aboutsummaryrefslogtreecommitdiff
path: root/contrib/llvm-project/llvm/lib/Target/AMDGPU
diff options
context:
space:
mode:
authorDimitry Andric <dim@FreeBSD.org>2023-12-18 20:30:12 +0000
committerDimitry Andric <dim@FreeBSD.org>2024-04-06 20:11:55 +0000
commit5f757f3ff9144b609b3c433dfd370cc6bdc191ad (patch)
tree1b4e980b866cd26a00af34c0a653eb640bd09caf /contrib/llvm-project/llvm/lib/Target/AMDGPU
parent3e1c8a35f741a5d114d0ba670b15191355711fe9 (diff)
parent312c0ed19cc5276a17bacf2120097bec4515b0f1 (diff)
Diffstat (limited to 'contrib/llvm-project/llvm/lib/Target/AMDGPU')
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPU.h176
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPU.td258
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUAliasAnalysis.cpp5
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.cpp3
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h28
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp135
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h5
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp233
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp132
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp160
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.h3
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td19
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp355
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUCombine.td8
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUCombinerHelper.cpp32
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUCtorDtorLowering.cpp41
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUGISel.td8
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelDivergenceLowering.cpp68
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.cpp8
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.h4
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp548
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.h102
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp186
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.h2
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp298
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h12
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp257
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h19
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUImageIntrinsicOptimizer.cpp336
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUInsertSingleUseVDST.cpp122
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp111
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.cpp4
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.h8
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td28
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp646
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h18
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td50
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp885
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h15
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp1234
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPULibFunc.cpp92
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPULibFunc.h25
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp84
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPULowerKernelAttributes.cpp7
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp77
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp1
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h11
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUMacroFusion.cpp2
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUOpenCLEnqueuedBlockLowering.cpp6
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp138
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUPreLegalizerCombiner.cpp107
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUPrintfRuntimeBinding.cpp4
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp29
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp85
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp556
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.h53
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPURemoveIncompatibleFunctions.cpp27
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp2
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPURewriteOutArguments.cpp3
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPURewriteUndefForPHI.cpp32
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td6
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp118
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h13
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp180
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h6
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUTargetObjectFile.cpp2
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp29
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp5
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp734
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/BUFInstructions.td1339
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/DSInstructions.td385
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp324
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h26
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/EXPInstructions.td26
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/FLATInstructions.td386
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/GCNCreateVOPD.cpp8
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/GCNDPPCombine.cpp27
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp6
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/GCNIterativeScheduler.cpp4
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/GCNProcessors.td16
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp320
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/GCNRegPressure.h101
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/GCNRewritePartialRegUses.cpp68
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp31
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/GCNSubtarget.h173
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/GCNVOPDUtils.cpp12
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/MCA/AMDGPUCustomBehaviour.cpp20
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/MCA/AMDGPUCustomBehaviour.h2
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp10
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFObjectWriter.cpp4
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp192
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h4
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp53
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp68
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h11
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/MCTargetDesc/R600MCCodeEmitter.cpp4
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/MIMGInstructions.td759
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/R600.h2
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/R600ISelDAGToDAG.cpp4
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp2
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/R600Instructions.td2
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/R600OpenCLImageTypeLoweringPass.cpp6
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/R600TargetMachine.cpp2
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/R600TargetMachine.h2
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp42
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/SIDefines.h122
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp94
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp310
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp274
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/SIFrameLowering.h4
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/SIISelLowering.cpp2070
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/SIISelLowering.h31
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInsertHardClauses.cpp5
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp220
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInstrFormats.td84
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp1301
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInstrInfo.h157
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInstrInfo.td211
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInstructions.td235
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/SILateBranchLowering.cpp19
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp93
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp67
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp388
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/SILowerI1Copies.h97
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp136
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/SILowerWWMCopies.cpp141
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp195
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h103
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp12
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/SIModeRegisterDefaults.cpp154
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/SIModeRegisterDefaults.h63
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp41
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/SIOptimizeVGPRLiveRange.cpp6
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp12
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/SIPreAllocateWWMRegs.cpp20
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp3
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/SIProgramInfo.cpp41
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/SIProgramInfo.h7
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp119
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/SIRegisterInfo.h39
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/SIRegisterInfo.td158
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/SISchedule.td64
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp21
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp9
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/SMInstructions.td157
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/SOPInstructions.td1148
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.cpp49
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp302
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h80
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/Utils/AMDGPUMemoryUtils.cpp10
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/Utils/AMDGPUMemoryUtils.h4
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.cpp24
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.h9
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/VINTERPInstructions.td38
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/VOP1Instructions.td469
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/VOP2Instructions.td691
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/VOP3Instructions.td404
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/VOP3PInstructions.td380
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/VOPCInstructions.td597
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/VOPDInstructions.td141
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/VOPInstructions.td414
161 files changed, 17573 insertions, 7907 deletions
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPU.h b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPU.h
index c25194c02f72..35d33cb60bc4 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPU.h
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPU.h
@@ -12,10 +12,12 @@
#include "llvm/IR/PassManager.h"
#include "llvm/Pass.h"
+#include "llvm/Support/AMDGPUAddrSpace.h"
#include "llvm/Support/CodeGen.h"
namespace llvm {
+class AMDGPUTargetMachine;
class TargetMachine;
// GlobalISel passes
@@ -34,6 +36,7 @@ FunctionPass *createSIAnnotateControlFlowPass();
FunctionPass *createSIFoldOperandsPass();
FunctionPass *createSIPeepholeSDWAPass();
FunctionPass *createSILowerI1CopiesPass();
+FunctionPass *createAMDGPUGlobalISelDivergenceLoweringPass();
FunctionPass *createSIShrinkInstructionsPass();
FunctionPass *createSILoadStoreOptimizerPass();
FunctionPass *createSIWholeQuadModePass();
@@ -41,25 +44,32 @@ FunctionPass *createSIFixControlFlowLiveIntervalsPass();
FunctionPass *createSIOptimizeExecMaskingPreRAPass();
FunctionPass *createSIOptimizeVGPRLiveRangePass();
FunctionPass *createSIFixSGPRCopiesPass();
+FunctionPass *createLowerWWMCopiesPass();
FunctionPass *createSIMemoryLegalizerPass();
FunctionPass *createSIInsertWaitcntsPass();
FunctionPass *createSIPreAllocateWWMRegsPass();
FunctionPass *createSIFormMemoryClausesPass();
FunctionPass *createSIPostRABundlerPass();
-FunctionPass *createAMDGPUSimplifyLibCallsPass(const TargetMachine *);
-FunctionPass *createAMDGPUUseNativeCallsPass();
+FunctionPass *createAMDGPUImageIntrinsicOptimizerPass(const TargetMachine *);
ModulePass *createAMDGPURemoveIncompatibleFunctionsPass(const TargetMachine *);
FunctionPass *createAMDGPUCodeGenPreparePass();
FunctionPass *createAMDGPULateCodeGenPreparePass();
FunctionPass *createAMDGPUMachineCFGStructurizerPass();
FunctionPass *createAMDGPURewriteOutArgumentsPass();
-ModulePass *createAMDGPULowerModuleLDSPass();
+ModulePass *
+createAMDGPULowerModuleLDSLegacyPass(const AMDGPUTargetMachine *TM = nullptr);
FunctionPass *createSIModeRegisterPass();
FunctionPass *createGCNPreRAOptimizationsPass();
struct AMDGPUSimplifyLibCallsPass : PassInfoMixin<AMDGPUSimplifyLibCallsPass> {
- AMDGPUSimplifyLibCallsPass(TargetMachine &TM) : TM(TM) {}
+ AMDGPUSimplifyLibCallsPass() {}
+ PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
+};
+
+struct AMDGPUImageIntrinsicOptimizerPass
+ : PassInfoMixin<AMDGPUImageIntrinsicOptimizerPass> {
+ AMDGPUImageIntrinsicOptimizerPass(TargetMachine &TM) : TM(TM) {}
PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
private:
@@ -78,8 +88,8 @@ extern char &AMDGPUMachineCFGStructurizerID;
void initializeAMDGPUAlwaysInlinePass(PassRegistry&);
Pass *createAMDGPUAnnotateKernelFeaturesPass();
-Pass *createAMDGPUAttributorPass();
-void initializeAMDGPUAttributorPass(PassRegistry &);
+Pass *createAMDGPUAttributorLegacyPass();
+void initializeAMDGPUAttributorLegacyPass(PassRegistry &);
void initializeAMDGPUAnnotateKernelFeaturesPass(PassRegistry &);
extern char &AMDGPUAnnotateKernelFeaturesID;
@@ -116,10 +126,13 @@ struct AMDGPULowerKernelAttributesPass
PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
};
-void initializeAMDGPULowerModuleLDSPass(PassRegistry &);
-extern char &AMDGPULowerModuleLDSID;
+void initializeAMDGPULowerModuleLDSLegacyPass(PassRegistry &);
+extern char &AMDGPULowerModuleLDSLegacyPassID;
struct AMDGPULowerModuleLDSPass : PassInfoMixin<AMDGPULowerModuleLDSPass> {
+ const AMDGPUTargetMachine &TM;
+ AMDGPULowerModuleLDSPass(const AMDGPUTargetMachine &TM_) : TM(TM_) {}
+
PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM);
};
@@ -144,9 +157,15 @@ extern char &SIFixSGPRCopiesID;
void initializeSIFixVGPRCopiesPass(PassRegistry &);
extern char &SIFixVGPRCopiesID;
+void initializeSILowerWWMCopiesPass(PassRegistry &);
+extern char &SILowerWWMCopiesID;
+
void initializeSILowerI1CopiesPass(PassRegistry &);
extern char &SILowerI1CopiesID;
+void initializeAMDGPUGlobalISelDivergenceLoweringPass(PassRegistry &);
+extern char &AMDGPUGlobalISelDivergenceLoweringID;
+
void initializeSILowerSGPRSpillsPass(PassRegistry &);
extern char &SILowerSGPRSpillsID;
@@ -171,15 +190,15 @@ extern char &SIOptimizeExecMaskingID;
void initializeSIPreAllocateWWMRegsPass(PassRegistry &);
extern char &SIPreAllocateWWMRegsID;
-void initializeAMDGPUSimplifyLibCallsPass(PassRegistry &);
-extern char &AMDGPUSimplifyLibCallsID;
-
-void initializeAMDGPUUseNativeCallsPass(PassRegistry &);
-extern char &AMDGPUUseNativeCallsID;
+void initializeAMDGPUImageIntrinsicOptimizerPass(PassRegistry &);
+extern char &AMDGPUImageIntrinsicOptimizerID;
void initializeAMDGPUPerfHintAnalysisPass(PassRegistry &);
extern char &AMDGPUPerfHintAnalysisID;
+void initializeGCNRegPressurePrinterPass(PassRegistry &);
+extern char &GCNRegPressurePrinterID;
+
// Passes common to R600 and SI
FunctionPass *createAMDGPUPromoteAlloca();
void initializeAMDGPUPromoteAllocaPass(PassRegistry&);
@@ -217,8 +236,7 @@ private:
};
Pass *createAMDGPUStructurizeCFGPass();
-FunctionPass *createAMDGPUISelDag(TargetMachine &TM,
- CodeGenOpt::Level OptLevel);
+FunctionPass *createAMDGPUISelDag(TargetMachine &TM, CodeGenOptLevel OptLevel);
ModulePass *createAMDGPUAlwaysInlinePass(bool GlobalOpt = true);
struct AMDGPUAlwaysInlinePass : PassInfoMixin<AMDGPUAlwaysInlinePass> {
@@ -239,6 +257,25 @@ public:
PreservedAnalyses run(Function &, FunctionAnalysisManager &);
};
+class AMDGPULowerKernelArgumentsPass
+ : public PassInfoMixin<AMDGPULowerKernelArgumentsPass> {
+private:
+ TargetMachine &TM;
+
+public:
+ AMDGPULowerKernelArgumentsPass(TargetMachine &TM) : TM(TM){};
+ PreservedAnalyses run(Function &, FunctionAnalysisManager &);
+};
+
+class AMDGPUAttributorPass : public PassInfoMixin<AMDGPUAttributorPass> {
+private:
+ TargetMachine &TM;
+
+public:
+ AMDGPUAttributorPass(TargetMachine &TM) : TM(TM){};
+ PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM);
+};
+
FunctionPass *createAMDGPUAnnotateUniformValues();
ModulePass *createAMDGPUPrintfRuntimeBinding();
@@ -279,9 +316,16 @@ extern char &AMDGPURemoveIncompatibleFunctionsID;
void initializeAMDGPULateCodeGenPreparePass(PassRegistry &);
extern char &AMDGPULateCodeGenPrepareID;
-FunctionPass *createAMDGPURewriteUndefForPHIPass();
-void initializeAMDGPURewriteUndefForPHIPass(PassRegistry &);
-extern char &AMDGPURewriteUndefForPHIPassID;
+FunctionPass *createAMDGPURewriteUndefForPHILegacyPass();
+void initializeAMDGPURewriteUndefForPHILegacyPass(PassRegistry &);
+extern char &AMDGPURewriteUndefForPHILegacyPassID;
+
+class AMDGPURewriteUndefForPHIPass
+ : public PassInfoMixin<AMDGPURewriteUndefForPHIPass> {
+public:
+ AMDGPURewriteUndefForPHIPass() = default;
+ PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
+};
void initializeSIAnnotateControlFlowPass(PassRegistry&);
extern char &SIAnnotateControlFlowPassID;
@@ -295,6 +339,9 @@ extern char &SIModeRegisterID;
void initializeAMDGPUInsertDelayAluPass(PassRegistry &);
extern char &AMDGPUInsertDelayAluID;
+void initializeAMDGPUInsertSingleUseVDSTPass(PassRegistry &);
+extern char &AMDGPUInsertSingleUseVDSTID;
+
void initializeSIInsertHardClausesPass(PassRegistry &);
extern char &SIInsertHardClausesID;
@@ -347,72 +394,6 @@ enum TargetIndex {
TI_SCRATCH_RSRC_DWORD2,
TI_SCRATCH_RSRC_DWORD3
};
-}
-
-/// OpenCL uses address spaces to differentiate between
-/// various memory regions on the hardware. On the CPU
-/// all of the address spaces point to the same memory,
-/// however on the GPU, each address space points to
-/// a separate piece of memory that is unique from other
-/// memory locations.
-namespace AMDGPUAS {
-enum : unsigned {
- // The maximum value for flat, generic, local, private, constant and region.
- MAX_AMDGPU_ADDRESS = 8,
-
- FLAT_ADDRESS = 0, ///< Address space for flat memory.
- GLOBAL_ADDRESS = 1, ///< Address space for global memory (RAT0, VTX0).
- REGION_ADDRESS = 2, ///< Address space for region memory. (GDS)
-
- CONSTANT_ADDRESS = 4, ///< Address space for constant memory (VTX2).
- LOCAL_ADDRESS = 3, ///< Address space for local memory.
- PRIVATE_ADDRESS = 5, ///< Address space for private memory.
-
- CONSTANT_ADDRESS_32BIT = 6, ///< Address space for 32-bit constant memory.
-
- BUFFER_FAT_POINTER = 7, ///< Address space for 160-bit buffer fat pointers.
- ///< Not used in backend.
-
- BUFFER_RESOURCE = 8, ///< Address space for 128-bit buffer resources.
-
- /// Internal address spaces. Can be freely renumbered.
- STREAMOUT_REGISTER = 128, ///< Address space for GS NGG Streamout registers.
- /// end Internal address spaces.
-
- /// Address space for direct addressable parameter memory (CONST0).
- PARAM_D_ADDRESS = 6,
- /// Address space for indirect addressable parameter memory (VTX1).
- PARAM_I_ADDRESS = 7,
-
- // Do not re-order the CONSTANT_BUFFER_* enums. Several places depend on
- // this order to be able to dynamically index a constant buffer, for
- // example:
- //
- // ConstantBufferAS = CONSTANT_BUFFER_0 + CBIdx
-
- CONSTANT_BUFFER_0 = 8,
- CONSTANT_BUFFER_1 = 9,
- CONSTANT_BUFFER_2 = 10,
- CONSTANT_BUFFER_3 = 11,
- CONSTANT_BUFFER_4 = 12,
- CONSTANT_BUFFER_5 = 13,
- CONSTANT_BUFFER_6 = 14,
- CONSTANT_BUFFER_7 = 15,
- CONSTANT_BUFFER_8 = 16,
- CONSTANT_BUFFER_9 = 17,
- CONSTANT_BUFFER_10 = 18,
- CONSTANT_BUFFER_11 = 19,
- CONSTANT_BUFFER_12 = 20,
- CONSTANT_BUFFER_13 = 21,
- CONSTANT_BUFFER_14 = 22,
- CONSTANT_BUFFER_15 = 23,
-
- // Some places use this if the address space can't be determined.
- UNKNOWN_ADDRESS_SPACE = ~0u,
-};
-}
-
-namespace AMDGPU {
// FIXME: Missing constant_32bit
inline bool isFlatGlobalAddrSpace(unsigned AS) {
@@ -429,24 +410,25 @@ inline bool isExtendedGlobalAddrSpace(unsigned AS) {
}
static inline bool addrspacesMayAlias(unsigned AS1, unsigned AS2) {
- static_assert(AMDGPUAS::MAX_AMDGPU_ADDRESS <= 8, "Addr space out of range");
+ static_assert(AMDGPUAS::MAX_AMDGPU_ADDRESS <= 9, "Addr space out of range");
if (AS1 > AMDGPUAS::MAX_AMDGPU_ADDRESS || AS2 > AMDGPUAS::MAX_AMDGPU_ADDRESS)
return true;
- // This array is indexed by address space value enum elements 0 ... to 8
+ // This array is indexed by address space value enum elements 0 ... to 9
// clang-format off
- static const bool ASAliasRules[9][9] = {
- /* Flat Global Region Group Constant Private Const32 BufFatPtr BufRsrc */
- /* Flat */ {true, true, false, true, true, true, true, true, true},
- /* Global */ {true, true, false, false, true, false, true, true, true},
- /* Region */ {false, false, true, false, false, false, false, false, false},
- /* Group */ {true, false, false, true, false, false, false, false, false},
- /* Constant */ {true, true, false, false, false, false, true, true, true},
- /* Private */ {true, false, false, false, false, true, false, false, false},
- /* Constant 32-bit */ {true, true, false, false, true, false, false, true, true},
- /* Buffer Fat Ptr */ {true, true, false, false, true, false, true, true, true},
- /* Buffer Resource */ {true, true, false, false, true, false, true, true, true},
+ static const bool ASAliasRules[10][10] = {
+ /* Flat Global Region Group Constant Private Const32 BufFatPtr BufRsrc BufStrdPtr */
+ /* Flat */ {true, true, false, true, true, true, true, true, true, true},
+ /* Global */ {true, true, false, false, true, false, true, true, true, true},
+ /* Region */ {false, false, true, false, false, false, false, false, false, false},
+ /* Group */ {true, false, false, true, false, false, false, false, false, false},
+ /* Constant */ {true, true, false, false, false, false, true, true, true, true},
+ /* Private */ {true, false, false, false, false, true, false, false, false, false},
+ /* Constant 32-bit */ {true, true, false, false, true, false, false, true, true, true},
+ /* Buffer Fat Ptr */ {true, true, false, false, true, false, true, true, true, true},
+ /* Buffer Resource */ {true, true, false, false, true, false, true, true, true, true},
+ /* Buffer Strided Ptr */ {true, true, false, false, true, false, true, true, true, true},
};
// clang-format on
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPU.td b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPU.td
index b178623a319d..060fb66d38f7 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPU.td
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPU.td
@@ -281,6 +281,12 @@ def FeatureMADIntraFwdBug : SubtargetFeature<"mad-intra-fwd-bug",
"MAD_U64/I64 intra instruction forwarding bug"
>;
+def FeatureMSAALoadDstSelBug : SubtargetFeature<"msaa-load-dst-sel-bug",
+ "HasMSAALoadDstSelBug",
+ "true",
+ "MSAA loads not honoring dst_sel bug"
+>;
+
class SubtargetFeatureLDSBankCount <int Value> : SubtargetFeature <
"ldsbankcount"#Value,
"LDSBankCount",
@@ -339,6 +345,12 @@ def FeatureGFX11Insts : SubtargetFeature<"gfx11-insts",
"Additional instructions for GFX11+"
>;
+def FeatureGFX12Insts : SubtargetFeature<"gfx12-insts",
+ "GFX12Insts",
+ "true",
+ "Additional instructions for GFX12+"
+>;
+
def FeatureGFX10_3Insts : SubtargetFeature<"gfx10-3-insts",
"GFX10_3Insts",
"true",
@@ -375,6 +387,12 @@ def FeatureTrue16BitInsts : SubtargetFeature<"true16",
"True 16-bit operand instructions"
>;
+def FeatureRealTrue16Insts : SubtargetFeature<"real-true16",
+ "EnableRealTrue16Insts",
+ "true",
+ "Use true 16-bit registers"
+>;
+
def FeatureVOP3P : SubtargetFeature<"vop3p",
"HasVOP3PInsts",
"true",
@@ -393,6 +411,12 @@ def FeatureVGPRIndexMode : SubtargetFeature<"vgpr-index-mode",
"Has VGPR mode register indexing"
>;
+def FeatureScalarDwordx3Loads : SubtargetFeature<"scalar-dwordx3-loads",
+ "HasScalarDwordx3Loads",
+ "true",
+ "Has 96-bit scalar load instructions"
+>;
+
def FeatureScalarStores : SubtargetFeature<"scalar-stores",
"HasScalarStores",
"true",
@@ -454,10 +478,16 @@ def FeatureDPP8 : SubtargetFeature<"dpp8",
"Support DPP8 (Data Parallel Primitives) extension"
>;
-def Feature64BitDPP : SubtargetFeature<"dpp-64bit",
- "Has64BitDPP",
+def FeatureDPALU_DPP : SubtargetFeature<"dpp-64bit",
+ "HasDPALU_DPP",
"true",
- "Support DPP (Data Parallel Primitives) extension"
+ "Support DPP (Data Parallel Primitives) extension in DP ALU"
+>;
+
+def FeatureDPPSrc1SGPR : SubtargetFeature<"dpp-src1-sgpr",
+ "HasDPPSrc1SGPR",
+ "true",
+ "Support SGPR for Src1 of DPP instructions"
>;
def FeaturePackedFP32Ops : SubtargetFeature<"packed-fp32-ops",
@@ -675,6 +705,13 @@ def FeatureAtomicGlobalPkAddBF16Inst : SubtargetFeature<"atomic-global-pk-add-bf
[FeatureFlatGlobalInsts]
>;
+def FeatureAtomicCSubNoRtnInsts : SubtargetFeature<"atomic-csub-no-rtn-insts",
+ "HasAtomicCSubNoRtnInsts",
+ "true",
+ "Has buffer_atomic_csub and global_atomic_csub instructions that don't "
+ "return original value"
+>;
+
def FeatureFlatAtomicFaddF32Inst
: SubtargetFeature<"flat-atomic-fadd-f32-inst",
"HasFlatAtomicFaddF32Inst",
@@ -773,6 +810,30 @@ def FeatureForceStoreSC0SC1 : SubtargetFeature<"force-store-sc0-sc1",
"Has SC0 and SC1 on stores"
>;
+def FeatureSALUFloatInsts : SubtargetFeature<"salu-float",
+ "HasSALUFloatInsts",
+ "true",
+ "Has SALU floating point instructions"
+>;
+
+def FeatureVGPRSingleUseHintInsts : SubtargetFeature<"vgpr-singleuse-hint",
+ "HasVGPRSingleUseHintInsts",
+ "true",
+ "Has single-use VGPR hint instructions"
+>;
+
+def FeaturePseudoScalarTrans : SubtargetFeature<"pseudo-scalar-trans",
+ "HasPseudoScalarTrans",
+ "true",
+ "Has Pseudo Scalar Transcendental instructions"
+>;
+
+def FeatureHasRestrictedSOffset : SubtargetFeature<"restricted-soffset",
+ "HasRestrictedSOffset",
+ "true",
+ "Has restricted SOffset (immediate not supported)."
+>;
+
//===------------------------------------------------------------===//
// Subtarget Features (options and debugging)
//===------------------------------------------------------------===//
@@ -872,6 +933,12 @@ def FeatureTrigReducedRange : SubtargetFeature<"trig-reduced-range",
"Requires use of fract on arguments to trig instructions"
>;
+def FeatureKernargPreload : SubtargetFeature <"kernarg-preload",
+ "KernargPreload",
+ "true",
+ "Hardware supports preloading of kernel arguments in user SGPRs."
+>;
+
// Alignment enforcement is controlled by a configuration register:
// SH_MEM_CONFIG.alignment_mode
def FeatureUnalignedAccessMode : SubtargetFeature<"unaligned-access-mode",
@@ -899,6 +966,18 @@ def FeatureArchitectedSGPRs : SubtargetFeature<"architected-sgprs",
"Enable the architected SGPRs"
>;
+def FeatureGDS : SubtargetFeature<"gds",
+ "HasGDS",
+ "true",
+ "Has Global Data Share"
+>;
+
+def FeatureGWS : SubtargetFeature<"gws",
+ "HasGWS",
+ "true",
+ "Has Global Wave Sync"
+>;
+
// Dummy feature used to disable assembler instructions.
def FeatureDisable : SubtargetFeature<"",
"FeatureDisable","true",
@@ -917,7 +996,8 @@ def FeatureSouthernIslands : GCNSubtargetFeatureGeneration<"SOUTHERN_ISLANDS",
[FeatureFP64, FeatureLocalMemorySize32768, FeatureMIMG_R128,
FeatureWavefrontSize64, FeatureSMemTimeInst, FeatureMadMacF32Insts,
FeatureDsSrc2Insts, FeatureLDSBankCount32, FeatureMovrel,
- FeatureTrigReducedRange, FeatureExtendedImageInsts, FeatureImageInsts
+ FeatureTrigReducedRange, FeatureExtendedImageInsts, FeatureImageInsts,
+ FeatureGDS, FeatureGWS
]
>;
@@ -928,7 +1008,7 @@ def FeatureSeaIslands : GCNSubtargetFeatureGeneration<"SEA_ISLANDS",
FeatureCIInsts, FeatureMovrel, FeatureTrigReducedRange,
FeatureGFX7GFX8GFX9Insts, FeatureSMemTimeInst, FeatureMadMacF32Insts,
FeatureDsSrc2Insts, FeatureExtendedImageInsts, FeatureUnalignedBufferAccess,
- FeatureImageInsts
+ FeatureImageInsts, FeatureGDS, FeatureGWS
]
>;
@@ -943,7 +1023,7 @@ def FeatureVolcanicIslands : GCNSubtargetFeatureGeneration<"VOLCANIC_ISLANDS",
FeatureIntClamp, FeatureTrigReducedRange, FeatureGFX8Insts,
FeatureGFX7GFX8GFX9Insts, FeatureSMemTimeInst, FeatureMadMacF32Insts,
FeatureDsSrc2Insts, FeatureExtendedImageInsts, FeatureFastDenormalF32,
- FeatureUnalignedBufferAccess, FeatureImageInsts
+ FeatureUnalignedBufferAccess, FeatureImageInsts, FeatureGDS, FeatureGWS
]
>;
@@ -961,7 +1041,7 @@ def FeatureGFX9 : GCNSubtargetFeatureGeneration<"GFX9",
FeatureScalarFlatScratchInsts, FeatureScalarAtomics, FeatureR128A16,
FeatureA16, FeatureSMemTimeInst, FeatureFastDenormalF32, FeatureSupportsXNACK,
FeatureUnalignedBufferAccess, FeatureUnalignedDSAccess,
- FeatureNegativeScratchOffsetBug
+ FeatureNegativeScratchOffsetBug, FeatureGWS
]
>;
@@ -980,7 +1060,8 @@ def FeatureGFX10 : GCNSubtargetFeatureGeneration<"GFX10",
FeatureVOP3Literal, FeatureDPP8, FeatureExtendedImageInsts,
FeatureNoDataDepHazard, FeaturePkFmacF16Inst,
FeatureA16, FeatureSMemTimeInst, FeatureFastDenormalF32, FeatureG16,
- FeatureUnalignedBufferAccess, FeatureUnalignedDSAccess, FeatureImageInsts
+ FeatureUnalignedBufferAccess, FeatureUnalignedDSAccess, FeatureImageInsts,
+ FeatureGDS, FeatureGWS
]
>;
@@ -999,7 +1080,28 @@ def FeatureGFX11 : GCNSubtargetFeatureGeneration<"GFX11",
FeatureVOP3Literal, FeatureDPP8, FeatureExtendedImageInsts,
FeatureNoDataDepHazard, FeaturePkFmacF16Inst,
FeatureA16, FeatureFastDenormalF32, FeatureG16,
- FeatureUnalignedBufferAccess, FeatureUnalignedDSAccess
+ FeatureUnalignedBufferAccess, FeatureUnalignedDSAccess, FeatureGDS,
+ FeatureGWS
+ ]
+>;
+
+def FeatureGFX12 : GCNSubtargetFeatureGeneration<"GFX12",
+ "gfx12",
+ [FeatureFP64, FeatureLocalMemorySize65536, FeatureMIMG_R128,
+ FeatureFlatAddressSpace, Feature16BitInsts,
+ FeatureInv2PiInlineImm, FeatureApertureRegs,
+ FeatureCIInsts, FeatureGFX8Insts, FeatureGFX9Insts, FeatureGFX10Insts,
+ FeatureGFX10_AEncoding, FeatureGFX10_BEncoding, FeatureGFX10_3Insts,
+ FeatureGFX11Insts, FeatureGFX12Insts, FeatureVOP3P, FeatureVOPD,
+ FeatureMovrel, FeatureFastFMAF32, FeatureDPP, FeatureIntClamp,
+ FeatureFlatInstOffsets, FeatureFlatGlobalInsts, FeatureFlatScratchInsts,
+ FeatureAddNoCarryInsts, FeatureFmaMixInsts,
+ FeatureNoSdstCMPX, FeatureVscnt,
+ FeatureVOP3Literal, FeatureDPP8,
+ FeatureNoDataDepHazard, FeaturePkFmacF16Inst,
+ FeatureA16, FeatureFastDenormalF32, FeatureG16,
+ FeatureUnalignedBufferAccess, FeatureUnalignedDSAccess, FeatureGDS,
+ FeatureGWS, FeatureTrue16BitInsts
]
>;
@@ -1104,28 +1206,32 @@ def FeatureISAVersion9_0_MI_Common : FeatureSet<
def FeatureISAVersion9_0_0 : FeatureSet<
!listconcat(FeatureISAVersion9_0_Common.Features,
- [FeatureMadMixInsts,
+ [FeatureGDS,
+ FeatureMadMixInsts,
FeatureDsSrc2Insts,
FeatureExtendedImageInsts,
FeatureImageGather4D16Bug])>;
def FeatureISAVersion9_0_2 : FeatureSet<
!listconcat(FeatureISAVersion9_0_Common.Features,
- [FeatureMadMixInsts,
+ [FeatureGDS,
+ FeatureMadMixInsts,
FeatureDsSrc2Insts,
FeatureExtendedImageInsts,
FeatureImageGather4D16Bug])>;
def FeatureISAVersion9_0_4 : FeatureSet<
!listconcat(FeatureISAVersion9_0_Common.Features,
- [FeatureDsSrc2Insts,
+ [FeatureGDS,
+ FeatureDsSrc2Insts,
FeatureExtendedImageInsts,
FeatureFmaMixInsts,
FeatureImageGather4D16Bug])>;
def FeatureISAVersion9_0_6 : FeatureSet<
!listconcat(FeatureISAVersion9_0_Common.Features,
- [HalfRate64Ops,
+ [FeatureGDS,
+ HalfRate64Ops,
FeatureFmaMixInsts,
FeatureDsSrc2Insts,
FeatureExtendedImageInsts,
@@ -1139,7 +1245,8 @@ def FeatureISAVersion9_0_6 : FeatureSet<
def FeatureISAVersion9_0_8 : FeatureSet<
!listconcat(FeatureISAVersion9_0_MI_Common.Features,
- [HalfRate64Ops,
+ [FeatureGDS,
+ HalfRate64Ops,
FeatureDsSrc2Insts,
FeatureExtendedImageInsts,
FeatureAtomicBufferGlobalPkAddF16NoRtnInsts,
@@ -1148,7 +1255,8 @@ def FeatureISAVersion9_0_8 : FeatureSet<
def FeatureISAVersion9_0_9 : FeatureSet<
!listconcat(FeatureISAVersion9_0_Common.Features,
- [FeatureMadMixInsts,
+ [FeatureGDS,
+ FeatureMadMixInsts,
FeatureDsSrc2Insts,
FeatureExtendedImageInsts,
FeatureImageInsts,
@@ -1158,17 +1266,19 @@ def FeatureISAVersion9_0_A : FeatureSet<
!listconcat(FeatureISAVersion9_0_MI_Common.Features,
[FeatureGFX90AInsts,
FeatureFmacF64Inst,
- Feature64BitDPP,
+ FeatureDPALU_DPP,
FeaturePackedFP32Ops,
FeatureAtomicFaddRtnInsts,
FeatureAtomicBufferGlobalPkAddF16Insts,
FeaturePackedTID,
FullRate64Ops,
- FeatureBackOffBarrier])>;
+ FeatureBackOffBarrier,
+ FeatureKernargPreload])>;
def FeatureISAVersion9_0_C : FeatureSet<
!listconcat(FeatureISAVersion9_0_Common.Features,
- [FeatureMadMixInsts,
+ [FeatureGDS,
+ FeatureMadMixInsts,
FeatureDsSrc2Insts,
FeatureExtendedImageInsts,
FeatureImageGather4D16Bug])>;
@@ -1191,7 +1301,7 @@ def FeatureISAVersion9_4_Common : FeatureSet<
FeatureDot10Insts,
FeatureAtomicDsPkAdd16Insts,
FeatureAtomicFlatPkAdd16Insts,
- Feature64BitDPP,
+ FeatureDPALU_DPP,
FeaturePackedFP32Ops,
FeatureMAIInsts,
FeatureFP8Insts,
@@ -1205,7 +1315,8 @@ def FeatureISAVersion9_4_Common : FeatureSet<
FeaturePackedTID,
FeatureArchitectedFlatScratch,
FullRate64Ops,
- FeatureBackOffBarrier]>;
+ FeatureBackOffBarrier,
+ FeatureKernargPreload]>;
def FeatureISAVersion9_4_0 : FeatureSet<
!listconcat(FeatureISAVersion9_4_Common.Features,
@@ -1313,7 +1424,8 @@ def FeatureISAVersion11_Common : FeatureSet<
def FeatureISAVersion11_0_Common : FeatureSet<
!listconcat(FeatureISAVersion11_Common.Features,
- [FeatureVALUTransUseHazard])>;
+ [FeatureMSAALoadDstSelBug,
+ FeatureVALUTransUseHazard])>;
def FeatureISAVersion11_0_0 : FeatureSet<
!listconcat(FeatureISAVersion11_0_Common.Features,
@@ -1334,11 +1446,44 @@ def FeatureISAVersion11_0_3 : FeatureSet<
def FeatureISAVersion11_5_0 : FeatureSet<
!listconcat(FeatureISAVersion11_Common.Features,
- [])>;
+ [FeatureSALUFloatInsts,
+ FeatureDPPSrc1SGPR,
+ FeatureVGPRSingleUseHintInsts])>;
def FeatureISAVersion11_5_1 : FeatureSet<
!listconcat(FeatureISAVersion11_Common.Features,
- [FeatureGFX11FullVGPRs])>;
+ [FeatureSALUFloatInsts,
+ FeatureDPPSrc1SGPR,
+ FeatureVGPRSingleUseHintInsts,
+ FeatureGFX11FullVGPRs])>;
+
+def FeatureISAVersion12 : FeatureSet<
+ [FeatureGFX12,
+ FeatureLDSBankCount32,
+ FeatureDLInsts,
+ FeatureDot5Insts,
+ FeatureDot7Insts,
+ FeatureDot8Insts,
+ FeatureDot9Insts,
+ FeatureDot10Insts,
+ FeatureNSAEncoding,
+ FeaturePartialNSAEncoding,
+ FeatureWavefrontSize32,
+ FeatureShaderCyclesRegister,
+ FeatureArchitectedFlatScratch,
+ FeatureAtomicFaddRtnInsts,
+ FeatureAtomicFaddNoRtnInsts,
+ FeatureFlatAtomicFaddF32Inst,
+ FeatureImageInsts,
+ FeatureExtendedImageInsts,
+ FeaturePackedTID,
+ FeatureVcmpxPermlaneHazard,
+ FeatureSALUFloatInsts,
+ FeaturePseudoScalarTrans,
+ FeatureHasRestrictedSOffset,
+ FeatureVGPRSingleUseHintInsts,
+ FeatureMADIntraFwdBug,
+ FeatureScalarDwordx3Loads]>;
//===----------------------------------------------------------------------===//
@@ -1494,6 +1639,10 @@ def isGFX6GFX7GFX8GFX9GFX10 :
"Subtarget->getGeneration() == AMDGPUSubtarget::GFX10">,
AssemblerPredicate<(all_of (not FeatureGFX11Insts))>;
+def isNotGFX12Plus :
+ Predicate<"Subtarget->getGeneration() <= AMDGPUSubtarget::GFX11">,
+ AssemblerPredicate<(all_of (not FeatureGFX12Insts))>;
+
def isGFX7GFX8GFX9GFX10 :
Predicate<"Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS ||"
"Subtarget->getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS ||"
@@ -1501,6 +1650,13 @@ def isGFX7GFX8GFX9GFX10 :
"Subtarget->getGeneration() == AMDGPUSubtarget::GFX10">,
AssemblerPredicate<(all_of FeatureCIInsts, (not FeatureGFX11Insts))>;
+def isGFX8GFX9GFX10GFX11 :
+ Predicate<"Subtarget->getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS ||"
+ "Subtarget->getGeneration() == AMDGPUSubtarget::GFX9 ||"
+ "Subtarget->getGeneration() == AMDGPUSubtarget::GFX10 ||"
+ "Subtarget->getGeneration() == AMDGPUSubtarget::GFX11">,
+ AssemblerPredicate<(all_of FeatureGFX8Insts, (not FeatureGFX12Insts))>;
+
def isGFX7Plus :
Predicate<"Subtarget->getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS">,
AssemblerPredicate<(all_of FeatureCIInsts)>;
@@ -1573,6 +1729,11 @@ def isGFX10Plus :
Predicate<"Subtarget->getGeneration() >= AMDGPUSubtarget::GFX10">,
AssemblerPredicate<(all_of FeatureGFX10Insts)>;
+def isGFX10GFX11 :
+ Predicate<"Subtarget->getGeneration() == AMDGPUSubtarget::GFX10 ||"
+ "Subtarget->getGeneration() == AMDGPUSubtarget::GFX11">,
+ AssemblerPredicate<(all_of FeatureGFX10Insts, (not FeatureGFX12Insts))>;
+
def isGFX10Before1030 :
Predicate<"Subtarget->getGeneration() == AMDGPUSubtarget::GFX10 &&"
"!Subtarget->hasGFX10_3Insts()">,
@@ -1591,12 +1752,20 @@ def isGFX8GFX9GFX10 :
def isGFX11Only :
Predicate<"Subtarget->getGeneration() == AMDGPUSubtarget::GFX11">,
- AssemblerPredicate<(all_of FeatureGFX11Insts)>;
+ AssemblerPredicate<(all_of FeatureGFX11Insts, (not FeatureGFX12Insts))>;
def isGFX11Plus :
Predicate<"Subtarget->getGeneration() >= AMDGPUSubtarget::GFX11">,
AssemblerPredicate<(all_of FeatureGFX11Insts)>;
+def isGFX12Only :
+ Predicate<"Subtarget->getGeneration() == AMDGPUSubtarget::GFX12">,
+ AssemblerPredicate<(all_of FeatureGFX12Insts)>;
+
+def isGFX12Plus :
+ Predicate<"Subtarget->getGeneration() >= AMDGPUSubtarget::GFX12">,
+ AssemblerPredicate<(all_of FeatureGFX12Insts)>;
+
def HasFlatAddressSpace : Predicate<"Subtarget->hasFlatAddressSpace()">,
AssemblerPredicate<(all_of FeatureFlatAddressSpace)>;
@@ -1625,6 +1794,11 @@ def HasUnpackedD16VMem : Predicate<"Subtarget->hasUnpackedD16VMem()">,
def HasPackedD16VMem : Predicate<"!Subtarget->hasUnpackedD16VMem()">,
AssemblerPredicate<(all_of (not FeatureUnpackedD16VMem))>;
+def HasRestrictedSOffset : Predicate<"Subtarget->hasRestrictedSOffset()">,
+ AssemblerPredicate<(all_of FeatureHasRestrictedSOffset)>;
+def HasUnrestrictedSOffset : Predicate<"!Subtarget->hasRestrictedSOffset()">,
+ AssemblerPredicate<(all_of (not FeatureHasRestrictedSOffset))>;
+
def D16PreservesUnusedBits :
Predicate<"Subtarget->d16PreservesUnusedBits()">,
AssemblerPredicate<(all_of FeatureGFX9Insts, (not FeatureSRAMECC))>;
@@ -1650,6 +1824,15 @@ def HasTrue16BitInsts : Predicate<"Subtarget->hasTrue16BitInsts()">,
AssemblerPredicate<(all_of FeatureTrue16BitInsts)>;
def NotHasTrue16BitInsts : Predicate<"!Subtarget->hasTrue16BitInsts()">;
+// Control use of True16 instructions. The real True16 instructions are
+// True16 instructions as they are defined in the ISA. Fake True16
+// instructions have the same encoding as real ones but syntactically
+// only allow 32-bit registers in operands and use low halves thereof.
+def UseRealTrue16Insts : Predicate<"Subtarget->useRealTrue16Insts()">,
+ AssemblerPredicate<(all_of FeatureTrue16BitInsts, FeatureRealTrue16Insts)>;
+def UseFakeTrue16Insts : Predicate<"Subtarget->hasTrue16BitInsts() && "
+ "!Subtarget->useRealTrue16Insts()">;
+
def HasVOP3PInsts : Predicate<"Subtarget->hasVOP3PInsts()">,
AssemblerPredicate<(all_of FeatureVOP3P)>;
@@ -1677,12 +1860,15 @@ def HasDPP : Predicate<"Subtarget->hasDPP()">,
def HasDPP8 : Predicate<"Subtarget->hasDPP8()">,
AssemblerPredicate<(all_of (not FeatureGCN3Encoding), FeatureGFX10Insts, FeatureDPP8)>;
-def Has64BitDPP : Predicate<"Subtarget->has64BitDPP()">,
- AssemblerPredicate<(all_of Feature64BitDPP)>;
+def HasDPALU_DPP : Predicate<"Subtarget->hasDPALU_DPP()">,
+ AssemblerPredicate<(all_of FeatureDPALU_DPP)>;
def HasPackedFP32Ops : Predicate<"Subtarget->hasPackedFP32Ops()">,
AssemblerPredicate<(all_of FeaturePackedFP32Ops)>;
+def HasPkMovB32 : Predicate<"Subtarget->hasPkMovB32()">,
+ AssemblerPredicate<(all_of FeatureGFX90AInsts)>;
+
def HasFmaakFmamkF32Insts :
Predicate<"Subtarget->hasFmaakFmamkF32Insts()">,
AssemblerPredicate<(any_of FeatureGFX10Insts, FeatureGFX940Insts)>;
@@ -1836,6 +2022,26 @@ def HasMADIntraFwdBug : Predicate<"Subtarget->hasMADIntraFwdBug()">;
def HasNotMADIntraFwdBug : Predicate<"!Subtarget->hasMADIntraFwdBug()">;
+def HasSALUFloatInsts : Predicate<"Subtarget->hasSALUFloatInsts()">,
+ AssemblerPredicate<(all_of FeatureSALUFloatInsts)>;
+
+def HasVGPRSingleUseHintInsts : Predicate<"Subtarget->hasVGPRSingleUseHintInsts()">,
+ AssemblerPredicate<(all_of FeatureVGPRSingleUseHintInsts)>;
+
+def HasPseudoScalarTrans : Predicate<"Subtarget->hasPseudoScalarTrans()">,
+ AssemblerPredicate<(all_of FeaturePseudoScalarTrans)>;
+
+def HasGDS : Predicate<"Subtarget->hasGDS()">;
+
+def HasGWS : Predicate<"Subtarget->hasGWS()">;
+
+def HasCvtFP8VOP1Bug : Predicate<"Subtarget->hasCvtFP8VOP1Bug()">;
+def HasNoCvtFP8VOP1Bug : Predicate<"!Subtarget->hasCvtFP8VOP1Bug()">;
+
+def HasAtomicCSubNoRtnInsts : Predicate<"Subtarget->hasAtomicCSubNoRtnInsts()">;
+
+def HasScalarDwordx3Loads : Predicate<"Subtarget->hasScalarDwordx3Loads()">;
+
// Include AMDGPU TD files
include "SISchedule.td"
include "GCNProcessors.td"
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUAliasAnalysis.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUAliasAnalysis.cpp
index 63942414bf3c..8d3eac686831 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUAliasAnalysis.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUAliasAnalysis.cpp
@@ -93,8 +93,7 @@ AliasResult AMDGPUAAResult::alias(const MemoryLocation &LocA,
}
}
- // Forward the query to the next alias analysis.
- return AAResultBase::alias(LocA, LocB, AAQI, nullptr);
+ return AliasResult::MayAlias;
}
ModRefInfo AMDGPUAAResult::getModRefInfoMask(const MemoryLocation &Loc,
@@ -111,5 +110,5 @@ ModRefInfo AMDGPUAAResult::getModRefInfoMask(const MemoryLocation &Loc,
AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT)
return ModRefInfo::NoModRef;
- return AAResultBase::getModRefInfoMask(Loc, AAQI, IgnoreLocals);
+ return ModRefInfo::ModRef;
}
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.cpp
index 1c18cbd855fc..de25f9241a50 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.cpp
@@ -60,6 +60,7 @@ bool AMDGPUArgumentUsageInfo::doFinalization(Module &M) {
return false;
}
+// TODO: Print preload kernargs?
void AMDGPUArgumentUsageInfo::print(raw_ostream &OS, const Module *M) const {
for (const auto &FI : ArgInfoMap) {
OS << "Arguments for " << FI.first->getName() << '\n'
@@ -148,7 +149,7 @@ AMDGPUFunctionArgInfo::getPreloadedValue(
llvm_unreachable("unexpected preloaded value type");
}
-constexpr AMDGPUFunctionArgInfo AMDGPUFunctionArgInfo::fixedABILayout() {
+AMDGPUFunctionArgInfo AMDGPUFunctionArgInfo::fixedABILayout() {
AMDGPUFunctionArgInfo AI;
AI.PrivateSegmentBuffer
= ArgDescriptor::createRegister(AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3);
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h
index f595e469f998..42b33c50d9f8 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h
@@ -9,6 +9,7 @@
#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUARGUMENTUSAGEINFO_H
#define LLVM_LIB_TARGET_AMDGPU_AMDGPUARGUMENTUSAGEINFO_H
+#include "llvm/ADT/DenseMap.h"
#include "llvm/CodeGen/Register.h"
#include "llvm/Pass.h"
@@ -37,22 +38,19 @@ private:
bool IsSet : 1;
public:
- constexpr ArgDescriptor(unsigned Val = 0, unsigned Mask = ~0u,
- bool IsStack = false, bool IsSet = false)
- : Reg(Val), Mask(Mask), IsStack(IsStack), IsSet(IsSet) {}
+ ArgDescriptor(unsigned Val = 0, unsigned Mask = ~0u, bool IsStack = false,
+ bool IsSet = false)
+ : Reg(Val), Mask(Mask), IsStack(IsStack), IsSet(IsSet) {}
- static constexpr ArgDescriptor createRegister(Register Reg,
- unsigned Mask = ~0u) {
+ static ArgDescriptor createRegister(Register Reg, unsigned Mask = ~0u) {
return ArgDescriptor(Reg, Mask, false, true);
}
- static constexpr ArgDescriptor createStack(unsigned Offset,
- unsigned Mask = ~0u) {
+ static ArgDescriptor createStack(unsigned Offset, unsigned Mask = ~0u) {
return ArgDescriptor(Offset, Mask, true, true);
}
- static constexpr ArgDescriptor createArg(const ArgDescriptor &Arg,
- unsigned Mask) {
+ static ArgDescriptor createArg(const ArgDescriptor &Arg, unsigned Mask) {
return ArgDescriptor(Arg.Reg, Mask, Arg.IsStack, Arg.IsSet);
}
@@ -94,7 +92,13 @@ inline raw_ostream &operator<<(raw_ostream &OS, const ArgDescriptor &Arg) {
return OS;
}
+struct KernArgPreloadDescriptor : public ArgDescriptor {
+ KernArgPreloadDescriptor() {}
+ SmallVector<MCRegister> Regs;
+};
+
struct AMDGPUFunctionArgInfo {
+ // clang-format off
enum PreloadedValue {
// SGPRS:
PRIVATE_SEGMENT_BUFFER = 0,
@@ -117,6 +121,7 @@ struct AMDGPUFunctionArgInfo {
WORKITEM_ID_Z = 19,
FIRST_VGPR_VALUE = WORKITEM_ID_X
};
+ // clang-format on
// Kernel input registers setup for the HSA ABI in allocation order.
@@ -151,10 +156,13 @@ struct AMDGPUFunctionArgInfo {
ArgDescriptor WorkItemIDY;
ArgDescriptor WorkItemIDZ;
+ // Map the index of preloaded kernel arguments to its descriptor.
+ SmallDenseMap<int, KernArgPreloadDescriptor> PreloadKernArgs{};
+
std::tuple<const ArgDescriptor *, const TargetRegisterClass *, LLT>
getPreloadedValue(PreloadedValue Value) const;
- static constexpr AMDGPUFunctionArgInfo fixedABILayout();
+ static AMDGPUFunctionArgInfo fixedABILayout();
};
class AMDGPUArgumentUsageInfo : public ImmutablePass {
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
index 7cd8e53e6521..d317a733d433 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
@@ -121,26 +121,13 @@ void AMDGPUAsmPrinter::initTargetStreamer(Module &M) {
TM.getTargetTriple().getOS() != Triple::AMDPAL)
return;
- if (CodeObjectVersion >= AMDGPU::AMDHSA_COV3)
- getTargetStreamer()->EmitDirectiveAMDGCNTarget();
+ getTargetStreamer()->EmitDirectiveAMDGCNTarget();
if (TM.getTargetTriple().getOS() == Triple::AMDHSA)
HSAMetadataStream->begin(M, *getTargetStreamer()->getTargetID());
if (TM.getTargetTriple().getOS() == Triple::AMDPAL)
getTargetStreamer()->getPALMetadata()->readFromIR(M);
-
- if (CodeObjectVersion >= AMDGPU::AMDHSA_COV3)
- return;
-
- // HSA emits NT_AMD_HSA_CODE_OBJECT_VERSION for code objects v2.
- if (TM.getTargetTriple().getOS() == Triple::AMDHSA)
- getTargetStreamer()->EmitDirectiveHSACodeObjectVersion(2, 1);
-
- // HSA and PAL emit NT_AMD_HSA_ISA_VERSION for code objects v2.
- IsaVersion Version = getIsaVersion(getGlobalSTI()->getCPU());
- getTargetStreamer()->EmitDirectiveHSACodeObjectISAV2(
- Version.Major, Version.Minor, Version.Stepping, "AMD", "AMDGPU");
}
void AMDGPUAsmPrinter::emitEndOfAsmFile(Module &M) {
@@ -148,8 +135,7 @@ void AMDGPUAsmPrinter::emitEndOfAsmFile(Module &M) {
if (!IsTargetStreamerInitialized)
initTargetStreamer(M);
- if (TM.getTargetTriple().getOS() != Triple::AMDHSA ||
- CodeObjectVersion == AMDGPU::AMDHSA_COV2)
+ if (TM.getTargetTriple().getOS() != Triple::AMDHSA)
getTargetStreamer()->EmitISAVersion();
// Emit HSA Metadata (NT_AMD_AMDGPU_HSA_METADATA).
@@ -162,20 +148,6 @@ void AMDGPUAsmPrinter::emitEndOfAsmFile(Module &M) {
}
}
-bool AMDGPUAsmPrinter::isBlockOnlyReachableByFallthrough(
- const MachineBasicBlock *MBB) const {
- if (!AsmPrinter::isBlockOnlyReachableByFallthrough(MBB))
- return false;
-
- if (MBB->empty())
- return true;
-
- // If this is a block implementing a long branch, an expression relative to
- // the start of the block is needed. to the start of the block.
- // XXX - Is there a smarter way to check this?
- return (MBB->back().getOpcode() != AMDGPU::S_SETPC_B64);
-}
-
void AMDGPUAsmPrinter::emitFunctionBodyStart() {
const SIMachineFunctionInfo &MFI = *MF->getInfo<SIMachineFunctionInfo>();
const GCNSubtarget &STM = MF->getSubtarget<GCNSubtarget>();
@@ -209,7 +181,7 @@ void AMDGPUAsmPrinter::emitFunctionBodyStart() {
if (!MFI.isEntryFunction())
return;
- if ((STM.isMesaKernel(F) || CodeObjectVersion == AMDGPU::AMDHSA_COV2) &&
+ if (STM.isMesaKernel(F) &&
(F.getCallingConv() == CallingConv::AMDGPU_KERNEL ||
F.getCallingConv() == CallingConv::SPIR_KERNEL)) {
amd_kernel_code_t KernelCode;
@@ -219,6 +191,11 @@ void AMDGPUAsmPrinter::emitFunctionBodyStart() {
if (STM.isAmdHsaOS())
HSAMetadataStream->emitKernel(*MF, CurrentProgramInfo);
+
+ if (MFI.getNumKernargPreloadedSGPRs() > 0) {
+ assert(AMDGPU::hasKernargPreload(STM));
+ getTargetStreamer()->EmitKernargPreloadHeader(*getGlobalSTI());
+ }
}
void AMDGPUAsmPrinter::emitFunctionBodyEnd() {
@@ -226,8 +203,7 @@ void AMDGPUAsmPrinter::emitFunctionBodyEnd() {
if (!MFI.isEntryFunction())
return;
- if (TM.getTargetTriple().getOS() != Triple::AMDHSA ||
- CodeObjectVersion == AMDGPU::AMDHSA_COV2)
+ if (TM.getTargetTriple().getOS() != Triple::AMDHSA)
return;
auto &Streamer = getTargetStreamer()->getStreamer();
@@ -260,9 +236,23 @@ void AMDGPUAsmPrinter::emitFunctionBodyEnd() {
Streamer.popSection();
}
+void AMDGPUAsmPrinter::emitImplicitDef(const MachineInstr *MI) const {
+ Register RegNo = MI->getOperand(0).getReg();
+
+ SmallString<128> Str;
+ raw_svector_ostream OS(Str);
+ OS << "implicit-def: "
+ << printReg(RegNo, MF->getSubtarget().getRegisterInfo());
+
+ if (MI->getAsmPrinterFlags() & AMDGPU::SGPR_SPILL)
+ OS << " : SGPR spill to VGPR lane";
+
+ OutStreamer->AddComment(OS.str());
+ OutStreamer->addBlankLine();
+}
+
void AMDGPUAsmPrinter::emitFunctionEntryLabel() {
- if (TM.getTargetTriple().getOS() == Triple::AMDHSA &&
- CodeObjectVersion >= AMDGPU::AMDHSA_COV3) {
+ if (TM.getTargetTriple().getOS() == Triple::AMDHSA) {
AsmPrinter::emitFunctionEntryLabel();
return;
}
@@ -337,12 +327,6 @@ bool AMDGPUAsmPrinter::doInitialization(Module &M) {
if (TM.getTargetTriple().getOS() == Triple::AMDHSA) {
switch (CodeObjectVersion) {
- case AMDGPU::AMDHSA_COV2:
- HSAMetadataStream.reset(new HSAMD::MetadataStreamerYamlV2());
- break;
- case AMDGPU::AMDHSA_COV3:
- HSAMetadataStream.reset(new HSAMD::MetadataStreamerMsgPackV3());
- break;
case AMDGPU::AMDHSA_COV4:
HSAMetadataStream.reset(new HSAMD::MetadataStreamerMsgPackV4());
break;
@@ -393,28 +377,29 @@ uint16_t AMDGPUAsmPrinter::getAmdhsaKernelCodeProperties(
const MachineFunction &MF) const {
const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
uint16_t KernelCodeProperties = 0;
+ const GCNUserSGPRUsageInfo &UserSGPRInfo = MFI.getUserSGPRInfo();
- if (MFI.hasPrivateSegmentBuffer()) {
+ if (UserSGPRInfo.hasPrivateSegmentBuffer()) {
KernelCodeProperties |=
amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER;
}
- if (MFI.hasDispatchPtr()) {
+ if (UserSGPRInfo.hasDispatchPtr()) {
KernelCodeProperties |=
amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR;
}
- if (MFI.hasQueuePtr() && CodeObjectVersion < AMDGPU::AMDHSA_COV5) {
+ if (UserSGPRInfo.hasQueuePtr() && CodeObjectVersion < AMDGPU::AMDHSA_COV5) {
KernelCodeProperties |=
amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR;
}
- if (MFI.hasKernargSegmentPtr()) {
+ if (UserSGPRInfo.hasKernargSegmentPtr()) {
KernelCodeProperties |=
amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR;
}
- if (MFI.hasDispatchID()) {
+ if (UserSGPRInfo.hasDispatchID()) {
KernelCodeProperties |=
amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID;
}
- if (MFI.hasFlatScratchInit()) {
+ if (UserSGPRInfo.hasFlatScratchInit()) {
KernelCodeProperties |=
amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT;
}
@@ -435,12 +420,13 @@ amdhsa::kernel_descriptor_t AMDGPUAsmPrinter::getAmdhsaKernelDescriptor(
const SIProgramInfo &PI) const {
const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
const Function &F = MF.getFunction();
+ const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
amdhsa::kernel_descriptor_t KernelDescriptor;
memset(&KernelDescriptor, 0x0, sizeof(KernelDescriptor));
assert(isUInt<32>(PI.ScratchSize));
- assert(isUInt<32>(PI.getComputePGMRSrc1()));
+ assert(isUInt<32>(PI.getComputePGMRSrc1(STM)));
assert(isUInt<32>(PI.getComputePGMRSrc2()));
KernelDescriptor.group_segment_fixed_size = PI.LDSSize;
@@ -449,7 +435,7 @@ amdhsa::kernel_descriptor_t AMDGPUAsmPrinter::getAmdhsaKernelDescriptor(
Align MaxKernArgAlign;
KernelDescriptor.kernarg_size = STM.getKernArgSegmentSize(F, MaxKernArgAlign);
- KernelDescriptor.compute_pgm_rsrc1 = PI.getComputePGMRSrc1();
+ KernelDescriptor.compute_pgm_rsrc1 = PI.getComputePGMRSrc1(STM);
KernelDescriptor.compute_pgm_rsrc2 = PI.getComputePGMRSrc2();
KernelDescriptor.kernel_code_properties = getAmdhsaKernelCodeProperties(MF);
@@ -458,6 +444,10 @@ amdhsa::kernel_descriptor_t AMDGPUAsmPrinter::getAmdhsaKernelDescriptor(
KernelDescriptor.compute_pgm_rsrc3 =
CurrentProgramInfo.ComputePGMRSrc3GFX90A;
+ if (AMDGPU::hasKernargPreload(STM))
+ KernelDescriptor.kernarg_preload =
+ static_cast<uint16_t>(Info->getNumKernargPreloadedSGPRs());
+
return KernelDescriptor;
}
@@ -949,6 +939,17 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
ProgInfo.Occupancy = STM.computeOccupancy(MF.getFunction(), ProgInfo.LDSSize,
ProgInfo.NumSGPRsForWavesPerEU,
ProgInfo.NumVGPRsForWavesPerEU);
+ const auto [MinWEU, MaxWEU] =
+ AMDGPU::getIntegerPairAttribute(F, "amdgpu-waves-per-eu", {0, 0}, true);
+ if (ProgInfo.Occupancy < MinWEU) {
+ DiagnosticInfoOptimizationFailure Diag(
+ F, F.getSubprogram(),
+ "failed to meet occupancy target given by 'amdgpu-waves-per-eu' in "
+ "'" +
+ F.getName() + "': desired occupancy was " + Twine(MinWEU) +
+ ", final occupancy is " + Twine(ProgInfo.Occupancy));
+ F.getContext().diagnose(Diag);
+ }
}
static unsigned getRsrcReg(CallingConv::ID CallConv) {
@@ -973,7 +974,7 @@ void AMDGPUAsmPrinter::EmitProgramInfoSI(const MachineFunction &MF,
if (AMDGPU::isCompute(MF.getFunction().getCallingConv())) {
OutStreamer->emitInt32(R_00B848_COMPUTE_PGM_RSRC1);
- OutStreamer->emitInt32(CurrentProgramInfo.getComputePGMRSrc1());
+ OutStreamer->emitInt32(CurrentProgramInfo.getComputePGMRSrc1(STM));
OutStreamer->emitInt32(R_00B84C_COMPUTE_PGM_RSRC2);
OutStreamer->emitInt32(CurrentProgramInfo.getComputePGMRSrc2());
@@ -1037,7 +1038,7 @@ void AMDGPUAsmPrinter::EmitPALMetadata(const MachineFunction &MF,
MD->setNumUsedSgprs(CC, CurrentProgramInfo.NumSGPRsForWavesPerEU);
if (MD->getPALMajorVersion() < 3) {
- MD->setRsrc1(CC, CurrentProgramInfo.getPGMRSrc1(CC));
+ MD->setRsrc1(CC, CurrentProgramInfo.getPGMRSrc1(CC, STM));
if (AMDGPU::isCompute(CC)) {
MD->setRsrc2(CC, CurrentProgramInfo.getComputePGMRSrc2());
} else {
@@ -1113,17 +1114,19 @@ void AMDGPUAsmPrinter::EmitPALMetadata(const MachineFunction &MF,
void AMDGPUAsmPrinter::emitPALFunctionMetadata(const MachineFunction &MF) {
auto *MD = getTargetStreamer()->getPALMetadata();
const MachineFrameInfo &MFI = MF.getFrameInfo();
- MD->setFunctionScratchSize(MF, MFI.getStackSize());
+ StringRef FnName = MF.getFunction().getName();
+ MD->setFunctionScratchSize(FnName, MFI.getStackSize());
+ const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
// Set compute registers
MD->setRsrc1(CallingConv::AMDGPU_CS,
- CurrentProgramInfo.getPGMRSrc1(CallingConv::AMDGPU_CS));
+ CurrentProgramInfo.getPGMRSrc1(CallingConv::AMDGPU_CS, ST));
MD->setRsrc2(CallingConv::AMDGPU_CS, CurrentProgramInfo.getComputePGMRSrc2());
// Set optional info
- MD->setFunctionLdsSize(MF, CurrentProgramInfo.LDSSize);
- MD->setFunctionNumUsedVgprs(MF, CurrentProgramInfo.NumVGPRsForWavesPerEU);
- MD->setFunctionNumUsedSgprs(MF, CurrentProgramInfo.NumSGPRsForWavesPerEU);
+ MD->setFunctionLdsSize(FnName, CurrentProgramInfo.LDSSize);
+ MD->setFunctionNumUsedVgprs(FnName, CurrentProgramInfo.NumVGPRsForWavesPerEU);
+ MD->setFunctionNumUsedSgprs(FnName, CurrentProgramInfo.NumSGPRsForWavesPerEU);
}
// This is supposed to be log2(Size)
@@ -1153,7 +1156,7 @@ void AMDGPUAsmPrinter::getAmdKernelCode(amd_kernel_code_t &Out,
AMDGPU::initDefaultAMDKernelCodeT(Out, &STM);
Out.compute_pgm_resource_registers =
- CurrentProgramInfo.getComputePGMRSrc1() |
+ CurrentProgramInfo.getComputePGMRSrc1(STM) |
(CurrentProgramInfo.getComputePGMRSrc2() << 32);
Out.code_properties |= AMD_CODE_PROPERTY_IS_PTR64;
@@ -1164,27 +1167,28 @@ void AMDGPUAsmPrinter::getAmdKernelCode(amd_kernel_code_t &Out,
AMD_CODE_PROPERTY_PRIVATE_ELEMENT_SIZE,
getElementByteSizeValue(STM.getMaxPrivateElementSize(true)));
- if (MFI->hasPrivateSegmentBuffer()) {
+ const GCNUserSGPRUsageInfo &UserSGPRInfo = MFI->getUserSGPRInfo();
+ if (UserSGPRInfo.hasPrivateSegmentBuffer()) {
Out.code_properties |=
AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER;
}
- if (MFI->hasDispatchPtr())
+ if (UserSGPRInfo.hasDispatchPtr())
Out.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR;
- if (MFI->hasQueuePtr() && CodeObjectVersion < AMDGPU::AMDHSA_COV5)
+ if (UserSGPRInfo.hasQueuePtr() && CodeObjectVersion < AMDGPU::AMDHSA_COV5)
Out.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR;
- if (MFI->hasKernargSegmentPtr())
+ if (UserSGPRInfo.hasKernargSegmentPtr())
Out.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR;
- if (MFI->hasDispatchID())
+ if (UserSGPRInfo.hasDispatchID())
Out.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID;
- if (MFI->hasFlatScratchInit())
+ if (UserSGPRInfo.hasFlatScratchInit())
Out.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT;
- if (MFI->hasDispatchPtr())
+ if (UserSGPRInfo.hasDispatchPtr())
Out.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR;
if (STM.isXNACKEnabled())
@@ -1293,6 +1297,9 @@ void AMDGPUAsmPrinter::emitResourceUsageRemarks(
EmitResourceUsageRemark("NumAGPR", "AGPRs", CurrentProgramInfo.NumAccVGPR);
EmitResourceUsageRemark("ScratchSize", "ScratchSize [bytes/lane]",
CurrentProgramInfo.ScratchSize);
+ StringRef DynamicStackStr =
+ CurrentProgramInfo.DynamicCallStack ? "True" : "False";
+ EmitResourceUsageRemark("DynamicStack", "Dynamic Stack", DynamicStackStr);
EmitResourceUsageRemark("Occupancy", "Occupancy [waves/SIMD]",
CurrentProgramInfo.Occupancy);
EmitResourceUsageRemark("SGPRSpill", "SGPRs Spill",
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h
index d490209ce35e..79326cd3d328 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h
@@ -116,6 +116,8 @@ public:
void emitFunctionBodyEnd() override;
+ void emitImplicitDef(const MachineInstr *MI) const override;
+
void emitFunctionEntryLabel() override;
void emitBasicBlockStart(const MachineBasicBlock &MBB) override;
@@ -126,9 +128,6 @@ public:
void emitEndOfAsmFile(Module &M) override;
- bool isBlockOnlyReachableByFallthrough(
- const MachineBasicBlock *MBB) const override;
-
bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
const char *ExtraCode, raw_ostream &O) override;
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp
index 9795928094f4..9ba74a23e8af 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp
@@ -202,9 +202,18 @@ void AMDGPUAtomicOptimizerImpl::visitAtomicRMWInst(AtomicRMWInst &I) {
case AtomicRMWInst::Min:
case AtomicRMWInst::UMax:
case AtomicRMWInst::UMin:
+ case AtomicRMWInst::FAdd:
+ case AtomicRMWInst::FSub:
+ case AtomicRMWInst::FMax:
+ case AtomicRMWInst::FMin:
break;
}
+ // Only 32-bit floating point atomic ops are supported.
+ if (AtomicRMWInst::isFPOperation(Op) && !I.getType()->isFloatTy()) {
+ return;
+ }
+
const unsigned PtrIdx = 0;
const unsigned ValIdx = 1;
@@ -344,8 +353,12 @@ static Value *buildNonAtomicBinOp(IRBuilder<> &B, AtomicRMWInst::BinOp Op,
llvm_unreachable("Unhandled atomic op");
case AtomicRMWInst::Add:
return B.CreateBinOp(Instruction::Add, LHS, RHS);
+ case AtomicRMWInst::FAdd:
+ return B.CreateFAdd(LHS, RHS);
case AtomicRMWInst::Sub:
return B.CreateBinOp(Instruction::Sub, LHS, RHS);
+ case AtomicRMWInst::FSub:
+ return B.CreateFSub(LHS, RHS);
case AtomicRMWInst::And:
return B.CreateBinOp(Instruction::And, LHS, RHS);
case AtomicRMWInst::Or:
@@ -365,6 +378,10 @@ static Value *buildNonAtomicBinOp(IRBuilder<> &B, AtomicRMWInst::BinOp Op,
case AtomicRMWInst::UMin:
Pred = CmpInst::ICMP_ULT;
break;
+ case AtomicRMWInst::FMax:
+ return B.CreateMaxNum(LHS, RHS);
+ case AtomicRMWInst::FMin:
+ return B.CreateMinNum(LHS, RHS);
}
Value *Cond = B.CreateICmp(Pred, LHS, RHS);
return B.CreateSelect(Cond, LHS, RHS);
@@ -376,10 +393,11 @@ Value *AMDGPUAtomicOptimizerImpl::buildReduction(IRBuilder<> &B,
AtomicRMWInst::BinOp Op,
Value *V,
Value *const Identity) const {
- Type *const Ty = V->getType();
+ Type *AtomicTy = V->getType();
+ Type *IntNTy = B.getIntNTy(AtomicTy->getPrimitiveSizeInBits());
Module *M = B.GetInsertBlock()->getModule();
Function *UpdateDPP =
- Intrinsic::getDeclaration(M, Intrinsic::amdgcn_update_dpp, Ty);
+ Intrinsic::getDeclaration(M, Intrinsic::amdgcn_update_dpp, AtomicTy);
// Reduce within each row of 16 lanes.
for (unsigned Idx = 0; Idx < 4; Idx++) {
@@ -392,39 +410,47 @@ Value *AMDGPUAtomicOptimizerImpl::buildReduction(IRBuilder<> &B,
// Reduce within each pair of rows (i.e. 32 lanes).
assert(ST->hasPermLaneX16());
- V = buildNonAtomicBinOp(
- B, Op, V,
- B.CreateIntrinsic(
- Intrinsic::amdgcn_permlanex16, {},
- {V, V, B.getInt32(-1), B.getInt32(-1), B.getFalse(), B.getFalse()}));
-
- if (ST->isWave32())
+ V = B.CreateBitCast(V, IntNTy);
+ Value *Permlanex16Call = B.CreateIntrinsic(
+ Intrinsic::amdgcn_permlanex16, {},
+ {V, V, B.getInt32(-1), B.getInt32(-1), B.getFalse(), B.getFalse()});
+ V = buildNonAtomicBinOp(B, Op, B.CreateBitCast(V, AtomicTy),
+ B.CreateBitCast(Permlanex16Call, AtomicTy));
+ if (ST->isWave32()) {
return V;
+ }
if (ST->hasPermLane64()) {
// Reduce across the upper and lower 32 lanes.
- return buildNonAtomicBinOp(
- B, Op, V, B.CreateIntrinsic(Intrinsic::amdgcn_permlane64, {}, V));
+ V = B.CreateBitCast(V, IntNTy);
+ Value *Permlane64Call =
+ B.CreateIntrinsic(Intrinsic::amdgcn_permlane64, {}, V);
+ return buildNonAtomicBinOp(B, Op, B.CreateBitCast(V, AtomicTy),
+ B.CreateBitCast(Permlane64Call, AtomicTy));
}
// Pick an arbitrary lane from 0..31 and an arbitrary lane from 32..63 and
// combine them with a scalar operation.
Function *ReadLane =
Intrinsic::getDeclaration(M, Intrinsic::amdgcn_readlane, {});
- Value *const Lane0 = B.CreateCall(ReadLane, {V, B.getInt32(0)});
- Value *const Lane32 = B.CreateCall(ReadLane, {V, B.getInt32(32)});
- return buildNonAtomicBinOp(B, Op, Lane0, Lane32);
+ V = B.CreateBitCast(V, IntNTy);
+ Value *Lane0 = B.CreateCall(ReadLane, {V, B.getInt32(0)});
+ Value *Lane32 = B.CreateCall(ReadLane, {V, B.getInt32(32)});
+ return buildNonAtomicBinOp(B, Op, B.CreateBitCast(Lane0, AtomicTy),
+ B.CreateBitCast(Lane32, AtomicTy));
}
// Use the builder to create an inclusive scan of V across the wavefront, with
// all lanes active.
Value *AMDGPUAtomicOptimizerImpl::buildScan(IRBuilder<> &B,
AtomicRMWInst::BinOp Op, Value *V,
- Value *const Identity) const {
- Type *const Ty = V->getType();
+ Value *Identity) const {
+ Type *AtomicTy = V->getType();
+ Type *IntNTy = B.getIntNTy(AtomicTy->getPrimitiveSizeInBits());
+
Module *M = B.GetInsertBlock()->getModule();
Function *UpdateDPP =
- Intrinsic::getDeclaration(M, Intrinsic::amdgcn_update_dpp, Ty);
+ Intrinsic::getDeclaration(M, Intrinsic::amdgcn_update_dpp, AtomicTy);
for (unsigned Idx = 0; Idx < 4; Idx++) {
V = buildNonAtomicBinOp(
@@ -452,23 +478,29 @@ Value *AMDGPUAtomicOptimizerImpl::buildScan(IRBuilder<> &B,
// Combine lane 15 into lanes 16..31 (and, for wave 64, lane 47 into lanes
// 48..63).
assert(ST->hasPermLaneX16());
- Value *const PermX = B.CreateIntrinsic(
+ V = B.CreateBitCast(V, IntNTy);
+ Value *PermX = B.CreateIntrinsic(
Intrinsic::amdgcn_permlanex16, {},
{V, V, B.getInt32(-1), B.getInt32(-1), B.getFalse(), B.getFalse()});
- V = buildNonAtomicBinOp(
- B, Op, V,
- B.CreateCall(UpdateDPP,
- {Identity, PermX, B.getInt32(DPP::QUAD_PERM_ID),
- B.getInt32(0xa), B.getInt32(0xf), B.getFalse()}));
+
+ Value *UpdateDPPCall =
+ B.CreateCall(UpdateDPP, {Identity, B.CreateBitCast(PermX, AtomicTy),
+ B.getInt32(DPP::QUAD_PERM_ID), B.getInt32(0xa),
+ B.getInt32(0xf), B.getFalse()});
+ V = buildNonAtomicBinOp(B, Op, B.CreateBitCast(V, AtomicTy), UpdateDPPCall);
+
if (!ST->isWave32()) {
// Combine lane 31 into lanes 32..63.
+ V = B.CreateBitCast(V, IntNTy);
Value *const Lane31 = B.CreateIntrinsic(Intrinsic::amdgcn_readlane, {},
{V, B.getInt32(31)});
- V = buildNonAtomicBinOp(
- B, Op, V,
- B.CreateCall(UpdateDPP,
- {Identity, Lane31, B.getInt32(DPP::QUAD_PERM_ID),
- B.getInt32(0xc), B.getInt32(0xf), B.getFalse()}));
+
+ Value *UpdateDPPCall = B.CreateCall(
+ UpdateDPP, {Identity, Lane31, B.getInt32(DPP::QUAD_PERM_ID),
+ B.getInt32(0xc), B.getInt32(0xf), B.getFalse()});
+
+ V = buildNonAtomicBinOp(B, Op, B.CreateBitCast(V, AtomicTy),
+ UpdateDPPCall);
}
}
return V;
@@ -477,12 +509,13 @@ Value *AMDGPUAtomicOptimizerImpl::buildScan(IRBuilder<> &B,
// Use the builder to create a shift right of V across the wavefront, with all
// lanes active, to turn an inclusive scan into an exclusive scan.
Value *AMDGPUAtomicOptimizerImpl::buildShiftRight(IRBuilder<> &B, Value *V,
- Value *const Identity) const {
- Type *const Ty = V->getType();
+ Value *Identity) const {
+ Type *AtomicTy = V->getType();
+ Type *IntNTy = B.getIntNTy(AtomicTy->getPrimitiveSizeInBits());
+
Module *M = B.GetInsertBlock()->getModule();
Function *UpdateDPP =
- Intrinsic::getDeclaration(M, Intrinsic::amdgcn_update_dpp, Ty);
-
+ Intrinsic::getDeclaration(M, Intrinsic::amdgcn_update_dpp, AtomicTy);
if (ST->hasDPPWavefrontShifts()) {
// GFX9 has DPP wavefront shift operations.
V = B.CreateCall(UpdateDPP,
@@ -502,19 +535,24 @@ Value *AMDGPUAtomicOptimizerImpl::buildShiftRight(IRBuilder<> &B, Value *V,
B.getInt32(0xf), B.getInt32(0xf), B.getFalse()});
// Copy the old lane 15 to the new lane 16.
- V = B.CreateCall(WriteLane, {B.CreateCall(ReadLane, {Old, B.getInt32(15)}),
- B.getInt32(16), V});
-
+ V = B.CreateCall(
+ WriteLane,
+ {B.CreateCall(ReadLane, {B.CreateBitCast(Old, IntNTy), B.getInt32(15)}),
+ B.getInt32(16), B.CreateBitCast(V, IntNTy)});
+ V = B.CreateBitCast(V, AtomicTy);
if (!ST->isWave32()) {
// Copy the old lane 31 to the new lane 32.
- V = B.CreateCall(
- WriteLane,
- {B.CreateCall(ReadLane, {Old, B.getInt32(31)}), B.getInt32(32), V});
+ V = B.CreateBitCast(V, IntNTy);
+ V = B.CreateCall(WriteLane,
+ {B.CreateCall(ReadLane, {B.CreateBitCast(Old, IntNTy),
+ B.getInt32(31)}),
+ B.getInt32(32), V});
// Copy the old lane 47 to the new lane 48.
V = B.CreateCall(
WriteLane,
{B.CreateCall(ReadLane, {Old, B.getInt32(47)}), B.getInt32(48), V});
+ V = B.CreateBitCast(V, AtomicTy);
}
}
@@ -529,7 +567,6 @@ Value *AMDGPUAtomicOptimizerImpl::buildShiftRight(IRBuilder<> &B, Value *V,
std::pair<Value *, Value *> AMDGPUAtomicOptimizerImpl::buildScanIteratively(
IRBuilder<> &B, AtomicRMWInst::BinOp Op, Value *const Identity, Value *V,
Instruction &I, BasicBlock *ComputeLoop, BasicBlock *ComputeEnd) const {
-
auto *Ty = I.getType();
auto *WaveTy = B.getIntNTy(ST->getWavefrontSize());
auto *EntryBB = I.getParent();
@@ -554,18 +591,25 @@ std::pair<Value *, Value *> AMDGPUAtomicOptimizerImpl::buildScanIteratively(
// Use llvm.cttz instrinsic to find the lowest remaining active lane.
auto *FF1 =
B.CreateIntrinsic(Intrinsic::cttz, WaveTy, {ActiveBits, B.getTrue()});
- auto *LaneIdxInt = B.CreateTrunc(FF1, Ty);
+
+ Type *IntNTy = B.getIntNTy(Ty->getPrimitiveSizeInBits());
+ auto *LaneIdxInt = B.CreateTrunc(FF1, IntNTy);
// Get the value required for atomic operation
- auto *LaneValue =
+ V = B.CreateBitCast(V, IntNTy);
+ Value *LaneValue =
B.CreateIntrinsic(Intrinsic::amdgcn_readlane, {}, {V, LaneIdxInt});
+ LaneValue = B.CreateBitCast(LaneValue, Ty);
// Perform writelane if intermediate scan results are required later in the
// kernel computations
Value *OldValue = nullptr;
if (NeedResult) {
- OldValue = B.CreateIntrinsic(Intrinsic::amdgcn_writelane, {},
- {Accumulator, LaneIdxInt, OldValuePhi});
+ OldValue =
+ B.CreateIntrinsic(Intrinsic::amdgcn_writelane, {},
+ {B.CreateBitCast(Accumulator, IntNTy), LaneIdxInt,
+ B.CreateBitCast(OldValuePhi, IntNTy)});
+ OldValue = B.CreateBitCast(OldValue, Ty);
OldValuePhi->addIncoming(OldValue, ComputeLoop);
}
@@ -590,8 +634,10 @@ std::pair<Value *, Value *> AMDGPUAtomicOptimizerImpl::buildScanIteratively(
return {OldValue, NewAccumulator};
}
-static APInt getIdentityValueForAtomicOp(AtomicRMWInst::BinOp Op,
- unsigned BitWidth) {
+static Constant *getIdentityValueForAtomicOp(Type *const Ty,
+ AtomicRMWInst::BinOp Op) {
+ LLVMContext &C = Ty->getContext();
+ const unsigned BitWidth = Ty->getPrimitiveSizeInBits();
switch (Op) {
default:
llvm_unreachable("Unhandled atomic op");
@@ -600,14 +646,22 @@ static APInt getIdentityValueForAtomicOp(AtomicRMWInst::BinOp Op,
case AtomicRMWInst::Or:
case AtomicRMWInst::Xor:
case AtomicRMWInst::UMax:
- return APInt::getMinValue(BitWidth);
+ return ConstantInt::get(C, APInt::getMinValue(BitWidth));
case AtomicRMWInst::And:
case AtomicRMWInst::UMin:
- return APInt::getMaxValue(BitWidth);
+ return ConstantInt::get(C, APInt::getMaxValue(BitWidth));
case AtomicRMWInst::Max:
- return APInt::getSignedMinValue(BitWidth);
+ return ConstantInt::get(C, APInt::getSignedMinValue(BitWidth));
case AtomicRMWInst::Min:
- return APInt::getSignedMaxValue(BitWidth);
+ return ConstantInt::get(C, APInt::getSignedMaxValue(BitWidth));
+ case AtomicRMWInst::FAdd:
+ return ConstantFP::get(C, APFloat::getZero(Ty->getFltSemantics(), true));
+ case AtomicRMWInst::FSub:
+ return ConstantFP::get(C, APFloat::getZero(Ty->getFltSemantics(), false));
+ case AtomicRMWInst::FMin:
+ return ConstantFP::get(C, APFloat::getInf(Ty->getFltSemantics(), false));
+ case AtomicRMWInst::FMax:
+ return ConstantFP::get(C, APFloat::getInf(Ty->getFltSemantics(), true));
}
}
@@ -623,6 +677,10 @@ void AMDGPUAtomicOptimizerImpl::optimizeAtomic(Instruction &I,
// Start building just before the instruction.
IRBuilder<> B(&I);
+ if (AtomicRMWInst::isFPOperation(Op)) {
+ B.setIsFPConstrained(I.getFunction()->hasFnAttribute(Attribute::StrictFP));
+ }
+
// If we are in a pixel shader, because of how we have to mask out helper
// lane invocations, we need to record the entry and exit BB's.
BasicBlock *PixelEntryBB = nullptr;
@@ -649,12 +707,15 @@ void AMDGPUAtomicOptimizerImpl::optimizeAtomic(Instruction &I,
}
Type *const Ty = I.getType();
+ Type *Int32Ty = B.getInt32Ty();
+ Type *IntNTy = B.getIntNTy(Ty->getPrimitiveSizeInBits());
+ bool isAtomicFloatingPointTy = Ty->isFloatingPointTy();
const unsigned TyBitWidth = DL->getTypeSizeInBits(Ty);
- auto *const VecTy = FixedVectorType::get(B.getInt32Ty(), 2);
+ auto *const VecTy = FixedVectorType::get(Int32Ty, 2);
// This is the value in the atomic operation we need to combine in order to
// reduce the number of atomic operations.
- Value *const V = I.getOperand(ValIdx);
+ Value *V = I.getOperand(ValIdx);
// We need to know how many lanes are active within the wavefront, and we do
// this by doing a ballot of active lanes.
@@ -671,39 +732,47 @@ void AMDGPUAtomicOptimizerImpl::optimizeAtomic(Instruction &I,
Mbcnt = B.CreateIntrinsic(Intrinsic::amdgcn_mbcnt_lo, {},
{Ballot, B.getInt32(0)});
} else {
- Value *const BitCast = B.CreateBitCast(Ballot, VecTy);
- Value *const ExtractLo = B.CreateExtractElement(BitCast, B.getInt32(0));
- Value *const ExtractHi = B.CreateExtractElement(BitCast, B.getInt32(1));
+ Value *const ExtractLo = B.CreateTrunc(Ballot, Int32Ty);
+ Value *const ExtractHi = B.CreateTrunc(B.CreateLShr(Ballot, 32), Int32Ty);
Mbcnt = B.CreateIntrinsic(Intrinsic::amdgcn_mbcnt_lo, {},
{ExtractLo, B.getInt32(0)});
Mbcnt =
B.CreateIntrinsic(Intrinsic::amdgcn_mbcnt_hi, {}, {ExtractHi, Mbcnt});
}
- Mbcnt = B.CreateIntCast(Mbcnt, Ty, false);
- Value *const Identity = B.getInt(getIdentityValueForAtomicOp(Op, TyBitWidth));
+ Function *F = I.getFunction();
+ LLVMContext &C = F->getContext();
+
+ // For atomic sub, perform scan with add operation and allow one lane to
+ // subtract the reduced value later.
+ AtomicRMWInst::BinOp ScanOp = Op;
+ if (Op == AtomicRMWInst::Sub) {
+ ScanOp = AtomicRMWInst::Add;
+ } else if (Op == AtomicRMWInst::FSub) {
+ ScanOp = AtomicRMWInst::FAdd;
+ }
+ Value *Identity = getIdentityValueForAtomicOp(Ty, ScanOp);
Value *ExclScan = nullptr;
Value *NewV = nullptr;
const bool NeedResult = !I.use_empty();
- Function *F = I.getFunction();
- LLVMContext &C = F->getContext();
BasicBlock *ComputeLoop = nullptr;
BasicBlock *ComputeEnd = nullptr;
// If we have a divergent value in each lane, we need to combine the value
// using DPP.
if (ValDivergent) {
- const AtomicRMWInst::BinOp ScanOp =
- Op == AtomicRMWInst::Sub ? AtomicRMWInst::Add : Op;
if (ScanImpl == ScanOptions::DPP) {
// First we need to set all inactive invocations to the identity value, so
// that they can correctly contribute to the final result.
- NewV =
- B.CreateIntrinsic(Intrinsic::amdgcn_set_inactive, Ty, {V, Identity});
- const AtomicRMWInst::BinOp ScanOp =
- Op == AtomicRMWInst::Sub ? AtomicRMWInst::Add : Op;
+ V = B.CreateBitCast(V, IntNTy);
+ Identity = B.CreateBitCast(Identity, IntNTy);
+ NewV = B.CreateIntrinsic(Intrinsic::amdgcn_set_inactive, IntNTy,
+ {V, Identity});
+ NewV = B.CreateBitCast(NewV, Ty);
+ V = B.CreateBitCast(V, Ty);
+ Identity = B.CreateBitCast(Identity, Ty);
if (!NeedResult && ST->hasPermLaneX16()) {
// On GFX10 the permlanex16 instruction helps us build a reduction
// without too many readlanes and writelanes, which are generally bad
@@ -718,8 +787,10 @@ void AMDGPUAtomicOptimizerImpl::optimizeAtomic(Instruction &I,
// which we will provide to the atomic operation.
Value *const LastLaneIdx = B.getInt32(ST->getWavefrontSize() - 1);
assert(TyBitWidth == 32);
+ NewV = B.CreateBitCast(NewV, IntNTy);
NewV = B.CreateIntrinsic(Intrinsic::amdgcn_readlane, {},
{NewV, LastLaneIdx});
+ NewV = B.CreateBitCast(NewV, Ty);
}
// Finally mark the readlanes in the WWM section.
NewV = B.CreateIntrinsic(Intrinsic::amdgcn_strict_wwm, Ty, NewV);
@@ -746,13 +817,22 @@ void AMDGPUAtomicOptimizerImpl::optimizeAtomic(Instruction &I,
NewV = buildMul(B, V, Ctpop);
break;
}
-
+ case AtomicRMWInst::FAdd:
+ case AtomicRMWInst::FSub: {
+ Value *const Ctpop = B.CreateIntCast(
+ B.CreateUnaryIntrinsic(Intrinsic::ctpop, Ballot), Int32Ty, false);
+ Value *const CtpopFP = B.CreateUIToFP(Ctpop, Ty);
+ NewV = B.CreateFMul(V, CtpopFP);
+ break;
+ }
case AtomicRMWInst::And:
case AtomicRMWInst::Or:
case AtomicRMWInst::Max:
case AtomicRMWInst::Min:
case AtomicRMWInst::UMax:
case AtomicRMWInst::UMin:
+ case AtomicRMWInst::FMin:
+ case AtomicRMWInst::FMax:
// These operations with a uniform value are idempotent: doing the atomic
// operation multiple times has the same effect as doing it once.
NewV = V;
@@ -771,7 +851,7 @@ void AMDGPUAtomicOptimizerImpl::optimizeAtomic(Instruction &I,
// We only want a single lane to enter our new control flow, and we do this
// by checking if there are any active lanes below us. Only one lane will
// have 0 active lanes below us, so that will be the only one to progress.
- Value *const Cond = B.CreateICmpEQ(Mbcnt, B.getIntN(TyBitWidth, 0));
+ Value *const Cond = B.CreateICmpEQ(Mbcnt, B.getInt32(0));
// Store I's original basic block before we split the block.
BasicBlock *const EntryBB = I.getParent();
@@ -840,9 +920,8 @@ void AMDGPUAtomicOptimizerImpl::optimizeAtomic(Instruction &I,
Value *BroadcastI = nullptr;
if (TyBitWidth == 64) {
- Value *const ExtractLo = B.CreateTrunc(PHI, B.getInt32Ty());
- Value *const ExtractHi =
- B.CreateTrunc(B.CreateLShr(PHI, 32), B.getInt32Ty());
+ Value *const ExtractLo = B.CreateTrunc(PHI, Int32Ty);
+ Value *const ExtractHi = B.CreateTrunc(B.CreateLShr(PHI, 32), Int32Ty);
CallInst *const ReadFirstLaneLo =
B.CreateIntrinsic(Intrinsic::amdgcn_readfirstlane, {}, ExtractLo);
CallInst *const ReadFirstLaneHi =
@@ -853,8 +932,11 @@ void AMDGPUAtomicOptimizerImpl::optimizeAtomic(Instruction &I,
B.CreateInsertElement(PartialInsert, ReadFirstLaneHi, B.getInt32(1));
BroadcastI = B.CreateBitCast(Insert, Ty);
} else if (TyBitWidth == 32) {
+ Value *CastedPhi = B.CreateBitCast(PHI, IntNTy);
+ BroadcastI =
+ B.CreateIntrinsic(Intrinsic::amdgcn_readfirstlane, {}, CastedPhi);
+ BroadcastI = B.CreateBitCast(BroadcastI, Ty);
- BroadcastI = B.CreateIntrinsic(Intrinsic::amdgcn_readfirstlane, {}, PHI);
} else {
llvm_unreachable("Unhandled atomic bit width");
}
@@ -874,6 +956,8 @@ void AMDGPUAtomicOptimizerImpl::optimizeAtomic(Instruction &I,
llvm_unreachable("Atomic Optimzer is disabled for None strategy");
}
} else {
+ Mbcnt = isAtomicFloatingPointTy ? B.CreateUIToFP(Mbcnt, Ty)
+ : B.CreateIntCast(Mbcnt, Ty, false);
switch (Op) {
default:
llvm_unreachable("Unhandled atomic op");
@@ -887,18 +971,25 @@ void AMDGPUAtomicOptimizerImpl::optimizeAtomic(Instruction &I,
case AtomicRMWInst::Min:
case AtomicRMWInst::UMax:
case AtomicRMWInst::UMin:
+ case AtomicRMWInst::FMin:
+ case AtomicRMWInst::FMax:
LaneOffset = B.CreateSelect(Cond, Identity, V);
break;
case AtomicRMWInst::Xor:
LaneOffset = buildMul(B, V, B.CreateAnd(Mbcnt, 1));
break;
+ case AtomicRMWInst::FAdd:
+ case AtomicRMWInst::FSub: {
+ LaneOffset = B.CreateFMul(V, Mbcnt);
+ break;
+ }
}
}
Value *const Result = buildNonAtomicBinOp(B, Op, BroadcastI, LaneOffset);
if (IsPixelShader) {
// Need a final PHI to reconverge to above the helper lane branch mask.
- B.SetInsertPoint(PixelExitBB->getFirstNonPHI());
+ B.SetInsertPoint(PixelExitBB, PixelExitBB->getFirstNonPHIIt());
PHINode *const PHI = B.CreatePHI(Ty, 2);
PHI->addIncoming(PoisonValue::get(Ty), PixelEntryBB);
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
index 57c873f00a4a..5fd9e571282d 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
@@ -28,6 +28,10 @@ void initializeCycleInfoWrapperPassPass(PassRegistry &);
using namespace llvm;
+static cl::opt<unsigned> KernargPreloadCount(
+ "amdgpu-kernarg-preload-count",
+ cl::desc("How many kernel arguments to preload onto SGPRs"), cl::init(0));
+
#define AMDGPU_ATTRIBUTE(Name, Str) Name##_POS,
enum ImplicitArgumentPositions {
@@ -914,9 +918,68 @@ AAAMDWavesPerEU &AAAMDWavesPerEU::createForPosition(const IRPosition &IRP,
llvm_unreachable("AAAMDWavesPerEU is only valid for function position");
}
-class AMDGPUAttributor : public ModulePass {
+static void addPreloadKernArgHint(Function &F, TargetMachine &TM) {
+ const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
+ for (unsigned I = 0;
+ I < F.arg_size() &&
+ I < std::min(KernargPreloadCount.getValue(), ST.getMaxNumUserSGPRs());
+ ++I) {
+ Argument &Arg = *F.getArg(I);
+ // Check for incompatible attributes.
+ if (Arg.hasByRefAttr() || Arg.hasNestAttr())
+ break;
+
+ Arg.addAttr(Attribute::InReg);
+ }
+}
+
+static bool runImpl(Module &M, AnalysisGetter &AG, TargetMachine &TM) {
+ SetVector<Function *> Functions;
+ for (Function &F : M) {
+ if (!F.isIntrinsic())
+ Functions.insert(&F);
+ }
+
+ CallGraphUpdater CGUpdater;
+ BumpPtrAllocator Allocator;
+ AMDGPUInformationCache InfoCache(M, AG, Allocator, nullptr, TM);
+ DenseSet<const char *> Allowed(
+ {&AAAMDAttributes::ID, &AAUniformWorkGroupSize::ID,
+ &AAPotentialValues::ID, &AAAMDFlatWorkGroupSize::ID,
+ &AAAMDWavesPerEU::ID, &AACallEdges::ID, &AAPointerInfo::ID,
+ &AAPotentialConstantValues::ID, &AAUnderlyingObjects::ID});
+
+ AttributorConfig AC(CGUpdater);
+ AC.Allowed = &Allowed;
+ AC.IsModulePass = true;
+ AC.DefaultInitializeLiveInternals = false;
+ AC.IPOAmendableCB = [](const Function &F) {
+ return F.getCallingConv() == CallingConv::AMDGPU_KERNEL;
+ };
+
+ Attributor A(Functions, InfoCache, AC);
+
+ for (Function &F : M) {
+ if (!F.isIntrinsic()) {
+ A.getOrCreateAAFor<AAAMDAttributes>(IRPosition::function(F));
+ A.getOrCreateAAFor<AAUniformWorkGroupSize>(IRPosition::function(F));
+ CallingConv::ID CC = F.getCallingConv();
+ if (!AMDGPU::isEntryFunctionCC(CC)) {
+ A.getOrCreateAAFor<AAAMDFlatWorkGroupSize>(IRPosition::function(F));
+ A.getOrCreateAAFor<AAAMDWavesPerEU>(IRPosition::function(F));
+ } else if (CC == CallingConv::AMDGPU_KERNEL) {
+ addPreloadKernArgHint(F, TM);
+ }
+ }
+ }
+
+ ChangeStatus Change = A.run();
+ return Change == ChangeStatus::CHANGED;
+}
+
+class AMDGPUAttributorLegacy : public ModulePass {
public:
- AMDGPUAttributor() : ModulePass(ID) {}
+ AMDGPUAttributorLegacy() : ModulePass(ID) {}
/// doInitialization - Virtual method overridden by subclasses to do
/// any necessary initialization before any pass is run.
@@ -930,45 +993,8 @@ public:
}
bool runOnModule(Module &M) override {
- SetVector<Function *> Functions;
AnalysisGetter AG(this);
- for (Function &F : M) {
- if (!F.isIntrinsic())
- Functions.insert(&F);
- }
-
- CallGraphUpdater CGUpdater;
- BumpPtrAllocator Allocator;
- AMDGPUInformationCache InfoCache(M, AG, Allocator, nullptr, *TM);
- DenseSet<const char *> Allowed(
- {&AAAMDAttributes::ID, &AAUniformWorkGroupSize::ID,
- &AAPotentialValues::ID, &AAAMDFlatWorkGroupSize::ID,
- &AAAMDWavesPerEU::ID, &AACallEdges::ID, &AAPointerInfo::ID,
- &AAPotentialConstantValues::ID, &AAUnderlyingObjects::ID});
-
- AttributorConfig AC(CGUpdater);
- AC.Allowed = &Allowed;
- AC.IsModulePass = true;
- AC.DefaultInitializeLiveInternals = false;
- AC.IPOAmendableCB = [](const Function &F) {
- return F.getCallingConv() == CallingConv::AMDGPU_KERNEL;
- };
-
- Attributor A(Functions, InfoCache, AC);
-
- for (Function &F : M) {
- if (!F.isIntrinsic()) {
- A.getOrCreateAAFor<AAAMDAttributes>(IRPosition::function(F));
- A.getOrCreateAAFor<AAUniformWorkGroupSize>(IRPosition::function(F));
- if (!AMDGPU::isEntryFunctionCC(F.getCallingConv())) {
- A.getOrCreateAAFor<AAAMDFlatWorkGroupSize>(IRPosition::function(F));
- A.getOrCreateAAFor<AAAMDWavesPerEU>(IRPosition::function(F));
- }
- }
- }
-
- ChangeStatus Change = A.run();
- return Change == ChangeStatus::CHANGED;
+ return runImpl(M, AG, *TM);
}
void getAnalysisUsage(AnalysisUsage &AU) const override {
@@ -981,11 +1007,25 @@ public:
};
} // namespace
-char AMDGPUAttributor::ID = 0;
+PreservedAnalyses llvm::AMDGPUAttributorPass::run(Module &M,
+ ModuleAnalysisManager &AM) {
+
+ FunctionAnalysisManager &FAM =
+ AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
+ AnalysisGetter AG(FAM);
+
+ // TODO: Probably preserves CFG
+ return runImpl(M, AG, TM) ? PreservedAnalyses::none()
+ : PreservedAnalyses::all();
+}
+
+char AMDGPUAttributorLegacy::ID = 0;
-Pass *llvm::createAMDGPUAttributorPass() { return new AMDGPUAttributor(); }
-INITIALIZE_PASS_BEGIN(AMDGPUAttributor, DEBUG_TYPE, "AMDGPU Attributor", false,
- false)
+Pass *llvm::createAMDGPUAttributorLegacyPass() {
+ return new AMDGPUAttributorLegacy();
+}
+INITIALIZE_PASS_BEGIN(AMDGPUAttributorLegacy, DEBUG_TYPE, "AMDGPU Attributor",
+ false, false)
INITIALIZE_PASS_DEPENDENCY(CycleInfoWrapperPass);
-INITIALIZE_PASS_END(AMDGPUAttributor, DEBUG_TYPE, "AMDGPU Attributor", false,
- false)
+INITIALIZE_PASS_END(AMDGPUAttributorLegacy, DEBUG_TYPE, "AMDGPU Attributor",
+ false, false)
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
index 9ba5ea8fb73f..cf2896f80f19 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
@@ -32,7 +32,7 @@ namespace {
/// Wrapper around extendRegister to ensure we extend to a full 32-bit register.
static Register extendRegisterMin32(CallLowering::ValueHandler &Handler,
- Register ValVReg, CCValAssign &VA) {
+ Register ValVReg, const CCValAssign &VA) {
if (VA.getLocVT().getSizeInBits() < 32) {
// 16-bit types are reported as legal for 32-bit registers. We need to
// extend and do a 32-bit copy to avoid the verifier complaining about it.
@@ -56,12 +56,13 @@ struct AMDGPUOutgoingValueHandler : public CallLowering::OutgoingValueHandler {
}
void assignValueToAddress(Register ValVReg, Register Addr, LLT MemTy,
- MachinePointerInfo &MPO, CCValAssign &VA) override {
+ const MachinePointerInfo &MPO,
+ const CCValAssign &VA) override {
llvm_unreachable("not implemented");
}
void assignValueToReg(Register ValVReg, Register PhysReg,
- CCValAssign VA) override {
+ const CCValAssign &VA) override {
Register ExtReg = extendRegisterMin32(*this, ValVReg, VA);
// If this is a scalar return, insert a readfirstlane just in case the value
@@ -82,9 +83,10 @@ struct AMDGPUOutgoingValueHandler : public CallLowering::OutgoingValueHandler {
ExtReg = MIRBuilder.buildBitcast(S32, ExtReg).getReg(0);
}
- auto ToSGPR = MIRBuilder.buildIntrinsic(Intrinsic::amdgcn_readfirstlane,
- {MRI.getType(ExtReg)}, false)
- .addReg(ExtReg);
+ auto ToSGPR = MIRBuilder
+ .buildIntrinsic(Intrinsic::amdgcn_readfirstlane,
+ {MRI.getType(ExtReg)})
+ .addReg(ExtReg);
ExtReg = ToSGPR.getReg(0);
}
@@ -116,7 +118,7 @@ struct AMDGPUIncomingArgHandler : public CallLowering::IncomingValueHandler {
}
void assignValueToReg(Register ValVReg, Register PhysReg,
- CCValAssign VA) override {
+ const CCValAssign &VA) override {
markPhysRegUsed(PhysReg);
if (VA.getLocVT().getSizeInBits() < 32) {
@@ -136,7 +138,8 @@ struct AMDGPUIncomingArgHandler : public CallLowering::IncomingValueHandler {
}
void assignValueToAddress(Register ValVReg, Register Addr, LLT MemTy,
- MachinePointerInfo &MPO, CCValAssign &VA) override {
+ const MachinePointerInfo &MPO,
+ const CCValAssign &VA) override {
MachineFunction &MF = MIRBuilder.getMF();
auto MMO = MF.getMachineMemOperand(
@@ -228,14 +231,15 @@ struct AMDGPUOutgoingArgHandler : public AMDGPUOutgoingValueHandler {
}
void assignValueToReg(Register ValVReg, Register PhysReg,
- CCValAssign VA) override {
+ const CCValAssign &VA) override {
MIB.addUse(PhysReg, RegState::Implicit);
Register ExtReg = extendRegisterMin32(*this, ValVReg, VA);
MIRBuilder.buildCopy(PhysReg, ExtReg);
}
void assignValueToAddress(Register ValVReg, Register Addr, LLT MemTy,
- MachinePointerInfo &MPO, CCValAssign &VA) override {
+ const MachinePointerInfo &MPO,
+ const CCValAssign &VA) override {
MachineFunction &MF = MIRBuilder.getMF();
uint64_t LocMemOffset = VA.getLocMemOffset();
const auto &ST = MF.getSubtarget<GCNSubtarget>();
@@ -248,7 +252,8 @@ struct AMDGPUOutgoingArgHandler : public AMDGPUOutgoingValueHandler {
void assignValueToAddress(const CallLowering::ArgInfo &Arg,
unsigned ValRegIndex, Register Addr, LLT MemTy,
- MachinePointerInfo &MPO, CCValAssign &VA) override {
+ const MachinePointerInfo &MPO,
+ const CCValAssign &VA) override {
Register ValVReg = VA.getLocInfo() != CCValAssign::LocInfo::FPExt
? extendRegister(Arg.Regs[ValRegIndex], VA)
: Arg.Regs[ValRegIndex];
@@ -454,27 +459,28 @@ static void allocateHSAUserSGPRs(CCState &CCInfo,
const SIRegisterInfo &TRI,
SIMachineFunctionInfo &Info) {
// FIXME: How should these inputs interact with inreg / custom SGPR inputs?
- if (Info.hasPrivateSegmentBuffer()) {
+ const GCNUserSGPRUsageInfo &UserSGPRInfo = Info.getUserSGPRInfo();
+ if (UserSGPRInfo.hasPrivateSegmentBuffer()) {
Register PrivateSegmentBufferReg = Info.addPrivateSegmentBuffer(TRI);
MF.addLiveIn(PrivateSegmentBufferReg, &AMDGPU::SGPR_128RegClass);
CCInfo.AllocateReg(PrivateSegmentBufferReg);
}
- if (Info.hasDispatchPtr()) {
+ if (UserSGPRInfo.hasDispatchPtr()) {
Register DispatchPtrReg = Info.addDispatchPtr(TRI);
MF.addLiveIn(DispatchPtrReg, &AMDGPU::SGPR_64RegClass);
CCInfo.AllocateReg(DispatchPtrReg);
}
const Module *M = MF.getFunction().getParent();
- if (Info.hasQueuePtr() &&
+ if (UserSGPRInfo.hasQueuePtr() &&
AMDGPU::getCodeObjectVersion(*M) < AMDGPU::AMDHSA_COV5) {
Register QueuePtrReg = Info.addQueuePtr(TRI);
MF.addLiveIn(QueuePtrReg, &AMDGPU::SGPR_64RegClass);
CCInfo.AllocateReg(QueuePtrReg);
}
- if (Info.hasKernargSegmentPtr()) {
+ if (UserSGPRInfo.hasKernargSegmentPtr()) {
MachineRegisterInfo &MRI = MF.getRegInfo();
Register InputPtrReg = Info.addKernargSegmentPtr(TRI);
const LLT P4 = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
@@ -485,13 +491,13 @@ static void allocateHSAUserSGPRs(CCState &CCInfo,
CCInfo.AllocateReg(InputPtrReg);
}
- if (Info.hasDispatchID()) {
+ if (UserSGPRInfo.hasDispatchID()) {
Register DispatchIDReg = Info.addDispatchID(TRI);
MF.addLiveIn(DispatchIDReg, &AMDGPU::SGPR_64RegClass);
CCInfo.AllocateReg(DispatchIDReg);
}
- if (Info.hasFlatScratchInit()) {
+ if (UserSGPRInfo.hasFlatScratchInit()) {
Register FlatScratchInitReg = Info.addFlatScratchInit(TRI);
MF.addLiveIn(FlatScratchInitReg, &AMDGPU::SGPR_64RegClass);
CCInfo.AllocateReg(FlatScratchInitReg);
@@ -596,15 +602,16 @@ bool AMDGPUCallLowering::lowerFormalArguments(
SmallVector<CCValAssign, 16> ArgLocs;
CCState CCInfo(CC, F.isVarArg(), MF, ArgLocs, F.getContext());
+ const GCNUserSGPRUsageInfo &UserSGPRInfo = Info->getUserSGPRInfo();
- if (Info->hasImplicitBufferPtr()) {
+ if (UserSGPRInfo.hasImplicitBufferPtr()) {
Register ImplicitBufferPtrReg = Info->addImplicitBufferPtr(*TRI);
MF.addLiveIn(ImplicitBufferPtrReg, &AMDGPU::SGPR_64RegClass);
CCInfo.AllocateReg(ImplicitBufferPtrReg);
}
// FIXME: This probably isn't defined for mesa
- if (Info->hasFlatScratchInit() && !Subtarget.isAmdPalOS()) {
+ if (UserSGPRInfo.hasFlatScratchInit() && !Subtarget.isAmdPalOS()) {
Register FlatScratchInitReg = Info->addFlatScratchInit(*TRI);
MF.addLiveIn(FlatScratchInitReg, &AMDGPU::SGPR_64RegClass);
CCInfo.AllocateReg(FlatScratchInitReg);
@@ -954,12 +961,18 @@ getAssignFnsForCC(CallingConv::ID CC, const SITargetLowering &TLI) {
}
static unsigned getCallOpcode(const MachineFunction &CallerF, bool IsIndirect,
- bool IsTailCall, CallingConv::ID CC) {
- assert(!(IsIndirect && IsTailCall) && "Indirect calls can't be tail calls, "
- "because the address can be divergent");
+ bool IsTailCall, bool isWave32,
+ CallingConv::ID CC) {
+ // For calls to amdgpu_cs_chain functions, the address is known to be uniform.
+ assert((AMDGPU::isChainCC(CC) || !IsIndirect || !IsTailCall) &&
+ "Indirect calls can't be tail calls, "
+ "because the address can be divergent");
if (!IsTailCall)
return AMDGPU::G_SI_CALL;
+ if (AMDGPU::isChainCC(CC))
+ return isWave32 ? AMDGPU::SI_CS_CHAIN_TC_W32 : AMDGPU::SI_CS_CHAIN_TC_W64;
+
return CC == CallingConv::AMDGPU_Gfx ? AMDGPU::SI_TCRETURN_GFX :
AMDGPU::SI_TCRETURN;
}
@@ -1147,14 +1160,20 @@ bool AMDGPUCallLowering::isEligibleForTailCallOptimization(
void AMDGPUCallLowering::handleImplicitCallArguments(
MachineIRBuilder &MIRBuilder, MachineInstrBuilder &CallInst,
const GCNSubtarget &ST, const SIMachineFunctionInfo &FuncInfo,
+ CallingConv::ID CalleeCC,
ArrayRef<std::pair<MCRegister, Register>> ImplicitArgRegs) const {
if (!ST.enableFlatScratch()) {
// Insert copies for the SRD. In the HSA case, this should be an identity
// copy.
auto ScratchRSrcReg = MIRBuilder.buildCopy(LLT::fixed_vector(4, 32),
FuncInfo.getScratchRSrcReg());
- MIRBuilder.buildCopy(AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3, ScratchRSrcReg);
- CallInst.addReg(AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3, RegState::Implicit);
+
+ auto CalleeRSrcReg = AMDGPU::isChainCC(CalleeCC)
+ ? AMDGPU::SGPR48_SGPR49_SGPR50_SGPR51
+ : AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3;
+
+ MIRBuilder.buildCopy(CalleeRSrcReg, ScratchRSrcReg);
+ CallInst.addReg(CalleeRSrcReg, RegState::Implicit);
}
for (std::pair<MCRegister, Register> ArgReg : ImplicitArgRegs) {
@@ -1186,7 +1205,8 @@ bool AMDGPUCallLowering::lowerTailCall(
if (!IsSibCall)
CallSeqStart = MIRBuilder.buildInstr(AMDGPU::ADJCALLSTACKUP);
- unsigned Opc = getCallOpcode(MF, Info.Callee.isReg(), true, CalleeCC);
+ unsigned Opc =
+ getCallOpcode(MF, Info.Callee.isReg(), true, ST.isWave32(), CalleeCC);
auto MIB = MIRBuilder.buildInstrNoInsert(Opc);
if (!addCallTargetOperands(MIB, MIRBuilder, Info))
return false;
@@ -1195,8 +1215,27 @@ bool AMDGPUCallLowering::lowerTailCall(
// be 0.
MIB.addImm(0);
- // Tell the call which registers are clobbered.
+ // If this is a chain call, we need to pass in the EXEC mask.
const SIRegisterInfo *TRI = ST.getRegisterInfo();
+ if (AMDGPU::isChainCC(Info.CallConv)) {
+ ArgInfo ExecArg = Info.OrigArgs[1];
+ assert(ExecArg.Regs.size() == 1 && "Too many regs for EXEC");
+
+ if (!ExecArg.Ty->isIntegerTy(ST.getWavefrontSize()))
+ return false;
+
+ if (auto CI = dyn_cast<ConstantInt>(ExecArg.OrigValue)) {
+ MIB.addImm(CI->getSExtValue());
+ } else {
+ MIB.addReg(ExecArg.Regs[0]);
+ unsigned Idx = MIB->getNumOperands() - 1;
+ MIB->getOperand(Idx).setReg(constrainOperandRegClass(
+ MF, *TRI, MRI, *ST.getInstrInfo(), *ST.getRegBankInfo(), *MIB,
+ MIB->getDesc(), MIB->getOperand(Idx), Idx));
+ }
+ }
+
+ // Tell the call which registers are clobbered.
const uint32_t *Mask = TRI->getCallPreservedMask(MF, CalleeCC);
MIB.addRegMask(Mask);
@@ -1250,7 +1289,8 @@ bool AMDGPUCallLowering::lowerTailCall(
// after the ordinary user argument registers.
SmallVector<std::pair<MCRegister, Register>, 12> ImplicitArgRegs;
- if (Info.CallConv != CallingConv::AMDGPU_Gfx) {
+ if (Info.CallConv != CallingConv::AMDGPU_Gfx &&
+ !AMDGPU::isChainCC(Info.CallConv)) {
// With a fixed ABI, allocate fixed registers before user arguments.
if (!passSpecialInputs(MIRBuilder, CCInfo, ImplicitArgRegs, Info))
return false;
@@ -1266,7 +1306,8 @@ bool AMDGPUCallLowering::lowerTailCall(
if (!handleAssignments(Handler, OutArgs, CCInfo, ArgLocs, MIRBuilder))
return false;
- handleImplicitCallArguments(MIRBuilder, MIB, ST, *FuncInfo, ImplicitArgRegs);
+ handleImplicitCallArguments(MIRBuilder, MIB, ST, *FuncInfo, CalleeCC,
+ ImplicitArgRegs);
// If we have -tailcallopt, we need to adjust the stack. We'll do the call
// sequence start and end here.
@@ -1300,8 +1341,62 @@ bool AMDGPUCallLowering::lowerTailCall(
return true;
}
+/// Lower a call to the @llvm.amdgcn.cs.chain intrinsic.
+bool AMDGPUCallLowering::lowerChainCall(MachineIRBuilder &MIRBuilder,
+ CallLoweringInfo &Info) const {
+ ArgInfo Callee = Info.OrigArgs[0];
+ ArgInfo SGPRArgs = Info.OrigArgs[2];
+ ArgInfo VGPRArgs = Info.OrigArgs[3];
+ ArgInfo Flags = Info.OrigArgs[4];
+
+ assert(cast<ConstantInt>(Flags.OrigValue)->isZero() &&
+ "Non-zero flags aren't supported yet.");
+ assert(Info.OrigArgs.size() == 5 && "Additional args aren't supported yet.");
+
+ MachineFunction &MF = MIRBuilder.getMF();
+ const Function &F = MF.getFunction();
+ const DataLayout &DL = F.getParent()->getDataLayout();
+
+ // The function to jump to is actually the first argument, so we'll change the
+ // Callee and other info to match that before using our existing helper.
+ const Value *CalleeV = Callee.OrigValue->stripPointerCasts();
+ if (const Function *F = dyn_cast<Function>(CalleeV)) {
+ Info.Callee = MachineOperand::CreateGA(F, 0);
+ Info.CallConv = F->getCallingConv();
+ } else {
+ assert(Callee.Regs.size() == 1 && "Too many regs for the callee");
+ Info.Callee = MachineOperand::CreateReg(Callee.Regs[0], false);
+ Info.CallConv = CallingConv::AMDGPU_CS_Chain; // amdgpu_cs_chain_preserve
+ // behaves the same here.
+ }
+
+ // The function that we're calling cannot be vararg (only the intrinsic is).
+ Info.IsVarArg = false;
+
+ assert(std::all_of(SGPRArgs.Flags.begin(), SGPRArgs.Flags.end(),
+ [](ISD::ArgFlagsTy F) { return F.isInReg(); }) &&
+ "SGPR arguments should be marked inreg");
+ assert(std::none_of(VGPRArgs.Flags.begin(), VGPRArgs.Flags.end(),
+ [](ISD::ArgFlagsTy F) { return F.isInReg(); }) &&
+ "VGPR arguments should not be marked inreg");
+
+ SmallVector<ArgInfo, 8> OutArgs;
+ splitToValueTypes(SGPRArgs, OutArgs, DL, Info.CallConv);
+ splitToValueTypes(VGPRArgs, OutArgs, DL, Info.CallConv);
+
+ Info.IsMustTailCall = true;
+ return lowerTailCall(MIRBuilder, Info, OutArgs);
+}
+
bool AMDGPUCallLowering::lowerCall(MachineIRBuilder &MIRBuilder,
CallLoweringInfo &Info) const {
+ if (Function *F = Info.CB->getCalledFunction())
+ if (F->isIntrinsic()) {
+ assert(F->getIntrinsicID() == Intrinsic::amdgcn_cs_chain &&
+ "Unexpected intrinsic");
+ return lowerChainCall(MIRBuilder, Info);
+ }
+
if (Info.IsVarArg) {
LLVM_DEBUG(dbgs() << "Variadic functions not implemented\n");
return false;
@@ -1350,11 +1445,15 @@ bool AMDGPUCallLowering::lowerCall(MachineIRBuilder &MIRBuilder,
// Create a temporarily-floating call instruction so we can add the implicit
// uses of arg registers.
- unsigned Opc = getCallOpcode(MF, Info.Callee.isReg(), false, Info.CallConv);
+ unsigned Opc = getCallOpcode(MF, Info.Callee.isReg(), false, ST.isWave32(),
+ Info.CallConv);
auto MIB = MIRBuilder.buildInstrNoInsert(Opc);
MIB.addDef(TRI->getReturnAddressReg(MF));
+ if (!Info.IsConvergent)
+ MIB.setMIFlag(MachineInstr::NoConvergent);
+
if (!addCallTargetOperands(MIB, MIRBuilder, Info))
return false;
@@ -1389,7 +1488,8 @@ bool AMDGPUCallLowering::lowerCall(MachineIRBuilder &MIRBuilder,
const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
- handleImplicitCallArguments(MIRBuilder, MIB, ST, *MFI, ImplicitArgRegs);
+ handleImplicitCallArguments(MIRBuilder, MIB, ST, *MFI, Info.CallConv,
+ ImplicitArgRegs);
// Get a count of how many bytes are to be pushed on the stack.
unsigned NumBytes = CCInfo.getStackSize();
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.h b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.h
index 569c6d75204d..a6e801f2a547 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.h
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.h
@@ -75,10 +75,13 @@ public:
void handleImplicitCallArguments(
MachineIRBuilder &MIRBuilder, MachineInstrBuilder &CallInst,
const GCNSubtarget &ST, const SIMachineFunctionInfo &MFI,
+ CallingConv::ID CalleeCC,
ArrayRef<std::pair<MCRegister, Register>> ImplicitArgRegs) const;
bool lowerTailCall(MachineIRBuilder &MIRBuilder, CallLoweringInfo &Info,
SmallVectorImpl<ArgInfo> &OutArgs) const;
+ bool lowerChainCall(MachineIRBuilder &MIRBuilder,
+ CallLoweringInfo &Info) const;
bool lowerCall(MachineIRBuilder &MIRBuilder,
CallLoweringInfo &Info) const override;
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td
index 2b70665ab95c..9036b26a6f6b 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td
@@ -176,6 +176,10 @@ def CSR_AMDGPU_SI_Gfx_GFX90AInsts : CalleeSavedRegs<
(add CSR_AMDGPU_SI_Gfx, CSR_AMDGPU_AGPRs)
>;
+def CSR_AMDGPU_CS_ChainPreserve : CalleeSavedRegs<
+ (sequence "VGPR%u", 8, 255)
+>;
+
def CSR_AMDGPU_NoRegs : CalleeSavedRegs<(add)>;
// Calling convention for leaf functions
@@ -183,6 +187,11 @@ def CC_AMDGPU_Func : CallingConv<[
CCIfByVal<CCPassByVal<4, 4>>,
CCIfType<[i1], CCPromoteToType<i32>>,
CCIfType<[i8, i16], CCIfExtend<CCPromoteToType<i32>>>,
+
+ CCIfInReg<CCIfType<[f32, i32, f16, i16, v2i16, v2f16] , CCAssignToReg<
+ !foreach(i, !range(0, 30), !cast<Register>("SGPR"#i)) // SGPR0-29
+ >>>,
+
CCIfType<[i32, f32, i16, f16, v2i16, v2f16, i1], CCAssignToReg<[
VGPR0, VGPR1, VGPR2, VGPR3, VGPR4, VGPR5, VGPR6, VGPR7,
VGPR8, VGPR9, VGPR10, VGPR11, VGPR12, VGPR13, VGPR14, VGPR15,
@@ -213,6 +222,16 @@ def CC_AMDGPU : CallingConv<[
CCDelegateTo<CC_AMDGPU_Func>>
]>;
+def CC_AMDGPU_CS_CHAIN : CallingConv<[
+ CCIfInReg<CCIfType<[f32, i32, f16, i16, v2i16, v2f16] , CCAssignToReg<
+ !foreach(i, !range(105), !cast<Register>("SGPR"#i))
+ >>>,
+
+ CCIfNotInReg<CCIfType<[f32, i32, f16, i16, v2i16, v2f16] , CCAssignToReg<
+ !foreach(i, !range(8, 255), !cast<Register>("VGPR"#i))
+ >>>
+]>;
+
// Trivial class to denote when a def is used only to get a RegMask, i.e.
// SaveList is ignored and the def is not used as part of any calling
// convention.
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
index 4ec85f3c5588..87b1957c799e 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
@@ -52,17 +52,17 @@ static cl::opt<bool> Widen16BitOps(
cl::init(true));
static cl::opt<bool>
- ScalarizeLargePHIs("amdgpu-codegenprepare-break-large-phis",
- cl::desc("Break large PHI nodes for DAGISel"),
- cl::ReallyHidden, cl::init(true));
+ BreakLargePHIs("amdgpu-codegenprepare-break-large-phis",
+ cl::desc("Break large PHI nodes for DAGISel"),
+ cl::ReallyHidden, cl::init(true));
static cl::opt<bool>
- ForceScalarizeLargePHIs("amdgpu-codegenprepare-force-break-large-phis",
- cl::desc("For testing purposes, always break large "
- "PHIs even if it isn't profitable."),
- cl::ReallyHidden, cl::init(false));
+ ForceBreakLargePHIs("amdgpu-codegenprepare-force-break-large-phis",
+ cl::desc("For testing purposes, always break large "
+ "PHIs even if it isn't profitable."),
+ cl::ReallyHidden, cl::init(false));
-static cl::opt<unsigned> ScalarizeLargePHIsThreshold(
+static cl::opt<unsigned> BreakLargePHIsThreshold(
"amdgpu-codegenprepare-break-large-phis-threshold",
cl::desc("Minimum type size in bits for breaking large PHI nodes"),
cl::ReallyHidden, cl::init(32));
@@ -108,9 +108,31 @@ public:
bool HasUnsafeFPMath = false;
bool HasFP32DenormalFlush = false;
bool FlowChanged = false;
+ mutable Function *SqrtF32 = nullptr;
+ mutable Function *LdexpF32 = nullptr;
DenseMap<const PHINode *, bool> BreakPhiNodesCache;
+ Function *getSqrtF32() const {
+ if (SqrtF32)
+ return SqrtF32;
+
+ LLVMContext &Ctx = Mod->getContext();
+ SqrtF32 = Intrinsic::getDeclaration(Mod, Intrinsic::amdgcn_sqrt,
+ {Type::getFloatTy(Ctx)});
+ return SqrtF32;
+ }
+
+ Function *getLdexpF32() const {
+ if (LdexpF32)
+ return LdexpF32;
+
+ LLVMContext &Ctx = Mod->getContext();
+ LdexpF32 = Intrinsic::getDeclaration(
+ Mod, Intrinsic::ldexp, {Type::getFloatTy(Ctx), Type::getInt32Ty(Ctx)});
+ return LdexpF32;
+ }
+
bool canBreakPHINode(const PHINode &I);
/// Copies exact/nsw/nuw flags (if any) from binary operation \p I to
@@ -276,6 +298,8 @@ public:
bool IsNegative) const;
Value *emitFrexpDiv(IRBuilder<> &Builder, Value *LHS, Value *RHS,
FastMathFlags FMF) const;
+ Value *emitSqrtIEEE2ULP(IRBuilder<> &Builder, Value *Src,
+ FastMathFlags FMF) const;
public:
bool visitFDiv(BinaryOperator &I);
@@ -290,6 +314,7 @@ public:
bool visitIntrinsicInst(IntrinsicInst &I);
bool visitBitreverseIntrinsicInst(IntrinsicInst &I);
bool visitMinNum(IntrinsicInst &I);
+ bool visitSqrt(IntrinsicInst &I);
bool run(Function &F);
};
@@ -319,6 +344,7 @@ public:
} // end anonymous namespace
bool AMDGPUCodeGenPrepareImpl::run(Function &F) {
+ BreakPhiNodesCache.clear();
bool MadeChange = false;
Function::iterator NextBB;
@@ -598,34 +624,6 @@ static Value *insertValues(IRBuilder<> &Builder,
return NewVal;
}
-// Returns 24-bit or 48-bit (as per `NumBits` and `Size`) mul of `LHS` and
-// `RHS`. `NumBits` is the number of KnownBits of the result and `Size` is the
-// width of the original destination.
-static Value *getMul24(IRBuilder<> &Builder, Value *LHS, Value *RHS,
- unsigned Size, unsigned NumBits, bool IsSigned) {
- if (Size <= 32 || NumBits <= 32) {
- Intrinsic::ID ID =
- IsSigned ? Intrinsic::amdgcn_mul_i24 : Intrinsic::amdgcn_mul_u24;
- return Builder.CreateIntrinsic(ID, {}, {LHS, RHS});
- }
-
- assert(NumBits <= 48);
-
- Intrinsic::ID LoID =
- IsSigned ? Intrinsic::amdgcn_mul_i24 : Intrinsic::amdgcn_mul_u24;
- Intrinsic::ID HiID =
- IsSigned ? Intrinsic::amdgcn_mulhi_i24 : Intrinsic::amdgcn_mulhi_u24;
-
- Value *Lo = Builder.CreateIntrinsic(LoID, {}, {LHS, RHS});
- Value *Hi = Builder.CreateIntrinsic(HiID, {}, {LHS, RHS});
-
- IntegerType *I64Ty = Builder.getInt64Ty();
- Lo = Builder.CreateZExtOrTrunc(Lo, I64Ty);
- Hi = Builder.CreateZExtOrTrunc(Hi, I64Ty);
-
- return Builder.CreateOr(Lo, Builder.CreateShl(Hi, 32));
-}
-
bool AMDGPUCodeGenPrepareImpl::replaceMulWithMul24(BinaryOperator &I) const {
if (I.getOpcode() != Instruction::Mul)
return false;
@@ -665,26 +663,20 @@ bool AMDGPUCodeGenPrepareImpl::replaceMulWithMul24(BinaryOperator &I) const {
extractValues(Builder, RHSVals, RHS);
IntegerType *I32Ty = Builder.getInt32Ty();
- for (int I = 0, E = LHSVals.size(); I != E; ++I) {
- Value *LHS, *RHS;
- if (IsSigned) {
- LHS = Builder.CreateSExtOrTrunc(LHSVals[I], I32Ty);
- RHS = Builder.CreateSExtOrTrunc(RHSVals[I], I32Ty);
- } else {
- LHS = Builder.CreateZExtOrTrunc(LHSVals[I], I32Ty);
- RHS = Builder.CreateZExtOrTrunc(RHSVals[I], I32Ty);
- }
-
- Value *Result =
- getMul24(Builder, LHS, RHS, Size, LHSBits + RHSBits, IsSigned);
+ IntegerType *IntrinTy = Size > 32 ? Builder.getInt64Ty() : I32Ty;
+ Type *DstTy = LHSVals[0]->getType();
- if (IsSigned) {
- ResultVals.push_back(
- Builder.CreateSExtOrTrunc(Result, LHSVals[I]->getType()));
- } else {
- ResultVals.push_back(
- Builder.CreateZExtOrTrunc(Result, LHSVals[I]->getType()));
- }
+ for (int I = 0, E = LHSVals.size(); I != E; ++I) {
+ Value *LHS = IsSigned ? Builder.CreateSExtOrTrunc(LHSVals[I], I32Ty)
+ : Builder.CreateZExtOrTrunc(LHSVals[I], I32Ty);
+ Value *RHS = IsSigned ? Builder.CreateSExtOrTrunc(RHSVals[I], I32Ty)
+ : Builder.CreateZExtOrTrunc(RHSVals[I], I32Ty);
+ Intrinsic::ID ID =
+ IsSigned ? Intrinsic::amdgcn_mul_i24 : Intrinsic::amdgcn_mul_u24;
+ Value *Result = Builder.CreateIntrinsic(ID, {IntrinTy}, {LHS, RHS});
+ Result = IsSigned ? Builder.CreateSExtOrTrunc(Result, DstTy)
+ : Builder.CreateZExtOrTrunc(Result, DstTy);
+ ResultVals.push_back(Result);
}
Value *NewVal = insertValues(Builder, Ty, ResultVals);
@@ -809,14 +801,10 @@ Value *AMDGPUCodeGenPrepareImpl::emitRcpIEEE1ULP(IRBuilder<> &Builder,
// range won't underflow to denormal. The hard part is knowing the
// result. We need a range check, the result could be denormal for
// 0x1p+126 < den <= 0x1p+127.
-
- Type *Ty = Src->getType();
-
auto [FrexpMant, FrexpExp] = getFrexpResults(Builder, Src);
Value *ScaleFactor = Builder.CreateNeg(FrexpExp);
Value *Rcp = Builder.CreateUnaryIntrinsic(Intrinsic::amdgcn_rcp, FrexpMant);
- return Builder.CreateIntrinsic(Intrinsic::ldexp, {Ty, Builder.getInt32Ty()},
- {Rcp, ScaleFactor});
+ return Builder.CreateCall(getLdexpF32(), {Rcp, ScaleFactor});
}
/// Emit a 2ulp expansion for fdiv by using frexp for input scaling.
@@ -832,8 +820,6 @@ Value *AMDGPUCodeGenPrepareImpl::emitFrexpDiv(IRBuilder<> &Builder, Value *LHS,
// We're scaling the LHS to avoid a denormal input, and scale the denominator
// to avoid large values underflowing the result.
- Type *Ty = LHS->getType();
-
auto [FrexpMantRHS, FrexpExpRHS] = getFrexpResults(Builder, RHS);
Value *Rcp =
@@ -845,8 +831,30 @@ Value *AMDGPUCodeGenPrepareImpl::emitFrexpDiv(IRBuilder<> &Builder, Value *LHS,
// We multiplied by 2^N/2^M, so we need to multiply by 2^(N-M) to scale the
// result.
Value *ExpDiff = Builder.CreateSub(FrexpExpLHS, FrexpExpRHS);
- return Builder.CreateIntrinsic(Intrinsic::ldexp, {Ty, Builder.getInt32Ty()},
- {Mul, ExpDiff});
+ return Builder.CreateCall(getLdexpF32(), {Mul, ExpDiff});
+}
+
+/// Emit a sqrt that handles denormals and is accurate to 2ulp.
+Value *AMDGPUCodeGenPrepareImpl::emitSqrtIEEE2ULP(IRBuilder<> &Builder,
+ Value *Src,
+ FastMathFlags FMF) const {
+ Type *Ty = Src->getType();
+ APFloat SmallestNormal =
+ APFloat::getSmallestNormalized(Ty->getFltSemantics());
+ Value *NeedScale =
+ Builder.CreateFCmpOLT(Src, ConstantFP::get(Ty, SmallestNormal));
+
+ ConstantInt *Zero = Builder.getInt32(0);
+ Value *InputScaleFactor =
+ Builder.CreateSelect(NeedScale, Builder.getInt32(32), Zero);
+
+ Value *Scaled = Builder.CreateCall(getLdexpF32(), {Src, InputScaleFactor});
+
+ Value *Sqrt = Builder.CreateCall(getSqrtF32(), Scaled);
+
+ Value *OutputScaleFactor =
+ Builder.CreateSelect(NeedScale, Builder.getInt32(-16), Zero);
+ return Builder.CreateCall(getLdexpF32(), {Sqrt, OutputScaleFactor});
}
/// Emit an expansion of 1.0 / sqrt(Src) good for 1ulp that supports denormals.
@@ -890,8 +898,8 @@ bool AMDGPUCodeGenPrepareImpl::canOptimizeWithRsq(const FPMathOperator *SqrtOp,
}
Value *AMDGPUCodeGenPrepareImpl::optimizeWithRsq(
- IRBuilder<> &Builder, Value *Num, Value *Den, FastMathFlags DivFMF,
- FastMathFlags SqrtFMF, const Instruction *CtxI) const {
+ IRBuilder<> &Builder, Value *Num, Value *Den, const FastMathFlags DivFMF,
+ const FastMathFlags SqrtFMF, const Instruction *CtxI) const {
// The rsqrt contraction increases accuracy from ~2ulp to ~1ulp.
assert(DivFMF.allowContract() && SqrtFMF.allowContract());
@@ -910,10 +918,9 @@ Value *AMDGPUCodeGenPrepareImpl::optimizeWithRsq(
if (CLHS->isExactlyValue(1.0) || (IsNegative = CLHS->isExactlyValue(-1.0))) {
// Add in the sqrt flags.
IRBuilder<>::FastMathFlagGuard Guard(Builder);
- DivFMF |= SqrtFMF;
- Builder.setFastMathFlags(DivFMF);
+ Builder.setFastMathFlags(DivFMF | SqrtFMF);
- if ((DivFMF.approxFunc() && SqrtFMF.approxFunc()) ||
+ if ((DivFMF.approxFunc() && SqrtFMF.approxFunc()) || HasUnsafeFPMath ||
canIgnoreDenormalInput(Den, CtxI)) {
Value *Result = Builder.CreateUnaryIntrinsic(Intrinsic::amdgcn_rsq, Den);
// -1.0 / sqrt(x) -> fneg(rsq(x))
@@ -1077,6 +1084,21 @@ bool AMDGPUCodeGenPrepareImpl::visitFDiv(BinaryOperator &FDiv) {
const FastMathFlags DivFMF = FPOp->getFastMathFlags();
const float ReqdAccuracy = FPOp->getFPAccuracy();
+ FastMathFlags SqrtFMF;
+
+ Value *Num = FDiv.getOperand(0);
+ Value *Den = FDiv.getOperand(1);
+
+ Value *RsqOp = nullptr;
+ auto *DenII = dyn_cast<IntrinsicInst>(Den);
+ if (DenII && DenII->getIntrinsicID() == Intrinsic::sqrt &&
+ DenII->hasOneUse()) {
+ const auto *SqrtOp = cast<FPMathOperator>(DenII);
+ SqrtFMF = SqrtOp->getFastMathFlags();
+ if (canOptimizeWithRsq(SqrtOp, DivFMF, SqrtFMF))
+ RsqOp = SqrtOp->getOperand(0);
+ }
+
// Inaccurate rcp is allowed with unsafe-fp-math or afn.
//
// Defer to codegen to handle this.
@@ -1087,28 +1109,13 @@ bool AMDGPUCodeGenPrepareImpl::visitFDiv(BinaryOperator &FDiv) {
// don't need any pre-consideration here when we have better information. A
// more conservative interpretation could use handling here.
const bool AllowInaccurateRcp = HasUnsafeFPMath || DivFMF.approxFunc();
- if (AllowInaccurateRcp)
+ if (!RsqOp && AllowInaccurateRcp)
return false;
// Defer the correct implementations to codegen.
if (ReqdAccuracy < 1.0f)
return false;
- FastMathFlags SqrtFMF;
-
- Value *Num = FDiv.getOperand(0);
- Value *Den = FDiv.getOperand(1);
-
- Value *RsqOp = nullptr;
- auto *DenII = dyn_cast<IntrinsicInst>(Den);
- if (DenII && DenII->getIntrinsicID() == Intrinsic::sqrt &&
- DenII->hasOneUse()) {
- const auto *SqrtOp = cast<FPMathOperator>(DenII);
- SqrtFMF = SqrtOp->getFastMathFlags();
- if (canOptimizeWithRsq(SqrtOp, DivFMF, SqrtFMF))
- RsqOp = SqrtOp->getOperand(0);
- }
-
IRBuilder<> Builder(FDiv.getParent(), std::next(FDiv.getIterator()));
Builder.setFastMathFlags(DivFMF);
Builder.SetCurrentDebugLocation(FDiv.getDebugLoc());
@@ -1777,47 +1784,79 @@ static bool isInterestingPHIIncomingValue(const Value *V) {
return false;
}
+static void collectPHINodes(const PHINode &I,
+ SmallPtrSet<const PHINode *, 8> &SeenPHIs) {
+ const auto [It, Inserted] = SeenPHIs.insert(&I);
+ if (!Inserted)
+ return;
+
+ for (const Value *Inc : I.incoming_values()) {
+ if (const auto *PhiInc = dyn_cast<PHINode>(Inc))
+ collectPHINodes(*PhiInc, SeenPHIs);
+ }
+
+ for (const User *U : I.users()) {
+ if (const auto *PhiU = dyn_cast<PHINode>(U))
+ collectPHINodes(*PhiU, SeenPHIs);
+ }
+}
+
bool AMDGPUCodeGenPrepareImpl::canBreakPHINode(const PHINode &I) {
- // Check in the cache, or add an entry for this node.
- //
- // We init with false because we consider all PHI nodes unbreakable until we
- // reach a conclusion. Doing the opposite - assuming they're break-able until
- // proven otherwise - can be harmful in some pathological cases so we're
- // conservative for now.
- const auto [It, DidInsert] = BreakPhiNodesCache.insert({&I, false});
- if (!DidInsert)
+ // Check in the cache first.
+ if (const auto It = BreakPhiNodesCache.find(&I);
+ It != BreakPhiNodesCache.end())
return It->second;
- // This function may recurse, so to guard against infinite looping, this PHI
- // is conservatively considered unbreakable until we reach a conclusion.
+ // We consider PHI nodes as part of "chains", so given a PHI node I, we
+ // recursively consider all its users and incoming values that are also PHI
+ // nodes. We then make a decision about all of those PHIs at once. Either they
+ // all get broken up, or none of them do. That way, we avoid cases where a
+ // single PHI is/is not broken and we end up reforming/exploding a vector
+ // multiple times, or even worse, doing it in a loop.
+ SmallPtrSet<const PHINode *, 8> WorkList;
+ collectPHINodes(I, WorkList);
+
+#ifndef NDEBUG
+ // Check that none of the PHI nodes in the worklist are in the map. If some of
+ // them are, it means we're not good enough at collecting related PHIs.
+ for (const PHINode *WLP : WorkList) {
+ assert(BreakPhiNodesCache.count(WLP) == 0);
+ }
+#endif
- // Don't break PHIs that have no interesting incoming values. That is, where
- // there is no clear opportunity to fold the "extractelement" instructions we
- // would add.
+ // To consider a PHI profitable to break, we need to see some interesting
+ // incoming values. At least 2/3rd (rounded up) of all PHIs in the worklist
+ // must have one to consider all PHIs breakable.
//
- // Note: IC does not run after this pass, so we're only interested in the
- // foldings that the DAG combiner can do.
- if (none_of(I.incoming_values(),
- [&](Value *V) { return isInterestingPHIIncomingValue(V); }))
- return false;
-
- // Now, check users for unbreakable PHI nodes. If we have an unbreakable PHI
- // node as user, we don't want to break this PHI either because it's unlikely
- // to be beneficial. We would just explode the vector and reassemble it
- // directly, wasting instructions.
+ // This threshold has been determined through performance testing.
+ //
+ // Note that the computation below is equivalent to
+ //
+ // (unsigned)ceil((K / 3.0) * 2)
//
- // In the case where multiple users are PHI nodes, we want at least half of
- // them to be breakable.
- int Score = 0;
- for (const Value *U : I.users()) {
- if (const auto *PU = dyn_cast<PHINode>(U))
- Score += canBreakPHINode(*PU) ? 1 : -1;
+ // It's simply written this way to avoid mixing integral/FP arithmetic.
+ const auto Threshold = (alignTo(WorkList.size() * 2, 3) / 3);
+ unsigned NumBreakablePHIs = 0;
+ bool CanBreak = false;
+ for (const PHINode *Cur : WorkList) {
+ // Don't break PHIs that have no interesting incoming values. That is, where
+ // there is no clear opportunity to fold the "extractelement" instructions
+ // we would add.
+ //
+ // Note: IC does not run after this pass, so we're only interested in the
+ // foldings that the DAG combiner can do.
+ if (any_of(Cur->incoming_values(), isInterestingPHIIncomingValue)) {
+ if (++NumBreakablePHIs >= Threshold) {
+ CanBreak = true;
+ break;
+ }
+ }
}
- if (Score < 0)
- return false;
+ for (const PHINode *Cur : WorkList)
+ BreakPhiNodesCache[Cur] = CanBreak;
- return BreakPhiNodesCache[&I] = true;
+ return CanBreak;
}
/// Helper class for "break large PHIs" (visitPHINode).
@@ -1898,14 +1937,15 @@ bool AMDGPUCodeGenPrepareImpl::visitPHINode(PHINode &I) {
// operations with most elements being "undef". This inhibits a lot of
// optimization opportunities and can result in unreasonably high register
// pressure and the inevitable stack spilling.
- if (!ScalarizeLargePHIs || getCGPassBuilderOption().EnableGlobalISelOption)
+ if (!BreakLargePHIs || getCGPassBuilderOption().EnableGlobalISelOption)
return false;
FixedVectorType *FVT = dyn_cast<FixedVectorType>(I.getType());
- if (!FVT || DL->getTypeSizeInBits(FVT) <= ScalarizeLargePHIsThreshold)
+ if (!FVT || FVT->getNumElements() == 1 ||
+ DL->getTypeSizeInBits(FVT) <= BreakLargePHIsThreshold)
return false;
- if (!ForceScalarizeLargePHIs && !canBreakPHINode(I))
+ if (!ForceBreakLargePHIs && !canBreakPHINode(I))
return false;
std::vector<VectorSlice> Slices;
@@ -1930,8 +1970,7 @@ bool AMDGPUCodeGenPrepareImpl::visitPHINode(PHINode &I) {
Slices.emplace_back(EltTy, Idx, 1);
}
- if (Slices.size() == 1)
- return false;
+ assert(Slices.size() > 1);
// Create one PHI per vector piece. The "VectorSlice" class takes care of
// creating the necessary instruction to extract the relevant slices of each
@@ -1977,6 +2016,8 @@ bool AMDGPUCodeGenPrepareImpl::visitIntrinsicInst(IntrinsicInst &I) {
return visitBitreverseIntrinsicInst(I);
case Intrinsic::minnum:
return visitMinNum(I);
+ case Intrinsic::sqrt:
+ return visitSqrt(I);
default:
return false;
}
@@ -2070,9 +2111,75 @@ bool AMDGPUCodeGenPrepareImpl::visitMinNum(IntrinsicInst &I) {
return true;
}
+static bool isOneOrNegOne(const Value *Val) {
+ const APFloat *C;
+ return match(Val, m_APFloat(C)) && C->getExactLog2Abs() == 0;
+}
+
+// Expand llvm.sqrt.f32 calls with !fpmath metadata in a semi-fast way.
+bool AMDGPUCodeGenPrepareImpl::visitSqrt(IntrinsicInst &Sqrt) {
+ Type *Ty = Sqrt.getType()->getScalarType();
+ if (!Ty->isFloatTy() && (!Ty->isHalfTy() || ST->has16BitInsts()))
+ return false;
+
+ const FPMathOperator *FPOp = cast<const FPMathOperator>(&Sqrt);
+ FastMathFlags SqrtFMF = FPOp->getFastMathFlags();
+
+ // We're trying to handle the fast-but-not-that-fast case only. The lowering
+ // of fast llvm.sqrt will give the raw instruction anyway.
+ if (SqrtFMF.approxFunc() || HasUnsafeFPMath)
+ return false;
+
+ const float ReqdAccuracy = FPOp->getFPAccuracy();
+
+ // Defer correctly rounded expansion to codegen.
+ if (ReqdAccuracy < 1.0f)
+ return false;
+
+ // FIXME: This is an ugly hack for this pass using forward iteration instead
+ // of reverse. If it worked like a normal combiner, the rsq would form before
+ // we saw a sqrt call.
+ auto *FDiv =
+ dyn_cast_or_null<FPMathOperator>(Sqrt.getUniqueUndroppableUser());
+ if (FDiv && FDiv->getOpcode() == Instruction::FDiv &&
+ FDiv->getFPAccuracy() >= 1.0f &&
+ canOptimizeWithRsq(FPOp, FDiv->getFastMathFlags(), SqrtFMF) &&
+ // TODO: We should also handle the arcp case for the fdiv with non-1 value
+ isOneOrNegOne(FDiv->getOperand(0)))
+ return false;
+
+ Value *SrcVal = Sqrt.getOperand(0);
+ bool CanTreatAsDAZ = canIgnoreDenormalInput(SrcVal, &Sqrt);
+
+ // The raw instruction is 1 ulp, but the correction for denormal handling
+ // brings it to 2.
+ if (!CanTreatAsDAZ && ReqdAccuracy < 2.0f)
+ return false;
+
+ IRBuilder<> Builder(&Sqrt);
+ SmallVector<Value *, 4> SrcVals;
+ extractValues(Builder, SrcVals, SrcVal);
+
+ SmallVector<Value *, 4> ResultVals(SrcVals.size());
+ for (int I = 0, E = SrcVals.size(); I != E; ++I) {
+ if (CanTreatAsDAZ)
+ ResultVals[I] = Builder.CreateCall(getSqrtF32(), SrcVals[I]);
+ else
+ ResultVals[I] = emitSqrtIEEE2ULP(Builder, SrcVals[I], SqrtFMF);
+ }
+
+ Value *NewSqrt = insertValues(Builder, Sqrt.getType(), ResultVals);
+ NewSqrt->takeName(&Sqrt);
+ Sqrt.replaceAllUsesWith(NewSqrt);
+ Sqrt.eraseFromParent();
+ return true;
+}
+
bool AMDGPUCodeGenPrepare::doInitialization(Module &M) {
Impl.Mod = &M;
Impl.DL = &Impl.Mod->getDataLayout();
+ Impl.SqrtF32 = nullptr;
+ Impl.LdexpF32 = nullptr;
return false;
}
@@ -2092,7 +2199,7 @@ bool AMDGPUCodeGenPrepare::runOnFunction(Function &F) {
auto *DTWP = getAnalysisIfAvailable<DominatorTreeWrapperPass>();
Impl.DT = DTWP ? &DTWP->getDomTree() : nullptr;
Impl.HasUnsafeFPMath = hasUnsafeFPMath(F);
- SIModeRegisterDefaults Mode(F);
+ SIModeRegisterDefaults Mode(F, *Impl.ST);
Impl.HasFP32DenormalFlush =
Mode.FP32Denormals == DenormalMode::getPreserveSign();
return Impl.run(F);
@@ -2109,7 +2216,7 @@ PreservedAnalyses AMDGPUCodeGenPreparePass::run(Function &F,
Impl.UA = &FAM.getResult<UniformityInfoAnalysis>(F);
Impl.DT = FAM.getCachedResult<DominatorTreeAnalysis>(F);
Impl.HasUnsafeFPMath = hasUnsafeFPMath(F);
- SIModeRegisterDefaults Mode(F);
+ SIModeRegisterDefaults Mode(F, *Impl.ST);
Impl.HasFP32DenormalFlush =
Mode.FP32Denormals == DenormalMode::getPreserveSign();
PreservedAnalyses PA = PreservedAnalyses::none();
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUCombine.td b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUCombine.td
index 892e1eef27a8..8d4cad4c07bc 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUCombine.td
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUCombine.td
@@ -139,19 +139,21 @@ def gfx6gfx7_combines : GICombineGroup<[fcmp_select_to_fmin_fmax_legacy]>;
// Combines which should only apply on VI
def gfx8_combines : GICombineGroup<[expand_promoted_fmed3]>;
-def AMDGPUPreLegalizerCombiner: GICombinerHelper<
+def AMDGPUPreLegalizerCombiner: GICombiner<
"AMDGPUPreLegalizerCombinerImpl",
[all_combines, clamp_i64_to_i16, foldable_fneg]> {
+ let CombineAllMethodName = "tryCombineAllImpl";
}
-def AMDGPUPostLegalizerCombiner: GICombinerHelper<
+def AMDGPUPostLegalizerCombiner: GICombiner<
"AMDGPUPostLegalizerCombinerImpl",
[all_combines, gfx6gfx7_combines, gfx8_combines,
uchar_to_float, cvt_f32_ubyteN, remove_fcanonicalize, foldable_fneg,
rcp_sqrt_to_rsq, sign_extension_in_reg]> {
+ let CombineAllMethodName = "tryCombineAllImpl";
}
-def AMDGPURegBankCombiner : GICombinerHelper<
+def AMDGPURegBankCombiner : GICombiner<
"AMDGPURegBankCombinerImpl",
[unmerge_merge, unmerge_cst, unmerge_undef,
zext_trunc_fold, int_minmax_to_med3, ptr_add_immed_chain,
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUCombinerHelper.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUCombinerHelper.cpp
index 78fdedc0b511..69dc78d33c83 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUCombinerHelper.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUCombinerHelper.cpp
@@ -9,6 +9,7 @@
#include "AMDGPUCombinerHelper.h"
#include "GCNSubtarget.h"
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
+#include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h"
#include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
#include "llvm/IR/IntrinsicsAMDGPU.h"
#include "llvm/Target/TargetMachine.h"
@@ -28,6 +29,8 @@ static bool fnegFoldsIntoMI(const MachineInstr &MI) {
case AMDGPU::G_FMAXNUM:
case AMDGPU::G_FMINNUM_IEEE:
case AMDGPU::G_FMAXNUM_IEEE:
+ case AMDGPU::G_FMINIMUM:
+ case AMDGPU::G_FMAXIMUM:
case AMDGPU::G_FSIN:
case AMDGPU::G_FPEXT:
case AMDGPU::G_INTRINSIC_TRUNC:
@@ -42,7 +45,7 @@ static bool fnegFoldsIntoMI(const MachineInstr &MI) {
case AMDGPU::G_AMDGPU_FMAX_LEGACY:
return true;
case AMDGPU::G_INTRINSIC: {
- unsigned IntrinsicID = MI.getIntrinsicID();
+ unsigned IntrinsicID = cast<GIntrinsic>(MI).getIntrinsicID();
switch (IntrinsicID) {
case Intrinsic::amdgcn_rcp:
case Intrinsic::amdgcn_rcp_legacy:
@@ -66,8 +69,7 @@ static bool fnegFoldsIntoMI(const MachineInstr &MI) {
LLVM_READONLY
static bool opMustUseVOP3Encoding(const MachineInstr &MI,
const MachineRegisterInfo &MRI) {
- return MI.getNumOperands() >
- (MI.getOpcode() == AMDGPU::G_INTRINSIC ? 4u : 3u) ||
+ return MI.getNumOperands() > (isa<GIntrinsic>(MI) ? 4u : 3u) ||
MRI.getType(MI.getOperand(0).getReg()).getScalarSizeInBits() == 64;
}
@@ -85,14 +87,16 @@ static bool hasSourceMods(const MachineInstr &MI) {
case TargetOpcode::INLINEASM:
case TargetOpcode::INLINEASM_BR:
case AMDGPU::G_INTRINSIC_W_SIDE_EFFECTS:
+ case AMDGPU::G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS:
case AMDGPU::G_BITCAST:
case AMDGPU::G_ANYEXT:
case AMDGPU::G_BUILD_VECTOR:
case AMDGPU::G_BUILD_VECTOR_TRUNC:
case AMDGPU::G_PHI:
return false;
- case AMDGPU::G_INTRINSIC: {
- unsigned IntrinsicID = MI.getIntrinsicID();
+ case AMDGPU::G_INTRINSIC:
+ case AMDGPU::G_INTRINSIC_CONVERGENT: {
+ unsigned IntrinsicID = cast<GIntrinsic>(MI).getIntrinsicID();
switch (IntrinsicID) {
case Intrinsic::amdgcn_interp_p1:
case Intrinsic::amdgcn_interp_p2:
@@ -172,6 +176,10 @@ static unsigned inverseMinMax(unsigned Opc) {
return AMDGPU::G_FMINNUM_IEEE;
case AMDGPU::G_FMINNUM_IEEE:
return AMDGPU::G_FMAXNUM_IEEE;
+ case AMDGPU::G_FMAXIMUM:
+ return AMDGPU::G_FMINIMUM;
+ case AMDGPU::G_FMINIMUM:
+ return AMDGPU::G_FMAXIMUM;
case AMDGPU::G_AMDGPU_FMAX_LEGACY:
return AMDGPU::G_AMDGPU_FMIN_LEGACY;
case AMDGPU::G_AMDGPU_FMIN_LEGACY:
@@ -205,6 +213,8 @@ bool AMDGPUCombinerHelper::matchFoldableFneg(MachineInstr &MI,
case AMDGPU::G_FMAXNUM:
case AMDGPU::G_FMINNUM_IEEE:
case AMDGPU::G_FMAXNUM_IEEE:
+ case AMDGPU::G_FMINIMUM:
+ case AMDGPU::G_FMAXIMUM:
case AMDGPU::G_AMDGPU_FMIN_LEGACY:
case AMDGPU::G_AMDGPU_FMAX_LEGACY:
// 0 doesn't have a negated inline immediate.
@@ -227,8 +237,9 @@ bool AMDGPUCombinerHelper::matchFoldableFneg(MachineInstr &MI,
case AMDGPU::G_FCANONICALIZE:
case AMDGPU::G_AMDGPU_RCP_IFLAG:
return true;
- case AMDGPU::G_INTRINSIC: {
- unsigned IntrinsicID = MatchInfo->getIntrinsicID();
+ case AMDGPU::G_INTRINSIC:
+ case AMDGPU::G_INTRINSIC_CONVERGENT: {
+ unsigned IntrinsicID = cast<GIntrinsic>(MatchInfo)->getIntrinsicID();
switch (IntrinsicID) {
case Intrinsic::amdgcn_rcp:
case Intrinsic::amdgcn_rcp_legacy:
@@ -301,6 +312,8 @@ void AMDGPUCombinerHelper::applyFoldableFneg(MachineInstr &MI,
case AMDGPU::G_FMAXNUM:
case AMDGPU::G_FMINNUM_IEEE:
case AMDGPU::G_FMAXNUM_IEEE:
+ case AMDGPU::G_FMINIMUM:
+ case AMDGPU::G_FMAXIMUM:
case AMDGPU::G_AMDGPU_FMIN_LEGACY:
case AMDGPU::G_AMDGPU_FMAX_LEGACY: {
NegateOperand(MatchInfo->getOperand(1));
@@ -326,8 +339,9 @@ void AMDGPUCombinerHelper::applyFoldableFneg(MachineInstr &MI,
case AMDGPU::G_FPTRUNC:
NegateOperand(MatchInfo->getOperand(1));
break;
- case AMDGPU::G_INTRINSIC: {
- unsigned IntrinsicID = MatchInfo->getIntrinsicID();
+ case AMDGPU::G_INTRINSIC:
+ case AMDGPU::G_INTRINSIC_CONVERGENT: {
+ unsigned IntrinsicID = cast<GIntrinsic>(MatchInfo)->getIntrinsicID();
switch (IntrinsicID) {
case Intrinsic::amdgcn_rcp:
case Intrinsic::amdgcn_rcp_legacy:
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUCtorDtorLowering.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUCtorDtorLowering.cpp
index a13447586bd4..3afefcf55d49 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUCtorDtorLowering.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUCtorDtorLowering.cpp
@@ -53,13 +53,22 @@ static Function *createInitOrFiniKernelFunction(Module &M, bool IsCtor) {
//
// extern "C" void * __init_array_start[];
// extern "C" void * __init_array_end[];
+// extern "C" void * __fini_array_start[];
+// extern "C" void * __fini_array_end[];
//
// using InitCallback = void();
+// using FiniCallback = void(void);
//
// void call_init_array_callbacks() {
// for (auto start = __init_array_start; start != __init_array_end; ++start)
// reinterpret_cast<InitCallback *>(*start)();
// }
+//
+// void call_fini_array_callbacks() {
+// size_t fini_array_size = __fini_array_end - __fini_array_start;
+// for (size_t i = fini_array_size; i > 0; --i)
+// reinterpret_cast<FiniCallback *>(__fini_array_start[i - 1])();
+// }
static void createInitOrFiniCalls(Function &F, bool IsCtor) {
Module &M = *F.getParent();
LLVMContext &C = M.getContext();
@@ -96,15 +105,37 @@ static void createInitOrFiniCalls(Function &F, bool IsCtor) {
// for now we just call them with no arguments.
auto *CallBackTy = FunctionType::get(IRB.getVoidTy(), {});
- IRB.CreateCondBr(IRB.CreateICmpNE(Begin, End), LoopBB, ExitBB);
+ Value *Start = Begin;
+ Value *Stop = End;
+ // The destructor array must be called in reverse order. Get a constant
+ // expression to the end of the array and iterate backwards instead.
+ if (!IsCtor) {
+ Type *Int64Ty = IntegerType::getInt64Ty(C);
+ auto *EndPtr = IRB.CreatePtrToInt(End, Int64Ty);
+ auto *BeginPtr = IRB.CreatePtrToInt(Begin, Int64Ty);
+ auto *ByteSize = IRB.CreateSub(EndPtr, BeginPtr);
+ auto *Size = IRB.CreateAShr(ByteSize, ConstantInt::get(Int64Ty, 3));
+ auto *Offset = IRB.CreateSub(Size, ConstantInt::get(Int64Ty, 1));
+ Start = IRB.CreateInBoundsGEP(
+ ArrayType::get(IRB.getPtrTy(), 0), Begin,
+ ArrayRef<Value *>({ConstantInt::get(Int64Ty, 0), Offset}));
+ Stop = Begin;
+ }
+
+ IRB.CreateCondBr(
+ IRB.CreateCmp(IsCtor ? ICmpInst::ICMP_NE : ICmpInst::ICMP_UGE, Start,
+ Stop),
+ LoopBB, ExitBB);
IRB.SetInsertPoint(LoopBB);
auto *CallBackPHI = IRB.CreatePHI(PtrTy, 2, "ptr");
- auto *CallBack = IRB.CreateLoad(CallBackTy->getPointerTo(F.getAddressSpace()),
+ auto *CallBack = IRB.CreateLoad(IRB.getPtrTy(F.getAddressSpace()),
CallBackPHI, "callback");
IRB.CreateCall(CallBackTy, CallBack);
- auto *NewCallBack = IRB.CreateConstGEP1_64(PtrTy, CallBackPHI, 1, "next");
- auto *EndCmp = IRB.CreateICmpEQ(NewCallBack, End, "end");
- CallBackPHI->addIncoming(Begin, &F.getEntryBlock());
+ auto *NewCallBack =
+ IRB.CreateConstGEP1_64(PtrTy, CallBackPHI, IsCtor ? 1 : -1, "next");
+ auto *EndCmp = IRB.CreateCmp(IsCtor ? ICmpInst::ICMP_EQ : ICmpInst::ICMP_ULT,
+ NewCallBack, Stop, "end");
+ CallBackPHI->addIncoming(Start, &F.getEntryBlock());
CallBackPHI->addIncoming(NewCallBack, LoopBB);
IRB.CreateCondBr(EndCmp, ExitBB, LoopBB);
IRB.SetInsertPoint(ExitBB);
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUGISel.td b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUGISel.td
index 37df4f68c265..2b85024a9b40 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUGISel.td
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUGISel.td
@@ -105,6 +105,11 @@ def gi_global_saddr :
def gi_mubuf_scratch_offset :
GIComplexOperandMatcher<s32, "selectMUBUFScratchOffset">,
GIComplexPatternEquiv<MUBUFScratchOffset>;
+
+def gi_buf_soffset :
+ GIComplexOperandMatcher<s32, "selectBUFSOffset">,
+ GIComplexPatternEquiv<BUFSOffset>;
+
def gi_mubuf_scratch_offen :
GIComplexOperandMatcher<s32, "selectMUBUFScratchOffen">,
GIComplexPatternEquiv<MUBUFScratchOffen>;
@@ -379,3 +384,6 @@ def gi_set_glc : GICustomOperandRenderer<"renderSetGLC">,
def gi_frameindex_to_targetframeindex : GICustomOperandRenderer<"renderFrameIndex">,
GISDNodeXFormEquiv<frameindex_to_targetframeindex>;
+
+def gi_fp_pow2_to_exponent : GICustomOperandRenderer<"renderFPPow2ToExponent">,
+ GISDNodeXFormEquiv<FPPow2ToExponentXForm>;
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelDivergenceLowering.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelDivergenceLowering.cpp
new file mode 100644
index 000000000000..4cd8b1ec1051
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelDivergenceLowering.cpp
@@ -0,0 +1,68 @@
+//===-- AMDGPUGlobalISelDivergenceLowering.cpp ----------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// GlobalISel pass that selects divergent i1 phis as lane mask phis.
+/// Lane mask merging uses same algorithm as SDAG in SILowerI1Copies.
+/// Handles all cases of temporal divergence.
+/// For divergent non-phi i1 and uniform i1 uses outside of the cycle this pass
+/// currently depends on LCSSA to insert phis with one incoming.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+
+#define DEBUG_TYPE "amdgpu-global-isel-divergence-lowering"
+
+using namespace llvm;
+
+namespace {
+
+class AMDGPUGlobalISelDivergenceLowering : public MachineFunctionPass {
+public:
+ static char ID;
+
+public:
+ AMDGPUGlobalISelDivergenceLowering() : MachineFunctionPass(ID) {
+ initializeAMDGPUGlobalISelDivergenceLoweringPass(
+ *PassRegistry::getPassRegistry());
+ }
+
+ bool runOnMachineFunction(MachineFunction &MF) override;
+
+ StringRef getPassName() const override {
+ return "AMDGPU GlobalISel divergence lowering";
+ }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.setPreservesCFG();
+ MachineFunctionPass::getAnalysisUsage(AU);
+ }
+};
+
+} // End anonymous namespace.
+
+INITIALIZE_PASS_BEGIN(AMDGPUGlobalISelDivergenceLowering, DEBUG_TYPE,
+ "AMDGPU GlobalISel divergence lowering", false, false)
+INITIALIZE_PASS_END(AMDGPUGlobalISelDivergenceLowering, DEBUG_TYPE,
+ "AMDGPU GlobalISel divergence lowering", false, false)
+
+char AMDGPUGlobalISelDivergenceLowering::ID = 0;
+
+char &llvm::AMDGPUGlobalISelDivergenceLoweringID =
+ AMDGPUGlobalISelDivergenceLowering::ID;
+
+FunctionPass *llvm::createAMDGPUGlobalISelDivergenceLoweringPass() {
+ return new AMDGPUGlobalISelDivergenceLowering();
+}
+
+bool AMDGPUGlobalISelDivergenceLowering::runOnMachineFunction(
+ MachineFunction &MF) {
+ return false;
+}
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.cpp
index 09930dc9612c..5a756602eb1a 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.cpp
@@ -18,7 +18,7 @@ using namespace MIPatternMatch;
std::pair<Register, unsigned>
AMDGPU::getBaseWithConstantOffset(MachineRegisterInfo &MRI, Register Reg,
- GISelKnownBits *KnownBits) {
+ GISelKnownBits *KnownBits, bool CheckNUW) {
MachineInstr *Def = getDefIgnoringCopies(Reg, MRI);
if (Def->getOpcode() == TargetOpcode::G_CONSTANT) {
unsigned Offset;
@@ -33,6 +33,12 @@ AMDGPU::getBaseWithConstantOffset(MachineRegisterInfo &MRI, Register Reg,
int64_t Offset;
if (Def->getOpcode() == TargetOpcode::G_ADD) {
+ // A 32-bit (address + offset) should not cause unsigned 32-bit integer
+ // wraparound, because s_load instructions perform the addition in 64 bits.
+ if (CheckNUW && !Def->getFlag(MachineInstr::NoUWrap)) {
+ assert(MRI.getType(Reg).getScalarSizeInBits() == 32);
+ return std::pair(Reg, 0);
+ }
// TODO: Handle G_OR used for add case
if (mi_match(Def->getOperand(2).getReg(), MRI, m_ICst(Offset)))
return std::pair(Def->getOperand(1).getReg(), Offset);
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.h b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.h
index ff4edf02a84d..5ee888d9db00 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.h
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.h
@@ -9,7 +9,6 @@
#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUGLOBALISELUTILS_H
#define LLVM_LIB_TARGET_AMDGPU_AMDGPUGLOBALISELUTILS_H
-#include "llvm/ADT/ArrayRef.h"
#include "llvm/CodeGen/Register.h"
#include <utility>
@@ -25,7 +24,8 @@ namespace AMDGPU {
/// Returns base register and constant offset.
std::pair<Register, unsigned>
getBaseWithConstantOffset(MachineRegisterInfo &MRI, Register Reg,
- GISelKnownBits *KnownBits = nullptr);
+ GISelKnownBits *KnownBits = nullptr,
+ bool CheckNUW = false);
bool hasAtomicFaddRtnForTy(const GCNSubtarget &Subtarget, const LLT &Ty);
}
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp
index dadc0c92ef8b..b51a876750b5 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp
@@ -49,443 +49,14 @@ namespace AMDGPU {
namespace HSAMD {
//===----------------------------------------------------------------------===//
-// HSAMetadataStreamerV2
-//===----------------------------------------------------------------------===//
-void MetadataStreamerYamlV2::dump(StringRef HSAMetadataString) const {
- errs() << "AMDGPU HSA Metadata:\n" << HSAMetadataString << '\n';
-}
-
-void MetadataStreamerYamlV2::verify(StringRef HSAMetadataString) const {
- errs() << "AMDGPU HSA Metadata Parser Test: ";
-
- HSAMD::Metadata FromHSAMetadataString;
- if (fromString(HSAMetadataString, FromHSAMetadataString)) {
- errs() << "FAIL\n";
- return;
- }
-
- std::string ToHSAMetadataString;
- if (toString(FromHSAMetadataString, ToHSAMetadataString)) {
- errs() << "FAIL\n";
- return;
- }
-
- errs() << (HSAMetadataString == ToHSAMetadataString ? "PASS" : "FAIL")
- << '\n';
- if (HSAMetadataString != ToHSAMetadataString) {
- errs() << "Original input: " << HSAMetadataString << '\n'
- << "Produced output: " << ToHSAMetadataString << '\n';
- }
-}
-
-AccessQualifier
-MetadataStreamerYamlV2::getAccessQualifier(StringRef AccQual) const {
- if (AccQual.empty())
- return AccessQualifier::Unknown;
-
- return StringSwitch<AccessQualifier>(AccQual)
- .Case("read_only", AccessQualifier::ReadOnly)
- .Case("write_only", AccessQualifier::WriteOnly)
- .Case("read_write", AccessQualifier::ReadWrite)
- .Default(AccessQualifier::Default);
-}
-
-AddressSpaceQualifier
-MetadataStreamerYamlV2::getAddressSpaceQualifier(unsigned AddressSpace) const {
- switch (AddressSpace) {
- case AMDGPUAS::PRIVATE_ADDRESS:
- return AddressSpaceQualifier::Private;
- case AMDGPUAS::GLOBAL_ADDRESS:
- return AddressSpaceQualifier::Global;
- case AMDGPUAS::CONSTANT_ADDRESS:
- return AddressSpaceQualifier::Constant;
- case AMDGPUAS::LOCAL_ADDRESS:
- return AddressSpaceQualifier::Local;
- case AMDGPUAS::FLAT_ADDRESS:
- return AddressSpaceQualifier::Generic;
- case AMDGPUAS::REGION_ADDRESS:
- return AddressSpaceQualifier::Region;
- default:
- return AddressSpaceQualifier::Unknown;
- }
-}
-
-ValueKind MetadataStreamerYamlV2::getValueKind(Type *Ty, StringRef TypeQual,
- StringRef BaseTypeName) const {
- if (TypeQual.contains("pipe"))
- return ValueKind::Pipe;
-
- return StringSwitch<ValueKind>(BaseTypeName)
- .Case("image1d_t", ValueKind::Image)
- .Case("image1d_array_t", ValueKind::Image)
- .Case("image1d_buffer_t", ValueKind::Image)
- .Case("image2d_t", ValueKind::Image)
- .Case("image2d_array_t", ValueKind::Image)
- .Case("image2d_array_depth_t", ValueKind::Image)
- .Case("image2d_array_msaa_t", ValueKind::Image)
- .Case("image2d_array_msaa_depth_t", ValueKind::Image)
- .Case("image2d_depth_t", ValueKind::Image)
- .Case("image2d_msaa_t", ValueKind::Image)
- .Case("image2d_msaa_depth_t", ValueKind::Image)
- .Case("image3d_t", ValueKind::Image)
- .Case("sampler_t", ValueKind::Sampler)
- .Case("queue_t", ValueKind::Queue)
- .Default(isa<PointerType>(Ty) ?
- (Ty->getPointerAddressSpace() ==
- AMDGPUAS::LOCAL_ADDRESS ?
- ValueKind::DynamicSharedPointer :
- ValueKind::GlobalBuffer) :
- ValueKind::ByValue);
-}
-
-std::string MetadataStreamerYamlV2::getTypeName(Type *Ty, bool Signed) const {
- switch (Ty->getTypeID()) {
- case Type::IntegerTyID: {
- if (!Signed)
- return (Twine('u') + getTypeName(Ty, true)).str();
-
- auto BitWidth = Ty->getIntegerBitWidth();
- switch (BitWidth) {
- case 8:
- return "char";
- case 16:
- return "short";
- case 32:
- return "int";
- case 64:
- return "long";
- default:
- return (Twine('i') + Twine(BitWidth)).str();
- }
- }
- case Type::HalfTyID:
- return "half";
- case Type::FloatTyID:
- return "float";
- case Type::DoubleTyID:
- return "double";
- case Type::FixedVectorTyID: {
- auto VecTy = cast<FixedVectorType>(Ty);
- auto ElTy = VecTy->getElementType();
- auto NumElements = VecTy->getNumElements();
- return (Twine(getTypeName(ElTy, Signed)) + Twine(NumElements)).str();
- }
- default:
- return "unknown";
- }
-}
-
-std::vector<uint32_t>
-MetadataStreamerYamlV2::getWorkGroupDimensions(MDNode *Node) const {
- std::vector<uint32_t> Dims;
- if (Node->getNumOperands() != 3)
- return Dims;
-
- for (auto &Op : Node->operands())
- Dims.push_back(mdconst::extract<ConstantInt>(Op)->getZExtValue());
- return Dims;
-}
-
-Kernel::CodeProps::Metadata MetadataStreamerYamlV2::getHSACodeProps(
- const MachineFunction &MF, const SIProgramInfo &ProgramInfo) const {
- const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
- const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
- HSAMD::Kernel::CodeProps::Metadata HSACodeProps;
- const Function &F = MF.getFunction();
-
- assert(F.getCallingConv() == CallingConv::AMDGPU_KERNEL ||
- F.getCallingConv() == CallingConv::SPIR_KERNEL);
-
- Align MaxKernArgAlign;
- HSACodeProps.mKernargSegmentSize = STM.getKernArgSegmentSize(F,
- MaxKernArgAlign);
- HSACodeProps.mKernargSegmentAlign =
- std::max(MaxKernArgAlign, Align(4)).value();
-
- HSACodeProps.mGroupSegmentFixedSize = ProgramInfo.LDSSize;
- HSACodeProps.mPrivateSegmentFixedSize = ProgramInfo.ScratchSize;
- HSACodeProps.mWavefrontSize = STM.getWavefrontSize();
- HSACodeProps.mNumSGPRs = ProgramInfo.NumSGPR;
- HSACodeProps.mNumVGPRs = ProgramInfo.NumVGPR;
- HSACodeProps.mMaxFlatWorkGroupSize = MFI.getMaxFlatWorkGroupSize();
- HSACodeProps.mIsDynamicCallStack = ProgramInfo.DynamicCallStack;
- HSACodeProps.mIsXNACKEnabled = STM.isXNACKEnabled();
- HSACodeProps.mNumSpilledSGPRs = MFI.getNumSpilledSGPRs();
- HSACodeProps.mNumSpilledVGPRs = MFI.getNumSpilledVGPRs();
-
- return HSACodeProps;
-}
-
-Kernel::DebugProps::Metadata MetadataStreamerYamlV2::getHSADebugProps(
- const MachineFunction &MF, const SIProgramInfo &ProgramInfo) const {
- return HSAMD::Kernel::DebugProps::Metadata();
-}
-
-void MetadataStreamerYamlV2::emitVersion() {
- auto &Version = HSAMetadata.mVersion;
-
- Version.push_back(VersionMajorV2);
- Version.push_back(VersionMinorV2);
-}
-
-void MetadataStreamerYamlV2::emitPrintf(const Module &Mod) {
- auto &Printf = HSAMetadata.mPrintf;
-
- auto Node = Mod.getNamedMetadata("llvm.printf.fmts");
- if (!Node)
- return;
-
- for (auto *Op : Node->operands())
- if (Op->getNumOperands())
- Printf.push_back(
- std::string(cast<MDString>(Op->getOperand(0))->getString()));
-}
-
-void MetadataStreamerYamlV2::emitKernelLanguage(const Function &Func) {
- auto &Kernel = HSAMetadata.mKernels.back();
-
- // TODO: What about other languages?
- auto Node = Func.getParent()->getNamedMetadata("opencl.ocl.version");
- if (!Node || !Node->getNumOperands())
- return;
- auto Op0 = Node->getOperand(0);
- if (Op0->getNumOperands() <= 1)
- return;
-
- Kernel.mLanguage = "OpenCL C";
- Kernel.mLanguageVersion.push_back(
- mdconst::extract<ConstantInt>(Op0->getOperand(0))->getZExtValue());
- Kernel.mLanguageVersion.push_back(
- mdconst::extract<ConstantInt>(Op0->getOperand(1))->getZExtValue());
-}
-
-void MetadataStreamerYamlV2::emitKernelAttrs(const Function &Func) {
- auto &Attrs = HSAMetadata.mKernels.back().mAttrs;
-
- if (auto Node = Func.getMetadata("reqd_work_group_size"))
- Attrs.mReqdWorkGroupSize = getWorkGroupDimensions(Node);
- if (auto Node = Func.getMetadata("work_group_size_hint"))
- Attrs.mWorkGroupSizeHint = getWorkGroupDimensions(Node);
- if (auto Node = Func.getMetadata("vec_type_hint")) {
- Attrs.mVecTypeHint = getTypeName(
- cast<ValueAsMetadata>(Node->getOperand(0))->getType(),
- mdconst::extract<ConstantInt>(Node->getOperand(1))->getZExtValue());
- }
- if (Func.hasFnAttribute("runtime-handle")) {
- Attrs.mRuntimeHandle =
- Func.getFnAttribute("runtime-handle").getValueAsString().str();
- }
-}
-
-void MetadataStreamerYamlV2::emitKernelArgs(const Function &Func,
- const GCNSubtarget &ST) {
- for (auto &Arg : Func.args())
- emitKernelArg(Arg);
-
- emitHiddenKernelArgs(Func, ST);
-}
-
-void MetadataStreamerYamlV2::emitKernelArg(const Argument &Arg) {
- auto Func = Arg.getParent();
- auto ArgNo = Arg.getArgNo();
- const MDNode *Node;
-
- StringRef Name;
- Node = Func->getMetadata("kernel_arg_name");
- if (Node && ArgNo < Node->getNumOperands())
- Name = cast<MDString>(Node->getOperand(ArgNo))->getString();
- else if (Arg.hasName())
- Name = Arg.getName();
-
- StringRef TypeName;
- Node = Func->getMetadata("kernel_arg_type");
- if (Node && ArgNo < Node->getNumOperands())
- TypeName = cast<MDString>(Node->getOperand(ArgNo))->getString();
-
- StringRef BaseTypeName;
- Node = Func->getMetadata("kernel_arg_base_type");
- if (Node && ArgNo < Node->getNumOperands())
- BaseTypeName = cast<MDString>(Node->getOperand(ArgNo))->getString();
-
- StringRef AccQual;
- if (Arg.getType()->isPointerTy() && Arg.onlyReadsMemory() &&
- Arg.hasNoAliasAttr()) {
- AccQual = "read_only";
- } else {
- Node = Func->getMetadata("kernel_arg_access_qual");
- if (Node && ArgNo < Node->getNumOperands())
- AccQual = cast<MDString>(Node->getOperand(ArgNo))->getString();
- }
-
- StringRef TypeQual;
- Node = Func->getMetadata("kernel_arg_type_qual");
- if (Node && ArgNo < Node->getNumOperands())
- TypeQual = cast<MDString>(Node->getOperand(ArgNo))->getString();
-
- const DataLayout &DL = Func->getParent()->getDataLayout();
-
- MaybeAlign PointeeAlign;
- if (auto PtrTy = dyn_cast<PointerType>(Arg.getType())) {
- if (PtrTy->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) {
- // FIXME: Should report this for all address spaces
- PointeeAlign = Arg.getParamAlign().valueOrOne();
- }
- }
-
- Type *ArgTy;
- Align ArgAlign;
- std::tie(ArgTy, ArgAlign) = getArgumentTypeAlign(Arg, DL);
-
- emitKernelArg(DL, ArgTy, ArgAlign,
- getValueKind(ArgTy, TypeQual, BaseTypeName), PointeeAlign, Name,
- TypeName, BaseTypeName, AccQual, TypeQual);
-}
-
-void MetadataStreamerYamlV2::emitKernelArg(
- const DataLayout &DL, Type *Ty, Align Alignment, ValueKind ValueKind,
- MaybeAlign PointeeAlign, StringRef Name, StringRef TypeName,
- StringRef BaseTypeName, StringRef AccQual, StringRef TypeQual) {
- HSAMetadata.mKernels.back().mArgs.push_back(Kernel::Arg::Metadata());
- auto &Arg = HSAMetadata.mKernels.back().mArgs.back();
-
- Arg.mName = std::string(Name);
- Arg.mTypeName = std::string(TypeName);
- Arg.mSize = DL.getTypeAllocSize(Ty);
- Arg.mAlign = Alignment.value();
- Arg.mValueKind = ValueKind;
- Arg.mPointeeAlign = PointeeAlign ? PointeeAlign->value() : 0;
-
- if (auto PtrTy = dyn_cast<PointerType>(Ty))
- Arg.mAddrSpaceQual = getAddressSpaceQualifier(PtrTy->getAddressSpace());
-
- Arg.mAccQual = getAccessQualifier(AccQual);
-
- // TODO: Emit Arg.mActualAccQual.
-
- SmallVector<StringRef, 1> SplitTypeQuals;
- TypeQual.split(SplitTypeQuals, " ", -1, false);
- for (StringRef Key : SplitTypeQuals) {
- auto P = StringSwitch<bool*>(Key)
- .Case("const", &Arg.mIsConst)
- .Case("restrict", &Arg.mIsRestrict)
- .Case("volatile", &Arg.mIsVolatile)
- .Case("pipe", &Arg.mIsPipe)
- .Default(nullptr);
- if (P)
- *P = true;
- }
-}
-
-void MetadataStreamerYamlV2::emitHiddenKernelArgs(const Function &Func,
- const GCNSubtarget &ST) {
- unsigned HiddenArgNumBytes = ST.getImplicitArgNumBytes(Func);
- if (!HiddenArgNumBytes)
- return;
-
- auto &DL = Func.getParent()->getDataLayout();
- auto Int64Ty = Type::getInt64Ty(Func.getContext());
-
- if (HiddenArgNumBytes >= 8)
- emitKernelArg(DL, Int64Ty, Align(8), ValueKind::HiddenGlobalOffsetX);
- if (HiddenArgNumBytes >= 16)
- emitKernelArg(DL, Int64Ty, Align(8), ValueKind::HiddenGlobalOffsetY);
- if (HiddenArgNumBytes >= 24)
- emitKernelArg(DL, Int64Ty, Align(8), ValueKind::HiddenGlobalOffsetZ);
-
- auto Int8PtrTy = Type::getInt8PtrTy(Func.getContext(),
- AMDGPUAS::GLOBAL_ADDRESS);
-
- if (HiddenArgNumBytes >= 32) {
- // We forbid the use of features requiring hostcall when compiling OpenCL
- // before code object V5, which makes the mutual exclusion between the
- // "printf buffer" and "hostcall buffer" here sound.
- if (Func.getParent()->getNamedMetadata("llvm.printf.fmts"))
- emitKernelArg(DL, Int8PtrTy, Align(8), ValueKind::HiddenPrintfBuffer);
- else if (!Func.hasFnAttribute("amdgpu-no-hostcall-ptr"))
- emitKernelArg(DL, Int8PtrTy, Align(8), ValueKind::HiddenHostcallBuffer);
- else
- emitKernelArg(DL, Int8PtrTy, Align(8), ValueKind::HiddenNone);
- }
-
- // Emit "default queue" and "completion action" arguments if enqueue kernel is
- // used, otherwise emit dummy "none" arguments.
- if (HiddenArgNumBytes >= 40) {
- if (!Func.hasFnAttribute("amdgpu-no-default-queue")) {
- emitKernelArg(DL, Int8PtrTy, Align(8), ValueKind::HiddenDefaultQueue);
- } else {
- emitKernelArg(DL, Int8PtrTy, Align(8), ValueKind::HiddenNone);
- }
- }
-
- if (HiddenArgNumBytes >= 48) {
- if (!Func.hasFnAttribute("amdgpu-no-completion-action")) {
- emitKernelArg(DL, Int8PtrTy, Align(8), ValueKind::HiddenCompletionAction);
- } else {
- emitKernelArg(DL, Int8PtrTy, Align(8), ValueKind::HiddenNone);
- }
- }
-
- // Emit the pointer argument for multi-grid object.
- if (HiddenArgNumBytes >= 56) {
- if (!Func.hasFnAttribute("amdgpu-no-multigrid-sync-arg"))
- emitKernelArg(DL, Int8PtrTy, Align(8), ValueKind::HiddenMultiGridSyncArg);
- else
- emitKernelArg(DL, Int8PtrTy, Align(8), ValueKind::HiddenNone);
- }
-}
-
-bool MetadataStreamerYamlV2::emitTo(AMDGPUTargetStreamer &TargetStreamer) {
- return TargetStreamer.EmitHSAMetadata(getHSAMetadata());
-}
-
-void MetadataStreamerYamlV2::begin(const Module &Mod,
- const IsaInfo::AMDGPUTargetID &TargetID) {
- emitVersion();
- emitPrintf(Mod);
-}
-
-void MetadataStreamerYamlV2::end() {
- std::string HSAMetadataString;
- if (toString(HSAMetadata, HSAMetadataString))
- return;
-
- if (DumpHSAMetadata)
- dump(HSAMetadataString);
- if (VerifyHSAMetadata)
- verify(HSAMetadataString);
-}
-
-void MetadataStreamerYamlV2::emitKernel(const MachineFunction &MF,
- const SIProgramInfo &ProgramInfo) {
- auto &Func = MF.getFunction();
- if (Func.getCallingConv() != CallingConv::AMDGPU_KERNEL)
- return;
-
- auto CodeProps = getHSACodeProps(MF, ProgramInfo);
- auto DebugProps = getHSADebugProps(MF, ProgramInfo);
-
- HSAMetadata.mKernels.push_back(Kernel::Metadata());
- auto &Kernel = HSAMetadata.mKernels.back();
-
- const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
- Kernel.mName = std::string(Func.getName());
- Kernel.mSymbolName = (Twine(Func.getName()) + Twine("@kd")).str();
- emitKernelLanguage(Func);
- emitKernelAttrs(Func);
- emitKernelArgs(Func, ST);
- HSAMetadata.mKernels.back().mCodeProps = CodeProps;
- HSAMetadata.mKernels.back().mDebugProps = DebugProps;
-}
-
-//===----------------------------------------------------------------------===//
-// HSAMetadataStreamerV3
+// HSAMetadataStreamerV4
//===----------------------------------------------------------------------===//
-void MetadataStreamerMsgPackV3::dump(StringRef HSAMetadataString) const {
+void MetadataStreamerMsgPackV4::dump(StringRef HSAMetadataString) const {
errs() << "AMDGPU HSA Metadata:\n" << HSAMetadataString << '\n';
}
-void MetadataStreamerMsgPackV3::verify(StringRef HSAMetadataString) const {
+void MetadataStreamerMsgPackV4::verify(StringRef HSAMetadataString) const {
errs() << "AMDGPU HSA Metadata Parser Test: ";
msgpack::Document FromHSAMetadataString;
@@ -507,7 +78,7 @@ void MetadataStreamerMsgPackV3::verify(StringRef HSAMetadataString) const {
}
std::optional<StringRef>
-MetadataStreamerMsgPackV3::getAccessQualifier(StringRef AccQual) const {
+MetadataStreamerMsgPackV4::getAccessQualifier(StringRef AccQual) const {
return StringSwitch<std::optional<StringRef>>(AccQual)
.Case("read_only", StringRef("read_only"))
.Case("write_only", StringRef("write_only"))
@@ -515,7 +86,7 @@ MetadataStreamerMsgPackV3::getAccessQualifier(StringRef AccQual) const {
.Default(std::nullopt);
}
-std::optional<StringRef> MetadataStreamerMsgPackV3::getAddressSpaceQualifier(
+std::optional<StringRef> MetadataStreamerMsgPackV4::getAddressSpaceQualifier(
unsigned AddressSpace) const {
switch (AddressSpace) {
case AMDGPUAS::PRIVATE_ADDRESS:
@@ -536,7 +107,7 @@ std::optional<StringRef> MetadataStreamerMsgPackV3::getAddressSpaceQualifier(
}
StringRef
-MetadataStreamerMsgPackV3::getValueKind(Type *Ty, StringRef TypeQual,
+MetadataStreamerMsgPackV4::getValueKind(Type *Ty, StringRef TypeQual,
StringRef BaseTypeName) const {
if (TypeQual.contains("pipe"))
return "pipe";
@@ -563,7 +134,7 @@ MetadataStreamerMsgPackV3::getValueKind(Type *Ty, StringRef TypeQual,
: "by_value");
}
-std::string MetadataStreamerMsgPackV3::getTypeName(Type *Ty,
+std::string MetadataStreamerMsgPackV4::getTypeName(Type *Ty,
bool Signed) const {
switch (Ty->getTypeID()) {
case Type::IntegerTyID: {
@@ -602,7 +173,7 @@ std::string MetadataStreamerMsgPackV3::getTypeName(Type *Ty,
}
msgpack::ArrayDocNode
-MetadataStreamerMsgPackV3::getWorkGroupDimensions(MDNode *Node) const {
+MetadataStreamerMsgPackV4::getWorkGroupDimensions(MDNode *Node) const {
auto Dims = HSAMetadataDoc->getArrayNode();
if (Node->getNumOperands() != 3)
return Dims;
@@ -613,14 +184,20 @@ MetadataStreamerMsgPackV3::getWorkGroupDimensions(MDNode *Node) const {
return Dims;
}
-void MetadataStreamerMsgPackV3::emitVersion() {
+void MetadataStreamerMsgPackV4::emitVersion() {
auto Version = HSAMetadataDoc->getArrayNode();
- Version.push_back(Version.getDocument()->getNode(VersionMajorV3));
- Version.push_back(Version.getDocument()->getNode(VersionMinorV3));
+ Version.push_back(Version.getDocument()->getNode(VersionMajorV4));
+ Version.push_back(Version.getDocument()->getNode(VersionMinorV4));
getRootMetadata("amdhsa.version") = Version;
}
-void MetadataStreamerMsgPackV3::emitPrintf(const Module &Mod) {
+void MetadataStreamerMsgPackV4::emitTargetID(
+ const IsaInfo::AMDGPUTargetID &TargetID) {
+ getRootMetadata("amdhsa.target") =
+ HSAMetadataDoc->getNode(TargetID.toString(), /*Copy=*/true);
+}
+
+void MetadataStreamerMsgPackV4::emitPrintf(const Module &Mod) {
auto Node = Mod.getNamedMetadata("llvm.printf.fmts");
if (!Node)
return;
@@ -633,7 +210,7 @@ void MetadataStreamerMsgPackV3::emitPrintf(const Module &Mod) {
getRootMetadata("amdhsa.printf") = Printf;
}
-void MetadataStreamerMsgPackV3::emitKernelLanguage(const Function &Func,
+void MetadataStreamerMsgPackV4::emitKernelLanguage(const Function &Func,
msgpack::MapDocNode Kern) {
// TODO: What about other languages?
auto Node = Func.getParent()->getNamedMetadata("opencl.ocl.version");
@@ -652,7 +229,7 @@ void MetadataStreamerMsgPackV3::emitKernelLanguage(const Function &Func,
Kern[".language_version"] = LanguageVersion;
}
-void MetadataStreamerMsgPackV3::emitKernelAttrs(const Function &Func,
+void MetadataStreamerMsgPackV4::emitKernelAttrs(const Function &Func,
msgpack::MapDocNode Kern) {
if (auto Node = Func.getMetadata("reqd_work_group_size"))
@@ -677,7 +254,7 @@ void MetadataStreamerMsgPackV3::emitKernelAttrs(const Function &Func,
Kern[".kind"] = Kern.getDocument()->getNode("fini");
}
-void MetadataStreamerMsgPackV3::emitKernelArgs(const MachineFunction &MF,
+void MetadataStreamerMsgPackV4::emitKernelArgs(const MachineFunction &MF,
msgpack::MapDocNode Kern) {
auto &Func = MF.getFunction();
unsigned Offset = 0;
@@ -690,7 +267,7 @@ void MetadataStreamerMsgPackV3::emitKernelArgs(const MachineFunction &MF,
Kern[".args"] = Args;
}
-void MetadataStreamerMsgPackV3::emitKernelArg(const Argument &Arg,
+void MetadataStreamerMsgPackV4::emitKernelArg(const Argument &Arg,
unsigned &Offset,
msgpack::ArrayDocNode Args) {
auto Func = Arg.getParent();
@@ -714,16 +291,20 @@ void MetadataStreamerMsgPackV3::emitKernelArg(const Argument &Arg,
if (Node && ArgNo < Node->getNumOperands())
BaseTypeName = cast<MDString>(Node->getOperand(ArgNo))->getString();
- StringRef AccQual;
- if (Arg.getType()->isPointerTy() && Arg.onlyReadsMemory() &&
- Arg.hasNoAliasAttr()) {
- AccQual = "read_only";
- } else {
- Node = Func->getMetadata("kernel_arg_access_qual");
- if (Node && ArgNo < Node->getNumOperands())
- AccQual = cast<MDString>(Node->getOperand(ArgNo))->getString();
+ StringRef ActAccQual;
+ // Do we really need NoAlias check here?
+ if (Arg.getType()->isPointerTy() && Arg.hasNoAliasAttr()) {
+ if (Arg.onlyReadsMemory())
+ ActAccQual = "read_only";
+ else if (Arg.hasAttribute(Attribute::WriteOnly))
+ ActAccQual = "write_only";
}
+ StringRef AccQual;
+ Node = Func->getMetadata("kernel_arg_access_qual");
+ if (Node && ArgNo < Node->getNumOperands())
+ AccQual = cast<MDString>(Node->getOperand(ArgNo))->getString();
+
StringRef TypeQual;
Node = Func->getMetadata("kernel_arg_type_qual");
if (Node && ArgNo < Node->getNumOperands())
@@ -747,14 +328,15 @@ void MetadataStreamerMsgPackV3::emitKernelArg(const Argument &Arg,
emitKernelArg(DL, ArgTy, ArgAlign,
getValueKind(ArgTy, TypeQual, BaseTypeName), Offset, Args,
- PointeeAlign, Name, TypeName, BaseTypeName, AccQual, TypeQual);
+ PointeeAlign, Name, TypeName, BaseTypeName, ActAccQual,
+ AccQual, TypeQual);
}
-void MetadataStreamerMsgPackV3::emitKernelArg(
+void MetadataStreamerMsgPackV4::emitKernelArg(
const DataLayout &DL, Type *Ty, Align Alignment, StringRef ValueKind,
unsigned &Offset, msgpack::ArrayDocNode Args, MaybeAlign PointeeAlign,
StringRef Name, StringRef TypeName, StringRef BaseTypeName,
- StringRef AccQual, StringRef TypeQual) {
+ StringRef ActAccQual, StringRef AccQual, StringRef TypeQual) {
auto Arg = Args.getDocument()->getMapNode();
if (!Name.empty())
@@ -780,7 +362,8 @@ void MetadataStreamerMsgPackV3::emitKernelArg(
if (auto AQ = getAccessQualifier(AccQual))
Arg[".access"] = Arg.getDocument()->getNode(*AQ, /*Copy=*/true);
- // TODO: Emit Arg[".actual_access"].
+ if (auto AAQ = getAccessQualifier(ActAccQual))
+ Arg[".actual_access"] = Arg.getDocument()->getNode(*AAQ, /*Copy=*/true);
SmallVector<StringRef, 1> SplitTypeQuals;
TypeQual.split(SplitTypeQuals, " ", -1, false);
@@ -798,7 +381,7 @@ void MetadataStreamerMsgPackV3::emitKernelArg(
Args.push_back(Arg);
}
-void MetadataStreamerMsgPackV3::emitHiddenKernelArgs(
+void MetadataStreamerMsgPackV4::emitHiddenKernelArgs(
const MachineFunction &MF, unsigned &Offset, msgpack::ArrayDocNode Args) {
auto &Func = MF.getFunction();
const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
@@ -824,7 +407,7 @@ void MetadataStreamerMsgPackV3::emitHiddenKernelArgs(
Args);
auto Int8PtrTy =
- Type::getInt8PtrTy(Func.getContext(), AMDGPUAS::GLOBAL_ADDRESS);
+ PointerType::get(Func.getContext(), AMDGPUAS::GLOBAL_ADDRESS);
if (HiddenArgNumBytes >= 32) {
// We forbid the use of features requiring hostcall when compiling OpenCL
@@ -871,9 +454,10 @@ void MetadataStreamerMsgPackV3::emitHiddenKernelArgs(
}
}
-msgpack::MapDocNode MetadataStreamerMsgPackV3::getHSAKernelProps(
- const MachineFunction &MF, const SIProgramInfo &ProgramInfo,
- unsigned CodeObjectVersion) const {
+msgpack::MapDocNode
+MetadataStreamerMsgPackV4::getHSAKernelProps(const MachineFunction &MF,
+ const SIProgramInfo &ProgramInfo,
+ unsigned CodeObjectVersion) const {
const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
const Function &F = MF.getFunction();
@@ -918,18 +502,19 @@ msgpack::MapDocNode MetadataStreamerMsgPackV3::getHSAKernelProps(
return Kern;
}
-bool MetadataStreamerMsgPackV3::emitTo(AMDGPUTargetStreamer &TargetStreamer) {
+bool MetadataStreamerMsgPackV4::emitTo(AMDGPUTargetStreamer &TargetStreamer) {
return TargetStreamer.EmitHSAMetadata(*HSAMetadataDoc, true);
}
-void MetadataStreamerMsgPackV3::begin(const Module &Mod,
+void MetadataStreamerMsgPackV4::begin(const Module &Mod,
const IsaInfo::AMDGPUTargetID &TargetID) {
emitVersion();
+ emitTargetID(TargetID);
emitPrintf(Mod);
getRootMetadata("amdhsa.kernels") = HSAMetadataDoc->getArrayNode();
}
-void MetadataStreamerMsgPackV3::end() {
+void MetadataStreamerMsgPackV4::end() {
std::string HSAMetadataString;
raw_string_ostream StrOS(HSAMetadataString);
HSAMetadataDoc->toYAML(StrOS);
@@ -940,7 +525,7 @@ void MetadataStreamerMsgPackV3::end() {
verify(StrOS.str());
}
-void MetadataStreamerMsgPackV3::emitKernel(const MachineFunction &MF,
+void MetadataStreamerMsgPackV4::emitKernel(const MachineFunction &MF,
const SIProgramInfo &ProgramInfo) {
auto &Func = MF.getFunction();
if (Func.getCallingConv() != CallingConv::AMDGPU_KERNEL &&
@@ -966,31 +551,6 @@ void MetadataStreamerMsgPackV3::emitKernel(const MachineFunction &MF,
}
//===----------------------------------------------------------------------===//
-// HSAMetadataStreamerV4
-//===----------------------------------------------------------------------===//
-
-void MetadataStreamerMsgPackV4::emitVersion() {
- auto Version = HSAMetadataDoc->getArrayNode();
- Version.push_back(Version.getDocument()->getNode(VersionMajorV4));
- Version.push_back(Version.getDocument()->getNode(VersionMinorV4));
- getRootMetadata("amdhsa.version") = Version;
-}
-
-void MetadataStreamerMsgPackV4::emitTargetID(
- const IsaInfo::AMDGPUTargetID &TargetID) {
- getRootMetadata("amdhsa.target") =
- HSAMetadataDoc->getNode(TargetID.toString(), /*Copy=*/true);
-}
-
-void MetadataStreamerMsgPackV4::begin(const Module &Mod,
- const IsaInfo::AMDGPUTargetID &TargetID) {
- emitVersion();
- emitTargetID(TargetID);
- emitPrintf(Mod);
- getRootMetadata("amdhsa.kernels") = HSAMetadataDoc->getArrayNode();
-}
-
-//===----------------------------------------------------------------------===//
// HSAMetadataStreamerV5
//===----------------------------------------------------------------------===//
@@ -1044,7 +604,7 @@ void MetadataStreamerMsgPackV5::emitHiddenKernelArgs(
Offset += 6; // Reserved.
auto Int8PtrTy =
- Type::getInt8PtrTy(Func.getContext(), AMDGPUAS::GLOBAL_ADDRESS);
+ PointerType::get(Func.getContext(), AMDGPUAS::GLOBAL_ADDRESS);
if (M->getNamedMetadata("llvm.printf.fmts")) {
emitKernelArg(DL, Int8PtrTy, Align(8), "hidden_printf_buffer", Offset,
@@ -1097,13 +657,13 @@ void MetadataStreamerMsgPackV5::emitHiddenKernelArgs(
Offset += 8; // Skipped.
}
- if (MFI.hasQueuePtr())
+ if (MFI.getUserSGPRInfo().hasQueuePtr())
emitKernelArg(DL, Int8PtrTy, Align(8), "hidden_queue_ptr", Offset, Args);
}
void MetadataStreamerMsgPackV5::emitKernelAttrs(const Function &Func,
msgpack::MapDocNode Kern) {
- MetadataStreamerMsgPackV3::emitKernelAttrs(Func, Kern);
+ MetadataStreamerMsgPackV4::emitKernelAttrs(Func, Kern);
if (Func.getFnAttribute("uniform-work-group-size").getValueAsBool())
Kern[".uniform_work_group_size"] = Kern.getDocument()->getNode(1);
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.h b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.h
index 7d7080e920f5..6d6bd86711b1 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.h
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.h
@@ -30,7 +30,6 @@ class MDNode;
class Module;
struct SIProgramInfo;
class Type;
-class GCNSubtarget;
namespace AMDGPU {
@@ -62,7 +61,7 @@ protected:
msgpack::MapDocNode Kern) = 0;
};
-class MetadataStreamerMsgPackV3 : public MetadataStreamer {
+class MetadataStreamerMsgPackV4 : public MetadataStreamer {
protected:
std::unique_ptr<msgpack::Document> HSAMetadataDoc =
std::make_unique<msgpack::Document>();
@@ -89,6 +88,8 @@ protected:
void emitVersion() override;
+ void emitTargetID(const IsaInfo::AMDGPUTargetID &TargetID);
+
void emitPrintf(const Module &Mod);
void emitKernelLanguage(const Function &Func, msgpack::MapDocNode Kern);
@@ -105,8 +106,8 @@ protected:
msgpack::ArrayDocNode Args,
MaybeAlign PointeeAlign = std::nullopt,
StringRef Name = "", StringRef TypeName = "",
- StringRef BaseTypeName = "", StringRef AccQual = "",
- StringRef TypeQual = "");
+ StringRef BaseTypeName = "", StringRef ActAccQual = "",
+ StringRef AccQual = "", StringRef TypeQual = "");
void emitHiddenKernelArgs(const MachineFunction &MF, unsigned &Offset,
msgpack::ArrayDocNode Args) override;
@@ -120,8 +121,8 @@ protected:
}
public:
- MetadataStreamerMsgPackV3() = default;
- ~MetadataStreamerMsgPackV3() = default;
+ MetadataStreamerMsgPackV4() = default;
+ ~MetadataStreamerMsgPackV4() = default;
bool emitTo(AMDGPUTargetStreamer &TargetStreamer) override;
@@ -134,19 +135,6 @@ public:
const SIProgramInfo &ProgramInfo) override;
};
-class MetadataStreamerMsgPackV4 : public MetadataStreamerMsgPackV3 {
-protected:
- void emitVersion() override;
- void emitTargetID(const IsaInfo::AMDGPUTargetID &TargetID);
-
-public:
- MetadataStreamerMsgPackV4() = default;
- ~MetadataStreamerMsgPackV4() = default;
-
- void begin(const Module &Mod,
- const IsaInfo::AMDGPUTargetID &TargetID) override;
-};
-
class MetadataStreamerMsgPackV5 final : public MetadataStreamerMsgPackV4 {
protected:
void emitVersion() override;
@@ -159,82 +147,6 @@ public:
~MetadataStreamerMsgPackV5() = default;
};
-// TODO: Rename MetadataStreamerV2 -> MetadataStreamerYamlV2.
-class MetadataStreamerYamlV2 final : public MetadataStreamer {
-private:
- Metadata HSAMetadata;
-
- void dump(StringRef HSAMetadataString) const;
-
- void verify(StringRef HSAMetadataString) const;
-
- AccessQualifier getAccessQualifier(StringRef AccQual) const;
-
- AddressSpaceQualifier getAddressSpaceQualifier(unsigned AddressSpace) const;
-
- ValueKind getValueKind(Type *Ty, StringRef TypeQual,
- StringRef BaseTypeName) const;
-
- std::string getTypeName(Type *Ty, bool Signed) const;
-
- std::vector<uint32_t> getWorkGroupDimensions(MDNode *Node) const;
-
- Kernel::CodeProps::Metadata getHSACodeProps(
- const MachineFunction &MF,
- const SIProgramInfo &ProgramInfo) const;
- Kernel::DebugProps::Metadata getHSADebugProps(
- const MachineFunction &MF,
- const SIProgramInfo &ProgramInfo) const;
-
- void emitPrintf(const Module &Mod);
-
- void emitKernelLanguage(const Function &Func);
-
- void emitKernelAttrs(const Function &Func);
-
- void emitKernelArgs(const Function &Func, const GCNSubtarget &ST);
-
- void emitKernelArg(const Argument &Arg);
-
- void emitKernelArg(const DataLayout &DL, Type *Ty, Align Alignment,
- ValueKind ValueKind,
- MaybeAlign PointeeAlign = std::nullopt,
- StringRef Name = "", StringRef TypeName = "",
- StringRef BaseTypeName = "", StringRef AccQual = "",
- StringRef TypeQual = "");
-
- void emitHiddenKernelArgs(const Function &Func, const GCNSubtarget &ST);
-
- const Metadata &getHSAMetadata() const {
- return HSAMetadata;
- }
-
-protected:
- void emitVersion() override;
- void emitHiddenKernelArgs(const MachineFunction &MF, unsigned &Offset,
- msgpack::ArrayDocNode Args) override {
- llvm_unreachable("Dummy override should not be invoked!");
- }
- void emitKernelAttrs(const Function &Func,
- msgpack::MapDocNode Kern) override {
- llvm_unreachable("Dummy override should not be invoked!");
- }
-
-public:
- MetadataStreamerYamlV2() = default;
- ~MetadataStreamerYamlV2() = default;
-
- bool emitTo(AMDGPUTargetStreamer &TargetStreamer) override;
-
- void begin(const Module &Mod,
- const IsaInfo::AMDGPUTargetID &TargetID) override;
-
- void end() override;
-
- void emitKernel(const MachineFunction &MF,
- const SIProgramInfo &ProgramInfo) override;
-};
-
} // end namespace HSAMD
} // end namespace AMDGPU
} // end namespace llvm
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp
index ffa6c88f9d41..0a17b1536040 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp
@@ -345,13 +345,13 @@ class PipelineSolver {
// return the number of edges missed.
int addEdges(SmallVectorImpl<SchedGroup> &SyncPipeline, SUnit *SU, int SGID,
std::vector<std::pair<SUnit *, SUnit *>> &AddedEdges);
- // Link the pipeline as if \p SU was in the SchedGroup with ID \p SGID. It
- // returns the cost (in terms of missed pipeline edges), and tracks the edges
- // added in \p AddedEdges
+ /// Link the pipeline as if \p SU was in the SchedGroup with ID \p SGID. It
+ /// returns the cost (in terms of missed pipeline edges), and tracks the edges
+ /// added in \p AddedEdges
template <typename T>
int linkSUnit(SUnit *SU, int SGID,
std::vector<std::pair<SUnit *, SUnit *>> &AddedEdges, T I, T E);
- // Remove the edges passed via \p AddedEdges
+ /// Remove the edges passed via \p AddedEdges
void removeEdges(const std::vector<std::pair<SUnit *, SUnit *>> &AddedEdges);
// Convert the passed in maps to arrays for bidirectional iterators
void convertSyncMapsToArrays();
@@ -593,11 +593,10 @@ void PipelineSolver::populateReadyList(
for (; I != E; ++I) {
std::vector<std::pair<SUnit *, SUnit *>> AddedEdges;
int CandSGID = *I;
- SchedGroup *Match;
- for (auto &SG : SyncPipeline) {
- if (SG.getSGID() == CandSGID)
- Match = &SG;
- }
+ SchedGroup *Match = llvm::find_if(SyncPipeline, [CandSGID](SchedGroup &SG) {
+ return SG.getSGID() == CandSGID;
+ });
+ assert(Match);
if (UseCostHeur) {
if (Match->isFull()) {
@@ -739,11 +738,10 @@ void PipelineSolver::greedyFind(
for (; I != E; ++I) {
std::vector<std::pair<SUnit *, SUnit *>> AddedEdges;
int CandSGID = *I;
- SchedGroup *Match;
- for (auto &SG : SyncPipeline) {
- if (SG.getSGID() == CandSGID)
- Match = &SG;
- }
+ SchedGroup *Match = llvm::find_if(SyncPipeline, [CandSGID](SchedGroup &SG) {
+ return SG.getSGID() == CandSGID;
+ });
+ assert(Match);
LLVM_DEBUG(dbgs() << "Trying SGID # " << CandSGID << " with Mask "
<< (int)Match->getMask() << "\n");
@@ -849,10 +847,11 @@ protected:
const SIInstrInfo *TII;
public:
- // Add SchedGroups to \p Pipeline to implement this Strategy.
+ /// Add SchedGroups to \p SyncedSchedGroups to implement this Strategy.
virtual void applyIGLPStrategy(
DenseMap<int, SUnitsToCandidateSGsMap> &SyncedInstrs,
- DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups) = 0;
+ DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups,
+ bool IsReentry) = 0;
// Returns true if this strategy should be applied to a ScheduleDAG.
virtual bool shouldApplyStrategy(ScheduleDAGInstrs *DAG) = 0;
@@ -870,7 +869,8 @@ private:
public:
void applyIGLPStrategy(
DenseMap<int, SUnitsToCandidateSGsMap> &SyncedInstrs,
- DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups) override;
+ DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups,
+ bool IsReentry) override;
bool shouldApplyStrategy(ScheduleDAGInstrs *DAG) override { return true; }
@@ -882,7 +882,8 @@ public:
void MFMASmallGemmOpt::applyIGLPStrategy(
DenseMap<int, SUnitsToCandidateSGsMap> &SyncedInstrs,
- DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups) {
+ DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups,
+ bool IsReentry) {
// Count the number of MFMA instructions.
unsigned MFMACount = 0;
for (const MachineInstr &I : *DAG)
@@ -964,11 +965,10 @@ private:
// Does the VALU have a DS_WRITE successor that is the same as other
// VALU already in the group. The V_PERMs will all share 1 DS_W succ
- return std::any_of(Cache->begin(), Cache->end(), [&SU](SUnit *Elt) {
- return std::any_of(SU->Succs.begin(), SU->Succs.end(),
- [&Elt](const SDep &ThisSucc) {
- return ThisSucc.getSUnit() == Elt;
- });
+ return llvm::any_of(*Cache, [&SU](SUnit *Elt) {
+ return llvm::any_of(SU->Succs, [&Elt](const SDep &ThisSucc) {
+ return ThisSucc.getSUnit() == Elt;
+ });
});
}
@@ -1045,8 +1045,8 @@ private:
: InstructionRule(TII, SGID, NeedsCache) {}
};
- // Whether the SU shares a V_PERM predecessor with any SU in the SchedGroup
- // that is /p Distance steps away
+ /// Whether the SU shares a V_PERM predecessor with any SU in the SchedGroup
+ /// that is \p Distance steps away
class SharesPredWithPrevNthGroup final : public InstructionRule {
private:
unsigned Distance = 1;
@@ -1078,16 +1078,18 @@ private:
Cache->push_back(Pred.getSUnit());
}
}
+
+ // If the other group has no PERM preds, then this group won't share any
+ if (!Cache->size())
+ return false;
}
- assert(Cache->size());
auto DAG = SyncPipe[0].DAG;
// Does the previous DS_WRITE share a V_PERM predecessor with this
// VMEM_READ
- return (
- std::any_of(Cache->begin(), Cache->end(), [&SU, &DAG](SUnit *Elt) {
- return DAG->IsReachable(const_cast<SUnit *>(SU), Elt);
- }));
+ return llvm::any_of(*Cache, [&SU, &DAG](SUnit *Elt) {
+ return DAG->IsReachable(const_cast<SUnit *>(SU), Elt);
+ });
}
SharesPredWithPrevNthGroup(unsigned Distance, const SIInstrInfo *TII,
unsigned SGID, bool NeedsCache = false)
@@ -1097,7 +1099,8 @@ private:
public:
void applyIGLPStrategy(
DenseMap<int, SUnitsToCandidateSGsMap> &SyncedInstrs,
- DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups) override;
+ DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups,
+ bool IsReentry) override;
bool shouldApplyStrategy(ScheduleDAGInstrs *DAG) override { return true; }
@@ -1107,14 +1110,20 @@ public:
}
};
+static unsigned DSWCount = 0;
+static unsigned DSWWithPermCount = 0;
+static unsigned DSWWithSharedVMEMCount = 0;
+
void MFMASmallGemmSingleWaveOpt::applyIGLPStrategy(
DenseMap<int, SUnitsToCandidateSGsMap> &SyncedInstrs,
- DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups) {
+ DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups,
+ bool IsReentry) {
unsigned MFMACount = 0;
- unsigned DSWCount = 0;
- unsigned DSWWithPermCount = 0;
- unsigned DSWWithSharedVMEMCount = 0;
unsigned DSRCount = 0;
+
+ assert((IsReentry || (DSWCount == 0 && DSWWithPermCount == 0 &&
+ DSWWithSharedVMEMCount == 0)) &&
+ "DSWCounters should be zero in pre-RA scheduling!");
SmallVector<SUnit *, 6> DSWithPerms;
for (auto &SU : DAG->SUnits) {
auto I = SU.getInstr();
@@ -1123,7 +1132,7 @@ void MFMASmallGemmSingleWaveOpt::applyIGLPStrategy(
else if (TII->isDS(*I)) {
if (I->mayLoad())
++DSRCount;
- else if (I->mayStore()) {
+ else if (I->mayStore() && !IsReentry) {
++DSWCount;
for (auto Pred : SU.Preds) {
if (Pred.getSUnit()->getInstr()->getOpcode() ==
@@ -1135,57 +1144,59 @@ void MFMASmallGemmSingleWaveOpt::applyIGLPStrategy(
}
}
}
- DSWWithPermCount = DSWithPerms.size();
- auto I = DSWithPerms.begin();
- auto E = DSWithPerms.end();
-
- // Get the count of DS_WRITES with V_PERM predecessors which
- // have loop carried dependencies (WAR) on the same VMEM_READs.
- // We consider partial overlap as a miss -- in other words,
- // for a given DS_W, we only consider another DS_W as matching
- // if there is a corresponding (in terms of the VMEM_R it uses) V_PERM pred
- // for every V_PERM pred of this DS_W.
- DenseMap<MachineInstr *, SUnit *> VMEMLookup;
- SmallVector<SUnit *, 6> Counted;
- for (; I != E; I++) {
- SUnit *Cand = nullptr;
- bool MissedAny = false;
- for (auto &Pred : (*I)->Preds) {
- if (Pred.getSUnit()->getInstr()->getOpcode() != AMDGPU::V_PERM_B32_e64)
- continue;
- if (Cand &&
- std::find(Counted.begin(), Counted.end(), Cand) != Counted.end())
- break;
-
- for (auto &Succ : Pred.getSUnit()->Succs) {
- auto MI = Succ.getSUnit()->getInstr();
- if (!TII->isVMEM(*MI) || !MI->mayLoad())
+ if (!IsReentry) {
+ DSWWithPermCount = DSWithPerms.size();
+ auto I = DSWithPerms.begin();
+ auto E = DSWithPerms.end();
+
+ // Get the count of DS_WRITES with V_PERM predecessors which
+ // have loop carried dependencies (WAR) on the same VMEM_READs.
+ // We consider partial overlap as a miss -- in other words,
+ // for a given DS_W, we only consider another DS_W as matching
+ // if there is a corresponding (in terms of the VMEM_R it uses) V_PERM pred
+ // for every V_PERM pred of this DS_W.
+ DenseMap<MachineInstr *, SUnit *> VMEMLookup;
+ SmallVector<SUnit *, 6> Counted;
+ for (; I != E; I++) {
+ SUnit *Cand = nullptr;
+ bool MissedAny = false;
+ for (auto &Pred : (*I)->Preds) {
+ if (Pred.getSUnit()->getInstr()->getOpcode() != AMDGPU::V_PERM_B32_e64)
continue;
- if (MissedAny || !VMEMLookup.size()) {
- MissedAny = true;
- VMEMLookup[MI] = *I;
- continue;
- }
+ if (Cand && llvm::is_contained(Counted, Cand))
+ break;
- if (!VMEMLookup.contains(MI)) {
- MissedAny = true;
- VMEMLookup[MI] = *I;
- continue;
- }
+ for (auto &Succ : Pred.getSUnit()->Succs) {
+ auto MI = Succ.getSUnit()->getInstr();
+ if (!TII->isVMEM(*MI) || !MI->mayLoad())
+ continue;
- Cand = VMEMLookup[MI];
- if (std::find(Counted.begin(), Counted.end(), Cand) != Counted.end()) {
- MissedAny = true;
- break;
+ if (MissedAny || !VMEMLookup.size()) {
+ MissedAny = true;
+ VMEMLookup[MI] = *I;
+ continue;
+ }
+
+ if (!VMEMLookup.contains(MI)) {
+ MissedAny = true;
+ VMEMLookup[MI] = *I;
+ continue;
+ }
+
+ Cand = VMEMLookup[MI];
+ if (llvm::is_contained(Counted, Cand)) {
+ MissedAny = true;
+ break;
+ }
}
}
- }
- if (!MissedAny && Cand) {
- DSWWithSharedVMEMCount += 2;
- Counted.push_back(Cand);
- Counted.push_back(*I);
+ if (!MissedAny && Cand) {
+ DSWWithSharedVMEMCount += 2;
+ Counted.push_back(Cand);
+ Counted.push_back(*I);
+ }
}
}
@@ -1401,7 +1412,11 @@ public:
// first created SchedGroup first.
bool IsBottomUp = 1;
+ // Whether or not this is a reentry into the IGroupLPDAGMutation.
+ bool IsReentry = false;
+
IGroupLPDAGMutation() = default;
+ IGroupLPDAGMutation(bool IsReentry) : IsReentry(IsReentry) {}
};
unsigned SchedGroup::NumSchedGroups = 0;
@@ -1689,7 +1704,7 @@ void IGroupLPDAGMutation::initIGLPOpt(SUnit &SU) {
auto S = createIGLPStrategy(StrategyID, DAG, TII);
if (S->shouldApplyStrategy(DAG)) {
IsBottomUp = S->IsBottomUp;
- S->applyIGLPStrategy(SyncedInstrs, SyncedSchedGroups);
+ S->applyIGLPStrategy(SyncedInstrs, SyncedSchedGroups, IsReentry);
}
}
@@ -1697,8 +1712,13 @@ void IGroupLPDAGMutation::initIGLPOpt(SUnit &SU) {
namespace llvm {
-std::unique_ptr<ScheduleDAGMutation> createIGroupLPDAGMutation() {
- return std::make_unique<IGroupLPDAGMutation>();
+/// \p IsReentry specifes whether or not this is a reentry into the
+/// IGroupLPDAGMutation. Since there may be multiple scheduling passes on the
+/// same scheduling region (e.g. pre and post-RA scheduling / multiple
+/// scheduling "phases"), we can reenter this mutation framework more than once
+/// for a given region.
+std::unique_ptr<ScheduleDAGMutation> createIGroupLPDAGMutation(bool IsReentry) {
+ return std::make_unique<IGroupLPDAGMutation>(IsReentry);
}
} // end namespace llvm
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.h b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.h
index ae0faba0780d..3ec8be4f8892 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.h
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.h
@@ -14,7 +14,7 @@
namespace llvm {
-std::unique_ptr<ScheduleDAGMutation> createIGroupLPDAGMutation();
+std::unique_ptr<ScheduleDAGMutation> createIGroupLPDAGMutation(bool IsReentry);
} // namespace llvm
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
index 825c6f0acd0f..b0eac567ec9f 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
@@ -19,6 +19,7 @@
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
#include "MCTargetDesc/R600MCTargetDesc.h"
#include "R600RegisterInfo.h"
+#include "SIISelLowering.h"
#include "SIMachineFunctionInfo.h"
#include "llvm/Analysis/UniformityAnalysis.h"
#include "llvm/Analysis/ValueTracking.h"
@@ -81,10 +82,9 @@ static bool isExtractHiElt(SDValue In, SDValue &Out) {
// same register.
static SDValue stripExtractLoElt(SDValue In) {
if (In.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
- if (ConstantSDNode *Idx = dyn_cast<ConstantSDNode>(In.getOperand(1))) {
- if (Idx->isZero() && In.getValueSizeInBits() <= 32)
- return In.getOperand(0);
- }
+ SDValue Idx = In.getOperand(1);
+ if (isNullConstant(Idx) && In.getValueSizeInBits() <= 32)
+ return In.getOperand(0);
}
if (In.getOpcode() == ISD::TRUNCATE) {
@@ -113,12 +113,12 @@ INITIALIZE_PASS_END(AMDGPUDAGToDAGISel, "amdgpu-isel",
/// This pass converts a legalized DAG into a AMDGPU-specific
// DAG, ready for instruction scheduling.
FunctionPass *llvm::createAMDGPUISelDag(TargetMachine &TM,
- CodeGenOpt::Level OptLevel) {
+ CodeGenOptLevel OptLevel) {
return new AMDGPUDAGToDAGISel(TM, OptLevel);
}
AMDGPUDAGToDAGISel::AMDGPUDAGToDAGISel(TargetMachine &TM,
- CodeGenOpt::Level OptLevel)
+ CodeGenOptLevel OptLevel)
: SelectionDAGISel(ID, TM, OptLevel) {
EnableLateStructurizeCFG = AMDGPUTargetMachine::EnableLateStructurizeCFG;
}
@@ -132,7 +132,7 @@ bool AMDGPUDAGToDAGISel::runOnMachineFunction(MachineFunction &MF) {
}
#endif
Subtarget = &MF.getSubtarget<GCNSubtarget>();
- Mode = SIModeRegisterDefaults(MF.getFunction());
+ Mode = SIModeRegisterDefaults(MF.getFunction(), *Subtarget);
return SelectionDAGISel::runOnMachineFunction(MF);
}
@@ -164,6 +164,7 @@ bool AMDGPUDAGToDAGISel::fp16SrcZerosHighBits(unsigned Opc) const {
case ISD::FTRUNC:
case ISD::FRINT:
case ISD::FNEARBYINT:
+ case ISD::FROUNDEVEN:
case ISD::FROUND:
case ISD::FFLOOR:
case ISD::FMINNUM:
@@ -596,11 +597,15 @@ void AMDGPUDAGToDAGISel::Select(SDNode *N) {
break;
uint64_t Imm;
- if (ConstantFPSDNode *FP = dyn_cast<ConstantFPSDNode>(N))
+ if (ConstantFPSDNode *FP = dyn_cast<ConstantFPSDNode>(N)) {
Imm = FP->getValueAPF().bitcastToAPInt().getZExtValue();
- else {
+ if (AMDGPU::isValid32BitLiteral(Imm, true))
+ break;
+ } else {
ConstantSDNode *C = cast<ConstantSDNode>(N);
Imm = C->getZExtValue();
+ if (AMDGPU::isValid32BitLiteral(Imm, false))
+ break;
}
SDLoc DL(N);
@@ -664,6 +669,9 @@ void AMDGPUDAGToDAGISel::Select(SDNode *N) {
case ISD::BRCOND:
SelectBRCOND(N);
return;
+ case ISD::FP_EXTEND:
+ SelectFP_EXTEND(N);
+ return;
case AMDGPUISD::CVT_PKRTZ_F16_F32:
case AMDGPUISD::CVT_PKNORM_I16_F32:
case AMDGPUISD::CVT_PKNORM_U16_F32:
@@ -692,6 +700,14 @@ void AMDGPUDAGToDAGISel::Select(SDNode *N) {
SelectINTRINSIC_VOID(N);
return;
}
+ case AMDGPUISD::WAVE_ADDRESS: {
+ SelectWAVE_ADDRESS(N);
+ return;
+ }
+ case ISD::STACKRESTORE: {
+ SelectSTACKRESTORE(N);
+ return;
+ }
}
SelectCode(N);
@@ -1136,13 +1152,69 @@ bool AMDGPUDAGToDAGISel::isDSOffset2Legal(SDValue Base, unsigned Offset0,
return CurDAG->SignBitIsZero(Base);
}
-bool AMDGPUDAGToDAGISel::isFlatScratchBaseLegal(SDValue Base,
- uint64_t FlatVariant) const {
- if (FlatVariant != SIInstrFlags::FlatScratch)
+// Return whether the operation has NoUnsignedWrap property.
+static bool isNoUnsignedWrap(SDValue Addr) {
+ return (Addr.getOpcode() == ISD::ADD &&
+ Addr->getFlags().hasNoUnsignedWrap()) ||
+ Addr->getOpcode() == ISD::OR;
+}
+
+// Check that the base address of flat scratch load/store in the form of `base +
+// offset` is legal to be put in SGPR/VGPR (i.e. unsigned per hardware
+// requirement). We always treat the first operand as the base address here.
+bool AMDGPUDAGToDAGISel::isFlatScratchBaseLegal(SDValue Addr) const {
+ if (isNoUnsignedWrap(Addr))
return true;
- // When value in 32-bit Base can be negative calculate scratch offset using
- // 32-bit add instruction, otherwise use Base(unsigned) + offset.
- return CurDAG->SignBitIsZero(Base);
+
+ // Starting with GFX12, VADDR and SADDR fields in VSCRATCH can use negative
+ // values.
+ if (AMDGPU::isGFX12Plus(*Subtarget))
+ return true;
+
+ auto LHS = Addr.getOperand(0);
+ auto RHS = Addr.getOperand(1);
+
+ // If the immediate offset is negative and within certain range, the base
+ // address cannot also be negative. If the base is also negative, the sum
+ // would be either negative or much larger than the valid range of scratch
+ // memory a thread can access.
+ ConstantSDNode *ImmOp = nullptr;
+ if (Addr.getOpcode() == ISD::ADD && (ImmOp = dyn_cast<ConstantSDNode>(RHS))) {
+ if (ImmOp->getSExtValue() < 0 && ImmOp->getSExtValue() > -0x40000000)
+ return true;
+ }
+
+ return CurDAG->SignBitIsZero(LHS);
+}
+
+// Check address value in SGPR/VGPR are legal for flat scratch in the form
+// of: SGPR + VGPR.
+bool AMDGPUDAGToDAGISel::isFlatScratchBaseLegalSV(SDValue Addr) const {
+ if (isNoUnsignedWrap(Addr))
+ return true;
+
+ auto LHS = Addr.getOperand(0);
+ auto RHS = Addr.getOperand(1);
+ return CurDAG->SignBitIsZero(RHS) && CurDAG->SignBitIsZero(LHS);
+}
+
+// Check address value in SGPR/VGPR are legal for flat scratch in the form
+// of: SGPR + VGPR + Imm.
+bool AMDGPUDAGToDAGISel::isFlatScratchBaseLegalSVImm(SDValue Addr) const {
+ auto Base = Addr.getOperand(0);
+ auto *RHSImm = cast<ConstantSDNode>(Addr.getOperand(1));
+ // If the immediate offset is negative and within certain range, the base
+ // address cannot also be negative. If the base is also negative, the sum
+ // would be either negative or much larger than the valid range of scratch
+ // memory a thread can access.
+ if (isNoUnsignedWrap(Base) &&
+ (isNoUnsignedWrap(Addr) ||
+ (RHSImm->getSExtValue() < 0 && RHSImm->getSExtValue() > -0x40000000)))
+ return true;
+
+ auto LHS = Base.getOperand(0);
+ auto RHS = Base.getOperand(1);
+ return CurDAG->SignBitIsZero(RHS) && CurDAG->SignBitIsZero(LHS);
}
// TODO: If offset is too big, put low 16-bit into offset.
@@ -1252,7 +1324,9 @@ bool AMDGPUDAGToDAGISel::SelectMUBUF(SDValue Addr, SDValue &Ptr, SDValue &VAddr,
Idxen = CurDAG->getTargetConstant(0, DL, MVT::i1);
Offen = CurDAG->getTargetConstant(0, DL, MVT::i1);
Addr64 = CurDAG->getTargetConstant(0, DL, MVT::i1);
- SOffset = CurDAG->getTargetConstant(0, DL, MVT::i32);
+ SOffset = Subtarget->hasRestrictedSOffset()
+ ? CurDAG->getRegister(AMDGPU::SGPR_NULL, MVT::i32)
+ : CurDAG->getTargetConstant(0, DL, MVT::i32);
ConstantSDNode *C1 = nullptr;
SDValue N0 = Addr;
@@ -1307,7 +1381,8 @@ bool AMDGPUDAGToDAGISel::SelectMUBUF(SDValue Addr, SDValue &Ptr, SDValue &VAddr,
return true;
}
- if (SIInstrInfo::isLegalMUBUFImmOffset(C1->getZExtValue())) {
+ const SIInstrInfo *TII = Subtarget->getInstrInfo();
+ if (TII->isLegalMUBUFImmOffset(C1->getZExtValue())) {
// Legal offset for instruction.
Offset = CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i32);
return true;
@@ -1381,7 +1456,7 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFScratchOffen(SDNode *Parent,
AMDGPUTargetMachine::getNullPointerValue(AMDGPUAS::PRIVATE_ADDRESS);
// Don't fold null pointer.
if (Imm != NullPtr) {
- const uint32_t MaxOffset = SIInstrInfo::getMaxMUBUFImmOffset();
+ const uint32_t MaxOffset = SIInstrInfo::getMaxMUBUFImmOffset(*Subtarget);
SDValue HighBits =
CurDAG->getTargetConstant(Imm & ~MaxOffset, DL, MVT::i32);
MachineSDNode *MovHighBits = CurDAG->getMachineNode(
@@ -1415,8 +1490,9 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFScratchOffen(SDNode *Parent,
// Therefore it should be safe to fold any VGPR offset on gfx9 into the
// MUBUF vaddr, but not on older subtargets which can only do this if the
// sign bit is known 0.
+ const SIInstrInfo *TII = Subtarget->getInstrInfo();
ConstantSDNode *C1 = cast<ConstantSDNode>(N1);
- if (SIInstrInfo::isLegalMUBUFImmOffset(C1->getZExtValue()) &&
+ if (TII->isLegalMUBUFImmOffset(C1->getZExtValue()) &&
(!Subtarget->privateMemoryResourceIsRangeChecked() ||
CurDAG->SignBitIsZero(N0))) {
std::tie(VAddr, SOffset) = foldFrameIndex(N0);
@@ -1448,6 +1524,7 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFScratchOffset(SDNode *Parent,
SDValue &Offset) const {
const SIRegisterInfo *TRI =
static_cast<const SIRegisterInfo *>(Subtarget->getRegisterInfo());
+ const SIInstrInfo *TII = Subtarget->getInstrInfo();
MachineFunction &MF = CurDAG->getMachineFunction();
const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
SDLoc DL(Addr);
@@ -1464,14 +1541,14 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFScratchOffset(SDNode *Parent,
if (Addr.getOpcode() == ISD::ADD) {
// Add (CopyFromReg <sgpr>) <constant>
CAddr = dyn_cast<ConstantSDNode>(Addr.getOperand(1));
- if (!CAddr || !SIInstrInfo::isLegalMUBUFImmOffset(CAddr->getZExtValue()))
+ if (!CAddr || !TII->isLegalMUBUFImmOffset(CAddr->getZExtValue()))
return false;
if (!IsCopyFromSGPR(*TRI, Addr.getOperand(0)))
return false;
SOffset = Addr.getOperand(0);
} else if ((CAddr = dyn_cast<ConstantSDNode>(Addr)) &&
- SIInstrInfo::isLegalMUBUFImmOffset(CAddr->getZExtValue())) {
+ TII->isLegalMUBUFImmOffset(CAddr->getZExtValue())) {
// <constant>
SOffset = CurDAG->getTargetConstant(0, DL, MVT::i32);
} else {
@@ -1488,8 +1565,7 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc,
SDValue &SOffset, SDValue &Offset
) const {
SDValue Ptr, VAddr, Offen, Idxen, Addr64;
- const SIInstrInfo *TII =
- static_cast<const SIInstrInfo *>(Subtarget->getInstrInfo());
+ const SIInstrInfo *TII = Subtarget->getInstrInfo();
if (!SelectMUBUF(Addr, Ptr, VAddr, SOffset, Offset, Offen, Idxen, Addr64))
return false;
@@ -1510,6 +1586,21 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc,
return false;
}
+bool AMDGPUDAGToDAGISel::SelectBUFSOffset(SDValue ByteOffsetNode,
+ SDValue &SOffset) const {
+ if (Subtarget->hasRestrictedSOffset()) {
+ if (auto SOffsetConst = dyn_cast<ConstantSDNode>(ByteOffsetNode)) {
+ if (SOffsetConst->isZero()) {
+ SOffset = CurDAG->getRegister(AMDGPU::SGPR_NULL, MVT::i32);
+ return true;
+ }
+ }
+ }
+
+ SOffset = ByteOffsetNode;
+ return true;
+}
+
// Find a load or store from corresponding pattern root.
// Roots may be build_vector, bitconvert or their combinations.
static MemSDNode* findMemSDNode(SDNode *N) {
@@ -1539,7 +1630,8 @@ bool AMDGPUDAGToDAGISel::SelectFlatOffsetImpl(SDNode *N, SDValue Addr,
if (Subtarget->hasFlatInstOffsets() && !CanHaveFlatSegmentOffsetBug) {
SDValue N0, N1;
if (isBaseWithConstantOffset64(Addr, N0, N1) &&
- isFlatScratchBaseLegal(N0, FlatVariant)) {
+ (FlatVariant != SIInstrFlags::FlatScratch ||
+ isFlatScratchBaseLegal(Addr))) {
int64_t COffsetVal = cast<ConstantSDNode>(N1)->getSExtValue();
const SIInstrInfo *TII = Subtarget->getInstrInfo();
@@ -1614,7 +1706,7 @@ bool AMDGPUDAGToDAGISel::SelectFlatOffsetImpl(SDNode *N, SDValue Addr,
}
VAddr = Addr;
- Offset = CurDAG->getTargetConstant(OffsetVal, SDLoc(), MVT::i16);
+ Offset = CurDAG->getTargetConstant(OffsetVal, SDLoc(), MVT::i32);
return true;
}
@@ -1682,7 +1774,7 @@ bool AMDGPUDAGToDAGISel::SelectGlobalSAddr(SDNode *N,
CurDAG->getTargetConstant(RemainderOffset, SDLoc(), MVT::i32));
VOffset = SDValue(VMov, 0);
SAddr = LHS;
- Offset = CurDAG->getTargetConstant(SplitImmOffset, SDLoc(), MVT::i16);
+ Offset = CurDAG->getTargetConstant(SplitImmOffset, SDLoc(), MVT::i32);
return true;
}
}
@@ -1722,7 +1814,7 @@ bool AMDGPUDAGToDAGISel::SelectGlobalSAddr(SDNode *N,
}
if (SAddr) {
- Offset = CurDAG->getTargetConstant(ImmOffset, SDLoc(), MVT::i16);
+ Offset = CurDAG->getTargetConstant(ImmOffset, SDLoc(), MVT::i32);
return true;
}
}
@@ -1738,7 +1830,7 @@ bool AMDGPUDAGToDAGISel::SelectGlobalSAddr(SDNode *N,
CurDAG->getMachineNode(AMDGPU::V_MOV_B32_e32, SDLoc(Addr), MVT::i32,
CurDAG->getTargetConstant(0, SDLoc(), MVT::i32));
VOffset = SDValue(VMov, 0);
- Offset = CurDAG->getTargetConstant(ImmOffset, SDLoc(), MVT::i16);
+ Offset = CurDAG->getTargetConstant(ImmOffset, SDLoc(), MVT::i32);
return true;
}
@@ -1771,8 +1863,7 @@ bool AMDGPUDAGToDAGISel::SelectScratchSAddr(SDNode *Parent, SDValue Addr,
int64_t COffsetVal = 0;
- if (CurDAG->isBaseWithConstantOffset(Addr) &&
- isFlatScratchBaseLegal(Addr.getOperand(0))) {
+ if (CurDAG->isBaseWithConstantOffset(Addr) && isFlatScratchBaseLegal(Addr)) {
COffsetVal = cast<ConstantSDNode>(Addr.getOperand(1))->getSExtValue();
SAddr = Addr.getOperand(0);
} else {
@@ -1829,6 +1920,7 @@ bool AMDGPUDAGToDAGISel::SelectScratchSVAddr(SDNode *N, SDValue Addr,
int64_t ImmOffset = 0;
SDValue LHS, RHS;
+ SDValue OrigAddr = Addr;
if (isBaseWithConstantOffset64(Addr, LHS, RHS)) {
int64_t COffsetVal = cast<ConstantSDNode>(RHS)->getSExtValue();
const SIInstrInfo *TII = Subtarget->getInstrInfo();
@@ -1850,7 +1942,7 @@ bool AMDGPUDAGToDAGISel::SelectScratchSVAddr(SDNode *N, SDValue Addr,
CurDAG->getTargetConstant(RemainderOffset, SDLoc(), MVT::i32));
VAddr = SDValue(VMov, 0);
SAddr = LHS;
- if (!isFlatScratchBaseLegal(SAddr) || !isFlatScratchBaseLegal(VAddr))
+ if (!isFlatScratchBaseLegal(Addr))
return false;
if (checkFlatScratchSVSSwizzleBug(VAddr, SAddr, SplitImmOffset))
return false;
@@ -1876,8 +1968,13 @@ bool AMDGPUDAGToDAGISel::SelectScratchSVAddr(SDNode *N, SDValue Addr,
return false;
}
- if (!isFlatScratchBaseLegal(SAddr) || !isFlatScratchBaseLegal(VAddr))
- return false;
+ if (OrigAddr != Addr) {
+ if (!isFlatScratchBaseLegalSVImm(OrigAddr))
+ return false;
+ } else {
+ if (!isFlatScratchBaseLegalSV(OrigAddr))
+ return false;
+ }
if (checkFlatScratchSVSSwizzleBug(VAddr, SAddr, ImmOffset))
return false;
@@ -2249,6 +2346,33 @@ bool AMDGPUDAGToDAGISel::isCBranchSCC(const SDNode *N) const {
return false;
}
+static SDValue combineBallotPattern(SDValue VCMP, bool &Negate) {
+ assert(VCMP->getOpcode() == AMDGPUISD::SETCC);
+ // Special case for amdgcn.ballot:
+ // %Cond = i1 (and/or combination of i1 ISD::SETCCs)
+ // %VCMP = i(WaveSize) AMDGPUISD::SETCC (ext %Cond), 0, setne/seteq
+ // =>
+ // Use i1 %Cond value instead of i(WaveSize) %VCMP.
+ // This is possible because divergent ISD::SETCC is selected as V_CMP and
+ // Cond becomes a i(WaveSize) full mask value.
+ // Note that ballot doesn't use SETEQ condition but its easy to support it
+ // here for completeness, so in this case Negate is set true on return.
+ auto VCMP_CC = cast<CondCodeSDNode>(VCMP.getOperand(2))->get();
+ if ((VCMP_CC == ISD::SETEQ || VCMP_CC == ISD::SETNE) &&
+ isNullConstant(VCMP.getOperand(1))) {
+
+ auto Cond = VCMP.getOperand(0);
+ if (ISD::isExtOpcode(Cond->getOpcode())) // Skip extension.
+ Cond = Cond.getOperand(0);
+
+ if (isBoolSGPR(Cond)) {
+ Negate = VCMP_CC == ISD::SETEQ;
+ return Cond;
+ }
+ }
+ return SDValue();
+}
+
void AMDGPUDAGToDAGISel::SelectBRCOND(SDNode *N) {
SDValue Cond = N->getOperand(1);
@@ -2262,11 +2386,50 @@ void AMDGPUDAGToDAGISel::SelectBRCOND(SDNode *N) {
const SIRegisterInfo *TRI = ST->getRegisterInfo();
bool UseSCCBr = isCBranchSCC(N) && isUniformBr(N);
- unsigned BrOp = UseSCCBr ? AMDGPU::S_CBRANCH_SCC1 : AMDGPU::S_CBRANCH_VCCNZ;
+ bool AndExec = !UseSCCBr;
+ bool Negate = false;
+
+ if (Cond.getOpcode() == ISD::SETCC &&
+ Cond->getOperand(0)->getOpcode() == AMDGPUISD::SETCC) {
+ SDValue VCMP = Cond->getOperand(0);
+ auto CC = cast<CondCodeSDNode>(Cond->getOperand(2))->get();
+ if ((CC == ISD::SETEQ || CC == ISD::SETNE) &&
+ isNullConstant(Cond->getOperand(1)) &&
+ // TODO: make condition below an assert after fixing ballot bitwidth.
+ VCMP.getValueType().getSizeInBits() == ST->getWavefrontSize()) {
+ // %VCMP = i(WaveSize) AMDGPUISD::SETCC ...
+ // %C = i1 ISD::SETCC %VCMP, 0, setne/seteq
+ // BRCOND i1 %C, %BB
+ // =>
+ // %VCMP = i(WaveSize) AMDGPUISD::SETCC ...
+ // VCC = COPY i(WaveSize) %VCMP
+ // S_CBRANCH_VCCNZ/VCCZ %BB
+ Negate = CC == ISD::SETEQ;
+ bool NegatedBallot = false;
+ if (auto BallotCond = combineBallotPattern(VCMP, NegatedBallot)) {
+ Cond = BallotCond;
+ UseSCCBr = !BallotCond->isDivergent();
+ Negate = Negate ^ NegatedBallot;
+ } else {
+ // TODO: don't use SCC here assuming that AMDGPUISD::SETCC is always
+ // selected as V_CMP, but this may change for uniform condition.
+ Cond = VCMP;
+ UseSCCBr = false;
+ }
+ }
+ // Cond is either V_CMP resulted from AMDGPUISD::SETCC or a combination of
+ // V_CMPs resulted from ballot or ballot has uniform condition and SCC is
+ // used.
+ AndExec = false;
+ }
+
+ unsigned BrOp =
+ UseSCCBr ? (Negate ? AMDGPU::S_CBRANCH_SCC0 : AMDGPU::S_CBRANCH_SCC1)
+ : (Negate ? AMDGPU::S_CBRANCH_VCCZ : AMDGPU::S_CBRANCH_VCCNZ);
Register CondReg = UseSCCBr ? AMDGPU::SCC : TRI->getVCC();
SDLoc SL(N);
- if (!UseSCCBr) {
+ if (AndExec) {
// This is the case that we are selecting to S_CBRANCH_VCCNZ. We have not
// analyzed what generates the vcc value, so we do not know whether vcc
// bits for disabled lanes are 0. Thus we need to mask out bits for
@@ -2296,6 +2459,22 @@ void AMDGPUDAGToDAGISel::SelectBRCOND(SDNode *N) {
VCC.getValue(0));
}
+void AMDGPUDAGToDAGISel::SelectFP_EXTEND(SDNode *N) {
+ if (Subtarget->hasSALUFloatInsts() && N->getValueType(0) == MVT::f32 &&
+ !N->isDivergent()) {
+ SDValue Src = N->getOperand(0);
+ if (Src.getValueType() == MVT::f16) {
+ if (isExtractHiElt(Src, Src)) {
+ CurDAG->SelectNodeTo(N, AMDGPU::S_CVT_HI_F32_F16, N->getVTList(),
+ {Src});
+ return;
+ }
+ }
+ }
+
+ SelectCode(N);
+}
+
void AMDGPUDAGToDAGISel::SelectDSAppendConsume(SDNode *N, unsigned IntrID) {
// The address is assumed to be uniform, so if it ends up in a VGPR, it will
// be copied to an SGPR with readfirstlane.
@@ -2369,8 +2548,9 @@ static unsigned gwsIntrinToOpcode(unsigned IntrID) {
}
void AMDGPUDAGToDAGISel::SelectDS_GWS(SDNode *N, unsigned IntrID) {
- if (IntrID == Intrinsic::amdgcn_ds_gws_sema_release_all &&
- !Subtarget->hasGWSSemaReleaseAll()) {
+ if (!Subtarget->hasGWS() ||
+ (IntrID == Intrinsic::amdgcn_ds_gws_sema_release_all &&
+ !Subtarget->hasGWSSemaReleaseAll())) {
// Let this error.
SelectCode(N);
return;
@@ -2568,6 +2748,45 @@ void AMDGPUDAGToDAGISel::SelectINTRINSIC_VOID(SDNode *N) {
SelectCode(N);
}
+void AMDGPUDAGToDAGISel::SelectWAVE_ADDRESS(SDNode *N) {
+ SDValue Log2WaveSize =
+ CurDAG->getTargetConstant(Subtarget->getWavefrontSizeLog2(), SDLoc(N), MVT::i32);
+ CurDAG->SelectNodeTo(N, AMDGPU::S_LSHR_B32, N->getVTList(),
+ {N->getOperand(0), Log2WaveSize});
+}
+
+void AMDGPUDAGToDAGISel::SelectSTACKRESTORE(SDNode *N) {
+ SDValue SrcVal = N->getOperand(1);
+ if (SrcVal.getValueType() != MVT::i32) {
+ SelectCode(N); // Emit default error
+ return;
+ }
+
+ SDValue CopyVal;
+ Register SP = TLI->getStackPointerRegisterToSaveRestore();
+ SDLoc SL(N);
+
+ if (SrcVal.getOpcode() == AMDGPUISD::WAVE_ADDRESS) {
+ CopyVal = SrcVal.getOperand(0);
+ } else {
+ SDValue Log2WaveSize = CurDAG->getTargetConstant(
+ Subtarget->getWavefrontSizeLog2(), SL, MVT::i32);
+
+ if (N->isDivergent()) {
+ SrcVal = SDValue(CurDAG->getMachineNode(AMDGPU::V_READFIRSTLANE_B32, SL,
+ MVT::i32, SrcVal),
+ 0);
+ }
+
+ CopyVal = SDValue(CurDAG->getMachineNode(AMDGPU::S_LSHL_B32, SL, MVT::i32,
+ {SrcVal, Log2WaveSize}),
+ 0);
+ }
+
+ SDValue CopyToSP = CurDAG->getCopyToReg(N->getOperand(0), SL, SP, CopyVal);
+ CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), CopyToSP);
+}
+
bool AMDGPUDAGToDAGISel::SelectVOP3ModsImpl(SDValue In, SDValue &Src,
unsigned &Mods,
bool IsCanonicalizing,
@@ -2948,7 +3167,7 @@ bool AMDGPUDAGToDAGISel::isVGPRImm(const SDNode * N) const {
if (!RC || SIRI->isSGPRClass(RC))
return false;
- if (RC != &AMDGPU::VS_32RegClass) {
+ if (RC != &AMDGPU::VS_32RegClass && RC != &AMDGPU::VS_64RegClass) {
AllUsesAcceptSReg = false;
SDNode * User = *U;
if (User->isMachineOpcode()) {
@@ -2960,7 +3179,8 @@ bool AMDGPUDAGToDAGISel::isVGPRImm(const SDNode * N) const {
if (SII->findCommutedOpIndices(Desc, OpIdx, CommuteIdx1)) {
unsigned CommutedOpNo = CommuteIdx1 - Desc.getNumDefs();
const TargetRegisterClass *CommutedRC = getOperandRegClass(*U, CommutedOpNo);
- if (CommutedRC == &AMDGPU::VS_32RegClass)
+ if (CommutedRC == &AMDGPU::VS_32RegClass ||
+ CommutedRC == &AMDGPU::VS_64RegClass)
AllUsesAcceptSReg = true;
}
}
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
index 0605baf3a0cc..374108af08cd 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
@@ -92,7 +92,7 @@ public:
AMDGPUDAGToDAGISel() = delete;
- explicit AMDGPUDAGToDAGISel(TargetMachine &TM, CodeGenOpt::Level OptLevel);
+ explicit AMDGPUDAGToDAGISel(TargetMachine &TM, CodeGenOptLevel OptLevel);
~AMDGPUDAGToDAGISel() override = default;
void getAnalysisUsage(AnalysisUsage &AU) const override;
@@ -154,8 +154,10 @@ private:
bool isDSOffsetLegal(SDValue Base, unsigned Offset) const;
bool isDSOffset2Legal(SDValue Base, unsigned Offset0, unsigned Offset1,
unsigned Size) const;
- bool isFlatScratchBaseLegal(
- SDValue Base, uint64_t FlatVariant = SIInstrFlags::FlatScratch) const;
+
+ bool isFlatScratchBaseLegal(SDValue Addr) const;
+ bool isFlatScratchBaseLegalSV(SDValue Addr) const;
+ bool isFlatScratchBaseLegalSVImm(SDValue Addr) const;
bool SelectDS1Addr1Offset(SDValue Ptr, SDValue &Base, SDValue &Offset) const;
bool SelectDS64Bit4ByteAligned(SDValue Ptr, SDValue &Base, SDValue &Offset0,
@@ -177,6 +179,7 @@ private:
bool SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc, SDValue &Soffset,
SDValue &Offset) const;
+ bool SelectBUFSOffset(SDValue Addr, SDValue &SOffset) const;
bool SelectFlatOffsetImpl(SDNode *N, SDValue Addr, SDValue &VAddr,
SDValue &Offset, uint64_t FlatVariant) const;
@@ -273,6 +276,7 @@ private:
bool isCBranchSCC(const SDNode *N) const;
void SelectBRCOND(SDNode *N);
void SelectFMAD_FMA(SDNode *N);
+ void SelectFP_EXTEND(SDNode *N);
void SelectDSAppendConsume(SDNode *N, unsigned IntrID);
void SelectDSBvhStackIntrinsic(SDNode *N);
void SelectDS_GWS(SDNode *N, unsigned IntrID);
@@ -280,6 +284,8 @@ private:
void SelectINTRINSIC_W_CHAIN(SDNode *N);
void SelectINTRINSIC_WO_CHAIN(SDNode *N);
void SelectINTRINSIC_VOID(SDNode *N);
+ void SelectWAVE_ADDRESS(SDNode *N);
+ void SelectSTACKRESTORE(SDNode *N);
protected:
// Include the pieces autogenerated from the target description.
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index 39e00a037bdd..9d7443012e3d 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -323,24 +323,26 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
setOperationAction({ISD::BR_JT, ISD::BRIND}, MVT::Other, Expand);
- // This is totally unsupported, just custom lower to produce an error.
+ // For R600, this is totally unsupported, just custom lower to produce an
+ // error.
setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Custom);
// Library functions. These default to Expand, but we have instructions
// for them.
- setOperationAction({ISD::FCEIL, ISD::FPOW, ISD::FABS, ISD::FFLOOR, ISD::FRINT,
- ISD::FTRUNC, ISD::FMINNUM, ISD::FMAXNUM},
+ setOperationAction({ISD::FCEIL, ISD::FPOW, ISD::FABS, ISD::FFLOOR,
+ ISD::FROUNDEVEN, ISD::FTRUNC, ISD::FMINNUM, ISD::FMAXNUM},
MVT::f32, Legal);
setOperationAction(ISD::FLOG2, MVT::f32, Custom);
setOperationAction(ISD::FROUND, {MVT::f32, MVT::f64}, Custom);
- setOperationAction({ISD::FLOG, ISD::FLOG10, ISD::FEXP, ISD::FEXP2}, MVT::f32,
- Custom);
+ setOperationAction(
+ {ISD::FLOG, ISD::FLOG10, ISD::FEXP, ISD::FEXP2, ISD::FEXP10}, MVT::f32,
+ Custom);
setOperationAction(ISD::FNEARBYINT, {MVT::f16, MVT::f32, MVT::f64}, Custom);
- setOperationAction(ISD::FROUNDEVEN, {MVT::f16, MVT::f32, MVT::f64}, Custom);
+ setOperationAction(ISD::FRINT, {MVT::f16, MVT::f32, MVT::f64}, Custom);
setOperationAction(ISD::FREM, {MVT::f16, MVT::f32, MVT::f64}, Custom);
@@ -351,7 +353,8 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
setOperationAction({ISD::FLOG2, ISD::FEXP2}, MVT::f16, Custom);
}
- setOperationAction({ISD::FLOG10, ISD::FLOG, ISD::FEXP}, MVT::f16, Custom);
+ setOperationAction({ISD::FLOG10, ISD::FLOG, ISD::FEXP, ISD::FEXP10}, MVT::f16,
+ Custom);
// FIXME: These IS_FPCLASS vector fp types are marked custom so it reaches
// scalarization code. Can be removed when IS_FPCLASS expand isn't called by
@@ -383,7 +386,7 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
MVT::v12f32, MVT::v16f16, MVT::v16i16, MVT::v16f32, MVT::v16i32,
MVT::v32f32, MVT::v32i32, MVT::v2f64, MVT::v2i64, MVT::v3f64,
MVT::v3i64, MVT::v4f64, MVT::v4i64, MVT::v8f64, MVT::v8i64,
- MVT::v16f64, MVT::v16i64},
+ MVT::v16f64, MVT::v16i64, MVT::v32i16, MVT::v32f16},
Custom);
setOperationAction(ISD::FP16_TO_FP, MVT::f64, Expand);
@@ -456,14 +459,17 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
for (MVT VT : FloatVectorTypes) {
setOperationAction(
- {ISD::FABS, ISD::FMINNUM, ISD::FMAXNUM, ISD::FADD,
- ISD::FCEIL, ISD::FCOS, ISD::FDIV, ISD::FEXP2,
- ISD::FEXP, ISD::FLOG2, ISD::FREM, ISD::FLOG,
- ISD::FLOG10, ISD::FPOW, ISD::FFLOOR, ISD::FTRUNC,
- ISD::FMUL, ISD::FMA, ISD::FRINT, ISD::FNEARBYINT,
- ISD::FSQRT, ISD::FSIN, ISD::FSUB, ISD::FNEG,
- ISD::VSELECT, ISD::SELECT_CC, ISD::FCOPYSIGN, ISD::VECTOR_SHUFFLE,
- ISD::SETCC, ISD::FCANONICALIZE},
+ {ISD::FABS, ISD::FMINNUM, ISD::FMAXNUM,
+ ISD::FADD, ISD::FCEIL, ISD::FCOS,
+ ISD::FDIV, ISD::FEXP2, ISD::FEXP,
+ ISD::FEXP10, ISD::FLOG2, ISD::FREM,
+ ISD::FLOG, ISD::FLOG10, ISD::FPOW,
+ ISD::FFLOOR, ISD::FTRUNC, ISD::FMUL,
+ ISD::FMA, ISD::FRINT, ISD::FNEARBYINT,
+ ISD::FSQRT, ISD::FSIN, ISD::FSUB,
+ ISD::FNEG, ISD::VSELECT, ISD::SELECT_CC,
+ ISD::FCOPYSIGN, ISD::VECTOR_SHUFFLE, ISD::SETCC,
+ ISD::FCANONICALIZE, ISD::FROUNDEVEN},
VT, Expand);
}
@@ -579,11 +585,14 @@ static bool fnegFoldsIntoOpcode(unsigned Opc) {
case ISD::FMAXNUM:
case ISD::FMINNUM_IEEE:
case ISD::FMAXNUM_IEEE:
+ case ISD::FMINIMUM:
+ case ISD::FMAXIMUM:
case ISD::SELECT:
case ISD::FSIN:
case ISD::FTRUNC:
case ISD::FRINT:
case ISD::FNEARBYINT:
+ case ISD::FROUNDEVEN:
case ISD::FCANONICALIZE:
case AMDGPUISD::RCP:
case AMDGPUISD::RCP_LEGACY:
@@ -1001,6 +1010,9 @@ CCAssignFn *AMDGPUCallLowering::CCAssignFnForCall(CallingConv::ID CC,
case CallingConv::AMDGPU_ES:
case CallingConv::AMDGPU_LS:
return CC_AMDGPU;
+ case CallingConv::AMDGPU_CS_Chain:
+ case CallingConv::AMDGPU_CS_ChainPreserve:
+ return CC_AMDGPU_CS_CHAIN;
case CallingConv::C:
case CallingConv::Fast:
case CallingConv::Cold:
@@ -1024,6 +1036,8 @@ CCAssignFn *AMDGPUCallLowering::CCAssignFnForReturn(CallingConv::ID CC,
case CallingConv::AMDGPU_GS:
case CallingConv::AMDGPU_PS:
case CallingConv::AMDGPU_CS:
+ case CallingConv::AMDGPU_CS_Chain:
+ case CallingConv::AMDGPU_CS_ChainPreserve:
case CallingConv::AMDGPU_HS:
case CallingConv::AMDGPU_ES:
case CallingConv::AMDGPU_LS:
@@ -1315,6 +1329,7 @@ SDValue AMDGPUTargetLowering::LowerOperation(SDValue Op,
case ISD::FLOG10:
return LowerFLOGCommon(Op, DAG);
case ISD::FEXP:
+ case ISD::FEXP10:
return lowerFEXP(Op, DAG);
case ISD::FEXP2:
return lowerFEXP2(Op, DAG);
@@ -1360,6 +1375,7 @@ void AMDGPUTargetLowering::ReplaceNodeResults(SDNode *N,
Results.push_back(Lowered);
return;
case ISD::FEXP:
+ case ISD::FEXP10:
if (SDValue Lowered = lowerFEXP(SDValue(N, 0), DAG))
Results.push_back(Lowered);
return;
@@ -1714,7 +1730,7 @@ SDValue AMDGPUTargetLowering::SplitVectorLoad(const SDValue Op,
SDValue LoLoad = DAG.getExtLoad(Load->getExtensionType(), SL, LoVT,
Load->getChain(), BasePtr, SrcValue, LoMemVT,
BaseAlign, Load->getMemOperand()->getFlags());
- SDValue HiPtr = DAG.getObjectPtrOffset(SL, BasePtr, TypeSize::Fixed(Size));
+ SDValue HiPtr = DAG.getObjectPtrOffset(SL, BasePtr, TypeSize::getFixed(Size));
SDValue HiLoad =
DAG.getExtLoad(Load->getExtensionType(), SL, HiVT, Load->getChain(),
HiPtr, SrcValue.getWithOffset(LoMemVT.getStoreSize()),
@@ -2362,7 +2378,8 @@ SDValue AMDGPUTargetLowering::LowerFTRUNC(SDValue Op, SelectionDAG &DAG) const {
return DAG.getNode(ISD::BITCAST, SL, MVT::f64, Tmp2);
}
-SDValue AMDGPUTargetLowering::LowerFRINT(SDValue Op, SelectionDAG &DAG) const {
+SDValue AMDGPUTargetLowering::LowerFROUNDEVEN(SDValue Op,
+ SelectionDAG &DAG) const {
SDLoc SL(Op);
SDValue Src = Op.getOperand(0);
@@ -2389,18 +2406,19 @@ SDValue AMDGPUTargetLowering::LowerFRINT(SDValue Op, SelectionDAG &DAG) const {
return DAG.getSelect(SL, MVT::f64, Cond, Src, Tmp2);
}
-SDValue AMDGPUTargetLowering::LowerFNEARBYINT(SDValue Op, SelectionDAG &DAG) const {
+SDValue AMDGPUTargetLowering::LowerFNEARBYINT(SDValue Op,
+ SelectionDAG &DAG) const {
// FNEARBYINT and FRINT are the same, except in their handling of FP
// exceptions. Those aren't really meaningful for us, and OpenCL only has
// rint, so just treat them as equivalent.
- return DAG.getNode(ISD::FRINT, SDLoc(Op), Op.getValueType(), Op.getOperand(0));
+ return DAG.getNode(ISD::FROUNDEVEN, SDLoc(Op), Op.getValueType(),
+ Op.getOperand(0));
}
-SDValue AMDGPUTargetLowering::LowerFROUNDEVEN(SDValue Op,
- SelectionDAG &DAG) const {
+SDValue AMDGPUTargetLowering::LowerFRINT(SDValue Op, SelectionDAG &DAG) const {
auto VT = Op.getValueType();
auto Arg = Op.getOperand(0u);
- return DAG.getNode(ISD::FRINT, SDLoc(Op), VT, Arg);
+ return DAG.getNode(ISD::FROUNDEVEN, SDLoc(Op), VT, Arg);
}
// XXX - May require not supporting f32 denormals?
@@ -2423,18 +2441,16 @@ SDValue AMDGPUTargetLowering::LowerFROUND(SDValue Op, SelectionDAG &DAG) const {
const SDValue Zero = DAG.getConstantFP(0.0, SL, VT);
const SDValue One = DAG.getConstantFP(1.0, SL, VT);
- const SDValue Half = DAG.getConstantFP(0.5, SL, VT);
-
- SDValue SignOne = DAG.getNode(ISD::FCOPYSIGN, SL, VT, One, X);
EVT SetCCVT =
getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
+ const SDValue Half = DAG.getConstantFP(0.5, SL, VT);
SDValue Cmp = DAG.getSetCC(SL, SetCCVT, AbsDiff, Half, ISD::SETOGE);
+ SDValue OneOrZeroFP = DAG.getNode(ISD::SELECT, SL, VT, Cmp, One, Zero);
- SDValue Sel = DAG.getNode(ISD::SELECT, SL, VT, Cmp, SignOne, Zero);
-
- return DAG.getNode(ISD::FADD, SL, VT, T, Sel);
+ SDValue SignedOffset = DAG.getNode(ISD::FCOPYSIGN, SL, VT, OneOrZeroFP, X);
+ return DAG.getNode(ISD::FADD, SL, VT, T, SignedOffset);
}
SDValue AMDGPUTargetLowering::LowerFFLOOR(SDValue Op, SelectionDAG &DAG) const {
@@ -2468,7 +2484,18 @@ static bool valueIsKnownNeverF32Denorm(SDValue Src) {
case ISD::FP_EXTEND:
return Src.getOperand(0).getValueType() == MVT::f16;
case ISD::FP16_TO_FP:
+ case ISD::FFREXP:
return true;
+ case ISD::INTRINSIC_WO_CHAIN: {
+ unsigned IntrinsicID =
+ cast<ConstantSDNode>(Src.getOperand(0))->getZExtValue();
+ switch (IntrinsicID) {
+ case Intrinsic::amdgcn_frexp_mant:
+ return true;
+ default:
+ return false;
+ }
+ }
default:
return false;
}
@@ -2476,15 +2503,17 @@ static bool valueIsKnownNeverF32Denorm(SDValue Src) {
llvm_unreachable("covered opcode switch");
}
-static bool allowApproxFunc(const SelectionDAG &DAG, SDNodeFlags Flags) {
+bool AMDGPUTargetLowering::allowApproxFunc(const SelectionDAG &DAG,
+ SDNodeFlags Flags) {
if (Flags.hasApproximateFuncs())
return true;
auto &Options = DAG.getTarget().Options;
return Options.UnsafeFPMath || Options.ApproxFuncFPMath;
}
-static bool needsDenormHandlingF32(const SelectionDAG &DAG, SDValue Src,
- SDNodeFlags Flags) {
+bool AMDGPUTargetLowering::needsDenormHandlingF32(const SelectionDAG &DAG,
+ SDValue Src,
+ SDNodeFlags Flags) {
return !valueIsKnownNeverF32Denorm(Src) &&
DAG.getMachineFunction()
.getDenormalMode(APFloat::IEEEsingle())
@@ -2697,7 +2726,8 @@ SDValue AMDGPUTargetLowering::LowerFLOGUnsafe(SDValue Src, const SDLoc &SL,
SelectionDAG &DAG, bool IsLog10,
SDNodeFlags Flags) const {
EVT VT = Src.getValueType();
- unsigned LogOp = VT == MVT::f32 ? AMDGPUISD::LOG : ISD::FLOG2;
+ unsigned LogOp =
+ VT == MVT::f32 ? (unsigned)AMDGPUISD::LOG : (unsigned)ISD::FLOG2;
double Log2BaseInverted =
IsLog10 ? numbers::ln2 / numbers::ln10 : numbers::ln2;
@@ -2782,14 +2812,95 @@ SDValue AMDGPUTargetLowering::lowerFEXP2(SDValue Op, SelectionDAG &DAG) const {
return DAG.getNode(ISD::FMUL, SL, VT, Exp2, ResultScale, Flags);
}
-SDValue AMDGPUTargetLowering::lowerFEXPUnsafe(SDValue Op, const SDLoc &SL,
+SDValue AMDGPUTargetLowering::lowerFEXPUnsafe(SDValue X, const SDLoc &SL,
SelectionDAG &DAG,
SDNodeFlags Flags) const {
- // exp2(M_LOG2E_F * f);
- EVT VT = Op.getValueType();
- const SDValue K = DAG.getConstantFP(numbers::log2e, SL, VT);
- SDValue Mul = DAG.getNode(ISD::FMUL, SL, VT, Op, K, Flags);
- return DAG.getNode(VT == MVT::f32 ? AMDGPUISD::EXP : ISD::FEXP2, SL, VT, Mul,
+ EVT VT = X.getValueType();
+ const SDValue Log2E = DAG.getConstantFP(numbers::log2e, SL, VT);
+
+ if (VT != MVT::f32 || !needsDenormHandlingF32(DAG, X, Flags)) {
+ // exp2(M_LOG2E_F * f);
+ SDValue Mul = DAG.getNode(ISD::FMUL, SL, VT, X, Log2E, Flags);
+ return DAG.getNode(VT == MVT::f32 ? (unsigned)AMDGPUISD::EXP
+ : (unsigned)ISD::FEXP2,
+ SL, VT, Mul, Flags);
+ }
+
+ EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
+
+ SDValue Threshold = DAG.getConstantFP(-0x1.5d58a0p+6f, SL, VT);
+ SDValue NeedsScaling = DAG.getSetCC(SL, SetCCVT, X, Threshold, ISD::SETOLT);
+
+ SDValue ScaleOffset = DAG.getConstantFP(0x1.0p+6f, SL, VT);
+
+ SDValue ScaledX = DAG.getNode(ISD::FADD, SL, VT, X, ScaleOffset, Flags);
+
+ SDValue AdjustedX =
+ DAG.getNode(ISD::SELECT, SL, VT, NeedsScaling, ScaledX, X);
+
+ SDValue ExpInput = DAG.getNode(ISD::FMUL, SL, VT, AdjustedX, Log2E, Flags);
+
+ SDValue Exp2 = DAG.getNode(AMDGPUISD::EXP, SL, VT, ExpInput, Flags);
+
+ SDValue ResultScaleFactor = DAG.getConstantFP(0x1.969d48p-93f, SL, VT);
+ SDValue AdjustedResult =
+ DAG.getNode(ISD::FMUL, SL, VT, Exp2, ResultScaleFactor, Flags);
+
+ return DAG.getNode(ISD::SELECT, SL, VT, NeedsScaling, AdjustedResult, Exp2,
+ Flags);
+}
+
+/// Emit approx-funcs appropriate lowering for exp10. inf/nan should still be
+/// handled correctly.
+SDValue AMDGPUTargetLowering::lowerFEXP10Unsafe(SDValue X, const SDLoc &SL,
+ SelectionDAG &DAG,
+ SDNodeFlags Flags) const {
+ const EVT VT = X.getValueType();
+ const unsigned Exp2Op = VT == MVT::f32 ? AMDGPUISD::EXP : ISD::FEXP2;
+
+ if (VT != MVT::f32 || !needsDenormHandlingF32(DAG, X, Flags)) {
+ // exp2(x * 0x1.a92000p+1f) * exp2(x * 0x1.4f0978p-11f);
+ SDValue K0 = DAG.getConstantFP(0x1.a92000p+1f, SL, VT);
+ SDValue K1 = DAG.getConstantFP(0x1.4f0978p-11f, SL, VT);
+
+ SDValue Mul0 = DAG.getNode(ISD::FMUL, SL, VT, X, K0, Flags);
+ SDValue Exp2_0 = DAG.getNode(Exp2Op, SL, VT, Mul0, Flags);
+ SDValue Mul1 = DAG.getNode(ISD::FMUL, SL, VT, X, K1, Flags);
+ SDValue Exp2_1 = DAG.getNode(Exp2Op, SL, VT, Mul1, Flags);
+ return DAG.getNode(ISD::FMUL, SL, VT, Exp2_0, Exp2_1);
+ }
+
+ // bool s = x < -0x1.2f7030p+5f;
+ // x += s ? 0x1.0p+5f : 0.0f;
+ // exp10 = exp2(x * 0x1.a92000p+1f) *
+ // exp2(x * 0x1.4f0978p-11f) *
+ // (s ? 0x1.9f623ep-107f : 1.0f);
+
+ EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
+
+ SDValue Threshold = DAG.getConstantFP(-0x1.2f7030p+5f, SL, VT);
+ SDValue NeedsScaling = DAG.getSetCC(SL, SetCCVT, X, Threshold, ISD::SETOLT);
+
+ SDValue ScaleOffset = DAG.getConstantFP(0x1.0p+5f, SL, VT);
+ SDValue ScaledX = DAG.getNode(ISD::FADD, SL, VT, X, ScaleOffset, Flags);
+ SDValue AdjustedX =
+ DAG.getNode(ISD::SELECT, SL, VT, NeedsScaling, ScaledX, X);
+
+ SDValue K0 = DAG.getConstantFP(0x1.a92000p+1f, SL, VT);
+ SDValue K1 = DAG.getConstantFP(0x1.4f0978p-11f, SL, VT);
+
+ SDValue Mul0 = DAG.getNode(ISD::FMUL, SL, VT, AdjustedX, K0, Flags);
+ SDValue Exp2_0 = DAG.getNode(Exp2Op, SL, VT, Mul0, Flags);
+ SDValue Mul1 = DAG.getNode(ISD::FMUL, SL, VT, AdjustedX, K1, Flags);
+ SDValue Exp2_1 = DAG.getNode(Exp2Op, SL, VT, Mul1, Flags);
+
+ SDValue MulExps = DAG.getNode(ISD::FMUL, SL, VT, Exp2_0, Exp2_1, Flags);
+
+ SDValue ResultScaleFactor = DAG.getConstantFP(0x1.9f623ep-107f, SL, VT);
+ SDValue AdjustedResult =
+ DAG.getNode(ISD::FMUL, SL, VT, MulExps, ResultScaleFactor, Flags);
+
+ return DAG.getNode(ISD::SELECT, SL, VT, NeedsScaling, AdjustedResult, MulExps,
Flags);
}
@@ -2798,7 +2909,7 @@ SDValue AMDGPUTargetLowering::lowerFEXP(SDValue Op, SelectionDAG &DAG) const {
SDLoc SL(Op);
SDValue X = Op.getOperand(0);
SDNodeFlags Flags = Op->getFlags();
- const bool IsExp10 = false; // TODO: For some reason exp10 is missing
+ const bool IsExp10 = Op.getOpcode() == ISD::FEXP10;
if (VT.getScalarType() == MVT::f16) {
// v_exp_f16 (fmul x, log2e)
@@ -2822,9 +2933,9 @@ SDValue AMDGPUTargetLowering::lowerFEXP(SDValue Op, SelectionDAG &DAG) const {
// TODO: Interpret allowApproxFunc as ignoring DAZ. This is currently copying
// library behavior. Also, is known-not-daz source sufficient?
- if (allowApproxFunc(DAG, Flags) && !needsDenormHandlingF32(DAG, X, Flags)) {
- assert(!IsExp10 && "todo exp10 support");
- return lowerFEXPUnsafe(X, SL, DAG, Flags);
+ if (allowApproxFunc(DAG, Flags)) {
+ return IsExp10 ? lowerFEXP10Unsafe(X, SL, DAG, Flags)
+ : lowerFEXPUnsafe(X, SL, DAG, Flags);
}
// Algorithm:
@@ -2891,7 +3002,7 @@ SDValue AMDGPUTargetLowering::lowerFEXP(SDValue Op, SelectionDAG &DAG) const {
PL = getMad(DAG, SL, VT, XH, CL, Mad0, Flags);
}
- SDValue E = DAG.getNode(ISD::FRINT, SL, VT, PH, Flags);
+ SDValue E = DAG.getNode(ISD::FROUNDEVEN, SL, VT, PH, Flags);
// It is unsafe to contract this fsub into the PH multiply.
SDValue PHSubE = DAG.getNode(ISD::FSUB, SL, VT, PH, E, FlagsNoContract);
@@ -3698,8 +3809,7 @@ SDValue AMDGPUTargetLowering::performIntrinsicWOChainCombine(
case Intrinsic::amdgcn_rsq:
case Intrinsic::amdgcn_rcp_legacy:
case Intrinsic::amdgcn_rsq_legacy:
- case Intrinsic::amdgcn_rsq_clamp:
- case Intrinsic::amdgcn_ldexp: {
+ case Intrinsic::amdgcn_rsq_clamp: {
// FIXME: This is probably wrong. If src is an sNaN, it won't be quieted
SDValue Src = N->getOperand(1);
return Src.isUndef() ? Src : SDValue();
@@ -4012,8 +4122,7 @@ static SDValue getAddOneOp(const SDNode *V) {
if (V->getOpcode() != ISD::ADD)
return SDValue();
- auto *C = dyn_cast<ConstantSDNode>(V->getOperand(1));
- return C && C->isOne() ? V->getOperand(0) : SDValue();
+ return isOneConstant(V->getOperand(1)) ? V->getOperand(0) : SDValue();
}
SDValue AMDGPUTargetLowering::performMulCombine(SDNode *N,
@@ -4243,8 +4352,7 @@ SDValue AMDGPUTargetLowering::getFFBX_U32(SelectionDAG &DAG,
SDValue AMDGPUTargetLowering::performCtlz_CttzCombine(const SDLoc &SL, SDValue Cond,
SDValue LHS, SDValue RHS,
DAGCombinerInfo &DCI) const {
- ConstantSDNode *CmpRhs = dyn_cast<ConstantSDNode>(Cond.getOperand(1));
- if (!CmpRhs || !CmpRhs->isZero())
+ if (!isNullConstant(Cond.getOperand(1)))
return SDValue();
SelectionDAG &DAG = DCI.DAG;
@@ -4466,6 +4574,10 @@ static unsigned inverseMinMax(unsigned Opc) {
return ISD::FMINNUM_IEEE;
case ISD::FMINNUM_IEEE:
return ISD::FMAXNUM_IEEE;
+ case ISD::FMAXIMUM:
+ return ISD::FMINIMUM;
+ case ISD::FMINIMUM:
+ return ISD::FMAXIMUM;
case AMDGPUISD::FMAX_LEGACY:
return AMDGPUISD::FMIN_LEGACY;
case AMDGPUISD::FMIN_LEGACY:
@@ -4589,6 +4701,8 @@ SDValue AMDGPUTargetLowering::performFNegCombine(SDNode *N,
case ISD::FMINNUM:
case ISD::FMAXNUM_IEEE:
case ISD::FMINNUM_IEEE:
+ case ISD::FMINIMUM:
+ case ISD::FMAXIMUM:
case AMDGPUISD::FMAX_LEGACY:
case AMDGPUISD::FMIN_LEGACY: {
// fneg (fmaxnum x, y) -> fminnum (fneg x), (fneg y)
@@ -4638,6 +4752,7 @@ SDValue AMDGPUTargetLowering::performFNegCombine(SDNode *N,
case ISD::FTRUNC:
case ISD::FRINT:
case ISD::FNEARBYINT: // XXX - Should fround be handled?
+ case ISD::FROUNDEVEN:
case ISD::FSIN:
case ISD::FCANONICALIZE:
case AMDGPUISD::RCP:
@@ -4999,6 +5114,36 @@ SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N,
return performAssertSZExtCombine(N, DCI);
case ISD::INTRINSIC_WO_CHAIN:
return performIntrinsicWOChainCombine(N, DCI);
+ case AMDGPUISD::FMAD_FTZ: {
+ SDValue N0 = N->getOperand(0);
+ SDValue N1 = N->getOperand(1);
+ SDValue N2 = N->getOperand(2);
+ EVT VT = N->getValueType(0);
+
+ // FMAD_FTZ is a FMAD + flush denormals to zero.
+ // We flush the inputs, the intermediate step, and the output.
+ ConstantFPSDNode *N0CFP = dyn_cast<ConstantFPSDNode>(N0);
+ ConstantFPSDNode *N1CFP = dyn_cast<ConstantFPSDNode>(N1);
+ ConstantFPSDNode *N2CFP = dyn_cast<ConstantFPSDNode>(N2);
+ if (N0CFP && N1CFP && N2CFP) {
+ const auto FTZ = [](const APFloat &V) {
+ if (V.isDenormal()) {
+ APFloat Zero(V.getSemantics(), 0);
+ return V.isNegative() ? -Zero : Zero;
+ }
+ return V;
+ };
+
+ APFloat V0 = FTZ(N0CFP->getValueAPF());
+ APFloat V1 = FTZ(N1CFP->getValueAPF());
+ APFloat V2 = FTZ(N2CFP->getValueAPF());
+ V0.multiply(V1, APFloat::rmNearestTiesToEven);
+ V0 = FTZ(V0);
+ V0.add(V2, APFloat::rmNearestTiesToEven);
+ return DAG.getConstantFP(FTZ(V0), DL, VT);
+ }
+ break;
+ }
}
return SDValue();
}
@@ -5140,8 +5285,10 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
NODE_NAME_CASE(CALL)
NODE_NAME_CASE(TC_RETURN)
NODE_NAME_CASE(TC_RETURN_GFX)
+ NODE_NAME_CASE(TC_RETURN_CHAIN)
NODE_NAME_CASE(TRAP)
NODE_NAME_CASE(RET_GLUE)
+ NODE_NAME_CASE(WAVE_ADDRESS)
NODE_NAME_CASE(RETURN_TO_EPILOG)
NODE_NAME_CASE(ENDPGM)
NODE_NAME_CASE(ENDPGM_TRAP)
@@ -5166,6 +5313,8 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
NODE_NAME_CASE(FMED3)
NODE_NAME_CASE(SMED3)
NODE_NAME_CASE(UMED3)
+ NODE_NAME_CASE(FMAXIMUM3)
+ NODE_NAME_CASE(FMINIMUM3)
NODE_NAME_CASE(FDOT2)
NODE_NAME_CASE(URECIP)
NODE_NAME_CASE(DIV_SCALE)
@@ -5620,6 +5769,8 @@ bool AMDGPUTargetLowering::isKnownNeverNaNForTargetNode(SDValue Op,
case AMDGPUISD::FMED3:
case AMDGPUISD::FMIN3:
case AMDGPUISD::FMAX3:
+ case AMDGPUISD::FMINIMUM3:
+ case AMDGPUISD::FMAXIMUM3:
case AMDGPUISD::FMAD_FTZ: {
if (SNaN)
return true;
@@ -5734,12 +5885,6 @@ AMDGPUTargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const {
}
}
-bool AMDGPUTargetLowering::isConstantUnsignedBitfieldExtractLegal(
- unsigned Opc, LLT Ty1, LLT Ty2) const {
- return (Ty1 == LLT::scalar(32) || Ty1 == LLT::scalar(64)) &&
- Ty2 == LLT::scalar(32);
-}
-
/// Whether it is profitable to sink the operands of an
/// Instruction I to the basic block of I.
/// This helps using several modifiers (like abs and neg) more often.
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
index c39093b9bb6b..827fb106b551 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
@@ -61,6 +61,9 @@ protected:
SDValue LowerFROUND(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerFFLOOR(SDValue Op, SelectionDAG &DAG) const;
+ static bool allowApproxFunc(const SelectionDAG &DAG, SDNodeFlags Flags);
+ static bool needsDenormHandlingF32(const SelectionDAG &DAG, SDValue Src,
+ SDNodeFlags Flags);
SDValue getIsLtSmallestNormal(SelectionDAG &DAG, SDValue Op,
SDNodeFlags Flags) const;
SDValue getIsFinite(SelectionDAG &DAG, SDValue Op, SDNodeFlags Flags) const;
@@ -77,6 +80,8 @@ protected:
SDValue lowerFEXPUnsafe(SDValue Op, const SDLoc &SL, SelectionDAG &DAG,
SDNodeFlags Flags) const;
+ SDValue lowerFEXP10Unsafe(SDValue Op, const SDLoc &SL, SelectionDAG &DAG,
+ SDNodeFlags Flags) const;
SDValue lowerFEXP(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerCTLZ_CTTZ(SDValue Op, SelectionDAG &DAG) const;
@@ -242,9 +247,7 @@ public:
SDValue LowerCall(CallLoweringInfo &CLI,
SmallVectorImpl<SDValue> &InVals) const override;
- SDValue LowerDYNAMIC_STACKALLOC(SDValue Op,
- SelectionDAG &DAG) const;
-
+ SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override;
SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override;
void ReplaceNodeResults(SDNode * N,
@@ -371,9 +374,6 @@ public:
AtomicExpansionKind shouldExpandAtomicRMWInIR(AtomicRMWInst *) const override;
- bool isConstantUnsignedBitfieldExtractLegal(unsigned Opc, LLT Ty1,
- LLT Ty2) const override;
-
bool shouldSinkOperands(Instruction *I,
SmallVectorImpl<Use *> &Ops) const override;
};
@@ -391,6 +391,7 @@ enum NodeType : unsigned {
CALL,
TC_RETURN,
TC_RETURN_GFX,
+ TC_RETURN_CHAIN,
TRAP,
// Masked control flow nodes.
@@ -410,6 +411,10 @@ enum NodeType : unsigned {
// Return with values from a non-entry function.
RET_GLUE,
+ // Convert a unswizzled wave uniform stack address to an address compatible
+ // with a vector offset for use in stack access.
+ WAVE_ADDRESS,
+
DWORDADDR,
FRACT,
@@ -444,6 +449,8 @@ enum NodeType : unsigned {
FMED3,
SMED3,
UMED3,
+ FMAXIMUM3,
+ FMINIMUM3,
FDOT2,
URECIP,
DIV_SCALE,
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUImageIntrinsicOptimizer.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUImageIntrinsicOptimizer.cpp
new file mode 100644
index 000000000000..e5fbcca1e7d1
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUImageIntrinsicOptimizer.cpp
@@ -0,0 +1,336 @@
+//===- AMDGPUImageIntrinsicOptimizer.cpp ----------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass tries to combine multiple image_load intrinsics with dim=2dmsaa
+// or dim=2darraymsaa into a single image_msaa_load intrinsic if:
+//
+// - they refer to the same vaddr except for sample_id,
+// - they use a constant sample_id and they fall into the same group,
+// - they have the same dmask and the number of intrinsics and the number of
+// vaddr/vdata dword transfers is reduced by the combine.
+//
+// Examples for the tradeoff (all are assuming 2DMsaa for vaddr):
+//
+// +----------+-----+-----+-------+---------+------------+---------+----------+
+// | popcount | a16 | d16 | #load | vaddr / | #msaa_load | vaddr / | combine? |
+// | (dmask) | | | | vdata | | vdata | |
+// +----------+-----+-----+-------+---------+------------+---------+----------+
+// | 1 | 0 | 0 | 4 | 12 / 4 | 1 | 3 / 4 | yes |
+// +----------+-----+-----+-------+---------+------------+---------+----------+
+// | 1 | 0 | 0 | 2 | 6 / 2 | 1 | 3 / 4 | yes? |
+// +----------+-----+-----+-------+---------+------------+---------+----------+
+// | 2 | 0 | 0 | 4 | 12 / 8 | 2 | 6 / 8 | yes |
+// +----------+-----+-----+-------+---------+------------+---------+----------+
+// | 2 | 0 | 0 | 2 | 6 / 4 | 2 | 6 / 8 | no |
+// +----------+-----+-----+-------+---------+------------+---------+----------+
+// | 1 | 0 | 1 | 2 | 6 / 2 | 1 | 3 / 2 | yes |
+// +----------+-----+-----+-------+---------+------------+---------+----------+
+//
+// Some cases are of questionable benefit, like the one marked with "yes?"
+// above: fewer intrinsics and fewer vaddr and fewer total transfers between SP
+// and TX, but higher vdata. We start by erring on the side of converting these
+// to MSAA_LOAD.
+//
+// clang-format off
+//
+// This pass will combine intrinsics such as (not neccessarily consecutive):
+// call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 %s, i32 %t, i32 0, <8 x i32> %rsrc, i32 0, i32 0)
+// call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 %s, i32 %t, i32 1, <8 x i32> %rsrc, i32 0, i32 0)
+// call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 %s, i32 %t, i32 2, <8 x i32> %rsrc, i32 0, i32 0)
+// call float @llvm.amdgcn.image.load.2dmsaa.f32.i32(i32 1, i32 %s, i32 %t, i32 3, <8 x i32> %rsrc, i32 0, i32 0)
+// ==>
+// call <4 x float> @llvm.amdgcn.image.msaa.load.2dmsaa.v4f32.i32(i32 1, i32 %s, i32 %t, i32 0, <8 x i32> %rsrc, i32 0, i32 0)
+//
+// clang-format on
+//
+// Future improvements:
+//
+// - We may occasionally not want to do the combine if it increases the maximum
+// register pressure.
+//
+// - Ensure clausing when multiple MSAA_LOAD are generated.
+//
+// Note: Even though the image_msaa_load intrinsic already exists on gfx10, this
+// combine only applies to gfx11, due to a limitation in gfx10: the gfx10
+// IMAGE_MSAA_LOAD only works correctly with single-channel texture formats, and
+// we don't know the format at compile time.
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "AMDGPUInstrInfo.h"
+#include "AMDGPUTargetMachine.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/IntrinsicsAMDGPU.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "amdgpu-image-intrinsic-opt"
+
+namespace {
+class AMDGPUImageIntrinsicOptimizer : public FunctionPass {
+ const TargetMachine *TM;
+
+public:
+ static char ID;
+
+ AMDGPUImageIntrinsicOptimizer(const TargetMachine *TM = nullptr)
+ : FunctionPass(ID), TM(TM) {}
+
+ bool runOnFunction(Function &F) override;
+
+}; // End of class AMDGPUImageIntrinsicOptimizer
+} // End anonymous namespace
+
+INITIALIZE_PASS(AMDGPUImageIntrinsicOptimizer, DEBUG_TYPE,
+ "AMDGPU Image Intrinsic Optimizer", false, false)
+
+char AMDGPUImageIntrinsicOptimizer::ID = 0;
+
+void addInstToMergeableList(
+ IntrinsicInst *II,
+ SmallVector<SmallVector<IntrinsicInst *, 4>> &MergeableInsts,
+ const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr) {
+ for (SmallVector<IntrinsicInst *, 4> &IIList : MergeableInsts) {
+ // Check Dim.
+ if (IIList.front()->getIntrinsicID() != II->getIntrinsicID())
+ continue;
+
+ // Check D16.
+ if (IIList.front()->getType() != II->getType())
+ continue;
+
+ // Check all arguments (DMask, VAddr, RSrc etc).
+ bool AllEqual = true;
+ assert(IIList.front()->arg_size() == II->arg_size());
+ for (int I = 1, E = II->arg_size(); AllEqual && I != E; ++I) {
+ Value *ArgList = IIList.front()->getArgOperand(I);
+ Value *Arg = II->getArgOperand(I);
+ if (I == ImageDimIntr->VAddrEnd - 1) {
+ // Check FragId group.
+ auto FragIdList = cast<ConstantInt>(IIList.front()->getArgOperand(I));
+ auto FragId = cast<ConstantInt>(II->getArgOperand(I));
+ AllEqual = FragIdList->getValue().udiv(4) == FragId->getValue().udiv(4);
+ } else {
+ // Check all arguments except FragId.
+ AllEqual = ArgList == Arg;
+ }
+ }
+ if (!AllEqual)
+ continue;
+
+ // Add to the list.
+ IIList.emplace_back(II);
+ return;
+ }
+
+ // Similar instruction not found, so add a new list.
+ MergeableInsts.emplace_back(1, II);
+ LLVM_DEBUG(dbgs() << "New: " << *II << "\n");
+}
+
+// Collect list of all instructions we know how to merge in a subset of the
+// block. It returns an iterator to the instruction after the last one analyzed.
+BasicBlock::iterator collectMergeableInsts(
+ BasicBlock::iterator I, BasicBlock::iterator E,
+ SmallVector<SmallVector<IntrinsicInst *, 4>> &MergeableInsts) {
+ for (; I != E; ++I) {
+ // Don't combine if there is a store in the middle or if there is a memory
+ // barrier.
+ if (I->mayHaveSideEffects()) {
+ ++I;
+ break;
+ }
+
+ // Ignore non-intrinsics.
+ if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) {
+ Intrinsic::ID IntrinID = II->getIntrinsicID();
+
+ // Ignore other intrinsics.
+ if (IntrinID != Intrinsic::amdgcn_image_load_2dmsaa &&
+ IntrinID != Intrinsic::amdgcn_image_load_2darraymsaa)
+ continue;
+
+ // Check for constant FragId.
+ const auto *ImageDimIntr = AMDGPU::getImageDimIntrinsicInfo(IntrinID);
+ const uint8_t FragIdIndex = ImageDimIntr->VAddrEnd - 1;
+ if (!isa<ConstantInt>(II->getArgOperand(FragIdIndex)))
+ continue;
+
+ LLVM_DEBUG(dbgs() << "Merge: " << *II << "\n");
+ addInstToMergeableList(II, MergeableInsts, ImageDimIntr);
+ }
+ }
+
+ return I;
+}
+
+bool optimizeSection(ArrayRef<SmallVector<IntrinsicInst *, 4>> MergeableInsts) {
+ bool Modified = false;
+
+ SmallVector<Instruction *, 4> InstrsToErase;
+ for (const auto &IIList : MergeableInsts) {
+ if (IIList.size() <= 1)
+ continue;
+
+ // Assume the arguments are unchanged and later override them, if needed.
+ SmallVector<Value *, 16> Args(IIList.front()->args());
+
+ // Validate function argument and return types, extracting overloaded
+ // types along the way.
+ SmallVector<Type *, 6> OverloadTys;
+ Function *F = IIList.front()->getCalledFunction();
+ if (!Intrinsic::getIntrinsicSignature(F, OverloadTys))
+ continue;
+
+ Intrinsic::ID IntrinID = IIList.front()->getIntrinsicID();
+ const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
+ AMDGPU::getImageDimIntrinsicInfo(IntrinID);
+
+ Type *EltTy = IIList.front()->getType()->getScalarType();
+ Type *NewTy = FixedVectorType::get(EltTy, 4);
+ OverloadTys[0] = NewTy;
+ bool isD16 = EltTy->isHalfTy();
+
+ ConstantInt *DMask = cast<ConstantInt>(
+ IIList.front()->getArgOperand(ImageDimIntr->DMaskIndex));
+ unsigned DMaskVal = DMask->getZExtValue() & 0xf;
+ unsigned NumElts = popcount(DMaskVal);
+
+ // Number of instructions and the number of vaddr/vdata dword transfers
+ // should be reduced.
+ unsigned NumLoads = IIList.size();
+ unsigned NumMsaas = NumElts;
+ unsigned NumVAddrLoads = 3 * NumLoads;
+ unsigned NumVDataLoads = divideCeil(NumElts, isD16 ? 2 : 1) * NumLoads;
+ unsigned NumVAddrMsaas = 3 * NumMsaas;
+ unsigned NumVDataMsaas = divideCeil(4, isD16 ? 2 : 1) * NumMsaas;
+
+ if (NumLoads < NumMsaas ||
+ (NumVAddrLoads + NumVDataLoads < NumVAddrMsaas + NumVDataMsaas))
+ continue;
+
+ const uint8_t FragIdIndex = ImageDimIntr->VAddrEnd - 1;
+ auto FragId = cast<ConstantInt>(IIList.front()->getArgOperand(FragIdIndex));
+ const APInt &NewFragIdVal = FragId->getValue().udiv(4) * 4;
+
+ // Create the new instructions.
+ IRBuilder<> B(IIList.front());
+
+ // Create the new image_msaa_load intrinsic.
+ SmallVector<Instruction *, 4> NewCalls;
+ while (DMaskVal != 0) {
+ unsigned NewMaskVal = 1 << countr_zero(DMaskVal);
+
+ Intrinsic::ID NewIntrinID;
+ if (IntrinID == Intrinsic::amdgcn_image_load_2dmsaa)
+ NewIntrinID = Intrinsic::amdgcn_image_msaa_load_2dmsaa;
+ else
+ NewIntrinID = Intrinsic::amdgcn_image_msaa_load_2darraymsaa;
+
+ Function *NewIntrin = Intrinsic::getDeclaration(
+ IIList.front()->getModule(), NewIntrinID, OverloadTys);
+ Args[ImageDimIntr->DMaskIndex] =
+ ConstantInt::get(DMask->getType(), NewMaskVal);
+ Args[FragIdIndex] = ConstantInt::get(FragId->getType(), NewFragIdVal);
+ CallInst *NewCall = B.CreateCall(NewIntrin, Args);
+ LLVM_DEBUG(dbgs() << "Optimize: " << *NewCall << "\n");
+
+ NewCalls.push_back(NewCall);
+ DMaskVal -= NewMaskVal;
+ }
+
+ // Create the new extractelement instructions.
+ for (auto &II : IIList) {
+ Value *VecOp = nullptr;
+ auto Idx = cast<ConstantInt>(II->getArgOperand(FragIdIndex));
+ B.SetCurrentDebugLocation(II->getDebugLoc());
+ if (NumElts == 1) {
+ VecOp = B.CreateExtractElement(NewCalls[0], Idx->getValue().urem(4));
+ LLVM_DEBUG(dbgs() << "Add: " << *VecOp << "\n");
+ } else {
+ VecOp = UndefValue::get(II->getType());
+ for (unsigned I = 0; I < NumElts; ++I) {
+ VecOp = B.CreateInsertElement(
+ VecOp,
+ B.CreateExtractElement(NewCalls[I], Idx->getValue().urem(4)), I);
+ LLVM_DEBUG(dbgs() << "Add: " << *VecOp << "\n");
+ }
+ }
+
+ // Replace the old instruction.
+ II->replaceAllUsesWith(VecOp);
+ VecOp->takeName(II);
+ InstrsToErase.push_back(II);
+ }
+
+ Modified = true;
+ }
+
+ for (auto I : InstrsToErase)
+ I->eraseFromParent();
+
+ return Modified;
+}
+
+static bool imageIntrinsicOptimizerImpl(Function &F, const TargetMachine *TM) {
+ if (!TM)
+ return false;
+
+ // This optimization only applies to GFX11 and beyond.
+ const GCNSubtarget &ST = TM->getSubtarget<GCNSubtarget>(F);
+ if (!AMDGPU::isGFX11Plus(ST) || ST.hasMSAALoadDstSelBug())
+ return false;
+
+ Module *M = F.getParent();
+
+ // Early test to determine if the intrinsics are used.
+ if (std::none_of(M->begin(), M->end(), [](Function &F) {
+ return !F.users().empty() &&
+ (F.getIntrinsicID() == Intrinsic::amdgcn_image_load_2dmsaa ||
+ F.getIntrinsicID() == Intrinsic::amdgcn_image_load_2darraymsaa);
+ }))
+ return false;
+
+ bool Modified = false;
+ for (auto &BB : F) {
+ BasicBlock::iterator SectionEnd;
+ for (BasicBlock::iterator I = BB.begin(), E = BB.end(); I != E;
+ I = SectionEnd) {
+ SmallVector<SmallVector<IntrinsicInst *, 4>> MergeableInsts;
+
+ SectionEnd = collectMergeableInsts(I, E, MergeableInsts);
+ Modified |= optimizeSection(MergeableInsts);
+ }
+ }
+
+ return Modified;
+}
+
+bool AMDGPUImageIntrinsicOptimizer::runOnFunction(Function &F) {
+ if (skipFunction(F))
+ return false;
+
+ return imageIntrinsicOptimizerImpl(F, TM);
+}
+
+FunctionPass *
+llvm::createAMDGPUImageIntrinsicOptimizerPass(const TargetMachine *TM) {
+ return new AMDGPUImageIntrinsicOptimizer(TM);
+}
+
+PreservedAnalyses
+AMDGPUImageIntrinsicOptimizerPass::run(Function &F,
+ FunctionAnalysisManager &AM) {
+
+ bool Changed = imageIntrinsicOptimizerImpl(F, &TM);
+ return Changed ? PreservedAnalyses::none() : PreservedAnalyses::all();
+}
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUInsertSingleUseVDST.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUInsertSingleUseVDST.cpp
new file mode 100644
index 000000000000..93ed77bb6f7e
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUInsertSingleUseVDST.cpp
@@ -0,0 +1,122 @@
+//===- AMDGPUInsertSingleUseVDST.cpp - Insert s_singleuse_vdst instructions ==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// Insert s_singleuse_vdst instructions on GFX11.5+ to mark regions of VALU
+/// instructions that produce single-use VGPR values. If the value is forwarded
+/// to the consumer instruction prior to VGPR writeback, the hardware can
+/// then skip (kill) the VGPR write.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "GCNSubtarget.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
+#include "SIInstrInfo.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/CodeGen/Register.h"
+#include "llvm/CodeGen/TargetSubtargetInfo.h"
+#include "llvm/IR/DebugLoc.h"
+#include "llvm/MC/MCRegister.h"
+#include "llvm/Pass.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "amdgpu-insert-single-use-vdst"
+
+namespace {
+class AMDGPUInsertSingleUseVDST : public MachineFunctionPass {
+private:
+ const SIInstrInfo *SII;
+
+public:
+ static char ID;
+
+ AMDGPUInsertSingleUseVDST() : MachineFunctionPass(ID) {}
+
+ void emitSingleUseVDST(MachineInstr &MI) const {
+ // Mark the following instruction as a single-use producer:
+ // s_singleuse_vdst { supr0: 1 }
+ BuildMI(*MI.getParent(), MI, DebugLoc(), SII->get(AMDGPU::S_SINGLEUSE_VDST))
+ .addImm(0x1);
+ }
+
+ bool runOnMachineFunction(MachineFunction &MF) override {
+ const auto &ST = MF.getSubtarget<GCNSubtarget>();
+ if (!ST.hasVGPRSingleUseHintInsts())
+ return false;
+
+ SII = ST.getInstrInfo();
+ const auto *TRI = &SII->getRegisterInfo();
+ bool InstructionEmitted = false;
+
+ for (MachineBasicBlock &MBB : MF) {
+ DenseMap<MCPhysReg, unsigned> RegisterUseCount; // TODO: MCRegUnits
+
+ // Handle boundaries at the end of basic block separately to avoid
+ // false positives. If they are live at the end of a basic block then
+ // assume it has more uses later on.
+ for (const auto &Liveouts : MBB.liveouts())
+ RegisterUseCount[Liveouts.PhysReg] = 2;
+
+ for (MachineInstr &MI : reverse(MBB.instrs())) {
+ // All registers in all operands need to be single use for an
+ // instruction to be marked as a single use producer.
+ bool AllProducerOperandsAreSingleUse = true;
+
+ for (const auto &Operand : MI.operands()) {
+ if (!Operand.isReg())
+ continue;
+ const auto Reg = Operand.getReg();
+
+ // Count the number of times each register is read.
+ if (Operand.readsReg())
+ RegisterUseCount[Reg]++;
+
+ // Do not attempt to optimise across exec mask changes.
+ if (MI.modifiesRegister(AMDGPU::EXEC, TRI)) {
+ for (auto &UsedReg : RegisterUseCount)
+ UsedReg.second = 2;
+ }
+
+ // If we are at the point where the register first became live,
+ // check if the operands are single use.
+ if (!MI.modifiesRegister(Reg, TRI))
+ continue;
+ if (RegisterUseCount[Reg] > 1)
+ AllProducerOperandsAreSingleUse = false;
+ // Reset uses count when a register is no longer live.
+ RegisterUseCount.erase(Reg);
+ }
+ if (AllProducerOperandsAreSingleUse && SIInstrInfo::isVALU(MI)) {
+ // TODO: Replace with candidate logging for instruction grouping
+ // later.
+ emitSingleUseVDST(MI);
+ InstructionEmitted = true;
+ }
+ }
+ }
+ return InstructionEmitted;
+ }
+};
+} // namespace
+
+char AMDGPUInsertSingleUseVDST::ID = 0;
+
+char &llvm::AMDGPUInsertSingleUseVDSTID = AMDGPUInsertSingleUseVDST::ID;
+
+INITIALIZE_PASS(AMDGPUInsertSingleUseVDST, DEBUG_TYPE,
+ "AMDGPU Insert SingleUseVDST", false, false)
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
index 3c399e497227..ee93d9eb4c0a 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
@@ -408,6 +408,13 @@ static Value *simplifyAMDGCNMemoryIntrinsicDemanded(InstCombiner &IC,
int DMaskIdx = -1,
bool IsLoad = true);
+/// Return true if it's legal to contract llvm.amdgcn.rcp(llvm.sqrt)
+static bool canContractSqrtToRsq(const FPMathOperator *SqrtOp) {
+ return (SqrtOp->getType()->isFloatTy() &&
+ (SqrtOp->hasApproxFunc() || SqrtOp->getFPAccuracy() >= 1.0f)) ||
+ SqrtOp->getType()->isHalfTy();
+}
+
std::optional<Instruction *>
GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
Intrinsic::ID IID = II.getIntrinsicID();
@@ -437,6 +444,37 @@ GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
return IC.replaceInstUsesWith(II, ConstantFP::get(II.getContext(), Val));
}
+ FastMathFlags FMF = cast<FPMathOperator>(II).getFastMathFlags();
+ if (!FMF.allowContract())
+ break;
+ auto *SrcCI = dyn_cast<IntrinsicInst>(Src);
+ if (!SrcCI)
+ break;
+
+ auto IID = SrcCI->getIntrinsicID();
+ // llvm.amdgcn.rcp(llvm.amdgcn.sqrt(x)) -> llvm.amdgcn.rsq(x) if contractable
+ //
+ // llvm.amdgcn.rcp(llvm.sqrt(x)) -> llvm.amdgcn.rsq(x) if contractable and
+ // relaxed.
+ if (IID == Intrinsic::amdgcn_sqrt || IID == Intrinsic::sqrt) {
+ const FPMathOperator *SqrtOp = cast<FPMathOperator>(SrcCI);
+ FastMathFlags InnerFMF = SqrtOp->getFastMathFlags();
+ if (!InnerFMF.allowContract() || !SrcCI->hasOneUse())
+ break;
+
+ if (IID == Intrinsic::sqrt && !canContractSqrtToRsq(SqrtOp))
+ break;
+
+ Function *NewDecl = Intrinsic::getDeclaration(
+ SrcCI->getModule(), Intrinsic::amdgcn_rsq, {SrcCI->getType()});
+
+ InnerFMF |= FMF;
+ II.setFastMathFlags(InnerFMF);
+
+ II.setCalledFunction(NewDecl);
+ return IC.replaceOperand(II, 0, SrcCI->getArgOperand(0));
+ }
+
break;
}
case Intrinsic::amdgcn_sqrt:
@@ -450,6 +488,14 @@ GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
return IC.replaceInstUsesWith(II, QNaN);
}
+ // f16 amdgcn.sqrt is identical to regular sqrt.
+ if (IID == Intrinsic::amdgcn_sqrt && Src->getType()->isHalfTy()) {
+ Function *NewDecl = Intrinsic::getDeclaration(
+ II.getModule(), Intrinsic::sqrt, {II.getType()});
+ II.setCalledFunction(NewDecl);
+ return &II;
+ }
+
break;
}
case Intrinsic::amdgcn_log:
@@ -784,7 +830,7 @@ GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
Constant *CCmp = ConstantExpr::getCompare(CCVal, CSrc0, CSrc1);
if (CCmp->isNullValue()) {
return IC.replaceInstUsesWith(
- II, ConstantExpr::getSExt(CCmp, II.getType()));
+ II, IC.Builder.CreateSExt(CCmp, II.getType()));
}
// The result of V_ICMP/V_FCMP assembly instructions (which this
@@ -946,14 +992,27 @@ GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
return IC.replaceOperand(II, 0, UndefValue::get(Old->getType()));
}
case Intrinsic::amdgcn_permlane16:
- case Intrinsic::amdgcn_permlanex16: {
+ case Intrinsic::amdgcn_permlane16_var:
+ case Intrinsic::amdgcn_permlanex16:
+ case Intrinsic::amdgcn_permlanex16_var: {
// Discard vdst_in if it's not going to be read.
Value *VDstIn = II.getArgOperand(0);
if (isa<UndefValue>(VDstIn))
break;
- ConstantInt *FetchInvalid = cast<ConstantInt>(II.getArgOperand(4));
- ConstantInt *BoundCtrl = cast<ConstantInt>(II.getArgOperand(5));
+ // FetchInvalid operand idx.
+ unsigned int FiIdx = (IID == Intrinsic::amdgcn_permlane16 ||
+ IID == Intrinsic::amdgcn_permlanex16)
+ ? 4 /* for permlane16 and permlanex16 */
+ : 3; /* for permlane16_var and permlanex16_var */
+
+ // BoundCtrl operand idx.
+ // For permlane16 and permlanex16 it should be 5
+ // For Permlane16_var and permlanex16_var it should be 4
+ unsigned int BcIdx = FiIdx + 1;
+
+ ConstantInt *FetchInvalid = cast<ConstantInt>(II.getArgOperand(FiIdx));
+ ConstantInt *BoundCtrl = cast<ConstantInt>(II.getArgOperand(BcIdx));
if (!FetchInvalid->getZExtValue() && !BoundCtrl->getZExtValue())
break;
@@ -1002,50 +1061,6 @@ GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
break;
}
- case Intrinsic::amdgcn_ldexp: {
- // FIXME: This doesn't introduce new instructions and belongs in
- // InstructionSimplify.
- Type *Ty = II.getType();
- Value *Op0 = II.getArgOperand(0);
- Value *Op1 = II.getArgOperand(1);
-
- // Folding undef to qnan is safe regardless of the FP mode.
- if (isa<UndefValue>(Op0)) {
- auto *QNaN = ConstantFP::get(Ty, APFloat::getQNaN(Ty->getFltSemantics()));
- return IC.replaceInstUsesWith(II, QNaN);
- }
-
- const APFloat *C = nullptr;
- match(Op0, PatternMatch::m_APFloat(C));
-
- // FIXME: Should flush denorms depending on FP mode, but that's ignored
- // everywhere else.
- //
- // These cases should be safe, even with strictfp.
- // ldexp(0.0, x) -> 0.0
- // ldexp(-0.0, x) -> -0.0
- // ldexp(inf, x) -> inf
- // ldexp(-inf, x) -> -inf
- if (C && (C->isZero() || C->isInfinity())) {
- return IC.replaceInstUsesWith(II, Op0);
- }
-
- // With strictfp, be more careful about possibly needing to flush denormals
- // or not, and snan behavior depends on ieee_mode.
- if (II.isStrictFP())
- break;
-
- if (C && C->isNaN())
- return IC.replaceInstUsesWith(II, ConstantFP::get(Ty, C->makeQuiet()));
-
- // ldexp(x, 0) -> x
- // ldexp(x, undef) -> x
- if (isa<UndefValue>(Op1) || match(Op1, PatternMatch::m_ZeroInt())) {
- return IC.replaceInstUsesWith(II, Op0);
- }
-
- break;
- }
case Intrinsic::amdgcn_fmul_legacy: {
Value *Op0 = II.getArgOperand(0);
Value *Op1 = II.getArgOperand(1);
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.cpp
index f2d62956e25b..d41e704a4a11 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.cpp
@@ -14,6 +14,7 @@
#include "AMDGPUInstrInfo.h"
#include "AMDGPU.h"
+#include "llvm/CodeGen/MachineInstr.h"
#include "llvm/CodeGen/MachineMemOperand.h"
#include "llvm/IR/Constants.h"
#include "llvm/IR/Instruction.h"
@@ -26,6 +27,9 @@ using namespace llvm;
AMDGPUInstrInfo::AMDGPUInstrInfo(const GCNSubtarget &ST) { }
+Intrinsic::ID AMDGPU::getIntrinsicID(const MachineInstr &I) {
+ return I.getOperand(I.getNumExplicitDefs()).getIntrinsicID();
+}
// TODO: Should largely merge with AMDGPUTTIImpl::isSourceOfDivergence.
bool AMDGPUInstrInfo::isUniformMMO(const MachineMemOperand *MMO) {
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.h b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.h
index e7ee36447682..515decea3921 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.h
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.h
@@ -21,6 +21,7 @@ namespace llvm {
class GCNSubtarget;
class MachineMemOperand;
+class MachineInstr;
class AMDGPUInstrInfo {
public:
@@ -31,6 +32,13 @@ public:
namespace AMDGPU {
+/// Return the intrinsic ID for opcodes with the G_AMDGPU_INTRIN_ prefix.
+///
+/// These opcodes have an Intrinsic::ID operand similar to a GIntrinsic. But
+/// they are not actual instances of GIntrinsics, so we cannot use
+/// GIntrinsic::getIntrinsicID() on them.
+unsigned getIntrinsicID(const MachineInstr &I);
+
struct RsrcIntrinsic {
unsigned Intr;
uint8_t RsrcArg;
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td
index b69cae0c73b3..82f58ea38fd0 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td
@@ -94,6 +94,11 @@ def AMDGPUtc_return_gfx: SDNode<"AMDGPUISD::TC_RETURN_GFX", AMDGPUTCReturnTP,
[SDNPHasChain, SDNPOptInGlue, SDNPVariadic]
>;
+def AMDGPUtc_return_chain: SDNode<"AMDGPUISD::TC_RETURN_CHAIN",
+ SDTypeProfile<0, -1, [SDTCisPtrTy<0>]>,
+ [SDNPHasChain, SDNPOptInGlue, SDNPVariadic]
+>;
+
def AMDGPUtrap : SDNode<"AMDGPUISD::TRAP",
SDTypeProfile<0, -1, [SDTCisVT<0, i16>]>,
[SDNPHasChain, SDNPVariadic, SDNPSideEffect, SDNPInGlue]
@@ -165,6 +170,11 @@ def AMDGPUfmax3 : SDNode<"AMDGPUISD::FMAX3", SDTFPTernaryOp,
[/*SDNPCommutative, SDNPAssociative*/]
>;
+// out = max(a, b, c) a, b and c are floats. Operation is IEEE2019 compliant.
+def AMDGPUfmaximum3 : SDNode<"AMDGPUISD::FMAXIMUM3", SDTFPTernaryOp,
+ [/*SDNPCommutative, SDNPAssociative*/]
+>;
+
// out = max(a, b, c) a, b, and c are signed ints
def AMDGPUsmax3 : SDNode<"AMDGPUISD::SMAX3", AMDGPUDTIntTernaryOp,
[/*SDNPCommutative, SDNPAssociative*/]
@@ -180,6 +190,11 @@ def AMDGPUfmin3 : SDNode<"AMDGPUISD::FMIN3", SDTFPTernaryOp,
[/*SDNPCommutative, SDNPAssociative*/]
>;
+// out = min(a, b, c) a, b and c are floats. Operation is IEEE2019 compliant.
+def AMDGPUfminimum3 : SDNode<"AMDGPUISD::FMINIMUM3", SDTFPTernaryOp,
+ [/*SDNPCommutative, SDNPAssociative*/]
+>;
+
// out = min(a, b, c) a, b and c are signed ints
def AMDGPUsmin3 : SDNode<"AMDGPUISD::SMIN3", AMDGPUDTIntTernaryOp,
[/*SDNPCommutative, SDNPAssociative*/]
@@ -265,9 +280,6 @@ def AMDGPUatomic_cmp_swap : SDNode<"AMDGPUISD::ATOMIC_CMP_SWAP",
[SDNPHasChain, SDNPMayStore, SDNPMayLoad,
SDNPMemOperand]>;
-def AMDGPUround : SDNode<"ISD::FROUND",
- SDTypeProfile<1, 1, [SDTCisFP<0>, SDTCisSameAs<0,1>]>>;
-
def AMDGPUbfe_u32_impl : SDNode<"AMDGPUISD::BFE_U32", AMDGPUDTIntTernaryOp>;
def AMDGPUbfe_i32_impl : SDNode<"AMDGPUISD::BFE_I32", AMDGPUDTIntTernaryOp>;
def AMDGPUbfi : SDNode<"AMDGPUISD::BFI", AMDGPUDTIntTernaryOp>;
@@ -279,11 +291,15 @@ def AMDGPUffbh_i32_impl : SDNode<"AMDGPUISD::FFBH_I32", SDTIntBitCountUnaryOp>;
def AMDGPUffbl_b32_impl : SDNode<"AMDGPUISD::FFBL_B32", SDTIntBitCountUnaryOp>;
// Signed and unsigned 24-bit multiply. The highest 8-bits are ignore
-// when performing the multiply. The result is a 32-bit value.
-def AMDGPUmul_u24_impl : SDNode<"AMDGPUISD::MUL_U24", SDTIntBinOp,
+// when performing the multiply. The result is a 32 or 64 bit value.
+def AMDGPUMul24Op : SDTypeProfile<1, 2, [
+ SDTCisInt<0>, SDTCisInt<1>, SDTCisSameAs<1, 2>
+]>;
+
+def AMDGPUmul_u24_impl : SDNode<"AMDGPUISD::MUL_U24", AMDGPUMul24Op,
[SDNPCommutative, SDNPAssociative]
>;
-def AMDGPUmul_i24_impl : SDNode<"AMDGPUISD::MUL_I24", SDTIntBinOp,
+def AMDGPUmul_i24_impl : SDNode<"AMDGPUISD::MUL_I24", AMDGPUMul24Op,
[SDNPCommutative, SDNPAssociative]
>;
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
index 747f9fe2f8ae..88ef4b577424 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -21,6 +21,7 @@
#include "Utils/AMDGPUBaseInfo.h"
#include "llvm/CodeGen/GlobalISel/GIMatchTableExecutorImpl.h"
#include "llvm/CodeGen/GlobalISel/GISelKnownBits.h"
+#include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h"
#include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
#include "llvm/CodeGen/MachineFrameInfo.h"
@@ -71,6 +72,13 @@ void AMDGPUInstructionSelector::setupMF(MachineFunction &MF, GISelKnownBits *KB,
InstructionSelector::setupMF(MF, KB, CoverageInfo, PSI, BFI);
}
+// Return the wave level SGPR base address if this is a wave address.
+static Register getWaveAddress(const MachineInstr *Def) {
+ return Def->getOpcode() == AMDGPU::G_AMDGPU_WAVE_ADDRESS
+ ? Def->getOperand(1).getReg()
+ : Register();
+}
+
bool AMDGPUInstructionSelector::isVCC(Register Reg,
const MachineRegisterInfo &MRI) const {
// The verifier is oblivious to s1 being a valid value for wavesize registers.
@@ -158,11 +166,15 @@ bool AMDGPUInstructionSelector::selectCOPY(MachineInstr &I) const {
// TODO: Skip masking high bits if def is known boolean.
+ bool IsSGPR = TRI.isSGPRClass(SrcRC);
unsigned AndOpc =
- TRI.isSGPRClass(SrcRC) ? AMDGPU::S_AND_B32 : AMDGPU::V_AND_B32_e32;
- BuildMI(*BB, &I, DL, TII.get(AndOpc), MaskedReg)
+ IsSGPR ? AMDGPU::S_AND_B32 : AMDGPU::V_AND_B32_e32;
+ auto And = BuildMI(*BB, &I, DL, TII.get(AndOpc), MaskedReg)
.addImm(1)
.addReg(SrcReg);
+ if (IsSGPR)
+ And.setOperandDead(3); // Dead scc
+
BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_CMP_NE_U32_e64), DstReg)
.addImm(0)
.addReg(MaskedReg);
@@ -322,7 +334,8 @@ bool AMDGPUInstructionSelector::selectG_ADD_SUB(MachineInstr &I) const {
MachineInstr *Add =
BuildMI(*BB, &I, DL, TII.get(Opc), DstReg)
.add(I.getOperand(1))
- .add(I.getOperand(2));
+ .add(I.getOperand(2))
+ .setOperandDead(3); // Dead scc
I.eraseFromParent();
return constrainSelectedInstRegOperands(*Add, TII, TRI, RBI);
}
@@ -369,7 +382,8 @@ bool AMDGPUInstructionSelector::selectG_ADD_SUB(MachineInstr &I) const {
.add(Lo2);
BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_ADDC_U32), DstHi)
.add(Hi1)
- .add(Hi2);
+ .add(Hi2)
+ .setOperandDead(3); // Dead scc
} else {
const TargetRegisterClass *CarryRC = TRI.getWaveMaskRegClass();
Register CarryReg = MRI->createVirtualRegister(CarryRC);
@@ -436,14 +450,18 @@ bool AMDGPUInstructionSelector::selectG_UADDO_USUBO_UADDE_USUBE(
unsigned NoCarryOpc = IsAdd ? AMDGPU::S_ADD_U32 : AMDGPU::S_SUB_U32;
unsigned CarryOpc = IsAdd ? AMDGPU::S_ADDC_U32 : AMDGPU::S_SUBB_U32;
- BuildMI(*BB, &I, DL, TII.get(HasCarryIn ? CarryOpc : NoCarryOpc), Dst0Reg)
+ auto CarryInst = BuildMI(*BB, &I, DL, TII.get(HasCarryIn ? CarryOpc : NoCarryOpc), Dst0Reg)
.add(I.getOperand(2))
.add(I.getOperand(3));
- BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), Dst1Reg)
- .addReg(AMDGPU::SCC);
- if (!MRI->getRegClassOrNull(Dst1Reg))
- MRI->setRegClass(Dst1Reg, &AMDGPU::SReg_32RegClass);
+ if (MRI->use_nodbg_empty(Dst1Reg)) {
+ CarryInst.setOperandDead(3); // Dead scc
+ } else {
+ BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), Dst1Reg)
+ .addReg(AMDGPU::SCC);
+ if (!MRI->getRegClassOrNull(Dst1Reg))
+ MRI->setRegClass(Dst1Reg, &AMDGPU::SReg_32RegClass);
+ }
if (!RBI.constrainGenericRegister(Dst0Reg, AMDGPU::SReg_32RegClass, *MRI) ||
!RBI.constrainGenericRegister(Src0Reg, AMDGPU::SReg_32RegClass, *MRI) ||
@@ -740,7 +758,8 @@ bool AMDGPUInstructionSelector::selectG_BUILD_VECTOR(MachineInstr &MI) const {
// build_vector_trunc (lshr $src0, 16), 0 -> s_lshr_b32 $src0, 16
auto MIB = BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_LSHR_B32), Dst)
.addReg(ShiftSrc0)
- .addImm(16);
+ .addImm(16)
+ .setOperandDead(3); // Dead scc
MI.eraseFromParent();
return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
@@ -1001,7 +1020,7 @@ bool AMDGPUInstructionSelector::selectDivScale(MachineInstr &MI) const {
}
bool AMDGPUInstructionSelector::selectG_INTRINSIC(MachineInstr &I) const {
- unsigned IntrinsicID = I.getIntrinsicID();
+ unsigned IntrinsicID = cast<GIntrinsic>(I).getIntrinsicID();
switch (IntrinsicID) {
case Intrinsic::amdgcn_if_break: {
MachineBasicBlock *BB = I.getParent();
@@ -1192,36 +1211,104 @@ int AMDGPUInstructionSelector::getS_CMPOpcode(CmpInst::Predicate P,
}
}
- if (Size != 32)
- return -1;
+ if (Size == 32) {
+ switch (P) {
+ case CmpInst::ICMP_NE:
+ return AMDGPU::S_CMP_LG_U32;
+ case CmpInst::ICMP_EQ:
+ return AMDGPU::S_CMP_EQ_U32;
+ case CmpInst::ICMP_SGT:
+ return AMDGPU::S_CMP_GT_I32;
+ case CmpInst::ICMP_SGE:
+ return AMDGPU::S_CMP_GE_I32;
+ case CmpInst::ICMP_SLT:
+ return AMDGPU::S_CMP_LT_I32;
+ case CmpInst::ICMP_SLE:
+ return AMDGPU::S_CMP_LE_I32;
+ case CmpInst::ICMP_UGT:
+ return AMDGPU::S_CMP_GT_U32;
+ case CmpInst::ICMP_UGE:
+ return AMDGPU::S_CMP_GE_U32;
+ case CmpInst::ICMP_ULT:
+ return AMDGPU::S_CMP_LT_U32;
+ case CmpInst::ICMP_ULE:
+ return AMDGPU::S_CMP_LE_U32;
+ case CmpInst::FCMP_OEQ:
+ return AMDGPU::S_CMP_EQ_F32;
+ case CmpInst::FCMP_OGT:
+ return AMDGPU::S_CMP_GT_F32;
+ case CmpInst::FCMP_OGE:
+ return AMDGPU::S_CMP_GE_F32;
+ case CmpInst::FCMP_OLT:
+ return AMDGPU::S_CMP_LT_F32;
+ case CmpInst::FCMP_OLE:
+ return AMDGPU::S_CMP_LE_F32;
+ case CmpInst::FCMP_ONE:
+ return AMDGPU::S_CMP_LG_F32;
+ case CmpInst::FCMP_ORD:
+ return AMDGPU::S_CMP_O_F32;
+ case CmpInst::FCMP_UNO:
+ return AMDGPU::S_CMP_U_F32;
+ case CmpInst::FCMP_UEQ:
+ return AMDGPU::S_CMP_NLG_F32;
+ case CmpInst::FCMP_UGT:
+ return AMDGPU::S_CMP_NLE_F32;
+ case CmpInst::FCMP_UGE:
+ return AMDGPU::S_CMP_NLT_F32;
+ case CmpInst::FCMP_ULT:
+ return AMDGPU::S_CMP_NGE_F32;
+ case CmpInst::FCMP_ULE:
+ return AMDGPU::S_CMP_NGT_F32;
+ case CmpInst::FCMP_UNE:
+ return AMDGPU::S_CMP_NEQ_F32;
+ default:
+ llvm_unreachable("Unknown condition code!");
+ }
+ }
- switch (P) {
- case CmpInst::ICMP_NE:
- return AMDGPU::S_CMP_LG_U32;
- case CmpInst::ICMP_EQ:
- return AMDGPU::S_CMP_EQ_U32;
- case CmpInst::ICMP_SGT:
- return AMDGPU::S_CMP_GT_I32;
- case CmpInst::ICMP_SGE:
- return AMDGPU::S_CMP_GE_I32;
- case CmpInst::ICMP_SLT:
- return AMDGPU::S_CMP_LT_I32;
- case CmpInst::ICMP_SLE:
- return AMDGPU::S_CMP_LE_I32;
- case CmpInst::ICMP_UGT:
- return AMDGPU::S_CMP_GT_U32;
- case CmpInst::ICMP_UGE:
- return AMDGPU::S_CMP_GE_U32;
- case CmpInst::ICMP_ULT:
- return AMDGPU::S_CMP_LT_U32;
- case CmpInst::ICMP_ULE:
- return AMDGPU::S_CMP_LE_U32;
- default:
- llvm_unreachable("Unknown condition code!");
+ if (Size == 16) {
+ if (!STI.hasSALUFloatInsts())
+ return -1;
+
+ switch (P) {
+ case CmpInst::FCMP_OEQ:
+ return AMDGPU::S_CMP_EQ_F16;
+ case CmpInst::FCMP_OGT:
+ return AMDGPU::S_CMP_GT_F16;
+ case CmpInst::FCMP_OGE:
+ return AMDGPU::S_CMP_GE_F16;
+ case CmpInst::FCMP_OLT:
+ return AMDGPU::S_CMP_LT_F16;
+ case CmpInst::FCMP_OLE:
+ return AMDGPU::S_CMP_LE_F16;
+ case CmpInst::FCMP_ONE:
+ return AMDGPU::S_CMP_LG_F16;
+ case CmpInst::FCMP_ORD:
+ return AMDGPU::S_CMP_O_F16;
+ case CmpInst::FCMP_UNO:
+ return AMDGPU::S_CMP_U_F16;
+ case CmpInst::FCMP_UEQ:
+ return AMDGPU::S_CMP_NLG_F16;
+ case CmpInst::FCMP_UGT:
+ return AMDGPU::S_CMP_NLE_F16;
+ case CmpInst::FCMP_UGE:
+ return AMDGPU::S_CMP_NLT_F16;
+ case CmpInst::FCMP_ULT:
+ return AMDGPU::S_CMP_NGE_F16;
+ case CmpInst::FCMP_ULE:
+ return AMDGPU::S_CMP_NGT_F16;
+ case CmpInst::FCMP_UNE:
+ return AMDGPU::S_CMP_NEQ_F16;
+ default:
+ llvm_unreachable("Unknown condition code!");
+ }
}
+
+ return -1;
}
-bool AMDGPUInstructionSelector::selectG_ICMP(MachineInstr &I) const {
+bool AMDGPUInstructionSelector::selectG_ICMP_or_FCMP(MachineInstr &I) const {
+
MachineBasicBlock *BB = I.getParent();
const DebugLoc &DL = I.getDebugLoc();
@@ -1247,6 +1334,9 @@ bool AMDGPUInstructionSelector::selectG_ICMP(MachineInstr &I) const {
return Ret;
}
+ if (I.getOpcode() == AMDGPU::G_FCMP)
+ return false;
+
int Opcode = getV_CMPOpcode(Pred, Size, *Subtarget);
if (Opcode == -1)
return false;
@@ -1569,8 +1659,8 @@ static unsigned gwsIntrinToOpcode(unsigned IntrID) {
bool AMDGPUInstructionSelector::selectDSGWSIntrinsic(MachineInstr &MI,
Intrinsic::ID IID) const {
- if (IID == Intrinsic::amdgcn_ds_gws_sema_release_all &&
- !STI.hasGWSSemaReleaseAll())
+ if (!STI.hasGWS() || (IID == Intrinsic::amdgcn_ds_gws_sema_release_all &&
+ !STI.hasGWSSemaReleaseAll()))
return false;
// intrinsic ID, vsrc, offset
@@ -1629,7 +1719,8 @@ bool AMDGPUInstructionSelector::selectDSGWSIntrinsic(MachineInstr &MI,
Register M0Base = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::S_LSHL_B32), M0Base)
.addReg(BaseOffset)
- .addImm(16);
+ .addImm(16)
+ .setOperandDead(3); // Dead scc
BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
.addReg(M0Base);
@@ -1690,7 +1781,7 @@ bool AMDGPUInstructionSelector::selectDSAppendConsume(MachineInstr &MI,
}
bool AMDGPUInstructionSelector::selectSBarrier(MachineInstr &MI) const {
- if (TM.getOptLevel() > CodeGenOpt::None) {
+ if (TM.getOptLevel() > CodeGenOptLevel::None) {
unsigned WGSize = STI.getFlatWorkGroupSizes(MF->getFunction()).second;
if (WGSize <= STI.getWavefrontSize()) {
MachineBasicBlock *MBB = MI.getParent();
@@ -1700,6 +1791,19 @@ bool AMDGPUInstructionSelector::selectSBarrier(MachineInstr &MI) const {
return true;
}
}
+
+ // On GFX12 lower s_barrier into s_barrier_signal_imm and s_barrier_wait
+ if (STI.hasSplitBarriers()) {
+ MachineBasicBlock *MBB = MI.getParent();
+ const DebugLoc &DL = MI.getDebugLoc();
+ BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::S_BARRIER_SIGNAL_IMM))
+ .addImm(AMDGPU::Barrier::WORKGROUP);
+ BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::S_BARRIER_WAIT))
+ .addImm(AMDGPU::Barrier::WORKGROUP);
+ MI.eraseFromParent();
+ return true;
+ }
+
return selectImpl(MI, *CoverageInfo);
}
@@ -1728,6 +1832,7 @@ bool AMDGPUInstructionSelector::selectImageIntrinsic(
unsigned IntrOpcode = Intr->BaseOpcode;
const bool IsGFX10Plus = AMDGPU::isGFX10Plus(STI);
const bool IsGFX11Plus = AMDGPU::isGFX11Plus(STI);
+ const bool IsGFX12Plus = AMDGPU::isGFX12Plus(STI);
const unsigned ArgOffset = MI.getNumExplicitDefs() + 1;
@@ -1812,7 +1917,7 @@ bool AMDGPUInstructionSelector::selectImageIntrinsic(
unsigned CPol = MI.getOperand(ArgOffset + Intr->CachePolicyIndex).getImm();
if (BaseOpcode->Atomic)
CPol |= AMDGPU::CPol::GLC; // TODO no-return optimization
- if (CPol & ~AMDGPU::CPol::ALL)
+ if (CPol & ~(IsGFX12Plus ? AMDGPU::CPol::ALL : AMDGPU::CPol::ALL_pregfx12))
return false;
int NumVAddrRegs = 0;
@@ -1847,7 +1952,10 @@ bool AMDGPUInstructionSelector::selectImageIntrinsic(
++NumVDataDwords;
int Opcode = -1;
- if (IsGFX11Plus) {
+ if (IsGFX12Plus) {
+ Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx12,
+ NumVDataDwords, NumVAddrDwords);
+ } else if (IsGFX11Plus) {
Opcode = AMDGPU::getMIMGOpcode(IntrOpcode,
UseNSA ? AMDGPU::MIMGEncGfx11NSA
: AMDGPU::MIMGEncGfx11Default,
@@ -1920,7 +2028,8 @@ bool AMDGPUInstructionSelector::selectImageIntrinsic(
if (IsGFX10Plus)
MIB.addImm(DimInfo->Encoding);
- MIB.addImm(Unorm);
+ if (AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::unorm))
+ MIB.addImm(Unorm);
MIB.addImm(CPol);
MIB.addImm(IsA16 && // a16 or r128
@@ -1935,7 +2044,8 @@ bool AMDGPUInstructionSelector::selectImageIntrinsic(
return false;
}
- MIB.addImm(LWE); // lwe
+ if (AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::lwe))
+ MIB.addImm(LWE); // lwe
if (!IsGFX10Plus)
MIB.addImm(DimInfo->DA ? -1 : 0);
if (BaseOpcode->HasD16)
@@ -2008,7 +2118,7 @@ bool AMDGPUInstructionSelector::selectDSBvhStackIntrinsic(
bool AMDGPUInstructionSelector::selectG_INTRINSIC_W_SIDE_EFFECTS(
MachineInstr &I) const {
- unsigned IntrinsicID = I.getIntrinsicID();
+ unsigned IntrinsicID = cast<GIntrinsic>(I).getIntrinsicID();
switch (IntrinsicID) {
case Intrinsic::amdgcn_end_cf:
return selectEndCfIntrinsic(I);
@@ -2046,6 +2156,16 @@ bool AMDGPUInstructionSelector::selectG_INTRINSIC_W_SIDE_EFFECTS(
break;
case Intrinsic::amdgcn_ds_bvh_stack_rtn:
return selectDSBvhStackIntrinsic(I);
+ case Intrinsic::amdgcn_s_barrier_init:
+ case Intrinsic::amdgcn_s_barrier_join:
+ case Intrinsic::amdgcn_s_wakeup_barrier:
+ case Intrinsic::amdgcn_s_get_barrier_state:
+ return selectNamedBarrierInst(I, IntrinsicID);
+ case Intrinsic::amdgcn_s_barrier_signal_isfirst:
+ case Intrinsic::amdgcn_s_barrier_signal_isfirst_var:
+ return selectSBarrierSignalIsfirst(I, IntrinsicID);
+ case Intrinsic::amdgcn_s_barrier_leave:
+ return selectSBarrierLeave(I);
}
return selectImpl(I, *CoverageInfo);
}
@@ -2194,7 +2314,8 @@ bool AMDGPUInstructionSelector::selectG_TRUNC(MachineInstr &I) const {
} else {
BuildMI(*MBB, I, DL, TII.get(AMDGPU::S_LSHL_B32), TmpReg0)
.addReg(HiReg)
- .addImm(16);
+ .addImm(16)
+ .setOperandDead(3); // Dead scc
}
unsigned MovOpc = IsVALU ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32;
@@ -2203,12 +2324,17 @@ bool AMDGPUInstructionSelector::selectG_TRUNC(MachineInstr &I) const {
BuildMI(*MBB, I, DL, TII.get(MovOpc), ImmReg)
.addImm(0xffff);
- BuildMI(*MBB, I, DL, TII.get(AndOpc), TmpReg1)
+ auto And = BuildMI(*MBB, I, DL, TII.get(AndOpc), TmpReg1)
.addReg(LoReg)
.addReg(ImmReg);
- BuildMI(*MBB, I, DL, TII.get(OrOpc), DstReg)
+ auto Or = BuildMI(*MBB, I, DL, TII.get(OrOpc), DstReg)
.addReg(TmpReg0)
.addReg(TmpReg1);
+
+ if (!IsVALU) {
+ And.setOperandDead(3); // Dead scc
+ Or.setOperandDead(3); // Dead scc
+ }
}
I.eraseFromParent();
@@ -2353,7 +2479,8 @@ bool AMDGPUInstructionSelector::selectG_SZA_EXT(MachineInstr &I) const {
if (Signed) {
BuildMI(MBB, I, DL, TII.get(AMDGPU::S_ASHR_I32), HiReg)
.addReg(SrcReg, 0, SubReg)
- .addImm(31);
+ .addImm(31)
+ .setOperandDead(3); // Dead scc
} else {
BuildMI(MBB, I, DL, TII.get(AMDGPU::S_MOV_B32), HiReg)
.addImm(0);
@@ -2397,7 +2524,8 @@ bool AMDGPUInstructionSelector::selectG_SZA_EXT(MachineInstr &I) const {
if (!Signed && shouldUseAndMask(SrcSize, Mask)) {
BuildMI(MBB, I, DL, TII.get(AMDGPU::S_AND_B32), DstReg)
.addReg(SrcReg)
- .addImm(Mask);
+ .addImm(Mask)
+ .setOperandDead(3); // Dead scc
} else {
BuildMI(MBB, I, DL, TII.get(BFE32), DstReg)
.addReg(SrcReg)
@@ -2411,16 +2539,54 @@ bool AMDGPUInstructionSelector::selectG_SZA_EXT(MachineInstr &I) const {
return false;
}
+static bool isExtractHiElt(MachineRegisterInfo &MRI, Register In,
+ Register &Out) {
+ Register LShlSrc;
+ if (mi_match(In, MRI,
+ m_GTrunc(m_GLShr(m_Reg(LShlSrc), m_SpecificICst(16))))) {
+ Out = LShlSrc;
+ return true;
+ }
+ return false;
+}
+
+bool AMDGPUInstructionSelector::selectG_FPEXT(MachineInstr &I) const {
+ if (!Subtarget->hasSALUFloatInsts())
+ return false;
+
+ Register Dst = I.getOperand(0).getReg();
+ const RegisterBank *DstRB = RBI.getRegBank(Dst, *MRI, TRI);
+ if (DstRB->getID() != AMDGPU::SGPRRegBankID)
+ return false;
+
+ Register Src = I.getOperand(1).getReg();
+
+ if (MRI->getType(Dst) == LLT::scalar(32) &&
+ MRI->getType(Src) == LLT::scalar(16)) {
+ if (isExtractHiElt(*MRI, Src, Src)) {
+ MachineBasicBlock *BB = I.getParent();
+ BuildMI(*BB, &I, I.getDebugLoc(), TII.get(AMDGPU::S_CVT_HI_F32_F16), Dst)
+ .addUse(Src);
+ I.eraseFromParent();
+ return RBI.constrainGenericRegister(Dst, AMDGPU::SReg_32RegClass, *MRI);
+ }
+ }
+
+ return false;
+}
+
bool AMDGPUInstructionSelector::selectG_CONSTANT(MachineInstr &I) const {
MachineBasicBlock *BB = I.getParent();
MachineOperand &ImmOp = I.getOperand(1);
Register DstReg = I.getOperand(0).getReg();
unsigned Size = MRI->getType(DstReg).getSizeInBits();
+ bool IsFP = false;
// The AMDGPU backend only supports Imm operands and not CImm or FPImm.
if (ImmOp.isFPImm()) {
const APInt &Imm = ImmOp.getFPImm()->getValueAPF().bitcastToAPInt();
ImmOp.ChangeToImmediate(Imm.getZExtValue());
+ IsFP = true;
} else if (ImmOp.isCImm()) {
ImmOp.ChangeToImmediate(ImmOp.getCImm()->getSExtValue());
} else {
@@ -2433,6 +2599,12 @@ bool AMDGPUInstructionSelector::selectG_CONSTANT(MachineInstr &I) const {
unsigned Opcode;
if (DstRB->getID() == AMDGPU::VCCRegBankID) {
Opcode = STI.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
+ } else if (Size == 64 &&
+ AMDGPU::isValid32BitLiteral(I.getOperand(1).getImm(), IsFP)) {
+ Opcode = IsSgpr ? AMDGPU::S_MOV_B64_IMM_PSEUDO : AMDGPU::V_MOV_B64_PSEUDO;
+ I.setDesc(TII.get(Opcode));
+ I.addImplicitDefUseOperands(*MF);
+ return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
} else {
Opcode = IsSgpr ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32;
@@ -2531,7 +2703,8 @@ bool AMDGPUInstructionSelector::selectG_FNEG(MachineInstr &MI) const {
unsigned Opc = Fabs ? AMDGPU::S_OR_B32 : AMDGPU::S_XOR_B32;
BuildMI(*BB, &MI, DL, TII.get(Opc), OpReg)
.addReg(HiReg)
- .addReg(ConstReg);
+ .addReg(ConstReg)
+ .setOperandDead(3); // Dead scc
BuildMI(*BB, &MI, DL, TII.get(AMDGPU::REG_SEQUENCE), Dst)
.addReg(LoReg)
.addImm(AMDGPU::sub0)
@@ -2572,7 +2745,8 @@ bool AMDGPUInstructionSelector::selectG_FABS(MachineInstr &MI) const {
// TODO: Should this used S_BITSET0_*?
BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_AND_B32), OpReg)
.addReg(HiReg)
- .addReg(ConstReg);
+ .addReg(ConstReg)
+ .setOperandDead(3); // Dead scc
BuildMI(*BB, &MI, DL, TII.get(AMDGPU::REG_SEQUENCE), Dst)
.addReg(LoReg)
.addImm(AMDGPU::sub0)
@@ -2689,8 +2863,8 @@ static bool isVCmpResult(Register Reg, MachineRegisterInfo &MRI) {
return isVCmpResult(MI.getOperand(1).getReg(), MRI) &&
isVCmpResult(MI.getOperand(2).getReg(), MRI);
- if (Opcode == TargetOpcode::G_INTRINSIC)
- return MI.getIntrinsicID() == Intrinsic::amdgcn_class;
+ if (auto *GI = dyn_cast<GIntrinsic>(&MI))
+ return GI->is(Intrinsic::amdgcn_class);
return Opcode == AMDGPU::G_ICMP || Opcode == AMDGPU::G_FCMP;
}
@@ -2730,7 +2904,8 @@ bool AMDGPUInstructionSelector::selectG_BRCOND(MachineInstr &I) const {
Register TmpReg = MRI->createVirtualRegister(TRI.getBoolRC());
BuildMI(*BB, &I, DL, TII.get(Opcode), TmpReg)
.addReg(CondReg)
- .addReg(Exec);
+ .addReg(Exec)
+ .setOperandDead(3); // Dead scc
CondReg = TmpReg;
}
@@ -2793,7 +2968,8 @@ bool AMDGPUInstructionSelector::selectG_PTRMASK(MachineInstr &I) const {
!CanCopyLow32 && !CanCopyHi32) {
auto MIB = BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_AND_B64), DstReg)
.addReg(SrcReg)
- .addReg(MaskReg);
+ .addReg(MaskReg)
+ .setOperandDead(3); // Dead scc
I.eraseFromParent();
return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
}
@@ -2816,9 +2992,12 @@ bool AMDGPUInstructionSelector::selectG_PTRMASK(MachineInstr &I) const {
assert(MaskTy.getSizeInBits() == 32 &&
"ptrmask should have been narrowed during legalize");
- BuildMI(*BB, &I, DL, TII.get(NewOpc), DstReg)
+ auto NewOp = BuildMI(*BB, &I, DL, TII.get(NewOpc), DstReg)
.addReg(SrcReg)
.addReg(MaskReg);
+
+ if (!IsVGPR)
+ NewOp.setOperandDead(3); // Dead scc
I.eraseFromParent();
return true;
}
@@ -3050,6 +3229,7 @@ bool AMDGPUInstructionSelector::selectG_INSERT_VECTOR_ELT(
}
bool AMDGPUInstructionSelector::selectBufferLoadLds(MachineInstr &MI) const {
+ assert(!AMDGPU::isGFX12Plus(STI));
unsigned Opc;
unsigned Size = MI.getOperand(3).getImm();
@@ -3116,8 +3296,8 @@ bool AMDGPUInstructionSelector::selectBufferLoadLds(MachineInstr &MI) const {
MIB.add(MI.getOperand(5 + OpOffset)); // soffset
MIB.add(MI.getOperand(6 + OpOffset)); // imm offset
unsigned Aux = MI.getOperand(7 + OpOffset).getImm();
- MIB.addImm(Aux & AMDGPU::CPol::ALL); // cpol
- MIB.addImm((Aux >> 3) & 1); // swz
+ MIB.addImm(Aux & AMDGPU::CPol::ALL); // cpol
+ MIB.addImm(Aux & AMDGPU::CPol::SWZ_pregfx12 ? 1 : 0); // swz
MachineMemOperand *LoadMMO = *MI.memoperands_begin();
MachinePointerInfo LoadPtrI = LoadMMO->getPointerInfo();
@@ -3252,7 +3432,7 @@ bool AMDGPUInstructionSelector::selectBVHIntrinsic(MachineInstr &MI) const{
bool AMDGPUInstructionSelector::selectSMFMACIntrin(MachineInstr &MI) const {
unsigned Opc;
- switch (MI.getIntrinsicID()) {
+ switch (cast<GIntrinsic>(MI).getIntrinsicID()) {
case Intrinsic::amdgcn_smfmac_f32_16x16x32_f16:
Opc = AMDGPU::V_SMFMAC_F32_16X16X32_F16_e64;
break;
@@ -3324,7 +3504,8 @@ bool AMDGPUInstructionSelector::selectWaveAddress(MachineInstr &MI) const {
} else {
BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_LSHR_B32), DstReg)
.addReg(SrcReg)
- .addImm(Subtarget->getWavefrontSizeLog2());
+ .addImm(Subtarget->getWavefrontSizeLog2())
+ .setOperandDead(3); // Dead scc
}
const TargetRegisterClass &RC =
@@ -3336,6 +3517,33 @@ bool AMDGPUInstructionSelector::selectWaveAddress(MachineInstr &MI) const {
return true;
}
+bool AMDGPUInstructionSelector::selectStackRestore(MachineInstr &MI) const {
+ Register SrcReg = MI.getOperand(0).getReg();
+ if (!RBI.constrainGenericRegister(SrcReg, AMDGPU::SReg_32RegClass, *MRI))
+ return false;
+
+ MachineInstr *DefMI = MRI->getVRegDef(SrcReg);
+ Register SP =
+ Subtarget->getTargetLowering()->getStackPointerRegisterToSaveRestore();
+ Register WaveAddr = getWaveAddress(DefMI);
+ MachineBasicBlock *MBB = MI.getParent();
+ const DebugLoc &DL = MI.getDebugLoc();
+
+ if (!WaveAddr) {
+ WaveAddr = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
+ BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_LSHR_B32), WaveAddr)
+ .addReg(SrcReg)
+ .addImm(Subtarget->getWavefrontSizeLog2())
+ .setOperandDead(3); // Dead scc
+ }
+
+ BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), SP)
+ .addReg(WaveAddr);
+
+ MI.eraseFromParent();
+ return true;
+}
+
bool AMDGPUInstructionSelector::select(MachineInstr &I) {
if (I.isPHI())
return selectPHI(I);
@@ -3402,11 +3610,14 @@ bool AMDGPUInstructionSelector::select(MachineInstr &I) {
case TargetOpcode::G_INSERT:
return selectG_INSERT(I);
case TargetOpcode::G_INTRINSIC:
+ case TargetOpcode::G_INTRINSIC_CONVERGENT:
return selectG_INTRINSIC(I);
case TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS:
+ case TargetOpcode::G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS:
return selectG_INTRINSIC_W_SIDE_EFFECTS(I);
case TargetOpcode::G_ICMP:
- if (selectG_ICMP(I))
+ case TargetOpcode::G_FCMP:
+ if (selectG_ICMP_or_FCMP(I))
return true;
return selectImpl(I, *CoverageInfo);
case TargetOpcode::G_LOAD:
@@ -3443,6 +3654,10 @@ bool AMDGPUInstructionSelector::select(MachineInstr &I) {
selectImpl(I, *CoverageInfo))
return true;
return selectG_SZA_EXT(I);
+ case TargetOpcode::G_FPEXT:
+ if (selectG_FPEXT(I))
+ return true;
+ return selectImpl(I, *CoverageInfo);
case TargetOpcode::G_BRCOND:
return selectG_BRCOND(I);
case TargetOpcode::G_GLOBAL_VALUE:
@@ -3457,8 +3672,8 @@ bool AMDGPUInstructionSelector::select(MachineInstr &I) {
case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16:
case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE:
case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16: {
- const AMDGPU::ImageDimIntrinsicInfo *Intr
- = AMDGPU::getImageDimIntrinsicInfo(I.getIntrinsicID());
+ const AMDGPU::ImageDimIntrinsicInfo *Intr =
+ AMDGPU::getImageDimIntrinsicInfo(AMDGPU::getIntrinsicID(I));
assert(Intr && "not an image intrinsic with image pseudo");
return selectImageIntrinsic(I, Intr);
}
@@ -3472,6 +3687,8 @@ bool AMDGPUInstructionSelector::select(MachineInstr &I) {
return true;
case AMDGPU::G_AMDGPU_WAVE_ADDRESS:
return selectWaveAddress(I);
+ case AMDGPU::G_STACKRESTORE:
+ return selectStackRestore(I);
default:
return selectImpl(I, *CoverageInfo);
}
@@ -3916,7 +4133,9 @@ AMDGPUInstructionSelector::selectFlatOffsetImpl(MachineOperand &Root,
int64_t ConstOffset;
std::tie(PtrBase, ConstOffset) =
getPtrBaseWithConstantOffset(Root.getReg(), *MRI);
- if (ConstOffset == 0 || !isFlatScratchBaseLegal(PtrBase, FlatVariant))
+
+ if (ConstOffset == 0 || (FlatVariant == SIInstrFlags::FlatScratch &&
+ !isFlatScratchBaseLegal(Root.getReg())))
return Default;
unsigned AddrSpace = (*MI->memoperands_begin())->getAddrSpace();
@@ -4079,7 +4298,7 @@ AMDGPUInstructionSelector::selectScratchSAddr(MachineOperand &Root) const {
// possible.
std::tie(PtrBase, ConstOffset) = getPtrBaseWithConstantOffset(Addr, *MRI);
- if (ConstOffset != 0 && isFlatScratchBaseLegal(PtrBase) &&
+ if (ConstOffset != 0 && isFlatScratchBaseLegal(Addr) &&
TII.isLegalFLATOffset(ConstOffset, AMDGPUAS::PRIVATE_ADDRESS,
SIInstrFlags::FlatScratch)) {
Addr = PtrBase;
@@ -4113,7 +4332,8 @@ AMDGPUInstructionSelector::selectScratchSAddr(MachineOperand &Root) const {
BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_ADD_I32), SAddr)
.addFrameIndex(FI)
- .addReg(RHSDef->Reg);
+ .addReg(RHSDef->Reg)
+ .setOperandDead(3); // Dead scc
}
}
@@ -4155,6 +4375,7 @@ AMDGPUInstructionSelector::selectScratchSVAddr(MachineOperand &Root) const {
// possible.
std::tie(PtrBase, ConstOffset) = getPtrBaseWithConstantOffset(Addr, *MRI);
+ Register OrigAddr = Addr;
if (ConstOffset != 0 &&
TII.isLegalFLATOffset(ConstOffset, AMDGPUAS::PRIVATE_ADDRESS, true)) {
Addr = PtrBase;
@@ -4172,8 +4393,13 @@ AMDGPUInstructionSelector::selectScratchSVAddr(MachineOperand &Root) const {
Register LHS = AddrDef->MI->getOperand(1).getReg();
auto LHSDef = getDefSrcRegIgnoringCopies(LHS, *MRI);
- if (!isFlatScratchBaseLegal(LHS) || !isFlatScratchBaseLegal(RHS))
- return std::nullopt;
+ if (OrigAddr != Addr) {
+ if (!isFlatScratchBaseLegalSVImm(OrigAddr))
+ return std::nullopt;
+ } else {
+ if (!isFlatScratchBaseLegalSV(OrigAddr))
+ return std::nullopt;
+ }
if (checkFlatScratchSVSSwizzleBug(RHS, LHS, ImmOffset))
return std::nullopt;
@@ -4211,7 +4437,7 @@ AMDGPUInstructionSelector::selectMUBUFScratchOffen(MachineOperand &Root) const {
// TODO: Should this be inside the render function? The iterator seems to
// move.
- const uint32_t MaxOffset = SIInstrInfo::getMaxMUBUFImmOffset();
+ const uint32_t MaxOffset = SIInstrInfo::getMaxMUBUFImmOffset(*Subtarget);
BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::V_MOV_B32_e32),
HighBits)
.addImm(Offset & ~MaxOffset);
@@ -4243,7 +4469,7 @@ AMDGPUInstructionSelector::selectMUBUFScratchOffen(MachineOperand &Root) const {
int64_t ConstOffset;
std::tie(PtrBase, ConstOffset) = getPtrBaseWithConstantOffset(VAddr, *MRI);
if (ConstOffset != 0) {
- if (SIInstrInfo::isLegalMUBUFImmOffset(ConstOffset) &&
+ if (TII.isLegalMUBUFImmOffset(ConstOffset) &&
(!STI.privateMemoryResourceIsRangeChecked() ||
KB->signBitIsZero(PtrBase))) {
const MachineInstr *PtrBaseDef = MRI->getVRegDef(PtrBase);
@@ -4306,14 +4532,83 @@ bool AMDGPUInstructionSelector::isDSOffset2Legal(Register Base, int64_t Offset0,
return KB->signBitIsZero(Base);
}
-bool AMDGPUInstructionSelector::isFlatScratchBaseLegal(
- Register Base, uint64_t FlatVariant) const {
- if (FlatVariant != SIInstrFlags::FlatScratch)
+// Return whether the operation has NoUnsignedWrap property.
+static bool isNoUnsignedWrap(MachineInstr *Addr) {
+ return Addr->getOpcode() == TargetOpcode::G_OR ||
+ (Addr->getOpcode() == TargetOpcode::G_PTR_ADD &&
+ Addr->getFlag(MachineInstr::NoUWrap));
+}
+
+// Check that the base address of flat scratch load/store in the form of `base +
+// offset` is legal to be put in SGPR/VGPR (i.e. unsigned per hardware
+// requirement). We always treat the first operand as the base address here.
+bool AMDGPUInstructionSelector::isFlatScratchBaseLegal(Register Addr) const {
+ MachineInstr *AddrMI = getDefIgnoringCopies(Addr, *MRI);
+
+ if (isNoUnsignedWrap(AddrMI))
return true;
- // When value in 32-bit Base can be negative calculate scratch offset using
- // 32-bit add instruction, otherwise use Base(unsigned) + offset.
- return KB->signBitIsZero(Base);
+ // Starting with GFX12, VADDR and SADDR fields in VSCRATCH can use negative
+ // values.
+ if (AMDGPU::isGFX12Plus(STI))
+ return true;
+
+ Register LHS = AddrMI->getOperand(1).getReg();
+ Register RHS = AddrMI->getOperand(2).getReg();
+
+ if (AddrMI->getOpcode() == TargetOpcode::G_PTR_ADD) {
+ std::optional<ValueAndVReg> RhsValReg =
+ getIConstantVRegValWithLookThrough(RHS, *MRI);
+ // If the immediate offset is negative and within certain range, the base
+ // address cannot also be negative. If the base is also negative, the sum
+ // would be either negative or much larger than the valid range of scratch
+ // memory a thread can access.
+ if (RhsValReg && RhsValReg->Value.getSExtValue() < 0 &&
+ RhsValReg->Value.getSExtValue() > -0x40000000)
+ return true;
+ }
+
+ return KB->signBitIsZero(LHS);
+}
+
+// Check address value in SGPR/VGPR are legal for flat scratch in the form
+// of: SGPR + VGPR.
+bool AMDGPUInstructionSelector::isFlatScratchBaseLegalSV(Register Addr) const {
+ MachineInstr *AddrMI = getDefIgnoringCopies(Addr, *MRI);
+
+ if (isNoUnsignedWrap(AddrMI))
+ return true;
+
+ Register LHS = AddrMI->getOperand(1).getReg();
+ Register RHS = AddrMI->getOperand(2).getReg();
+ return KB->signBitIsZero(RHS) && KB->signBitIsZero(LHS);
+}
+
+// Check address value in SGPR/VGPR are legal for flat scratch in the form
+// of: SGPR + VGPR + Imm.
+bool AMDGPUInstructionSelector::isFlatScratchBaseLegalSVImm(
+ Register Addr) const {
+ MachineInstr *AddrMI = getDefIgnoringCopies(Addr, *MRI);
+ Register Base = AddrMI->getOperand(1).getReg();
+ std::optional<DefinitionAndSourceRegister> BaseDef =
+ getDefSrcRegIgnoringCopies(Base, *MRI);
+ std::optional<ValueAndVReg> RHSOffset =
+ getIConstantVRegValWithLookThrough(AddrMI->getOperand(2).getReg(), *MRI);
+ assert(RHSOffset);
+
+ // If the immediate offset is negative and within certain range, the base
+ // address cannot also be negative. If the base is also negative, the sum
+ // would be either negative or much larger than the valid range of scratch
+ // memory a thread can access.
+ if (isNoUnsignedWrap(BaseDef->MI) &&
+ (isNoUnsignedWrap(AddrMI) ||
+ (RHSOffset->Value.getSExtValue() < 0 &&
+ RHSOffset->Value.getSExtValue() > -0x40000000)))
+ return true;
+
+ Register LHS = BaseDef->MI->getOperand(1).getReg();
+ Register RHS = BaseDef->MI->getOperand(2).getReg();
+ return KB->signBitIsZero(RHS) && KB->signBitIsZero(LHS);
}
bool AMDGPUInstructionSelector::isUnneededShiftMask(const MachineInstr &MI,
@@ -4332,21 +4627,18 @@ bool AMDGPUInstructionSelector::isUnneededShiftMask(const MachineInstr &MI,
return (LHSKnownZeros | *RHS).countr_one() >= ShAmtBits;
}
-// Return the wave level SGPR base address if this is a wave address.
-static Register getWaveAddress(const MachineInstr *Def) {
- return Def->getOpcode() == AMDGPU::G_AMDGPU_WAVE_ADDRESS
- ? Def->getOperand(1).getReg()
- : Register();
-}
-
InstructionSelector::ComplexRendererFns
AMDGPUInstructionSelector::selectMUBUFScratchOffset(
MachineOperand &Root) const {
Register Reg = Root.getReg();
const SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>();
- const MachineInstr *Def = MRI->getVRegDef(Reg);
- if (Register WaveBase = getWaveAddress(Def)) {
+ std::optional<DefinitionAndSourceRegister> Def =
+ getDefSrcRegIgnoringCopies(Reg, *MRI);
+ assert(Def && "this shouldn't be an optional result");
+ Reg = Def->Reg;
+
+ if (Register WaveBase = getWaveAddress(Def->MI)) {
return {{
[=](MachineInstrBuilder &MIB) { // rsrc
MIB.addReg(Info->getScratchRSrcReg());
@@ -4362,10 +4654,12 @@ AMDGPUInstructionSelector::selectMUBUFScratchOffset(
// FIXME: Copy check is a hack
Register BasePtr;
- if (mi_match(Reg, *MRI, m_GPtrAdd(m_Reg(BasePtr), m_Copy(m_ICst(Offset))))) {
- if (!SIInstrInfo::isLegalMUBUFImmOffset(Offset))
+ if (mi_match(Reg, *MRI,
+ m_GPtrAdd(m_Reg(BasePtr),
+ m_any_of(m_ICst(Offset), m_Copy(m_ICst(Offset)))))) {
+ if (!TII.isLegalMUBUFImmOffset(Offset))
return {};
- const MachineInstr *BasePtrDef = MRI->getVRegDef(BasePtr);
+ MachineInstr *BasePtrDef = getDefIgnoringCopies(BasePtr, *MRI);
Register WaveBase = getWaveAddress(BasePtrDef);
if (!WaveBase)
return {};
@@ -4382,7 +4676,7 @@ AMDGPUInstructionSelector::selectMUBUFScratchOffset(
}
if (!mi_match(Root.getReg(), *MRI, m_ICst(Offset)) ||
- !SIInstrInfo::isLegalMUBUFImmOffset(Offset))
+ !TII.isLegalMUBUFImmOffset(Offset))
return {};
return {{
@@ -4625,7 +4919,7 @@ bool AMDGPUInstructionSelector::shouldUseAddr64(MUBUFAddressData Addr) const {
/// component.
void AMDGPUInstructionSelector::splitIllegalMUBUFOffset(
MachineIRBuilder &B, Register &SOffset, int64_t &ImmOffset) const {
- if (SIInstrInfo::isLegalMUBUFImmOffset(ImmOffset))
+ if (TII.isLegalMUBUFImmOffset(ImmOffset))
return;
// Illegal offset, store it in soffset.
@@ -4734,6 +5028,8 @@ AMDGPUInstructionSelector::selectMUBUFAddr64(MachineOperand &Root) const {
[=](MachineInstrBuilder &MIB) { // soffset
if (SOffset)
MIB.addReg(SOffset);
+ else if (STI.hasRestrictedSOffset())
+ MIB.addReg(AMDGPU::SGPR_NULL);
else
MIB.addImm(0);
},
@@ -4762,6 +5058,8 @@ AMDGPUInstructionSelector::selectMUBUFOffset(MachineOperand &Root) const {
[=](MachineInstrBuilder &MIB) { // soffset
if (SOffset)
MIB.addReg(SOffset);
+ else if (STI.hasRestrictedSOffset())
+ MIB.addReg(AMDGPU::SGPR_NULL);
else
MIB.addImm(0);
},
@@ -4772,6 +5070,17 @@ AMDGPUInstructionSelector::selectMUBUFOffset(MachineOperand &Root) const {
}};
}
+InstructionSelector::ComplexRendererFns
+AMDGPUInstructionSelector::selectBUFSOffset(MachineOperand &Root) const {
+
+ Register SOffset = Root.getReg();
+
+ if (STI.hasRestrictedSOffset() && mi_match(SOffset, *MRI, m_ZeroInt()))
+ SOffset = AMDGPU::SGPR_NULL;
+
+ return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(SOffset); }}};
+}
+
/// Get an immediate that must be 32-bits, and treated as zero extended.
static std::optional<uint64_t>
getConstantZext32Val(Register Reg, const MachineRegisterInfo &MRI) {
@@ -4818,8 +5127,8 @@ AMDGPUInstructionSelector::selectSMRDBufferSgprImm(MachineOperand &Root) const {
// an immediate offset.
Register SOffset;
unsigned Offset;
- std::tie(SOffset, Offset) =
- AMDGPU::getBaseWithConstantOffset(*MRI, Root.getReg(), KB);
+ std::tie(SOffset, Offset) = AMDGPU::getBaseWithConstantOffset(
+ *MRI, Root.getReg(), KB, /*CheckNUW*/ true);
if (!SOffset)
return std::nullopt;
@@ -4980,6 +5289,135 @@ AMDGPUInstructionSelector::selectVOP3PMadMixMods(MachineOperand &Root) const {
}};
}
+bool AMDGPUInstructionSelector::selectSBarrierSignalIsfirst(
+ MachineInstr &I, Intrinsic::ID IntrID) const {
+ MachineBasicBlock *MBB = I.getParent();
+ const DebugLoc &DL = I.getDebugLoc();
+ Register CCReg = I.getOperand(0).getReg();
+
+ bool HasM0 = IntrID == Intrinsic::amdgcn_s_barrier_signal_isfirst_var;
+
+ if (HasM0) {
+ auto CopyMIB = BuildMI(*MBB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
+ .addReg(I.getOperand(2).getReg());
+ BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_BARRIER_SIGNAL_ISFIRST_M0));
+ if (!constrainSelectedInstRegOperands(*CopyMIB, TII, TRI, RBI))
+ return false;
+ } else {
+ BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_BARRIER_SIGNAL_ISFIRST_IMM))
+ .addImm(I.getOperand(2).getImm());
+ }
+
+ BuildMI(*MBB, &I, DL, TII.get(AMDGPU::COPY), CCReg).addReg(AMDGPU::SCC);
+
+ I.eraseFromParent();
+ return RBI.constrainGenericRegister(CCReg, AMDGPU::SReg_32_XM0_XEXECRegClass,
+ *MRI);
+}
+
+unsigned getNamedBarrierOp(bool HasInlineConst, Intrinsic::ID IntrID) {
+ if (HasInlineConst) {
+ switch (IntrID) {
+ default:
+ llvm_unreachable("not a named barrier op");
+ case Intrinsic::amdgcn_s_barrier_init:
+ return AMDGPU::S_BARRIER_INIT_IMM;
+ case Intrinsic::amdgcn_s_barrier_join:
+ return AMDGPU::S_BARRIER_JOIN_IMM;
+ case Intrinsic::amdgcn_s_wakeup_barrier:
+ return AMDGPU::S_WAKEUP_BARRIER_IMM;
+ case Intrinsic::amdgcn_s_get_barrier_state:
+ return AMDGPU::S_GET_BARRIER_STATE_IMM;
+ };
+ } else {
+ switch (IntrID) {
+ default:
+ llvm_unreachable("not a named barrier op");
+ case Intrinsic::amdgcn_s_barrier_init:
+ return AMDGPU::S_BARRIER_INIT_M0;
+ case Intrinsic::amdgcn_s_barrier_join:
+ return AMDGPU::S_BARRIER_JOIN_M0;
+ case Intrinsic::amdgcn_s_wakeup_barrier:
+ return AMDGPU::S_WAKEUP_BARRIER_M0;
+ case Intrinsic::amdgcn_s_get_barrier_state:
+ return AMDGPU::S_GET_BARRIER_STATE_M0;
+ };
+ }
+}
+
+bool AMDGPUInstructionSelector::selectNamedBarrierInst(
+ MachineInstr &I, Intrinsic::ID IntrID) const {
+ MachineBasicBlock *MBB = I.getParent();
+ const DebugLoc &DL = I.getDebugLoc();
+ MachineOperand BarOp = IntrID == Intrinsic::amdgcn_s_get_barrier_state
+ ? I.getOperand(2)
+ : I.getOperand(1);
+ std::optional<int64_t> BarValImm =
+ getIConstantVRegSExtVal(BarOp.getReg(), *MRI);
+ Register M0Val;
+ Register TmpReg0;
+
+ // For S_BARRIER_INIT, member count will always be read from M0[16:22]
+ if (IntrID == Intrinsic::amdgcn_s_barrier_init) {
+ Register MemberCount = I.getOperand(2).getReg();
+ TmpReg0 = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
+ // TODO: This should be expanded during legalization so that the the S_LSHL
+ // and S_OR can be constant-folded
+ BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_LSHL_B32), TmpReg0)
+ .addImm(16)
+ .addReg(MemberCount);
+ M0Val = TmpReg0;
+ }
+
+ // If not inlinable, get reference to barrier depending on the instruction
+ if (!BarValImm) {
+ if (IntrID == Intrinsic::amdgcn_s_barrier_init) {
+ // If reference to barrier id is not an inlinable constant then it must be
+ // referenced with M0[4:0]. Perform an OR with the member count to include
+ // it in M0 for S_BARRIER_INIT.
+ Register TmpReg1 = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
+ BuildMI(*MBB, &I, DL, TII.get(AMDGPU::S_OR_B32), TmpReg1)
+ .addReg(BarOp.getReg())
+ .addReg(TmpReg0);
+ M0Val = TmpReg1;
+ } else {
+ M0Val = BarOp.getReg();
+ }
+ }
+
+ // Build copy to M0 if needed. For S_BARRIER_INIT, M0 is always required.
+ if (M0Val) {
+ auto CopyMIB =
+ BuildMI(*MBB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::M0).addReg(M0Val);
+ constrainSelectedInstRegOperands(*CopyMIB, TII, TRI, RBI);
+ }
+
+ MachineInstrBuilder MIB;
+ unsigned Opc = getNamedBarrierOp(BarValImm.has_value(), IntrID);
+ MIB = BuildMI(*MBB, &I, DL, TII.get(Opc));
+
+ if (IntrID == Intrinsic::amdgcn_s_get_barrier_state)
+ MIB.addDef(I.getOperand(0).getReg());
+
+ if (BarValImm)
+ MIB.addImm(*BarValImm);
+
+ I.eraseFromParent();
+ return true;
+}
+bool AMDGPUInstructionSelector::selectSBarrierLeave(MachineInstr &I) const {
+ MachineBasicBlock *BB = I.getParent();
+ const DebugLoc &DL = I.getDebugLoc();
+ Register CCReg = I.getOperand(0).getReg();
+
+ BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_BARRIER_LEAVE));
+ BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), CCReg).addReg(AMDGPU::SCC);
+
+ I.eraseFromParent();
+ return RBI.constrainGenericRegister(CCReg, AMDGPU::SReg_32_XM0_XEXECRegClass,
+ *MRI);
+}
+
void AMDGPUInstructionSelector::renderTruncImm32(MachineInstrBuilder &MIB,
const MachineInstr &MI,
int OpIdx) const {
@@ -5037,14 +5475,19 @@ void AMDGPUInstructionSelector::renderExtractCPol(MachineInstrBuilder &MIB,
const MachineInstr &MI,
int OpIdx) const {
assert(OpIdx >= 0 && "expected to match an immediate operand");
- MIB.addImm(MI.getOperand(OpIdx).getImm() & AMDGPU::CPol::ALL);
+ MIB.addImm(MI.getOperand(OpIdx).getImm() &
+ (AMDGPU::isGFX12Plus(STI) ? AMDGPU::CPol::ALL
+ : AMDGPU::CPol::ALL_pregfx12));
}
void AMDGPUInstructionSelector::renderExtractSWZ(MachineInstrBuilder &MIB,
const MachineInstr &MI,
int OpIdx) const {
assert(OpIdx >= 0 && "expected to match an immediate operand");
- MIB.addImm((MI.getOperand(OpIdx).getImm() >> 3) & 1);
+ const bool Swizzle = MI.getOperand(OpIdx).getImm() &
+ (AMDGPU::isGFX12Plus(STI) ? AMDGPU::CPol::SWZ
+ : AMDGPU::CPol::SWZ_pregfx12);
+ MIB.addImm(Swizzle);
}
void AMDGPUInstructionSelector::renderSetGLC(MachineInstrBuilder &MIB,
@@ -5057,7 +5500,16 @@ void AMDGPUInstructionSelector::renderSetGLC(MachineInstrBuilder &MIB,
void AMDGPUInstructionSelector::renderFrameIndex(MachineInstrBuilder &MIB,
const MachineInstr &MI,
int OpIdx) const {
- MIB.addFrameIndex((MI.getOperand(1).getIndex()));
+ MIB.addFrameIndex(MI.getOperand(1).getIndex());
+}
+
+void AMDGPUInstructionSelector::renderFPPow2ToExponent(MachineInstrBuilder &MIB,
+ const MachineInstr &MI,
+ int OpIdx) const {
+ const APFloat &APF = MI.getOperand(1).getFPImm()->getValueAPF();
+ int ExpVal = APF.getExactLog2Abs();
+ assert(ExpVal != INT_MIN);
+ MIB.addImm(ExpVal);
}
bool AMDGPUInstructionSelector::isInlineImmediate16(int64_t Imm) const {
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
index 243ff72e2979..ab7cc0a6beb8 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
@@ -90,6 +90,7 @@ private:
bool selectPHI(MachineInstr &I) const;
bool selectG_TRUNC(MachineInstr &I) const;
bool selectG_SZA_EXT(MachineInstr &I) const;
+ bool selectG_FPEXT(MachineInstr &I) const;
bool selectG_CONSTANT(MachineInstr &I) const;
bool selectG_FNEG(MachineInstr &I) const;
bool selectG_FABS(MachineInstr &I) const;
@@ -129,7 +130,7 @@ private:
const AMDGPU::ImageDimIntrinsicInfo *Intr) const;
bool selectG_INTRINSIC_W_SIDE_EFFECTS(MachineInstr &I) const;
int getS_CMPOpcode(CmpInst::Predicate P, unsigned Size) const;
- bool selectG_ICMP(MachineInstr &I) const;
+ bool selectG_ICMP_or_FCMP(MachineInstr &I) const;
bool hasVgprParts(ArrayRef<GEPInfo> AddrInfo) const;
void getAddrModeInfo(const MachineInstr &Load, const MachineRegisterInfo &MRI,
SmallVectorImpl<GEPInfo> &AddrInfo) const;
@@ -147,6 +148,10 @@ private:
bool selectBVHIntrinsic(MachineInstr &I) const;
bool selectSMFMACIntrin(MachineInstr &I) const;
bool selectWaveAddress(MachineInstr &I) const;
+ bool selectStackRestore(MachineInstr &MI) const;
+ bool selectNamedBarrierInst(MachineInstr &I, Intrinsic::ID IID) const;
+ bool selectSBarrierSignalIsfirst(MachineInstr &I, Intrinsic::ID IID) const;
+ bool selectSBarrierLeave(MachineInstr &I) const;
std::pair<Register, unsigned> selectVOP3ModsImpl(MachineOperand &Root,
bool IsCanonicalizing = true,
@@ -241,8 +246,9 @@ private:
bool isDSOffsetLegal(Register Base, int64_t Offset) const;
bool isDSOffset2Legal(Register Base, int64_t Offset0, int64_t Offset1,
unsigned Size) const;
- bool isFlatScratchBaseLegal(
- Register Base, uint64_t FlatVariant = SIInstrFlags::FlatScratch) const;
+ bool isFlatScratchBaseLegal(Register Addr) const;
+ bool isFlatScratchBaseLegalSV(Register Addr) const;
+ bool isFlatScratchBaseLegalSVImm(Register Addr) const;
std::pair<Register, unsigned>
selectDS1Addr1OffsetImpl(MachineOperand &Root) const;
@@ -287,6 +293,9 @@ private:
Register &SOffset, int64_t &Offset) const;
InstructionSelector::ComplexRendererFns
+ selectBUFSOffset(MachineOperand &Root) const;
+
+ InstructionSelector::ComplexRendererFns
selectMUBUFAddr64(MachineOperand &Root) const;
InstructionSelector::ComplexRendererFns
@@ -328,6 +337,9 @@ private:
void renderFrameIndex(MachineInstrBuilder &MIB, const MachineInstr &MI,
int OpIdx) const;
+ void renderFPPow2ToExponent(MachineInstrBuilder &MIB, const MachineInstr &MI,
+ int OpIdx) const;
+
bool isInlineImmediate16(int64_t Imm) const;
bool isInlineImmediate32(int64_t Imm) const;
bool isInlineImmediate64(int64_t Imm) const;
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td
index 2305097e3f94..eaf72d7157ee 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td
@@ -125,11 +125,11 @@ def InstFlag : OperandWithDefaultOps <i32, (ops (i32 0))>;
def i1imm_0 : OperandWithDefaultOps<i1, (ops (i1 0))>;
-class CustomOperandClass<string name, bit optional, string parserMethod,
- string defaultMethod>
+class CustomOperandClass<string name, bit optional, string predicateMethod,
+ string parserMethod, string defaultMethod>
: AsmOperandClass {
let Name = name;
- let PredicateMethod = "is"#name;
+ let PredicateMethod = predicateMethod;
let ParserMethod = parserMethod;
let RenderMethod = "addImmOperands";
let IsOptional = optional;
@@ -138,6 +138,7 @@ class CustomOperandClass<string name, bit optional, string parserMethod,
class CustomOperandProps<bit optional = 0, string name = NAME> {
string ImmTy = "ImmTy"#name;
+ string PredicateMethod = "is"#name;
string ParserMethod = "parse"#name;
string DefaultValue = "0";
string DefaultMethod = "[this]() { return "#
@@ -145,7 +146,8 @@ class CustomOperandProps<bit optional = 0, string name = NAME> {
"AMDGPUOperand::"#ImmTy#"); }";
string PrintMethod = "print"#name;
AsmOperandClass ParserMatchClass =
- CustomOperandClass<name, optional, ParserMethod, DefaultMethod>;
+ CustomOperandClass<name, optional, PredicateMethod, ParserMethod,
+ DefaultMethod>;
string OperandType = "OPERAND_IMMEDIATE";
}
@@ -163,6 +165,20 @@ class ImmOperand<ValueType type, string name = NAME, bit optional = 0,
def s16imm : ImmOperand<i16, "S16Imm", 0, "printU16ImmOperand">;
def u16imm : ImmOperand<i16, "U16Imm", 0, "printU16ImmOperand">;
+class ValuePredicatedOperand<CustomOperand op, string valuePredicate,
+ bit optional = 0>
+ : CustomOperand<op.Type, optional> {
+ let ImmTy = op.ImmTy;
+ defvar OpPredicate = op.ParserMatchClass.PredicateMethod;
+ let PredicateMethod =
+ "getPredicate([](const AMDGPUOperand &Op) -> bool { "#
+ "return Op."#OpPredicate#"() && "#valuePredicate#"; })";
+ let ParserMethod = op.ParserMatchClass.ParserMethod;
+ let DefaultValue = op.DefaultValue;
+ let DefaultMethod = op.DefaultMethod;
+ let PrintMethod = op.PrintMethod;
+}
+
//===--------------------------------------------------------------------===//
// Custom Operands
//===--------------------------------------------------------------------===//
@@ -236,6 +252,8 @@ def umin_oneuse : HasOneUseBinOp<umin>;
def fminnum_oneuse : HasOneUseBinOp<fminnum>;
def fmaxnum_oneuse : HasOneUseBinOp<fmaxnum>;
+def fminimum_oneuse : HasOneUseBinOp<fminimum>;
+def fmaximum_oneuse : HasOneUseBinOp<fmaximum>;
def fminnum_ieee_oneuse : HasOneUseBinOp<fminnum_ieee>;
def fmaxnum_ieee_oneuse : HasOneUseBinOp<fmaxnum_ieee>;
@@ -544,19 +562,18 @@ def truncstorei16_#as : PatFrag<(ops node:$val, node:$ptr),
def store_hi16_#as : StoreHi16 <truncstorei16, i16>;
def truncstorei8_hi16_#as : StoreHi16<truncstorei8, i8>;
def truncstorei16_hi16_#as : StoreHi16<truncstorei16, i16>;
-
} // End let IsStore = 1, AddressSpaces = ...
let IsAtomic = 1, AddressSpaces = !cast<AddressSpaceList>("StoreAddress_"#as).AddrSpaces in {
-def atomic_store_8_#as : PatFrag<(ops node:$ptr, node:$val),
- (atomic_store_8 node:$ptr, node:$val)>;
-def atomic_store_16_#as : PatFrag<(ops node:$ptr, node:$val),
- (atomic_store_16 node:$ptr, node:$val)>;
-def atomic_store_32_#as : PatFrag<(ops node:$ptr, node:$val),
- (atomic_store_32 node:$ptr, node:$val)>;
-def atomic_store_64_#as : PatFrag<(ops node:$ptr, node:$val),
- (atomic_store_64 node:$ptr, node:$val)>;
-}
+def atomic_store_8_#as : PatFrag<(ops node:$val, node:$ptr),
+ (atomic_store_8 node:$val, node:$ptr)>;
+def atomic_store_16_#as : PatFrag<(ops node:$val, node:$ptr),
+ (atomic_store_16 node:$val, node:$ptr)>;
+def atomic_store_32_#as : PatFrag<(ops node:$val, node:$ptr),
+ (atomic_store_32 node:$val, node:$ptr)>;
+def atomic_store_64_#as : PatFrag<(ops node:$val, node:$ptr),
+ (atomic_store_64 node:$val, node:$ptr)>;
+} // End let IsAtomic = 1, AddressSpaces = ...
} // End foreach as
multiclass noret_op {
@@ -622,8 +639,13 @@ defm int_amdgcn_flat_atomic_fadd : global_addr_space_atomic_op;
defm int_amdgcn_global_atomic_fadd_v2bf16 : noret_op;
defm int_amdgcn_global_atomic_fmin : noret_op;
defm int_amdgcn_global_atomic_fmax : noret_op;
+defm int_amdgcn_global_atomic_csub : noret_op;
defm int_amdgcn_flat_atomic_fadd : local_addr_space_atomic_op;
defm int_amdgcn_ds_fadd_v2bf16 : noret_op;
+defm int_amdgcn_flat_atomic_fmin_num : noret_op;
+defm int_amdgcn_flat_atomic_fmax_num : noret_op;
+defm int_amdgcn_global_atomic_fmin_num : noret_op;
+defm int_amdgcn_global_atomic_fmax_num : noret_op;
multiclass noret_binary_atomic_op<SDNode atomic_op, bit IsInt = 1> {
let HasNoUse = true in
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
index bbf4db12f5ab..fbee28889451 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -17,14 +17,19 @@
#include "AMDGPUGlobalISelUtils.h"
#include "AMDGPUInstrInfo.h"
#include "AMDGPUTargetMachine.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
+#include "SIInstrInfo.h"
#include "SIMachineFunctionInfo.h"
+#include "SIRegisterInfo.h"
#include "Utils/AMDGPUBaseInfo.h"
#include "llvm/ADT/ScopeExit.h"
#include "llvm/BinaryFormat/ELF.h"
+#include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h"
#include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
#include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
#include "llvm/CodeGen/GlobalISel/Utils.h"
+#include "llvm/CodeGen/TargetOpcodes.h"
#include "llvm/IR/DiagnosticInfo.h"
#include "llvm/IR/IntrinsicsAMDGPU.h"
#include "llvm/IR/IntrinsicsR600.h"
@@ -455,8 +460,8 @@ static bool shouldWidenLoad(const GCNSubtarget &ST, LLT MemoryTy,
return false;
// If we have 96-bit memory operations, we shouldn't touch them. Note we may
- // end up widening these for a scalar load during RegBankSelect, since there
- // aren't 96-bit scalar loads.
+ // end up widening these for a scalar load during RegBankSelect, if we don't
+ // have 96-bit scalar loads.
if (SizeInBits == 96 && ST.hasDwordx3LoadStores())
return false;
@@ -628,6 +633,8 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS);
const LLT BufferFatPtr = GetAddrSpacePtr(AMDGPUAS::BUFFER_FAT_POINTER);
const LLT RsrcPtr = GetAddrSpacePtr(AMDGPUAS::BUFFER_RESOURCE);
+ const LLT BufferStridedPtr =
+ GetAddrSpacePtr(AMDGPUAS::BUFFER_STRIDED_POINTER);
const LLT CodePtr = FlatPtr;
@@ -676,13 +683,23 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
if (ST.hasVOP3PInsts() && ST.hasAddNoCarry() && ST.hasIntClamp()) {
// Full set of gfx9 features.
- getActionDefinitionsBuilder({G_ADD, G_SUB})
- .legalFor({S32, S16, V2S16})
- .clampMaxNumElementsStrict(0, S16, 2)
- .scalarize(0)
- .minScalar(0, S16)
- .widenScalarToNextMultipleOf(0, 32)
- .maxScalar(0, S32);
+ if (ST.hasScalarAddSub64()) {
+ getActionDefinitionsBuilder({G_ADD, G_SUB})
+ .legalFor({S64, S32, S16, V2S16})
+ .clampMaxNumElementsStrict(0, S16, 2)
+ .scalarize(0)
+ .minScalar(0, S16)
+ .widenScalarToNextMultipleOf(0, 32)
+ .maxScalar(0, S32);
+ } else {
+ getActionDefinitionsBuilder({G_ADD, G_SUB})
+ .legalFor({S32, S16, V2S16})
+ .clampMaxNumElementsStrict(0, S16, 2)
+ .scalarize(0)
+ .minScalar(0, S16)
+ .widenScalarToNextMultipleOf(0, 32)
+ .maxScalar(0, S32);
+ }
getActionDefinitionsBuilder(G_MUL)
.legalFor({S32, S16, V2S16})
@@ -842,6 +859,11 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
getActionDefinitionsBuilder(G_DYN_STACKALLOC)
.legalFor({{PrivatePtr, S32}});
+ getActionDefinitionsBuilder(G_STACKSAVE)
+ .customFor({PrivatePtr});
+ getActionDefinitionsBuilder(G_STACKRESTORE)
+ .legalFor({PrivatePtr});
+
getActionDefinitionsBuilder(G_GLOBAL_VALUE)
.customIf(typeIsNot(0, PrivatePtr));
@@ -866,6 +888,11 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
FDIVActions.customFor({S16});
}
+ if (ST.hasPackedFP32Ops()) {
+ FPOpActions.legalFor({V2S32});
+ FPOpActions.clampMaxNumElementsStrict(0, S32, 2);
+ }
+
auto &MinNumMaxNum = getActionDefinitionsBuilder({
G_FMINNUM, G_FMAXNUM, G_FMINNUM_IEEE, G_FMAXNUM_IEEE});
@@ -908,10 +935,10 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
if (ST.has16BitInsts()) {
getActionDefinitionsBuilder(G_FSQRT)
- .legalFor({S32, S16})
- .customFor({S64})
+ .legalFor({S16})
+ .customFor({S32, S64})
.scalarize(0)
- .clampScalar(0, S16, S64);
+ .unsupported();
getActionDefinitionsBuilder(G_FFLOOR)
.legalFor({S32, S64, S16})
.scalarize(0)
@@ -930,10 +957,10 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
.lower();
} else {
getActionDefinitionsBuilder(G_FSQRT)
- .legalFor({S32})
- .customFor({S64})
+ .customFor({S32, S64, S16})
.scalarize(0)
- .clampScalar(0, S32, S64);
+ .unsupported();
+
if (ST.hasFractBug()) {
getActionDefinitionsBuilder(G_FFLOOR)
@@ -1061,31 +1088,34 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
.scalarize(0)
.lower();
- // Lower roundeven into G_FRINT
- getActionDefinitionsBuilder({G_INTRINSIC_ROUND, G_INTRINSIC_ROUNDEVEN})
- .scalarize(0)
- .lower();
+ // Lower G_FNEARBYINT and G_FRINT into G_INTRINSIC_ROUNDEVEN
+ getActionDefinitionsBuilder({G_INTRINSIC_ROUND, G_FRINT, G_FNEARBYINT})
+ .scalarize(0)
+ .lower();
if (ST.has16BitInsts()) {
- getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
- .legalFor({S16, S32, S64})
- .clampScalar(0, S16, S64)
- .scalarize(0);
+ getActionDefinitionsBuilder(
+ {G_INTRINSIC_TRUNC, G_FCEIL, G_INTRINSIC_ROUNDEVEN})
+ .legalFor({S16, S32, S64})
+ .clampScalar(0, S16, S64)
+ .scalarize(0);
} else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) {
- getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
- .legalFor({S32, S64})
- .clampScalar(0, S32, S64)
- .scalarize(0);
+ getActionDefinitionsBuilder(
+ {G_INTRINSIC_TRUNC, G_FCEIL, G_INTRINSIC_ROUNDEVEN})
+ .legalFor({S32, S64})
+ .clampScalar(0, S32, S64)
+ .scalarize(0);
} else {
- getActionDefinitionsBuilder({G_INTRINSIC_TRUNC, G_FCEIL, G_FRINT})
- .legalFor({S32})
- .customFor({S64})
- .clampScalar(0, S32, S64)
- .scalarize(0);
+ getActionDefinitionsBuilder(
+ {G_INTRINSIC_TRUNC, G_FCEIL, G_INTRINSIC_ROUNDEVEN})
+ .legalFor({S32})
+ .customFor({S64})
+ .clampScalar(0, S32, S64)
+ .scalarize(0);
}
getActionDefinitionsBuilder(G_PTR_ADD)
- .unsupportedFor({BufferFatPtr, RsrcPtr})
+ .unsupportedFor({BufferFatPtr, BufferStridedPtr, RsrcPtr})
.legalIf(all(isPointer(0), sameSize(0, 1)))
.scalarize(0)
.scalarSameSizeAs(1, 0);
@@ -1121,8 +1151,14 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
.scalarize(0)
.legalIf(all(typeInSet(0, {S1, S32}), isPointer(1)));
- getActionDefinitionsBuilder(G_FCMP)
- .legalForCartesianProduct({S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase)
+ auto &FCmpBuilder =
+ getActionDefinitionsBuilder(G_FCMP).legalForCartesianProduct(
+ {S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase);
+
+ if (ST.hasSALUFloatInsts())
+ FCmpBuilder.legalForCartesianProduct({S32}, {S16, S32});
+
+ FCmpBuilder
.widenScalarToNextPow2(1)
.clampScalar(1, S32, S64)
.scalarize(0);
@@ -1149,7 +1185,8 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
Log2Ops.scalarize(0)
.lower();
- auto &LogOps = getActionDefinitionsBuilder({G_FLOG, G_FLOG10, G_FEXP});
+ auto &LogOps =
+ getActionDefinitionsBuilder({G_FLOG, G_FLOG10, G_FEXP, G_FEXP10});
LogOps.customFor({S32, S16});
LogOps.clampScalar(0, MinScalarFPTy, S32)
.scalarize(0);
@@ -1219,7 +1256,6 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
if (ST.hasVOP3PInsts()) {
getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
.legalFor({S32, S16, V2S16})
- .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
.clampMaxNumElements(0, S16, 2)
.minScalar(0, S16)
.widenScalarToNextPow2(0)
@@ -1369,7 +1405,8 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
// The custom pointers (fat pointers, buffer resources) don't work with load
// and store at this level. Fat pointers should have been lowered to
// intrinsics before the translation to MIR.
- Actions.unsupportedIf(typeInSet(1, {BufferFatPtr, RsrcPtr}));
+ Actions.unsupportedIf(
+ typeInSet(1, {BufferFatPtr, BufferStridedPtr, RsrcPtr}));
// Address space 8 pointers are handled by a 4xs32 load, bitcast, and
// ptrtoint. This is needed to account for the fact that we can't have i128
@@ -1925,20 +1962,25 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
.widenScalarToNextPow2(0)
.scalarize(0);
- getActionDefinitionsBuilder({
- // TODO: Verify V_BFI_B32 is generated from expanded bit ops
- G_FCOPYSIGN,
+ getActionDefinitionsBuilder(
+ {// TODO: Verify V_BFI_B32 is generated from expanded bit ops
+ G_FCOPYSIGN,
- G_ATOMIC_CMPXCHG_WITH_SUCCESS,
- G_ATOMICRMW_NAND,
- G_ATOMICRMW_FSUB,
- G_READ_REGISTER,
- G_WRITE_REGISTER,
+ G_ATOMIC_CMPXCHG_WITH_SUCCESS, G_ATOMICRMW_NAND, G_ATOMICRMW_FSUB,
+ G_READ_REGISTER, G_WRITE_REGISTER,
- G_SADDO, G_SSUBO,
+ G_SADDO, G_SSUBO})
+ .lower();
- // TODO: Implement
- G_FMINIMUM, G_FMAXIMUM}).lower();
+ if (ST.hasIEEEMinMax()) {
+ getActionDefinitionsBuilder({G_FMINIMUM, G_FMAXIMUM})
+ .legalFor(FPTypesPK16)
+ .clampMaxNumElements(0, S16, 2)
+ .scalarize(0);
+ } else {
+ // TODO: Implement
+ getActionDefinitionsBuilder({G_FMINIMUM, G_FMAXIMUM}).lower();
+ }
getActionDefinitionsBuilder({G_MEMCPY, G_MEMCPY_INLINE, G_MEMMOVE, G_MEMSET})
.lower();
@@ -1948,6 +1990,8 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
G_INDEXED_ZEXTLOAD, G_INDEXED_STORE})
.unsupported();
+ getActionDefinitionsBuilder(G_PREFETCH).alwaysLegal();
+
getLegacyLegalizerInfo().computeTables();
verify(*ST.getInstrInfo());
}
@@ -1960,8 +2004,8 @@ bool AMDGPULegalizerInfo::legalizeCustom(LegalizerHelper &Helper,
switch (MI.getOpcode()) {
case TargetOpcode::G_ADDRSPACE_CAST:
return legalizeAddrSpaceCast(MI, MRI, B);
- case TargetOpcode::G_FRINT:
- return legalizeFrint(MI, MRI, B);
+ case TargetOpcode::G_INTRINSIC_ROUNDEVEN:
+ return legalizeFroundeven(MI, MRI, B);
case TargetOpcode::G_FCEIL:
return legalizeFceil(MI, MRI, B);
case TargetOpcode::G_FREM:
@@ -2022,6 +2066,7 @@ bool AMDGPULegalizerInfo::legalizeCustom(LegalizerHelper &Helper,
case TargetOpcode::G_FEXP2:
return legalizeFExp2(MI, B);
case TargetOpcode::G_FEXP:
+ case TargetOpcode::G_FEXP10:
return legalizeFExp(MI, B);
case TargetOpcode::G_FPOW:
return legalizeFPow(MI, B);
@@ -2037,6 +2082,8 @@ bool AMDGPULegalizerInfo::legalizeCustom(LegalizerHelper &Helper,
return legalizeCTLZ_CTTZ(MI, MRI, B);
case TargetOpcode::G_INTRINSIC_FPTRUNC_ROUND:
return legalizeFPTruncRound(MI, B);
+ case TargetOpcode::G_STACKSAVE:
+ return legalizeStackSave(MI, B);
default:
return false;
}
@@ -2264,9 +2311,9 @@ bool AMDGPULegalizerInfo::legalizeAddrSpaceCast(
return true;
}
-bool AMDGPULegalizerInfo::legalizeFrint(
- MachineInstr &MI, MachineRegisterInfo &MRI,
- MachineIRBuilder &B) const {
+bool AMDGPULegalizerInfo::legalizeFroundeven(MachineInstr &MI,
+ MachineRegisterInfo &MRI,
+ MachineIRBuilder &B) const {
Register Src = MI.getOperand(1).getReg();
LLT Ty = MRI.getType(Src);
assert(Ty.isScalar() && Ty.getSizeInBits() == 64);
@@ -2345,10 +2392,10 @@ static MachineInstrBuilder extractF64Exponent(Register Hi,
auto Const0 = B.buildConstant(S32, FractBits - 32);
auto Const1 = B.buildConstant(S32, ExpBits);
- auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false)
- .addUse(Hi)
- .addUse(Const0.getReg(0))
- .addUse(Const1.getReg(0));
+ auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32})
+ .addUse(Hi)
+ .addUse(Const0.getReg(0))
+ .addUse(Const1.getReg(0));
return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023));
}
@@ -2436,8 +2483,7 @@ bool AMDGPULegalizerInfo::legalizeITOFP(
auto X = B.buildXor(S32, Unmerge.getReg(0), Unmerge.getReg(1));
auto OppositeSign = B.buildAShr(S32, X, ThirtyOne);
auto MaxShAmt = B.buildAdd(S32, ThirtyTwo, OppositeSign);
- auto LS = B.buildIntrinsic(Intrinsic::amdgcn_sffbh, {S32},
- /*HasSideEffects=*/false)
+ auto LS = B.buildIntrinsic(Intrinsic::amdgcn_sffbh, {S32})
.addUse(Unmerge.getReg(1));
auto LS2 = B.buildSub(S32, LS, One);
ShAmt = B.buildUMin(S32, LS2, MaxShAmt);
@@ -2670,15 +2716,16 @@ bool AMDGPULegalizerInfo::legalizeSinCos(
auto OneOver2Pi = B.buildFConstant(Ty, 0.5 * numbers::inv_pi);
if (ST.hasTrigReducedRange()) {
auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags);
- TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false)
- .addUse(MulVal.getReg(0))
- .setMIFlags(Flags).getReg(0);
+ TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty})
+ .addUse(MulVal.getReg(0))
+ .setMIFlags(Flags)
+ .getReg(0);
} else
TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0);
Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ?
Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos;
- B.buildIntrinsic(TrigIntrin, ArrayRef<Register>(DstReg), false)
+ B.buildIntrinsic(TrigIntrin, ArrayRef<Register>(DstReg))
.addUse(TrigVal)
.setMIFlags(Flags);
MI.eraseFromParent();
@@ -2714,15 +2761,6 @@ bool AMDGPULegalizerInfo::buildPCRelGlobalAddress(Register DstReg, LLT PtrTy,
// $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
// which is a 64-bit pc-relative offset from the encoding of the $symbol
// operand to the global variable.
- //
- // What we want here is an offset from the value returned by s_getpc
- // (which is the address of the s_add_u32 instruction) to the global
- // variable, but since the encoding of $symbol starts 4 bytes after the start
- // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too
- // small. This requires us to add 4 to the global variable offset in order to
- // compute the correct address. Similarly for the s_addc_u32 instruction, the
- // encoding of $symbol starts 12 bytes after the start of the s_add_u32
- // instruction.
LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
@@ -2732,11 +2770,11 @@ bool AMDGPULegalizerInfo::buildPCRelGlobalAddress(Register DstReg, LLT PtrTy,
MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET)
.addDef(PCReg);
- MIB.addGlobalAddress(GV, Offset + 4, GAFlags);
+ MIB.addGlobalAddress(GV, Offset, GAFlags);
if (GAFlags == SIInstrInfo::MO_NONE)
MIB.addImm(0);
else
- MIB.addGlobalAddress(GV, Offset + 12, GAFlags + 1);
+ MIB.addGlobalAddress(GV, Offset, GAFlags + 1);
if (!B.getMRI()->getRegClassOrNull(PCReg))
B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass);
@@ -2744,7 +2782,63 @@ bool AMDGPULegalizerInfo::buildPCRelGlobalAddress(Register DstReg, LLT PtrTy,
if (PtrTy.getSizeInBits() == 32)
B.buildExtract(DstReg, PCReg, 0);
return true;
- }
+}
+
+// Emit a ABS32_LO / ABS32_HI relocation stub.
+void AMDGPULegalizerInfo::buildAbsGlobalAddress(
+ Register DstReg, LLT PtrTy, MachineIRBuilder &B, const GlobalValue *GV,
+ MachineRegisterInfo &MRI) const {
+ bool RequiresHighHalf = PtrTy.getSizeInBits() != 32;
+
+ LLT S32 = LLT::scalar(32);
+
+ // Use the destination directly, if and only if we store the lower address
+ // part only and we don't have a register class being set.
+ Register AddrLo = !RequiresHighHalf && !MRI.getRegClassOrNull(DstReg)
+ ? DstReg
+ : MRI.createGenericVirtualRegister(S32);
+
+ if (!MRI.getRegClassOrNull(AddrLo))
+ MRI.setRegClass(AddrLo, &AMDGPU::SReg_32RegClass);
+
+ // Write the lower half.
+ B.buildInstr(AMDGPU::S_MOV_B32)
+ .addDef(AddrLo)
+ .addGlobalAddress(GV, 0, SIInstrInfo::MO_ABS32_LO);
+
+ // If required, write the upper half as well.
+ if (RequiresHighHalf) {
+ assert(PtrTy.getSizeInBits() == 64 &&
+ "Must provide a 64-bit pointer type!");
+
+ Register AddrHi = MRI.createGenericVirtualRegister(S32);
+ MRI.setRegClass(AddrHi, &AMDGPU::SReg_32RegClass);
+
+ B.buildInstr(AMDGPU::S_MOV_B32)
+ .addDef(AddrHi)
+ .addGlobalAddress(GV, 0, SIInstrInfo::MO_ABS32_HI);
+
+ // Use the destination directly, if and only if we don't have a register
+ // class being set.
+ Register AddrDst = !MRI.getRegClassOrNull(DstReg)
+ ? DstReg
+ : MRI.createGenericVirtualRegister(LLT::scalar(64));
+
+ if (!MRI.getRegClassOrNull(AddrDst))
+ MRI.setRegClass(AddrDst, &AMDGPU::SReg_64RegClass);
+
+ B.buildMergeValues(AddrDst, {AddrLo, AddrHi});
+
+ // If we created a new register for the destination, cast the result into
+ // the final output.
+ if (AddrDst != DstReg)
+ B.buildCast(DstReg, AddrDst);
+ } else if (AddrLo != DstReg) {
+ // If we created a new register for the destination, cast the result into
+ // the final output.
+ B.buildCast(DstReg, AddrLo);
+ }
+}
bool AMDGPULegalizerInfo::legalizeGlobalValue(
MachineInstr &MI, MachineRegisterInfo &MRI,
@@ -2771,7 +2865,7 @@ bool AMDGPULegalizerInfo::legalizeGlobalValue(
// functions that use local objects. However, if these dead functions are
// not eliminated, we don't want a compile time error. Just emit a warning
// and a trap, since there should be no callable path here.
- B.buildIntrinsic(Intrinsic::trap, ArrayRef<Register>(), true);
+ B.buildIntrinsic(Intrinsic::trap, ArrayRef<Register>());
B.buildUndef(DstReg);
MI.eraseFromParent();
return true;
@@ -2797,8 +2891,7 @@ bool AMDGPULegalizerInfo::legalizeGlobalValue(
// Adjust alignment for that dynamic shared memory array.
MFI->setDynLDSAlign(MF.getFunction(), *cast<GlobalVariable>(GV));
LLT S32 = LLT::scalar(32);
- auto Sz =
- B.buildIntrinsic(Intrinsic::amdgcn_groupstaticsize, {S32}, false);
+ auto Sz = B.buildIntrinsic(Intrinsic::amdgcn_groupstaticsize, {S32});
B.buildIntToPtr(DstReg, Sz);
MI.eraseFromParent();
return true;
@@ -2811,6 +2904,12 @@ bool AMDGPULegalizerInfo::legalizeGlobalValue(
return true;
}
+ if (ST.isAmdPalOS() || ST.isMesa3DOS()) {
+ buildAbsGlobalAddress(DstReg, Ty, B, GV, MRI);
+ MI.eraseFromParent();
+ return true;
+ }
+
const SITargetLowering *TLI = ST.getTargetLowering();
if (TLI->shouldEmitFixup(GV)) {
@@ -2973,10 +3072,10 @@ bool AMDGPULegalizerInfo::legalizeFMad(
// TODO: Always legal with future ftz flag.
// FIXME: Do we need just output?
- if (Ty == LLT::scalar(32) &&
+ if (Ty == LLT::float32() &&
MFI->getMode().FP32Denormals == DenormalMode::getPreserveSign())
return true;
- if (Ty == LLT::scalar(16) &&
+ if (Ty == LLT::float16() &&
MFI->getMode().FP64FP16Denormals == DenormalMode::getPreserveSign())
return true;
@@ -3014,9 +3113,30 @@ bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg(
/// Return true if it's known that \p Src can never be an f32 denormal value.
static bool valueIsKnownNeverF32Denorm(const MachineRegisterInfo &MRI,
Register Src) {
- Register ExtSrc;
- if (mi_match(Src, MRI, m_GFPExt(m_Reg(ExtSrc))))
- return MRI.getType(ExtSrc) == LLT::scalar(16);
+ const MachineInstr *DefMI = MRI.getVRegDef(Src);
+ switch (DefMI->getOpcode()) {
+ case TargetOpcode::G_INTRINSIC: {
+ switch (cast<GIntrinsic>(DefMI)->getIntrinsicID()) {
+ case Intrinsic::amdgcn_frexp_mant:
+ return true;
+ default:
+ break;
+ }
+
+ break;
+ }
+ case TargetOpcode::G_FFREXP: {
+ if (DefMI->getOperand(0).getReg() == Src)
+ return true;
+ break;
+ }
+ case TargetOpcode::G_FPEXT: {
+ return MRI.getType(DefMI->getOperand(1).getReg()) == LLT::scalar(16);
+ }
+ default:
+ return false;
+ }
+
return false;
}
@@ -3072,9 +3192,9 @@ bool AMDGPULegalizerInfo::legalizeFlog2(MachineInstr &MI,
const LLT F32 = LLT::scalar(32);
// Nothing in half is a denormal when promoted to f32.
auto Ext = B.buildFPExt(F32, Src, Flags);
- auto Log2 = B.buildIntrinsic(Intrinsic::amdgcn_log, {F32}, false)
- .addUse(Ext.getReg(0))
- .setMIFlags(Flags);
+ auto Log2 = B.buildIntrinsic(Intrinsic::amdgcn_log, {F32})
+ .addUse(Ext.getReg(0))
+ .setMIFlags(Flags);
B.buildFPTrunc(Dst, Log2, Flags);
MI.eraseFromParent();
return true;
@@ -3084,14 +3204,14 @@ bool AMDGPULegalizerInfo::legalizeFlog2(MachineInstr &MI,
auto [ScaledInput, IsLtSmallestNormal] = getScaledLogInput(B, Src, Flags);
if (!ScaledInput) {
- B.buildIntrinsic(Intrinsic::amdgcn_log, {MI.getOperand(0)}, false)
+ B.buildIntrinsic(Intrinsic::amdgcn_log, {MI.getOperand(0)})
.addUse(Src)
.setMIFlags(Flags);
MI.eraseFromParent();
return true;
}
- auto Log2 = B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty}, false)
+ auto Log2 = B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty})
.addUse(ScaledInput)
.setMIFlags(Flags);
@@ -3148,9 +3268,8 @@ bool AMDGPULegalizerInfo::legalizeFlogCommon(MachineInstr &MI,
if (ScaledInput)
X = ScaledInput;
- auto Y = B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty}, false)
- .addUse(X)
- .setMIFlags(Flags);
+ auto Y =
+ B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty}).addUse(X).setMIFlags(Flags);
Register R;
if (ST.hasFastFMAF32()) {
@@ -3231,7 +3350,7 @@ bool AMDGPULegalizerInfo::legalizeFlogUnsafe(MachineIRBuilder &B, Register Dst,
if (Ty == LLT::scalar(32)) {
auto [ScaledInput, IsScaled] = getScaledLogInput(B, Src, Flags);
if (ScaledInput) {
- auto LogSrc = B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty}, false)
+ auto LogSrc = B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty})
.addUse(Src)
.setMIFlags(Flags);
auto ScaledResultOffset = B.buildFConstant(Ty, -32.0 * Log2BaseInverted);
@@ -3253,7 +3372,7 @@ bool AMDGPULegalizerInfo::legalizeFlogUnsafe(MachineIRBuilder &B, Register Dst,
auto Log2Operand = Ty == LLT::scalar(16)
? B.buildFLog2(Ty, Src, Flags)
- : B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty}, false)
+ : B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty})
.addUse(Src)
.setMIFlags(Flags);
auto Log2BaseInvertedOperand = B.buildFConstant(Ty, Log2BaseInverted);
@@ -3276,9 +3395,9 @@ bool AMDGPULegalizerInfo::legalizeFExp2(MachineInstr &MI,
if (Ty == F16) {
// Nothing in half is a denormal when promoted to f32.
auto Ext = B.buildFPExt(F32, Src, Flags);
- auto Log2 = B.buildIntrinsic(Intrinsic::amdgcn_exp2, {F32}, false)
- .addUse(Ext.getReg(0))
- .setMIFlags(Flags);
+ auto Log2 = B.buildIntrinsic(Intrinsic::amdgcn_exp2, {F32})
+ .addUse(Ext.getReg(0))
+ .setMIFlags(Flags);
B.buildFPTrunc(Dst, Log2, Flags);
MI.eraseFromParent();
return true;
@@ -3287,7 +3406,7 @@ bool AMDGPULegalizerInfo::legalizeFExp2(MachineInstr &MI,
assert(Ty == F32);
if (!needsDenormHandlingF32(B.getMF(), Src, Flags)) {
- B.buildIntrinsic(Intrinsic::amdgcn_exp2, ArrayRef<Register>{Dst}, false)
+ B.buildIntrinsic(Intrinsic::amdgcn_exp2, ArrayRef<Register>{Dst})
.addUse(Src)
.setMIFlags(Flags);
MI.eraseFromParent();
@@ -3307,7 +3426,7 @@ bool AMDGPULegalizerInfo::legalizeFExp2(MachineInstr &MI,
auto AddOffset = B.buildSelect(F32, NeedsScaling, SixtyFour, Zero, Flags);
auto AddInput = B.buildFAdd(F32, Src, AddOffset, Flags);
- auto Exp2 = B.buildIntrinsic(Intrinsic::amdgcn_exp2, {Ty}, false)
+ auto Exp2 = B.buildIntrinsic(Intrinsic::amdgcn_exp2, {Ty})
.addUse(AddInput.getReg(0))
.setMIFlags(Flags);
@@ -3320,20 +3439,42 @@ bool AMDGPULegalizerInfo::legalizeFExp2(MachineInstr &MI,
}
bool AMDGPULegalizerInfo::legalizeFExpUnsafe(MachineIRBuilder &B, Register Dst,
- Register Src,
- unsigned Flags) const {
+ Register X, unsigned Flags) const {
LLT Ty = B.getMRI()->getType(Dst);
- auto K = B.buildFConstant(Ty, numbers::log2e);
- auto Mul = B.buildFMul(Ty, Src, K, Flags);
+ LLT F32 = LLT::scalar(32);
- if (Ty == LLT::scalar(32)) {
- B.buildIntrinsic(Intrinsic::amdgcn_exp2, ArrayRef<Register>{Dst}, false)
- .addUse(Mul.getReg(0))
- .setMIFlags(Flags);
- } else {
- B.buildFExp2(Dst, Mul.getReg(0), Flags);
+ if (Ty != F32 || !needsDenormHandlingF32(B.getMF(), X, Flags)) {
+ auto Log2E = B.buildFConstant(Ty, numbers::log2e);
+ auto Mul = B.buildFMul(Ty, X, Log2E, Flags);
+
+ if (Ty == F32) {
+ B.buildIntrinsic(Intrinsic::amdgcn_exp2, ArrayRef<Register>{Dst})
+ .addUse(Mul.getReg(0))
+ .setMIFlags(Flags);
+ } else {
+ B.buildFExp2(Dst, Mul.getReg(0), Flags);
+ }
+
+ return true;
}
+ auto Threshold = B.buildFConstant(Ty, -0x1.5d58a0p+6f);
+ auto NeedsScaling =
+ B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), X, Threshold, Flags);
+ auto ScaleOffset = B.buildFConstant(Ty, 0x1.0p+6f);
+ auto ScaledX = B.buildFAdd(Ty, X, ScaleOffset, Flags);
+ auto AdjustedX = B.buildSelect(Ty, NeedsScaling, ScaledX, X, Flags);
+
+ auto Log2E = B.buildFConstant(Ty, numbers::log2e);
+ auto ExpInput = B.buildFMul(Ty, AdjustedX, Log2E, Flags);
+
+ auto Exp2 = B.buildIntrinsic(Intrinsic::amdgcn_exp2, {Ty})
+ .addUse(ExpInput.getReg(0))
+ .setMIFlags(Flags);
+
+ auto ResultScaleFactor = B.buildFConstant(Ty, 0x1.969d48p-93f);
+ auto AdjustedResult = B.buildFMul(Ty, Exp2, ResultScaleFactor, Flags);
+ B.buildSelect(Dst, NeedsScaling, AdjustedResult, Exp2, Flags);
return true;
}
@@ -3347,7 +3488,7 @@ bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI,
LLT Ty = MRI.getType(Dst);
const LLT F16 = LLT::scalar(16);
const LLT F32 = LLT::scalar(32);
- const bool IsExp10 = false; // TODO: For some reason exp10 is missing
+ const bool IsExp10 = MI.getOpcode() == TargetOpcode::G_FEXP10;
if (Ty == F16) {
// v_exp_f16 (fmul x, log2e)
@@ -3374,7 +3515,7 @@ bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI,
// TODO: Interpret allowApproxFunc as ignoring DAZ. This is currently copying
// library behavior. Also, is known-not-daz source sufficient?
- if (allowApproxFunc(MF, Flags) && !needsDenormHandlingF32(MF, X, Flags)) {
+ if (allowApproxFunc(MF, Flags)) {
legalizeFExpUnsafe(B, Dst, X, Flags);
MI.eraseFromParent();
return true;
@@ -3442,14 +3583,14 @@ bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI,
PL = getMad(B, Ty, XH.getReg(0), CL.getReg(0), Mad0, Flags);
}
- auto E = B.buildFRint(Ty, PH, Flags);
+ auto E = B.buildIntrinsicRoundeven(Ty, PH, Flags);
// It is unsafe to contract this fsub into the PH multiply.
auto PHSubE = B.buildFSub(Ty, PH, E, FlagsNoContract);
auto A = B.buildFAdd(Ty, PHSubE, PL, Flags);
auto IntE = B.buildFPTOSI(LLT::scalar(32), E);
- auto Exp2 = B.buildIntrinsic(Intrinsic::amdgcn_exp2, {Ty}, false)
+ auto Exp2 = B.buildIntrinsic(Intrinsic::amdgcn_exp2, {Ty})
.addUse(A.getReg(0))
.setMIFlags(Flags);
auto R = B.buildFLdexp(Ty, Exp2, IntE, Flags);
@@ -3486,27 +3627,26 @@ bool AMDGPULegalizerInfo::legalizeFPow(MachineInstr &MI,
Register Src1 = MI.getOperand(2).getReg();
unsigned Flags = MI.getFlags();
LLT Ty = B.getMRI()->getType(Dst);
- const LLT S16 = LLT::scalar(16);
- const LLT S32 = LLT::scalar(32);
-
- if (Ty == S32) {
- auto Log = B.buildFLog2(S32, Src0, Flags);
- auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false)
- .addUse(Log.getReg(0))
- .addUse(Src1)
- .setMIFlags(Flags);
+ const LLT F16 = LLT::float16();
+ const LLT F32 = LLT::float32();
+
+ if (Ty == F32) {
+ auto Log = B.buildFLog2(F32, Src0, Flags);
+ auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {F32})
+ .addUse(Log.getReg(0))
+ .addUse(Src1)
+ .setMIFlags(Flags);
B.buildFExp2(Dst, Mul, Flags);
- } else if (Ty == S16) {
+ } else if (Ty == F16) {
// There's no f16 fmul_legacy, so we need to convert for it.
- auto Log = B.buildFLog2(S16, Src0, Flags);
- auto Ext0 = B.buildFPExt(S32, Log, Flags);
- auto Ext1 = B.buildFPExt(S32, Src1, Flags);
- auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false)
- .addUse(Ext0.getReg(0))
- .addUse(Ext1.getReg(0))
- .setMIFlags(Flags);
-
- B.buildFExp2(Dst, B.buildFPTrunc(S16, Mul), Flags);
+ auto Log = B.buildFLog2(F16, Src0, Flags);
+ auto Ext0 = B.buildFPExt(F32, Log, Flags);
+ auto Ext1 = B.buildFPExt(F32, Src1, Flags);
+ auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {F32})
+ .addUse(Ext0.getReg(0))
+ .addUse(Ext1.getReg(0))
+ .setMIFlags(Flags);
+ B.buildFExp2(Dst, B.buildFPTrunc(F16, Mul), Flags);
} else
return false;
@@ -3531,11 +3671,11 @@ bool AMDGPULegalizerInfo::legalizeFFloor(MachineInstr &MI,
MachineIRBuilder &B) const {
const LLT S1 = LLT::scalar(1);
- const LLT S64 = LLT::scalar(64);
+ const LLT F64 = LLT::float64();
Register Dst = MI.getOperand(0).getReg();
Register OrigSrc = MI.getOperand(1).getReg();
unsigned Flags = MI.getFlags();
- assert(ST.hasFractBug() && MRI.getType(Dst) == S64 &&
+ assert(ST.hasFractBug() && MRI.getType(Dst) == F64 &&
"this should not have been custom lowered");
// V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x))
@@ -3546,9 +3686,9 @@ bool AMDGPULegalizerInfo::legalizeFFloor(MachineInstr &MI,
//
// Convert floor(x) to (x - fract(x))
- auto Fract = B.buildIntrinsic(Intrinsic::amdgcn_fract, {S64}, false)
- .addUse(OrigSrc)
- .setMIFlags(Flags);
+ auto Fract = B.buildIntrinsic(Intrinsic::amdgcn_fract, {F64})
+ .addUse(OrigSrc)
+ .setMIFlags(Flags);
// Give source modifier matching some assistance before obscuring a foldable
// pattern.
@@ -3558,9 +3698,9 @@ bool AMDGPULegalizerInfo::legalizeFFloor(MachineInstr &MI,
Register ModSrc = stripAnySourceMods(OrigSrc, MRI);
auto Const =
- B.buildFConstant(S64, llvm::bit_cast<double>(0x3fefffffffffffff));
+ B.buildFConstant(F64, llvm::bit_cast<double>(0x3fefffffffffffff));
- Register Min = MRI.createGenericVirtualRegister(S64);
+ Register Min = MRI.createGenericVirtualRegister(F64);
// We don't need to concern ourselves with the snan handling difference, so
// use the one which will directly select.
@@ -3573,10 +3713,10 @@ bool AMDGPULegalizerInfo::legalizeFFloor(MachineInstr &MI,
Register CorrectedFract = Min;
if (!MI.getFlag(MachineInstr::FmNoNans)) {
auto IsNan = B.buildFCmp(CmpInst::FCMP_ORD, S1, ModSrc, ModSrc, Flags);
- CorrectedFract = B.buildSelect(S64, IsNan, ModSrc, Min, Flags).getReg(0);
+ CorrectedFract = B.buildSelect(F64, IsNan, ModSrc, Min, Flags).getReg(0);
}
- auto NegFract = B.buildFNeg(S64, CorrectedFract, Flags);
+ auto NegFract = B.buildFNeg(F64, CorrectedFract, Flags);
B.buildFAdd(Dst, OrigSrc, NegFract, Flags);
MI.eraseFromParent();
@@ -4497,38 +4637,36 @@ bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI,
// 1 / x -> RCP(x)
if (CLHS->isExactlyValue(1.0)) {
- B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
- .addUse(RHS)
- .setMIFlags(Flags);
+ B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res)
+ .addUse(RHS)
+ .setMIFlags(Flags);
MI.eraseFromParent();
return true;
}
- // TODO: Match rsq
-
// -1 / x -> RCP( FNEG(x) )
if (CLHS->isExactlyValue(-1.0)) {
auto FNeg = B.buildFNeg(ResTy, RHS, Flags);
- B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
- .addUse(FNeg.getReg(0))
- .setMIFlags(Flags);
+ B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res)
+ .addUse(FNeg.getReg(0))
+ .setMIFlags(Flags);
MI.eraseFromParent();
return true;
}
}
- // For f16 require arcp only.
- // For f32 require afn+arcp.
+ // For f16 require afn or arcp.
+ // For f32 require afn.
if (!AllowInaccurateRcp && (ResTy != LLT::scalar(16) ||
!MI.getFlag(MachineInstr::FmArcp)))
return false;
// x / y -> x * (1.0 / y)
- auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false)
- .addUse(RHS)
- .setMIFlags(Flags);
+ auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy})
+ .addUse(RHS)
+ .setMIFlags(Flags);
B.buildFMul(Res, LHS, RCP, Flags);
MI.eraseFromParent();
@@ -4554,9 +4692,9 @@ bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV64(MachineInstr &MI,
auto NegY = B.buildFNeg(ResTy, Y);
auto One = B.buildFConstant(ResTy, 1.0);
- auto R = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false)
- .addUse(Y)
- .setMIFlags(Flags);
+ auto R = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy})
+ .addUse(Y)
+ .setMIFlags(Flags);
auto Tmp0 = B.buildFMA(ResTy, NegY, R, One);
R = B.buildFMA(ResTy, Tmp0, R, R);
@@ -4590,23 +4728,27 @@ bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI,
auto LHSExt = B.buildFPExt(S32, LHS, Flags);
auto RHSExt = B.buildFPExt(S32, RHS, Flags);
- auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
- .addUse(RHSExt.getReg(0))
- .setMIFlags(Flags);
+ auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32})
+ .addUse(RHSExt.getReg(0))
+ .setMIFlags(Flags);
auto QUOT = B.buildFMul(S32, LHSExt, RCP, Flags);
auto RDst = B.buildFPTrunc(S16, QUOT, Flags);
- B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
- .addUse(RDst.getReg(0))
- .addUse(RHS)
- .addUse(LHS)
- .setMIFlags(Flags);
+ B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res)
+ .addUse(RDst.getReg(0))
+ .addUse(RHS)
+ .addUse(LHS)
+ .setMIFlags(Flags);
MI.eraseFromParent();
return true;
}
+static const unsigned SPDenormModeBitField =
+ AMDGPU::Hwreg::ID_MODE | (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) |
+ (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_);
+
// Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions
// to enable denorm mode. When 'Enable' is false, disable denorm mode.
static void toggleSPDenormMode(bool Enable, MachineIRBuilder &B,
@@ -4625,11 +4767,6 @@ static void toggleSPDenormMode(bool Enable, MachineIRBuilder &B,
.addImm(NewDenormModeValue);
} else {
- // Select FP32 bit field in mode register.
- unsigned SPDenormModeBitField = AMDGPU::Hwreg::ID_MODE |
- (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) |
- (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_);
-
B.buildInstr(AMDGPU::S_SETREG_IMM32_B32)
.addImm(SPDenormMode)
.addImm(SPDenormModeBitField);
@@ -4656,27 +4793,38 @@ bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI,
auto One = B.buildFConstant(S32, 1.0f);
auto DenominatorScaled =
- B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
- .addUse(LHS)
- .addUse(RHS)
- .addImm(0)
- .setMIFlags(Flags);
+ B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1})
+ .addUse(LHS)
+ .addUse(RHS)
+ .addImm(0)
+ .setMIFlags(Flags);
auto NumeratorScaled =
- B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
- .addUse(LHS)
- .addUse(RHS)
- .addImm(1)
- .setMIFlags(Flags);
-
- auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
- .addUse(DenominatorScaled.getReg(0))
- .setMIFlags(Flags);
+ B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1})
+ .addUse(LHS)
+ .addUse(RHS)
+ .addImm(1)
+ .setMIFlags(Flags);
+
+ auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32})
+ .addUse(DenominatorScaled.getReg(0))
+ .setMIFlags(Flags);
auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags);
- // FIXME: Doesn't correctly model the FP mode switch, and the FP operations
- // aren't modeled as reading it.
- if (Mode.FP32Denormals != DenormalMode::getIEEE())
+ const bool PreservesDenormals = Mode.FP32Denormals == DenormalMode::getIEEE();
+ const bool HasDynamicDenormals =
+ (Mode.FP32Denormals.Input == DenormalMode::Dynamic) ||
+ (Mode.FP32Denormals.Output == DenormalMode::Dynamic);
+
+ Register SavedSPDenormMode;
+ if (!PreservesDenormals) {
+ if (HasDynamicDenormals) {
+ SavedSPDenormMode = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
+ B.buildInstr(AMDGPU::S_GETREG_B32)
+ .addDef(SavedSPDenormMode)
+ .addImm(SPDenormModeBitField);
+ }
toggleSPDenormMode(true, B, ST, Mode);
+ }
auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags);
auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags);
@@ -4685,23 +4833,28 @@ bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI,
auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags);
auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags);
- // FIXME: This mishandles dynamic denormal mode. We need to query the
- // current mode and restore the original.
- if (Mode.FP32Denormals != DenormalMode::getIEEE())
- toggleSPDenormMode(false, B, ST, Mode);
+ if (!PreservesDenormals) {
+ if (HasDynamicDenormals) {
+ assert(SavedSPDenormMode);
+ B.buildInstr(AMDGPU::S_SETREG_B32)
+ .addReg(SavedSPDenormMode)
+ .addImm(SPDenormModeBitField);
+ } else
+ toggleSPDenormMode(false, B, ST, Mode);
+ }
- auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32}, false)
- .addUse(Fma4.getReg(0))
- .addUse(Fma1.getReg(0))
- .addUse(Fma3.getReg(0))
- .addUse(NumeratorScaled.getReg(1))
- .setMIFlags(Flags);
+ auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32})
+ .addUse(Fma4.getReg(0))
+ .addUse(Fma1.getReg(0))
+ .addUse(Fma3.getReg(0))
+ .addUse(NumeratorScaled.getReg(1))
+ .setMIFlags(Flags);
- B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res, false)
- .addUse(Fmas.getReg(0))
- .addUse(RHS)
- .addUse(LHS)
- .setMIFlags(Flags);
+ B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res)
+ .addUse(Fmas.getReg(0))
+ .addUse(RHS)
+ .addUse(LHS)
+ .setMIFlags(Flags);
MI.eraseFromParent();
return true;
@@ -4724,27 +4877,27 @@ bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI,
auto One = B.buildFConstant(S64, 1.0);
- auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
- .addUse(LHS)
- .addUse(RHS)
- .addImm(0)
- .setMIFlags(Flags);
+ auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1})
+ .addUse(LHS)
+ .addUse(RHS)
+ .addImm(0)
+ .setMIFlags(Flags);
auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags);
- auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64}, false)
- .addUse(DivScale0.getReg(0))
- .setMIFlags(Flags);
+ auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64})
+ .addUse(DivScale0.getReg(0))
+ .setMIFlags(Flags);
auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags);
auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags);
auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags);
- auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
- .addUse(LHS)
- .addUse(RHS)
- .addImm(1)
- .setMIFlags(Flags);
+ auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1})
+ .addUse(LHS)
+ .addUse(RHS)
+ .addImm(1)
+ .setMIFlags(Flags);
auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags);
auto Mul = B.buildFMul(S64, DivScale1.getReg(0), Fma3, Flags);
@@ -4771,14 +4924,14 @@ bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI,
Scale = DivScale1.getReg(1);
}
- auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64}, false)
- .addUse(Fma4.getReg(0))
- .addUse(Fma3.getReg(0))
- .addUse(Mul.getReg(0))
- .addUse(Scale)
- .setMIFlags(Flags);
+ auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64})
+ .addUse(Fma4.getReg(0))
+ .addUse(Fma3.getReg(0))
+ .addUse(Mul.getReg(0))
+ .addUse(Scale)
+ .setMIFlags(Flags);
- B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, ArrayRef(Res), false)
+ B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, ArrayRef(Res))
.addUse(Fmas.getReg(0))
.addUse(RHS)
.addUse(LHS)
@@ -4799,10 +4952,10 @@ bool AMDGPULegalizerInfo::legalizeFFREXP(MachineInstr &MI,
LLT Ty = MRI.getType(Res0);
LLT InstrExpTy = Ty == LLT::scalar(16) ? LLT::scalar(16) : LLT::scalar(32);
- auto Mant = B.buildIntrinsic(Intrinsic::amdgcn_frexp_mant, {Ty}, false)
+ auto Mant = B.buildIntrinsic(Intrinsic::amdgcn_frexp_mant, {Ty})
.addUse(Val)
.setMIFlags(Flags);
- auto Exp = B.buildIntrinsic(Intrinsic::amdgcn_frexp_exp, {InstrExpTy}, false)
+ auto Exp = B.buildIntrinsic(Intrinsic::amdgcn_frexp_exp, {InstrExpTy})
.addUse(Val)
.setMIFlags(Flags);
@@ -4846,9 +4999,9 @@ bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI,
auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags);
- auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
- .addUse(Mul0.getReg(0))
- .setMIFlags(Flags);
+ auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32})
+ .addUse(Mul0.getReg(0))
+ .setMIFlags(Flags);
auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags);
@@ -4858,9 +5011,107 @@ bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI,
return true;
}
-bool AMDGPULegalizerInfo::legalizeFSQRT(MachineInstr &MI,
- MachineRegisterInfo &MRI,
- MachineIRBuilder &B) const {
+bool AMDGPULegalizerInfo::legalizeFSQRTF16(MachineInstr &MI,
+ MachineRegisterInfo &MRI,
+ MachineIRBuilder &B) const {
+ // Bypass the correct expansion a standard promotion through G_FSQRT would
+ // get. The f32 op is accurate enough for the f16 cas.
+ unsigned Flags = MI.getFlags();
+ assert(!ST.has16BitInsts());
+ const LLT F32 = LLT::scalar(32);
+ auto Ext = B.buildFPExt(F32, MI.getOperand(1), Flags);
+ auto Log2 = B.buildIntrinsic(Intrinsic::amdgcn_sqrt, {F32})
+ .addUse(Ext.getReg(0))
+ .setMIFlags(Flags);
+ B.buildFPTrunc(MI.getOperand(0), Log2, Flags);
+ MI.eraseFromParent();
+ return true;
+}
+
+bool AMDGPULegalizerInfo::legalizeFSQRTF32(MachineInstr &MI,
+ MachineRegisterInfo &MRI,
+ MachineIRBuilder &B) const {
+ MachineFunction &MF = B.getMF();
+ Register Dst = MI.getOperand(0).getReg();
+ Register X = MI.getOperand(1).getReg();
+ const unsigned Flags = MI.getFlags();
+ const LLT S1 = LLT::scalar(1);
+ const LLT F32 = LLT::scalar(32);
+ const LLT I32 = LLT::scalar(32);
+
+ if (allowApproxFunc(MF, Flags)) {
+ B.buildIntrinsic(Intrinsic::amdgcn_sqrt, ArrayRef<Register>({Dst}))
+ .addUse(X)
+ .setMIFlags(Flags);
+ MI.eraseFromParent();
+ return true;
+ }
+
+ auto ScaleThreshold = B.buildFConstant(F32, 0x1.0p-96f);
+ auto NeedScale = B.buildFCmp(CmpInst::FCMP_OGT, S1, ScaleThreshold, X, Flags);
+ auto ScaleUpFactor = B.buildFConstant(F32, 0x1.0p+32f);
+ auto ScaledX = B.buildFMul(F32, X, ScaleUpFactor, Flags);
+ auto SqrtX = B.buildSelect(F32, NeedScale, ScaledX, X, Flags);
+
+ Register SqrtS = MRI.createGenericVirtualRegister(F32);
+ if (needsDenormHandlingF32(MF, X, Flags)) {
+ B.buildIntrinsic(Intrinsic::amdgcn_sqrt, ArrayRef<Register>({SqrtS}))
+ .addUse(SqrtX.getReg(0))
+ .setMIFlags(Flags);
+
+ auto NegOne = B.buildConstant(I32, -1);
+ auto SqrtSNextDown = B.buildAdd(I32, SqrtS, NegOne);
+
+ auto NegSqrtSNextDown = B.buildFNeg(F32, SqrtSNextDown, Flags);
+ auto SqrtVP = B.buildFMA(F32, NegSqrtSNextDown, SqrtS, SqrtX, Flags);
+
+ auto PosOne = B.buildConstant(I32, 1);
+ auto SqrtSNextUp = B.buildAdd(I32, SqrtS, PosOne);
+
+ auto NegSqrtSNextUp = B.buildFNeg(F32, SqrtSNextUp, Flags);
+ auto SqrtVS = B.buildFMA(F32, NegSqrtSNextUp, SqrtS, SqrtX, Flags);
+
+ auto Zero = B.buildFConstant(F32, 0.0f);
+ auto SqrtVPLE0 = B.buildFCmp(CmpInst::FCMP_OLE, S1, SqrtVP, Zero, Flags);
+
+ SqrtS =
+ B.buildSelect(F32, SqrtVPLE0, SqrtSNextDown, SqrtS, Flags).getReg(0);
+
+ auto SqrtVPVSGT0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, SqrtVS, Zero, Flags);
+ SqrtS =
+ B.buildSelect(F32, SqrtVPVSGT0, SqrtSNextUp, SqrtS, Flags).getReg(0);
+ } else {
+ auto SqrtR =
+ B.buildIntrinsic(Intrinsic::amdgcn_rsq, {F32}).addReg(SqrtX.getReg(0));
+ B.buildFMul(SqrtS, SqrtX, SqrtR, Flags);
+
+ auto Half = B.buildFConstant(F32, 0.5f);
+ auto SqrtH = B.buildFMul(F32, SqrtR, Half, Flags);
+ auto NegSqrtH = B.buildFNeg(F32, SqrtH, Flags);
+ auto SqrtE = B.buildFMA(F32, NegSqrtH, SqrtS, Half, Flags);
+ SqrtH = B.buildFMA(F32, SqrtH, SqrtE, SqrtH, Flags);
+ SqrtS = B.buildFMA(F32, SqrtS, SqrtE, SqrtS, Flags).getReg(0);
+ auto NegSqrtS = B.buildFNeg(F32, SqrtS, Flags);
+ auto SqrtD = B.buildFMA(F32, NegSqrtS, SqrtS, SqrtX, Flags);
+ SqrtS = B.buildFMA(F32, SqrtD, SqrtH, SqrtS, Flags).getReg(0);
+ }
+
+ auto ScaleDownFactor = B.buildFConstant(F32, 0x1.0p-16f);
+
+ auto ScaledDown = B.buildFMul(F32, SqrtS, ScaleDownFactor, Flags);
+
+ SqrtS = B.buildSelect(F32, NeedScale, ScaledDown, SqrtS, Flags).getReg(0);
+
+ auto IsZeroOrInf = B.buildIsFPClass(LLT::scalar(1), SqrtX, fcZero | fcPosInf);
+ B.buildSelect(Dst, IsZeroOrInf, SqrtX, SqrtS, Flags);
+
+ MI.eraseFromParent();
+ return true;
+}
+
+bool AMDGPULegalizerInfo::legalizeFSQRTF64(MachineInstr &MI,
+ MachineRegisterInfo &MRI,
+ MachineIRBuilder &B) const {
// For double type, the SQRT and RSQ instructions don't have required
// precision, we apply Goldschmidt's algorithm to improve the result:
//
@@ -4901,8 +5152,8 @@ bool AMDGPULegalizerInfo::legalizeFSQRT(MachineInstr &MI,
auto ScaleUp = B.buildSelect(S32, Scaling, ScaleUpFactor, ZeroInt);
auto SqrtX = B.buildFLdexp(F64, X, ScaleUp, Flags);
- auto SqrtY = B.buildIntrinsic(Intrinsic::amdgcn_rsq, {F64}, false)
- .addReg(SqrtX.getReg(0));
+ auto SqrtY =
+ B.buildIntrinsic(Intrinsic::amdgcn_rsq, {F64}).addReg(SqrtX.getReg(0));
auto Half = B.buildFConstant(F64, 0.5);
auto SqrtH0 = B.buildFMul(F64, SqrtY, Half);
@@ -4942,6 +5193,19 @@ bool AMDGPULegalizerInfo::legalizeFSQRT(MachineInstr &MI,
return true;
}
+bool AMDGPULegalizerInfo::legalizeFSQRT(MachineInstr &MI,
+ MachineRegisterInfo &MRI,
+ MachineIRBuilder &B) const {
+ LLT Ty = MRI.getType(MI.getOperand(0).getReg());
+ if (Ty == LLT::scalar(32))
+ return legalizeFSQRTF32(MI, MRI, B);
+ if (Ty == LLT::scalar(64))
+ return legalizeFSQRTF64(MI, MRI, B);
+ if (Ty == LLT::scalar(16))
+ return legalizeFSQRTF16(MI, MRI, B);
+ return false;
+}
+
// Expand llvm.amdgcn.rsq.clamp on targets that don't support the instruction.
// FIXME: Why do we handle this one but not other removed instructions?
//
@@ -4968,9 +5232,9 @@ bool AMDGPULegalizerInfo::legalizeRsqClampIntrinsic(MachineInstr &MI,
else
return false;
- auto Rsq = B.buildIntrinsic(Intrinsic::amdgcn_rsq, {Ty}, false)
- .addUse(Src)
- .setMIFlags(Flags);
+ auto Rsq = B.buildIntrinsic(Intrinsic::amdgcn_rsq, {Ty})
+ .addUse(Src)
+ .setMIFlags(Flags);
// We don't need to concern ourselves with the snan handling difference, since
// the rsq quieted (or not) so use the one which will directly select.
@@ -5153,7 +5417,7 @@ bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI,
std::pair<Register, unsigned>
AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B,
Register OrigOffset) const {
- const unsigned MaxImm = SIInstrInfo::getMaxMUBUFImmOffset();
+ const unsigned MaxImm = SIInstrInfo::getMaxMUBUFImmOffset(ST);
Register BaseReg;
unsigned ImmOffset;
const LLT S32 = LLT::scalar(32);
@@ -5631,31 +5895,23 @@ bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI,
IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap ||
IID == Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap ||
IID == Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap;
- const bool HasReturn = MI.getNumExplicitDefs() != 0;
-
- Register Dst;
-
- int OpOffset = 0;
- if (HasReturn) {
- // A few FP atomics do not support return values.
- Dst = MI.getOperand(0).getReg();
- } else {
- OpOffset = -1;
- }
+ Register Dst = MI.getOperand(0).getReg();
// Since we don't have 128-bit atomics, we don't need to handle the case of
// p8 argmunents to the atomic itself
- Register VData = MI.getOperand(2 + OpOffset).getReg();
+ Register VData = MI.getOperand(2).getReg();
+
Register CmpVal;
+ int OpOffset = 0;
if (IsCmpSwap) {
- CmpVal = MI.getOperand(3 + OpOffset).getReg();
+ CmpVal = MI.getOperand(3).getReg();
++OpOffset;
}
castBufferRsrcArgToV4I32(MI, B, 3 + OpOffset);
Register RSrc = MI.getOperand(3 + OpOffset).getReg();
- const unsigned NumVIndexOps = (IsCmpSwap ? 8 : 7) + HasReturn;
+ const unsigned NumVIndexOps = IsCmpSwap ? 9 : 8;
// The struct intrinsic variants add one additional operand over raw.
const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
@@ -5676,12 +5932,9 @@ bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI,
unsigned ImmOffset;
std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset);
- auto MIB = B.buildInstr(getBufferAtomicPseudo(IID));
-
- if (HasReturn)
- MIB.addDef(Dst);
-
- MIB.addUse(VData); // vdata
+ auto MIB = B.buildInstr(getBufferAtomicPseudo(IID))
+ .addDef(Dst)
+ .addUse(VData); // vdata
if (IsCmpSwap)
MIB.addReg(CmpVal);
@@ -5903,53 +6156,52 @@ bool AMDGPULegalizerInfo::legalizeImageIntrinsic(
return false;
}
- const unsigned NSAMaxSize = ST.getNSAMaxSize();
+ const unsigned NSAMaxSize = ST.getNSAMaxSize(BaseOpcode->Sampler);
const unsigned HasPartialNSA = ST.hasPartialNSAEncoding();
if (IsA16 || IsG16) {
- if (Intr->NumVAddrs > 1) {
- SmallVector<Register, 4> PackedRegs;
+ // Even if NumVAddrs == 1 we should pack it into a 32-bit value, because the
+ // instructions expect VGPR_32
+ SmallVector<Register, 4> PackedRegs;
- packImage16bitOpsToDwords(B, MI, PackedRegs, ArgOffset, Intr, IsA16,
- IsG16);
-
- // See also below in the non-a16 branch
- const bool UseNSA = ST.hasNSAEncoding() &&
- PackedRegs.size() >= ST.getNSAThreshold(MF) &&
- (PackedRegs.size() <= NSAMaxSize || HasPartialNSA);
- const bool UsePartialNSA =
- UseNSA && HasPartialNSA && PackedRegs.size() > NSAMaxSize;
-
- if (UsePartialNSA) {
- // Pack registers that would go over NSAMaxSize into last VAddr register
- LLT PackedAddrTy =
- LLT::fixed_vector(2 * (PackedRegs.size() - NSAMaxSize + 1), 16);
- auto Concat = B.buildConcatVectors(
- PackedAddrTy, ArrayRef(PackedRegs).slice(NSAMaxSize - 1));
- PackedRegs[NSAMaxSize - 1] = Concat.getReg(0);
- PackedRegs.resize(NSAMaxSize);
- } else if (!UseNSA && PackedRegs.size() > 1) {
- LLT PackedAddrTy = LLT::fixed_vector(2 * PackedRegs.size(), 16);
- auto Concat = B.buildConcatVectors(PackedAddrTy, PackedRegs);
- PackedRegs[0] = Concat.getReg(0);
- PackedRegs.resize(1);
- }
+ packImage16bitOpsToDwords(B, MI, PackedRegs, ArgOffset, Intr, IsA16, IsG16);
- const unsigned NumPacked = PackedRegs.size();
- for (unsigned I = Intr->VAddrStart; I < Intr->VAddrEnd; I++) {
- MachineOperand &SrcOp = MI.getOperand(ArgOffset + I);
- if (!SrcOp.isReg()) {
- assert(SrcOp.isImm() && SrcOp.getImm() == 0);
- continue;
- }
+ // See also below in the non-a16 branch
+ const bool UseNSA = ST.hasNSAEncoding() &&
+ PackedRegs.size() >= ST.getNSAThreshold(MF) &&
+ (PackedRegs.size() <= NSAMaxSize || HasPartialNSA);
+ const bool UsePartialNSA =
+ UseNSA && HasPartialNSA && PackedRegs.size() > NSAMaxSize;
- assert(SrcOp.getReg() != AMDGPU::NoRegister);
+ if (UsePartialNSA) {
+ // Pack registers that would go over NSAMaxSize into last VAddr register
+ LLT PackedAddrTy =
+ LLT::fixed_vector(2 * (PackedRegs.size() - NSAMaxSize + 1), 16);
+ auto Concat = B.buildConcatVectors(
+ PackedAddrTy, ArrayRef(PackedRegs).slice(NSAMaxSize - 1));
+ PackedRegs[NSAMaxSize - 1] = Concat.getReg(0);
+ PackedRegs.resize(NSAMaxSize);
+ } else if (!UseNSA && PackedRegs.size() > 1) {
+ LLT PackedAddrTy = LLT::fixed_vector(2 * PackedRegs.size(), 16);
+ auto Concat = B.buildConcatVectors(PackedAddrTy, PackedRegs);
+ PackedRegs[0] = Concat.getReg(0);
+ PackedRegs.resize(1);
+ }
- if (I - Intr->VAddrStart < NumPacked)
- SrcOp.setReg(PackedRegs[I - Intr->VAddrStart]);
- else
- SrcOp.setReg(AMDGPU::NoRegister);
+ const unsigned NumPacked = PackedRegs.size();
+ for (unsigned I = Intr->VAddrStart; I < Intr->VAddrEnd; I++) {
+ MachineOperand &SrcOp = MI.getOperand(ArgOffset + I);
+ if (!SrcOp.isReg()) {
+ assert(SrcOp.isImm() && SrcOp.getImm() == 0);
+ continue;
}
+
+ assert(SrcOp.getReg() != AMDGPU::NoRegister);
+
+ if (I - Intr->VAddrStart < NumPacked)
+ SrcOp.setReg(PackedRegs[I - Intr->VAddrStart]);
+ else
+ SrcOp.setReg(AMDGPU::NoRegister);
}
} else {
// If the register allocator cannot place the address registers contiguously
@@ -5964,7 +6216,7 @@ bool AMDGPULegalizerInfo::legalizeImageIntrinsic(
// SIShrinkInstructions will convert NSA encodings to non-NSA after register
// allocation when possible.
//
- // Partial NSA is allowed on GFX11 where the final register is a contiguous
+ // Partial NSA is allowed on GFX11+ where the final register is a contiguous
// set of the remaining addresses.
const bool UseNSA = ST.hasNSAEncoding() &&
CorrectedNumVAddrs >= ST.getNSAThreshold(MF) &&
@@ -6195,13 +6447,11 @@ bool AMDGPULegalizerInfo::legalizeSBufferLoad(
// Handle needing to s.buffer.load() a p8 value.
if (hasBufferRsrcWorkaround(Ty)) {
Ty = castBufferRsrcFromV4I32(MI, B, *B.getMRI(), 0);
- Dst = MI.getOperand(0).getReg();
B.setInsertPt(B.getMBB(), MI);
}
if (shouldBitcastLoadStoreType(ST, Ty, LLT::scalar(Size))) {
Ty = getBitcastRegisterType(Ty);
Helper.bitcastDst(MI, Ty, 0);
- Dst = MI.getOperand(0).getReg();
B.setInsertPt(B.getMBB(), MI);
}
@@ -6222,10 +6472,10 @@ bool AMDGPULegalizerInfo::legalizeSBufferLoad(
MemSize, MemAlign);
MI.addMemOperand(MF, MMO);
- // There are no 96-bit result scalar loads, but widening to 128-bit should
+ // If we don't have 96-bit result scalar loads, widening to 128-bit should
// always be legal. We may need to restore this to a 96-bit result if it turns
// out this needs to be converted to a vector load during RegBankSelect.
- if (!isPowerOf2_32(Size)) {
+ if (!isPowerOf2_32(Size) && (Size != 96 || !ST.hasScalarDwordx3Loads())) {
if (Ty.isVector())
Helper.moreElementsVectorDst(MI, getPow2VectorType(Ty), 0);
else
@@ -6244,11 +6494,6 @@ bool AMDGPULegalizerInfo::legalizeTrapIntrinsic(MachineInstr &MI,
ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA)
return legalizeTrapEndpgm(MI, MRI, B);
- const Module *M = B.getMF().getFunction().getParent();
- unsigned CodeObjectVersion = AMDGPU::getCodeObjectVersion(*M);
- if (CodeObjectVersion <= AMDGPU::AMDHSA_COV3)
- return legalizeTrapHsaQueuePtr(MI, MRI, B);
-
return ST.supportsGetDoorbellID() ?
legalizeTrapHsa(MI, MRI, B) : legalizeTrapHsaQueuePtr(MI, MRI, B);
}
@@ -6395,13 +6640,17 @@ bool AMDGPULegalizerInfo::legalizeBVHIntrinsic(MachineInstr &MI,
return false;
}
+ const bool IsGFX11 = AMDGPU::isGFX11(ST);
const bool IsGFX11Plus = AMDGPU::isGFX11Plus(ST);
+ const bool IsGFX12Plus = AMDGPU::isGFX12Plus(ST);
const bool IsA16 = MRI.getType(RayDir).getElementType().getSizeInBits() == 16;
const bool Is64 = MRI.getType(NodePtr).getSizeInBits() == 64;
const unsigned NumVDataDwords = 4;
const unsigned NumVAddrDwords = IsA16 ? (Is64 ? 9 : 8) : (Is64 ? 12 : 11);
const unsigned NumVAddrs = IsGFX11Plus ? (IsA16 ? 4 : 5) : NumVAddrDwords;
- const bool UseNSA = ST.hasNSAEncoding() && NumVAddrs <= ST.getNSAMaxSize();
+ const bool UseNSA =
+ IsGFX12Plus || (ST.hasNSAEncoding() && NumVAddrs <= ST.getNSAMaxSize());
+
const unsigned BaseOpcodes[2][2] = {
{AMDGPU::IMAGE_BVH_INTERSECT_RAY, AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16},
{AMDGPU::IMAGE_BVH64_INTERSECT_RAY,
@@ -6409,14 +6658,16 @@ bool AMDGPULegalizerInfo::legalizeBVHIntrinsic(MachineInstr &MI,
int Opcode;
if (UseNSA) {
Opcode = AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16],
- IsGFX11Plus ? AMDGPU::MIMGEncGfx11NSA
+ IsGFX12Plus ? AMDGPU::MIMGEncGfx12
+ : IsGFX11 ? AMDGPU::MIMGEncGfx11NSA
: AMDGPU::MIMGEncGfx10NSA,
NumVDataDwords, NumVAddrDwords);
} else {
- Opcode = AMDGPU::getMIMGOpcode(
- BaseOpcodes[Is64][IsA16],
- IsGFX11Plus ? AMDGPU::MIMGEncGfx11Default : AMDGPU::MIMGEncGfx10Default,
- NumVDataDwords, NumVAddrDwords);
+ assert(!IsGFX12Plus);
+ Opcode = AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16],
+ IsGFX11 ? AMDGPU::MIMGEncGfx11Default
+ : AMDGPU::MIMGEncGfx10Default,
+ NumVDataDwords, NumVAddrDwords);
}
assert(Opcode != -1);
@@ -6539,13 +6790,23 @@ bool AMDGPULegalizerInfo::legalizeFPTruncRound(MachineInstr &MI,
return true;
}
+bool AMDGPULegalizerInfo::legalizeStackSave(MachineInstr &MI,
+ MachineIRBuilder &B) const {
+ const SITargetLowering *TLI = ST.getTargetLowering();
+ Register StackPtr = TLI->getStackPointerRegisterToSaveRestore();
+ Register DstReg = MI.getOperand(0).getReg();
+ B.buildInstr(AMDGPU::G_AMDGPU_WAVE_ADDRESS, {DstReg}, {StackPtr});
+ MI.eraseFromParent();
+ return true;
+}
+
bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
MachineInstr &MI) const {
MachineIRBuilder &B = Helper.MIRBuilder;
MachineRegisterInfo &MRI = *B.getMRI();
// Replace the use G_BRCOND with the exec manipulate and branch pseudos.
- auto IntrID = MI.getIntrinsicID();
+ auto IntrID = cast<GIntrinsic>(MI).getIntrinsicID();
switch (IntrID) {
case Intrinsic::amdgcn_if:
case Intrinsic::amdgcn_else: {
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h
index 534bb2c87ea3..855fa0ddc214 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h
@@ -43,8 +43,8 @@ public:
bool legalizeAddrSpaceCast(MachineInstr &MI, MachineRegisterInfo &MRI,
MachineIRBuilder &B) const;
- bool legalizeFrint(MachineInstr &MI, MachineRegisterInfo &MRI,
- MachineIRBuilder &B) const;
+ bool legalizeFroundeven(MachineInstr &MI, MachineRegisterInfo &MRI,
+ MachineIRBuilder &B) const;
bool legalizeFceil(MachineInstr &MI, MachineRegisterInfo &MRI,
MachineIRBuilder &B) const;
bool legalizeFrem(MachineInstr &MI, MachineRegisterInfo &MRI,
@@ -68,6 +68,10 @@ public:
const GlobalValue *GV, int64_t Offset,
unsigned GAFlags = SIInstrInfo::MO_NONE) const;
+ void buildAbsGlobalAddress(Register DstReg, LLT PtrTy, MachineIRBuilder &B,
+ const GlobalValue *GV,
+ MachineRegisterInfo &MRI) const;
+
bool legalizeGlobalValue(MachineInstr &MI, MachineRegisterInfo &MRI,
MachineIRBuilder &B) const;
bool legalizeLoad(LegalizerHelper &Helper, MachineInstr &MI) const;
@@ -157,6 +161,12 @@ public:
bool legalizeFDIVFastIntrin(MachineInstr &MI, MachineRegisterInfo &MRI,
MachineIRBuilder &B) const;
+ bool legalizeFSQRTF16(MachineInstr &MI, MachineRegisterInfo &MRI,
+ MachineIRBuilder &B) const;
+ bool legalizeFSQRTF32(MachineInstr &MI, MachineRegisterInfo &MRI,
+ MachineIRBuilder &B) const;
+ bool legalizeFSQRTF64(MachineInstr &MI, MachineRegisterInfo &MRI,
+ MachineIRBuilder &B) const;
bool legalizeFSQRT(MachineInstr &MI, MachineRegisterInfo &MRI,
MachineIRBuilder &B) const;
@@ -201,6 +211,7 @@ public:
bool legalizeBVHIntrinsic(MachineInstr &MI, MachineIRBuilder &B) const;
bool legalizeFPTruncRound(MachineInstr &MI, MachineIRBuilder &B) const;
+ bool legalizeStackSave(MachineInstr &MI, MachineIRBuilder &B) const;
bool legalizeImageIntrinsic(
MachineInstr &MI, MachineIRBuilder &B,
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp
index 44ce1e15f0ef..0c21382e5c22 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp
@@ -14,18 +14,22 @@
#include "AMDGPU.h"
#include "AMDGPULibFunc.h"
#include "GCNSubtarget.h"
-#include "llvm/Analysis/AliasAnalysis.h"
-#include "llvm/Analysis/Loads.h"
+#include "llvm/Analysis/AssumptionCache.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/AttributeMask.h"
+#include "llvm/IR/Dominators.h"
#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/IntrinsicInst.h"
#include "llvm/IR/IntrinsicsAMDGPU.h"
+#include "llvm/IR/PatternMatch.h"
#include "llvm/InitializePasses.h"
-#include "llvm/Target/TargetMachine.h"
#include <cmath>
#define DEBUG_TYPE "amdgpu-simplifylib"
using namespace llvm;
+using namespace llvm::PatternMatch;
static cl::opt<bool> EnablePreLink("amdgpu-prelink",
cl::desc("Enable pre-link mode optimizations"),
@@ -46,10 +50,13 @@ namespace llvm {
class AMDGPULibCalls {
private:
+ const TargetLibraryInfo *TLInfo = nullptr;
+ AssumptionCache *AC = nullptr;
+ DominatorTree *DT = nullptr;
typedef llvm::AMDGPULibFunc FuncInfo;
- const TargetMachine *TM;
+ bool UnsafeFPMath = false;
// -fuse-native.
bool AllNative = false;
@@ -66,64 +73,76 @@ private:
/* Specialized optimizations */
- // recip (half or native)
- bool fold_recip(CallInst *CI, IRBuilder<> &B, const FuncInfo &FInfo);
-
- // divide (half or native)
- bool fold_divide(CallInst *CI, IRBuilder<> &B, const FuncInfo &FInfo);
-
// pow/powr/pown
- bool fold_pow(CallInst *CI, IRBuilder<> &B, const FuncInfo &FInfo);
+ bool fold_pow(FPMathOperator *FPOp, IRBuilder<> &B, const FuncInfo &FInfo);
// rootn
- bool fold_rootn(CallInst *CI, IRBuilder<> &B, const FuncInfo &FInfo);
-
- // fma/mad
- bool fold_fma_mad(CallInst *CI, IRBuilder<> &B, const FuncInfo &FInfo);
+ bool fold_rootn(FPMathOperator *FPOp, IRBuilder<> &B, const FuncInfo &FInfo);
// -fuse-native for sincos
bool sincosUseNative(CallInst *aCI, const FuncInfo &FInfo);
// evaluate calls if calls' arguments are constants.
- bool evaluateScalarMathFunc(const FuncInfo &FInfo, double& Res0,
- double& Res1, Constant *copr0, Constant *copr1, Constant *copr2);
+ bool evaluateScalarMathFunc(const FuncInfo &FInfo, double &Res0, double &Res1,
+ Constant *copr0, Constant *copr1);
bool evaluateCall(CallInst *aCI, const FuncInfo &FInfo);
// sqrt
- bool fold_sqrt(CallInst *CI, IRBuilder<> &B, const FuncInfo &FInfo);
+ bool fold_sqrt(FPMathOperator *FPOp, IRBuilder<> &B, const FuncInfo &FInfo);
+
+ /// Insert a value to sincos function \p Fsincos. Returns (value of sin, value
+ /// of cos, sincos call).
+ std::tuple<Value *, Value *, Value *> insertSinCos(Value *Arg,
+ FastMathFlags FMF,
+ IRBuilder<> &B,
+ FunctionCallee Fsincos);
// sin/cos
- bool fold_sincos(CallInst * CI, IRBuilder<> &B, AliasAnalysis * AA);
+ bool fold_sincos(FPMathOperator *FPOp, IRBuilder<> &B, const FuncInfo &FInfo);
// __read_pipe/__write_pipe
bool fold_read_write_pipe(CallInst *CI, IRBuilder<> &B,
const FuncInfo &FInfo);
- // llvm.amdgcn.wavefrontsize
- bool fold_wavefrontsize(CallInst *CI, IRBuilder<> &B);
-
- // Get insertion point at entry.
- BasicBlock::iterator getEntryIns(CallInst * UI);
- // Insert an Alloc instruction.
- AllocaInst* insertAlloca(CallInst * UI, IRBuilder<> &B, const char *prefix);
// Get a scalar native builtin single argument FP function
FunctionCallee getNativeFunction(Module *M, const FuncInfo &FInfo);
+ /// Substitute a call to a known libcall with an intrinsic call. If \p
+ /// AllowMinSize is true, allow the replacement in a minsize function.
+ bool shouldReplaceLibcallWithIntrinsic(const CallInst *CI,
+ bool AllowMinSizeF32 = false,
+ bool AllowF64 = false,
+ bool AllowStrictFP = false);
+ void replaceLibCallWithSimpleIntrinsic(IRBuilder<> &B, CallInst *CI,
+ Intrinsic::ID IntrID);
+
+ bool tryReplaceLibcallWithSimpleIntrinsic(IRBuilder<> &B, CallInst *CI,
+ Intrinsic::ID IntrID,
+ bool AllowMinSizeF32 = false,
+ bool AllowF64 = false,
+ bool AllowStrictFP = false);
+
protected:
- CallInst *CI;
+ bool isUnsafeMath(const FPMathOperator *FPOp) const;
+ bool isUnsafeFiniteOnlyMath(const FPMathOperator *FPOp) const;
+
+ bool canIncreasePrecisionOfConstantFold(const FPMathOperator *FPOp) const;
- bool isUnsafeMath(const CallInst *CI) const;
+ static void replaceCall(Instruction *I, Value *With) {
+ I->replaceAllUsesWith(With);
+ I->eraseFromParent();
+ }
- void replaceCall(Value *With) {
- CI->replaceAllUsesWith(With);
- CI->eraseFromParent();
+ static void replaceCall(FPMathOperator *I, Value *With) {
+ replaceCall(cast<Instruction>(I), With);
}
public:
- AMDGPULibCalls(const TargetMachine *TM_ = nullptr) : TM(TM_) {}
+ AMDGPULibCalls() {}
- bool fold(CallInst *CI, AliasAnalysis *AA = nullptr);
+ bool fold(CallInst *CI);
+ void initFunction(Function &F, FunctionAnalysisManager &FAM);
void initNativeFuncs();
// Replace a normal math function call with that native version
@@ -132,57 +151,6 @@ public:
} // end llvm namespace
-namespace {
-
- class AMDGPUSimplifyLibCalls : public FunctionPass {
-
- AMDGPULibCalls Simplifier;
-
- public:
- static char ID; // Pass identification
-
- AMDGPUSimplifyLibCalls(const TargetMachine *TM = nullptr)
- : FunctionPass(ID), Simplifier(TM) {
- initializeAMDGPUSimplifyLibCallsPass(*PassRegistry::getPassRegistry());
- }
-
- void getAnalysisUsage(AnalysisUsage &AU) const override {
- AU.addRequired<AAResultsWrapperPass>();
- }
-
- bool runOnFunction(Function &M) override;
- };
-
- class AMDGPUUseNativeCalls : public FunctionPass {
-
- AMDGPULibCalls Simplifier;
-
- public:
- static char ID; // Pass identification
-
- AMDGPUUseNativeCalls() : FunctionPass(ID) {
- initializeAMDGPUUseNativeCallsPass(*PassRegistry::getPassRegistry());
- Simplifier.initNativeFuncs();
- }
-
- bool runOnFunction(Function &F) override;
- };
-
-} // end anonymous namespace.
-
-char AMDGPUSimplifyLibCalls::ID = 0;
-char AMDGPUUseNativeCalls::ID = 0;
-
-INITIALIZE_PASS_BEGIN(AMDGPUSimplifyLibCalls, "amdgpu-simplifylib",
- "Simplify well-known AMD library calls", false, false)
-INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
-INITIALIZE_PASS_END(AMDGPUSimplifyLibCalls, "amdgpu-simplifylib",
- "Simplify well-known AMD library calls", false, false)
-
-INITIALIZE_PASS(AMDGPUUseNativeCalls, "amdgpu-usenative",
- "Replace builtin math calls with that native versions.",
- false, false)
-
template <typename IRB>
static CallInst *CreateCallEx(IRB &B, FunctionCallee Callee, Value *Arg,
const Twine &Name = "") {
@@ -201,6 +169,15 @@ static CallInst *CreateCallEx2(IRB &B, FunctionCallee Callee, Value *Arg1,
return R;
}
+static FunctionType *getPownType(FunctionType *FT) {
+ Type *PowNExpTy = Type::getInt32Ty(FT->getContext());
+ if (VectorType *VecTy = dyn_cast<VectorType>(FT->getReturnType()))
+ PowNExpTy = VectorType::get(PowNExpTy, VecTy->getElementCount());
+
+ return FunctionType::get(FT->getReturnType(),
+ {FT->getParamType(0), PowNExpTy}, false);
+}
+
// Data structures for table-driven optimizations.
// FuncTbl works for both f32 and f64 functions with 1 input argument
@@ -444,13 +421,26 @@ bool AMDGPULibCalls::parseFunctionName(const StringRef &FMangledName,
return AMDGPULibFunc::parse(FMangledName, FInfo);
}
-bool AMDGPULibCalls::isUnsafeMath(const CallInst *CI) const {
- if (auto Op = dyn_cast<FPMathOperator>(CI))
- if (Op->isFast())
- return true;
- const Function *F = CI->getParent()->getParent();
- Attribute Attr = F->getFnAttribute("unsafe-fp-math");
- return Attr.getValueAsBool();
+bool AMDGPULibCalls::isUnsafeMath(const FPMathOperator *FPOp) const {
+ return UnsafeFPMath || FPOp->isFast();
+}
+
+bool AMDGPULibCalls::isUnsafeFiniteOnlyMath(const FPMathOperator *FPOp) const {
+ return UnsafeFPMath ||
+ (FPOp->hasApproxFunc() && FPOp->hasNoNaNs() && FPOp->hasNoInfs());
+}
+
+bool AMDGPULibCalls::canIncreasePrecisionOfConstantFold(
+ const FPMathOperator *FPOp) const {
+ // TODO: Refine to approxFunc or contract
+ return isUnsafeMath(FPOp);
+}
+
+void AMDGPULibCalls::initFunction(Function &F, FunctionAnalysisManager &FAM) {
+ UnsafeFPMath = F.getFnAttribute("unsafe-fp-math").getValueAsBool();
+ AC = &FAM.getResult<AssumptionAnalysis>(F);
+ TLInfo = &FAM.getResult<TargetLibraryAnalysis>(F);
+ DT = FAM.getCachedResult<DominatorTreeAnalysis>(F);
}
bool AMDGPULibCalls::useNativeFunc(const StringRef F) const {
@@ -490,7 +480,7 @@ bool AMDGPULibCalls::sincosUseNative(CallInst *aCI, const FuncInfo &FInfo) {
DEBUG_WITH_TYPE("usenative", dbgs() << "<useNative> replace " << *aCI
<< " with native version of sin/cos");
- replaceCall(sinval);
+ replaceCall(aCI, sinval);
return true;
}
}
@@ -498,8 +488,9 @@ bool AMDGPULibCalls::sincosUseNative(CallInst *aCI, const FuncInfo &FInfo) {
}
bool AMDGPULibCalls::useNative(CallInst *aCI) {
- CI = aCI;
Function *Callee = aCI->getCalledFunction();
+ if (!Callee || aCI->isNoBuiltin())
+ return false;
FuncInfo FInfo;
if (!parseFunctionName(Callee->getName(), FInfo) || !FInfo.isMangled() ||
@@ -538,29 +529,25 @@ bool AMDGPULibCalls::fold_read_write_pipe(CallInst *CI, IRBuilder<> &B,
assert(Callee->hasName() && "Invalid read_pipe/write_pipe function");
auto *M = Callee->getParent();
- auto &Ctx = M->getContext();
std::string Name = std::string(Callee->getName());
auto NumArg = CI->arg_size();
if (NumArg != 4 && NumArg != 6)
return false;
- auto *PacketSize = CI->getArgOperand(NumArg - 2);
- auto *PacketAlign = CI->getArgOperand(NumArg - 1);
- if (!isa<ConstantInt>(PacketSize) || !isa<ConstantInt>(PacketAlign))
+ ConstantInt *PacketSize =
+ dyn_cast<ConstantInt>(CI->getArgOperand(NumArg - 2));
+ ConstantInt *PacketAlign =
+ dyn_cast<ConstantInt>(CI->getArgOperand(NumArg - 1));
+ if (!PacketSize || !PacketAlign)
return false;
- unsigned Size = cast<ConstantInt>(PacketSize)->getZExtValue();
- Align Alignment = cast<ConstantInt>(PacketAlign)->getAlignValue();
+
+ unsigned Size = PacketSize->getZExtValue();
+ Align Alignment = PacketAlign->getAlignValue();
if (Alignment != Size)
return false;
- Type *PtrElemTy;
- if (Size <= 8)
- PtrElemTy = Type::getIntNTy(Ctx, Size * 8);
- else
- PtrElemTy = FixedVectorType::get(Type::getInt64Ty(Ctx), Size / 8);
unsigned PtrArgLoc = CI->arg_size() - 3;
- auto PtrArg = CI->getArgOperand(PtrArgLoc);
- unsigned PtrArgAS = PtrArg->getType()->getPointerAddressSpace();
- auto *PtrTy = llvm::PointerType::get(PtrElemTy, PtrArgAS);
+ Value *PtrArg = CI->getArgOperand(PtrArgLoc);
+ Type *PtrTy = PtrArg->getType();
SmallVector<llvm::Type *, 6> ArgTys;
for (unsigned I = 0; I != PtrArgLoc; ++I)
@@ -575,11 +562,10 @@ bool AMDGPULibCalls::fold_read_write_pipe(CallInst *CI, IRBuilder<> &B,
if (!F)
return false;
- auto *BCast = B.CreatePointerCast(PtrArg, PtrTy);
SmallVector<Value *, 6> Args;
for (unsigned I = 0; I != PtrArgLoc; ++I)
Args.push_back(CI->getArgOperand(I));
- Args.push_back(BCast);
+ Args.push_back(PtrArg);
auto *NCI = B.CreateCall(F, Args);
NCI->setAttributes(CI->getAttributes());
@@ -590,99 +576,242 @@ bool AMDGPULibCalls::fold_read_write_pipe(CallInst *CI, IRBuilder<> &B,
return true;
}
-// This function returns false if no change; return true otherwise.
-bool AMDGPULibCalls::fold(CallInst *CI, AliasAnalysis *AA) {
- this->CI = CI;
- Function *Callee = CI->getCalledFunction();
+static bool isKnownIntegral(const Value *V, const DataLayout &DL,
+ FastMathFlags FMF) {
+ if (isa<UndefValue>(V))
+ return true;
- // Ignore indirect calls.
- if (Callee == nullptr)
- return false;
+ if (const ConstantFP *CF = dyn_cast<ConstantFP>(V))
+ return CF->getValueAPF().isInteger();
- BasicBlock *BB = CI->getParent();
- LLVMContext &Context = CI->getParent()->getContext();
- IRBuilder<> B(Context);
+ if (const ConstantDataVector *CDV = dyn_cast<ConstantDataVector>(V)) {
+ for (unsigned i = 0, e = CDV->getNumElements(); i != e; ++i) {
+ Constant *ConstElt = CDV->getElementAsConstant(i);
+ if (isa<UndefValue>(ConstElt))
+ continue;
+ const ConstantFP *CFP = dyn_cast<ConstantFP>(ConstElt);
+ if (!CFP || !CFP->getValue().isInteger())
+ return false;
+ }
- // Set the builder to the instruction after the call.
- B.SetInsertPoint(BB, CI->getIterator());
+ return true;
+ }
- // Copy fast flags from the original call.
- if (const FPMathOperator *FPOp = dyn_cast<const FPMathOperator>(CI))
- B.setFastMathFlags(FPOp->getFastMathFlags());
+ const Instruction *I = dyn_cast<Instruction>(V);
+ if (!I)
+ return false;
+
+ switch (I->getOpcode()) {
+ case Instruction::SIToFP:
+ case Instruction::UIToFP:
+ // TODO: Could check nofpclass(inf) on incoming argument
+ if (FMF.noInfs())
+ return true;
- switch (Callee->getIntrinsicID()) {
+ // Need to check int size cannot produce infinity, which computeKnownFPClass
+ // knows how to do already.
+ return isKnownNeverInfinity(I, DL);
+ case Instruction::Call: {
+ const CallInst *CI = cast<CallInst>(I);
+ switch (CI->getIntrinsicID()) {
+ case Intrinsic::trunc:
+ case Intrinsic::floor:
+ case Intrinsic::ceil:
+ case Intrinsic::rint:
+ case Intrinsic::nearbyint:
+ case Intrinsic::round:
+ case Intrinsic::roundeven:
+ return (FMF.noInfs() && FMF.noNaNs()) ||
+ isKnownNeverInfOrNaN(I, DL, nullptr);
+ default:
+ break;
+ }
+
+ break;
+ }
default:
break;
- case Intrinsic::amdgcn_wavefrontsize:
- return !EnablePreLink && fold_wavefrontsize(CI, B);
}
+ return false;
+}
+
+// This function returns false if no change; return true otherwise.
+bool AMDGPULibCalls::fold(CallInst *CI) {
+ Function *Callee = CI->getCalledFunction();
+ // Ignore indirect calls.
+ if (!Callee || Callee->isIntrinsic() || CI->isNoBuiltin())
+ return false;
+
FuncInfo FInfo;
if (!parseFunctionName(Callee->getName(), FInfo))
return false;
// Further check the number of arguments to see if they match.
- if (CI->arg_size() != FInfo.getNumArgs())
+ // TODO: Check calling convention matches too
+ if (!FInfo.isCompatibleSignature(CI->getFunctionType()))
return false;
- if (TDOFold(CI, FInfo))
- return true;
+ LLVM_DEBUG(dbgs() << "AMDIC: try folding " << *CI << '\n');
- // Under unsafe-math, evaluate calls if possible.
- // According to Brian Sumner, we can do this for all f32 function calls
- // using host's double function calls.
- if (isUnsafeMath(CI) && evaluateCall(CI, FInfo))
+ if (TDOFold(CI, FInfo))
return true;
- // Specialized optimizations for each function call
- switch (FInfo.getId()) {
- case AMDGPULibFunc::EI_RECIP:
- // skip vector function
- assert ((FInfo.getPrefix() == AMDGPULibFunc::NATIVE ||
- FInfo.getPrefix() == AMDGPULibFunc::HALF) &&
- "recip must be an either native or half function");
- return (getVecSize(FInfo) != 1) ? false : fold_recip(CI, B, FInfo);
+ IRBuilder<> B(CI);
- case AMDGPULibFunc::EI_DIVIDE:
- // skip vector function
- assert ((FInfo.getPrefix() == AMDGPULibFunc::NATIVE ||
- FInfo.getPrefix() == AMDGPULibFunc::HALF) &&
- "divide must be an either native or half function");
- return (getVecSize(FInfo) != 1) ? false : fold_divide(CI, B, FInfo);
-
- case AMDGPULibFunc::EI_POW:
- case AMDGPULibFunc::EI_POWR:
- case AMDGPULibFunc::EI_POWN:
- return fold_pow(CI, B, FInfo);
+ if (FPMathOperator *FPOp = dyn_cast<FPMathOperator>(CI)) {
+ // Under unsafe-math, evaluate calls if possible.
+ // According to Brian Sumner, we can do this for all f32 function calls
+ // using host's double function calls.
+ if (canIncreasePrecisionOfConstantFold(FPOp) && evaluateCall(CI, FInfo))
+ return true;
- case AMDGPULibFunc::EI_ROOTN:
- // skip vector function
- return (getVecSize(FInfo) != 1) ? false : fold_rootn(CI, B, FInfo);
+ // Copy fast flags from the original call.
+ FastMathFlags FMF = FPOp->getFastMathFlags();
+ B.setFastMathFlags(FMF);
+
+ // Specialized optimizations for each function call.
+ //
+ // TODO: Handle other simple intrinsic wrappers. Sqrt.
+ //
+ // TODO: Handle native functions
+ switch (FInfo.getId()) {
+ case AMDGPULibFunc::EI_EXP:
+ if (FMF.none())
+ return false;
+ return tryReplaceLibcallWithSimpleIntrinsic(B, CI, Intrinsic::exp,
+ FMF.approxFunc());
+ case AMDGPULibFunc::EI_EXP2:
+ if (FMF.none())
+ return false;
+ return tryReplaceLibcallWithSimpleIntrinsic(B, CI, Intrinsic::exp2,
+ FMF.approxFunc());
+ case AMDGPULibFunc::EI_LOG:
+ if (FMF.none())
+ return false;
+ return tryReplaceLibcallWithSimpleIntrinsic(B, CI, Intrinsic::log,
+ FMF.approxFunc());
+ case AMDGPULibFunc::EI_LOG2:
+ if (FMF.none())
+ return false;
+ return tryReplaceLibcallWithSimpleIntrinsic(B, CI, Intrinsic::log2,
+ FMF.approxFunc());
+ case AMDGPULibFunc::EI_LOG10:
+ if (FMF.none())
+ return false;
+ return tryReplaceLibcallWithSimpleIntrinsic(B, CI, Intrinsic::log10,
+ FMF.approxFunc());
+ case AMDGPULibFunc::EI_FMIN:
+ return tryReplaceLibcallWithSimpleIntrinsic(B, CI, Intrinsic::minnum,
+ true, true);
+ case AMDGPULibFunc::EI_FMAX:
+ return tryReplaceLibcallWithSimpleIntrinsic(B, CI, Intrinsic::maxnum,
+ true, true);
+ case AMDGPULibFunc::EI_FMA:
+ return tryReplaceLibcallWithSimpleIntrinsic(B, CI, Intrinsic::fma, true,
+ true);
+ case AMDGPULibFunc::EI_MAD:
+ return tryReplaceLibcallWithSimpleIntrinsic(B, CI, Intrinsic::fmuladd,
+ true, true);
+ case AMDGPULibFunc::EI_FABS:
+ return tryReplaceLibcallWithSimpleIntrinsic(B, CI, Intrinsic::fabs, true,
+ true, true);
+ case AMDGPULibFunc::EI_COPYSIGN:
+ return tryReplaceLibcallWithSimpleIntrinsic(B, CI, Intrinsic::copysign,
+ true, true, true);
+ case AMDGPULibFunc::EI_FLOOR:
+ return tryReplaceLibcallWithSimpleIntrinsic(B, CI, Intrinsic::floor, true,
+ true);
+ case AMDGPULibFunc::EI_CEIL:
+ return tryReplaceLibcallWithSimpleIntrinsic(B, CI, Intrinsic::ceil, true,
+ true);
+ case AMDGPULibFunc::EI_TRUNC:
+ return tryReplaceLibcallWithSimpleIntrinsic(B, CI, Intrinsic::trunc, true,
+ true);
+ case AMDGPULibFunc::EI_RINT:
+ return tryReplaceLibcallWithSimpleIntrinsic(B, CI, Intrinsic::rint, true,
+ true);
+ case AMDGPULibFunc::EI_ROUND:
+ return tryReplaceLibcallWithSimpleIntrinsic(B, CI, Intrinsic::round, true,
+ true);
+ case AMDGPULibFunc::EI_LDEXP: {
+ if (!shouldReplaceLibcallWithIntrinsic(CI, true, true))
+ return false;
- case AMDGPULibFunc::EI_FMA:
- case AMDGPULibFunc::EI_MAD:
- case AMDGPULibFunc::EI_NFMA:
- // skip vector function
- return (getVecSize(FInfo) != 1) ? false : fold_fma_mad(CI, B, FInfo);
+ Value *Arg1 = CI->getArgOperand(1);
+ if (VectorType *VecTy = dyn_cast<VectorType>(CI->getType());
+ VecTy && !isa<VectorType>(Arg1->getType())) {
+ Value *SplatArg1 = B.CreateVectorSplat(VecTy->getElementCount(), Arg1);
+ CI->setArgOperand(1, SplatArg1);
+ }
- case AMDGPULibFunc::EI_SQRT:
- return isUnsafeMath(CI) && fold_sqrt(CI, B, FInfo);
- case AMDGPULibFunc::EI_COS:
- case AMDGPULibFunc::EI_SIN:
- if ((getArgType(FInfo) == AMDGPULibFunc::F32 ||
- getArgType(FInfo) == AMDGPULibFunc::F64)
- && (FInfo.getPrefix() == AMDGPULibFunc::NOPFX))
- return fold_sincos(CI, B, AA);
+ CI->setCalledFunction(Intrinsic::getDeclaration(
+ CI->getModule(), Intrinsic::ldexp,
+ {CI->getType(), CI->getArgOperand(1)->getType()}));
+ return true;
+ }
+ case AMDGPULibFunc::EI_POW: {
+ Module *M = Callee->getParent();
+ AMDGPULibFunc PowrInfo(AMDGPULibFunc::EI_POWR, FInfo);
+ FunctionCallee PowrFunc = getFunction(M, PowrInfo);
+ CallInst *Call = cast<CallInst>(FPOp);
+
+ // pow(x, y) -> powr(x, y) for x >= -0.0
+ // TODO: Account for flags on current call
+ if (PowrFunc &&
+ cannotBeOrderedLessThanZero(FPOp->getOperand(0), M->getDataLayout(),
+ TLInfo, 0, AC, Call, DT)) {
+ Call->setCalledFunction(PowrFunc);
+ return fold_pow(FPOp, B, PowrInfo) || true;
+ }
- break;
- case AMDGPULibFunc::EI_READ_PIPE_2:
- case AMDGPULibFunc::EI_READ_PIPE_4:
- case AMDGPULibFunc::EI_WRITE_PIPE_2:
- case AMDGPULibFunc::EI_WRITE_PIPE_4:
- return fold_read_write_pipe(CI, B, FInfo);
+ // pow(x, y) -> pown(x, y) for known integral y
+ if (isKnownIntegral(FPOp->getOperand(1), M->getDataLayout(),
+ FPOp->getFastMathFlags())) {
+ FunctionType *PownType = getPownType(CI->getFunctionType());
+ AMDGPULibFunc PownInfo(AMDGPULibFunc::EI_POWN, PownType, true);
+ FunctionCallee PownFunc = getFunction(M, PownInfo);
+ if (PownFunc) {
+ // TODO: If the incoming integral value is an sitofp/uitofp, it won't
+ // fold out without a known range. We can probably take the source
+ // value directly.
+ Value *CastedArg =
+ B.CreateFPToSI(FPOp->getOperand(1), PownType->getParamType(1));
+ // Have to drop any nofpclass attributes on the original call site.
+ Call->removeParamAttrs(
+ 1, AttributeFuncs::typeIncompatible(CastedArg->getType()));
+ Call->setCalledFunction(PownFunc);
+ Call->setArgOperand(1, CastedArg);
+ return fold_pow(FPOp, B, PownInfo) || true;
+ }
+ }
- default:
- break;
+ return fold_pow(FPOp, B, FInfo);
+ }
+ case AMDGPULibFunc::EI_POWR:
+ case AMDGPULibFunc::EI_POWN:
+ return fold_pow(FPOp, B, FInfo);
+ case AMDGPULibFunc::EI_ROOTN:
+ return fold_rootn(FPOp, B, FInfo);
+ case AMDGPULibFunc::EI_SQRT:
+ return fold_sqrt(FPOp, B, FInfo);
+ case AMDGPULibFunc::EI_COS:
+ case AMDGPULibFunc::EI_SIN:
+ return fold_sincos(FPOp, B, FInfo);
+ default:
+ break;
+ }
+ } else {
+ // Specialized optimizations for each function call
+ switch (FInfo.getId()) {
+ case AMDGPULibFunc::EI_READ_PIPE_2:
+ case AMDGPULibFunc::EI_READ_PIPE_4:
+ case AMDGPULibFunc::EI_WRITE_PIPE_2:
+ case AMDGPULibFunc::EI_WRITE_PIPE_4:
+ return fold_read_write_pipe(CI, B, FInfo);
+ default:
+ break;
+ }
}
return false;
@@ -731,7 +860,7 @@ bool AMDGPULibCalls::TDOFold(CallInst *CI, const FuncInfo &FInfo) {
nval = ConstantDataVector::get(context, tmp);
}
LLVM_DEBUG(errs() << "AMDIC: " << *CI << " ---> " << *nval << "\n");
- replaceCall(nval);
+ replaceCall(CI, nval);
return true;
}
} else {
@@ -741,7 +870,7 @@ bool AMDGPULibCalls::TDOFold(CallInst *CI, const FuncInfo &FInfo) {
if (CF->isExactlyValue(tr[i].input)) {
Value *nval = ConstantFP::get(CF->getType(), tr[i].result);
LLVM_DEBUG(errs() << "AMDIC: " << *CI << " ---> " << *nval << "\n");
- replaceCall(nval);
+ replaceCall(CI, nval);
return true;
}
}
@@ -751,45 +880,6 @@ bool AMDGPULibCalls::TDOFold(CallInst *CI, const FuncInfo &FInfo) {
return false;
}
-// [native_]half_recip(c) ==> 1.0/c
-bool AMDGPULibCalls::fold_recip(CallInst *CI, IRBuilder<> &B,
- const FuncInfo &FInfo) {
- Value *opr0 = CI->getArgOperand(0);
- if (ConstantFP *CF = dyn_cast<ConstantFP>(opr0)) {
- // Just create a normal div. Later, InstCombine will be able
- // to compute the divide into a constant (avoid check float infinity
- // or subnormal at this point).
- Value *nval = B.CreateFDiv(ConstantFP::get(CF->getType(), 1.0),
- opr0,
- "recip2div");
- LLVM_DEBUG(errs() << "AMDIC: " << *CI << " ---> " << *nval << "\n");
- replaceCall(nval);
- return true;
- }
- return false;
-}
-
-// [native_]half_divide(x, c) ==> x/c
-bool AMDGPULibCalls::fold_divide(CallInst *CI, IRBuilder<> &B,
- const FuncInfo &FInfo) {
- Value *opr0 = CI->getArgOperand(0);
- Value *opr1 = CI->getArgOperand(1);
- ConstantFP *CF0 = dyn_cast<ConstantFP>(opr0);
- ConstantFP *CF1 = dyn_cast<ConstantFP>(opr1);
-
- if ((CF0 && CF1) || // both are constants
- (CF1 && (getArgType(FInfo) == AMDGPULibFunc::F32)))
- // CF1 is constant && f32 divide
- {
- Value *nval1 = B.CreateFDiv(ConstantFP::get(opr1->getType(), 1.0),
- opr1, "__div2recip");
- Value *nval = B.CreateFMul(opr0, nval1, "__div2mul");
- replaceCall(nval);
- return true;
- }
- return false;
-}
-
namespace llvm {
static double log2(double V) {
#if _XOPEN_SOURCE >= 600 || defined(_ISOC99_SOURCE) || _POSIX_C_SOURCE >= 200112L
@@ -800,81 +890,62 @@ static double log2(double V) {
}
}
-bool AMDGPULibCalls::fold_pow(CallInst *CI, IRBuilder<> &B,
+bool AMDGPULibCalls::fold_pow(FPMathOperator *FPOp, IRBuilder<> &B,
const FuncInfo &FInfo) {
assert((FInfo.getId() == AMDGPULibFunc::EI_POW ||
FInfo.getId() == AMDGPULibFunc::EI_POWR ||
FInfo.getId() == AMDGPULibFunc::EI_POWN) &&
"fold_pow: encounter a wrong function call");
- Value *opr0, *opr1;
- ConstantFP *CF;
- ConstantInt *CINT;
- ConstantAggregateZero *CZero;
- Type *eltType;
+ Module *M = B.GetInsertBlock()->getModule();
+ Type *eltType = FPOp->getType()->getScalarType();
+ Value *opr0 = FPOp->getOperand(0);
+ Value *opr1 = FPOp->getOperand(1);
- opr0 = CI->getArgOperand(0);
- opr1 = CI->getArgOperand(1);
- CZero = dyn_cast<ConstantAggregateZero>(opr1);
- if (getVecSize(FInfo) == 1) {
- eltType = opr0->getType();
- CF = dyn_cast<ConstantFP>(opr1);
- CINT = dyn_cast<ConstantInt>(opr1);
- } else {
- VectorType *VTy = dyn_cast<VectorType>(opr0->getType());
- assert(VTy && "Oprand of vector function should be of vectortype");
- eltType = VTy->getElementType();
- ConstantDataVector *CDV = dyn_cast<ConstantDataVector>(opr1);
-
- // Now, only Handle vector const whose elements have the same value.
- CF = CDV ? dyn_cast_or_null<ConstantFP>(CDV->getSplatValue()) : nullptr;
- CINT = CDV ? dyn_cast_or_null<ConstantInt>(CDV->getSplatValue()) : nullptr;
- }
-
- // No unsafe math , no constant argument, do nothing
- if (!isUnsafeMath(CI) && !CF && !CINT && !CZero)
- return false;
+ const APFloat *CF = nullptr;
+ const APInt *CINT = nullptr;
+ if (!match(opr1, m_APFloatAllowUndef(CF)))
+ match(opr1, m_APIntAllowUndef(CINT));
// 0x1111111 means that we don't do anything for this call.
int ci_opr1 = (CINT ? (int)CINT->getSExtValue() : 0x1111111);
- if ((CF && CF->isZero()) || (CINT && ci_opr1 == 0) || CZero) {
+ if ((CF && CF->isZero()) || (CINT && ci_opr1 == 0)) {
// pow/powr/pown(x, 0) == 1
- LLVM_DEBUG(errs() << "AMDIC: " << *CI << " ---> 1\n");
+ LLVM_DEBUG(errs() << "AMDIC: " << *FPOp << " ---> 1\n");
Constant *cnval = ConstantFP::get(eltType, 1.0);
if (getVecSize(FInfo) > 1) {
cnval = ConstantDataVector::getSplat(getVecSize(FInfo), cnval);
}
- replaceCall(cnval);
+ replaceCall(FPOp, cnval);
return true;
}
if ((CF && CF->isExactlyValue(1.0)) || (CINT && ci_opr1 == 1)) {
// pow/powr/pown(x, 1.0) = x
- LLVM_DEBUG(errs() << "AMDIC: " << *CI << " ---> " << *opr0 << "\n");
- replaceCall(opr0);
+ LLVM_DEBUG(errs() << "AMDIC: " << *FPOp << " ---> " << *opr0 << "\n");
+ replaceCall(FPOp, opr0);
return true;
}
if ((CF && CF->isExactlyValue(2.0)) || (CINT && ci_opr1 == 2)) {
// pow/powr/pown(x, 2.0) = x*x
- LLVM_DEBUG(errs() << "AMDIC: " << *CI << " ---> " << *opr0 << " * " << *opr0
- << "\n");
+ LLVM_DEBUG(errs() << "AMDIC: " << *FPOp << " ---> " << *opr0 << " * "
+ << *opr0 << "\n");
Value *nval = B.CreateFMul(opr0, opr0, "__pow2");
- replaceCall(nval);
+ replaceCall(FPOp, nval);
return true;
}
if ((CF && CF->isExactlyValue(-1.0)) || (CINT && ci_opr1 == -1)) {
// pow/powr/pown(x, -1.0) = 1.0/x
- LLVM_DEBUG(errs() << "AMDIC: " << *CI << " ---> 1 / " << *opr0 << "\n");
+ LLVM_DEBUG(errs() << "AMDIC: " << *FPOp << " ---> 1 / " << *opr0 << "\n");
Constant *cnval = ConstantFP::get(eltType, 1.0);
if (getVecSize(FInfo) > 1) {
cnval = ConstantDataVector::getSplat(getVecSize(FInfo), cnval);
}
Value *nval = B.CreateFDiv(cnval, opr0, "__powrecip");
- replaceCall(nval);
+ replaceCall(FPOp, nval);
return true;
}
- Module *M = CI->getModule();
if (CF && (CF->isExactlyValue(0.5) || CF->isExactlyValue(-0.5))) {
// pow[r](x, [-]0.5) = sqrt(x)
bool issqrt = CF->isExactlyValue(0.5);
@@ -882,16 +953,16 @@ bool AMDGPULibCalls::fold_pow(CallInst *CI, IRBuilder<> &B,
getFunction(M, AMDGPULibFunc(issqrt ? AMDGPULibFunc::EI_SQRT
: AMDGPULibFunc::EI_RSQRT,
FInfo))) {
- LLVM_DEBUG(errs() << "AMDIC: " << *CI << " ---> "
- << FInfo.getName().c_str() << "(" << *opr0 << ")\n");
+ LLVM_DEBUG(errs() << "AMDIC: " << *FPOp << " ---> " << FInfo.getName()
+ << '(' << *opr0 << ")\n");
Value *nval = CreateCallEx(B,FPExpr, opr0, issqrt ? "__pow2sqrt"
: "__pow2rsqrt");
- replaceCall(nval);
+ replaceCall(FPOp, nval);
return true;
}
}
- if (!isUnsafeMath(CI))
+ if (!isUnsafeFiniteOnlyMath(FPOp))
return false;
// Unsafe Math optimization
@@ -899,8 +970,8 @@ bool AMDGPULibCalls::fold_pow(CallInst *CI, IRBuilder<> &B,
// Remember that ci_opr1 is set if opr1 is integral
if (CF) {
double dval = (getArgType(FInfo) == AMDGPULibFunc::F32)
- ? (double)CF->getValueAPF().convertToFloat()
- : CF->getValueAPF().convertToDouble();
+ ? (double)CF->convertToFloat()
+ : CF->convertToDouble();
int ival = (int)dval;
if ((double)ival == dval) {
ci_opr1 = ival;
@@ -939,31 +1010,39 @@ bool AMDGPULibCalls::fold_pow(CallInst *CI, IRBuilder<> &B,
}
nval = B.CreateFDiv(cnval, nval, "__1powprod");
}
- LLVM_DEBUG(errs() << "AMDIC: " << *CI << " ---> "
+ LLVM_DEBUG(errs() << "AMDIC: " << *FPOp << " ---> "
<< ((ci_opr1 < 0) ? "1/prod(" : "prod(") << *opr0
<< ")\n");
- replaceCall(nval);
+ replaceCall(FPOp, nval);
return true;
}
+ // If we should use the generic intrinsic instead of emitting a libcall
+ const bool ShouldUseIntrinsic = eltType->isFloatTy() || eltType->isHalfTy();
+
// powr ---> exp2(y * log2(x))
// pown/pow ---> powr(fabs(x), y) | (x & ((int)y << 31))
- FunctionCallee ExpExpr =
- getFunction(M, AMDGPULibFunc(AMDGPULibFunc::EI_EXP2, FInfo));
- if (!ExpExpr)
- return false;
+ FunctionCallee ExpExpr;
+ if (ShouldUseIntrinsic)
+ ExpExpr = Intrinsic::getDeclaration(M, Intrinsic::exp2, {FPOp->getType()});
+ else {
+ ExpExpr = getFunction(M, AMDGPULibFunc(AMDGPULibFunc::EI_EXP2, FInfo));
+ if (!ExpExpr)
+ return false;
+ }
bool needlog = false;
bool needabs = false;
bool needcopysign = false;
Constant *cnval = nullptr;
if (getVecSize(FInfo) == 1) {
- CF = dyn_cast<ConstantFP>(opr0);
+ CF = nullptr;
+ match(opr0, m_APFloatAllowUndef(CF));
if (CF) {
double V = (getArgType(FInfo) == AMDGPULibFunc::F32)
- ? (double)CF->getValueAPF().convertToFloat()
- : CF->getValueAPF().convertToDouble();
+ ? (double)CF->convertToFloat()
+ : CF->convertToDouble();
V = log2(std::abs(V));
cnval = ConstantFP::get(eltType, V);
@@ -986,9 +1065,7 @@ bool AMDGPULibCalls::fold_pow(CallInst *CI, IRBuilder<> &B,
SmallVector<double, 0> DVal;
for (int i=0; i < getVecSize(FInfo); ++i) {
- double V = (getArgType(FInfo) == AMDGPULibFunc::F32)
- ? (double)CDV->getElementAsFloat(i)
- : CDV->getElementAsDouble(i);
+ double V = CDV->getElementAsAPFloat(i).convertToDouble();
if (V < 0.0) needcopysign = true;
V = log2(std::abs(V));
DVal.push_back(V);
@@ -1010,44 +1087,27 @@ bool AMDGPULibCalls::fold_pow(CallInst *CI, IRBuilder<> &B,
if (needcopysign && (FInfo.getId() == AMDGPULibFunc::EI_POW)) {
// We cannot handle corner cases for a general pow() function, give up
// unless y is a constant integral value. Then proceed as if it were pown.
- if (getVecSize(FInfo) == 1) {
- if (const ConstantFP *CF = dyn_cast<ConstantFP>(opr1)) {
- double y = (getArgType(FInfo) == AMDGPULibFunc::F32)
- ? (double)CF->getValueAPF().convertToFloat()
- : CF->getValueAPF().convertToDouble();
- if (y != (double)(int64_t)y)
- return false;
- } else
- return false;
- } else {
- if (const ConstantDataVector *CDV = dyn_cast<ConstantDataVector>(opr1)) {
- for (int i=0; i < getVecSize(FInfo); ++i) {
- double y = (getArgType(FInfo) == AMDGPULibFunc::F32)
- ? (double)CDV->getElementAsFloat(i)
- : CDV->getElementAsDouble(i);
- if (y != (double)(int64_t)y)
- return false;
- }
- } else
- return false;
- }
+ if (!isKnownIntegral(opr1, M->getDataLayout(), FPOp->getFastMathFlags()))
+ return false;
}
Value *nval;
if (needabs) {
- FunctionCallee AbsExpr =
- getFunction(M, AMDGPULibFunc(AMDGPULibFunc::EI_FABS, FInfo));
- if (!AbsExpr)
- return false;
- nval = CreateCallEx(B, AbsExpr, opr0, "__fabs");
+ nval = B.CreateUnaryIntrinsic(Intrinsic::fabs, opr0, nullptr, "__fabs");
} else {
nval = cnval ? cnval : opr0;
}
if (needlog) {
- FunctionCallee LogExpr =
- getFunction(M, AMDGPULibFunc(AMDGPULibFunc::EI_LOG2, FInfo));
- if (!LogExpr)
- return false;
+ FunctionCallee LogExpr;
+ if (ShouldUseIntrinsic) {
+ LogExpr =
+ Intrinsic::getDeclaration(M, Intrinsic::log2, {FPOp->getType()});
+ } else {
+ LogExpr = getFunction(M, AMDGPULibFunc(AMDGPULibFunc::EI_LOG2, FInfo));
+ if (!LogExpr)
+ return false;
+ }
+
nval = CreateCallEx(B,LogExpr, nval, "__log2");
}
@@ -1061,14 +1121,14 @@ bool AMDGPULibCalls::fold_pow(CallInst *CI, IRBuilder<> &B,
if (needcopysign) {
Value *opr_n;
Type* rTy = opr0->getType();
- Type* nTyS = eltType->isDoubleTy() ? B.getInt64Ty() : B.getInt32Ty();
+ Type* nTyS = B.getIntNTy(eltType->getPrimitiveSizeInBits());
Type *nTy = nTyS;
if (const auto *vTy = dyn_cast<FixedVectorType>(rTy))
nTy = FixedVectorType::get(nTyS, vTy);
unsigned size = nTy->getScalarSizeInBits();
- opr_n = CI->getArgOperand(1);
+ opr_n = FPOp->getOperand(1);
if (opr_n->getType()->isIntegerTy())
- opr_n = B.CreateZExtOrBitCast(opr_n, nTy, "__ytou");
+ opr_n = B.CreateZExtOrTrunc(opr_n, nTy, "__ytou");
else
opr_n = B.CreateFPToSI(opr1, nTy, "__ytou");
@@ -1078,17 +1138,21 @@ bool AMDGPULibCalls::fold_pow(CallInst *CI, IRBuilder<> &B,
nval = B.CreateBitCast(nval, opr0->getType());
}
- LLVM_DEBUG(errs() << "AMDIC: " << *CI << " ---> "
+ LLVM_DEBUG(errs() << "AMDIC: " << *FPOp << " ---> "
<< "exp2(" << *opr1 << " * log2(" << *opr0 << "))\n");
- replaceCall(nval);
+ replaceCall(FPOp, nval);
return true;
}
-bool AMDGPULibCalls::fold_rootn(CallInst *CI, IRBuilder<> &B,
+bool AMDGPULibCalls::fold_rootn(FPMathOperator *FPOp, IRBuilder<> &B,
const FuncInfo &FInfo) {
- Value *opr0 = CI->getArgOperand(0);
- Value *opr1 = CI->getArgOperand(1);
+ // skip vector function
+ if (getVecSize(FInfo) != 1)
+ return false;
+
+ Value *opr0 = FPOp->getOperand(0);
+ Value *opr1 = FPOp->getOperand(1);
ConstantInt *CINT = dyn_cast<ConstantInt>(opr1);
if (!CINT) {
@@ -1096,90 +1160,47 @@ bool AMDGPULibCalls::fold_rootn(CallInst *CI, IRBuilder<> &B,
}
int ci_opr1 = (int)CINT->getSExtValue();
if (ci_opr1 == 1) { // rootn(x, 1) = x
- LLVM_DEBUG(errs() << "AMDIC: " << *CI << " ---> " << *opr0 << "\n");
- replaceCall(opr0);
+ LLVM_DEBUG(errs() << "AMDIC: " << *FPOp << " ---> " << *opr0 << "\n");
+ replaceCall(FPOp, opr0);
return true;
}
- if (ci_opr1 == 2) { // rootn(x, 2) = sqrt(x)
- Module *M = CI->getModule();
+
+ Module *M = B.GetInsertBlock()->getModule();
+ if (ci_opr1 == 2) { // rootn(x, 2) = sqrt(x)
if (FunctionCallee FPExpr =
getFunction(M, AMDGPULibFunc(AMDGPULibFunc::EI_SQRT, FInfo))) {
- LLVM_DEBUG(errs() << "AMDIC: " << *CI << " ---> sqrt(" << *opr0 << ")\n");
+ LLVM_DEBUG(errs() << "AMDIC: " << *FPOp << " ---> sqrt(" << *opr0
+ << ")\n");
Value *nval = CreateCallEx(B,FPExpr, opr0, "__rootn2sqrt");
- replaceCall(nval);
+ replaceCall(FPOp, nval);
return true;
}
} else if (ci_opr1 == 3) { // rootn(x, 3) = cbrt(x)
- Module *M = CI->getModule();
if (FunctionCallee FPExpr =
getFunction(M, AMDGPULibFunc(AMDGPULibFunc::EI_CBRT, FInfo))) {
- LLVM_DEBUG(errs() << "AMDIC: " << *CI << " ---> cbrt(" << *opr0 << ")\n");
+ LLVM_DEBUG(errs() << "AMDIC: " << *FPOp << " ---> cbrt(" << *opr0
+ << ")\n");
Value *nval = CreateCallEx(B,FPExpr, opr0, "__rootn2cbrt");
- replaceCall(nval);
+ replaceCall(FPOp, nval);
return true;
}
} else if (ci_opr1 == -1) { // rootn(x, -1) = 1.0/x
- LLVM_DEBUG(errs() << "AMDIC: " << *CI << " ---> 1.0 / " << *opr0 << "\n");
+ LLVM_DEBUG(errs() << "AMDIC: " << *FPOp << " ---> 1.0 / " << *opr0 << "\n");
Value *nval = B.CreateFDiv(ConstantFP::get(opr0->getType(), 1.0),
opr0,
"__rootn2div");
- replaceCall(nval);
+ replaceCall(FPOp, nval);
return true;
- } else if (ci_opr1 == -2) { // rootn(x, -2) = rsqrt(x)
- Module *M = CI->getModule();
+ } else if (ci_opr1 == -2) { // rootn(x, -2) = rsqrt(x)
if (FunctionCallee FPExpr =
getFunction(M, AMDGPULibFunc(AMDGPULibFunc::EI_RSQRT, FInfo))) {
- LLVM_DEBUG(errs() << "AMDIC: " << *CI << " ---> rsqrt(" << *opr0
+ LLVM_DEBUG(errs() << "AMDIC: " << *FPOp << " ---> rsqrt(" << *opr0
<< ")\n");
Value *nval = CreateCallEx(B,FPExpr, opr0, "__rootn2rsqrt");
- replaceCall(nval);
- return true;
- }
- }
- return false;
-}
-
-bool AMDGPULibCalls::fold_fma_mad(CallInst *CI, IRBuilder<> &B,
- const FuncInfo &FInfo) {
- Value *opr0 = CI->getArgOperand(0);
- Value *opr1 = CI->getArgOperand(1);
- Value *opr2 = CI->getArgOperand(2);
-
- ConstantFP *CF0 = dyn_cast<ConstantFP>(opr0);
- ConstantFP *CF1 = dyn_cast<ConstantFP>(opr1);
- if ((CF0 && CF0->isZero()) || (CF1 && CF1->isZero())) {
- // fma/mad(a, b, c) = c if a=0 || b=0
- LLVM_DEBUG(errs() << "AMDIC: " << *CI << " ---> " << *opr2 << "\n");
- replaceCall(opr2);
- return true;
- }
- if (CF0 && CF0->isExactlyValue(1.0f)) {
- // fma/mad(a, b, c) = b+c if a=1
- LLVM_DEBUG(errs() << "AMDIC: " << *CI << " ---> " << *opr1 << " + " << *opr2
- << "\n");
- Value *nval = B.CreateFAdd(opr1, opr2, "fmaadd");
- replaceCall(nval);
- return true;
- }
- if (CF1 && CF1->isExactlyValue(1.0f)) {
- // fma/mad(a, b, c) = a+c if b=1
- LLVM_DEBUG(errs() << "AMDIC: " << *CI << " ---> " << *opr0 << " + " << *opr2
- << "\n");
- Value *nval = B.CreateFAdd(opr0, opr2, "fmaadd");
- replaceCall(nval);
- return true;
- }
- if (ConstantFP *CF = dyn_cast<ConstantFP>(opr2)) {
- if (CF->isZero()) {
- // fma/mad(a, b, c) = a*b if c=0
- LLVM_DEBUG(errs() << "AMDIC: " << *CI << " ---> " << *opr0 << " * "
- << *opr1 << "\n");
- Value *nval = B.CreateFMul(opr0, opr1, "fmamul");
- replaceCall(nval);
+ replaceCall(FPOp, nval);
return true;
}
}
-
return false;
}
@@ -1193,185 +1214,243 @@ FunctionCallee AMDGPULibCalls::getNativeFunction(Module *M,
return getFunction(M, nf);
}
+// Some library calls are just wrappers around llvm intrinsics, but compiled
+// conservatively. Preserve the flags from the original call site by
+// substituting them with direct calls with all the flags.
+bool AMDGPULibCalls::shouldReplaceLibcallWithIntrinsic(const CallInst *CI,
+ bool AllowMinSizeF32,
+ bool AllowF64,
+ bool AllowStrictFP) {
+ Type *FltTy = CI->getType()->getScalarType();
+ const bool IsF32 = FltTy->isFloatTy();
+
+ // f64 intrinsics aren't implemented for most operations.
+ if (!IsF32 && !FltTy->isHalfTy() && (!AllowF64 || !FltTy->isDoubleTy()))
+ return false;
+
+ // We're implicitly inlining by replacing the libcall with the intrinsic, so
+ // don't do it for noinline call sites.
+ if (CI->isNoInline())
+ return false;
+
+ const Function *ParentF = CI->getFunction();
+ // TODO: Handle strictfp
+ if (!AllowStrictFP && ParentF->hasFnAttribute(Attribute::StrictFP))
+ return false;
+
+ if (IsF32 && !AllowMinSizeF32 && ParentF->hasMinSize())
+ return false;
+ return true;
+}
+
+void AMDGPULibCalls::replaceLibCallWithSimpleIntrinsic(IRBuilder<> &B,
+ CallInst *CI,
+ Intrinsic::ID IntrID) {
+ if (CI->arg_size() == 2) {
+ Value *Arg0 = CI->getArgOperand(0);
+ Value *Arg1 = CI->getArgOperand(1);
+ VectorType *Arg0VecTy = dyn_cast<VectorType>(Arg0->getType());
+ VectorType *Arg1VecTy = dyn_cast<VectorType>(Arg1->getType());
+ if (Arg0VecTy && !Arg1VecTy) {
+ Value *SplatRHS = B.CreateVectorSplat(Arg0VecTy->getElementCount(), Arg1);
+ CI->setArgOperand(1, SplatRHS);
+ } else if (!Arg0VecTy && Arg1VecTy) {
+ Value *SplatLHS = B.CreateVectorSplat(Arg1VecTy->getElementCount(), Arg0);
+ CI->setArgOperand(0, SplatLHS);
+ }
+ }
+
+ CI->setCalledFunction(
+ Intrinsic::getDeclaration(CI->getModule(), IntrID, {CI->getType()}));
+}
+
+bool AMDGPULibCalls::tryReplaceLibcallWithSimpleIntrinsic(
+ IRBuilder<> &B, CallInst *CI, Intrinsic::ID IntrID, bool AllowMinSizeF32,
+ bool AllowF64, bool AllowStrictFP) {
+ if (!shouldReplaceLibcallWithIntrinsic(CI, AllowMinSizeF32, AllowF64,
+ AllowStrictFP))
+ return false;
+ replaceLibCallWithSimpleIntrinsic(B, CI, IntrID);
+ return true;
+}
+
// fold sqrt -> native_sqrt (x)
-bool AMDGPULibCalls::fold_sqrt(CallInst *CI, IRBuilder<> &B,
+bool AMDGPULibCalls::fold_sqrt(FPMathOperator *FPOp, IRBuilder<> &B,
const FuncInfo &FInfo) {
+ if (!isUnsafeMath(FPOp))
+ return false;
+
if (getArgType(FInfo) == AMDGPULibFunc::F32 && (getVecSize(FInfo) == 1) &&
(FInfo.getPrefix() != AMDGPULibFunc::NATIVE)) {
+ Module *M = B.GetInsertBlock()->getModule();
+
if (FunctionCallee FPExpr = getNativeFunction(
- CI->getModule(), AMDGPULibFunc(AMDGPULibFunc::EI_SQRT, FInfo))) {
- Value *opr0 = CI->getArgOperand(0);
- LLVM_DEBUG(errs() << "AMDIC: " << *CI << " ---> "
+ M, AMDGPULibFunc(AMDGPULibFunc::EI_SQRT, FInfo))) {
+ Value *opr0 = FPOp->getOperand(0);
+ LLVM_DEBUG(errs() << "AMDIC: " << *FPOp << " ---> "
<< "sqrt(" << *opr0 << ")\n");
Value *nval = CreateCallEx(B,FPExpr, opr0, "__sqrt");
- replaceCall(nval);
+ replaceCall(FPOp, nval);
return true;
}
}
return false;
}
-// fold sin, cos -> sincos.
-bool AMDGPULibCalls::fold_sincos(CallInst *CI, IRBuilder<> &B,
- AliasAnalysis *AA) {
- AMDGPULibFunc fInfo;
- if (!AMDGPULibFunc::parse(CI->getCalledFunction()->getName(), fInfo))
- return false;
+std::tuple<Value *, Value *, Value *>
+AMDGPULibCalls::insertSinCos(Value *Arg, FastMathFlags FMF, IRBuilder<> &B,
+ FunctionCallee Fsincos) {
+ DebugLoc DL = B.getCurrentDebugLocation();
+ Function *F = B.GetInsertBlock()->getParent();
+ B.SetInsertPointPastAllocas(F);
+ AllocaInst *Alloc = B.CreateAlloca(Arg->getType(), nullptr, "__sincos_");
+
+ if (Instruction *ArgInst = dyn_cast<Instruction>(Arg)) {
+ // If the argument is an instruction, it must dominate all uses so put our
+ // sincos call there. Otherwise, right after the allocas works well enough
+ // if it's an argument or constant.
+
+ B.SetInsertPoint(ArgInst->getParent(), ++ArgInst->getIterator());
+
+ // SetInsertPoint unwelcomely always tries to set the debug loc.
+ B.SetCurrentDebugLocation(DL);
+ }
+
+ Type *CosPtrTy = Fsincos.getFunctionType()->getParamType(1);
+
+ // The allocaInst allocates the memory in private address space. This need
+ // to be addrspacecasted to point to the address space of cos pointer type.
+ // In OpenCL 2.0 this is generic, while in 1.2 that is private.
+ Value *CastAlloc = B.CreateAddrSpaceCast(Alloc, CosPtrTy);
+
+ CallInst *SinCos = CreateCallEx2(B, Fsincos, Arg, CastAlloc);
+
+ // TODO: Is it worth trying to preserve the location for the cos calls for the
+ // load?
+
+ LoadInst *LoadCos = B.CreateLoad(Alloc->getAllocatedType(), Alloc);
+ return {SinCos, LoadCos, SinCos};
+}
+
+// fold sin, cos -> sincos.
+bool AMDGPULibCalls::fold_sincos(FPMathOperator *FPOp, IRBuilder<> &B,
+ const FuncInfo &fInfo) {
assert(fInfo.getId() == AMDGPULibFunc::EI_SIN ||
fInfo.getId() == AMDGPULibFunc::EI_COS);
+
+ if ((getArgType(fInfo) != AMDGPULibFunc::F32 &&
+ getArgType(fInfo) != AMDGPULibFunc::F64) ||
+ fInfo.getPrefix() != AMDGPULibFunc::NOPFX)
+ return false;
+
bool const isSin = fInfo.getId() == AMDGPULibFunc::EI_SIN;
- Value *CArgVal = CI->getArgOperand(0);
- BasicBlock * const CBB = CI->getParent();
+ Value *CArgVal = FPOp->getOperand(0);
+ CallInst *CI = cast<CallInst>(FPOp);
- int const MaxScan = 30;
- bool Changed = false;
+ Function *F = B.GetInsertBlock()->getParent();
+ Module *M = F->getParent();
- { // fold in load value.
- LoadInst *LI = dyn_cast<LoadInst>(CArgVal);
- if (LI && LI->getParent() == CBB) {
- BasicBlock::iterator BBI = LI->getIterator();
- Value *AvailableVal = FindAvailableLoadedValue(LI, CBB, BBI, MaxScan, AA);
- if (AvailableVal) {
- Changed = true;
- CArgVal->replaceAllUsesWith(AvailableVal);
- if (CArgVal->getNumUses() == 0)
- LI->eraseFromParent();
- CArgVal = CI->getArgOperand(0);
- }
- }
- }
+ // Merge the sin and cos. For OpenCL 2.0, there may only be a generic pointer
+ // implementation. Prefer the private form if available.
+ AMDGPULibFunc SinCosLibFuncPrivate(AMDGPULibFunc::EI_SINCOS, fInfo);
+ SinCosLibFuncPrivate.getLeads()[0].PtrKind =
+ AMDGPULibFunc::getEPtrKindFromAddrSpace(AMDGPUAS::PRIVATE_ADDRESS);
- Module *M = CI->getModule();
- fInfo.setId(isSin ? AMDGPULibFunc::EI_COS : AMDGPULibFunc::EI_SIN);
- std::string const PairName = fInfo.mangle();
+ AMDGPULibFunc SinCosLibFuncGeneric(AMDGPULibFunc::EI_SINCOS, fInfo);
+ SinCosLibFuncGeneric.getLeads()[0].PtrKind =
+ AMDGPULibFunc::getEPtrKindFromAddrSpace(AMDGPUAS::FLAT_ADDRESS);
+
+ FunctionCallee FSinCosPrivate = getFunction(M, SinCosLibFuncPrivate);
+ FunctionCallee FSinCosGeneric = getFunction(M, SinCosLibFuncGeneric);
+ FunctionCallee FSinCos = FSinCosPrivate ? FSinCosPrivate : FSinCosGeneric;
+ if (!FSinCos)
+ return false;
+
+ SmallVector<CallInst *> SinCalls;
+ SmallVector<CallInst *> CosCalls;
+ SmallVector<CallInst *> SinCosCalls;
+ FuncInfo PartnerInfo(isSin ? AMDGPULibFunc::EI_COS : AMDGPULibFunc::EI_SIN,
+ fInfo);
+ const std::string PairName = PartnerInfo.mangle();
+
+ StringRef SinName = isSin ? CI->getCalledFunction()->getName() : PairName;
+ StringRef CosName = isSin ? PairName : CI->getCalledFunction()->getName();
+ const std::string SinCosPrivateName = SinCosLibFuncPrivate.mangle();
+ const std::string SinCosGenericName = SinCosLibFuncGeneric.mangle();
+
+ // Intersect the two sets of flags.
+ FastMathFlags FMF = FPOp->getFastMathFlags();
+ MDNode *FPMath = CI->getMetadata(LLVMContext::MD_fpmath);
+
+ SmallVector<DILocation *> MergeDbgLocs = {CI->getDebugLoc()};
- CallInst *UI = nullptr;
for (User* U : CArgVal->users()) {
- CallInst *XI = dyn_cast_or_null<CallInst>(U);
- if (!XI || XI == CI || XI->getParent() != CBB)
+ CallInst *XI = dyn_cast<CallInst>(U);
+ if (!XI || XI->getFunction() != F || XI->isNoBuiltin())
continue;
Function *UCallee = XI->getCalledFunction();
- if (!UCallee || !UCallee->getName().equals(PairName))
+ if (!UCallee)
continue;
- BasicBlock::iterator BBI = CI->getIterator();
- if (BBI == CI->getParent()->begin())
- break;
- --BBI;
- for (int I = MaxScan; I > 0 && BBI != CBB->begin(); --BBI, --I) {
- if (cast<Instruction>(BBI) == XI) {
- UI = XI;
- break;
- }
+ bool Handled = true;
+
+ if (UCallee->getName() == SinName)
+ SinCalls.push_back(XI);
+ else if (UCallee->getName() == CosName)
+ CosCalls.push_back(XI);
+ else if (UCallee->getName() == SinCosPrivateName ||
+ UCallee->getName() == SinCosGenericName)
+ SinCosCalls.push_back(XI);
+ else
+ Handled = false;
+
+ if (Handled) {
+ MergeDbgLocs.push_back(XI->getDebugLoc());
+ auto *OtherOp = cast<FPMathOperator>(XI);
+ FMF &= OtherOp->getFastMathFlags();
+ FPMath = MDNode::getMostGenericFPMath(
+ FPMath, XI->getMetadata(LLVMContext::MD_fpmath));
}
- if (UI) break;
}
- if (!UI)
- return Changed;
-
- // Merge the sin and cos.
+ if (SinCalls.empty() || CosCalls.empty())
+ return false;
- // for OpenCL 2.0 we have only generic implementation of sincos
- // function.
- AMDGPULibFunc nf(AMDGPULibFunc::EI_SINCOS, fInfo);
- nf.getLeads()[0].PtrKind = AMDGPULibFunc::getEPtrKindFromAddrSpace(AMDGPUAS::FLAT_ADDRESS);
- FunctionCallee Fsincos = getFunction(M, nf);
- if (!Fsincos)
- return Changed;
+ B.setFastMathFlags(FMF);
+ B.setDefaultFPMathTag(FPMath);
+ DILocation *DbgLoc = DILocation::getMergedLocations(MergeDbgLocs);
+ B.SetCurrentDebugLocation(DbgLoc);
- BasicBlock::iterator ItOld = B.GetInsertPoint();
- AllocaInst *Alloc = insertAlloca(UI, B, "__sincos_");
- B.SetInsertPoint(UI);
+ auto [Sin, Cos, SinCos] = insertSinCos(CArgVal, FMF, B, FSinCos);
- Value *P = Alloc;
- Type *PTy = Fsincos.getFunctionType()->getParamType(1);
- // The allocaInst allocates the memory in private address space. This need
- // to be bitcasted to point to the address space of cos pointer type.
- // In OpenCL 2.0 this is generic, while in 1.2 that is private.
- if (PTy->getPointerAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS)
- P = B.CreateAddrSpaceCast(Alloc, PTy);
- CallInst *Call = CreateCallEx2(B, Fsincos, UI->getArgOperand(0), P);
-
- LLVM_DEBUG(errs() << "AMDIC: fold_sincos (" << *CI << ", " << *UI << ") with "
- << *Call << "\n");
-
- if (!isSin) { // CI->cos, UI->sin
- B.SetInsertPoint(&*ItOld);
- UI->replaceAllUsesWith(&*Call);
- Instruction *Reload = B.CreateLoad(Alloc->getAllocatedType(), Alloc);
- CI->replaceAllUsesWith(Reload);
- UI->eraseFromParent();
- CI->eraseFromParent();
- } else { // CI->sin, UI->cos
- Instruction *Reload = B.CreateLoad(Alloc->getAllocatedType(), Alloc);
- UI->replaceAllUsesWith(Reload);
- CI->replaceAllUsesWith(Call);
- UI->eraseFromParent();
- CI->eraseFromParent();
- }
- return true;
-}
-
-bool AMDGPULibCalls::fold_wavefrontsize(CallInst *CI, IRBuilder<> &B) {
- if (!TM)
- return false;
+ auto replaceTrigInsts = [](ArrayRef<CallInst *> Calls, Value *Res) {
+ for (CallInst *C : Calls)
+ C->replaceAllUsesWith(Res);
- StringRef CPU = TM->getTargetCPU();
- StringRef Features = TM->getTargetFeatureString();
- if ((CPU.empty() || CPU.equals_insensitive("generic")) &&
- (Features.empty() || !Features.contains_insensitive("wavefrontsize")))
- return false;
-
- Function *F = CI->getParent()->getParent();
- const GCNSubtarget &ST = TM->getSubtarget<GCNSubtarget>(*F);
- unsigned N = ST.getWavefrontSize();
+ // Leave the other dead instructions to avoid clobbering iterators.
+ };
- LLVM_DEBUG(errs() << "AMDIC: fold_wavefrontsize (" << *CI << ") with "
- << N << "\n");
+ replaceTrigInsts(SinCalls, Sin);
+ replaceTrigInsts(CosCalls, Cos);
+ replaceTrigInsts(SinCosCalls, SinCos);
- CI->replaceAllUsesWith(ConstantInt::get(B.getInt32Ty(), N));
+ // It's safe to delete the original now.
CI->eraseFromParent();
return true;
}
-// Get insertion point at entry.
-BasicBlock::iterator AMDGPULibCalls::getEntryIns(CallInst * UI) {
- Function * Func = UI->getParent()->getParent();
- BasicBlock * BB = &Func->getEntryBlock();
- assert(BB && "Entry block not found!");
- BasicBlock::iterator ItNew = BB->begin();
- return ItNew;
-}
-
-// Insert a AllocsInst at the beginning of function entry block.
-AllocaInst* AMDGPULibCalls::insertAlloca(CallInst *UI, IRBuilder<> &B,
- const char *prefix) {
- BasicBlock::iterator ItNew = getEntryIns(UI);
- Function *UCallee = UI->getCalledFunction();
- Type *RetType = UCallee->getReturnType();
- B.SetInsertPoint(&*ItNew);
- AllocaInst *Alloc =
- B.CreateAlloca(RetType, nullptr, std::string(prefix) + UI->getName());
- Alloc->setAlignment(
- Align(UCallee->getParent()->getDataLayout().getTypeAllocSize(RetType)));
- return Alloc;
-}
-
-bool AMDGPULibCalls::evaluateScalarMathFunc(const FuncInfo &FInfo,
- double& Res0, double& Res1,
- Constant *copr0, Constant *copr1,
- Constant *copr2) {
+bool AMDGPULibCalls::evaluateScalarMathFunc(const FuncInfo &FInfo, double &Res0,
+ double &Res1, Constant *copr0,
+ Constant *copr1) {
// By default, opr0/opr1/opr3 holds values of float/double type.
// If they are not float/double, each function has to its
// operand separately.
- double opr0=0.0, opr1=0.0, opr2=0.0;
+ double opr0 = 0.0, opr1 = 0.0;
ConstantFP *fpopr0 = dyn_cast_or_null<ConstantFP>(copr0);
ConstantFP *fpopr1 = dyn_cast_or_null<ConstantFP>(copr1);
- ConstantFP *fpopr2 = dyn_cast_or_null<ConstantFP>(copr2);
if (fpopr0) {
opr0 = (getArgType(FInfo) == AMDGPULibFunc::F64)
? fpopr0->getValueAPF().convertToDouble()
@@ -1384,12 +1463,6 @@ bool AMDGPULibCalls::evaluateScalarMathFunc(const FuncInfo &FInfo,
: (double)fpopr1->getValueAPF().convertToFloat();
}
- if (fpopr2) {
- opr2 = (getArgType(FInfo) == AMDGPULibFunc::F64)
- ? fpopr2->getValueAPF().convertToDouble()
- : (double)fpopr2->getValueAPF().convertToFloat();
- }
-
switch (FInfo.getId()) {
default : return false;
@@ -1460,10 +1533,6 @@ bool AMDGPULibCalls::evaluateScalarMathFunc(const FuncInfo &FInfo,
Res0 = pow(10.0, opr0);
return true;
- case AMDGPULibFunc::EI_EXPM1:
- Res0 = exp(opr0) - 1.0;
- return true;
-
case AMDGPULibFunc::EI_LOG:
Res0 = log(opr0);
return true;
@@ -1492,10 +1561,6 @@ bool AMDGPULibCalls::evaluateScalarMathFunc(const FuncInfo &FInfo,
Res0 = sin(MATH_PI * opr0);
return true;
- case AMDGPULibFunc::EI_SQRT:
- Res0 = sqrt(opr0);
- return true;
-
case AMDGPULibFunc::EI_TAN:
Res0 = tan(opr0);
return true;
@@ -1508,15 +1573,7 @@ bool AMDGPULibCalls::evaluateScalarMathFunc(const FuncInfo &FInfo,
Res0 = tan(MATH_PI * opr0);
return true;
- case AMDGPULibFunc::EI_RECIP:
- Res0 = 1.0 / opr0;
- return true;
-
// two-arg functions
- case AMDGPULibFunc::EI_DIVIDE:
- Res0 = opr0 / opr1;
- return true;
-
case AMDGPULibFunc::EI_POW:
case AMDGPULibFunc::EI_POWR:
Res0 = pow(opr0, opr1);
@@ -1545,12 +1602,6 @@ bool AMDGPULibCalls::evaluateScalarMathFunc(const FuncInfo &FInfo,
Res0 = sin(opr0);
Res1 = cos(opr0);
return true;
-
- // three-arg functions
- case AMDGPULibFunc::EI_FMA:
- case AMDGPULibFunc::EI_MAD:
- Res0 = opr0 * opr1 + opr2;
- return true;
}
return false;
@@ -1563,7 +1614,6 @@ bool AMDGPULibCalls::evaluateCall(CallInst *aCI, const FuncInfo &FInfo) {
Constant *copr0 = nullptr;
Constant *copr1 = nullptr;
- Constant *copr2 = nullptr;
if (numArgs > 0) {
if ((copr0 = dyn_cast<Constant>(aCI->getArgOperand(0))) == nullptr)
return false;
@@ -1576,11 +1626,6 @@ bool AMDGPULibCalls::evaluateCall(CallInst *aCI, const FuncInfo &FInfo) {
}
}
- if (numArgs > 2) {
- if ((copr2 = dyn_cast<Constant>(aCI->getArgOperand(2))) == nullptr)
- return false;
- }
-
// At this point, all arguments to aCI are constants.
// max vector size is 16, and sincos will generate two results.
@@ -1588,31 +1633,27 @@ bool AMDGPULibCalls::evaluateCall(CallInst *aCI, const FuncInfo &FInfo) {
int FuncVecSize = getVecSize(FInfo);
bool hasTwoResults = (FInfo.getId() == AMDGPULibFunc::EI_SINCOS);
if (FuncVecSize == 1) {
- if (!evaluateScalarMathFunc(FInfo, DVal0[0],
- DVal1[0], copr0, copr1, copr2)) {
+ if (!evaluateScalarMathFunc(FInfo, DVal0[0], DVal1[0], copr0, copr1)) {
return false;
}
} else {
ConstantDataVector *CDV0 = dyn_cast_or_null<ConstantDataVector>(copr0);
ConstantDataVector *CDV1 = dyn_cast_or_null<ConstantDataVector>(copr1);
- ConstantDataVector *CDV2 = dyn_cast_or_null<ConstantDataVector>(copr2);
for (int i = 0; i < FuncVecSize; ++i) {
Constant *celt0 = CDV0 ? CDV0->getElementAsConstant(i) : nullptr;
Constant *celt1 = CDV1 ? CDV1->getElementAsConstant(i) : nullptr;
- Constant *celt2 = CDV2 ? CDV2->getElementAsConstant(i) : nullptr;
- if (!evaluateScalarMathFunc(FInfo, DVal0[i],
- DVal1[i], celt0, celt1, celt2)) {
+ if (!evaluateScalarMathFunc(FInfo, DVal0[i], DVal1[i], celt0, celt1)) {
return false;
}
}
}
- LLVMContext &context = CI->getParent()->getParent()->getContext();
+ LLVMContext &context = aCI->getContext();
Constant *nval0, *nval1;
if (FuncVecSize == 1) {
- nval0 = ConstantFP::get(CI->getType(), DVal0[0]);
+ nval0 = ConstantFP::get(aCI->getType(), DVal0[0]);
if (hasTwoResults)
- nval1 = ConstantFP::get(CI->getType(), DVal1[0]);
+ nval1 = ConstantFP::get(aCI->getType(), DVal1[0]);
} else {
if (getArgType(FInfo) == AMDGPULibFunc::F32) {
SmallVector <float, 0> FVal0, FVal1;
@@ -1643,59 +1684,17 @@ bool AMDGPULibCalls::evaluateCall(CallInst *aCI, const FuncInfo &FInfo) {
new StoreInst(nval1, aCI->getArgOperand(1), aCI);
}
- replaceCall(nval0);
+ replaceCall(aCI, nval0);
return true;
}
-// Public interface to the Simplify LibCalls pass.
-FunctionPass *llvm::createAMDGPUSimplifyLibCallsPass(const TargetMachine *TM) {
- return new AMDGPUSimplifyLibCalls(TM);
-}
-
-FunctionPass *llvm::createAMDGPUUseNativeCallsPass() {
- return new AMDGPUUseNativeCalls();
-}
-
-bool AMDGPUSimplifyLibCalls::runOnFunction(Function &F) {
- if (skipFunction(F))
- return false;
-
- bool Changed = false;
- auto AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
-
- LLVM_DEBUG(dbgs() << "AMDIC: process function ";
- F.printAsOperand(dbgs(), false, F.getParent()); dbgs() << '\n';);
-
- for (auto &BB : F) {
- for (BasicBlock::iterator I = BB.begin(), E = BB.end(); I != E; ) {
- // Ignore non-calls.
- CallInst *CI = dyn_cast<CallInst>(I);
- ++I;
- // Ignore intrinsics that do not become real instructions.
- if (!CI || isa<DbgInfoIntrinsic>(CI) || CI->isLifetimeStartOrEnd())
- continue;
-
- // Ignore indirect calls.
- Function *Callee = CI->getCalledFunction();
- if (Callee == nullptr)
- continue;
-
- LLVM_DEBUG(dbgs() << "AMDIC: try folding " << *CI << "\n";
- dbgs().flush());
- if(Simplifier.fold(CI, AA))
- Changed = true;
- }
- }
- return Changed;
-}
-
PreservedAnalyses AMDGPUSimplifyLibCallsPass::run(Function &F,
FunctionAnalysisManager &AM) {
- AMDGPULibCalls Simplifier(&TM);
+ AMDGPULibCalls Simplifier;
Simplifier.initNativeFuncs();
+ Simplifier.initFunction(F, AM);
bool Changed = false;
- auto AA = &AM.getResult<AAManager>(F);
LLVM_DEBUG(dbgs() << "AMDIC: process function ";
F.printAsOperand(dbgs(), false, F.getParent()); dbgs() << '\n';);
@@ -1705,48 +1704,16 @@ PreservedAnalyses AMDGPUSimplifyLibCallsPass::run(Function &F,
// Ignore non-calls.
CallInst *CI = dyn_cast<CallInst>(I);
++I;
- // Ignore intrinsics that do not become real instructions.
- if (!CI || isa<DbgInfoIntrinsic>(CI) || CI->isLifetimeStartOrEnd())
- continue;
- // Ignore indirect calls.
- Function *Callee = CI->getCalledFunction();
- if (Callee == nullptr)
- continue;
-
- LLVM_DEBUG(dbgs() << "AMDIC: try folding " << *CI << "\n";
- dbgs().flush());
- if (Simplifier.fold(CI, AA))
- Changed = true;
+ if (CI) {
+ if (Simplifier.fold(CI))
+ Changed = true;
+ }
}
}
return Changed ? PreservedAnalyses::none() : PreservedAnalyses::all();
}
-bool AMDGPUUseNativeCalls::runOnFunction(Function &F) {
- if (skipFunction(F) || UseNative.empty())
- return false;
-
- bool Changed = false;
- for (auto &BB : F) {
- for (BasicBlock::iterator I = BB.begin(), E = BB.end(); I != E; ) {
- // Ignore non-calls.
- CallInst *CI = dyn_cast<CallInst>(I);
- ++I;
- if (!CI) continue;
-
- // Ignore indirect calls.
- Function *Callee = CI->getCalledFunction();
- if (Callee == nullptr)
- continue;
-
- if (Simplifier.useNative(CI))
- Changed = true;
- }
- }
- return Changed;
-}
-
PreservedAnalyses AMDGPUUseNativeCallsPass::run(Function &F,
FunctionAnalysisManager &AM) {
if (UseNative.empty())
@@ -1754,6 +1721,7 @@ PreservedAnalyses AMDGPUUseNativeCallsPass::run(Function &F,
AMDGPULibCalls Simplifier;
Simplifier.initNativeFuncs();
+ Simplifier.initFunction(F, AM);
bool Changed = false;
for (auto &BB : F) {
@@ -1761,15 +1729,7 @@ PreservedAnalyses AMDGPUUseNativeCallsPass::run(Function &F,
// Ignore non-calls.
CallInst *CI = dyn_cast<CallInst>(I);
++I;
- if (!CI)
- continue;
-
- // Ignore indirect calls.
- Function *Callee = CI->getCalledFunction();
- if (Callee == nullptr)
- continue;
-
- if (Simplifier.useNative(CI))
+ if (CI && Simplifier.useNative(CI))
Changed = true;
}
}
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPULibFunc.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPULibFunc.cpp
index 169a242d74e4..3437b6dc8ae0 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPULibFunc.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPULibFunc.cpp
@@ -478,7 +478,7 @@ static bool eatTerm(StringRef& mangledName, const char c) {
template <size_t N>
static bool eatTerm(StringRef& mangledName, const char (&str)[N]) {
- if (mangledName.startswith(StringRef(str, N-1))) {
+ if (mangledName.starts_with(StringRef(str, N - 1))) {
drop_front(mangledName, N-1);
return true;
}
@@ -527,6 +527,16 @@ AMDGPUMangledLibFunc::AMDGPUMangledLibFunc(
Leads[1] = copyFrom.Leads[1];
}
+AMDGPUMangledLibFunc::AMDGPUMangledLibFunc(EFuncId id, FunctionType *FT,
+ bool SignedInts) {
+ FuncId = id;
+ unsigned NumArgs = FT->getNumParams();
+ if (NumArgs >= 1)
+ Leads[0] = Param::getFromTy(FT->getParamType(0), SignedInts);
+ if (NumArgs >= 2)
+ Leads[1] = Param::getFromTy(FT->getParamType(1), SignedInts);
+}
+
///////////////////////////////////////////////////////////////////////////////
// Demangling
@@ -875,6 +885,50 @@ std::string AMDGPUMangledLibFunc::mangleNameItanium() const {
///////////////////////////////////////////////////////////////////////////////
// Misc
+AMDGPULibFuncBase::Param AMDGPULibFuncBase::Param::getFromTy(Type *Ty,
+ bool Signed) {
+ Param P;
+ if (FixedVectorType *VT = dyn_cast<FixedVectorType>(Ty)) {
+ P.VectorSize = VT->getNumElements();
+ Ty = VT->getElementType();
+ }
+
+ switch (Ty->getTypeID()) {
+ case Type::FloatTyID:
+ P.ArgType = AMDGPULibFunc::F32;
+ break;
+ case Type::DoubleTyID:
+ P.ArgType = AMDGPULibFunc::F64;
+ break;
+ case Type::HalfTyID:
+ P.ArgType = AMDGPULibFunc::F16;
+ break;
+ case Type::IntegerTyID:
+ switch (cast<IntegerType>(Ty)->getBitWidth()) {
+ case 8:
+ P.ArgType = Signed ? AMDGPULibFunc::I8 : AMDGPULibFunc::U8;
+ break;
+ case 16:
+ P.ArgType = Signed ? AMDGPULibFunc::I16 : AMDGPULibFunc::U16;
+ break;
+ case 32:
+ P.ArgType = Signed ? AMDGPULibFunc::I32 : AMDGPULibFunc::U32;
+ break;
+ case 64:
+ P.ArgType = Signed ? AMDGPULibFunc::I64 : AMDGPULibFunc::U64;
+ break;
+ default:
+ llvm_unreachable("unhandled libcall argument type");
+ }
+
+ break;
+ default:
+ llvm_unreachable("unhandled libcall argument type");
+ }
+
+ return P;
+}
+
static Type* getIntrinsicParamType(
LLVMContext& C,
const AMDGPULibFunc::Param& P,
@@ -945,18 +999,25 @@ std::string AMDGPUMangledLibFunc::getName() const {
return std::string(OS.str());
}
+bool AMDGPULibFunc::isCompatibleSignature(const FunctionType *FuncTy) const {
+ // TODO: Validate types make sense
+ return !FuncTy->isVarArg() && FuncTy->getNumParams() == getNumArgs();
+}
+
Function *AMDGPULibFunc::getFunction(Module *M, const AMDGPULibFunc &fInfo) {
std::string FuncName = fInfo.mangle();
Function *F = dyn_cast_or_null<Function>(
M->getValueSymbolTable().lookup(FuncName));
+ if (!F || F->isDeclaration())
+ return nullptr;
- // check formal with actual types conformance
- if (F && !F->isDeclaration()
- && !F->isVarArg()
- && F->arg_size() == fInfo.getNumArgs()) {
- return F;
- }
- return nullptr;
+ if (F->hasFnAttribute(Attribute::NoBuiltin))
+ return nullptr;
+
+ if (!fInfo.isCompatibleSignature(F->getFunctionType()))
+ return nullptr;
+
+ return F;
}
FunctionCallee AMDGPULibFunc::getOrInsertFunction(Module *M,
@@ -965,11 +1026,12 @@ FunctionCallee AMDGPULibFunc::getOrInsertFunction(Module *M,
Function *F = dyn_cast_or_null<Function>(
M->getValueSymbolTable().lookup(FuncName));
- // check formal with actual types conformance
- if (F && !F->isDeclaration()
- && !F->isVarArg()
- && F->arg_size() == fInfo.getNumArgs()) {
- return F;
+ if (F) {
+ if (F->hasFnAttribute(Attribute::NoBuiltin))
+ return nullptr;
+ if (!F->isDeclaration() &&
+ fInfo.isCompatibleSignature(F->getFunctionType()))
+ return F;
}
FunctionType *FuncTy = fInfo.getFunctionType(*M);
@@ -1043,6 +1105,10 @@ AMDGPULibFunc::AMDGPULibFunc(EFuncId Id, const AMDGPULibFunc &CopyFrom) {
Id, *cast<AMDGPUMangledLibFunc>(CopyFrom.Impl.get())));
}
+AMDGPULibFunc::AMDGPULibFunc(EFuncId Id, FunctionType *FT, bool SignedInts) {
+ Impl.reset(new AMDGPUMangledLibFunc(Id, FT, SignedInts));
+}
+
AMDGPULibFunc::AMDGPULibFunc(StringRef Name, FunctionType *FT) {
Impl.reset(new AMDGPUUnmangledLibFunc(Name, FT));
}
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPULibFunc.h b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPULibFunc.h
index bf0fda25b2c0..10551bee3fa8 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPULibFunc.h
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPULibFunc.h
@@ -18,6 +18,7 @@ class FunctionCallee;
class FunctionType;
class Function;
class Module;
+class Type;
class AMDGPULibFuncBase {
public:
@@ -290,18 +291,23 @@ public:
};
struct Param {
- unsigned char ArgType;
- unsigned char VectorSize;
- unsigned char PtrKind;
+ unsigned char ArgType = 0;
+ unsigned char VectorSize = 1;
+ unsigned char PtrKind = 0;
- unsigned char Reserved;
+ unsigned char Reserved = 0;
void reset() {
ArgType = 0;
VectorSize = 1;
PtrKind = 0;
}
- Param() { reset(); }
+
+ static Param getIntN(unsigned char NumElts) {
+ return Param{I32, NumElts, 0, 0};
+ }
+
+ static Param getFromTy(Type *Ty, bool Signed);
template <typename Stream>
void mangleItanium(Stream& os);
@@ -351,7 +357,7 @@ public:
protected:
EFuncId FuncId;
std::string Name;
- ENamePrefix FKind;
+ ENamePrefix FKind = NOPFX;
};
/// Wrapper class for AMDGPULIbFuncImpl
@@ -362,6 +368,8 @@ public:
/// Clone a mangled library func with the Id \p Id and argument info from \p
/// CopyFrom.
explicit AMDGPULibFunc(EFuncId Id, const AMDGPULibFunc &CopyFrom);
+ explicit AMDGPULibFunc(EFuncId Id, FunctionType *FT, bool SignedInts);
+
/// Construct an unmangled library function on the fly.
explicit AMDGPULibFunc(StringRef FName, FunctionType *FT);
@@ -383,6 +391,9 @@ public:
return Impl->parseFuncName(MangledName);
}
+ // Validate the call type matches the expected libfunc type.
+ bool isCompatibleSignature(const FunctionType *FuncTy) const;
+
/// \return The mangled function name for mangled library functions
/// and unmangled function name for unmangled library functions.
std::string mangle() const { return Impl->mangle(); }
@@ -412,6 +423,8 @@ public:
explicit AMDGPUMangledLibFunc();
explicit AMDGPUMangledLibFunc(EFuncId id,
const AMDGPUMangledLibFunc &copyFrom);
+ explicit AMDGPUMangledLibFunc(EFuncId id, FunctionType *FT,
+ bool SignedInts = true);
std::string getName() const override;
unsigned getNumArgs() const override;
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp
index f5323725250f..c32303defe7f 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp
@@ -14,17 +14,59 @@
#include "AMDGPU.h"
#include "GCNSubtarget.h"
#include "llvm/CodeGen/TargetPassConfig.h"
-#include "llvm/IR/IntrinsicsAMDGPU.h"
#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/IntrinsicsAMDGPU.h"
#include "llvm/IR/MDBuilder.h"
#include "llvm/Target/TargetMachine.h"
+
#define DEBUG_TYPE "amdgpu-lower-kernel-arguments"
using namespace llvm;
namespace {
-class AMDGPULowerKernelArguments : public FunctionPass{
+class PreloadKernelArgInfo {
+private:
+ Function &F;
+ const GCNSubtarget &ST;
+ unsigned NumFreeUserSGPRs;
+
+public:
+ SmallVector<llvm::Metadata *, 8> KernelArgMetadata;
+
+ PreloadKernelArgInfo(Function &F, const GCNSubtarget &ST) : F(F), ST(ST) {
+ setInitialFreeUserSGPRsCount();
+ }
+
+ // Returns the maximum number of user SGPRs that we have available to preload
+ // arguments.
+ void setInitialFreeUserSGPRsCount() {
+ const unsigned MaxUserSGPRs = ST.getMaxNumUserSGPRs();
+ GCNUserSGPRUsageInfo UserSGPRInfo(F, ST);
+
+ NumFreeUserSGPRs = MaxUserSGPRs - UserSGPRInfo.getNumUsedUserSGPRs();
+ }
+
+ bool tryAllocPreloadSGPRs(unsigned AllocSize, uint64_t ArgOffset,
+ uint64_t LastExplicitArgOffset) {
+ // Check if this argument may be loaded into the same register as the
+ // previous argument.
+ if (!isAligned(Align(4), ArgOffset) && AllocSize < 4)
+ return true;
+
+ // Pad SGPRs for kernarg alignment.
+ unsigned Padding = ArgOffset - LastExplicitArgOffset;
+ unsigned PaddingSGPRs = alignTo(Padding, 4) / 4;
+ unsigned NumPreloadSGPRs = alignTo(AllocSize, 4) / 4;
+ if (NumPreloadSGPRs + PaddingSGPRs > NumFreeUserSGPRs)
+ return false;
+
+ NumFreeUserSGPRs -= (NumPreloadSGPRs + PaddingSGPRs);
+ return true;
+ }
+};
+
+class AMDGPULowerKernelArguments : public FunctionPass {
public:
static char ID;
@@ -55,14 +97,11 @@ static BasicBlock::iterator getInsertPt(BasicBlock &BB) {
return InsPt;
}
-bool AMDGPULowerKernelArguments::runOnFunction(Function &F) {
+static bool lowerKernelArguments(Function &F, const TargetMachine &TM) {
CallingConv::ID CC = F.getCallingConv();
if (CC != CallingConv::AMDGPU_KERNEL || F.arg_empty())
return false;
- auto &TPC = getAnalysis<TargetPassConfig>();
-
- const TargetMachine &TM = TPC.getTM<TargetMachine>();
const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
LLVMContext &Ctx = F.getParent()->getContext();
const DataLayout &DL = F.getParent()->getDataLayout();
@@ -87,6 +126,9 @@ bool AMDGPULowerKernelArguments::runOnFunction(Function &F) {
Attribute::getWithDereferenceableBytes(Ctx, TotalKernArgSize));
uint64_t ExplicitArgOffset = 0;
+ // Preloaded kernel arguments must be sequential.
+ bool InPreloadSequence = true;
+ PreloadKernelArgInfo PreloadInfo(F, ST);
for (Argument &Arg : F.args()) {
const bool IsByRef = Arg.hasByRefAttr();
@@ -98,8 +140,19 @@ bool AMDGPULowerKernelArguments::runOnFunction(Function &F) {
uint64_t AllocSize = DL.getTypeAllocSize(ArgTy);
uint64_t EltOffset = alignTo(ExplicitArgOffset, ABITypeAlign) + BaseOffset;
+ uint64_t LastExplicitArgOffset = ExplicitArgOffset;
ExplicitArgOffset = alignTo(ExplicitArgOffset, ABITypeAlign) + AllocSize;
+ // Try to preload this argument into user SGPRs.
+ if (Arg.hasInRegAttr() && InPreloadSequence && ST.hasKernargPreload() &&
+ !ST.needsKernargPreloadBackwardsCompatibility() &&
+ !Arg.getType()->isAggregateType())
+ if (PreloadInfo.tryAllocPreloadSGPRs(AllocSize, EltOffset,
+ LastExplicitArgOffset))
+ continue;
+
+ InPreloadSequence = false;
+
if (Arg.use_empty())
continue;
@@ -232,6 +285,12 @@ bool AMDGPULowerKernelArguments::runOnFunction(Function &F) {
return true;
}
+bool AMDGPULowerKernelArguments::runOnFunction(Function &F) {
+ auto &TPC = getAnalysis<TargetPassConfig>();
+ const TargetMachine &TM = TPC.getTM<TargetMachine>();
+ return lowerKernelArguments(F, TM);
+}
+
INITIALIZE_PASS_BEGIN(AMDGPULowerKernelArguments, DEBUG_TYPE,
"AMDGPU Lower Kernel Arguments", false, false)
INITIALIZE_PASS_END(AMDGPULowerKernelArguments, DEBUG_TYPE, "AMDGPU Lower Kernel Arguments",
@@ -242,3 +301,16 @@ char AMDGPULowerKernelArguments::ID = 0;
FunctionPass *llvm::createAMDGPULowerKernelArgumentsPass() {
return new AMDGPULowerKernelArguments();
}
+
+PreservedAnalyses
+AMDGPULowerKernelArgumentsPass::run(Function &F, FunctionAnalysisManager &AM) {
+ bool Changed = lowerKernelArguments(F, TM);
+ if (Changed) {
+ // TODO: Preserves a lot more.
+ PreservedAnalyses PA;
+ PA.preserveSet<CFGAnalyses>();
+ return PA;
+ }
+
+ return PreservedAnalyses::all();
+}
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPULowerKernelAttributes.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPULowerKernelAttributes.cpp
index 26074cf06071..097722157d41 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPULowerKernelAttributes.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPULowerKernelAttributes.cpp
@@ -14,6 +14,7 @@
#include "AMDGPU.h"
#include "Utils/AMDGPUBaseInfo.h"
+#include "llvm/Analysis/ConstantFolding.h"
#include "llvm/Analysis/ValueTracking.h"
#include "llvm/CodeGen/Passes.h"
#include "llvm/CodeGen/TargetPassConfig.h"
@@ -286,8 +287,8 @@ static bool processUse(CallInst *CI, bool IsV5OrAbove) {
if (HasReqdWorkGroupSize) {
ConstantInt *KnownSize
= mdconst::extract<ConstantInt>(MD->getOperand(I));
- UMin->replaceAllUsesWith(ConstantExpr::getIntegerCast(
- KnownSize, UMin->getType(), false));
+ UMin->replaceAllUsesWith(ConstantFoldIntegerCast(
+ KnownSize, UMin->getType(), false, DL));
} else {
UMin->replaceAllUsesWith(ZextGroupSize);
}
@@ -310,7 +311,7 @@ static bool processUse(CallInst *CI, bool IsV5OrAbove) {
ConstantInt *KnownSize = mdconst::extract<ConstantInt>(MD->getOperand(I));
GroupSize->replaceAllUsesWith(
- ConstantExpr::getIntegerCast(KnownSize, GroupSize->getType(), false));
+ ConstantFoldIntegerCast(KnownSize, GroupSize->getType(), false, DL));
MadeChange = true;
}
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp
index e3a645977f92..d2a02143e4e7 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp
@@ -177,6 +177,7 @@
//===----------------------------------------------------------------------===//
#include "AMDGPU.h"
+#include "AMDGPUTargetMachine.h"
#include "Utils/AMDGPUBaseInfo.h"
#include "Utils/AMDGPUMemoryUtils.h"
#include "llvm/ADT/BitVector.h"
@@ -184,8 +185,8 @@
#include "llvm/ADT/DenseSet.h"
#include "llvm/ADT/STLExtras.h"
#include "llvm/ADT/SetOperations.h"
-#include "llvm/ADT/SetVector.h"
#include "llvm/Analysis/CallGraph.h"
+#include "llvm/CodeGen/TargetPassConfig.h"
#include "llvm/IR/Constants.h"
#include "llvm/IR/DerivedTypes.h"
#include "llvm/IR/IRBuilder.h"
@@ -204,7 +205,6 @@
#include "llvm/Transforms/Utils/BasicBlockUtils.h"
#include "llvm/Transforms/Utils/ModuleUtils.h"
-#include <tuple>
#include <vector>
#include <cstdio>
@@ -252,7 +252,8 @@ template <typename T> std::vector<T> sortByName(std::vector<T> &&V) {
return {std::move(V)};
}
-class AMDGPULowerModuleLDS : public ModulePass {
+class AMDGPULowerModuleLDS {
+ const AMDGPUTargetMachine &TM;
static void
removeLocalVarsFromUsedLists(Module &M,
@@ -291,7 +292,8 @@ class AMDGPULowerModuleLDS : public ModulePass {
// equivalent target specific intrinsic which lasts until immediately after
// codegen would suffice for that, but one would still need to ensure that
// the variables are allocated in the anticpated order.
- IRBuilder<> Builder(Func->getEntryBlock().getFirstNonPHI());
+ BasicBlock *Entry = &Func->getEntryBlock();
+ IRBuilder<> Builder(Entry, Entry->getFirstNonPHIIt());
Function *Decl =
Intrinsic::getDeclaration(Func->getParent(), Intrinsic::donothing, {});
@@ -326,11 +328,7 @@ class AMDGPULowerModuleLDS : public ModulePass {
}
public:
- static char ID;
-
- AMDGPULowerModuleLDS() : ModulePass(ID) {
- initializeAMDGPULowerModuleLDSPass(*PassRegistry::getPassRegistry());
- }
+ AMDGPULowerModuleLDS(const AMDGPUTargetMachine &TM_) : TM(TM_) {}
using FunctionVariableMap = DenseMap<Function *, DenseSet<GlobalVariable *>>;
@@ -854,7 +852,7 @@ public:
appendToCompilerUsed(M, {static_cast<GlobalValue *>(
ConstantExpr::getPointerBitCastOrAddrSpaceCast(
cast<Constant>(ModuleScopeReplacement.SGV),
- Type::getInt8PtrTy(Ctx)))});
+ PointerType::getUnqual(Ctx)))});
// module.lds will be allocated at zero in any kernel that allocates it
recordLDSAbsoluteAddress(&M, ModuleScopeReplacement.SGV, 0);
@@ -1089,7 +1087,7 @@ public:
return KernelToCreatedDynamicLDS;
}
- bool runOnModule(Module &M) override {
+ bool runOnModule(Module &M) {
CallGraph CG = CallGraph(M);
bool Changed = superAlignLDSGlobals(M);
@@ -1241,6 +1239,7 @@ public:
}
if (Offset != 0) {
+ (void)TM; // TODO: Account for target maximum LDS
std::string Buffer;
raw_string_ostream SS{Buffer};
SS << format("%u", Offset);
@@ -1367,9 +1366,9 @@ private:
Type *ATy = ArrayType::get(Type::getInt8Ty(Ctx), Padding);
LocalVars.push_back(new GlobalVariable(
- M, ATy, false, GlobalValue::InternalLinkage, UndefValue::get(ATy),
- "", nullptr, GlobalValue::NotThreadLocal, AMDGPUAS::LOCAL_ADDRESS,
- false));
+ M, ATy, false, GlobalValue::InternalLinkage,
+ PoisonValue::get(ATy), "", nullptr, GlobalValue::NotThreadLocal,
+ AMDGPUAS::LOCAL_ADDRESS, false));
IsPaddingField.push_back(true);
CurrentOffset += Padding;
}
@@ -1391,7 +1390,7 @@ private:
Align StructAlign = AMDGPU::getAlign(DL, LocalVars[0]);
GlobalVariable *SGV = new GlobalVariable(
- M, LDSTy, false, GlobalValue::InternalLinkage, UndefValue::get(LDSTy),
+ M, LDSTy, false, GlobalValue::InternalLinkage, PoisonValue::get(LDSTy),
VarName, nullptr, GlobalValue::NotThreadLocal, AMDGPUAS::LOCAL_ADDRESS,
false);
SGV->setAlignment(StructAlign);
@@ -1530,21 +1529,51 @@ private:
}
};
+class AMDGPULowerModuleLDSLegacy : public ModulePass {
+public:
+ const AMDGPUTargetMachine *TM;
+ static char ID;
+
+ AMDGPULowerModuleLDSLegacy(const AMDGPUTargetMachine *TM_ = nullptr)
+ : ModulePass(ID), TM(TM_) {
+ initializeAMDGPULowerModuleLDSLegacyPass(*PassRegistry::getPassRegistry());
+ }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ if (!TM)
+ AU.addRequired<TargetPassConfig>();
+ }
+
+ bool runOnModule(Module &M) override {
+ if (!TM) {
+ auto &TPC = getAnalysis<TargetPassConfig>();
+ TM = &TPC.getTM<AMDGPUTargetMachine>();
+ }
+
+ return AMDGPULowerModuleLDS(*TM).runOnModule(M);
+ }
+};
+
} // namespace
-char AMDGPULowerModuleLDS::ID = 0;
+char AMDGPULowerModuleLDSLegacy::ID = 0;
-char &llvm::AMDGPULowerModuleLDSID = AMDGPULowerModuleLDS::ID;
+char &llvm::AMDGPULowerModuleLDSLegacyPassID = AMDGPULowerModuleLDSLegacy::ID;
-INITIALIZE_PASS(AMDGPULowerModuleLDS, DEBUG_TYPE,
- "Lower uses of LDS variables from non-kernel functions", false,
- false)
+INITIALIZE_PASS_BEGIN(AMDGPULowerModuleLDSLegacy, DEBUG_TYPE,
+ "Lower uses of LDS variables from non-kernel functions",
+ false, false)
+INITIALIZE_PASS_DEPENDENCY(TargetPassConfig)
+INITIALIZE_PASS_END(AMDGPULowerModuleLDSLegacy, DEBUG_TYPE,
+ "Lower uses of LDS variables from non-kernel functions",
+ false, false)
-ModulePass *llvm::createAMDGPULowerModuleLDSPass() {
- return new AMDGPULowerModuleLDS();
+ModulePass *
+llvm::createAMDGPULowerModuleLDSLegacyPass(const AMDGPUTargetMachine *TM) {
+ return new AMDGPULowerModuleLDSLegacy(TM);
}
PreservedAnalyses AMDGPULowerModuleLDSPass::run(Module &M,
ModuleAnalysisManager &) {
- return AMDGPULowerModuleLDS().runOnModule(M) ? PreservedAnalyses::none()
- : PreservedAnalyses::all();
+ return AMDGPULowerModuleLDS(TM).runOnModule(M) ? PreservedAnalyses::none()
+ : PreservedAnalyses::all();
}
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp
index 44bbfe6f13d9..323462e60a29 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp
@@ -24,6 +24,7 @@ AMDGPUMachineFunction::AMDGPUMachineFunction(const Function &F,
: IsEntryFunction(AMDGPU::isEntryFunctionCC(F.getCallingConv())),
IsModuleEntryFunction(
AMDGPU::isModuleEntryFunctionCC(F.getCallingConv())),
+ IsChainFunction(AMDGPU::isChainCC(F.getCallingConv())),
NoSignedZerosFPMath(false) {
// FIXME: Should initialize KernArgSize based on ExplicitKernelArgOffset,
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h
index 5780fa64a7e4..248ee26a47eb 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h
@@ -20,7 +20,6 @@
namespace llvm {
class AMDGPUSubtarget;
-class GCNSubtarget;
class AMDGPUMachineFunction : public MachineFunctionInfo {
/// A map to keep track of local memory objects and their offsets within the
@@ -54,6 +53,9 @@ protected:
// Entry points called by other functions instead of directly by the hardware.
bool IsModuleEntryFunction = false;
+ // Functions with the amdgpu_cs_chain or amdgpu_cs_chain_preserve CC.
+ bool IsChainFunction = false;
+
bool NoSignedZerosFPMath = false;
// Function may be memory bound.
@@ -85,6 +87,13 @@ public:
bool isModuleEntryFunction() const { return IsModuleEntryFunction; }
+ bool isChainFunction() const { return IsChainFunction; }
+
+ // The stack is empty upon entry to this function.
+ bool isBottomOfStack() const {
+ return isEntryFunction() || isChainFunction();
+ }
+
bool hasNoSignedZerosFPMath() const {
return NoSignedZerosFPMath;
}
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUMacroFusion.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUMacroFusion.cpp
index c15c94ee17f8..0cbabf3895a6 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUMacroFusion.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUMacroFusion.cpp
@@ -59,7 +59,7 @@ static bool shouldScheduleAdjacent(const TargetInstrInfo &TII_,
namespace llvm {
-std::unique_ptr<ScheduleDAGMutation> createAMDGPUMacroFusionDAGMutation () {
+std::unique_ptr<ScheduleDAGMutation> createAMDGPUMacroFusionDAGMutation() {
return createMacroFusionDAGMutation(shouldScheduleAdjacent);
}
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUOpenCLEnqueuedBlockLowering.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUOpenCLEnqueuedBlockLowering.cpp
index 2092707c8a3f..4f5ca08b46c1 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUOpenCLEnqueuedBlockLowering.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUOpenCLEnqueuedBlockLowering.cpp
@@ -92,9 +92,9 @@ bool AMDGPUOpenCLEnqueuedBlockLowering::runOnModule(Module &M) {
auto RuntimeHandle = (F.getName() + ".runtime_handle").str();
if (!HandleTy) {
Type *Int32 = Type::getInt32Ty(C);
- HandleTy = StructType::create(
- C, {Type::getInt8Ty(C)->getPointerTo(0), Int32, Int32},
- "block.runtime.handle.t");
+ HandleTy =
+ StructType::create(C, {PointerType::getUnqual(C), Int32, Int32},
+ "block.runtime.handle.t");
}
auto *GV = new GlobalVariable(
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp
index 536fb02cb4ec..7b18e1f805d8 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp
@@ -19,9 +19,9 @@
#include "llvm/CodeGen/GlobalISel/Combiner.h"
#include "llvm/CodeGen/GlobalISel/CombinerHelper.h"
#include "llvm/CodeGen/GlobalISel/CombinerInfo.h"
-#include "llvm/CodeGen/GlobalISel/GIMatchTableExecutor.h"
#include "llvm/CodeGen/GlobalISel/GIMatchTableExecutorImpl.h"
#include "llvm/CodeGen/GlobalISel/GISelKnownBits.h"
+#include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h"
#include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
#include "llvm/CodeGen/MachineDominators.h"
#include "llvm/CodeGen/TargetPassConfig.h"
@@ -42,27 +42,26 @@ namespace {
#include "AMDGPUGenPostLegalizeGICombiner.inc"
#undef GET_GICOMBINER_TYPES
-class AMDGPUPostLegalizerCombinerImpl : public GIMatchTableExecutor {
+class AMDGPUPostLegalizerCombinerImpl : public Combiner {
protected:
const AMDGPUPostLegalizerCombinerImplRuleConfig &RuleConfig;
-
- MachineIRBuilder &B;
- MachineFunction &MF;
- MachineRegisterInfo &MRI;
const GCNSubtarget &STI;
const SIInstrInfo &TII;
- AMDGPUCombinerHelper &Helper;
- GISelChangeObserver &Observer;
+ // TODO: Make CombinerHelper methods const.
+ mutable AMDGPUCombinerHelper Helper;
public:
AMDGPUPostLegalizerCombinerImpl(
+ MachineFunction &MF, CombinerInfo &CInfo, const TargetPassConfig *TPC,
+ GISelKnownBits &KB, GISelCSEInfo *CSEInfo,
const AMDGPUPostLegalizerCombinerImplRuleConfig &RuleConfig,
- MachineIRBuilder &B, AMDGPUCombinerHelper &Helper,
- GISelChangeObserver &Observer);
+ const GCNSubtarget &STI, MachineDominatorTree *MDT,
+ const LegalizerInfo *LI);
static const char *getName() { return "AMDGPUPostLegalizerCombinerImpl"; }
- bool tryCombineAll(MachineInstr &I) const;
+ bool tryCombineAllImpl(MachineInstr &I) const;
+ bool tryCombineAll(MachineInstr &I) const override;
struct FMinFMaxLegacyInfo {
Register LHS;
@@ -120,18 +119,36 @@ private:
#undef GET_GICOMBINER_IMPL
AMDGPUPostLegalizerCombinerImpl::AMDGPUPostLegalizerCombinerImpl(
+ MachineFunction &MF, CombinerInfo &CInfo, const TargetPassConfig *TPC,
+ GISelKnownBits &KB, GISelCSEInfo *CSEInfo,
const AMDGPUPostLegalizerCombinerImplRuleConfig &RuleConfig,
- MachineIRBuilder &B, AMDGPUCombinerHelper &Helper,
- GISelChangeObserver &Observer)
- : RuleConfig(RuleConfig), B(B), MF(B.getMF()), MRI(*B.getMRI()),
- STI(MF.getSubtarget<GCNSubtarget>()), TII(*STI.getInstrInfo()),
- Helper(Helper), Observer(Observer),
+ const GCNSubtarget &STI, MachineDominatorTree *MDT, const LegalizerInfo *LI)
+ : Combiner(MF, CInfo, TPC, &KB, CSEInfo), RuleConfig(RuleConfig), STI(STI),
+ TII(*STI.getInstrInfo()),
+ Helper(Observer, B, /*IsPreLegalize*/ false, &KB, MDT, LI),
#define GET_GICOMBINER_CONSTRUCTOR_INITS
#include "AMDGPUGenPostLegalizeGICombiner.inc"
#undef GET_GICOMBINER_CONSTRUCTOR_INITS
{
}
+bool AMDGPUPostLegalizerCombinerImpl::tryCombineAll(MachineInstr &MI) const {
+ if (tryCombineAllImpl(MI))
+ return true;
+
+ switch (MI.getOpcode()) {
+ case TargetOpcode::G_SHL:
+ case TargetOpcode::G_LSHR:
+ case TargetOpcode::G_ASHR:
+ // On some subtargets, 64-bit shift is a quarter rate instruction. In the
+ // common case, splitting this into a move and a 32-bit shift is faster and
+ // the same code size.
+ return Helper.tryCombineShiftToUnmerge(MI, 32);
+ }
+
+ return false;
+}
+
bool AMDGPUPostLegalizerCombinerImpl::matchFMinFMaxLegacy(
MachineInstr &MI, FMinFMaxLegacyInfo &Info) const {
// FIXME: Type predicate on pattern
@@ -265,17 +282,20 @@ void AMDGPUPostLegalizerCombinerImpl::applyUCharToFloat(
bool AMDGPUPostLegalizerCombinerImpl::matchRcpSqrtToRsq(
MachineInstr &MI,
std::function<void(MachineIRBuilder &)> &MatchInfo) const {
-
- auto getRcpSrc = [=](const MachineInstr &MI) {
- MachineInstr *ResMI = nullptr;
- if (MI.getOpcode() == TargetOpcode::G_INTRINSIC &&
- MI.getIntrinsicID() == Intrinsic::amdgcn_rcp)
- ResMI = MRI.getVRegDef(MI.getOperand(2).getReg());
-
- return ResMI;
+ auto getRcpSrc = [=](const MachineInstr &MI) -> MachineInstr * {
+ if (!MI.getFlag(MachineInstr::FmContract))
+ return nullptr;
+
+ if (auto *GI = dyn_cast<GIntrinsic>(&MI)) {
+ if (GI->is(Intrinsic::amdgcn_rcp))
+ return MRI.getVRegDef(MI.getOperand(2).getReg());
+ }
+ return nullptr;
};
- auto getSqrtSrc = [=](const MachineInstr &MI) {
+ auto getSqrtSrc = [=](const MachineInstr &MI) -> MachineInstr * {
+ if (!MI.getFlag(MachineInstr::FmContract))
+ return nullptr;
MachineInstr *SqrtSrcMI = nullptr;
auto Match =
mi_match(MI.getOperand(0).getReg(), MRI, m_GFSqrt(m_MInstr(SqrtSrcMI)));
@@ -287,7 +307,7 @@ bool AMDGPUPostLegalizerCombinerImpl::matchRcpSqrtToRsq(
// rcp(sqrt(x))
if ((RcpSrcMI = getRcpSrc(MI)) && (SqrtSrcMI = getSqrtSrc(*RcpSrcMI))) {
MatchInfo = [SqrtSrcMI, &MI](MachineIRBuilder &B) {
- B.buildIntrinsic(Intrinsic::amdgcn_rsq, {MI.getOperand(0)}, false)
+ B.buildIntrinsic(Intrinsic::amdgcn_rsq, {MI.getOperand(0)})
.addUse(SqrtSrcMI->getOperand(0).getReg())
.setMIFlags(MI.getFlags());
};
@@ -297,13 +317,12 @@ bool AMDGPUPostLegalizerCombinerImpl::matchRcpSqrtToRsq(
// sqrt(rcp(x))
if ((SqrtSrcMI = getSqrtSrc(MI)) && (RcpSrcMI = getRcpSrc(*SqrtSrcMI))) {
MatchInfo = [RcpSrcMI, &MI](MachineIRBuilder &B) {
- B.buildIntrinsic(Intrinsic::amdgcn_rsq, {MI.getOperand(0)}, false)
+ B.buildIntrinsic(Intrinsic::amdgcn_rsq, {MI.getOperand(0)})
.addUse(RcpSrcMI->getOperand(0).getReg())
.setMIFlags(MI.getFlags());
};
return true;
}
-
return false;
}
@@ -400,51 +419,6 @@ void AMDGPUPostLegalizerCombinerImpl::applyCombineSignExtendInReg(
MI.eraseFromParent();
}
-class AMDGPUPostLegalizerCombinerInfo final : public CombinerInfo {
- GISelKnownBits *KB;
- MachineDominatorTree *MDT;
- AMDGPUPostLegalizerCombinerImplRuleConfig RuleConfig;
-
-public:
- AMDGPUPostLegalizerCombinerInfo(bool EnableOpt, bool OptSize, bool MinSize,
- const AMDGPULegalizerInfo *LI,
- GISelKnownBits *KB, MachineDominatorTree *MDT)
- : CombinerInfo(/*AllowIllegalOps*/ false, /*ShouldLegalizeIllegal*/ true,
- /*LegalizerInfo*/ LI, EnableOpt, OptSize, MinSize),
- KB(KB), MDT(MDT) {
- if (!RuleConfig.parseCommandLineOption())
- report_fatal_error("Invalid rule identifier");
- }
-
- bool combine(GISelChangeObserver &Observer, MachineInstr &MI,
- MachineIRBuilder &B) const override;
-};
-
-bool AMDGPUPostLegalizerCombinerInfo::combine(GISelChangeObserver &Observer,
- MachineInstr &MI,
- MachineIRBuilder &B) const {
- AMDGPUCombinerHelper Helper(Observer, B, /*IsPreLegalize*/ false, KB, MDT,
- LInfo);
- // TODO: Do not re-create the Impl on every inst, it should be per function.
- AMDGPUPostLegalizerCombinerImpl Impl(RuleConfig, B, Helper, Observer);
- Impl.setupMF(*MI.getMF(), KB);
-
- if (Impl.tryCombineAll(MI))
- return true;
-
- switch (MI.getOpcode()) {
- case TargetOpcode::G_SHL:
- case TargetOpcode::G_LSHR:
- case TargetOpcode::G_ASHR:
- // On some subtargets, 64-bit shift is a quarter rate instruction. In the
- // common case, splitting this into a move and a 32-bit shift is faster and
- // the same code size.
- return Helper.tryCombineShiftToUnmerge(MI, 32);
- }
-
- return false;
-}
-
// Pass boilerplate
// ================
@@ -461,8 +435,10 @@ public:
bool runOnMachineFunction(MachineFunction &MF) override;
void getAnalysisUsage(AnalysisUsage &AU) const override;
+
private:
bool IsOptNone;
+ AMDGPUPostLegalizerCombinerImplRuleConfig RuleConfig;
};
} // end anonymous namespace
@@ -482,6 +458,9 @@ void AMDGPUPostLegalizerCombiner::getAnalysisUsage(AnalysisUsage &AU) const {
AMDGPUPostLegalizerCombiner::AMDGPUPostLegalizerCombiner(bool IsOptNone)
: MachineFunctionPass(ID), IsOptNone(IsOptNone) {
initializeAMDGPUPostLegalizerCombinerPass(*PassRegistry::getPassRegistry());
+
+ if (!RuleConfig.parseCommandLineOption())
+ report_fatal_error("Invalid rule identifier");
}
bool AMDGPUPostLegalizerCombiner::runOnMachineFunction(MachineFunction &MF) {
@@ -491,7 +470,7 @@ bool AMDGPUPostLegalizerCombiner::runOnMachineFunction(MachineFunction &MF) {
auto *TPC = &getAnalysis<TargetPassConfig>();
const Function &F = MF.getFunction();
bool EnableOpt =
- MF.getTarget().getOptLevel() != CodeGenOpt::None && !skipFunction(F);
+ MF.getTarget().getOptLevel() != CodeGenOptLevel::None && !skipFunction(F);
const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
const AMDGPULegalizerInfo *LI =
@@ -500,10 +479,13 @@ bool AMDGPUPostLegalizerCombiner::runOnMachineFunction(MachineFunction &MF) {
GISelKnownBits *KB = &getAnalysis<GISelKnownBitsAnalysis>().get(MF);
MachineDominatorTree *MDT =
IsOptNone ? nullptr : &getAnalysis<MachineDominatorTree>();
- AMDGPUPostLegalizerCombinerInfo PCInfo(EnableOpt, F.hasOptSize(),
- F.hasMinSize(), LI, KB, MDT);
- Combiner C(PCInfo, TPC);
- return C.combineMachineInstrs(MF, /*CSEInfo*/ nullptr);
+
+ CombinerInfo CInfo(/*AllowIllegalOps*/ false, /*ShouldLegalizeIllegal*/ true,
+ LI, EnableOpt, F.hasOptSize(), F.hasMinSize());
+
+ AMDGPUPostLegalizerCombinerImpl Impl(MF, CInfo, TPC, *KB, /*CSEInfo*/ nullptr,
+ RuleConfig, ST, MDT, LI);
+ return Impl.combineMachineInstrs();
}
char AMDGPUPostLegalizerCombiner::ID = 0;
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUPreLegalizerCombiner.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUPreLegalizerCombiner.cpp
index 936ca54fcf2e..0c7e198810da 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUPreLegalizerCombiner.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUPreLegalizerCombiner.cpp
@@ -20,7 +20,6 @@
#include "llvm/CodeGen/GlobalISel/Combiner.h"
#include "llvm/CodeGen/GlobalISel/CombinerHelper.h"
#include "llvm/CodeGen/GlobalISel/CombinerInfo.h"
-#include "llvm/CodeGen/GlobalISel/GIMatchTableExecutor.h"
#include "llvm/CodeGen/GlobalISel/GIMatchTableExecutorImpl.h"
#include "llvm/CodeGen/GlobalISel/GISelKnownBits.h"
#include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
@@ -42,26 +41,25 @@ namespace {
#include "AMDGPUGenPreLegalizeGICombiner.inc"
#undef GET_GICOMBINER_TYPES
-class AMDGPUPreLegalizerCombinerImpl : public GIMatchTableExecutor {
+class AMDGPUPreLegalizerCombinerImpl : public Combiner {
protected:
const AMDGPUPreLegalizerCombinerImplRuleConfig &RuleConfig;
const GCNSubtarget &STI;
-
- GISelChangeObserver &Observer;
- MachineIRBuilder &B;
- MachineFunction &MF;
- MachineRegisterInfo &MRI;
- AMDGPUCombinerHelper &Helper;
+ // TODO: Make CombinerHelper methods const.
+ mutable AMDGPUCombinerHelper Helper;
public:
AMDGPUPreLegalizerCombinerImpl(
+ MachineFunction &MF, CombinerInfo &CInfo, const TargetPassConfig *TPC,
+ GISelKnownBits &KB, GISelCSEInfo *CSEInfo,
const AMDGPUPreLegalizerCombinerImplRuleConfig &RuleConfig,
- const GCNSubtarget &STI, GISelChangeObserver &Observer,
- MachineIRBuilder &B, AMDGPUCombinerHelper &Helper);
+ const GCNSubtarget &STI, MachineDominatorTree *MDT,
+ const LegalizerInfo *LI);
static const char *getName() { return "AMDGPUPreLegalizerCombinerImpl"; }
- bool tryCombineAll(MachineInstr &I) const;
+ bool tryCombineAllImpl(MachineInstr &MI) const;
+ bool tryCombineAll(MachineInstr &I) const override;
struct ClampI64ToI16MatchInfo {
int64_t Cmp1 = 0;
@@ -91,17 +89,32 @@ private:
#undef GET_GICOMBINER_IMPL
AMDGPUPreLegalizerCombinerImpl::AMDGPUPreLegalizerCombinerImpl(
+ MachineFunction &MF, CombinerInfo &CInfo, const TargetPassConfig *TPC,
+ GISelKnownBits &KB, GISelCSEInfo *CSEInfo,
const AMDGPUPreLegalizerCombinerImplRuleConfig &RuleConfig,
- const GCNSubtarget &STI, GISelChangeObserver &Observer, MachineIRBuilder &B,
- AMDGPUCombinerHelper &Helper)
- : RuleConfig(RuleConfig), STI(STI), Observer(Observer), B(B), MF(B.getMF()),
- MRI(*B.getMRI()), Helper(Helper),
+ const GCNSubtarget &STI, MachineDominatorTree *MDT, const LegalizerInfo *LI)
+ : Combiner(MF, CInfo, TPC, &KB, CSEInfo), RuleConfig(RuleConfig), STI(STI),
+ Helper(Observer, B, /*IsPreLegalize*/ true, &KB, MDT, LI),
#define GET_GICOMBINER_CONSTRUCTOR_INITS
#include "AMDGPUGenPreLegalizeGICombiner.inc"
#undef GET_GICOMBINER_CONSTRUCTOR_INITS
{
}
+bool AMDGPUPreLegalizerCombinerImpl::tryCombineAll(MachineInstr &MI) const {
+ if (tryCombineAllImpl(MI))
+ return true;
+
+ switch (MI.getOpcode()) {
+ case TargetOpcode::G_CONCAT_VECTORS:
+ return Helper.tryCombineConcatVectors(MI);
+ case TargetOpcode::G_SHUFFLE_VECTOR:
+ return Helper.tryCombineShuffleVector(MI);
+ }
+
+ return false;
+}
+
bool AMDGPUPreLegalizerCombinerImpl::matchClampI64ToI16(
MachineInstr &MI, const MachineRegisterInfo &MRI, const MachineFunction &MF,
ClampI64ToI16MatchInfo &MatchInfo) const {
@@ -199,49 +212,6 @@ void AMDGPUPreLegalizerCombinerImpl::applyClampI64ToI16(
MI.eraseFromParent();
}
-class AMDGPUPreLegalizerCombinerInfo final : public CombinerInfo {
- GISelKnownBits *KB;
- MachineDominatorTree *MDT;
- AMDGPUPreLegalizerCombinerImplRuleConfig RuleConfig;
-
-public:
- AMDGPUPreLegalizerCombinerInfo(bool EnableOpt, bool OptSize, bool MinSize,
- GISelKnownBits *KB, MachineDominatorTree *MDT)
- : CombinerInfo(/*AllowIllegalOps*/ true, /*ShouldLegalizeIllegal*/ false,
- /*LegalizerInfo*/ nullptr, EnableOpt, OptSize, MinSize),
- KB(KB), MDT(MDT) {
- if (!RuleConfig.parseCommandLineOption())
- report_fatal_error("Invalid rule identifier");
- }
-
- bool combine(GISelChangeObserver &Observer, MachineInstr &MI,
- MachineIRBuilder &B) const override;
-};
-
-bool AMDGPUPreLegalizerCombinerInfo::combine(GISelChangeObserver &Observer,
- MachineInstr &MI,
- MachineIRBuilder &B) const {
- const auto *LI = MI.getMF()->getSubtarget().getLegalizerInfo();
- AMDGPUCombinerHelper Helper(Observer, B, /*IsPreLegalize*/ true, KB, MDT, LI);
-
- const GCNSubtarget &STI = MI.getMF()->getSubtarget<GCNSubtarget>();
- // TODO: Do not re-create the Impl on every inst, it should be per function.
- AMDGPUPreLegalizerCombinerImpl Impl(RuleConfig, STI, Observer, B, Helper);
- Impl.setupMF(*MI.getMF(), KB);
-
- if (Impl.tryCombineAll(MI))
- return true;
-
- switch (MI.getOpcode()) {
- case TargetOpcode::G_CONCAT_VECTORS:
- return Helper.tryCombineConcatVectors(MI);
- case TargetOpcode::G_SHUFFLE_VECTOR:
- return Helper.tryCombineShuffleVector(MI);
- }
-
- return false;
-}
-
// Pass boilerplate
// ================
@@ -261,6 +231,7 @@ public:
private:
bool IsOptNone;
+ AMDGPUPreLegalizerCombinerImplRuleConfig RuleConfig;
};
} // end anonymous namespace
@@ -283,6 +254,9 @@ void AMDGPUPreLegalizerCombiner::getAnalysisUsage(AnalysisUsage &AU) const {
AMDGPUPreLegalizerCombiner::AMDGPUPreLegalizerCombiner(bool IsOptNone)
: MachineFunctionPass(ID), IsOptNone(IsOptNone) {
initializeAMDGPUPreLegalizerCombinerPass(*PassRegistry::getPassRegistry());
+
+ if (!RuleConfig.parseCommandLineOption())
+ report_fatal_error("Invalid rule identifier");
}
bool AMDGPUPreLegalizerCombiner::runOnMachineFunction(MachineFunction &MF) {
@@ -292,19 +266,22 @@ bool AMDGPUPreLegalizerCombiner::runOnMachineFunction(MachineFunction &MF) {
auto *TPC = &getAnalysis<TargetPassConfig>();
const Function &F = MF.getFunction();
bool EnableOpt =
- MF.getTarget().getOptLevel() != CodeGenOpt::None && !skipFunction(F);
+ MF.getTarget().getOptLevel() != CodeGenOptLevel::None && !skipFunction(F);
GISelKnownBits *KB = &getAnalysis<GISelKnownBitsAnalysis>().get(MF);
- MachineDominatorTree *MDT =
- IsOptNone ? nullptr : &getAnalysis<MachineDominatorTree>();
- AMDGPUPreLegalizerCombinerInfo PCInfo(EnableOpt, F.hasOptSize(),
- F.hasMinSize(), KB, MDT);
+
// Enable CSE.
GISelCSEAnalysisWrapper &Wrapper =
getAnalysis<GISelCSEAnalysisWrapperPass>().getCSEWrapper();
auto *CSEInfo = &Wrapper.get(TPC->getCSEConfig());
- Combiner C(PCInfo, TPC);
- return C.combineMachineInstrs(MF, CSEInfo);
+ const GCNSubtarget &STI = MF.getSubtarget<GCNSubtarget>();
+ MachineDominatorTree *MDT =
+ IsOptNone ? nullptr : &getAnalysis<MachineDominatorTree>();
+ CombinerInfo CInfo(/*AllowIllegalOps*/ true, /*ShouldLegalizeIllegal*/ false,
+ nullptr, EnableOpt, F.hasOptSize(), F.hasMinSize());
+ AMDGPUPreLegalizerCombinerImpl Impl(MF, CInfo, TPC, *KB, CSEInfo, RuleConfig,
+ STI, MDT, STI.getLegalizerInfo());
+ return Impl.combineMachineInstrs();
}
char AMDGPUPreLegalizerCombiner::ID = 0;
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUPrintfRuntimeBinding.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUPrintfRuntimeBinding.cpp
index 13f83e298cf4..7b5dc3795b02 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUPrintfRuntimeBinding.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUPrintfRuntimeBinding.cpp
@@ -102,7 +102,7 @@ void AMDGPUPrintfRuntimeBindingImpl::getConversionSpecifiers(
bool ArgDump = false;
StringRef CurFmt = Fmt.substr(PrevFmtSpecifierIdx,
CurFmtSpecifierIdx - PrevFmtSpecifierIdx);
- size_t pTag = CurFmt.find_last_of("%");
+ size_t pTag = CurFmt.find_last_of('%');
if (pTag != StringRef::npos) {
ArgDump = true;
while (pTag && CurFmt[--pTag] == '%') {
@@ -439,7 +439,7 @@ bool AMDGPUPrintfRuntimeBindingImpl::run(Module &M) {
for (auto &U : PrintfFunction->uses()) {
if (auto *CI = dyn_cast<CallInst>(U.getUser())) {
- if (CI->isCallee(&U))
+ if (CI->isCallee(&U) && !CI->isNoBuiltin())
Printfs.push_back(CI);
}
}
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
index 17025867c1da..1bed516fb5c7 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
@@ -185,7 +185,7 @@ INITIALIZE_PASS_BEGIN(AMDGPUPromoteAlloca, DEBUG_TYPE,
"AMDGPU promote alloca to vector or LDS", false, false)
// Move LDS uses from functions to kernels before promote alloca for accurate
// estimation of LDS available
-INITIALIZE_PASS_DEPENDENCY(AMDGPULowerModuleLDS)
+INITIALIZE_PASS_DEPENDENCY(AMDGPULowerModuleLDSLegacy)
INITIALIZE_PASS_END(AMDGPUPromoteAlloca, DEBUG_TYPE,
"AMDGPU promote alloca to vector or LDS", false, false)
@@ -386,7 +386,6 @@ static Value *promoteAllocaUserToVector(
};
Type *VecEltTy = VectorTy->getElementType();
- const unsigned NumVecElts = VectorTy->getNumElements();
switch (Inst->getOpcode()) {
case Instruction::Load: {
@@ -419,11 +418,12 @@ static Value *promoteAllocaUserToVector(
auto *SubVecTy = FixedVectorType::get(VecEltTy, NumLoadedElts);
assert(DL.getTypeStoreSize(SubVecTy) == DL.getTypeStoreSize(AccessTy));
- unsigned IndexVal = cast<ConstantInt>(Index)->getZExtValue();
Value *SubVec = PoisonValue::get(SubVecTy);
for (unsigned K = 0; K < NumLoadedElts; ++K) {
+ Value *CurIdx =
+ Builder.CreateAdd(Index, ConstantInt::get(Index->getType(), K));
SubVec = Builder.CreateInsertElement(
- SubVec, Builder.CreateExtractElement(CurVal, IndexVal + K), K);
+ SubVec, Builder.CreateExtractElement(CurVal, CurIdx), K);
}
if (AccessTy->isPtrOrPtrVectorTy())
@@ -469,6 +469,7 @@ static Value *promoteAllocaUserToVector(
assert(AccessSize.isKnownMultipleOf(DL.getTypeStoreSize(VecEltTy)));
const unsigned NumWrittenElts =
AccessSize / DL.getTypeStoreSize(VecEltTy);
+ const unsigned NumVecElts = VectorTy->getNumElements();
auto *SubVecTy = FixedVectorType::get(VecEltTy, NumWrittenElts);
assert(DL.getTypeStoreSize(SubVecTy) == DL.getTypeStoreSize(AccessTy));
@@ -479,12 +480,13 @@ static Value *promoteAllocaUserToVector(
Val = Builder.CreateBitOrPointerCast(Val, SubVecTy);
- unsigned IndexVal = cast<ConstantInt>(Index)->getZExtValue();
Value *CurVec = GetOrLoadCurrentVectorValue();
- for (unsigned K = 0; K < NumWrittenElts && ((IndexVal + K) < NumVecElts);
- ++K) {
+ for (unsigned K = 0, NumElts = std::min(NumWrittenElts, NumVecElts);
+ K < NumElts; ++K) {
+ Value *CurIdx =
+ Builder.CreateAdd(Index, ConstantInt::get(Index->getType(), K));
CurVec = Builder.CreateInsertElement(
- CurVec, Builder.CreateExtractElement(Val, K), IndexVal + K);
+ CurVec, Builder.CreateExtractElement(Val, K), CurIdx);
}
return CurVec;
}
@@ -679,6 +681,12 @@ bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToVector(AllocaInst &Alloca) {
return RejectUser(Inst, "unsupported load/store as aggregate");
assert(!AccessTy->isAggregateType() || AccessTy->isArrayTy());
+ // Check that this is a simple access of a vector element.
+ bool IsSimple = isa<LoadInst>(Inst) ? cast<LoadInst>(Inst)->isSimple()
+ : cast<StoreInst>(Inst)->isSimple();
+ if (!IsSimple)
+ return RejectUser(Inst, "not a simple load or store");
+
Ptr = Ptr->stripPointerCasts();
// Alloca already accessed as vector.
@@ -688,11 +696,6 @@ bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToVector(AllocaInst &Alloca) {
continue;
}
- // Check that this is a simple access of a vector element.
- bool IsSimple = isa<LoadInst>(Inst) ? cast<LoadInst>(Inst)->isSimple()
- : cast<StoreInst>(Inst)->isSimple();
- if (!IsSimple)
- return RejectUser(Inst, "not a simple load or store");
if (!isSupportedAccessType(VectorTy, AccessTy, *DL))
return RejectUser(Inst, "not a supported access type");
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp
index c935e384da8e..20e1aaa5419a 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp
@@ -20,7 +20,6 @@
#include "llvm/CodeGen/GlobalISel/Combiner.h"
#include "llvm/CodeGen/GlobalISel/CombinerHelper.h"
#include "llvm/CodeGen/GlobalISel/CombinerInfo.h"
-#include "llvm/CodeGen/GlobalISel/GIMatchTableExecutor.h"
#include "llvm/CodeGen/GlobalISel/GIMatchTableExecutorImpl.h"
#include "llvm/CodeGen/GlobalISel/GISelKnownBits.h"
#include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
@@ -43,29 +42,27 @@ namespace {
#include "AMDGPUGenRegBankGICombiner.inc"
#undef GET_GICOMBINER_TYPES
-class AMDGPURegBankCombinerImpl : public GIMatchTableExecutor {
+class AMDGPURegBankCombinerImpl : public Combiner {
protected:
const AMDGPURegBankCombinerImplRuleConfig &RuleConfig;
-
- MachineIRBuilder &B;
- MachineFunction &MF;
- MachineRegisterInfo &MRI;
const GCNSubtarget &STI;
const RegisterBankInfo &RBI;
const TargetRegisterInfo &TRI;
const SIInstrInfo &TII;
- CombinerHelper &Helper;
- GISelChangeObserver &Observer;
+ // TODO: Make CombinerHelper methods const.
+ mutable CombinerHelper Helper;
public:
AMDGPURegBankCombinerImpl(
+ MachineFunction &MF, CombinerInfo &CInfo, const TargetPassConfig *TPC,
+ GISelKnownBits &KB, GISelCSEInfo *CSEInfo,
const AMDGPURegBankCombinerImplRuleConfig &RuleConfig,
- MachineIRBuilder &B, CombinerHelper &Helper,
- GISelChangeObserver &Observer);
+ const GCNSubtarget &STI, MachineDominatorTree *MDT,
+ const LegalizerInfo *LI);
static const char *getName() { return "AMDGPURegBankCombinerImpl"; }
- bool tryCombineAll(MachineInstr &I) const;
+ bool tryCombineAll(MachineInstr &I) const override;
bool isVgprRegBank(Register Reg) const;
Register getAsVgpr(Register Reg) const;
@@ -114,12 +111,14 @@ private:
#undef GET_GICOMBINER_IMPL
AMDGPURegBankCombinerImpl::AMDGPURegBankCombinerImpl(
- const AMDGPURegBankCombinerImplRuleConfig &RuleConfig, MachineIRBuilder &B,
- CombinerHelper &Helper, GISelChangeObserver &Observer)
- : RuleConfig(RuleConfig), B(B), MF(B.getMF()), MRI(*B.getMRI()),
- STI(MF.getSubtarget<GCNSubtarget>()), RBI(*STI.getRegBankInfo()),
- TRI(*STI.getRegisterInfo()), TII(*STI.getInstrInfo()), Helper(Helper),
- Observer(Observer),
+ MachineFunction &MF, CombinerInfo &CInfo, const TargetPassConfig *TPC,
+ GISelKnownBits &KB, GISelCSEInfo *CSEInfo,
+ const AMDGPURegBankCombinerImplRuleConfig &RuleConfig,
+ const GCNSubtarget &STI, MachineDominatorTree *MDT, const LegalizerInfo *LI)
+ : Combiner(MF, CInfo, TPC, &KB, CSEInfo), RuleConfig(RuleConfig), STI(STI),
+ RBI(*STI.getRegBankInfo()), TRI(*STI.getRegisterInfo()),
+ TII(*STI.getInstrInfo()),
+ Helper(Observer, B, /*IsPreLegalize*/ false, &KB, MDT, LI),
#define GET_GICOMBINER_CONSTRUCTOR_INITS
#include "AMDGPUGenRegBankGICombiner.inc"
#undef GET_GICOMBINER_CONSTRUCTOR_INITS
@@ -396,36 +395,6 @@ bool AMDGPURegBankCombinerImpl::isClampZeroToOne(MachineInstr *K0,
return false;
}
-class AMDGPURegBankCombinerInfo final : public CombinerInfo {
- GISelKnownBits *KB;
- MachineDominatorTree *MDT;
- AMDGPURegBankCombinerImplRuleConfig RuleConfig;
-
-public:
- AMDGPURegBankCombinerInfo(bool EnableOpt, bool OptSize, bool MinSize,
- const AMDGPULegalizerInfo *LI, GISelKnownBits *KB,
- MachineDominatorTree *MDT)
- : CombinerInfo(/*AllowIllegalOps*/ false, /*ShouldLegalizeIllegal*/ true,
- /*LegalizerInfo*/ LI, EnableOpt, OptSize, MinSize),
- KB(KB), MDT(MDT) {
- if (!RuleConfig.parseCommandLineOption())
- report_fatal_error("Invalid rule identifier");
- }
-
- bool combine(GISelChangeObserver &Observer, MachineInstr &MI,
- MachineIRBuilder &B) const override;
-};
-
-bool AMDGPURegBankCombinerInfo::combine(GISelChangeObserver &Observer,
- MachineInstr &MI,
- MachineIRBuilder &B) const {
- CombinerHelper Helper(Observer, B, /* IsPreLegalize*/ false, KB, MDT);
- // TODO: Do not re-create the Impl on every inst, it should be per function.
- AMDGPURegBankCombinerImpl Impl(RuleConfig, B, Helper, Observer);
- Impl.setupMF(*MI.getMF(), KB);
- return Impl.tryCombineAll(MI);
-}
-
// Pass boilerplate
// ================
@@ -440,8 +409,10 @@ public:
bool runOnMachineFunction(MachineFunction &MF) override;
void getAnalysisUsage(AnalysisUsage &AU) const override;
+
private:
bool IsOptNone;
+ AMDGPURegBankCombinerImplRuleConfig RuleConfig;
};
} // end anonymous namespace
@@ -461,6 +432,9 @@ void AMDGPURegBankCombiner::getAnalysisUsage(AnalysisUsage &AU) const {
AMDGPURegBankCombiner::AMDGPURegBankCombiner(bool IsOptNone)
: MachineFunctionPass(ID), IsOptNone(IsOptNone) {
initializeAMDGPURegBankCombinerPass(*PassRegistry::getPassRegistry());
+
+ if (!RuleConfig.parseCommandLineOption())
+ report_fatal_error("Invalid rule identifier");
}
bool AMDGPURegBankCombiner::runOnMachineFunction(MachineFunction &MF) {
@@ -470,19 +444,20 @@ bool AMDGPURegBankCombiner::runOnMachineFunction(MachineFunction &MF) {
auto *TPC = &getAnalysis<TargetPassConfig>();
const Function &F = MF.getFunction();
bool EnableOpt =
- MF.getTarget().getOptLevel() != CodeGenOpt::None && !skipFunction(F);
+ MF.getTarget().getOptLevel() != CodeGenOptLevel::None && !skipFunction(F);
const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
- const AMDGPULegalizerInfo *LI =
- static_cast<const AMDGPULegalizerInfo *>(ST.getLegalizerInfo());
-
GISelKnownBits *KB = &getAnalysis<GISelKnownBitsAnalysis>().get(MF);
+
+ const auto *LI = ST.getLegalizerInfo();
MachineDominatorTree *MDT =
IsOptNone ? nullptr : &getAnalysis<MachineDominatorTree>();
- AMDGPURegBankCombinerInfo PCInfo(EnableOpt, F.hasOptSize(), F.hasMinSize(),
- LI, KB, MDT);
- Combiner C(PCInfo, TPC);
- return C.combineMachineInstrs(MF, /*CSEInfo*/ nullptr);
+
+ CombinerInfo CInfo(/*AllowIllegalOps*/ false, /*ShouldLegalizeIllegal*/ true,
+ LI, EnableOpt, F.hasOptSize(), F.hasMinSize());
+ AMDGPURegBankCombinerImpl Impl(MF, CInfo, TPC, *KB, /*CSEInfo*/ nullptr,
+ RuleConfig, ST, MDT, LI);
+ return Impl.combineMachineInstrs();
}
char AMDGPURegBankCombiner::ID = 0;
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
index 0203af32e389..c9412f720c62 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -97,19 +97,25 @@ namespace {
// Observer to apply a register bank to new registers created by LegalizerHelper.
class ApplyRegBankMapping final : public GISelChangeObserver {
private:
+ MachineIRBuilder &B;
const AMDGPURegisterBankInfo &RBI;
MachineRegisterInfo &MRI;
const RegisterBank *NewBank;
SmallVector<MachineInstr *, 4> NewInsts;
public:
- ApplyRegBankMapping(const AMDGPURegisterBankInfo &RBI_,
+ ApplyRegBankMapping(MachineIRBuilder &B, const AMDGPURegisterBankInfo &RBI_,
MachineRegisterInfo &MRI_, const RegisterBank *RB)
- : RBI(RBI_), MRI(MRI_), NewBank(RB) {}
+ : B(B), RBI(RBI_), MRI(MRI_), NewBank(RB) {
+ assert(!B.isObservingChanges());
+ B.setChangeObserver(*this);
+ }
~ApplyRegBankMapping() {
for (MachineInstr *MI : NewInsts)
applyBank(*MI);
+
+ B.stopObservingChanges();
}
/// Set any registers that don't have a set register class or bank to SALU.
@@ -131,7 +137,8 @@ public:
// Replace the extension with a select, which really uses the boolean
// source.
- MachineIRBuilder B(MI);
+ B.setInsertPt(*MI.getParent(), MI);
+
auto True = B.buildConstant(S32, Opc == AMDGPU::G_SEXT ? -1 : 1);
auto False = B.buildConstant(S32, 0);
B.buildSelect(DstReg, SrcReg, True, False);
@@ -193,6 +200,7 @@ public:
};
}
+
AMDGPURegisterBankInfo::AMDGPURegisterBankInfo(const GCNSubtarget &ST)
: Subtarget(ST), TRI(Subtarget.getRegisterInfo()),
TII(Subtarget.getInstrInfo()) {
@@ -221,7 +229,7 @@ bool AMDGPURegisterBankInfo::isDivergentRegBank(const RegisterBank *RB) const {
unsigned AMDGPURegisterBankInfo::copyCost(const RegisterBank &Dst,
const RegisterBank &Src,
- unsigned Size) const {
+ TypeSize Size) const {
// TODO: Should there be a UniformVGPRRegBank which can use readfirstlane?
if (Dst.getID() == AMDGPU::SGPRRegBankID &&
(isVectorRegisterBank(Src) || Src.getID() == AMDGPU::VCCRegBankID)) {
@@ -337,7 +345,7 @@ AMDGPURegisterBankInfo::addMappingFromTable(
RegisterBankInfo::InstructionMappings
AMDGPURegisterBankInfo::getInstrAlternativeMappingsIntrinsic(
const MachineInstr &MI, const MachineRegisterInfo &MRI) const {
- switch (MI.getIntrinsicID()) {
+ switch (cast<GIntrinsic>(MI).getIntrinsicID()) {
case Intrinsic::amdgcn_readlane: {
static const OpRegBankEntry<3> Table[2] = {
// Perfectly legal.
@@ -378,7 +386,7 @@ RegisterBankInfo::InstructionMappings
AMDGPURegisterBankInfo::getInstrAlternativeMappingsIntrinsicWSideEffects(
const MachineInstr &MI, const MachineRegisterInfo &MRI) const {
- switch (MI.getIntrinsicID()) {
+ switch (cast<GIntrinsic>(MI).getIntrinsicID()) {
case Intrinsic::amdgcn_s_buffer_load: {
static const OpRegBankEntry<2> Table[4] = {
// Perfectly legal.
@@ -632,8 +640,10 @@ AMDGPURegisterBankInfo::getInstrAlternativeMappings(
return AltMappings;
}
case AMDGPU::G_INTRINSIC:
+ case AMDGPU::G_INTRINSIC_CONVERGENT:
return getInstrAlternativeMappingsIntrinsic(MI, MRI);
case AMDGPU::G_INTRINSIC_W_SIDE_EFFECTS:
+ case AMDGPU::G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS:
return getInstrAlternativeMappingsIntrinsicWSideEffects(MI, MRI);
default:
break;
@@ -758,11 +768,8 @@ Register AMDGPURegisterBankInfo::buildReadFirstLane(MachineIRBuilder &B,
/// There is additional complexity to try for compare values to identify the
/// unique values used.
bool AMDGPURegisterBankInfo::executeInWaterfallLoop(
- MachineIRBuilder &B,
- iterator_range<MachineBasicBlock::iterator> Range,
- SmallSet<Register, 4> &SGPROperandRegs,
- MachineRegisterInfo &MRI) const {
-
+ MachineIRBuilder &B, iterator_range<MachineBasicBlock::iterator> Range,
+ SmallSet<Register, 4> &SGPROperandRegs) const {
// Track use registers which have already been expanded with a readfirstlane
// sequence. This may have multiple uses if moving a sequence.
DenseMap<Register, Register> WaterfalledRegMap;
@@ -787,6 +794,7 @@ bool AMDGPURegisterBankInfo::executeInWaterfallLoop(
const int OrigRangeSize = std::distance(Range.begin(), Range.end());
#endif
+ MachineRegisterInfo &MRI = *B.getMRI();
Register SaveExecReg = MRI.createVirtualRegister(WaveRC);
Register InitSaveExecReg = MRI.createVirtualRegister(WaveRC);
@@ -922,8 +930,7 @@ bool AMDGPURegisterBankInfo::executeInWaterfallLoop(
// The ballot becomes a no-op during instruction selection.
CondReg = B.buildIntrinsic(Intrinsic::amdgcn_ballot,
- {LLT::scalar(Subtarget.isWave32() ? 32 : 64)},
- false)
+ {LLT::scalar(Subtarget.isWave32() ? 32 : 64)})
.addReg(CondReg)
.getReg(0);
MRI.setRegClass(CondReg, WaveRC);
@@ -986,37 +993,28 @@ bool AMDGPURegisterBankInfo::collectWaterfallOperands(
}
bool AMDGPURegisterBankInfo::executeInWaterfallLoop(
- MachineIRBuilder &B, MachineInstr &MI, MachineRegisterInfo &MRI,
- ArrayRef<unsigned> OpIndices) const {
+ MachineIRBuilder &B, MachineInstr &MI, ArrayRef<unsigned> OpIndices) const {
// Use a set to avoid extra readfirstlanes in the case where multiple operands
// are the same register.
SmallSet<Register, 4> SGPROperandRegs;
- if (!collectWaterfallOperands(SGPROperandRegs, MI, MRI, OpIndices))
+ if (!collectWaterfallOperands(SGPROperandRegs, MI, *B.getMRI(), OpIndices))
return false;
MachineBasicBlock::iterator I = MI.getIterator();
return executeInWaterfallLoop(B, make_range(I, std::next(I)),
- SGPROperandRegs, MRI);
-}
-
-bool AMDGPURegisterBankInfo::executeInWaterfallLoop(
- MachineInstr &MI, MachineRegisterInfo &MRI,
- ArrayRef<unsigned> OpIndices) const {
- MachineIRBuilder B(MI);
- return executeInWaterfallLoop(B, MI, MRI, OpIndices);
+ SGPROperandRegs);
}
// Legalize an operand that must be an SGPR by inserting a readfirstlane.
void AMDGPURegisterBankInfo::constrainOpWithReadfirstlane(
- MachineInstr &MI, MachineRegisterInfo &MRI, unsigned OpIdx) const {
+ MachineIRBuilder &B, MachineInstr &MI, unsigned OpIdx) const {
Register Reg = MI.getOperand(OpIdx).getReg();
+ MachineRegisterInfo &MRI = *B.getMRI();
const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI);
if (Bank == &AMDGPU::SGPRRegBank)
return;
- MachineIRBuilder B(MI);
-
Reg = buildReadFirstLane(B, MRI, Reg);
MI.getOperand(OpIdx).setReg(Reg);
}
@@ -1048,9 +1046,11 @@ static LLT widen96To128(LLT Ty) {
return LLT::fixed_vector(128 / EltTy.getSizeInBits(), EltTy);
}
-bool AMDGPURegisterBankInfo::applyMappingLoad(MachineInstr &MI,
- const AMDGPURegisterBankInfo::OperandsMapper &OpdMapper,
- MachineRegisterInfo &MRI) const {
+bool AMDGPURegisterBankInfo::applyMappingLoad(
+ MachineIRBuilder &B,
+ const AMDGPURegisterBankInfo::OperandsMapper &OpdMapper,
+ MachineInstr &MI) const {
+ MachineRegisterInfo &MRI = *B.getMRI();
Register DstReg = MI.getOperand(0).getReg();
const LLT LoadTy = MRI.getType(DstReg);
unsigned LoadSize = LoadTy.getSizeInBits();
@@ -1061,7 +1061,7 @@ bool AMDGPURegisterBankInfo::applyMappingLoad(MachineInstr &MI,
if (DstBank == &AMDGPU::SGPRRegBank) {
// There are some special cases that we need to look at for 32 bit and 96
// bit SGPR loads otherwise we have nothing to do.
- if (LoadSize != 32 && LoadSize != 96)
+ if (LoadSize != 32 && (LoadSize != 96 || Subtarget.hasScalarDwordx3Loads()))
return false;
MachineMemOperand *MMO = *MI.memoperands_begin();
@@ -1076,8 +1076,7 @@ bool AMDGPURegisterBankInfo::applyMappingLoad(MachineInstr &MI,
Register PtrReg = MI.getOperand(1).getReg();
- ApplyRegBankMapping O(*this, MRI, &AMDGPU::SGPRRegBank);
- MachineIRBuilder B(MI, O);
+ ApplyRegBankMapping ApplyBank(B, *this, MRI, DstBank);
if (LoadSize == 32) {
// This is an extending load from a sub-dword size. Widen the memory
@@ -1098,10 +1097,7 @@ bool AMDGPURegisterBankInfo::applyMappingLoad(MachineInstr &MI,
// 96-bit loads are only available for vector loads. We need to split this
// into a 64-bit part, and 32 (unless we can widen to a 128-bit load).
if (MMO->getAlign() < Align(16)) {
- MachineFunction *MF = MI.getParent()->getParent();
- ApplyRegBankMapping ApplyBank(*this, MRI, DstBank);
- MachineIRBuilder B(MI, ApplyBank);
- LegalizerHelper Helper(*MF, ApplyBank, B);
+ LegalizerHelper Helper(B.getMF(), ApplyBank, B);
LLT Part64, Part32;
std::tie(Part64, Part32) = splitUnequalType(LoadTy, 64);
if (Helper.reduceLoadStoreWidth(cast<GAnyLoad>(MI), 0, Part64) !=
@@ -1144,9 +1140,8 @@ bool AMDGPURegisterBankInfo::applyMappingLoad(MachineInstr &MI,
unsigned NumSplitParts = LoadTy.getSizeInBits() / MaxNonSmrdLoadSize;
const LLT LoadSplitTy = LoadTy.divide(NumSplitParts);
- ApplyRegBankMapping Observer(*this, MRI, &AMDGPU::VGPRRegBank);
- MachineIRBuilder B(MI, Observer);
- LegalizerHelper Helper(B.getMF(), Observer, B);
+ ApplyRegBankMapping O(B, *this, MRI, &AMDGPU::VGPRRegBank);
+ LegalizerHelper Helper(B.getMF(), O, B);
if (LoadTy.isVector()) {
if (Helper.fewerElementsVector(MI, 0, LoadSplitTy) != LegalizerHelper::Legalized)
@@ -1161,10 +1156,11 @@ bool AMDGPURegisterBankInfo::applyMappingLoad(MachineInstr &MI,
}
bool AMDGPURegisterBankInfo::applyMappingDynStackAlloc(
- MachineInstr &MI,
- const AMDGPURegisterBankInfo::OperandsMapper &OpdMapper,
- MachineRegisterInfo &MRI) const {
- const MachineFunction &MF = *MI.getMF();
+ MachineIRBuilder &B,
+ const AMDGPURegisterBankInfo::OperandsMapper &OpdMapper,
+ MachineInstr &MI) const {
+ MachineRegisterInfo &MRI = *B.getMRI();
+ const MachineFunction &MF = B.getMF();
const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
const auto &TFI = *ST.getFrameLowering();
@@ -1188,8 +1184,7 @@ bool AMDGPURegisterBankInfo::applyMappingDynStackAlloc(
const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
Register SPReg = Info->getStackPtrOffsetReg();
- ApplyRegBankMapping ApplyBank(*this, MRI, &AMDGPU::SGPRRegBank);
- MachineIRBuilder B(MI, ApplyBank);
+ ApplyRegBankMapping ApplyBank(B, *this, MRI, &AMDGPU::SGPRRegBank);
auto WaveSize = B.buildConstant(LLT::scalar(32), ST.getWavefrontSizeLog2());
auto ScaledSize = B.buildShl(IntPtrTy, AllocSize, WaveSize);
@@ -1208,8 +1203,9 @@ bool AMDGPURegisterBankInfo::applyMappingDynStackAlloc(
}
bool AMDGPURegisterBankInfo::applyMappingImage(
- MachineInstr &MI, const AMDGPURegisterBankInfo::OperandsMapper &OpdMapper,
- MachineRegisterInfo &MRI, int RsrcIdx) const {
+ MachineIRBuilder &B, MachineInstr &MI,
+ const AMDGPURegisterBankInfo::OperandsMapper &OpdMapper,
+ int RsrcIdx) const {
const int NumDefs = MI.getNumExplicitDefs();
// The reported argument index is relative to the IR intrinsic call arguments,
@@ -1230,7 +1226,7 @@ bool AMDGPURegisterBankInfo::applyMappingImage(
SGPRIndexes.push_back(I);
}
- executeInWaterfallLoop(MI, MRI, SGPRIndexes);
+ executeInWaterfallLoop(B, MI, SGPRIndexes);
return true;
}
@@ -1320,7 +1316,7 @@ unsigned AMDGPURegisterBankInfo::setBufferOffsets(
}
bool AMDGPURegisterBankInfo::applyMappingSBufferLoad(
- const OperandsMapper &OpdMapper) const {
+ MachineIRBuilder &B, const OperandsMapper &OpdMapper) const {
MachineInstr &MI = OpdMapper.getMI();
MachineRegisterInfo &MRI = OpdMapper.getMRI();
@@ -1350,7 +1346,6 @@ bool AMDGPURegisterBankInfo::applyMappingSBufferLoad(
// immediate offsets.
const Align Alignment = NumLoads > 1 ? Align(16 * NumLoads) : Align(1);
- MachineIRBuilder B(MI);
MachineFunction &MF = B.getMF();
Register SOffset;
@@ -1421,7 +1416,7 @@ bool AMDGPURegisterBankInfo::applyMappingSBufferLoad(
OpsToWaterfall.insert(RSrc);
executeInWaterfallLoop(B, make_range(Span.begin(), Span.end()),
- OpsToWaterfall, MRI);
+ OpsToWaterfall);
}
if (NumLoads != 1) {
@@ -1438,7 +1433,8 @@ bool AMDGPURegisterBankInfo::applyMappingSBufferLoad(
return true;
}
-bool AMDGPURegisterBankInfo::applyMappingBFE(const OperandsMapper &OpdMapper,
+bool AMDGPURegisterBankInfo::applyMappingBFE(MachineIRBuilder &B,
+ const OperandsMapper &OpdMapper,
bool Signed) const {
MachineInstr &MI = OpdMapper.getMI();
MachineRegisterInfo &MRI = OpdMapper.getMRI();
@@ -1451,7 +1447,7 @@ bool AMDGPURegisterBankInfo::applyMappingBFE(const OperandsMapper &OpdMapper,
const LLT S32 = LLT::scalar(32);
- unsigned FirstOpnd = MI.getOpcode() == AMDGPU::G_INTRINSIC ? 2 : 1;
+ unsigned FirstOpnd = isa<GIntrinsic>(MI) ? 2 : 1;
Register SrcReg = MI.getOperand(FirstOpnd).getReg();
Register OffsetReg = MI.getOperand(FirstOpnd + 1).getReg();
Register WidthReg = MI.getOperand(FirstOpnd + 2).getReg();
@@ -1464,8 +1460,7 @@ bool AMDGPURegisterBankInfo::applyMappingBFE(const OperandsMapper &OpdMapper,
// There is no 64-bit vgpr bitfield extract instructions so the operation
// is expanded to a sequence of instructions that implement the operation.
- ApplyRegBankMapping ApplyBank(*this, MRI, &AMDGPU::VGPRRegBank);
- MachineIRBuilder B(MI, ApplyBank);
+ ApplyRegBankMapping ApplyBank(B, *this, MRI, &AMDGPU::VGPRRegBank);
const LLT S64 = LLT::scalar(64);
// Shift the source operand so that extracted bits start at bit 0.
@@ -1517,8 +1512,7 @@ bool AMDGPURegisterBankInfo::applyMappingBFE(const OperandsMapper &OpdMapper,
// The scalar form packs the offset and width in a single operand.
- ApplyRegBankMapping ApplyBank(*this, MRI, &AMDGPU::SGPRRegBank);
- MachineIRBuilder B(MI, ApplyBank);
+ ApplyRegBankMapping ApplyBank(B, *this, MRI, &AMDGPU::SGPRRegBank);
// Ensure the high bits are clear to insert the offset.
auto OffsetMask = B.buildConstant(S32, maskTrailingOnes<unsigned>(6));
@@ -1546,7 +1540,7 @@ bool AMDGPURegisterBankInfo::applyMappingBFE(const OperandsMapper &OpdMapper,
}
bool AMDGPURegisterBankInfo::applyMappingMAD_64_32(
- const OperandsMapper &OpdMapper) const {
+ MachineIRBuilder &B, const OperandsMapper &OpdMapper) const {
MachineInstr &MI = OpdMapper.getMI();
MachineRegisterInfo &MRI = OpdMapper.getMRI();
@@ -1575,8 +1569,6 @@ bool AMDGPURegisterBankInfo::applyMappingMAD_64_32(
}
// Keep the multiplication on the SALU.
- MachineIRBuilder B(MI);
-
Register DstHi;
Register DstLo = B.buildMul(S32, Src0, Src1).getReg(0);
bool MulHiInVgpr = false;
@@ -1792,7 +1784,7 @@ getBaseWithConstantOffset(MachineRegisterInfo &MRI, Register Reg) {
std::pair<Register, unsigned>
AMDGPURegisterBankInfo::splitBufferOffsets(MachineIRBuilder &B,
Register OrigOffset) const {
- const unsigned MaxImm = SIInstrInfo::getMaxMUBUFImmOffset();
+ const unsigned MaxImm = SIInstrInfo::getMaxMUBUFImmOffset(Subtarget);
Register BaseReg;
unsigned ImmOffset;
const LLT S32 = LLT::scalar(32);
@@ -1916,8 +1908,9 @@ static void extendLow32IntoHigh32(MachineIRBuilder &B,
}
bool AMDGPURegisterBankInfo::foldExtractEltToCmpSelect(
- MachineInstr &MI, MachineRegisterInfo &MRI,
- const OperandsMapper &OpdMapper) const {
+ MachineIRBuilder &B, MachineInstr &MI,
+ const OperandsMapper &OpdMapper) const {
+ MachineRegisterInfo &MRI = *B.getMRI();
Register VecReg = MI.getOperand(1).getReg();
Register Idx = MI.getOperand(2).getReg();
@@ -1935,7 +1928,6 @@ bool AMDGPURegisterBankInfo::foldExtractEltToCmpSelect(
IsDivergentIdx, &Subtarget))
return false;
- MachineIRBuilder B(MI);
LLT S32 = LLT::scalar(32);
const RegisterBank &DstBank =
@@ -2014,9 +2006,10 @@ static Register constrainRegToBank(MachineRegisterInfo &MRI,
}
bool AMDGPURegisterBankInfo::foldInsertEltToCmpSelect(
- MachineInstr &MI, MachineRegisterInfo &MRI,
- const OperandsMapper &OpdMapper) const {
+ MachineIRBuilder &B, MachineInstr &MI,
+ const OperandsMapper &OpdMapper) const {
+ MachineRegisterInfo &MRI = *B.getMRI();
Register VecReg = MI.getOperand(1).getReg();
Register Idx = MI.getOperand(3).getReg();
@@ -2033,7 +2026,6 @@ bool AMDGPURegisterBankInfo::foldInsertEltToCmpSelect(
IsDivergentIdx, &Subtarget))
return false;
- MachineIRBuilder B(MI);
LLT S32 = LLT::scalar(32);
const RegisterBank &DstBank =
@@ -2103,8 +2095,9 @@ bool AMDGPURegisterBankInfo::foldInsertEltToCmpSelect(
}
void AMDGPURegisterBankInfo::applyMappingImpl(
- const OperandsMapper &OpdMapper) const {
+ MachineIRBuilder &B, const OperandsMapper &OpdMapper) const {
MachineInstr &MI = OpdMapper.getMI();
+ B.setInstrAndDebugLoc(MI);
unsigned Opc = MI.getOpcode();
MachineRegisterInfo &MRI = OpdMapper.getMRI();
switch (Opc) {
@@ -2123,7 +2116,6 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
if (DefRegs.empty())
DefRegs.push_back(DstReg);
- MachineIRBuilder B(MI);
B.setInsertPt(*MI.getParent(), ++MI.getIterator());
Register NewDstReg = MRI.createGenericVirtualRegister(LLT::scalar(32));
@@ -2156,8 +2148,6 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
// produce an invalid copy. We can only copy with some kind of compare to
// get a vector boolean result. Insert a register bank copy that will be
// correctly lowered to a compare.
- MachineIRBuilder B(*MI.getParent()->getParent());
-
for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) {
Register SrcReg = MI.getOperand(I).getReg();
const RegisterBank *SrcBank = getRegBank(SrcReg, MRI, *TRI);
@@ -2179,16 +2169,19 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
substituteSimpleCopyRegs(OpdMapper, 0);
// Promote SGPR/VGPR booleans to s32
- MachineFunction *MF = MI.getParent()->getParent();
- ApplyRegBankMapping ApplyBank(*this, MRI, DstBank);
- MachineIRBuilder B(MI, ApplyBank);
- LegalizerHelper Helper(*MF, ApplyBank, B);
+ ApplyRegBankMapping ApplyBank(B, *this, MRI, DstBank);
+ B.setInsertPt(B.getMBB(), MI);
+ LegalizerHelper Helper(B.getMF(), ApplyBank, B);
if (Helper.widenScalar(MI, 0, S32) != LegalizerHelper::Legalized)
llvm_unreachable("widen scalar should have succeeded");
return;
}
+ case AMDGPU::G_FCMP:
+ if (!Subtarget.hasSALUFloatInsts())
+ break;
+ LLVM_FALLTHROUGH;
case AMDGPU::G_ICMP:
case AMDGPU::G_UADDO:
case AMDGPU::G_USUBO:
@@ -2196,7 +2189,8 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
case AMDGPU::G_SADDE:
case AMDGPU::G_USUBE:
case AMDGPU::G_SSUBE: {
- unsigned BoolDstOp = Opc == AMDGPU::G_ICMP ? 0 : 1;
+ unsigned BoolDstOp =
+ (Opc == AMDGPU::G_ICMP || Opc == AMDGPU::G_FCMP) ? 0 : 1;
Register DstReg = MI.getOperand(BoolDstOp).getReg();
const RegisterBank *DstBank =
@@ -2212,7 +2206,6 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
Register NewDstReg = MRI.createGenericVirtualRegister(S32);
MRI.setRegBank(NewDstReg, AMDGPU::SGPRRegBank);
MI.getOperand(BoolDstOp).setReg(NewDstReg);
- MachineIRBuilder B(MI);
if (HasCarryIn) {
Register NewSrcReg = MRI.createGenericVirtualRegister(S32);
@@ -2245,7 +2238,6 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
const RegisterBank *CondBank = getRegBank(CondRegs[0], MRI, *TRI);
if (CondBank == &AMDGPU::SGPRRegBank) {
- MachineIRBuilder B(MI);
const LLT S32 = LLT::scalar(32);
Register NewCondReg = MRI.createGenericVirtualRegister(S32);
MRI.setRegBank(NewCondReg, AMDGPU::SGPRRegBank);
@@ -2257,7 +2249,6 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
if (DstTy.getSizeInBits() != 64)
break;
- MachineIRBuilder B(MI);
LLT HalfTy = getHalfSizedType(DstTy);
SmallVector<Register, 2> DefRegs(OpdMapper.getVRegs(0));
@@ -2297,7 +2288,6 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
if (CondBank == &AMDGPU::SGPRRegBank) {
- MachineIRBuilder B(MI);
const LLT S32 = LLT::scalar(32);
Register NewCondReg = MRI.createGenericVirtualRegister(S32);
MRI.setRegBank(NewCondReg, AMDGPU::SGPRRegBank);
@@ -2324,8 +2314,7 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
break;
MachineFunction *MF = MI.getParent()->getParent();
- ApplyRegBankMapping ApplyBank(*this, MRI, DstBank);
- MachineIRBuilder B(MI, ApplyBank);
+ ApplyRegBankMapping ApplyBank(B, *this, MRI, DstBank);
LegalizerHelper Helper(*MF, ApplyBank, B);
if (Helper.widenScalar(MI, 0, LLT::scalar(32)) !=
@@ -2355,7 +2344,6 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
// Depending on where the source registers came from, the generic code may
// have decided to split the inputs already or not. If not, we still need to
// extract the values.
- MachineIRBuilder B(MI);
if (Src0Regs.empty())
split64BitValueForMapping(B, Src0Regs, HalfTy, MI.getOperand(1).getReg());
@@ -2384,8 +2372,7 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
// max combination.
if (SrcBank && SrcBank == &AMDGPU::VGPRRegBank) {
MachineFunction *MF = MI.getParent()->getParent();
- ApplyRegBankMapping Apply(*this, MRI, &AMDGPU::VGPRRegBank);
- MachineIRBuilder B(MI, Apply);
+ ApplyRegBankMapping Apply(B, *this, MRI, &AMDGPU::VGPRRegBank);
LegalizerHelper Helper(*MF, Apply, B);
if (Helper.lowerAbsToMaxNeg(MI) != LegalizerHelper::Legalized)
@@ -2420,8 +2407,19 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
const LLT S32 = LLT::scalar(32);
MachineBasicBlock *MBB = MI.getParent();
MachineFunction *MF = MBB->getParent();
- ApplyRegBankMapping ApplySALU(*this, MRI, &AMDGPU::SGPRRegBank);
- MachineIRBuilder B(MI, ApplySALU);
+ ApplyRegBankMapping ApplySALU(B, *this, MRI, &AMDGPU::SGPRRegBank);
+
+ if (DstTy.isVector() && Opc == AMDGPU::G_ABS) {
+ Register WideSrcLo, WideSrcHi;
+
+ std::tie(WideSrcLo, WideSrcHi) =
+ unpackV2S16ToS32(B, MI.getOperand(1).getReg(), TargetOpcode::G_SEXT);
+ auto Lo = B.buildInstr(AMDGPU::G_ABS, {S32}, {WideSrcLo});
+ auto Hi = B.buildInstr(AMDGPU::G_ABS, {S32}, {WideSrcHi});
+ B.buildBuildVectorTrunc(DstReg, {Lo.getReg(0), Hi.getReg(0)});
+ MI.eraseFromParent();
+ return;
+ }
if (DstTy.isVector()) {
Register WideSrc0Lo, WideSrc0Hi;
@@ -2459,10 +2457,7 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
break; // Nothing to repair
const LLT S32 = LLT::scalar(32);
- MachineIRBuilder B(MI);
- ApplyRegBankMapping O(*this, MRI, &AMDGPU::VGPRRegBank);
- GISelObserverWrapper Observer(&O);
- B.setChangeObserver(Observer);
+ ApplyRegBankMapping O(B, *this, MRI, &AMDGPU::VGPRRegBank);
// Don't use LegalizerHelper's narrowScalar. It produces unwanted G_SEXTs
// we would need to further expand, and doesn't let us directly set the
@@ -2508,8 +2503,7 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
if (Ty == S32)
break;
- ApplyRegBankMapping ApplyVALU(*this, MRI, &AMDGPU::VGPRRegBank);
- MachineIRBuilder B(MI, ApplyVALU);
+ ApplyRegBankMapping ApplyVALU(B, *this, MRI, &AMDGPU::VGPRRegBank);
MachineFunction &MF = B.getMF();
LegalizerHelper Helper(MF, ApplyVALU, B);
@@ -2539,8 +2533,7 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
// (cttz_zero_undef hi:lo) -> (umin (add (ffbl hi), 32), (ffbl lo))
// (ffbh hi:lo) -> (umin (ffbh hi), (uaddsat (ffbh lo), 32))
// (ffbl hi:lo) -> (umin (uaddsat (ffbh hi), 32), (ffbh lo))
- ApplyRegBankMapping ApplyVALU(*this, MRI, &AMDGPU::VGPRRegBank);
- MachineIRBuilder B(MI, ApplyVALU);
+ ApplyRegBankMapping ApplyVALU(B, *this, MRI, &AMDGPU::VGPRRegBank);
SmallVector<Register, 2> SrcRegs(OpdMapper.getVRegs(1));
unsigned NewOpc = Opc == AMDGPU::G_CTLZ_ZERO_UNDEF
? (unsigned)AMDGPU::G_AMDGPU_FFBH_U32
@@ -2569,7 +2562,6 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
assert(OpdMapper.getVRegs(1).empty());
- MachineIRBuilder B(MI);
const RegisterBank *SrcBank =
OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank;
@@ -2654,11 +2646,9 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
LLT DstTy = MRI.getType(DstReg);
LLT SrcTy = MRI.getType(SrcReg);
- if (foldExtractEltToCmpSelect(MI, MRI, OpdMapper))
+ if (foldExtractEltToCmpSelect(B, MI, OpdMapper))
return;
- MachineIRBuilder B(MI);
-
const ValueMapping &DstMapping
= OpdMapper.getInstrMapping().getOperandMapping(0);
const RegisterBank *DstBank = DstMapping.BreakDown[0].RegBank;
@@ -2693,7 +2683,7 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
if (DstRegs.empty()) {
applyDefaultMapping(OpdMapper);
- executeInWaterfallLoop(MI, MRI, { 2 });
+ executeInWaterfallLoop(B, MI, {2});
if (NeedCopyToVGPR) {
// We don't want a phi for this temporary reg.
@@ -2752,7 +2742,7 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
B.setInstr(*Span.begin());
MI.eraseFromParent();
executeInWaterfallLoop(B, make_range(Span.begin(), Span.end()),
- OpsToWaterfall, MRI);
+ OpsToWaterfall);
if (NeedCopyToVGPR) {
MachineBasicBlock *LoopBB = Extract1->getParent();
@@ -2787,7 +2777,7 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
if (substituteSimpleCopyRegs(OpdMapper, 1))
MRI.setType(MI.getOperand(1).getReg(), VecTy);
- if (foldInsertEltToCmpSelect(MI, MRI, OpdMapper))
+ if (foldInsertEltToCmpSelect(B, MI, OpdMapper))
return;
const RegisterBank *IdxBank =
@@ -2817,24 +2807,21 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
if (InsRegs.empty()) {
- executeInWaterfallLoop(MI, MRI, { 3 });
+ executeInWaterfallLoop(B, MI, {3});
// Re-insert the constant offset add inside the waterfall loop.
if (ShouldMoveIndexIntoLoop) {
- MachineIRBuilder B(MI);
reinsertVectorIndexAdd(B, MI, 3, ConstOffset);
}
return;
}
-
assert(InsTy.getSizeInBits() == 64);
const LLT S32 = LLT::scalar(32);
LLT Vec32 = LLT::fixed_vector(2 * VecTy.getNumElements(), 32);
- MachineIRBuilder B(MI);
auto CastSrc = B.buildBitcast(Vec32, SrcReg);
auto One = B.buildConstant(S32, 1);
@@ -2881,7 +2868,7 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
// Figure out the point after the waterfall loop before mangling the control
// flow.
executeInWaterfallLoop(B, make_range(Span.begin(), Span.end()),
- OpsToWaterfall, MRI);
+ OpsToWaterfall);
// The insertion point is now right after the original instruction.
//
@@ -2913,7 +2900,7 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
case AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT:
case AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16: {
applyDefaultMapping(OpdMapper);
- executeInWaterfallLoop(MI, MRI, {1, 4});
+ executeInWaterfallLoop(B, MI, {1, 4});
return;
}
case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP:
@@ -2929,27 +2916,28 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC:
case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC: {
applyDefaultMapping(OpdMapper);
- executeInWaterfallLoop(MI, MRI, {2, 5});
+ executeInWaterfallLoop(B, MI, {2, 5});
return;
}
case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD:
case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN:
case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX: {
applyDefaultMapping(OpdMapper);
- executeInWaterfallLoop(MI, MRI, {2, 5});
+ executeInWaterfallLoop(B, MI, {2, 5});
return;
}
case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP: {
applyDefaultMapping(OpdMapper);
- executeInWaterfallLoop(MI, MRI, {3, 6});
+ executeInWaterfallLoop(B, MI, {3, 6});
return;
}
case AMDGPU::G_AMDGPU_S_BUFFER_LOAD: {
- applyMappingSBufferLoad(OpdMapper);
+ applyMappingSBufferLoad(B, OpdMapper);
return;
}
- case AMDGPU::G_INTRINSIC: {
- switch (MI.getIntrinsicID()) {
+ case AMDGPU::G_INTRINSIC:
+ case AMDGPU::G_INTRINSIC_CONVERGENT: {
+ switch (cast<GIntrinsic>(MI).getIntrinsicID()) {
case Intrinsic::amdgcn_readlane: {
substituteSimpleCopyRegs(OpdMapper, 2);
@@ -2958,7 +2946,7 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
// Make sure the index is an SGPR. It doesn't make sense to run this in a
// waterfall loop, so assume it's a uniform value.
- constrainOpWithReadfirstlane(MI, MRI, 3); // Index
+ constrainOpWithReadfirstlane(B, MI, 3); // Index
return;
}
case Intrinsic::amdgcn_writelane: {
@@ -2967,8 +2955,8 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
assert(OpdMapper.getVRegs(3).empty());
substituteSimpleCopyRegs(OpdMapper, 4); // VGPR input val
- constrainOpWithReadfirstlane(MI, MRI, 2); // Source value
- constrainOpWithReadfirstlane(MI, MRI, 3); // Index
+ constrainOpWithReadfirstlane(B, MI, 2); // Source value
+ constrainOpWithReadfirstlane(B, MI, 3); // Index
return;
}
case Intrinsic::amdgcn_interp_p1:
@@ -2981,7 +2969,7 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
// Readlane for m0 value, which is always the last operand.
// FIXME: Should this be a waterfall loop instead?
- constrainOpWithReadfirstlane(MI, MRI, MI.getNumOperands() - 1); // Index
+ constrainOpWithReadfirstlane(B, MI, MI.getNumOperands() - 1); // Index
return;
}
case Intrinsic::amdgcn_interp_inreg_p10:
@@ -2995,19 +2983,22 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
// Doing a waterfall loop over these wouldn't make any sense.
substituteSimpleCopyRegs(OpdMapper, 2);
substituteSimpleCopyRegs(OpdMapper, 3);
- constrainOpWithReadfirstlane(MI, MRI, 4);
- constrainOpWithReadfirstlane(MI, MRI, 5);
+ constrainOpWithReadfirstlane(B, MI, 4);
+ constrainOpWithReadfirstlane(B, MI, 5);
return;
}
case Intrinsic::amdgcn_sbfe:
- applyMappingBFE(OpdMapper, true);
+ applyMappingBFE(B, OpdMapper, true);
return;
case Intrinsic::amdgcn_ubfe:
- applyMappingBFE(OpdMapper, false);
+ applyMappingBFE(B, OpdMapper, false);
return;
case Intrinsic::amdgcn_inverse_ballot:
+ case Intrinsic::amdgcn_s_bitreplicate:
+ case Intrinsic::amdgcn_s_quadmask:
+ case Intrinsic::amdgcn_s_wqm:
applyDefaultMapping(OpdMapper);
- constrainOpWithReadfirstlane(MI, MRI, 2); // Mask
+ constrainOpWithReadfirstlane(B, MI, 2); // Mask
return;
case Intrinsic::amdgcn_ballot:
// Use default handling and insert copy to vcc source.
@@ -3019,30 +3010,31 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16:
case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE:
case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16: {
- const AMDGPU::RsrcIntrinsic *RSrcIntrin
- = AMDGPU::lookupRsrcIntrinsic(MI.getIntrinsicID());
+ const AMDGPU::RsrcIntrinsic *RSrcIntrin =
+ AMDGPU::lookupRsrcIntrinsic(AMDGPU::getIntrinsicID(MI));
assert(RSrcIntrin && RSrcIntrin->IsImage);
// Non-images can have complications from operands that allow both SGPR
// and VGPR. For now it's too complicated to figure out the final opcode
// to derive the register bank from the MCInstrDesc.
- applyMappingImage(MI, OpdMapper, MRI, RSrcIntrin->RsrcArg);
+ applyMappingImage(B, MI, OpdMapper, RSrcIntrin->RsrcArg);
return;
}
case AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY: {
unsigned N = MI.getNumExplicitOperands() - 2;
applyDefaultMapping(OpdMapper);
- executeInWaterfallLoop(MI, MRI, { N });
+ executeInWaterfallLoop(B, MI, {N});
return;
}
- case AMDGPU::G_INTRINSIC_W_SIDE_EFFECTS: {
- auto IntrID = MI.getIntrinsicID();
+ case AMDGPU::G_INTRINSIC_W_SIDE_EFFECTS:
+ case AMDGPU::G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS: {
+ auto IntrID = cast<GIntrinsic>(MI).getIntrinsicID();
switch (IntrID) {
case Intrinsic::amdgcn_ds_ordered_add:
case Intrinsic::amdgcn_ds_ordered_swap: {
// This is only allowed to execute with 1 lane, so readfirstlane is safe.
assert(OpdMapper.getVRegs(0).empty());
substituteSimpleCopyRegs(OpdMapper, 3);
- constrainOpWithReadfirstlane(MI, MRI, 2); // M0
+ constrainOpWithReadfirstlane(B, MI, 2); // M0
return;
}
case Intrinsic::amdgcn_ds_gws_init:
@@ -3050,62 +3042,85 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
case Intrinsic::amdgcn_ds_gws_sema_br: {
// Only the first lane is executes, so readfirstlane is safe.
substituteSimpleCopyRegs(OpdMapper, 1);
- constrainOpWithReadfirstlane(MI, MRI, 2); // M0
+ constrainOpWithReadfirstlane(B, MI, 2); // M0
return;
}
case Intrinsic::amdgcn_ds_gws_sema_v:
case Intrinsic::amdgcn_ds_gws_sema_p:
case Intrinsic::amdgcn_ds_gws_sema_release_all: {
// Only the first lane is executes, so readfirstlane is safe.
- constrainOpWithReadfirstlane(MI, MRI, 1); // M0
+ constrainOpWithReadfirstlane(B, MI, 1); // M0
return;
}
case Intrinsic::amdgcn_ds_append:
case Intrinsic::amdgcn_ds_consume: {
- constrainOpWithReadfirstlane(MI, MRI, 2); // M0
+ constrainOpWithReadfirstlane(B, MI, 2); // M0
return;
}
case Intrinsic::amdgcn_s_sendmsg:
case Intrinsic::amdgcn_s_sendmsghalt: {
// FIXME: Should this use a waterfall loop?
- constrainOpWithReadfirstlane(MI, MRI, 2); // M0
+ constrainOpWithReadfirstlane(B, MI, 2); // M0
return;
}
case Intrinsic::amdgcn_s_setreg: {
- constrainOpWithReadfirstlane(MI, MRI, 2);
+ constrainOpWithReadfirstlane(B, MI, 2);
return;
}
+ case Intrinsic::amdgcn_s_ttracedata:
+ constrainOpWithReadfirstlane(B, MI, 1); // M0
+ return;
case Intrinsic::amdgcn_raw_buffer_load_lds:
case Intrinsic::amdgcn_raw_ptr_buffer_load_lds: {
applyDefaultMapping(OpdMapper);
- constrainOpWithReadfirstlane(MI, MRI, 1); // rsrc
- constrainOpWithReadfirstlane(MI, MRI, 2); // M0
- constrainOpWithReadfirstlane(MI, MRI, 5); // soffset
+ constrainOpWithReadfirstlane(B, MI, 1); // rsrc
+ constrainOpWithReadfirstlane(B, MI, 2); // M0
+ constrainOpWithReadfirstlane(B, MI, 5); // soffset
return;
}
case Intrinsic::amdgcn_struct_buffer_load_lds:
case Intrinsic::amdgcn_struct_ptr_buffer_load_lds: {
applyDefaultMapping(OpdMapper);
- constrainOpWithReadfirstlane(MI, MRI, 1); // rsrc
- constrainOpWithReadfirstlane(MI, MRI, 2); // M0
- constrainOpWithReadfirstlane(MI, MRI, 6); // soffset
+ constrainOpWithReadfirstlane(B, MI, 1); // rsrc
+ constrainOpWithReadfirstlane(B, MI, 2); // M0
+ constrainOpWithReadfirstlane(B, MI, 6); // soffset
return;
}
case Intrinsic::amdgcn_global_load_lds: {
applyDefaultMapping(OpdMapper);
- constrainOpWithReadfirstlane(MI, MRI, 2);
+ constrainOpWithReadfirstlane(B, MI, 2);
return;
}
case Intrinsic::amdgcn_lds_direct_load: {
applyDefaultMapping(OpdMapper);
// Readlane for m0 value, which is always the last operand.
- constrainOpWithReadfirstlane(MI, MRI, MI.getNumOperands() - 1); // Index
+ constrainOpWithReadfirstlane(B, MI, MI.getNumOperands() - 1); // Index
return;
}
case Intrinsic::amdgcn_exp_row:
applyDefaultMapping(OpdMapper);
- constrainOpWithReadfirstlane(MI, MRI, 8); // M0
+ constrainOpWithReadfirstlane(B, MI, 8); // M0
+ return;
+ case Intrinsic::amdgcn_s_sleep_var:
+ assert(OpdMapper.getVRegs(1).empty());
+ constrainOpWithReadfirstlane(B, MI, 1);
+ return;
+ case Intrinsic::amdgcn_s_barrier_signal_var:
+ case Intrinsic::amdgcn_s_barrier_join:
+ case Intrinsic::amdgcn_s_wakeup_barrier:
+ constrainOpWithReadfirstlane(B, MI, 1);
return;
+ case Intrinsic::amdgcn_s_barrier_signal_isfirst_var:
+ constrainOpWithReadfirstlane(B, MI, 2);
+ return;
+ case Intrinsic::amdgcn_s_barrier_init:
+ constrainOpWithReadfirstlane(B, MI, 1);
+ constrainOpWithReadfirstlane(B, MI, 2);
+ return;
+ case Intrinsic::amdgcn_s_get_barrier_state: {
+ constrainOpWithReadfirstlane(B, MI, 2);
+ return;
+ }
default: {
if (const AMDGPU::RsrcIntrinsic *RSrcIntrin =
AMDGPU::lookupRsrcIntrinsic(IntrID)) {
@@ -3113,7 +3128,7 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
// and VGPR. For now it's too complicated to figure out the final opcode
// to derive the register bank from the MCInstrDesc.
if (RSrcIntrin->IsImage) {
- applyMappingImage(MI, OpdMapper, MRI, RSrcIntrin->RsrcArg);
+ applyMappingImage(B, MI, OpdMapper, RSrcIntrin->RsrcArg);
return;
}
}
@@ -3214,30 +3229,53 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
}
++End;
- MachineIRBuilder B(*Start);
- executeInWaterfallLoop(B, make_range(Start, End), SGPROperandRegs, MRI);
+ B.setInsertPt(B.getMBB(), Start);
+ executeInWaterfallLoop(B, make_range(Start, End), SGPROperandRegs);
break;
}
case AMDGPU::G_LOAD:
case AMDGPU::G_ZEXTLOAD:
case AMDGPU::G_SEXTLOAD: {
- if (applyMappingLoad(MI, OpdMapper, MRI))
+ if (applyMappingLoad(B, OpdMapper, MI))
return;
break;
}
case AMDGPU::G_DYN_STACKALLOC:
- applyMappingDynStackAlloc(MI, OpdMapper, MRI);
+ applyMappingDynStackAlloc(B, OpdMapper, MI);
+ return;
+ case AMDGPU::G_STACKRESTORE: {
+ applyDefaultMapping(OpdMapper);
+ constrainOpWithReadfirstlane(B, MI, 0);
return;
+ }
case AMDGPU::G_SBFX:
- applyMappingBFE(OpdMapper, /*Signed*/ true);
+ applyMappingBFE(B, OpdMapper, /*Signed*/ true);
return;
case AMDGPU::G_UBFX:
- applyMappingBFE(OpdMapper, /*Signed*/ false);
+ applyMappingBFE(B, OpdMapper, /*Signed*/ false);
return;
case AMDGPU::G_AMDGPU_MAD_U64_U32:
case AMDGPU::G_AMDGPU_MAD_I64_I32:
- applyMappingMAD_64_32(OpdMapper);
+ applyMappingMAD_64_32(B, OpdMapper);
return;
+ case AMDGPU::G_PREFETCH: {
+ if (!Subtarget.hasPrefetch()) {
+ MI.eraseFromParent();
+ return;
+ }
+ unsigned PtrBank =
+ getRegBankID(MI.getOperand(0).getReg(), MRI, AMDGPU::SGPRRegBankID);
+ if (PtrBank == AMDGPU::VGPRRegBankID) {
+ MI.eraseFromParent();
+ return;
+ }
+ // FIXME: There is currently no support for prefetch in global isel.
+ // There is no node equivalence and what's worse there is no MMO produced
+ // for a prefetch on global isel path.
+ // Prefetch does not affect execution so erase it for now.
+ MI.eraseFromParent();
+ return;
+ }
default:
break;
}
@@ -3542,7 +3580,7 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
if (MI.getOpcode() != AMDGPU::G_FREEZE &&
- cannotCopy(*DstBank, *SrcBank, Size))
+ cannotCopy(*DstBank, *SrcBank, TypeSize::getFixed(Size)))
return getInvalidInstructionMapping();
const ValueMapping &ValMap = getValueMapping(0, Size, *DstBank);
@@ -3717,40 +3755,68 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
case AMDGPU::G_UBFX:
if (isSALUMapping(MI))
return getDefaultMappingSOP(MI);
- [[fallthrough]];
-
- case AMDGPU::G_SADDSAT: // FIXME: Could lower sat ops for SALU
- case AMDGPU::G_SSUBSAT:
- case AMDGPU::G_UADDSAT:
- case AMDGPU::G_USUBSAT:
+ return getDefaultMappingVOP(MI);
case AMDGPU::G_FADD:
case AMDGPU::G_FSUB:
- case AMDGPU::G_FPTOSI:
- case AMDGPU::G_FPTOUI:
case AMDGPU::G_FMUL:
case AMDGPU::G_FMA:
- case AMDGPU::G_FMAD:
- case AMDGPU::G_FSQRT:
case AMDGPU::G_FFLOOR:
case AMDGPU::G_FCEIL:
- case AMDGPU::G_FRINT:
+ case AMDGPU::G_INTRINSIC_ROUNDEVEN:
+ case AMDGPU::G_FMINNUM:
+ case AMDGPU::G_FMAXNUM:
+ case AMDGPU::G_FMINIMUM:
+ case AMDGPU::G_FMAXIMUM:
+ case AMDGPU::G_INTRINSIC_TRUNC:
+ case AMDGPU::G_STRICT_FADD:
+ case AMDGPU::G_STRICT_FSUB:
+ case AMDGPU::G_STRICT_FMUL:
+ case AMDGPU::G_STRICT_FMA: {
+ LLT Ty = MRI.getType(MI.getOperand(0).getReg());
+ unsigned Size = Ty.getSizeInBits();
+ if (Subtarget.hasSALUFloatInsts() && Ty.isScalar() &&
+ (Size == 32 || Size == 16) && isSALUMapping(MI))
+ return getDefaultMappingSOP(MI);
+ return getDefaultMappingVOP(MI);
+ }
+ case AMDGPU::G_FPTOSI:
+ case AMDGPU::G_FPTOUI:
case AMDGPU::G_SITOFP:
- case AMDGPU::G_UITOFP:
+ case AMDGPU::G_UITOFP: {
+ unsigned SizeDst = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
+ unsigned SizeSrc = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
+ if (Subtarget.hasSALUFloatInsts() && SizeDst == 32 && SizeSrc == 32 &&
+ isSALUMapping(MI))
+ return getDefaultMappingSOP(MI);
+ return getDefaultMappingVOP(MI);
+ }
case AMDGPU::G_FPTRUNC:
- case AMDGPU::G_FPEXT:
+ case AMDGPU::G_FPEXT: {
+ unsigned SizeDst = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
+ unsigned SizeSrc = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
+ if (Subtarget.hasSALUFloatInsts() && SizeDst != 64 && SizeSrc != 64 &&
+ isSALUMapping(MI))
+ return getDefaultMappingSOP(MI);
+ return getDefaultMappingVOP(MI);
+ }
+ case AMDGPU::G_FSQRT:
case AMDGPU::G_FEXP2:
- case AMDGPU::G_FLOG2:
+ case AMDGPU::G_FLOG2: {
+ unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
+ if (Subtarget.hasPseudoScalarTrans() && (Size == 16 || Size == 32) &&
+ isSALUMapping(MI))
+ return getDefaultMappingSOP(MI);
+ return getDefaultMappingVOP(MI);
+ }
+ case AMDGPU::G_SADDSAT: // FIXME: Could lower sat ops for SALU
+ case AMDGPU::G_SSUBSAT:
+ case AMDGPU::G_UADDSAT:
+ case AMDGPU::G_USUBSAT:
+ case AMDGPU::G_FMAD:
case AMDGPU::G_FLDEXP:
- case AMDGPU::G_FMINNUM:
- case AMDGPU::G_FMAXNUM:
case AMDGPU::G_FMINNUM_IEEE:
case AMDGPU::G_FMAXNUM_IEEE:
case AMDGPU::G_FCANONICALIZE:
- case AMDGPU::G_INTRINSIC_TRUNC:
- case AMDGPU::G_STRICT_FADD:
- case AMDGPU::G_STRICT_FSUB:
- case AMDGPU::G_STRICT_FMUL:
- case AMDGPU::G_STRICT_FMA:
case AMDGPU::G_STRICT_FLDEXP:
case AMDGPU::G_BSWAP: // TODO: Somehow expand for scalar?
case AMDGPU::G_FSHR: // TODO: Expand for scalar
@@ -3845,9 +3911,8 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
// This case is weird because we expect a physical register in the source,
// but need to set a bank anyway.
//
- // We could select the result to SGPR or VGPR, but for the one current use
- // it's more practical to always use VGPR.
- OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
+ // TODO: We could select the result to SGPR or VGPR
+ OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32);
OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32);
break;
}
@@ -3971,14 +4036,6 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
SrcSize);
break;
}
- case AMDGPU::G_FCMP: {
- unsigned Size = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
- OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1);
- OpdsMapping[1] = nullptr; // Predicate Operand.
- OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
- OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
- break;
- }
case AMDGPU::G_IS_FPCLASS: {
Register SrcReg = MI.getOperand(1).getReg();
unsigned SrcSize = MRI.getType(SrcReg).getSizeInBits();
@@ -3999,8 +4056,8 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
OpdsMapping[1] = getValueMappingForPtr(MRI, MI.getOperand(1).getReg());
break;
}
- case AMDGPU::G_ICMP: {
- auto Pred = static_cast<CmpInst::Predicate>(MI.getOperand(1).getPredicate());
+ case AMDGPU::G_ICMP:
+ case AMDGPU::G_FCMP: {
unsigned Size = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
// See if the result register has already been constrained to vcc, which may
@@ -4010,12 +4067,23 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
unsigned Op2Bank = getRegBankID(MI.getOperand(2).getReg(), MRI);
unsigned Op3Bank = getRegBankID(MI.getOperand(3).getReg(), MRI);
+ auto canUseSCCICMP = [&]() {
+ auto Pred =
+ static_cast<CmpInst::Predicate>(MI.getOperand(1).getPredicate());
+ return Size == 32 ||
+ (Size == 64 &&
+ (Pred == CmpInst::ICMP_EQ || Pred == CmpInst::ICMP_NE) &&
+ Subtarget.hasScalarCompareEq64());
+ };
+ auto canUseSCCFCMP = [&]() {
+ return Subtarget.hasSALUFloatInsts() && (Size == 32 || Size == 16);
+ };
+
+ bool isICMP = MI.getOpcode() == AMDGPU::G_ICMP;
bool CanUseSCC = DstBank == AMDGPU::SGPRRegBankID &&
Op2Bank == AMDGPU::SGPRRegBankID &&
Op3Bank == AMDGPU::SGPRRegBankID &&
- (Size == 32 || (Size == 64 &&
- (Pred == CmpInst::ICMP_EQ || Pred == CmpInst::ICMP_NE) &&
- Subtarget.hasScalarCompareEq64()));
+ (isICMP ? canUseSCCICMP() : canUseSCCFCMP());
DstBank = CanUseSCC ? AMDGPU::SGPRRegBankID : AMDGPU::VCCRegBankID;
unsigned SrcBank = CanUseSCC ? AMDGPU::SGPRRegBankID : AMDGPU::VGPRRegBankID;
@@ -4025,6 +4093,7 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
const unsigned ResultSize = 1;
OpdsMapping[0] = AMDGPU::getValueMapping(DstBank, ResultSize);
+ OpdsMapping[1] = nullptr; // Predicate Operand.
OpdsMapping[2] = AMDGPU::getValueMapping(SrcBank, Size);
OpdsMapping[3] = AMDGPU::getValueMapping(SrcBank, Size);
break;
@@ -4197,8 +4266,9 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
OpdsMapping[0] = AMDGPU::getValueMapping(ResultBank, Size0);
break;
}
- case AMDGPU::G_INTRINSIC: {
- switch (MI.getIntrinsicID()) {
+ case AMDGPU::G_INTRINSIC:
+ case AMDGPU::G_INTRINSIC_CONVERGENT: {
+ switch (cast<GIntrinsic>(MI).getIntrinsicID()) {
default:
return getInvalidInstructionMapping();
case Intrinsic::amdgcn_div_fmas:
@@ -4207,12 +4277,7 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
case Intrinsic::amdgcn_sin:
case Intrinsic::amdgcn_cos:
case Intrinsic::amdgcn_log_clamp:
- case Intrinsic::amdgcn_log:
- case Intrinsic::amdgcn_exp2:
- case Intrinsic::amdgcn_rcp:
case Intrinsic::amdgcn_rcp_legacy:
- case Intrinsic::amdgcn_sqrt:
- case Intrinsic::amdgcn_rsq:
case Intrinsic::amdgcn_rsq_legacy:
case Intrinsic::amdgcn_rsq_clamp:
case Intrinsic::amdgcn_fmul_legacy:
@@ -4220,7 +4285,6 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
case Intrinsic::amdgcn_frexp_mant:
case Intrinsic::amdgcn_frexp_exp:
case Intrinsic::amdgcn_fract:
- case Intrinsic::amdgcn_cvt_pkrtz:
case Intrinsic::amdgcn_cvt_pknorm_i16:
case Intrinsic::amdgcn_cvt_pknorm_u16:
case Intrinsic::amdgcn_cvt_pk_i16:
@@ -4263,11 +4327,24 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
case Intrinsic::amdgcn_sudot8:
case Intrinsic::amdgcn_wmma_bf16_16x16x16_bf16:
case Intrinsic::amdgcn_wmma_f16_16x16x16_f16:
+ case Intrinsic::amdgcn_wmma_bf16_16x16x16_bf16_tied:
+ case Intrinsic::amdgcn_wmma_f16_16x16x16_f16_tied:
case Intrinsic::amdgcn_wmma_f32_16x16x16_bf16:
case Intrinsic::amdgcn_wmma_f32_16x16x16_f16:
case Intrinsic::amdgcn_wmma_i32_16x16x16_iu4:
case Intrinsic::amdgcn_wmma_i32_16x16x16_iu8:
return getDefaultMappingVOP(MI);
+ case Intrinsic::amdgcn_log:
+ case Intrinsic::amdgcn_exp2:
+ case Intrinsic::amdgcn_rcp:
+ case Intrinsic::amdgcn_rsq:
+ case Intrinsic::amdgcn_sqrt: {
+ unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
+ if (Subtarget.hasPseudoScalarTrans() && (Size == 16 || Size == 32) &&
+ isSALUMapping(MI))
+ return getDefaultMappingSOP(MI);
+ return getDefaultMappingVOP(MI);
+ }
case Intrinsic::amdgcn_sbfe:
case Intrinsic::amdgcn_ubfe:
if (isSALUMapping(MI))
@@ -4285,8 +4362,13 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
case Intrinsic::amdgcn_wqm:
case Intrinsic::amdgcn_softwqm:
case Intrinsic::amdgcn_set_inactive:
+ case Intrinsic::amdgcn_set_inactive_chain_arg:
case Intrinsic::amdgcn_permlane64:
return getDefaultMappingAllVGPR(MI);
+ case Intrinsic::amdgcn_cvt_pkrtz:
+ if (Subtarget.hasSALUFloatInsts() && isSALUMapping(MI))
+ return getDefaultMappingSOP(MI);
+ return getDefaultMappingVOP(MI);
case Intrinsic::amdgcn_kernarg_segment_ptr:
case Intrinsic::amdgcn_s_getpc:
case Intrinsic::amdgcn_groupstaticsize:
@@ -4387,6 +4469,15 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
OpdsMapping[5] = getSGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
break;
}
+ case Intrinsic::amdgcn_permlane16_var:
+ case Intrinsic::amdgcn_permlanex16_var: {
+ unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
+ OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
+ OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
+ OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
+ OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
+ break;
+ }
case Intrinsic::amdgcn_mfma_f32_4x4x1f32:
case Intrinsic::amdgcn_mfma_f32_4x4x4f16:
case Intrinsic::amdgcn_mfma_i32_4x4x4i8:
@@ -4514,6 +4605,15 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
OpdsMapping[2] = AMDGPU::getValueMapping(MaskBank, MaskSize);
break;
}
+ case Intrinsic::amdgcn_s_quadmask:
+ case Intrinsic::amdgcn_s_wqm: {
+ Register MaskReg = MI.getOperand(2).getReg();
+ unsigned MaskSize = MRI.getType(MaskReg).getSizeInBits();
+ unsigned MaskBank = getRegBankID(MaskReg, MRI, AMDGPU::SGPRRegBankID);
+ OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, MaskSize);
+ OpdsMapping[2] = AMDGPU::getValueMapping(MaskBank, MaskSize);
+ break;
+ }
case Intrinsic::amdgcn_wave_reduce_umin:
case Intrinsic::amdgcn_wave_reduce_umax: {
unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
@@ -4524,6 +4624,11 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
OpdsMapping[2] = AMDGPU::getValueMapping(regBankID, OpSize);
break;
}
+ case Intrinsic::amdgcn_s_bitreplicate:
+ Register MaskReg = MI.getOperand(2).getReg();
+ unsigned MaskBank = getRegBankID(MaskReg, MRI, AMDGPU::SGPRRegBankID);
+ OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 64);
+ OpdsMapping[2] = AMDGPU::getValueMapping(MaskBank, 32);
}
break;
}
@@ -4531,7 +4636,7 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16:
case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE:
case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16: {
- auto IntrID = MI.getIntrinsicID();
+ auto IntrID = AMDGPU::getIntrinsicID(MI);
const AMDGPU::RsrcIntrinsic *RSrcIntrin = AMDGPU::lookupRsrcIntrinsic(IntrID);
assert(RSrcIntrin && "missing RsrcIntrinsic for image intrinsic");
// Non-images can have complications from operands that allow both SGPR
@@ -4559,8 +4664,9 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
}
break;
}
- case AMDGPU::G_INTRINSIC_W_SIDE_EFFECTS: {
- auto IntrID = MI.getIntrinsicID();
+ case AMDGPU::G_INTRINSIC_W_SIDE_EFFECTS:
+ case AMDGPU::G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS: {
+ auto IntrID = cast<GIntrinsic>(MI).getIntrinsicID();
switch (IntrID) {
case Intrinsic::amdgcn_s_getreg:
case Intrinsic::amdgcn_s_memtime:
@@ -4575,9 +4681,13 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
case Intrinsic::amdgcn_global_atomic_csub:
case Intrinsic::amdgcn_global_atomic_fmin:
case Intrinsic::amdgcn_global_atomic_fmax:
+ case Intrinsic::amdgcn_global_atomic_fmin_num:
+ case Intrinsic::amdgcn_global_atomic_fmax_num:
case Intrinsic::amdgcn_flat_atomic_fadd:
case Intrinsic::amdgcn_flat_atomic_fmin:
case Intrinsic::amdgcn_flat_atomic_fmax:
+ case Intrinsic::amdgcn_flat_atomic_fmin_num:
+ case Intrinsic::amdgcn_flat_atomic_fmax_num:
case Intrinsic::amdgcn_global_atomic_fadd_v2bf16:
case Intrinsic::amdgcn_flat_atomic_fadd_v2bf16:
return getDefaultMappingAllVGPR(MI);
@@ -4632,6 +4742,13 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
OpdsMapping[2] = AMDGPU::getValueMapping(Bank, 32);
break;
}
+ case Intrinsic::amdgcn_s_ttracedata: {
+ // This must be an SGPR, but accept a VGPR.
+ unsigned Bank =
+ getRegBankID(MI.getOperand(1).getReg(), MRI, AMDGPU::SGPRRegBankID);
+ OpdsMapping[1] = AMDGPU::getValueMapping(Bank, 32);
+ break;
+ }
case Intrinsic::amdgcn_end_cf: {
unsigned Size = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI);
OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
@@ -4779,7 +4896,37 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
getVGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI); // %data1
break;
}
-
+ case Intrinsic::amdgcn_s_sleep_var:
+ OpdsMapping[1] = getSGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
+ break;
+ case Intrinsic::amdgcn_s_barrier_signal_var:
+ case Intrinsic::amdgcn_s_barrier_join:
+ case Intrinsic::amdgcn_s_wakeup_barrier:
+ OpdsMapping[1] = getSGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
+ break;
+ case Intrinsic::amdgcn_s_barrier_init:
+ OpdsMapping[1] = getSGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
+ OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
+ break;
+ case Intrinsic::amdgcn_s_barrier_signal_isfirst_var: {
+ const unsigned ResultSize = 1;
+ OpdsMapping[0] =
+ AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, ResultSize);
+ OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
+ break;
+ }
+ case Intrinsic::amdgcn_s_barrier_signal_isfirst:
+ case Intrinsic::amdgcn_s_barrier_leave: {
+ const unsigned ResultSize = 1;
+ OpdsMapping[0] =
+ AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, ResultSize);
+ break;
+ }
+ case Intrinsic::amdgcn_s_get_barrier_state: {
+ OpdsMapping[0] = getSGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
+ OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
+ break;
+ }
default:
return getInvalidInstructionMapping();
}
@@ -4887,6 +5034,9 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
case AMDGPU::G_FPTRUNC_ROUND_UPWARD:
case AMDGPU::G_FPTRUNC_ROUND_DOWNWARD:
return getDefaultMappingVOP(MI);
+ case AMDGPU::G_PREFETCH:
+ OpdsMapping[0] = getSGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
+ break;
}
return getInstructionMapping(/*ID*/1, /*Cost*/1,
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.h b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.h
index 78214d7a1058..b5d16e70ab23 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.h
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.h
@@ -53,43 +53,36 @@ public:
MachineRegisterInfo &MRI,
ArrayRef<unsigned> OpIndices) const;
- bool executeInWaterfallLoop(
- MachineIRBuilder &B,
- iterator_range<MachineBasicBlock::iterator> Range,
- SmallSet<Register, 4> &SGPROperandRegs,
- MachineRegisterInfo &MRI) const;
+ bool executeInWaterfallLoop(MachineIRBuilder &B,
+ iterator_range<MachineBasicBlock::iterator> Range,
+ SmallSet<Register, 4> &SGPROperandRegs) const;
Register buildReadFirstLane(MachineIRBuilder &B, MachineRegisterInfo &MRI,
Register Src) const;
- bool executeInWaterfallLoop(MachineIRBuilder &B,
- MachineInstr &MI,
- MachineRegisterInfo &MRI,
- ArrayRef<unsigned> OpIndices) const;
- bool executeInWaterfallLoop(MachineInstr &MI,
- MachineRegisterInfo &MRI,
+ bool executeInWaterfallLoop(MachineIRBuilder &B, MachineInstr &MI,
ArrayRef<unsigned> OpIndices) const;
- void constrainOpWithReadfirstlane(MachineInstr &MI, MachineRegisterInfo &MRI,
+ void constrainOpWithReadfirstlane(MachineIRBuilder &B, MachineInstr &MI,
unsigned OpIdx) const;
- bool applyMappingDynStackAlloc(MachineInstr &MI,
+ bool applyMappingDynStackAlloc(MachineIRBuilder &B,
const OperandsMapper &OpdMapper,
- MachineRegisterInfo &MRI) const;
- bool applyMappingLoad(MachineInstr &MI,
- const OperandsMapper &OpdMapper,
- MachineRegisterInfo &MRI) const;
- bool
- applyMappingImage(MachineInstr &MI,
- const OperandsMapper &OpdMapper,
- MachineRegisterInfo &MRI, int RSrcIdx) const;
+ MachineInstr &MI) const;
+ bool applyMappingLoad(MachineIRBuilder &B, const OperandsMapper &OpdMapper,
+ MachineInstr &MI) const;
+ bool applyMappingImage(MachineIRBuilder &B, MachineInstr &MI,
+ const OperandsMapper &OpdMapper, int RSrcIdx) const;
unsigned setBufferOffsets(MachineIRBuilder &B, Register CombinedOffset,
Register &VOffsetReg, Register &SOffsetReg,
int64_t &InstOffsetVal, Align Alignment) const;
- bool applyMappingSBufferLoad(const OperandsMapper &OpdMapper) const;
+ bool applyMappingSBufferLoad(MachineIRBuilder &B,
+ const OperandsMapper &OpdMapper) const;
- bool applyMappingBFE(const OperandsMapper &OpdMapper, bool Signed) const;
+ bool applyMappingBFE(MachineIRBuilder &B, const OperandsMapper &OpdMapper,
+ bool Signed) const;
- bool applyMappingMAD_64_32(const OperandsMapper &OpdMapper) const;
+ bool applyMappingMAD_64_32(MachineIRBuilder &B,
+ const OperandsMapper &OpdMapper) const;
Register handleD16VData(MachineIRBuilder &B, MachineRegisterInfo &MRI,
Register Reg) const;
@@ -98,7 +91,8 @@ public:
splitBufferOffsets(MachineIRBuilder &B, Register Offset) const;
/// See RegisterBankInfo::applyMapping.
- void applyMappingImpl(const OperandsMapper &OpdMapper) const override;
+ void applyMappingImpl(MachineIRBuilder &Builder,
+ const OperandsMapper &OpdMapper) const override;
const ValueMapping *getValueMappingForPtr(const MachineRegisterInfo &MRI,
Register Ptr) const;
@@ -171,7 +165,7 @@ public:
bool isDivergentRegBank(const RegisterBank *RB) const override;
unsigned copyCost(const RegisterBank &A, const RegisterBank &B,
- unsigned Size) const override;
+ TypeSize Size) const override;
unsigned getBreakDownCost(const ValueMapping &ValMapping,
const RegisterBank *CurBank = nullptr) const override;
@@ -186,12 +180,9 @@ public:
getInstrMapping(const MachineInstr &MI) const override;
private:
-
- bool foldExtractEltToCmpSelect(MachineInstr &MI,
- MachineRegisterInfo &MRI,
+ bool foldExtractEltToCmpSelect(MachineIRBuilder &B, MachineInstr &MI,
const OperandsMapper &OpdMapper) const;
- bool foldInsertEltToCmpSelect(MachineInstr &MI,
- MachineRegisterInfo &MRI,
+ bool foldInsertEltToCmpSelect(MachineIRBuilder &B, MachineInstr &MI,
const OperandsMapper &OpdMapper) const;
};
} // End llvm namespace.
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPURemoveIncompatibleFunctions.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPURemoveIncompatibleFunctions.cpp
index 580352fb8cf4..552380d54dfd 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPURemoveIncompatibleFunctions.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPURemoveIncompatibleFunctions.cpp
@@ -89,15 +89,23 @@ const SubtargetSubTypeKV *getGPUInfo(const GCNSubtarget &ST,
return nullptr;
}
-constexpr unsigned FeaturesToCheck[] = {
- AMDGPU::FeatureGFX11Insts, AMDGPU::FeatureGFX10Insts,
- AMDGPU::FeatureGFX9Insts, AMDGPU::FeatureGFX8Insts,
- AMDGPU::FeatureDPP, AMDGPU::Feature16BitInsts,
- AMDGPU::FeatureDot1Insts, AMDGPU::FeatureDot2Insts,
- AMDGPU::FeatureDot3Insts, AMDGPU::FeatureDot4Insts,
- AMDGPU::FeatureDot5Insts, AMDGPU::FeatureDot6Insts,
- AMDGPU::FeatureDot7Insts, AMDGPU::FeatureDot8Insts,
-};
+constexpr unsigned FeaturesToCheck[] = {AMDGPU::FeatureGFX11Insts,
+ AMDGPU::FeatureGFX10Insts,
+ AMDGPU::FeatureGFX9Insts,
+ AMDGPU::FeatureGFX8Insts,
+ AMDGPU::FeatureDPP,
+ AMDGPU::Feature16BitInsts,
+ AMDGPU::FeatureDot1Insts,
+ AMDGPU::FeatureDot2Insts,
+ AMDGPU::FeatureDot3Insts,
+ AMDGPU::FeatureDot4Insts,
+ AMDGPU::FeatureDot5Insts,
+ AMDGPU::FeatureDot6Insts,
+ AMDGPU::FeatureDot7Insts,
+ AMDGPU::FeatureDot8Insts,
+ AMDGPU::FeatureExtendedImageInsts,
+ AMDGPU::FeatureSMemRealTime,
+ AMDGPU::FeatureSMemTimeInst};
FeatureBitset expandImpliedFeatures(const FeatureBitset &Features) {
FeatureBitset Result = Features;
@@ -120,7 +128,6 @@ void reportFunctionRemoved(Function &F, unsigned Feature) {
<< getFeatureName(Feature)
<< " is not supported on the current target";
});
- return;
}
} // end anonymous namespace
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp
index 804bf503e4f9..db5d2bbcf5bb 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp
@@ -185,7 +185,7 @@ AMDGPUResourceUsageAnalysis::analyzeResourceUsage(
//
// If we only have implicit uses of flat_scr on flat instructions, it is not
// really needed.
- if (Info.UsesFlatScratch && !MFI->hasFlatScratchInit() &&
+ if (Info.UsesFlatScratch && !MFI->getUserSGPRInfo().hasFlatScratchInit() &&
(!hasAnyNonFlatUseOfReg(MRI, *TII, AMDGPU::FLAT_SCR) &&
!hasAnyNonFlatUseOfReg(MRI, *TII, AMDGPU::FLAT_SCR_LO) &&
!hasAnyNonFlatUseOfReg(MRI, *TII, AMDGPU::FLAT_SCR_HI))) {
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPURewriteOutArguments.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPURewriteOutArguments.cpp
index 2fde7afc0c14..5087f1a90245 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPURewriteOutArguments.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPURewriteOutArguments.cpp
@@ -43,7 +43,6 @@
#include "AMDGPU.h"
#include "Utils/AMDGPUBaseInfo.h"
-#include "llvm/ADT/SmallSet.h"
#include "llvm/ADT/Statistic.h"
#include "llvm/Analysis/MemoryDependenceAnalysis.h"
#include "llvm/IR/AttributeMask.h"
@@ -331,6 +330,8 @@ bool AMDGPURewriteOutArguments::runOnFunction(Function &F) {
NewFunc->removeRetAttrs(RetAttrs);
// TODO: How to preserve metadata?
+ NewFunc->setIsNewDbgInfoFormat(F.IsNewDbgInfoFormat);
+
// Move the body of the function into the new rewritten function, and replace
// this function with a stub.
NewFunc->splice(NewFunc->begin(), &F);
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPURewriteUndefForPHI.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPURewriteUndefForPHI.cpp
index 9c07851243c9..459400e3359c 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPURewriteUndefForPHI.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPURewriteUndefForPHI.cpp
@@ -69,11 +69,11 @@ using namespace llvm;
namespace {
-class AMDGPURewriteUndefForPHI : public FunctionPass {
+class AMDGPURewriteUndefForPHILegacy : public FunctionPass {
public:
static char ID;
- AMDGPURewriteUndefForPHI() : FunctionPass(ID) {
- initializeAMDGPURewriteUndefForPHIPass(*PassRegistry::getPassRegistry());
+ AMDGPURewriteUndefForPHILegacy() : FunctionPass(ID) {
+ initializeAMDGPURewriteUndefForPHILegacyPass(*PassRegistry::getPassRegistry());
}
bool runOnFunction(Function &F) override;
StringRef getPassName() const override {
@@ -91,13 +91,13 @@ public:
};
} // end anonymous namespace
-char AMDGPURewriteUndefForPHI::ID = 0;
+char AMDGPURewriteUndefForPHILegacy::ID = 0;
-INITIALIZE_PASS_BEGIN(AMDGPURewriteUndefForPHI, DEBUG_TYPE,
+INITIALIZE_PASS_BEGIN(AMDGPURewriteUndefForPHILegacy, DEBUG_TYPE,
"Rewrite undef for PHI", false, false)
INITIALIZE_PASS_DEPENDENCY(UniformityInfoWrapperPass)
INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
-INITIALIZE_PASS_END(AMDGPURewriteUndefForPHI, DEBUG_TYPE,
+INITIALIZE_PASS_END(AMDGPURewriteUndefForPHILegacy, DEBUG_TYPE,
"Rewrite undef for PHI", false, false)
bool rewritePHIs(Function &F, UniformityInfo &UA, DominatorTree *DT) {
@@ -170,13 +170,27 @@ bool rewritePHIs(Function &F, UniformityInfo &UA, DominatorTree *DT) {
return Changed;
}
-bool AMDGPURewriteUndefForPHI::runOnFunction(Function &F) {
+bool AMDGPURewriteUndefForPHILegacy::runOnFunction(Function &F) {
UniformityInfo &UA =
getAnalysis<UniformityInfoWrapperPass>().getUniformityInfo();
DominatorTree *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
return rewritePHIs(F, UA, DT);
}
-FunctionPass *llvm::createAMDGPURewriteUndefForPHIPass() {
- return new AMDGPURewriteUndefForPHI();
+PreservedAnalyses
+AMDGPURewriteUndefForPHIPass::run(Function &F, FunctionAnalysisManager &AM) {
+ UniformityInfo &UA = AM.getResult<UniformityInfoAnalysis>(F);
+ DominatorTree *DT = &AM.getResult<DominatorTreeAnalysis>(F);
+ bool Changed = rewritePHIs(F, UA, DT);
+ if (Changed) {
+ PreservedAnalyses PA;
+ PA.preserveSet<CFGAnalyses>();
+ return PA;
+ }
+
+ return PreservedAnalyses::all();
+}
+
+FunctionPass *llvm::createAMDGPURewriteUndefForPHILegacyPass() {
+ return new AMDGPURewriteUndefForPHILegacy();
}
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td
index 317f3f21d240..beb670669581 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td
@@ -241,9 +241,13 @@ def : SourceOfDivergence<int_amdgcn_global_atomic_csub>;
def : SourceOfDivergence<int_amdgcn_global_atomic_fadd>;
def : SourceOfDivergence<int_amdgcn_global_atomic_fmin>;
def : SourceOfDivergence<int_amdgcn_global_atomic_fmax>;
+def : SourceOfDivergence<int_amdgcn_global_atomic_fmin_num>;
+def : SourceOfDivergence<int_amdgcn_global_atomic_fmax_num>;
def : SourceOfDivergence<int_amdgcn_flat_atomic_fadd>;
def : SourceOfDivergence<int_amdgcn_flat_atomic_fmin>;
def : SourceOfDivergence<int_amdgcn_flat_atomic_fmax>;
+def : SourceOfDivergence<int_amdgcn_flat_atomic_fmin_num>;
+def : SourceOfDivergence<int_amdgcn_flat_atomic_fmax_num>;
def : SourceOfDivergence<int_amdgcn_global_atomic_fadd_v2bf16>;
def : SourceOfDivergence<int_amdgcn_flat_atomic_fadd_v2bf16>;
def : SourceOfDivergence<int_amdgcn_ds_fadd>;
@@ -333,6 +337,8 @@ def : SourceOfDivergence<int_amdgcn_ds_ordered_add>;
def : SourceOfDivergence<int_amdgcn_ds_ordered_swap>;
def : SourceOfDivergence<int_amdgcn_permlane16>;
def : SourceOfDivergence<int_amdgcn_permlanex16>;
+def : SourceOfDivergence<int_amdgcn_permlane16_var>;
+def : SourceOfDivergence<int_amdgcn_permlanex16_var>;
def : SourceOfDivergence<int_amdgcn_mov_dpp>;
def : SourceOfDivergence<int_amdgcn_mov_dpp8>;
def : SourceOfDivergence<int_amdgcn_update_dpp>;
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
index 9b50f4fa53ac..f19c57668564 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
@@ -17,6 +17,7 @@
#include "AMDGPULegalizerInfo.h"
#include "AMDGPURegisterBankInfo.h"
#include "AMDGPUTargetMachine.h"
+#include "GCNSubtarget.h"
#include "R600Subtarget.h"
#include "SIMachineFunctionInfo.h"
#include "Utils/AMDGPUBaseInfo.h"
@@ -166,6 +167,10 @@ GCNSubtarget::initializeSubtargetDependencies(const Triple &TT,
AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT) : TargetTriple(TT) {}
+bool AMDGPUSubtarget::useRealTrue16Insts() const {
+ return hasTrue16BitInsts() && EnableRealTrue16Insts;
+}
+
GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
const GCNTargetMachine &TM)
: // clang-format off
@@ -196,14 +201,18 @@ unsigned GCNSubtarget::getConstantBusLimit(unsigned Opcode) const {
case AMDGPU::V_LSHLREV_B64_e64:
case AMDGPU::V_LSHLREV_B64_gfx10:
case AMDGPU::V_LSHLREV_B64_e64_gfx11:
+ case AMDGPU::V_LSHLREV_B64_e32_gfx12:
+ case AMDGPU::V_LSHLREV_B64_e64_gfx12:
case AMDGPU::V_LSHL_B64_e64:
case AMDGPU::V_LSHRREV_B64_e64:
case AMDGPU::V_LSHRREV_B64_gfx10:
case AMDGPU::V_LSHRREV_B64_e64_gfx11:
+ case AMDGPU::V_LSHRREV_B64_e64_gfx12:
case AMDGPU::V_LSHR_B64_e64:
case AMDGPU::V_ASHRREV_I64_e64:
case AMDGPU::V_ASHRREV_I64_gfx10:
case AMDGPU::V_ASHRREV_I64_e64_gfx11:
+ case AMDGPU::V_ASHRREV_I64_e64_gfx12:
case AMDGPU::V_ASHR_I64_e64:
return 1;
}
@@ -692,7 +701,7 @@ GCNSubtarget::getBaseReservedNumSGPRs(const bool HasFlatScratch) const {
unsigned GCNSubtarget::getReservedNumSGPRs(const MachineFunction &MF) const {
const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
- return getBaseReservedNumSGPRs(MFI.hasFlatScratchInit());
+ return getBaseReservedNumSGPRs(MFI.getUserSGPRInfo().hasFlatScratchInit());
}
unsigned GCNSubtarget::getReservedNumSGPRs(const Function &F) const {
@@ -771,24 +780,26 @@ unsigned GCNSubtarget::getMaxNumSGPRs(const MachineFunction &MF) const {
}
static unsigned getMaxNumPreloadedSGPRs() {
+ using USI = GCNUserSGPRUsageInfo;
// Max number of user SGPRs
- unsigned MaxUserSGPRs = 4 + // private segment buffer
- 2 + // Dispatch ptr
- 2 + // queue ptr
- 2 + // kernel segment ptr
- 2 + // dispatch ID
- 2 + // flat scratch init
- 2; // Implicit buffer ptr
+ const unsigned MaxUserSGPRs =
+ USI::getNumUserSGPRForField(USI::PrivateSegmentBufferID) +
+ USI::getNumUserSGPRForField(USI::DispatchPtrID) +
+ USI::getNumUserSGPRForField(USI::QueuePtrID) +
+ USI::getNumUserSGPRForField(USI::KernargSegmentPtrID) +
+ USI::getNumUserSGPRForField(USI::DispatchIdID) +
+ USI::getNumUserSGPRForField(USI::FlatScratchInitID) +
+ USI::getNumUserSGPRForField(USI::ImplicitBufferPtrID);
// Max number of system SGPRs
- unsigned MaxSystemSGPRs = 1 + // WorkGroupIDX
- 1 + // WorkGroupIDY
- 1 + // WorkGroupIDZ
- 1 + // WorkGroupInfo
- 1; // private segment wave byte offset
+ const unsigned MaxSystemSGPRs = 1 + // WorkGroupIDX
+ 1 + // WorkGroupIDY
+ 1 + // WorkGroupIDZ
+ 1 + // WorkGroupInfo
+ 1; // private segment wave byte offset
// Max number of synthetic SGPRs
- unsigned SyntheticSGPRs = 1; // LDSKernelId
+ const unsigned SyntheticSGPRs = 1; // LDSKernelId
return MaxUserSGPRs + MaxSystemSGPRs + SyntheticSGPRs;
}
@@ -994,6 +1005,9 @@ GCNSubtarget::createFillMFMAShadowMutation(const TargetInstrInfo *TII) const {
}
unsigned GCNSubtarget::getNSAThreshold(const MachineFunction &MF) const {
+ if (getGeneration() >= AMDGPUSubtarget::GFX12)
+ return 0; // Not MIMG encoding.
+
if (NSAThreshold.getNumOccurrences() > 0)
return std::max(NSAThreshold.getValue(), 2u);
@@ -1018,3 +1032,79 @@ const AMDGPUSubtarget &AMDGPUSubtarget::get(const TargetMachine &TM, const Funct
else
return static_cast<const AMDGPUSubtarget&>(TM.getSubtarget<R600Subtarget>(F));
}
+
+GCNUserSGPRUsageInfo::GCNUserSGPRUsageInfo(const Function &F,
+ const GCNSubtarget &ST)
+ : ST(ST) {
+ const CallingConv::ID CC = F.getCallingConv();
+ const bool IsKernel =
+ CC == CallingConv::AMDGPU_KERNEL || CC == CallingConv::SPIR_KERNEL;
+ // FIXME: Should have analysis or something rather than attribute to detect
+ // calls.
+ const bool HasCalls = F.hasFnAttribute("amdgpu-calls");
+ // FIXME: This attribute is a hack, we just need an analysis on the function
+ // to look for allocas.
+ const bool HasStackObjects = F.hasFnAttribute("amdgpu-stack-objects");
+
+ if (IsKernel && (!F.arg_empty() || ST.getImplicitArgNumBytes(F) != 0))
+ KernargSegmentPtr = true;
+
+ bool IsAmdHsaOrMesa = ST.isAmdHsaOrMesa(F);
+ if (IsAmdHsaOrMesa && !ST.enableFlatScratch())
+ PrivateSegmentBuffer = true;
+ else if (ST.isMesaGfxShader(F))
+ ImplicitBufferPtr = true;
+
+ if (!AMDGPU::isGraphics(CC)) {
+ if (!F.hasFnAttribute("amdgpu-no-dispatch-ptr"))
+ DispatchPtr = true;
+
+ // FIXME: Can this always be disabled with < COv5?
+ if (!F.hasFnAttribute("amdgpu-no-queue-ptr"))
+ QueuePtr = true;
+
+ if (!F.hasFnAttribute("amdgpu-no-dispatch-id"))
+ DispatchID = true;
+ }
+
+ // TODO: This could be refined a lot. The attribute is a poor way of
+ // detecting calls or stack objects that may require it before argument
+ // lowering.
+ if (ST.hasFlatAddressSpace() && AMDGPU::isEntryFunctionCC(CC) &&
+ (IsAmdHsaOrMesa || ST.enableFlatScratch()) &&
+ (HasCalls || HasStackObjects || ST.enableFlatScratch()) &&
+ !ST.flatScratchIsArchitected()) {
+ FlatScratchInit = true;
+ }
+
+ if (hasImplicitBufferPtr())
+ NumUsedUserSGPRs += getNumUserSGPRForField(ImplicitBufferPtrID);
+
+ if (hasPrivateSegmentBuffer())
+ NumUsedUserSGPRs += getNumUserSGPRForField(PrivateSegmentBufferID);
+
+ if (hasDispatchPtr())
+ NumUsedUserSGPRs += getNumUserSGPRForField(DispatchPtrID);
+
+ if (hasQueuePtr())
+ NumUsedUserSGPRs += getNumUserSGPRForField(QueuePtrID);
+
+ if (hasKernargSegmentPtr())
+ NumUsedUserSGPRs += getNumUserSGPRForField(KernargSegmentPtrID);
+
+ if (hasDispatchID())
+ NumUsedUserSGPRs += getNumUserSGPRForField(DispatchIdID);
+
+ if (hasFlatScratchInit())
+ NumUsedUserSGPRs += getNumUserSGPRForField(FlatScratchInitID);
+}
+
+void GCNUserSGPRUsageInfo::allocKernargPreloadSGPRs(unsigned NumSGPRs) {
+ assert(NumKernargPreloadSGPRs + NumSGPRs <= AMDGPU::getMaxNumUserSGPRs(ST));
+ NumKernargPreloadSGPRs += NumSGPRs;
+ NumUsedUserSGPRs += NumSGPRs;
+}
+
+unsigned GCNUserSGPRUsageInfo::getNumFreeUserSGPRs() {
+ return AMDGPU::getMaxNumUserSGPRs(ST) - NumUsedUserSGPRs;
+}
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
index 10ce00fe68ca..b72697973be7 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
@@ -39,7 +39,8 @@ public:
VOLCANIC_ISLANDS = 7,
GFX9 = 8,
GFX10 = 9,
- GFX11 = 10
+ GFX11 = 10,
+ GFX12 = 11,
};
private:
@@ -49,6 +50,7 @@ protected:
bool GCN3Encoding = false;
bool Has16BitInsts = false;
bool HasTrue16BitInsts = false;
+ bool EnableRealTrue16Insts = false;
bool HasMadMixInsts = false;
bool HasMadMacF32Insts = false;
bool HasDsSrc2Insts = false;
@@ -153,8 +155,17 @@ public:
return Has16BitInsts;
}
+ /// Return true if the subtarget supports True16 instructions.
bool hasTrue16BitInsts() const { return HasTrue16BitInsts; }
+ /// Return true if real (non-fake) variants of True16 instructions using
+ /// 16-bit registers should be code-generated. Fake True16 instructions are
+ /// identical to non-fake ones except that they take 32-bit registers as
+ /// operands and always use their low halves.
+ // TODO: Remove and use hasTrue16BitInsts() instead once True16 is fully
+ // supported and the support for fake True16 instructions is removed.
+ bool useRealTrue16Insts() const;
+
bool hasMadMixInsts() const {
return HasMadMixInsts;
}
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index 87ef2333e2ea..e8c04ecf39ba 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -50,6 +50,7 @@
#include "llvm/InitializePasses.h"
#include "llvm/MC/TargetRegistry.h"
#include "llvm/Passes/PassBuilder.h"
+#include "llvm/Transforms/HipStdPar/HipStdPar.h"
#include "llvm/Transforms/IPO.h"
#include "llvm/Transforms/IPO/AlwaysInliner.h"
#include "llvm/Transforms/IPO/GlobalDCE.h"
@@ -173,12 +174,6 @@ static VGPRRegisterRegAlloc fastRegAllocVGPR(
"fast", "fast register allocator", createFastVGPRRegisterAllocator);
}
-static cl::opt<bool> EnableSROA(
- "amdgpu-sroa",
- cl::desc("Run SROA after promote alloca pass"),
- cl::ReallyHidden,
- cl::init(true));
-
static cl::opt<bool>
EnableEarlyIfConversion("amdgpu-early-ifcvt", cl::Hidden,
cl::desc("Run early if-conversion"),
@@ -291,6 +286,12 @@ static cl::opt<bool> EnableSIModeRegisterPass(
cl::init(true),
cl::Hidden);
+// Enable GFX11.5+ s_singleuse_vdst insertion
+static cl::opt<bool>
+ EnableInsertSingleUseVDST("amdgpu-enable-single-use-vdst",
+ cl::desc("Enable s_singleuse_vdst insertion"),
+ cl::init(false), cl::Hidden);
+
// Enable GFX11+ s_delay_alu insertion
static cl::opt<bool>
EnableInsertDelayAlu("amdgpu-enable-delay-alu",
@@ -339,6 +340,11 @@ static cl::opt<bool> EnablePromoteKernelArguments(
cl::desc("Enable promotion of flat kernel pointer arguments to global"),
cl::Hidden, cl::init(true));
+static cl::opt<bool> EnableImageIntrinsicOptimizer(
+ "amdgpu-enable-image-intrinsic-optimizer",
+ cl::desc("Enable image intrinsic optimizer pass"), cl::init(true),
+ cl::Hidden);
+
static cl::opt<bool> EnableMaxIlpSchedStrategy(
"amdgpu-enable-max-ilp-scheduling-strategy",
cl::desc("Enable scheduling strategy to maximize ILP for a single wave."),
@@ -346,9 +352,14 @@ static cl::opt<bool> EnableMaxIlpSchedStrategy(
static cl::opt<bool> EnableRewritePartialRegUses(
"amdgpu-enable-rewrite-partial-reg-uses",
- cl::desc("Enable rewrite partial reg uses pass"), cl::init(false),
+ cl::desc("Enable rewrite partial reg uses pass"), cl::init(true),
cl::Hidden);
+static cl::opt<bool> EnableHipStdPar(
+ "amdgpu-enable-hipstdpar",
+ cl::desc("Enable HIP Standard Parallelism Offload support"), cl::init(false),
+ cl::Hidden);
+
extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() {
// Register the target
RegisterTargetMachine<R600TargetMachine> X(getTheR600Target());
@@ -364,6 +375,8 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() {
initializeAMDGPUDAGToDAGISelPass(*PR);
initializeGCNDPPCombinePass(*PR);
initializeSILowerI1CopiesPass(*PR);
+ initializeAMDGPUGlobalISelDivergenceLoweringPass(*PR);
+ initializeSILowerWWMCopiesPass(*PR);
initializeSILowerSGPRSpillsPass(*PR);
initializeSIFixSGPRCopiesPass(*PR);
initializeSIFixVGPRCopiesPass(*PR);
@@ -375,7 +388,7 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() {
initializeSILoadStoreOptimizerPass(*PR);
initializeAMDGPUCtorDtorLoweringLegacyPass(*PR);
initializeAMDGPUAlwaysInlinePass(*PR);
- initializeAMDGPUAttributorPass(*PR);
+ initializeAMDGPUAttributorLegacyPass(*PR);
initializeAMDGPUAnnotateKernelFeaturesPass(*PR);
initializeAMDGPUAnnotateUniformValuesPass(*PR);
initializeAMDGPUArgumentUsageInfoPass(*PR);
@@ -393,11 +406,12 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() {
initializeAMDGPUCodeGenPreparePass(*PR);
initializeAMDGPULateCodeGenPreparePass(*PR);
initializeAMDGPURemoveIncompatibleFunctionsPass(*PR);
- initializeAMDGPULowerModuleLDSPass(*PR);
+ initializeAMDGPULowerModuleLDSLegacyPass(*PR);
initializeAMDGPURewriteOutArgumentsPass(*PR);
- initializeAMDGPURewriteUndefForPHIPass(*PR);
+ initializeAMDGPURewriteUndefForPHILegacyPass(*PR);
initializeAMDGPUUnifyMetadataPass(*PR);
initializeSIAnnotateControlFlowPass(*PR);
+ initializeAMDGPUInsertSingleUseVDSTPass(*PR);
initializeAMDGPUInsertDelayAluPass(*PR);
initializeSIInsertHardClausesPass(*PR);
initializeSIInsertWaitcntsPass(*PR);
@@ -415,14 +429,14 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() {
initializeAMDGPUUnifyDivergentExitNodesPass(*PR);
initializeAMDGPUAAWrapperPassPass(*PR);
initializeAMDGPUExternalAAWrapperPass(*PR);
- initializeAMDGPUUseNativeCallsPass(*PR);
- initializeAMDGPUSimplifyLibCallsPass(*PR);
+ initializeAMDGPUImageIntrinsicOptimizerPass(*PR);
initializeAMDGPUPrintfRuntimeBindingPass(*PR);
initializeAMDGPUResourceUsageAnalysisPass(*PR);
initializeGCNNSAReassignPass(*PR);
initializeGCNPreRAOptimizationsPass(*PR);
initializeGCNPreRALongBranchRegPass(*PR);
initializeGCNRewritePartialRegUsesPass(*PR);
+ initializeGCNRegPressurePrinterPass(*PR);
}
static std::unique_ptr<TargetLoweringObjectFile> createTLOF(const Triple &TT) {
@@ -441,7 +455,7 @@ createGCNMaxOccupancyMachineScheduler(MachineSchedContext *C) {
DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI));
if (ST.shouldClusterStores())
DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI));
- DAG->addMutation(createIGroupLPDAGMutation());
+ DAG->addMutation(createIGroupLPDAGMutation(/*IsPostRA=*/false));
DAG->addMutation(createAMDGPUMacroFusionDAGMutation());
DAG->addMutation(createAMDGPUExportClusteringDAGMutation());
return DAG;
@@ -451,7 +465,7 @@ static ScheduleDAGInstrs *
createGCNMaxILPMachineScheduler(MachineSchedContext *C) {
ScheduleDAGMILive *DAG =
new GCNScheduleDAGMILive(C, std::make_unique<GCNMaxILPSchedStrategy>(C));
- DAG->addMutation(createIGroupLPDAGMutation());
+ DAG->addMutation(createIGroupLPDAGMutation(/*IsPostRA=*/false));
return DAG;
}
@@ -525,9 +539,10 @@ static StringRef computeDataLayout(const Triple &TT) {
// space 8) which cannot be non-trivilally accessed by LLVM memory operations
// like getelementptr.
return "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32"
- "-p7:160:256:256:32-p8:128:128-i64:64-v16:16-v24:32-v32:32-v48:64-v96:"
+ "-p7:160:256:256:32-p8:128:128-p9:192:256:256:32-i64:64-v16:16-v24:32-"
+ "v32:32-v48:64-v96:"
"128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-"
- "G1-ni:7:8";
+ "G1-ni:7:8:9";
}
LLVM_READNONE
@@ -553,7 +568,7 @@ AMDGPUTargetMachine::AMDGPUTargetMachine(const Target &T, const Triple &TT,
TargetOptions Options,
std::optional<Reloc::Model> RM,
std::optional<CodeModel::Model> CM,
- CodeGenOpt::Level OptLevel)
+ CodeGenOptLevel OptLevel)
: LLVMTargetMachine(T, computeDataLayout(TT), TT, getGPUOrDefault(TT, CPU),
FS, Options, getEffectiveRelocModel(RM),
getEffectiveCodeModel(CM, CodeModel::Small), OptLevel),
@@ -588,8 +603,8 @@ StringRef AMDGPUTargetMachine::getFeatureString(const Function &F) const {
/// Predicate for Internalize pass.
static bool mustPreserveGV(const GlobalValue &GV) {
if (const Function *F = dyn_cast<Function>(&GV))
- return F->isDeclaration() || F->getName().startswith("__asan_") ||
- F->getName().startswith("__sanitizer_") ||
+ return F->isDeclaration() || F->getName().starts_with("__asan_") ||
+ F->getName().starts_with("__sanitizer_") ||
AMDGPU::isEntryFunctionCC(F->getCallingConv());
GV.removeDeadConstantUsers();
@@ -602,8 +617,12 @@ void AMDGPUTargetMachine::registerDefaultAliasAnalyses(AAManager &AAM) {
void AMDGPUTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) {
PB.registerPipelineParsingCallback(
- [](StringRef PassName, ModulePassManager &PM,
- ArrayRef<PassBuilder::PipelineElement>) {
+ [this](StringRef PassName, ModulePassManager &PM,
+ ArrayRef<PassBuilder::PipelineElement>) {
+ if (PassName == "amdgpu-attributor") {
+ PM.addPass(AMDGPUAttributorPass(*this));
+ return true;
+ }
if (PassName == "amdgpu-unify-metadata") {
PM.addPass(AMDGPUUnifyMetadataPass());
return true;
@@ -617,7 +636,7 @@ void AMDGPUTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) {
return true;
}
if (PassName == "amdgpu-lower-module-lds") {
- PM.addPass(AMDGPULowerModuleLDSPass());
+ PM.addPass(AMDGPULowerModuleLDSPass(*this));
return true;
}
if (PassName == "amdgpu-lower-ctor-dtor") {
@@ -630,7 +649,11 @@ void AMDGPUTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) {
[this](StringRef PassName, FunctionPassManager &PM,
ArrayRef<PassBuilder::PipelineElement>) {
if (PassName == "amdgpu-simplifylib") {
- PM.addPass(AMDGPUSimplifyLibCallsPass(*this));
+ PM.addPass(AMDGPUSimplifyLibCallsPass());
+ return true;
+ }
+ if (PassName == "amdgpu-image-intrinsic-opt") {
+ PM.addPass(AMDGPUImageIntrinsicOptimizerPass(*this));
return true;
}
if (PassName == "amdgpu-usenative") {
@@ -666,6 +689,14 @@ void AMDGPUTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) {
PM.addPass(AMDGPUCodeGenPreparePass(*this));
return true;
}
+ if (PassName == "amdgpu-lower-kernel-arguments") {
+ PM.addPass(AMDGPULowerKernelArgumentsPass(*this));
+ return true;
+ }
+ if (PassName == "amdgpu-rewrite-undef-for-phi") {
+ PM.addPass(AMDGPURewriteUndefForPHIPass());
+ return true;
+ }
return false;
});
@@ -682,12 +713,14 @@ void AMDGPUTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) {
});
PB.registerPipelineStartEPCallback(
- [this](ModulePassManager &PM, OptimizationLevel Level) {
+ [](ModulePassManager &PM, OptimizationLevel Level) {
FunctionPassManager FPM;
FPM.addPass(AMDGPUUseNativeCallsPass());
if (EnableLibCallSimplify && Level != OptimizationLevel::O0)
- FPM.addPass(AMDGPUSimplifyLibCallsPass(*this));
+ FPM.addPass(AMDGPUSimplifyLibCallsPass());
PM.addPass(createModuleToFunctionPassAdaptor(std::move(FPM)));
+ if (EnableHipStdPar)
+ PM.addPass(HipStdParAcceleratorCodeSelectionPass());
});
PB.registerPipelineEarlySimplificationEPCallback(
@@ -826,7 +859,7 @@ GCNTargetMachine::GCNTargetMachine(const Target &T, const Triple &TT,
TargetOptions Options,
std::optional<Reloc::Model> RM,
std::optional<CodeModel::Model> CM,
- CodeGenOpt::Level OL, bool JIT)
+ CodeGenOptLevel OL, bool JIT)
: AMDGPUTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL) {}
const TargetSubtargetInfo *
@@ -894,8 +927,8 @@ public:
if (ST.shouldClusterStores())
DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI));
DAG->addMutation(ST.createFillMFMAShadowMutation(DAG->TII));
- DAG->addMutation(createIGroupLPDAGMutation());
- if (isPassEnabled(EnableVOPD, CodeGenOpt::Less))
+ DAG->addMutation(createIGroupLPDAGMutation(/*IsPostRA=*/true));
+ if (isPassEnabled(EnableVOPD, CodeGenOptLevel::Less))
DAG->addMutation(createVOPDPairingMutation());
return DAG;
}
@@ -942,7 +975,7 @@ AMDGPUPassConfig::AMDGPUPassConfig(LLVMTargetMachine &TM, PassManagerBase &PM)
}
void AMDGPUPassConfig::addEarlyCSEOrGVNPass() {
- if (getOptLevel() == CodeGenOpt::Aggressive)
+ if (getOptLevel() == CodeGenOptLevel::Aggressive)
addPass(createGVNPass());
else
addPass(createEarlyCSEPass());
@@ -966,6 +999,10 @@ void AMDGPUPassConfig::addStraightLineScalarOptimizationPasses() {
void AMDGPUPassConfig::addIRPasses() {
const AMDGPUTargetMachine &TM = getAMDGPUTargetMachine();
+ Triple::ArchType Arch = TM.getTargetTriple().getArch();
+ if (RemoveIncompatibleFunctions && Arch == Triple::amdgcn)
+ addPass(createAMDGPURemoveIncompatibleFunctionsPass(&TM));
+
// There is no reason to run these.
disablePass(&StackMapLivenessID);
disablePass(&FuncletLayoutID);
@@ -975,12 +1012,15 @@ void AMDGPUPassConfig::addIRPasses() {
if (LowerCtorDtor)
addPass(createAMDGPUCtorDtorLoweringLegacyPass());
+ if (isPassEnabled(EnableImageIntrinsicOptimizer))
+ addPass(createAMDGPUImageIntrinsicOptimizerPass(&TM));
+
// Function calls are not supported, so make sure we inline everything.
addPass(createAMDGPUAlwaysInlinePass());
addPass(createAlwaysInlinerLegacyPass());
// Handle uses of OpenCL image2d_t, image3d_t and sampler_t arguments.
- if (TM.getTargetTriple().getArch() == Triple::r600)
+ if (Arch == Triple::r600)
addPass(createR600OpenCLImageTypeLoweringPass());
// Replace OpenCL enqueued block function pointers with global variables.
@@ -988,24 +1028,29 @@ void AMDGPUPassConfig::addIRPasses() {
// Runs before PromoteAlloca so the latter can account for function uses
if (EnableLowerModuleLDS) {
- addPass(createAMDGPULowerModuleLDSPass());
+ addPass(createAMDGPULowerModuleLDSLegacyPass(&TM));
}
// AMDGPUAttributor infers lack of llvm.amdgcn.lds.kernel.id calls, so run
// after their introduction
- if (TM.getOptLevel() > CodeGenOpt::None)
- addPass(createAMDGPUAttributorPass());
+ if (TM.getOptLevel() > CodeGenOptLevel::None)
+ addPass(createAMDGPUAttributorLegacyPass());
- if (TM.getOptLevel() > CodeGenOpt::None)
+ if (TM.getOptLevel() > CodeGenOptLevel::None)
addPass(createInferAddressSpacesPass());
+ // Run atomic optimizer before Atomic Expand
+ if ((TM.getTargetTriple().getArch() == Triple::amdgcn) &&
+ (TM.getOptLevel() >= CodeGenOptLevel::Less) &&
+ (AMDGPUAtomicOptimizerStrategy != ScanOptions::None)) {
+ addPass(createAMDGPUAtomicOptimizerPass(AMDGPUAtomicOptimizerStrategy));
+ }
+
addPass(createAtomicExpandPass());
- if (TM.getOptLevel() > CodeGenOpt::None) {
+ if (TM.getOptLevel() > CodeGenOptLevel::None) {
addPass(createAMDGPUPromoteAlloca());
- if (EnableSROA)
- addPass(createSROAPass());
if (isPassEnabled(EnableScalarIRPasses))
addStraightLineScalarOptimizationPasses();
@@ -1025,7 +1070,7 @@ void AMDGPUPassConfig::addIRPasses() {
// Try to hoist loop invariant parts of divisions AMDGPUCodeGenPrepare may
// have expanded.
- if (TM.getOptLevel() > CodeGenOpt::Less)
+ if (TM.getOptLevel() > CodeGenOptLevel::Less)
addPass(createLICMPass());
}
@@ -1049,9 +1094,6 @@ void AMDGPUPassConfig::addIRPasses() {
void AMDGPUPassConfig::addCodeGenPrepare() {
if (TM->getTargetTriple().getArch() == Triple::amdgcn) {
- if (RemoveIncompatibleFunctions)
- addPass(createAMDGPURemoveIncompatibleFunctionsPass(TM));
-
// FIXME: This pass adds 2 hacky attributes that can be replaced with an
// analysis, and should be removed.
addPass(createAMDGPUAnnotateKernelFeaturesPass());
@@ -1074,7 +1116,7 @@ void AMDGPUPassConfig::addCodeGenPrepare() {
}
bool AMDGPUPassConfig::addPreISel() {
- if (TM->getOptLevel() > CodeGenOpt::None)
+ if (TM->getOptLevel() > CodeGenOptLevel::None)
addPass(createFlattenCFGPass());
return false;
}
@@ -1125,15 +1167,10 @@ ScheduleDAGInstrs *GCNPassConfig::createMachineScheduler(
bool GCNPassConfig::addPreISel() {
AMDGPUPassConfig::addPreISel();
- if (TM->getOptLevel() > CodeGenOpt::None)
+ if (TM->getOptLevel() > CodeGenOptLevel::None)
addPass(createAMDGPULateCodeGenPreparePass());
- if ((TM->getOptLevel() >= CodeGenOpt::Less) &&
- (AMDGPUAtomicOptimizerStrategy != ScanOptions::None)) {
- addPass(createAMDGPUAtomicOptimizerPass(AMDGPUAtomicOptimizerStrategy));
- }
-
- if (TM->getOptLevel() > CodeGenOpt::None)
+ if (TM->getOptLevel() > CodeGenOptLevel::None)
addPass(createSinkingPass());
// Merge divergent exit nodes. StructurizeCFG won't recognize the multi-exit
@@ -1152,11 +1189,11 @@ bool GCNPassConfig::addPreISel() {
// TODO: Move this right after structurizeCFG to avoid extra divergence
// analysis. This depends on stopping SIAnnotateControlFlow from making
// control flow modifications.
- addPass(createAMDGPURewriteUndefForPHIPass());
+ addPass(createAMDGPURewriteUndefForPHILegacyPass());
}
addPass(createLCSSAPass());
- if (TM->getOptLevel() > CodeGenOpt::Less)
+ if (TM->getOptLevel() > CodeGenOptLevel::Less)
addPass(&AMDGPUPerfHintAnalysisID);
return false;
@@ -1207,7 +1244,7 @@ bool GCNPassConfig::addIRTranslator() {
}
void GCNPassConfig::addPreLegalizeMachineIR() {
- bool IsOptNone = getOptLevel() == CodeGenOpt::None;
+ bool IsOptNone = getOptLevel() == CodeGenOptLevel::None;
addPass(createAMDGPUPreLegalizeCombiner(IsOptNone));
addPass(new Localizer());
}
@@ -1218,8 +1255,9 @@ bool GCNPassConfig::addLegalizeMachineIR() {
}
void GCNPassConfig::addPreRegBankSelect() {
- bool IsOptNone = getOptLevel() == CodeGenOpt::None;
+ bool IsOptNone = getOptLevel() == CodeGenOptLevel::None;
addPass(createAMDGPUPostLegalizeCombiner(IsOptNone));
+ addPass(createAMDGPUGlobalISelDivergenceLoweringPass());
}
bool GCNPassConfig::addRegBankSelect() {
@@ -1228,7 +1266,7 @@ bool GCNPassConfig::addRegBankSelect() {
}
void GCNPassConfig::addPreGlobalInstructionSelect() {
- bool IsOptNone = getOptLevel() == CodeGenOpt::None;
+ bool IsOptNone = getOptLevel() == CodeGenOptLevel::None;
addPass(createAMDGPURegBankCombiner(IsOptNone));
}
@@ -1253,7 +1291,6 @@ void GCNPassConfig::addFastRegAlloc() {
insertPass(&PHIEliminationID, &SILowerControlFlowID);
insertPass(&TwoAddressInstructionPassID, &SIWholeQuadModeID);
- insertPass(&TwoAddressInstructionPassID, &SIPreAllocateWWMRegsID);
TargetPassConfig::addFastRegAlloc();
}
@@ -1262,7 +1299,6 @@ void GCNPassConfig::addOptimizedRegAlloc() {
// Allow the scheduler to run before SIWholeQuadMode inserts exec manipulation
// instructions that cause scheduling barriers.
insertPass(&MachineSchedulerID, &SIWholeQuadModeID);
- insertPass(&MachineSchedulerID, &SIPreAllocateWWMRegsID);
if (OptExecMaskPreRA)
insertPass(&MachineSchedulerID, &SIOptimizeExecMaskingPreRAID);
@@ -1275,7 +1311,7 @@ void GCNPassConfig::addOptimizedRegAlloc() {
// This is not an essential optimization and it has a noticeable impact on
// compilation time, so we only enable it from O2.
- if (TM->getOptLevel() > CodeGenOpt::Less)
+ if (TM->getOptLevel() > CodeGenOptLevel::Less)
insertPass(&MachineSchedulerID, &SIFormMemoryClausesID);
// FIXME: when an instruction has a Killed operand, and the instruction is
@@ -1296,6 +1332,7 @@ void GCNPassConfig::addOptimizedRegAlloc() {
}
bool GCNPassConfig::addPreRewrite() {
+ addPass(&SILowerWWMCopiesID);
if (EnableRegReassign)
addPass(&GCNNSAReassignID);
return true;
@@ -1348,8 +1385,11 @@ bool GCNPassConfig::addRegAssignAndRewriteFast() {
// Equivalent of PEI for SGPRs.
addPass(&SILowerSGPRSpillsID);
+ addPass(&SIPreAllocateWWMRegsID);
addPass(createVGPRAllocPass(false));
+
+ addPass(&SILowerWWMCopiesID);
return true;
}
@@ -1369,6 +1409,7 @@ bool GCNPassConfig::addRegAssignAndRewriteOptimized() {
// Equivalent of PEI for SGPRs.
addPass(&SILowerSGPRSpillsID);
+ addPass(&SIPreAllocateWWMRegsID);
addPass(createVGPRAllocPass(true));
@@ -1380,32 +1421,32 @@ bool GCNPassConfig::addRegAssignAndRewriteOptimized() {
void GCNPassConfig::addPostRegAlloc() {
addPass(&SIFixVGPRCopiesID);
- if (getOptLevel() > CodeGenOpt::None)
+ if (getOptLevel() > CodeGenOptLevel::None)
addPass(&SIOptimizeExecMaskingID);
TargetPassConfig::addPostRegAlloc();
}
void GCNPassConfig::addPreSched2() {
- if (TM->getOptLevel() > CodeGenOpt::None)
+ if (TM->getOptLevel() > CodeGenOptLevel::None)
addPass(createSIShrinkInstructionsPass());
addPass(&SIPostRABundlerID);
}
void GCNPassConfig::addPreEmitPass() {
- if (isPassEnabled(EnableVOPD, CodeGenOpt::Less))
+ if (isPassEnabled(EnableVOPD, CodeGenOptLevel::Less))
addPass(&GCNCreateVOPDID);
addPass(createSIMemoryLegalizerPass());
addPass(createSIInsertWaitcntsPass());
addPass(createSIModeRegisterPass());
- if (getOptLevel() > CodeGenOpt::None)
+ if (getOptLevel() > CodeGenOptLevel::None)
addPass(&SIInsertHardClausesID);
addPass(&SILateBranchLoweringPassID);
- if (isPassEnabled(EnableSetWavePriority, CodeGenOpt::Less))
+ if (isPassEnabled(EnableSetWavePriority, CodeGenOptLevel::Less))
addPass(createAMDGPUSetWavePriorityPass());
- if (getOptLevel() > CodeGenOpt::None)
+ if (getOptLevel() > CodeGenOptLevel::None)
addPass(&SIPreEmitPeepholeID);
// The hazard recognizer that runs as part of the post-ra scheduler does not
// guarantee to be able handle all hazards correctly. This is because if there
@@ -1417,7 +1458,10 @@ void GCNPassConfig::addPreEmitPass() {
// cases.
addPass(&PostRAHazardRecognizerID);
- if (isPassEnabled(EnableInsertDelayAlu, CodeGenOpt::Less))
+ if (isPassEnabled(EnableInsertSingleUseVDST, CodeGenOptLevel::Less))
+ addPass(&AMDGPUInsertSingleUseVDSTID);
+
+ if (isPassEnabled(EnableInsertDelayAlu, CodeGenOptLevel::Less))
addPass(&AMDGPUInsertDelayAluID);
addPass(&BranchRelaxationPassID);
@@ -1458,13 +1502,13 @@ bool GCNTargetMachine::parseMachineFunctionInfo(
static_cast<const yaml::SIMachineFunctionInfo &>(MFI_);
MachineFunction &MF = PFS.MF;
SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
+ const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
if (MFI->initializeBaseYamlFields(YamlMFI, MF, PFS, Error, SourceRange))
return true;
if (MFI->Occupancy == 0) {
// Fixup the subtarget dependent default value.
- const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
MFI->Occupancy = ST.computeOccupancy(MF.getFunction(), MFI->getLDSSize());
}
@@ -1618,8 +1662,10 @@ bool GCNTargetMachine::parseMachineFunctionInfo(
MFI->ArgInfo.WorkItemIDZ, 0, 0)))
return true;
- MFI->Mode.IEEE = YamlMFI.Mode.IEEE;
- MFI->Mode.DX10Clamp = YamlMFI.Mode.DX10Clamp;
+ if (ST.hasIEEEMode())
+ MFI->Mode.IEEE = YamlMFI.Mode.IEEE;
+ if (ST.hasDX10ClampMode())
+ MFI->Mode.DX10Clamp = YamlMFI.Mode.DX10Clamp;
// FIXME: Move proper support for denormal-fp-math into base MachineFunction
MFI->Mode.FP32Denormals.Input = YamlMFI.Mode.FP32InputDenormals
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h
index 2426be405a65..9051a61e6557 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h
@@ -41,7 +41,7 @@ public:
AMDGPUTargetMachine(const Target &T, const Triple &TT, StringRef CPU,
StringRef FS, TargetOptions Options,
std::optional<Reloc::Model> RM,
- std::optional<CodeModel::Model> CM, CodeGenOpt::Level OL);
+ std::optional<CodeModel::Model> CM, CodeGenOptLevel OL);
~AMDGPUTargetMachine() override;
const TargetSubtargetInfo *getSubtargetImpl() const;
@@ -79,7 +79,7 @@ public:
GCNTargetMachine(const Target &T, const Triple &TT, StringRef CPU,
StringRef FS, TargetOptions Options,
std::optional<Reloc::Model> RM,
- std::optional<CodeModel::Model> CM, CodeGenOpt::Level OL,
+ std::optional<CodeModel::Model> CM, CodeGenOptLevel OL,
bool JIT);
TargetPassConfig *createPassConfig(PassManagerBase &PM) override;
@@ -137,7 +137,7 @@ public:
/// be used given that a pass shall work at an optimization \p Level
/// minimum.
bool isPassEnabled(const cl::opt<bool> &Opt,
- CodeGenOpt::Level Level = CodeGenOpt::Default) const {
+ CodeGenOptLevel Level = CodeGenOptLevel::Default) const {
if (Opt.getNumOccurrences())
return Opt;
if (TM->getOptLevel() < Level)
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUTargetObjectFile.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUTargetObjectFile.cpp
index f854c8c16e5a..584e41bfd546 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUTargetObjectFile.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUTargetObjectFile.cpp
@@ -30,7 +30,7 @@ MCSection *AMDGPUTargetObjectFile::getExplicitSectionGlobal(
const GlobalObject *GO, SectionKind SK, const TargetMachine &TM) const {
// Set metadata access for the explicit section
StringRef SectionName = GO->getSection();
- if (SectionName.startswith(".AMDGPU.comment."))
+ if (SectionName.starts_with(".AMDGPU.comment."))
SK = SectionKind::getMetadata();
return TargetLoweringObjectFileELF::getExplicitSectionGlobal(GO, SK, TM);
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
index 81d083c1c88a..f1da1a61bf4d 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
@@ -296,7 +296,7 @@ GCNTTIImpl::GCNTTIImpl(const AMDGPUTargetMachine *TM, const Function &F)
ST(static_cast<const GCNSubtarget *>(TM->getSubtargetImpl(F))),
TLI(ST->getTargetLowering()), CommonTTI(TM, F),
IsGraphics(AMDGPU::isGraphics(F.getCallingConv())) {
- SIModeRegisterDefaults Mode(F);
+ SIModeRegisterDefaults Mode(F, *ST);
HasFP32Denormals = Mode.FP32Denormals != DenormalMode::getPreserveSign();
HasFP64FP16Denormals =
Mode.FP64FP16Denormals != DenormalMode::getPreserveSign();
@@ -368,7 +368,8 @@ unsigned GCNTTIImpl::getLoadStoreVecRegBitWidth(unsigned AddrSpace) const {
AddrSpace == AMDGPUAS::CONSTANT_ADDRESS ||
AddrSpace == AMDGPUAS::CONSTANT_ADDRESS_32BIT ||
AddrSpace == AMDGPUAS::BUFFER_FAT_POINTER ||
- AddrSpace == AMDGPUAS::BUFFER_RESOURCE) {
+ AddrSpace == AMDGPUAS::BUFFER_RESOURCE ||
+ AddrSpace == AMDGPUAS::BUFFER_STRIDED_POINTER) {
return 512;
}
@@ -650,6 +651,15 @@ InstructionCost GCNTTIImpl::getArithmeticInstrCost(
return LT.first * Cost * NElts;
}
+ if (SLT == MVT::f32 && ((CxtI && CxtI->hasApproxFunc()) ||
+ TLI->getTargetMachine().Options.UnsafeFPMath)) {
+ // Fast unsafe fdiv lowering:
+ // f32 rcp
+ // f32 fmul
+ int Cost = getQuarterRateInstrCost(CostKind) + getFullRateInstrCost();
+ return LT.first * Cost * NElts;
+ }
+
if (SLT == MVT::f32 || SLT == MVT::f16) {
// 4 more v_cvt_* insts without f16 insts support
int Cost = (SLT == MVT::f16 ? 14 : 10) * getFullRateInstrCost() +
@@ -883,7 +893,7 @@ bool GCNTTIImpl::isReadRegisterSourceOfDivergence(
return true;
// Special case scalar registers that start with 'v'.
- if (RegName.startswith("vcc") || RegName.empty())
+ if (RegName.starts_with("vcc") || RegName.empty())
return false;
// VGPR or AGPR is divergent. There aren't any specially named vector
@@ -1017,6 +1027,8 @@ bool GCNTTIImpl::collectFlatAddressOperands(SmallVectorImpl<int> &OpIndexes,
case Intrinsic::amdgcn_flat_atomic_fadd:
case Intrinsic::amdgcn_flat_atomic_fmax:
case Intrinsic::amdgcn_flat_atomic_fmin:
+ case Intrinsic::amdgcn_flat_atomic_fmax_num:
+ case Intrinsic::amdgcn_flat_atomic_fmin_num:
OpIndexes.push_back(0);
return true;
default:
@@ -1091,7 +1103,9 @@ Value *GCNTTIImpl::rewriteIntrinsicWithAddressSpace(IntrinsicInst *II,
}
case Intrinsic::amdgcn_flat_atomic_fadd:
case Intrinsic::amdgcn_flat_atomic_fmax:
- case Intrinsic::amdgcn_flat_atomic_fmin: {
+ case Intrinsic::amdgcn_flat_atomic_fmin:
+ case Intrinsic::amdgcn_flat_atomic_fmax_num:
+ case Intrinsic::amdgcn_flat_atomic_fmin_num: {
Type *DestTy = II->getType();
Type *SrcTy = NewV->getType();
unsigned NewAS = SrcTy->getPointerAddressSpace();
@@ -1114,7 +1128,8 @@ InstructionCost GCNTTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
TTI::TargetCostKind CostKind,
int Index, VectorType *SubTp,
ArrayRef<const Value *> Args) {
- Kind = improveShuffleKindFromMask(Kind, Mask);
+ Kind = improveShuffleKindFromMask(Kind, Mask, VT, Index, SubTp);
+
if (ST->hasVOP3PInsts()) {
if (cast<FixedVectorType>(VT)->getNumElements() == 2 &&
DL.getTypeSizeInBits(VT->getElementType()) == 16) {
@@ -1153,8 +1168,8 @@ bool GCNTTIImpl::areInlineCompatible(const Function *Caller,
// FIXME: dx10_clamp can just take the caller setting, but there seems to be
// no way to support merge for backend defined attributes.
- SIModeRegisterDefaults CallerMode(*Caller);
- SIModeRegisterDefaults CalleeMode(*Callee);
+ SIModeRegisterDefaults CallerMode(*Caller, *CallerST);
+ SIModeRegisterDefaults CalleeMode(*Callee, *CalleeST);
if (!CallerMode.isInlineCompatible(CalleeMode))
return false;
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp
index 9ad841c3c8a5..9bc3ba161c9e 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp
@@ -46,6 +46,7 @@
#include "llvm/Support/Casting.h"
#include "llvm/Transforms/Scalar.h"
#include "llvm/Transforms/Utils.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
#include "llvm/Transforms/Utils/Local.h"
using namespace llvm;
@@ -114,8 +115,6 @@ void AMDGPUUnifyDivergentExitNodes::getAnalysisUsage(AnalysisUsage &AU) const {
// We preserve the non-critical-edgeness property
AU.addPreservedID(BreakCriticalEdgesID);
- // This is a cluster of orthogonal Transforms
- AU.addPreservedID(LowerSwitchID);
FunctionPass::getAnalysisUsage(AU);
AU.addRequired<TargetTransformInfoWrapperPass>();
@@ -192,6 +191,8 @@ BasicBlock *AMDGPUUnifyDivergentExitNodesImpl::unifyReturnBlockSet(
bool AMDGPUUnifyDivergentExitNodesImpl::run(Function &F, DominatorTree *DT,
const PostDominatorTree &PDT,
const UniformityInfo &UA) {
+ assert(hasOnlySimpleTerminator(F) && "Unsupported block terminator.");
+
if (PDT.root_size() == 0 ||
(PDT.root_size() == 1 &&
!isa<BranchInst>(PDT.getRoot()->getTerminator())))
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
index b9443559132f..3b69a37728ea 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
@@ -75,6 +75,7 @@ public:
bool Abs = false;
bool Neg = false;
bool Sext = false;
+ bool Lit = false;
bool hasFPModifiers() const { return Abs || Neg; }
bool hasIntModifiers() const { return Sext; }
@@ -273,6 +274,10 @@ public:
return isRegOrImmWithInputMods(AMDGPU::VS_32RegClassID, MVT::i16);
}
+ bool isRegOrImmWithIntT16InputMods() const {
+ return isRegOrImmWithInputMods(AMDGPU::VS_16RegClassID, MVT::i16);
+ }
+
bool isRegOrImmWithInt32InputMods() const {
return isRegOrImmWithInputMods(AMDGPU::VS_32RegClassID, MVT::i32);
}
@@ -293,6 +298,10 @@ public:
return isRegOrImmWithInputMods(AMDGPU::VS_32RegClassID, MVT::f16);
}
+ bool isRegOrImmWithFPT16InputMods() const {
+ return isRegOrImmWithInputMods(AMDGPU::VS_16RegClassID, MVT::f16);
+ }
+
bool isRegOrImmWithFP32InputMods() const {
return isRegOrImmWithInputMods(AMDGPU::VS_32RegClassID, MVT::f32);
}
@@ -347,29 +356,24 @@ public:
return isImm() && Imm.Type == ImmT;
}
+ template <ImmTy Ty> bool isImmTy() const { return isImmTy(Ty); }
+
bool isImmLiteral() const { return isImmTy(ImmTyNone); }
bool isImmModifier() const {
return isImm() && Imm.Type != ImmTyNone;
}
- bool isClampSI() const { return isImmTy(ImmTyClampSI); }
bool isOModSI() const { return isImmTy(ImmTyOModSI); }
bool isDMask() const { return isImmTy(ImmTyDMask); }
bool isDim() const { return isImmTy(ImmTyDim); }
- bool isUNorm() const { return isImmTy(ImmTyUNorm); }
- bool isDA() const { return isImmTy(ImmTyDA); }
bool isR128A16() const { return isImmTy(ImmTyR128A16); }
- bool isA16() const { return isImmTy(ImmTyA16); }
- bool isLWE() const { return isImmTy(ImmTyLWE); }
bool isOff() const { return isImmTy(ImmTyOff); }
bool isExpTgt() const { return isImmTy(ImmTyExpTgt); }
- bool isExpVM() const { return isImmTy(ImmTyExpVM); }
- bool isExpCompr() const { return isImmTy(ImmTyExpCompr); }
bool isOffen() const { return isImmTy(ImmTyOffen); }
bool isIdxen() const { return isImmTy(ImmTyIdxen); }
bool isAddr64() const { return isImmTy(ImmTyAddr64); }
- bool isOffset() const { return isImmTy(ImmTyOffset) && isUInt<16>(getImm()); }
+ bool isOffset() const { return isImmTy(ImmTyOffset); }
bool isOffset0() const { return isImmTy(ImmTyOffset0) && isUInt<8>(getImm()); }
bool isOffset1() const { return isImmTy(ImmTyOffset1) && isUInt<8>(getImm()); }
bool isSMEMOffsetMod() const { return isImmTy(ImmTySMEMOffsetMod); }
@@ -378,7 +382,6 @@ public:
bool isLDS() const { return isImmTy(ImmTyLDS); }
bool isCPol() const { return isImmTy(ImmTyCPol); }
bool isTFE() const { return isImmTy(ImmTyTFE); }
- bool isD16() const { return isImmTy(ImmTyD16); }
bool isFORMAT() const { return isImmTy(ImmTyFORMAT) && isUInt<7>(getImm()); }
bool isDppBankMask() const { return isImmTy(ImmTyDppBankMask); }
bool isDppRowMask() const { return isImmTy(ImmTyDppRowMask); }
@@ -395,7 +398,6 @@ public:
bool isOpSelHi() const { return isImmTy(ImmTyOpSelHi); }
bool isNegLo() const { return isImmTy(ImmTyNegLo); }
bool isNegHi() const { return isImmTy(ImmTyNegHi); }
- bool isHigh() const { return isImmTy(ImmTyHigh); }
bool isRegOrImm() const {
return isReg() || isImm();
@@ -512,7 +514,15 @@ public:
return isRegOrInlineNoMods(AMDGPU::VS_64RegClassID, MVT::i64);
}
+ bool isVCSrcTB16() const {
+ return isRegOrInlineNoMods(AMDGPU::VS_16RegClassID, MVT::i16);
+ }
+
bool isVCSrcTB16_Lo128() const {
+ return isRegOrInlineNoMods(AMDGPU::VS_16_Lo128RegClassID, MVT::i16);
+ }
+
+ bool isVCSrcFake16B16_Lo128() const {
return isRegOrInlineNoMods(AMDGPU::VS_32_Lo128RegClassID, MVT::i16);
}
@@ -532,7 +542,15 @@ public:
return isRegOrInlineNoMods(AMDGPU::VS_64RegClassID, MVT::f64);
}
+ bool isVCSrcTF16() const {
+ return isRegOrInlineNoMods(AMDGPU::VS_16RegClassID, MVT::f16);
+ }
+
bool isVCSrcTF16_Lo128() const {
+ return isRegOrInlineNoMods(AMDGPU::VS_16_Lo128RegClassID, MVT::f16);
+ }
+
+ bool isVCSrcFake16F16_Lo128() const {
return isRegOrInlineNoMods(AMDGPU::VS_32_Lo128RegClassID, MVT::f16);
}
@@ -552,10 +570,16 @@ public:
return isVCSrcF64() || isLiteralImm(MVT::i64);
}
+ bool isVSrcTB16() const { return isVCSrcTB16() || isLiteralImm(MVT::i16); }
+
bool isVSrcTB16_Lo128() const {
return isVCSrcTB16_Lo128() || isLiteralImm(MVT::i16);
}
+ bool isVSrcFake16B16_Lo128() const {
+ return isVCSrcFake16B16_Lo128() || isLiteralImm(MVT::i16);
+ }
+
bool isVSrcB16() const {
return isVCSrcB16() || isLiteralImm(MVT::i16);
}
@@ -588,10 +612,16 @@ public:
return isVCSrcF64() || isLiteralImm(MVT::f64);
}
+ bool isVSrcTF16() const { return isVCSrcTF16() || isLiteralImm(MVT::f16); }
+
bool isVSrcTF16_Lo128() const {
return isVCSrcTF16_Lo128() || isLiteralImm(MVT::f16);
}
+ bool isVSrcFake16F16_Lo128() const {
+ return isVCSrcFake16F16_Lo128() || isLiteralImm(MVT::f16);
+ }
+
bool isVSrcF16() const {
return isVCSrcF16() || isLiteralImm(MVT::f16);
}
@@ -863,6 +893,7 @@ public:
bool isSDelayALU() const;
bool isHwreg() const;
bool isSendMsg() const;
+ bool isSplitBarrier() const;
bool isSwizzle() const;
bool isSMRDOffset8() const;
bool isSMEMOffset() const;
@@ -879,6 +910,10 @@ public:
bool isWaitVDST() const;
bool isWaitEXP() const;
+ auto getPredicate(std::function<bool(const AMDGPUOperand &Op)> P) const {
+ return std::bind(P, *this);
+ }
+
StringRef getToken() const {
assert(isToken());
return StringRef(Tok.Data, Tok.Length);
@@ -1344,7 +1379,7 @@ public:
// AsmParser::parseDirectiveSet() cannot be specialized for specific target.
AMDGPU::IsaVersion ISA = AMDGPU::getIsaVersion(getSTI().getCPU());
MCContext &Ctx = getContext();
- if (ISA.Major >= 6 && isHsaAbiVersion3AndAbove(&getSTI())) {
+ if (ISA.Major >= 6 && isHsaAbi(getSTI())) {
MCSymbol *Sym =
Ctx.getOrCreateSymbol(Twine(".amdgcn.gfx_generation_number"));
Sym->setVariableValue(MCConstantExpr::create(ISA.Major, Ctx));
@@ -1361,7 +1396,7 @@ public:
Sym = Ctx.getOrCreateSymbol(Twine(".option.machine_version_stepping"));
Sym->setVariableValue(MCConstantExpr::create(ISA.Stepping, Ctx));
}
- if (ISA.Major >= 6 && isHsaAbiVersion3AndAbove(&getSTI())) {
+ if (ISA.Major >= 6 && isHsaAbi(getSTI())) {
initializeGprCountSymbol(IS_VGPR);
initializeGprCountSymbol(IS_SGPR);
} else
@@ -1381,6 +1416,8 @@ public:
bool hasG16() const { return AMDGPU::hasG16(getSTI()); }
+ bool hasGDS() const { return AMDGPU::hasGDS(getSTI()); }
+
bool isSI() const {
return AMDGPU::isSI(getSTI());
}
@@ -1424,6 +1461,10 @@ public:
return AMDGPU::isGFX11Plus(getSTI());
}
+ bool isGFX12() const { return AMDGPU::isGFX12(getSTI()); }
+
+ bool isGFX12Plus() const { return AMDGPU::isGFX12Plus(getSTI()); }
+
bool isGFX10_AEncoding() const { return AMDGPU::isGFX10_AEncoding(getSTI()); }
bool isGFX10_BEncoding() const {
@@ -1456,10 +1497,16 @@ public:
return getFeatureBits()[AMDGPU::FeaturePartialNSAEncoding];
}
- unsigned getNSAMaxSize() const {
- return AMDGPU::getNSAMaxSize(getSTI());
+ unsigned getNSAMaxSize(bool HasSampler = false) const {
+ return AMDGPU::getNSAMaxSize(getSTI(), HasSampler);
+ }
+
+ unsigned getMaxNumUserSGPRs() const {
+ return AMDGPU::getMaxNumUserSGPRs(getSTI());
}
+ bool hasKernargPreload() const { return AMDGPU::hasKernargPreload(getSTI()); }
+
AMDGPUTargetStreamer &getTargetStreamer() {
MCTargetStreamer &TS = *getParser().getStreamer().getTargetStreamer();
return static_cast<AMDGPUTargetStreamer &>(TS);
@@ -1493,10 +1540,9 @@ public:
std::unique_ptr<AMDGPUOperand> parseRegister(bool RestoreOnFailure = false);
bool ParseRegister(MCRegister &RegNo, SMLoc &StartLoc, SMLoc &EndLoc,
bool RestoreOnFailure);
- bool parseRegister(MCRegister &RegNo, SMLoc &StartLoc,
- SMLoc &EndLoc) override;
- OperandMatchResultTy tryParseRegister(MCRegister &RegNo, SMLoc &StartLoc,
- SMLoc &EndLoc) override;
+ bool parseRegister(MCRegister &Reg, SMLoc &StartLoc, SMLoc &EndLoc) override;
+ ParseStatus tryParseRegister(MCRegister &Reg, SMLoc &StartLoc,
+ SMLoc &EndLoc) override;
unsigned checkTargetMatchPredicate(MCInst &Inst) override;
unsigned validateTargetOperandClass(MCParsedAsmOperand &Op,
unsigned Kind) override;
@@ -1531,6 +1577,8 @@ public:
AMDGPUOperand::ImmTy ImmTy = AMDGPUOperand::ImmTyNone);
unsigned getCPolKind(StringRef Id, StringRef Mnemo, bool &Disabling) const;
ParseStatus parseCPol(OperandVector &Operands);
+ ParseStatus parseScope(OperandVector &Operands, int64_t &Scope);
+ ParseStatus parseTH(OperandVector &Operands, int64_t &TH);
ParseStatus parseStringWithPrefix(StringRef Prefix, StringRef &Value,
SMLoc &StringLoc);
@@ -1540,9 +1588,11 @@ public:
bool isNamedOperandModifier(const AsmToken &Token, const AsmToken &NextToken) const;
bool isOpcodeModifierWithVal(const AsmToken &Token, const AsmToken &NextToken) const;
bool parseSP3NegModifier();
- ParseStatus parseImm(OperandVector &Operands, bool HasSP3AbsModifier = false);
+ ParseStatus parseImm(OperandVector &Operands, bool HasSP3AbsModifier = false,
+ bool HasLit = false);
ParseStatus parseReg(OperandVector &Operands);
- ParseStatus parseRegOrImm(OperandVector &Operands, bool HasSP3AbsMod = false);
+ ParseStatus parseRegOrImm(OperandVector &Operands, bool HasSP3AbsMod = false,
+ bool HasLit = false);
ParseStatus parseRegOrImmWithFPInputMods(OperandVector &Operands,
bool AllowImm = true);
ParseStatus parseRegOrImmWithIntInputMods(OperandVector &Operands,
@@ -1616,6 +1666,7 @@ private:
SMLoc getInstLoc(const OperandVector &Operands) const;
bool validateInstruction(const MCInst &Inst, const SMLoc &IDLoc, const OperandVector &Operands);
+ bool validateOffset(const MCInst &Inst, const OperandVector &Operands);
bool validateFlatOffset(const MCInst &Inst, const OperandVector &Operands);
bool validateSMEMOffset(const MCInst &Inst, const OperandVector &Operands);
bool validateSOPLiteral(const MCInst &Inst) const;
@@ -1640,11 +1691,14 @@ private:
bool validateAGPRLdSt(const MCInst &Inst) const;
bool validateVGPRAlign(const MCInst &Inst) const;
bool validateBLGP(const MCInst &Inst, const OperandVector &Operands);
+ bool validateDS(const MCInst &Inst, const OperandVector &Operands);
bool validateGWS(const MCInst &Inst, const OperandVector &Operands);
bool validateDivScale(const MCInst &Inst);
bool validateWaitCnt(const MCInst &Inst, const OperandVector &Operands);
bool validateCoherencyBits(const MCInst &Inst, const OperandVector &Operands,
const SMLoc &IDLoc);
+ bool validateTHAndScopeBits(const MCInst &Inst, const OperandVector &Operands,
+ const unsigned CPol);
bool validateExeczVcczOperands(const OperandVector &Operands);
bool validateTFE(const MCInst &Inst, const OperandVector &Operands);
std::optional<StringRef> validateLdsDirect(const MCInst &Inst);
@@ -1733,7 +1787,6 @@ public:
void cvtVOP3Interp(MCInst &Inst, const OperandVector &Operands);
void cvtVINTERP(MCInst &Inst, const OperandVector &Operands);
- void cvtSMEMAtomic(MCInst &Inst, const OperandVector &Operands);
bool parseDimId(unsigned &Encoding);
ParseStatus parseDim(OperandVector &Operands);
@@ -1805,6 +1858,7 @@ static const fltSemantics *getOpFltSemantics(uint8_t OperandType) {
case AMDGPU::OPERAND_REG_INLINE_C_V2INT32:
case AMDGPU::OPERAND_REG_IMM_V2INT32:
case AMDGPU::OPERAND_KIMM32:
+ case AMDGPU::OPERAND_INLINE_SPLIT_BARRIER_INT32:
return &APFloat::IEEEsingle();
case AMDGPU::OPERAND_REG_IMM_INT64:
case AMDGPU::OPERAND_REG_IMM_FP64:
@@ -1987,7 +2041,7 @@ bool AMDGPUOperand::isVRegWithInputMods() const {
return isRegClass(AMDGPU::VGPR_32RegClassID) ||
// GFX90A allows DPP on 64-bit operands.
(isRegClass(AMDGPU::VReg_64RegClassID) &&
- AsmParser->getFeatureBits()[AMDGPU::Feature64BitDPP]);
+ AsmParser->getFeatureBits()[AMDGPU::FeatureDPALU_DPP]);
}
bool AMDGPUOperand::isT16VRegWithInputMods() const {
@@ -2096,9 +2150,10 @@ void AMDGPUOperand::addLiteralImmOperand(MCInst &Inst, int64_t Val, bool ApplyMo
const_cast<AMDGPUAsmParser *>(AsmParser)->Warning(Inst.getLoc(),
"Can't encode literal as exact 64-bit floating-point operand. "
"Low 32-bits will be set to zero");
+ Val &= 0xffffffff00000000u;
}
- Inst.addOperand(MCOperand::createImm(Literal.lshr(32).getZExtValue()));
+ Inst.addOperand(MCOperand::createImm(Val));
setImmKindLiteral();
return;
}
@@ -2133,7 +2188,8 @@ void AMDGPUOperand::addLiteralImmOperand(MCInst &Inst, int64_t Val, bool ApplyMo
case AMDGPU::OPERAND_REG_INLINE_C_V2INT32:
case AMDGPU::OPERAND_REG_IMM_V2INT32:
case AMDGPU::OPERAND_KIMM32:
- case AMDGPU::OPERAND_KIMM16: {
+ case AMDGPU::OPERAND_KIMM16:
+ case AMDGPU::OPERAND_INLINE_SPLIT_BARRIER_INT32: {
bool lost;
APFloat FPLiteral(APFloat::IEEEdouble(), Literal);
// Convert literal to single precision
@@ -2174,6 +2230,7 @@ void AMDGPUOperand::addLiteralImmOperand(MCInst &Inst, int64_t Val, bool ApplyMo
case AMDGPU::OPERAND_REG_INLINE_C_V2FP32:
case AMDGPU::OPERAND_REG_IMM_V2INT32:
case AMDGPU::OPERAND_REG_INLINE_C_V2INT32:
+ case AMDGPU::OPERAND_INLINE_SPLIT_BARRIER_INT32:
if (isSafeTruncation(Val, 32) &&
AMDGPU::isInlinableLiteral32(static_cast<int32_t>(Val),
AsmParser->hasInv2PiInlineImm())) {
@@ -2197,7 +2254,10 @@ void AMDGPUOperand::addLiteralImmOperand(MCInst &Inst, int64_t Val, bool ApplyMo
return;
}
- Inst.addOperand(MCOperand::createImm(Lo_32(Val)));
+ Val = AMDGPU::isSISrcFPOperand(InstDesc, OpNum) ? (uint64_t)Val << 32
+ : Lo_32(Val);
+
+ Inst.addOperand(MCOperand::createImm(Val));
setImmKindLiteral();
return;
@@ -2424,23 +2484,21 @@ bool AMDGPUAsmParser::ParseRegister(MCRegister &RegNo, SMLoc &StartLoc,
return false;
}
-bool AMDGPUAsmParser::parseRegister(MCRegister &RegNo, SMLoc &StartLoc,
+bool AMDGPUAsmParser::parseRegister(MCRegister &Reg, SMLoc &StartLoc,
SMLoc &EndLoc) {
- return ParseRegister(RegNo, StartLoc, EndLoc, /*RestoreOnFailure=*/false);
+ return ParseRegister(Reg, StartLoc, EndLoc, /*RestoreOnFailure=*/false);
}
-OperandMatchResultTy AMDGPUAsmParser::tryParseRegister(MCRegister &RegNo,
- SMLoc &StartLoc,
- SMLoc &EndLoc) {
- bool Result =
- ParseRegister(RegNo, StartLoc, EndLoc, /*RestoreOnFailure=*/true);
+ParseStatus AMDGPUAsmParser::tryParseRegister(MCRegister &Reg, SMLoc &StartLoc,
+ SMLoc &EndLoc) {
+ bool Result = ParseRegister(Reg, StartLoc, EndLoc, /*RestoreOnFailure=*/true);
bool PendingErrors = getParser().hasPendingError();
getParser().clearPendingErrors();
if (PendingErrors)
- return MatchOperand_ParseFail;
+ return ParseStatus::Failure;
if (Result)
- return MatchOperand_NoMatch;
- return MatchOperand_Success;
+ return ParseStatus::NoMatch;
+ return ParseStatus::Success;
}
bool AMDGPUAsmParser::AddNextRegisterToList(unsigned &Reg, unsigned &RegWidth,
@@ -2517,7 +2575,7 @@ static bool isRegularReg(RegisterKind Kind) {
static const RegInfo* getRegularRegInfo(StringRef Str) {
for (const RegInfo &Reg : RegularRegisters)
- if (Str.startswith(Reg.Name))
+ if (Str.starts_with(Reg.Name))
return &Reg;
return nullptr;
}
@@ -2577,7 +2635,7 @@ AMDGPUAsmParser::getRegularReg(RegisterKind RegKind,
if (RegKind == IS_SGPR || RegKind == IS_TTMP) {
// SGPR and TTMP registers must be aligned.
// Max required alignment is 4 dwords.
- AlignSize = std::min(RegWidth / 32, 4u);
+ AlignSize = std::min(llvm::bit_ceil(RegWidth / 32), 4u);
}
if (RegNum % AlignSize != 0) {
@@ -2855,7 +2913,7 @@ AMDGPUAsmParser::parseRegister(bool RestoreOnFailure) {
if (!ParseAMDGPURegister(RegKind, Reg, RegNum, RegWidth)) {
return nullptr;
}
- if (isHsaAbiVersion3AndAbove(&getSTI())) {
+ if (isHsaAbi(getSTI())) {
if (!updateGprCountSymbols(RegKind, RegNum, RegWidth))
return nullptr;
} else
@@ -2864,13 +2922,26 @@ AMDGPUAsmParser::parseRegister(bool RestoreOnFailure) {
}
ParseStatus AMDGPUAsmParser::parseImm(OperandVector &Operands,
- bool HasSP3AbsModifier) {
+ bool HasSP3AbsModifier, bool HasLit) {
// TODO: add syntactic sugar for 1/(2*PI)
if (isRegister())
return ParseStatus::NoMatch;
assert(!isModifier());
+ if (!HasLit) {
+ HasLit = trySkipId("lit");
+ if (HasLit) {
+ if (!skipToken(AsmToken::LParen, "expected left paren after lit"))
+ return ParseStatus::Failure;
+ ParseStatus S = parseImm(Operands, HasSP3AbsModifier, HasLit);
+ if (S.isSuccess() &&
+ !skipToken(AsmToken::RParen, "expected closing parentheses"))
+ return ParseStatus::Failure;
+ return S;
+ }
+ }
+
const auto& Tok = getToken();
const auto& NextTok = peekToken();
bool IsReal = Tok.is(AsmToken::Real);
@@ -2883,6 +2954,9 @@ ParseStatus AMDGPUAsmParser::parseImm(OperandVector &Operands,
Negate = true;
}
+ AMDGPUOperand::Modifiers Mods;
+ Mods.Lit = HasLit;
+
if (IsReal) {
// Floating-point expressions are not supported.
// Can only allow floating-point literals with an
@@ -2901,6 +2975,8 @@ ParseStatus AMDGPUAsmParser::parseImm(OperandVector &Operands,
Operands.push_back(
AMDGPUOperand::CreateImm(this, RealVal.bitcastToAPInt().getZExtValue(), S,
AMDGPUOperand::ImmTyNone, true));
+ AMDGPUOperand &Op = static_cast<AMDGPUOperand &>(*Operands.back());
+ Op.setModifiers(Mods);
return ParseStatus::Success;
@@ -2927,7 +3003,11 @@ ParseStatus AMDGPUAsmParser::parseImm(OperandVector &Operands,
if (Expr->evaluateAsAbsolute(IntVal)) {
Operands.push_back(AMDGPUOperand::CreateImm(this, IntVal, S));
+ AMDGPUOperand &Op = static_cast<AMDGPUOperand &>(*Operands.back());
+ Op.setModifiers(Mods);
} else {
+ if (HasLit)
+ return ParseStatus::NoMatch;
Operands.push_back(AMDGPUOperand::CreateExpr(this, Expr, S));
}
@@ -2950,13 +3030,13 @@ ParseStatus AMDGPUAsmParser::parseReg(OperandVector &Operands) {
}
ParseStatus AMDGPUAsmParser::parseRegOrImm(OperandVector &Operands,
- bool HasSP3AbsMod) {
+ bool HasSP3AbsMod, bool HasLit) {
ParseStatus Res = parseReg(Operands);
if (!Res.isNoMatch())
return Res;
if (isModifier())
return ParseStatus::NoMatch;
- return parseImm(Operands, HasSP3AbsMod);
+ return parseImm(Operands, HasSP3AbsMod, HasLit);
}
bool
@@ -3052,6 +3132,7 @@ AMDGPUAsmParser::parseRegOrImmWithFPInputMods(OperandVector &Operands,
bool AllowImm) {
bool Neg, SP3Neg;
bool Abs, SP3Abs;
+ bool Lit;
SMLoc Loc;
// Disable ambiguous constructs like '--1' etc. Should use neg(-1) instead.
@@ -3071,6 +3152,10 @@ AMDGPUAsmParser::parseRegOrImmWithFPInputMods(OperandVector &Operands,
if (Abs && !skipToken(AsmToken::LParen, "expected left paren after abs"))
return ParseStatus::Failure;
+ Lit = trySkipId("lit");
+ if (Lit && !skipToken(AsmToken::LParen, "expected left paren after lit"))
+ return ParseStatus::Failure;
+
Loc = getLoc();
SP3Abs = trySkipToken(AsmToken::Pipe);
if (Abs && SP3Abs)
@@ -3078,12 +3163,15 @@ AMDGPUAsmParser::parseRegOrImmWithFPInputMods(OperandVector &Operands,
ParseStatus Res;
if (AllowImm) {
- Res = parseRegOrImm(Operands, SP3Abs);
+ Res = parseRegOrImm(Operands, SP3Abs, Lit);
} else {
Res = parseReg(Operands);
}
if (!Res.isSuccess())
- return (SP3Neg || Neg || SP3Abs || Abs) ? ParseStatus::Failure : Res;
+ return (SP3Neg || Neg || SP3Abs || Abs || Lit) ? ParseStatus::Failure : Res;
+
+ if (Lit && !Operands.back()->isImm())
+ Error(Loc, "expected immediate with lit modifier");
if (SP3Abs && !skipToken(AsmToken::Pipe, "expected vertical bar"))
return ParseStatus::Failure;
@@ -3091,12 +3179,15 @@ AMDGPUAsmParser::parseRegOrImmWithFPInputMods(OperandVector &Operands,
return ParseStatus::Failure;
if (Neg && !skipToken(AsmToken::RParen, "expected closing parentheses"))
return ParseStatus::Failure;
+ if (Lit && !skipToken(AsmToken::RParen, "expected closing parentheses"))
+ return ParseStatus::Failure;
AMDGPUOperand::Modifiers Mods;
Mods.Abs = Abs || SP3Abs;
Mods.Neg = Neg || SP3Neg;
+ Mods.Lit = Lit;
- if (Mods.hasFPModifiers()) {
+ if (Mods.hasFPModifiers() || Lit) {
AMDGPUOperand &Op = static_cast<AMDGPUOperand &>(*Operands.back());
if (Op.isExpr())
return Error(Op.getStartLoc(), "expected an absolute expression");
@@ -3325,12 +3416,16 @@ unsigned AMDGPUAsmParser::getConstantBusLimit(unsigned Opcode) const {
case AMDGPU::V_LSHLREV_B64_e64:
case AMDGPU::V_LSHLREV_B64_gfx10:
case AMDGPU::V_LSHLREV_B64_e64_gfx11:
+ case AMDGPU::V_LSHLREV_B64_e32_gfx12:
+ case AMDGPU::V_LSHLREV_B64_e64_gfx12:
case AMDGPU::V_LSHRREV_B64_e64:
case AMDGPU::V_LSHRREV_B64_gfx10:
case AMDGPU::V_LSHRREV_B64_e64_gfx11:
+ case AMDGPU::V_LSHRREV_B64_e64_gfx12:
case AMDGPU::V_ASHRREV_I64_e64:
case AMDGPU::V_ASHRREV_I64_gfx10:
case AMDGPU::V_ASHRREV_I64_e64_gfx11:
+ case AMDGPU::V_ASHRREV_I64_e64_gfx12:
case AMDGPU::V_LSHL_B64_e64:
case AMDGPU::V_LSHR_B64_e64:
case AMDGPU::V_ASHR_I64_e64:
@@ -3485,8 +3580,12 @@ bool AMDGPUAsmParser::validateVOPDRegBankConstraints(
: MCRegister::NoRegister;
};
+ // On GFX12 if both OpX and OpY are V_MOV_B32 then OPY uses SRC2 source-cache.
+ bool SkipSrc = Opcode == AMDGPU::V_DUAL_MOV_B32_e32_X_MOV_B32_e32_gfx12;
+
const auto &InstInfo = getVOPDInstInfo(Opcode, &MII);
- auto InvalidCompOprIdx = InstInfo.getInvalidCompOperandIndex(getVRegIdx);
+ auto InvalidCompOprIdx =
+ InstInfo.getInvalidCompOperandIndex(getVRegIdx, SkipSrc);
if (!InvalidCompOprIdx)
return true;
@@ -3522,13 +3621,16 @@ bool AMDGPUAsmParser::validateIntClampSupported(const MCInst &Inst) {
return true;
}
+constexpr uint64_t MIMGFlags =
+ SIInstrFlags::MIMG | SIInstrFlags::VIMAGE | SIInstrFlags::VSAMPLE;
+
bool AMDGPUAsmParser::validateMIMGDataSize(const MCInst &Inst,
const SMLoc &IDLoc) {
const unsigned Opc = Inst.getOpcode();
const MCInstrDesc &Desc = MII.get(Opc);
- if ((Desc.TSFlags & SIInstrFlags::MIMG) == 0)
+ if ((Desc.TSFlags & MIMGFlags) == 0)
return true;
int VDataIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdata);
@@ -3574,7 +3676,7 @@ bool AMDGPUAsmParser::validateMIMGAddrSize(const MCInst &Inst,
const unsigned Opc = Inst.getOpcode();
const MCInstrDesc &Desc = MII.get(Opc);
- if ((Desc.TSFlags & SIInstrFlags::MIMG) == 0 || !isGFX10Plus())
+ if ((Desc.TSFlags & MIMGFlags) == 0 || !isGFX10Plus())
return true;
const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Opc);
@@ -3582,7 +3684,9 @@ bool AMDGPUAsmParser::validateMIMGAddrSize(const MCInst &Inst,
const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
AMDGPU::getMIMGBaseOpcodeInfo(Info->BaseOpcode);
int VAddr0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0);
- int SrsrcIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::srsrc);
+ int RSrcOpName = Desc.TSFlags & SIInstrFlags::MIMG ? AMDGPU::OpName::srsrc
+ : AMDGPU::OpName::rsrc;
+ int SrsrcIdx = AMDGPU::getNamedOperandIdx(Opc, RSrcOpName);
int DimIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::dim);
int A16Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::a16);
@@ -3590,7 +3694,7 @@ bool AMDGPUAsmParser::validateMIMGAddrSize(const MCInst &Inst,
assert(SrsrcIdx != -1);
assert(SrsrcIdx > VAddr0Idx);
- bool IsA16 = Inst.getOperand(A16Idx).getImm();
+ bool IsA16 = (A16Idx != -1 && Inst.getOperand(A16Idx).getImm());
if (BaseOpcode->BVH) {
if (IsA16 == BaseOpcode->A16)
return true;
@@ -3609,7 +3713,9 @@ bool AMDGPUAsmParser::validateMIMGAddrSize(const MCInst &Inst,
AMDGPU::getAddrSizeMIMGOp(BaseOpcode, DimInfo, IsA16, hasG16());
if (IsNSA) {
- if (hasPartialNSAEncoding() && ExpectedAddrSize > getNSAMaxSize()) {
+ if (hasPartialNSAEncoding() &&
+ ExpectedAddrSize >
+ getNSAMaxSize(Desc.TSFlags & SIInstrFlags::VSAMPLE)) {
int VAddrLastIdx = SrsrcIdx - 1;
unsigned VAddrLastSize =
AMDGPU::getRegOperandSize(getMRI(), Desc, VAddrLastIdx) / 4;
@@ -3639,7 +3745,7 @@ bool AMDGPUAsmParser::validateMIMGAtomicDMask(const MCInst &Inst) {
const unsigned Opc = Inst.getOpcode();
const MCInstrDesc &Desc = MII.get(Opc);
- if ((Desc.TSFlags & SIInstrFlags::MIMG) == 0)
+ if ((Desc.TSFlags & MIMGFlags) == 0)
return true;
if (!Desc.mayLoad() || !Desc.mayStore())
return true; // Not atomic
@@ -3677,7 +3783,7 @@ bool AMDGPUAsmParser::validateMIMGMSAA(const MCInst &Inst) {
const unsigned Opc = Inst.getOpcode();
const MCInstrDesc &Desc = MII.get(Opc);
- if ((Desc.TSFlags & SIInstrFlags::MIMG) == 0)
+ if ((Desc.TSFlags & MIMGFlags) == 0)
return true;
const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Opc);
@@ -3854,7 +3960,7 @@ bool AMDGPUAsmParser::validateMIMGD16(const MCInst &Inst) {
const unsigned Opc = Inst.getOpcode();
const MCInstrDesc &Desc = MII.get(Opc);
- if ((Desc.TSFlags & SIInstrFlags::MIMG) == 0)
+ if ((Desc.TSFlags & MIMGFlags) == 0)
return true;
int D16Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::d16);
@@ -4038,6 +4144,40 @@ SMLoc AMDGPUAsmParser::getFlatOffsetLoc(const OperandVector &Operands) const {
return getLoc();
}
+bool AMDGPUAsmParser::validateOffset(const MCInst &Inst,
+ const OperandVector &Operands) {
+ auto Opcode = Inst.getOpcode();
+ auto OpNum = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::offset);
+ if (OpNum == -1)
+ return true;
+
+ uint64_t TSFlags = MII.get(Inst.getOpcode()).TSFlags;
+ if ((TSFlags & SIInstrFlags::FLAT))
+ return validateFlatOffset(Inst, Operands);
+
+ if ((TSFlags & SIInstrFlags::SMRD))
+ return validateSMEMOffset(Inst, Operands);
+
+ const auto &Op = Inst.getOperand(OpNum);
+ if (isGFX12Plus() &&
+ (TSFlags & (SIInstrFlags::MUBUF | SIInstrFlags::MTBUF))) {
+ const unsigned OffsetSize = 24;
+ if (!isIntN(OffsetSize, Op.getImm())) {
+ Error(getFlatOffsetLoc(Operands),
+ Twine("expected a ") + Twine(OffsetSize) + "-bit signed offset");
+ return false;
+ }
+ } else {
+ const unsigned OffsetSize = 16;
+ if (!isUIntN(OffsetSize, Op.getImm())) {
+ Error(getFlatOffsetLoc(Operands),
+ Twine("expected a ") + Twine(OffsetSize) + "-bit unsigned offset");
+ return false;
+ }
+ }
+ return true;
+}
+
bool AMDGPUAsmParser::validateFlatOffset(const MCInst &Inst,
const OperandVector &Operands) {
uint64_t TSFlags = MII.get(Inst.getOpcode()).TSFlags;
@@ -4055,11 +4195,12 @@ bool AMDGPUAsmParser::validateFlatOffset(const MCInst &Inst,
return false;
}
- // For FLAT segment the offset must be positive;
+ // For pre-GFX12 FLAT instructions the offset must be positive;
// MSB is ignored and forced to zero.
unsigned OffsetSize = AMDGPU::getNumFlatOffsetBits(getSTI());
bool AllowNegative =
- TSFlags & (SIInstrFlags::FlatGlobal | SIInstrFlags::FlatScratch);
+ (TSFlags & (SIInstrFlags::FlatGlobal | SIInstrFlags::FlatScratch)) ||
+ isGFX12Plus();
if (!isIntN(OffsetSize, Op.getImm()) || (!AllowNegative && Op.getImm() < 0)) {
Error(getFlatOffsetLoc(Operands),
Twine("expected a ") +
@@ -4106,8 +4247,9 @@ bool AMDGPUAsmParser::validateSMEMOffset(const MCInst &Inst,
return true;
Error(getSMEMOffsetLoc(Operands),
- (isVI() || IsBuffer) ? "expected a 20-bit unsigned offset" :
- "expected a 21-bit signed offset");
+ isGFX12Plus() ? "expected a 24-bit signed offset"
+ : (isVI() || IsBuffer) ? "expected a 20-bit unsigned offset"
+ : "expected a 21-bit signed offset");
return false;
}
@@ -4189,21 +4331,35 @@ bool AMDGPUAsmParser::validateDPP(const MCInst &Inst,
const OperandVector &Operands) {
const unsigned Opc = Inst.getOpcode();
int DppCtrlIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::dpp_ctrl);
- if (DppCtrlIdx < 0)
- return true;
- unsigned DppCtrl = Inst.getOperand(DppCtrlIdx).getImm();
+ if (DppCtrlIdx >= 0) {
+ unsigned DppCtrl = Inst.getOperand(DppCtrlIdx).getImm();
- if (!AMDGPU::isLegal64BitDPPControl(DppCtrl)) {
- // DPP64 is supported for row_newbcast only.
- int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
- if (Src0Idx >= 0 &&
- getMRI()->getSubReg(Inst.getOperand(Src0Idx).getReg(), AMDGPU::sub1)) {
+ if (!AMDGPU::isLegalDPALU_DPPControl(DppCtrl) &&
+ AMDGPU::isDPALU_DPP(MII.get(Opc))) {
+ // DP ALU DPP is supported for row_newbcast only on GFX9*
SMLoc S = getImmLoc(AMDGPUOperand::ImmTyDppCtrl, Operands);
- Error(S, "64 bit dpp only supports row_newbcast");
+ Error(S, "DP ALU dpp only supports row_newbcast");
return false;
}
}
+ int Dpp8Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::dpp8);
+ bool IsDPP = DppCtrlIdx >= 0 || Dpp8Idx >= 0;
+
+ if (IsDPP && !hasDPPSrc1SGPR(getSTI())) {
+ int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1);
+ if (Src1Idx >= 0) {
+ const MCOperand &Src1 = Inst.getOperand(Src1Idx);
+ const MCRegisterInfo *TRI = getContext().getRegisterInfo();
+ if (Src1.isImm() ||
+ (Src1.isReg() && isSGPR(mc2PseudoReg(Src1.getReg()), TRI))) {
+ AMDGPUOperand &Op = ((AMDGPUOperand &)*Operands[Src1Idx]);
+ Error(Op.getStartLoc(), "invalid operand for instruction");
+ return false;
+ }
+ }
+ }
+
return true;
}
@@ -4241,7 +4397,19 @@ bool AMDGPUAsmParser::validateVOPLiteral(const MCInst &Inst,
continue;
if (MO.isImm() && !isInlineConstant(Inst, OpIdx)) {
- uint32_t Value = static_cast<uint32_t>(MO.getImm());
+ uint64_t Value = static_cast<uint64_t>(MO.getImm());
+ bool IsFP64 = AMDGPU::isSISrcFPOperand(Desc, OpIdx) &&
+ AMDGPU::getOperandSize(Desc.operands()[OpIdx]) == 8;
+ bool IsValid32Op = AMDGPU::isValid32BitLiteral(Value, IsFP64);
+
+ if (!IsValid32Op && !isInt<32>(Value) && !isUInt<32>(Value)) {
+ Error(getLitLoc(Operands), "invalid operand for instruction");
+ return false;
+ }
+
+ if (IsFP64 && IsValid32Op)
+ Value = Hi_32(Value);
+
if (NumLiterals == 0 || LiteralValue != Value) {
LiteralValue = Value;
++NumLiterals;
@@ -4359,7 +4527,7 @@ bool AMDGPUAsmParser::validateBLGP(const MCInst &Inst,
SMLoc BLGPLoc = getBLGPLoc(Operands);
if (!BLGPLoc.isValid())
return true;
- bool IsNeg = StringRef(BLGPLoc.getPointer()).startswith("neg:");
+ bool IsNeg = StringRef(BLGPLoc.getPointer()).starts_with("neg:");
auto FB = getFeatureBits();
bool UsesNeg = false;
if (FB[AMDGPU::FeatureGFX940Insts]) {
@@ -4405,6 +4573,29 @@ bool AMDGPUAsmParser::validateWaitCnt(const MCInst &Inst,
return false;
}
+bool AMDGPUAsmParser::validateDS(const MCInst &Inst,
+ const OperandVector &Operands) {
+ uint64_t TSFlags = MII.get(Inst.getOpcode()).TSFlags;
+ if ((TSFlags & SIInstrFlags::DS) == 0)
+ return true;
+ if (TSFlags & SIInstrFlags::GWS)
+ return validateGWS(Inst, Operands);
+ // Only validate GDS for non-GWS instructions.
+ if (hasGDS())
+ return true;
+ int GDSIdx =
+ AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::gds);
+ if (GDSIdx < 0)
+ return true;
+ unsigned GDS = Inst.getOperand(GDSIdx).getImm();
+ if (GDS) {
+ SMLoc S = getImmLoc(AMDGPUOperand::ImmTyGDS, Operands);
+ Error(S, "gds modifier is not supported on this GPU");
+ return false;
+ }
+ return true;
+}
+
// gfx90a has an undocumented limitation:
// DS_GWS opcodes must use even aligned registers.
bool AMDGPUAsmParser::validateGWS(const MCInst &Inst,
@@ -4443,6 +4634,9 @@ bool AMDGPUAsmParser::validateCoherencyBits(const MCInst &Inst,
unsigned CPol = Inst.getOperand(CPolPos).getImm();
+ if (isGFX12Plus())
+ return validateTHAndScopeBits(Inst, Operands, CPol);
+
uint64_t TSFlags = MII.get(Inst.getOpcode()).TSFlags;
if (TSFlags & SIInstrFlags::SMRD) {
if (CPol && (isSI() || isCI())) {
@@ -4457,11 +4651,17 @@ bool AMDGPUAsmParser::validateCoherencyBits(const MCInst &Inst,
}
if (isGFX90A() && !isGFX940() && (CPol & CPol::SCC)) {
- SMLoc S = getImmLoc(AMDGPUOperand::ImmTyCPol, Operands);
- StringRef CStr(S.getPointer());
- S = SMLoc::getFromPointer(&CStr.data()[CStr.find("scc")]);
- Error(S, "scc is not supported on this GPU");
- return false;
+ const uint64_t AllowSCCModifier = SIInstrFlags::MUBUF |
+ SIInstrFlags::MTBUF | SIInstrFlags::MIMG |
+ SIInstrFlags::FLAT;
+ if (!(TSFlags & AllowSCCModifier)) {
+ SMLoc S = getImmLoc(AMDGPUOperand::ImmTyCPol, Operands);
+ StringRef CStr(S.getPointer());
+ S = SMLoc::getFromPointer(&CStr.data()[CStr.find("scc")]);
+ Error(S,
+ "scc modifier is not supported for this instruction on this GPU");
+ return false;
+ }
}
if (!(TSFlags & (SIInstrFlags::IsAtomicNoRet | SIInstrFlags::IsAtomicRet)))
@@ -4488,6 +4688,60 @@ bool AMDGPUAsmParser::validateCoherencyBits(const MCInst &Inst,
return true;
}
+bool AMDGPUAsmParser::validateTHAndScopeBits(const MCInst &Inst,
+ const OperandVector &Operands,
+ const unsigned CPol) {
+ const unsigned TH = CPol & AMDGPU::CPol::TH;
+ const unsigned Scope = CPol & AMDGPU::CPol::SCOPE;
+
+ const unsigned Opcode = Inst.getOpcode();
+ const MCInstrDesc &TID = MII.get(Opcode);
+
+ auto PrintError = [&](StringRef Msg) {
+ SMLoc S = getImmLoc(AMDGPUOperand::ImmTyCPol, Operands);
+ Error(S, Msg);
+ return false;
+ };
+
+ if ((TID.TSFlags & SIInstrFlags::IsAtomicRet) &&
+ (TID.TSFlags & (SIInstrFlags::FLAT | SIInstrFlags::MUBUF)) &&
+ (!(TH & AMDGPU::CPol::TH_ATOMIC_RETURN)))
+ return PrintError("instruction must use th:TH_ATOMIC_RETURN");
+
+ if (TH == 0)
+ return true;
+
+ if ((TID.TSFlags & SIInstrFlags::SMRD) &&
+ ((TH == AMDGPU::CPol::TH_NT_RT) || (TH == AMDGPU::CPol::TH_RT_NT) ||
+ (TH == AMDGPU::CPol::TH_NT_HT)))
+ return PrintError("invalid th value for SMEM instruction");
+
+ if (TH == AMDGPU::CPol::TH_BYPASS) {
+ if ((Scope != AMDGPU::CPol::SCOPE_SYS &&
+ CPol & AMDGPU::CPol::TH_REAL_BYPASS) ||
+ (Scope == AMDGPU::CPol::SCOPE_SYS &&
+ !(CPol & AMDGPU::CPol::TH_REAL_BYPASS)))
+ return PrintError("scope and th combination is not valid");
+ }
+
+ bool IsStore = TID.mayStore();
+ bool IsAtomic =
+ TID.TSFlags & (SIInstrFlags::IsAtomicNoRet | SIInstrFlags::IsAtomicRet);
+
+ if (IsAtomic) {
+ if (!(CPol & AMDGPU::CPol::TH_TYPE_ATOMIC))
+ return PrintError("invalid th value for atomic instructions");
+ } else if (IsStore) {
+ if (!(CPol & AMDGPU::CPol::TH_TYPE_STORE))
+ return PrintError("invalid th value for store instructions");
+ } else {
+ if (!(CPol & AMDGPU::CPol::TH_TYPE_LOAD))
+ return PrintError("invalid th value for load instructions");
+ }
+
+ return true;
+}
+
bool AMDGPUAsmParser::validateExeczVcczOperands(const OperandVector &Operands) {
if (!isGFX11Plus())
return true;
@@ -4582,10 +4836,7 @@ bool AMDGPUAsmParser::validateInstruction(const MCInst &Inst,
if (!validateMovrels(Inst, Operands)) {
return false;
}
- if (!validateFlatOffset(Inst, Operands)) {
- return false;
- }
- if (!validateSMEMOffset(Inst, Operands)) {
+ if (!validateOffset(Inst, Operands)) {
return false;
}
if (!validateMAIAccWrite(Inst, Operands)) {
@@ -4613,7 +4864,7 @@ bool AMDGPUAsmParser::validateInstruction(const MCInst &Inst,
"invalid register class: vgpr tuples must be 64 bit aligned");
return false;
}
- if (!validateGWS(Inst, Operands)) {
+ if (!validateDS(Inst, Operands)) {
return false;
}
@@ -4888,7 +5139,7 @@ bool AMDGPUAsmParser::ParseDirectiveAMDHSAKernel() {
if (getSTI().getTargetTriple().getArch() != Triple::amdgcn)
return TokError("directive only supported for amdgcn architecture");
- if (getSTI().getTargetTriple().getOS() != Triple::AMDHSA)
+ if (!isHsaAbi(getSTI()))
return TokError("directive only supported for amdhsa OS");
StringRef KernelName;
@@ -4905,6 +5156,8 @@ bool AMDGPUAsmParser::ParseDirectiveAMDHSAKernel() {
uint64_t NextFreeVGPR = 0;
uint64_t AccumOffset = 0;
uint64_t SharedVGPRCount = 0;
+ uint64_t PreloadLength = 0;
+ uint64_t PreloadOffset = 0;
SMRange SGPRRange;
uint64_t NextFreeSGPR = 0;
@@ -4973,6 +5226,28 @@ bool AMDGPUAsmParser::ParseDirectiveAMDHSAKernel() {
Val, ValRange);
if (Val)
ImpliedUserSGPRCount += 4;
+ } else if (ID == ".amdhsa_user_sgpr_kernarg_preload_length") {
+ if (!hasKernargPreload())
+ return Error(IDRange.Start, "directive requires gfx90a+", IDRange);
+
+ if (Val > getMaxNumUserSGPRs())
+ return OutOfRangeError(ValRange);
+ PARSE_BITS_ENTRY(KD.kernarg_preload, KERNARG_PRELOAD_SPEC_LENGTH, Val,
+ ValRange);
+ if (Val) {
+ ImpliedUserSGPRCount += Val;
+ PreloadLength = Val;
+ }
+ } else if (ID == ".amdhsa_user_sgpr_kernarg_preload_offset") {
+ if (!hasKernargPreload())
+ return Error(IDRange.Start, "directive requires gfx90a+", IDRange);
+
+ if (Val >= 1024)
+ return OutOfRangeError(ValRange);
+ PARSE_BITS_ENTRY(KD.kernarg_preload, KERNARG_PRELOAD_SPEC_OFFSET, Val,
+ ValRange);
+ if (Val)
+ PreloadOffset = Val;
} else if (ID == ".amdhsa_user_sgpr_dispatch_ptr") {
PARSE_BITS_ENTRY(KD.kernel_code_properties,
KERNEL_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR, Val,
@@ -5104,15 +5379,21 @@ bool AMDGPUAsmParser::ParseDirectiveAMDHSAKernel() {
COMPUTE_PGM_RSRC1_FLOAT_DENORM_MODE_16_64, Val,
ValRange);
} else if (ID == ".amdhsa_dx10_clamp") {
+ if (IVersion.Major >= 12)
+ return Error(IDRange.Start, "directive unsupported on gfx12+", IDRange);
PARSE_BITS_ENTRY(KD.compute_pgm_rsrc1,
- COMPUTE_PGM_RSRC1_ENABLE_DX10_CLAMP, Val, ValRange);
+ COMPUTE_PGM_RSRC1_GFX6_GFX11_ENABLE_DX10_CLAMP, Val,
+ ValRange);
} else if (ID == ".amdhsa_ieee_mode") {
- PARSE_BITS_ENTRY(KD.compute_pgm_rsrc1, COMPUTE_PGM_RSRC1_ENABLE_IEEE_MODE,
- Val, ValRange);
+ if (IVersion.Major >= 12)
+ return Error(IDRange.Start, "directive unsupported on gfx12+", IDRange);
+ PARSE_BITS_ENTRY(KD.compute_pgm_rsrc1,
+ COMPUTE_PGM_RSRC1_GFX6_GFX11_ENABLE_IEEE_MODE, Val,
+ ValRange);
} else if (ID == ".amdhsa_fp16_overflow") {
if (IVersion.Major < 9)
return Error(IDRange.Start, "directive requires gfx9+", IDRange);
- PARSE_BITS_ENTRY(KD.compute_pgm_rsrc1, COMPUTE_PGM_RSRC1_FP16_OVFL, Val,
+ PARSE_BITS_ENTRY(KD.compute_pgm_rsrc1, COMPUTE_PGM_RSRC1_GFX9_PLUS_FP16_OVFL, Val,
ValRange);
} else if (ID == ".amdhsa_tg_split") {
if (!isGFX90A())
@@ -5122,17 +5403,17 @@ bool AMDGPUAsmParser::ParseDirectiveAMDHSAKernel() {
} else if (ID == ".amdhsa_workgroup_processor_mode") {
if (IVersion.Major < 10)
return Error(IDRange.Start, "directive requires gfx10+", IDRange);
- PARSE_BITS_ENTRY(KD.compute_pgm_rsrc1, COMPUTE_PGM_RSRC1_WGP_MODE, Val,
+ PARSE_BITS_ENTRY(KD.compute_pgm_rsrc1, COMPUTE_PGM_RSRC1_GFX10_PLUS_WGP_MODE, Val,
ValRange);
} else if (ID == ".amdhsa_memory_ordered") {
if (IVersion.Major < 10)
return Error(IDRange.Start, "directive requires gfx10+", IDRange);
- PARSE_BITS_ENTRY(KD.compute_pgm_rsrc1, COMPUTE_PGM_RSRC1_MEM_ORDERED, Val,
+ PARSE_BITS_ENTRY(KD.compute_pgm_rsrc1, COMPUTE_PGM_RSRC1_GFX10_PLUS_MEM_ORDERED, Val,
ValRange);
} else if (ID == ".amdhsa_forward_progress") {
if (IVersion.Major < 10)
return Error(IDRange.Start, "directive requires gfx10+", IDRange);
- PARSE_BITS_ENTRY(KD.compute_pgm_rsrc1, COMPUTE_PGM_RSRC1_FWD_PROGRESS, Val,
+ PARSE_BITS_ENTRY(KD.compute_pgm_rsrc1, COMPUTE_PGM_RSRC1_GFX10_PLUS_FWD_PROGRESS, Val,
ValRange);
} else if (ID == ".amdhsa_shared_vgpr_count") {
if (IVersion.Major < 10)
@@ -5171,6 +5452,12 @@ bool AMDGPUAsmParser::ParseDirectiveAMDHSAKernel() {
PARSE_BITS_ENTRY(KD.compute_pgm_rsrc2,
COMPUTE_PGM_RSRC2_ENABLE_EXCEPTION_INT_DIVIDE_BY_ZERO,
Val, ValRange);
+ } else if (ID == ".amdhsa_round_robin_scheduling") {
+ if (IVersion.Major < 12)
+ return Error(IDRange.Start, "directive requires gfx12+", IDRange);
+ PARSE_BITS_ENTRY(KD.compute_pgm_rsrc1,
+ COMPUTE_PGM_RSRC1_GFX12_PLUS_ENABLE_WG_RR_EN, Val,
+ ValRange);
} else {
return Error(IDRange.Start, "unknown .amdhsa_kernel directive", IDRange);
}
@@ -5218,6 +5505,11 @@ bool AMDGPUAsmParser::ParseDirectiveAMDHSAKernel() {
AMDHSA_BITS_SET(KD.compute_pgm_rsrc2, COMPUTE_PGM_RSRC2_USER_SGPR_COUNT,
UserSGPRCount);
+ if (PreloadLength && KD.kernarg_size &&
+ (PreloadLength * 4 + PreloadOffset * 4 > KD.kernarg_size))
+ return TokError("Kernarg preload length + offset is larger than the "
+ "kernarg segment size");
+
if (isGFX90A()) {
if (!Seen.contains(".amdhsa_accum_offset"))
return TokError(".amdhsa_accum_offset directive is required");
@@ -5319,6 +5611,18 @@ bool AMDGPUAsmParser::ParseAMDKernelCodeTValue(StringRef ID,
}
Lex();
+ if (ID == "enable_dx10_clamp") {
+ if (G_00B848_DX10_CLAMP(Header.compute_pgm_resource_registers) &&
+ isGFX12Plus())
+ return TokError("enable_dx10_clamp=1 is not allowed on GFX12+");
+ }
+
+ if (ID == "enable_ieee_mode") {
+ if (G_00B848_IEEE_MODE(Header.compute_pgm_resource_registers) &&
+ isGFX12Plus())
+ return TokError("enable_ieee_mode=1 is not allowed on GFX12+");
+ }
+
if (ID == "enable_wavefront_size32") {
if (Header.code_properties & AMD_CODE_PROPERTY_ENABLE_WAVEFRONT_SIZE32) {
if (!isGFX10Plus())
@@ -5419,33 +5723,15 @@ bool AMDGPUAsmParser::ParseDirectiveISAVersion() {
}
bool AMDGPUAsmParser::ParseDirectiveHSAMetadata() {
- const char *AssemblerDirectiveBegin;
- const char *AssemblerDirectiveEnd;
- std::tie(AssemblerDirectiveBegin, AssemblerDirectiveEnd) =
- isHsaAbiVersion3AndAbove(&getSTI())
- ? std::pair(HSAMD::V3::AssemblerDirectiveBegin,
- HSAMD::V3::AssemblerDirectiveEnd)
- : std::pair(HSAMD::AssemblerDirectiveBegin,
- HSAMD::AssemblerDirectiveEnd);
-
- if (getSTI().getTargetTriple().getOS() != Triple::AMDHSA) {
- return Error(getLoc(),
- (Twine(AssemblerDirectiveBegin) + Twine(" directive is "
- "not available on non-amdhsa OSes")).str());
- }
+ assert(isHsaAbi(getSTI()));
std::string HSAMetadataString;
- if (ParseToEndDirective(AssemblerDirectiveBegin, AssemblerDirectiveEnd,
- HSAMetadataString))
+ if (ParseToEndDirective(HSAMD::V3::AssemblerDirectiveBegin,
+ HSAMD::V3::AssemblerDirectiveEnd, HSAMetadataString))
return true;
- if (isHsaAbiVersion3AndAbove(&getSTI())) {
- if (!getTargetStreamer().EmitHSAMetadataV3(HSAMetadataString))
- return Error(getLoc(), "invalid HSA metadata");
- } else {
- if (!getTargetStreamer().EmitHSAMetadataV2(HSAMetadataString))
- return Error(getLoc(), "invalid HSA metadata");
- }
+ if (!getTargetStreamer().EmitHSAMetadataV3(HSAMetadataString))
+ return Error(getLoc(), "invalid HSA metadata");
return false;
}
@@ -5588,7 +5874,7 @@ bool AMDGPUAsmParser::ParseDirectiveAMDGPULDS() {
bool AMDGPUAsmParser::ParseDirective(AsmToken DirectiveID) {
StringRef IDVal = DirectiveID.getString();
- if (isHsaAbiVersion3AndAbove(&getSTI())) {
+ if (isHsaAbi(getSTI())) {
if (IDVal == ".amdhsa_kernel")
return ParseDirectiveAMDHSAKernel();
@@ -5611,8 +5897,12 @@ bool AMDGPUAsmParser::ParseDirective(AsmToken DirectiveID) {
if (IDVal == ".amd_amdgpu_isa")
return ParseDirectiveISAVersion();
- if (IDVal == AMDGPU::HSAMD::AssemblerDirectiveBegin)
- return ParseDirectiveHSAMetadata();
+ if (IDVal == AMDGPU::HSAMD::AssemblerDirectiveBegin) {
+ return Error(getLoc(), (Twine(HSAMD::AssemblerDirectiveBegin) +
+ Twine(" directive is "
+ "not available on non-amdhsa OSes"))
+ .str());
+ }
}
if (IDVal == ".amdgcn_target")
@@ -5753,20 +6043,20 @@ StringRef AMDGPUAsmParser::parseMnemonicSuffix(StringRef Name) {
setForcedDPP(false);
setForcedSDWA(false);
- if (Name.endswith("_e64_dpp")) {
+ if (Name.ends_with("_e64_dpp")) {
setForcedDPP(true);
setForcedEncodingSize(64);
return Name.substr(0, Name.size() - 8);
- } else if (Name.endswith("_e64")) {
+ } else if (Name.ends_with("_e64")) {
setForcedEncodingSize(64);
return Name.substr(0, Name.size() - 4);
- } else if (Name.endswith("_e32")) {
+ } else if (Name.ends_with("_e32")) {
setForcedEncodingSize(32);
return Name.substr(0, Name.size() - 4);
- } else if (Name.endswith("_dpp")) {
+ } else if (Name.ends_with("_dpp")) {
setForcedDPP(true);
return Name.substr(0, Name.size() - 4);
- } else if (Name.endswith("_sdwa")) {
+ } else if (Name.ends_with("_sdwa")) {
setForcedSDWA(true);
return Name.substr(0, Name.size() - 5);
}
@@ -5789,7 +6079,7 @@ bool AMDGPUAsmParser::ParseInstruction(ParseInstructionInfo &Info,
Operands.push_back(AMDGPUOperand::CreateToken(this, Name, NameLoc));
- bool IsMIMG = Name.startswith("image_");
+ bool IsMIMG = Name.starts_with("image_");
while (!trySkipToken(AsmToken::EndOfStatement)) {
OperandMode Mode = OperandMode_Default;
@@ -5929,7 +6219,7 @@ unsigned AMDGPUAsmParser::getCPolKind(StringRef Id, StringRef Mnemo,
bool &Disabling) const {
Disabling = Id.consume_front("no");
- if (isGFX940() && !Mnemo.startswith("s_")) {
+ if (isGFX940() && !Mnemo.starts_with("s_")) {
return StringSwitch<unsigned>(Id)
.Case("nt", AMDGPU::CPol::NT)
.Case("sc0", AMDGPU::CPol::SC0)
@@ -5946,6 +6236,47 @@ unsigned AMDGPUAsmParser::getCPolKind(StringRef Id, StringRef Mnemo,
}
ParseStatus AMDGPUAsmParser::parseCPol(OperandVector &Operands) {
+ if (isGFX12Plus()) {
+ SMLoc StringLoc = getLoc();
+
+ int64_t CPolVal = 0;
+ ParseStatus ResTH = ParseStatus::NoMatch;
+ ParseStatus ResScope = ParseStatus::NoMatch;
+
+ for (;;) {
+ if (ResTH.isNoMatch()) {
+ int64_t TH;
+ ResTH = parseTH(Operands, TH);
+ if (ResTH.isFailure())
+ return ResTH;
+ if (ResTH.isSuccess()) {
+ CPolVal |= TH;
+ continue;
+ }
+ }
+
+ if (ResScope.isNoMatch()) {
+ int64_t Scope;
+ ResScope = parseScope(Operands, Scope);
+ if (ResScope.isFailure())
+ return ResScope;
+ if (ResScope.isSuccess()) {
+ CPolVal |= Scope;
+ continue;
+ }
+ }
+
+ break;
+ }
+
+ if (ResTH.isNoMatch() && ResScope.isNoMatch())
+ return ParseStatus::NoMatch;
+
+ Operands.push_back(AMDGPUOperand::CreateImm(this, CPolVal, StringLoc,
+ AMDGPUOperand::ImmTyCPol));
+ return ParseStatus::Success;
+ }
+
StringRef Mnemo = ((AMDGPUOperand &)*Operands[0]).getToken();
SMLoc OpLoc = getLoc();
unsigned Enabled = 0, Seen = 0;
@@ -5981,6 +6312,95 @@ ParseStatus AMDGPUAsmParser::parseCPol(OperandVector &Operands) {
return ParseStatus::Success;
}
+ParseStatus AMDGPUAsmParser::parseScope(OperandVector &Operands,
+ int64_t &Scope) {
+ Scope = AMDGPU::CPol::SCOPE_CU; // default;
+
+ StringRef Value;
+ SMLoc StringLoc;
+ ParseStatus Res;
+
+ Res = parseStringWithPrefix("scope", Value, StringLoc);
+ if (!Res.isSuccess())
+ return Res;
+
+ Scope = StringSwitch<int64_t>(Value)
+ .Case("SCOPE_CU", AMDGPU::CPol::SCOPE_CU)
+ .Case("SCOPE_SE", AMDGPU::CPol::SCOPE_SE)
+ .Case("SCOPE_DEV", AMDGPU::CPol::SCOPE_DEV)
+ .Case("SCOPE_SYS", AMDGPU::CPol::SCOPE_SYS)
+ .Default(0xffffffff);
+
+ if (Scope == 0xffffffff)
+ return Error(StringLoc, "invalid scope value");
+
+ return ParseStatus::Success;
+}
+
+ParseStatus AMDGPUAsmParser::parseTH(OperandVector &Operands, int64_t &TH) {
+ TH = AMDGPU::CPol::TH_RT; // default
+
+ StringRef Value;
+ SMLoc StringLoc;
+ ParseStatus Res = parseStringWithPrefix("th", Value, StringLoc);
+ if (!Res.isSuccess())
+ return Res;
+
+ if (Value == "TH_DEFAULT")
+ TH = AMDGPU::CPol::TH_RT;
+ else if (Value == "TH_STORE_LU" || Value == "TH_LOAD_RT_WB" ||
+ Value == "TH_LOAD_NT_WB") {
+ return Error(StringLoc, "invalid th value");
+ } else if (Value.starts_with("TH_ATOMIC_")) {
+ Value = Value.drop_front(10);
+ TH = AMDGPU::CPol::TH_TYPE_ATOMIC;
+ } else if (Value.starts_with("TH_LOAD_")) {
+ Value = Value.drop_front(8);
+ TH = AMDGPU::CPol::TH_TYPE_LOAD;
+ } else if (Value.starts_with("TH_STORE_")) {
+ Value = Value.drop_front(9);
+ TH = AMDGPU::CPol::TH_TYPE_STORE;
+ } else {
+ return Error(StringLoc, "invalid th value");
+ }
+
+ if (Value == "BYPASS")
+ TH |= AMDGPU::CPol::TH_REAL_BYPASS;
+
+ if (TH != 0) {
+ if (TH & AMDGPU::CPol::TH_TYPE_ATOMIC)
+ TH |= StringSwitch<int64_t>(Value)
+ .Case("RETURN", AMDGPU::CPol::TH_ATOMIC_RETURN)
+ .Case("RT", AMDGPU::CPol::TH_RT)
+ .Case("RT_RETURN", AMDGPU::CPol::TH_ATOMIC_RETURN)
+ .Case("NT", AMDGPU::CPol::TH_ATOMIC_NT)
+ .Case("NT_RETURN", AMDGPU::CPol::TH_ATOMIC_NT |
+ AMDGPU::CPol::TH_ATOMIC_RETURN)
+ .Case("CASCADE_RT", AMDGPU::CPol::TH_ATOMIC_CASCADE)
+ .Case("CASCADE_NT", AMDGPU::CPol::TH_ATOMIC_CASCADE |
+ AMDGPU::CPol::TH_ATOMIC_NT)
+ .Default(0xffffffff);
+ else
+ TH |= StringSwitch<int64_t>(Value)
+ .Case("RT", AMDGPU::CPol::TH_RT)
+ .Case("NT", AMDGPU::CPol::TH_NT)
+ .Case("HT", AMDGPU::CPol::TH_HT)
+ .Case("LU", AMDGPU::CPol::TH_LU)
+ .Case("RT_WB", AMDGPU::CPol::TH_RT_WB)
+ .Case("NT_RT", AMDGPU::CPol::TH_NT_RT)
+ .Case("RT_NT", AMDGPU::CPol::TH_RT_NT)
+ .Case("NT_HT", AMDGPU::CPol::TH_NT_HT)
+ .Case("NT_WB", AMDGPU::CPol::TH_NT_WB)
+ .Case("BYPASS", AMDGPU::CPol::TH_BYPASS)
+ .Default(0xffffffff);
+ }
+
+ if (TH == 0xffffffff)
+ return Error(StringLoc, "invalid th value");
+
+ return ParseStatus::Success;
+}
+
static void addOptionalImmOperand(
MCInst& Inst, const OperandVector& Operands,
AMDGPUAsmParser::OptionalImmIndexMap& OptionalIdx,
@@ -6382,7 +6802,7 @@ bool AMDGPUAsmParser::parseCnt(int64_t &IntVal) {
AMDGPU::IsaVersion ISA = AMDGPU::getIsaVersion(getSTI().getCPU());
bool Failed = true;
- bool Sat = CntName.endswith("_sat");
+ bool Sat = CntName.ends_with("_sat");
if (CntName == "vmcnt" || CntName == "vmcnt_sat") {
Failed = encodeCnt(ISA, IntVal, CntVal, Sat, encodeVmcnt, decodeVmcnt);
@@ -6855,7 +7275,7 @@ ParseStatus AMDGPUAsmParser::parseInterpAttr(OperandVector &Operands) {
if (!parseId(Str))
return ParseStatus::NoMatch;
- if (!Str.startswith("attr"))
+ if (!Str.starts_with("attr"))
return Error(S, "invalid interpolation attribute");
StringRef Chan = Str.take_back(2);
@@ -6946,7 +7366,7 @@ bool
AMDGPUAsmParser::trySkipId(const StringRef Pref, const StringRef Id) {
if (isToken(AsmToken::Identifier)) {
StringRef Tok = getTokenStr();
- if (Tok.startswith(Pref) && Tok.drop_front(Pref.size()) == Id) {
+ if (Tok.starts_with(Pref) && Tok.drop_front(Pref.size()) == Id) {
lex();
return true;
}
@@ -7578,66 +7998,6 @@ void AMDGPUAsmParser::cvtMubufImpl(MCInst &Inst,
}
//===----------------------------------------------------------------------===//
-// SMEM
-//===----------------------------------------------------------------------===//
-
-void AMDGPUAsmParser::cvtSMEMAtomic(MCInst &Inst, const OperandVector &Operands) {
- OptionalImmIndexMap OptionalIdx;
- bool IsAtomicReturn = false;
-
- for (unsigned i = 1, e = Operands.size(); i != e; ++i) {
- AMDGPUOperand &Op = ((AMDGPUOperand &)*Operands[i]);
- if (!Op.isCPol())
- continue;
- IsAtomicReturn = Op.getImm() & AMDGPU::CPol::GLC;
- break;
- }
-
- if (!IsAtomicReturn) {
- int NewOpc = AMDGPU::getAtomicNoRetOp(Inst.getOpcode());
- if (NewOpc != -1)
- Inst.setOpcode(NewOpc);
- }
-
- IsAtomicReturn = MII.get(Inst.getOpcode()).TSFlags &
- SIInstrFlags::IsAtomicRet;
-
- for (unsigned i = 1, e = Operands.size(); i != e; ++i) {
- AMDGPUOperand &Op = ((AMDGPUOperand &)*Operands[i]);
-
- // Add the register arguments
- if (Op.isReg()) {
- Op.addRegOperands(Inst, 1);
- if (IsAtomicReturn && i == 1)
- Op.addRegOperands(Inst, 1);
- continue;
- }
-
- // Handle the case where soffset is an immediate
- if (Op.isImm() && Op.getImmTy() == AMDGPUOperand::ImmTyNone) {
- Op.addImmOperands(Inst, 1);
- continue;
- }
-
- // Handle tokens like 'offen' which are sometimes hard-coded into the
- // asm string. There are no MCInst operands for these.
- if (Op.isToken()) {
- continue;
- }
- assert(Op.isImm());
-
- // Handle optional arguments
- OptionalIdx[Op.getImmTy()] = i;
- }
-
- if ((int)Inst.getNumOperands() <=
- AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::offset))
- addOptionalImmOperand(Inst, Operands, OptionalIdx,
- AMDGPUOperand::ImmTySMEMOffsetMod);
- addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyCPol, 0);
-}
-
-//===----------------------------------------------------------------------===//
// smrd
//===----------------------------------------------------------------------===//
@@ -7704,7 +8064,7 @@ void AMDGPUAsmParser::onBeginOfFile() {
// TODO: Should try to check code object version from directive???
AMDGPU::getAmdhsaCodeObjectVersion());
- if (isHsaAbiVersion3AndAbove(&getSTI()))
+ if (isHsaAbi(getSTI()))
getTargetStreamer().EmitDirectiveAMDGCNTarget();
}
@@ -8155,7 +8515,7 @@ bool AMDGPUAsmParser::parseDimId(unsigned &Encoding) {
Token += Suffix;
StringRef DimId = Token;
- if (DimId.startswith("SQ_RSRC_IMG_"))
+ if (DimId.starts_with("SQ_RSRC_IMG_"))
DimId = DimId.drop_front(12);
const AMDGPU::MIMGDimInfo *DimInfo = AMDGPU::getMIMGDimInfoByAsmSuffix(DimId);
@@ -8838,3 +9198,9 @@ bool AMDGPUOperand::isWaitVDST() const {
bool AMDGPUOperand::isWaitEXP() const {
return isImmTy(ImmTyWaitEXP) && isUInt<3>(getImm());
}
+
+//===----------------------------------------------------------------------===//
+// Split Barrier
+//===----------------------------------------------------------------------===//
+
+bool AMDGPUOperand::isSplitBarrier() const { return isInlinableImm(MVT::i32); }
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/BUFInstructions.td b/contrib/llvm-project/llvm/lib/Target/AMDGPU/BUFInstructions.td
index ea1578e30ae8..43d35fa5291c 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/BUFInstructions.td
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/BUFInstructions.td
@@ -12,6 +12,8 @@ def MUBUFOffset : ComplexPattern<iPTR, 3, "SelectMUBUFOffset">;
def MUBUFScratchOffen : ComplexPattern<iPTR, 4, "SelectMUBUFScratchOffen", [], [SDNPWantParent]>;
def MUBUFScratchOffset : ComplexPattern<iPTR, 3, "SelectMUBUFScratchOffset", [], [SDNPWantParent], 20>;
+def BUFSOffset : ComplexPattern<iPTR, 1, "SelectBUFSOffset">;
+
def BUFAddrKind {
int Offset = 0;
int OffEn = 1;
@@ -152,24 +154,32 @@ class MTBUF_Real <MTBUF_Pseudo ps, string real_name = ps.Mnemonic> :
}
class getMTBUFInsDA<list<RegisterClass> vdataList,
- list<RegisterClass> vaddrList=[]> {
+ list<RegisterClass> vaddrList=[], bit hasGFX12Enc> {
RegisterClass vdataClass = !if(!empty(vdataList), ?, !head(vdataList));
RegisterClass vaddrClass = !if(!empty(vaddrList), ?, !head(vaddrList));
RegisterOperand vdata_op = getLdStRegisterOperand<vdataClass>.ret;
- dag NonVaddrInputs = (ins SReg_128:$srsrc, SCSrc_b32:$soffset, offset:$offset, FORMAT:$format, CPol:$cpol, i1imm:$swz);
- dag Inputs = !if(!empty(vaddrList), NonVaddrInputs, !con((ins vaddrClass:$vaddr), NonVaddrInputs));
- dag ret = !if(!empty(vdataList), Inputs, !con((ins vdata_op:$vdata), Inputs));
-}
+ dag SOffset = !if(hasGFX12Enc, (ins SReg_32:$soffset),
+ (ins SCSrc_b32:$soffset));
+ dag NonVaddrInputs = !con((ins SReg_128:$srsrc), SOffset,
+ (ins offset:$offset, FORMAT:$format, CPol_0:$cpol, i1imm_0:$swz));
-class getMTBUFIns<int addrKind, list<RegisterClass> vdataList=[]> {
+ dag Inputs = !if(!empty(vaddrList),
+ NonVaddrInputs,
+ !con((ins vaddrClass:$vaddr), NonVaddrInputs));
+ dag ret = !if(!empty(vdataList),
+ Inputs,
+ !con((ins vdata_op:$vdata), Inputs));
+}
+
+class getMTBUFIns<int addrKind, list<RegisterClass> vdataList=[], bit hasGFX12Enc> {
dag ret =
- !if(!eq(addrKind, BUFAddrKind.Offset), getMTBUFInsDA<vdataList>.ret,
- !if(!eq(addrKind, BUFAddrKind.OffEn), getMTBUFInsDA<vdataList, [VGPR_32]>.ret,
- !if(!eq(addrKind, BUFAddrKind.IdxEn), getMTBUFInsDA<vdataList, [VGPR_32]>.ret,
- !if(!eq(addrKind, BUFAddrKind.BothEn), getMTBUFInsDA<vdataList, [VReg_64]>.ret,
- !if(!eq(addrKind, BUFAddrKind.Addr64), getMTBUFInsDA<vdataList, [VReg_64]>.ret,
+ !if(!eq(addrKind, BUFAddrKind.Offset), getMTBUFInsDA<vdataList, [], hasGFX12Enc>.ret,
+ !if(!eq(addrKind, BUFAddrKind.OffEn), getMTBUFInsDA<vdataList, [VGPR_32], hasGFX12Enc>.ret,
+ !if(!eq(addrKind, BUFAddrKind.IdxEn), getMTBUFInsDA<vdataList, [VGPR_32], hasGFX12Enc>.ret,
+ !if(!eq(addrKind, BUFAddrKind.BothEn), getMTBUFInsDA<vdataList, [VReg_64], hasGFX12Enc>.ret,
+ !if(!eq(addrKind, BUFAddrKind.Addr64), getMTBUFInsDA<vdataList, [VReg_64], hasGFX12Enc>.ret,
(ins))))));
}
@@ -204,12 +214,13 @@ class MTBUF_Load_Pseudo <string opName,
int addrKind,
RegisterClass vdataClass,
int elems,
+ bit hasGFX12Enc = 0,
list<dag> pattern=[],
// Workaround bug bz30254
int addrKindCopy = addrKind>
: MTBUF_Pseudo<opName,
(outs getLdStRegisterOperand<vdataClass>.ret:$vdata),
- getMTBUFIns<addrKindCopy>.ret,
+ getMTBUFIns<addrKindCopy, [], hasGFX12Enc>.ret,
getMTBUFAsmOps<addrKindCopy>.ret,
pattern>,
MTBUF_SetupAddr<addrKindCopy> {
@@ -219,38 +230,45 @@ class MTBUF_Load_Pseudo <string opName,
let elements = elems;
}
-multiclass MTBUF_Pseudo_Loads<string opName, RegisterClass vdataClass,
- int elems> {
+multiclass MTBUF_Pseudo_Loads_Helper<string opName, RegisterClass vdataClass,
+ int elems, bit hasGFX12Enc> {
- def _OFFSET : MTBUF_Load_Pseudo <opName, BUFAddrKind.Offset, vdataClass, elems>,
+ def _OFFSET : MTBUF_Load_Pseudo <opName, BUFAddrKind.Offset, vdataClass, elems, hasGFX12Enc>,
MTBUFAddr64Table<0, NAME>;
- def _ADDR64 : MTBUF_Load_Pseudo <opName, BUFAddrKind.Addr64, vdataClass, elems>,
+ def _ADDR64 : MTBUF_Load_Pseudo <opName, BUFAddrKind.Addr64, vdataClass, elems, hasGFX12Enc>,
MTBUFAddr64Table<1, NAME>;
- def _OFFEN : MTBUF_Load_Pseudo <opName, BUFAddrKind.OffEn, vdataClass, elems>;
- def _IDXEN : MTBUF_Load_Pseudo <opName, BUFAddrKind.IdxEn, vdataClass, elems>;
- def _BOTHEN : MTBUF_Load_Pseudo <opName, BUFAddrKind.BothEn, vdataClass, elems>;
+ def _OFFEN : MTBUF_Load_Pseudo <opName, BUFAddrKind.OffEn, vdataClass, elems, hasGFX12Enc>;
+ def _IDXEN : MTBUF_Load_Pseudo <opName, BUFAddrKind.IdxEn, vdataClass, elems, hasGFX12Enc>;
+ def _BOTHEN : MTBUF_Load_Pseudo <opName, BUFAddrKind.BothEn, vdataClass, elems, hasGFX12Enc>;
let DisableWQM = 1 in {
- def _OFFSET_exact : MTBUF_Load_Pseudo <opName, BUFAddrKind.Offset, vdataClass, elems>;
- def _OFFEN_exact : MTBUF_Load_Pseudo <opName, BUFAddrKind.OffEn, vdataClass, elems>;
- def _IDXEN_exact : MTBUF_Load_Pseudo <opName, BUFAddrKind.IdxEn, vdataClass, elems>;
- def _BOTHEN_exact : MTBUF_Load_Pseudo <opName, BUFAddrKind.BothEn, vdataClass, elems>;
+ def _OFFSET_exact : MTBUF_Load_Pseudo <opName, BUFAddrKind.Offset, vdataClass, elems, hasGFX12Enc>;
+ def _OFFEN_exact : MTBUF_Load_Pseudo <opName, BUFAddrKind.OffEn, vdataClass, elems, hasGFX12Enc>;
+ def _IDXEN_exact : MTBUF_Load_Pseudo <opName, BUFAddrKind.IdxEn, vdataClass, elems, hasGFX12Enc>;
+ def _BOTHEN_exact : MTBUF_Load_Pseudo <opName, BUFAddrKind.BothEn, vdataClass, elems, hasGFX12Enc>;
}
}
+multiclass MTBUF_Pseudo_Loads<string opName, RegisterClass vdataClass,
+ int elems> {
+ defm NAME : MTBUF_Pseudo_Loads_Helper<opName, vdataClass, elems, 0>;
+ defm _VBUFFER : MTBUF_Pseudo_Loads_Helper<opName, vdataClass, elems, 1>;
+}
+
class MTBUF_Store_Pseudo <string opName,
int addrKind,
RegisterClass vdataClass,
int elems,
+ bit hasGFX12Enc = 0,
list<dag> pattern=[],
// Workaround bug bz30254
int addrKindCopy = addrKind,
RegisterClass vdataClassCopy = vdataClass>
: MTBUF_Pseudo<opName,
(outs),
- getMTBUFIns<addrKindCopy, [vdataClassCopy]>.ret,
+ getMTBUFIns<addrKindCopy, [vdataClassCopy], hasGFX12Enc>.ret,
getMTBUFAsmOps<addrKindCopy>.ret,
pattern>,
MTBUF_SetupAddr<addrKindCopy> {
@@ -260,27 +278,32 @@ class MTBUF_Store_Pseudo <string opName,
let elements = elems;
}
-multiclass MTBUF_Pseudo_Stores<string opName, RegisterClass vdataClass,
- int elems> {
+multiclass MTBUF_Pseudo_Stores_Helper<string opName, RegisterClass vdataClass,
+ int elems, bit hasGFX12Enc> {
- def _OFFSET : MTBUF_Store_Pseudo <opName, BUFAddrKind.Offset, vdataClass, elems>,
+ def _OFFSET : MTBUF_Store_Pseudo <opName, BUFAddrKind.Offset, vdataClass, elems, hasGFX12Enc>,
MTBUFAddr64Table<0, NAME>;
- def _ADDR64 : MTBUF_Store_Pseudo <opName, BUFAddrKind.Addr64, vdataClass, elems>,
+ def _ADDR64 : MTBUF_Store_Pseudo <opName, BUFAddrKind.Addr64, vdataClass, elems, hasGFX12Enc>,
MTBUFAddr64Table<1, NAME>;
- def _OFFEN : MTBUF_Store_Pseudo <opName, BUFAddrKind.OffEn, vdataClass, elems>;
- def _IDXEN : MTBUF_Store_Pseudo <opName, BUFAddrKind.IdxEn, vdataClass, elems>;
- def _BOTHEN : MTBUF_Store_Pseudo <opName, BUFAddrKind.BothEn, vdataClass, elems>;
+ def _OFFEN : MTBUF_Store_Pseudo <opName, BUFAddrKind.OffEn, vdataClass, elems, hasGFX12Enc>;
+ def _IDXEN : MTBUF_Store_Pseudo <opName, BUFAddrKind.IdxEn, vdataClass, elems, hasGFX12Enc>;
+ def _BOTHEN : MTBUF_Store_Pseudo <opName, BUFAddrKind.BothEn, vdataClass, elems, hasGFX12Enc>;
let DisableWQM = 1 in {
- def _OFFSET_exact : MTBUF_Store_Pseudo <opName, BUFAddrKind.Offset, vdataClass, elems>;
- def _OFFEN_exact : MTBUF_Store_Pseudo <opName, BUFAddrKind.OffEn, vdataClass, elems>;
- def _IDXEN_exact : MTBUF_Store_Pseudo <opName, BUFAddrKind.IdxEn, vdataClass, elems>;
- def _BOTHEN_exact : MTBUF_Store_Pseudo <opName, BUFAddrKind.BothEn, vdataClass, elems>;
+ def _OFFSET_exact : MTBUF_Store_Pseudo <opName, BUFAddrKind.Offset, vdataClass, elems, hasGFX12Enc>;
+ def _OFFEN_exact : MTBUF_Store_Pseudo <opName, BUFAddrKind.OffEn, vdataClass, elems, hasGFX12Enc>;
+ def _IDXEN_exact : MTBUF_Store_Pseudo <opName, BUFAddrKind.IdxEn, vdataClass, elems, hasGFX12Enc>;
+ def _BOTHEN_exact : MTBUF_Store_Pseudo <opName, BUFAddrKind.BothEn, vdataClass, elems, hasGFX12Enc>;
}
}
+multiclass MTBUF_Pseudo_Stores<string opName, RegisterClass vdataClass,
+ int elems> {
+ defm NAME : MTBUF_Pseudo_Stores_Helper<opName, vdataClass, elems, 0>;
+ defm _VBUFFER : MTBUF_Pseudo_Stores_Helper<opName, vdataClass, elems, 1>;
+}
//===----------------------------------------------------------------------===//
// MUBUF classes
@@ -381,12 +404,14 @@ class getLdStVDataRegisterOperand<RegisterClass RC, bit isTFE> {
}
class getMUBUFInsDA<list<RegisterClass> vdataList,
- list<RegisterClass> vaddrList, bit isTFE> {
+ list<RegisterClass> vaddrList, bit isTFE, bit hasGFX12Enc> {
RegisterClass vdataClass = !if(!empty(vdataList), ?, !head(vdataList));
RegisterClass vaddrClass = !if(!empty(vaddrList), ?, !head(vaddrList));
RegisterOperand vdata_op = getLdStVDataRegisterOperand<vdataClass, isTFE>.ret;
- dag NonVaddrInputs = (ins SReg_128:$srsrc, SCSrc_b32:$soffset, offset:$offset, CPol_0:$cpol, i1imm_0:$swz);
+ dag SOffset = !if(hasGFX12Enc, (ins SReg_32:$soffset), (ins SCSrc_b32:$soffset));
+ dag NonVaddrInputs = !con((ins SReg_128:$srsrc), SOffset, (ins offset:$offset, CPol_0:$cpol, i1imm_0:$swz));
+
dag Inputs = !if(!empty(vaddrList), NonVaddrInputs, !con((ins vaddrClass:$vaddr), NonVaddrInputs));
dag ret = !if(!empty(vdataList), Inputs, !con((ins vdata_op:$vdata), Inputs));
}
@@ -410,13 +435,13 @@ class getMUBUFElements<ValueType vt> {
);
}
-class getMUBUFIns<int addrKind, list<RegisterClass> vdataList, bit isTFE> {
+class getMUBUFIns<int addrKind, list<RegisterClass> vdataList, bit isTFE, bit hasGFX12Enc> {
dag ret =
- !if(!eq(addrKind, BUFAddrKind.Offset), getMUBUFInsDA<vdataList, [], isTFE>.ret,
- !if(!eq(addrKind, BUFAddrKind.OffEn), getMUBUFInsDA<vdataList, [VGPR_32], isTFE>.ret,
- !if(!eq(addrKind, BUFAddrKind.IdxEn), getMUBUFInsDA<vdataList, [VGPR_32], isTFE>.ret,
- !if(!eq(addrKind, BUFAddrKind.BothEn), getMUBUFInsDA<vdataList, [VReg_64], isTFE>.ret,
- !if(!eq(addrKind, BUFAddrKind.Addr64), getMUBUFInsDA<vdataList, [VReg_64], isTFE>.ret,
+ !if(!eq(addrKind, BUFAddrKind.Offset), getMUBUFInsDA<vdataList, [], isTFE, hasGFX12Enc>.ret,
+ !if(!eq(addrKind, BUFAddrKind.OffEn), getMUBUFInsDA<vdataList, [VGPR_32], isTFE, hasGFX12Enc>.ret,
+ !if(!eq(addrKind, BUFAddrKind.IdxEn), getMUBUFInsDA<vdataList, [VGPR_32], isTFE, hasGFX12Enc>.ret,
+ !if(!eq(addrKind, BUFAddrKind.BothEn), getMUBUFInsDA<vdataList, [VReg_64], isTFE, hasGFX12Enc>.ret,
+ !if(!eq(addrKind, BUFAddrKind.Addr64), getMUBUFInsDA<vdataList, [VReg_64], isTFE, hasGFX12Enc>.ret,
(ins))))));
}
@@ -456,6 +481,7 @@ class MUBUF_Load_Pseudo <string opName,
bit isLds = 0,
bit isLdsOpc = 0,
bit isTFE = 0,
+ bit hasGFX12Enc = 0,
list<dag> pattern=[],
// Workaround bug bz30254
int addrKindCopy = addrKind,
@@ -463,7 +489,7 @@ class MUBUF_Load_Pseudo <string opName,
RegisterOperand vdata_op = getLdStVDataRegisterOperand<vdata_rc, isTFE>.ret>
: MUBUF_Pseudo<opName,
!if(!or(isLds, isLdsOpc), (outs), (outs vdata_op:$vdata)),
- !con(getMUBUFIns<addrKindCopy, [], isTFE>.ret,
+ !con(getMUBUFIns<addrKindCopy, [], isTFE, hasGFX12Enc>.ret,
!if(HasTiedDest, (ins vdata_op:$vdata_in), (ins))),
getMUBUFAsmOps<addrKindCopy, !or(isLds, isLdsOpc), isLds, isTFE>.ret,
pattern>,
@@ -485,50 +511,61 @@ class MUBUF_Load_Pseudo <string opName,
let VALU = isLds;
}
-class MUBUF_Offset_Load_Pat <Instruction inst, ValueType load_vt = i32, SDPatternOperator ld = null_frag> : Pat <
+class MUBUF_Offset_Load_Pat <Instruction inst, ValueType load_vt = i32, SDPatternOperator ld = null_frag> : GCNPat <
(load_vt (ld (MUBUFOffset v4i32:$srsrc, i32:$soffset, i32:$offset))),
(load_vt (inst v4i32:$srsrc, i32:$soffset, i32:$offset))
>;
class MUBUF_Addr64_Load_Pat <Instruction inst,
ValueType load_vt = i32,
- SDPatternOperator ld = null_frag> : Pat <
+ SDPatternOperator ld = null_frag> : GCNPat <
(load_vt (ld (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, i32:$soffset, i32:$offset))),
(load_vt (inst i64:$vaddr, v4i32:$srsrc, i32:$soffset, i32:$offset))
>;
-multiclass MUBUF_Pseudo_Load_Pats<string BaseInst, ValueType load_vt = i32, SDPatternOperator ld = null_frag> {
+multiclass MUBUF_Pseudo_Load_Pats_Common<string BaseInst, ValueType load_vt = i32, SDPatternOperator ld = null_frag> {
def : MUBUF_Offset_Load_Pat<!cast<Instruction>(BaseInst#"_OFFSET"), load_vt, ld>;
def : MUBUF_Addr64_Load_Pat<!cast<Instruction>(BaseInst#"_ADDR64"), load_vt, ld>;
}
+multiclass MUBUF_Pseudo_Load_Pats<string BaseInst, ValueType load_vt = i32, SDPatternOperator ld = null_frag>{
+ let SubtargetPredicate = HasUnrestrictedSOffset in {
+ defm : MUBUF_Pseudo_Load_Pats_Common<BaseInst, load_vt, ld>;
+ }
+ defm : MUBUF_Pseudo_Load_Pats_Common<BaseInst # "_VBUFFER", load_vt, ld>;
+}
+
multiclass MUBUF_Pseudo_Loads_Helper<string opName, ValueType load_vt,
- bit TiedDest, bit isLds, bit isTFE> {
+ bit TiedDest, bit isLds, bit isTFE, bit hasGFX12Enc> {
defvar legal_load_vt = !if(!eq(load_vt, v3f16), v4f16, load_vt);
- def _OFFSET : MUBUF_Load_Pseudo <opName, BUFAddrKind.Offset, legal_load_vt, TiedDest, isLds, 0, isTFE>,
+ def _OFFSET : MUBUF_Load_Pseudo <opName, BUFAddrKind.Offset, legal_load_vt, TiedDest, isLds, 0, isTFE, hasGFX12Enc>,
MUBUFAddr64Table<0, NAME # !if(isLds, "_LDS", "")>;
- def _ADDR64 : MUBUF_Load_Pseudo <opName, BUFAddrKind.Addr64, legal_load_vt, TiedDest, isLds, 0, isTFE>,
+ def _ADDR64 : MUBUF_Load_Pseudo <opName, BUFAddrKind.Addr64, legal_load_vt, TiedDest, isLds, 0, isTFE, hasGFX12Enc>,
MUBUFAddr64Table<1, NAME # !if(isLds, "_LDS", "")>;
- def _OFFEN : MUBUF_Load_Pseudo <opName, BUFAddrKind.OffEn, legal_load_vt, TiedDest, isLds, 0, isTFE>;
- def _IDXEN : MUBUF_Load_Pseudo <opName, BUFAddrKind.IdxEn, legal_load_vt, TiedDest, isLds, 0, isTFE>;
- def _BOTHEN : MUBUF_Load_Pseudo <opName, BUFAddrKind.BothEn, legal_load_vt, TiedDest, isLds, 0, isTFE>;
+ def _OFFEN : MUBUF_Load_Pseudo <opName, BUFAddrKind.OffEn, legal_load_vt, TiedDest, isLds, 0, isTFE, hasGFX12Enc>;
+ def _IDXEN : MUBUF_Load_Pseudo <opName, BUFAddrKind.IdxEn, legal_load_vt, TiedDest, isLds, 0, isTFE, hasGFX12Enc>;
+ def _BOTHEN : MUBUF_Load_Pseudo <opName, BUFAddrKind.BothEn, legal_load_vt, TiedDest, isLds, 0, isTFE, hasGFX12Enc>;
let DisableWQM = 1 in {
- def _OFFSET_exact : MUBUF_Load_Pseudo <opName, BUFAddrKind.Offset, legal_load_vt, TiedDest, isLds, 0, isTFE>;
- def _OFFEN_exact : MUBUF_Load_Pseudo <opName, BUFAddrKind.OffEn, legal_load_vt, TiedDest, isLds, 0, isTFE>;
- def _IDXEN_exact : MUBUF_Load_Pseudo <opName, BUFAddrKind.IdxEn, legal_load_vt, TiedDest, isLds, 0, isTFE>;
- def _BOTHEN_exact : MUBUF_Load_Pseudo <opName, BUFAddrKind.BothEn, legal_load_vt, TiedDest, isLds, 0, isTFE>;
+ def _OFFSET_exact : MUBUF_Load_Pseudo <opName, BUFAddrKind.Offset, legal_load_vt, TiedDest, isLds, 0, isTFE, hasGFX12Enc>;
+ def _OFFEN_exact : MUBUF_Load_Pseudo <opName, BUFAddrKind.OffEn, legal_load_vt, TiedDest, isLds, 0, isTFE, hasGFX12Enc>;
+ def _IDXEN_exact : MUBUF_Load_Pseudo <opName, BUFAddrKind.IdxEn, legal_load_vt, TiedDest, isLds, 0, isTFE, hasGFX12Enc>;
+ def _BOTHEN_exact : MUBUF_Load_Pseudo <opName, BUFAddrKind.BothEn, legal_load_vt, TiedDest, isLds, 0, isTFE, hasGFX12Enc>;
}
}
multiclass MUBUF_Pseudo_Loads<string opName, ValueType load_vt = i32,
bit TiedDest = 0, bit isLds = 0> {
- defm NAME : MUBUF_Pseudo_Loads_Helper<opName, load_vt, TiedDest, isLds, 0>;
- if !not(isLds) then
- defm _TFE : MUBUF_Pseudo_Loads_Helper<opName, load_vt, TiedDest, isLds, 1>;
+ defm NAME : MUBUF_Pseudo_Loads_Helper<opName, load_vt, TiedDest, isLds, 0, 0>;
+ defm _VBUFFER : MUBUF_Pseudo_Loads_Helper<opName, load_vt, TiedDest, isLds, 0, 1>;
+
+ if !not(isLds) then {
+ defm _TFE : MUBUF_Pseudo_Loads_Helper<opName, load_vt, TiedDest, isLds, 1, 0>;
+ defm _TFE_VBUFFER : MUBUF_Pseudo_Loads_Helper<opName, load_vt, TiedDest, isLds, 1, 1>;
+ }
}
multiclass MUBUF_Pseudo_Loads_Lds<string opName, ValueType load_vt = i32> {
@@ -548,18 +585,24 @@ multiclass MUBUF_Pseudo_Loads_LDSOpc<string opName,
def _OFFEN : MUBUF_Load_Pseudo <opName, BUFAddrKind.OffEn, legal_load_vt, TiedDest, isLds, isLdsOpc>;
def _IDXEN : MUBUF_Load_Pseudo <opName, BUFAddrKind.IdxEn, legal_load_vt, TiedDest, isLds, isLdsOpc>;
def _BOTHEN : MUBUF_Load_Pseudo <opName, BUFAddrKind.BothEn, legal_load_vt, TiedDest, isLds, isLdsOpc>;
+
+ def _VBUFFER_OFFSET : MUBUF_Load_Pseudo <opName, BUFAddrKind.Offset, legal_load_vt, TiedDest, isLds, isLdsOpc, 0, 1>;
+ def _VBUFFER_OFFEN : MUBUF_Load_Pseudo <opName, BUFAddrKind.OffEn, legal_load_vt, TiedDest, isLds, isLdsOpc, 0, 1>;
+ def _VBUFFER_IDXEN : MUBUF_Load_Pseudo <opName, BUFAddrKind.IdxEn, legal_load_vt, TiedDest, isLds, isLdsOpc, 0, 1>;
+ def _VBUFFER_BOTHEN : MUBUF_Load_Pseudo <opName, BUFAddrKind.BothEn, legal_load_vt, TiedDest, isLds, isLdsOpc, 0, 1>;
}
class MUBUF_Store_Pseudo <string opName,
int addrKind,
ValueType store_vt,
bit isTFE = 0,
+ bit hasGFX12Enc = 0,
list<dag> pattern=[],
// Workaround bug bz30254
int addrKindCopy = addrKind>
: MUBUF_Pseudo<opName,
(outs),
- getMUBUFIns<addrKindCopy, [getVregSrcForVT<store_vt>.ret], isTFE>.ret,
+ getMUBUFIns<addrKindCopy, [getVregSrcForVT<store_vt>.ret], isTFE, hasGFX12Enc>.ret,
getMUBUFAsmOps<addrKindCopy, 0, 0, isTFE>.ret,
pattern>,
MUBUF_SetupAddr<addrKindCopy> {
@@ -572,36 +615,52 @@ class MUBUF_Store_Pseudo <string opName,
let tfe = isTFE;
}
+multiclass MUBUF_Pseudo_Store_Pats_Common<string BaseInst, ValueType store_vt = i32, SDPatternOperator st = null_frag> {
+
+ def : GCNPat <
+ (st store_vt:$vdata, (MUBUFOffset v4i32:$srsrc, i32:$soffset, i32:$offset)),
+ (!cast<MUBUF_Pseudo>(BaseInst # _OFFSET) store_vt:$vdata, v4i32:$srsrc, i32:$soffset, i32:$offset)>;
+
+ def : GCNPat <
+ (st store_vt:$vdata, (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, i32:$soffset, i32:$offset)),
+ (!cast<MUBUF_Pseudo>(BaseInst # _ADDR64) store_vt:$vdata, i64:$vaddr, v4i32:$srsrc, i32:$soffset, i32:$offset)>;
+}
+
+multiclass MUBUF_Pseudo_Store_Pats<string BaseInst, ValueType store_vt = i32, SDPatternOperator st = null_frag> {
+ let SubtargetPredicate = HasUnrestrictedSOffset in {
+ defm : MUBUF_Pseudo_Store_Pats_Common<BaseInst, store_vt, st>;
+ }
+ defm : MUBUF_Pseudo_Store_Pats_Common<BaseInst # "_VBUFFER", store_vt, st>;
+}
+
multiclass MUBUF_Pseudo_Stores_Helper<string opName, ValueType store_vt,
- SDPatternOperator st, bit isTFE> {
+ bit isTFE, bit hasGFX12Enc> {
defvar legal_store_vt = !if(!eq(store_vt, v3f16), v4f16, store_vt);
- def _OFFSET : MUBUF_Store_Pseudo <opName, BUFAddrKind.Offset, legal_store_vt, isTFE,
- [(st legal_store_vt:$vdata, (MUBUFOffset v4i32:$srsrc, i32:$soffset,
- i32:$offset))]>,
+ def _OFFSET : MUBUF_Store_Pseudo <opName, BUFAddrKind.Offset, legal_store_vt, isTFE, hasGFX12Enc>,
MUBUFAddr64Table<0, NAME>;
- def _ADDR64 : MUBUF_Store_Pseudo <opName, BUFAddrKind.Addr64, legal_store_vt, isTFE,
- [(st legal_store_vt:$vdata, (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, i32:$soffset,
- i32:$offset))]>,
+ def _ADDR64 : MUBUF_Store_Pseudo <opName, BUFAddrKind.Addr64, legal_store_vt, isTFE, hasGFX12Enc>,
MUBUFAddr64Table<1, NAME>;
- def _OFFEN : MUBUF_Store_Pseudo <opName, BUFAddrKind.OffEn, legal_store_vt, isTFE>;
- def _IDXEN : MUBUF_Store_Pseudo <opName, BUFAddrKind.IdxEn, legal_store_vt, isTFE>;
- def _BOTHEN : MUBUF_Store_Pseudo <opName, BUFAddrKind.BothEn, legal_store_vt, isTFE>;
+ def _OFFEN : MUBUF_Store_Pseudo <opName, BUFAddrKind.OffEn, legal_store_vt, isTFE, hasGFX12Enc>;
+ def _IDXEN : MUBUF_Store_Pseudo <opName, BUFAddrKind.IdxEn, legal_store_vt, isTFE, hasGFX12Enc>;
+ def _BOTHEN : MUBUF_Store_Pseudo <opName, BUFAddrKind.BothEn, legal_store_vt, isTFE, hasGFX12Enc>;
let DisableWQM = 1 in {
- def _OFFSET_exact : MUBUF_Store_Pseudo <opName, BUFAddrKind.Offset, legal_store_vt, isTFE>;
- def _OFFEN_exact : MUBUF_Store_Pseudo <opName, BUFAddrKind.OffEn, legal_store_vt, isTFE>;
- def _IDXEN_exact : MUBUF_Store_Pseudo <opName, BUFAddrKind.IdxEn, legal_store_vt, isTFE>;
- def _BOTHEN_exact : MUBUF_Store_Pseudo <opName, BUFAddrKind.BothEn, legal_store_vt, isTFE>;
+ def _OFFSET_exact : MUBUF_Store_Pseudo <opName, BUFAddrKind.Offset, legal_store_vt, isTFE, hasGFX12Enc>;
+ def _OFFEN_exact : MUBUF_Store_Pseudo <opName, BUFAddrKind.OffEn, legal_store_vt, isTFE, hasGFX12Enc>;
+ def _IDXEN_exact : MUBUF_Store_Pseudo <opName, BUFAddrKind.IdxEn, legal_store_vt, isTFE, hasGFX12Enc>;
+ def _BOTHEN_exact : MUBUF_Store_Pseudo <opName, BUFAddrKind.BothEn, legal_store_vt, isTFE, hasGFX12Enc>;
}
}
-multiclass MUBUF_Pseudo_Stores<string opName, ValueType store_vt = i32,
- SDPatternOperator st = null_frag> {
- defm NAME : MUBUF_Pseudo_Stores_Helper<opName, store_vt, st, 0>;
- defm _TFE : MUBUF_Pseudo_Stores_Helper<opName, store_vt, null_frag, 1>;
+multiclass MUBUF_Pseudo_Stores<string opName, ValueType store_vt = i32> {
+ defm NAME : MUBUF_Pseudo_Stores_Helper<opName, store_vt, 0, 0>;
+ defm _TFE : MUBUF_Pseudo_Stores_Helper<opName, store_vt, 1, 0>;
+
+ defm _VBUFFER : MUBUF_Pseudo_Stores_Helper<opName, store_vt, 0, 1>;
+ defm _TFE_VBUFFER : MUBUF_Pseudo_Stores_Helper<opName, store_vt, 1, 1>;
}
class MUBUF_Pseudo_Store_Lds<string opName>
@@ -623,15 +682,17 @@ class MUBUF_Pseudo_Store_Lds<string opName>
let AsmMatchConverter = "cvtMubuf";
}
-class getMUBUFAtomicInsDA<RegisterClass vdataClass, bit vdata_in,
+class getMUBUFAtomicInsDA<RegisterClass vdataClass, bit vdata_in, bit hasGFX12Enc,
list<RegisterClass> vaddrList=[]> {
RegisterClass vaddrClass = !if(!empty(vaddrList), ?, !head(vaddrList));
RegisterOperand vdata_op = getLdStRegisterOperand<vdataClass>.ret;
dag VData = !if(vdata_in, (ins vdata_op:$vdata_in), (ins vdata_op:$vdata));
dag Data = !if(!empty(vaddrList), VData, !con(VData, (ins vaddrClass:$vaddr)));
- dag MainInputs = (ins SReg_128:$srsrc, SCSrc_b32:$soffset, offset:$offset);
- dag CPol = !if(vdata_in, (ins CPol_GLC1:$cpol), (ins CPol_0:$cpol));
+ dag SOffset = !if(hasGFX12Enc, (ins SReg_32:$soffset), (ins SCSrc_b32:$soffset));
+ dag MainInputs = !con((ins SReg_128:$srsrc), SOffset, (ins offset:$offset));
+ dag CPol = !if(vdata_in, (ins CPol_GLC_WithDefault:$cpol),
+ (ins CPol_NonGLC_WithDefault:$cpol));
dag ret = !con(Data, MainInputs, CPol);
}
@@ -639,19 +700,20 @@ class getMUBUFAtomicInsDA<RegisterClass vdataClass, bit vdata_in,
class getMUBUFAtomicIns<int addrKind,
RegisterClass vdataClass,
bit vdata_in,
+ bit hasGFX12Enc,
// Workaround bug bz30254
RegisterClass vdataClassCopy=vdataClass> {
dag ret =
!if(!eq(addrKind, BUFAddrKind.Offset),
- getMUBUFAtomicInsDA<vdataClassCopy, vdata_in>.ret,
+ getMUBUFAtomicInsDA<vdataClassCopy, vdata_in, hasGFX12Enc>.ret,
!if(!eq(addrKind, BUFAddrKind.OffEn),
- getMUBUFAtomicInsDA<vdataClassCopy, vdata_in, [VGPR_32]>.ret,
+ getMUBUFAtomicInsDA<vdataClassCopy, vdata_in, hasGFX12Enc, [VGPR_32]>.ret,
!if(!eq(addrKind, BUFAddrKind.IdxEn),
- getMUBUFAtomicInsDA<vdataClassCopy, vdata_in, [VGPR_32]>.ret,
+ getMUBUFAtomicInsDA<vdataClassCopy, vdata_in, hasGFX12Enc, [VGPR_32]>.ret,
!if(!eq(addrKind, BUFAddrKind.BothEn),
- getMUBUFAtomicInsDA<vdataClassCopy, vdata_in, [VReg_64]>.ret,
+ getMUBUFAtomicInsDA<vdataClassCopy, vdata_in, hasGFX12Enc, [VReg_64]>.ret,
!if(!eq(addrKind, BUFAddrKind.Addr64),
- getMUBUFAtomicInsDA<vdataClassCopy, vdata_in, [VReg_64]>.ret,
+ getMUBUFAtomicInsDA<vdataClassCopy, vdata_in, hasGFX12Enc, [VReg_64]>.ret,
(ins))))));
}
@@ -679,13 +741,14 @@ class MUBUF_Atomic_Pseudo<string opName,
class MUBUF_AtomicNoRet_Pseudo<string opName, int addrKind,
RegisterClass vdataClass,
+ bit hasGFX12Enc = 0,
list<dag> pattern=[],
// Workaround bug bz30254
int addrKindCopy = addrKind,
RegisterClass vdataClassCopy = vdataClass>
: MUBUF_Atomic_Pseudo<opName, addrKindCopy,
(outs),
- getMUBUFAtomicIns<addrKindCopy, vdataClassCopy, 0>.ret,
+ getMUBUFAtomicIns<addrKindCopy, vdataClassCopy, 0, hasGFX12Enc>.ret,
getMUBUFAsmOps<addrKindCopy>.ret,
pattern>,
AtomicNoRet<opName # "_" # getAddrName<addrKindCopy>.ret, 0> {
@@ -698,6 +761,7 @@ class MUBUF_AtomicNoRet_Pseudo<string opName, int addrKind,
class MUBUF_AtomicRet_Pseudo<string opName, int addrKind,
RegisterClass vdataClass,
+ bit hasGFX12Enc = 0,
list<dag> pattern=[],
// Workaround bug bz30254
int addrKindCopy = addrKind,
@@ -705,7 +769,7 @@ class MUBUF_AtomicRet_Pseudo<string opName, int addrKind,
RegisterOperand vdata_op = getLdStRegisterOperand<vdataClass>.ret>
: MUBUF_Atomic_Pseudo<opName, addrKindCopy,
(outs vdata_op:$vdata),
- getMUBUFAtomicIns<addrKindCopy, vdataClassCopy, 1>.ret,
+ getMUBUFAtomicIns<addrKindCopy, vdataClassCopy, 1, hasGFX12Enc>.ret,
getMUBUFAsmOps<addrKindCopy>.ret,
pattern>,
AtomicNoRet<opName # "_" # getAddrName<addrKindCopy>.ret, 1> {
@@ -723,13 +787,21 @@ multiclass MUBUF_Pseudo_Atomics_NO_RTN <string opName,
ValueType vdataType,
bit isFP = isFloatType<vdataType>.ret> {
let FPAtomic = isFP in {
- def _OFFSET : MUBUF_AtomicNoRet_Pseudo <opName, BUFAddrKind.Offset, vdataClass>,
+ def _OFFSET : MUBUF_AtomicNoRet_Pseudo <opName, BUFAddrKind.Offset, vdataClass, 0>,
MUBUFAddr64Table <0, NAME>;
- def _ADDR64 : MUBUF_AtomicNoRet_Pseudo <opName, BUFAddrKind.Addr64, vdataClass>,
+ def _ADDR64 : MUBUF_AtomicNoRet_Pseudo <opName, BUFAddrKind.Addr64, vdataClass, 0>,
MUBUFAddr64Table <1, NAME>;
- def _OFFEN : MUBUF_AtomicNoRet_Pseudo <opName, BUFAddrKind.OffEn, vdataClass>;
- def _IDXEN : MUBUF_AtomicNoRet_Pseudo <opName, BUFAddrKind.IdxEn, vdataClass>;
- def _BOTHEN : MUBUF_AtomicNoRet_Pseudo <opName, BUFAddrKind.BothEn, vdataClass>;
+ def _OFFEN : MUBUF_AtomicNoRet_Pseudo <opName, BUFAddrKind.OffEn, vdataClass, 0>;
+ def _IDXEN : MUBUF_AtomicNoRet_Pseudo <opName, BUFAddrKind.IdxEn, vdataClass, 0>;
+ def _BOTHEN : MUBUF_AtomicNoRet_Pseudo <opName, BUFAddrKind.BothEn, vdataClass, 0>;
+
+ def _VBUFFER_OFFSET : MUBUF_AtomicNoRet_Pseudo <opName #_vbuffer, BUFAddrKind.Offset, vdataClass, 1>,
+ MUBUFAddr64Table <0, NAME # "_VBUFFER">;
+ def _VBUFFER_ADDR64 : MUBUF_AtomicNoRet_Pseudo <opName #_vbuffer, BUFAddrKind.Addr64, vdataClass, 1>,
+ MUBUFAddr64Table <1, NAME # "_VBUFFER">;
+ def _VBUFFER_OFFEN : MUBUF_AtomicNoRet_Pseudo <opName #_vbuffer, BUFAddrKind.OffEn, vdataClass, 1>;
+ def _VBUFFER_IDXEN : MUBUF_AtomicNoRet_Pseudo <opName #_vbuffer, BUFAddrKind.IdxEn, vdataClass, 1>;
+ def _VBUFFER_BOTHEN : MUBUF_AtomicNoRet_Pseudo <opName #_vbuffer, BUFAddrKind.BothEn, vdataClass, 1>;
}
}
@@ -739,21 +811,37 @@ multiclass MUBUF_Pseudo_Atomics_RTN <string opName,
SDPatternOperator atomic,
bit isFP = isFloatType<vdataType>.ret> {
let FPAtomic = isFP in {
- def _OFFSET_RTN : MUBUF_AtomicRet_Pseudo <opName, BUFAddrKind.Offset, vdataClass,
+ def _OFFSET_RTN : MUBUF_AtomicRet_Pseudo <opName, BUFAddrKind.Offset, vdataClass, 0,
[(set vdataType:$vdata,
(atomic (MUBUFOffset v4i32:$srsrc, i32:$soffset, i32:$offset),
vdataType:$vdata_in))]>,
MUBUFAddr64Table <0, NAME # "_RTN">;
- def _ADDR64_RTN : MUBUF_AtomicRet_Pseudo <opName, BUFAddrKind.Addr64, vdataClass,
+ def _ADDR64_RTN : MUBUF_AtomicRet_Pseudo <opName, BUFAddrKind.Addr64, vdataClass, 0,
[(set vdataType:$vdata,
(atomic (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, i32:$soffset, i32:$offset),
vdataType:$vdata_in))]>,
MUBUFAddr64Table <1, NAME # "_RTN">;
- def _OFFEN_RTN : MUBUF_AtomicRet_Pseudo <opName, BUFAddrKind.OffEn, vdataClass>;
- def _IDXEN_RTN : MUBUF_AtomicRet_Pseudo <opName, BUFAddrKind.IdxEn, vdataClass>;
- def _BOTHEN_RTN : MUBUF_AtomicRet_Pseudo <opName, BUFAddrKind.BothEn, vdataClass>;
+ def _OFFEN_RTN : MUBUF_AtomicRet_Pseudo <opName, BUFAddrKind.OffEn, vdataClass, 0>;
+ def _IDXEN_RTN : MUBUF_AtomicRet_Pseudo <opName, BUFAddrKind.IdxEn, vdataClass, 0>;
+ def _BOTHEN_RTN : MUBUF_AtomicRet_Pseudo <opName, BUFAddrKind.BothEn, vdataClass, 0>;
+
+ def _VBUFFER_OFFSET_RTN : MUBUF_AtomicRet_Pseudo <opName #_vbuffer, BUFAddrKind.Offset, vdataClass, 1,
+ [(set vdataType:$vdata,
+ (atomic (MUBUFOffset v4i32:$srsrc, i32:$soffset, i32:$offset),
+ vdataType:$vdata_in))]>,
+ MUBUFAddr64Table <0, NAME # "_VBUFFER_RTN">;
+
+ def _VBUFFER_ADDR64_RTN : MUBUF_AtomicRet_Pseudo <opName #_vbuffer, BUFAddrKind.Addr64, vdataClass, 1,
+ [(set vdataType:$vdata,
+ (atomic (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, i32:$soffset, i32:$offset),
+ vdataType:$vdata_in))]>,
+ MUBUFAddr64Table <1, NAME # "_VBUFFER_RTN">;
+
+ def _VBUFFER_OFFEN_RTN : MUBUF_AtomicRet_Pseudo <opName #_vbuffer, BUFAddrKind.OffEn, vdataClass, 1>;
+ def _VBUFFER_IDXEN_RTN : MUBUF_AtomicRet_Pseudo <opName #_vbuffer, BUFAddrKind.IdxEn, vdataClass, 1>;
+ def _VBUFFER_BOTHEN_RTN : MUBUF_AtomicRet_Pseudo <opName #_vbuffer, BUFAddrKind.BothEn, vdataClass, 1>;
}
}
@@ -794,7 +882,7 @@ defm BUFFER_STORE_FORMAT_XYZW : MUBUF_Pseudo_Stores <
"buffer_store_format_xyzw", v4f32
>;
-let SubtargetPredicate = HasUnpackedD16VMem, D16Buf = 1 in {
+let OtherPredicates = [HasUnpackedD16VMem], D16Buf = 1 in {
let TiedSourceNotRead = 1 in {
defm BUFFER_LOAD_FORMAT_D16_X_gfx80 : MUBUF_Pseudo_Loads <
"buffer_load_format_d16_x", i32
@@ -821,9 +909,9 @@ let TiedSourceNotRead = 1 in {
defm BUFFER_STORE_FORMAT_D16_XYZW_gfx80 : MUBUF_Pseudo_Stores <
"buffer_store_format_d16_xyzw", v4i32
>;
-} // End HasUnpackedD16VMem.
+} // End OtherPredicates = [HasUnpackedD16VMem], D16Buf = 1.
-let SubtargetPredicate = HasPackedD16VMem, D16Buf = 1 in {
+let OtherPredicates = [HasPackedD16VMem], D16Buf = 1 in {
let TiedSourceNotRead = 1 in {
defm BUFFER_LOAD_FORMAT_D16_X : MUBUF_Pseudo_Loads <
"buffer_load_format_d16_x", f16
@@ -850,7 +938,7 @@ let TiedSourceNotRead = 1 in {
defm BUFFER_STORE_FORMAT_D16_XYZW : MUBUF_Pseudo_Stores <
"buffer_store_format_d16_xyzw", v4f16
>;
-} // End HasPackedD16VMem.
+} // End OtherPredicates = [HasPackedD16VMem], D16Buf = 1.
defm BUFFER_LOAD_UBYTE : MUBUF_Pseudo_Loads_Lds <
"buffer_load_ubyte", i32
@@ -906,29 +994,61 @@ defm : MUBUF_Pseudo_Load_Pats<"BUFFER_LOAD_SBYTE", i32, sextloadi8_global>;
defm : MUBUF_Pseudo_Load_Pats<"BUFFER_LOAD_USHORT", i32, extloadi16_global>;
defm : MUBUF_Pseudo_Load_Pats<"BUFFER_LOAD_USHORT", i32, zextloadi16_global>;
defm : MUBUF_Pseudo_Load_Pats<"BUFFER_LOAD_SSHORT", i32, sextloadi16_global>;
-defm : MUBUF_Pseudo_Load_Pats<"BUFFER_LOAD_DWORD", i32, load_global>;
-defm : MUBUF_Pseudo_Load_Pats<"BUFFER_LOAD_DWORDX2", v2i32, load_global>;
-defm : MUBUF_Pseudo_Load_Pats<"BUFFER_LOAD_DWORDX3", v3i32, load_global>;
-defm : MUBUF_Pseudo_Load_Pats<"BUFFER_LOAD_DWORDX4", v4i32, load_global>;
+
+foreach vt = Reg32Types.types in {
+defm : MUBUF_Pseudo_Load_Pats<"BUFFER_LOAD_DWORD", vt, load_global>;
+}
+
+foreach vt = VReg_64.RegTypes in {
+defm : MUBUF_Pseudo_Load_Pats<"BUFFER_LOAD_DWORDX2", vt, load_global>;
+}
+
+foreach vt = VReg_96.RegTypes in {
+defm : MUBUF_Pseudo_Load_Pats<"BUFFER_LOAD_DWORDX3", vt, load_global>;
+}
+
+foreach vt = VReg_128.RegTypes in {
+defm : MUBUF_Pseudo_Load_Pats<"BUFFER_LOAD_DWORDX4", vt, load_global>;
+}
defm BUFFER_STORE_BYTE : MUBUF_Pseudo_Stores <
- "buffer_store_byte", i32, truncstorei8_global
+ "buffer_store_byte", i32
>;
defm BUFFER_STORE_SHORT : MUBUF_Pseudo_Stores <
- "buffer_store_short", i32, truncstorei16_global
+ "buffer_store_short", i32
>;
defm BUFFER_STORE_DWORD : MUBUF_Pseudo_Stores <
- "buffer_store_dword", i32, store_global
+ "buffer_store_dword", i32
>;
defm BUFFER_STORE_DWORDX2 : MUBUF_Pseudo_Stores <
- "buffer_store_dwordx2", v2i32, store_global
+ "buffer_store_dwordx2", v2i32
>;
defm BUFFER_STORE_DWORDX3 : MUBUF_Pseudo_Stores <
- "buffer_store_dwordx3", v3i32, store_global
+ "buffer_store_dwordx3", v3i32
>;
defm BUFFER_STORE_DWORDX4 : MUBUF_Pseudo_Stores <
- "buffer_store_dwordx4", v4i32, store_global
+ "buffer_store_dwordx4", v4i32
>;
+
+defm : MUBUF_Pseudo_Store_Pats<"BUFFER_STORE_BYTE", i32, truncstorei8_global>;
+defm : MUBUF_Pseudo_Store_Pats<"BUFFER_STORE_SHORT", i32, truncstorei16_global>;
+
+foreach vt = Reg32Types.types in {
+defm : MUBUF_Pseudo_Store_Pats<"BUFFER_STORE_DWORD", vt, store_global>;
+}
+
+foreach vt = VReg_64.RegTypes in {
+defm : MUBUF_Pseudo_Store_Pats<"BUFFER_STORE_DWORDX2", vt, store_global>;
+}
+
+foreach vt = VReg_96.RegTypes in {
+defm : MUBUF_Pseudo_Store_Pats<"BUFFER_STORE_DWORDX3", vt, store_global>;
+}
+
+foreach vt = VReg_128.RegTypes in {
+defm : MUBUF_Pseudo_Store_Pats<"BUFFER_STORE_DWORDX4", vt, store_global>;
+}
+
defm BUFFER_ATOMIC_SWAP : MUBUF_Pseudo_Atomics <
"buffer_atomic_swap", VGPR_32, i32
>;
@@ -1008,10 +1128,11 @@ defm BUFFER_ATOMIC_DEC_X2 : MUBUF_Pseudo_Atomics <
"buffer_atomic_dec_x2", VReg_64, i64
>;
-let SubtargetPredicate = HasGFX10_BEncoding in
-defm BUFFER_ATOMIC_CSUB : MUBUF_Pseudo_Atomics_RTN <
- "buffer_atomic_csub", VGPR_32, i32, int_amdgcn_global_atomic_csub
->;
+let OtherPredicates = [HasGFX10_BEncoding] in {
+ defm BUFFER_ATOMIC_CSUB : MUBUF_Pseudo_Atomics <
+ "buffer_atomic_csub", VGPR_32, i32, int_amdgcn_global_atomic_csub
+ >;
+}
let SubtargetPredicate = isGFX8GFX9 in {
def BUFFER_STORE_LDS_DWORD : MUBUF_Pseudo_Store_Lds <"buffer_store_lds_dword">;
@@ -1198,10 +1319,8 @@ def BUFFER_INV : MUBUF_Invalidate<"buffer_inv"> {
let AsmOperands = "$cpol";
}
-let SubtargetPredicate = isGFX10Plus in {
- def BUFFER_GL0_INV : MUBUF_Invalidate<"buffer_gl0_inv">;
- def BUFFER_GL1_INV : MUBUF_Invalidate<"buffer_gl1_inv">;
-} // End SubtargetPredicate = isGFX10Plus
+def BUFFER_GL0_INV : MUBUF_Invalidate<"buffer_gl0_inv">;
+def BUFFER_GL1_INV : MUBUF_Invalidate<"buffer_gl1_inv">;
//===----------------------------------------------------------------------===//
// MUBUF Patterns
@@ -1211,33 +1330,33 @@ let SubtargetPredicate = isGFX10Plus in {
// buffer_load/store_format patterns
//===----------------------------------------------------------------------===//
-multiclass MUBUF_LoadIntrinsicPat<SDPatternOperator name, ValueType vt,
+multiclass MUBUF_LoadIntrinsicPat_Common<SDPatternOperator name, ValueType vt,
string opcode, ValueType memoryVt = vt> {
defvar st = !if(!eq(memoryVt, vt), name, mubuf_intrinsic_load<name, memoryVt>);
def : GCNPat<
- (vt (st v4i32:$rsrc, 0, 0, i32:$soffset, timm:$offset,
+ (vt (st v4i32:$rsrc, 0, 0, (BUFSOffset i32:$soffset), timm:$offset,
timm:$auxiliary, 0)),
(!cast<MUBUF_Pseudo>(opcode # _OFFSET) SReg_128:$rsrc, SCSrc_b32:$soffset, timm:$offset,
(extract_cpol $auxiliary), (extract_swz $auxiliary))
>;
def : GCNPat<
- (vt (st v4i32:$rsrc, 0, i32:$voffset, i32:$soffset, timm:$offset,
+ (vt (st v4i32:$rsrc, 0, i32:$voffset, (BUFSOffset i32:$soffset), timm:$offset,
timm:$auxiliary, 0)),
(!cast<MUBUF_Pseudo>(opcode # _OFFEN) VGPR_32:$voffset, SReg_128:$rsrc, SCSrc_b32:$soffset, timm:$offset,
(extract_cpol $auxiliary), (extract_swz $auxiliary))
>;
def : GCNPat<
- (vt (st v4i32:$rsrc, i32:$vindex, 0, i32:$soffset, timm:$offset,
+ (vt (st v4i32:$rsrc, i32:$vindex, 0, (BUFSOffset i32:$soffset), timm:$offset,
timm:$auxiliary, timm)),
(!cast<MUBUF_Pseudo>(opcode # _IDXEN) VGPR_32:$vindex, SReg_128:$rsrc, SCSrc_b32:$soffset, timm:$offset,
(extract_cpol $auxiliary), (extract_swz $auxiliary))
>;
def : GCNPat<
- (vt (st v4i32:$rsrc, i32:$vindex, i32:$voffset, i32:$soffset, timm:$offset,
+ (vt (st v4i32:$rsrc, i32:$vindex, i32:$voffset, (BUFSOffset i32:$soffset), timm:$offset,
timm:$auxiliary, timm)),
(!cast<MUBUF_Pseudo>(opcode # _BOTHEN)
(REG_SEQUENCE VReg_64, VGPR_32:$vindex, sub0, VGPR_32:$voffset, sub1),
@@ -1246,6 +1365,14 @@ multiclass MUBUF_LoadIntrinsicPat<SDPatternOperator name, ValueType vt,
>;
}
+multiclass MUBUF_LoadIntrinsicPat<SDPatternOperator name, ValueType vt,
+ string opcode, ValueType memoryVt = vt>{
+ let SubtargetPredicate = HasUnrestrictedSOffset in {
+ defm : MUBUF_LoadIntrinsicPat_Common<name, vt, opcode, memoryVt>;
+ }
+ defm : MUBUF_LoadIntrinsicPat_Common<name, vt, opcode # "_VBUFFER", memoryVt>;
+}
+
defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_format, f32, "BUFFER_LOAD_FORMAT_X">;
defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_format, i32, "BUFFER_LOAD_FORMAT_X">;
defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_format, v2f32, "BUFFER_LOAD_FORMAT_XY">;
@@ -1260,16 +1387,16 @@ defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_format_tfe, v3i32, "BUFFER_LOAD_FORM
defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_format_tfe, v4i32, "BUFFER_LOAD_FORMAT_XYZ_TFE">;
defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_format_tfe, v5i32, "BUFFER_LOAD_FORMAT_XYZW_TFE">;
-let SubtargetPredicate = HasUnpackedD16VMem in {
- defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_format_d16, f16, "BUFFER_LOAD_FORMAT_D16_X_gfx80">;
- defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_format_d16, i16, "BUFFER_LOAD_FORMAT_D16_X_gfx80">;
- defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_format_d16, i32, "BUFFER_LOAD_FORMAT_D16_X_gfx80">;
- defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_format_d16, v2i32, "BUFFER_LOAD_FORMAT_D16_XY_gfx80">;
- defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_format_d16, v3i32, "BUFFER_LOAD_FORMAT_D16_XYZ_gfx80">;
- defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_format_d16, v4i32, "BUFFER_LOAD_FORMAT_D16_XYZW_gfx80">;
+let OtherPredicates = [HasUnpackedD16VMem] in {
+ defm : MUBUF_LoadIntrinsicPat_Common<SIbuffer_load_format_d16, f16, "BUFFER_LOAD_FORMAT_D16_X_gfx80">;
+ defm : MUBUF_LoadIntrinsicPat_Common<SIbuffer_load_format_d16, i16, "BUFFER_LOAD_FORMAT_D16_X_gfx80">;
+ defm : MUBUF_LoadIntrinsicPat_Common<SIbuffer_load_format_d16, i32, "BUFFER_LOAD_FORMAT_D16_X_gfx80">;
+ defm : MUBUF_LoadIntrinsicPat_Common<SIbuffer_load_format_d16, v2i32, "BUFFER_LOAD_FORMAT_D16_XY_gfx80">;
+ defm : MUBUF_LoadIntrinsicPat_Common<SIbuffer_load_format_d16, v3i32, "BUFFER_LOAD_FORMAT_D16_XYZ_gfx80">;
+ defm : MUBUF_LoadIntrinsicPat_Common<SIbuffer_load_format_d16, v4i32, "BUFFER_LOAD_FORMAT_D16_XYZW_gfx80">;
} // End HasUnpackedD16VMem.
-let SubtargetPredicate = HasPackedD16VMem in {
+let OtherPredicates = [HasPackedD16VMem] in {
defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_format_d16, f16, "BUFFER_LOAD_FORMAT_D16_X">;
defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_format_d16, i16, "BUFFER_LOAD_FORMAT_D16_X">;
defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_format_d16, i32, "BUFFER_LOAD_FORMAT_D16_X">;
@@ -1298,33 +1425,33 @@ defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_short, i32, "BUFFER_LOAD_SSHORT">;
defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_ubyte, i32, "BUFFER_LOAD_UBYTE">;
defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_ushort, i32, "BUFFER_LOAD_USHORT">;
-multiclass MUBUF_StoreIntrinsicPat<SDPatternOperator name, ValueType vt,
+multiclass MUBUF_StoreIntrinsicPat_Common<SDPatternOperator name, ValueType vt,
string opcode, ValueType memoryVt = vt> {
defvar st = !if(!eq(memoryVt, vt), name, mubuf_intrinsic_store<name, memoryVt>);
def : GCNPat<
- (st vt:$vdata, v4i32:$rsrc, 0, 0, i32:$soffset, timm:$offset,
+ (st vt:$vdata, v4i32:$rsrc, 0, 0, (BUFSOffset i32:$soffset), timm:$offset,
timm:$auxiliary, 0),
(!cast<MUBUF_Pseudo>(opcode # _OFFSET_exact) getVregSrcForVT<vt>.ret:$vdata, SReg_128:$rsrc, SCSrc_b32:$soffset, timm:$offset,
(extract_cpol $auxiliary), (extract_swz $auxiliary))
>;
def : GCNPat<
- (st vt:$vdata, v4i32:$rsrc, 0, i32:$voffset, i32:$soffset, timm:$offset,
+ (st vt:$vdata, v4i32:$rsrc, 0, i32:$voffset, (BUFSOffset i32:$soffset), timm:$offset,
timm:$auxiliary, 0),
(!cast<MUBUF_Pseudo>(opcode # _OFFEN_exact) getVregSrcForVT<vt>.ret:$vdata, VGPR_32:$voffset, SReg_128:$rsrc, SCSrc_b32:$soffset,
timm:$offset, (extract_cpol $auxiliary), (extract_swz $auxiliary))
>;
def : GCNPat<
- (st vt:$vdata, v4i32:$rsrc, i32:$vindex, 0, i32:$soffset, timm:$offset,
+ (st vt:$vdata, v4i32:$rsrc, i32:$vindex, 0, (BUFSOffset i32:$soffset), timm:$offset,
timm:$auxiliary, timm),
(!cast<MUBUF_Pseudo>(opcode # _IDXEN_exact) getVregSrcForVT<vt>.ret:$vdata, VGPR_32:$vindex, SReg_128:$rsrc, SCSrc_b32:$soffset,
timm:$offset, (extract_cpol $auxiliary), (extract_swz $auxiliary))
>;
def : GCNPat<
- (st vt:$vdata, v4i32:$rsrc, i32:$vindex, i32:$voffset, i32:$soffset, timm:$offset,
+ (st vt:$vdata, v4i32:$rsrc, i32:$vindex, i32:$voffset, (BUFSOffset i32:$soffset), timm:$offset,
timm:$auxiliary, timm),
(!cast<MUBUF_Pseudo>(opcode # _BOTHEN_exact)
getVregSrcForVT<vt>.ret:$vdata,
@@ -1334,6 +1461,14 @@ multiclass MUBUF_StoreIntrinsicPat<SDPatternOperator name, ValueType vt,
>;
}
+multiclass MUBUF_StoreIntrinsicPat<SDPatternOperator name, ValueType vt,
+ string opcode, ValueType memoryVt = vt> {
+ let SubtargetPredicate = HasUnrestrictedSOffset in {
+ defm : MUBUF_StoreIntrinsicPat_Common<name, vt, opcode, memoryVt>;
+ }
+ defm : MUBUF_StoreIntrinsicPat_Common<name, vt, opcode # "_VBUFFER", memoryVt>;
+}
+
defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_format, f32, "BUFFER_STORE_FORMAT_X">;
defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_format, i32, "BUFFER_STORE_FORMAT_X">;
defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_format, v2f32, "BUFFER_STORE_FORMAT_XY">;
@@ -1343,16 +1478,16 @@ defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_format, v3i32, "BUFFER_STORE_FORMA
defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_format, v4f32, "BUFFER_STORE_FORMAT_XYZW">;
defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_format, v4i32, "BUFFER_STORE_FORMAT_XYZW">;
-let SubtargetPredicate = HasUnpackedD16VMem in {
- defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_format_d16, f16, "BUFFER_STORE_FORMAT_D16_X_gfx80">;
- defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_format_d16, i16, "BUFFER_STORE_FORMAT_D16_X_gfx80">;
- defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_format_d16, i32, "BUFFER_STORE_FORMAT_D16_X_gfx80">;
- defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_format_d16, v2i32, "BUFFER_STORE_FORMAT_D16_XY_gfx80">;
- defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_format_d16, v3i32, "BUFFER_STORE_FORMAT_D16_XYZ_gfx80">;
- defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_format_d16, v4i32, "BUFFER_STORE_FORMAT_D16_XYZW_gfx80">;
+let OtherPredicates = [HasUnpackedD16VMem] in {
+ defm : MUBUF_StoreIntrinsicPat_Common<SIbuffer_store_format_d16, f16, "BUFFER_STORE_FORMAT_D16_X_gfx80">;
+ defm : MUBUF_StoreIntrinsicPat_Common<SIbuffer_store_format_d16, i16, "BUFFER_STORE_FORMAT_D16_X_gfx80">;
+ defm : MUBUF_StoreIntrinsicPat_Common<SIbuffer_store_format_d16, i32, "BUFFER_STORE_FORMAT_D16_X_gfx80">;
+ defm : MUBUF_StoreIntrinsicPat_Common<SIbuffer_store_format_d16, v2i32, "BUFFER_STORE_FORMAT_D16_XY_gfx80">;
+ defm : MUBUF_StoreIntrinsicPat_Common<SIbuffer_store_format_d16, v3i32, "BUFFER_STORE_FORMAT_D16_XYZ_gfx80">;
+ defm : MUBUF_StoreIntrinsicPat_Common<SIbuffer_store_format_d16, v4i32, "BUFFER_STORE_FORMAT_D16_XYZW_gfx80">;
} // End HasUnpackedD16VMem.
-let SubtargetPredicate = HasPackedD16VMem in {
+let OtherPredicates = [HasPackedD16VMem] in {
defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_format_d16, f16, "BUFFER_STORE_FORMAT_D16_X">;
defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_format_d16, i16, "BUFFER_STORE_FORMAT_D16_X">;
defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_format_d16, i32, "BUFFER_STORE_FORMAT_D16_X">;
@@ -1383,7 +1518,7 @@ defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_short, i32, "BUFFER_STORE_SHORT">;
// buffer_atomic patterns
//===----------------------------------------------------------------------===//
-multiclass BufferAtomicPat<string OpPrefix, ValueType vt, string Inst, bit isIntr = 0> {
+multiclass BufferAtomicPat_Common<string OpPrefix, ValueType vt, string Inst, bit isIntr = 0> {
foreach RtnMode = ["ret", "noret"] in {
defvar Op = !cast<SDPatternOperator>(OpPrefix
@@ -1409,11 +1544,18 @@ multiclass BufferAtomicPat<string OpPrefix, ValueType vt, string Inst, bit isInt
} // end foreach RtnMode
}
+multiclass BufferAtomicPat<string OpPrefix, ValueType vt, string Inst, bit isIntr = 0> {
+ let SubtargetPredicate = HasUnrestrictedSOffset in {
+ defm : BufferAtomicPat_Common<OpPrefix, vt, Inst, isIntr>;
+ }
+ defm : BufferAtomicPat_Common<OpPrefix, vt, Inst # "_VBUFFER", isIntr>;
+}
+
multiclass BufferAtomicIntrPat<string OpPrefix, ValueType vt, string Inst> {
defm : BufferAtomicPat<OpPrefix, vt, Inst, /* isIntr */ 1>;
}
-multiclass BufferAtomicCmpSwapPat<ValueType vt, ValueType data_vt, string Inst> {
+multiclass BufferAtomicCmpSwapPat_Common<ValueType vt, ValueType data_vt, string Inst> {
foreach RtnMode = ["ret", "noret"] in {
defvar Op = !cast<SDPatternOperator>("AMDGPUatomic_cmp_swap_global"
@@ -1449,6 +1591,14 @@ multiclass BufferAtomicCmpSwapPat<ValueType vt, ValueType data_vt, string Inst>
} // end foreach RtnMode
}
+multiclass BufferAtomicCmpSwapPat<ValueType vt, ValueType data_vt, string Inst> {
+ let SubtargetPredicate = HasUnrestrictedSOffset in {
+ defm : BufferAtomicCmpSwapPat_Common<vt, data_vt, Inst>;
+ }
+ defm : BufferAtomicCmpSwapPat_Common<vt, data_vt, Inst # "_VBUFFER">;
+}
+
+
foreach Ty = [i32, i64] in {
defvar Suffix = !if(!eq(Ty, i64), "_X2", "");
@@ -1471,7 +1621,7 @@ defm : BufferAtomicPat<"atomic_load_udec_wrap_global", Ty, "BUFFER_ATOMIC_DEC" #
defm : BufferAtomicCmpSwapPat<i32, v2i32, "BUFFER_ATOMIC_CMPSWAP">;
defm : BufferAtomicCmpSwapPat<i64, v2i64, "BUFFER_ATOMIC_CMPSWAP_X2">;
-multiclass SIBufferAtomicPat<string OpPrefix, ValueType vt, string Inst,
+multiclass SIBufferAtomicPat_Common<string OpPrefix, ValueType vt, string Inst,
list<string> RtnModes = ["ret", "noret"]> {
foreach RtnMode = RtnModes in {
@@ -1484,7 +1634,7 @@ multiclass SIBufferAtomicPat<string OpPrefix, ValueType vt, string Inst,
let AddedComplexity = !if(!eq(RtnMode, "ret"), 0, 1) in {
def : GCNPat<
- (vt (Op vt:$vdata_in, v4i32:$rsrc, 0, 0, i32:$soffset,
+ (vt (Op vt:$vdata_in, v4i32:$rsrc, 0, 0, (BUFSOffset i32:$soffset),
timm:$offset, timm:$cachepolicy, 0)),
(!cast<MUBUF_Pseudo>(Inst # "_OFFSET" # InstSuffix)
getVregSrcForVT<vt>.ret:$vdata_in, SReg_128:$rsrc, SCSrc_b32:$soffset,
@@ -1492,7 +1642,7 @@ multiclass SIBufferAtomicPat<string OpPrefix, ValueType vt, string Inst,
>;
def : GCNPat<
- (vt (Op vt:$vdata_in, v4i32:$rsrc, i32:$vindex, 0, i32:$soffset,
+ (vt (Op vt:$vdata_in, v4i32:$rsrc, i32:$vindex, 0, (BUFSOffset i32:$soffset),
timm:$offset, timm:$cachepolicy, timm)),
(!cast<MUBUF_Pseudo>(Inst # "_IDXEN" # InstSuffix)
getVregSrcForVT<vt>.ret:$vdata_in, VGPR_32:$vindex, SReg_128:$rsrc,
@@ -1501,7 +1651,7 @@ multiclass SIBufferAtomicPat<string OpPrefix, ValueType vt, string Inst,
def : GCNPat<
(vt (Op vt:$vdata_in, v4i32:$rsrc, 0, i32:$voffset,
- i32:$soffset, timm:$offset, timm:$cachepolicy, 0)),
+ (BUFSOffset i32:$soffset), timm:$offset, timm:$cachepolicy, 0)),
(!cast<MUBUF_Pseudo>(Inst # "_OFFEN" # InstSuffix)
getVregSrcForVT<vt>.ret:$vdata_in, VGPR_32:$voffset, SReg_128:$rsrc,
SCSrc_b32:$soffset, timm:$offset, CachePolicy)
@@ -1509,7 +1659,7 @@ multiclass SIBufferAtomicPat<string OpPrefix, ValueType vt, string Inst,
def : GCNPat<
(vt (Op vt:$vdata_in, v4i32:$rsrc, i32:$vindex, i32:$voffset,
- i32:$soffset, timm:$offset, timm:$cachepolicy, timm)),
+ (BUFSOffset i32:$soffset), timm:$offset, timm:$cachepolicy, timm)),
(!cast<MUBUF_Pseudo>(Inst # "_BOTHEN" # InstSuffix)
getVregSrcForVT<vt>.ret:$vdata_in,
(REG_SEQUENCE VReg_64, VGPR_32:$vindex, sub0, VGPR_32:$voffset, sub1),
@@ -1520,6 +1670,14 @@ multiclass SIBufferAtomicPat<string OpPrefix, ValueType vt, string Inst,
} // end foreach RtnMode
}
+multiclass SIBufferAtomicPat<string OpPrefix, ValueType vt, string Inst,
+ list<string> RtnModes = ["ret", "noret"]> {
+ let SubtargetPredicate = HasUnrestrictedSOffset in {
+ defm : SIBufferAtomicPat_Common<OpPrefix, vt, Inst, RtnModes>;
+ }
+ defm : SIBufferAtomicPat_Common<OpPrefix, vt, Inst # "_VBUFFER", RtnModes>;
+}
+
defm : SIBufferAtomicPat<"SIbuffer_atomic_swap", i32, "BUFFER_ATOMIC_SWAP">;
defm : SIBufferAtomicPat<"SIbuffer_atomic_swap", f32, "BUFFER_ATOMIC_SWAP">;
defm : SIBufferAtomicPat<"SIbuffer_atomic_add", i32, "BUFFER_ATOMIC_ADD">;
@@ -1547,6 +1705,9 @@ defm : SIBufferAtomicPat<"SIbuffer_atomic_xor", i64, "BUFFER_ATOMIC_XOR_X2">;
defm : SIBufferAtomicPat<"SIbuffer_atomic_inc", i64, "BUFFER_ATOMIC_INC_X2">;
defm : SIBufferAtomicPat<"SIbuffer_atomic_dec", i64, "BUFFER_ATOMIC_DEC_X2">;
+let SubtargetPredicate = HasAtomicCSubNoRtnInsts in
+defm : SIBufferAtomicPat<"SIbuffer_atomic_csub", i32, "BUFFER_ATOMIC_CSUB", ["noret"]>;
+
let SubtargetPredicate = isGFX6GFX7GFX10Plus in {
defm : SIBufferAtomicPat<"SIbuffer_atomic_fmin", f32, "BUFFER_ATOMIC_FMIN">;
defm : SIBufferAtomicPat<"SIbuffer_atomic_fmax", f32, "BUFFER_ATOMIC_FMAX">;
@@ -1562,11 +1723,11 @@ class NoUseBufferAtomic<SDPatternOperator Op, ValueType vt> : PatFrag <
let HasNoUse = true;
}
-multiclass BufferAtomicPatterns_NO_RTN<SDPatternOperator name, ValueType vt,
+multiclass BufferAtomicPatterns_NO_RTN_Common<SDPatternOperator name, ValueType vt,
string opcode> {
def : GCNPat<
(NoUseBufferAtomic<name, vt> vt:$vdata_in, v4i32:$rsrc, 0,
- 0, i32:$soffset, timm:$offset,
+ 0, (BUFSOffset i32:$soffset), timm:$offset,
timm:$cachepolicy, 0),
(!cast<MUBUF_Pseudo>(opcode # _OFFSET) getVregSrcForVT<vt>.ret:$vdata_in, SReg_128:$rsrc, SCSrc_b32:$soffset,
timm:$offset, timm:$cachepolicy)
@@ -1574,7 +1735,7 @@ multiclass BufferAtomicPatterns_NO_RTN<SDPatternOperator name, ValueType vt,
def : GCNPat<
(NoUseBufferAtomic<name, vt> vt:$vdata_in, v4i32:$rsrc, i32:$vindex,
- 0, i32:$soffset, timm:$offset,
+ 0, (BUFSOffset i32:$soffset), timm:$offset,
timm:$cachepolicy, timm),
(!cast<MUBUF_Pseudo>(opcode # _IDXEN) getVregSrcForVT<vt>.ret:$vdata_in, VGPR_32:$vindex, SReg_128:$rsrc, SCSrc_b32:$soffset,
timm:$offset, timm:$cachepolicy)
@@ -1582,7 +1743,7 @@ multiclass BufferAtomicPatterns_NO_RTN<SDPatternOperator name, ValueType vt,
def : GCNPat<
(NoUseBufferAtomic<name, vt> vt:$vdata_in, v4i32:$rsrc, 0,
- i32:$voffset, i32:$soffset, timm:$offset,
+ i32:$voffset, (BUFSOffset i32:$soffset), timm:$offset,
timm:$cachepolicy, 0),
(!cast<MUBUF_Pseudo>(opcode # _OFFEN) getVregSrcForVT<vt>.ret:$vdata_in, VGPR_32:$voffset, SReg_128:$rsrc, SCSrc_b32:$soffset,
timm:$offset, timm:$cachepolicy)
@@ -1590,7 +1751,7 @@ multiclass BufferAtomicPatterns_NO_RTN<SDPatternOperator name, ValueType vt,
def : GCNPat<
(NoUseBufferAtomic<name, vt> vt:$vdata_in, v4i32:$rsrc, i32:$vindex,
- i32:$voffset, i32:$soffset, timm:$offset,
+ i32:$voffset, (BUFSOffset i32:$soffset), timm:$offset,
timm:$cachepolicy, timm),
(!cast<MUBUF_Pseudo>(opcode # _BOTHEN)
getVregSrcForVT<vt>.ret:$vdata_in,
@@ -1599,87 +1760,111 @@ multiclass BufferAtomicPatterns_NO_RTN<SDPatternOperator name, ValueType vt,
>;
}
-let SubtargetPredicate = HasAtomicFaddNoRtnInsts in
-defm : SIBufferAtomicPat<"SIbuffer_atomic_fadd", f32, "BUFFER_ATOMIC_ADD_F32", ["noret"]>;
+multiclass BufferAtomicPatterns_NO_RTN<SDPatternOperator name, ValueType vt,
+ string opcode> {
+ let SubtargetPredicate = HasUnrestrictedSOffset in {
+ defm : BufferAtomicPatterns_NO_RTN_Common<name, vt, opcode>;
+ }
+ defm : BufferAtomicPatterns_NO_RTN_Common<name, vt, opcode # "_VBUFFER">;
+}
-let SubtargetPredicate = HasAtomicBufferGlobalPkAddF16NoRtnInsts in
-defm : SIBufferAtomicPat<"SIbuffer_atomic_fadd", v2f16, "BUFFER_ATOMIC_PK_ADD_F16", ["noret"]>;
+let OtherPredicates = [HasAtomicFaddNoRtnInsts] in
+ defm : SIBufferAtomicPat<"SIbuffer_atomic_fadd", f32, "BUFFER_ATOMIC_ADD_F32", ["noret"]>;
-let SubtargetPredicate = HasAtomicFaddRtnInsts in
-defm : SIBufferAtomicPat<"SIbuffer_atomic_fadd", f32, "BUFFER_ATOMIC_ADD_F32", ["ret"]>;
+let OtherPredicates = [HasAtomicBufferGlobalPkAddF16NoRtnInsts] in {
+ defm : SIBufferAtomicPat_Common<"SIbuffer_atomic_fadd", v2f16, "BUFFER_ATOMIC_PK_ADD_F16", ["noret"]>;
+} // End OtherPredicates = [HasAtomicBufferGlobalPkAddF16NoRtnInsts]
-let SubtargetPredicate = HasAtomicBufferGlobalPkAddF16Insts in
-defm : SIBufferAtomicPat<"SIbuffer_atomic_fadd", v2f16, "BUFFER_ATOMIC_PK_ADD_F16", ["ret"]>;
+let OtherPredicates = [HasAtomicFaddRtnInsts] in
+ defm : SIBufferAtomicPat<"SIbuffer_atomic_fadd", f32, "BUFFER_ATOMIC_ADD_F32", ["ret"]>;
-let SubtargetPredicate = isGFX90APlus in {
+let OtherPredicates = [HasAtomicBufferGlobalPkAddF16Insts] in {
+ defm : SIBufferAtomicPat_Common<"SIbuffer_atomic_fadd", v2f16, "BUFFER_ATOMIC_PK_ADD_F16", ["ret"]>;
+} // End OtherPredicates = [HasAtomicBufferGlobalPkAddF16Insts]
+
+let OtherPredicates = [isGFX90APlus] in {
defm : SIBufferAtomicPat<"SIbuffer_atomic_fadd", f64, "BUFFER_ATOMIC_ADD_F64">;
defm : SIBufferAtomicPat<"SIbuffer_atomic_fmin", f64, "BUFFER_ATOMIC_MIN_F64">;
defm : SIBufferAtomicPat<"SIbuffer_atomic_fmax", f64, "BUFFER_ATOMIC_MAX_F64">;
} // End SubtargetPredicate = isGFX90APlus
-foreach RtnMode = ["ret", "noret"] in {
-
-defvar Op = !cast<SDPatternOperator>(SIbuffer_atomic_cmpswap
- # !if(!eq(RtnMode, "ret"), "", "_noret"));
-defvar InstSuffix = !if(!eq(RtnMode, "ret"), "_RTN", "");
-defvar CachePolicy = !if(!eq(RtnMode, "ret"), (set_glc $cachepolicy),
- (timm:$cachepolicy));
-
-defvar OffsetResDag = (!cast<MUBUF_Pseudo>("BUFFER_ATOMIC_CMPSWAP_OFFSET" # InstSuffix)
- (REG_SEQUENCE VReg_64, VGPR_32:$data, sub0, VGPR_32:$cmp, sub1),
- SReg_128:$rsrc, SCSrc_b32:$soffset, timm:$offset, CachePolicy);
-def : GCNPat<
- (Op
- i32:$data, i32:$cmp, v4i32:$rsrc, 0, 0, i32:$soffset,
- timm:$offset, timm:$cachepolicy, 0),
- !if(!eq(RtnMode, "ret"),
- (EXTRACT_SUBREG (i64 (COPY_TO_REGCLASS OffsetResDag, VReg_64)), sub0),
- OffsetResDag)
->;
-
-defvar IdxenResDag = (!cast<MUBUF_Pseudo>("BUFFER_ATOMIC_CMPSWAP_IDXEN" # InstSuffix)
- (REG_SEQUENCE VReg_64, VGPR_32:$data, sub0, VGPR_32:$cmp, sub1),
- VGPR_32:$vindex, SReg_128:$rsrc, SCSrc_b32:$soffset, timm:$offset,
- CachePolicy);
-def : GCNPat<
- (Op
- i32:$data, i32:$cmp, v4i32:$rsrc, i32:$vindex,
- 0, i32:$soffset, timm:$offset,
- timm:$cachepolicy, timm),
- !if(!eq(RtnMode, "ret"),
- (EXTRACT_SUBREG (i64 (COPY_TO_REGCLASS IdxenResDag, VReg_64)), sub0),
- IdxenResDag)
->;
-
-defvar OffenResDag = (!cast<MUBUF_Pseudo>("BUFFER_ATOMIC_CMPSWAP_OFFEN" # InstSuffix)
- (REG_SEQUENCE VReg_64, VGPR_32:$data, sub0, VGPR_32:$cmp, sub1),
- VGPR_32:$voffset, SReg_128:$rsrc, SCSrc_b32:$soffset, timm:$offset,
- CachePolicy);
-def : GCNPat<
- (Op
- i32:$data, i32:$cmp, v4i32:$rsrc, 0,
- i32:$voffset, i32:$soffset, timm:$offset,
- timm:$cachepolicy, 0),
- !if(!eq(RtnMode, "ret"),
- (EXTRACT_SUBREG (i64 (COPY_TO_REGCLASS OffenResDag, VReg_64)), sub0),
- OffenResDag)
->;
-
-defvar BothenResDag = (!cast<MUBUF_Pseudo>("BUFFER_ATOMIC_CMPSWAP_BOTHEN" # InstSuffix)
- (REG_SEQUENCE VReg_64, VGPR_32:$data, sub0, VGPR_32:$cmp, sub1),
- (REG_SEQUENCE VReg_64, VGPR_32:$vindex, sub0, VGPR_32:$voffset, sub1),
- SReg_128:$rsrc, SCSrc_b32:$soffset, timm:$offset, CachePolicy);
-def : GCNPat<
- (Op
- i32:$data, i32:$cmp, v4i32:$rsrc, i32:$vindex,
- i32:$voffset, i32:$soffset, timm:$offset,
- timm:$cachepolicy, timm),
- !if(!eq(RtnMode, "ret"),
- (EXTRACT_SUBREG (i64 (COPY_TO_REGCLASS BothenResDag, VReg_64)), sub0),
- BothenResDag)
->;
-
-} // end foreach RtnMode
+multiclass SIBufferAtomicCmpSwapPat_Common<ValueType vt, ValueType data_vt, string Inst> {
+ foreach RtnMode = ["ret", "noret"] in {
+ defvar Op = !cast<SDPatternOperator>(SIbuffer_atomic_cmpswap
+ # !if(!eq(RtnMode, "ret"), "", "_noret"));
+ defvar InstSuffix = !if(!eq(RtnMode, "ret"), "_RTN", "");
+ defvar CachePolicy = !if(!eq(RtnMode, "ret"), (set_glc $cachepolicy),
+ (timm:$cachepolicy));
+ defvar SrcRC = getVregSrcForVT<vt>.ret;
+ defvar DataRC = getVregSrcForVT<data_vt>.ret;
+ defvar SubLo = !if(!eq(vt, i32), sub0, sub0_sub1);
+ defvar SubHi = !if(!eq(vt, i32), sub1, sub2_sub3);
+
+ defvar OffsetResDag = (!cast<MUBUF_Pseudo>(Inst # "_OFFSET" # InstSuffix)
+ (REG_SEQUENCE DataRC, SrcRC:$data, SubLo, SrcRC:$cmp, SubHi),
+ SReg_128:$rsrc, SCSrc_b32:$soffset, timm:$offset, CachePolicy);
+ def : GCNPat<
+ (vt (Op
+ vt:$data, vt:$cmp, v4i32:$rsrc, 0, 0, (BUFSOffset i32:$soffset),
+ timm:$offset, timm:$cachepolicy, 0)),
+ !if(!eq(RtnMode, "ret"),
+ (EXTRACT_SUBREG OffsetResDag, SubLo),
+ OffsetResDag)
+ >;
+
+ defvar IdxenResDag = (!cast<MUBUF_Pseudo>(Inst # "_IDXEN" # InstSuffix)
+ (REG_SEQUENCE DataRC, SrcRC:$data, SubLo, SrcRC:$cmp, SubHi),
+ VGPR_32:$vindex, SReg_128:$rsrc, SCSrc_b32:$soffset, timm:$offset,
+ CachePolicy);
+ def : GCNPat<
+ (vt (Op
+ vt:$data, vt:$cmp, v4i32:$rsrc, i32:$vindex,
+ 0, (BUFSOffset i32:$soffset), timm:$offset,
+ timm:$cachepolicy, timm)),
+ !if(!eq(RtnMode, "ret"),
+ (EXTRACT_SUBREG IdxenResDag, SubLo),
+ IdxenResDag)
+ >;
+
+ defvar OffenResDag = (!cast<MUBUF_Pseudo>(Inst # "_OFFEN" # InstSuffix)
+ (REG_SEQUENCE DataRC, SrcRC:$data, SubLo, SrcRC:$cmp, SubHi),
+ VGPR_32:$voffset, SReg_128:$rsrc, SCSrc_b32:$soffset, timm:$offset,
+ CachePolicy);
+ def : GCNPat<
+ (vt (Op
+ vt:$data, vt:$cmp, v4i32:$rsrc, 0,
+ i32:$voffset, (BUFSOffset i32:$soffset), timm:$offset,
+ timm:$cachepolicy, 0)),
+ !if(!eq(RtnMode, "ret"),
+ (EXTRACT_SUBREG OffenResDag, SubLo),
+ OffenResDag)
+ >;
+
+ defvar BothenResDag = (!cast<MUBUF_Pseudo>(Inst # "_BOTHEN" # InstSuffix)
+ (REG_SEQUENCE DataRC, SrcRC:$data, SubLo, SrcRC:$cmp, SubHi),
+ (REG_SEQUENCE VReg_64, VGPR_32:$vindex, sub0, VGPR_32:$voffset, sub1),
+ SReg_128:$rsrc, SCSrc_b32:$soffset, timm:$offset, CachePolicy);
+ def : GCNPat<
+ (vt (Op
+ vt:$data, vt:$cmp, v4i32:$rsrc, i32:$vindex,
+ i32:$voffset, (BUFSOffset i32:$soffset), timm:$offset,
+ timm:$cachepolicy, timm)),
+ !if(!eq(RtnMode, "ret"),
+ (EXTRACT_SUBREG BothenResDag, SubLo),
+ BothenResDag)
+ >;
+ } // end foreach RtnMode
+}
+
+multiclass SIBufferAtomicCmpSwapPat<ValueType vt, ValueType data_vt, string Inst> {
+ let SubtargetPredicate = HasUnrestrictedSOffset in {
+ defm : SIBufferAtomicCmpSwapPat_Common<vt, data_vt, Inst>;
+ }
+ defm : SIBufferAtomicCmpSwapPat_Common<vt, data_vt, Inst # "_VBUFFER">;
+}
+
+defm : SIBufferAtomicCmpSwapPat<i32, v2i32, "BUFFER_ATOMIC_CMPSWAP">;
+defm : SIBufferAtomicCmpSwapPat<i64, v2i64, "BUFFER_ATOMIC_CMPSWAP_X2">;
class MUBUFLoad_PatternADDR64 <MUBUF_Pseudo Instr_ADDR64, ValueType vt,
PatFrag constant_ld> : GCNPat <
@@ -1713,105 +1898,125 @@ defm : MUBUFLoad_Atomic_Pattern <BUFFER_LOAD_DWORD_ADDR64, BUFFER_LOAD_DWORD_OFF
defm : MUBUFLoad_Atomic_Pattern <BUFFER_LOAD_DWORDX2_ADDR64, BUFFER_LOAD_DWORDX2_OFFSET, i64, atomic_load_64_global>;
} // End SubtargetPredicate = isGFX6GFX7
-multiclass MUBUFLoad_Pattern <MUBUF_Pseudo Instr_OFFSET, ValueType vt,
+multiclass MUBUFLoad_PatternOffset_Common <string Instr, ValueType vt,
PatFrag ld> {
-
def : GCNPat <
(vt (ld (MUBUFOffset v4i32:$srsrc, i32:$soffset, i32:$offset))),
- (Instr_OFFSET $srsrc, $soffset, $offset)
+ (!cast<MUBUF_Pseudo>(Instr # "_OFFSET") $srsrc, $soffset, $offset)
>;
}
+multiclass MUBUFLoad_PatternOffset <string Instr, ValueType vt,
+ PatFrag ld> {
+ let SubtargetPredicate = HasUnrestrictedSOffset in {
+ defm : MUBUFLoad_PatternOffset_Common<Instr, vt, ld>;
+ }
+ defm : MUBUFLoad_PatternOffset_Common<Instr # "_VBUFFER", vt, ld>;
+}
+
let OtherPredicates = [Has16BitInsts] in {
-defm : MUBUFLoad_Pattern <BUFFER_LOAD_SBYTE_OFFSET, i16, sextloadi8_constant>;
-defm : MUBUFLoad_Pattern <BUFFER_LOAD_UBYTE_OFFSET, i16, extloadi8_constant>;
-defm : MUBUFLoad_Pattern <BUFFER_LOAD_UBYTE_OFFSET, i16, zextloadi8_constant>;
-defm : MUBUFLoad_Pattern <BUFFER_LOAD_SBYTE_OFFSET, i16, sextloadi8_global>;
-defm : MUBUFLoad_Pattern <BUFFER_LOAD_UBYTE_OFFSET, i16, extloadi8_global>;
-defm : MUBUFLoad_Pattern <BUFFER_LOAD_UBYTE_OFFSET, i16, zextloadi8_global>;
+defm : MUBUFLoad_PatternOffset <"BUFFER_LOAD_SBYTE", i16, sextloadi8_constant>;
+defm : MUBUFLoad_PatternOffset <"BUFFER_LOAD_UBYTE", i16, extloadi8_constant>;
+defm : MUBUFLoad_PatternOffset <"BUFFER_LOAD_UBYTE", i16, zextloadi8_constant>;
+defm : MUBUFLoad_PatternOffset <"BUFFER_LOAD_SBYTE", i16, sextloadi8_global>;
+defm : MUBUFLoad_PatternOffset <"BUFFER_LOAD_UBYTE", i16, extloadi8_global>;
+defm : MUBUFLoad_PatternOffset <"BUFFER_LOAD_UBYTE", i16, zextloadi8_global>;
-defm : MUBUFLoad_Pattern <BUFFER_LOAD_USHORT_OFFSET, i16, load_global>;
+defm : MUBUFLoad_PatternOffset <"BUFFER_LOAD_USHORT", i16, load_global>;
} // End OtherPredicates = [Has16BitInsts]
-multiclass MUBUFScratchLoadPat <MUBUF_Pseudo InstrOffen,
- MUBUF_Pseudo InstrOffset,
+multiclass MUBUFScratchLoadPat_Common <string Instr,
ValueType vt, PatFrag ld> {
def : GCNPat <
(vt (ld (MUBUFScratchOffen v4i32:$srsrc, i32:$vaddr,
i32:$soffset, i32:$offset))),
- (InstrOffen $vaddr, $srsrc, $soffset, $offset, 0, 0)
+ (!cast<MUBUF_Pseudo>(Instr # _OFFEN) $vaddr, $srsrc, $soffset, $offset, 0, 0)
>;
def : GCNPat <
(vt (ld (MUBUFScratchOffset v4i32:$srsrc, i32:$soffset, i32:$offset))),
- (InstrOffset $srsrc, $soffset, $offset, 0, 0)
+ (!cast<MUBUF_Pseudo>(Instr # _OFFSET) $srsrc, $soffset, $offset, 0, 0)
>;
}
+multiclass MUBUFScratchLoadPat <string Instr,
+ ValueType vt, PatFrag ld> {
+ let SubtargetPredicate = HasUnrestrictedSOffset in {
+ defm : MUBUFScratchLoadPat_Common<Instr, vt, ld>;
+ }
+ defm : MUBUFScratchLoadPat_Common<Instr # "_VBUFFER", vt, ld>;
+}
+
// XXX - Is it possible to have a complex pattern in a PatFrag?
-multiclass MUBUFScratchLoadPat_D16 <MUBUF_Pseudo InstrOffen,
- MUBUF_Pseudo InstrOffset,
+multiclass MUBUFScratchLoadPat_D16_Common <string Instr,
ValueType vt, PatFrag ld_frag> {
def : GCNPat <
(ld_frag (MUBUFScratchOffen v4i32:$srsrc, i32:$vaddr, i32:$soffset, i32:$offset), vt:$in),
- (InstrOffen $vaddr, $srsrc, $soffset, $offset, $in)
+ (!cast<MUBUF_Pseudo>(Instr # _OFFEN) $vaddr, $srsrc, $soffset, $offset, $in)
>;
def : GCNPat <
(ld_frag (MUBUFScratchOffset v4i32:$srsrc, i32:$soffset, i32:$offset), vt:$in),
- (InstrOffset $srsrc, $soffset, $offset, $in)
+ (!cast<MUBUF_Pseudo>(Instr # _OFFSET) $srsrc, $soffset, $offset, $in)
>;
}
+multiclass MUBUFScratchLoadPat_D16 <string Instr,
+ ValueType vt, PatFrag ld_frag> {
+ let SubtargetPredicate = HasUnrestrictedSOffset in {
+ defm : MUBUFScratchLoadPat_D16_Common<Instr, vt, ld_frag>;
+ }
+ defm : MUBUFScratchLoadPat_D16_Common<Instr # "_VBUFFER", vt, ld_frag>;
+}
+
let OtherPredicates = [DisableFlatScratch] in {
-defm : MUBUFScratchLoadPat <BUFFER_LOAD_SBYTE_OFFEN, BUFFER_LOAD_SBYTE_OFFSET, i32, sextloadi8_private>;
-defm : MUBUFScratchLoadPat <BUFFER_LOAD_UBYTE_OFFEN, BUFFER_LOAD_UBYTE_OFFSET, i32, extloadi8_private>;
-defm : MUBUFScratchLoadPat <BUFFER_LOAD_UBYTE_OFFEN, BUFFER_LOAD_UBYTE_OFFSET, i32, zextloadi8_private>;
-defm : MUBUFScratchLoadPat <BUFFER_LOAD_SBYTE_OFFEN, BUFFER_LOAD_SBYTE_OFFSET, i16, sextloadi8_private>;
-defm : MUBUFScratchLoadPat <BUFFER_LOAD_UBYTE_OFFEN, BUFFER_LOAD_UBYTE_OFFSET, i16, extloadi8_private>;
-defm : MUBUFScratchLoadPat <BUFFER_LOAD_UBYTE_OFFEN, BUFFER_LOAD_UBYTE_OFFSET, i16, zextloadi8_private>;
-defm : MUBUFScratchLoadPat <BUFFER_LOAD_SSHORT_OFFEN, BUFFER_LOAD_SSHORT_OFFSET, i32, sextloadi16_private>;
-defm : MUBUFScratchLoadPat <BUFFER_LOAD_USHORT_OFFEN, BUFFER_LOAD_USHORT_OFFSET, i32, extloadi16_private>;
-defm : MUBUFScratchLoadPat <BUFFER_LOAD_USHORT_OFFEN, BUFFER_LOAD_USHORT_OFFSET, i32, zextloadi16_private>;
-defm : MUBUFScratchLoadPat <BUFFER_LOAD_USHORT_OFFEN, BUFFER_LOAD_USHORT_OFFSET, i16, load_private>;
+defm : MUBUFScratchLoadPat <"BUFFER_LOAD_SBYTE", i32, sextloadi8_private>;
+defm : MUBUFScratchLoadPat <"BUFFER_LOAD_UBYTE", i32, extloadi8_private>;
+defm : MUBUFScratchLoadPat <"BUFFER_LOAD_UBYTE", i32, zextloadi8_private>;
+defm : MUBUFScratchLoadPat <"BUFFER_LOAD_SBYTE", i16, sextloadi8_private>;
+defm : MUBUFScratchLoadPat <"BUFFER_LOAD_UBYTE", i16, extloadi8_private>;
+defm : MUBUFScratchLoadPat <"BUFFER_LOAD_UBYTE", i16, zextloadi8_private>;
+defm : MUBUFScratchLoadPat <"BUFFER_LOAD_SSHORT", i32, sextloadi16_private>;
+defm : MUBUFScratchLoadPat <"BUFFER_LOAD_USHORT", i32, extloadi16_private>;
+defm : MUBUFScratchLoadPat <"BUFFER_LOAD_USHORT", i32, zextloadi16_private>;
+defm : MUBUFScratchLoadPat <"BUFFER_LOAD_USHORT", i16, load_private>;
foreach vt = Reg32Types.types in {
-defm : MUBUFScratchLoadPat <BUFFER_LOAD_DWORD_OFFEN, BUFFER_LOAD_DWORD_OFFSET, vt, load_private>;
+defm : MUBUFScratchLoadPat <"BUFFER_LOAD_DWORD", vt, load_private>;
}
-defm : MUBUFScratchLoadPat <BUFFER_LOAD_DWORDX2_OFFEN, BUFFER_LOAD_DWORDX2_OFFSET, v2i32, load_private>;
-defm : MUBUFScratchLoadPat <BUFFER_LOAD_DWORDX3_OFFEN, BUFFER_LOAD_DWORDX3_OFFSET, v3i32, load_private>;
-defm : MUBUFScratchLoadPat <BUFFER_LOAD_DWORDX4_OFFEN, BUFFER_LOAD_DWORDX4_OFFSET, v4i32, load_private>;
+defm : MUBUFScratchLoadPat <"BUFFER_LOAD_DWORDX2", v2i32, load_private>;
+defm : MUBUFScratchLoadPat <"BUFFER_LOAD_DWORDX3", v3i32, load_private>;
+defm : MUBUFScratchLoadPat <"BUFFER_LOAD_DWORDX4", v4i32, load_private>;
let OtherPredicates = [D16PreservesUnusedBits, DisableFlatScratch] in {
-defm : MUBUFScratchLoadPat_D16<BUFFER_LOAD_SHORT_D16_HI_OFFEN, BUFFER_LOAD_SHORT_D16_HI_OFFSET, v2i16, load_d16_hi_private>;
-defm : MUBUFScratchLoadPat_D16<BUFFER_LOAD_UBYTE_D16_HI_OFFEN, BUFFER_LOAD_UBYTE_D16_HI_OFFSET, v2i16, az_extloadi8_d16_hi_private>;
-defm : MUBUFScratchLoadPat_D16<BUFFER_LOAD_SBYTE_D16_HI_OFFEN, BUFFER_LOAD_SBYTE_D16_HI_OFFSET, v2i16, sextloadi8_d16_hi_private>;
-defm : MUBUFScratchLoadPat_D16<BUFFER_LOAD_SHORT_D16_HI_OFFEN, BUFFER_LOAD_SHORT_D16_HI_OFFSET, v2f16, load_d16_hi_private>;
-defm : MUBUFScratchLoadPat_D16<BUFFER_LOAD_UBYTE_D16_HI_OFFEN, BUFFER_LOAD_UBYTE_D16_HI_OFFSET, v2f16, az_extloadi8_d16_hi_private>;
-defm : MUBUFScratchLoadPat_D16<BUFFER_LOAD_SBYTE_D16_HI_OFFEN, BUFFER_LOAD_SBYTE_D16_HI_OFFSET, v2f16, sextloadi8_d16_hi_private>;
+defm : MUBUFScratchLoadPat_D16<"BUFFER_LOAD_SHORT_D16_HI", v2i16, load_d16_hi_private>;
+defm : MUBUFScratchLoadPat_D16<"BUFFER_LOAD_UBYTE_D16_HI", v2i16, az_extloadi8_d16_hi_private>;
+defm : MUBUFScratchLoadPat_D16<"BUFFER_LOAD_SBYTE_D16_HI", v2i16, sextloadi8_d16_hi_private>;
+defm : MUBUFScratchLoadPat_D16<"BUFFER_LOAD_SHORT_D16_HI", v2f16, load_d16_hi_private>;
+defm : MUBUFScratchLoadPat_D16<"BUFFER_LOAD_UBYTE_D16_HI", v2f16, az_extloadi8_d16_hi_private>;
+defm : MUBUFScratchLoadPat_D16<"BUFFER_LOAD_SBYTE_D16_HI", v2f16, sextloadi8_d16_hi_private>;
-defm : MUBUFScratchLoadPat_D16<BUFFER_LOAD_SHORT_D16_OFFEN, BUFFER_LOAD_SHORT_D16_OFFSET, v2i16, load_d16_lo_private>;
-defm : MUBUFScratchLoadPat_D16<BUFFER_LOAD_UBYTE_D16_OFFEN, BUFFER_LOAD_UBYTE_D16_OFFSET, v2i16, az_extloadi8_d16_lo_private>;
-defm : MUBUFScratchLoadPat_D16<BUFFER_LOAD_SBYTE_D16_OFFEN, BUFFER_LOAD_SBYTE_D16_OFFSET, v2i16, sextloadi8_d16_lo_private>;
-defm : MUBUFScratchLoadPat_D16<BUFFER_LOAD_SHORT_D16_OFFEN, BUFFER_LOAD_SHORT_D16_OFFSET, v2f16, load_d16_lo_private>;
-defm : MUBUFScratchLoadPat_D16<BUFFER_LOAD_UBYTE_D16_OFFEN, BUFFER_LOAD_UBYTE_D16_OFFSET, v2f16, az_extloadi8_d16_lo_private>;
-defm : MUBUFScratchLoadPat_D16<BUFFER_LOAD_SBYTE_D16_OFFEN, BUFFER_LOAD_SBYTE_D16_OFFSET, v2f16, sextloadi8_d16_lo_private>;
+defm : MUBUFScratchLoadPat_D16<"BUFFER_LOAD_SHORT_D16", v2i16, load_d16_lo_private>;
+defm : MUBUFScratchLoadPat_D16<"BUFFER_LOAD_UBYTE_D16", v2i16, az_extloadi8_d16_lo_private>;
+defm : MUBUFScratchLoadPat_D16<"BUFFER_LOAD_SBYTE_D16", v2i16, sextloadi8_d16_lo_private>;
+defm : MUBUFScratchLoadPat_D16<"BUFFER_LOAD_SHORT_D16", v2f16, load_d16_lo_private>;
+defm : MUBUFScratchLoadPat_D16<"BUFFER_LOAD_UBYTE_D16", v2f16, az_extloadi8_d16_lo_private>;
+defm : MUBUFScratchLoadPat_D16<"BUFFER_LOAD_SBYTE_D16", v2f16, sextloadi8_d16_lo_private>;
}
} // End OtherPredicates = [DisableFlatScratch]
multiclass MUBUFStore_Atomic_Pattern <MUBUF_Pseudo Instr_ADDR64, MUBUF_Pseudo Instr_OFFSET,
ValueType vt, PatFrag atomic_st> {
- // Store follows atomic op convention so address is first
def : GCNPat <
- (atomic_st (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, i32:$soffset, i32:$offset), vt:$val),
+ (atomic_st vt:$val, (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, i32:$soffset, i32:$offset)),
(Instr_ADDR64 $val, $vaddr, $srsrc, $soffset, $offset)
>;
def : GCNPat <
- (atomic_st (MUBUFOffset v4i32:$rsrc, i32:$soffset, i32:$offset), vt:$val),
+ (atomic_st vt:$val, (MUBUFOffset v4i32:$rsrc, i32:$soffset, i32:$offset)),
(Instr_OFFSET $val, $rsrc, $soffset, (as_i16imm $offset))
>;
}
@@ -1825,56 +2030,72 @@ defm : MUBUFStore_Atomic_Pattern <BUFFER_STORE_DWORDX2_ADDR64, BUFFER_STORE_DWOR
} // End Predicates = isGFX6GFX7
-multiclass MUBUFStore_Pattern <MUBUF_Pseudo Instr_OFFSET, ValueType vt,
- PatFrag st> {
+multiclass MUBUFStore_PatternOffset_Common <string Instr, ValueType vt,
+ PatFrag st> {
def : GCNPat <
(st vt:$vdata, (MUBUFOffset v4i32:$srsrc, i32:$soffset, i32:$offset)),
- (Instr_OFFSET $vdata, $srsrc, $soffset, $offset)
+ (!cast<MUBUF_Pseudo>(Instr # "_OFFSET") $vdata, $srsrc, $soffset, $offset)
>;
}
-defm : MUBUFStore_Pattern <BUFFER_STORE_BYTE_OFFSET, i16, truncstorei8_global>;
-defm : MUBUFStore_Pattern <BUFFER_STORE_SHORT_OFFSET, i16, store_global>;
+multiclass MUBUFStore_PatternOffset <string Instr, ValueType vt,
+ PatFrag st> {
+ let SubtargetPredicate = HasUnrestrictedSOffset in {
+ defm : MUBUFStore_PatternOffset_Common<Instr, vt, st>;
+ }
+ defm : MUBUFStore_PatternOffset_Common<Instr # "_VBUFFER", vt, st>;
+}
+
+defm : MUBUFStore_PatternOffset <"BUFFER_STORE_BYTE", i16, truncstorei8_global>;
+defm : MUBUFStore_PatternOffset <"BUFFER_STORE_SHORT", i16, store_global>;
-multiclass MUBUFScratchStorePat <MUBUF_Pseudo InstrOffen,
- MUBUF_Pseudo InstrOffset,
+multiclass MUBUFScratchStorePat_Common <string Instr,
ValueType vt, PatFrag st,
RegisterClass rc = VGPR_32> {
def : GCNPat <
(st vt:$value, (MUBUFScratchOffen v4i32:$srsrc, i32:$vaddr,
i32:$soffset, i32:$offset)),
- (InstrOffen rc:$value, $vaddr, $srsrc, $soffset, $offset, 0, 0)
+ (!cast<MUBUF_Pseudo>(Instr # _OFFEN) rc:$value, $vaddr, $srsrc, $soffset, $offset, 0, 0)
>;
def : GCNPat <
(st vt:$value, (MUBUFScratchOffset v4i32:$srsrc, i32:$soffset,
i32:$offset)),
- (InstrOffset rc:$value, $srsrc, $soffset, $offset, 0, 0)
+ (!cast<MUBUF_Pseudo>(Instr # _OFFSET) rc:$value, $srsrc, $soffset, $offset, 0, 0)
>;
}
+multiclass MUBUFScratchStorePat <string Instr,
+ ValueType vt, PatFrag st,
+ RegisterClass rc = VGPR_32> {
+ let SubtargetPredicate = HasUnrestrictedSOffset in {
+ defm : MUBUFScratchStorePat_Common<Instr, vt, st, rc>;
+ }
+ defm : MUBUFScratchStorePat_Common<Instr # "_VBUFFER", vt, st, rc>;
+}
+
let OtherPredicates = [DisableFlatScratch] in {
-defm : MUBUFScratchStorePat <BUFFER_STORE_BYTE_OFFEN, BUFFER_STORE_BYTE_OFFSET, i32, truncstorei8_private>;
-defm : MUBUFScratchStorePat <BUFFER_STORE_SHORT_OFFEN, BUFFER_STORE_SHORT_OFFSET, i32, truncstorei16_private>;
-defm : MUBUFScratchStorePat <BUFFER_STORE_BYTE_OFFEN, BUFFER_STORE_BYTE_OFFSET, i16, truncstorei8_private>;
-defm : MUBUFScratchStorePat <BUFFER_STORE_SHORT_OFFEN, BUFFER_STORE_SHORT_OFFSET, i16, store_private>;
+defm : MUBUFScratchStorePat <"BUFFER_STORE_BYTE", i32, truncstorei8_private>;
+defm : MUBUFScratchStorePat <"BUFFER_STORE_SHORT", i32, truncstorei16_private>;
+defm : MUBUFScratchStorePat <"BUFFER_STORE_BYTE", i16, truncstorei8_private>;
+defm : MUBUFScratchStorePat <"BUFFER_STORE_SHORT", i16, store_private>;
foreach vt = Reg32Types.types in {
-defm : MUBUFScratchStorePat <BUFFER_STORE_DWORD_OFFEN, BUFFER_STORE_DWORD_OFFSET, vt, store_private>;
+defm : MUBUFScratchStorePat <"BUFFER_STORE_DWORD", vt, store_private>;
}
-defm : MUBUFScratchStorePat <BUFFER_STORE_DWORDX2_OFFEN, BUFFER_STORE_DWORDX2_OFFSET, v2i32, store_private, VReg_64>;
-defm : MUBUFScratchStorePat <BUFFER_STORE_DWORDX3_OFFEN, BUFFER_STORE_DWORDX3_OFFSET, v3i32, store_private, VReg_96>;
-defm : MUBUFScratchStorePat <BUFFER_STORE_DWORDX4_OFFEN, BUFFER_STORE_DWORDX4_OFFSET, v4i32, store_private, VReg_128>;
+defm : MUBUFScratchStorePat <"BUFFER_STORE_DWORDX2", v2i32, store_private, VReg_64>;
+defm : MUBUFScratchStorePat <"BUFFER_STORE_DWORDX3", v3i32, store_private, VReg_96>;
+defm : MUBUFScratchStorePat <"BUFFER_STORE_DWORDX4", v4i32, store_private, VReg_128>;
let OtherPredicates = [HasD16LoadStore, DisableFlatScratch] in {
// Hiding the extract high pattern in the PatFrag seems to not
// automatically increase the complexity.
let AddedComplexity = 1 in {
-defm : MUBUFScratchStorePat <BUFFER_STORE_SHORT_D16_HI_OFFEN, BUFFER_STORE_SHORT_D16_HI_OFFSET, i32, store_hi16_private>;
-defm : MUBUFScratchStorePat <BUFFER_STORE_BYTE_D16_HI_OFFEN, BUFFER_STORE_BYTE_D16_HI_OFFSET, i32, truncstorei8_hi16_private>;
+defm : MUBUFScratchStorePat <"BUFFER_STORE_SHORT_D16_HI", i32, store_hi16_private>;
+defm : MUBUFScratchStorePat <"BUFFER_STORE_BYTE_D16_HI", i32, truncstorei8_hi16_private>;
}
}
} // End OtherPredicates = [DisableFlatScratch]
@@ -1887,12 +2108,12 @@ defm : MUBUFScratchStorePat <BUFFER_STORE_BYTE_D16_HI_OFFEN, BUFFER_STORE_BYTE_D
// tbuffer_load/store_format patterns
//===----------------------------------------------------------------------===//
-multiclass MTBUF_LoadIntrinsicPat<SDPatternOperator name, ValueType vt,
+multiclass MTBUF_LoadIntrinsicPat_Common<SDPatternOperator name, ValueType vt,
string opcode, ValueType memoryVt = vt> {
defvar st = !if(!eq(memoryVt, vt), name, mtbuf_intrinsic_load<name, memoryVt>);
def : GCNPat<
- (vt (st v4i32:$rsrc, 0, 0, i32:$soffset, timm:$offset,
+ (vt (st v4i32:$rsrc, 0, 0, (BUFSOffset i32:$soffset), timm:$offset,
timm:$format, timm:$auxiliary, 0)),
(!cast<MTBUF_Pseudo>(opcode # _OFFSET) SReg_128:$rsrc, SCSrc_b32:$soffset, timm:$offset,
(as_i8timm $format),
@@ -1900,7 +2121,7 @@ multiclass MTBUF_LoadIntrinsicPat<SDPatternOperator name, ValueType vt,
>;
def : GCNPat<
- (vt (st v4i32:$rsrc, i32:$vindex, 0, i32:$soffset, timm:$offset,
+ (vt (st v4i32:$rsrc, i32:$vindex, 0, (BUFSOffset i32:$soffset), timm:$offset,
timm:$format, timm:$auxiliary, timm)),
(!cast<MTBUF_Pseudo>(opcode # _IDXEN) VGPR_32:$vindex, SReg_128:$rsrc, SCSrc_b32:$soffset, timm:$offset,
(as_i8timm $format),
@@ -1908,7 +2129,7 @@ multiclass MTBUF_LoadIntrinsicPat<SDPatternOperator name, ValueType vt,
>;
def : GCNPat<
- (vt (st v4i32:$rsrc, 0, i32:$voffset, i32:$soffset, timm:$offset,
+ (vt (st v4i32:$rsrc, 0, i32:$voffset, (BUFSOffset i32:$soffset), timm:$offset,
timm:$format, timm:$auxiliary, 0)),
(!cast<MTBUF_Pseudo>(opcode # _OFFEN) VGPR_32:$voffset, SReg_128:$rsrc, SCSrc_b32:$soffset, timm:$offset,
(as_i8timm $format),
@@ -1916,7 +2137,7 @@ multiclass MTBUF_LoadIntrinsicPat<SDPatternOperator name, ValueType vt,
>;
def : GCNPat<
- (vt (st v4i32:$rsrc, i32:$vindex, i32:$voffset, i32:$soffset, timm:$offset,
+ (vt (st v4i32:$rsrc, i32:$vindex, i32:$voffset, (BUFSOffset i32:$soffset), timm:$offset,
timm:$format, timm:$auxiliary, timm)),
(!cast<MTBUF_Pseudo>(opcode # _BOTHEN)
(REG_SEQUENCE VReg_64, VGPR_32:$vindex, sub0, VGPR_32:$voffset, sub1),
@@ -1926,6 +2147,14 @@ multiclass MTBUF_LoadIntrinsicPat<SDPatternOperator name, ValueType vt,
>;
}
+multiclass MTBUF_LoadIntrinsicPat<SDPatternOperator name, ValueType vt,
+ string opcode, ValueType memoryVt = vt> {
+ let SubtargetPredicate = HasUnrestrictedSOffset in {
+ defm : MTBUF_LoadIntrinsicPat_Common<name, vt, opcode, memoryVt>;
+ }
+ defm : MTBUF_LoadIntrinsicPat_Common<name, vt, opcode # "_VBUFFER", memoryVt>;
+}
+
defm : MTBUF_LoadIntrinsicPat<SItbuffer_load, i32, "TBUFFER_LOAD_FORMAT_X">;
defm : MTBUF_LoadIntrinsicPat<SItbuffer_load, v2i32, "TBUFFER_LOAD_FORMAT_XY">;
defm : MTBUF_LoadIntrinsicPat<SItbuffer_load, v3i32, "TBUFFER_LOAD_FORMAT_XYZ">;
@@ -1935,15 +2164,15 @@ defm : MTBUF_LoadIntrinsicPat<SItbuffer_load, v2f32, "TBUFFER_LOAD_FORMAT_XY">;
defm : MTBUF_LoadIntrinsicPat<SItbuffer_load, v3f32, "TBUFFER_LOAD_FORMAT_XYZ">;
defm : MTBUF_LoadIntrinsicPat<SItbuffer_load, v4f32, "TBUFFER_LOAD_FORMAT_XYZW">;
-let SubtargetPredicate = HasUnpackedD16VMem in {
- defm : MTBUF_LoadIntrinsicPat<SItbuffer_load_d16, f16, "TBUFFER_LOAD_FORMAT_D16_X_gfx80">;
- defm : MTBUF_LoadIntrinsicPat<SItbuffer_load_d16, i32, "TBUFFER_LOAD_FORMAT_D16_X_gfx80">;
- defm : MTBUF_LoadIntrinsicPat<SItbuffer_load_d16, v2i32, "TBUFFER_LOAD_FORMAT_D16_XY_gfx80">;
- defm : MTBUF_LoadIntrinsicPat<SItbuffer_load_d16, v3i32, "TBUFFER_LOAD_FORMAT_D16_XYZ_gfx80">;
- defm : MTBUF_LoadIntrinsicPat<SItbuffer_load_d16, v4i32, "TBUFFER_LOAD_FORMAT_D16_XYZW_gfx80">;
+let OtherPredicates = [HasUnpackedD16VMem] in {
+ defm : MTBUF_LoadIntrinsicPat_Common<SItbuffer_load_d16, f16, "TBUFFER_LOAD_FORMAT_D16_X_gfx80">;
+ defm : MTBUF_LoadIntrinsicPat_Common<SItbuffer_load_d16, i32, "TBUFFER_LOAD_FORMAT_D16_X_gfx80">;
+ defm : MTBUF_LoadIntrinsicPat_Common<SItbuffer_load_d16, v2i32, "TBUFFER_LOAD_FORMAT_D16_XY_gfx80">;
+ defm : MTBUF_LoadIntrinsicPat_Common<SItbuffer_load_d16, v3i32, "TBUFFER_LOAD_FORMAT_D16_XYZ_gfx80">;
+ defm : MTBUF_LoadIntrinsicPat_Common<SItbuffer_load_d16, v4i32, "TBUFFER_LOAD_FORMAT_D16_XYZW_gfx80">;
} // End HasUnpackedD16VMem.
-let SubtargetPredicate = HasPackedD16VMem in {
+let OtherPredicates = [HasPackedD16VMem] in {
defm : MTBUF_LoadIntrinsicPat<SItbuffer_load_d16, f16, "TBUFFER_LOAD_FORMAT_D16_X">;
defm : MTBUF_LoadIntrinsicPat<SItbuffer_load_d16, i32, "TBUFFER_LOAD_FORMAT_D16_X">;
defm : MTBUF_LoadIntrinsicPat<SItbuffer_load_d16, v2f16, "TBUFFER_LOAD_FORMAT_D16_XY">;
@@ -1951,12 +2180,12 @@ let SubtargetPredicate = HasPackedD16VMem in {
defm : MTBUF_LoadIntrinsicPat<SItbuffer_load_d16, v4f16, "TBUFFER_LOAD_FORMAT_D16_XYZW">;
} // End HasPackedD16VMem.
-multiclass MTBUF_StoreIntrinsicPat<SDPatternOperator name, ValueType vt,
+multiclass MTBUF_StoreIntrinsicPat_Common<SDPatternOperator name, ValueType vt,
string opcode, ValueType memoryVt = vt> {
defvar st = !if(!eq(memoryVt, vt), name, mtbuf_intrinsic_store<name, memoryVt>);
def : GCNPat<
- (st vt:$vdata, v4i32:$rsrc, 0, 0, i32:$soffset, timm:$offset,
+ (st vt:$vdata, v4i32:$rsrc, 0, 0, (BUFSOffset i32:$soffset), timm:$offset,
timm:$format, timm:$auxiliary, 0),
(!cast<MTBUF_Pseudo>(opcode # _OFFSET_exact) getVregSrcForVT<vt>.ret:$vdata, SReg_128:$rsrc, SCSrc_b32:$soffset,
timm:$offset, (as_i8timm $format),
@@ -1964,7 +2193,7 @@ multiclass MTBUF_StoreIntrinsicPat<SDPatternOperator name, ValueType vt,
>;
def : GCNPat<
- (st vt:$vdata, v4i32:$rsrc, i32:$vindex, 0, i32:$soffset, timm:$offset,
+ (st vt:$vdata, v4i32:$rsrc, i32:$vindex, 0, (BUFSOffset i32:$soffset), timm:$offset,
timm:$format, timm:$auxiliary, timm),
(!cast<MTBUF_Pseudo>(opcode # _IDXEN_exact) getVregSrcForVT<vt>.ret:$vdata, VGPR_32:$vindex, SReg_128:$rsrc, SCSrc_b32:$soffset,
timm:$offset, (as_i8timm $format),
@@ -1972,7 +2201,7 @@ multiclass MTBUF_StoreIntrinsicPat<SDPatternOperator name, ValueType vt,
>;
def : GCNPat<
- (st vt:$vdata, v4i32:$rsrc, 0, i32:$voffset, i32:$soffset, timm:$offset,
+ (st vt:$vdata, v4i32:$rsrc, 0, i32:$voffset, (BUFSOffset i32:$soffset), timm:$offset,
timm:$format, timm:$auxiliary, 0),
(!cast<MTBUF_Pseudo>(opcode # _OFFEN_exact) getVregSrcForVT<vt>.ret:$vdata, VGPR_32:$voffset, SReg_128:$rsrc, SCSrc_b32:$soffset,
timm:$offset, (as_i8timm $format),
@@ -1980,7 +2209,7 @@ multiclass MTBUF_StoreIntrinsicPat<SDPatternOperator name, ValueType vt,
>;
def : GCNPat<
- (st vt:$vdata, v4i32:$rsrc, i32:$vindex, i32:$voffset, i32:$soffset,
+ (st vt:$vdata, v4i32:$rsrc, i32:$vindex, i32:$voffset, (BUFSOffset i32:$soffset),
timm:$offset, timm:$format, timm:$auxiliary, timm),
(!cast<MTBUF_Pseudo>(opcode # _BOTHEN_exact)
getVregSrcForVT<vt>.ret:$vdata,
@@ -1990,6 +2219,14 @@ multiclass MTBUF_StoreIntrinsicPat<SDPatternOperator name, ValueType vt,
>;
}
+multiclass MTBUF_StoreIntrinsicPat<SDPatternOperator name, ValueType vt,
+ string opcode, ValueType memoryVt = vt> {
+ let SubtargetPredicate = HasUnrestrictedSOffset in {
+ defm : MTBUF_StoreIntrinsicPat_Common<name, vt, opcode, memoryVt>;
+ }
+ defm : MTBUF_StoreIntrinsicPat_Common<name, vt, opcode # "_VBUFFER", memoryVt>;
+}
+
defm : MTBUF_StoreIntrinsicPat<SItbuffer_store, i32, "TBUFFER_STORE_FORMAT_X">;
defm : MTBUF_StoreIntrinsicPat<SItbuffer_store, v2i32, "TBUFFER_STORE_FORMAT_XY">;
defm : MTBUF_StoreIntrinsicPat<SItbuffer_store, v3i32, "TBUFFER_STORE_FORMAT_XYZ">;
@@ -1999,15 +2236,15 @@ defm : MTBUF_StoreIntrinsicPat<SItbuffer_store, v2f32, "TBUFFER_STORE_FORMAT_XY"
defm : MTBUF_StoreIntrinsicPat<SItbuffer_store, v3f32, "TBUFFER_STORE_FORMAT_XYZ">;
defm : MTBUF_StoreIntrinsicPat<SItbuffer_store, v4f32, "TBUFFER_STORE_FORMAT_XYZW">;
-let SubtargetPredicate = HasUnpackedD16VMem in {
- defm : MTBUF_StoreIntrinsicPat<SItbuffer_store_d16, f16, "TBUFFER_STORE_FORMAT_D16_X_gfx80">;
- defm : MTBUF_StoreIntrinsicPat<SItbuffer_store_d16, i32, "TBUFFER_STORE_FORMAT_D16_X_gfx80">;
- defm : MTBUF_StoreIntrinsicPat<SItbuffer_store_d16, v2i32, "TBUFFER_STORE_FORMAT_D16_XY_gfx80">;
- defm : MTBUF_StoreIntrinsicPat<SItbuffer_store_d16, v3i32, "TBUFFER_STORE_FORMAT_D16_XYZ_gfx80">;
- defm : MTBUF_StoreIntrinsicPat<SItbuffer_store_d16, v4i32, "TBUFFER_STORE_FORMAT_D16_XYZW_gfx80">;
+let OtherPredicates = [HasUnpackedD16VMem] in {
+ defm : MTBUF_StoreIntrinsicPat_Common<SItbuffer_store_d16, f16, "TBUFFER_STORE_FORMAT_D16_X_gfx80">;
+ defm : MTBUF_StoreIntrinsicPat_Common<SItbuffer_store_d16, i32, "TBUFFER_STORE_FORMAT_D16_X_gfx80">;
+ defm : MTBUF_StoreIntrinsicPat_Common<SItbuffer_store_d16, v2i32, "TBUFFER_STORE_FORMAT_D16_XY_gfx80">;
+ defm : MTBUF_StoreIntrinsicPat_Common<SItbuffer_store_d16, v3i32, "TBUFFER_STORE_FORMAT_D16_XYZ_gfx80">;
+ defm : MTBUF_StoreIntrinsicPat_Common<SItbuffer_store_d16, v4i32, "TBUFFER_STORE_FORMAT_D16_XYZW_gfx80">;
} // End HasUnpackedD16VMem.
-let SubtargetPredicate = HasPackedD16VMem in {
+let OtherPredicates = [HasPackedD16VMem] in {
defm : MTBUF_StoreIntrinsicPat<SItbuffer_store_d16, f16, "TBUFFER_STORE_FORMAT_D16_X">;
defm : MTBUF_StoreIntrinsicPat<SItbuffer_store_d16, i32, "TBUFFER_STORE_FORMAT_D16_X">;
defm : MTBUF_StoreIntrinsicPat<SItbuffer_store_d16, v2f16, "TBUFFER_STORE_FORMAT_D16_XY">;
@@ -2044,6 +2281,7 @@ class MUBUF_Real_gfx11<bits<8> op, MUBUF_Pseudo ps,
let Inst{53} = ps.tfe;
let Inst{54} = ps.offen;
let Inst{55} = ps.idxen;
+ let SubtargetPredicate = isGFX11Only;
}
class Base_MUBUF_Real_Atomic_gfx11<bits<8> op, MUBUF_Pseudo ps,
@@ -2067,15 +2305,98 @@ class MUBUF_Real_gfx10<bits<8> op, MUBUF_Pseudo ps> :
Base_MUBUF_Real_gfx6_gfx7_gfx10<op{6-0}, ps, SIEncodingFamily.GFX10> {
let Inst{15} = !if(ps.has_dlc, cpol{CPolBit.DLC}, ps.dlc_value);
let Inst{25} = op{7};
+ let SubtargetPredicate = isGFX10Only;
}
class MUBUF_Real_gfx6_gfx7<bits<8> op, MUBUF_Pseudo ps> :
Base_MUBUF_Real_gfx6_gfx7_gfx10<op{6-0}, ps, SIEncodingFamily.SI> {
let Inst{15} = ps.addr64;
+ let SubtargetPredicate = isGFX6GFX7;
+}
+
+//===----------------------------------------------------------------------===//
+// Base ENC_VBUFFER for GFX12.
+//===----------------------------------------------------------------------===//
+
+class VBUFFER_Real <BUF_Pseudo ps, string real_name = ps.Mnemonic> :
+ InstSI <ps.OutOperandList, ps.InOperandList, real_name # ps.AsmOperands, []>, Enc96 {
+
+ let isPseudo = 0;
+ let isCodeGenOnly = 0;
+
+ let VM_CNT = 1;
+ let EXP_CNT = 1;
+
+ // copy relevant pseudo op flags
+ let SubtargetPredicate = ps.SubtargetPredicate;
+ let AsmMatchConverter = ps.AsmMatchConverter;
+ let OtherPredicates = ps.OtherPredicates;
+ let Constraints = ps.Constraints;
+ let DisableEncoding = ps.DisableEncoding;
+ let TSFlags = ps.TSFlags;
+ let UseNamedOperandTable = ps.UseNamedOperandTable;
+ let SchedRW = ps.SchedRW;
+ let mayLoad = ps.mayLoad;
+ let mayStore = ps.mayStore;
+ let IsAtomicRet = ps.IsAtomicRet;
+ let IsAtomicNoRet = ps.IsAtomicNoRet;
+ let VALU = ps.VALU;
+ let LGKM_CNT = ps.LGKM_CNT;
+
+ bits<24> offset;
+ bits<8> vaddr;
+ bits<10> vdata;
+
+ bits<7> srsrc;
+ bits<7> soffset;
+ bits<6> cpol;
+
+ let Inst{95-72} = !if(ps.has_offset, offset, ?);
+ let Inst{71-64} = !if(ps.has_vaddr, vaddr, ?);
+ let Inst{39-32} = !if(ps.has_vdata, vdata{7-0}, ?);
+
+ let Inst{47-41} = !if(ps.has_srsrc, srsrc, ?);
+ let Inst{49-48} = 0b00;
+ let Inst{6-0} = !if(ps.has_soffset, soffset, ?);
+ let Inst{22} = ps.tfe;
+ let Inst{62} = ps.offen;
+ let Inst{63} = ps.idxen;
+
+ let Inst{54-53} = cpol{2-1}; // th{2-1}
+ let Inst{52} = !if(ps.IsAtomicRet, 1, cpol{0}); // th{0}
+ let Inst{51-50} = cpol{4-3}; // scope
+
+ let Inst{31-26} = 0b110001;
+}
+
+class VBUFFER_MUBUF_Real_gfx12<bits<8> op, MUBUF_Pseudo ps,
+ string real_name = ps.Mnemonic> :
+ VBUFFER_Real<ps, real_name>, SIMCInstr<ps.PseudoInstr, SIEncodingFamily.GFX12> {
+
+ let MUBUF = 1;
+
+ // Set the last bit of format to 1 to avoid round-trip issues, as some tools
+ // print BUF_FMT_INVALID for format 0.
+ let Inst{55} = 0b1;
+ let Inst{21-14} = op;
+ let SubtargetPredicate = isGFX12Only;
+}
+
+class VBUFFER_MTBUF_Real_gfx12<bits<4> op, MTBUF_Pseudo ps,
+ string real_name = ps.Mnemonic> :
+ VBUFFER_Real<ps, real_name>, SIMCInstr<ps.PseudoInstr, SIEncodingFamily.GFX12> {
+
+ let MTBUF = 1;
+
+ bits<7> format;
+
+ let Inst{17-14} = op;
+ let Inst{21-18} = 0b1000;
+ let Inst{61-55} = format;
}
//===----------------------------------------------------------------------===//
-// MUBUF - GFX11.
+// MUBUF - GFX11, GFX12.
//===----------------------------------------------------------------------===//
// Shortcut to default Mnemonic from MUBUF_Pseudo. Hides the cast to the
@@ -2085,19 +2406,43 @@ class get_MUBUF_ps<string name> {
}
// gfx11 instruction that accept both old and new assembler name.
-class Pre_gfx11_MUBUF_Name <string mnemonic, string real_name> :
+class Mnem_gfx11_gfx12 <string mnemonic, string real_name> :
MnemonicAlias<mnemonic, real_name>, Requires<[isGFX11Plus]>;
+class Mnem_gfx11 <string mnemonic, string real_name> :
+ MnemonicAlias<mnemonic, real_name>, Requires<[isGFX11Only]>;
+
+class Mnem_gfx12 <string mnemonic, string real_name> :
+ MnemonicAlias<mnemonic, real_name>, Requires<[isGFX12Plus]>;
+
class MUBUF_Real_gfx11_impl<bits<8> op, string ps_name, string real_name> :
MUBUF_Real_gfx11<op, !cast<MUBUF_Pseudo>(ps_name), real_name>;
-let AssemblerPredicate = isGFX11Only, DecoderNamespace = "GFX11" in
+
+class VBUFFER_MUBUF_Real_gfx12_impl<bits<8> op, string ps_name, string real_name> :
+ VBUFFER_MUBUF_Real_gfx12<op, !cast<MUBUF_Pseudo>(ps_name), real_name>;
+
multiclass MUBUF_Real_AllAddr_gfx11_Renamed_Impl2<bits<8> op, string real_name> {
- def _BOTHEN_gfx11 : MUBUF_Real_gfx11_impl<op, NAME # "_BOTHEN", real_name>;
- def _IDXEN_gfx11 : MUBUF_Real_gfx11_impl<op, NAME # "_IDXEN", real_name>;
- def _OFFEN_gfx11 : MUBUF_Real_gfx11_impl<op, NAME # "_OFFEN", real_name>;
- def _OFFSET_gfx11 : MUBUF_Real_gfx11_impl<op, NAME # "_OFFSET", real_name>;
+ let DecoderNamespace = "GFX11" in {
+ def _BOTHEN_gfx11 : MUBUF_Real_gfx11_impl<op, NAME # "_BOTHEN", real_name>;
+ def _IDXEN_gfx11 : MUBUF_Real_gfx11_impl<op, NAME # "_IDXEN", real_name>;
+ def _OFFEN_gfx11 : MUBUF_Real_gfx11_impl<op, NAME # "_OFFEN", real_name>;
+ def _OFFSET_gfx11 : MUBUF_Real_gfx11_impl<op, NAME # "_OFFSET", real_name>;
+ }
}
+multiclass MUBUF_Real_AllAddr_gfx12_Renamed_Impl2<bits<8> op, string real_name> {
+ let DecoderNamespace = "GFX12" in {
+ def _BOTHEN_gfx12 : VBUFFER_MUBUF_Real_gfx12_impl<op, NAME # "_VBUFFER_BOTHEN", real_name>;
+ def _IDXEN_gfx12 : VBUFFER_MUBUF_Real_gfx12_impl<op, NAME # "_VBUFFER_IDXEN", real_name>;
+ def _OFFEN_gfx12 : VBUFFER_MUBUF_Real_gfx12_impl<op, NAME # "_VBUFFER_OFFEN", real_name>;
+ def _OFFSET_gfx12 : VBUFFER_MUBUF_Real_gfx12_impl<op, NAME # "_VBUFFER_OFFSET", real_name>;
+ }
+}
+
+multiclass MUBUF_Real_AllAddr_gfx11_gfx12_Renamed_Impl2<bits<8> op, string real_name> :
+ MUBUF_Real_AllAddr_gfx11_Renamed_Impl2<op, real_name>,
+ MUBUF_Real_AllAddr_gfx12_Renamed_Impl2<op, real_name>;
+
multiclass MUBUF_Real_AllAddr_gfx11_Renamed_Impl<bits<8> op, string real_name,
bit hasTFE = 1> {
defm NAME : MUBUF_Real_AllAddr_gfx11_Renamed_Impl2<op, real_name>;
@@ -2105,136 +2450,196 @@ multiclass MUBUF_Real_AllAddr_gfx11_Renamed_Impl<bits<8> op, string real_name,
defm _TFE : MUBUF_Real_AllAddr_gfx11_Renamed_Impl2<op, real_name>;
}
-// Non-renamed, non-atomic gfx11 mubuf instructions.
+multiclass MUBUF_Real_AllAddr_gfx11_gfx12_Renamed_Impl<bits<8> op, string real_name,
+ bit hasTFE = 1> {
+ defm NAME : MUBUF_Real_AllAddr_gfx11_gfx12_Renamed_Impl2<op, real_name>;
+ if hasTFE then
+ defm _TFE : MUBUF_Real_AllAddr_gfx11_gfx12_Renamed_Impl2<op, real_name>;
+}
+
+// Non-renamed, non-atomic gfx11/gfx12 mubuf instructions.
multiclass MUBUF_Real_AllAddr_gfx11<bits<8> op, bit hasTFE = 1> :
MUBUF_Real_AllAddr_gfx11_Renamed_Impl<op, get_MUBUF_ps<NAME>.Mnemonic, hasTFE>;
-multiclass MUBUF_Real_AllAddr_gfx11_Renamed<bits<8> op, string real_name> :
- MUBUF_Real_AllAddr_gfx11_Renamed_Impl<op, real_name> {
- def : Pre_gfx11_MUBUF_Name<get_MUBUF_ps<NAME>.Mnemonic, real_name>;
+multiclass MUBUF_Real_AllAddr_gfx11_gfx12<bits<8> op, bit hasTFE = 1> :
+ MUBUF_Real_AllAddr_gfx11_gfx12_Renamed_Impl<op, get_MUBUF_ps<NAME>.Mnemonic, hasTFE>;
+
+multiclass MUBUF_Real_AllAddr_gfx11_gfx12_Renamed<bits<8> op, string real_name> :
+ MUBUF_Real_AllAddr_gfx11_gfx12_Renamed_Impl<op, real_name> {
+ def : Mnem_gfx11_gfx12<get_MUBUF_ps<NAME>.Mnemonic, real_name>;
}
class MUBUF_Real_Atomic_gfx11_impl<bits<8> op, string ps_name,
string real_name> :
Base_MUBUF_Real_Atomic_gfx11<op, !cast<MUBUF_Pseudo>(ps_name), real_name>;
-let AssemblerPredicate = isGFX11Only, DecoderNamespace = "GFX11" in
+
+class MUBUF_Real_Atomic_gfx12_impl<bits<8> op, string ps_name,
+ string real_name> :
+ VBUFFER_MUBUF_Real_gfx12<op, !cast<MUBUF_Pseudo>(ps_name), real_name>;
+
multiclass MUBUF_Real_Atomic_gfx11_Renamed_impl<bits<8> op, bit is_return,
string real_name> {
- defvar Rtn = !if(!eq(is_return, 1), "_RTN", "");
- def _BOTHEN#Rtn#_gfx11 :
- MUBUF_Real_Atomic_gfx11_impl<op, NAME # "_BOTHEN" # Rtn, real_name>,
- AtomicNoRet<NAME # "_BOTHEN_gfx11", is_return>;
- def _IDXEN#Rtn#_gfx11 :
- MUBUF_Real_Atomic_gfx11_impl<op, NAME # "_IDXEN" # Rtn, real_name>,
- AtomicNoRet<NAME # "_IDXEN_gfx11", is_return>;
- def _OFFEN#Rtn#_gfx11 :
- MUBUF_Real_Atomic_gfx11_impl<op, NAME # "_OFFEN" # Rtn, real_name>,
- AtomicNoRet<NAME # "_OFFEN_gfx11", is_return>;
- def _OFFSET#Rtn#_gfx11 :
- MUBUF_Real_Atomic_gfx11_impl<op, NAME # "_OFFSET" # Rtn, real_name>,
- AtomicNoRet<NAME # "_OFFSET_gfx11", is_return>;
-}
-
-// Non-renamed gfx11 mubuf atomic.
-multiclass MUBUF_Real_Atomic_gfx11<bits<8> op> :
- MUBUF_Real_Atomic_gfx11_Renamed_impl<op, 0, get_MUBUF_ps<NAME>.Mnemonic>,
- MUBUF_Real_Atomic_gfx11_Renamed_impl<op, 1, get_MUBUF_ps<NAME>.Mnemonic>;
+ let DecoderNamespace = "GFX11" in {
+ defvar Rtn = !if(!eq(is_return, 1), "_RTN", "");
+ def _BOTHEN#Rtn#_gfx11 :
+ MUBUF_Real_Atomic_gfx11_impl<op, NAME # "_BOTHEN" # Rtn, real_name>,
+ AtomicNoRet<NAME # "_BOTHEN_gfx11", is_return>;
+ def _IDXEN#Rtn#_gfx11 :
+ MUBUF_Real_Atomic_gfx11_impl<op, NAME # "_IDXEN" # Rtn, real_name>,
+ AtomicNoRet<NAME # "_IDXEN_gfx11", is_return>;
+ def _OFFEN#Rtn#_gfx11 :
+ MUBUF_Real_Atomic_gfx11_impl<op, NAME # "_OFFEN" # Rtn, real_name>,
+ AtomicNoRet<NAME # "_OFFEN_gfx11", is_return>;
+ def _OFFSET#Rtn#_gfx11 :
+ MUBUF_Real_Atomic_gfx11_impl<op, NAME # "_OFFSET" # Rtn, real_name>,
+ AtomicNoRet<NAME # "_OFFSET_gfx11", is_return>;
+ }
+}
+
+multiclass MUBUF_Real_Atomic_gfx12_Renamed_impl<bits<8> op, bit is_return,
+ string real_name> {
+ let DecoderNamespace = "GFX12" in {
+ defvar Rtn = !if(!eq(is_return, 1), "_RTN", "");
+ def _BOTHEN#Rtn#_gfx12 :
+ MUBUF_Real_Atomic_gfx12_impl<op, NAME # "_VBUFFER_BOTHEN" # Rtn, real_name>,
+ AtomicNoRet<NAME # "_BOTHEN_gfx12", is_return>;
+ def _IDXEN#Rtn#_gfx12 :
+ MUBUF_Real_Atomic_gfx12_impl<op, NAME # "_VBUFFER_IDXEN" # Rtn, real_name>,
+ AtomicNoRet<NAME # "_IDXEN_gfx12", is_return>;
+ def _OFFEN#Rtn#_gfx12 :
+ MUBUF_Real_Atomic_gfx12_impl<op, NAME # "_VBUFFER_OFFEN" # Rtn, real_name>,
+ AtomicNoRet<NAME # "_OFFEN_gfx12", is_return>;
+ def _OFFSET#Rtn#_gfx12 :
+ MUBUF_Real_Atomic_gfx12_impl<op, NAME # "_VBUFFER_OFFSET" # Rtn, real_name>,
+ AtomicNoRet<NAME # "_OFFSET_gfx12", is_return>;
+ }
+}
+
+multiclass MUBUF_Real_Atomic_gfx11_gfx12_Renamed_impl<bits<8> op, bit is_return,
+ string real_name> :
+ MUBUF_Real_Atomic_gfx11_Renamed_impl<op, is_return, real_name>,
+ MUBUF_Real_Atomic_gfx12_Renamed_impl<op, is_return, real_name>;
+
+// Non-renamed gfx11/gfx12 mubuf atomic.
+multiclass MUBUF_Real_Atomic_gfx11_gfx12<bits<8> op> :
+ MUBUF_Real_Atomic_gfx11_gfx12_Renamed_impl<op, 0, get_MUBUF_ps<NAME>.Mnemonic>,
+ MUBUF_Real_Atomic_gfx11_gfx12_Renamed_impl<op, 1, get_MUBUF_ps<NAME>.Mnemonic>;
+
+multiclass MUBUF_Real_Atomic_gfx12<bits<8> op> :
+ MUBUF_Real_Atomic_gfx12_Renamed_impl<op, 0, get_MUBUF_ps<NAME>.Mnemonic>,
+ MUBUF_Real_Atomic_gfx12_Renamed_impl<op, 1, get_MUBUF_ps<NAME>.Mnemonic>;
multiclass MUBUF_Real_Atomic_gfx11_Renamed<bits<8> op, string real_name> :
MUBUF_Real_Atomic_gfx11_Renamed_impl<op, 0, real_name>,
- MUBUF_Real_Atomic_gfx11_Renamed_impl<op, 1, real_name> {
- def : Pre_gfx11_MUBUF_Name<get_MUBUF_ps<NAME>.Mnemonic, real_name>;
+ MUBUF_Real_Atomic_gfx11_Renamed_impl<op, 1, real_name> {
+ def : Mnem_gfx11_gfx12<get_MUBUF_ps<NAME>.Mnemonic, real_name>;
+}
+
+multiclass MUBUF_Real_Atomic_gfx11_gfx12_Renamed<bits<8> op, string real_name> :
+ MUBUF_Real_Atomic_gfx11_gfx12_Renamed_impl<op, 0, real_name>,
+ MUBUF_Real_Atomic_gfx11_gfx12_Renamed_impl<op, 1, real_name> {
+ def : Mnem_gfx11_gfx12<get_MUBUF_ps<NAME>.Mnemonic, real_name>;
}
-let AssemblerPredicate = isGFX11Only, DecoderNamespace = "GFX11" in {
+multiclass MUBUF_Real_Atomic_gfx11_gfx12_Renamed_gfx12_Renamed<bits<8> op, string gfx12_name, string gfx11_name> :
+ MUBUF_Real_Atomic_gfx11_Renamed_impl<op, 0, gfx11_name>,
+ MUBUF_Real_Atomic_gfx11_Renamed_impl<op, 1, gfx11_name>,
+ MUBUF_Real_Atomic_gfx12_Renamed_impl<op, 0, gfx12_name>,
+ MUBUF_Real_Atomic_gfx12_Renamed_impl<op, 1, gfx12_name> {
+ def : Mnem_gfx11<get_MUBUF_ps<NAME>.Mnemonic, gfx11_name>;
+ def : Mnem_gfx12<get_MUBUF_ps<NAME>.Mnemonic, gfx12_name>;
+ def : Mnem_gfx12<gfx11_name, gfx12_name>;
+}
+
+let DecoderNamespace = "GFX11" in {
def BUFFER_GL0_INV_gfx11 : MUBUF_Real_gfx11<0x02B, BUFFER_GL0_INV>;
def BUFFER_GL1_INV_gfx11 : MUBUF_Real_gfx11<0x02C, BUFFER_GL1_INV>;
}
-defm BUFFER_LOAD_DWORD : MUBUF_Real_AllAddr_gfx11_Renamed<0x014, "buffer_load_b32">;
-defm BUFFER_LOAD_DWORDX2 : MUBUF_Real_AllAddr_gfx11_Renamed<0x015, "buffer_load_b64">;
-defm BUFFER_LOAD_DWORDX3 : MUBUF_Real_AllAddr_gfx11_Renamed<0x016, "buffer_load_b96">;
-defm BUFFER_LOAD_DWORDX4 : MUBUF_Real_AllAddr_gfx11_Renamed<0x017, "buffer_load_b128">;
-defm BUFFER_LOAD_SHORT_D16 : MUBUF_Real_AllAddr_gfx11_Renamed<0x020, "buffer_load_d16_b16">;
-defm BUFFER_LOAD_FORMAT_D16_X : MUBUF_Real_AllAddr_gfx11_Renamed<0x008, "buffer_load_d16_format_x">;
-defm BUFFER_LOAD_FORMAT_D16_XY : MUBUF_Real_AllAddr_gfx11_Renamed<0x009, "buffer_load_d16_format_xy">;
-defm BUFFER_LOAD_FORMAT_D16_XYZ : MUBUF_Real_AllAddr_gfx11_Renamed<0x00a, "buffer_load_d16_format_xyz">;
-defm BUFFER_LOAD_FORMAT_D16_XYZW : MUBUF_Real_AllAddr_gfx11_Renamed<0x00b, "buffer_load_d16_format_xyzw">;
-defm BUFFER_LOAD_SHORT_D16_HI : MUBUF_Real_AllAddr_gfx11_Renamed<0x023, "buffer_load_d16_hi_b16">;
-defm BUFFER_LOAD_FORMAT_D16_HI_X : MUBUF_Real_AllAddr_gfx11_Renamed<0x026, "buffer_load_d16_hi_format_x">;
-defm BUFFER_LOAD_SBYTE_D16_HI : MUBUF_Real_AllAddr_gfx11_Renamed<0x022, "buffer_load_d16_hi_i8">;
-defm BUFFER_LOAD_UBYTE_D16_HI : MUBUF_Real_AllAddr_gfx11_Renamed<0x021, "buffer_load_d16_hi_u8">;
-defm BUFFER_LOAD_SBYTE_D16 : MUBUF_Real_AllAddr_gfx11_Renamed<0x01f, "buffer_load_d16_i8">;
-defm BUFFER_LOAD_UBYTE_D16 : MUBUF_Real_AllAddr_gfx11_Renamed<0x01e, "buffer_load_d16_u8">;
-defm BUFFER_LOAD_FORMAT_X : MUBUF_Real_AllAddr_gfx11<0x000>;
-defm BUFFER_LOAD_FORMAT_XY : MUBUF_Real_AllAddr_gfx11<0x001>;
-defm BUFFER_LOAD_FORMAT_XYZ : MUBUF_Real_AllAddr_gfx11<0x002>;
-defm BUFFER_LOAD_FORMAT_XYZW : MUBUF_Real_AllAddr_gfx11<0x003>;
-defm BUFFER_LOAD_SBYTE : MUBUF_Real_AllAddr_gfx11_Renamed<0x011, "buffer_load_i8">;
-defm BUFFER_LOAD_SSHORT : MUBUF_Real_AllAddr_gfx11_Renamed<0x013, "buffer_load_i16">;
-defm BUFFER_LOAD_UBYTE : MUBUF_Real_AllAddr_gfx11_Renamed<0x010, "buffer_load_u8">;
-defm BUFFER_LOAD_USHORT : MUBUF_Real_AllAddr_gfx11_Renamed<0x012, "buffer_load_u16">;
+defm BUFFER_LOAD_DWORD : MUBUF_Real_AllAddr_gfx11_gfx12_Renamed<0x014, "buffer_load_b32">;
+defm BUFFER_LOAD_DWORDX2 : MUBUF_Real_AllAddr_gfx11_gfx12_Renamed<0x015, "buffer_load_b64">;
+defm BUFFER_LOAD_DWORDX3 : MUBUF_Real_AllAddr_gfx11_gfx12_Renamed<0x016, "buffer_load_b96">;
+defm BUFFER_LOAD_DWORDX4 : MUBUF_Real_AllAddr_gfx11_gfx12_Renamed<0x017, "buffer_load_b128">;
+defm BUFFER_LOAD_SHORT_D16 : MUBUF_Real_AllAddr_gfx11_gfx12_Renamed<0x020, "buffer_load_d16_b16">;
+defm BUFFER_LOAD_FORMAT_D16_X : MUBUF_Real_AllAddr_gfx11_gfx12_Renamed<0x008, "buffer_load_d16_format_x">;
+defm BUFFER_LOAD_FORMAT_D16_XY : MUBUF_Real_AllAddr_gfx11_gfx12_Renamed<0x009, "buffer_load_d16_format_xy">;
+defm BUFFER_LOAD_FORMAT_D16_XYZ : MUBUF_Real_AllAddr_gfx11_gfx12_Renamed<0x00a, "buffer_load_d16_format_xyz">;
+defm BUFFER_LOAD_FORMAT_D16_XYZW : MUBUF_Real_AllAddr_gfx11_gfx12_Renamed<0x00b, "buffer_load_d16_format_xyzw">;
+defm BUFFER_LOAD_SHORT_D16_HI : MUBUF_Real_AllAddr_gfx11_gfx12_Renamed<0x023, "buffer_load_d16_hi_b16">;
+defm BUFFER_LOAD_FORMAT_D16_HI_X : MUBUF_Real_AllAddr_gfx11_gfx12_Renamed<0x026, "buffer_load_d16_hi_format_x">;
+defm BUFFER_LOAD_SBYTE_D16_HI : MUBUF_Real_AllAddr_gfx11_gfx12_Renamed<0x022, "buffer_load_d16_hi_i8">;
+defm BUFFER_LOAD_UBYTE_D16_HI : MUBUF_Real_AllAddr_gfx11_gfx12_Renamed<0x021, "buffer_load_d16_hi_u8">;
+defm BUFFER_LOAD_SBYTE_D16 : MUBUF_Real_AllAddr_gfx11_gfx12_Renamed<0x01f, "buffer_load_d16_i8">;
+defm BUFFER_LOAD_UBYTE_D16 : MUBUF_Real_AllAddr_gfx11_gfx12_Renamed<0x01e, "buffer_load_d16_u8">;
+defm BUFFER_LOAD_FORMAT_X : MUBUF_Real_AllAddr_gfx11_gfx12<0x000>;
+defm BUFFER_LOAD_FORMAT_XY : MUBUF_Real_AllAddr_gfx11_gfx12<0x001>;
+defm BUFFER_LOAD_FORMAT_XYZ : MUBUF_Real_AllAddr_gfx11_gfx12<0x002>;
+defm BUFFER_LOAD_FORMAT_XYZW : MUBUF_Real_AllAddr_gfx11_gfx12<0x003>;
+defm BUFFER_LOAD_SBYTE : MUBUF_Real_AllAddr_gfx11_gfx12_Renamed<0x011, "buffer_load_i8">;
+defm BUFFER_LOAD_SSHORT : MUBUF_Real_AllAddr_gfx11_gfx12_Renamed<0x013, "buffer_load_i16">;
+defm BUFFER_LOAD_UBYTE : MUBUF_Real_AllAddr_gfx11_gfx12_Renamed<0x010, "buffer_load_u8">;
+defm BUFFER_LOAD_USHORT : MUBUF_Real_AllAddr_gfx11_gfx12_Renamed<0x012, "buffer_load_u16">;
defm BUFFER_LOAD_LDS_B32 : MUBUF_Real_AllAddr_gfx11<0x031, 0>;
defm BUFFER_LOAD_LDS_FORMAT_X : MUBUF_Real_AllAddr_gfx11<0x032, 0>;
defm BUFFER_LOAD_LDS_I8 : MUBUF_Real_AllAddr_gfx11<0x02e, 0>;
defm BUFFER_LOAD_LDS_I16 : MUBUF_Real_AllAddr_gfx11<0x030, 0>;
defm BUFFER_LOAD_LDS_U8 : MUBUF_Real_AllAddr_gfx11<0x02d, 0>;
defm BUFFER_LOAD_LDS_U16 : MUBUF_Real_AllAddr_gfx11<0x02f, 0>;
-defm BUFFER_STORE_BYTE : MUBUF_Real_AllAddr_gfx11_Renamed<0x018, "buffer_store_b8">;
-defm BUFFER_STORE_SHORT : MUBUF_Real_AllAddr_gfx11_Renamed<0x019, "buffer_store_b16">;
-defm BUFFER_STORE_DWORD : MUBUF_Real_AllAddr_gfx11_Renamed<0x01A, "buffer_store_b32">;
-defm BUFFER_STORE_DWORDX2 : MUBUF_Real_AllAddr_gfx11_Renamed<0x01B, "buffer_store_b64">;
-defm BUFFER_STORE_DWORDX3 : MUBUF_Real_AllAddr_gfx11_Renamed<0x01C, "buffer_store_b96">;
-defm BUFFER_STORE_DWORDX4 : MUBUF_Real_AllAddr_gfx11_Renamed<0x01D, "buffer_store_b128">;
-defm BUFFER_STORE_FORMAT_D16_X : MUBUF_Real_AllAddr_gfx11_Renamed<0x00C, "buffer_store_d16_format_x">;
-defm BUFFER_STORE_FORMAT_D16_XY : MUBUF_Real_AllAddr_gfx11_Renamed<0x00D, "buffer_store_d16_format_xy">;
-defm BUFFER_STORE_FORMAT_D16_XYZ : MUBUF_Real_AllAddr_gfx11_Renamed<0x00E, "buffer_store_d16_format_xyz">;
-defm BUFFER_STORE_FORMAT_D16_XYZW : MUBUF_Real_AllAddr_gfx11_Renamed<0x00F, "buffer_store_d16_format_xyzw">;
-defm BUFFER_STORE_BYTE_D16_HI : MUBUF_Real_AllAddr_gfx11_Renamed<0x024, "buffer_store_d16_hi_b8">;
-defm BUFFER_STORE_SHORT_D16_HI : MUBUF_Real_AllAddr_gfx11_Renamed<0x025, "buffer_store_d16_hi_b16">;
-defm BUFFER_STORE_FORMAT_D16_HI_X : MUBUF_Real_AllAddr_gfx11_Renamed<0x027, "buffer_store_d16_hi_format_x">;
-defm BUFFER_STORE_FORMAT_X : MUBUF_Real_AllAddr_gfx11<0x004>;
-defm BUFFER_STORE_FORMAT_XY : MUBUF_Real_AllAddr_gfx11<0x005>;
-defm BUFFER_STORE_FORMAT_XYZ : MUBUF_Real_AllAddr_gfx11<0x006>;
-defm BUFFER_STORE_FORMAT_XYZW : MUBUF_Real_AllAddr_gfx11<0x007>;
-defm BUFFER_ATOMIC_ADD_F32 : MUBUF_Real_Atomic_gfx11<0x056>;
-defm BUFFER_ATOMIC_ADD : MUBUF_Real_Atomic_gfx11_Renamed<0x035, "buffer_atomic_add_u32">;
-defm BUFFER_ATOMIC_ADD_X2 : MUBUF_Real_Atomic_gfx11_Renamed<0x043, "buffer_atomic_add_u64">;
-defm BUFFER_ATOMIC_AND : MUBUF_Real_Atomic_gfx11_Renamed<0x03C, "buffer_atomic_and_b32">;
-defm BUFFER_ATOMIC_AND_X2 : MUBUF_Real_Atomic_gfx11_Renamed<0x049, "buffer_atomic_and_b64">;
-defm BUFFER_ATOMIC_CMPSWAP : MUBUF_Real_Atomic_gfx11_Renamed<0x034, "buffer_atomic_cmpswap_b32">;
-defm BUFFER_ATOMIC_CMPSWAP_X2 : MUBUF_Real_Atomic_gfx11_Renamed<0x042, "buffer_atomic_cmpswap_b64">;
+defm BUFFER_STORE_BYTE : MUBUF_Real_AllAddr_gfx11_gfx12_Renamed<0x018, "buffer_store_b8">;
+defm BUFFER_STORE_SHORT : MUBUF_Real_AllAddr_gfx11_gfx12_Renamed<0x019, "buffer_store_b16">;
+defm BUFFER_STORE_DWORD : MUBUF_Real_AllAddr_gfx11_gfx12_Renamed<0x01A, "buffer_store_b32">;
+defm BUFFER_STORE_DWORDX2 : MUBUF_Real_AllAddr_gfx11_gfx12_Renamed<0x01B, "buffer_store_b64">;
+defm BUFFER_STORE_DWORDX3 : MUBUF_Real_AllAddr_gfx11_gfx12_Renamed<0x01C, "buffer_store_b96">;
+defm BUFFER_STORE_DWORDX4 : MUBUF_Real_AllAddr_gfx11_gfx12_Renamed<0x01D, "buffer_store_b128">;
+defm BUFFER_STORE_FORMAT_D16_X : MUBUF_Real_AllAddr_gfx11_gfx12_Renamed<0x00C, "buffer_store_d16_format_x">;
+defm BUFFER_STORE_FORMAT_D16_XY : MUBUF_Real_AllAddr_gfx11_gfx12_Renamed<0x00D, "buffer_store_d16_format_xy">;
+defm BUFFER_STORE_FORMAT_D16_XYZ : MUBUF_Real_AllAddr_gfx11_gfx12_Renamed<0x00E, "buffer_store_d16_format_xyz">;
+defm BUFFER_STORE_FORMAT_D16_XYZW : MUBUF_Real_AllAddr_gfx11_gfx12_Renamed<0x00F, "buffer_store_d16_format_xyzw">;
+defm BUFFER_STORE_BYTE_D16_HI : MUBUF_Real_AllAddr_gfx11_gfx12_Renamed<0x024, "buffer_store_d16_hi_b8">;
+defm BUFFER_STORE_SHORT_D16_HI : MUBUF_Real_AllAddr_gfx11_gfx12_Renamed<0x025, "buffer_store_d16_hi_b16">;
+defm BUFFER_STORE_FORMAT_D16_HI_X : MUBUF_Real_AllAddr_gfx11_gfx12_Renamed<0x027, "buffer_store_d16_hi_format_x">;
+defm BUFFER_STORE_FORMAT_X : MUBUF_Real_AllAddr_gfx11_gfx12<0x004>;
+defm BUFFER_STORE_FORMAT_XY : MUBUF_Real_AllAddr_gfx11_gfx12<0x005>;
+defm BUFFER_STORE_FORMAT_XYZ : MUBUF_Real_AllAddr_gfx11_gfx12<0x006>;
+defm BUFFER_STORE_FORMAT_XYZW : MUBUF_Real_AllAddr_gfx11_gfx12<0x007>;
+defm BUFFER_ATOMIC_ADD_F32 : MUBUF_Real_Atomic_gfx11_gfx12<0x056>;
+defm BUFFER_ATOMIC_ADD : MUBUF_Real_Atomic_gfx11_gfx12_Renamed<0x035, "buffer_atomic_add_u32">;
+defm BUFFER_ATOMIC_ADD_X2 : MUBUF_Real_Atomic_gfx11_gfx12_Renamed<0x043, "buffer_atomic_add_u64">;
+defm BUFFER_ATOMIC_AND : MUBUF_Real_Atomic_gfx11_gfx12_Renamed<0x03C, "buffer_atomic_and_b32">;
+defm BUFFER_ATOMIC_AND_X2 : MUBUF_Real_Atomic_gfx11_gfx12_Renamed<0x049, "buffer_atomic_and_b64">;
+defm BUFFER_ATOMIC_CMPSWAP : MUBUF_Real_Atomic_gfx11_gfx12_Renamed<0x034, "buffer_atomic_cmpswap_b32">;
+defm BUFFER_ATOMIC_CMPSWAP_X2 : MUBUF_Real_Atomic_gfx11_gfx12_Renamed<0x042, "buffer_atomic_cmpswap_b64">;
defm BUFFER_ATOMIC_FCMPSWAP : MUBUF_Real_Atomic_gfx11_Renamed<0x050, "buffer_atomic_cmpswap_f32">;
-defm BUFFER_ATOMIC_CSUB : MUBUF_Real_Atomic_gfx11_Renamed_impl<0x037, 1, "buffer_atomic_csub_u32">;
-def : Pre_gfx11_MUBUF_Name<"buffer_atomic_csub", "buffer_atomic_csub_u32">;
-defm BUFFER_ATOMIC_DEC : MUBUF_Real_Atomic_gfx11_Renamed<0x040, "buffer_atomic_dec_u32">;
-defm BUFFER_ATOMIC_DEC_X2 : MUBUF_Real_Atomic_gfx11_Renamed<0x04D, "buffer_atomic_dec_u64">;
-defm BUFFER_ATOMIC_INC : MUBUF_Real_Atomic_gfx11_Renamed<0x03F, "buffer_atomic_inc_u32">;
-defm BUFFER_ATOMIC_INC_X2 : MUBUF_Real_Atomic_gfx11_Renamed<0x04C, "buffer_atomic_inc_u64">;
-defm BUFFER_ATOMIC_FMAX : MUBUF_Real_Atomic_gfx11_Renamed<0x052, "buffer_atomic_max_f32">;
-defm BUFFER_ATOMIC_SMAX : MUBUF_Real_Atomic_gfx11_Renamed<0x03A, "buffer_atomic_max_i32">;
-defm BUFFER_ATOMIC_SMAX_X2 : MUBUF_Real_Atomic_gfx11_Renamed<0x047, "buffer_atomic_max_i64">;
-defm BUFFER_ATOMIC_UMAX : MUBUF_Real_Atomic_gfx11_Renamed<0x03B, "buffer_atomic_max_u32">;
-defm BUFFER_ATOMIC_UMAX_X2 : MUBUF_Real_Atomic_gfx11_Renamed<0x048, "buffer_atomic_max_u64">;
-defm BUFFER_ATOMIC_FMIN : MUBUF_Real_Atomic_gfx11_Renamed<0x051, "buffer_atomic_min_f32">;
-defm BUFFER_ATOMIC_SMIN : MUBUF_Real_Atomic_gfx11_Renamed<0x038, "buffer_atomic_min_i32">;
-defm BUFFER_ATOMIC_SMIN_X2 : MUBUF_Real_Atomic_gfx11_Renamed<0x045, "buffer_atomic_min_i64">;
-defm BUFFER_ATOMIC_UMIN : MUBUF_Real_Atomic_gfx11_Renamed<0x039, "buffer_atomic_min_u32">;
-defm BUFFER_ATOMIC_UMIN_X2 : MUBUF_Real_Atomic_gfx11_Renamed<0x046, "buffer_atomic_min_u64">;
-defm BUFFER_ATOMIC_OR : MUBUF_Real_Atomic_gfx11_Renamed<0x03D, "buffer_atomic_or_b32">;
-defm BUFFER_ATOMIC_OR_X2 : MUBUF_Real_Atomic_gfx11_Renamed<0x04A, "buffer_atomic_or_b64">;
-defm BUFFER_ATOMIC_SUB : MUBUF_Real_Atomic_gfx11_Renamed<0x036, "buffer_atomic_sub_u32">;
-defm BUFFER_ATOMIC_SUB_X2 : MUBUF_Real_Atomic_gfx11_Renamed<0x044, "buffer_atomic_sub_u64">;
-defm BUFFER_ATOMIC_SWAP : MUBUF_Real_Atomic_gfx11_Renamed<0x033, "buffer_atomic_swap_b32">;
-defm BUFFER_ATOMIC_SWAP_X2 : MUBUF_Real_Atomic_gfx11_Renamed<0x041, "buffer_atomic_swap_b64">;
-defm BUFFER_ATOMIC_XOR : MUBUF_Real_Atomic_gfx11_Renamed<0x03E, "buffer_atomic_xor_b32">;
-defm BUFFER_ATOMIC_XOR_X2 : MUBUF_Real_Atomic_gfx11_Renamed<0x04B, "buffer_atomic_xor_b64">;
+defm BUFFER_ATOMIC_CSUB : MUBUF_Real_Atomic_gfx11_gfx12_Renamed_gfx12_Renamed<0x037, "buffer_atomic_sub_clamp_u32", "buffer_atomic_csub_u32">;
+def : Mnem_gfx11_gfx12<"buffer_atomic_csub", "buffer_atomic_csub_u32">;
+defm BUFFER_ATOMIC_DEC : MUBUF_Real_Atomic_gfx11_gfx12_Renamed<0x040, "buffer_atomic_dec_u32">;
+defm BUFFER_ATOMIC_DEC_X2 : MUBUF_Real_Atomic_gfx11_gfx12_Renamed<0x04D, "buffer_atomic_dec_u64">;
+defm BUFFER_ATOMIC_INC : MUBUF_Real_Atomic_gfx11_gfx12_Renamed<0x03F, "buffer_atomic_inc_u32">;
+defm BUFFER_ATOMIC_INC_X2 : MUBUF_Real_Atomic_gfx11_gfx12_Renamed<0x04C, "buffer_atomic_inc_u64">;
+defm BUFFER_ATOMIC_FMAX : MUBUF_Real_Atomic_gfx11_gfx12_Renamed_gfx12_Renamed<0x052, "buffer_atomic_max_num_f32", "buffer_atomic_max_f32">;
+defm BUFFER_ATOMIC_SMAX : MUBUF_Real_Atomic_gfx11_gfx12_Renamed<0x03A, "buffer_atomic_max_i32">;
+defm BUFFER_ATOMIC_SMAX_X2 : MUBUF_Real_Atomic_gfx11_gfx12_Renamed<0x047, "buffer_atomic_max_i64">;
+defm BUFFER_ATOMIC_UMAX : MUBUF_Real_Atomic_gfx11_gfx12_Renamed<0x03B, "buffer_atomic_max_u32">;
+defm BUFFER_ATOMIC_UMAX_X2 : MUBUF_Real_Atomic_gfx11_gfx12_Renamed<0x048, "buffer_atomic_max_u64">;
+defm BUFFER_ATOMIC_FMIN : MUBUF_Real_Atomic_gfx11_gfx12_Renamed_gfx12_Renamed<0x051, "buffer_atomic_min_num_f32", "buffer_atomic_min_f32">;
+defm BUFFER_ATOMIC_SMIN : MUBUF_Real_Atomic_gfx11_gfx12_Renamed<0x038, "buffer_atomic_min_i32">;
+defm BUFFER_ATOMIC_SMIN_X2 : MUBUF_Real_Atomic_gfx11_gfx12_Renamed<0x045, "buffer_atomic_min_i64">;
+defm BUFFER_ATOMIC_UMIN : MUBUF_Real_Atomic_gfx11_gfx12_Renamed<0x039, "buffer_atomic_min_u32">;
+defm BUFFER_ATOMIC_UMIN_X2 : MUBUF_Real_Atomic_gfx11_gfx12_Renamed<0x046, "buffer_atomic_min_u64">;
+defm BUFFER_ATOMIC_OR : MUBUF_Real_Atomic_gfx11_gfx12_Renamed<0x03D, "buffer_atomic_or_b32">;
+defm BUFFER_ATOMIC_OR_X2 : MUBUF_Real_Atomic_gfx11_gfx12_Renamed<0x04A, "buffer_atomic_or_b64">;
+defm BUFFER_ATOMIC_SUB : MUBUF_Real_Atomic_gfx11_gfx12_Renamed<0x036, "buffer_atomic_sub_u32">;
+defm BUFFER_ATOMIC_SUB_X2 : MUBUF_Real_Atomic_gfx11_gfx12_Renamed<0x044, "buffer_atomic_sub_u64">;
+defm BUFFER_ATOMIC_SWAP : MUBUF_Real_Atomic_gfx11_gfx12_Renamed<0x033, "buffer_atomic_swap_b32">;
+defm BUFFER_ATOMIC_SWAP_X2 : MUBUF_Real_Atomic_gfx11_gfx12_Renamed<0x041, "buffer_atomic_swap_b64">;
+defm BUFFER_ATOMIC_XOR : MUBUF_Real_Atomic_gfx11_gfx12_Renamed<0x03E, "buffer_atomic_xor_b32">;
+defm BUFFER_ATOMIC_XOR_X2 : MUBUF_Real_Atomic_gfx11_gfx12_Renamed<0x04B, "buffer_atomic_xor_b64">;
//===----------------------------------------------------------------------===//
// MUBUF - GFX10.
//===----------------------------------------------------------------------===//
-let AssemblerPredicate = isGFX10Only, DecoderNamespace = "GFX10" in {
+let DecoderNamespace = "GFX10" in {
multiclass MUBUF_Real_AllAddr_Helper_gfx10<bits<8> op> {
def _BOTHEN_gfx10 :
MUBUF_Real_gfx10<op, !cast<MUBUF_Pseudo>(NAME#"_BOTHEN")>;
@@ -2291,7 +2696,7 @@ let AssemblerPredicate = isGFX10Only, DecoderNamespace = "GFX10" in {
MUBUF_Real_gfx10<op, !cast<MUBUF_Pseudo>(NAME#"_OFFSET")>,
AtomicNoRet<NAME # "_OFFSET_gfx10", 0>;
}
-} // End AssemblerPredicate = isGFX10Only, DecoderNamespace = "GFX10"
+} // End DecoderNamespace = "GFX10"
defm BUFFER_STORE_BYTE_D16_HI : MUBUF_Real_AllAddr_gfx10<0x019>;
defm BUFFER_STORE_SHORT_D16_HI : MUBUF_Real_AllAddr_gfx10<0x01b>;
@@ -2477,7 +2882,7 @@ defm BUFFER_ATOMIC_FCMPSWAP_X2 : MUBUF_Real_Atomics_gfx6_gfx7_gfx10<0x05e>;
defm BUFFER_ATOMIC_FMIN_X2 : MUBUF_Real_Atomics_gfx6_gfx7_gfx10<0x05f>;
defm BUFFER_ATOMIC_FMAX_X2 : MUBUF_Real_Atomics_gfx6_gfx7_gfx10<0x060>;
-defm BUFFER_ATOMIC_CSUB : MUBUF_Real_Atomics_RTN_gfx10<0x034>;
+defm BUFFER_ATOMIC_CSUB : MUBUF_Real_Atomics_gfx10<0x034>;
defm BUFFER_WBINVL1_SC : MUBUF_Real_gfx6<0x070>;
defm BUFFER_WBINVL1_VOL : MUBUF_Real_gfx7<0x070>;
@@ -2524,47 +2929,59 @@ class Base_MTBUF_Real_gfx6_gfx7_gfx10<bits<3> op, MTBUF_Pseudo ps, int ef> :
// MTBUF - GFX11.
//===----------------------------------------------------------------------===//
-let AssemblerPredicate = isGFX11Only, DecoderNamespace = "GFX11" in
-multiclass MTBUF_Real_AllAddr_gfx11_Renamed_Impl<bits<4> op, string real_name> {
- def _BOTHEN_gfx11 :
- Base_MTBUF_Real_gfx11<op, !cast<MTBUF_Pseudo>(NAME#"_BOTHEN"), real_name>;
- def _IDXEN_gfx11 :
- Base_MTBUF_Real_gfx11<op, !cast<MTBUF_Pseudo>(NAME#"_IDXEN"), real_name>;
- def _OFFEN_gfx11 :
- Base_MTBUF_Real_gfx11<op, !cast<MTBUF_Pseudo>(NAME#"_OFFEN"), real_name>;
- def _OFFSET_gfx11 :
- Base_MTBUF_Real_gfx11<op, !cast<MTBUF_Pseudo>(NAME#"_OFFSET"), real_name>;
+multiclass MTBUF_Real_AllAddr_gfx11_gfx12_Renamed_Impl<bits<4> op, string real_name> {
+ let AssemblerPredicate = isGFX11Only, DecoderNamespace = "GFX11" in {
+ def _BOTHEN_gfx11 :
+ Base_MTBUF_Real_gfx11<op, !cast<MTBUF_Pseudo>(NAME#"_BOTHEN"), real_name>;
+ def _IDXEN_gfx11 :
+ Base_MTBUF_Real_gfx11<op, !cast<MTBUF_Pseudo>(NAME#"_IDXEN"), real_name>;
+ def _OFFEN_gfx11 :
+ Base_MTBUF_Real_gfx11<op, !cast<MTBUF_Pseudo>(NAME#"_OFFEN"), real_name>;
+ def _OFFSET_gfx11 :
+ Base_MTBUF_Real_gfx11<op, !cast<MTBUF_Pseudo>(NAME#"_OFFSET"), real_name>;
+ }
+
+ let AssemblerPredicate = isGFX12Plus, DecoderNamespace = "GFX12" in {
+ def _BOTHEN_gfx12 :
+ VBUFFER_MTBUF_Real_gfx12<op, !cast<MTBUF_Pseudo>(NAME#"_VBUFFER_BOTHEN"), real_name>;
+ def _IDXEN_gfx12 :
+ VBUFFER_MTBUF_Real_gfx12<op, !cast<MTBUF_Pseudo>(NAME#"_VBUFFER_IDXEN"), real_name>;
+ def _OFFEN_gfx12 :
+ VBUFFER_MTBUF_Real_gfx12<op, !cast<MTBUF_Pseudo>(NAME#"_VBUFFER_OFFEN"), real_name>;
+ def _OFFSET_gfx12 :
+ VBUFFER_MTBUF_Real_gfx12<op, !cast<MTBUF_Pseudo>(NAME#"_VBUFFER_OFFSET"), real_name>;
+ }
}
-multiclass MTBUF_Real_AllAddr_gfx11_Impl<bits<4> op, MTBUF_Pseudo ps>
- : MTBUF_Real_AllAddr_gfx11_Renamed_Impl<op, ps.Mnemonic>;
-multiclass MTBUF_Real_AllAddr_gfx11<bits<4> op>
- : MTBUF_Real_AllAddr_gfx11_Impl<op, !cast<MTBUF_Pseudo>(NAME#"_BOTHEN")>;
+multiclass MTBUF_Real_AllAddr_gfx11_gfx12_Impl<bits<4> op, MTBUF_Pseudo ps>
+ : MTBUF_Real_AllAddr_gfx11_gfx12_Renamed_Impl<op, ps.Mnemonic>;
+multiclass MTBUF_Real_AllAddr_gfx11_gfx12<bits<4> op>
+ : MTBUF_Real_AllAddr_gfx11_gfx12_Impl<op, !cast<MTBUF_Pseudo>(NAME#"_BOTHEN")>;
class Pre_gfx11_MTBUF_Name <MTBUF_Pseudo ps, string real_name>
: MnemonicAlias<ps.Mnemonic, real_name>, Requires<[isGFX11Plus]>;
-multiclass MTBUF_Real_AllAddr_gfx11_Renamed<bits<4> op, string real_name>
- : MTBUF_Real_AllAddr_gfx11_Renamed_Impl<op, real_name> {
+multiclass MTBUF_Real_AllAddr_gfx11_gfx12_Renamed<bits<4> op, string real_name>
+ : MTBUF_Real_AllAddr_gfx11_gfx12_Renamed_Impl<op, real_name> {
def : Pre_gfx11_MTBUF_Name<!cast<MTBUF_Pseudo>(NAME#"_BOTHEN"), real_name>;
}
-defm TBUFFER_LOAD_FORMAT_D16_X : MTBUF_Real_AllAddr_gfx11_Renamed<0x008, "tbuffer_load_d16_format_x">;
-defm TBUFFER_LOAD_FORMAT_D16_XY : MTBUF_Real_AllAddr_gfx11_Renamed<0x009, "tbuffer_load_d16_format_xy">;
-defm TBUFFER_LOAD_FORMAT_D16_XYZ : MTBUF_Real_AllAddr_gfx11_Renamed<0x00a, "tbuffer_load_d16_format_xyz">;
-defm TBUFFER_LOAD_FORMAT_D16_XYZW : MTBUF_Real_AllAddr_gfx11_Renamed<0x00b, "tbuffer_load_d16_format_xyzw">;
-defm TBUFFER_LOAD_FORMAT_X : MTBUF_Real_AllAddr_gfx11<0x000>;
-defm TBUFFER_LOAD_FORMAT_XY : MTBUF_Real_AllAddr_gfx11<0x001>;
-defm TBUFFER_LOAD_FORMAT_XYZ : MTBUF_Real_AllAddr_gfx11<0x002>;
-defm TBUFFER_LOAD_FORMAT_XYZW : MTBUF_Real_AllAddr_gfx11<0x003>;
-defm TBUFFER_STORE_FORMAT_D16_X : MTBUF_Real_AllAddr_gfx11_Renamed<0x00c, "tbuffer_store_d16_format_x">;
-defm TBUFFER_STORE_FORMAT_D16_XY : MTBUF_Real_AllAddr_gfx11_Renamed<0x00d, "tbuffer_store_d16_format_xy">;
-defm TBUFFER_STORE_FORMAT_D16_XYZ : MTBUF_Real_AllAddr_gfx11_Renamed<0x00e, "tbuffer_store_d16_format_xyz">;
-defm TBUFFER_STORE_FORMAT_D16_XYZW : MTBUF_Real_AllAddr_gfx11_Renamed<0x00f, "tbuffer_store_d16_format_xyzw">;
-defm TBUFFER_STORE_FORMAT_X : MTBUF_Real_AllAddr_gfx11<0x004>;
-defm TBUFFER_STORE_FORMAT_XY : MTBUF_Real_AllAddr_gfx11<0x005>;
-defm TBUFFER_STORE_FORMAT_XYZ : MTBUF_Real_AllAddr_gfx11<0x006>;
-defm TBUFFER_STORE_FORMAT_XYZW : MTBUF_Real_AllAddr_gfx11<0x007>;
+defm TBUFFER_LOAD_FORMAT_D16_X : MTBUF_Real_AllAddr_gfx11_gfx12_Renamed<0x008, "tbuffer_load_d16_format_x">;
+defm TBUFFER_LOAD_FORMAT_D16_XY : MTBUF_Real_AllAddr_gfx11_gfx12_Renamed<0x009, "tbuffer_load_d16_format_xy">;
+defm TBUFFER_LOAD_FORMAT_D16_XYZ : MTBUF_Real_AllAddr_gfx11_gfx12_Renamed<0x00a, "tbuffer_load_d16_format_xyz">;
+defm TBUFFER_LOAD_FORMAT_D16_XYZW : MTBUF_Real_AllAddr_gfx11_gfx12_Renamed<0x00b, "tbuffer_load_d16_format_xyzw">;
+defm TBUFFER_LOAD_FORMAT_X : MTBUF_Real_AllAddr_gfx11_gfx12<0x000>;
+defm TBUFFER_LOAD_FORMAT_XY : MTBUF_Real_AllAddr_gfx11_gfx12<0x001>;
+defm TBUFFER_LOAD_FORMAT_XYZ : MTBUF_Real_AllAddr_gfx11_gfx12<0x002>;
+defm TBUFFER_LOAD_FORMAT_XYZW : MTBUF_Real_AllAddr_gfx11_gfx12<0x003>;
+defm TBUFFER_STORE_FORMAT_D16_X : MTBUF_Real_AllAddr_gfx11_gfx12_Renamed<0x00c, "tbuffer_store_d16_format_x">;
+defm TBUFFER_STORE_FORMAT_D16_XY : MTBUF_Real_AllAddr_gfx11_gfx12_Renamed<0x00d, "tbuffer_store_d16_format_xy">;
+defm TBUFFER_STORE_FORMAT_D16_XYZ : MTBUF_Real_AllAddr_gfx11_gfx12_Renamed<0x00e, "tbuffer_store_d16_format_xyz">;
+defm TBUFFER_STORE_FORMAT_D16_XYZW : MTBUF_Real_AllAddr_gfx11_gfx12_Renamed<0x00f, "tbuffer_store_d16_format_xyzw">;
+defm TBUFFER_STORE_FORMAT_X : MTBUF_Real_AllAddr_gfx11_gfx12<0x004>;
+defm TBUFFER_STORE_FORMAT_XY : MTBUF_Real_AllAddr_gfx11_gfx12<0x005>;
+defm TBUFFER_STORE_FORMAT_XYZ : MTBUF_Real_AllAddr_gfx11_gfx12<0x006>;
+defm TBUFFER_STORE_FORMAT_XYZW : MTBUF_Real_AllAddr_gfx11_gfx12<0x007>;
//===----------------------------------------------------------------------===//
// MTBUF - GFX10.
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/DSInstructions.td b/contrib/llvm-project/llvm/lib/Target/AMDGPU/DSInstructions.td
index 85a3f763cd5a..3a895923fa4b 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/DSInstructions.td
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/DSInstructions.td
@@ -12,6 +12,7 @@ class DS_Pseudo <string opName, dag outs, dag ins, string asmOps, list<dag> patt
let LGKM_CNT = 1;
let DS = 1;
+ let GWS = 0;
let Size = 8;
let UseNamedOperandTable = 1;
@@ -61,6 +62,7 @@ class DS_Real <DS_Pseudo ps, string opName = ps.Mnemonic> :
let UseNamedOperandTable = 1;
// copy relevant pseudo op flags
+ let GWS = ps.GWS;
let SubtargetPredicate = ps.SubtargetPredicate;
let OtherPredicates = ps.OtherPredicates;
let SchedRW = ps.SchedRW;
@@ -376,6 +378,7 @@ multiclass DS_1A_mc <string opName> {
class DS_GWS <string opName, dag ins, string asmOps>
: DS_Pseudo<opName, (outs), ins, asmOps> {
+ let GWS = 1;
let has_vdst = 0;
let has_addr = 0;
@@ -708,18 +711,34 @@ def DS_ADD_SRC2_F32 : DS_1A<"ds_add_src2_f32">;
//===----------------------------------------------------------------------===//
-// Instruction definitions for GFX11 and newer.
+// Instruction definitions for GFX11.
//===----------------------------------------------------------------------===//
-let SubtargetPredicate = isGFX11Plus in {
+let SubtargetPredicate = isGFX11Only in {
def DS_ADD_GS_REG_RTN : DS_0A1D_RET_GDS<"ds_add_gs_reg_rtn", VReg_64, VGPR_32>;
def DS_SUB_GS_REG_RTN : DS_0A1D_RET_GDS<"ds_sub_gs_reg_rtn", VReg_64, VGPR_32>;
+
+} // let SubtargetPredicate = isGFX11Only
+
+let SubtargetPredicate = isGFX11Plus in {
+
def DS_BVH_STACK_RTN_B32 : DS_BVH_STACK<"ds_bvh_stack_rtn_b32">;
} // let SubtargetPredicate = isGFX11Plus
//===----------------------------------------------------------------------===//
+// Instruction definitions for GFX12 and newer.
+//===----------------------------------------------------------------------===//
+
+let SubtargetPredicate = isGFX12Plus in {
+
+defm DS_SUB_CLAMP_U32 : DS_1A1D_NORET_mc<"ds_sub_clamp_u32">;
+defm DS_SUB_CLAMP_RTN_U32 : DS_1A1D_RET_mc<"ds_sub_clamp_rtn_u32", VGPR_32, "ds_sub_clamp_u32">;
+
+} // let SubtargetPredicate = isGFX12Plus
+
+//===----------------------------------------------------------------------===//
// DS Patterns
//===----------------------------------------------------------------------===//
@@ -803,23 +822,6 @@ multiclass DSWritePat_mc <DS_Pseudo inst, ValueType vt, string frag> {
}
}
-// Irritatingly, atomic_store reverses the order of operands from a
-// normal store.
-class DSAtomicWritePat <DS_Pseudo inst, ValueType vt, PatFrag frag> : GCNPat <
- (frag (DS1Addr1Offset i32:$ptr, i32:$offset), vt:$value),
- (inst $ptr, getVregSrcForVT<vt>.ret:$value, offset:$offset, (i1 0))
->;
-
-multiclass DSAtomicWritePat_mc <DS_Pseudo inst, ValueType vt, string frag> {
- let OtherPredicates = [LDSRequiresM0Init] in {
- def : DSAtomicWritePat<inst, vt, !cast<PatFrag>(frag#"_m0")>;
- }
-
- let OtherPredicates = [NotLDSRequiresM0Init] in {
- def : DSAtomicWritePat<!cast<DS_Pseudo>(!cast<string>(inst)#"_gfx9"), vt, !cast<PatFrag>(frag)>;
- }
-}
-
defm : DSWritePat_mc <DS_WRITE_B8, i32, "truncstorei8_local">;
defm : DSWritePat_mc <DS_WRITE_B16, i32, "truncstorei16_local">;
defm : DSWritePat_mc <DS_WRITE_B8, i16, "truncstorei8_local">;
@@ -829,12 +831,12 @@ foreach vt = Reg32Types.types in {
defm : DSWritePat_mc <DS_WRITE_B32, vt, "store_local">;
}
-defm : DSAtomicWritePat_mc <DS_WRITE_B8, i16, "atomic_store_8_local">;
-defm : DSAtomicWritePat_mc <DS_WRITE_B8, i32, "atomic_store_8_local">;
-defm : DSAtomicWritePat_mc <DS_WRITE_B16, i16, "atomic_store_16_local">;
-defm : DSAtomicWritePat_mc <DS_WRITE_B16, i32, "atomic_store_16_local">;
-defm : DSAtomicWritePat_mc <DS_WRITE_B32, i32, "atomic_store_32_local">;
-defm : DSAtomicWritePat_mc <DS_WRITE_B64, i64, "atomic_store_64_local">;
+defm : DSWritePat_mc <DS_WRITE_B8, i16, "atomic_store_8_local">;
+defm : DSWritePat_mc <DS_WRITE_B8, i32, "atomic_store_8_local">;
+defm : DSWritePat_mc <DS_WRITE_B16, i16, "atomic_store_16_local">;
+defm : DSWritePat_mc <DS_WRITE_B16, i32, "atomic_store_16_local">;
+defm : DSWritePat_mc <DS_WRITE_B32, i32, "atomic_store_32_local">;
+defm : DSWritePat_mc <DS_WRITE_B64, i64, "atomic_store_64_local">;
let OtherPredicates = [HasD16LoadStore] in {
def : DSWritePat <DS_WRITE_B16_D16_HI, i32, store_hi16_local>;
@@ -969,8 +971,10 @@ multiclass DSAtomicRetPat_mc<DS_Pseudo inst, ValueType vt, string frag> {
!cast<PatFrag>(frag#"_local_"#vt.Size)>;
}
- def : DSAtomicRetPat<inst, vt, !cast<PatFrag>(frag#"_region_m0_"#vt.Size),
- /* complexity */ 0, /* gds */ 1>;
+ let OtherPredicates = [HasGDS] in {
+ def : DSAtomicRetPat<inst, vt, !cast<PatFrag>(frag#"_region_m0_"#vt.Size),
+ /* complexity */ 0, /* gds */ 1>;
+ }
}
multiclass DSAtomicRetNoRetPat_mc<DS_Pseudo inst, DS_Pseudo noRetInst,
@@ -989,12 +993,14 @@ multiclass DSAtomicRetNoRetPat_mc<DS_Pseudo inst, DS_Pseudo noRetInst,
!cast<PatFrag>(frag#"_local_noret_"#vt.Size), /* complexity */ 1>;
}
- def : DSAtomicRetPat<inst, vt,
- !cast<PatFrag>(frag#"_region_m0_"#vt.Size),
- /* complexity */ 0, /* gds */ 1>;
- def : DSAtomicRetPat<noRetInst, vt,
- !cast<PatFrag>(frag#"_region_m0_noret_"#vt.Size),
- /* complexity */ 1, /* gds */ 1>;
+ let OtherPredicates = [HasGDS] in {
+ def : DSAtomicRetPat<inst, vt,
+ !cast<PatFrag>(frag#"_region_m0_"#vt.Size),
+ /* complexity */ 0, /* gds */ 1>;
+ def : DSAtomicRetPat<noRetInst, vt,
+ !cast<PatFrag>(frag#"_region_m0_noret_"#vt.Size),
+ /* complexity */ 1, /* gds */ 1>;
+ }
}
@@ -1024,10 +1030,12 @@ multiclass DSAtomicCmpXChgSwapped_mc<DS_Pseudo inst, DS_Pseudo noRetInst, ValueT
/* complexity */ 1>;
}
- def : DSAtomicCmpXChgSwapped<inst, vt, !cast<PatFrag>(frag#"_region_m0_"#vt.Size),
- /* complexity */ 0, /* gds */ 1>;
- def : DSAtomicCmpXChgSwapped<noRetInst, vt, !cast<PatFrag>(frag#"_region_m0_noret_"#vt.Size),
- /* complexity */ 1, /* gds */ 1>;
+ let OtherPredicates = [HasGDS] in {
+ def : DSAtomicCmpXChgSwapped<inst, vt, !cast<PatFrag>(frag#"_region_m0_"#vt.Size),
+ /* complexity */ 0, /* gds */ 1>;
+ def : DSAtomicCmpXChgSwapped<noRetInst, vt, !cast<PatFrag>(frag#"_region_m0_noret_"#vt.Size),
+ /* complexity */ 1, /* gds */ 1>;
+ }
}
} // End SubtargetPredicate = isGFX6GFX7GFX8GFX9GFX10
@@ -1047,10 +1055,12 @@ multiclass DSAtomicCmpXChg_mc<DS_Pseudo inst, DS_Pseudo noRetInst, ValueType vt,
def : DSAtomicCmpXChg<!cast<DS_Pseudo>(!cast<string>(noRetInst)#"_gfx9"), vt,
!cast<PatFrag>(frag#"_local_noret_"#vt.Size), /* complexity */ 1>;
- def : DSAtomicCmpXChg<inst, vt, !cast<PatFrag>(frag#"_region_m0_"#vt.Size),
- /* complexity */ 0, /* gds */ 1>;
- def : DSAtomicCmpXChg<noRetInst, vt, !cast<PatFrag>(frag#"_region_m0_noret_"#vt.Size),
- /* complexity */ 1, /* gds */ 1>;
+ let OtherPredicates = [HasGDS] in {
+ def : DSAtomicCmpXChg<inst, vt, !cast<PatFrag>(frag#"_region_m0_"#vt.Size),
+ /* complexity */ 0, /* gds */ 1>;
+ def : DSAtomicCmpXChg<noRetInst, vt, !cast<PatFrag>(frag#"_region_m0_noret_"#vt.Size),
+ /* complexity */ 1, /* gds */ 1>;
+ }
}
} // End SubtargetPredicate = isGFX11Plus
@@ -1175,11 +1185,12 @@ def : GCNPat <
//===----------------------------------------------------------------------===//
//===----------------------------------------------------------------------===//
-// Base ENC_DS for GFX6, GFX7, GFX10, GFX11.
+// Base ENC_DS for GFX6, GFX7, GFX10, GFX11, GFX12.
//===----------------------------------------------------------------------===//
-class Base_DS_Real_gfx6_gfx7_gfx10_gfx11<bits<8> op, DS_Pseudo ps, int ef, string opName = ps.Mnemonic> :
- DS_Real<ps, opName>, SIMCInstr <ps.Mnemonic, ef> {
+class Base_DS_Real_gfx6_gfx7_gfx10_gfx11_gfx12<bits<8> op, DS_Pseudo ps, int ef,
+ string opName = ps.Mnemonic>
+ : DS_Real<ps, opName>, SIMCInstr <ps.Mnemonic, ef> {
let Inst{7-0} = !if(ps.has_offset0, offset0, 0);
let Inst{15-8} = !if(ps.has_offset1, offset1, 0);
@@ -1193,74 +1204,117 @@ class Base_DS_Real_gfx6_gfx7_gfx10_gfx11<bits<8> op, DS_Pseudo ps, int ef, strin
}
//===----------------------------------------------------------------------===//
+// GFX12.
+//===----------------------------------------------------------------------===//
+
+let AssemblerPredicate = isGFX12Plus, DecoderNamespace = "GFX12" in {
+ multiclass DS_Real_gfx12<bits<8> op> {
+ defvar ps = !cast<DS_Pseudo>(NAME);
+ def _gfx12 :
+ Base_DS_Real_gfx6_gfx7_gfx10_gfx11_gfx12<op, ps, SIEncodingFamily.GFX12,
+ ps.Mnemonic>;
+ }
+
+ multiclass DS_Real_Renamed_gfx12<bits<8> op, DS_Pseudo backing_pseudo,
+ string real_name> {
+ def _gfx12 :
+ Base_DS_Real_gfx6_gfx7_gfx10_gfx11_gfx12<op, backing_pseudo,
+ SIEncodingFamily.GFX12,
+ real_name>,
+ MnemonicAlias<backing_pseudo.Mnemonic, real_name>,
+ Requires<[isGFX12Plus]>;
+ }
+} // End AssemblerPredicate = isGFX12Plus, DecoderNamespace = "GFX12"
+
+defm DS_MIN_NUM_F32 : DS_Real_Renamed_gfx12<0x012, DS_MIN_F32, "ds_min_num_f32">;
+defm DS_MAX_NUM_F32 : DS_Real_Renamed_gfx12<0x013, DS_MAX_F32, "ds_max_num_f32">;
+defm DS_MIN_NUM_RTN_F32 : DS_Real_Renamed_gfx12<0x032, DS_MIN_RTN_F32, "ds_min_num_rtn_f32">;
+defm DS_MAX_NUM_RTN_F32 : DS_Real_Renamed_gfx12<0x033, DS_MAX_RTN_F32, "ds_max_num_rtn_f32">;
+defm DS_MIN_NUM_F64 : DS_Real_Renamed_gfx12<0x052, DS_MIN_F64, "ds_min_num_f64">;
+defm DS_MAX_NUM_F64 : DS_Real_Renamed_gfx12<0x053, DS_MAX_F64, "ds_max_num_f64">;
+defm DS_MIN_NUM_RTN_F64 : DS_Real_Renamed_gfx12<0x072, DS_MIN_RTN_F64, "ds_min_num_rtn_f64">;
+defm DS_MAX_NUM_RTN_F64 : DS_Real_Renamed_gfx12<0x073, DS_MAX_RTN_F64, "ds_max_num_rtn_f64">;
+defm DS_SUB_CLAMP_U32 : DS_Real_gfx12<0x099>;
+defm DS_SUB_CLAMP_RTN_U32 : DS_Real_gfx12<0x0a9>;
+
+//===----------------------------------------------------------------------===//
// GFX11.
//===----------------------------------------------------------------------===//
-let AssemblerPredicate = isGFX11Plus, DecoderNamespace = "GFX11" in {
+let AssemblerPredicate = isGFX11Only, DecoderNamespace = "GFX11" in {
multiclass DS_Real_gfx11<bits<8> op> {
- def _gfx11 : Base_DS_Real_gfx6_gfx7_gfx10_gfx11<op, !cast<DS_Pseudo>(NAME),
+ def _gfx11 :
+ Base_DS_Real_gfx6_gfx7_gfx10_gfx11_gfx12<op, !cast<DS_Pseudo>(NAME),
SIEncodingFamily.GFX11>;
}
multiclass DS_Real_Renamed_gfx11<bits<8> op, DS_Pseudo backing_pseudo, string real_name> {
- def _gfx11 : Base_DS_Real_gfx6_gfx7_gfx10_gfx11<op, backing_pseudo, SIEncodingFamily.GFX11, real_name>,
- MnemonicAlias<backing_pseudo.Mnemonic, real_name>, Requires<[isGFX11Plus]>;
+ def _gfx11 : Base_DS_Real_gfx6_gfx7_gfx10_gfx11_gfx12<op, backing_pseudo, SIEncodingFamily.GFX11, real_name>,
+ MnemonicAlias<backing_pseudo.Mnemonic, real_name>, Requires<[isGFX11Only]>;
}
-} // End AssemblerPredicate = isGFX11Plus, DecoderNamespace = "GFX11"
-
-defm DS_STORE_B32 : DS_Real_Renamed_gfx11<0x00d, DS_WRITE_B32, "ds_store_b32">;
-defm DS_STORE_2ADDR_B32 : DS_Real_Renamed_gfx11<0x00e, DS_WRITE2_B32, "ds_store_2addr_b32">;
-defm DS_STORE_2ADDR_STRIDE64_B32 : DS_Real_Renamed_gfx11<0x00f, DS_WRITE2ST64_B32, "ds_store_2addr_stride64_b32">;
-defm DS_STORE_B8 : DS_Real_Renamed_gfx11<0x01e, DS_WRITE_B8, "ds_store_b8">;
-defm DS_STORE_B16 : DS_Real_Renamed_gfx11<0x01f, DS_WRITE_B16, "ds_store_b16">;
-defm DS_STOREXCHG_RTN_B32 : DS_Real_Renamed_gfx11<0x02d, DS_WRXCHG_RTN_B32, "ds_storexchg_rtn_b32">;
-defm DS_STOREXCHG_2ADDR_RTN_B32 : DS_Real_Renamed_gfx11<0x02e, DS_WRXCHG2_RTN_B32, "ds_storexchg_2addr_rtn_b32">;
-defm DS_STOREXCHG_2ADDR_STRIDE64_RTN_B32 : DS_Real_Renamed_gfx11<0x02f, DS_WRXCHG2ST64_RTN_B32, "ds_storexchg_2addr_stride64_rtn_b32">;
-defm DS_LOAD_B32 : DS_Real_Renamed_gfx11<0x036, DS_READ_B32, "ds_load_b32">;
-defm DS_LOAD_2ADDR_B32 : DS_Real_Renamed_gfx11<0x037, DS_READ2_B32, "ds_load_2addr_b32">;
-defm DS_LOAD_2ADDR_STRIDE64_B32 : DS_Real_Renamed_gfx11<0x038, DS_READ2ST64_B32, "ds_load_2addr_stride64_b32">;
-defm DS_LOAD_I8 : DS_Real_Renamed_gfx11<0x039, DS_READ_I8, "ds_load_i8">;
-defm DS_LOAD_U8 : DS_Real_Renamed_gfx11<0x03a, DS_READ_U8, "ds_load_u8">;
-defm DS_LOAD_I16 : DS_Real_Renamed_gfx11<0x03b, DS_READ_I16, "ds_load_i16">;
-defm DS_LOAD_U16 : DS_Real_Renamed_gfx11<0x03c, DS_READ_U16, "ds_load_u16">;
-defm DS_STORE_B64 : DS_Real_Renamed_gfx11<0x04d, DS_WRITE_B64, "ds_store_b64">;
-defm DS_STORE_2ADDR_B64 : DS_Real_Renamed_gfx11<0x04e, DS_WRITE2_B64, "ds_store_2addr_b64">;
-defm DS_STORE_2ADDR_STRIDE64_B64 : DS_Real_Renamed_gfx11<0x04f, DS_WRITE2ST64_B64, "ds_store_2addr_stride64_b64">;
-defm DS_STOREXCHG_RTN_B64 : DS_Real_Renamed_gfx11<0x06d, DS_WRXCHG_RTN_B64, "ds_storexchg_rtn_b64">;
-defm DS_STOREXCHG_2ADDR_RTN_B64 : DS_Real_Renamed_gfx11<0x06e, DS_WRXCHG2_RTN_B64, "ds_storexchg_2addr_rtn_b64">;
-defm DS_STOREXCHG_2ADDR_STRIDE64_RTN_B64 : DS_Real_Renamed_gfx11<0x06f, DS_WRXCHG2ST64_RTN_B64, "ds_storexchg_2addr_stride64_rtn_b64">;
-defm DS_LOAD_B64 : DS_Real_Renamed_gfx11<0x076, DS_READ_B64, "ds_load_b64">;
-defm DS_LOAD_2ADDR_B64 : DS_Real_Renamed_gfx11<0x077, DS_READ2_B64, "ds_load_2addr_b64">;
-defm DS_LOAD_2ADDR_STRIDE64_B64 : DS_Real_Renamed_gfx11<0x078, DS_READ2ST64_B64, "ds_load_2addr_stride64_b64">;
-defm DS_STORE_B8_D16_HI : DS_Real_Renamed_gfx11<0x0a0, DS_WRITE_B8_D16_HI, "ds_store_b8_d16_hi">;
-defm DS_STORE_B16_D16_HI : DS_Real_Renamed_gfx11<0x0a1, DS_WRITE_B16_D16_HI, "ds_store_b16_d16_hi">;
-defm DS_LOAD_U8_D16 : DS_Real_Renamed_gfx11<0x0a2, DS_READ_U8_D16, "ds_load_u8_d16">;
-defm DS_LOAD_U8_D16_HI : DS_Real_Renamed_gfx11<0x0a3, DS_READ_U8_D16_HI, "ds_load_u8_d16_hi">;
-defm DS_LOAD_I8_D16 : DS_Real_Renamed_gfx11<0x0a4, DS_READ_I8_D16, "ds_load_i8_d16">;
-defm DS_LOAD_I8_D16_HI : DS_Real_Renamed_gfx11<0x0a5, DS_READ_I8_D16_HI, "ds_load_i8_d16_hi">;
-defm DS_LOAD_U16_D16 : DS_Real_Renamed_gfx11<0x0a6, DS_READ_U16_D16, "ds_load_u16_d16">;
-defm DS_LOAD_U16_D16_HI : DS_Real_Renamed_gfx11<0x0a7, DS_READ_U16_D16_HI, "ds_load_u16_d16_hi">;
-defm DS_STORE_ADDTID_B32 : DS_Real_Renamed_gfx11<0x0b0, DS_WRITE_ADDTID_B32, "ds_store_addtid_b32">;
-defm DS_LOAD_ADDTID_B32 : DS_Real_Renamed_gfx11<0x0b1, DS_READ_ADDTID_B32, "ds_load_addtid_b32">;
-defm DS_STORE_B96 : DS_Real_Renamed_gfx11<0x0de, DS_WRITE_B96, "ds_store_b96">;
-defm DS_STORE_B128 : DS_Real_Renamed_gfx11<0x0df, DS_WRITE_B128, "ds_store_b128">;
-defm DS_LOAD_B96 : DS_Real_Renamed_gfx11<0x0fe, DS_READ_B96, "ds_load_b96">;
-defm DS_LOAD_B128 : DS_Real_Renamed_gfx11<0x0ff, DS_READ_B128, "ds_load_b128">;
+} // End AssemblerPredicate = isGFX11Only, DecoderNamespace = "GFX11"
+
+multiclass DS_Real_gfx11_gfx12<bits<8> op>
+ : DS_Real_gfx11<op>, DS_Real_gfx12<op>;
+
+multiclass DS_Real_Renamed_gfx11_gfx12<bits<8> op, DS_Pseudo backing_pseudo,
+ string real_name>
+ : DS_Real_Renamed_gfx11<op, backing_pseudo, real_name>,
+ DS_Real_Renamed_gfx12<op, backing_pseudo, real_name>;
+
+defm DS_STORE_B32 : DS_Real_Renamed_gfx11_gfx12<0x00d, DS_WRITE_B32, "ds_store_b32">;
+defm DS_STORE_2ADDR_B32 : DS_Real_Renamed_gfx11_gfx12<0x00e, DS_WRITE2_B32, "ds_store_2addr_b32">;
+defm DS_STORE_2ADDR_STRIDE64_B32 : DS_Real_Renamed_gfx11_gfx12<0x00f, DS_WRITE2ST64_B32, "ds_store_2addr_stride64_b32">;
+defm DS_STORE_B8 : DS_Real_Renamed_gfx11_gfx12<0x01e, DS_WRITE_B8, "ds_store_b8">;
+defm DS_STORE_B16 : DS_Real_Renamed_gfx11_gfx12<0x01f, DS_WRITE_B16, "ds_store_b16">;
+defm DS_STOREXCHG_RTN_B32 : DS_Real_Renamed_gfx11_gfx12<0x02d, DS_WRXCHG_RTN_B32, "ds_storexchg_rtn_b32">;
+defm DS_STOREXCHG_2ADDR_RTN_B32 : DS_Real_Renamed_gfx11_gfx12<0x02e, DS_WRXCHG2_RTN_B32, "ds_storexchg_2addr_rtn_b32">;
+defm DS_STOREXCHG_2ADDR_STRIDE64_RTN_B32 : DS_Real_Renamed_gfx11_gfx12<0x02f, DS_WRXCHG2ST64_RTN_B32, "ds_storexchg_2addr_stride64_rtn_b32">;
+defm DS_LOAD_B32 : DS_Real_Renamed_gfx11_gfx12<0x036, DS_READ_B32, "ds_load_b32">;
+defm DS_LOAD_2ADDR_B32 : DS_Real_Renamed_gfx11_gfx12<0x037, DS_READ2_B32, "ds_load_2addr_b32">;
+defm DS_LOAD_2ADDR_STRIDE64_B32 : DS_Real_Renamed_gfx11_gfx12<0x038, DS_READ2ST64_B32, "ds_load_2addr_stride64_b32">;
+defm DS_LOAD_I8 : DS_Real_Renamed_gfx11_gfx12<0x039, DS_READ_I8, "ds_load_i8">;
+defm DS_LOAD_U8 : DS_Real_Renamed_gfx11_gfx12<0x03a, DS_READ_U8, "ds_load_u8">;
+defm DS_LOAD_I16 : DS_Real_Renamed_gfx11_gfx12<0x03b, DS_READ_I16, "ds_load_i16">;
+defm DS_LOAD_U16 : DS_Real_Renamed_gfx11_gfx12<0x03c, DS_READ_U16, "ds_load_u16">;
+defm DS_STORE_B64 : DS_Real_Renamed_gfx11_gfx12<0x04d, DS_WRITE_B64, "ds_store_b64">;
+defm DS_STORE_2ADDR_B64 : DS_Real_Renamed_gfx11_gfx12<0x04e, DS_WRITE2_B64, "ds_store_2addr_b64">;
+defm DS_STORE_2ADDR_STRIDE64_B64 : DS_Real_Renamed_gfx11_gfx12<0x04f, DS_WRITE2ST64_B64, "ds_store_2addr_stride64_b64">;
+defm DS_STOREXCHG_RTN_B64 : DS_Real_Renamed_gfx11_gfx12<0x06d, DS_WRXCHG_RTN_B64, "ds_storexchg_rtn_b64">;
+defm DS_STOREXCHG_2ADDR_RTN_B64 : DS_Real_Renamed_gfx11_gfx12<0x06e, DS_WRXCHG2_RTN_B64, "ds_storexchg_2addr_rtn_b64">;
+defm DS_STOREXCHG_2ADDR_STRIDE64_RTN_B64 : DS_Real_Renamed_gfx11_gfx12<0x06f, DS_WRXCHG2ST64_RTN_B64, "ds_storexchg_2addr_stride64_rtn_b64">;
+defm DS_LOAD_B64 : DS_Real_Renamed_gfx11_gfx12<0x076, DS_READ_B64, "ds_load_b64">;
+defm DS_LOAD_2ADDR_B64 : DS_Real_Renamed_gfx11_gfx12<0x077, DS_READ2_B64, "ds_load_2addr_b64">;
+defm DS_LOAD_2ADDR_STRIDE64_B64 : DS_Real_Renamed_gfx11_gfx12<0x078, DS_READ2ST64_B64, "ds_load_2addr_stride64_b64">;
+defm DS_STORE_B8_D16_HI : DS_Real_Renamed_gfx11_gfx12<0x0a0, DS_WRITE_B8_D16_HI, "ds_store_b8_d16_hi">;
+defm DS_STORE_B16_D16_HI : DS_Real_Renamed_gfx11_gfx12<0x0a1, DS_WRITE_B16_D16_HI, "ds_store_b16_d16_hi">;
+defm DS_LOAD_U8_D16 : DS_Real_Renamed_gfx11_gfx12<0x0a2, DS_READ_U8_D16, "ds_load_u8_d16">;
+defm DS_LOAD_U8_D16_HI : DS_Real_Renamed_gfx11_gfx12<0x0a3, DS_READ_U8_D16_HI, "ds_load_u8_d16_hi">;
+defm DS_LOAD_I8_D16 : DS_Real_Renamed_gfx11_gfx12<0x0a4, DS_READ_I8_D16, "ds_load_i8_d16">;
+defm DS_LOAD_I8_D16_HI : DS_Real_Renamed_gfx11_gfx12<0x0a5, DS_READ_I8_D16_HI, "ds_load_i8_d16_hi">;
+defm DS_LOAD_U16_D16 : DS_Real_Renamed_gfx11_gfx12<0x0a6, DS_READ_U16_D16, "ds_load_u16_d16">;
+defm DS_LOAD_U16_D16_HI : DS_Real_Renamed_gfx11_gfx12<0x0a7, DS_READ_U16_D16_HI, "ds_load_u16_d16_hi">;
+defm DS_STORE_ADDTID_B32 : DS_Real_Renamed_gfx11_gfx12<0x0b0, DS_WRITE_ADDTID_B32, "ds_store_addtid_b32">;
+defm DS_LOAD_ADDTID_B32 : DS_Real_Renamed_gfx11_gfx12<0x0b1, DS_READ_ADDTID_B32, "ds_load_addtid_b32">;
+defm DS_STORE_B96 : DS_Real_Renamed_gfx11_gfx12<0x0de, DS_WRITE_B96, "ds_store_b96">;
+defm DS_STORE_B128 : DS_Real_Renamed_gfx11_gfx12<0x0df, DS_WRITE_B128, "ds_store_b128">;
+defm DS_LOAD_B96 : DS_Real_Renamed_gfx11_gfx12<0x0fe, DS_READ_B96, "ds_load_b96">;
+defm DS_LOAD_B128 : DS_Real_Renamed_gfx11_gfx12<0x0ff, DS_READ_B128, "ds_load_b128">;
// DS_CMPST_* are renamed to DS_CMPSTORE_* in GFX11, but also the data operands (src and cmp) are swapped
// comparing to pre-GFX11.
// Note: the mnemonic alias is not generated to avoid a potential ambiguity due to the semantics change.
-defm DS_CMPSTORE_B32 : DS_Real_gfx11<0x010>;
+defm DS_CMPSTORE_B32 : DS_Real_gfx11_gfx12<0x010>;
defm DS_CMPSTORE_F32 : DS_Real_gfx11<0x011>;
-defm DS_CMPSTORE_RTN_B32 : DS_Real_gfx11<0x030>;
+defm DS_CMPSTORE_RTN_B32 : DS_Real_gfx11_gfx12<0x030>;
defm DS_CMPSTORE_RTN_F32 : DS_Real_gfx11<0x031>;
-defm DS_CMPSTORE_B64 : DS_Real_gfx11<0x050>;
+defm DS_CMPSTORE_B64 : DS_Real_gfx11_gfx12<0x050>;
defm DS_CMPSTORE_F64 : DS_Real_gfx11<0x051>;
-defm DS_CMPSTORE_RTN_B64 : DS_Real_gfx11<0x070>;
+defm DS_CMPSTORE_RTN_B64 : DS_Real_gfx11_gfx12<0x070>;
defm DS_CMPSTORE_RTN_F64 : DS_Real_gfx11<0x071>;
-defm DS_ADD_RTN_F32 : DS_Real_gfx11<0x079>;
+defm DS_ADD_RTN_F32 : DS_Real_gfx11_gfx12<0x079>;
defm DS_ADD_GS_REG_RTN : DS_Real_gfx11<0x07a>;
defm DS_SUB_GS_REG_RTN : DS_Real_gfx11<0x07b>;
defm DS_BVH_STACK_RTN_B32 : DS_Real_gfx11<0x0ad>;
@@ -1271,8 +1325,8 @@ defm DS_BVH_STACK_RTN_B32 : DS_Real_gfx11<0x0ad>;
let AssemblerPredicate = isGFX10Only, DecoderNamespace = "GFX10" in {
multiclass DS_Real_gfx10<bits<8> op> {
- def _gfx10 : Base_DS_Real_gfx6_gfx7_gfx10_gfx11<op, !cast<DS_Pseudo>(NAME),
- SIEncodingFamily.GFX10>;
+ def _gfx10 : Base_DS_Real_gfx6_gfx7_gfx10_gfx11_gfx12<op,
+ !cast<DS_Pseudo>(NAME), SIEncodingFamily.GFX10>;
}
} // End AssemblerPredicate = isGFX10Only, DecoderNamespace = "GFX10"
@@ -1289,28 +1343,34 @@ defm DS_WRITE_ADDTID_B32 : DS_Real_gfx10<0x0b0>;
defm DS_READ_ADDTID_B32 : DS_Real_gfx10<0x0b1>;
//===----------------------------------------------------------------------===//
-// GFX10, GFX11.
+// GFX10, GFX11, GFX12.
//===----------------------------------------------------------------------===//
+multiclass DS_Real_gfx10_gfx11_gfx12<bits<8> op> :
+ DS_Real_gfx10<op>, DS_Real_gfx11<op>, DS_Real_gfx12<op>;
+
multiclass DS_Real_gfx10_gfx11<bits<8> op> :
DS_Real_gfx10<op>, DS_Real_gfx11<op>;
-defm DS_ADD_F32 : DS_Real_gfx10_gfx11<0x015>;
+defm DS_ADD_F32 : DS_Real_gfx10_gfx11_gfx12<0x015>;
defm DS_ADD_SRC2_F32 : DS_Real_gfx10<0x095>;
-defm DS_PERMUTE_B32 : DS_Real_gfx10_gfx11<0x0b2>;
-defm DS_BPERMUTE_B32 : DS_Real_gfx10_gfx11<0x0b3>;
+defm DS_PERMUTE_B32 : DS_Real_gfx10_gfx11_gfx12<0x0b2>;
+defm DS_BPERMUTE_B32 : DS_Real_gfx10_gfx11_gfx12<0x0b3>;
//===----------------------------------------------------------------------===//
-// GFX7, GFX10, GFX11.
+// GFX7, GFX10, GFX11, GFX12.
//===----------------------------------------------------------------------===//
let AssemblerPredicate = isGFX7Only, DecoderNamespace = "GFX7" in {
multiclass DS_Real_gfx7<bits<8> op> {
- def _gfx7 : Base_DS_Real_gfx6_gfx7_gfx10_gfx11<op, !cast<DS_Pseudo>(NAME),
- SIEncodingFamily.SI>;
+ def _gfx7 : Base_DS_Real_gfx6_gfx7_gfx10_gfx11_gfx12<op,
+ !cast<DS_Pseudo>(NAME), SIEncodingFamily.SI>;
}
} // End AssemblerPredicate = isGFX7Only, DecoderNamespace = "GFX7"
+multiclass DS_Real_gfx7_gfx10_gfx11_gfx12<bits<8> op> :
+ DS_Real_gfx7<op>, DS_Real_gfx10_gfx11_gfx12<op>;
+
multiclass DS_Real_gfx7_gfx10_gfx11<bits<8> op> :
DS_Real_gfx7<op>, DS_Real_gfx10_gfx11<op>;
@@ -1320,7 +1380,7 @@ multiclass DS_Real_gfx7_gfx10<bits<8> op> :
// FIXME-GFX7: Add tests when upstreaming this part.
defm DS_GWS_SEMA_RELEASE_ALL : DS_Real_gfx7_gfx10_gfx11<0x018>;
defm DS_WRAP_RTN_B32 : DS_Real_gfx7_gfx10_gfx11<0x034>;
-defm DS_CONDXCHG32_RTN_B64 : DS_Real_gfx7_gfx10_gfx11<0x07e>;
+defm DS_CONDXCHG32_RTN_B64 : DS_Real_gfx7_gfx10_gfx11_gfx12<0x07e>;
defm DS_WRITE_B96 : DS_Real_gfx7_gfx10<0x0de>;
defm DS_WRITE_B128 : DS_Real_gfx7_gfx10<0x0df>;
defm DS_READ_B96 : DS_Real_gfx7_gfx10<0x0fe>;
@@ -1332,30 +1392,33 @@ defm DS_READ_B128 : DS_Real_gfx7_gfx10<0x0ff>;
let AssemblerPredicate = isGFX6GFX7, DecoderNamespace = "GFX6GFX7" in {
multiclass DS_Real_gfx6_gfx7<bits<8> op> {
- def _gfx6_gfx7 : Base_DS_Real_gfx6_gfx7_gfx10_gfx11<op, !cast<DS_Pseudo>(NAME),
- SIEncodingFamily.SI>;
+ def _gfx6_gfx7 : Base_DS_Real_gfx6_gfx7_gfx10_gfx11_gfx12<op,
+ !cast<DS_Pseudo>(NAME), SIEncodingFamily.SI>;
}
} // End AssemblerPredicate = isGFX6GFX7, DecoderNamespace = "GFX6GFX7"
+multiclass DS_Real_gfx6_gfx7_gfx10_gfx11_gfx12<bits<8> op> :
+ DS_Real_gfx6_gfx7<op>, DS_Real_gfx10_gfx11_gfx12<op>;
+
multiclass DS_Real_gfx6_gfx7_gfx10_gfx11<bits<8> op> :
DS_Real_gfx6_gfx7<op>, DS_Real_gfx10_gfx11<op>;
multiclass DS_Real_gfx6_gfx7_gfx10<bits<8> op> :
DS_Real_gfx6_gfx7<op>, DS_Real_gfx10<op>;
-defm DS_ADD_U32 : DS_Real_gfx6_gfx7_gfx10_gfx11<0x000>;
-defm DS_SUB_U32 : DS_Real_gfx6_gfx7_gfx10_gfx11<0x001>;
-defm DS_RSUB_U32 : DS_Real_gfx6_gfx7_gfx10_gfx11<0x002>;
-defm DS_INC_U32 : DS_Real_gfx6_gfx7_gfx10_gfx11<0x003>;
-defm DS_DEC_U32 : DS_Real_gfx6_gfx7_gfx10_gfx11<0x004>;
-defm DS_MIN_I32 : DS_Real_gfx6_gfx7_gfx10_gfx11<0x005>;
-defm DS_MAX_I32 : DS_Real_gfx6_gfx7_gfx10_gfx11<0x006>;
-defm DS_MIN_U32 : DS_Real_gfx6_gfx7_gfx10_gfx11<0x007>;
-defm DS_MAX_U32 : DS_Real_gfx6_gfx7_gfx10_gfx11<0x008>;
-defm DS_AND_B32 : DS_Real_gfx6_gfx7_gfx10_gfx11<0x009>;
-defm DS_OR_B32 : DS_Real_gfx6_gfx7_gfx10_gfx11<0x00a>;
-defm DS_XOR_B32 : DS_Real_gfx6_gfx7_gfx10_gfx11<0x00b>;
-defm DS_MSKOR_B32 : DS_Real_gfx6_gfx7_gfx10_gfx11<0x00c>;
+defm DS_ADD_U32 : DS_Real_gfx6_gfx7_gfx10_gfx11_gfx12<0x000>;
+defm DS_SUB_U32 : DS_Real_gfx6_gfx7_gfx10_gfx11_gfx12<0x001>;
+defm DS_RSUB_U32 : DS_Real_gfx6_gfx7_gfx10_gfx11_gfx12<0x002>;
+defm DS_INC_U32 : DS_Real_gfx6_gfx7_gfx10_gfx11_gfx12<0x003>;
+defm DS_DEC_U32 : DS_Real_gfx6_gfx7_gfx10_gfx11_gfx12<0x004>;
+defm DS_MIN_I32 : DS_Real_gfx6_gfx7_gfx10_gfx11_gfx12<0x005>;
+defm DS_MAX_I32 : DS_Real_gfx6_gfx7_gfx10_gfx11_gfx12<0x006>;
+defm DS_MIN_U32 : DS_Real_gfx6_gfx7_gfx10_gfx11_gfx12<0x007>;
+defm DS_MAX_U32 : DS_Real_gfx6_gfx7_gfx10_gfx11_gfx12<0x008>;
+defm DS_AND_B32 : DS_Real_gfx6_gfx7_gfx10_gfx11_gfx12<0x009>;
+defm DS_OR_B32 : DS_Real_gfx6_gfx7_gfx10_gfx11_gfx12<0x00a>;
+defm DS_XOR_B32 : DS_Real_gfx6_gfx7_gfx10_gfx11_gfx12<0x00b>;
+defm DS_MSKOR_B32 : DS_Real_gfx6_gfx7_gfx10_gfx11_gfx12<0x00c>;
defm DS_WRITE_B32 : DS_Real_gfx6_gfx7_gfx10<0x00d>;
defm DS_WRITE2_B32 : DS_Real_gfx6_gfx7_gfx10<0x00e>;
@@ -1365,7 +1428,7 @@ defm DS_CMPST_F32 : DS_Real_gfx6_gfx7_gfx10<0x011>;
defm DS_MIN_F32 : DS_Real_gfx6_gfx7_gfx10_gfx11<0x012>;
defm DS_MAX_F32 : DS_Real_gfx6_gfx7_gfx10_gfx11<0x013>;
-defm DS_NOP : DS_Real_gfx6_gfx7_gfx10_gfx11<0x014>;
+defm DS_NOP : DS_Real_gfx6_gfx7_gfx10_gfx11_gfx12<0x014>;
defm DS_GWS_INIT : DS_Real_gfx6_gfx7_gfx10_gfx11<0x019>;
defm DS_GWS_SEMA_V : DS_Real_gfx6_gfx7_gfx10_gfx11<0x01a>;
defm DS_GWS_SEMA_BR : DS_Real_gfx6_gfx7_gfx10_gfx11<0x01b>;
@@ -1375,19 +1438,19 @@ defm DS_GWS_BARRIER : DS_Real_gfx6_gfx7_gfx10_gfx11<0x01d>;
defm DS_WRITE_B8 : DS_Real_gfx6_gfx7_gfx10<0x01e>;
defm DS_WRITE_B16 : DS_Real_gfx6_gfx7_gfx10<0x01f>;
-defm DS_ADD_RTN_U32 : DS_Real_gfx6_gfx7_gfx10_gfx11<0x020>;
-defm DS_SUB_RTN_U32 : DS_Real_gfx6_gfx7_gfx10_gfx11<0x021>;
-defm DS_RSUB_RTN_U32 : DS_Real_gfx6_gfx7_gfx10_gfx11<0x022>;
-defm DS_INC_RTN_U32 : DS_Real_gfx6_gfx7_gfx10_gfx11<0x023>;
-defm DS_DEC_RTN_U32 : DS_Real_gfx6_gfx7_gfx10_gfx11<0x024>;
-defm DS_MIN_RTN_I32 : DS_Real_gfx6_gfx7_gfx10_gfx11<0x025>;
-defm DS_MAX_RTN_I32 : DS_Real_gfx6_gfx7_gfx10_gfx11<0x026>;
-defm DS_MIN_RTN_U32 : DS_Real_gfx6_gfx7_gfx10_gfx11<0x027>;
-defm DS_MAX_RTN_U32 : DS_Real_gfx6_gfx7_gfx10_gfx11<0x028>;
-defm DS_AND_RTN_B32 : DS_Real_gfx6_gfx7_gfx10_gfx11<0x029>;
-defm DS_OR_RTN_B32 : DS_Real_gfx6_gfx7_gfx10_gfx11<0x02a>;
-defm DS_XOR_RTN_B32 : DS_Real_gfx6_gfx7_gfx10_gfx11<0x02b>;
-defm DS_MSKOR_RTN_B32 : DS_Real_gfx6_gfx7_gfx10_gfx11<0x02c>;
+defm DS_ADD_RTN_U32 : DS_Real_gfx6_gfx7_gfx10_gfx11_gfx12<0x020>;
+defm DS_SUB_RTN_U32 : DS_Real_gfx6_gfx7_gfx10_gfx11_gfx12<0x021>;
+defm DS_RSUB_RTN_U32 : DS_Real_gfx6_gfx7_gfx10_gfx11_gfx12<0x022>;
+defm DS_INC_RTN_U32 : DS_Real_gfx6_gfx7_gfx10_gfx11_gfx12<0x023>;
+defm DS_DEC_RTN_U32 : DS_Real_gfx6_gfx7_gfx10_gfx11_gfx12<0x024>;
+defm DS_MIN_RTN_I32 : DS_Real_gfx6_gfx7_gfx10_gfx11_gfx12<0x025>;
+defm DS_MAX_RTN_I32 : DS_Real_gfx6_gfx7_gfx10_gfx11_gfx12<0x026>;
+defm DS_MIN_RTN_U32 : DS_Real_gfx6_gfx7_gfx10_gfx11_gfx12<0x027>;
+defm DS_MAX_RTN_U32 : DS_Real_gfx6_gfx7_gfx10_gfx11_gfx12<0x028>;
+defm DS_AND_RTN_B32 : DS_Real_gfx6_gfx7_gfx10_gfx11_gfx12<0x029>;
+defm DS_OR_RTN_B32 : DS_Real_gfx6_gfx7_gfx10_gfx11_gfx12<0x02a>;
+defm DS_XOR_RTN_B32 : DS_Real_gfx6_gfx7_gfx10_gfx11_gfx12<0x02b>;
+defm DS_MSKOR_RTN_B32 : DS_Real_gfx6_gfx7_gfx10_gfx11_gfx12<0x02c>;
defm DS_WRXCHG_RTN_B32 : DS_Real_gfx6_gfx7_gfx10<0x02d>;
defm DS_WRXCHG2_RTN_B32 : DS_Real_gfx6_gfx7_gfx10<0x02e>;
@@ -1397,7 +1460,7 @@ defm DS_CMPST_RTN_F32 : DS_Real_gfx6_gfx7_gfx10<0x031>;
defm DS_MIN_RTN_F32 : DS_Real_gfx6_gfx7_gfx10_gfx11<0x032>;
defm DS_MAX_RTN_F32 : DS_Real_gfx6_gfx7_gfx10_gfx11<0x033>;
-defm DS_SWIZZLE_B32 : DS_Real_gfx6_gfx7_gfx10_gfx11<0x035>;
+defm DS_SWIZZLE_B32 : DS_Real_gfx6_gfx7_gfx10_gfx11_gfx12<0x035>;
defm DS_READ_B32 : DS_Real_gfx6_gfx7_gfx10<0x036>;
defm DS_READ2_B32 : DS_Real_gfx6_gfx7_gfx10<0x037>;
@@ -1407,22 +1470,22 @@ defm DS_READ_U8 : DS_Real_gfx6_gfx7_gfx10<0x03a>;
defm DS_READ_I16 : DS_Real_gfx6_gfx7_gfx10<0x03b>;
defm DS_READ_U16 : DS_Real_gfx6_gfx7_gfx10<0x03c>;
-defm DS_CONSUME : DS_Real_gfx6_gfx7_gfx10_gfx11<0x03d>;
-defm DS_APPEND : DS_Real_gfx6_gfx7_gfx10_gfx11<0x03e>;
+defm DS_CONSUME : DS_Real_gfx6_gfx7_gfx10_gfx11_gfx12<0x03d>;
+defm DS_APPEND : DS_Real_gfx6_gfx7_gfx10_gfx11_gfx12<0x03e>;
defm DS_ORDERED_COUNT : DS_Real_gfx6_gfx7_gfx10_gfx11<0x03f>;
-defm DS_ADD_U64 : DS_Real_gfx6_gfx7_gfx10_gfx11<0x040>;
-defm DS_SUB_U64 : DS_Real_gfx6_gfx7_gfx10_gfx11<0x041>;
-defm DS_RSUB_U64 : DS_Real_gfx6_gfx7_gfx10_gfx11<0x042>;
-defm DS_INC_U64 : DS_Real_gfx6_gfx7_gfx10_gfx11<0x043>;
-defm DS_DEC_U64 : DS_Real_gfx6_gfx7_gfx10_gfx11<0x044>;
-defm DS_MIN_I64 : DS_Real_gfx6_gfx7_gfx10_gfx11<0x045>;
-defm DS_MAX_I64 : DS_Real_gfx6_gfx7_gfx10_gfx11<0x046>;
-defm DS_MIN_U64 : DS_Real_gfx6_gfx7_gfx10_gfx11<0x047>;
-defm DS_MAX_U64 : DS_Real_gfx6_gfx7_gfx10_gfx11<0x048>;
-defm DS_AND_B64 : DS_Real_gfx6_gfx7_gfx10_gfx11<0x049>;
-defm DS_OR_B64 : DS_Real_gfx6_gfx7_gfx10_gfx11<0x04a>;
-defm DS_XOR_B64 : DS_Real_gfx6_gfx7_gfx10_gfx11<0x04b>;
-defm DS_MSKOR_B64 : DS_Real_gfx6_gfx7_gfx10_gfx11<0x04c>;
+defm DS_ADD_U64 : DS_Real_gfx6_gfx7_gfx10_gfx11_gfx12<0x040>;
+defm DS_SUB_U64 : DS_Real_gfx6_gfx7_gfx10_gfx11_gfx12<0x041>;
+defm DS_RSUB_U64 : DS_Real_gfx6_gfx7_gfx10_gfx11_gfx12<0x042>;
+defm DS_INC_U64 : DS_Real_gfx6_gfx7_gfx10_gfx11_gfx12<0x043>;
+defm DS_DEC_U64 : DS_Real_gfx6_gfx7_gfx10_gfx11_gfx12<0x044>;
+defm DS_MIN_I64 : DS_Real_gfx6_gfx7_gfx10_gfx11_gfx12<0x045>;
+defm DS_MAX_I64 : DS_Real_gfx6_gfx7_gfx10_gfx11_gfx12<0x046>;
+defm DS_MIN_U64 : DS_Real_gfx6_gfx7_gfx10_gfx11_gfx12<0x047>;
+defm DS_MAX_U64 : DS_Real_gfx6_gfx7_gfx10_gfx11_gfx12<0x048>;
+defm DS_AND_B64 : DS_Real_gfx6_gfx7_gfx10_gfx11_gfx12<0x049>;
+defm DS_OR_B64 : DS_Real_gfx6_gfx7_gfx10_gfx11_gfx12<0x04a>;
+defm DS_XOR_B64 : DS_Real_gfx6_gfx7_gfx10_gfx11_gfx12<0x04b>;
+defm DS_MSKOR_B64 : DS_Real_gfx6_gfx7_gfx10_gfx11_gfx12<0x04c>;
defm DS_WRITE_B64 : DS_Real_gfx6_gfx7_gfx10<0x04d>;
defm DS_WRITE2_B64 : DS_Real_gfx6_gfx7_gfx10<0x04e>;
@@ -1432,19 +1495,19 @@ defm DS_CMPST_F64 : DS_Real_gfx6_gfx7_gfx10<0x051>;
defm DS_MIN_F64 : DS_Real_gfx6_gfx7_gfx10_gfx11<0x052>;
defm DS_MAX_F64 : DS_Real_gfx6_gfx7_gfx10_gfx11<0x053>;
-defm DS_ADD_RTN_U64 : DS_Real_gfx6_gfx7_gfx10_gfx11<0x060>;
-defm DS_SUB_RTN_U64 : DS_Real_gfx6_gfx7_gfx10_gfx11<0x061>;
-defm DS_RSUB_RTN_U64 : DS_Real_gfx6_gfx7_gfx10_gfx11<0x062>;
-defm DS_INC_RTN_U64 : DS_Real_gfx6_gfx7_gfx10_gfx11<0x063>;
-defm DS_DEC_RTN_U64 : DS_Real_gfx6_gfx7_gfx10_gfx11<0x064>;
-defm DS_MIN_RTN_I64 : DS_Real_gfx6_gfx7_gfx10_gfx11<0x065>;
-defm DS_MAX_RTN_I64 : DS_Real_gfx6_gfx7_gfx10_gfx11<0x066>;
-defm DS_MIN_RTN_U64 : DS_Real_gfx6_gfx7_gfx10_gfx11<0x067>;
-defm DS_MAX_RTN_U64 : DS_Real_gfx6_gfx7_gfx10_gfx11<0x068>;
-defm DS_AND_RTN_B64 : DS_Real_gfx6_gfx7_gfx10_gfx11<0x069>;
-defm DS_OR_RTN_B64 : DS_Real_gfx6_gfx7_gfx10_gfx11<0x06a>;
-defm DS_XOR_RTN_B64 : DS_Real_gfx6_gfx7_gfx10_gfx11<0x06b>;
-defm DS_MSKOR_RTN_B64 : DS_Real_gfx6_gfx7_gfx10_gfx11<0x06c>;
+defm DS_ADD_RTN_U64 : DS_Real_gfx6_gfx7_gfx10_gfx11_gfx12<0x060>;
+defm DS_SUB_RTN_U64 : DS_Real_gfx6_gfx7_gfx10_gfx11_gfx12<0x061>;
+defm DS_RSUB_RTN_U64 : DS_Real_gfx6_gfx7_gfx10_gfx11_gfx12<0x062>;
+defm DS_INC_RTN_U64 : DS_Real_gfx6_gfx7_gfx10_gfx11_gfx12<0x063>;
+defm DS_DEC_RTN_U64 : DS_Real_gfx6_gfx7_gfx10_gfx11_gfx12<0x064>;
+defm DS_MIN_RTN_I64 : DS_Real_gfx6_gfx7_gfx10_gfx11_gfx12<0x065>;
+defm DS_MAX_RTN_I64 : DS_Real_gfx6_gfx7_gfx10_gfx11_gfx12<0x066>;
+defm DS_MIN_RTN_U64 : DS_Real_gfx6_gfx7_gfx10_gfx11_gfx12<0x067>;
+defm DS_MAX_RTN_U64 : DS_Real_gfx6_gfx7_gfx10_gfx11_gfx12<0x068>;
+defm DS_AND_RTN_B64 : DS_Real_gfx6_gfx7_gfx10_gfx11_gfx12<0x069>;
+defm DS_OR_RTN_B64 : DS_Real_gfx6_gfx7_gfx10_gfx11_gfx12<0x06a>;
+defm DS_XOR_RTN_B64 : DS_Real_gfx6_gfx7_gfx10_gfx11_gfx12<0x06b>;
+defm DS_MSKOR_RTN_B64 : DS_Real_gfx6_gfx7_gfx10_gfx11_gfx12<0x06c>;
defm DS_WRXCHG_RTN_B64 : DS_Real_gfx6_gfx7_gfx10<0x06d>;
defm DS_WRXCHG2_RTN_B64 : DS_Real_gfx6_gfx7_gfx10<0x06e>;
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
index 1b05acd5c90a..ed2e7e4f189e 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
@@ -91,9 +91,11 @@ static DecodeStatus decodeSMEMOffset(MCInst &Inst, unsigned Imm, uint64_t Addr,
const MCDisassembler *Decoder) {
auto DAsm = static_cast<const AMDGPUDisassembler*>(Decoder);
int64_t Offset;
- if (DAsm->isVI()) { // VI supports 20-bit unsigned offsets.
+ if (DAsm->isGFX12Plus()) { // GFX12 supports 24-bit signed offsets.
+ Offset = SignExtend64<24>(Imm);
+ } else if (DAsm->isVI()) { // VI supports 20-bit unsigned offsets.
Offset = Imm & 0xFFFFF;
- } else { // GFX9+ supports 21-bit signed offsets.
+ } else { // GFX9+ supports 21-bit signed offsets.
Offset = SignExtend64<21>(Imm);
}
return addOperand(Inst, MCOperand::createImm(Offset));
@@ -105,6 +107,13 @@ static DecodeStatus decodeBoolReg(MCInst &Inst, unsigned Val, uint64_t Addr,
return addOperand(Inst, DAsm->decodeBoolReg(Val));
}
+static DecodeStatus decodeSplitBarrier(MCInst &Inst, unsigned Val,
+ uint64_t Addr,
+ const MCDisassembler *Decoder) {
+ auto DAsm = static_cast<const AMDGPUDisassembler *>(Decoder);
+ return addOperand(Inst, DAsm->decodeSplitBarrier(Val));
+}
+
#define DECODE_OPERAND(StaticDecoderName, DecoderName) \
static DecodeStatus StaticDecoderName(MCInst &Inst, unsigned Imm, \
uint64_t /*Addr*/, \
@@ -200,10 +209,12 @@ DECODE_OPERAND_REG_8(VReg_512)
DECODE_OPERAND_REG_8(VReg_1024)
DECODE_OPERAND_REG_7(SReg_32, OPW32)
+DECODE_OPERAND_REG_7(SReg_32_XEXEC, OPW32)
DECODE_OPERAND_REG_7(SReg_32_XM0_XEXEC, OPW32)
DECODE_OPERAND_REG_7(SReg_32_XEXEC_HI, OPW32)
DECODE_OPERAND_REG_7(SReg_64, OPW64)
DECODE_OPERAND_REG_7(SReg_64_XEXEC, OPW64)
+DECODE_OPERAND_REG_7(SReg_96, OPW96)
DECODE_OPERAND_REG_7(SReg_128, OPW128)
DECODE_OPERAND_REG_7(SReg_256, OPW256)
DECODE_OPERAND_REG_7(SReg_512, OPW512)
@@ -238,6 +249,7 @@ DECODE_SRC_OPERAND_REG_AV10(AV_128, OPW128)
DECODE_OPERAND_SRC_REG_OR_IMM_9(SReg_64, OPW64, 64)
DECODE_OPERAND_SRC_REG_OR_IMM_9(SReg_32, OPW32, 32)
+DECODE_OPERAND_SRC_REG_OR_IMM_9(SReg_32, OPW32, 16)
DECODE_OPERAND_SRC_REG_OR_IMM_9(SRegOrLds_32, OPW32, 32)
DECODE_OPERAND_SRC_REG_OR_IMM_9(VS_32_Lo128, OPW16, 16)
DECODE_OPERAND_SRC_REG_OR_IMM_9(VS_32, OPW32, 16)
@@ -259,6 +271,62 @@ DECODE_OPERAND_SRC_REG_OR_IMM_A9(AReg_1024, OPW1024, 32)
DECODE_OPERAND_SRC_REG_OR_IMM_DEFERRED_9(VS_32_Lo128, OPW16, 16)
DECODE_OPERAND_SRC_REG_OR_IMM_DEFERRED_9(VS_32, OPW16, 16)
DECODE_OPERAND_SRC_REG_OR_IMM_DEFERRED_9(VS_32, OPW32, 32)
+DECODE_OPERAND_SRC_REG_OR_IMM_DEFERRED_9(SReg_32, OPW32, 32)
+
+static DecodeStatus DecodeVGPR_16RegisterClass(MCInst &Inst, unsigned Imm,
+ uint64_t /*Addr*/,
+ const MCDisassembler *Decoder) {
+ assert(isUInt<10>(Imm) && "10-bit encoding expected");
+ assert((Imm & (1 << 8)) == 0 && "Imm{8} should not be used");
+
+ bool IsHi = Imm & (1 << 9);
+ unsigned RegIdx = Imm & 0xff;
+ auto DAsm = static_cast<const AMDGPUDisassembler *>(Decoder);
+ return addOperand(Inst, DAsm->createVGPR16Operand(RegIdx, IsHi));
+}
+
+static DecodeStatus
+DecodeVGPR_16_Lo128RegisterClass(MCInst &Inst, unsigned Imm, uint64_t /*Addr*/,
+ const MCDisassembler *Decoder) {
+ assert(isUInt<8>(Imm) && "8-bit encoding expected");
+
+ bool IsHi = Imm & (1 << 7);
+ unsigned RegIdx = Imm & 0x7f;
+ auto DAsm = static_cast<const AMDGPUDisassembler *>(Decoder);
+ return addOperand(Inst, DAsm->createVGPR16Operand(RegIdx, IsHi));
+}
+
+static DecodeStatus decodeOperand_VSrcT16_Lo128(MCInst &Inst, unsigned Imm,
+ uint64_t /*Addr*/,
+ const MCDisassembler *Decoder) {
+ assert(isUInt<9>(Imm) && "9-bit encoding expected");
+
+ const auto *DAsm = static_cast<const AMDGPUDisassembler *>(Decoder);
+ bool IsVGPR = Imm & (1 << 8);
+ if (IsVGPR) {
+ bool IsHi = Imm & (1 << 7);
+ unsigned RegIdx = Imm & 0x7f;
+ return addOperand(Inst, DAsm->createVGPR16Operand(RegIdx, IsHi));
+ }
+ return addOperand(Inst, DAsm->decodeNonVGPRSrcOp(AMDGPUDisassembler::OPW16,
+ Imm & 0xFF, false, 16));
+}
+
+static DecodeStatus decodeOperand_VSrcT16(MCInst &Inst, unsigned Imm,
+ uint64_t /*Addr*/,
+ const MCDisassembler *Decoder) {
+ assert(isUInt<10>(Imm) && "10-bit encoding expected");
+
+ const auto *DAsm = static_cast<const AMDGPUDisassembler *>(Decoder);
+ bool IsVGPR = Imm & (1 << 8);
+ if (IsVGPR) {
+ bool IsHi = Imm & (1 << 9);
+ unsigned RegIdx = Imm & 0xff;
+ return addOperand(Inst, DAsm->createVGPR16Operand(RegIdx, IsHi));
+ }
+ return addOperand(Inst, DAsm->decodeNonVGPRSrcOp(AMDGPUDisassembler::OPW16,
+ Imm & 0xFF, false, 16));
+}
static DecodeStatus decodeOperand_KImmFP(MCInst &Inst, unsigned Imm,
uint64_t Addr,
@@ -321,6 +389,15 @@ static DecodeStatus decodeOperand_AVLdSt_Any(MCInst &Inst, unsigned Imm,
return addOperand(Inst, DAsm->decodeSrcOp(Opw, Imm | 256));
}
+static DecodeStatus decodeOperand_VSrc_f64(MCInst &Inst, unsigned Imm,
+ uint64_t Addr,
+ const MCDisassembler *Decoder) {
+ assert(Imm < (1 << 9) && "9-bit encoding");
+ auto DAsm = static_cast<const AMDGPUDisassembler *>(Decoder);
+ return addOperand(
+ Inst, DAsm->decodeSrcOp(AMDGPUDisassembler::OPW64, Imm, false, 64, true));
+}
+
static DecodeStatus
DecodeAVLdSt_32RegisterClass(MCInst &Inst, unsigned Imm, uint64_t Addr,
const MCDisassembler *Decoder) {
@@ -371,18 +448,19 @@ DECODE_SDWA(VopcDst)
template <typename T> static inline T eatBytes(ArrayRef<uint8_t>& Bytes) {
assert(Bytes.size() >= sizeof(T));
- const auto Res = support::endian::read<T, support::endianness::little>(Bytes.data());
+ const auto Res =
+ support::endian::read<T, llvm::endianness::little>(Bytes.data());
Bytes = Bytes.slice(sizeof(T));
return Res;
}
static inline DecoderUInt128 eat12Bytes(ArrayRef<uint8_t> &Bytes) {
assert(Bytes.size() >= 12);
- uint64_t Lo = support::endian::read<uint64_t, support::endianness::little>(
- Bytes.data());
+ uint64_t Lo =
+ support::endian::read<uint64_t, llvm::endianness::little>(Bytes.data());
Bytes = Bytes.slice(8);
- uint64_t Hi = support::endian::read<uint32_t, support::endianness::little>(
- Bytes.data());
+ uint64_t Hi =
+ support::endian::read<uint32_t, llvm::endianness::little>(Bytes.data());
Bytes = Bytes.slice(4);
return DecoderUInt128(Lo, Hi);
}
@@ -418,25 +496,48 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
// encodings
if (isGFX11Plus() && Bytes.size() >= 12 ) {
DecoderUInt128 DecW = eat12Bytes(Bytes);
- Res = tryDecodeInst(DecoderTableDPP8GFX1196, MI, DecW, Address, CS);
+ Res =
+ tryDecodeInst(DecoderTableDPP8GFX1196, DecoderTableDPP8GFX11_FAKE1696,
+ MI, DecW, Address, CS);
if (Res && convertDPP8Inst(MI) == MCDisassembler::Success)
break;
MI = MCInst(); // clear
- Res = tryDecodeInst(DecoderTableDPPGFX1196, MI, DecW, Address, CS);
- if (Res) {
- if (MCII->get(MI.getOpcode()).TSFlags & SIInstrFlags::VOP3P)
+ Res =
+ tryDecodeInst(DecoderTableDPP8GFX1296, DecoderTableDPP8GFX12_FAKE1696,
+ MI, DecW, Address, CS);
+ if (Res && convertDPP8Inst(MI) == MCDisassembler::Success)
+ break;
+ MI = MCInst(); // clear
+
+ const auto convertVOPDPP = [&]() {
+ if (MCII->get(MI.getOpcode()).TSFlags & SIInstrFlags::VOP3P) {
convertVOP3PDPPInst(MI);
- else if (AMDGPU::isVOPC64DPP(MI.getOpcode()))
+ } else if (AMDGPU::isVOPC64DPP(MI.getOpcode())) {
convertVOPCDPPInst(MI); // Special VOP3 case
- else {
+ } else {
assert(MCII->get(MI.getOpcode()).TSFlags & SIInstrFlags::VOP3);
convertVOP3DPPInst(MI); // Regular VOP3 case
}
+ };
+ Res = tryDecodeInst(DecoderTableDPPGFX1196, DecoderTableDPPGFX11_FAKE1696,
+ MI, DecW, Address, CS);
+ if (Res) {
+ convertVOPDPP();
+ break;
+ }
+ Res = tryDecodeInst(DecoderTableDPPGFX1296, DecoderTableDPPGFX12_FAKE1696,
+ MI, DecW, Address, CS);
+ if (Res) {
+ convertVOPDPP();
break;
}
Res = tryDecodeInst(DecoderTableGFX1196, MI, DecW, Address, CS);
if (Res)
break;
+
+ Res = tryDecodeInst(DecoderTableGFX1296, MI, DecW, Address, CS);
+ if (Res)
+ break;
}
// Reinitialize Bytes
Bytes = Bytes_.slice(0, MaxInstBytesNum);
@@ -461,7 +562,14 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
break;
MI = MCInst(); // clear
- Res = tryDecodeInst(DecoderTableDPP8GFX1164, MI, QW, Address, CS);
+ Res = tryDecodeInst(DecoderTableDPP8GFX1164,
+ DecoderTableDPP8GFX11_FAKE1664, MI, QW, Address, CS);
+ if (Res && convertDPP8Inst(MI) == MCDisassembler::Success)
+ break;
+ MI = MCInst(); // clear
+
+ Res = tryDecodeInst(DecoderTableDPP8GFX1264,
+ DecoderTableDPP8GFX12_FAKE1664, MI, QW, Address, CS);
if (Res && convertDPP8Inst(MI) == MCDisassembler::Success)
break;
MI = MCInst(); // clear
@@ -469,7 +577,16 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
Res = tryDecodeInst(DecoderTableDPP64, MI, QW, Address, CS);
if (Res) break;
- Res = tryDecodeInst(DecoderTableDPPGFX1164, MI, QW, Address, CS);
+ Res = tryDecodeInst(DecoderTableDPPGFX1164, DecoderTableDPPGFX11_FAKE1664,
+ MI, QW, Address, CS);
+ if (Res) {
+ if (MCII->get(MI.getOpcode()).TSFlags & SIInstrFlags::VOPC)
+ convertVOPCDPPInst(MI);
+ break;
+ }
+
+ Res = tryDecodeInst(DecoderTableDPPGFX1264, DecoderTableDPPGFX12_FAKE1664,
+ MI, QW, Address, CS);
if (Res) {
if (MCII->get(MI.getOpcode()).TSFlags & SIInstrFlags::VOPC)
convertVOPCDPPInst(MI);
@@ -530,9 +647,15 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
Res = tryDecodeInst(DecoderTableGFX1032, MI, DW, Address, CS);
if (Res) break;
- Res = tryDecodeInst(DecoderTableGFX1132, MI, DW, Address, CS);
+ Res = tryDecodeInst(DecoderTableGFX1132, DecoderTableGFX11_FAKE1632, MI, DW,
+ Address, CS);
if (Res) break;
+ Res = tryDecodeInst(DecoderTableGFX1232, DecoderTableGFX12_FAKE1632, MI, DW,
+ Address, CS);
+ if (Res)
+ break;
+
if (Bytes.size() < 4) break;
const uint64_t QW = ((uint64_t)eatBytes<uint32_t>(Bytes) << 32) | DW;
@@ -560,7 +683,13 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
Res = tryDecodeInst(DecoderTableGFX1064, MI, QW, Address, CS);
if (Res) break;
- Res = tryDecodeInst(DecoderTableGFX1164, MI, QW, Address, CS);
+ Res = tryDecodeInst(DecoderTableGFX1264, DecoderTableGFX12_FAKE1664, MI, QW,
+ Address, CS);
+ if (Res)
+ break;
+
+ Res = tryDecodeInst(DecoderTableGFX1164, DecoderTableGFX11_FAKE1664, MI, QW,
+ Address, CS);
if (Res)
break;
@@ -640,6 +769,10 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
Res = convertMIMGInst(MI);
}
+ if (Res && (MCII->get(MI.getOpcode()).TSFlags &
+ (SIInstrFlags::VIMAGE | SIInstrFlags::VSAMPLE)))
+ Res = convertMIMGInst(MI);
+
if (Res && (MCII->get(MI.getOpcode()).TSFlags & SIInstrFlags::EXP))
Res = convertEXPInst(MI);
@@ -679,7 +812,7 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
}
DecodeStatus AMDGPUDisassembler::convertEXPInst(MCInst &MI) const {
- if (STI.hasFeature(AMDGPU::FeatureGFX11)) {
+ if (STI.hasFeature(AMDGPU::FeatureGFX11Insts)) {
// The MCInst still has these fields even though they are no longer encoded
// in the GFX11 instruction.
insertNamedMCOperand(MI, MCOperand::createImm(0), AMDGPU::OpName::vm);
@@ -690,9 +823,13 @@ DecodeStatus AMDGPUDisassembler::convertEXPInst(MCInst &MI) const {
DecodeStatus AMDGPUDisassembler::convertVINTERPInst(MCInst &MI) const {
if (MI.getOpcode() == AMDGPU::V_INTERP_P10_F16_F32_inreg_gfx11 ||
+ MI.getOpcode() == AMDGPU::V_INTERP_P10_F16_F32_inreg_gfx12 ||
MI.getOpcode() == AMDGPU::V_INTERP_P10_RTZ_F16_F32_inreg_gfx11 ||
+ MI.getOpcode() == AMDGPU::V_INTERP_P10_RTZ_F16_F32_inreg_gfx12 ||
MI.getOpcode() == AMDGPU::V_INTERP_P2_F16_F32_inreg_gfx11 ||
- MI.getOpcode() == AMDGPU::V_INTERP_P2_RTZ_F16_F32_inreg_gfx11) {
+ MI.getOpcode() == AMDGPU::V_INTERP_P2_F16_F32_inreg_gfx12 ||
+ MI.getOpcode() == AMDGPU::V_INTERP_P2_RTZ_F16_F32_inreg_gfx11 ||
+ MI.getOpcode() == AMDGPU::V_INTERP_P2_RTZ_F16_F32_inreg_gfx12) {
// The MCInst has this field that is not directly encoded in the
// instruction.
insertNamedMCOperand(MI, MCOperand::createImm(0), AMDGPU::OpName::op_sel);
@@ -840,6 +977,7 @@ DecodeStatus AMDGPUDisassembler::convertVOP3DPPInst(MCInst &MI) const {
// VADDR size. Consequently, decoded instructions always show address as if it
// has 1 dword, which could be not really so.
DecodeStatus AMDGPUDisassembler::convertMIMGInst(MCInst &MI) const {
+ auto TSFlags = MCII->get(MI.getOpcode()).TSFlags;
int VDstIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
AMDGPU::OpName::vdst);
@@ -848,8 +986,9 @@ DecodeStatus AMDGPUDisassembler::convertMIMGInst(MCInst &MI) const {
AMDGPU::OpName::vdata);
int VAddr0Idx =
AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vaddr0);
- int RsrcIdx =
- AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::srsrc);
+ int RsrcOpName = TSFlags & SIInstrFlags::MIMG ? AMDGPU::OpName::srsrc
+ : AMDGPU::OpName::rsrc;
+ int RsrcIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), RsrcOpName);
int DMaskIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
AMDGPU::OpName::dmask);
@@ -870,7 +1009,8 @@ DecodeStatus AMDGPUDisassembler::convertMIMGInst(MCInst &MI) const {
}
bool IsAtomic = (VDstIdx != -1);
- bool IsGather4 = MCII->get(MI.getOpcode()).TSFlags & SIInstrFlags::Gather4;
+ bool IsGather4 = TSFlags & SIInstrFlags::Gather4;
+ bool IsVSample = TSFlags & SIInstrFlags::VSAMPLE;
bool IsNSA = false;
bool IsPartialNSA = false;
unsigned AddrSize = Info->VAddrDwords;
@@ -887,10 +1027,13 @@ DecodeStatus AMDGPUDisassembler::convertMIMGInst(MCInst &MI) const {
AddrSize =
AMDGPU::getAddrSizeMIMGOp(BaseOpcode, Dim, IsA16, AMDGPU::hasG16(STI));
+ // VSAMPLE insts that do not use vaddr3 behave the same as NSA forms.
+ // VIMAGE insts other than BVH never use vaddr4.
IsNSA = Info->MIMGEncoding == AMDGPU::MIMGEncGfx10NSA ||
- Info->MIMGEncoding == AMDGPU::MIMGEncGfx11NSA;
+ Info->MIMGEncoding == AMDGPU::MIMGEncGfx11NSA ||
+ Info->MIMGEncoding == AMDGPU::MIMGEncGfx12;
if (!IsNSA) {
- if (AddrSize > 12)
+ if (!IsVSample && AddrSize > 12)
AddrSize = 16;
} else {
if (AddrSize > Info->VAddrDwords) {
@@ -1098,6 +1241,8 @@ MCOperand AMDGPUDisassembler::createSRegOperand(unsigned SRegClassID,
case AMDGPU::TTMP_64RegClassID:
shift = 1;
break;
+ case AMDGPU::SGPR_96RegClassID:
+ case AMDGPU::TTMP_96RegClassID:
case AMDGPU::SGPR_128RegClassID:
case AMDGPU::TTMP_128RegClassID:
// ToDo: unclear if s[100:104] is available on VI. Can we use VCC as SGPR in
@@ -1132,6 +1277,13 @@ MCOperand AMDGPUDisassembler::createSRegOperand(unsigned SRegClassID,
return createRegOperand(SRegClassID, Val >> shift);
}
+MCOperand AMDGPUDisassembler::createVGPR16Operand(unsigned RegIdx,
+ bool IsHi) const {
+ unsigned RCID =
+ IsHi ? AMDGPU::VGPR_HI16RegClassID : AMDGPU::VGPR_LO16RegClassID;
+ return createRegOperand(RCID, RegIdx);
+}
+
// Decode Literals for insts which always have a literal in the encoding
MCOperand
AMDGPUDisassembler::decodeMandatoryLiteralConstant(unsigned Val) const {
@@ -1147,7 +1299,7 @@ AMDGPUDisassembler::decodeMandatoryLiteralConstant(unsigned Val) const {
return MCOperand::createImm(Literal);
}
-MCOperand AMDGPUDisassembler::decodeLiteralConstant() const {
+MCOperand AMDGPUDisassembler::decodeLiteralConstant(bool ExtendFP64) const {
// For now all literal constants are supposed to be unsigned integer
// ToDo: deal with signed/unsigned 64-bit integer constants
// ToDo: deal with float/double constants
@@ -1157,9 +1309,11 @@ MCOperand AMDGPUDisassembler::decodeLiteralConstant() const {
Twine(Bytes.size()));
}
HasLiteral = true;
- Literal = eatBytes<uint32_t>(Bytes);
+ Literal = Literal64 = eatBytes<uint32_t>(Bytes);
+ if (ExtendFP64)
+ Literal64 <<= 32;
}
- return MCOperand::createImm(Literal);
+ return MCOperand::createImm(ExtendFP64 ? Literal64 : Literal);
}
MCOperand AMDGPUDisassembler::decodeIntImmed(unsigned Imm) {
@@ -1376,7 +1530,7 @@ int AMDGPUDisassembler::getTTmpIdx(unsigned Val) const {
MCOperand AMDGPUDisassembler::decodeSrcOp(const OpWidthTy Width, unsigned Val,
bool MandatoryLiteral,
- unsigned ImmWidth) const {
+ unsigned ImmWidth, bool IsFP) const {
using namespace AMDGPU::EncValues;
assert(Val < 1024); // enum10
@@ -1388,6 +1542,20 @@ MCOperand AMDGPUDisassembler::decodeSrcOp(const OpWidthTy Width, unsigned Val,
return createRegOperand(IsAGPR ? getAgprClassId(Width)
: getVgprClassId(Width), Val - VGPR_MIN);
}
+ return decodeNonVGPRSrcOp(Width, Val & 0xFF, MandatoryLiteral, ImmWidth,
+ IsFP);
+}
+
+MCOperand AMDGPUDisassembler::decodeNonVGPRSrcOp(const OpWidthTy Width,
+ unsigned Val,
+ bool MandatoryLiteral,
+ unsigned ImmWidth,
+ bool IsFP) const {
+ // Cases when Val{8} is 1 (vgpr, agpr or true 16 vgpr) should have been
+ // decoded earlier.
+ assert(Val < (1 << 8) && "9-bit Src encoding when Val{8} is 0");
+ using namespace AMDGPU::EncValues;
+
if (Val <= SGPR_MAX) {
// "SGPR_MIN <= Val" is always true and causes compilation warning.
static_assert(SGPR_MIN == 0);
@@ -1410,7 +1578,7 @@ MCOperand AMDGPUDisassembler::decodeSrcOp(const OpWidthTy Width, unsigned Val,
// Keep a sentinel value for deferred setting
return MCOperand::createImm(LITERAL_CONST);
else
- return decodeLiteralConstant();
+ return decodeLiteralConstant(IsFP && ImmWidth == 64);
}
switch (Width) {
@@ -1590,6 +1758,10 @@ MCOperand AMDGPUDisassembler::decodeBoolReg(unsigned Val) const {
: decodeSrcOp(OPW32, Val);
}
+MCOperand AMDGPUDisassembler::decodeSplitBarrier(unsigned Val) const {
+ return decodeSrcOp(OPW32, Val);
+}
+
bool AMDGPUDisassembler::isVI() const {
return STI.hasFeature(AMDGPU::FeatureVolcanicIslands);
}
@@ -1616,11 +1788,18 @@ bool AMDGPUDisassembler::isGFX11Plus() const {
return AMDGPU::isGFX11Plus(STI);
}
+bool AMDGPUDisassembler::isGFX12Plus() const {
+ return AMDGPU::isGFX12Plus(STI);
+}
bool AMDGPUDisassembler::hasArchitectedFlatScratch() const {
return STI.hasFeature(AMDGPU::FeatureArchitectedFlatScratch);
}
+bool AMDGPUDisassembler::hasKernargPreload() const {
+ return AMDGPU::hasKernargPreload(STI);
+}
+
//===----------------------------------------------------------------------===//
// AMDGPU specific symbol handling
//===----------------------------------------------------------------------===//
@@ -1704,12 +1883,16 @@ MCDisassembler::DecodeStatus AMDGPUDisassembler::decodeCOMPUTE_PGM_RSRC1(
if (FourByteBuffer & COMPUTE_PGM_RSRC1_PRIV)
return MCDisassembler::Fail;
- PRINT_DIRECTIVE(".amdhsa_dx10_clamp", COMPUTE_PGM_RSRC1_ENABLE_DX10_CLAMP);
+ if (!isGFX12Plus())
+ PRINT_DIRECTIVE(".amdhsa_dx10_clamp",
+ COMPUTE_PGM_RSRC1_GFX6_GFX11_ENABLE_DX10_CLAMP);
if (FourByteBuffer & COMPUTE_PGM_RSRC1_DEBUG_MODE)
return MCDisassembler::Fail;
- PRINT_DIRECTIVE(".amdhsa_ieee_mode", COMPUTE_PGM_RSRC1_ENABLE_IEEE_MODE);
+ if (!isGFX12Plus())
+ PRINT_DIRECTIVE(".amdhsa_ieee_mode",
+ COMPUTE_PGM_RSRC1_GFX6_GFX11_ENABLE_IEEE_MODE);
if (FourByteBuffer & COMPUTE_PGM_RSRC1_BULKY)
return MCDisassembler::Fail;
@@ -1717,17 +1900,29 @@ MCDisassembler::DecodeStatus AMDGPUDisassembler::decodeCOMPUTE_PGM_RSRC1(
if (FourByteBuffer & COMPUTE_PGM_RSRC1_CDBG_USER)
return MCDisassembler::Fail;
- PRINT_DIRECTIVE(".amdhsa_fp16_overflow", COMPUTE_PGM_RSRC1_FP16_OVFL);
+ if (isGFX9Plus())
+ PRINT_DIRECTIVE(".amdhsa_fp16_overflow", COMPUTE_PGM_RSRC1_GFX9_PLUS_FP16_OVFL);
- if (FourByteBuffer & COMPUTE_PGM_RSRC1_RESERVED0)
+ if (!isGFX9Plus())
+ if (FourByteBuffer & COMPUTE_PGM_RSRC1_GFX6_GFX8_RESERVED0)
+ return MCDisassembler::Fail;
+ if (FourByteBuffer & COMPUTE_PGM_RSRC1_RESERVED1)
return MCDisassembler::Fail;
+ if (!isGFX10Plus())
+ if (FourByteBuffer & COMPUTE_PGM_RSRC1_GFX6_GFX9_RESERVED2)
+ return MCDisassembler::Fail;
if (isGFX10Plus()) {
PRINT_DIRECTIVE(".amdhsa_workgroup_processor_mode",
- COMPUTE_PGM_RSRC1_WGP_MODE);
- PRINT_DIRECTIVE(".amdhsa_memory_ordered", COMPUTE_PGM_RSRC1_MEM_ORDERED);
- PRINT_DIRECTIVE(".amdhsa_forward_progress", COMPUTE_PGM_RSRC1_FWD_PROGRESS);
+ COMPUTE_PGM_RSRC1_GFX10_PLUS_WGP_MODE);
+ PRINT_DIRECTIVE(".amdhsa_memory_ordered", COMPUTE_PGM_RSRC1_GFX10_PLUS_MEM_ORDERED);
+ PRINT_DIRECTIVE(".amdhsa_forward_progress", COMPUTE_PGM_RSRC1_GFX10_PLUS_FWD_PROGRESS);
}
+
+ if (isGFX12Plus())
+ PRINT_DIRECTIVE(".amdhsa_round_robin_scheduling",
+ COMPUTE_PGM_RSRC1_GFX12_PLUS_ENABLE_WG_RR_EN);
+
return MCDisassembler::Success;
}
@@ -1807,16 +2002,29 @@ MCDisassembler::DecodeStatus AMDGPUDisassembler::decodeCOMPUTE_PGM_RSRC3(
PRINT_PSEUDO_DIRECTIVE_COMMENT(
"SHARED_VGPR_COUNT", COMPUTE_PGM_RSRC3_GFX10_PLUS_SHARED_VGPR_COUNT);
}
- PRINT_PSEUDO_DIRECTIVE_COMMENT("INST_PREF_SIZE",
- COMPUTE_PGM_RSRC3_GFX10_PLUS_INST_PREF_SIZE);
- PRINT_PSEUDO_DIRECTIVE_COMMENT("TRAP_ON_START",
- COMPUTE_PGM_RSRC3_GFX10_PLUS_TRAP_ON_START);
- PRINT_PSEUDO_DIRECTIVE_COMMENT("TRAP_ON_END",
- COMPUTE_PGM_RSRC3_GFX10_PLUS_TRAP_ON_END);
- if (FourByteBuffer & COMPUTE_PGM_RSRC3_GFX10_PLUS_RESERVED0)
+
+ if (isGFX11Plus()) {
+ PRINT_PSEUDO_DIRECTIVE_COMMENT("INST_PREF_SIZE",
+ COMPUTE_PGM_RSRC3_GFX11_PLUS_INST_PREF_SIZE);
+ PRINT_PSEUDO_DIRECTIVE_COMMENT("TRAP_ON_START",
+ COMPUTE_PGM_RSRC3_GFX11_PLUS_TRAP_ON_START);
+ PRINT_PSEUDO_DIRECTIVE_COMMENT("TRAP_ON_END",
+ COMPUTE_PGM_RSRC3_GFX11_PLUS_TRAP_ON_END);
+ } else {
+ if (FourByteBuffer & COMPUTE_PGM_RSRC3_GFX10_RESERVED0)
+ return MCDisassembler::Fail;
+ }
+
+ if (FourByteBuffer & COMPUTE_PGM_RSRC3_GFX10_PLUS_RESERVED1)
return MCDisassembler::Fail;
- PRINT_PSEUDO_DIRECTIVE_COMMENT("IMAGE_OP",
- COMPUTE_PGM_RSRC3_GFX10_PLUS_TRAP_ON_START);
+
+ if (isGFX11Plus()) {
+ PRINT_PSEUDO_DIRECTIVE_COMMENT("IMAGE_OP",
+ COMPUTE_PGM_RSRC3_GFX11_PLUS_TRAP_ON_START);
+ } else {
+ if (FourByteBuffer & COMPUTE_PGM_RSRC3_GFX10_RESERVED2)
+ return MCDisassembler::Fail;
+ }
} else if (FourByteBuffer) {
return MCDisassembler::Fail;
}
@@ -1945,10 +2153,24 @@ AMDGPUDisassembler::decodeKernelDescriptorDirective(
return MCDisassembler::Success;
- case amdhsa::RESERVED2_OFFSET:
- // 6 bytes from here are reserved, must be 0.
- ReservedBytes = DE.getBytes(Cursor, 6);
- for (int I = 0; I < 6; ++I) {
+ case amdhsa::KERNARG_PRELOAD_OFFSET:
+ using namespace amdhsa;
+ TwoByteBuffer = DE.getU16(Cursor);
+ if (TwoByteBuffer & KERNARG_PRELOAD_SPEC_LENGTH) {
+ PRINT_DIRECTIVE(".amdhsa_user_sgpr_kernarg_preload_length",
+ KERNARG_PRELOAD_SPEC_LENGTH);
+ }
+
+ if (TwoByteBuffer & KERNARG_PRELOAD_SPEC_OFFSET) {
+ PRINT_DIRECTIVE(".amdhsa_user_sgpr_kernarg_preload_offset",
+ KERNARG_PRELOAD_SPEC_OFFSET);
+ }
+ return MCDisassembler::Success;
+
+ case amdhsa::RESERVED3_OFFSET:
+ // 4 bytes from here are reserved, must be 0.
+ ReservedBytes = DE.getBytes(Cursor, 4);
+ for (int I = 0; I < 4; ++I) {
if (ReservedBytes[I] != 0)
return MCDisassembler::Fail;
}
@@ -1975,7 +2197,7 @@ MCDisassembler::DecodeStatus AMDGPUDisassembler::decodeKernelDescriptor(
if (isGFX10Plus()) {
uint16_t KernelCodeProperties =
support::endian::read16(&Bytes[amdhsa::KERNEL_CODE_PROPERTIES_OFFSET],
- support::endianness::little);
+ llvm::endianness::little);
EnableWavefrontSize32 =
AMDHSA_BITS_GET(KernelCodeProperties,
amdhsa::KERNEL_CODE_PROPERTY_ENABLE_WAVEFRONT_SIZE32);
@@ -2018,7 +2240,7 @@ AMDGPUDisassembler::onSymbolStart(SymbolInfoTy &Symbol, uint64_t &Size,
// Code Object V3 kernel descriptors.
StringRef Name = Symbol.Name;
- if (Symbol.Type == ELF::STT_OBJECT && Name.endswith(StringRef(".kd"))) {
+ if (Symbol.Type == ELF::STT_OBJECT && Name.ends_with(StringRef(".kd"))) {
Size = 64; // Size = 64 regardless of success or failure.
return decodeKernelDescriptor(Name.drop_back(3), Bytes, Address);
}
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h b/contrib/llvm-project/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h
index 444312473a5f..233581949d71 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h
@@ -97,6 +97,7 @@ private:
const unsigned TargetMaxInstBytes;
mutable ArrayRef<uint8_t> Bytes;
mutable uint32_t Literal;
+ mutable uint64_t Literal64;
mutable bool HasLiteral;
mutable std::optional<bool> EnableWavefrontSize32;
@@ -114,6 +115,7 @@ public:
MCOperand createRegOperand(unsigned int RegId) const;
MCOperand createRegOperand(unsigned RegClassID, unsigned Val) const;
MCOperand createSRegOperand(unsigned SRegClassID, unsigned Val) const;
+ MCOperand createVGPR16Operand(unsigned RegIdx, bool IsHi) const;
MCOperand errOperand(unsigned V, const Twine& ErrMsg) const;
@@ -144,6 +146,17 @@ public:
return MCDisassembler::Fail;
}
+ template <typename InsnType>
+ DecodeStatus tryDecodeInst(const uint8_t *Table1, const uint8_t *Table2,
+ MCInst &MI, InsnType Inst, uint64_t Address,
+ raw_ostream &Comments) const {
+ for (const uint8_t *T : {Table1, Table2}) {
+ if (DecodeStatus Res = tryDecodeInst(T, MI, Inst, Address, Comments))
+ return Res;
+ }
+ return MCDisassembler::Fail;
+ }
+
std::optional<DecodeStatus>
onSymbolStart(SymbolInfoTy &Symbol, uint64_t &Size, ArrayRef<uint8_t> Bytes,
uint64_t Address, raw_ostream &CStream) const override;
@@ -217,11 +230,15 @@ public:
static MCOperand decodeFPImmed(unsigned ImmWidth, unsigned Imm);
MCOperand decodeMandatoryLiteralConstant(unsigned Imm) const;
- MCOperand decodeLiteralConstant() const;
+ MCOperand decodeLiteralConstant(bool ExtendFP64) const;
MCOperand decodeSrcOp(const OpWidthTy Width, unsigned Val,
- bool MandatoryLiteral = false,
- unsigned ImmWidth = 0) const;
+ bool MandatoryLiteral = false, unsigned ImmWidth = 0,
+ bool IsFP = false) const;
+
+ MCOperand decodeNonVGPRSrcOp(const OpWidthTy Width, unsigned Val,
+ bool MandatoryLiteral = false,
+ unsigned ImmWidth = 0, bool IsFP = false) const;
MCOperand decodeVOPDDstYOp(MCInst &Inst, unsigned Val) const;
MCOperand decodeSpecialReg32(unsigned Val) const;
@@ -234,6 +251,7 @@ public:
MCOperand decodeSDWAVopcDst(unsigned Val) const;
MCOperand decodeBoolReg(unsigned Val) const;
+ MCOperand decodeSplitBarrier(unsigned Val) const;
int getTTmpIdx(unsigned Val) const;
@@ -247,8 +265,10 @@ public:
bool isGFX10Plus() const;
bool isGFX11() const;
bool isGFX11Plus() const;
+ bool isGFX12Plus() const;
bool hasArchitectedFlatScratch() const;
+ bool hasKernargPreload() const;
bool isMacDPP(MCInst &MI) const;
};
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/EXPInstructions.td b/contrib/llvm-project/llvm/lib/Target/AMDGPU/EXPInstructions.td
index 14ba01f0d67c..ff1d661ef6fe 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/EXPInstructions.td
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/EXPInstructions.td
@@ -41,8 +41,8 @@ class EXP_Real_ComprVM<bit done, string pseudo, int subtarget>
}
// Real instruction with optional asm operand "row_en".
-class EXP_Real_Row<bit row, bit done, string pseudo, int subtarget>
- : EXPCommon<row, done, "exp$tgt $src0, $src1, $src2, $src3"
+class EXP_Real_Row<bit row, bit done, string pseudo, int subtarget, string name = "exp">
+ : EXPCommon<row, done, name#"$tgt $src0, $src1, $src2, $src3"
#!if(done, " done", "")#!if(row, " row_en", "")>,
SIMCInstr<pseudo, subtarget> {
let AsmMatchConverter = "cvtExp";
@@ -105,12 +105,12 @@ def EXP_gfx10 : EXP_Real_gfx10<0, "EXP">;
def EXP_DONE_gfx10 : EXP_Real_gfx10<1, "EXP_DONE">;
//===----------------------------------------------------------------------===//
-// GFX11+
+// GFX11
//===----------------------------------------------------------------------===//
class EXP_Real_gfx11<bit _row, bit _done, string pseudo>
: EXP_Real_Row<_row, _done, pseudo, SIEncodingFamily.GFX11>, EXPe_Row {
- let AssemblerPredicate = isGFX11Plus;
+ let AssemblerPredicate = isGFX11Only;
let DecoderNamespace = "GFX11";
let row = _row;
let done = _done;
@@ -122,6 +122,24 @@ def EXP_ROW_gfx11 : EXP_Real_gfx11<1, 0, "EXP_ROW">;
def EXP_ROW_DONE_gfx11 : EXP_Real_gfx11<1, 1, "EXP_ROW_DONE">;
//===----------------------------------------------------------------------===//
+// GFX12+
+//===----------------------------------------------------------------------===//
+
+class VEXPORT_Real_gfx12<bit _row, bit _done, string pseudo>
+ : EXP_Real_Row<_row, _done, pseudo, SIEncodingFamily.GFX12, "export">,
+ EXPe_Row, MnemonicAlias<"exp", "export">, Requires<[isGFX12Plus]> {
+ let AssemblerPredicate = isGFX12Plus;
+ let DecoderNamespace = "GFX12";
+ let row = _row;
+ let done = _done;
+}
+
+def EXPORT_gfx12 : VEXPORT_Real_gfx12<0, 0, "EXP">;
+def EXPORT_DONE_gfx12 : VEXPORT_Real_gfx12<0, 1, "EXP_DONE">;
+def EXPORT_ROW_gfx12 : VEXPORT_Real_gfx12<1, 0, "EXP_ROW">;
+def EXPORT_ROW_DONE_gfx12 : VEXPORT_Real_gfx12<1, 1, "EXP_ROW_DONE">;
+
+//===----------------------------------------------------------------------===//
// EXP Patterns
//===----------------------------------------------------------------------===//
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/FLATInstructions.td b/contrib/llvm-project/llvm/lib/Target/AMDGPU/FLATInstructions.td
index 5c86d80e7dd2..0dd2b3f5c2c9 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/FLATInstructions.td
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/FLATInstructions.td
@@ -144,6 +144,47 @@ class FLAT_Real <bits<7> op, FLAT_Pseudo ps, string opName = ps.Mnemonic> :
let Inst{63-56} = !if(ps.has_vdst, vdst{7-0}, ?);
}
+class VFLAT_Real <bits<8> op, FLAT_Pseudo ps, string opName = ps.Mnemonic> :
+ InstSI <ps.OutOperandList, ps.InOperandList, opName # ps.AsmOperands, []>,
+ Enc96 {
+
+ let FLAT = 1;
+
+ // copy relevant pseudo op flags
+ let SubtargetPredicate = ps.SubtargetPredicate;
+ let AsmMatchConverter = ps.AsmMatchConverter;
+ let OtherPredicates = ps.OtherPredicates;
+ let TSFlags = ps.TSFlags;
+ let UseNamedOperandTable = ps.UseNamedOperandTable;
+ let SchedRW = ps.SchedRW;
+ let mayLoad = ps.mayLoad;
+ let mayStore = ps.mayStore;
+ let IsAtomicRet = ps.IsAtomicRet;
+ let IsAtomicNoRet = ps.IsAtomicNoRet;
+ let VM_CNT = ps.VM_CNT;
+ let LGKM_CNT = ps.LGKM_CNT;
+ let VALU = ps.VALU;
+
+ bits<7> saddr;
+ bits<8> vdst;
+ bits<6> cpol;
+ bits<8> vdata; // vsrc
+ bits<8> vaddr;
+ bits<24> offset;
+
+ let Inst{6-0} = !if(ps.has_saddr, !if(ps.enabled_saddr, saddr, 0x7f), 0);
+ let Inst{21-14} = op;
+ let Inst{31-26} = 0x3b;
+ let Inst{39-32} = !if(ps.has_vdst, vdst, ?);
+ let Inst{49} = ps.sve;
+ let Inst{54-53} = cpol{2-1}; // th{2-1}
+ let Inst{52} = !if(ps.IsAtomicRet, 1, cpol{0}); // th{0}
+ let Inst{51-50} = cpol{4-3}; // scope
+ let Inst{62-55} = !if(ps.has_data, vdata{7-0}, ?);
+ let Inst{71-64} = !if(ps.has_vaddr, vaddr, ?);
+ let Inst{95-72} = offset;
+}
+
class GlobalSaddrTable <bit is_saddr, string Name = ""> {
bit IsSaddr = is_saddr;
string SaddrOp = Name;
@@ -758,6 +799,10 @@ let SubtargetPredicate = HasFlatAtomicFaddF32Inst in {
defm FLAT_ATOMIC_ADD_F32 : FLAT_Atomic_Pseudo<"flat_atomic_add_f32", VGPR_32, f32>;
} // End SubtargetPredicate = HasFlatAtomicFaddF32Inst
+let SubtargetPredicate = isGFX12Plus in {
+ defm FLAT_ATOMIC_CSUB_U32 : FLAT_Atomic_Pseudo <"flat_atomic_csub_u32", VGPR_32, i32>;
+} // End SubtargetPredicate = isGFX12Plus
+
defm GLOBAL_LOAD_UBYTE : FLAT_Global_Load_Pseudo <"global_load_ubyte", VGPR_32>;
defm GLOBAL_LOAD_SBYTE : FLAT_Global_Load_Pseudo <"global_load_sbyte", VGPR_32>;
defm GLOBAL_LOAD_USHORT : FLAT_Global_Load_Pseudo <"global_load_ushort", VGPR_32>;
@@ -870,9 +915,10 @@ defm GLOBAL_ATOMIC_INC_X2 : FLAT_Global_Atomic_Pseudo <"global_atomic_inc_x2",
defm GLOBAL_ATOMIC_DEC_X2 : FLAT_Global_Atomic_Pseudo <"global_atomic_dec_x2",
VReg_64, i64>;
-let SubtargetPredicate = HasGFX10_BEncoding in
-defm GLOBAL_ATOMIC_CSUB : FLAT_Global_Atomic_Pseudo_RTN <"global_atomic_csub",
- VGPR_32, i32>;
+let SubtargetPredicate = HasGFX10_BEncoding in {
+ defm GLOBAL_ATOMIC_CSUB : FLAT_Global_Atomic_Pseudo <"global_atomic_csub",
+ VGPR_32, i32>;
+}
defm GLOBAL_LOAD_LDS_UBYTE : FLAT_Global_Load_LDS_Pseudo <"global_load_lds_ubyte">;
defm GLOBAL_LOAD_LDS_SBYTE : FLAT_Global_Load_LDS_Pseudo <"global_load_lds_sbyte">;
@@ -996,12 +1042,6 @@ class GlobalStoreSaddrPat <FLAT_Pseudo inst, SDPatternOperator node,
(inst $voffset, getVregSrcForVT<vt>.ret:$data, $saddr, $offset)
>;
-class GlobalAtomicStoreSaddrPat <FLAT_Pseudo inst, SDPatternOperator node,
- ValueType vt> : GCNPat <
- (node (GlobalSAddr (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), i32:$offset), vt:$data),
- (inst $voffset, getVregSrcForVT<vt>.ret:$data, $saddr, $offset)
->;
-
class GlobalAtomicSaddrPat <FLAT_Pseudo inst, SDPatternOperator node,
ValueType vt, ValueType data_vt = vt> : GCNPat <
(vt (node (GlobalSAddr (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), i32:$offset), data_vt:$data)),
@@ -1024,13 +1064,6 @@ class FlatStoreSignedPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt
(inst $vaddr, getVregSrcForVT<vt>.ret:$data, $offset)
>;
-class FlatStoreAtomicPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat <
- // atomic store follows atomic binop convention so the address comes
- // first.
- (node (FlatOffset i64:$vaddr, i32:$offset), vt:$data),
- (inst $vaddr, getVregSrcForVT<vt>.ret:$data, $offset)
->;
-
class FlatStoreSignedAtomicPat <FLAT_Pseudo inst, SDPatternOperator node,
ValueType vt, ValueType data_vt = vt> : GCNPat <
// atomic store follows atomic binop convention so the address comes
@@ -1039,19 +1072,43 @@ class FlatStoreSignedAtomicPat <FLAT_Pseudo inst, SDPatternOperator node,
(inst $vaddr, getVregSrcForVT<data_vt>.ret:$data, $offset)
>;
-multiclass FlatAtomicPat <string inst, string node, ValueType vt,
- ValueType data_vt = vt> {
- defvar rtnNode = !cast<PatFrags>(node#"_"#vt.Size);
- defvar noRtnNode = !cast<PatFrags>(node#"_noret_"#vt.Size);
-
- def : GCNPat <(vt (rtnNode (FlatOffset i64:$vaddr, i32:$offset), data_vt:$data)),
- (!cast<FLAT_Pseudo>(inst#"_RTN") VReg_64:$vaddr, getVregSrcForVT<data_vt>.ret:$data, $offset)>;
+multiclass FlatAtomicNoRtnPat <string inst, string node, ValueType vt,
+ ValueType data_vt = vt, bit isIntr = 0> {
+ defvar noRtnNode = !cast<PatFrags>(node # "_noret" # !if(isIntr, "", "_"#vt.Size));
let AddedComplexity = 1 in
def : GCNPat <(vt (noRtnNode (FlatOffset i64:$vaddr, i32:$offset), data_vt:$data)),
(!cast<FLAT_Pseudo>(inst) VReg_64:$vaddr, getVregSrcForVT<data_vt>.ret:$data, $offset)>;
}
+multiclass FlatAtomicRtnPat <string inst, string node, ValueType vt,
+ ValueType data_vt = vt, bit isIntr = 0> {
+ defvar rtnNode = !cast<SDPatternOperator>(node # !if(isIntr, "", "_"#vt.Size));
+
+ def : GCNPat <(vt (rtnNode (FlatOffset i64:$vaddr, i32:$offset), data_vt:$data)),
+ (!cast<FLAT_Pseudo>(inst#"_RTN") VReg_64:$vaddr, getVregSrcForVT<data_vt>.ret:$data, $offset)>;
+}
+
+multiclass FlatAtomicPat <string inst, string node, ValueType vt,
+ ValueType data_vt = vt, bit isIntr = 0> :
+ FlatAtomicRtnPat<inst, node, vt, data_vt, isIntr>,
+ FlatAtomicNoRtnPat<inst, node, vt, data_vt, isIntr>;
+
+multiclass FlatAtomicIntrNoRtnPat <string inst, string node, ValueType vt,
+ ValueType data_vt = vt> {
+ defm : FlatAtomicNoRtnPat<inst, node, vt, data_vt, /* isIntr */ 1>;
+}
+
+multiclass FlatAtomicIntrRtnPat <string inst, string node, ValueType vt,
+ ValueType data_vt = vt> {
+ defm : FlatAtomicRtnPat<inst, node, vt, data_vt, /* isIntr */ 1>;
+}
+
+multiclass FlatAtomicIntrPat <string inst, string node, ValueType vt,
+ ValueType data_vt = vt> :
+ FlatAtomicRtnPat<inst, node, vt, data_vt, /* isIntr */ 1>,
+ FlatAtomicNoRtnPat<inst, node, vt, data_vt, /* isIntr */ 1>;
+
class FlatSignedAtomicPatBase <FLAT_Pseudo inst, SDPatternOperator node,
ValueType vt, ValueType data_vt = vt> : GCNPat <
(vt (node (GlobalOffset i64:$vaddr, i32:$offset), data_vt:$data)),
@@ -1174,12 +1231,12 @@ def : FlatLoadPat <FLAT_LOAD_DWORDX4, load_flat, vt>;
def : FlatStorePat <FLAT_STORE_DWORDX4, store_flat, vt>;
}
-def : FlatStoreAtomicPat <FLAT_STORE_DWORD, atomic_store_32_flat, i32>;
-def : FlatStoreAtomicPat <FLAT_STORE_DWORDX2, atomic_store_64_flat, i64>;
-def : FlatStoreAtomicPat <FLAT_STORE_BYTE, atomic_store_8_flat, i32>;
-def : FlatStoreAtomicPat <FLAT_STORE_BYTE, atomic_store_8_flat, i16>;
-def : FlatStoreAtomicPat <FLAT_STORE_SHORT, atomic_store_16_flat, i32>;
-def : FlatStoreAtomicPat <FLAT_STORE_SHORT, atomic_store_16_flat, i16>;
+def : FlatStorePat <FLAT_STORE_DWORD, atomic_store_32_flat, i32>;
+def : FlatStorePat <FLAT_STORE_DWORDX2, atomic_store_64_flat, i64>;
+def : FlatStorePat <FLAT_STORE_BYTE, atomic_store_8_flat, i32>;
+def : FlatStorePat <FLAT_STORE_BYTE, atomic_store_8_flat, i16>;
+def : FlatStorePat <FLAT_STORE_SHORT, atomic_store_16_flat, i32>;
+def : FlatStorePat <FLAT_STORE_SHORT, atomic_store_16_flat, i16>;
foreach as = [ "flat", "global" ] in {
defm : FlatAtomicPat <"FLAT_ATOMIC_ADD", "atomic_load_add_"#as, i32>;
@@ -1269,24 +1326,13 @@ multiclass GlobalFLATStorePats<FLAT_Pseudo inst, SDPatternOperator node,
}
}
-// Deal with swapped operands for atomic_store vs. regular store
-multiclass GlobalFLATAtomicStorePats<FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> {
- def : FlatStoreSignedAtomicPat <inst, node, vt> {
- let AddedComplexity = 10;
- }
-
- def : GlobalAtomicStoreSaddrPat<!cast<FLAT_Pseudo>(!cast<string>(inst)#"_SADDR"), node, vt> {
- let AddedComplexity = 11;
- }
-}
-
multiclass GlobalFLATAtomicPatsNoRtnBase<string inst, string node, ValueType vt,
ValueType data_vt = vt> {
let AddedComplexity = 11 in
- def : FlatSignedAtomicPatBase<!cast<FLAT_Pseudo>(inst), !cast<PatFrags>(node), vt, data_vt>;
+ def : FlatSignedAtomicPatBase<!cast<FLAT_Pseudo>(inst), !cast<SDPatternOperator>(node), vt, data_vt>;
let AddedComplexity = 13 in
- def : GlobalAtomicSaddrPat<!cast<FLAT_Pseudo>(inst#"_SADDR"), !cast<PatFrags>(node), vt, data_vt>;
+ def : GlobalAtomicSaddrPat<!cast<FLAT_Pseudo>(inst#"_SADDR"), !cast<SDPatternOperator>(node), vt, data_vt>;
}
multiclass GlobalFLATAtomicPatsRtnBase<string inst, string node, ValueType vt,
@@ -1444,12 +1490,12 @@ defm : GlobalFLATLoadPats_D16 <GLOBAL_LOAD_SHORT_D16, load_d16_lo_global, v2i16>
defm : GlobalFLATLoadPats_D16 <GLOBAL_LOAD_SHORT_D16, load_d16_lo_global, v2f16>;
}
-defm : GlobalFLATAtomicStorePats <GLOBAL_STORE_BYTE, atomic_store_8_global, i32>;
-defm : GlobalFLATAtomicStorePats <GLOBAL_STORE_BYTE, atomic_store_8_global, i16>;
-defm : GlobalFLATAtomicStorePats <GLOBAL_STORE_SHORT, atomic_store_16_global, i32>;
-defm : GlobalFLATAtomicStorePats <GLOBAL_STORE_SHORT, atomic_store_16_global, i16>;
-defm : GlobalFLATAtomicStorePats <GLOBAL_STORE_DWORD, atomic_store_32_global, i32>;
-defm : GlobalFLATAtomicStorePats <GLOBAL_STORE_DWORDX2, atomic_store_64_global, i64>;
+defm : GlobalFLATStorePats <GLOBAL_STORE_BYTE, atomic_store_8_global, i32>;
+defm : GlobalFLATStorePats <GLOBAL_STORE_BYTE, atomic_store_8_global, i16>;
+defm : GlobalFLATStorePats <GLOBAL_STORE_SHORT, atomic_store_16_global, i32>;
+defm : GlobalFLATStorePats <GLOBAL_STORE_SHORT, atomic_store_16_global, i16>;
+defm : GlobalFLATStorePats <GLOBAL_STORE_DWORD, atomic_store_32_global, i32>;
+defm : GlobalFLATStorePats <GLOBAL_STORE_DWORDX2, atomic_store_64_global, i64>;
defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_ADD", "atomic_load_add_global", i32>;
defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_SUB", "atomic_load_sub_global", i32>;
@@ -1466,6 +1512,9 @@ defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_CMPSWAP", "AMDGPUatomic_cmp_swap_glo
defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_XOR", "atomic_load_xor_global", i32>;
defm : GlobalFLATAtomicPatsRtn <"GLOBAL_ATOMIC_CSUB", "int_amdgcn_global_atomic_csub", i32, i32, /* isIntr */ 1>;
+let OtherPredicates = [HasAtomicCSubNoRtnInsts] in
+defm : GlobalFLATAtomicPatsNoRtn <"GLOBAL_ATOMIC_CSUB", "int_amdgcn_global_atomic_csub", i32, i32, /* isIntr */ 1>;
+
defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_ADD_X2", "atomic_load_add_global", i64>;
defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_SUB_X2", "atomic_load_sub_global", i64>;
defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_INC_X2", "atomic_load_uinc_wrap_global", i64>;
@@ -1483,10 +1532,14 @@ defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_XOR_X2", "atomic_load_xor_global", i
let OtherPredicates = [isGFX10Plus] in {
defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_FMIN", "atomic_load_fmin_global", f32>;
defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_FMAX", "atomic_load_fmax_global", f32>;
-defm : GlobalFLATAtomicIntrPats <"GLOBAL_ATOMIC_FMIN", "int_amdgcn_global_atomic_fmin", f32>;
-defm : GlobalFLATAtomicIntrPats <"GLOBAL_ATOMIC_FMAX", "int_amdgcn_global_atomic_fmax", f32>;
defm : FlatSignedAtomicPat <"FLAT_ATOMIC_FMIN", "atomic_load_fmin_flat", f32>;
defm : FlatSignedAtomicPat <"FLAT_ATOMIC_FMAX", "atomic_load_fmax_flat", f32>;
+}
+
+let OtherPredicates = [isGFX10GFX11] in {
+defm : GlobalFLATAtomicIntrPats <"GLOBAL_ATOMIC_FMIN", "int_amdgcn_global_atomic_fmin", f32>;
+defm : GlobalFLATAtomicIntrPats <"GLOBAL_ATOMIC_FMAX", "int_amdgcn_global_atomic_fmax", f32>;
+
defm : FlatSignedAtomicIntrPat <"FLAT_ATOMIC_FMIN", "int_amdgcn_flat_atomic_fmin", f32>;
defm : FlatSignedAtomicIntrPat <"FLAT_ATOMIC_FMAX", "int_amdgcn_flat_atomic_fmax", f32>;
}
@@ -1502,6 +1555,13 @@ defm : FlatSignedAtomicIntrPat <"FLAT_ATOMIC_FMIN_X2", "int_amdgcn_flat_atomic_f
defm : FlatSignedAtomicIntrPat <"FLAT_ATOMIC_FMAX_X2", "int_amdgcn_flat_atomic_fmax", f64>;
}
+let OtherPredicates = [isGFX12Only] in {
+ defm : GlobalFLATAtomicIntrPats <"GLOBAL_ATOMIC_FMIN", "int_amdgcn_global_atomic_fmin_num", f32>;
+ defm : GlobalFLATAtomicIntrPats <"GLOBAL_ATOMIC_FMAX", "int_amdgcn_global_atomic_fmax_num", f32>;
+ defm : FlatSignedAtomicIntrPat <"FLAT_ATOMIC_FMIN", "int_amdgcn_flat_atomic_fmin_num", f32>;
+ defm : FlatSignedAtomicIntrPat <"FLAT_ATOMIC_FMAX", "int_amdgcn_flat_atomic_fmax_num", f32>;
+}
+
let OtherPredicates = [HasAtomicFaddNoRtnInsts] in {
defm : GlobalFLATAtomicPatsNoRtn <"GLOBAL_ATOMIC_ADD_F32", "atomic_load_fadd_global", f32>;
defm : GlobalFLATAtomicPatsNoRtnWithAddrSpace <"GLOBAL_ATOMIC_ADD_F32", "int_amdgcn_flat_atomic_fadd", "global_addrspace", f32>;
@@ -1998,7 +2058,7 @@ multiclass FLAT_Real_SADDR_RTN_gfx10<bits<7> op> {
multiclass FLAT_Real_ST_gfx10<bits<7> op> {
def _ST_gfx10 :
FLAT_Real_gfx10<op, !cast<FLAT_Pseudo>(NAME#"_ST")> {
- let Inst{54-48} = !cast<int>(EXEC_HI.HWEncoding);
+ let Inst{54-48} = EXEC_HI.Index;
let OtherPredicates = [HasFlatScratchSTMode];
}
}
@@ -2126,7 +2186,7 @@ defm GLOBAL_ATOMIC_SWAP : FLAT_Real_GlblAtomics_gfx10<0x030>;
defm GLOBAL_ATOMIC_CMPSWAP : FLAT_Real_GlblAtomics_gfx10<0x031>;
defm GLOBAL_ATOMIC_ADD : FLAT_Real_GlblAtomics_gfx10<0x032>;
defm GLOBAL_ATOMIC_SUB : FLAT_Real_GlblAtomics_gfx10<0x033>;
-defm GLOBAL_ATOMIC_CSUB : FLAT_Real_GlblAtomics_RTN_gfx10<0x034>;
+defm GLOBAL_ATOMIC_CSUB : FLAT_Real_GlblAtomics_gfx10<0x034>;
defm GLOBAL_ATOMIC_SMIN : FLAT_Real_GlblAtomics_gfx10<0x035>;
defm GLOBAL_ATOMIC_UMIN : FLAT_Real_GlblAtomics_gfx10<0x036>;
defm GLOBAL_ATOMIC_SMAX : FLAT_Real_GlblAtomics_gfx10<0x037>;
@@ -2201,7 +2261,7 @@ defm SCRATCH_LOAD_LDS_DWORD : FLAT_Real_ScratchAllAddr_LDS_gfx10 <0x00c>;
class FLAT_Real_gfx11 <bits<7> op, FLAT_Pseudo ps, string opName = ps.Mnemonic> :
FLAT_Real <op, ps, opName>,
SIMCInstr <ps.PseudoInstr, SIEncodingFamily.GFX11> {
- let AssemblerPredicate = isGFX11Plus;
+ let AssemblerPredicate = isGFX11Only;
let DecoderNamespace = "GFX11";
let Inst{13} = !if(ps.has_dlc, cpol{CPolBit.DLC}, ps.dlcValue);
@@ -2213,19 +2273,19 @@ class FLAT_Real_gfx11 <bits<7> op, FLAT_Pseudo ps, string opName = ps.Mnemonic>
multiclass FLAT_Aliases_gfx11<string ps, string opName, int renamed> {
if renamed then
- def _renamed_gfx11 : MnemonicAlias<!cast<FLAT_Pseudo>(ps).Mnemonic, opName>, Requires<[isGFX11Plus]>;
+ def _renamed_gfx11 : MnemonicAlias<!cast<FLAT_Pseudo>(ps).Mnemonic, opName>, Requires<[isGFX11Only]>;
}
multiclass FLAT_Real_Base_gfx11<bits<7> op, string ps, string opName, int renamed = false> :
FLAT_Aliases_gfx11<ps, opName, renamed> {
def _gfx11 : FLAT_Real_gfx11<op, !cast<FLAT_Pseudo>(ps), opName> {
- let Inst{54-48} = !cast<int>(SGPR_NULL_gfx11plus.HWEncoding);
+ let Inst{54-48} = SGPR_NULL_gfx11plus.Index;
}
}
multiclass FLAT_Real_RTN_gfx11<bits<7> op, string ps, string opName> {
def _RTN_gfx11 : FLAT_Real_gfx11<op, !cast<FLAT_Pseudo>(ps#"_RTN"), opName> {
- let Inst{54-48} = !cast<int>(SGPR_NULL_gfx11plus.HWEncoding);
+ let Inst{54-48} = SGPR_NULL_gfx11plus.Index;
}
}
@@ -2239,7 +2299,7 @@ multiclass FLAT_Real_SADDR_RTN_gfx11<bits<7> op, string ps, string opName> {
multiclass FLAT_Real_ST_gfx11<bits<7> op, string ps, string opName> {
def _ST_gfx11 : FLAT_Real_gfx11<op, !cast<FLAT_Pseudo>(ps#"_ST"), opName> {
- let Inst{54-48} = !cast<int>(SGPR_NULL_gfx11plus.HWEncoding);
+ let Inst{54-48} = SGPR_NULL_gfx11plus.Index;
let OtherPredicates = [HasFlatScratchSTMode];
}
}
@@ -2357,7 +2417,7 @@ defm GLOBAL_ATOMIC_SWAP_B32 : FLAT_Real_GlblAtomics_gfx11<0x033, "GLOBAL_ATO
defm GLOBAL_ATOMIC_CMPSWAP_B32 : FLAT_Real_GlblAtomics_gfx11<0x034, "GLOBAL_ATOMIC_CMPSWAP", "global_atomic_cmpswap_b32", true>;
defm GLOBAL_ATOMIC_ADD_U32 : FLAT_Real_GlblAtomics_gfx11<0x035, "GLOBAL_ATOMIC_ADD", "global_atomic_add_u32", true>;
defm GLOBAL_ATOMIC_SUB_U32 : FLAT_Real_GlblAtomics_gfx11<0x036, "GLOBAL_ATOMIC_SUB", "global_atomic_sub_u32", true>;
-defm GLOBAL_ATOMIC_CSUB_U32 : FLAT_Real_GlblAtomics_RTN_gfx11<0x037, "GLOBAL_ATOMIC_CSUB", "global_atomic_csub_u32", true>;
+defm GLOBAL_ATOMIC_CSUB_U32 : FLAT_Real_GlblAtomics_gfx11<0x037, "GLOBAL_ATOMIC_CSUB", "global_atomic_csub_u32", true>;
defm GLOBAL_ATOMIC_MIN_I32 : FLAT_Real_GlblAtomics_gfx11<0x038, "GLOBAL_ATOMIC_SMIN", "global_atomic_min_i32", true>;
defm GLOBAL_ATOMIC_MIN_U32 : FLAT_Real_GlblAtomics_gfx11<0x039, "GLOBAL_ATOMIC_UMIN", "global_atomic_min_u32", true>;
defm GLOBAL_ATOMIC_MAX_I32 : FLAT_Real_GlblAtomics_gfx11<0x03a, "GLOBAL_ATOMIC_SMAX", "global_atomic_max_i32", true>;
@@ -2408,3 +2468,213 @@ defm SCRATCH_LOAD_D16_HI_I8 : FLAT_Real_ScratchAllAddr_gfx11<0x22, "SCRATCH_
defm SCRATCH_LOAD_D16_HI_B16 : FLAT_Real_ScratchAllAddr_gfx11<0x23, "SCRATCH_LOAD_SHORT_D16_HI", "scratch_load_d16_hi_b16">;
defm SCRATCH_STORE_D16_HI_B8 : FLAT_Real_ScratchAllAddr_gfx11<0x24, "SCRATCH_STORE_BYTE_D16_HI", "scratch_store_d16_hi_b8">;
defm SCRATCH_STORE_D16_HI_B16 : FLAT_Real_ScratchAllAddr_gfx11<0x25, "SCRATCH_STORE_SHORT_D16_HI", "scratch_store_d16_hi_b16">;
+
+//===----------------------------------------------------------------------===//
+// GFX12
+//===----------------------------------------------------------------------===//
+
+class VFLAT_Real_gfx12 <bits<8> op, FLAT_Pseudo ps,
+ string opName = ps.Mnemonic> :
+ VFLAT_Real <op, ps, opName>,
+ SIMCInstr <ps.PseudoInstr, SIEncodingFamily.GFX12> {
+ let AssemblerPredicate = isGFX12Plus;
+ let DecoderNamespace = "GFX12";
+
+ let Inst{25-24} = !if(ps.is_flat_scratch, 0b01,
+ !if(ps.is_flat_global, 0b10, 0b00));
+}
+
+multiclass VFLAT_Aliases_gfx12<string ps, string opName, int renamed, string alias> {
+ if renamed then
+ def _renamed_gfx12 : MnemonicAlias<!cast<FLAT_Pseudo>(ps).Mnemonic, opName>, Requires<[isGFX12Plus]>;
+ if !not(!empty(alias)) then
+ def _alias_gfx12 : MnemonicAlias<alias, opName>, Requires<[isGFX12Plus]>;
+}
+
+multiclass VFLAT_Real_Base_gfx12<bits<8> op, string ps, string opName, int renamed = false, string alias = ""> :
+ VFLAT_Aliases_gfx12<ps, opName, renamed, alias> {
+ def _gfx12 : VFLAT_Real_gfx12<op, !cast<FLAT_Pseudo>(ps), opName> {
+ let Inst{6-0} = !cast<int>(SGPR_NULL_gfx11plus.HWEncoding);
+ }
+}
+
+multiclass VFLAT_Real_RTN_gfx12<bits<8> op, string ps, string opName> {
+ def _RTN_gfx12 : VFLAT_Real_gfx12<op, !cast<FLAT_Pseudo>(ps#"_RTN"), opName> {
+ let Inst{6-0} = !cast<int>(SGPR_NULL_gfx11plus.HWEncoding);
+ }
+}
+
+multiclass VFLAT_Real_SADDR_gfx12<bits<8> op, string ps, string opName> {
+ def _SADDR_gfx12 : VFLAT_Real_gfx12<op, !cast<FLAT_Pseudo>(ps#"_SADDR"), opName>;
+}
+
+multiclass VFLAT_Real_SADDR_RTN_gfx12<bits<8> op, string ps, string opName> {
+ def _SADDR_RTN_gfx12 : VFLAT_Real_gfx12<op, !cast<FLAT_Pseudo>(ps#"_SADDR_RTN"), opName>;
+}
+
+multiclass VFLAT_Real_ST_gfx12<bits<8> op, string ps, string opName> {
+ def _ST_gfx12 : VFLAT_Real_gfx12<op, !cast<FLAT_Pseudo>(ps#"_ST"), opName> {
+ let Inst{6-0} = !cast<int>(SGPR_NULL_gfx11plus.HWEncoding);
+ let OtherPredicates = [HasFlatScratchSTMode];
+ }
+}
+
+multiclass VFLAT_Real_SVS_gfx12<bits<8> op, string ps, string opName> {
+ def _SVS_gfx12 : VFLAT_Real_gfx12<op, !cast<FLAT_Pseudo>(ps#"_SVS"), opName> {
+ let OtherPredicates = [HasFlatScratchSVSMode];
+ }
+}
+
+multiclass VFLAT_Real_Atomics_gfx12<bits<8> op, string ps, string opName, int renamed = false, string alias = ""> :
+ VFLAT_Real_Base_gfx12<op, ps, opName, renamed, alias>,
+ VFLAT_Real_RTN_gfx12<op, ps, opName>;
+
+multiclass VGLOBAL_Real_AllAddr_gfx12<bits<8> op, string ps, string opName, int renamed = false, string alias = ""> :
+ VFLAT_Real_Base_gfx12<op, ps, opName, renamed, alias>,
+ VFLAT_Real_SADDR_gfx12<op, ps, opName>;
+
+multiclass VGLOBAL_Real_Atomics_gfx12<bits<8> op, string ps, string opName, int renamed = false, string alias = ""> :
+ VGLOBAL_Real_AllAddr_gfx12<op, ps, opName, renamed, alias>,
+ VFLAT_Real_RTN_gfx12<op, ps, opName>,
+ VFLAT_Real_SADDR_RTN_gfx12<op, ps, opName>;
+
+multiclass VSCRATCH_Real_AllAddr_gfx12<bits<8> op, string ps, string opName, int renamed = false> :
+ VFLAT_Real_Base_gfx12<op, ps, opName, renamed>,
+ VFLAT_Real_SADDR_gfx12<op, ps, opName>,
+ VFLAT_Real_ST_gfx12<op, ps, opName>,
+ VFLAT_Real_SVS_gfx12<op, ps, opName>;
+
+// ENC_VFLAT.
+defm FLAT_LOAD_U8 : VFLAT_Real_Base_gfx12<0x010, "FLAT_LOAD_UBYTE", "flat_load_u8", true>;
+defm FLAT_LOAD_I8 : VFLAT_Real_Base_gfx12<0x011, "FLAT_LOAD_SBYTE", "flat_load_i8", true>;
+defm FLAT_LOAD_U16 : VFLAT_Real_Base_gfx12<0x012, "FLAT_LOAD_USHORT", "flat_load_u16", true>;
+defm FLAT_LOAD_I16 : VFLAT_Real_Base_gfx12<0x013, "FLAT_LOAD_SSHORT", "flat_load_i16", true>;
+defm FLAT_LOAD_B32 : VFLAT_Real_Base_gfx12<0x014, "FLAT_LOAD_DWORD", "flat_load_b32", true>;
+defm FLAT_LOAD_B64 : VFLAT_Real_Base_gfx12<0x015, "FLAT_LOAD_DWORDX2", "flat_load_b64", true>;
+defm FLAT_LOAD_B96 : VFLAT_Real_Base_gfx12<0x016, "FLAT_LOAD_DWORDX3", "flat_load_b96", true>;
+defm FLAT_LOAD_B128 : VFLAT_Real_Base_gfx12<0x017, "FLAT_LOAD_DWORDX4", "flat_load_b128", true>;
+defm FLAT_STORE_B8 : VFLAT_Real_Base_gfx12<0x018, "FLAT_STORE_BYTE", "flat_store_b8", true>;
+defm FLAT_STORE_B16 : VFLAT_Real_Base_gfx12<0x019, "FLAT_STORE_SHORT", "flat_store_b16", true>;
+defm FLAT_STORE_B32 : VFLAT_Real_Base_gfx12<0x01a, "FLAT_STORE_DWORD", "flat_store_b32", true>;
+defm FLAT_STORE_B64 : VFLAT_Real_Base_gfx12<0x01b, "FLAT_STORE_DWORDX2", "flat_store_b64", true>;
+defm FLAT_STORE_B96 : VFLAT_Real_Base_gfx12<0x01c, "FLAT_STORE_DWORDX3", "flat_store_b96", true>;
+defm FLAT_STORE_B128 : VFLAT_Real_Base_gfx12<0x01d, "FLAT_STORE_DWORDX4", "flat_store_b128", true>;
+defm FLAT_LOAD_D16_U8 : VFLAT_Real_Base_gfx12<0x01e, "FLAT_LOAD_UBYTE_D16", "flat_load_d16_u8">;
+defm FLAT_LOAD_D16_I8 : VFLAT_Real_Base_gfx12<0x01f, "FLAT_LOAD_SBYTE_D16", "flat_load_d16_i8">;
+defm FLAT_LOAD_D16_B16 : VFLAT_Real_Base_gfx12<0x020, "FLAT_LOAD_SHORT_D16", "flat_load_d16_b16">;
+defm FLAT_LOAD_D16_HI_U8 : VFLAT_Real_Base_gfx12<0x021, "FLAT_LOAD_UBYTE_D16_HI", "flat_load_d16_hi_u8">;
+defm FLAT_LOAD_D16_HI_I8 : VFLAT_Real_Base_gfx12<0x022, "FLAT_LOAD_SBYTE_D16_HI", "flat_load_d16_hi_i8">;
+defm FLAT_LOAD_D16_HI_B16 : VFLAT_Real_Base_gfx12<0x023, "FLAT_LOAD_SHORT_D16_HI", "flat_load_d16_hi_b16">;
+defm FLAT_STORE_D16_HI_B8 : VFLAT_Real_Base_gfx12<0x024, "FLAT_STORE_BYTE_D16_HI", "flat_store_d16_hi_b8">;
+defm FLAT_STORE_D16_HI_B16 : VFLAT_Real_Base_gfx12<0x025, "FLAT_STORE_SHORT_D16_HI", "flat_store_d16_hi_b16">;
+defm FLAT_ATOMIC_SWAP_B32 : VFLAT_Real_Atomics_gfx12<0x033, "FLAT_ATOMIC_SWAP", "flat_atomic_swap_b32", true>;
+defm FLAT_ATOMIC_CMPSWAP_B32 : VFLAT_Real_Atomics_gfx12<0x034, "FLAT_ATOMIC_CMPSWAP", "flat_atomic_cmpswap_b32", true>;
+defm FLAT_ATOMIC_ADD_U32 : VFLAT_Real_Atomics_gfx12<0x035, "FLAT_ATOMIC_ADD", "flat_atomic_add_u32", true>;
+defm FLAT_ATOMIC_SUB_U32 : VFLAT_Real_Atomics_gfx12<0x036, "FLAT_ATOMIC_SUB", "flat_atomic_sub_u32", true>;
+defm FLAT_ATOMIC_SUB_CLAMP_U32 : VFLAT_Real_Atomics_gfx12<0x037, "FLAT_ATOMIC_CSUB_U32", "flat_atomic_sub_clamp_u32", true>;
+defm FLAT_ATOMIC_MIN_I32 : VFLAT_Real_Atomics_gfx12<0x038, "FLAT_ATOMIC_SMIN", "flat_atomic_min_i32", true>;
+defm FLAT_ATOMIC_MIN_U32 : VFLAT_Real_Atomics_gfx12<0x039, "FLAT_ATOMIC_UMIN", "flat_atomic_min_u32", true>;
+defm FLAT_ATOMIC_MAX_I32 : VFLAT_Real_Atomics_gfx12<0x03a, "FLAT_ATOMIC_SMAX", "flat_atomic_max_i32", true>;
+defm FLAT_ATOMIC_MAX_U32 : VFLAT_Real_Atomics_gfx12<0x03b, "FLAT_ATOMIC_UMAX", "flat_atomic_max_u32", true>;
+defm FLAT_ATOMIC_AND_B32 : VFLAT_Real_Atomics_gfx12<0x03c, "FLAT_ATOMIC_AND", "flat_atomic_and_b32", true>;
+defm FLAT_ATOMIC_OR_B32 : VFLAT_Real_Atomics_gfx12<0x03d, "FLAT_ATOMIC_OR", "flat_atomic_or_b32", true>;
+defm FLAT_ATOMIC_XOR_B32 : VFLAT_Real_Atomics_gfx12<0x03e, "FLAT_ATOMIC_XOR", "flat_atomic_xor_b32", true>;
+defm FLAT_ATOMIC_INC_U32 : VFLAT_Real_Atomics_gfx12<0x03f, "FLAT_ATOMIC_INC", "flat_atomic_inc_u32", true>;
+defm FLAT_ATOMIC_DEC_U32 : VFLAT_Real_Atomics_gfx12<0x040, "FLAT_ATOMIC_DEC", "flat_atomic_dec_u32", true>;
+defm FLAT_ATOMIC_SWAP_B64 : VFLAT_Real_Atomics_gfx12<0x041, "FLAT_ATOMIC_SWAP_X2", "flat_atomic_swap_b64", true>;
+defm FLAT_ATOMIC_CMPSWAP_B64 : VFLAT_Real_Atomics_gfx12<0x042, "FLAT_ATOMIC_CMPSWAP_X2", "flat_atomic_cmpswap_b64", true>;
+defm FLAT_ATOMIC_ADD_U64 : VFLAT_Real_Atomics_gfx12<0x043, "FLAT_ATOMIC_ADD_X2", "flat_atomic_add_u64", true>;
+defm FLAT_ATOMIC_SUB_U64 : VFLAT_Real_Atomics_gfx12<0x044, "FLAT_ATOMIC_SUB_X2", "flat_atomic_sub_u64", true>;
+defm FLAT_ATOMIC_MIN_I64 : VFLAT_Real_Atomics_gfx12<0x045, "FLAT_ATOMIC_SMIN_X2", "flat_atomic_min_i64", true>;
+defm FLAT_ATOMIC_MIN_U64 : VFLAT_Real_Atomics_gfx12<0x046, "FLAT_ATOMIC_UMIN_X2", "flat_atomic_min_u64", true>;
+defm FLAT_ATOMIC_MAX_I64 : VFLAT_Real_Atomics_gfx12<0x047, "FLAT_ATOMIC_SMAX_X2", "flat_atomic_max_i64", true>;
+defm FLAT_ATOMIC_MAX_U64 : VFLAT_Real_Atomics_gfx12<0x048, "FLAT_ATOMIC_UMAX_X2", "flat_atomic_max_u64", true>;
+defm FLAT_ATOMIC_AND_B64 : VFLAT_Real_Atomics_gfx12<0x049, "FLAT_ATOMIC_AND_X2", "flat_atomic_and_b64", true>;
+defm FLAT_ATOMIC_OR_B64 : VFLAT_Real_Atomics_gfx12<0x04a, "FLAT_ATOMIC_OR_X2", "flat_atomic_or_b64", true>;
+defm FLAT_ATOMIC_XOR_B64 : VFLAT_Real_Atomics_gfx12<0x04b, "FLAT_ATOMIC_XOR_X2", "flat_atomic_xor_b64", true>;
+defm FLAT_ATOMIC_INC_U64 : VFLAT_Real_Atomics_gfx12<0x04c, "FLAT_ATOMIC_INC_X2", "flat_atomic_inc_u64", true>;
+defm FLAT_ATOMIC_DEC_U64 : VFLAT_Real_Atomics_gfx12<0x04d, "FLAT_ATOMIC_DEC_X2", "flat_atomic_dec_u64", true>;
+defm FLAT_ATOMIC_MIN_NUM_F32 : VFLAT_Real_Atomics_gfx12<0x051, "FLAT_ATOMIC_FMIN", "flat_atomic_min_num_f32", true, "flat_atomic_min_f32">;
+defm FLAT_ATOMIC_MAX_NUM_F32 : VFLAT_Real_Atomics_gfx12<0x052, "FLAT_ATOMIC_FMAX", "flat_atomic_max_num_f32", true, "flat_atomic_max_f32">;
+defm FLAT_ATOMIC_ADD_F32 : VFLAT_Real_Atomics_gfx12<0x056, "FLAT_ATOMIC_ADD_F32", "flat_atomic_add_f32">;
+
+// ENC_VGLOBAL.
+defm GLOBAL_LOAD_U8 : VGLOBAL_Real_AllAddr_gfx12<0x010, "GLOBAL_LOAD_UBYTE", "global_load_u8", true>;
+defm GLOBAL_LOAD_I8 : VGLOBAL_Real_AllAddr_gfx12<0x011, "GLOBAL_LOAD_SBYTE", "global_load_i8", true>;
+defm GLOBAL_LOAD_U16 : VGLOBAL_Real_AllAddr_gfx12<0x012, "GLOBAL_LOAD_USHORT", "global_load_u16", true>;
+defm GLOBAL_LOAD_I16 : VGLOBAL_Real_AllAddr_gfx12<0x013, "GLOBAL_LOAD_SSHORT", "global_load_i16", true>;
+defm GLOBAL_LOAD_B32 : VGLOBAL_Real_AllAddr_gfx12<0x014, "GLOBAL_LOAD_DWORD", "global_load_b32", true>;
+defm GLOBAL_LOAD_B64 : VGLOBAL_Real_AllAddr_gfx12<0x015, "GLOBAL_LOAD_DWORDX2", "global_load_b64", true>;
+defm GLOBAL_LOAD_B96 : VGLOBAL_Real_AllAddr_gfx12<0x016, "GLOBAL_LOAD_DWORDX3", "global_load_b96", true>;
+defm GLOBAL_LOAD_B128 : VGLOBAL_Real_AllAddr_gfx12<0x017, "GLOBAL_LOAD_DWORDX4", "global_load_b128", true>;
+defm GLOBAL_STORE_B8 : VGLOBAL_Real_AllAddr_gfx12<0x018, "GLOBAL_STORE_BYTE", "global_store_b8", true>;
+defm GLOBAL_STORE_B16 : VGLOBAL_Real_AllAddr_gfx12<0x019, "GLOBAL_STORE_SHORT", "global_store_b16", true>;
+defm GLOBAL_STORE_B32 : VGLOBAL_Real_AllAddr_gfx12<0x01a, "GLOBAL_STORE_DWORD", "global_store_b32", true>;
+defm GLOBAL_STORE_B64 : VGLOBAL_Real_AllAddr_gfx12<0x01b, "GLOBAL_STORE_DWORDX2", "global_store_b64", true>;
+defm GLOBAL_STORE_B96 : VGLOBAL_Real_AllAddr_gfx12<0x01c, "GLOBAL_STORE_DWORDX3", "global_store_b96", true>;
+defm GLOBAL_STORE_B128 : VGLOBAL_Real_AllAddr_gfx12<0x01d, "GLOBAL_STORE_DWORDX4", "global_store_b128", true>;
+defm GLOBAL_LOAD_D16_U8 : VGLOBAL_Real_AllAddr_gfx12<0x01e, "GLOBAL_LOAD_UBYTE_D16", "global_load_d16_u8">;
+defm GLOBAL_LOAD_D16_I8 : VGLOBAL_Real_AllAddr_gfx12<0x01f, "GLOBAL_LOAD_SBYTE_D16", "global_load_d16_i8">;
+defm GLOBAL_LOAD_D16_B16 : VGLOBAL_Real_AllAddr_gfx12<0x020, "GLOBAL_LOAD_SHORT_D16", "global_load_d16_b16">;
+defm GLOBAL_LOAD_D16_HI_U8 : VGLOBAL_Real_AllAddr_gfx12<0x021, "GLOBAL_LOAD_UBYTE_D16_HI", "global_load_d16_hi_u8">;
+defm GLOBAL_LOAD_D16_HI_I8 : VGLOBAL_Real_AllAddr_gfx12<0x022, "GLOBAL_LOAD_SBYTE_D16_HI", "global_load_d16_hi_i8">;
+defm GLOBAL_LOAD_D16_HI_B16 : VGLOBAL_Real_AllAddr_gfx12<0x023, "GLOBAL_LOAD_SHORT_D16_HI", "global_load_d16_hi_b16">;
+defm GLOBAL_STORE_D16_HI_B8 : VGLOBAL_Real_AllAddr_gfx12<0x024, "GLOBAL_STORE_BYTE_D16_HI", "global_store_d16_hi_b8">;
+defm GLOBAL_STORE_D16_HI_B16 : VGLOBAL_Real_AllAddr_gfx12<0x025, "GLOBAL_STORE_SHORT_D16_HI", "global_store_d16_hi_b16">;
+defm GLOBAL_LOAD_ADDTID_B32 : VGLOBAL_Real_AllAddr_gfx12<0x028, "GLOBAL_LOAD_DWORD_ADDTID", "global_load_addtid_b32">;
+defm GLOBAL_STORE_ADDTID_B32 : VGLOBAL_Real_AllAddr_gfx12<0x029, "GLOBAL_STORE_DWORD_ADDTID", "global_store_addtid_b32">;
+
+defm GLOBAL_ATOMIC_SWAP_B32 : VGLOBAL_Real_Atomics_gfx12<0x033, "GLOBAL_ATOMIC_SWAP", "global_atomic_swap_b32", true>;
+defm GLOBAL_ATOMIC_CMPSWAP_B32 : VGLOBAL_Real_Atomics_gfx12<0x034, "GLOBAL_ATOMIC_CMPSWAP", "global_atomic_cmpswap_b32", true>;
+defm GLOBAL_ATOMIC_ADD_U32 : VGLOBAL_Real_Atomics_gfx12<0x035, "GLOBAL_ATOMIC_ADD", "global_atomic_add_u32", true>;
+defm GLOBAL_ATOMIC_SUB_U32 : VGLOBAL_Real_Atomics_gfx12<0x036, "GLOBAL_ATOMIC_SUB", "global_atomic_sub_u32", true>;
+defm GLOBAL_ATOMIC_SUB_CLAMP_U32 : VGLOBAL_Real_Atomics_gfx12<0x037, "GLOBAL_ATOMIC_CSUB", "global_atomic_sub_clamp_u32", true, "global_atomic_csub_u32">;
+defm GLOBAL_ATOMIC_MIN_I32 : VGLOBAL_Real_Atomics_gfx12<0x038, "GLOBAL_ATOMIC_SMIN", "global_atomic_min_i32", true>;
+defm GLOBAL_ATOMIC_MIN_U32 : VGLOBAL_Real_Atomics_gfx12<0x039, "GLOBAL_ATOMIC_UMIN", "global_atomic_min_u32", true>;
+defm GLOBAL_ATOMIC_MAX_I32 : VGLOBAL_Real_Atomics_gfx12<0x03a, "GLOBAL_ATOMIC_SMAX", "global_atomic_max_i32", true>;
+defm GLOBAL_ATOMIC_MAX_U32 : VGLOBAL_Real_Atomics_gfx12<0x03b, "GLOBAL_ATOMIC_UMAX", "global_atomic_max_u32", true>;
+defm GLOBAL_ATOMIC_AND_B32 : VGLOBAL_Real_Atomics_gfx12<0x03c, "GLOBAL_ATOMIC_AND", "global_atomic_and_b32", true>;
+defm GLOBAL_ATOMIC_OR_B32 : VGLOBAL_Real_Atomics_gfx12<0x03d, "GLOBAL_ATOMIC_OR", "global_atomic_or_b32", true>;
+defm GLOBAL_ATOMIC_XOR_B32 : VGLOBAL_Real_Atomics_gfx12<0x03e, "GLOBAL_ATOMIC_XOR", "global_atomic_xor_b32", true>;
+defm GLOBAL_ATOMIC_INC_U32 : VGLOBAL_Real_Atomics_gfx12<0x03f, "GLOBAL_ATOMIC_INC", "global_atomic_inc_u32", true>;
+defm GLOBAL_ATOMIC_DEC_U32 : VGLOBAL_Real_Atomics_gfx12<0x040, "GLOBAL_ATOMIC_DEC", "global_atomic_dec_u32", true>;
+defm GLOBAL_ATOMIC_SWAP_B64 : VGLOBAL_Real_Atomics_gfx12<0x041, "GLOBAL_ATOMIC_SWAP_X2", "global_atomic_swap_b64", true>;
+defm GLOBAL_ATOMIC_CMPSWAP_B64 : VGLOBAL_Real_Atomics_gfx12<0x042, "GLOBAL_ATOMIC_CMPSWAP_X2", "global_atomic_cmpswap_b64", true>;
+defm GLOBAL_ATOMIC_ADD_U64 : VGLOBAL_Real_Atomics_gfx12<0x043, "GLOBAL_ATOMIC_ADD_X2", "global_atomic_add_u64", true>;
+defm GLOBAL_ATOMIC_SUB_U64 : VGLOBAL_Real_Atomics_gfx12<0x044, "GLOBAL_ATOMIC_SUB_X2", "global_atomic_sub_u64", true>;
+defm GLOBAL_ATOMIC_MIN_I64 : VGLOBAL_Real_Atomics_gfx12<0x045, "GLOBAL_ATOMIC_SMIN_X2", "global_atomic_min_i64", true>;
+defm GLOBAL_ATOMIC_MIN_U64 : VGLOBAL_Real_Atomics_gfx12<0x046, "GLOBAL_ATOMIC_UMIN_X2", "global_atomic_min_u64", true>;
+defm GLOBAL_ATOMIC_MAX_I64 : VGLOBAL_Real_Atomics_gfx12<0x047, "GLOBAL_ATOMIC_SMAX_X2", "global_atomic_max_i64", true>;
+defm GLOBAL_ATOMIC_MAX_U64 : VGLOBAL_Real_Atomics_gfx12<0x048, "GLOBAL_ATOMIC_UMAX_X2", "global_atomic_max_u64", true>;
+defm GLOBAL_ATOMIC_AND_B64 : VGLOBAL_Real_Atomics_gfx12<0x049, "GLOBAL_ATOMIC_AND_X2", "global_atomic_and_b64", true>;
+defm GLOBAL_ATOMIC_OR_B64 : VGLOBAL_Real_Atomics_gfx12<0x04a, "GLOBAL_ATOMIC_OR_X2", "global_atomic_or_b64", true>;
+defm GLOBAL_ATOMIC_XOR_B64 : VGLOBAL_Real_Atomics_gfx12<0x04b, "GLOBAL_ATOMIC_XOR_X2", "global_atomic_xor_b64", true>;
+defm GLOBAL_ATOMIC_INC_U64 : VGLOBAL_Real_Atomics_gfx12<0x04c, "GLOBAL_ATOMIC_INC_X2", "global_atomic_inc_u64", true>;
+defm GLOBAL_ATOMIC_DEC_U64 : VGLOBAL_Real_Atomics_gfx12<0x04d, "GLOBAL_ATOMIC_DEC_X2", "global_atomic_dec_u64", true>;
+defm GLOBAL_ATOMIC_MIN_NUM_F32 : VGLOBAL_Real_Atomics_gfx12<0x051, "GLOBAL_ATOMIC_FMIN", "global_atomic_min_num_f32", true, "global_atomic_min_f32">;
+defm GLOBAL_ATOMIC_MAX_NUM_F32 : VGLOBAL_Real_Atomics_gfx12<0x052, "GLOBAL_ATOMIC_FMAX", "global_atomic_max_num_f32", true, "global_atomic_max_f32">;
+defm GLOBAL_ATOMIC_ADD_F32 : VGLOBAL_Real_Atomics_gfx12<0x056, "GLOBAL_ATOMIC_ADD_F32", "global_atomic_add_f32">;
+
+// ENC_VSCRATCH.
+defm SCRATCH_LOAD_U8 : VSCRATCH_Real_AllAddr_gfx12<0x10, "SCRATCH_LOAD_UBYTE", "scratch_load_u8", true>;
+defm SCRATCH_LOAD_I8 : VSCRATCH_Real_AllAddr_gfx12<0x11, "SCRATCH_LOAD_SBYTE", "scratch_load_i8", true>;
+defm SCRATCH_LOAD_U16 : VSCRATCH_Real_AllAddr_gfx12<0x12, "SCRATCH_LOAD_USHORT", "scratch_load_u16", true>;
+defm SCRATCH_LOAD_I16 : VSCRATCH_Real_AllAddr_gfx12<0x13, "SCRATCH_LOAD_SSHORT", "scratch_load_i16", true>;
+defm SCRATCH_LOAD_B32 : VSCRATCH_Real_AllAddr_gfx12<0x14, "SCRATCH_LOAD_DWORD", "scratch_load_b32", true>;
+defm SCRATCH_LOAD_B64 : VSCRATCH_Real_AllAddr_gfx12<0x15, "SCRATCH_LOAD_DWORDX2", "scratch_load_b64", true>;
+defm SCRATCH_LOAD_B96 : VSCRATCH_Real_AllAddr_gfx12<0x16, "SCRATCH_LOAD_DWORDX3", "scratch_load_b96", true>;
+defm SCRATCH_LOAD_B128 : VSCRATCH_Real_AllAddr_gfx12<0x17, "SCRATCH_LOAD_DWORDX4", "scratch_load_b128", true>;
+defm SCRATCH_STORE_B8 : VSCRATCH_Real_AllAddr_gfx12<0x18, "SCRATCH_STORE_BYTE", "scratch_store_b8", true>;
+defm SCRATCH_STORE_B16 : VSCRATCH_Real_AllAddr_gfx12<0x19, "SCRATCH_STORE_SHORT", "scratch_store_b16", true>;
+defm SCRATCH_STORE_B32 : VSCRATCH_Real_AllAddr_gfx12<0x1a, "SCRATCH_STORE_DWORD", "scratch_store_b32", true>;
+defm SCRATCH_STORE_B64 : VSCRATCH_Real_AllAddr_gfx12<0x1b, "SCRATCH_STORE_DWORDX2", "scratch_store_b64", true>;
+defm SCRATCH_STORE_B96 : VSCRATCH_Real_AllAddr_gfx12<0x1c, "SCRATCH_STORE_DWORDX3", "scratch_store_b96", true>;
+defm SCRATCH_STORE_B128 : VSCRATCH_Real_AllAddr_gfx12<0x1d, "SCRATCH_STORE_DWORDX4", "scratch_store_b128", true>;
+defm SCRATCH_LOAD_D16_U8 : VSCRATCH_Real_AllAddr_gfx12<0x1e, "SCRATCH_LOAD_UBYTE_D16", "scratch_load_d16_u8">;
+defm SCRATCH_LOAD_D16_I8 : VSCRATCH_Real_AllAddr_gfx12<0x1f, "SCRATCH_LOAD_SBYTE_D16", "scratch_load_d16_i8">;
+defm SCRATCH_LOAD_D16_B16 : VSCRATCH_Real_AllAddr_gfx12<0x20, "SCRATCH_LOAD_SHORT_D16", "scratch_load_d16_b16">;
+defm SCRATCH_LOAD_D16_HI_U8 : VSCRATCH_Real_AllAddr_gfx12<0x21, "SCRATCH_LOAD_UBYTE_D16_HI", "scratch_load_d16_hi_u8">;
+defm SCRATCH_LOAD_D16_HI_I8 : VSCRATCH_Real_AllAddr_gfx12<0x22, "SCRATCH_LOAD_SBYTE_D16_HI", "scratch_load_d16_hi_i8">;
+defm SCRATCH_LOAD_D16_HI_B16 : VSCRATCH_Real_AllAddr_gfx12<0x23, "SCRATCH_LOAD_SHORT_D16_HI", "scratch_load_d16_hi_b16">;
+defm SCRATCH_STORE_D16_HI_B8 : VSCRATCH_Real_AllAddr_gfx12<0x24, "SCRATCH_STORE_BYTE_D16_HI", "scratch_store_d16_hi_b8">;
+defm SCRATCH_STORE_D16_HI_B16 : VSCRATCH_Real_AllAddr_gfx12<0x25, "SCRATCH_STORE_SHORT_D16_HI", "scratch_store_d16_hi_b16">;
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/GCNCreateVOPD.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/GCNCreateVOPD.cpp
index c9e0c6849568..05e10a95b157 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/GCNCreateVOPD.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/GCNCreateVOPD.cpp
@@ -25,7 +25,6 @@
#include "Utils/AMDGPUBaseInfo.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/Statistic.h"
-#include "llvm/ADT/StringMap.h"
#include "llvm/CodeGen/MachineBasicBlock.h"
#include "llvm/CodeGen/MachineInstr.h"
#include "llvm/CodeGen/MachineOperand.h"
@@ -72,8 +71,11 @@ public:
auto *SecondMI = CI.SecondMI;
unsigned Opc1 = FirstMI->getOpcode();
unsigned Opc2 = SecondMI->getOpcode();
- int NewOpcode = AMDGPU::getVOPDFull(AMDGPU::getVOPDOpcode(Opc1),
- AMDGPU::getVOPDOpcode(Opc2));
+ unsigned EncodingFamily =
+ AMDGPU::getVOPDEncodingFamily(SII->getSubtarget());
+ int NewOpcode =
+ AMDGPU::getVOPDFull(AMDGPU::getVOPDOpcode(Opc1),
+ AMDGPU::getVOPDOpcode(Opc2), EncodingFamily);
assert(NewOpcode != -1 &&
"Should have previously determined this as a possible VOPD\n");
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/GCNDPPCombine.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/GCNDPPCombine.cpp
index 2592584b89c6..a75082268c77 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/GCNDPPCombine.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/GCNDPPCombine.cpp
@@ -191,6 +191,16 @@ MachineOperand *GCNDPPCombine::getOldOpndValue(MachineOperand &OldOpnd) const {
return &OldOpnd;
}
+[[maybe_unused]] static unsigned getOperandSize(MachineInstr &MI, unsigned Idx,
+ MachineRegisterInfo &MRI) {
+ int16_t RegClass = MI.getDesc().operands()[Idx].RegClass;
+ if (RegClass == -1)
+ return 0;
+
+ const TargetRegisterInfo *TRI = MRI.getTargetRegisterInfo();
+ return TRI->getRegSizeInBits(*TRI->getRegClass(RegClass));
+}
+
MachineInstr *GCNDPPCombine::createDPPInst(MachineInstr &OrigMI,
MachineInstr &MovMI,
RegSubRegPair CombOldVGPR,
@@ -278,6 +288,7 @@ MachineInstr *GCNDPPCombine::createDPPInst(MachineInstr &OrigMI,
}
auto *Src0 = TII->getNamedOperand(MovMI, AMDGPU::OpName::src0);
assert(Src0);
+ int Src0Idx = NumOperands;
if (!TII->isOperandLegal(*DPPInst.getInstr(), NumOperands, Src0)) {
LLVM_DEBUG(dbgs() << " failed: src0 is illegal\n");
Fail = true;
@@ -301,7 +312,17 @@ MachineInstr *GCNDPPCombine::createDPPInst(MachineInstr &OrigMI,
}
auto *Src1 = TII->getNamedOperand(OrigMI, AMDGPU::OpName::src1);
if (Src1) {
- if (!TII->isOperandLegal(*DPPInst.getInstr(), NumOperands, Src1)) {
+ int OpNum = NumOperands;
+ // If subtarget does not support SGPRs for src1 operand then the
+ // requirements are the same as for src0. We check src0 instead because
+ // pseudos are shared between subtargets and allow SGPR for src1 on all.
+ if (!ST->hasDPPSrc1SGPR()) {
+ assert(getOperandSize(*DPPInst, Src0Idx, *MRI) ==
+ getOperandSize(*DPPInst, NumOperands, *MRI) &&
+ "Src0 and Src1 operands should have the same size");
+ OpNum = Src0Idx;
+ }
+ if (!TII->isOperandLegal(*DPPInst.getInstr(), OpNum, Src1)) {
LLVM_DEBUG(dbgs() << " failed: src1 is illegal\n");
Fail = true;
break;
@@ -505,7 +526,7 @@ bool GCNDPPCombine::combineDPPMov(MachineInstr &MovMI) const {
MovMI.getOpcode() == AMDGPU::V_MOV_B64_dpp) {
auto *DppCtrl = TII->getNamedOperand(MovMI, AMDGPU::OpName::dpp_ctrl);
assert(DppCtrl && DppCtrl->isImm());
- if (!AMDGPU::isLegal64BitDPPControl(DppCtrl->getImm())) {
+ if (!AMDGPU::isLegalDPALU_DPPControl(DppCtrl->getImm())) {
LLVM_DEBUG(dbgs() << " failed: 64 bit dpp move uses unsupported"
" control value\n");
// Let it split, then control may become legal.
@@ -728,7 +749,7 @@ bool GCNDPPCombine::runOnMachineFunction(MachineFunction &MF) {
++NumDPPMovsCombined;
} else if (MI.getOpcode() == AMDGPU::V_MOV_B64_DPP_PSEUDO ||
MI.getOpcode() == AMDGPU::V_MOV_B64_dpp) {
- if (ST->has64BitDPP() && combineDPPMov(MI)) {
+ if (ST->hasDPALU_DPP() && combineDPPMov(MI)) {
Changed = true;
++NumDPPMovsCombined;
} else {
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
index 2d53b2a70dbe..a7d8ff0242b8 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
@@ -163,7 +163,9 @@ static bool isSendMsgTraceDataOrGDS(const SIInstrInfo &TII,
static bool isPermlane(const MachineInstr &MI) {
unsigned Opcode = MI.getOpcode();
return Opcode == AMDGPU::V_PERMLANE16_B32_e64 ||
- Opcode == AMDGPU::V_PERMLANEX16_B32_e64;
+ Opcode == AMDGPU::V_PERMLANEX16_B32_e64 ||
+ Opcode == AMDGPU::V_PERMLANE16_VAR_B32_e64 ||
+ Opcode == AMDGPU::V_PERMLANEX16_VAR_B32_e64;
}
static bool isLdsDma(const MachineInstr &MI) {
@@ -271,7 +273,7 @@ GCNHazardRecognizer::getMFMAPipelineWaitStates(const MachineInstr &MI) const {
const MCSchedClassDesc *SC = TSchedModel.resolveSchedClass(&MI);
assert(TSchedModel.getWriteProcResBegin(SC) !=
TSchedModel.getWriteProcResEnd(SC));
- return TSchedModel.getWriteProcResBegin(SC)->Cycles;
+ return TSchedModel.getWriteProcResBegin(SC)->ReleaseAtCycle;
}
void GCNHazardRecognizer::processBundle() {
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/GCNIterativeScheduler.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/GCNIterativeScheduler.cpp
index d89c9b1febde..cdc9de7f65e3 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/GCNIterativeScheduler.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/GCNIterativeScheduler.cpp
@@ -251,7 +251,7 @@ GCNIterativeScheduler::getRegionPressure(MachineBasicBlock::iterator Begin,
assert(UPTracker.isValid() ||
(dbgs() << "Tracked region ",
printRegion(dbgs(), Begin, End, LIS), false));
- return UPTracker.moveMaxPressure();
+ return UPTracker.getMaxPressureAndReset();
}
// returns max pressure for a tentative schedule
@@ -272,7 +272,7 @@ GCNIterativeScheduler::getSchedulePressure(const Region &R,
for (auto I = Schedule.end(), B = Schedule.begin(); I != B;) {
RPTracker.recede(*getMachineInstr(*--I));
}
- return RPTracker.moveMaxPressure();
+ return RPTracker.getMaxPressureAndReset();
}
void GCNIterativeScheduler::enterRegion(MachineBasicBlock *BB, // overridden
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/GCNProcessors.td b/contrib/llvm-project/llvm/lib/Target/AMDGPU/GCNProcessors.td
index b9c9358f88b9..96af1a6aab3d 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/GCNProcessors.td
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/GCNProcessors.td
@@ -9,11 +9,11 @@
// The code produced for "generic" is only useful for tests and cannot
// reasonably be expected to execute on any particular target.
def : ProcessorModel<"generic", NoSchedModel,
- [FeatureWavefrontSize64]
+ [FeatureWavefrontSize64, FeatureGDS, FeatureGWS]
>;
def : ProcessorModel<"generic-hsa", NoSchedModel,
- [FeatureWavefrontSize64, FeatureFlatAddressSpace]
+ [FeatureWavefrontSize64, FeatureGDS, FeatureGWS, FeatureFlatAddressSpace]
>;
//===------------------------------------------------------------===//
@@ -279,3 +279,15 @@ def : ProcessorModel<"gfx1150", GFX11SpeedModel,
def : ProcessorModel<"gfx1151", GFX11SpeedModel,
FeatureISAVersion11_5_1.Features
>;
+
+//===----------------------------------------------------------------------===//
+// GCN GFX12.
+//===----------------------------------------------------------------------===//
+
+def : ProcessorModel<"gfx1200", GFX12SpeedModel,
+ FeatureISAVersion12.Features
+>;
+
+def : ProcessorModel<"gfx1201", GFX12SpeedModel,
+ FeatureISAVersion12.Features
+>;
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp
index 68cf97170369..fd8f0bebd3be 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp
@@ -12,6 +12,7 @@
//===----------------------------------------------------------------------===//
#include "GCNRegPressure.h"
+#include "AMDGPU.h"
#include "llvm/CodeGen/RegisterPressure.h"
using namespace llvm;
@@ -31,7 +32,6 @@ bool llvm::isEqual(const GCNRPTracker::LiveRegSet &S1,
return true;
}
-
///////////////////////////////////////////////////////////////////////////////
// GCNRegPressure
@@ -78,7 +78,9 @@ void GCNRegPressure::inc(unsigned Reg,
if (PrevMask.none()) {
assert(NewMask.any());
- Value[Kind] += Sign * MRI.getPressureSets(Reg).getWeight();
+ const TargetRegisterInfo *TRI = MRI.getTargetRegisterInfo();
+ Value[Kind] +=
+ Sign * TRI->getRegClassWeight(MRI.getRegClass(Reg)).RegWeight;
}
break;
@@ -133,8 +135,6 @@ bool GCNRegPressure::less(const GCNSubtarget &ST,
O.getVGPRNum(ST.hasGFX90AInsts()));
}
-#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
-LLVM_DUMP_METHOD
Printable llvm::print(const GCNRegPressure &RP, const GCNSubtarget *ST) {
return Printable([&RP, ST](raw_ostream &OS) {
OS << "VGPRs: " << RP.Value[GCNRegPressure::VGPR32] << ' '
@@ -153,7 +153,6 @@ Printable llvm::print(const GCNRegPressure &RP, const GCNSubtarget *ST) {
OS << '\n';
});
}
-#endif
static LaneBitmask getDefRegMask(const MachineOperand &MO,
const MachineRegisterInfo &MRI) {
@@ -167,66 +166,60 @@ static LaneBitmask getDefRegMask(const MachineOperand &MO,
MRI.getTargetRegisterInfo()->getSubRegIndexLaneMask(MO.getSubReg());
}
-static LaneBitmask getUsedRegMask(const MachineOperand &MO,
- const MachineRegisterInfo &MRI,
- const LiveIntervals &LIS) {
- assert(MO.isUse() && MO.isReg() && MO.getReg().isVirtual());
-
- if (auto SubReg = MO.getSubReg())
- return MRI.getTargetRegisterInfo()->getSubRegIndexLaneMask(SubReg);
-
- auto MaxMask = MRI.getMaxLaneMaskForVReg(MO.getReg());
- if (SIRegisterInfo::getNumCoveredRegs(MaxMask) > 1) // cannot have subregs
- return MaxMask;
-
- // For a tentative schedule LIS isn't updated yet but livemask should remain
- // the same on any schedule. Subreg defs can be reordered but they all must
- // dominate uses anyway.
- auto SI = LIS.getInstructionIndex(*MO.getParent()).getBaseIndex();
- return getLiveLaneMask(MO.getReg(), SI, LIS, MRI);
-}
-
-static SmallVector<RegisterMaskPair, 8>
-collectVirtualRegUses(const MachineInstr &MI, const LiveIntervals &LIS,
+static void
+collectVirtualRegUses(SmallVectorImpl<RegisterMaskPair> &RegMaskPairs,
+ const MachineInstr &MI, const LiveIntervals &LIS,
const MachineRegisterInfo &MRI) {
- SmallVector<RegisterMaskPair, 8> Res;
+ SlotIndex InstrSI;
for (const auto &MO : MI.operands()) {
if (!MO.isReg() || !MO.getReg().isVirtual())
continue;
if (!MO.isUse() || !MO.readsReg())
continue;
- auto const UsedMask = getUsedRegMask(MO, MRI, LIS);
+ Register Reg = MO.getReg();
+ if (llvm::any_of(RegMaskPairs, [Reg](const RegisterMaskPair &RM) {
+ return RM.RegUnit == Reg;
+ }))
+ continue;
- auto Reg = MO.getReg();
- auto I = llvm::find_if(
- Res, [Reg](const RegisterMaskPair &RM) { return RM.RegUnit == Reg; });
- if (I != Res.end())
- I->LaneMask |= UsedMask;
- else
- Res.push_back(RegisterMaskPair(Reg, UsedMask));
+ LaneBitmask UseMask;
+ auto &LI = LIS.getInterval(Reg);
+ if (!LI.hasSubRanges())
+ UseMask = MRI.getMaxLaneMaskForVReg(Reg);
+ else {
+ // For a tentative schedule LIS isn't updated yet but livemask should
+ // remain the same on any schedule. Subreg defs can be reordered but they
+ // all must dominate uses anyway.
+ if (!InstrSI)
+ InstrSI = LIS.getInstructionIndex(*MO.getParent()).getBaseIndex();
+ UseMask = getLiveLaneMask(LI, InstrSI, MRI);
+ }
+
+ RegMaskPairs.emplace_back(Reg, UseMask);
}
- return Res;
}
///////////////////////////////////////////////////////////////////////////////
// GCNRPTracker
-LaneBitmask llvm::getLiveLaneMask(unsigned Reg,
- SlotIndex SI,
+LaneBitmask llvm::getLiveLaneMask(unsigned Reg, SlotIndex SI,
const LiveIntervals &LIS,
const MachineRegisterInfo &MRI) {
+ return getLiveLaneMask(LIS.getInterval(Reg), SI, MRI);
+}
+
+LaneBitmask llvm::getLiveLaneMask(const LiveInterval &LI, SlotIndex SI,
+ const MachineRegisterInfo &MRI) {
LaneBitmask LiveMask;
- const auto &LI = LIS.getInterval(Reg);
if (LI.hasSubRanges()) {
for (const auto &S : LI.subranges())
if (S.liveAt(SI)) {
LiveMask |= S.LaneMask;
- assert(LiveMask < MRI.getMaxLaneMaskForVReg(Reg) ||
- LiveMask == MRI.getMaxLaneMaskForVReg(Reg));
+ assert(LiveMask == (LiveMask & MRI.getMaxLaneMaskForVReg(LI.reg())));
}
} else if (LI.liveAt(SI)) {
- LiveMask = MRI.getMaxLaneMaskForVReg(Reg);
+ LiveMask = MRI.getMaxLaneMaskForVReg(LI.reg());
}
return LiveMask;
}
@@ -262,9 +255,15 @@ void GCNRPTracker::reset(const MachineInstr &MI,
MaxPressure = CurPressure = getRegPressure(*MRI, LiveRegs);
}
-void GCNUpwardRPTracker::reset(const MachineInstr &MI,
- const LiveRegSet *LiveRegsCopy) {
- GCNRPTracker::reset(MI, LiveRegsCopy, true);
+////////////////////////////////////////////////////////////////////////////////
+// GCNUpwardRPTracker
+
+void GCNUpwardRPTracker::reset(const MachineRegisterInfo &MRI_,
+ const LiveRegSet &LiveRegs_) {
+ MRI = &MRI_;
+ LiveRegs = LiveRegs_;
+ LastTrackedMI = nullptr;
+ MaxPressure = CurPressure = getRegPressure(MRI_, LiveRegs_);
}
void GCNUpwardRPTracker::recede(const MachineInstr &MI) {
@@ -275,41 +274,61 @@ void GCNUpwardRPTracker::recede(const MachineInstr &MI) {
if (MI.isDebugInstr())
return;
- auto const RegUses = collectVirtualRegUses(MI, LIS, *MRI);
+ // Kill all defs.
+ GCNRegPressure DefPressure, ECDefPressure;
+ bool HasECDefs = false;
+ for (const MachineOperand &MO : MI.all_defs()) {
+ if (!MO.getReg().isVirtual())
+ continue;
- // calc pressure at the MI (defs + uses)
- auto AtMIPressure = CurPressure;
- for (const auto &U : RegUses) {
- auto LiveMask = LiveRegs[U.RegUnit];
- AtMIPressure.inc(U.RegUnit, LiveMask, LiveMask | U.LaneMask, *MRI);
- }
- // update max pressure
- MaxPressure = max(AtMIPressure, MaxPressure);
+ Register Reg = MO.getReg();
+ LaneBitmask DefMask = getDefRegMask(MO, *MRI);
- for (const auto &MO : MI.all_defs()) {
- if (!MO.getReg().isVirtual() || MO.isDead())
- continue;
+ // Treat a def as fully live at the moment of definition: keep a record.
+ if (MO.isEarlyClobber()) {
+ ECDefPressure.inc(Reg, LaneBitmask::getNone(), DefMask, *MRI);
+ HasECDefs = true;
+ } else
+ DefPressure.inc(Reg, LaneBitmask::getNone(), DefMask, *MRI);
- auto Reg = MO.getReg();
auto I = LiveRegs.find(Reg);
if (I == LiveRegs.end())
continue;
- auto &LiveMask = I->second;
- auto PrevMask = LiveMask;
- LiveMask &= ~getDefRegMask(MO, *MRI);
+
+ LaneBitmask &LiveMask = I->second;
+ LaneBitmask PrevMask = LiveMask;
+ LiveMask &= ~DefMask;
CurPressure.inc(Reg, PrevMask, LiveMask, *MRI);
if (LiveMask.none())
LiveRegs.erase(I);
}
- for (const auto &U : RegUses) {
- auto &LiveMask = LiveRegs[U.RegUnit];
- auto PrevMask = LiveMask;
+
+ // Update MaxPressure with defs pressure.
+ DefPressure += CurPressure;
+ if (HasECDefs)
+ DefPressure += ECDefPressure;
+ MaxPressure = max(DefPressure, MaxPressure);
+
+ // Make uses alive.
+ SmallVector<RegisterMaskPair, 8> RegUses;
+ collectVirtualRegUses(RegUses, MI, LIS, *MRI);
+ for (const RegisterMaskPair &U : RegUses) {
+ LaneBitmask &LiveMask = LiveRegs[U.RegUnit];
+ LaneBitmask PrevMask = LiveMask;
LiveMask |= U.LaneMask;
CurPressure.inc(U.RegUnit, PrevMask, LiveMask, *MRI);
}
+
+ // Update MaxPressure with uses plus early-clobber defs pressure.
+ MaxPressure = HasECDefs ? max(CurPressure + ECDefPressure, MaxPressure)
+ : max(CurPressure, MaxPressure);
+
assert(CurPressure == getRegPressure(*MRI, LiveRegs));
}
+////////////////////////////////////////////////////////////////////////////////
+// GCNDownwardRPTracker
+
bool GCNDownwardRPTracker::reset(const MachineInstr &MI,
const LiveRegSet *LiveRegsCopy) {
MRI = &MI.getParent()->getParent()->getRegInfo();
@@ -416,19 +435,17 @@ bool GCNDownwardRPTracker::advance(MachineBasicBlock::const_iterator Begin,
return advance(End);
}
-#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
-LLVM_DUMP_METHOD
Printable llvm::reportMismatch(const GCNRPTracker::LiveRegSet &LISLR,
const GCNRPTracker::LiveRegSet &TrackedLR,
- const TargetRegisterInfo *TRI) {
- return Printable([&LISLR, &TrackedLR, TRI](raw_ostream &OS) {
+ const TargetRegisterInfo *TRI, StringRef Pfx) {
+ return Printable([&LISLR, &TrackedLR, TRI, Pfx](raw_ostream &OS) {
for (auto const &P : TrackedLR) {
auto I = LISLR.find(P.first);
if (I == LISLR.end()) {
- OS << " " << printReg(P.first, TRI) << ":L" << PrintLaneMask(P.second)
+ OS << Pfx << printReg(P.first, TRI) << ":L" << PrintLaneMask(P.second)
<< " isn't found in LIS reported set\n";
} else if (I->second != P.second) {
- OS << " " << printReg(P.first, TRI)
+ OS << Pfx << printReg(P.first, TRI)
<< " masks doesn't match: LIS reported " << PrintLaneMask(I->second)
<< ", tracked " << PrintLaneMask(P.second) << '\n';
}
@@ -436,7 +453,7 @@ Printable llvm::reportMismatch(const GCNRPTracker::LiveRegSet &LISLR,
for (auto const &P : LISLR) {
auto I = TrackedLR.find(P.first);
if (I == TrackedLR.end()) {
- OS << " " << printReg(P.first, TRI) << ":L" << PrintLaneMask(P.second)
+ OS << Pfx << printReg(P.first, TRI) << ":L" << PrintLaneMask(P.second)
<< " isn't found in tracked set\n";
}
}
@@ -465,7 +482,6 @@ bool GCNUpwardRPTracker::isValid() const {
return true;
}
-LLVM_DUMP_METHOD
Printable llvm::print(const GCNRPTracker::LiveRegSet &LiveRegs,
const MachineRegisterInfo &MRI) {
return Printable([&LiveRegs, &MRI](raw_ostream &OS) {
@@ -481,7 +497,163 @@ Printable llvm::print(const GCNRPTracker::LiveRegSet &LiveRegs,
});
}
-LLVM_DUMP_METHOD
void GCNRegPressure::dump() const { dbgs() << print(*this); }
-#endif
+static cl::opt<bool> UseDownwardTracker(
+ "amdgpu-print-rp-downward",
+ cl::desc("Use GCNDownwardRPTracker for GCNRegPressurePrinter pass"),
+ cl::init(false), cl::Hidden);
+
+char llvm::GCNRegPressurePrinter::ID = 0;
+char &llvm::GCNRegPressurePrinterID = GCNRegPressurePrinter::ID;
+
+INITIALIZE_PASS(GCNRegPressurePrinter, "amdgpu-print-rp", "", true, true)
+
+// Return lanemask of Reg's subregs that are live-through at [Begin, End] and
+// are fully covered by Mask.
+static LaneBitmask
+getRegLiveThroughMask(const MachineRegisterInfo &MRI, const LiveIntervals &LIS,
+ Register Reg, SlotIndex Begin, SlotIndex End,
+ LaneBitmask Mask = LaneBitmask::getAll()) {
+
+ auto IsInOneSegment = [Begin, End](const LiveRange &LR) -> bool {
+ auto *Segment = LR.getSegmentContaining(Begin);
+ return Segment && Segment->contains(End);
+ };
+
+ LaneBitmask LiveThroughMask;
+ const LiveInterval &LI = LIS.getInterval(Reg);
+ if (LI.hasSubRanges()) {
+ for (auto &SR : LI.subranges()) {
+ if ((SR.LaneMask & Mask) == SR.LaneMask && IsInOneSegment(SR))
+ LiveThroughMask |= SR.LaneMask;
+ }
+ } else {
+ LaneBitmask RegMask = MRI.getMaxLaneMaskForVReg(Reg);
+ if ((RegMask & Mask) == RegMask && IsInOneSegment(LI))
+ LiveThroughMask = RegMask;
+ }
+
+ return LiveThroughMask;
+}
+
+bool GCNRegPressurePrinter::runOnMachineFunction(MachineFunction &MF) {
+ const MachineRegisterInfo &MRI = MF.getRegInfo();
+ const TargetRegisterInfo *TRI = MRI.getTargetRegisterInfo();
+ const LiveIntervals &LIS = getAnalysis<LiveIntervals>();
+
+ auto &OS = dbgs();
+
+// Leading spaces are important for YAML syntax.
+#define PFX " "
+
+ OS << "---\nname: " << MF.getName() << "\nbody: |\n";
+
+ auto printRP = [](const GCNRegPressure &RP) {
+ return Printable([&RP](raw_ostream &OS) {
+ OS << format(PFX " %-5d", RP.getSGPRNum())
+ << format(" %-5d", RP.getVGPRNum(false));
+ });
+ };
+
+ auto ReportLISMismatchIfAny = [&](const GCNRPTracker::LiveRegSet &TrackedLR,
+ const GCNRPTracker::LiveRegSet &LISLR) {
+ if (LISLR != TrackedLR) {
+ OS << PFX " mis LIS: " << llvm::print(LISLR, MRI)
+ << reportMismatch(LISLR, TrackedLR, TRI, PFX " ");
+ }
+ };
+
+ // Register pressure before and at an instruction (in program order).
+ SmallVector<std::pair<GCNRegPressure, GCNRegPressure>, 16> RP;
+
+ for (auto &MBB : MF) {
+ RP.clear();
+ RP.reserve(MBB.size());
+
+ OS << PFX;
+ MBB.printName(OS);
+ OS << ":\n";
+
+ SlotIndex MBBStartSlot = LIS.getSlotIndexes()->getMBBStartIdx(&MBB);
+ SlotIndex MBBEndSlot = LIS.getSlotIndexes()->getMBBEndIdx(&MBB);
+
+ GCNRPTracker::LiveRegSet LiveIn, LiveOut;
+ GCNRegPressure RPAtMBBEnd;
+
+ if (UseDownwardTracker) {
+ if (MBB.empty()) {
+ LiveIn = LiveOut = getLiveRegs(MBBStartSlot, LIS, MRI);
+ RPAtMBBEnd = getRegPressure(MRI, LiveIn);
+ } else {
+ GCNDownwardRPTracker RPT(LIS);
+ RPT.reset(MBB.front());
+
+ LiveIn = RPT.getLiveRegs();
+
+ while (!RPT.advanceBeforeNext()) {
+ GCNRegPressure RPBeforeMI = RPT.getPressure();
+ RPT.advanceToNext();
+ RP.emplace_back(RPBeforeMI, RPT.getPressure());
+ }
+
+ LiveOut = RPT.getLiveRegs();
+ RPAtMBBEnd = RPT.getPressure();
+ }
+ } else {
+ GCNUpwardRPTracker RPT(LIS);
+ RPT.reset(MRI, MBBEndSlot);
+
+ LiveOut = RPT.getLiveRegs();
+ RPAtMBBEnd = RPT.getPressure();
+
+ for (auto &MI : reverse(MBB)) {
+ RPT.resetMaxPressure();
+ RPT.recede(MI);
+ if (!MI.isDebugInstr())
+ RP.emplace_back(RPT.getPressure(), RPT.getMaxPressure());
+ }
+
+ LiveIn = RPT.getLiveRegs();
+ }
+
+ OS << PFX " Live-in: " << llvm::print(LiveIn, MRI);
+ if (!UseDownwardTracker)
+ ReportLISMismatchIfAny(LiveIn, getLiveRegs(MBBStartSlot, LIS, MRI));
+
+ OS << PFX " SGPR VGPR\n";
+ int I = 0;
+ for (auto &MI : MBB) {
+ if (!MI.isDebugInstr()) {
+ auto &[RPBeforeInstr, RPAtInstr] =
+ RP[UseDownwardTracker ? I : (RP.size() - 1 - I)];
+ ++I;
+ OS << printRP(RPBeforeInstr) << '\n' << printRP(RPAtInstr) << " ";
+ } else
+ OS << PFX " ";
+ MI.print(OS);
+ }
+ OS << printRP(RPAtMBBEnd) << '\n';
+
+ OS << PFX " Live-out:" << llvm::print(LiveOut, MRI);
+ if (UseDownwardTracker)
+ ReportLISMismatchIfAny(LiveOut, getLiveRegs(MBBEndSlot, LIS, MRI));
+
+ GCNRPTracker::LiveRegSet LiveThrough;
+ for (auto [Reg, Mask] : LiveIn) {
+ LaneBitmask MaskIntersection = Mask & LiveOut.lookup(Reg);
+ if (MaskIntersection.any()) {
+ LaneBitmask LTMask = getRegLiveThroughMask(
+ MRI, LIS, Reg, MBBStartSlot, MBBEndSlot, MaskIntersection);
+ if (LTMask.any())
+ LiveThrough[Reg] = LTMask;
+ }
+ }
+ OS << PFX " Live-thr:" << llvm::print(LiveThrough, MRI);
+ OS << printRP(getRegPressure(MRI, LiveThrough)) << '\n';
+ }
+ OS << "...\n";
+ return false;
+
+#undef PFX
+} \ No newline at end of file
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/GCNRegPressure.h b/contrib/llvm-project/llvm/lib/Target/AMDGPU/GCNRegPressure.h
index 72e18acc1b8e..4100970fe1a9 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/GCNRegPressure.h
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/GCNRegPressure.h
@@ -85,6 +85,18 @@ struct GCNRegPressure {
return !(*this == O);
}
+ GCNRegPressure &operator+=(const GCNRegPressure &RHS) {
+ for (unsigned I = 0; I < TOTAL_KINDS; ++I)
+ Value[I] += RHS.Value[I];
+ return *this;
+ }
+
+ GCNRegPressure &operator-=(const GCNRegPressure &RHS) {
+ for (unsigned I = 0; I < TOTAL_KINDS; ++I)
+ Value[I] -= RHS.Value[I];
+ return *this;
+ }
+
void dump() const;
private:
@@ -105,6 +117,20 @@ inline GCNRegPressure max(const GCNRegPressure &P1, const GCNRegPressure &P2) {
return Res;
}
+inline GCNRegPressure operator+(const GCNRegPressure &P1,
+ const GCNRegPressure &P2) {
+ GCNRegPressure Sum = P1;
+ Sum += P2;
+ return Sum;
+}
+
+inline GCNRegPressure operator-(const GCNRegPressure &P1,
+ const GCNRegPressure &P2) {
+ GCNRegPressure Diff = P1;
+ Diff -= P2;
+ return Diff;
+}
+
class GCNRPTracker {
public:
using LiveRegSet = DenseMap<unsigned, LaneBitmask>;
@@ -128,32 +154,55 @@ public:
void clearMaxPressure() { MaxPressure.clear(); }
- // returns MaxPressure, resetting it
- decltype(MaxPressure) moveMaxPressure() {
- auto Res = MaxPressure;
- MaxPressure.clear();
- return Res;
- }
+ GCNRegPressure getPressure() const { return CurPressure; }
decltype(LiveRegs) moveLiveRegs() {
return std::move(LiveRegs);
}
};
+GCNRPTracker::LiveRegSet getLiveRegs(SlotIndex SI, const LiveIntervals &LIS,
+ const MachineRegisterInfo &MRI);
+
class GCNUpwardRPTracker : public GCNRPTracker {
public:
GCNUpwardRPTracker(const LiveIntervals &LIS_) : GCNRPTracker(LIS_) {}
- // reset tracker to the point just below MI
- // filling live regs upon this point using LIS
- void reset(const MachineInstr &MI, const LiveRegSet *LiveRegs = nullptr);
+ // reset tracker and set live register set to the specified value.
+ void reset(const MachineRegisterInfo &MRI_, const LiveRegSet &LiveRegs_);
+
+ // reset tracker at the specified slot index.
+ void reset(const MachineRegisterInfo &MRI, SlotIndex SI) {
+ reset(MRI, llvm::getLiveRegs(SI, LIS, MRI));
+ }
+
+ // reset tracker to the end of the MBB.
+ void reset(const MachineBasicBlock &MBB) {
+ reset(MBB.getParent()->getRegInfo(),
+ LIS.getSlotIndexes()->getMBBEndIdx(&MBB));
+ }
+
+ // reset tracker to the point just after MI (in program order).
+ void reset(const MachineInstr &MI) {
+ reset(MI.getMF()->getRegInfo(), LIS.getInstructionIndex(MI).getDeadSlot());
+ }
- // move to the state just above the MI
+ // move to the state just before the MI (in program order).
void recede(const MachineInstr &MI);
// checks whether the tracker's state after receding MI corresponds
- // to reported by LIS
+ // to reported by LIS.
bool isValid() const;
+
+ const GCNRegPressure &getMaxPressure() const { return MaxPressure; }
+
+ void resetMaxPressure() { MaxPressure = CurPressure; }
+
+ GCNRegPressure getMaxPressureAndReset() {
+ GCNRegPressure RP = MaxPressure;
+ resetMaxPressure();
+ return RP;
+ }
};
class GCNDownwardRPTracker : public GCNRPTracker {
@@ -167,6 +216,13 @@ public:
MachineBasicBlock::const_iterator getNext() const { return NextMI; }
+ // Return MaxPressure and clear it.
+ GCNRegPressure moveMaxPressure() {
+ auto Res = MaxPressure;
+ MaxPressure.clear();
+ return Res;
+ }
+
// Reset tracker to the point before the MI
// filling live regs upon this point using LIS.
// Returns false if block is empty except debug values.
@@ -196,8 +252,10 @@ LaneBitmask getLiveLaneMask(unsigned Reg,
const LiveIntervals &LIS,
const MachineRegisterInfo &MRI);
-GCNRPTracker::LiveRegSet getLiveRegs(SlotIndex SI,
- const LiveIntervals &LIS,
+LaneBitmask getLiveLaneMask(const LiveInterval &LI, SlotIndex SI,
+ const MachineRegisterInfo &MRI);
+
+GCNRPTracker::LiveRegSet getLiveRegs(SlotIndex SI, const LiveIntervals &LIS,
const MachineRegisterInfo &MRI);
/// creates a map MachineInstr -> LiveRegSet
@@ -275,7 +333,22 @@ Printable print(const GCNRPTracker::LiveRegSet &LiveRegs,
Printable reportMismatch(const GCNRPTracker::LiveRegSet &LISLR,
const GCNRPTracker::LiveRegSet &TrackedL,
- const TargetRegisterInfo *TRI);
+ const TargetRegisterInfo *TRI, StringRef Pfx = " ");
+
+struct GCNRegPressurePrinter : public MachineFunctionPass {
+ static char ID;
+
+public:
+ GCNRegPressurePrinter() : MachineFunctionPass(ID) {}
+
+ bool runOnMachineFunction(MachineFunction &MF) override;
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.addRequired<LiveIntervals>();
+ AU.setPreservesAll();
+ MachineFunctionPass::getAnalysisUsage(AU);
+ }
+};
} // end namespace llvm
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/GCNRewritePartialRegUses.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/GCNRewritePartialRegUses.cpp
index 99db7e4af9fd..019b64dd871e 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/GCNRewritePartialRegUses.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/GCNRewritePartialRegUses.cpp
@@ -101,17 +101,16 @@ private:
/// find new regclass such that:
/// 1. It has subregs obtained by shifting each OldSubReg by RShift number
/// of bits to the right. Every "shifted" subreg should have the same
- /// SubRegRC. SubRegRC can be null, in this case it initialized using
- /// getSubRegisterClass. If CoverSubregIdx is not zero it's a subreg that
- /// "covers" all other subregs in pairs. Basically such subreg becomes a
- /// whole register.
+ /// SubRegRC. If CoverSubregIdx is not zero it's a subreg that "covers"
+ /// all other subregs in pairs. Basically such subreg becomes a whole
+ /// register.
/// 2. Resulting register class contains registers of minimal size but not
/// less than RegNumBits.
///
/// SubRegs is map of OldSubReg -> [SubRegRC, NewSubReg] and is used as in/out
/// parameter:
/// OldSubReg - input parameter,
- /// SubRegRC - in/out, should be changed for unknown regclass,
+ /// SubRegRC - input parameter (cannot be null),
/// NewSubReg - output, contains shifted subregs on return.
const TargetRegisterClass *
getRegClassWithShiftedSubregs(const TargetRegisterClass *RC, unsigned RShift,
@@ -228,19 +227,7 @@ GCNRewritePartialRegUses::getRegClassWithShiftedSubregs(
BitVector ClassMask(getAllocatableAndAlignedRegClassMask(RCAlign));
for (auto &[OldSubReg, SRI] : SubRegs) {
auto &[SubRegRC, NewSubReg] = SRI;
-
- // Register class may be unknown, for example:
- // undef %0.sub4:sgpr_1024 = S_MOV_B32 01
- // %0.sub5:sgpr_1024 = S_MOV_B32 02
- // %1:vreg_64 = COPY %0.sub4_sub5
- // Register classes for subregs 'sub4' and 'sub5' are known from the
- // description of destination operand of S_MOV_B32 instruction but the
- // class for the subreg 'sub4_sub5' isn't specified by the COPY instruction.
- if (!SubRegRC)
- SubRegRC = TRI->getSubRegisterClass(RC, OldSubReg);
-
- if (!SubRegRC)
- return nullptr;
+ assert(SubRegRC);
LLVM_DEBUG(dbgs() << " " << TRI->getSubRegIndexName(OldSubReg) << ':'
<< TRI->getRegClassName(SubRegRC)
@@ -248,6 +235,8 @@ GCNRewritePartialRegUses::getRegClassWithShiftedSubregs(
<< " -> ");
if (OldSubReg == CoverSubregIdx) {
+ // Covering subreg will become a full register, RC should be allocatable.
+ assert(SubRegRC->isAllocatable());
NewSubReg = AMDGPU::NoSubRegister;
LLVM_DEBUG(dbgs() << "whole reg");
} else {
@@ -421,33 +410,42 @@ GCNRewritePartialRegUses::getOperandRegClass(MachineOperand &MO) const {
bool GCNRewritePartialRegUses::rewriteReg(Register Reg) const {
auto Range = MRI->reg_nodbg_operands(Reg);
- if (Range.begin() == Range.end())
+ if (Range.empty() || any_of(Range, [](MachineOperand &MO) {
+ return MO.getSubReg() == AMDGPU::NoSubRegister; // Whole reg used. [1]
+ }))
return false;
- for (MachineOperand &MO : Range) {
- if (MO.getSubReg() == AMDGPU::NoSubRegister) // Whole reg used, quit.
- return false;
- }
-
auto *RC = MRI->getRegClass(Reg);
LLVM_DEBUG(dbgs() << "Try to rewrite partial reg " << printReg(Reg, TRI)
<< ':' << TRI->getRegClassName(RC) << '\n');
- // Collect used subregs and constrained reg classes infered from instruction
+ // Collect used subregs and their reg classes infered from instruction
// operands.
SubRegMap SubRegs;
- for (MachineOperand &MO : MRI->reg_nodbg_operands(Reg)) {
- assert(MO.getSubReg() != AMDGPU::NoSubRegister);
- auto *OpDescRC = getOperandRegClass(MO);
- const auto [I, Inserted] = SubRegs.try_emplace(MO.getSubReg(), OpDescRC);
- if (!Inserted && OpDescRC) {
- SubRegInfo &SRI = I->second;
- SRI.RC = SRI.RC ? TRI->getCommonSubClass(SRI.RC, OpDescRC) : OpDescRC;
- if (!SRI.RC) {
- LLVM_DEBUG(dbgs() << " Couldn't find common target regclass\n");
- return false;
+ for (MachineOperand &MO : Range) {
+ const unsigned SubReg = MO.getSubReg();
+ assert(SubReg != AMDGPU::NoSubRegister); // Due to [1].
+ LLVM_DEBUG(dbgs() << " " << TRI->getSubRegIndexName(SubReg) << ':');
+
+ const auto [I, Inserted] = SubRegs.try_emplace(SubReg);
+ const TargetRegisterClass *&SubRegRC = I->second.RC;
+
+ if (Inserted)
+ SubRegRC = TRI->getSubRegisterClass(RC, SubReg);
+
+ if (SubRegRC) {
+ if (const TargetRegisterClass *OpDescRC = getOperandRegClass(MO)) {
+ LLVM_DEBUG(dbgs() << TRI->getRegClassName(SubRegRC) << " & "
+ << TRI->getRegClassName(OpDescRC) << " = ");
+ SubRegRC = TRI->getCommonSubClass(SubRegRC, OpDescRC);
}
}
+
+ if (!SubRegRC) {
+ LLVM_DEBUG(dbgs() << "couldn't find target regclass\n");
+ return false;
+ }
+ LLVM_DEBUG(dbgs() << TRI->getRegClassName(SubRegRC) << '\n');
}
auto *NewRC = getMinSizeReg(RC, SubRegs);
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
index 994cfea1fd7d..342d518f38bf 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
@@ -32,12 +32,18 @@
using namespace llvm;
-static cl::opt<bool>
- DisableUnclusterHighRP("amdgpu-disable-unclustred-high-rp-reschedule",
- cl::Hidden,
- cl::desc("Disable unclustred high register pressure "
- "reduction scheduling stage."),
- cl::init(false));
+static cl::opt<bool> DisableUnclusterHighRP(
+ "amdgpu-disable-unclustered-high-rp-reschedule", cl::Hidden,
+ cl::desc("Disable unclustered high register pressure "
+ "reduction scheduling stage."),
+ cl::init(false));
+
+static cl::opt<bool> DisableClusteredLowOccupancy(
+ "amdgpu-disable-clustered-low-occupancy-reschedule", cl::Hidden,
+ cl::desc("Disable clustered low occupancy "
+ "rescheduling for ILP scheduling stage."),
+ cl::init(false));
+
static cl::opt<unsigned> ScheduleMetricBias(
"amdgpu-schedule-metric-bias", cl::Hidden,
cl::desc(
@@ -707,7 +713,7 @@ bool UnclusteredHighRPStage::initGCNSchedStage() {
return false;
SavedMutations.swap(DAG.Mutations);
- DAG.addMutation(createIGroupLPDAGMutation());
+ DAG.addMutation(createIGroupLPDAGMutation(/*IsPostRA=*/false));
InitialOccupancy = DAG.MinOccupancy;
// Aggressivly try to reduce register pressure in the unclustered high RP
@@ -727,6 +733,9 @@ bool UnclusteredHighRPStage::initGCNSchedStage() {
}
bool ClusteredLowOccStage::initGCNSchedStage() {
+ if (DisableClusteredLowOccupancy)
+ return false;
+
if (!GCNSchedStage::initGCNSchedStage())
return false;
@@ -844,7 +853,9 @@ bool GCNSchedStage::initGCNRegion() {
StageID != GCNSchedStageID::UnclusteredHighRPReschedule) {
SavedMutations.clear();
SavedMutations.swap(DAG.Mutations);
- DAG.addMutation(createIGroupLPDAGMutation());
+ bool IsInitialStage = StageID == GCNSchedStageID::OccInitialSchedule ||
+ StageID == GCNSchedStageID::ILPInitialSchedule;
+ DAG.addMutation(createIGroupLPDAGMutation(/*IsReentry=*/!IsInitialStage));
}
return true;
@@ -1116,7 +1127,7 @@ bool OccInitialScheduleStage::shouldRevertScheduling(unsigned WavesAfter) {
}
bool UnclusteredHighRPStage::shouldRevertScheduling(unsigned WavesAfter) {
- // If RP is not reduced in the unclustred reschedule stage, revert to the
+ // If RP is not reduced in the unclustered reschedule stage, revert to the
// old schedule.
if ((WavesAfter <= PressureBefore.getOccupancy(ST) &&
mayCauseSpilling(WavesAfter)) ||
@@ -1558,7 +1569,7 @@ void GCNPostScheduleDAGMILive::schedule() {
if (HasIGLPInstrs) {
SavedMutations.clear();
SavedMutations.swap(Mutations);
- addMutation(createIGroupLPDAGMutation());
+ addMutation(createIGroupLPDAGMutation(/*IsReentry=*/true));
}
ScheduleDAGMI::schedule();
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/contrib/llvm-project/llvm/lib/Target/AMDGPU/GCNSubtarget.h
index ef5470df876d..91a709303269 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/GCNSubtarget.h
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/GCNSubtarget.h
@@ -22,6 +22,7 @@
#include "SIInstrInfo.h"
#include "Utils/AMDGPUBaseInfo.h"
#include "llvm/CodeGen/SelectionDAGTargetInfo.h"
+#include "llvm/Support/ErrorHandling.h"
#define GET_SUBTARGETINFO_HEADER
#include "AMDGPUGenSubtargetInfo.inc"
@@ -77,6 +78,7 @@ protected:
bool UnalignedAccessMode = false;
bool HasApertureRegs = false;
bool SupportsXNACK = false;
+ bool KernargPreload = false;
// This should not be used directly. 'TargetID' tracks the dynamic settings
// for XNACK.
@@ -105,6 +107,7 @@ protected:
bool GFX940Insts = false;
bool GFX10Insts = false;
bool GFX11Insts = false;
+ bool GFX12Insts = false;
bool GFX10_3Insts = false;
bool GFX7GFX8GFX9Insts = false;
bool SGPRInitBug = false;
@@ -116,6 +119,7 @@ protected:
bool HasFmaMixInsts = false;
bool HasMovrel = false;
bool HasVGPRIndexMode = false;
+ bool HasScalarDwordx3Loads = false;
bool HasScalarStores = false;
bool HasScalarAtomics = false;
bool HasSDWAOmod = false;
@@ -125,7 +129,8 @@ protected:
bool HasSDWAOutModsVOPC = false;
bool HasDPP = false;
bool HasDPP8 = false;
- bool Has64BitDPP = false;
+ bool HasDPALU_DPP = false;
+ bool HasDPPSrc1SGPR = false;
bool HasPackedFP32Ops = false;
bool HasImageInsts = false;
bool HasExtendedImageInsts = false;
@@ -157,6 +162,7 @@ protected:
bool HasAtomicFaddNoRtnInsts = false;
bool HasAtomicBufferGlobalPkAddF16NoRtnInsts = false;
bool HasAtomicBufferGlobalPkAddF16Insts = false;
+ bool HasAtomicCSubNoRtnInsts = false;
bool HasAtomicGlobalPkAddBF16Inst = false;
bool HasFlatAtomicFaddF32Inst = false;
bool SupportsSRAMECC = false;
@@ -180,6 +186,8 @@ protected:
bool HasArchitectedFlatScratch = false;
bool EnableFlatScratch = false;
bool HasArchitectedSGPRs = false;
+ bool HasGDS = false;
+ bool HasGWS = false;
bool AddNoCarryInsts = false;
bool HasUnpackedD16VMem = false;
bool LDSMisalignedBug = false;
@@ -188,6 +196,10 @@ protected:
bool UnalignedDSAccess = false;
bool HasPackedTID = false;
bool ScalarizeGlobal = false;
+ bool HasSALUFloatInsts = false;
+ bool HasVGPRSingleUseHintInsts = false;
+ bool HasPseudoScalarTrans = false;
+ bool HasRestrictedSOffset = false;
bool HasVcmpxPermlaneHazard = false;
bool HasVMEMtoScalarWriteHazard = false;
@@ -201,6 +213,7 @@ protected:
bool HasFlatSegmentOffsetBug = false;
bool HasImageStoreD16Bug = false;
bool HasImageGather4D16Bug = false;
+ bool HasMSAALoadDstSelBug = false;
bool HasGFX11FullVGPRs = false;
bool HasMADIntraFwdBug = false;
bool HasVOPDInsts = false;
@@ -667,6 +680,8 @@ public:
return AddNoCarryInsts;
}
+ bool hasScalarAddSub64() const { return getGeneration() >= GFX12; }
+
bool hasUnpackedD16VMem() const {
return HasUnpackedD16VMem;
}
@@ -818,6 +833,11 @@ public:
bool hasInstPrefetch() const { return getGeneration() >= GFX10; }
+ bool hasPrefetch() const { return GFX12Insts; }
+
+ // Has s_cmpk_* instructions.
+ bool hasSCmpK() const { return getGeneration() < GFX12; }
+
// Scratch is allocated in 256 dword per wave blocks for the entire
// wavefront. When viewed from the perspective of an arbitrary workitem, this
// is 4-byte aligned.
@@ -853,7 +873,7 @@ public:
unsigned NumRegionInstrs) const override;
unsigned getMaxNumUserSGPRs() const {
- return 16;
+ return AMDGPU::getMaxNumUserSGPRs(*this);
}
bool hasSMemRealTime() const {
@@ -874,6 +894,8 @@ public:
return getGeneration() >= VOLCANIC_ISLANDS;
}
+ bool hasScalarDwordx3Loads() const { return HasScalarDwordx3Loads; }
+
bool hasScalarStores() const {
return HasScalarStores;
}
@@ -906,14 +928,21 @@ public:
return HasDPP8;
}
- bool has64BitDPP() const {
- return Has64BitDPP;
+ bool hasDPALU_DPP() const {
+ return HasDPALU_DPP;
}
+ bool hasDPPSrc1SGPR() const { return HasDPPSrc1SGPR; }
+
bool hasPackedFP32Ops() const {
return HasPackedFP32Ops;
}
+ // Has V_PK_MOV_B32 opcode
+ bool hasPkMovB32() const {
+ return GFX90AInsts;
+ }
+
bool hasFmaakFmamkF32Insts() const {
return getGeneration() >= GFX10 || hasGFX940Insts();
}
@@ -944,11 +973,15 @@ public:
bool hasMADIntraFwdBug() const { return HasMADIntraFwdBug; }
+ bool hasMSAALoadDstSelBug() const { return HasMSAALoadDstSelBug; }
+
bool hasNSAEncoding() const { return HasNSAEncoding; }
bool hasPartialNSAEncoding() const { return HasPartialNSAEncoding; }
- unsigned getNSAMaxSize() const { return AMDGPU::getNSAMaxSize(*this); }
+ unsigned getNSAMaxSize(bool HasSampler = false) const {
+ return AMDGPU::getNSAMaxSize(*this, HasSampler);
+ }
bool hasGFX10_AEncoding() const {
return GFX10_AEncoding;
@@ -1127,6 +1160,14 @@ public:
// hasGFX90AInsts is also true.
bool hasGFX940Insts() const { return GFX940Insts; }
+ bool hasSALUFloatInsts() const { return HasSALUFloatInsts; }
+
+ bool hasVGPRSingleUseHintInsts() const { return HasVGPRSingleUseHintInsts; }
+
+ bool hasPseudoScalarTrans() const { return HasPseudoScalarTrans; }
+
+ bool hasRestrictedSOffset() const { return HasRestrictedSOffset; }
+
/// Return the maximum number of waves per SIMD for kernels using \p SGPRs
/// SGPRs
unsigned getOccupancyWithNumSGPRs(unsigned SGPRs) const;
@@ -1155,6 +1196,12 @@ public:
/// \returns true if the architected SGPRs are enabled.
bool hasArchitectedSGPRs() const { return HasArchitectedSGPRs; }
+ /// \returns true if Global Data Share is supported.
+ bool hasGDS() const { return HasGDS; }
+
+ /// \returns true if Global Wave Sync is supported.
+ bool hasGWS() const { return HasGWS; }
+
/// \returns true if the machine has merged shaders in which s0-s7 are
/// reserved by the hardware and user SGPRs start at s8
bool hasMergedShaders() const {
@@ -1164,6 +1211,37 @@ public:
// \returns true if the target supports the pre-NGG legacy geometry path.
bool hasLegacyGeometry() const { return getGeneration() < GFX11; }
+ // \returns true if preloading kernel arguments is supported.
+ bool hasKernargPreload() const { return KernargPreload; }
+
+ // \returns true if we need to generate backwards compatible code when
+ // preloading kernel arguments.
+ bool needsKernargPreloadBackwardsCompatibility() const {
+ return hasKernargPreload() && !hasGFX940Insts();
+ }
+
+ // \returns true if the target has split barriers feature
+ bool hasSplitBarriers() const { return getGeneration() >= GFX12; }
+
+ // \returns true if FP8/BF8 VOP1 form of conversion to F32 is unreliable.
+ bool hasCvtFP8VOP1Bug() const { return true; }
+
+ // \returns true if CSUB (a.k.a. SUB_CLAMP on GFX12) atomics support a
+ // no-return form.
+ bool hasAtomicCSubNoRtnInsts() const { return HasAtomicCSubNoRtnInsts; }
+
+ // \returns true if the target has DX10_CLAMP kernel descriptor mode bit
+ bool hasDX10ClampMode() const { return getGeneration() < GFX12; }
+
+ // \returns true if the target has IEEE kernel descriptor mode bit
+ bool hasIEEEMode() const { return getGeneration() < GFX12; }
+
+ // \returns true if the target has IEEE fminimum/fmaximum instructions
+ bool hasIEEEMinMax() const { return getGeneration() >= GFX12; }
+
+ // \returns true if the target has WG_RR_MODE kernel descriptor mode bit
+ bool hasRrWGMode() const { return getGeneration() >= GFX12; }
+
/// \returns SGPR allocation granularity supported by the subtarget.
unsigned getSGPRAllocGranule() const {
return AMDGPU::IsaInfo::getSGPRAllocGranule(this);
@@ -1362,6 +1440,91 @@ public:
}
};
+class GCNUserSGPRUsageInfo {
+public:
+ bool hasImplicitBufferPtr() const { return ImplicitBufferPtr; }
+
+ bool hasPrivateSegmentBuffer() const { return PrivateSegmentBuffer; }
+
+ bool hasDispatchPtr() const { return DispatchPtr; }
+
+ bool hasQueuePtr() const { return QueuePtr; }
+
+ bool hasKernargSegmentPtr() const { return KernargSegmentPtr; }
+
+ bool hasDispatchID() const { return DispatchID; }
+
+ bool hasFlatScratchInit() const { return FlatScratchInit; }
+
+ unsigned getNumKernargPreloadSGPRs() const { return NumKernargPreloadSGPRs; }
+
+ unsigned getNumUsedUserSGPRs() const { return NumUsedUserSGPRs; }
+
+ unsigned getNumFreeUserSGPRs();
+
+ void allocKernargPreloadSGPRs(unsigned NumSGPRs);
+
+ enum UserSGPRID : unsigned {
+ ImplicitBufferPtrID = 0,
+ PrivateSegmentBufferID = 1,
+ DispatchPtrID = 2,
+ QueuePtrID = 3,
+ KernargSegmentPtrID = 4,
+ DispatchIdID = 5,
+ FlatScratchInitID = 6,
+ PrivateSegmentSizeID = 7
+ };
+
+ // Returns the size in number of SGPRs for preload user SGPR field.
+ static unsigned getNumUserSGPRForField(UserSGPRID ID) {
+ switch (ID) {
+ case ImplicitBufferPtrID:
+ return 2;
+ case PrivateSegmentBufferID:
+ return 4;
+ case DispatchPtrID:
+ return 2;
+ case QueuePtrID:
+ return 2;
+ case KernargSegmentPtrID:
+ return 2;
+ case DispatchIdID:
+ return 2;
+ case FlatScratchInitID:
+ return 2;
+ case PrivateSegmentSizeID:
+ return 1;
+ }
+ llvm_unreachable("Unknown UserSGPRID.");
+ }
+
+ GCNUserSGPRUsageInfo(const Function &F, const GCNSubtarget &ST);
+
+private:
+ const GCNSubtarget &ST;
+
+ // Private memory buffer
+ // Compute directly in sgpr[0:1]
+ // Other shaders indirect 64-bits at sgpr[0:1]
+ bool ImplicitBufferPtr = false;
+
+ bool PrivateSegmentBuffer = false;
+
+ bool DispatchPtr = false;
+
+ bool QueuePtr = false;
+
+ bool KernargSegmentPtr = false;
+
+ bool DispatchID = false;
+
+ bool FlatScratchInit = false;
+
+ unsigned NumKernargPreloadSGPRs = 0;
+
+ unsigned NumUsedUserSGPRs = 0;
+};
+
} // end namespace llvm
#endif // LLVM_LIB_TARGET_AMDGPU_GCNSUBTARGET_H
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/GCNVOPDUtils.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/GCNVOPDUtils.cpp
index 29c9b9ccf276..33c208495c50 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/GCNVOPDUtils.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/GCNVOPDUtils.cpp
@@ -103,7 +103,13 @@ bool llvm::checkVOPDRegConstraints(const SIInstrInfo &TII,
return false;
if ((UniqueLiterals.size() + UniqueScalarRegs.size()) > 2)
return false;
- if (InstInfo.hasInvalidOperand(getVRegIdx))
+
+ // On GFX12 if both OpX and OpY are V_MOV_B32 then OPY uses SRC2 source-cache.
+ bool SkipSrc = ST.getGeneration() >= AMDGPUSubtarget::GFX12 &&
+ FirstMI.getOpcode() == AMDGPU::V_MOV_B32_e32 &&
+ SecondMI.getOpcode() == AMDGPU::V_MOV_B32_e32;
+
+ if (InstInfo.hasInvalidOperand(getVRegIdx, SkipSrc))
return false;
LLVM_DEBUG(dbgs() << "VOPD Reg Constraints Passed\n\tX: " << FirstMI
@@ -142,10 +148,10 @@ namespace {
/// be turned into VOPD instructions
/// Greedily pairs instruction candidates. O(n^2) algorithm.
struct VOPDPairingMutation : ScheduleDAGMutation {
- ShouldSchedulePredTy shouldScheduleAdjacent; // NOLINT: function pointer
+ MacroFusionPredTy shouldScheduleAdjacent; // NOLINT: function pointer
VOPDPairingMutation(
- ShouldSchedulePredTy shouldScheduleAdjacent) // NOLINT: function pointer
+ MacroFusionPredTy shouldScheduleAdjacent) // NOLINT: function pointer
: shouldScheduleAdjacent(shouldScheduleAdjacent) {}
void apply(ScheduleDAGInstrs *DAG) override {
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/MCA/AMDGPUCustomBehaviour.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/MCA/AMDGPUCustomBehaviour.cpp
index a1f8be403c44..c8ce1903d315 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/MCA/AMDGPUCustomBehaviour.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/MCA/AMDGPUCustomBehaviour.cpp
@@ -13,7 +13,7 @@
#include "AMDGPUCustomBehaviour.h"
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
-#include "SIInstrInfo.h"
+#include "Utils/AMDGPUBaseInfo.h"
#include "TargetInfo/AMDGPUTargetInfo.h"
#include "llvm/MC/TargetRegistry.h"
#include "llvm/Support/WithColor.h"
@@ -25,10 +25,12 @@ void AMDGPUInstrPostProcess::postProcessInstruction(
std::unique_ptr<Instruction> &Inst, const MCInst &MCI) {
switch (MCI.getOpcode()) {
case AMDGPU::S_WAITCNT:
+ case AMDGPU::S_WAITCNT_soft:
case AMDGPU::S_WAITCNT_EXPCNT:
case AMDGPU::S_WAITCNT_LGKMCNT:
case AMDGPU::S_WAITCNT_VMCNT:
case AMDGPU::S_WAITCNT_VSCNT:
+ case AMDGPU::S_WAITCNT_VSCNT_soft:
case AMDGPU::S_WAITCNT_EXPCNT_gfx10:
case AMDGPU::S_WAITCNT_LGKMCNT_gfx10:
case AMDGPU::S_WAITCNT_VMCNT_gfx10:
@@ -77,10 +79,12 @@ unsigned AMDGPUCustomBehaviour::checkCustomHazard(ArrayRef<InstRef> IssuedInst,
default:
return 0;
case AMDGPU::S_WAITCNT: // This instruction
+ case AMDGPU::S_WAITCNT_soft:
case AMDGPU::S_WAITCNT_EXPCNT:
case AMDGPU::S_WAITCNT_LGKMCNT:
case AMDGPU::S_WAITCNT_VMCNT:
- case AMDGPU::S_WAITCNT_VSCNT: // to this instruction are all pseudo.
+ case AMDGPU::S_WAITCNT_VSCNT:
+ case AMDGPU::S_WAITCNT_VSCNT_soft: // to this instruction are all pseudo.
case AMDGPU::S_WAITCNT_EXPCNT_gfx10:
case AMDGPU::S_WAITCNT_LGKMCNT_gfx10:
case AMDGPU::S_WAITCNT_VMCNT_gfx10:
@@ -317,13 +321,15 @@ bool AMDGPUCustomBehaviour::hasModifiersSet(
return true;
}
+// taken from SIInstrInfo::isGWS()
+bool AMDGPUCustomBehaviour::isGWS(uint16_t Opcode) const {
+ const MCInstrDesc &MCID = MCII.get(Opcode);
+ return MCID.TSFlags & SIInstrFlags::GWS;
+}
+
// taken from SIInstrInfo::isAlwaysGDS()
bool AMDGPUCustomBehaviour::isAlwaysGDS(uint16_t Opcode) const {
- return Opcode == AMDGPU::DS_ORDERED_COUNT || Opcode == AMDGPU::DS_GWS_INIT ||
- Opcode == AMDGPU::DS_GWS_SEMA_V || Opcode == AMDGPU::DS_GWS_SEMA_BR ||
- Opcode == AMDGPU::DS_GWS_SEMA_P ||
- Opcode == AMDGPU::DS_GWS_SEMA_RELEASE_ALL ||
- Opcode == AMDGPU::DS_GWS_BARRIER;
+ return Opcode == AMDGPU::DS_ORDERED_COUNT || isGWS(Opcode);
}
} // namespace mca
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/MCA/AMDGPUCustomBehaviour.h b/contrib/llvm-project/llvm/lib/Target/AMDGPU/MCA/AMDGPUCustomBehaviour.h
index cb1436d319c9..3a231758887b 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/MCA/AMDGPUCustomBehaviour.h
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/MCA/AMDGPUCustomBehaviour.h
@@ -68,6 +68,8 @@ class AMDGPUCustomBehaviour : public CustomBehaviour {
bool hasModifiersSet(const std::unique_ptr<Instruction> &Inst,
unsigned OpName) const;
/// Helper function used in generateWaitCntInfo()
+ bool isGWS(uint16_t Opcode) const;
+ /// Helper function used in generateWaitCntInfo()
bool isAlwaysGDS(uint16_t Opcode) const;
/// Helper function used in generateWaitCntInfo()
bool isVMEM(const MCInstrDesc &MCID);
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp
index 44109b9d2919..f91f36ed851b 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp
@@ -28,7 +28,7 @@ namespace {
class AMDGPUAsmBackend : public MCAsmBackend {
public:
- AMDGPUAsmBackend(const Target &T) : MCAsmBackend(support::little) {}
+ AMDGPUAsmBackend(const Target &T) : MCAsmBackend(llvm::endianness::little) {}
unsigned getNumFixupKinds() const override { return AMDGPU::NumTargetFixupKinds; };
@@ -53,7 +53,8 @@ public:
std::optional<MCFixupKind> getFixupKind(StringRef Name) const override;
const MCFixupKindInfo &getFixupKindInfo(MCFixupKind Kind) const override;
bool shouldForceRelocation(const MCAssembler &Asm, const MCFixup &Fixup,
- const MCValue &Target) override;
+ const MCValue &Target,
+ const MCSubtargetInfo *STI) override;
};
} //End anonymous namespace
@@ -185,12 +186,15 @@ const MCFixupKindInfo &AMDGPUAsmBackend::getFixupKindInfo(
if (Kind < FirstTargetFixupKind)
return MCAsmBackend::getFixupKindInfo(Kind);
+ assert(unsigned(Kind - FirstTargetFixupKind) < getNumFixupKinds() &&
+ "Invalid kind!");
return Infos[Kind - FirstTargetFixupKind];
}
bool AMDGPUAsmBackend::shouldForceRelocation(const MCAssembler &,
const MCFixup &Fixup,
- const MCValue &) {
+ const MCValue &,
+ const MCSubtargetInfo *STI) {
return Fixup.getKind() >= FirstLiteralRelocationKind;
}
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFObjectWriter.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFObjectWriter.cpp
index 3f188478ca8b..58eed81e0755 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFObjectWriter.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFObjectWriter.cpp
@@ -63,6 +63,10 @@ unsigned AMDGPUELFObjectWriter::getRelocType(MCContext &Ctx,
return ELF::R_AMDGPU_REL32_HI;
case MCSymbolRefExpr::VK_AMDGPU_REL64:
return ELF::R_AMDGPU_REL64;
+ case MCSymbolRefExpr::VK_AMDGPU_ABS32_LO:
+ return ELF::R_AMDGPU_ABS32_LO;
+ case MCSymbolRefExpr::VK_AMDGPU_ABS32_HI:
+ return ELF::R_AMDGPU_ABS32_HI;
}
MCFixupKind Kind = Fixup.getKind();
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp
index ad55c73b22ea..edc244db613d 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp
@@ -10,13 +10,13 @@
#include "AMDGPUInstPrinter.h"
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
#include "SIDefines.h"
-#include "SIRegisterInfo.h"
#include "Utils/AMDGPUAsmUtils.h"
#include "Utils/AMDGPUBaseInfo.h"
#include "llvm/MC/MCExpr.h"
#include "llvm/MC/MCInst.h"
#include "llvm/MC/MCInstrDesc.h"
#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCRegisterInfo.h"
#include "llvm/MC/MCSubtargetInfo.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/TargetParser/TargetParser.h"
@@ -24,12 +24,6 @@
using namespace llvm;
using namespace llvm::AMDGPU;
-static cl::opt<bool> Keep16BitSuffixes(
- "amdgpu-keep-16-bit-reg-suffixes",
- cl::desc("Keep .l and .h suffixes in asm for debugging purposes"),
- cl::init(false),
- cl::ReallyHidden);
-
void AMDGPUInstPrinter::printRegName(raw_ostream &OS, MCRegister Reg) const {
// FIXME: The current implementation of
// AsmParser::parseRegisterOrRegisterNumber in MC implies we either emit this
@@ -103,28 +97,36 @@ void AMDGPUInstPrinter::printNamedBit(const MCInst *MI, unsigned OpNo,
void AMDGPUInstPrinter::printOffset(const MCInst *MI, unsigned OpNo,
const MCSubtargetInfo &STI,
raw_ostream &O) {
- uint16_t Imm = MI->getOperand(OpNo).getImm();
+ uint32_t Imm = MI->getOperand(OpNo).getImm();
if (Imm != 0) {
O << " offset:";
- printU16ImmDecOperand(MI, OpNo, O);
+
+ // GFX12 uses a 24-bit signed offset for VBUFFER.
+ const MCInstrDesc &Desc = MII.get(MI->getOpcode());
+ bool IsVBuffer = Desc.TSFlags & (SIInstrFlags::MUBUF | SIInstrFlags::MTBUF);
+ if (AMDGPU::isGFX12(STI) && IsVBuffer)
+ O << formatDec(SignExtend32<24>(Imm));
+ else
+ printU16ImmDecOperand(MI, OpNo, O);
}
}
void AMDGPUInstPrinter::printFlatOffset(const MCInst *MI, unsigned OpNo,
const MCSubtargetInfo &STI,
raw_ostream &O) {
- uint16_t Imm = MI->getOperand(OpNo).getImm();
+ uint32_t Imm = MI->getOperand(OpNo).getImm();
if (Imm != 0) {
O << " offset:";
const MCInstrDesc &Desc = MII.get(MI->getOpcode());
- bool IsFlatSeg = !(Desc.TSFlags &
- (SIInstrFlags::FlatGlobal | SIInstrFlags::FlatScratch));
+ bool AllowNegative = (Desc.TSFlags & (SIInstrFlags::FlatGlobal |
+ SIInstrFlags::FlatScratch)) ||
+ AMDGPU::isGFX12(STI);
- if (IsFlatSeg) // Unsigned offset
- printU16ImmDecOperand(MI, OpNo, O);
- else // Signed offset
+ if (AllowNegative) // Signed offset
O << formatDec(SignExtend32(Imm, AMDGPU::getNumFlatOffsetBits(STI)));
+ else // Unsigned offset
+ printU16ImmDecOperand(MI, OpNo, O);
}
}
@@ -174,6 +176,17 @@ void AMDGPUInstPrinter::printSMRDLiteralOffset(const MCInst *MI, unsigned OpNo,
void AMDGPUInstPrinter::printCPol(const MCInst *MI, unsigned OpNo,
const MCSubtargetInfo &STI, raw_ostream &O) {
auto Imm = MI->getOperand(OpNo).getImm();
+
+ if (AMDGPU::isGFX12Plus(STI)) {
+ const int64_t TH = Imm & CPol::TH;
+ const int64_t Scope = Imm & CPol::SCOPE;
+
+ printTH(MI, TH, Scope, O);
+ printScope(Scope, O);
+
+ return;
+ }
+
if (Imm & CPol::GLC)
O << ((AMDGPU::isGFX940(STI) &&
!(MII.get(MI->getOpcode()).TSFlags & SIInstrFlags::SMRD)) ? " sc0"
@@ -188,6 +201,89 @@ void AMDGPUInstPrinter::printCPol(const MCInst *MI, unsigned OpNo,
O << " /* unexpected cache policy bit */";
}
+void AMDGPUInstPrinter::printTH(const MCInst *MI, int64_t TH, int64_t Scope,
+ raw_ostream &O) {
+ // For th = 0 do not print this field
+ if (TH == 0)
+ return;
+
+ const unsigned Opcode = MI->getOpcode();
+ const MCInstrDesc &TID = MII.get(Opcode);
+ bool IsStore = TID.mayStore();
+ bool IsAtomic =
+ TID.TSFlags & (SIInstrFlags::IsAtomicNoRet | SIInstrFlags::IsAtomicRet);
+
+ O << " th:";
+
+ if (IsAtomic) {
+ O << "TH_ATOMIC_";
+ if (TH & AMDGPU::CPol::TH_ATOMIC_CASCADE) {
+ if (Scope >= AMDGPU::CPol::SCOPE_DEV)
+ O << "CASCADE" << (TH & AMDGPU::CPol::TH_ATOMIC_NT ? "_NT" : "_RT");
+ else
+ O << formatHex(TH);
+ } else if (TH & AMDGPU::CPol::TH_ATOMIC_NT)
+ O << "NT" << (TH & AMDGPU::CPol::TH_ATOMIC_RETURN ? "_RETURN" : "");
+ else if (TH & AMDGPU::CPol::TH_ATOMIC_RETURN)
+ O << "RETURN";
+ else
+ O << formatHex(TH);
+ } else {
+ if (!IsStore && TH == AMDGPU::CPol::TH_RESERVED)
+ O << formatHex(TH);
+ else {
+ // This will default to printing load variants when neither MayStore nor
+ // MayLoad flag is present which is the case with instructions like
+ // image_get_resinfo.
+ O << (IsStore ? "TH_STORE_" : "TH_LOAD_");
+ switch (TH) {
+ case AMDGPU::CPol::TH_NT:
+ O << "NT";
+ break;
+ case AMDGPU::CPol::TH_HT:
+ O << "HT";
+ break;
+ case AMDGPU::CPol::TH_BYPASS: // or LU or RT_WB
+ O << (Scope == AMDGPU::CPol::SCOPE_SYS ? "BYPASS"
+ : (IsStore ? "RT_WB" : "LU"));
+ break;
+ case AMDGPU::CPol::TH_NT_RT:
+ O << "NT_RT";
+ break;
+ case AMDGPU::CPol::TH_RT_NT:
+ O << "RT_NT";
+ break;
+ case AMDGPU::CPol::TH_NT_HT:
+ O << "NT_HT";
+ break;
+ case AMDGPU::CPol::TH_NT_WB:
+ O << "NT_WB";
+ break;
+ default:
+ llvm_unreachable("unexpected th value");
+ }
+ }
+ }
+}
+
+void AMDGPUInstPrinter::printScope(int64_t Scope, raw_ostream &O) {
+ if (Scope == CPol::SCOPE_CU)
+ return;
+
+ O << " scope:";
+
+ if (Scope == CPol::SCOPE_SE)
+ O << "SCOPE_SE";
+ else if (Scope == CPol::SCOPE_DEV)
+ O << "SCOPE_DEV";
+ else if (Scope == CPol::SCOPE_SYS)
+ O << "SCOPE_SYS";
+ else
+ llvm_unreachable("unexpected scope policy value");
+
+ return;
+}
+
void AMDGPUInstPrinter::printDMask(const MCInst *MI, unsigned OpNo,
const MCSubtargetInfo &STI, raw_ostream &O) {
if (MI->getOperand(OpNo).getImm()) {
@@ -278,12 +374,7 @@ void AMDGPUInstPrinter::printRegOperand(unsigned RegNo, raw_ostream &O,
}
#endif
- StringRef RegName(getRegisterName(RegNo));
- if (!Keep16BitSuffixes)
- if (!RegName.consume_back(".l"))
- RegName.consume_back(".h");
-
- O << RegName;
+ O << getRegisterName(RegNo);
}
void AMDGPUInstPrinter::printVOPDst(const MCInst *MI, unsigned OpNo,
@@ -333,6 +424,15 @@ void AMDGPUInstPrinter::printVOPDst(const MCInst *MI, unsigned OpNo,
case AMDGPU::V_ADD_CO_CI_U32_dpp8_gfx11:
case AMDGPU::V_SUB_CO_CI_U32_dpp8_gfx11:
case AMDGPU::V_SUBREV_CO_CI_U32_dpp8_gfx11:
+ case AMDGPU::V_ADD_CO_CI_U32_e32_gfx12:
+ case AMDGPU::V_SUB_CO_CI_U32_e32_gfx12:
+ case AMDGPU::V_SUBREV_CO_CI_U32_e32_gfx12:
+ case AMDGPU::V_ADD_CO_CI_U32_dpp_gfx12:
+ case AMDGPU::V_SUB_CO_CI_U32_dpp_gfx12:
+ case AMDGPU::V_SUBREV_CO_CI_U32_dpp_gfx12:
+ case AMDGPU::V_ADD_CO_CI_U32_dpp8_gfx12:
+ case AMDGPU::V_SUB_CO_CI_U32_dpp8_gfx12:
+ case AMDGPU::V_SUBREV_CO_CI_U32_dpp8_gfx12:
printDefaultVccOperand(false, STI, O);
break;
}
@@ -437,7 +537,7 @@ void AMDGPUInstPrinter::printImmediate32(uint32_t Imm,
void AMDGPUInstPrinter::printImmediate64(uint64_t Imm,
const MCSubtargetInfo &STI,
- raw_ostream &O) {
+ raw_ostream &O, bool IsFP) {
int64_t SImm = static_cast<int64_t>(Imm);
if (SImm >= -16 && SImm <= 64) {
O << SImm;
@@ -465,7 +565,10 @@ void AMDGPUInstPrinter::printImmediate64(uint64_t Imm,
else if (Imm == 0x3fc45f306dc9c882 &&
STI.hasFeature(AMDGPU::FeatureInv2PiInlineImm))
O << "0.15915494309189532";
- else {
+ else if (IsFP) {
+ assert(AMDGPU::isValid32BitLiteral(Imm, true));
+ O << formatHex(static_cast<uint64_t>(Hi_32(Imm)));
+ } else {
assert(isUInt<32>(Imm) || isInt<32>(Imm));
// In rare situations, we will have a 32-bit literal in a 64-bit
@@ -532,21 +635,15 @@ void AMDGPUInstPrinter::printDefaultVccOperand(bool FirstOperand,
void AMDGPUInstPrinter::printWaitVDST(const MCInst *MI, unsigned OpNo,
const MCSubtargetInfo &STI,
raw_ostream &O) {
- uint8_t Imm = MI->getOperand(OpNo).getImm();
- if (Imm != 0) {
- O << " wait_vdst:";
- printU4ImmDecOperand(MI, OpNo, O);
- }
+ O << " wait_vdst:";
+ printU4ImmDecOperand(MI, OpNo, O);
}
void AMDGPUInstPrinter::printWaitEXP(const MCInst *MI, unsigned OpNo,
const MCSubtargetInfo &STI,
raw_ostream &O) {
- uint8_t Imm = MI->getOperand(OpNo).getImm();
- if (Imm != 0) {
- O << " wait_exp:";
- printU4ImmDecOperand(MI, OpNo, O);
- }
+ O << " wait_exp:";
+ printU4ImmDecOperand(MI, OpNo, O);
}
bool AMDGPUInstPrinter::needsImpliedVcc(const MCInstrDesc &Desc,
@@ -619,14 +716,17 @@ void AMDGPUInstPrinter::printRegularOperand(const MCInst *MI, unsigned OpNo,
case AMDGPU::OPERAND_REG_INLINE_C_V2INT32:
case AMDGPU::OPERAND_REG_INLINE_C_V2FP32:
case MCOI::OPERAND_IMMEDIATE:
+ case AMDGPU::OPERAND_INLINE_SPLIT_BARRIER_INT32:
printImmediate32(Op.getImm(), STI, O);
break;
case AMDGPU::OPERAND_REG_IMM_INT64:
- case AMDGPU::OPERAND_REG_IMM_FP64:
case AMDGPU::OPERAND_REG_INLINE_C_INT64:
+ printImmediate64(Op.getImm(), STI, O, false);
+ break;
+ case AMDGPU::OPERAND_REG_IMM_FP64:
case AMDGPU::OPERAND_REG_INLINE_C_FP64:
case AMDGPU::OPERAND_REG_INLINE_AC_FP64:
- printImmediate64(Op.getImm(), STI, O);
+ printImmediate64(Op.getImm(), STI, O, true);
break;
case AMDGPU::OPERAND_REG_INLINE_C_INT16:
case AMDGPU::OPERAND_REG_INLINE_AC_INT16:
@@ -688,7 +788,7 @@ void AMDGPUInstPrinter::printRegularOperand(const MCInst *MI, unsigned OpNo,
if (RCBits == 32)
printImmediate32(llvm::bit_cast<uint32_t>((float)Value), STI, O);
else if (RCBits == 64)
- printImmediate64(llvm::bit_cast<uint64_t>(Value), STI, O);
+ printImmediate64(llvm::bit_cast<uint64_t>(Value), STI, O, true);
else
llvm_unreachable("Invalid register class size");
}
@@ -725,6 +825,18 @@ void AMDGPUInstPrinter::printRegularOperand(const MCInst *MI, unsigned OpNo,
case AMDGPU::V_ADD_CO_CI_U32_dpp8_gfx11:
case AMDGPU::V_SUB_CO_CI_U32_dpp8_gfx11:
case AMDGPU::V_SUBREV_CO_CI_U32_dpp8_gfx11:
+ case AMDGPU::V_CNDMASK_B32_e32_gfx12:
+ case AMDGPU::V_ADD_CO_CI_U32_e32_gfx12:
+ case AMDGPU::V_SUB_CO_CI_U32_e32_gfx12:
+ case AMDGPU::V_SUBREV_CO_CI_U32_e32_gfx12:
+ case AMDGPU::V_CNDMASK_B32_dpp_gfx12:
+ case AMDGPU::V_ADD_CO_CI_U32_dpp_gfx12:
+ case AMDGPU::V_SUB_CO_CI_U32_dpp_gfx12:
+ case AMDGPU::V_SUBREV_CO_CI_U32_dpp_gfx12:
+ case AMDGPU::V_CNDMASK_B32_dpp8_gfx12:
+ case AMDGPU::V_ADD_CO_CI_U32_dpp8_gfx12:
+ case AMDGPU::V_SUB_CO_CI_U32_dpp8_gfx12:
+ case AMDGPU::V_SUBREV_CO_CI_U32_dpp8_gfx12:
case AMDGPU::V_CNDMASK_B32_e32_gfx6_gfx7:
case AMDGPU::V_CNDMASK_B32_e32_vi:
@@ -846,13 +958,9 @@ void AMDGPUInstPrinter::printDPPCtrl(const MCInst *MI, unsigned OpNo,
unsigned Imm = MI->getOperand(OpNo).getImm();
const MCInstrDesc &Desc = MII.get(MI->getOpcode());
- int Src0Idx = AMDGPU::getNamedOperandIdx(MI->getOpcode(),
- AMDGPU::OpName::src0);
- if (Src0Idx >= 0 &&
- Desc.operands()[Src0Idx].RegClass == AMDGPU::VReg_64RegClassID &&
- !AMDGPU::isLegal64BitDPPControl(Imm)) {
- O << " /* 64 bit dpp only supports row_newbcast */";
+ if (!AMDGPU::isLegalDPALU_DPPControl(Imm) && AMDGPU::isDPALU_DPP(Desc)) {
+ O << " /* DP ALU dpp only supports row_newbcast */";
return;
} else if (Imm <= DppCtrl::QUAD_PERM_LAST) {
O << "quad_perm:[";
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h b/contrib/llvm-project/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h
index 3b14faab136b..95c26de6299e 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h
@@ -66,6 +66,8 @@ private:
const MCSubtargetInfo &STI, raw_ostream &O);
void printCPol(const MCInst *MI, unsigned OpNo,
const MCSubtargetInfo &STI, raw_ostream &O);
+ void printTH(const MCInst *MI, int64_t TH, int64_t Scope, raw_ostream &O);
+ void printScope(int64_t Scope, raw_ostream &O);
void printDMask(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
raw_ostream &O);
void printDim(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
@@ -91,7 +93,7 @@ private:
void printImmediate32(uint32_t Imm, const MCSubtargetInfo &STI,
raw_ostream &O);
void printImmediate64(uint64_t Imm, const MCSubtargetInfo &STI,
- raw_ostream &O);
+ raw_ostream &O, bool IsFP);
void printOperand(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
raw_ostream &O);
void printRegularOperand(const MCInst *MI, unsigned OpNo,
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp
index 5e77a8caa04e..b403d69d9ff1 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp
@@ -49,6 +49,14 @@ public:
SmallVectorImpl<MCFixup> &Fixups,
const MCSubtargetInfo &STI) const;
+ void getMachineOpValueT16(const MCInst &MI, unsigned OpNo, APInt &Op,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const;
+
+ void getMachineOpValueT16Lo128(const MCInst &MI, unsigned OpNo, APInt &Op,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const;
+
/// Use a fixup to encode the simm16 field for SOPP branch
/// instructions.
void getSOPPBrEncoding(const MCInst &MI, unsigned OpNo, APInt &Op,
@@ -254,6 +262,7 @@ AMDGPUMCCodeEmitter::getLitEncoding(const MCOperand &MO,
case AMDGPU::OPERAND_REG_IMM_V2FP32:
case AMDGPU::OPERAND_REG_INLINE_C_V2INT32:
case AMDGPU::OPERAND_REG_INLINE_C_V2FP32:
+ case AMDGPU::OPERAND_INLINE_SPLIT_BARRIER_INT32:
return getLit32Encoding(static_cast<uint32_t>(Imm), STI);
case AMDGPU::OPERAND_REG_IMM_INT64:
@@ -345,7 +354,8 @@ void AMDGPUMCCodeEmitter::encodeInstruction(const MCInst &MI,
// However, dst is encoded as EXEC for compatibility with SP3.
if (AMDGPU::isGFX10Plus(STI) && isVCMPX64(Desc)) {
assert((Encoding & 0xFF) == 0);
- Encoding |= MRI.getEncodingValue(AMDGPU::EXEC_LO);
+ Encoding |= MRI.getEncodingValue(AMDGPU::EXEC_LO) &
+ AMDGPU::HWEncoding::REG_IDX_MASK;
}
for (unsigned i = 0; i < bytes; i++) {
@@ -403,7 +413,10 @@ void AMDGPUMCCodeEmitter::encodeInstruction(const MCInst &MI,
} else if (!Op.isExpr()) // Exprs will be replaced with a fixup value.
llvm_unreachable("Must be immediate or expr");
- support::endian::write<uint32_t>(CB, Imm, support::endianness::little);
+ if (Desc.operands()[i].OperandType == AMDGPU::OPERAND_REG_IMM_FP64)
+ Imm = Hi_32(Imm);
+
+ support::endian::write<uint32_t>(CB, Imm, llvm::endianness::little);
// Only one literal value allowed
break;
@@ -488,11 +501,14 @@ void AMDGPUMCCodeEmitter::getAVOperandEncoding(
const MCInst &MI, unsigned OpNo, APInt &Op,
SmallVectorImpl<MCFixup> &Fixups, const MCSubtargetInfo &STI) const {
unsigned Reg = MI.getOperand(OpNo).getReg();
- uint64_t Enc = MRI.getEncodingValue(Reg);
+ unsigned Enc = MRI.getEncodingValue(Reg);
+ unsigned Idx = Enc & AMDGPU::HWEncoding::REG_IDX_MASK;
+ bool IsVGPROrAGPR = Enc & AMDGPU::HWEncoding::IS_VGPR_OR_AGPR;
// VGPR and AGPR have the same encoding, but SrcA and SrcB operands of mfma
// instructions use acc[0:1] modifier bits to distinguish. These bits are
// encoded as a virtual 9th bit of the register for these operands.
+ bool IsAGPR = false;
if (MRI.getRegClass(AMDGPU::AGPR_32RegClassID).contains(Reg) ||
MRI.getRegClass(AMDGPU::AReg_64RegClassID).contains(Reg) ||
MRI.getRegClass(AMDGPU::AReg_96RegClassID).contains(Reg) ||
@@ -507,9 +523,9 @@ void AMDGPUMCCodeEmitter::getAVOperandEncoding(
MRI.getRegClass(AMDGPU::AReg_384RegClassID).contains(Reg) ||
MRI.getRegClass(AMDGPU::AReg_512RegClassID).contains(Reg) ||
MRI.getRegClass(AMDGPU::AGPR_LO16RegClassID).contains(Reg))
- Enc |= 512;
+ IsAGPR = true;
- Op = Enc;
+ Op = Idx | (IsVGPROrAGPR << 8) | (IsAGPR << 9);
}
static bool needsPCRel(const MCExpr *Expr) {
@@ -540,13 +556,38 @@ void AMDGPUMCCodeEmitter::getMachineOpValue(const MCInst &MI,
SmallVectorImpl<MCFixup> &Fixups,
const MCSubtargetInfo &STI) const {
if (MO.isReg()){
- Op = MRI.getEncodingValue(MO.getReg());
+ unsigned Enc = MRI.getEncodingValue(MO.getReg());
+ unsigned Idx = Enc & AMDGPU::HWEncoding::REG_IDX_MASK;
+ bool IsVGPR = Enc & AMDGPU::HWEncoding::IS_VGPR_OR_AGPR;
+ Op = Idx | (IsVGPR << 8);
return;
}
unsigned OpNo = &MO - MI.begin();
getMachineOpValueCommon(MI, MO, OpNo, Op, Fixups, STI);
}
+void AMDGPUMCCodeEmitter::getMachineOpValueT16(
+ const MCInst &MI, unsigned OpNo, APInt &Op,
+ SmallVectorImpl<MCFixup> &Fixups, const MCSubtargetInfo &STI) const {
+ llvm_unreachable("TODO: Implement getMachineOpValueT16().");
+}
+
+void AMDGPUMCCodeEmitter::getMachineOpValueT16Lo128(
+ const MCInst &MI, unsigned OpNo, APInt &Op,
+ SmallVectorImpl<MCFixup> &Fixups, const MCSubtargetInfo &STI) const {
+ const MCOperand &MO = MI.getOperand(OpNo);
+ if (MO.isReg()) {
+ uint16_t Encoding = MRI.getEncodingValue(MO.getReg());
+ unsigned RegIdx = Encoding & AMDGPU::HWEncoding::REG_IDX_MASK;
+ bool IsHi = Encoding & AMDGPU::HWEncoding::IS_HI;
+ bool IsVGPR = Encoding & AMDGPU::HWEncoding::IS_VGPR_OR_AGPR;
+ assert((!IsVGPR || isUInt<7>(RegIdx)) && "VGPR0-VGPR127 expected!");
+ Op = (IsVGPR ? 0x100 : 0) | (IsHi ? 0x80 : 0) | RegIdx;
+ return;
+ }
+ getMachineOpValueCommon(MI, MO, OpNo, Op, Fixups, STI);
+}
+
void AMDGPUMCCodeEmitter::getMachineOpValueCommon(
const MCInst &MI, const MCOperand &MO, unsigned OpNo, APInt &Op,
SmallVectorImpl<MCFixup> &Fixups, const MCSubtargetInfo &STI) const {
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp
index 1bd3cdc67800..a855cf585205 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp
@@ -66,8 +66,8 @@ bool AMDGPUTargetStreamer::EmitHSAMetadataV3(StringRef HSAMetadataString) {
StringRef AMDGPUTargetStreamer::getArchNameFromElfMach(unsigned ElfMach) {
AMDGPU::GPUKind AK;
+ // clang-format off
switch (ElfMach) {
- default: llvm_unreachable("Unhandled ELF::EF_AMDGPU type");
case ELF::EF_AMDGPU_MACH_R600_R600: AK = GK_R600; break;
case ELF::EF_AMDGPU_MACH_R600_R630: AK = GK_R630; break;
case ELF::EF_AMDGPU_MACH_R600_RS880: AK = GK_RS880; break;
@@ -126,8 +126,12 @@ StringRef AMDGPUTargetStreamer::getArchNameFromElfMach(unsigned ElfMach) {
case ELF::EF_AMDGPU_MACH_AMDGCN_GFX1103: AK = GK_GFX1103; break;
case ELF::EF_AMDGPU_MACH_AMDGCN_GFX1150: AK = GK_GFX1150; break;
case ELF::EF_AMDGPU_MACH_AMDGCN_GFX1151: AK = GK_GFX1151; break;
+ case ELF::EF_AMDGPU_MACH_AMDGCN_GFX1200: AK = GK_GFX1200; break;
+ case ELF::EF_AMDGPU_MACH_AMDGCN_GFX1201: AK = GK_GFX1201; break;
case ELF::EF_AMDGPU_MACH_NONE: AK = GK_NONE; break;
+ default: AK = GK_NONE; break;
}
+ // clang-format on
StringRef GPUName = getArchNameAMDGCN(AK);
if (GPUName != "")
@@ -140,6 +144,7 @@ unsigned AMDGPUTargetStreamer::getElfMach(StringRef GPU) {
if (AK == AMDGPU::GPUKind::GK_NONE)
AK = parseArchR600(GPU);
+ // clang-format off
switch (AK) {
case GK_R600: return ELF::EF_AMDGPU_MACH_R600_R600;
case GK_R630: return ELF::EF_AMDGPU_MACH_R600_R630;
@@ -199,8 +204,11 @@ unsigned AMDGPUTargetStreamer::getElfMach(StringRef GPU) {
case GK_GFX1103: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX1103;
case GK_GFX1150: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX1150;
case GK_GFX1151: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX1151;
+ case GK_GFX1200: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX1200;
+ case GK_GFX1201: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX1201;
case GK_NONE: return ELF::EF_AMDGPU_MACH_NONE;
}
+ // clang-format on
llvm_unreachable("unknown GPU");
}
@@ -368,6 +376,12 @@ void AMDGPUTargetAsmStreamer::EmitAmdhsaKernelDescriptor(
PRINT_FIELD(OS, ".amdhsa_user_sgpr_flat_scratch_init", KD,
kernel_code_properties,
amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT);
+ if (hasKernargPreload(STI)) {
+ PRINT_FIELD(OS, ".amdhsa_user_sgpr_kernarg_preload_length ", KD,
+ kernarg_preload, amdhsa::KERNARG_PRELOAD_SPEC_LENGTH);
+ PRINT_FIELD(OS, ".amdhsa_user_sgpr_kernarg_preload_offset ", KD,
+ kernarg_preload, amdhsa::KERNARG_PRELOAD_SPEC_OFFSET);
+ }
PRINT_FIELD(OS, ".amdhsa_user_sgpr_private_segment_size", KD,
kernel_code_properties,
amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE);
@@ -418,9 +432,6 @@ void AMDGPUTargetAsmStreamer::EmitAmdhsaKernelDescriptor(
switch (CodeObjectVersion) {
default:
break;
- case AMDGPU::AMDHSA_COV2:
- break;
- case AMDGPU::AMDHSA_COV3:
case AMDGPU::AMDHSA_COV4:
case AMDGPU::AMDHSA_COV5:
if (getTargetID()->isXnackSupported())
@@ -440,16 +451,16 @@ void AMDGPUTargetAsmStreamer::EmitAmdhsaKernelDescriptor(
PRINT_FIELD(OS, ".amdhsa_float_denorm_mode_16_64", KD,
compute_pgm_rsrc1,
amdhsa::COMPUTE_PGM_RSRC1_FLOAT_DENORM_MODE_16_64);
- PRINT_FIELD(OS, ".amdhsa_dx10_clamp", KD,
- compute_pgm_rsrc1,
- amdhsa::COMPUTE_PGM_RSRC1_ENABLE_DX10_CLAMP);
- PRINT_FIELD(OS, ".amdhsa_ieee_mode", KD,
- compute_pgm_rsrc1,
- amdhsa::COMPUTE_PGM_RSRC1_ENABLE_IEEE_MODE);
+ if (IVersion.Major < 12) {
+ PRINT_FIELD(OS, ".amdhsa_dx10_clamp", KD, compute_pgm_rsrc1,
+ amdhsa::COMPUTE_PGM_RSRC1_GFX6_GFX11_ENABLE_DX10_CLAMP);
+ PRINT_FIELD(OS, ".amdhsa_ieee_mode", KD, compute_pgm_rsrc1,
+ amdhsa::COMPUTE_PGM_RSRC1_GFX6_GFX11_ENABLE_IEEE_MODE);
+ }
if (IVersion.Major >= 9)
PRINT_FIELD(OS, ".amdhsa_fp16_overflow", KD,
compute_pgm_rsrc1,
- amdhsa::COMPUTE_PGM_RSRC1_FP16_OVFL);
+ amdhsa::COMPUTE_PGM_RSRC1_GFX9_PLUS_FP16_OVFL);
if (AMDGPU::isGFX90A(STI))
PRINT_FIELD(OS, ".amdhsa_tg_split", KD,
compute_pgm_rsrc3,
@@ -457,16 +468,19 @@ void AMDGPUTargetAsmStreamer::EmitAmdhsaKernelDescriptor(
if (IVersion.Major >= 10) {
PRINT_FIELD(OS, ".amdhsa_workgroup_processor_mode", KD,
compute_pgm_rsrc1,
- amdhsa::COMPUTE_PGM_RSRC1_WGP_MODE);
+ amdhsa::COMPUTE_PGM_RSRC1_GFX10_PLUS_WGP_MODE);
PRINT_FIELD(OS, ".amdhsa_memory_ordered", KD,
compute_pgm_rsrc1,
- amdhsa::COMPUTE_PGM_RSRC1_MEM_ORDERED);
+ amdhsa::COMPUTE_PGM_RSRC1_GFX10_PLUS_MEM_ORDERED);
PRINT_FIELD(OS, ".amdhsa_forward_progress", KD,
compute_pgm_rsrc1,
- amdhsa::COMPUTE_PGM_RSRC1_FWD_PROGRESS);
+ amdhsa::COMPUTE_PGM_RSRC1_GFX10_PLUS_FWD_PROGRESS);
PRINT_FIELD(OS, ".amdhsa_shared_vgpr_count", KD, compute_pgm_rsrc3,
amdhsa::COMPUTE_PGM_RSRC3_GFX10_PLUS_SHARED_VGPR_COUNT);
}
+ if (IVersion.Major >= 12)
+ PRINT_FIELD(OS, ".amdhsa_round_robin_scheduling", KD, compute_pgm_rsrc1,
+ amdhsa::COMPUTE_PGM_RSRC1_GFX12_PLUS_ENABLE_WG_RR_EN);
PRINT_FIELD(
OS, ".amdhsa_exception_fp_ieee_invalid_op", KD,
compute_pgm_rsrc2,
@@ -539,7 +553,7 @@ void AMDGPUTargetELFStreamer::EmitNote(
unsigned NoteFlags = 0;
// TODO Apparently, this is currently needed for OpenCL as mentioned in
// https://reviews.llvm.org/D74995
- if (STI.getTargetTriple().getOS() == Triple::AMDHSA)
+ if (isHsaAbi(STI))
NoteFlags = ELF::SHF_ALLOC;
S.pushSection();
@@ -598,11 +612,10 @@ unsigned AMDGPUTargetELFStreamer::getEFlagsUnknownOS() {
}
unsigned AMDGPUTargetELFStreamer::getEFlagsAMDHSA() {
- assert(STI.getTargetTriple().getOS() == Triple::AMDHSA);
+ assert(isHsaAbi(STI));
if (std::optional<uint8_t> HsaAbiVer = getHsaAbiVersion(&STI)) {
switch (*HsaAbiVer) {
- case ELF::ELFABIVERSION_AMDGPU_HSA_V2:
case ELF::ELFABIVERSION_AMDGPU_HSA_V3:
return getEFlagsV3();
case ELF::ELFABIVERSION_AMDGPU_HSA_V4:
@@ -827,6 +840,24 @@ bool AMDGPUTargetELFStreamer::EmitHSAMetadata(
return true;
}
+bool AMDGPUTargetAsmStreamer::EmitKernargPreloadHeader(
+ const MCSubtargetInfo &STI) {
+ for (int i = 0; i < 64; ++i) {
+ OS << "\ts_nop 0\n";
+ }
+ return true;
+}
+
+bool AMDGPUTargetELFStreamer::EmitKernargPreloadHeader(
+ const MCSubtargetInfo &STI) {
+ const uint32_t Encoded_s_nop = 0xbf800000;
+ MCStreamer &OS = getStreamer();
+ for (int i = 0; i < 64; ++i) {
+ OS.emitInt32(Encoded_s_nop);
+ }
+ return true;
+}
+
bool AMDGPUTargetELFStreamer::EmitCodeEnd(const MCSubtargetInfo &STI) {
const uint32_t Encoded_s_code_end = 0xbf9f0000;
const uint32_t Encoded_s_nop = 0xbf800000;
@@ -906,6 +937,7 @@ void AMDGPUTargetELFStreamer::EmitAmdhsaKernelDescriptor(
Streamer.emitInt32(KernelDescriptor.compute_pgm_rsrc1);
Streamer.emitInt32(KernelDescriptor.compute_pgm_rsrc2);
Streamer.emitInt16(KernelDescriptor.kernel_code_properties);
- for (uint8_t Res : KernelDescriptor.reserved2)
+ Streamer.emitInt16(KernelDescriptor.kernarg_preload);
+ for (uint8_t Res : KernelDescriptor.reserved3)
Streamer.emitInt8(Res);
}
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h b/contrib/llvm-project/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h
index db43de8fcc5f..55b5246c9210 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h
@@ -90,6 +90,11 @@ public:
/// \returns True on success, false on failure.
virtual bool EmitCodeEnd(const MCSubtargetInfo &STI) { return true; }
+ /// \returns True on success, false on failure.
+ virtual bool EmitKernargPreloadHeader(const MCSubtargetInfo &STI) {
+ return true;
+ }
+
virtual void EmitAmdhsaKernelDescriptor(
const MCSubtargetInfo &STI, StringRef KernelName,
const amdhsa::kernel_descriptor_t &KernelDescriptor, uint64_t NextVGPR,
@@ -154,6 +159,9 @@ public:
/// \returns True on success, false on failure.
bool EmitCodeEnd(const MCSubtargetInfo &STI) override;
+ /// \returns True on success, false on failure.
+ bool EmitKernargPreloadHeader(const MCSubtargetInfo &STI) override;
+
void EmitAmdhsaKernelDescriptor(
const MCSubtargetInfo &STI, StringRef KernelName,
const amdhsa::kernel_descriptor_t &KernelDescriptor, uint64_t NextVGPR,
@@ -215,6 +223,9 @@ public:
/// \returns True on success, false on failure.
bool EmitCodeEnd(const MCSubtargetInfo &STI) override;
+ /// \returns True on success, false on failure.
+ bool EmitKernargPreloadHeader(const MCSubtargetInfo &STI) override;
+
void EmitAmdhsaKernelDescriptor(
const MCSubtargetInfo &STI, StringRef KernelName,
const amdhsa::kernel_descriptor_t &KernelDescriptor, uint64_t NextVGPR,
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/MCTargetDesc/R600MCCodeEmitter.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/MCTargetDesc/R600MCCodeEmitter.cpp
index bbbfbe4faa0f..6c539df7677e 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/MCTargetDesc/R600MCCodeEmitter.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/MCTargetDesc/R600MCCodeEmitter.cpp
@@ -142,11 +142,11 @@ void R600MCCodeEmitter::encodeInstruction(const MCInst &MI,
}
void R600MCCodeEmitter::emit(uint32_t Value, SmallVectorImpl<char> &CB) const {
- support::endian::write(CB, Value, support::little);
+ support::endian::write(CB, Value, llvm::endianness::little);
}
void R600MCCodeEmitter::emit(uint64_t Value, SmallVectorImpl<char> &CB) const {
- support::endian::write(CB, Value, support::little);
+ support::endian::write(CB, Value, llvm::endianness::little);
}
unsigned R600MCCodeEmitter::getHWReg(unsigned RegNo) const {
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/MIMGInstructions.td b/contrib/llvm-project/llvm/lib/Target/AMDGPU/MIMGInstructions.td
index d924f733624a..240366c8e7da 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/MIMGInstructions.td
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/MIMGInstructions.td
@@ -25,6 +25,7 @@ def MIMGEncGfx10Default : MIMGEncoding;
def MIMGEncGfx10NSA : MIMGEncoding;
def MIMGEncGfx11Default : MIMGEncoding;
def MIMGEncGfx11NSA : MIMGEncoding;
+def MIMGEncGfx12 : MIMGEncoding;
def MIMGEncoding : GenericEnum {
let FilterClass = "MIMGEncoding";
@@ -95,11 +96,13 @@ def MIMG {
int NOP = -1;
}
-class mimgopc <int gfx11, int gfx10m, int vi = gfx10m, int si = gfx10m> {
+class mimgopc <int gfx12, int gfx11, int gfx10m, int vi = gfx10m, int si = gfx10m> {
+ field bits<8> GFX12 = gfx12;
field bits<8> GFX11 = gfx11;
field bits<8> GFX10M = gfx10m; // GFX10minus for all but atomics
field bits<8> VI = vi; // VI is only used for atomic/sampler/gather instructions
field bits<8> SI = si; // SI is only used for atomic instructions
+ bit HAS_GFX12 = !ne(gfx12, MIMG.NOP);
bit HAS_GFX11 = !ne(gfx11, MIMG.NOP);
bit HAS_GFX10M = !ne(gfx10m, MIMG.NOP);
bit HAS_VI = !ne(vi, MIMG.NOP);
@@ -218,6 +221,16 @@ class MIMG <dag outs, string dns = "">
bits<8> VAddrOperands;
}
+class VIMAGE <dag outs, string dns = ""> : MIMG<outs, dns> {
+ let MIMG = 0;
+ let VIMAGE = 1;
+}
+
+class VSAMPLE <dag outs, string dns = ""> : MIMG<outs, dns> {
+ let MIMG = 0;
+ let VSAMPLE = 1;
+}
+
def MIMGInfoTable : GenericTable {
let FilterClass = "MIMG";
let CppTypeName = "MIMGInfo";
@@ -327,8 +340,8 @@ class MIMG_nsa_gfx10<int op, dag outs, int num_addrs, string dns="">
// Base class of all non-NSA gfx11 MIMG instructions.
class MIMG_gfx11<int op, dag outs, string dns = "">
: MIMG<outs, dns>, MIMGe_gfx11<op> {
- let SubtargetPredicate = isGFX11Plus;
- let AssemblerPredicate = isGFX11Plus;
+ let SubtargetPredicate = isGFX11Only;
+ let AssemblerPredicate = isGFX11Only;
let MIMGEncoding = MIMGEncGfx11Default;
let VAddrOperands = 1;
@@ -343,8 +356,8 @@ class MIMG_nsa_gfx11<int op, dag outs, int num_addrs, string dns="",
list<RegisterClass> addr_types=[],
RegisterClass LastAddrRC = VGPR_32>
: MIMG<outs, dns>, MIMGe_gfx11<op> {
- let SubtargetPredicate = isGFX11Plus;
- let AssemblerPredicate = isGFX11Plus;
+ let SubtargetPredicate = isGFX11Only;
+ let AssemblerPredicate = isGFX11Only;
let MIMGEncoding = MIMGEncGfx11NSA;
let VAddrOperands = num_addrs;
@@ -359,6 +372,48 @@ class MIMG_nsa_gfx11<int op, dag outs, int num_addrs, string dns="",
let nsa = nsah.NSA;
}
+class VIMAGE_gfx12<int op, dag outs, int num_addrs, string dns="",
+ list<RegisterClass> addr_types=[]>
+ : VIMAGE<outs, dns>, VIMAGEe<op> {
+ let SubtargetPredicate = isGFX12Plus;
+ let AssemblerPredicate = isGFX12Plus;
+
+ let MIMGEncoding = MIMGEncGfx12;
+ let VAddrOperands = num_addrs;
+
+ MIMGNSAHelper nsah = !if(!empty(addr_types),
+ MIMGNSAHelper<num_addrs>,
+ MIMGNSAHelper<num_addrs, addr_types>);
+ dag AddrIns = nsah.AddrIns;
+ string AddrAsm = !if(!eq(num_addrs, 1), "$vaddr0", nsah.AddrAsm);
+
+ let d16 = !if(BaseOpcode.HasD16, ?, 0);
+ let vaddr1 = !if(!lt(num_addrs, 2), 0, ?);
+ let vaddr2 = !if(!lt(num_addrs, 3), 0, ?);
+ let vaddr3 = !if(!lt(num_addrs, 4), 0, ?);
+ let vaddr4 = !if(!lt(num_addrs, 5), 0, ?);
+}
+
+class VSAMPLE_gfx12<int op, dag outs, int num_addrs, string dns="",
+ RegisterClass Addr3RC>
+ : VSAMPLE<outs, dns>, VSAMPLEe<op> {
+ let SubtargetPredicate = isGFX12Plus;
+ let AssemblerPredicate = isGFX12Plus;
+
+ let MIMGEncoding = MIMGEncGfx12;
+ let VAddrOperands = num_addrs;
+
+ PartialNSAHelper nsah = PartialNSAHelper<num_addrs, 4, Addr3RC>;
+
+ dag AddrIns = nsah.AddrIns;
+ string AddrAsm = !if(!eq(num_addrs, 1), "$vaddr0", nsah.AddrAsm);
+
+ let d16 = !if(BaseOpcode.HasD16, ?, 0);
+ let vaddr1 = !if(!lt(num_addrs, 2), 0, ?);
+ let vaddr2 = !if(!lt(num_addrs, 3), 0, ?);
+ let vaddr3 = !if(!lt(num_addrs, 4), 0, ?);
+}
+
class MIMG_NoSampler_Helper <mimgopc op, string asm,
RegisterClass dst_rc,
RegisterClass addr_rc,
@@ -435,12 +490,41 @@ class MIMG_NoSampler_nsa_gfx11<mimgopc op, string opcode,
#!if(BaseOpcode.HasD16, "$d16", "");
}
+class VIMAGE_NoSampler_gfx12<mimgopc op, string opcode,
+ RegisterClass DataRC, int num_addrs,
+ string dns="">
+ : VIMAGE_gfx12<op.GFX11, (outs DataRC:$vdata), num_addrs, dns> {
+ let InOperandList = !con(AddrIns,
+ (ins SReg_256:$rsrc, DMask:$dmask, Dim:$dim,
+ CPol:$cpol, R128A16:$r128, A16:$a16, TFE:$tfe),
+ !if(BaseOpcode.HasD16, (ins D16:$d16), (ins)));
+ let AsmString = opcode#" $vdata, "#AddrAsm#", $rsrc$dmask$dim$cpol$r128$a16$tfe"
+ #!if(BaseOpcode.HasD16, "$d16", "");
+}
+
+class VSAMPLE_Sampler_gfx12<mimgopc op, string opcode, RegisterClass DataRC,
+ int num_addrs, RegisterClass Addr3RC = VGPR_32,
+ string dns="">
+ : VSAMPLE_gfx12<op.GFX12, (outs DataRC:$vdata), num_addrs, dns, Addr3RC> {
+ let InOperandList = !con(AddrIns,
+ (ins SReg_256:$rsrc),
+ !if(BaseOpcode.Sampler, (ins SReg_128:$samp), (ins)),
+ (ins DMask:$dmask, Dim:$dim, UNorm:$unorm,
+ CPol:$cpol, R128A16:$r128, A16:$a16, TFE:$tfe,
+ LWE:$lwe),
+ !if(BaseOpcode.HasD16, (ins D16:$d16), (ins)));
+ let AsmString = opcode#" $vdata, "#AddrAsm#", $rsrc"
+ #!if(BaseOpcode.Sampler, ", $samp", "")
+ #"$dmask$dim$unorm$cpol$r128$a16$tfe$lwe"
+ #!if(BaseOpcode.HasD16, "$d16", "");
+}
+
multiclass MIMG_NoSampler_Src_Helper <mimgopc op, string asm,
- RegisterClass dst_rc,
- bit enableDisasm,
- bit ExtendedImageInst = 1> {
- let ssamp = 0 in {
- let VAddrDwords = 1 in {
+ RegisterClass dst_rc, bit enableDisasm,
+ bit ExtendedImageInst = 1,
+ bit isVSample = 0> {
+ let VAddrDwords = 1 in {
+ let ssamp = 0 in {
if op.HAS_GFX10M then {
def _V1 : MIMG_NoSampler_Helper <op, asm, dst_rc, VGPR_32,
!if(enableDisasm, "AMDGPU", "")>;
@@ -455,8 +539,19 @@ multiclass MIMG_NoSampler_Src_Helper <mimgopc op, string asm,
!if(enableDisasm, "AMDGPU", "")>;
}
}
-
- let VAddrDwords = 2 in {
+ if op.HAS_GFX12 then {
+ if isVSample then {
+ let samp = 0 in
+ def _V1_gfx12 : VSAMPLE_Sampler_gfx12<op, asm, dst_rc, 1>;
+ }
+ else {
+ def _V1_gfx12 : VIMAGE_NoSampler_gfx12<op, asm, dst_rc, 1,
+ !if(enableDisasm, "GFX12", "")>;
+ }
+ }
+ }
+ let VAddrDwords = 2 in {
+ let ssamp = 0 in {
if op.HAS_GFX10M then {
def _V2 : MIMG_NoSampler_Helper <op, asm, dst_rc, VReg_64>;
if !not(ExtendedImageInst) then
@@ -469,8 +564,18 @@ multiclass MIMG_NoSampler_Src_Helper <mimgopc op, string asm,
def _V2_nsa_gfx11 : MIMG_NoSampler_nsa_gfx11<op, asm, dst_rc, 2>;
}
}
-
- let VAddrDwords = 3 in {
+ if op.HAS_GFX12 then {
+ if isVSample then {
+ let samp = 0 in
+ def _V2_gfx12 : VSAMPLE_Sampler_gfx12<op, asm, dst_rc, 2>;
+ }
+ else {
+ def _V2_gfx12 : VIMAGE_NoSampler_gfx12<op, asm, dst_rc, 2>;
+ }
+ }
+ }
+ let VAddrDwords = 3 in {
+ let ssamp = 0 in {
if op.HAS_GFX10M then {
def _V3 : MIMG_NoSampler_Helper <op, asm, dst_rc, VReg_96>;
if !not(ExtendedImageInst) then
@@ -483,8 +588,18 @@ multiclass MIMG_NoSampler_Src_Helper <mimgopc op, string asm,
def _V3_nsa_gfx11 : MIMG_NoSampler_nsa_gfx11<op, asm, dst_rc, 3>;
}
}
-
- let VAddrDwords = 4 in {
+ if op.HAS_GFX12 then {
+ if isVSample then {
+ let samp = 0 in
+ def _V3_gfx12 : VSAMPLE_Sampler_gfx12<op, asm, dst_rc, 3>;
+ }
+ else {
+ def _V3_gfx12 : VIMAGE_NoSampler_gfx12<op, asm, dst_rc, 3>;
+ }
+ }
+ }
+ let VAddrDwords = 4 in {
+ let ssamp = 0 in {
if op.HAS_GFX10M then {
def _V4 : MIMG_NoSampler_Helper <op, asm, dst_rc, VReg_128>;
if !not(ExtendedImageInst) then
@@ -499,6 +614,17 @@ multiclass MIMG_NoSampler_Src_Helper <mimgopc op, string asm,
!if(enableDisasm, "AMDGPU", "")>;
}
}
+ if op.HAS_GFX12 then {
+ if isVSample then {
+ let samp = 0 in
+ def _V4_gfx12 : VSAMPLE_Sampler_gfx12<op, asm, dst_rc, 4, VGPR_32,
+ !if(enableDisasm, "GFX12", "")>;
+ }
+ else {
+ def _V4_gfx12 : VIMAGE_NoSampler_gfx12<op, asm, dst_rc, 4,
+ !if(enableDisasm, "GFX12", "")>;
+ }
+ }
}
}
@@ -606,62 +732,97 @@ class MIMG_Store_nsa_gfx11<mimgopc op, string opcode,
#!if(BaseOpcode.HasD16, "$d16", "");
}
+class VIMAGE_Store_gfx12<mimgopc op, string opcode,
+ RegisterClass DataRC, int num_addrs,
+ string dns="">
+ : VIMAGE_gfx12<op.GFX12, (outs), num_addrs, dns> {
+ let InOperandList = !con((ins DataRC:$vdata),
+ AddrIns,
+ (ins SReg_256:$rsrc, DMask:$dmask, Dim:$dim,
+ CPol:$cpol, R128A16:$r128, A16:$a16, TFE:$tfe),
+ !if(BaseOpcode.HasD16, (ins D16:$d16), (ins)));
+ let AsmString = opcode#" $vdata, "#AddrAsm#", $rsrc$dmask$dim$cpol$r128$a16$tfe"
+ #!if(BaseOpcode.HasD16, "$d16", "");
+}
+
multiclass MIMG_Store_Addr_Helper <mimgopc op, string asm,
RegisterClass data_rc,
bit enableDisasm> {
let mayLoad = 0, mayStore = 1, hasSideEffects = 0, hasPostISelHook = 0,
- DisableWQM = 1, ssamp = 0 in {
+ DisableWQM = 1 in {
let VAddrDwords = 1 in {
- if op.HAS_GFX10M then {
- def _V1 : MIMG_Store_Helper <op, asm, data_rc, VGPR_32,
- !if(enableDisasm, "AMDGPU", "")>;
- let hasPostISelHook = 1 in
- def _V1_gfx90a : MIMG_Store_Helper_gfx90a <op, asm, data_rc, VGPR_32,
- !if(enableDisasm, "GFX90A", "")>;
- def _V1_gfx10 : MIMG_Store_gfx10 <op, asm, data_rc, VGPR_32,
- !if(enableDisasm, "AMDGPU", "")>;
+ let ssamp = 0 in {
+ if op.HAS_GFX10M then {
+ def _V1 : MIMG_Store_Helper <op, asm, data_rc, VGPR_32,
+ !if(enableDisasm, "AMDGPU", "")>;
+ let hasPostISelHook = 1 in
+ def _V1_gfx90a : MIMG_Store_Helper_gfx90a <op, asm, data_rc, VGPR_32,
+ !if(enableDisasm, "GFX90A", "")>;
+ def _V1_gfx10 : MIMG_Store_gfx10 <op, asm, data_rc, VGPR_32,
+ !if(enableDisasm, "AMDGPU", "")>;
+ }
+ if op.HAS_GFX11 then {
+ def _V1_gfx11 : MIMG_Store_gfx11 <op, asm, data_rc, VGPR_32,
+ !if(enableDisasm, "AMDGPU", "")>;
+ }
}
- if op.HAS_GFX11 then {
- def _V1_gfx11 : MIMG_Store_gfx11 <op, asm, data_rc, VGPR_32,
- !if(enableDisasm, "AMDGPU", "")>;
+ if op.HAS_GFX12 then {
+ def _V1_gfx12 : VIMAGE_Store_gfx12 <op, asm, data_rc, 1,
+ !if(enableDisasm, "GFX12", "")>;
}
}
let VAddrDwords = 2 in {
- if op.HAS_GFX10M then {
- def _V2 : MIMG_Store_Helper <op, asm, data_rc, VReg_64>;
- def _V2_gfx90a : MIMG_Store_Helper_gfx90a <op, asm, data_rc, VReg_64>;
- def _V2_gfx10 : MIMG_Store_gfx10 <op, asm, data_rc, VReg_64>;
- def _V2_nsa_gfx10 : MIMG_Store_nsa_gfx10 <op, asm, data_rc, 2>;
+ let ssamp = 0 in {
+ if op.HAS_GFX10M then {
+ def _V2 : MIMG_Store_Helper <op, asm, data_rc, VReg_64>;
+ def _V2_gfx90a : MIMG_Store_Helper_gfx90a <op, asm, data_rc, VReg_64>;
+ def _V2_gfx10 : MIMG_Store_gfx10 <op, asm, data_rc, VReg_64>;
+ def _V2_nsa_gfx10 : MIMG_Store_nsa_gfx10 <op, asm, data_rc, 2>;
+ }
+ if op.HAS_GFX11 then {
+ def _V2_gfx11 : MIMG_Store_gfx11 <op, asm, data_rc, VReg_64>;
+ def _V2_nsa_gfx11 : MIMG_Store_nsa_gfx11 <op, asm, data_rc, 2>;
+ }
}
- if op.HAS_GFX11 then {
- def _V2_gfx11 : MIMG_Store_gfx11 <op, asm, data_rc, VReg_64>;
- def _V2_nsa_gfx11 : MIMG_Store_nsa_gfx11 <op, asm, data_rc, 2>;
+ if op.HAS_GFX12 then {
+ def _V2_gfx12 : VIMAGE_Store_gfx12 <op, asm, data_rc, 2>;
}
}
let VAddrDwords = 3 in {
- if op.HAS_GFX10M then {
- def _V3 : MIMG_Store_Helper <op, asm, data_rc, VReg_96>;
- def _V3_gfx90a : MIMG_Store_Helper_gfx90a <op, asm, data_rc, VReg_96>;
- def _V3_gfx10 : MIMG_Store_gfx10 <op, asm, data_rc, VReg_96>;
- def _V3_nsa_gfx10 : MIMG_Store_nsa_gfx10 <op, asm, data_rc, 3>;
+ let ssamp = 0 in {
+ if op.HAS_GFX10M then {
+ def _V3 : MIMG_Store_Helper <op, asm, data_rc, VReg_96>;
+ def _V3_gfx90a : MIMG_Store_Helper_gfx90a <op, asm, data_rc, VReg_96>;
+ def _V3_gfx10 : MIMG_Store_gfx10 <op, asm, data_rc, VReg_96>;
+ def _V3_nsa_gfx10 : MIMG_Store_nsa_gfx10 <op, asm, data_rc, 3>;
+ }
+ if op.HAS_GFX11 then {
+ def _V3_gfx11 : MIMG_Store_gfx11 <op, asm, data_rc, VReg_96>;
+ def _V3_nsa_gfx11 : MIMG_Store_nsa_gfx11 <op, asm, data_rc, 3>;
+ }
}
- if op.HAS_GFX11 then {
- def _V3_gfx11 : MIMG_Store_gfx11 <op, asm, data_rc, VReg_96>;
- def _V3_nsa_gfx11 : MIMG_Store_nsa_gfx11 <op, asm, data_rc, 3>;
+ if op.HAS_GFX12 then {
+ def _V3_gfx12 : VIMAGE_Store_gfx12 <op, asm, data_rc, 3>;
}
}
let VAddrDwords = 4 in {
- if op.HAS_GFX10M then {
- def _V4 : MIMG_Store_Helper <op, asm, data_rc, VReg_128>;
- def _V4_gfx90a : MIMG_Store_Helper_gfx90a <op, asm, data_rc, VReg_128>;
- def _V4_gfx10 : MIMG_Store_gfx10 <op, asm, data_rc, VReg_128>;
- def _V4_nsa_gfx10 : MIMG_Store_nsa_gfx10 <op, asm, data_rc, 4,
- !if(enableDisasm, "AMDGPU", "")>;
+ let ssamp = 0 in {
+ if op.HAS_GFX10M then {
+ def _V4 : MIMG_Store_Helper <op, asm, data_rc, VReg_128>;
+ def _V4_gfx90a : MIMG_Store_Helper_gfx90a <op, asm, data_rc, VReg_128>;
+ def _V4_gfx10 : MIMG_Store_gfx10 <op, asm, data_rc, VReg_128>;
+ def _V4_nsa_gfx10 : MIMG_Store_nsa_gfx10 <op, asm, data_rc, 4,
+ !if(enableDisasm, "AMDGPU", "")>;
+ }
+ if op.HAS_GFX11 then {
+ def _V4_gfx11 : MIMG_Store_gfx11 <op, asm, data_rc, VReg_128>;
+ def _V4_nsa_gfx11 : MIMG_Store_nsa_gfx11 <op, asm, data_rc, 4,
+ !if(enableDisasm, "AMDGPU", "")>;
+ }
}
- if op.HAS_GFX11 then {
- def _V4_gfx11 : MIMG_Store_gfx11 <op, asm, data_rc, VReg_128>;
- def _V4_nsa_gfx11 : MIMG_Store_nsa_gfx11 <op, asm, data_rc, 4,
- !if(enableDisasm, "AMDGPU", "")>;
+ if op.HAS_GFX12 then {
+ def _V4_gfx12 : VIMAGE_Store_gfx12 <op, asm, data_rc, 4,
+ !if(enableDisasm, "GFX12", "")>;
}
}
}
@@ -788,84 +949,137 @@ class MIMG_Atomic_nsa_gfx11<mimgopc op, string opcode,
let AsmString = opcode#" $vdata, "#AddrAsm#", $srsrc$dmask$dim$unorm$cpol$r128$a16$tfe$lwe";
}
+class VIMAGE_Atomic_gfx12<mimgopc op, string opcode, RegisterClass DataRC,
+ int num_addrs, bit enableDisasm = 0>
+ : VIMAGE_gfx12<!cast<int>(op.GFX12), (outs DataRC:$vdst), num_addrs,
+ !if(enableDisasm, "GFX12", "")> {
+ let Constraints = "$vdst = $vdata";
+
+ let InOperandList = !con((ins DataRC:$vdata),
+ AddrIns,
+ (ins SReg_256:$rsrc, DMask:$dmask, Dim:$dim,
+ CPol:$cpol, R128A16:$r128, A16:$a16, TFE:$tfe));
+ let AsmString = opcode#" $vdata, "#AddrAsm#", $rsrc$dmask$dim$cpol$r128$a16$tfe";
+}
+
+class VIMAGE_Atomic_gfx12_Renamed<mimgopc op, string opcode, string renamed,
+ RegisterClass DataRC, int num_addrs,
+ bit enableDisasm = 0>
+ : VIMAGE_Atomic_gfx12<op, renamed, DataRC, num_addrs, enableDisasm>,
+ MnemonicAlias<opcode, renamed>, Requires<[isGFX12Plus]>;
+
multiclass MIMG_Atomic_Addr_Helper_m <mimgopc op, string asm,
RegisterClass data_rc,
bit enableDasm = 0,
- bit isFP = 0> {
+ bit isFP = 0,
+ string renamed = ""> {
let hasSideEffects = 1, // FIXME: remove this
mayLoad = 1, mayStore = 1, hasPostISelHook = 0, DisableWQM = 1,
- ssamp = 0, FPAtomic = isFP in {
+ FPAtomic = isFP in {
let VAddrDwords = 1 in {
- if op.HAS_SI then {
- def _V1_si : MIMG_Atomic_si <op, asm, data_rc, VGPR_32, enableDasm>;
- }
- if op.HAS_VI then {
- def _V1_vi : MIMG_Atomic_vi <op, asm, data_rc, VGPR_32, enableDasm>;
- let hasPostISelHook = 1 in
- def _V1_gfx90a : MIMG_Atomic_gfx90a <op, asm, data_rc, VGPR_32, enableDasm>;
+ let ssamp = 0 in {
+ if op.HAS_SI then {
+ def _V1_si : MIMG_Atomic_si <op, asm, data_rc, VGPR_32, enableDasm>;
+ }
+ if op.HAS_VI then {
+ def _V1_vi : MIMG_Atomic_vi <op, asm, data_rc, VGPR_32, enableDasm>;
+ let hasPostISelHook = 1 in
+ def _V1_gfx90a : MIMG_Atomic_gfx90a <op, asm, data_rc, VGPR_32, enableDasm>;
+ }
+ if op.HAS_GFX10M then {
+ def _V1_gfx10 : MIMG_Atomic_gfx10 <op, asm, data_rc, VGPR_32, enableDasm>;
+ }
+ if op.HAS_GFX11 then {
+ def _V1_gfx11 : MIMG_Atomic_gfx11 <op, asm, data_rc, VGPR_32, enableDasm>;
+ }
}
- if op.HAS_GFX10M then {
- def _V1_gfx10 : MIMG_Atomic_gfx10 <op, asm, data_rc, VGPR_32, enableDasm>;
- }
- if op.HAS_GFX11 then {
- def _V1_gfx11 : MIMG_Atomic_gfx11 <op, asm, data_rc, VGPR_32, enableDasm>;
+ if op.HAS_GFX12 then {
+ if !empty(renamed) then
+ def _V1_gfx12 : VIMAGE_Atomic_gfx12 <op, asm, data_rc, 1, enableDasm>;
+ else
+ def _V1_gfx12 : VIMAGE_Atomic_gfx12_Renamed <op, asm, renamed, data_rc, 1, enableDasm>;
}
}
let VAddrDwords = 2 in {
- if op.HAS_SI then {
- def _V2_si : MIMG_Atomic_si <op, asm, data_rc, VReg_64, 0>;
+ let ssamp = 0 in {
+ if op.HAS_SI then {
+ def _V2_si : MIMG_Atomic_si <op, asm, data_rc, VReg_64, 0>;
+ }
+ if op.HAS_VI then {
+ def _V2_vi : MIMG_Atomic_vi <op, asm, data_rc, VReg_64, 0>;
+ def _V2_gfx90a : MIMG_Atomic_gfx90a <op, asm, data_rc, VReg_64, 0>;
+ }
+ if op.HAS_GFX10M then {
+ def _V2_gfx10 : MIMG_Atomic_gfx10 <op, asm, data_rc, VReg_64, 0>;
+ def _V2_nsa_gfx10 : MIMG_Atomic_nsa_gfx10 <op, asm, data_rc, 2, 0>;
+ }
+ if op.HAS_GFX11 then {
+ def _V2_gfx11 : MIMG_Atomic_gfx11 <op, asm, data_rc, VReg_64, 0>;
+ def _V2_nsa_gfx11 : MIMG_Atomic_nsa_gfx11 <op, asm, data_rc, 2, 0>;
+ }
}
- if op.HAS_VI then {
- def _V2_vi : MIMG_Atomic_vi <op, asm, data_rc, VReg_64, 0>;
- def _V2_gfx90a : MIMG_Atomic_gfx90a <op, asm, data_rc, VReg_64, 0>;
- }
- if op.HAS_GFX10M then {
- def _V2_gfx10 : MIMG_Atomic_gfx10 <op, asm, data_rc, VReg_64, 0>;
- def _V2_nsa_gfx10 : MIMG_Atomic_nsa_gfx10 <op, asm, data_rc, 2, 0>;
- }
- if op.HAS_GFX11 then {
- def _V2_gfx11 : MIMG_Atomic_gfx11 <op, asm, data_rc, VReg_64, 0>;
- def _V2_nsa_gfx11 : MIMG_Atomic_nsa_gfx11 <op, asm, data_rc, 2, 0>;
+ if op.HAS_GFX12 then {
+ if !empty(renamed) then
+ def _V2_gfx12 : VIMAGE_Atomic_gfx12 <op, asm, data_rc, 2, 0>;
+ else
+ def _V2_gfx12 : VIMAGE_Atomic_gfx12_Renamed <op, asm, renamed, data_rc, 2, 0>;
}
}
let VAddrDwords = 3 in {
- if op.HAS_SI then {
- def _V3_si : MIMG_Atomic_si <op, asm, data_rc, VReg_96, 0>;
- }
- if op.HAS_VI then {
- def _V3_vi : MIMG_Atomic_vi <op, asm, data_rc, VReg_96, 0>;
- def _V3_gfx90a : MIMG_Atomic_gfx90a <op, asm, data_rc, VReg_96, 0>;
- }
- if op.HAS_GFX10M then {
- def _V3_gfx10 : MIMG_Atomic_gfx10 <op, asm, data_rc, VReg_96, 0>;
- def _V3_nsa_gfx10 : MIMG_Atomic_nsa_gfx10 <op, asm, data_rc, 3, 0>;
+ let ssamp = 0 in {
+ if op.HAS_SI then {
+ def _V3_si : MIMG_Atomic_si <op, asm, data_rc, VReg_96, 0>;
+ }
+ if op.HAS_VI then {
+ def _V3_vi : MIMG_Atomic_vi <op, asm, data_rc, VReg_96, 0>;
+ def _V3_gfx90a : MIMG_Atomic_gfx90a <op, asm, data_rc, VReg_96, 0>;
+ }
+ if op.HAS_GFX10M then {
+ def _V3_gfx10 : MIMG_Atomic_gfx10 <op, asm, data_rc, VReg_96, 0>;
+ def _V3_nsa_gfx10 : MIMG_Atomic_nsa_gfx10 <op, asm, data_rc, 3, 0>;
+ }
+ if op.HAS_GFX11 then {
+ def _V3_gfx11 : MIMG_Atomic_gfx11 <op, asm, data_rc, VReg_96, 0>;
+ def _V3_nsa_gfx11 : MIMG_Atomic_nsa_gfx11 <op, asm, data_rc, 3, 0>;
+ }
}
- if op.HAS_GFX11 then {
- def _V3_gfx11 : MIMG_Atomic_gfx11 <op, asm, data_rc, VReg_96, 0>;
- def _V3_nsa_gfx11 : MIMG_Atomic_nsa_gfx11 <op, asm, data_rc, 3, 0>;
+ if op.HAS_GFX12 then {
+ if !empty(renamed) then
+ def _V3_gfx12 : VIMAGE_Atomic_gfx12 <op, asm, data_rc, 3, 0>;
+ else
+ def _V3_gfx12 : VIMAGE_Atomic_gfx12_Renamed <op, asm, renamed, data_rc, 3, 0>;
}
}
let VAddrDwords = 4 in {
- if op.HAS_SI then {
- def _V4_si : MIMG_Atomic_si <op, asm, data_rc, VReg_128, 0>;
- }
- if op.HAS_VI then {
- def _V4_vi : MIMG_Atomic_vi <op, asm, data_rc, VReg_128, 0>;
- def _V4_gfx90a : MIMG_Atomic_gfx90a <op, asm, data_rc, VReg_128, 0>;
+ let ssamp = 0 in {
+ if op.HAS_SI then {
+ def _V4_si : MIMG_Atomic_si <op, asm, data_rc, VReg_128, 0>;
+ }
+ if op.HAS_VI then {
+ def _V4_vi : MIMG_Atomic_vi <op, asm, data_rc, VReg_128, 0>;
+ def _V4_gfx90a : MIMG_Atomic_gfx90a <op, asm, data_rc, VReg_128, 0>;
+ }
+ if op.HAS_GFX10M then {
+ def _V4_gfx10 : MIMG_Atomic_gfx10 <op, asm, data_rc, VReg_128, 0>;
+ def _V4_nsa_gfx10 : MIMG_Atomic_nsa_gfx10 <op, asm, data_rc, 4, enableDasm>;
+ }
+ if op.HAS_GFX11 then {
+ def _V4_gfx11 : MIMG_Atomic_gfx11 <op, asm, data_rc, VReg_128, 0>;
+ def _V4_nsa_gfx11 : MIMG_Atomic_nsa_gfx11 <op, asm, data_rc, 4, enableDasm>;
+ }
}
- if op.HAS_GFX10M then {
- def _V4_gfx10 : MIMG_Atomic_gfx10 <op, asm, data_rc, VReg_128, 0>;
- def _V4_nsa_gfx10 : MIMG_Atomic_nsa_gfx10 <op, asm, data_rc, 4, enableDasm>;
- }
- if op.HAS_GFX11 then {
- def _V4_gfx11 : MIMG_Atomic_gfx11 <op, asm, data_rc, VReg_128, 0>;
- def _V4_nsa_gfx11 : MIMG_Atomic_nsa_gfx11 <op, asm, data_rc, 4, enableDasm>;
+ if op.HAS_GFX12 then {
+ if !empty(renamed) then
+ def _V4_gfx12 : VIMAGE_Atomic_gfx12 <op, asm, data_rc, 4, enableDasm>;
+ else
+ def _V4_gfx12 : VIMAGE_Atomic_gfx12_Renamed <op, asm, renamed, data_rc, 4, enableDasm>;
}
}
}
}
-multiclass MIMG_Atomic <mimgopc op, string asm, bit isCmpSwap = 0, bit isFP = 0> { // 64-bit atomics
+multiclass MIMG_Atomic <mimgopc op, string asm, bit isCmpSwap = 0, bit isFP = 0,
+ string renamed = ""> { // 64-bit atomics
let IsAtomicRet = 1 in {
def "" : MIMGBaseOpcode {
let Atomic = 1;
@@ -877,13 +1091,17 @@ multiclass MIMG_Atomic <mimgopc op, string asm, bit isCmpSwap = 0, bit isFP = 0>
// using dmask and tfe. Only 32-bit variant is registered with disassembler.
// Other variants are reconstructed by disassembler using dmask and tfe.
let VDataDwords = !if(isCmpSwap, 2, 1) in
- defm _V1 : MIMG_Atomic_Addr_Helper_m <op, asm, !if(isCmpSwap, VReg_64, VGPR_32), 1, isFP>;
+ defm _V1 : MIMG_Atomic_Addr_Helper_m <op, asm, !if(isCmpSwap, VReg_64, VGPR_32), 1, isFP, renamed>;
let VDataDwords = !if(isCmpSwap, 4, 2) in
- defm _V2 : MIMG_Atomic_Addr_Helper_m <op, asm, !if(isCmpSwap, VReg_128, VReg_64), 0, isFP>;
+ defm _V2 : MIMG_Atomic_Addr_Helper_m <op, asm, !if(isCmpSwap, VReg_128, VReg_64), 0, isFP, renamed>;
}
} // End IsAtomicRet = 1
}
+multiclass MIMG_Atomic_Renamed <mimgopc op, string asm, string renamed,
+ bit isCmpSwap = 0, bit isFP = 0>
+ : MIMG_Atomic <op, asm, isCmpSwap, isFP, renamed>;
+
class MIMG_Sampler_Helper <mimgopc op, string asm, RegisterClass dst_rc,
RegisterClass src_rc, string dns="">
: MIMG_gfx6789 <op.VI, (outs dst_rc:$vdata), dns> {
@@ -1006,7 +1224,7 @@ class MIMGAddrSizes_dw_range<list<int> range> {
}
class MIMG_Sampler_AddrSizes<AMDGPUSampleVariant sample, bit isG16,
- int nsa_max_addr = 5> {
+ int nsa_max_addr = 5, bit includeNSA1 = 0> {
// List of all possible numbers of address words, taking all combinations of
// A16 and image dimension into account (note: no MSAA, since this is for
// sample/gather ops).
@@ -1061,8 +1279,10 @@ class MIMG_Sampler_AddrSizes<AMDGPUSampleVariant sample, bit isG16,
// it is the only one that could have a register other than VGPR32.
int EnableDisasmNum = !foldl(!head(AllNumAddrWords), !tail(AllNumAddrWords),
acc, var, !if(!le(var, nsa_max_addr), var, acc));
+ list<int> PossibleVariants =
+ !listconcat([12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2], !if(includeNSA1, [1], []));
list<LastVAddrSize> PartialNSAInstrs =
- !foldl([]<LastVAddrSize>, [12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2], lhs, dw,
+ !foldl([]<LastVAddrSize>, PossibleVariants, lhs, dw,
!if(isIntInList<dw, AllNumAddrWords>.ret,
!listconcat(lhs, [LastVAddrSize<dw, !sub(nsa_max_addr, 1),
!eq(dw, EnableDisasmNum)>]),
@@ -1114,6 +1334,16 @@ multiclass MIMG_Sampler_Src_Helper <mimgopc op, string asm,
}
}
}
+
+ foreach addr = MIMG_Sampler_AddrSizes<sample, isG16, 4/*MaxNSASize*/, 1>.PartialNSAInstrs in {
+ let VAddrDwords = addr.NumWords in {
+ if op.HAS_GFX12 then {
+ def _V # addr.NumWords # _gfx12
+ : VSAMPLE_Sampler_gfx12<op, asm, dst_rc, addr.NumWords, addr.RegClass,
+ !if(!and(enableDisasm, addr.Disassemble), "GFX12", "")>;
+ }
+ }
+ }
}
class MIMG_Sampler_BaseOpcode<AMDGPUSampleVariant sample>
@@ -1177,12 +1407,12 @@ class MIMG_IntersectRay_Helper<bit Is64, bit IsA16> {
RegisterClass RegClass = MIMGAddrSize<num_addrs, 0>.RegClass;
int VAddrDwords = !srl(RegClass.Size, 5);
- int gfx11_nsa_addrs = !if(IsA16, 4, 5);
+ int GFX11PlusNSAAddrs = !if(IsA16, 4, 5);
RegisterClass node_ptr_type = !if(Is64, VReg_64, VGPR_32);
- list<RegisterClass> gfx11_addr_types =
- !if(IsA16,
- [node_ptr_type, VGPR_32, VReg_96, VReg_96],
- [node_ptr_type, VGPR_32, VReg_96, VReg_96, VReg_96]);
+ list<RegisterClass> GFX11PlusAddrTypes =
+ !if(IsA16,
+ [node_ptr_type, VGPR_32, VReg_96, VReg_96],
+ [node_ptr_type, VGPR_32, VReg_96, VReg_96, VReg_96]);
}
class MIMG_IntersectRay_gfx10<mimgopc op, string opcode, RegisterClass AddrRC>
@@ -1215,6 +1445,14 @@ class MIMG_IntersectRay_nsa_gfx11<mimgopc op, string opcode, int num_addrs,
let AsmString = opcode#" $vdata, "#nsah.AddrAsm#", $srsrc$a16";
}
+class VIMAGE_IntersectRay_gfx12<mimgopc op, string opcode, int num_addrs,
+ list<RegisterClass> addr_types>
+ : VIMAGE_gfx12<op.GFX12, (outs VReg_128:$vdata),
+ num_addrs, "GFX12", addr_types> {
+ let InOperandList = !con(nsah.AddrIns, (ins SReg_128:$rsrc, A16:$a16));
+ let AsmString = opcode#" $vdata, "#nsah.AddrAsm#", $rsrc$a16";
+}
+
multiclass MIMG_IntersectRay<mimgopc op, string opcode, bit Is64, bit IsA16> {
defvar info = MIMG_IntersectRay_Helper<Is64, IsA16>;
def "" : MIMGBaseOpcode {
@@ -1222,30 +1460,39 @@ multiclass MIMG_IntersectRay<mimgopc op, string opcode, bit Is64, bit IsA16> {
let A16 = IsA16;
}
let dmask = 0xf,
- unorm = 1,
d16 = 0,
cpol = 0,
tfe = 0,
- lwe = 0,
r128 = 1,
- ssamp = 0,
dim = {0, 0, 0},
a16 = IsA16,
d16 = 0,
BaseOpcode = !cast<MIMGBaseOpcode>(NAME),
VDataDwords = 4 in {
- def _sa_gfx10 : MIMG_IntersectRay_gfx10<op, opcode, info.RegClass> {
- let VAddrDwords = info.VAddrDwords;
- }
- def _sa_gfx11 : MIMG_IntersectRay_gfx11<op, opcode, info.RegClass> {
- let VAddrDwords = info.VAddrDwords;
- }
- def _nsa_gfx10 : MIMG_IntersectRay_nsa_gfx10<op, opcode, info.num_addrs> {
- let VAddrDwords = info.num_addrs;
+ let unorm = 1,
+ lwe = 0,
+ ssamp = 0 in {
+ if op.HAS_GFX10M then
+ def _sa_gfx10 : MIMG_IntersectRay_gfx10<op, opcode, info.RegClass> {
+ let VAddrDwords = info.VAddrDwords;
+ }
+ if op.HAS_GFX11 then
+ def _sa_gfx11 : MIMG_IntersectRay_gfx11<op, opcode, info.RegClass> {
+ let VAddrDwords = info.VAddrDwords;
+ }
+ if op.HAS_GFX10M then
+ def _nsa_gfx10 : MIMG_IntersectRay_nsa_gfx10<op, opcode, info.num_addrs> {
+ let VAddrDwords = info.num_addrs;
+ }
+ if op.HAS_GFX11 then
+ def _nsa_gfx11 : MIMG_IntersectRay_nsa_gfx11<op, opcode,
+ info.GFX11PlusNSAAddrs,
+ info.GFX11PlusAddrTypes> {
+ let VAddrDwords = info.num_addrs;
+ }
}
- def _nsa_gfx11 : MIMG_IntersectRay_nsa_gfx11<op, opcode,
- info.gfx11_nsa_addrs,
- info.gfx11_addr_types> {
+ def _gfx12 : VIMAGE_IntersectRay_gfx12<op, opcode, info.GFX11PlusNSAAddrs,
+ info.GFX11PlusAddrTypes> {
let VAddrDwords = info.num_addrs;
}
}
@@ -1261,13 +1508,13 @@ multiclass MIMG_MSAA_Load <mimgopc op, string asm> {
let BaseOpcode = !cast<MIMGBaseOpcode>(NAME),
Gather4 = 1, hasPostISelHook = 0, mayLoad = 1 in {
let VDataDwords = 2 in
- defm _V2 : MIMG_NoSampler_Src_Helper<op, asm, VReg_64, 0>; /* packed D16 */
+ defm _V2 : MIMG_NoSampler_Src_Helper<op, asm, VReg_64, 0, 0, 1>; /* packed D16 */
let VDataDwords = 3 in
- defm _V3 : MIMG_NoSampler_Src_Helper<op, asm, VReg_96, 0>; /* packed D16 + tfe */
+ defm _V3 : MIMG_NoSampler_Src_Helper<op, asm, VReg_96, 0, 0, 1>; /* packed D16 + tfe */
let VDataDwords = 4 in
- defm _V4 : MIMG_NoSampler_Src_Helper<op, asm, VReg_128, 1>;
+ defm _V4 : MIMG_NoSampler_Src_Helper<op, asm, VReg_128, 1, 0, 1>;
let VDataDwords = 5 in
- defm _V5 : MIMG_NoSampler_Src_Helper<op, asm, VReg_160, 0>;
+ defm _V5 : MIMG_NoSampler_Src_Helper<op, asm, VReg_160, 0, 0, 1>;
}
}
@@ -1276,143 +1523,143 @@ multiclass MIMG_MSAA_Load <mimgopc op, string asm> {
//===----------------------------------------------------------------------===//
let OtherPredicates = [HasImageInsts] in {
-defm IMAGE_LOAD : MIMG_NoSampler <mimgopc<0x00, 0x00>, "image_load", 1>;
-defm IMAGE_LOAD_MIP : MIMG_NoSampler <mimgopc<0x01, 0x01>, "image_load_mip", 1, 1>;
-defm IMAGE_LOAD_PCK : MIMG_NoSampler <mimgopc<0x02, 0x02>, "image_load_pck", 0>;
-defm IMAGE_LOAD_PCK_SGN : MIMG_NoSampler <mimgopc<0x03, 0x03>, "image_load_pck_sgn", 0>;
-defm IMAGE_LOAD_MIP_PCK : MIMG_NoSampler <mimgopc<0x04, 0x04>, "image_load_mip_pck", 0, 1>;
-defm IMAGE_LOAD_MIP_PCK_SGN : MIMG_NoSampler <mimgopc<0x05, 0x05>, "image_load_mip_pck_sgn", 0, 1>;
-defm IMAGE_STORE : MIMG_Store <mimgopc<0x06, 0x08>, "image_store", 1>;
-defm IMAGE_STORE_MIP : MIMG_Store <mimgopc<0x07, 0x09>, "image_store_mip", 1, 1>;
-defm IMAGE_STORE_PCK : MIMG_Store <mimgopc<0x08, 0x0a>, "image_store_pck", 0>;
-defm IMAGE_STORE_MIP_PCK : MIMG_Store <mimgopc<0x09, 0x0b>, "image_store_mip_pck", 0, 1>;
-
-defm IMAGE_GET_RESINFO : MIMG_NoSampler <mimgopc<0x17, 0x0e>, "image_get_resinfo", 0, 1, 1>;
-
-defm IMAGE_ATOMIC_SWAP : MIMG_Atomic <mimgopc<0x0a, 0x0f, 0x10, 0x0f>, "image_atomic_swap">;
-defm IMAGE_ATOMIC_CMPSWAP : MIMG_Atomic <mimgopc<0x0b, 0x10, 0x11, 0x10>, "image_atomic_cmpswap", 1>;
-defm IMAGE_ATOMIC_ADD : MIMG_Atomic <mimgopc<0x0c, 0x11, 0x12, 0x11>, "image_atomic_add">;
-defm IMAGE_ATOMIC_SUB : MIMG_Atomic <mimgopc<0x0d, 0x12, 0x13, 0x12>, "image_atomic_sub">;
-defm IMAGE_ATOMIC_RSUB : MIMG_Atomic <mimgopc<MIMG.NOP, MIMG.NOP, MIMG.NOP, 0x13>, "image_atomic_rsub">;
-defm IMAGE_ATOMIC_SMIN : MIMG_Atomic <mimgopc<0x0e, 0x14>, "image_atomic_smin">;
-defm IMAGE_ATOMIC_UMIN : MIMG_Atomic <mimgopc<0x0f, 0x15>, "image_atomic_umin">;
-defm IMAGE_ATOMIC_SMAX : MIMG_Atomic <mimgopc<0x10, 0x16>, "image_atomic_smax">;
-defm IMAGE_ATOMIC_UMAX : MIMG_Atomic <mimgopc<0x11, 0x17>, "image_atomic_umax">;
-defm IMAGE_ATOMIC_AND : MIMG_Atomic <mimgopc<0x12, 0x18>, "image_atomic_and">;
-defm IMAGE_ATOMIC_OR : MIMG_Atomic <mimgopc<0x13, 0x19>, "image_atomic_or">;
-defm IMAGE_ATOMIC_XOR : MIMG_Atomic <mimgopc<0x14, 0x1a>, "image_atomic_xor">;
-defm IMAGE_ATOMIC_INC : MIMG_Atomic <mimgopc<0x15, 0x1b>, "image_atomic_inc">;
-defm IMAGE_ATOMIC_DEC : MIMG_Atomic <mimgopc<0x16, 0x1c>, "image_atomic_dec">;
-defm IMAGE_ATOMIC_FCMPSWAP : MIMG_Atomic <mimgopc<MIMG.NOP, 0x1d, MIMG.NOP>, "image_atomic_fcmpswap", 1, 1>;
-defm IMAGE_ATOMIC_FMIN : MIMG_Atomic <mimgopc<MIMG.NOP, 0x1e, MIMG.NOP>, "image_atomic_fmin", 0, 1>;
-defm IMAGE_ATOMIC_FMAX : MIMG_Atomic <mimgopc<MIMG.NOP, 0x1f, MIMG.NOP>, "image_atomic_fmax", 0, 1>;
-
-defm IMAGE_SAMPLE : MIMG_Sampler_WQM <mimgopc<0x1b, 0x20>, AMDGPUSample>;
+defm IMAGE_LOAD : MIMG_NoSampler <mimgopc<0x00, 0x00, 0x00>, "image_load", 1>;
+defm IMAGE_LOAD_MIP : MIMG_NoSampler <mimgopc<0x01, 0x01, 0x01>, "image_load_mip", 1, 1>;
+defm IMAGE_LOAD_PCK : MIMG_NoSampler <mimgopc<0x02, 0x02, 0x02>, "image_load_pck", 0>;
+defm IMAGE_LOAD_PCK_SGN : MIMG_NoSampler <mimgopc<0x03, 0x03, 0x03>, "image_load_pck_sgn", 0>;
+defm IMAGE_LOAD_MIP_PCK : MIMG_NoSampler <mimgopc<0x04, 0x04, 0x04>, "image_load_mip_pck", 0, 1>;
+defm IMAGE_LOAD_MIP_PCK_SGN : MIMG_NoSampler <mimgopc<0x05, 0x05, 0x05>, "image_load_mip_pck_sgn", 0, 1>;
+defm IMAGE_STORE : MIMG_Store <mimgopc<0x06, 0x06, 0x08>, "image_store", 1>;
+defm IMAGE_STORE_MIP : MIMG_Store <mimgopc<0x07, 0x07, 0x09>, "image_store_mip", 1, 1>;
+defm IMAGE_STORE_PCK : MIMG_Store <mimgopc<0x08, 0x08, 0x0a>, "image_store_pck", 0>;
+defm IMAGE_STORE_MIP_PCK : MIMG_Store <mimgopc<0x09, 0x09, 0x0b>, "image_store_mip_pck", 0, 1>;
+
+defm IMAGE_GET_RESINFO : MIMG_NoSampler <mimgopc<0x17, 0x17, 0x0e, 0x0e, 0x0e>, "image_get_resinfo", 0, 1, 1>;
+
+defm IMAGE_ATOMIC_SWAP : MIMG_Atomic <mimgopc<0x0a, 0x0a, 0x0f, 0x10, 0x0f>, "image_atomic_swap">;
+defm IMAGE_ATOMIC_CMPSWAP : MIMG_Atomic <mimgopc<0x0b, 0x0b, 0x10, 0x11, 0x10>, "image_atomic_cmpswap", 1>;
+defm IMAGE_ATOMIC_ADD : MIMG_Atomic_Renamed <mimgopc<0x0c, 0x0c, 0x11, 0x12, 0x11>, "image_atomic_add", "image_atomic_add_uint">;
+defm IMAGE_ATOMIC_SUB : MIMG_Atomic_Renamed <mimgopc<0x0d, 0x0d, 0x12, 0x13, 0x12>, "image_atomic_sub", "image_atomic_sub_uint">;
+defm IMAGE_ATOMIC_RSUB : MIMG_Atomic <mimgopc<MIMG.NOP, MIMG.NOP, MIMG.NOP, MIMG.NOP, 0x13>, "image_atomic_rsub">;
+defm IMAGE_ATOMIC_SMIN : MIMG_Atomic_Renamed <mimgopc<0x0e, 0x0e, 0x14>, "image_atomic_smin", "image_atomic_min_int">;
+defm IMAGE_ATOMIC_UMIN : MIMG_Atomic_Renamed <mimgopc<0x0f, 0x0f, 0x15>, "image_atomic_umin", "image_atomic_min_uint">;
+defm IMAGE_ATOMIC_SMAX : MIMG_Atomic_Renamed <mimgopc<0x10, 0x10, 0x16>, "image_atomic_smax", "image_atomic_max_int">;
+defm IMAGE_ATOMIC_UMAX : MIMG_Atomic_Renamed <mimgopc<0x11, 0x11, 0x17>, "image_atomic_umax", "image_atomic_max_uint">;
+defm IMAGE_ATOMIC_AND : MIMG_Atomic <mimgopc<0x12, 0x12, 0x18>, "image_atomic_and">;
+defm IMAGE_ATOMIC_OR : MIMG_Atomic <mimgopc<0x13, 0x13, 0x19>, "image_atomic_or">;
+defm IMAGE_ATOMIC_XOR : MIMG_Atomic <mimgopc<0x14, 0x14, 0x1a>, "image_atomic_xor">;
+defm IMAGE_ATOMIC_INC : MIMG_Atomic_Renamed <mimgopc<0x15, 0x15, 0x1b>, "image_atomic_inc", "image_atomic_inc_uint">;
+defm IMAGE_ATOMIC_DEC : MIMG_Atomic_Renamed <mimgopc<0x16, 0x16, 0x1c>, "image_atomic_dec", "image_atomic_dec_uint">;
+defm IMAGE_ATOMIC_FCMPSWAP : MIMG_Atomic <mimgopc<MIMG.NOP, MIMG.NOP, 0x1d, MIMG.NOP>, "image_atomic_fcmpswap", 1, 1>;
+defm IMAGE_ATOMIC_FMIN : MIMG_Atomic <mimgopc<MIMG.NOP, MIMG.NOP, 0x1e, MIMG.NOP>, "image_atomic_fmin", 0, 1>;
+defm IMAGE_ATOMIC_FMAX : MIMG_Atomic <mimgopc<MIMG.NOP, MIMG.NOP, 0x1f, MIMG.NOP>, "image_atomic_fmax", 0, 1>;
+
+defm IMAGE_SAMPLE : MIMG_Sampler_WQM <mimgopc<0x1b, 0x1b, 0x20>, AMDGPUSample>;
let OtherPredicates = [HasExtendedImageInsts] in {
-defm IMAGE_SAMPLE_CL : MIMG_Sampler_WQM <mimgopc<0x40, 0x21>, AMDGPUSample_cl>;
-defm IMAGE_SAMPLE_D : MIMG_Sampler <mimgopc<0x1c, 0x22>, AMDGPUSample_d>;
-defm IMAGE_SAMPLE_D_CL : MIMG_Sampler <mimgopc<0x41, 0x23>, AMDGPUSample_d_cl>;
-defm IMAGE_SAMPLE_L : MIMG_Sampler <mimgopc<0x1d, 0x24>, AMDGPUSample_l>;
-defm IMAGE_SAMPLE_B : MIMG_Sampler_WQM <mimgopc<0x1e, 0x25>, AMDGPUSample_b>;
-defm IMAGE_SAMPLE_B_CL : MIMG_Sampler_WQM <mimgopc<0x42, 0x26>, AMDGPUSample_b_cl>;
-defm IMAGE_SAMPLE_LZ : MIMG_Sampler <mimgopc<0x1f, 0x27>, AMDGPUSample_lz>;
-defm IMAGE_SAMPLE_C : MIMG_Sampler_WQM <mimgopc<0x20, 0x28>, AMDGPUSample_c>;
-defm IMAGE_SAMPLE_C_CL : MIMG_Sampler_WQM <mimgopc<0x43, 0x29>, AMDGPUSample_c_cl>;
-defm IMAGE_SAMPLE_C_D : MIMG_Sampler <mimgopc<0x21, 0x2a>, AMDGPUSample_c_d>;
-defm IMAGE_SAMPLE_C_D_CL : MIMG_Sampler <mimgopc<0x44, 0x2b>, AMDGPUSample_c_d_cl>;
-defm IMAGE_SAMPLE_C_L : MIMG_Sampler <mimgopc<0x22, 0x2c>, AMDGPUSample_c_l>;
-defm IMAGE_SAMPLE_C_B : MIMG_Sampler_WQM <mimgopc<0x23, 0x2d>, AMDGPUSample_c_b>;
-defm IMAGE_SAMPLE_C_B_CL : MIMG_Sampler_WQM <mimgopc<0x45, 0x2e>, AMDGPUSample_c_b_cl>;
-defm IMAGE_SAMPLE_C_LZ : MIMG_Sampler <mimgopc<0x24, 0x2f>, AMDGPUSample_c_lz>;
-defm IMAGE_SAMPLE_O : MIMG_Sampler_WQM <mimgopc<0x25, 0x30>, AMDGPUSample_o>;
-defm IMAGE_SAMPLE_CL_O : MIMG_Sampler_WQM <mimgopc<0x46, 0x31>, AMDGPUSample_cl_o>;
-defm IMAGE_SAMPLE_D_O : MIMG_Sampler <mimgopc<0x26, 0x32>, AMDGPUSample_d_o>;
-defm IMAGE_SAMPLE_D_CL_O : MIMG_Sampler <mimgopc<0x47, 0x33>, AMDGPUSample_d_cl_o>;
-defm IMAGE_SAMPLE_L_O : MIMG_Sampler <mimgopc<0x27, 0x34>, AMDGPUSample_l_o>;
-defm IMAGE_SAMPLE_B_O : MIMG_Sampler_WQM <mimgopc<0x28, 0x35>, AMDGPUSample_b_o>;
-defm IMAGE_SAMPLE_B_CL_O : MIMG_Sampler_WQM <mimgopc<0x48, 0x36>, AMDGPUSample_b_cl_o>;
-defm IMAGE_SAMPLE_LZ_O : MIMG_Sampler <mimgopc<0x29, 0x37>, AMDGPUSample_lz_o>;
-defm IMAGE_SAMPLE_C_O : MIMG_Sampler_WQM <mimgopc<0x2a, 0x38>, AMDGPUSample_c_o>;
-defm IMAGE_SAMPLE_C_CL_O : MIMG_Sampler_WQM <mimgopc<0x49, 0x39>, AMDGPUSample_c_cl_o>;
-defm IMAGE_SAMPLE_C_D_O : MIMG_Sampler <mimgopc<0x2b, 0x3a>, AMDGPUSample_c_d_o>;
-defm IMAGE_SAMPLE_C_D_CL_O : MIMG_Sampler <mimgopc<0x4a, 0x3b>, AMDGPUSample_c_d_cl_o>;
-defm IMAGE_SAMPLE_C_L_O : MIMG_Sampler <mimgopc<0x2c, 0x3c>, AMDGPUSample_c_l_o>;
-defm IMAGE_SAMPLE_C_B_CL_O : MIMG_Sampler_WQM <mimgopc<0x4b, 0x3e>, AMDGPUSample_c_b_cl_o>;
-defm IMAGE_SAMPLE_C_B_O : MIMG_Sampler_WQM <mimgopc<0x2d, 0x3d>, AMDGPUSample_c_b_o>;
-defm IMAGE_SAMPLE_C_LZ_O : MIMG_Sampler <mimgopc<0x2e, 0x3f>, AMDGPUSample_c_lz_o>;
-defm IMAGE_GATHER4 : MIMG_Gather_WQM <mimgopc<0x2f, 0x40>, AMDGPUSample>;
-defm IMAGE_GATHER4_CL : MIMG_Gather_WQM <mimgopc<0x60, 0x41>, AMDGPUSample_cl>;
-defm IMAGE_GATHER4_L : MIMG_Gather <mimgopc<0x30, 0x44>, AMDGPUSample_l>;
-defm IMAGE_GATHER4_B : MIMG_Gather_WQM <mimgopc<0x31, 0x45>, AMDGPUSample_b>;
-defm IMAGE_GATHER4_B_CL : MIMG_Gather_WQM <mimgopc<0x61, 0x46>, AMDGPUSample_b_cl>;
-defm IMAGE_GATHER4_LZ : MIMG_Gather <mimgopc<0x32, 0x47>, AMDGPUSample_lz>;
-defm IMAGE_GATHER4_C : MIMG_Gather_WQM <mimgopc<0x33, 0x48>, AMDGPUSample_c>;
-defm IMAGE_GATHER4_C_CL : MIMG_Gather_WQM <mimgopc<0x62, 0x49>, AMDGPUSample_c_cl>;
-defm IMAGE_GATHER4_C_L : MIMG_Gather <mimgopc<0x63, 0x4c>, AMDGPUSample_c_l>;
-defm IMAGE_GATHER4_C_B : MIMG_Gather_WQM <mimgopc<0x64, 0x4d>, AMDGPUSample_c_b>;
-defm IMAGE_GATHER4_C_B_CL : MIMG_Gather_WQM <mimgopc<0x65, 0x4e>, AMDGPUSample_c_b_cl>;
-defm IMAGE_GATHER4_C_LZ : MIMG_Gather <mimgopc<0x34, 0x4f>, AMDGPUSample_c_lz>;
-defm IMAGE_GATHER4_O : MIMG_Gather_WQM <mimgopc<0x35, 0x50>, AMDGPUSample_o>;
-defm IMAGE_GATHER4_CL_O : MIMG_Gather_WQM <mimgopc<MIMG.NOP, 0x51>, AMDGPUSample_cl_o>;
-defm IMAGE_GATHER4_L_O : MIMG_Gather <mimgopc<MIMG.NOP, 0x54>, AMDGPUSample_l_o>;
-defm IMAGE_GATHER4_B_O : MIMG_Gather_WQM <mimgopc<MIMG.NOP, 0x55>, AMDGPUSample_b_o>;
-defm IMAGE_GATHER4_B_CL_O : MIMG_Gather <mimgopc<MIMG.NOP, 0x56>, AMDGPUSample_b_cl_o>;
-defm IMAGE_GATHER4_LZ_O : MIMG_Gather <mimgopc<0x36, 0x57>, AMDGPUSample_lz_o>;
-defm IMAGE_GATHER4_C_O : MIMG_Gather_WQM <mimgopc<MIMG.NOP, 0x58>, AMDGPUSample_c_o>;
-defm IMAGE_GATHER4_C_CL_O : MIMG_Gather_WQM <mimgopc<MIMG.NOP, 0x59>, AMDGPUSample_c_cl_o>;
-defm IMAGE_GATHER4_C_L_O : MIMG_Gather <mimgopc<MIMG.NOP, 0x5c>, AMDGPUSample_c_l_o>;
-defm IMAGE_GATHER4_C_B_O : MIMG_Gather_WQM <mimgopc<MIMG.NOP, 0x5d>, AMDGPUSample_c_b_o>;
-defm IMAGE_GATHER4_C_B_CL_O : MIMG_Gather_WQM <mimgopc<MIMG.NOP, 0x5e>, AMDGPUSample_c_b_cl_o>;
-defm IMAGE_GATHER4_C_LZ_O : MIMG_Gather <mimgopc<0x37, 0x5f>, AMDGPUSample_c_lz_o>;
+defm IMAGE_SAMPLE_CL : MIMG_Sampler_WQM <mimgopc<0x40, 0x40, 0x21>, AMDGPUSample_cl>;
+defm IMAGE_SAMPLE_D : MIMG_Sampler <mimgopc<0x1c, 0x1c, 0x22>, AMDGPUSample_d>;
+defm IMAGE_SAMPLE_D_CL : MIMG_Sampler <mimgopc<0x41, 0x41, 0x23>, AMDGPUSample_d_cl>;
+defm IMAGE_SAMPLE_L : MIMG_Sampler <mimgopc<0x1d, 0x1d, 0x24>, AMDGPUSample_l>;
+defm IMAGE_SAMPLE_B : MIMG_Sampler_WQM <mimgopc<0x1e, 0x1e, 0x25>, AMDGPUSample_b>;
+defm IMAGE_SAMPLE_B_CL : MIMG_Sampler_WQM <mimgopc<0x42, 0x42, 0x26>, AMDGPUSample_b_cl>;
+defm IMAGE_SAMPLE_LZ : MIMG_Sampler <mimgopc<0x1f, 0x1f, 0x27>, AMDGPUSample_lz>;
+defm IMAGE_SAMPLE_C : MIMG_Sampler_WQM <mimgopc<0x20, 0x20, 0x28>, AMDGPUSample_c>;
+defm IMAGE_SAMPLE_C_CL : MIMG_Sampler_WQM <mimgopc<0x43, 0x43, 0x29>, AMDGPUSample_c_cl>;
+defm IMAGE_SAMPLE_C_D : MIMG_Sampler <mimgopc<0x21, 0x21, 0x2a>, AMDGPUSample_c_d>;
+defm IMAGE_SAMPLE_C_D_CL : MIMG_Sampler <mimgopc<0x44, 0x44, 0x2b>, AMDGPUSample_c_d_cl>;
+defm IMAGE_SAMPLE_C_L : MIMG_Sampler <mimgopc<0x22, 0x22, 0x2c>, AMDGPUSample_c_l>;
+defm IMAGE_SAMPLE_C_B : MIMG_Sampler_WQM <mimgopc<0x23, 0x23, 0x2d>, AMDGPUSample_c_b>;
+defm IMAGE_SAMPLE_C_B_CL : MIMG_Sampler_WQM <mimgopc<0x45, 0x45, 0x2e>, AMDGPUSample_c_b_cl>;
+defm IMAGE_SAMPLE_C_LZ : MIMG_Sampler <mimgopc<0x24, 0x24, 0x2f>, AMDGPUSample_c_lz>;
+defm IMAGE_SAMPLE_O : MIMG_Sampler_WQM <mimgopc<0x25, 0x25, 0x30>, AMDGPUSample_o>;
+defm IMAGE_SAMPLE_CL_O : MIMG_Sampler_WQM <mimgopc<0x46, 0x46, 0x31>, AMDGPUSample_cl_o>;
+defm IMAGE_SAMPLE_D_O : MIMG_Sampler <mimgopc<0x26, 0x26, 0x32>, AMDGPUSample_d_o>;
+defm IMAGE_SAMPLE_D_CL_O : MIMG_Sampler <mimgopc<0x47, 0x47, 0x33>, AMDGPUSample_d_cl_o>;
+defm IMAGE_SAMPLE_L_O : MIMG_Sampler <mimgopc<0x27, 0x27, 0x34>, AMDGPUSample_l_o>;
+defm IMAGE_SAMPLE_B_O : MIMG_Sampler_WQM <mimgopc<0x28, 0x28, 0x35>, AMDGPUSample_b_o>;
+defm IMAGE_SAMPLE_B_CL_O : MIMG_Sampler_WQM <mimgopc<0x48, 0x48, 0x36>, AMDGPUSample_b_cl_o>;
+defm IMAGE_SAMPLE_LZ_O : MIMG_Sampler <mimgopc<0x29, 0x29, 0x37>, AMDGPUSample_lz_o>;
+defm IMAGE_SAMPLE_C_O : MIMG_Sampler_WQM <mimgopc<0x2a, 0x2a, 0x38>, AMDGPUSample_c_o>;
+defm IMAGE_SAMPLE_C_CL_O : MIMG_Sampler_WQM <mimgopc<0x49, 0x49, 0x39>, AMDGPUSample_c_cl_o>;
+defm IMAGE_SAMPLE_C_D_O : MIMG_Sampler <mimgopc<0x2b, 0x2b, 0x3a>, AMDGPUSample_c_d_o>;
+defm IMAGE_SAMPLE_C_D_CL_O : MIMG_Sampler <mimgopc<0x4a, 0x4a, 0x3b>, AMDGPUSample_c_d_cl_o>;
+defm IMAGE_SAMPLE_C_L_O : MIMG_Sampler <mimgopc<0x2c, 0x2c, 0x3c>, AMDGPUSample_c_l_o>;
+defm IMAGE_SAMPLE_C_B_CL_O : MIMG_Sampler_WQM <mimgopc<0x4b, 0x4b, 0x3e>, AMDGPUSample_c_b_cl_o>;
+defm IMAGE_SAMPLE_C_B_O : MIMG_Sampler_WQM <mimgopc<0x2d, 0x2d, 0x3d>, AMDGPUSample_c_b_o>;
+defm IMAGE_SAMPLE_C_LZ_O : MIMG_Sampler <mimgopc<0x2e, 0x2e, 0x3f>, AMDGPUSample_c_lz_o>;
+defm IMAGE_GATHER4 : MIMG_Gather_WQM <mimgopc<0x2f, 0x2f, 0x40>, AMDGPUSample>;
+defm IMAGE_GATHER4_CL : MIMG_Gather_WQM <mimgopc<0x60, 0x60, 0x41>, AMDGPUSample_cl>;
+defm IMAGE_GATHER4_L : MIMG_Gather <mimgopc<0x30, 0x30, 0x44>, AMDGPUSample_l>;
+defm IMAGE_GATHER4_B : MIMG_Gather_WQM <mimgopc<0x31, 0x31, 0x45>, AMDGPUSample_b>;
+defm IMAGE_GATHER4_B_CL : MIMG_Gather_WQM <mimgopc<0x61, 0x61, 0x46>, AMDGPUSample_b_cl>;
+defm IMAGE_GATHER4_LZ : MIMG_Gather <mimgopc<0x32, 0x32, 0x47>, AMDGPUSample_lz>;
+defm IMAGE_GATHER4_C : MIMG_Gather_WQM <mimgopc<0x33, 0x33, 0x48>, AMDGPUSample_c>;
+defm IMAGE_GATHER4_C_CL : MIMG_Gather_WQM <mimgopc<0x62, 0x62, 0x49>, AMDGPUSample_c_cl>;
+defm IMAGE_GATHER4_C_L : MIMG_Gather <mimgopc<0x63, 0x63, 0x4c>, AMDGPUSample_c_l>;
+defm IMAGE_GATHER4_C_B : MIMG_Gather_WQM <mimgopc<0x64, 0x64, 0x4d>, AMDGPUSample_c_b>;
+defm IMAGE_GATHER4_C_B_CL : MIMG_Gather_WQM <mimgopc<0x65, 0x65, 0x4e>, AMDGPUSample_c_b_cl>;
+defm IMAGE_GATHER4_C_LZ : MIMG_Gather <mimgopc<0x34, 0x34, 0x4f>, AMDGPUSample_c_lz>;
+defm IMAGE_GATHER4_O : MIMG_Gather_WQM <mimgopc<0x35, 0x35, 0x50>, AMDGPUSample_o>;
+defm IMAGE_GATHER4_CL_O : MIMG_Gather_WQM <mimgopc<MIMG.NOP, MIMG.NOP, 0x51>, AMDGPUSample_cl_o>;
+defm IMAGE_GATHER4_L_O : MIMG_Gather <mimgopc<MIMG.NOP, MIMG.NOP, 0x54>, AMDGPUSample_l_o>;
+defm IMAGE_GATHER4_B_O : MIMG_Gather_WQM <mimgopc<MIMG.NOP, MIMG.NOP, 0x55>, AMDGPUSample_b_o>;
+defm IMAGE_GATHER4_B_CL_O : MIMG_Gather <mimgopc<MIMG.NOP, MIMG.NOP, 0x56>, AMDGPUSample_b_cl_o>;
+defm IMAGE_GATHER4_LZ_O : MIMG_Gather <mimgopc<0x36, 0x36, 0x57>, AMDGPUSample_lz_o>;
+defm IMAGE_GATHER4_C_O : MIMG_Gather_WQM <mimgopc<MIMG.NOP, MIMG.NOP, 0x58>, AMDGPUSample_c_o>;
+defm IMAGE_GATHER4_C_CL_O : MIMG_Gather_WQM <mimgopc<MIMG.NOP, MIMG.NOP, 0x59>, AMDGPUSample_c_cl_o>;
+defm IMAGE_GATHER4_C_L_O : MIMG_Gather <mimgopc<MIMG.NOP, MIMG.NOP, 0x5c>, AMDGPUSample_c_l_o>;
+defm IMAGE_GATHER4_C_B_O : MIMG_Gather_WQM <mimgopc<MIMG.NOP, MIMG.NOP, 0x5d>, AMDGPUSample_c_b_o>;
+defm IMAGE_GATHER4_C_B_CL_O : MIMG_Gather_WQM <mimgopc<MIMG.NOP, MIMG.NOP, 0x5e>, AMDGPUSample_c_b_cl_o>;
+defm IMAGE_GATHER4_C_LZ_O : MIMG_Gather <mimgopc<0x37, 0x37, 0x5f>, AMDGPUSample_c_lz_o>;
let SubtargetPredicate = isGFX9Plus in
-defm IMAGE_GATHER4H : MIMG_Gather <mimgopc<0x90, 0x61, 0x42>, AMDGPUSample, 1, "image_gather4h">;
-
-defm IMAGE_GET_LOD : MIMG_Sampler <mimgopc<0x38, 0x60>, AMDGPUSample, 1, 0, 1, "image_get_lod">;
-
-defm IMAGE_SAMPLE_CD : MIMG_Sampler <mimgopc<MIMG.NOP, 0x68>, AMDGPUSample_cd>;
-defm IMAGE_SAMPLE_CD_CL : MIMG_Sampler <mimgopc<MIMG.NOP, 0x69>, AMDGPUSample_cd_cl>;
-defm IMAGE_SAMPLE_C_CD : MIMG_Sampler <mimgopc<MIMG.NOP, 0x6a>, AMDGPUSample_c_cd>;
-defm IMAGE_SAMPLE_C_CD_CL : MIMG_Sampler <mimgopc<MIMG.NOP, 0x6b>, AMDGPUSample_c_cd_cl>;
-defm IMAGE_SAMPLE_CD_O : MIMG_Sampler <mimgopc<MIMG.NOP, 0x6c>, AMDGPUSample_cd_o>;
-defm IMAGE_SAMPLE_CD_CL_O : MIMG_Sampler <mimgopc<MIMG.NOP, 0x6d>, AMDGPUSample_cd_cl_o>;
-defm IMAGE_SAMPLE_C_CD_O : MIMG_Sampler <mimgopc<MIMG.NOP, 0x6e>, AMDGPUSample_c_cd_o>;
-defm IMAGE_SAMPLE_C_CD_CL_O : MIMG_Sampler <mimgopc<MIMG.NOP, 0x6f>, AMDGPUSample_c_cd_cl_o>;
+defm IMAGE_GATHER4H : MIMG_Gather <mimgopc<0x90, 0x90, 0x61, 0x42>, AMDGPUSample, 1, "image_gather4h">;
+
+defm IMAGE_GET_LOD : MIMG_Sampler <mimgopc<0x38, 0x38, 0x60>, AMDGPUSample, 1, 0, 1, "image_get_lod">;
+
+defm IMAGE_SAMPLE_CD : MIMG_Sampler <mimgopc<MIMG.NOP, MIMG.NOP, 0x68>, AMDGPUSample_cd>;
+defm IMAGE_SAMPLE_CD_CL : MIMG_Sampler <mimgopc<MIMG.NOP, MIMG.NOP, 0x69>, AMDGPUSample_cd_cl>;
+defm IMAGE_SAMPLE_C_CD : MIMG_Sampler <mimgopc<MIMG.NOP, MIMG.NOP, 0x6a>, AMDGPUSample_c_cd>;
+defm IMAGE_SAMPLE_C_CD_CL : MIMG_Sampler <mimgopc<MIMG.NOP, MIMG.NOP, 0x6b>, AMDGPUSample_c_cd_cl>;
+defm IMAGE_SAMPLE_CD_O : MIMG_Sampler <mimgopc<MIMG.NOP, MIMG.NOP, 0x6c>, AMDGPUSample_cd_o>;
+defm IMAGE_SAMPLE_CD_CL_O : MIMG_Sampler <mimgopc<MIMG.NOP, MIMG.NOP, 0x6d>, AMDGPUSample_cd_cl_o>;
+defm IMAGE_SAMPLE_C_CD_O : MIMG_Sampler <mimgopc<MIMG.NOP, MIMG.NOP, 0x6e>, AMDGPUSample_c_cd_o>;
+defm IMAGE_SAMPLE_C_CD_CL_O : MIMG_Sampler <mimgopc<MIMG.NOP, MIMG.NOP, 0x6f>, AMDGPUSample_c_cd_cl_o>;
} // End OtherPredicates = [HasExtendedImageInsts]
let OtherPredicates = [HasExtendedImageInsts,HasG16] in {
-defm IMAGE_SAMPLE_D_G16 : MIMG_Sampler <mimgopc<0x39, 0xa2>, AMDGPUSample_d, 0, 1>;
-defm IMAGE_SAMPLE_D_CL_G16 : MIMG_Sampler <mimgopc<0x5f, 0xa3>, AMDGPUSample_d_cl, 0, 1>;
-defm IMAGE_SAMPLE_C_D_G16 : MIMG_Sampler <mimgopc<0x3a, 0xaa>, AMDGPUSample_c_d, 0, 1>;
-defm IMAGE_SAMPLE_C_D_CL_G16 : MIMG_Sampler <mimgopc<0x54, 0xab>, AMDGPUSample_c_d_cl, 0, 1>;
-defm IMAGE_SAMPLE_D_O_G16 : MIMG_Sampler <mimgopc<0x3b, 0xb2>, AMDGPUSample_d_o, 0, 1>;
-defm IMAGE_SAMPLE_D_CL_O_G16 : MIMG_Sampler <mimgopc<0x55, 0xb3>, AMDGPUSample_d_cl_o, 0, 1>;
-defm IMAGE_SAMPLE_C_D_O_G16 : MIMG_Sampler <mimgopc<0x3c, 0xba>, AMDGPUSample_c_d_o, 0, 1>;
-defm IMAGE_SAMPLE_C_D_CL_O_G16 : MIMG_Sampler <mimgopc<0x56, 0xbb>, AMDGPUSample_c_d_cl_o, 0, 1>;
-defm IMAGE_SAMPLE_CD_G16 : MIMG_Sampler <mimgopc<MIMG.NOP, 0xe8>, AMDGPUSample_cd, 0, 1>;
-defm IMAGE_SAMPLE_CD_CL_G16 : MIMG_Sampler <mimgopc<MIMG.NOP, 0xe9>, AMDGPUSample_cd_cl, 0, 1>;
-defm IMAGE_SAMPLE_C_CD_G16 : MIMG_Sampler <mimgopc<MIMG.NOP, 0xea>, AMDGPUSample_c_cd, 0, 1>;
-defm IMAGE_SAMPLE_C_CD_CL_G16 : MIMG_Sampler <mimgopc<MIMG.NOP, 0xeb>, AMDGPUSample_c_cd_cl, 0, 1>;
-defm IMAGE_SAMPLE_CD_O_G16 : MIMG_Sampler <mimgopc<MIMG.NOP, 0xec>, AMDGPUSample_cd_o, 0, 1>;
-defm IMAGE_SAMPLE_CD_CL_O_G16 : MIMG_Sampler <mimgopc<MIMG.NOP, 0xed>, AMDGPUSample_cd_cl_o, 0, 1>;
-defm IMAGE_SAMPLE_C_CD_O_G16 : MIMG_Sampler <mimgopc<MIMG.NOP, 0xee>, AMDGPUSample_c_cd_o, 0, 1>;
-defm IMAGE_SAMPLE_C_CD_CL_O_G16 : MIMG_Sampler <mimgopc<MIMG.NOP, 0xef>, AMDGPUSample_c_cd_cl_o, 0, 1>;
+defm IMAGE_SAMPLE_D_G16 : MIMG_Sampler <mimgopc<0x39, 0x39, 0xa2>, AMDGPUSample_d, 0, 1>;
+defm IMAGE_SAMPLE_D_CL_G16 : MIMG_Sampler <mimgopc<0x5f, 0x5f, 0xa3>, AMDGPUSample_d_cl, 0, 1>;
+defm IMAGE_SAMPLE_C_D_G16 : MIMG_Sampler <mimgopc<0x3a, 0x3a, 0xaa>, AMDGPUSample_c_d, 0, 1>;
+defm IMAGE_SAMPLE_C_D_CL_G16 : MIMG_Sampler <mimgopc<0x54, 0x54, 0xab>, AMDGPUSample_c_d_cl, 0, 1>;
+defm IMAGE_SAMPLE_D_O_G16 : MIMG_Sampler <mimgopc<0x3b, 0x3b, 0xb2>, AMDGPUSample_d_o, 0, 1>;
+defm IMAGE_SAMPLE_D_CL_O_G16 : MIMG_Sampler <mimgopc<0x55, 0x55, 0xb3>, AMDGPUSample_d_cl_o, 0, 1>;
+defm IMAGE_SAMPLE_C_D_O_G16 : MIMG_Sampler <mimgopc<0x3c, 0x3c, 0xba>, AMDGPUSample_c_d_o, 0, 1>;
+defm IMAGE_SAMPLE_C_D_CL_O_G16 : MIMG_Sampler <mimgopc<0x56, 0x56, 0xbb>, AMDGPUSample_c_d_cl_o, 0, 1>;
+defm IMAGE_SAMPLE_CD_G16 : MIMG_Sampler <mimgopc<MIMG.NOP, MIMG.NOP, 0xe8>, AMDGPUSample_cd, 0, 1>;
+defm IMAGE_SAMPLE_CD_CL_G16 : MIMG_Sampler <mimgopc<MIMG.NOP, MIMG.NOP, 0xe9>, AMDGPUSample_cd_cl, 0, 1>;
+defm IMAGE_SAMPLE_C_CD_G16 : MIMG_Sampler <mimgopc<MIMG.NOP, MIMG.NOP, 0xea>, AMDGPUSample_c_cd, 0, 1>;
+defm IMAGE_SAMPLE_C_CD_CL_G16 : MIMG_Sampler <mimgopc<MIMG.NOP, MIMG.NOP, 0xeb>, AMDGPUSample_c_cd_cl, 0, 1>;
+defm IMAGE_SAMPLE_CD_O_G16 : MIMG_Sampler <mimgopc<MIMG.NOP, MIMG.NOP, 0xec>, AMDGPUSample_cd_o, 0, 1>;
+defm IMAGE_SAMPLE_CD_CL_O_G16 : MIMG_Sampler <mimgopc<MIMG.NOP, MIMG.NOP, 0xed>, AMDGPUSample_cd_cl_o, 0, 1>;
+defm IMAGE_SAMPLE_C_CD_O_G16 : MIMG_Sampler <mimgopc<MIMG.NOP, MIMG.NOP, 0xee>, AMDGPUSample_c_cd_o, 0, 1>;
+defm IMAGE_SAMPLE_C_CD_CL_O_G16 : MIMG_Sampler <mimgopc<MIMG.NOP, MIMG.NOP, 0xef>, AMDGPUSample_c_cd_cl_o, 0, 1>;
} // End OtherPredicates = [HasExtendedImageInsts,HasG16]
//def IMAGE_RSRC256 : MIMG_NoPattern_RSRC256 <"image_rsrc256", mimgopc<0x7e>>;
//def IMAGE_SAMPLER : MIMG_NoPattern_ <"image_sampler", mimgopc<0x7f>>;
let SubtargetPredicate = isGFX10Only, OtherPredicates = [HasGFX10_AEncoding] in
-defm IMAGE_MSAA_LOAD_X : MIMG_NoSampler <mimgopc<MIMG.NOP, 0x80>, "image_msaa_load", 1, 0, 0, 1>;
+defm IMAGE_MSAA_LOAD_X : MIMG_NoSampler <mimgopc<MIMG.NOP, MIMG.NOP, 0x80>, "image_msaa_load", 1, 0, 0, 1>;
let OtherPredicates = [HasGFX10_AEncoding] in
-defm IMAGE_MSAA_LOAD : MIMG_MSAA_Load <mimgopc<0x18, MIMG.NOP>, "image_msaa_load">;
+defm IMAGE_MSAA_LOAD : MIMG_MSAA_Load <mimgopc<0x18, 0x18, MIMG.NOP>, "image_msaa_load">;
let OtherPredicates = [HasGFX10_AEncoding] in {
-defm IMAGE_BVH_INTERSECT_RAY : MIMG_IntersectRay<mimgopc<0x19, 0xe6>, "image_bvh_intersect_ray", 0, 0>;
-defm IMAGE_BVH_INTERSECT_RAY_a16 : MIMG_IntersectRay<mimgopc<0x19, 0xe6>, "image_bvh_intersect_ray", 0, 1>;
-defm IMAGE_BVH64_INTERSECT_RAY : MIMG_IntersectRay<mimgopc<0x1a, 0xe7>, "image_bvh64_intersect_ray", 1, 0>;
-defm IMAGE_BVH64_INTERSECT_RAY_a16 : MIMG_IntersectRay<mimgopc<0x1a, 0xe7>, "image_bvh64_intersect_ray", 1, 1>;
+defm IMAGE_BVH_INTERSECT_RAY : MIMG_IntersectRay<mimgopc<0x19, 0x19, 0xe6>, "image_bvh_intersect_ray", 0, 0>;
+defm IMAGE_BVH_INTERSECT_RAY_a16 : MIMG_IntersectRay<mimgopc<0x19, 0x19, 0xe6>, "image_bvh_intersect_ray", 0, 1>;
+defm IMAGE_BVH64_INTERSECT_RAY : MIMG_IntersectRay<mimgopc<0x1a, 0x1a, 0xe7>, "image_bvh64_intersect_ray", 1, 0>;
+defm IMAGE_BVH64_INTERSECT_RAY_a16 : MIMG_IntersectRay<mimgopc<0x1a, 0x1a, 0xe7>, "image_bvh64_intersect_ray", 1, 1>;
} // End OtherPredicates = [HasGFX10_AEncoding]
} // End let OtherPredicates = [HasImageInsts]
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600.h b/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600.h
index 2b0a887c61fa..6c40c2813e20 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600.h
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600.h
@@ -27,7 +27,7 @@ FunctionPass *createR600ClauseMergePass();
FunctionPass *createR600Packetizer();
FunctionPass *createR600ControlFlowFinalizer();
FunctionPass *createR600MachineCFGStructurizerPass();
-FunctionPass *createR600ISelDag(TargetMachine &TM, CodeGenOpt::Level OptLevel);
+FunctionPass *createR600ISelDag(TargetMachine &TM, CodeGenOptLevel OptLevel);
ModulePass *createR600OpenCLImageTypeLoweringPass();
void initializeR600ClauseMergePassPass(PassRegistry &);
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600ISelDAGToDAG.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600ISelDAGToDAG.cpp
index 20c2ff8a4fd7..293db13f34f6 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600ISelDAGToDAG.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600ISelDAGToDAG.cpp
@@ -30,7 +30,7 @@ class R600DAGToDAGISel : public AMDGPUDAGToDAGISel {
public:
R600DAGToDAGISel() = delete;
- explicit R600DAGToDAGISel(TargetMachine &TM, CodeGenOpt::Level OptLevel)
+ explicit R600DAGToDAGISel(TargetMachine &TM, CodeGenOptLevel OptLevel)
: AMDGPUDAGToDAGISel(TM, OptLevel) {}
void Select(SDNode *N) override;
@@ -183,6 +183,6 @@ bool R600DAGToDAGISel::SelectADDRVTX_READ(SDValue Addr, SDValue &Base,
/// This pass converts a legalized DAG into a R600-specific
// DAG, ready for instruction scheduling.
FunctionPass *llvm::createR600ISelDag(TargetMachine &TM,
- CodeGenOpt::Level OptLevel) {
+ CodeGenOptLevel OptLevel) {
return new R600DAGToDAGISel(TM, OptLevel);
}
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp
index ad072cfe23b1..c1ba9c514874 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp
@@ -101,7 +101,7 @@ R600TargetLowering::R600TargetLowering(const TargetMachine &TM,
setOperationAction(ISD::FSUB, MVT::f32, Expand);
- setOperationAction({ISD::FCEIL, ISD::FTRUNC, ISD::FRINT, ISD::FFLOOR},
+ setOperationAction({ISD::FCEIL, ISD::FTRUNC, ISD::FROUNDEVEN, ISD::FFLOOR},
MVT::f64, Custom);
setOperationAction(ISD::SELECT_CC, {MVT::f32, MVT::i32}, Custom);
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600Instructions.td b/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600Instructions.td
index f4dfbe8adc75..f82bd55beccc 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600Instructions.td
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600Instructions.td
@@ -782,7 +782,7 @@ def SETNE_DX10 : R600_2OP <
def FRACT : R600_1OP_Helper <0x10, "FRACT", AMDGPUfract>;
def TRUNC : R600_1OP_Helper <0x11, "TRUNC", ftrunc>;
def CEIL : R600_1OP_Helper <0x12, "CEIL", fceil>;
-def RNDNE : R600_1OP_Helper <0x13, "RNDNE", frint>;
+def RNDNE : R600_1OP_Helper <0x13, "RNDNE", froundeven>;
def FLOOR : R600_1OP_Helper <0x14, "FLOOR", ffloor>;
def MOV : R600_1OP <0x19, "MOV", []>;
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600OpenCLImageTypeLoweringPass.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600OpenCLImageTypeLoweringPass.cpp
index 2a15c0123b74..195dc4f9a0f4 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600OpenCLImageTypeLoweringPass.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600OpenCLImageTypeLoweringPass.cpp
@@ -163,11 +163,11 @@ class R600OpenCLImageTypeLoweringPass : public ModulePass {
Value *Replacement = nullptr;
StringRef Name = F->getName();
- if (Name.startswith(GetImageResourceIDFunc)) {
+ if (Name.starts_with(GetImageResourceIDFunc)) {
Replacement = ConstantInt::get(Int32Type, ResourceID);
- } else if (Name.startswith(GetImageSizeFunc)) {
+ } else if (Name.starts_with(GetImageSizeFunc)) {
Replacement = &ImageSizeArg;
- } else if (Name.startswith(GetImageFormatFunc)) {
+ } else if (Name.starts_with(GetImageFormatFunc)) {
Replacement = &ImageFormatArg;
} else {
continue;
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600TargetMachine.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600TargetMachine.cpp
index 36840587d219..6cd4fd42444d 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600TargetMachine.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600TargetMachine.cpp
@@ -53,7 +53,7 @@ R600TargetMachine::R600TargetMachine(const Target &T, const Triple &TT,
TargetOptions Options,
std::optional<Reloc::Model> RM,
std::optional<CodeModel::Model> CM,
- CodeGenOpt::Level OL, bool JIT)
+ CodeGenOptLevel OL, bool JIT)
: AMDGPUTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL) {
setRequiresStructuredCFG(true);
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600TargetMachine.h b/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600TargetMachine.h
index f0e3cd352642..3fe54c778fe1 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600TargetMachine.h
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600TargetMachine.h
@@ -33,7 +33,7 @@ public:
R600TargetMachine(const Target &T, const Triple &TT, StringRef CPU,
StringRef FS, TargetOptions Options,
std::optional<Reloc::Model> RM,
- std::optional<CodeModel::Model> CM, CodeGenOpt::Level OL,
+ std::optional<CodeModel::Model> CM, CodeGenOptLevel OL,
bool JIT);
TargetPassConfig *createPassConfig(PassManagerBase &PM) override;
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp
index b87cd8c66cc8..932c0d6216ce 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp
@@ -19,6 +19,7 @@
#include "llvm/IR/BasicBlock.h"
#include "llvm/IR/Constants.h"
#include "llvm/IR/Dominators.h"
+#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/IntrinsicsAMDGPU.h"
#include "llvm/InitializePasses.h"
#include "llvm/Target/TargetMachine.h"
@@ -206,9 +207,12 @@ bool SIAnnotateControlFlow::openIf(BranchInst *Term) {
if (isUniform(Term))
return false;
- Value *Ret = CallInst::Create(If, Term->getCondition(), "", Term);
- Term->setCondition(ExtractValueInst::Create(Ret, 0, "", Term));
- push(Term->getSuccessor(1), ExtractValueInst::Create(Ret, 1, "", Term));
+ IRBuilder<> IRB(Term);
+ Value *IfCall = IRB.CreateCall(If, {Term->getCondition()});
+ Value *Cond = IRB.CreateExtractValue(IfCall, {0});
+ Value *Mask = IRB.CreateExtractValue(IfCall, {1});
+ Term->setCondition(Cond);
+ push(Term->getSuccessor(1), Mask);
return true;
}
@@ -217,15 +221,24 @@ bool SIAnnotateControlFlow::insertElse(BranchInst *Term) {
if (isUniform(Term)) {
return false;
}
- Value *Ret = CallInst::Create(Else, popSaved(), "", Term);
- Term->setCondition(ExtractValueInst::Create(Ret, 0, "", Term));
- push(Term->getSuccessor(1), ExtractValueInst::Create(Ret, 1, "", Term));
+
+ IRBuilder<> IRB(Term);
+ Value *ElseCall = IRB.CreateCall(Else, {popSaved()});
+ Value *Cond = IRB.CreateExtractValue(ElseCall, {0});
+ Value *Mask = IRB.CreateExtractValue(ElseCall, {1});
+ Term->setCondition(Cond);
+ push(Term->getSuccessor(1), Mask);
return true;
}
/// Recursively handle the condition leading to a loop
Value *SIAnnotateControlFlow::handleLoopCondition(
Value *Cond, PHINode *Broken, llvm::Loop *L, BranchInst *Term) {
+
+ auto CreateBreak = [this, Cond, Broken](Instruction *I) -> CallInst * {
+ return IRBuilder<>(I).CreateCall(IfBreak, {Cond, Broken});
+ };
+
if (Instruction *Inst = dyn_cast<Instruction>(Cond)) {
BasicBlock *Parent = Inst->getParent();
Instruction *Insert;
@@ -235,8 +248,7 @@ Value *SIAnnotateControlFlow::handleLoopCondition(
Insert = L->getHeader()->getFirstNonPHIOrDbgOrLifetime();
}
- Value *Args[] = { Cond, Broken };
- return CallInst::Create(IfBreak, Args, "", Insert);
+ return CreateBreak(Insert);
}
// Insert IfBreak in the loop header TERM for constant COND other than true.
@@ -244,14 +256,12 @@ Value *SIAnnotateControlFlow::handleLoopCondition(
Instruction *Insert = Cond == BoolTrue ?
Term : L->getHeader()->getTerminator();
- Value *Args[] = { Cond, Broken };
- return CallInst::Create(IfBreak, Args, "", Insert);
+ return CreateBreak(Insert);
}
if (isa<Argument>(Cond)) {
Instruction *Insert = L->getHeader()->getFirstNonPHIOrDbgOrLifetime();
- Value *Args[] = { Cond, Broken };
- return CallInst::Create(IfBreak, Args, "", Insert);
+ return CreateBreak(Insert);
}
llvm_unreachable("Unhandled loop condition!");
@@ -268,7 +278,8 @@ bool SIAnnotateControlFlow::handleLoop(BranchInst *Term) {
return false;
BasicBlock *Target = Term->getSuccessor(1);
- PHINode *Broken = PHINode::Create(IntMask, 0, "phi.broken", &Target->front());
+ PHINode *Broken = PHINode::Create(IntMask, 0, "phi.broken");
+ Broken->insertBefore(Target->begin());
Value *Cond = Term->getCondition();
Term->setCondition(BoolTrue);
@@ -286,7 +297,8 @@ bool SIAnnotateControlFlow::handleLoop(BranchInst *Term) {
Broken->addIncoming(PHIValue, Pred);
}
- Term->setCondition(CallInst::Create(Loop, Arg, "", Term));
+ CallInst *LoopCall = IRBuilder<>(Term).CreateCall(Loop, {Arg});
+ Term->setCondition(LoopCall);
push(Term->getSuccessor(0), Arg);
@@ -325,7 +337,7 @@ bool SIAnnotateControlFlow::closeControlFlow(BasicBlock *BB) {
// Split edge to make Def dominate Use
FirstInsertionPt = &*SplitEdge(DefBB, BB, DT, LI)->getFirstInsertionPt();
}
- CallInst::Create(EndCf, Exec, "", FirstInsertionPt);
+ IRBuilder<>(FirstInsertionPt).CreateCall(EndCf, {Exec});
}
return true;
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIDefines.h b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIDefines.h
index cd1818285e3e..b291400a947c 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIDefines.h
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIDefines.h
@@ -44,6 +44,7 @@ enum {
GFX90A = 8,
GFX940 = 9,
GFX11 = 10,
+ GFX12 = 11,
};
}
@@ -80,19 +81,21 @@ enum : uint64_t {
MTBUF = 1 << 18,
SMRD = 1 << 19,
MIMG = 1 << 20,
- EXP = 1 << 21,
- FLAT = 1 << 22,
- DS = 1 << 23,
+ VIMAGE = 1 << 21,
+ VSAMPLE = 1 << 22,
+ EXP = 1 << 23,
+ FLAT = 1 << 24,
+ DS = 1 << 25,
// Pseudo instruction formats.
- VGPRSpill = 1 << 24,
- SGPRSpill = 1 << 25,
+ VGPRSpill = 1 << 26,
+ SGPRSpill = 1 << 27,
// LDSDIR instruction format.
- LDSDIR = 1 << 26,
+ LDSDIR = 1 << 28,
// VINTERP instruction format.
- VINTERP = 1 << 27,
+ VINTERP = 1 << 29,
// High bits - other information.
VM_CNT = UINT64_C(1) << 32,
@@ -161,6 +164,9 @@ enum : uint64_t {
// Is never uniform.
IsNeverUniform = UINT64_C(1) << 61,
+
+ // ds_gws_* instructions.
+ GWS = UINT64_C(1) << 62,
};
// v_cmp_class_* etc. use a 10-bit mask for what operation is checked.
@@ -207,6 +213,9 @@ enum OperandType : unsigned {
OPERAND_REG_INLINE_C_V2INT32,
OPERAND_REG_INLINE_C_V2FP32,
+ // Operand for split barrier inline constant
+ OPERAND_INLINE_SPLIT_BARRIER_INT32,
+
/// Operand with 32-bit immediate that uses the constant bus.
OPERAND_KIMM32,
OPERAND_KIMM16,
@@ -326,13 +335,20 @@ enum : unsigned {
LITERAL_CONST = 255,
VGPR_MIN = 256,
VGPR_MAX = 511,
- IS_VGPR = 256 // Indicates VGPR or AGPR
+ IS_VGPR = 256, // Indicates VGPR or AGPR
};
} // namespace EncValues
-} // namespace AMDGPU
-namespace AMDGPU {
+// Register codes as defined in the TableGen's HWEncoding field.
+namespace HWEncoding {
+enum : unsigned {
+ REG_IDX_MASK = 0xff,
+ IS_VGPR_OR_AGPR = 1 << 8,
+ IS_HI = 1 << 9, // High 16-bit register.
+};
+} // namespace HWEncoding
+
namespace CPol {
enum CPol {
@@ -343,7 +359,47 @@ enum CPol {
SC0 = GLC,
SC1 = SCC,
NT = SLC,
- ALL = GLC | SLC | DLC | SCC
+ ALL_pregfx12 = GLC | SLC | DLC | SCC,
+ SWZ_pregfx12 = 8,
+
+ // Below are GFX12+ cache policy bits
+
+ // Temporal hint
+ TH = 0x7, // All TH bits
+ TH_RT = 0, // regular
+ TH_NT = 1, // non-temporal
+ TH_HT = 2, // high-temporal
+ TH_LU = 3, // last use
+ TH_RT_WB = 3, // regular (CU, SE), high-temporal with write-back (MALL)
+ TH_NT_RT = 4, // non-temporal (CU, SE), regular (MALL)
+ TH_RT_NT = 5, // regular (CU, SE), non-temporal (MALL)
+ TH_NT_HT = 6, // non-temporal (CU, SE), high-temporal (MALL)
+ TH_NT_WB = 7, // non-temporal (CU, SE), high-temporal with write-back (MALL)
+ TH_BYPASS = 3, // only to be used with scope = 3
+
+ TH_RESERVED = 7, // unused value for load insts
+
+ // Bits of TH for atomics
+ TH_ATOMIC_RETURN = GLC, // Returning vs non-returning
+ TH_ATOMIC_NT = SLC, // Non-temporal vs regular
+ TH_ATOMIC_CASCADE = 4, // Cascading vs regular
+
+ // Scope
+ SCOPE = 0x3 << 3, // All Scope bits
+ SCOPE_CU = 0 << 3,
+ SCOPE_SE = 1 << 3,
+ SCOPE_DEV = 2 << 3,
+ SCOPE_SYS = 3 << 3,
+
+ SWZ = 1 << 6, // Swizzle bit
+
+ ALL = TH | SCOPE,
+
+ // Helper bits
+ TH_TYPE_LOAD = 1 << 7, // TH_LOAD policy
+ TH_TYPE_STORE = 1 << 8, // TH_STORE policy
+ TH_TYPE_ATOMIC = 1 << 9, // TH_ATOMIC policy
+ TH_REAL_BYPASS = 1 << 10, // is TH=3 bypass policy or not
};
} // namespace CPol
@@ -360,8 +416,8 @@ enum Id { // Message ID, width(4) [3:0].
ID_DEALLOC_VGPRS_GFX11Plus = 3, // reused in GFX11
ID_SAVEWAVE = 4, // added in GFX8, removed in GFX11
- ID_STALL_WAVE_GEN = 5, // added in GFX9
- ID_HALT_WAVES = 6, // added in GFX9
+ ID_STALL_WAVE_GEN = 5, // added in GFX9, removed in GFX12
+ ID_HALT_WAVES = 6, // added in GFX9, removed in GFX12
ID_ORDERED_PS_DONE = 7, // added in GFX9, removed in GFX11
ID_EARLY_PRIM_DEALLOC = 8, // added in GFX9, removed in GFX10
ID_GS_ALLOC_REQ = 9, // added in GFX9
@@ -375,6 +431,7 @@ enum Id { // Message ID, width(4) [3:0].
ID_RTN_GET_REALTIME = 131,
ID_RTN_SAVE_WAVE = 132,
ID_RTN_GET_TBA = 133,
+ ID_RTN_GET_SE_AID_ID = 134,
ID_MASK_PreGFX11_ = 0xF,
ID_MASK_GFX11Plus_ = 0xFF
@@ -425,6 +482,9 @@ enum Id { // HwRegCode, (6) [5:0]
ID_GPR_ALLOC = 5,
ID_LDS_ALLOC = 6,
ID_IB_STS = 7,
+ ID_PERF_SNAPSHOT_DATA_gfx12 = 10,
+ ID_PERF_SNAPSHOT_PC_LO_gfx12 = 11,
+ ID_PERF_SNAPSHOT_PC_HI_gfx12 = 12,
ID_MEM_BASES = 15,
ID_TBA_LO = 16,
ID_TBA_HI = 17,
@@ -436,12 +496,23 @@ enum Id { // HwRegCode, (6) [5:0]
ID_HW_ID1 = 23,
ID_HW_ID2 = 24,
ID_POPS_PACKER = 25,
- ID_PERF_SNAPSHOT_DATA = 27,
+ ID_PERF_SNAPSHOT_DATA_gfx11 = 27,
ID_SHADER_CYCLES = 29,
-
- // Register numbers reused in GFX11+
- ID_PERF_SNAPSHOT_PC_LO = 18,
- ID_PERF_SNAPSHOT_PC_HI = 19,
+ ID_SHADER_CYCLES_HI = 30,
+ ID_DVGPR_ALLOC_LO = 31,
+ ID_DVGPR_ALLOC_HI = 32,
+
+ // Register numbers reused in GFX11
+ ID_PERF_SNAPSHOT_PC_LO_gfx11 = 18,
+ ID_PERF_SNAPSHOT_PC_HI_gfx11 = 19,
+
+ // Register numbers reused in GFX12+
+ ID_STATE_PRIV = 4,
+ ID_PERF_SNAPSHOT_DATA1 = 15,
+ ID_PERF_SNAPSHOT_DATA2 = 16,
+ ID_EXCP_FLAG_PRIV = 17,
+ ID_EXCP_FLAG_USER = 18,
+ ID_TRAP_CTRL = 19,
// GFX940 specific registers
ID_XCC_ID = 20,
@@ -958,6 +1029,14 @@ enum Register_Flag : uint8_t {
} // namespace AMDGPU
+namespace AMDGPU {
+namespace Barrier {
+enum Type { TRAP = -2, WORKGROUP = -1 };
+} // namespace Barrier
+} // namespace AMDGPU
+
+// clang-format off
+
#define R_00B028_SPI_SHADER_PGM_RSRC1_PS 0x00B028
#define S_00B028_VGPRS(x) (((x) & 0x3F) << 0)
#define S_00B028_SGPRS(x) (((x) & 0x0F) << 6)
@@ -1050,6 +1129,9 @@ enum Register_Flag : uint8_t {
#define S_00B848_DX10_CLAMP(x) (((x) & 0x1) << 21)
#define G_00B848_DX10_CLAMP(x) (((x) >> 21) & 0x1)
#define C_00B848_DX10_CLAMP 0xFFDFFFFF
+#define S_00B848_RR_WG_MODE(x) (((x) & 0x1) << 21)
+#define G_00B848_RR_WG_MODE(x) (((x) >> 21) & 0x1)
+#define C_00B848_RR_WG_MODE 0xFFDFFFFF
#define S_00B848_DEBUG_MODE(x) (((x) & 0x1) << 22)
#define G_00B848_DEBUG_MODE(x) (((x) >> 22) & 0x1)
#define C_00B848_DEBUG_MODE 0xFFBFFFFF
@@ -1066,7 +1148,6 @@ enum Register_Flag : uint8_t {
#define G_00B848_FWD_PROGRESS(x) (((x) >> 31) & 0x1)
#define C_00B848_FWD_PROGRESS 0x7FFFFFFF
-
// Helpers for setting FLOAT_MODE
#define FP_ROUND_ROUND_TO_NEAREST 0
#define FP_ROUND_ROUND_TO_INF 1
@@ -1108,6 +1189,9 @@ enum Register_Flag : uint8_t {
#define R_SPILLED_SGPRS 0x4
#define R_SPILLED_VGPRS 0x8
+
+// clang-format on
+
} // End namespace llvm
#endif
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
index db323465c153..86980ee851bb 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
@@ -88,7 +88,7 @@ public:
// VGPR to SGPR copy being processed
MachineInstr *Copy;
// All SALU instructions reachable from this copy in SSA graph
- DenseSet<MachineInstr *> SChain;
+ SetVector<MachineInstr *> SChain;
// Number of SGPR to VGPR copies that are used to put the SALU computation
// results back to VALU.
unsigned NumSVCopies;
@@ -125,7 +125,7 @@ class SIFixSGPRCopies : public MachineFunctionPass {
SmallVector<MachineInstr*, 4> PHINodes;
SmallVector<MachineInstr*, 4> S2VCopies;
unsigned NextVGPRToSGPRCopyID;
- DenseMap<unsigned, V2SCopyInfo> V2SCopies;
+ MapVector<unsigned, V2SCopyInfo> V2SCopies;
DenseMap<MachineInstr *, SetVector<unsigned>> SiblingPenalty;
public:
@@ -152,6 +152,13 @@ public:
void processPHINode(MachineInstr &MI);
+ // Check if MO is an immediate materialized into a VGPR, and if so replace it
+ // with an SGPR immediate. The VGPR immediate is also deleted if it does not
+ // have any other uses.
+ bool tryMoveVGPRConstToSGPR(MachineOperand &MO, Register NewDst,
+ MachineBasicBlock *BlockToInsertTo,
+ MachineBasicBlock::iterator PointToInsertTo);
+
StringRef getPassName() const override { return "SI Fix SGPR copies"; }
void getAnalysisUsage(AnalysisUsage &AU) const override {
@@ -350,7 +357,7 @@ static bool isSafeToFoldImmIntoCopy(const MachineInstr *Copy,
return false;
// FIXME: Handle copies with sub-regs.
- if (Copy->getOperand(0).getSubReg())
+ if (Copy->getOperand(1).getSubReg())
return false;
switch (MoveImm->getOpcode()) {
@@ -360,7 +367,7 @@ static bool isSafeToFoldImmIntoCopy(const MachineInstr *Copy,
SMovOp = AMDGPU::S_MOV_B32;
break;
case AMDGPU::V_MOV_B64_PSEUDO:
- SMovOp = AMDGPU::S_MOV_B64;
+ SMovOp = AMDGPU::S_MOV_B64_IMM_PSEUDO;
break;
}
Imm = ImmOp->getImm();
@@ -662,13 +669,17 @@ bool SIFixSGPRCopies::runOnMachineFunction(MachineFunction &MF) {
: MBB;
MachineBasicBlock::iterator PointToInsertCopy =
MI.isPHI() ? BlockToInsertCopy->getFirstInstrTerminator() : I;
- MachineInstr *NewCopy =
- BuildMI(*BlockToInsertCopy, PointToInsertCopy,
- PointToInsertCopy->getDebugLoc(),
- TII->get(AMDGPU::COPY), NewDst)
- .addReg(MO.getReg());
- MO.setReg(NewDst);
- analyzeVGPRToSGPRCopy(NewCopy);
+
+ if (!tryMoveVGPRConstToSGPR(MO, NewDst, BlockToInsertCopy,
+ PointToInsertCopy)) {
+ MachineInstr *NewCopy =
+ BuildMI(*BlockToInsertCopy, PointToInsertCopy,
+ PointToInsertCopy->getDebugLoc(),
+ TII->get(AMDGPU::COPY), NewDst)
+ .addReg(MO.getReg());
+ MO.setReg(NewDst);
+ analyzeVGPRToSGPRCopy(NewCopy);
+ }
}
}
}
@@ -765,7 +776,7 @@ bool SIFixSGPRCopies::runOnMachineFunction(MachineFunction &MF) {
for (auto MI : PHINodes) {
processPHINode(*MI);
}
- if (MF.getTarget().getOptLevel() > CodeGenOpt::None && EnableM0Merge)
+ if (MF.getTarget().getOptLevel() > CodeGenOptLevel::None && EnableM0Merge)
hoistAndMergeSGPRInits(AMDGPU::M0, *MRI, TRI, *MDT, TII);
SiblingPenalty.clear();
@@ -829,6 +840,32 @@ void SIFixSGPRCopies::processPHINode(MachineInstr &MI) {
}
}
+bool SIFixSGPRCopies::tryMoveVGPRConstToSGPR(
+ MachineOperand &MaybeVGPRConstMO, Register DstReg,
+ MachineBasicBlock *BlockToInsertTo,
+ MachineBasicBlock::iterator PointToInsertTo) {
+
+ MachineInstr *DefMI = MRI->getVRegDef(MaybeVGPRConstMO.getReg());
+ if (!DefMI || !DefMI->isMoveImmediate())
+ return false;
+
+ MachineOperand *SrcConst = TII->getNamedOperand(*DefMI, AMDGPU::OpName::src0);
+ if (SrcConst->isReg())
+ return false;
+
+ const TargetRegisterClass *SrcRC =
+ MRI->getRegClass(MaybeVGPRConstMO.getReg());
+ unsigned MoveSize = TRI->getRegSizeInBits(*SrcRC);
+ unsigned MoveOp = MoveSize == 64 ? AMDGPU::S_MOV_B64 : AMDGPU::S_MOV_B32;
+ BuildMI(*BlockToInsertTo, PointToInsertTo, PointToInsertTo->getDebugLoc(),
+ TII->get(MoveOp), DstReg)
+ .add(*SrcConst);
+ if (MRI->hasOneUse(MaybeVGPRConstMO.getReg()))
+ DefMI->eraseFromParent();
+ MaybeVGPRConstMO.setReg(DstReg);
+ return true;
+}
+
bool SIFixSGPRCopies::lowerSpecialCase(MachineInstr &MI,
MachineBasicBlock::iterator &I) {
Register DstReg = MI.getOperand(0).getReg();
@@ -846,25 +883,10 @@ bool SIFixSGPRCopies::lowerSpecialCase(MachineInstr &MI,
TII->get(AMDGPU::V_READFIRSTLANE_B32), TmpReg)
.add(MI.getOperand(1));
MI.getOperand(1).setReg(TmpReg);
- } else {
- MachineInstr *DefMI = MRI->getVRegDef(SrcReg);
- if (DefMI && DefMI->isMoveImmediate()) {
- MachineOperand SrcConst = DefMI->getOperand(AMDGPU::getNamedOperandIdx(
- DefMI->getOpcode(), AMDGPU::OpName::src0));
- if (!SrcConst.isReg()) {
- const TargetRegisterClass *SrcRC = MRI->getRegClass(SrcReg);
- unsigned MoveSize = TRI->getRegSizeInBits(*SrcRC);
- unsigned MoveOp =
- MoveSize == 64 ? AMDGPU::S_MOV_B64 : AMDGPU::S_MOV_B32;
- BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), TII->get(MoveOp),
- DstReg)
- .add(SrcConst);
- I = std::next(I);
- if (MRI->hasOneUse(SrcReg))
- DefMI->eraseFromParent();
- MI.eraseFromParent();
- }
- }
+ } else if (tryMoveVGPRConstToSGPR(MI.getOperand(1), DstReg, MI.getParent(),
+ MI)) {
+ I = std::next(I);
+ MI.eraseFromParent();
}
return true;
}
@@ -966,7 +988,7 @@ bool SIFixSGPRCopies::needToBeConvertedToVALU(V2SCopyInfo *Info) {
for (auto J : Info->Siblings) {
auto InfoIt = V2SCopies.find(J);
if (InfoIt != V2SCopies.end()) {
- MachineInstr *SiblingCopy = InfoIt->getSecond().Copy;
+ MachineInstr *SiblingCopy = InfoIt->second.Copy;
if (SiblingCopy->isImplicitDef())
// the COPY has already been MoveToVALUed
continue;
@@ -1001,15 +1023,15 @@ void SIFixSGPRCopies::lowerVGPR2SGPRCopies(MachineFunction &MF) {
unsigned CurID = LoweringWorklist.pop_back_val();
auto CurInfoIt = V2SCopies.find(CurID);
if (CurInfoIt != V2SCopies.end()) {
- V2SCopyInfo C = CurInfoIt->getSecond();
+ V2SCopyInfo C = CurInfoIt->second;
LLVM_DEBUG(dbgs() << "Processing ...\n"; C.dump());
for (auto S : C.Siblings) {
auto SibInfoIt = V2SCopies.find(S);
if (SibInfoIt != V2SCopies.end()) {
- V2SCopyInfo &SI = SibInfoIt->getSecond();
+ V2SCopyInfo &SI = SibInfoIt->second;
LLVM_DEBUG(dbgs() << "Sibling:\n"; SI.dump());
if (!SI.NeedToBeConvertedToVALU) {
- set_subtract(SI.SChain, C.SChain);
+ SI.SChain.set_subtract(C.SChain);
if (needToBeConvertedToVALU(&SI))
LoweringWorklist.push_back(SI.ID);
}
@@ -1018,6 +1040,8 @@ void SIFixSGPRCopies::lowerVGPR2SGPRCopies(MachineFunction &MF) {
}
LLVM_DEBUG(dbgs() << "V2S copy " << *C.Copy
<< " is being turned to VALU\n");
+ // TODO: MapVector::erase is inefficient. Do bulk removal with remove_if
+ // instead.
V2SCopies.erase(C.ID);
Copies.insert(C.Copy);
}
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
index 9f1d6038f1b6..709de612d81d 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
@@ -80,6 +80,10 @@ public:
bool updateOperand(FoldCandidate &Fold) const;
+ bool canUseImmWithOpSel(FoldCandidate &Fold) const;
+
+ bool tryFoldImmWithOpSel(FoldCandidate &Fold) const;
+
bool tryAddToFoldList(SmallVectorImpl<FoldCandidate> &FoldList,
MachineInstr *MI, unsigned OpNo,
MachineOperand *OpToFold) const;
@@ -196,61 +200,86 @@ FunctionPass *llvm::createSIFoldOperandsPass() {
return new SIFoldOperands();
}
-bool SIFoldOperands::updateOperand(FoldCandidate &Fold) const {
+bool SIFoldOperands::canUseImmWithOpSel(FoldCandidate &Fold) const {
MachineInstr *MI = Fold.UseMI;
MachineOperand &Old = MI->getOperand(Fold.UseOpNo);
- assert(Old.isReg());
+ const uint64_t TSFlags = MI->getDesc().TSFlags;
+ assert(Old.isReg() && Fold.isImm());
- const uint64_t TSFlags = MI->getDesc().TSFlags;
- if (Fold.isImm()) {
- if (TSFlags & SIInstrFlags::IsPacked && !(TSFlags & SIInstrFlags::IsMAI) &&
- (!ST->hasDOTOpSelHazard() || !(TSFlags & SIInstrFlags::IsDOT)) &&
- AMDGPU::isFoldableLiteralV216(Fold.ImmToFold,
- ST->hasInv2PiInlineImm())) {
- // Set op_sel/op_sel_hi on this operand or bail out if op_sel is
- // already set.
- unsigned Opcode = MI->getOpcode();
- int OpNo = MI->getOperandNo(&Old);
- int ModIdx = -1;
- if (OpNo == AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0))
- ModIdx = AMDGPU::OpName::src0_modifiers;
- else if (OpNo == AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src1))
- ModIdx = AMDGPU::OpName::src1_modifiers;
- else if (OpNo == AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src2))
- ModIdx = AMDGPU::OpName::src2_modifiers;
- assert(ModIdx != -1);
- ModIdx = AMDGPU::getNamedOperandIdx(Opcode, ModIdx);
- MachineOperand &Mod = MI->getOperand(ModIdx);
- unsigned Val = Mod.getImm();
- if (!(Val & SISrcMods::OP_SEL_0) && (Val & SISrcMods::OP_SEL_1)) {
- // Only apply the following transformation if that operand requires
- // a packed immediate.
- switch (TII->get(Opcode).operands()[OpNo].OperandType) {
- case AMDGPU::OPERAND_REG_IMM_V2FP16:
- case AMDGPU::OPERAND_REG_IMM_V2INT16:
- case AMDGPU::OPERAND_REG_INLINE_C_V2FP16:
- case AMDGPU::OPERAND_REG_INLINE_C_V2INT16:
- // If upper part is all zero we do not need op_sel_hi.
- if (!isUInt<16>(Fold.ImmToFold)) {
- if (!(Fold.ImmToFold & 0xffff)) {
- Mod.setImm(Mod.getImm() | SISrcMods::OP_SEL_0);
- Mod.setImm(Mod.getImm() & ~SISrcMods::OP_SEL_1);
- Old.ChangeToImmediate((Fold.ImmToFold >> 16) & 0xffff);
- return true;
- }
- Mod.setImm(Mod.getImm() & ~SISrcMods::OP_SEL_1);
- Old.ChangeToImmediate(Fold.ImmToFold & 0xffff);
- return true;
- }
- break;
- default:
- break;
- }
- }
- }
+ if (!(TSFlags & SIInstrFlags::IsPacked) || (TSFlags & SIInstrFlags::IsMAI) ||
+ (ST->hasDOTOpSelHazard() && (TSFlags & SIInstrFlags::IsDOT)) ||
+ isUInt<16>(Fold.ImmToFold) ||
+ !AMDGPU::isFoldableLiteralV216(Fold.ImmToFold, ST->hasInv2PiInlineImm()))
+ return false;
+
+ unsigned Opcode = MI->getOpcode();
+ int OpNo = MI->getOperandNo(&Old);
+ uint8_t OpType = TII->get(Opcode).operands()[OpNo].OperandType;
+ switch (OpType) {
+ default:
+ return false;
+ case AMDGPU::OPERAND_REG_IMM_V2FP16:
+ case AMDGPU::OPERAND_REG_IMM_V2INT16:
+ case AMDGPU::OPERAND_REG_INLINE_C_V2FP16:
+ case AMDGPU::OPERAND_REG_INLINE_C_V2INT16:
+ break;
}
+ return true;
+}
+
+bool SIFoldOperands::tryFoldImmWithOpSel(FoldCandidate &Fold) const {
+ MachineInstr *MI = Fold.UseMI;
+ MachineOperand &Old = MI->getOperand(Fold.UseOpNo);
+ unsigned Opcode = MI->getOpcode();
+ int OpNo = MI->getOperandNo(&Old);
+
+ // Set op_sel/op_sel_hi on this operand or bail out if op_sel is
+ // already set.
+ int ModIdx = -1;
+ if (OpNo == AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0))
+ ModIdx = AMDGPU::OpName::src0_modifiers;
+ else if (OpNo == AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src1))
+ ModIdx = AMDGPU::OpName::src1_modifiers;
+ else if (OpNo == AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src2))
+ ModIdx = AMDGPU::OpName::src2_modifiers;
+ assert(ModIdx != -1);
+ ModIdx = AMDGPU::getNamedOperandIdx(Opcode, ModIdx);
+ MachineOperand &Mod = MI->getOperand(ModIdx);
+ unsigned Val = Mod.getImm();
+ if ((Val & SISrcMods::OP_SEL_0) || !(Val & SISrcMods::OP_SEL_1))
+ return false;
+
+ // Only apply the following transformation if that operand requires
+ // a packed immediate.
+ // If upper part is all zero we do not need op_sel_hi.
+ if (!(Fold.ImmToFold & 0xffff)) {
+ MachineOperand New =
+ MachineOperand::CreateImm((Fold.ImmToFold >> 16) & 0xffff);
+ if (!TII->isOperandLegal(*MI, OpNo, &New))
+ return false;
+ Mod.setImm(Mod.getImm() | SISrcMods::OP_SEL_0);
+ Mod.setImm(Mod.getImm() & ~SISrcMods::OP_SEL_1);
+ Old.ChangeToImmediate((Fold.ImmToFold >> 16) & 0xffff);
+ return true;
+ }
+ MachineOperand New = MachineOperand::CreateImm(Fold.ImmToFold & 0xffff);
+ if (!TII->isOperandLegal(*MI, OpNo, &New))
+ return false;
+ Mod.setImm(Mod.getImm() & ~SISrcMods::OP_SEL_1);
+ Old.ChangeToImmediate(Fold.ImmToFold & 0xffff);
+ return true;
+}
+
+bool SIFoldOperands::updateOperand(FoldCandidate &Fold) const {
+ MachineInstr *MI = Fold.UseMI;
+ MachineOperand &Old = MI->getOperand(Fold.UseOpNo);
+ assert(Old.isReg());
+
+ if (Fold.isImm() && canUseImmWithOpSel(Fold))
+ return tryFoldImmWithOpSel(Fold);
+
if ((Fold.isImm() || Fold.isFI() || Fold.isGlobal()) && Fold.needsShrink()) {
MachineBasicBlock *MBB = MI->getParent();
auto Liveness = MBB->computeRegisterLiveness(TRI, AMDGPU::VCC, MI, 16);
@@ -345,9 +374,50 @@ static void appendFoldCandidate(SmallVectorImpl<FoldCandidate> &FoldList,
bool SIFoldOperands::tryAddToFoldList(SmallVectorImpl<FoldCandidate> &FoldList,
MachineInstr *MI, unsigned OpNo,
MachineOperand *OpToFold) const {
- if (!TII->isOperandLegal(*MI, OpNo, OpToFold)) {
+ const unsigned Opc = MI->getOpcode();
+
+ auto tryToFoldAsFMAAKorMK = [&]() {
+ if (!OpToFold->isImm())
+ return false;
+
+ const bool TryAK = OpNo == 3;
+ const unsigned NewOpc = TryAK ? AMDGPU::S_FMAAK_F32 : AMDGPU::S_FMAMK_F32;
+ MI->setDesc(TII->get(NewOpc));
+
+ // We have to fold into operand which would be Imm not into OpNo.
+ bool FoldAsFMAAKorMK =
+ tryAddToFoldList(FoldList, MI, TryAK ? 3 : 2, OpToFold);
+ if (FoldAsFMAAKorMK) {
+ // Untie Src2 of fmac.
+ MI->untieRegOperand(3);
+ // For fmamk swap operands 1 and 2 if OpToFold was meant for operand 1.
+ if (OpNo == 1) {
+ MachineOperand &Op1 = MI->getOperand(1);
+ MachineOperand &Op2 = MI->getOperand(2);
+ Register OldReg = Op1.getReg();
+ // Operand 2 might be an inlinable constant
+ if (Op2.isImm()) {
+ Op1.ChangeToImmediate(Op2.getImm());
+ Op2.ChangeToRegister(OldReg, false);
+ } else {
+ Op1.setReg(Op2.getReg());
+ Op2.setReg(OldReg);
+ }
+ }
+ return true;
+ }
+ MI->setDesc(TII->get(Opc));
+ return false;
+ };
+
+ bool IsLegal = TII->isOperandLegal(*MI, OpNo, OpToFold);
+ if (!IsLegal && OpToFold->isImm()) {
+ FoldCandidate Fold(MI, OpNo, OpToFold);
+ IsLegal = canUseImmWithOpSel(Fold);
+ }
+
+ if (!IsLegal) {
// Special case for v_mac_{f16, f32}_e64 if we are trying to fold into src2
- unsigned Opc = MI->getOpcode();
unsigned NewOpc = macToMad(Opc);
if (NewOpc != AMDGPU::INSTRUCTION_LIST_END) {
// Check if changing this to a v_mad_{f16, f32} instruction will allow us
@@ -367,6 +437,13 @@ bool SIFoldOperands::tryAddToFoldList(SmallVectorImpl<FoldCandidate> &FoldList,
MI->setDesc(TII->get(Opc));
}
+ // Special case for s_fmac_f32 if we are trying to fold into Src2.
+ // By transforming into fmaak we can untie Src2 and make folding legal.
+ if (Opc == AMDGPU::S_FMAC_F32 && OpNo == 3) {
+ if (tryToFoldAsFMAAKorMK())
+ return true;
+ }
+
// Special case for s_setreg_b32
if (OpToFold->isImm()) {
unsigned ImmOpc = 0;
@@ -387,66 +464,72 @@ bool SIFoldOperands::tryAddToFoldList(SmallVectorImpl<FoldCandidate> &FoldList,
if (isUseMIInFoldList(FoldList, MI))
return false;
- unsigned CommuteOpNo = OpNo;
-
// Operand is not legal, so try to commute the instruction to
// see if this makes it possible to fold.
- unsigned CommuteIdx0 = TargetInstrInfo::CommuteAnyOperandIndex;
- unsigned CommuteIdx1 = TargetInstrInfo::CommuteAnyOperandIndex;
- bool CanCommute = TII->findCommutedOpIndices(*MI, CommuteIdx0, CommuteIdx1);
-
- if (CanCommute) {
- if (CommuteIdx0 == OpNo)
- CommuteOpNo = CommuteIdx1;
- else if (CommuteIdx1 == OpNo)
- CommuteOpNo = CommuteIdx0;
- }
-
+ unsigned CommuteOpNo = TargetInstrInfo::CommuteAnyOperandIndex;
+ bool CanCommute = TII->findCommutedOpIndices(*MI, OpNo, CommuteOpNo);
+ if (!CanCommute)
+ return false;
// One of operands might be an Imm operand, and OpNo may refer to it after
// the call of commuteInstruction() below. Such situations are avoided
// here explicitly as OpNo must be a register operand to be a candidate
// for memory folding.
- if (CanCommute && (!MI->getOperand(CommuteIdx0).isReg() ||
- !MI->getOperand(CommuteIdx1).isReg()))
+ if (!MI->getOperand(OpNo).isReg() || !MI->getOperand(CommuteOpNo).isReg())
return false;
- if (!CanCommute ||
- !TII->commuteInstruction(*MI, false, CommuteIdx0, CommuteIdx1))
+ if (!TII->commuteInstruction(*MI, false, OpNo, CommuteOpNo))
return false;
+ int Op32 = -1;
if (!TII->isOperandLegal(*MI, CommuteOpNo, OpToFold)) {
- if ((Opc == AMDGPU::V_ADD_CO_U32_e64 ||
- Opc == AMDGPU::V_SUB_CO_U32_e64 ||
- Opc == AMDGPU::V_SUBREV_CO_U32_e64) && // FIXME
- (OpToFold->isImm() || OpToFold->isFI() || OpToFold->isGlobal())) {
-
- // Verify the other operand is a VGPR, otherwise we would violate the
- // constant bus restriction.
- unsigned OtherIdx = CommuteOpNo == CommuteIdx0 ? CommuteIdx1 : CommuteIdx0;
- MachineOperand &OtherOp = MI->getOperand(OtherIdx);
- if (!OtherOp.isReg() ||
- !TII->getRegisterInfo().isVGPR(*MRI, OtherOp.getReg()))
- return false;
-
- assert(MI->getOperand(1).isDef());
+ if ((Opc != AMDGPU::V_ADD_CO_U32_e64 && Opc != AMDGPU::V_SUB_CO_U32_e64 &&
+ Opc != AMDGPU::V_SUBREV_CO_U32_e64) || // FIXME
+ (!OpToFold->isImm() && !OpToFold->isFI() && !OpToFold->isGlobal())) {
+ TII->commuteInstruction(*MI, false, OpNo, CommuteOpNo);
+ return false;
+ }
- // Make sure to get the 32-bit version of the commuted opcode.
- unsigned MaybeCommutedOpc = MI->getOpcode();
- int Op32 = AMDGPU::getVOPe32(MaybeCommutedOpc);
+ // Verify the other operand is a VGPR, otherwise we would violate the
+ // constant bus restriction.
+ MachineOperand &OtherOp = MI->getOperand(OpNo);
+ if (!OtherOp.isReg() ||
+ !TII->getRegisterInfo().isVGPR(*MRI, OtherOp.getReg()))
+ return false;
- appendFoldCandidate(FoldList, MI, CommuteOpNo, OpToFold, true, Op32);
- return true;
- }
+ assert(MI->getOperand(1).isDef());
- TII->commuteInstruction(*MI, false, CommuteIdx0, CommuteIdx1);
- return false;
+ // Make sure to get the 32-bit version of the commuted opcode.
+ unsigned MaybeCommutedOpc = MI->getOpcode();
+ Op32 = AMDGPU::getVOPe32(MaybeCommutedOpc);
}
- appendFoldCandidate(FoldList, MI, CommuteOpNo, OpToFold, true);
+ appendFoldCandidate(FoldList, MI, CommuteOpNo, OpToFold, true, Op32);
return true;
}
+ // Inlineable constant might have been folded into Imm operand of fmaak or
+ // fmamk and we are trying to fold a non-inlinable constant.
+ if ((Opc == AMDGPU::S_FMAAK_F32 || Opc == AMDGPU::S_FMAMK_F32) &&
+ !OpToFold->isReg() && !TII->isInlineConstant(*OpToFold)) {
+ unsigned ImmIdx = Opc == AMDGPU::S_FMAAK_F32 ? 3 : 2;
+ MachineOperand &OpImm = MI->getOperand(ImmIdx);
+ if (!OpImm.isReg() &&
+ TII->isInlineConstant(*MI, MI->getOperand(OpNo), OpImm))
+ return tryToFoldAsFMAAKorMK();
+ }
+
+ // Special case for s_fmac_f32 if we are trying to fold into Src0 or Src1.
+ // By changing into fmamk we can untie Src2.
+ // If folding for Src0 happens first and it is identical operand to Src1 we
+ // should avoid transforming into fmamk which requires commuting as it would
+ // cause folding into Src1 to fail later on due to wrong OpNo used.
+ if (Opc == AMDGPU::S_FMAC_F32 &&
+ (OpNo != 1 || !MI->getOperand(1).isIdenticalTo(MI->getOperand(2)))) {
+ if (tryToFoldAsFMAAKorMK())
+ return true;
+ }
+
// Check the case where we might introduce a second constant operand to a
// scalar instruction
if (TII->isSALU(MI->getOpcode())) {
@@ -458,7 +541,8 @@ bool SIFoldOperands::tryAddToFoldList(SmallVectorImpl<FoldCandidate> &FoldList,
// Otherwise check for another constant
for (unsigned i = 0, e = InstDesc.getNumOperands(); i != e; ++i) {
auto &Op = MI->getOperand(i);
- if (OpNo != i && !Op.isReg() && !TII->isInlineConstant(Op, OpInfo))
+ if (OpNo != i && !Op.isReg() &&
+ !TII->isInlineConstant(Op, InstDesc.operands()[i]))
return false;
}
}
@@ -516,13 +600,10 @@ bool SIFoldOperands::tryToFoldACImm(
if (UseOpIdx >= Desc.getNumOperands())
return false;
- uint8_t OpTy = Desc.operands()[UseOpIdx].OperandType;
- if ((OpTy < AMDGPU::OPERAND_REG_INLINE_AC_FIRST ||
- OpTy > AMDGPU::OPERAND_REG_INLINE_AC_LAST) &&
- (OpTy < AMDGPU::OPERAND_REG_INLINE_C_FIRST ||
- OpTy > AMDGPU::OPERAND_REG_INLINE_C_LAST))
+ if (!AMDGPU::isSISrcInlinableOperand(Desc, UseOpIdx))
return false;
+ uint8_t OpTy = Desc.operands()[UseOpIdx].OperandType;
if (OpToFold.isImm() && TII->isInlineConstant(OpToFold, OpTy) &&
TII->isOperandLegal(*UseMI, UseOpIdx, &OpToFold)) {
UseMI->getOperand(UseOpIdx).ChangeToImmediate(OpToFold.getImm());
@@ -671,24 +752,6 @@ void SIFoldOperands::foldOperand(
const TargetRegisterClass *DestRC = TRI->getRegClassForReg(*MRI, DestReg);
if (!DestReg.isPhysical()) {
- if (TRI->isSGPRClass(SrcRC) && TRI->hasVectorRegisters(DestRC)) {
- SmallVector<FoldCandidate, 4> CopyUses;
- for (auto &Use : MRI->use_nodbg_operands(DestReg)) {
- // There's no point trying to fold into an implicit operand.
- if (Use.isImplicit())
- continue;
-
- CopyUses.emplace_back(Use.getParent(),
- Use.getParent()->getOperandNo(&Use),
- &UseMI->getOperand(1));
- }
-
- for (auto &F : CopyUses) {
- foldOperand(*F.OpToFold, F.UseMI, F.UseOpNo, FoldList,
- CopiesToReplace);
- }
- }
-
if (DestRC == &AMDGPU::AGPR_32RegClass &&
TII->isInlineConstant(OpToFold, AMDGPU::OPERAND_REG_INLINE_C_INT32)) {
UseMI->setDesc(TII->get(AMDGPU::V_ACCVGPR_WRITE_B32_e64));
@@ -1035,6 +1098,9 @@ SIFoldOperands::getImmOrMaterializedImm(MachineOperand &Op) const {
// selection.
// TODO: See if a frame index with a fixed offset can fold.
bool SIFoldOperands::tryConstantFoldOp(MachineInstr *MI) const {
+ if (!MI->allImplicitDefsAreDead())
+ return false;
+
unsigned Opc = MI->getOpcode();
int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
@@ -1340,6 +1406,7 @@ const MachineOperand *SIFoldOperands::isClamp(const MachineInstr &MI) const {
case AMDGPU::V_MAX_F32_e64:
case AMDGPU::V_MAX_F16_e64:
case AMDGPU::V_MAX_F16_t16_e64:
+ case AMDGPU::V_MAX_F16_fake16_e64:
case AMDGPU::V_MAX_F64_e64:
case AMDGPU::V_PK_MAX_F16: {
if (!TII->getNamedOperand(MI, AMDGPU::OpName::clamp)->getImm())
@@ -1435,7 +1502,8 @@ static int getOModValue(unsigned Opc, int64_t Val) {
}
}
case AMDGPU::V_MUL_F16_e64:
- case AMDGPU::V_MUL_F16_t16_e64: {
+ case AMDGPU::V_MUL_F16_t16_e64:
+ case AMDGPU::V_MUL_F16_fake16_e64: {
switch (static_cast<uint16_t>(Val)) {
case 0x3800: // 0.5
return SIOutMods::DIV2;
@@ -1462,12 +1530,14 @@ SIFoldOperands::isOMod(const MachineInstr &MI) const {
case AMDGPU::V_MUL_F64_e64:
case AMDGPU::V_MUL_F32_e64:
case AMDGPU::V_MUL_F16_t16_e64:
+ case AMDGPU::V_MUL_F16_fake16_e64:
case AMDGPU::V_MUL_F16_e64: {
// If output denormals are enabled, omod is ignored.
if ((Op == AMDGPU::V_MUL_F32_e64 &&
MFI->getMode().FP32Denormals.Output != DenormalMode::PreserveSign) ||
((Op == AMDGPU::V_MUL_F64_e64 || Op == AMDGPU::V_MUL_F16_e64 ||
- Op == AMDGPU::V_MUL_F16_t16_e64) &&
+ Op == AMDGPU::V_MUL_F16_t16_e64 ||
+ Op == AMDGPU::V_MUL_F16_fake16_e64) &&
MFI->getMode().FP64FP16Denormals.Output != DenormalMode::PreserveSign))
return std::pair(nullptr, SIOutMods::NONE);
@@ -1497,12 +1567,14 @@ SIFoldOperands::isOMod(const MachineInstr &MI) const {
case AMDGPU::V_ADD_F64_e64:
case AMDGPU::V_ADD_F32_e64:
case AMDGPU::V_ADD_F16_e64:
- case AMDGPU::V_ADD_F16_t16_e64: {
+ case AMDGPU::V_ADD_F16_t16_e64:
+ case AMDGPU::V_ADD_F16_fake16_e64: {
// If output denormals are enabled, omod is ignored.
if ((Op == AMDGPU::V_ADD_F32_e64 &&
MFI->getMode().FP32Denormals.Output != DenormalMode::PreserveSign) ||
((Op == AMDGPU::V_ADD_F64_e64 || Op == AMDGPU::V_ADD_F16_e64 ||
- Op == AMDGPU::V_ADD_F16_t16_e64) &&
+ Op == AMDGPU::V_ADD_F16_t16_e64 ||
+ Op == AMDGPU::V_ADD_F16_fake16_e64) &&
MFI->getMode().FP64FP16Denormals.Output != DenormalMode::PreserveSign))
return std::pair(nullptr, SIOutMods::NONE);
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
index 903e726c667d..0f89df144486 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
@@ -11,7 +11,7 @@
#include "GCNSubtarget.h"
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
#include "SIMachineFunctionInfo.h"
-#include "llvm/CodeGen/LivePhysRegs.h"
+#include "llvm/CodeGen/LiveRegUnits.h"
#include "llvm/CodeGen/MachineFrameInfo.h"
#include "llvm/CodeGen/RegisterScavenging.h"
#include "llvm/Target/TargetMachine.h"
@@ -26,13 +26,17 @@ static cl::opt<bool> EnableSpillVGPRToAGPR(
cl::ReallyHidden,
cl::init(true));
-// Find a register matching \p RC from \p LiveRegs which is unused and available
-// throughout the function. On failure, returns AMDGPU::NoRegister.
+// Find a register matching \p RC from \p LiveUnits which is unused and
+// available throughout the function. On failure, returns AMDGPU::NoRegister.
+// TODO: Rewrite the loop here to iterate over MCRegUnits instead of
+// MCRegisters. This should reduce the number of iterations and avoid redundant
+// checking.
static MCRegister findUnusedRegister(MachineRegisterInfo &MRI,
- const LivePhysRegs &LiveRegs,
+ const LiveRegUnits &LiveUnits,
const TargetRegisterClass &RC) {
for (MCRegister Reg : RC) {
- if (!MRI.isPhysRegUsed(Reg) && LiveRegs.available(MRI, Reg))
+ if (!MRI.isPhysRegUsed(Reg) && LiveUnits.available(Reg) &&
+ !MRI.isReserved(Reg))
return Reg;
}
return MCRegister();
@@ -42,22 +46,21 @@ static MCRegister findUnusedRegister(MachineRegisterInfo &MRI,
// callee-save registers since they may appear to be free when this is called
// from canUseAsPrologue (during shrink wrapping), but then no longer be free
// when this is called from emitPrologue.
-static MCRegister findScratchNonCalleeSaveRegister(MachineRegisterInfo &MRI,
- LivePhysRegs &LiveRegs,
- const TargetRegisterClass &RC,
- bool Unused = false) {
+static MCRegister findScratchNonCalleeSaveRegister(
+ MachineRegisterInfo &MRI, LiveRegUnits &LiveUnits,
+ const TargetRegisterClass &RC, bool Unused = false) {
// Mark callee saved registers as used so we will not choose them.
const MCPhysReg *CSRegs = MRI.getCalleeSavedRegs();
for (unsigned i = 0; CSRegs[i]; ++i)
- LiveRegs.addReg(CSRegs[i]);
+ LiveUnits.addReg(CSRegs[i]);
// We are looking for a register that can be used throughout the entire
// function, so any use is unacceptable.
if (Unused)
- return findUnusedRegister(MRI, LiveRegs, RC);
+ return findUnusedRegister(MRI, LiveUnits, RC);
for (MCRegister Reg : RC) {
- if (LiveRegs.available(MRI, Reg))
+ if (LiveUnits.available(Reg) && !MRI.isReserved(Reg))
return Reg;
}
@@ -65,9 +68,9 @@ static MCRegister findScratchNonCalleeSaveRegister(MachineRegisterInfo &MRI,
}
/// Query target location for spilling SGPRs
-/// \p IncludeScratchCopy : Also look for free scratch SGPRs
+/// \p IncludeScratchCopy : Also look for free scratch SGPRs
static void getVGPRSpillLaneOrTempRegister(
- MachineFunction &MF, LivePhysRegs &LiveRegs, Register SGPR,
+ MachineFunction &MF, LiveRegUnits &LiveUnits, Register SGPR,
const TargetRegisterClass &RC = AMDGPU::SReg_32_XM0_XEXECRegClass,
bool IncludeScratchCopy = true) {
SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
@@ -81,11 +84,11 @@ static void getVGPRSpillLaneOrTempRegister(
// We need to save and restore the given SGPR.
Register ScratchSGPR;
- // 1: Try to save the given register into an unused scratch SGPR. The LiveRegs
- // should have all the callee saved registers marked as used. For certain
- // cases we skip copy to scratch SGPR.
+ // 1: Try to save the given register into an unused scratch SGPR. The
+ // LiveUnits should have all the callee saved registers marked as used. For
+ // certain cases we skip copy to scratch SGPR.
if (IncludeScratchCopy)
- ScratchSGPR = findUnusedRegister(MF.getRegInfo(), LiveRegs, RC);
+ ScratchSGPR = findUnusedRegister(MF.getRegInfo(), LiveUnits, RC);
if (!ScratchSGPR) {
int FI = FrameInfo.CreateStackObject(Size, Alignment, true, nullptr,
@@ -99,10 +102,10 @@ static void getVGPRSpillLaneOrTempRegister(
SGPR, PrologEpilogSGPRSaveRestoreInfo(
SGPRSaveKind::SPILL_TO_VGPR_LANE, FI));
- LLVM_DEBUG(
- auto Spill = MFI->getPrologEpilogSGPRSpillToVGPRLanes(FI).front();
- dbgs() << printReg(SGPR, TRI) << " requires fallback spill to "
- << printReg(Spill.VGPR, TRI) << ':' << Spill.Lane << '\n';);
+ LLVM_DEBUG(auto Spill = MFI->getSGPRSpillToPhysicalVGPRLanes(FI).front();
+ dbgs() << printReg(SGPR, TRI) << " requires fallback spill to "
+ << printReg(Spill.VGPR, TRI) << ':' << Spill.Lane
+ << '\n';);
} else {
// Remove dead <FI> index
MF.getFrameInfo().RemoveStackObject(FI);
@@ -118,7 +121,7 @@ static void getVGPRSpillLaneOrTempRegister(
MFI->addToPrologEpilogSGPRSpills(
SGPR, PrologEpilogSGPRSaveRestoreInfo(
SGPRSaveKind::COPY_TO_SCRATCH_SGPR, ScratchSGPR));
- LiveRegs.addReg(ScratchSGPR);
+ LiveUnits.addReg(ScratchSGPR);
LLVM_DEBUG(dbgs() << "Saving " << printReg(SGPR, TRI) << " with copy to "
<< printReg(ScratchSGPR, TRI) << '\n');
}
@@ -129,7 +132,7 @@ static void getVGPRSpillLaneOrTempRegister(
// use.
static void buildPrologSpill(const GCNSubtarget &ST, const SIRegisterInfo &TRI,
const SIMachineFunctionInfo &FuncInfo,
- LivePhysRegs &LiveRegs, MachineFunction &MF,
+ LiveRegUnits &LiveUnits, MachineFunction &MF,
MachineBasicBlock &MBB,
MachineBasicBlock::iterator I, const DebugLoc &DL,
Register SpillReg, int FI, Register FrameReg,
@@ -142,18 +145,18 @@ static void buildPrologSpill(const GCNSubtarget &ST, const SIRegisterInfo &TRI,
MachineMemOperand *MMO = MF.getMachineMemOperand(
PtrInfo, MachineMemOperand::MOStore, FrameInfo.getObjectSize(FI),
FrameInfo.getObjectAlign(FI));
- LiveRegs.addReg(SpillReg);
+ LiveUnits.addReg(SpillReg);
bool IsKill = !MBB.isLiveIn(SpillReg);
TRI.buildSpillLoadStore(MBB, I, DL, Opc, FI, SpillReg, IsKill, FrameReg,
- DwordOff, MMO, nullptr, &LiveRegs);
+ DwordOff, MMO, nullptr, &LiveUnits);
if (IsKill)
- LiveRegs.removeReg(SpillReg);
+ LiveUnits.removeReg(SpillReg);
}
static void buildEpilogRestore(const GCNSubtarget &ST,
const SIRegisterInfo &TRI,
const SIMachineFunctionInfo &FuncInfo,
- LivePhysRegs &LiveRegs, MachineFunction &MF,
+ LiveRegUnits &LiveUnits, MachineFunction &MF,
MachineBasicBlock &MBB,
MachineBasicBlock::iterator I,
const DebugLoc &DL, Register SpillReg, int FI,
@@ -167,7 +170,7 @@ static void buildEpilogRestore(const GCNSubtarget &ST,
PtrInfo, MachineMemOperand::MOLoad, FrameInfo.getObjectSize(FI),
FrameInfo.getObjectAlign(FI));
TRI.buildSpillLoadStore(MBB, I, DL, Opc, FI, SpillReg, false, FrameReg,
- DwordOff, MMO, nullptr, &LiveRegs);
+ DwordOff, MMO, nullptr, &LiveUnits);
}
static void buildGitPtr(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
@@ -195,18 +198,18 @@ static void buildGitPtr(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
.addReg(GitPtrLo);
}
-static void initLiveRegs(LivePhysRegs &LiveRegs, const SIRegisterInfo &TRI,
- const SIMachineFunctionInfo *FuncInfo,
- MachineFunction &MF, MachineBasicBlock &MBB,
- MachineBasicBlock::iterator MBBI, bool IsProlog) {
- if (LiveRegs.empty()) {
- LiveRegs.init(TRI);
+static void initLiveUnits(LiveRegUnits &LiveUnits, const SIRegisterInfo &TRI,
+ const SIMachineFunctionInfo *FuncInfo,
+ MachineFunction &MF, MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MBBI, bool IsProlog) {
+ if (LiveUnits.empty()) {
+ LiveUnits.init(TRI);
if (IsProlog) {
- LiveRegs.addLiveIns(MBB);
+ LiveUnits.addLiveIns(MBB);
} else {
// In epilog.
- LiveRegs.addLiveOuts(MBB);
- LiveRegs.stepBackward(*MBBI);
+ LiveUnits.addLiveOuts(MBB);
+ LiveUnits.stepBackward(*MBBI);
}
}
}
@@ -228,7 +231,7 @@ class PrologEpilogSGPRSpillBuilder {
const SIRegisterInfo &TRI;
Register SuperReg;
const PrologEpilogSGPRSaveRestoreInfo SI;
- LivePhysRegs &LiveRegs;
+ LiveRegUnits &LiveUnits;
const DebugLoc &DL;
Register FrameReg;
ArrayRef<int16_t> SplitParts;
@@ -239,10 +242,10 @@ class PrologEpilogSGPRSpillBuilder {
MachineRegisterInfo &MRI = MF.getRegInfo();
assert(!MFI.isDeadObjectIndex(FI));
- initLiveRegs(LiveRegs, TRI, FuncInfo, MF, MBB, MI, /*IsProlog*/ true);
+ initLiveUnits(LiveUnits, TRI, FuncInfo, MF, MBB, MI, /*IsProlog*/ true);
MCPhysReg TmpVGPR = findScratchNonCalleeSaveRegister(
- MRI, LiveRegs, AMDGPU::VGPR_32RegClass);
+ MRI, LiveUnits, AMDGPU::VGPR_32RegClass);
if (!TmpVGPR)
report_fatal_error("failed to find free scratch register");
@@ -253,7 +256,7 @@ class PrologEpilogSGPRSpillBuilder {
BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpVGPR)
.addReg(SubReg);
- buildPrologSpill(ST, TRI, *FuncInfo, LiveRegs, MF, MBB, MI, DL, TmpVGPR,
+ buildPrologSpill(ST, TRI, *FuncInfo, LiveUnits, MF, MBB, MI, DL, TmpVGPR,
FI, FrameReg, DwordOff);
DwordOff += 4;
}
@@ -264,14 +267,15 @@ class PrologEpilogSGPRSpillBuilder {
assert(MFI.getStackID(FI) == TargetStackID::SGPRSpill);
ArrayRef<SIRegisterInfo::SpilledReg> Spill =
- FuncInfo->getPrologEpilogSGPRSpillToVGPRLanes(FI);
+ FuncInfo->getSGPRSpillToPhysicalVGPRLanes(FI);
assert(Spill.size() == NumSubRegs);
for (unsigned I = 0; I < NumSubRegs; ++I) {
Register SubReg = NumSubRegs == 1
? SuperReg
: Register(TRI.getSubReg(SuperReg, SplitParts[I]));
- BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_WRITELANE_B32), Spill[I].VGPR)
+ BuildMI(MBB, MI, DL, TII->get(AMDGPU::SI_SPILL_S32_TO_VGPR),
+ Spill[I].VGPR)
.addReg(SubReg)
.addImm(Spill[I].Lane)
.addReg(Spill[I].VGPR, RegState::Undef);
@@ -287,9 +291,9 @@ class PrologEpilogSGPRSpillBuilder {
void restoreFromMemory(const int FI) {
MachineRegisterInfo &MRI = MF.getRegInfo();
- initLiveRegs(LiveRegs, TRI, FuncInfo, MF, MBB, MI, /*IsProlog*/ false);
+ initLiveUnits(LiveUnits, TRI, FuncInfo, MF, MBB, MI, /*IsProlog*/ false);
MCPhysReg TmpVGPR = findScratchNonCalleeSaveRegister(
- MRI, LiveRegs, AMDGPU::VGPR_32RegClass);
+ MRI, LiveUnits, AMDGPU::VGPR_32RegClass);
if (!TmpVGPR)
report_fatal_error("failed to find free scratch register");
@@ -298,8 +302,8 @@ class PrologEpilogSGPRSpillBuilder {
? SuperReg
: Register(TRI.getSubReg(SuperReg, SplitParts[I]));
- buildEpilogRestore(ST, TRI, *FuncInfo, LiveRegs, MF, MBB, MI, DL, TmpVGPR,
- FI, FrameReg, DwordOff);
+ buildEpilogRestore(ST, TRI, *FuncInfo, LiveUnits, MF, MBB, MI, DL,
+ TmpVGPR, FI, FrameReg, DwordOff);
BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), SubReg)
.addReg(TmpVGPR, RegState::Kill);
DwordOff += 4;
@@ -309,14 +313,14 @@ class PrologEpilogSGPRSpillBuilder {
void restoreFromVGPRLane(const int FI) {
assert(MFI.getStackID(FI) == TargetStackID::SGPRSpill);
ArrayRef<SIRegisterInfo::SpilledReg> Spill =
- FuncInfo->getPrologEpilogSGPRSpillToVGPRLanes(FI);
+ FuncInfo->getSGPRSpillToPhysicalVGPRLanes(FI);
assert(Spill.size() == NumSubRegs);
for (unsigned I = 0; I < NumSubRegs; ++I) {
Register SubReg = NumSubRegs == 1
? SuperReg
: Register(TRI.getSubReg(SuperReg, SplitParts[I]));
- BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_READLANE_B32), SubReg)
+ BuildMI(MBB, MI, DL, TII->get(AMDGPU::SI_RESTORE_S32_FROM_VGPR), SubReg)
.addReg(Spill[I].VGPR)
.addImm(Spill[I].Lane);
}
@@ -335,11 +339,12 @@ public:
MachineBasicBlock::iterator MI,
const DebugLoc &DL, const SIInstrInfo *TII,
const SIRegisterInfo &TRI,
- LivePhysRegs &LiveRegs, Register FrameReg)
+ LiveRegUnits &LiveUnits, Register FrameReg)
: MI(MI), MBB(MBB), MF(*MBB.getParent()),
ST(MF.getSubtarget<GCNSubtarget>()), MFI(MF.getFrameInfo()),
FuncInfo(MF.getInfo<SIMachineFunctionInfo>()), TII(TII), TRI(TRI),
- SuperReg(Reg), SI(SI), LiveRegs(LiveRegs), DL(DL), FrameReg(FrameReg) {
+ SuperReg(Reg), SI(SI), LiveUnits(LiveUnits), DL(DL),
+ FrameReg(FrameReg) {
const TargetRegisterClass *RC = TRI.getPhysRegBaseClass(SuperReg);
SplitParts = TRI.getRegSplitParts(RC, EltSize);
NumSubRegs = SplitParts.empty() ? 1 : SplitParts.size();
@@ -396,9 +401,9 @@ void SIFrameLowering::emitEntryFunctionFlatScratchInit(
if (ST.isAmdPalOS()) {
// Extract the scratch offset from the descriptor in the GIT
- LivePhysRegs LiveRegs;
- LiveRegs.init(*TRI);
- LiveRegs.addLiveIns(MBB);
+ LiveRegUnits LiveUnits;
+ LiveUnits.init(*TRI);
+ LiveUnits.addLiveIns(MBB);
// Find unused reg to load flat scratch init into
MachineRegisterInfo &MRI = MF.getRegInfo();
@@ -409,8 +414,8 @@ void SIFrameLowering::emitEntryFunctionFlatScratchInit(
std::min(static_cast<unsigned>(AllSGPR64s.size()), NumPreloaded));
Register GITPtrLoReg = MFI->getGITPtrLoReg(MF);
for (MCPhysReg Reg : AllSGPR64s) {
- if (LiveRegs.available(MRI, Reg) && MRI.isAllocatable(Reg) &&
- !TRI->isSubRegisterEq(Reg, GITPtrLoReg)) {
+ if (LiveUnits.available(Reg) && !MRI.isReserved(Reg) &&
+ MRI.isAllocatable(Reg) && !TRI->isSubRegisterEq(Reg, GITPtrLoReg)) {
FlatScrInit = Reg;
break;
}
@@ -692,7 +697,7 @@ void SIFrameLowering::emitEntryFunctionPrologue(MachineFunction &MF,
}
bool NeedsFlatScratchInit =
- MFI->hasFlatScratchInit() &&
+ MFI->getUserSGPRInfo().hasFlatScratchInit() &&
(MRI.isPhysRegUsed(AMDGPU::FLAT_SCR) || FrameInfo.hasCalls() ||
(!allStackObjectsAreDead(FrameInfo) && ST.enableFlatScratch()));
@@ -775,7 +780,7 @@ void SIFrameLowering::emitEntryFunctionScratchRsrcRegSetup(
// Use relocations to get the pointer, and setup the other bits manually.
uint64_t Rsrc23 = TII->getScratchRsrcWords23();
- if (MFI->hasImplicitBufferPtr()) {
+ if (MFI->getUserSGPRInfo().hasImplicitBufferPtr()) {
Register Rsrc01 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0_sub1);
if (AMDGPU::isCompute(MF.getFunction().getCallingConv())) {
@@ -814,7 +819,6 @@ void SIFrameLowering::emitEntryFunctionScratchRsrcRegSetup(
BuildMI(MBB, I, DL, SMovB32, Rsrc1)
.addExternalSymbol("SCRATCH_RSRC_DWORD1")
.addReg(ScratchRsrcReg, RegState::ImplicitDefine);
-
}
BuildMI(MBB, I, DL, SMovB32, Rsrc2)
@@ -873,7 +877,7 @@ bool SIFrameLowering::isSupportedStackID(TargetStackID::Value ID) const {
// Activate only the inactive lanes when \p EnableInactiveLanes is true.
// Otherwise, activate all lanes. It returns the saved exec.
-static Register buildScratchExecCopy(LivePhysRegs &LiveRegs,
+static Register buildScratchExecCopy(LiveRegUnits &LiveUnits,
MachineFunction &MF,
MachineBasicBlock &MBB,
MachineBasicBlock::iterator MBBI,
@@ -886,14 +890,14 @@ static Register buildScratchExecCopy(LivePhysRegs &LiveRegs,
const SIRegisterInfo &TRI = TII->getRegisterInfo();
SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
- initLiveRegs(LiveRegs, TRI, FuncInfo, MF, MBB, MBBI, IsProlog);
+ initLiveUnits(LiveUnits, TRI, FuncInfo, MF, MBB, MBBI, IsProlog);
ScratchExecCopy = findScratchNonCalleeSaveRegister(
- MRI, LiveRegs, *TRI.getWaveMaskRegClass());
+ MRI, LiveUnits, *TRI.getWaveMaskRegClass());
if (!ScratchExecCopy)
report_fatal_error("failed to find free scratch register");
- LiveRegs.addReg(ScratchExecCopy);
+ LiveUnits.addReg(ScratchExecCopy);
const unsigned SaveExecOpc =
ST.isWave32() ? (EnableInactiveLanes ? AMDGPU::S_XOR_SAVEEXEC_B32
@@ -909,7 +913,7 @@ static Register buildScratchExecCopy(LivePhysRegs &LiveRegs,
void SIFrameLowering::emitCSRSpillStores(
MachineFunction &MF, MachineBasicBlock &MBB,
- MachineBasicBlock::iterator MBBI, DebugLoc &DL, LivePhysRegs &LiveRegs,
+ MachineBasicBlock::iterator MBBI, DebugLoc &DL, LiveRegUnits &LiveUnits,
Register FrameReg, Register FramePtrRegScratchCopy) const {
SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
@@ -924,7 +928,7 @@ void SIFrameLowering::emitCSRSpillStores(
FuncInfo->splitWWMSpillRegisters(MF, WWMCalleeSavedRegs, WWMScratchRegs);
if (!WWMScratchRegs.empty())
ScratchExecCopy =
- buildScratchExecCopy(LiveRegs, MF, MBB, MBBI, DL,
+ buildScratchExecCopy(LiveUnits, MF, MBB, MBBI, DL,
/*IsProlog*/ true, /*EnableInactiveLanes*/ true);
auto StoreWWMRegisters =
@@ -932,7 +936,7 @@ void SIFrameLowering::emitCSRSpillStores(
for (const auto &Reg : WWMRegs) {
Register VGPR = Reg.first;
int FI = Reg.second;
- buildPrologSpill(ST, TRI, *FuncInfo, LiveRegs, MF, MBB, MBBI, DL,
+ buildPrologSpill(ST, TRI, *FuncInfo, LiveUnits, MF, MBB, MBBI, DL,
VGPR, FI, FrameReg);
}
};
@@ -943,7 +947,7 @@ void SIFrameLowering::emitCSRSpillStores(
unsigned MovOpc = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
BuildMI(MBB, MBBI, DL, TII->get(MovOpc), TRI.getExec()).addImm(-1);
} else {
- ScratchExecCopy = buildScratchExecCopy(LiveRegs, MF, MBB, MBBI, DL,
+ ScratchExecCopy = buildScratchExecCopy(LiveUnits, MF, MBB, MBBI, DL,
/*IsProlog*/ true,
/*EnableInactiveLanes*/ false);
}
@@ -955,7 +959,7 @@ void SIFrameLowering::emitCSRSpillStores(
unsigned ExecMov = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
BuildMI(MBB, MBBI, DL, TII->get(ExecMov), TRI.getExec())
.addReg(ScratchExecCopy, RegState::Kill);
- LiveRegs.addReg(ScratchExecCopy);
+ LiveUnits.addReg(ScratchExecCopy);
}
Register FramePtrReg = FuncInfo->getFrameOffsetReg();
@@ -971,7 +975,7 @@ void SIFrameLowering::emitCSRSpillStores(
continue;
PrologEpilogSGPRSpillBuilder SB(Reg, Spill.second, MBB, MBBI, DL, TII, TRI,
- LiveRegs, FrameReg);
+ LiveUnits, FrameReg);
SB.save();
}
@@ -986,16 +990,16 @@ void SIFrameLowering::emitCSRSpillStores(
MBB.sortUniqueLiveIns();
}
- if (!LiveRegs.empty()) {
+ if (!LiveUnits.empty()) {
for (MCPhysReg Reg : ScratchSGPRs)
- LiveRegs.addReg(Reg);
+ LiveUnits.addReg(Reg);
}
}
}
void SIFrameLowering::emitCSRSpillRestores(
MachineFunction &MF, MachineBasicBlock &MBB,
- MachineBasicBlock::iterator MBBI, DebugLoc &DL, LivePhysRegs &LiveRegs,
+ MachineBasicBlock::iterator MBBI, DebugLoc &DL, LiveRegUnits &LiveUnits,
Register FrameReg, Register FramePtrRegScratchCopy) const {
const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
@@ -1015,7 +1019,7 @@ void SIFrameLowering::emitCSRSpillRestores(
continue;
PrologEpilogSGPRSpillBuilder SB(Reg, Spill.second, MBB, MBBI, DL, TII, TRI,
- LiveRegs, FrameReg);
+ LiveUnits, FrameReg);
SB.restore();
}
@@ -1027,7 +1031,7 @@ void SIFrameLowering::emitCSRSpillRestores(
FuncInfo->splitWWMSpillRegisters(MF, WWMCalleeSavedRegs, WWMScratchRegs);
if (!WWMScratchRegs.empty())
ScratchExecCopy =
- buildScratchExecCopy(LiveRegs, MF, MBB, MBBI, DL,
+ buildScratchExecCopy(LiveUnits, MF, MBB, MBBI, DL,
/*IsProlog*/ false, /*EnableInactiveLanes*/ true);
auto RestoreWWMRegisters =
@@ -1035,7 +1039,7 @@ void SIFrameLowering::emitCSRSpillRestores(
for (const auto &Reg : WWMRegs) {
Register VGPR = Reg.first;
int FI = Reg.second;
- buildEpilogRestore(ST, TRI, *FuncInfo, LiveRegs, MF, MBB, MBBI, DL,
+ buildEpilogRestore(ST, TRI, *FuncInfo, LiveUnits, MF, MBB, MBBI, DL,
VGPR, FI, FrameReg);
}
};
@@ -1046,7 +1050,7 @@ void SIFrameLowering::emitCSRSpillRestores(
unsigned MovOpc = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
BuildMI(MBB, MBBI, DL, TII->get(MovOpc), TRI.getExec()).addImm(-1);
} else {
- ScratchExecCopy = buildScratchExecCopy(LiveRegs, MF, MBB, MBBI, DL,
+ ScratchExecCopy = buildScratchExecCopy(LiveUnits, MF, MBB, MBBI, DL,
/*IsProlog*/ false,
/*EnableInactiveLanes*/ false);
}
@@ -1079,13 +1083,25 @@ void SIFrameLowering::emitPrologue(MachineFunction &MF,
Register FramePtrReg = FuncInfo->getFrameOffsetReg();
Register BasePtrReg =
TRI.hasBasePointer(MF) ? TRI.getBaseRegister() : Register();
- LivePhysRegs LiveRegs;
+ LiveRegUnits LiveUnits;
MachineBasicBlock::iterator MBBI = MBB.begin();
// DebugLoc must be unknown since the first instruction with DebugLoc is used
// to determine the end of the prologue.
DebugLoc DL;
+ if (FuncInfo->isChainFunction()) {
+ // Functions with the amdgpu_cs_chain[_preserve] CC don't receive a SP, but
+ // are free to set one up if they need it.
+ bool UseSP = requiresStackPointerReference(MF);
+ if (UseSP) {
+ assert(StackPtrReg != AMDGPU::SP_REG);
+
+ BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_MOV_B32), StackPtrReg)
+ .addImm(MFI.getStackSize() * getScratchScaleFactor(ST));
+ }
+ }
+
bool HasFP = false;
bool HasBP = false;
uint32_t NumBytes = MFI.getStackSize();
@@ -1097,14 +1113,15 @@ void SIFrameLowering::emitPrologue(MachineFunction &MF,
Register FramePtrRegScratchCopy;
if (!HasFP && !hasFP(MF)) {
// Emit the CSR spill stores with SP base register.
- emitCSRSpillStores(MF, MBB, MBBI, DL, LiveRegs, StackPtrReg,
+ emitCSRSpillStores(MF, MBB, MBBI, DL, LiveUnits,
+ FuncInfo->isChainFunction() ? Register() : StackPtrReg,
FramePtrRegScratchCopy);
} else {
// CSR spill stores will use FP as base register.
Register SGPRForFPSaveRestoreCopy =
FuncInfo->getScratchSGPRCopyDstReg(FramePtrReg);
- initLiveRegs(LiveRegs, TRI, FuncInfo, MF, MBB, MBBI, /*IsProlog*/ true);
+ initLiveUnits(LiveUnits, TRI, FuncInfo, MF, MBB, MBBI, /*IsProlog*/ true);
if (SGPRForFPSaveRestoreCopy) {
// Copy FP to the scratch register now and emit the CFI entry. It avoids
// the extra FP copy needed in the other two cases when FP is spilled to
@@ -1112,18 +1129,18 @@ void SIFrameLowering::emitPrologue(MachineFunction &MF,
PrologEpilogSGPRSpillBuilder SB(
FramePtrReg,
FuncInfo->getPrologEpilogSGPRSaveRestoreInfo(FramePtrReg), MBB, MBBI,
- DL, TII, TRI, LiveRegs, FramePtrReg);
+ DL, TII, TRI, LiveUnits, FramePtrReg);
SB.save();
- LiveRegs.addReg(SGPRForFPSaveRestoreCopy);
+ LiveUnits.addReg(SGPRForFPSaveRestoreCopy);
} else {
// Copy FP into a new scratch register so that its previous value can be
// spilled after setting up the new frame.
FramePtrRegScratchCopy = findScratchNonCalleeSaveRegister(
- MRI, LiveRegs, AMDGPU::SReg_32_XM0_XEXECRegClass);
+ MRI, LiveUnits, AMDGPU::SReg_32_XM0_XEXECRegClass);
if (!FramePtrRegScratchCopy)
report_fatal_error("failed to find free scratch register");
- LiveRegs.addReg(FramePtrRegScratchCopy);
+ LiveUnits.addReg(FramePtrRegScratchCopy);
BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::COPY), FramePtrRegScratchCopy)
.addReg(FramePtrReg);
}
@@ -1133,9 +1150,9 @@ void SIFrameLowering::emitPrologue(MachineFunction &MF,
const unsigned Alignment = MFI.getMaxAlign().value();
RoundedSize += Alignment;
- if (LiveRegs.empty()) {
- LiveRegs.init(TRI);
- LiveRegs.addLiveIns(MBB);
+ if (LiveUnits.empty()) {
+ LiveUnits.init(TRI);
+ LiveUnits.addLiveIns(MBB);
}
// s_add_i32 s33, s32, NumBytes
@@ -1158,10 +1175,10 @@ void SIFrameLowering::emitPrologue(MachineFunction &MF,
// If FP is used, emit the CSR spills with FP base register.
if (HasFP) {
- emitCSRSpillStores(MF, MBB, MBBI, DL, LiveRegs, FramePtrReg,
+ emitCSRSpillStores(MF, MBB, MBBI, DL, LiveUnits, FramePtrReg,
FramePtrRegScratchCopy);
if (FramePtrRegScratchCopy)
- LiveRegs.removeReg(FramePtrRegScratchCopy);
+ LiveUnits.removeReg(FramePtrRegScratchCopy);
}
// If we need a base pointer, set it up here. It's whatever the value of
@@ -1210,7 +1227,7 @@ void SIFrameLowering::emitEpilogue(MachineFunction &MF,
const SIInstrInfo *TII = ST.getInstrInfo();
const SIRegisterInfo &TRI = TII->getRegisterInfo();
MachineRegisterInfo &MRI = MF.getRegInfo();
- LivePhysRegs LiveRegs;
+ LiveRegUnits LiveUnits;
// Get the insert location for the epilogue. If there were no terminators in
// the block, get the last instruction.
MachineBasicBlock::iterator MBBI = MBB.end();
@@ -1240,19 +1257,19 @@ void SIFrameLowering::emitEpilogue(MachineFunction &MF,
// SGPRForFPSaveRestoreCopy is not true, restore the previous value of FP
// into a new scratch register and copy to FP later when other registers are
// restored from the current stack frame.
- initLiveRegs(LiveRegs, TRI, FuncInfo, MF, MBB, MBBI, /*IsProlog*/ false);
+ initLiveUnits(LiveUnits, TRI, FuncInfo, MF, MBB, MBBI, /*IsProlog*/ false);
if (SGPRForFPSaveRestoreCopy) {
- LiveRegs.addReg(SGPRForFPSaveRestoreCopy);
+ LiveUnits.addReg(SGPRForFPSaveRestoreCopy);
} else {
FramePtrRegScratchCopy = findScratchNonCalleeSaveRegister(
- MRI, LiveRegs, AMDGPU::SReg_32_XM0_XEXECRegClass);
+ MRI, LiveUnits, AMDGPU::SReg_32_XM0_XEXECRegClass);
if (!FramePtrRegScratchCopy)
report_fatal_error("failed to find free scratch register");
- LiveRegs.addReg(FramePtrRegScratchCopy);
+ LiveUnits.addReg(FramePtrRegScratchCopy);
}
- emitCSRSpillRestores(MF, MBB, MBBI, DL, LiveRegs, FramePtrReg,
+ emitCSRSpillRestores(MF, MBB, MBBI, DL, LiveUnits, FramePtrReg,
FramePtrRegScratchCopy);
}
@@ -1275,7 +1292,7 @@ void SIFrameLowering::emitEpilogue(MachineFunction &MF,
MIB.setMIFlag(MachineInstr::FrameDestroy);
} else {
// Insert the CSR spill restores with SP as the base register.
- emitCSRSpillRestores(MF, MBB, MBBI, DL, LiveRegs, StackPtrReg,
+ emitCSRSpillRestores(MF, MBB, MBBI, DL, LiveUnits, StackPtrReg,
FramePtrRegScratchCopy);
}
}
@@ -1318,7 +1335,11 @@ void SIFrameLowering::processFunctionBeforeFrameFinalized(
SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
// Allocate spill slots for WWM reserved VGPRs.
- if (!FuncInfo->isEntryFunction()) {
+ // For chain functions, we only need to do this if we have calls to
+ // llvm.amdgcn.cs.chain.
+ bool IsChainWithoutCalls =
+ FuncInfo->isChainFunction() && !MF.getFrameInfo().hasTailCall();
+ if (!FuncInfo->isEntryFunction() && !IsChainWithoutCalls) {
for (Register Reg : FuncInfo->getWWMReservedRegs()) {
const TargetRegisterClass *RC = TRI->getPhysRegBaseClass(Reg);
FuncInfo->allocateWWMSpill(MF, Reg, TRI->getSpillSize(*RC),
@@ -1353,8 +1374,8 @@ void SIFrameLowering::processFunctionBeforeFrameFinalized(
if (FuncInfo->allocateVGPRSpillToAGPR(MF, FI,
TRI->isAGPR(MRI, VReg))) {
assert(RS != nullptr);
- // FIXME: change to enterBasicBlockEnd()
- RS->enterBasicBlock(MBB);
+ RS->enterBasicBlockEnd(MBB);
+ RS->backward(std::next(MI.getIterator()));
TRI->eliminateFrameIndex(MI, 0, FIOp, RS);
SpillFIs.set(FI);
continue;
@@ -1472,30 +1493,30 @@ void SIFrameLowering::determinePrologEpilogSGPRSaves(
SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
const SIRegisterInfo *TRI = ST.getRegisterInfo();
- LivePhysRegs LiveRegs;
- LiveRegs.init(*TRI);
+ LiveRegUnits LiveUnits;
+ LiveUnits.init(*TRI);
// Initially mark callee saved registers as used so we will not choose them
// while looking for scratch SGPRs.
const MCPhysReg *CSRegs = MF.getRegInfo().getCalleeSavedRegs();
for (unsigned I = 0; CSRegs[I]; ++I)
- LiveRegs.addReg(CSRegs[I]);
+ LiveUnits.addReg(CSRegs[I]);
const TargetRegisterClass &RC = *TRI->getWaveMaskRegClass();
if (NeedExecCopyReservedReg) {
Register ReservedReg = MFI->getSGPRForEXECCopy();
assert(ReservedReg && "Should have reserved an SGPR for EXEC copy.");
- Register UnusedScratchReg = findUnusedRegister(MRI, LiveRegs, RC);
+ Register UnusedScratchReg = findUnusedRegister(MRI, LiveUnits, RC);
if (UnusedScratchReg) {
// If found any unused scratch SGPR, reserve the register itself for Exec
// copy and there is no need for any spills in that case.
MFI->setSGPRForEXECCopy(UnusedScratchReg);
- LiveRegs.addReg(UnusedScratchReg);
+ LiveUnits.addReg(UnusedScratchReg);
} else {
// Needs spill.
assert(!MFI->hasPrologEpilogSGPRSpillEntry(ReservedReg) &&
"Re-reserving spill slot for EXEC copy register");
- getVGPRSpillLaneOrTempRegister(MF, LiveRegs, ReservedReg, RC,
+ getVGPRSpillLaneOrTempRegister(MF, LiveUnits, ReservedReg, RC,
/*IncludeScratchCopy=*/false);
}
}
@@ -1516,14 +1537,14 @@ void SIFrameLowering::determinePrologEpilogSGPRSaves(
Register FramePtrReg = MFI->getFrameOffsetReg();
assert(!MFI->hasPrologEpilogSGPRSpillEntry(FramePtrReg) &&
"Re-reserving spill slot for FP");
- getVGPRSpillLaneOrTempRegister(MF, LiveRegs, FramePtrReg);
+ getVGPRSpillLaneOrTempRegister(MF, LiveUnits, FramePtrReg);
}
if (TRI->hasBasePointer(MF)) {
Register BasePtrReg = TRI->getBaseRegister();
assert(!MFI->hasPrologEpilogSGPRSpillEntry(BasePtrReg) &&
"Re-reserving spill slot for BP");
- getVGPRSpillLaneOrTempRegister(MF, LiveRegs, BasePtrReg);
+ getVGPRSpillLaneOrTempRegister(MF, LiveUnits, BasePtrReg);
}
}
@@ -1531,8 +1552,15 @@ void SIFrameLowering::determinePrologEpilogSGPRSaves(
void SIFrameLowering::determineCalleeSaves(MachineFunction &MF,
BitVector &SavedVGPRs,
RegScavenger *RS) const {
- TargetFrameLowering::determineCalleeSaves(MF, SavedVGPRs, RS);
SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
+
+ // If this is a function with the amdgpu_cs_chain[_preserve] calling
+ // convention and it doesn't contain any calls to llvm.amdgcn.cs.chain, then
+ // we don't need to save and restore anything.
+ if (MFI->isChainFunction() && !MF.getFrameInfo().hasTailCall())
+ return;
+
+ TargetFrameLowering::determineCalleeSaves(MF, SavedVGPRs, RS);
if (MFI->isEntryFunction())
return;
@@ -1551,17 +1579,17 @@ void SIFrameLowering::determineCalleeSaves(MachineFunction &MF,
// TODO: Handle this elsewhere at an early point. Walking through all MBBs
// here would be a bad heuristic. A better way should be by calling
// allocateWWMSpill during the regalloc pipeline whenever a physical
- // register is allocated for the intended virtual registers. That will
- // also help excluding the general use of WRITELANE/READLANE intrinsics
- // that won't really need any such special handling.
- if (MI.getOpcode() == AMDGPU::V_WRITELANE_B32)
+ // register is allocated for the intended virtual registers.
+ if (MI.getOpcode() == AMDGPU::SI_SPILL_S32_TO_VGPR)
MFI->allocateWWMSpill(MF, MI.getOperand(0).getReg());
- else if (MI.getOpcode() == AMDGPU::V_READLANE_B32)
+ else if (MI.getOpcode() == AMDGPU::SI_RESTORE_S32_FROM_VGPR)
MFI->allocateWWMSpill(MF, MI.getOperand(1).getReg());
else if (TII->isWWMRegSpillOpcode(MI.getOpcode()))
NeedExecCopyReservedReg = true;
else if (MI.getOpcode() == AMDGPU::SI_RETURN ||
- MI.getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG) {
+ MI.getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG ||
+ (MFI->isChainFunction() &&
+ TII->isChainCallOpcode(MI.getOpcode()))) {
// We expect all return to be the same size.
assert(!ReturnMI ||
(count_if(MI.operands(), [](auto Op) { return Op.isReg(); }) ==
@@ -1695,6 +1723,7 @@ bool SIFrameLowering::allocateScavengingFrameIndexesNearIncomingSP(
const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
const MachineFrameInfo &MFI = MF.getFrameInfo();
+ const SIInstrInfo *TII = ST.getInstrInfo();
uint64_t EstStackSize = MFI.estimateStackSize(MF);
uint64_t MaxOffset = EstStackSize - 1;
@@ -1706,12 +1735,11 @@ bool SIFrameLowering::allocateScavengingFrameIndexesNearIncomingSP(
// rather than allocating as close to possible. This could save a lot of space
// on frames with alignment requirements.
if (ST.enableFlatScratch()) {
- const SIInstrInfo *TII = ST.getInstrInfo();
if (TII->isLegalFLATOffset(MaxOffset, AMDGPUAS::PRIVATE_ADDRESS,
SIInstrFlags::FlatScratch))
return false;
} else {
- if (SIInstrInfo::isLegalMUBUFImmOffset(MaxOffset))
+ if (TII->isLegalMUBUFImmOffset(MaxOffset))
return false;
}
@@ -1770,10 +1798,11 @@ static bool frameTriviallyRequiresSP(const MachineFrameInfo &MFI) {
bool SIFrameLowering::hasFP(const MachineFunction &MF) const {
const MachineFrameInfo &MFI = MF.getFrameInfo();
- // For entry functions we can use an immediate offset in most cases, so the
- // presence of calls doesn't imply we need a distinct frame pointer.
+ // For entry & chain functions we can use an immediate offset in most cases,
+ // so the presence of calls doesn't imply we need a distinct frame pointer.
if (MFI.hasCalls() &&
- !MF.getInfo<SIMachineFunctionInfo>()->isEntryFunction()) {
+ !MF.getInfo<SIMachineFunctionInfo>()->isEntryFunction() &&
+ !MF.getInfo<SIMachineFunctionInfo>()->isChainFunction()) {
// All offsets are unsigned, so need to be addressed in the same direction
// as stack growth.
@@ -1793,11 +1822,14 @@ bool SIFrameLowering::hasFP(const MachineFunction &MF) const {
// register. We may need to initialize the stack pointer depending on the frame
// properties, which logically overlaps many of the cases where an ordinary
// function would require an FP.
+// Also used for chain functions. While not technically entry functions, chain
+// functions may need to set up a stack pointer in some situations.
bool SIFrameLowering::requiresStackPointerReference(
const MachineFunction &MF) const {
// Callable functions always require a stack pointer reference.
- assert(MF.getInfo<SIMachineFunctionInfo>()->isEntryFunction() &&
- "only expected to call this for entry points");
+ assert((MF.getInfo<SIMachineFunctionInfo>()->isEntryFunction() ||
+ MF.getInfo<SIMachineFunctionInfo>()->isChainFunction()) &&
+ "only expected to call this for entry points and chain functions");
const MachineFrameInfo &MFI = MF.getFrameInfo();
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIFrameLowering.h b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIFrameLowering.h
index 0060fc0be431..b3feb759ed81 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIFrameLowering.h
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIFrameLowering.h
@@ -38,11 +38,11 @@ public:
bool NeedExecCopyReservedReg) const;
void emitCSRSpillStores(MachineFunction &MF, MachineBasicBlock &MBB,
MachineBasicBlock::iterator MBBI, DebugLoc &DL,
- LivePhysRegs &LiveRegs, Register FrameReg,
+ LiveRegUnits &LiveUnits, Register FrameReg,
Register FramePtrRegScratchCopy) const;
void emitCSRSpillRestores(MachineFunction &MF, MachineBasicBlock &MBB,
MachineBasicBlock::iterator MBBI, DebugLoc &DL,
- LivePhysRegs &LiveRegs, Register FrameReg,
+ LiveRegUnits &LiveUnits, Register FrameReg,
Register FramePtrRegScratchCopy) const;
bool
assignCalleeSavedSpillSlots(MachineFunction &MF,
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index b7b90e23e895..34826809c1a6 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -15,6 +15,7 @@
#include "AMDGPU.h"
#include "AMDGPUInstrInfo.h"
#include "AMDGPUTargetMachine.h"
+#include "GCNSubtarget.h"
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
#include "SIMachineFunctionInfo.h"
#include "SIRegisterInfo.h"
@@ -28,6 +29,7 @@
#include "llvm/CodeGen/ByteProvider.h"
#include "llvm/CodeGen/FunctionLoweringInfo.h"
#include "llvm/CodeGen/GlobalISel/GISelKnownBits.h"
+#include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h"
#include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
#include "llvm/CodeGen/MachineFrameInfo.h"
#include "llvm/CodeGen/MachineFunction.h"
@@ -146,8 +148,13 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
addRegisterClass(MVT::v16f64, TRI->getVGPRClassForBitWidth(1024));
if (Subtarget->has16BitInsts()) {
- addRegisterClass(MVT::i16, &AMDGPU::SReg_32RegClass);
- addRegisterClass(MVT::f16, &AMDGPU::SReg_32RegClass);
+ if (Subtarget->useRealTrue16Insts()) {
+ addRegisterClass(MVT::i16, &AMDGPU::VGPR_16RegClass);
+ addRegisterClass(MVT::f16, &AMDGPU::VGPR_16RegClass);
+ } else {
+ addRegisterClass(MVT::i16, &AMDGPU::SReg_32RegClass);
+ addRegisterClass(MVT::f16, &AMDGPU::SReg_32RegClass);
+ }
// Unless there are also VOP3P operations, not operations are really legal.
addRegisterClass(MVT::v2i16, &AMDGPU::SReg_32RegClass);
@@ -158,6 +165,8 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
addRegisterClass(MVT::v8f16, &AMDGPU::SGPR_128RegClass);
addRegisterClass(MVT::v16i16, &AMDGPU::SGPR_256RegClass);
addRegisterClass(MVT::v16f16, &AMDGPU::SGPR_256RegClass);
+ addRegisterClass(MVT::v32i16, &AMDGPU::SGPR_512RegClass);
+ addRegisterClass(MVT::v32f16, &AMDGPU::SGPR_512RegClass);
}
addRegisterClass(MVT::v32i32, &AMDGPU::VReg_1024RegClass);
@@ -219,7 +228,7 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
setOperationAction(ISD::SELECT, MVT::f64, Promote);
AddPromotedToType(ISD::SELECT, MVT::f64, MVT::i64);
- setOperationAction(ISD::FSQRT, MVT::f64, Custom);
+ setOperationAction(ISD::FSQRT, {MVT::f32, MVT::f64}, Custom);
setOperationAction(ISD::SELECT_CC,
{MVT::f32, MVT::i32, MVT::i64, MVT::f64, MVT::i1}, Expand);
@@ -262,13 +271,13 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
// We only support LOAD/STORE and vector manipulation ops for vectors
// with > 4 elements.
for (MVT VT :
- {MVT::v8i32, MVT::v8f32, MVT::v9i32, MVT::v9f32, MVT::v10i32,
- MVT::v10f32, MVT::v11i32, MVT::v11f32, MVT::v12i32, MVT::v12f32,
- MVT::v16i32, MVT::v16f32, MVT::v2i64, MVT::v2f64, MVT::v4i16,
- MVT::v4f16, MVT::v3i64, MVT::v3f64, MVT::v6i32, MVT::v6f32,
- MVT::v4i64, MVT::v4f64, MVT::v8i64, MVT::v8f64, MVT::v8i16,
- MVT::v8f16, MVT::v16i16, MVT::v16f16, MVT::v16i64, MVT::v16f64,
- MVT::v32i32, MVT::v32f32}) {
+ {MVT::v8i32, MVT::v8f32, MVT::v9i32, MVT::v9f32, MVT::v10i32,
+ MVT::v10f32, MVT::v11i32, MVT::v11f32, MVT::v12i32, MVT::v12f32,
+ MVT::v16i32, MVT::v16f32, MVT::v2i64, MVT::v2f64, MVT::v4i16,
+ MVT::v4f16, MVT::v3i64, MVT::v3f64, MVT::v6i32, MVT::v6f32,
+ MVT::v4i64, MVT::v4f64, MVT::v8i64, MVT::v8f64, MVT::v8i16,
+ MVT::v8f16, MVT::v16i16, MVT::v16f16, MVT::v16i64, MVT::v16f64,
+ MVT::v32i32, MVT::v32f32, MVT::v32i16, MVT::v32f16}) {
for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) {
switch (Op) {
case ISD::LOAD:
@@ -420,6 +429,8 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
if (Subtarget->has16BitInsts()) {
setOperationAction({ISD::FPOW, ISD::FPOWI}, MVT::f16, Promote);
setOperationAction({ISD::FLOG, ISD::FEXP, ISD::FLOG10}, MVT::f16, Custom);
+ } else {
+ setOperationAction(ISD::FSQRT, MVT::f16, Custom);
}
if (Subtarget->hasMadMacF32Insts())
@@ -470,9 +481,10 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
{MVT::f32, MVT::f64}, Legal);
if (Subtarget->haveRoundOpsF64())
- setOperationAction({ISD::FTRUNC, ISD::FCEIL, ISD::FRINT}, MVT::f64, Legal);
+ setOperationAction({ISD::FTRUNC, ISD::FCEIL, ISD::FROUNDEVEN}, MVT::f64,
+ Legal);
else
- setOperationAction({ISD::FCEIL, ISD::FTRUNC, ISD::FRINT, ISD::FFLOOR},
+ setOperationAction({ISD::FCEIL, ISD::FTRUNC, ISD::FROUNDEVEN, ISD::FFLOOR},
MVT::f64, Custom);
setOperationAction(ISD::FFLOOR, MVT::f64, Legal);
@@ -544,8 +556,9 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
if (STI.hasMadF16())
setOperationAction(ISD::FMAD, MVT::f16, Legal);
- for (MVT VT : {MVT::v2i16, MVT::v2f16, MVT::v4i16, MVT::v4f16, MVT::v8i16,
- MVT::v8f16, MVT::v16i16, MVT::v16f16}) {
+ for (MVT VT :
+ {MVT::v2i16, MVT::v2f16, MVT::v4i16, MVT::v4f16, MVT::v8i16,
+ MVT::v8f16, MVT::v16i16, MVT::v16f16, MVT::v32i16, MVT::v32f16}) {
for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) {
switch (Op) {
case ISD::LOAD:
@@ -631,6 +644,16 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
setOperationAction(ISD::STORE, MVT::v16f16, Promote);
AddPromotedToType(ISD::STORE, MVT::v16f16, MVT::v8i32);
+ setOperationAction(ISD::LOAD, MVT::v32i16, Promote);
+ AddPromotedToType(ISD::LOAD, MVT::v32i16, MVT::v16i32);
+ setOperationAction(ISD::LOAD, MVT::v32f16, Promote);
+ AddPromotedToType(ISD::LOAD, MVT::v32f16, MVT::v16i32);
+
+ setOperationAction(ISD::STORE, MVT::v32i16, Promote);
+ AddPromotedToType(ISD::STORE, MVT::v32i16, MVT::v16i32);
+ setOperationAction(ISD::STORE, MVT::v32f16, Promote);
+ AddPromotedToType(ISD::STORE, MVT::v32f16, MVT::v16i32);
+
setOperationAction({ISD::ANY_EXTEND, ISD::ZERO_EXTEND, ISD::SIGN_EXTEND},
MVT::v2i32, Expand);
setOperationAction(ISD::FP_EXTEND, MVT::v2f32, Expand);
@@ -653,12 +676,15 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
setOperationAction({ISD::FMAXNUM_IEEE, ISD::FMINNUM_IEEE}, MVT::f16, Legal);
setOperationAction({ISD::FMINNUM_IEEE, ISD::FMAXNUM_IEEE},
- {MVT::v4f16, MVT::v8f16, MVT::v16f16}, Custom);
+ {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16},
+ Custom);
setOperationAction({ISD::FMINNUM, ISD::FMAXNUM},
- {MVT::v4f16, MVT::v8f16, MVT::v16f16}, Expand);
+ {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16},
+ Expand);
- for (MVT Vec16 : {MVT::v8i16, MVT::v8f16, MVT::v16i16, MVT::v16f16}) {
+ for (MVT Vec16 : {MVT::v8i16, MVT::v8f16, MVT::v16i16, MVT::v16f16,
+ MVT::v32i16, MVT::v32f16}) {
setOperationAction(
{ISD::BUILD_VECTOR, ISD::EXTRACT_VECTOR_ELT, ISD::SCALAR_TO_VECTOR},
Vec16, Custom);
@@ -681,10 +707,10 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
setOperationAction(ISD::VECTOR_SHUFFLE,
{MVT::v4f16, MVT::v4i16, MVT::v8f16, MVT::v8i16,
- MVT::v16f16, MVT::v16i16},
+ MVT::v16f16, MVT::v16i16, MVT::v32f16, MVT::v32i16},
Custom);
- for (MVT VT : {MVT::v4i16, MVT::v8i16, MVT::v16i16})
+ for (MVT VT : {MVT::v4i16, MVT::v8i16, MVT::v16i16, MVT::v32i16})
// Split vector operations.
setOperationAction({ISD::SHL, ISD::SRA, ISD::SRL, ISD::ADD, ISD::SUB,
ISD::MUL, ISD::SMIN, ISD::SMAX, ISD::UMIN, ISD::UMAX,
@@ -692,7 +718,7 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
ISD::SSUBSAT},
VT, Custom);
- for (MVT VT : {MVT::v4f16, MVT::v8f16, MVT::v16f16})
+ for (MVT VT : {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16})
// Split vector operations.
setOperationAction({ISD::FADD, ISD::FMUL, ISD::FMA, ISD::FCANONICALIZE},
VT, Custom);
@@ -728,7 +754,8 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
setOperationAction(ISD::SELECT,
{MVT::v4i16, MVT::v4f16, MVT::v2i8, MVT::v4i8, MVT::v8i8,
- MVT::v8i16, MVT::v8f16, MVT::v16i16, MVT::v16f16},
+ MVT::v8i16, MVT::v8f16, MVT::v16i16, MVT::v16f16,
+ MVT::v32i16, MVT::v32f16},
Custom);
setOperationAction({ISD::SMULO, ISD::UMULO}, MVT::i64, Custom);
@@ -736,6 +763,13 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
if (Subtarget->hasMad64_32())
setOperationAction({ISD::SMUL_LOHI, ISD::UMUL_LOHI}, MVT::i32, Custom);
+ if (Subtarget->hasPrefetch())
+ setOperationAction(ISD::PREFETCH, MVT::Other, Custom);
+
+ if (Subtarget->hasIEEEMinMax())
+ setOperationAction({ISD::FMAXIMUM, ISD::FMINIMUM},
+ {MVT::f16, MVT::f32, MVT::f64, MVT::v2f16}, Legal);
+
setOperationAction(ISD::INTRINSIC_WO_CHAIN,
{MVT::Other, MVT::f32, MVT::v4f32, MVT::i16, MVT::f16,
MVT::v2i16, MVT::v2f16, MVT::i128},
@@ -753,16 +787,28 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
MVT::i8, MVT::i128},
Custom);
+ setOperationAction(ISD::STACKSAVE, MVT::Other, Custom);
+ setOperationAction(ISD::GET_ROUNDING, MVT::i32, Custom);
+
+ // TODO: Could move this to custom lowering, could benefit from combines on
+ // extract of relevant bits.
+ setOperationAction(ISD::GET_FPMODE, MVT::i32, Legal);
+
+ setOperationAction(ISD::MUL, MVT::i1, Promote);
+
setTargetDAGCombine({ISD::ADD,
ISD::UADDO_CARRY,
ISD::SUB,
ISD::USUBO_CARRY,
ISD::FADD,
ISD::FSUB,
+ ISD::FDIV,
ISD::FMINNUM,
ISD::FMAXNUM,
ISD::FMINNUM_IEEE,
ISD::FMAXNUM_IEEE,
+ ISD::FMINIMUM,
+ ISD::FMAXIMUM,
ISD::FMA,
ISD::SMIN,
ISD::SMAX,
@@ -772,6 +818,7 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
ISD::AND,
ISD::OR,
ISD::XOR,
+ ISD::FSHR,
ISD::SINT_TO_FP,
ISD::UINT_TO_FP,
ISD::FCANONICALIZE,
@@ -1002,12 +1049,20 @@ static EVT memVTFromLoadIntrReturn(Type *Ty, unsigned MaxNumLanes) {
MVT SITargetLowering::getPointerTy(const DataLayout &DL, unsigned AS) const {
if (AMDGPUAS::BUFFER_FAT_POINTER == AS && DL.getPointerSizeInBits(AS) == 160)
return MVT::v5i32;
+ if (AMDGPUAS::BUFFER_STRIDED_POINTER == AS &&
+ DL.getPointerSizeInBits(AS) == 192)
+ return MVT::v6i32;
return AMDGPUTargetLowering::getPointerTy(DL, AS);
}
/// Similarly, the in-memory representation of a p7 is {p8, i32}, aka
/// v8i32 when padding is added.
+/// The in-memory representation of a p9 is {p8, i32, i32}, which is
+/// also v8i32 with padding.
MVT SITargetLowering::getPointerMemTy(const DataLayout &DL, unsigned AS) const {
- if (AMDGPUAS::BUFFER_FAT_POINTER == AS && DL.getPointerSizeInBits(AS) == 160)
+ if ((AMDGPUAS::BUFFER_FAT_POINTER == AS &&
+ DL.getPointerSizeInBits(AS) == 160) ||
+ (AMDGPUAS::BUFFER_STRIDED_POINTER == AS &&
+ DL.getPointerSizeInBits(AS) == 192))
return MVT::v8i32;
return AMDGPUTargetLowering::getPointerMemTy(DL, AS);
}
@@ -1186,9 +1241,13 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
case Intrinsic::amdgcn_global_atomic_fadd:
case Intrinsic::amdgcn_global_atomic_fmin:
case Intrinsic::amdgcn_global_atomic_fmax:
+ case Intrinsic::amdgcn_global_atomic_fmin_num:
+ case Intrinsic::amdgcn_global_atomic_fmax_num:
case Intrinsic::amdgcn_flat_atomic_fadd:
case Intrinsic::amdgcn_flat_atomic_fmin:
case Intrinsic::amdgcn_flat_atomic_fmax:
+ case Intrinsic::amdgcn_flat_atomic_fmin_num:
+ case Intrinsic::amdgcn_flat_atomic_fmax_num:
case Intrinsic::amdgcn_global_atomic_fadd_v2bf16:
case Intrinsic::amdgcn_flat_atomic_fadd_v2bf16: {
Info.opc = ISD::INTRINSIC_W_CHAIN;
@@ -1271,6 +1330,8 @@ bool SITargetLowering::getAddrModeArguments(IntrinsicInst *II,
case Intrinsic::amdgcn_flat_atomic_fadd:
case Intrinsic::amdgcn_flat_atomic_fmin:
case Intrinsic::amdgcn_flat_atomic_fmax:
+ case Intrinsic::amdgcn_flat_atomic_fmin_num:
+ case Intrinsic::amdgcn_flat_atomic_fmax_num:
case Intrinsic::amdgcn_global_atomic_fadd_v2bf16:
case Intrinsic::amdgcn_flat_atomic_fadd_v2bf16:
case Intrinsic::amdgcn_global_atomic_csub: {
@@ -1284,7 +1345,9 @@ bool SITargetLowering::getAddrModeArguments(IntrinsicInst *II,
}
}
-bool SITargetLowering::isLegalFlatAddressingMode(const AddrMode &AM) const {
+bool SITargetLowering::isLegalFlatAddressingMode(const AddrMode &AM,
+ unsigned AddrSpace,
+ uint64_t FlatVariant) const {
if (!Subtarget->hasFlatInstOffsets()) {
// Flat instructions do not have offsets, and only have the register
// address.
@@ -1292,29 +1355,27 @@ bool SITargetLowering::isLegalFlatAddressingMode(const AddrMode &AM) const {
}
return AM.Scale == 0 &&
- (AM.BaseOffs == 0 ||
- Subtarget->getInstrInfo()->isLegalFLATOffset(
- AM.BaseOffs, AMDGPUAS::FLAT_ADDRESS, SIInstrFlags::FLAT));
+ (AM.BaseOffs == 0 || Subtarget->getInstrInfo()->isLegalFLATOffset(
+ AM.BaseOffs, AddrSpace, FlatVariant));
}
bool SITargetLowering::isLegalGlobalAddressingMode(const AddrMode &AM) const {
if (Subtarget->hasFlatGlobalInsts())
- return AM.Scale == 0 &&
- (AM.BaseOffs == 0 || Subtarget->getInstrInfo()->isLegalFLATOffset(
- AM.BaseOffs, AMDGPUAS::GLOBAL_ADDRESS,
- SIInstrFlags::FlatGlobal));
+ return isLegalFlatAddressingMode(AM, AMDGPUAS::GLOBAL_ADDRESS,
+ SIInstrFlags::FlatGlobal);
if (!Subtarget->hasAddr64() || Subtarget->useFlatForGlobal()) {
- // Assume the we will use FLAT for all global memory accesses
- // on VI.
- // FIXME: This assumption is currently wrong. On VI we still use
- // MUBUF instructions for the r + i addressing mode. As currently
- // implemented, the MUBUF instructions only work on buffer < 4GB.
- // It may be possible to support > 4GB buffers with MUBUF instructions,
- // by setting the stride value in the resource descriptor which would
- // increase the size limit to (stride * 4GB). However, this is risky,
- // because it has never been validated.
- return isLegalFlatAddressingMode(AM);
+ // Assume the we will use FLAT for all global memory accesses
+ // on VI.
+ // FIXME: This assumption is currently wrong. On VI we still use
+ // MUBUF instructions for the r + i addressing mode. As currently
+ // implemented, the MUBUF instructions only work on buffer < 4GB.
+ // It may be possible to support > 4GB buffers with MUBUF instructions,
+ // by setting the stride value in the resource descriptor which would
+ // increase the size limit to (stride * 4GB). However, this is risky,
+ // because it has never been validated.
+ return isLegalFlatAddressingMode(AM, AMDGPUAS::FLAT_ADDRESS,
+ SIInstrFlags::FLAT);
}
return isLegalMUBUFAddressingMode(AM);
@@ -1330,7 +1391,8 @@ bool SITargetLowering::isLegalMUBUFAddressingMode(const AddrMode &AM) const {
// assume those use MUBUF instructions. Scratch loads / stores are currently
// implemented as mubuf instructions with offen bit set, so slightly
// different than the normal addr64.
- if (!SIInstrInfo::isLegalMUBUFImmOffset(AM.BaseOffs))
+ const SIInstrInfo *TII = Subtarget->getInstrInfo();
+ if (!TII->isLegalMUBUFImmOffset(AM.BaseOffs))
return false;
// FIXME: Since we can split immediate into soffset and immediate offset,
@@ -1367,7 +1429,8 @@ bool SITargetLowering::isLegalAddressingMode(const DataLayout &DL,
if (AS == AMDGPUAS::CONSTANT_ADDRESS ||
AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT ||
- AS == AMDGPUAS::BUFFER_FAT_POINTER || AS == AMDGPUAS::BUFFER_RESOURCE) {
+ AS == AMDGPUAS::BUFFER_FAT_POINTER || AS == AMDGPUAS::BUFFER_RESOURCE ||
+ AS == AMDGPUAS::BUFFER_STRIDED_POINTER) {
// If the offset isn't a multiple of 4, it probably isn't going to be
// correctly aligned.
// FIXME: Can we get the real alignment here?
@@ -1394,11 +1457,15 @@ bool SITargetLowering::isLegalAddressingMode(const DataLayout &DL,
// On VI, these use the SMEM format and the offset is 20-bit in bytes.
if (!isUInt<20>(AM.BaseOffs))
return false;
- } else {
+ } else if (Subtarget->getGeneration() < AMDGPUSubtarget::GFX12) {
// On GFX9 the offset is signed 21-bit in bytes (but must not be negative
// for S_BUFFER_* instructions).
if (!isInt<21>(AM.BaseOffs))
return false;
+ } else {
+ // On GFX12, all offsets are signed 24-bit in bytes.
+ if (!isInt<24>(AM.BaseOffs))
+ return false;
}
if (AM.Scale == 0) // r + i or just i, depending on HasBaseReg.
@@ -1411,9 +1478,13 @@ bool SITargetLowering::isLegalAddressingMode(const DataLayout &DL,
}
if (AS == AMDGPUAS::PRIVATE_ADDRESS)
- return isLegalMUBUFAddressingMode(AM);
+ return Subtarget->enableFlatScratch()
+ ? isLegalFlatAddressingMode(AM, AMDGPUAS::PRIVATE_ADDRESS,
+ SIInstrFlags::FlatScratch)
+ : isLegalMUBUFAddressingMode(AM);
- if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
+ if (AS == AMDGPUAS::LOCAL_ADDRESS ||
+ (AS == AMDGPUAS::REGION_ADDRESS && Subtarget->hasGDS())) {
// Basic, single offset DS instructions allow a 16-bit unsigned immediate
// field.
// XXX - If doing a 4-byte aligned 8-byte type access, we effectively have
@@ -1436,7 +1507,8 @@ bool SITargetLowering::isLegalAddressingMode(const DataLayout &DL,
// computation. We don't have instructions that compute pointers with any
// addressing modes, so treat them as having no offset like flat
// instructions.
- return isLegalFlatAddressingMode(AM);
+ return isLegalFlatAddressingMode(AM, AMDGPUAS::FLAT_ADDRESS,
+ SIInstrFlags::FLAT);
}
// Assume a user alias of global for unknown address spaces.
@@ -1748,13 +1820,13 @@ SDValue SITargetLowering::lowerKernArgParameterPtr(SelectionDAG &DAG,
// We may not have the kernarg segment argument if we have no kernel
// arguments.
if (!InputPtrReg)
- return DAG.getConstant(0, SL, PtrVT);
+ return DAG.getConstant(Offset, SL, PtrVT);
MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo();
SDValue BasePtr = DAG.getCopyFromReg(Chain, SL,
MRI.getLiveInVirtReg(InputPtrReg->getRegister()), PtrVT);
- return DAG.getObjectPtrOffset(SL, BasePtr, TypeSize::Fixed(Offset));
+ return DAG.getObjectPtrOffset(SL, BasePtr, TypeSize::getFixed(Offset));
}
SDValue SITargetLowering::getImplicitArgPtr(SelectionDAG &DAG,
@@ -2133,13 +2205,14 @@ void SITargetLowering::allocateSpecialInputSGPRs(
const SIRegisterInfo &TRI,
SIMachineFunctionInfo &Info) const {
auto &ArgInfo = Info.getArgInfo();
+ const GCNUserSGPRUsageInfo &UserSGPRInfo = Info.getUserSGPRInfo();
// TODO: Unify handling with private memory pointers.
- if (Info.hasDispatchPtr())
+ if (UserSGPRInfo.hasDispatchPtr())
allocateSGPR64Input(CCInfo, ArgInfo.DispatchPtr);
const Module *M = MF.getFunction().getParent();
- if (Info.hasQueuePtr() &&
+ if (UserSGPRInfo.hasQueuePtr() &&
AMDGPU::getCodeObjectVersion(*M) < AMDGPU::AMDHSA_COV5)
allocateSGPR64Input(CCInfo, ArgInfo.QueuePtr);
@@ -2148,7 +2221,7 @@ void SITargetLowering::allocateSpecialInputSGPRs(
if (Info.hasImplicitArgPtr())
allocateSGPR64Input(CCInfo, ArgInfo.ImplicitArgPtr);
- if (Info.hasDispatchID())
+ if (UserSGPRInfo.hasDispatchID())
allocateSGPR64Input(CCInfo, ArgInfo.DispatchID);
// flat_scratch_init is not applicable for non-kernel functions.
@@ -2171,34 +2244,35 @@ void SITargetLowering::allocateHSAUserSGPRs(CCState &CCInfo,
MachineFunction &MF,
const SIRegisterInfo &TRI,
SIMachineFunctionInfo &Info) const {
- if (Info.hasImplicitBufferPtr()) {
+ const GCNUserSGPRUsageInfo &UserSGPRInfo = Info.getUserSGPRInfo();
+ if (UserSGPRInfo.hasImplicitBufferPtr()) {
Register ImplicitBufferPtrReg = Info.addImplicitBufferPtr(TRI);
MF.addLiveIn(ImplicitBufferPtrReg, &AMDGPU::SGPR_64RegClass);
CCInfo.AllocateReg(ImplicitBufferPtrReg);
}
// FIXME: How should these inputs interact with inreg / custom SGPR inputs?
- if (Info.hasPrivateSegmentBuffer()) {
+ if (UserSGPRInfo.hasPrivateSegmentBuffer()) {
Register PrivateSegmentBufferReg = Info.addPrivateSegmentBuffer(TRI);
MF.addLiveIn(PrivateSegmentBufferReg, &AMDGPU::SGPR_128RegClass);
CCInfo.AllocateReg(PrivateSegmentBufferReg);
}
- if (Info.hasDispatchPtr()) {
+ if (UserSGPRInfo.hasDispatchPtr()) {
Register DispatchPtrReg = Info.addDispatchPtr(TRI);
MF.addLiveIn(DispatchPtrReg, &AMDGPU::SGPR_64RegClass);
CCInfo.AllocateReg(DispatchPtrReg);
}
const Module *M = MF.getFunction().getParent();
- if (Info.hasQueuePtr() &&
+ if (UserSGPRInfo.hasQueuePtr() &&
AMDGPU::getCodeObjectVersion(*M) < AMDGPU::AMDHSA_COV5) {
Register QueuePtrReg = Info.addQueuePtr(TRI);
MF.addLiveIn(QueuePtrReg, &AMDGPU::SGPR_64RegClass);
CCInfo.AllocateReg(QueuePtrReg);
}
- if (Info.hasKernargSegmentPtr()) {
+ if (UserSGPRInfo.hasKernargSegmentPtr()) {
MachineRegisterInfo &MRI = MF.getRegInfo();
Register InputPtrReg = Info.addKernargSegmentPtr(TRI);
CCInfo.AllocateReg(InputPtrReg);
@@ -2207,26 +2281,100 @@ void SITargetLowering::allocateHSAUserSGPRs(CCState &CCInfo,
MRI.setType(VReg, LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
}
- if (Info.hasDispatchID()) {
+ if (UserSGPRInfo.hasDispatchID()) {
Register DispatchIDReg = Info.addDispatchID(TRI);
MF.addLiveIn(DispatchIDReg, &AMDGPU::SGPR_64RegClass);
CCInfo.AllocateReg(DispatchIDReg);
}
- if (Info.hasFlatScratchInit() && !getSubtarget()->isAmdPalOS()) {
+ if (UserSGPRInfo.hasFlatScratchInit() && !getSubtarget()->isAmdPalOS()) {
Register FlatScratchInitReg = Info.addFlatScratchInit(TRI);
MF.addLiveIn(FlatScratchInitReg, &AMDGPU::SGPR_64RegClass);
CCInfo.AllocateReg(FlatScratchInitReg);
}
+ // TODO: Add GridWorkGroupCount user SGPRs when used. For now with HSA we read
+ // these from the dispatch pointer.
+}
+
+// Allocate pre-loaded kernel arguemtns. Arguments to be preloading must be
+// sequential starting from the first argument.
+void SITargetLowering::allocatePreloadKernArgSGPRs(
+ CCState &CCInfo, SmallVectorImpl<CCValAssign> &ArgLocs,
+ const SmallVectorImpl<ISD::InputArg> &Ins, MachineFunction &MF,
+ const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const {
+ Function &F = MF.getFunction();
+ unsigned LastExplicitArgOffset =
+ MF.getSubtarget<GCNSubtarget>().getExplicitKernelArgOffset();
+ GCNUserSGPRUsageInfo &SGPRInfo = Info.getUserSGPRInfo();
+ bool InPreloadSequence = true;
+ unsigned InIdx = 0;
+ for (auto &Arg : F.args()) {
+ if (!InPreloadSequence || !Arg.hasInRegAttr())
+ break;
+
+ int ArgIdx = Arg.getArgNo();
+ // Don't preload non-original args or parts not in the current preload
+ // sequence.
+ if (InIdx < Ins.size() && (!Ins[InIdx].isOrigArg() ||
+ (int)Ins[InIdx].getOrigArgIndex() != ArgIdx))
+ break;
+
+ for (; InIdx < Ins.size() && Ins[InIdx].isOrigArg() &&
+ (int)Ins[InIdx].getOrigArgIndex() == ArgIdx;
+ InIdx++) {
+ assert(ArgLocs[ArgIdx].isMemLoc());
+ auto &ArgLoc = ArgLocs[InIdx];
+ const Align KernelArgBaseAlign = Align(16);
+ unsigned ArgOffset = ArgLoc.getLocMemOffset();
+ Align Alignment = commonAlignment(KernelArgBaseAlign, ArgOffset);
+ unsigned NumAllocSGPRs =
+ alignTo(ArgLoc.getLocVT().getFixedSizeInBits(), 32) / 32;
+
+ // Arg is preloaded into the previous SGPR.
+ if (ArgLoc.getLocVT().getStoreSize() < 4 && Alignment < 4) {
+ Info.getArgInfo().PreloadKernArgs[InIdx].Regs.push_back(
+ Info.getArgInfo().PreloadKernArgs[InIdx - 1].Regs[0]);
+ continue;
+ }
+
+ unsigned Padding = ArgOffset - LastExplicitArgOffset;
+ unsigned PaddingSGPRs = alignTo(Padding, 4) / 4;
+ // Check for free user SGPRs for preloading.
+ if (PaddingSGPRs + NumAllocSGPRs + 1 /*Synthetic SGPRs*/ >
+ SGPRInfo.getNumFreeUserSGPRs()) {
+ InPreloadSequence = false;
+ break;
+ }
+
+ // Preload this argument.
+ const TargetRegisterClass *RC =
+ TRI.getSGPRClassForBitWidth(NumAllocSGPRs * 32);
+ SmallVectorImpl<MCRegister> *PreloadRegs =
+ Info.addPreloadedKernArg(TRI, RC, NumAllocSGPRs, InIdx, PaddingSGPRs);
+
+ if (PreloadRegs->size() > 1)
+ RC = &AMDGPU::SGPR_32RegClass;
+ for (auto &Reg : *PreloadRegs) {
+ assert(Reg);
+ MF.addLiveIn(Reg, RC);
+ CCInfo.AllocateReg(Reg);
+ }
+
+ LastExplicitArgOffset = NumAllocSGPRs * 4 + ArgOffset;
+ }
+ }
+}
+
+void SITargetLowering::allocateLDSKernelId(CCState &CCInfo, MachineFunction &MF,
+ const SIRegisterInfo &TRI,
+ SIMachineFunctionInfo &Info) const {
+ // Always allocate this last since it is a synthetic preload.
if (Info.hasLDSKernelId()) {
Register Reg = Info.addLDSKernelId();
MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
CCInfo.AllocateReg(Reg);
}
-
- // TODO: Add GridWorkGroupCount user SGPRs when used. For now with HSA we read
- // these from the dispatch pointer.
}
// Allocate special input registers that are initialized per-wave.
@@ -2331,7 +2479,7 @@ static void reservePrivateMemoryRegs(const TargetMachine &TM,
// Everything live out of a block is spilled with fast regalloc, so it's
// almost certain that spilling will be required.
- if (TM.getOptLevel() == CodeGenOpt::None)
+ if (TM.getOptLevel() == CodeGenOptLevel::None)
HasStackObjects = true;
// For now assume stack access is needed in any callee functions, so we need
@@ -2477,12 +2625,14 @@ SDValue SITargetLowering::LowerFormalArguments(
bool IsEntryFunc = AMDGPU::isEntryFunctionCC(CallConv);
if (IsGraphics) {
- assert(!Info->hasDispatchPtr() && !Info->hasKernargSegmentPtr() &&
- !Info->hasWorkGroupInfo() && !Info->hasLDSKernelId() &&
- !Info->hasWorkItemIDX() && !Info->hasWorkItemIDY() &&
- !Info->hasWorkItemIDZ());
+ const GCNUserSGPRUsageInfo &UserSGPRInfo = Info->getUserSGPRInfo();
+ assert(!UserSGPRInfo.hasDispatchPtr() &&
+ !UserSGPRInfo.hasKernargSegmentPtr() && !Info->hasWorkGroupInfo() &&
+ !Info->hasLDSKernelId() && !Info->hasWorkItemIDX() &&
+ !Info->hasWorkItemIDY() && !Info->hasWorkItemIDZ());
+ (void)UserSGPRInfo;
if (!Subtarget->enableFlatScratch())
- assert(!Info->hasFlatScratchInit());
+ assert(!UserSGPRInfo.hasFlatScratchInit());
if (CallConv != CallingConv::AMDGPU_CS || !Subtarget->hasArchitectedSGPRs())
assert(!Info->hasWorkGroupIDX() && !Info->hasWorkGroupIDY() &&
!Info->hasWorkGroupIDZ());
@@ -2531,18 +2681,29 @@ SDValue SITargetLowering::LowerFormalArguments(
Splits.append(Ins.begin(), Ins.end());
}
+ if (IsKernel)
+ analyzeFormalArgumentsCompute(CCInfo, Ins);
+
if (IsEntryFunc) {
allocateSpecialEntryInputVGPRs(CCInfo, MF, *TRI, *Info);
allocateHSAUserSGPRs(CCInfo, MF, *TRI, *Info);
+ if (IsKernel && Subtarget->hasKernargPreload() &&
+ !Subtarget->needsKernargPreloadBackwardsCompatibility())
+ allocatePreloadKernArgSGPRs(CCInfo, ArgLocs, Ins, MF, *TRI, *Info);
+
+ allocateLDSKernelId(CCInfo, MF, *TRI, *Info);
} else if (!IsGraphics) {
// For the fixed ABI, pass workitem IDs in the last argument register.
allocateSpecialInputVGPRsFixed(CCInfo, MF, *TRI, *Info);
}
- if (IsKernel) {
- analyzeFormalArgumentsCompute(CCInfo, Ins);
- } else {
+ if (!IsKernel) {
CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, isVarArg);
+ if (!IsGraphics && !Subtarget->enableFlatScratch()) {
+ CCInfo.AllocateRegBlock(ArrayRef<MCPhysReg>{AMDGPU::SGPR0, AMDGPU::SGPR1,
+ AMDGPU::SGPR2, AMDGPU::SGPR3},
+ 4);
+ }
CCInfo.AnalyzeFormalArguments(Splits, AssignFn);
}
@@ -2587,9 +2748,81 @@ SDValue SITargetLowering::LowerFormalArguments(
continue;
}
- SDValue Arg = lowerKernargMemParameter(
- DAG, VT, MemVT, DL, Chain, Offset, Alignment, Ins[i].Flags.isSExt(), &Ins[i]);
- Chains.push_back(Arg.getValue(1));
+ SDValue NewArg;
+ if (Arg.isOrigArg() && Info->getArgInfo().PreloadKernArgs.count(i)) {
+ if (MemVT.getStoreSize() < 4 && Alignment < 4) {
+ // In this case the argument is packed into the previous preload SGPR.
+ int64_t AlignDownOffset = alignDown(Offset, 4);
+ int64_t OffsetDiff = Offset - AlignDownOffset;
+ EVT IntVT = MemVT.changeTypeToInteger();
+
+ const SIMachineFunctionInfo *Info =
+ MF.getInfo<SIMachineFunctionInfo>();
+ MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo();
+ Register Reg =
+ Info->getArgInfo().PreloadKernArgs.find(i)->getSecond().Regs[0];
+
+ assert(Reg);
+ Register VReg = MRI.getLiveInVirtReg(Reg);
+ SDValue Copy = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i32);
+
+ SDValue ShiftAmt = DAG.getConstant(OffsetDiff * 8, DL, MVT::i32);
+ SDValue Extract = DAG.getNode(ISD::SRL, DL, MVT::i32, Copy, ShiftAmt);
+
+ SDValue ArgVal = DAG.getNode(ISD::TRUNCATE, DL, IntVT, Extract);
+ ArgVal = DAG.getNode(ISD::BITCAST, DL, MemVT, ArgVal);
+ NewArg = convertArgType(DAG, VT, MemVT, DL, ArgVal,
+ Ins[i].Flags.isSExt(), &Ins[i]);
+
+ NewArg = DAG.getMergeValues({NewArg, Copy.getValue(1)}, DL);
+ } else {
+ const SIMachineFunctionInfo *Info =
+ MF.getInfo<SIMachineFunctionInfo>();
+ MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo();
+ const SmallVectorImpl<MCRegister> &PreloadRegs =
+ Info->getArgInfo().PreloadKernArgs.find(i)->getSecond().Regs;
+
+ SDValue Copy;
+ if (PreloadRegs.size() == 1) {
+ Register VReg = MRI.getLiveInVirtReg(PreloadRegs[0]);
+ const TargetRegisterClass *RC = MRI.getRegClass(VReg);
+ NewArg = DAG.getCopyFromReg(
+ Chain, DL, VReg,
+ EVT::getIntegerVT(*DAG.getContext(),
+ TRI->getRegSizeInBits(*RC)));
+
+ } else {
+ // If the kernarg alignment does not match the alignment of the SGPR
+ // tuple RC that can accommodate this argument, it will be built up
+ // via copies from from the individual SGPRs that the argument was
+ // preloaded to.
+ SmallVector<SDValue, 4> Elts;
+ for (auto Reg : PreloadRegs) {
+ Register VReg = MRI.getLiveInVirtReg(Reg);
+ Copy = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i32);
+ Elts.push_back(Copy);
+ }
+ NewArg =
+ DAG.getBuildVector(EVT::getVectorVT(*DAG.getContext(), MVT::i32,
+ PreloadRegs.size()),
+ DL, Elts);
+ }
+
+ SDValue CMemVT;
+ if (VT.isScalarInteger() && VT.bitsLT(NewArg.getSimpleValueType()))
+ CMemVT = DAG.getNode(ISD::TRUNCATE, DL, MemVT, NewArg);
+ else
+ CMemVT = DAG.getBitcast(MemVT, NewArg);
+ NewArg = convertArgType(DAG, VT, MemVT, DL, CMemVT,
+ Ins[i].Flags.isSExt(), &Ins[i]);
+ NewArg = DAG.getMergeValues({NewArg, Chain}, DL);
+ }
+ } else {
+ NewArg =
+ lowerKernargMemParameter(DAG, VT, MemVT, DL, Chain, Offset,
+ Alignment, Ins[i].Flags.isSExt(), &Ins[i]);
+ }
+ Chains.push_back(NewArg.getValue(1));
auto *ParamTy =
dyn_cast<PointerType>(FType->getParamType(Ins[i].getOrigArgIndex()));
@@ -2599,11 +2832,11 @@ SDValue SITargetLowering::LowerFormalArguments(
// On SI local pointers are just offsets into LDS, so they are always
// less than 16-bits. On CI and newer they could potentially be
// real pointers, so we can't guarantee their size.
- Arg = DAG.getNode(ISD::AssertZext, DL, Arg.getValueType(), Arg,
- DAG.getValueType(MVT::i16));
+ NewArg = DAG.getNode(ISD::AssertZext, DL, NewArg.getValueType(), NewArg,
+ DAG.getValueType(MVT::i16));
}
- InVals.push_back(Arg);
+ InVals.push_back(NewArg);
continue;
} else if (!IsEntryFunc && VA.isMemLoc()) {
SDValue Val = lowerStackParameter(DAG, VA, DL, Chain, Arg);
@@ -3084,6 +3317,9 @@ bool SITargetLowering::isEligibleForTailCallOptimization(
const SmallVectorImpl<ISD::OutputArg> &Outs,
const SmallVectorImpl<SDValue> &OutVals,
const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const {
+ if (AMDGPU::isChainCC(CalleeCC))
+ return true;
+
if (!mayTailCallThisCC(CalleeCC))
return false;
@@ -3168,7 +3404,36 @@ bool SITargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const {
// The wave scratch offset register is used as the global base pointer.
SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,
SmallVectorImpl<SDValue> &InVals) const {
+ CallingConv::ID CallConv = CLI.CallConv;
+ bool IsChainCallConv = AMDGPU::isChainCC(CallConv);
+
SelectionDAG &DAG = CLI.DAG;
+
+ TargetLowering::ArgListEntry RequestedExec;
+ if (IsChainCallConv) {
+ // The last argument should be the value that we need to put in EXEC.
+ // Pop it out of CLI.Outs and CLI.OutVals before we do any processing so we
+ // don't treat it like the rest of the arguments.
+ RequestedExec = CLI.Args.back();
+ assert(RequestedExec.Node && "No node for EXEC");
+
+ if (!RequestedExec.Ty->isIntegerTy(Subtarget->getWavefrontSize()))
+ return lowerUnhandledCall(CLI, InVals, "Invalid value for EXEC");
+
+ assert(CLI.Outs.back().OrigArgIndex == 2 && "Unexpected last arg");
+ CLI.Outs.pop_back();
+ CLI.OutVals.pop_back();
+
+ if (RequestedExec.Ty->isIntegerTy(64)) {
+ assert(CLI.Outs.back().OrigArgIndex == 2 && "Exec wasn't split up");
+ CLI.Outs.pop_back();
+ CLI.OutVals.pop_back();
+ }
+
+ assert(CLI.Outs.back().OrigArgIndex != 2 &&
+ "Haven't popped all the pieces of the EXEC mask");
+ }
+
const SDLoc &DL = CLI.DL;
SmallVector<ISD::OutputArg, 32> &Outs = CLI.Outs;
SmallVector<SDValue, 32> &OutVals = CLI.OutVals;
@@ -3176,7 +3441,6 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,
SDValue Chain = CLI.Chain;
SDValue Callee = CLI.Callee;
bool &IsTailCall = CLI.IsTailCall;
- CallingConv::ID CallConv = CLI.CallConv;
bool IsVarArg = CLI.IsVarArg;
bool IsSibCall = false;
bool IsThisReturn = false;
@@ -3207,9 +3471,10 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,
if (IsTailCall) {
IsTailCall = isEligibleForTailCallOptimization(
Callee, CallConv, IsVarArg, Outs, OutVals, Ins, DAG);
- if (!IsTailCall && CLI.CB && CLI.CB->isMustTailCall()) {
+ if (!IsTailCall &&
+ ((CLI.CB && CLI.CB->isMustTailCall()) || IsChainCallConv)) {
report_fatal_error("failed to perform tail call elimination on a call "
- "site marked musttail");
+ "site marked musttail or on llvm.amdgcn.cs.chain");
}
bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt;
@@ -3232,7 +3497,7 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,
CCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext());
CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, IsVarArg);
- if (CallConv != CallingConv::AMDGPU_Gfx) {
+ if (CallConv != CallingConv::AMDGPU_Gfx && !AMDGPU::isChainCC(CallConv)) {
// With a fixed ABI, allocate fixed registers before user arguments.
passSpecialInputs(CLI, CCInfo, *Info, RegsToPass, MemOpChains, Chain);
}
@@ -3258,16 +3523,20 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,
// Adjust the stack pointer for the new arguments...
// These operations are automatically eliminated by the prolog/epilog pass
- if (!IsSibCall) {
+ if (!IsSibCall)
Chain = DAG.getCALLSEQ_START(Chain, 0, 0, DL);
+ if (!IsSibCall || IsChainCallConv) {
if (!Subtarget->enableFlatScratch()) {
SmallVector<SDValue, 4> CopyFromChains;
// In the HSA case, this should be an identity copy.
SDValue ScratchRSrcReg
= DAG.getCopyFromReg(Chain, DL, Info->getScratchRSrcReg(), MVT::v4i32);
- RegsToPass.emplace_back(AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3, ScratchRSrcReg);
+ RegsToPass.emplace_back(IsChainCallConv
+ ? AMDGPU::SGPR48_SGPR49_SGPR50_SGPR51
+ : AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3,
+ ScratchRSrcReg);
CopyFromChains.push_back(ScratchRSrcReg.getValue(1));
Chain = DAG.getTokenFactor(DL, CopyFromChains);
}
@@ -3412,6 +3681,9 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,
Ops.push_back(DAG.getTargetConstant(FPDiff, DL, MVT::i32));
}
+ if (IsChainCallConv)
+ Ops.push_back(RequestedExec.Node);
+
// Add argument registers to the end of the list so that they are known live
// into the call.
for (auto &RegToPass : RegsToPass) {
@@ -3420,8 +3692,7 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,
}
// Add a register mask operand representing the call-preserved registers.
-
- auto *TRI = static_cast<const SIRegisterInfo*>(Subtarget->getRegisterInfo());
+ auto *TRI = static_cast<const SIRegisterInfo *>(Subtarget->getRegisterInfo());
const uint32_t *Mask = TRI->getCallPreservedMask(MF, CallConv);
assert(Mask && "Missing call preserved mask for calling convention");
Ops.push_back(DAG.getRegisterMask(Mask));
@@ -3435,8 +3706,17 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,
// actual call instruction.
if (IsTailCall) {
MFI.setHasTailCall();
- unsigned OPC = CallConv == CallingConv::AMDGPU_Gfx ?
- AMDGPUISD::TC_RETURN_GFX : AMDGPUISD::TC_RETURN;
+ unsigned OPC = AMDGPUISD::TC_RETURN;
+ switch (CallConv) {
+ case CallingConv::AMDGPU_Gfx:
+ OPC = AMDGPUISD::TC_RETURN_GFX;
+ break;
+ case CallingConv::AMDGPU_CS_Chain:
+ case CallingConv::AMDGPU_CS_ChainPreserve:
+ OPC = AMDGPUISD::TC_RETURN_CHAIN;
+ break;
+ }
+
return DAG.getNode(OPC, DL, NodeTys, Ops);
}
@@ -3481,22 +3761,21 @@ SDValue SITargetLowering::lowerDYNAMIC_STACKALLOCImpl(
SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, VT);
Chain = SP.getValue(1);
MaybeAlign Alignment = cast<ConstantSDNode>(Tmp3)->getMaybeAlignValue();
- const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
- const TargetFrameLowering *TFL = ST.getFrameLowering();
+ const TargetFrameLowering *TFL = Subtarget->getFrameLowering();
unsigned Opc =
TFL->getStackGrowthDirection() == TargetFrameLowering::StackGrowsUp ?
ISD::ADD : ISD::SUB;
SDValue ScaledSize = DAG.getNode(
ISD::SHL, dl, VT, Size,
- DAG.getConstant(ST.getWavefrontSizeLog2(), dl, MVT::i32));
+ DAG.getConstant(Subtarget->getWavefrontSizeLog2(), dl, MVT::i32));
Align StackAlign = TFL->getStackAlign();
Tmp1 = DAG.getNode(Opc, dl, VT, SP, ScaledSize); // Value
if (Alignment && *Alignment > StackAlign) {
Tmp1 = DAG.getNode(ISD::AND, dl, VT, Tmp1,
DAG.getConstant(-(uint64_t)Alignment->value()
- << ST.getWavefrontSizeLog2(),
+ << Subtarget->getWavefrontSizeLog2(),
dl, VT));
}
@@ -3520,6 +3799,111 @@ SDValue SITargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
return AMDGPUTargetLowering::LowerDYNAMIC_STACKALLOC(Op, DAG);
}
+SDValue SITargetLowering::LowerSTACKSAVE(SDValue Op, SelectionDAG &DAG) const {
+ if (Op.getValueType() != MVT::i32)
+ return Op; // Defer to cannot select error.
+
+ Register SP = getStackPointerRegisterToSaveRestore();
+ SDLoc SL(Op);
+
+ SDValue CopyFromSP = DAG.getCopyFromReg(Op->getOperand(0), SL, SP, MVT::i32);
+
+ // Convert from wave uniform to swizzled vector address. This should protect
+ // from any edge cases where the stacksave result isn't directly used with
+ // stackrestore.
+ SDValue VectorAddress =
+ DAG.getNode(AMDGPUISD::WAVE_ADDRESS, SL, MVT::i32, CopyFromSP);
+ return DAG.getMergeValues({VectorAddress, CopyFromSP.getValue(1)}, SL);
+}
+
+SDValue SITargetLowering::lowerGET_ROUNDING(SDValue Op,
+ SelectionDAG &DAG) const {
+ SDLoc SL(Op);
+ assert(Op.getValueType() == MVT::i32);
+
+ uint32_t BothRoundHwReg =
+ AMDGPU::Hwreg::encodeHwreg(AMDGPU::Hwreg::ID_MODE, 0, 4);
+ SDValue GetRoundBothImm = DAG.getTargetConstant(BothRoundHwReg, SL, MVT::i32);
+
+ SDValue IntrinID =
+ DAG.getTargetConstant(Intrinsic::amdgcn_s_getreg, SL, MVT::i32);
+ SDValue GetReg = DAG.getNode(ISD::INTRINSIC_W_CHAIN, SL, Op->getVTList(),
+ Op.getOperand(0), IntrinID, GetRoundBothImm);
+
+ // There are two rounding modes, one for f32 and one for f64/f16. We only
+ // report in the standard value range if both are the same.
+ //
+ // The raw values also differ from the expected FLT_ROUNDS values. Nearest
+ // ties away from zero is not supported, and the other values are rotated by
+ // 1.
+ //
+ // If the two rounding modes are not the same, report a target defined value.
+
+ // Mode register rounding mode fields:
+ //
+ // [1:0] Single-precision round mode.
+ // [3:2] Double/Half-precision round mode.
+ //
+ // 0=nearest even; 1= +infinity; 2= -infinity, 3= toward zero.
+ //
+ // Hardware Spec
+ // Toward-0 3 0
+ // Nearest Even 0 1
+ // +Inf 1 2
+ // -Inf 2 3
+ // NearestAway0 N/A 4
+ //
+ // We have to handle 16 permutations of a 4-bit value, so we create a 64-bit
+ // table we can index by the raw hardware mode.
+ //
+ // (trunc (FltRoundConversionTable >> MODE.fp_round)) & 0xf
+
+ SDValue BitTable =
+ DAG.getConstant(AMDGPU::FltRoundConversionTable, SL, MVT::i64);
+
+ SDValue Two = DAG.getConstant(2, SL, MVT::i32);
+ SDValue RoundModeTimesNumBits =
+ DAG.getNode(ISD::SHL, SL, MVT::i32, GetReg, Two);
+
+ // TODO: We could possibly avoid a 64-bit shift and use a simpler table if we
+ // knew only one mode was demanded.
+ SDValue TableValue =
+ DAG.getNode(ISD::SRL, SL, MVT::i64, BitTable, RoundModeTimesNumBits);
+ SDValue TruncTable = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, TableValue);
+
+ SDValue EntryMask = DAG.getConstant(0xf, SL, MVT::i32);
+ SDValue TableEntry =
+ DAG.getNode(ISD::AND, SL, MVT::i32, TruncTable, EntryMask);
+
+ // There's a gap in the 4-bit encoded table and actual enum values, so offset
+ // if it's an extended value.
+ SDValue Four = DAG.getConstant(4, SL, MVT::i32);
+ SDValue IsStandardValue =
+ DAG.getSetCC(SL, MVT::i1, TableEntry, Four, ISD::SETULT);
+ SDValue EnumOffset = DAG.getNode(ISD::ADD, SL, MVT::i32, TableEntry, Four);
+ SDValue Result = DAG.getNode(ISD::SELECT, SL, MVT::i32, IsStandardValue,
+ TableEntry, EnumOffset);
+
+ return DAG.getMergeValues({Result, GetReg.getValue(1)}, SL);
+}
+
+SDValue SITargetLowering::lowerPREFETCH(SDValue Op, SelectionDAG &DAG) const {
+ if (Op->isDivergent())
+ return SDValue();
+
+ switch (cast<MemSDNode>(Op)->getAddressSpace()) {
+ case AMDGPUAS::FLAT_ADDRESS:
+ case AMDGPUAS::GLOBAL_ADDRESS:
+ case AMDGPUAS::CONSTANT_ADDRESS:
+ case AMDGPUAS::CONSTANT_ADDRESS_32BIT:
+ break;
+ default:
+ return SDValue();
+ }
+
+ return Op;
+}
+
Register SITargetLowering::getRegisterByName(const char* RegName, LLT VT,
const MachineFunction &MF) const {
Register Reg = StringSwitch<Register>(RegName)
@@ -4217,40 +4601,51 @@ MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter(
}
case AMDGPU::S_ADD_U64_PSEUDO:
case AMDGPU::S_SUB_U64_PSEUDO: {
- MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
+ // For targets older than GFX12, we emit a sequence of 32-bit operations.
+ // For GFX12, we emit s_add_u64 and s_sub_u64.
const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
- const SIRegisterInfo *TRI = ST.getRegisterInfo();
- const TargetRegisterClass *BoolRC = TRI->getBoolRC();
+ MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
const DebugLoc &DL = MI.getDebugLoc();
-
MachineOperand &Dest = MI.getOperand(0);
MachineOperand &Src0 = MI.getOperand(1);
MachineOperand &Src1 = MI.getOperand(2);
-
- Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
- Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
-
- MachineOperand Src0Sub0 = TII->buildExtractSubRegOrImm(
- MI, MRI, Src0, BoolRC, AMDGPU::sub0, &AMDGPU::SReg_32RegClass);
- MachineOperand Src0Sub1 = TII->buildExtractSubRegOrImm(
- MI, MRI, Src0, BoolRC, AMDGPU::sub1, &AMDGPU::SReg_32RegClass);
-
- MachineOperand Src1Sub0 = TII->buildExtractSubRegOrImm(
- MI, MRI, Src1, BoolRC, AMDGPU::sub0, &AMDGPU::SReg_32RegClass);
- MachineOperand Src1Sub1 = TII->buildExtractSubRegOrImm(
- MI, MRI, Src1, BoolRC, AMDGPU::sub1, &AMDGPU::SReg_32RegClass);
-
bool IsAdd = (MI.getOpcode() == AMDGPU::S_ADD_U64_PSEUDO);
-
- unsigned LoOpc = IsAdd ? AMDGPU::S_ADD_U32 : AMDGPU::S_SUB_U32;
- unsigned HiOpc = IsAdd ? AMDGPU::S_ADDC_U32 : AMDGPU::S_SUBB_U32;
- BuildMI(*BB, MI, DL, TII->get(LoOpc), DestSub0).add(Src0Sub0).add(Src1Sub0);
- BuildMI(*BB, MI, DL, TII->get(HiOpc), DestSub1).add(Src0Sub1).add(Src1Sub1);
- BuildMI(*BB, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), Dest.getReg())
- .addReg(DestSub0)
- .addImm(AMDGPU::sub0)
- .addReg(DestSub1)
- .addImm(AMDGPU::sub1);
+ if (Subtarget->hasScalarAddSub64()) {
+ unsigned Opc = IsAdd ? AMDGPU::S_ADD_U64 : AMDGPU::S_SUB_U64;
+ BuildMI(*BB, MI, DL, TII->get(Opc), Dest.getReg())
+ .addReg(Src0.getReg())
+ .addReg(Src1.getReg());
+ } else {
+ const SIRegisterInfo *TRI = ST.getRegisterInfo();
+ const TargetRegisterClass *BoolRC = TRI->getBoolRC();
+
+ Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
+ Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
+
+ MachineOperand Src0Sub0 = TII->buildExtractSubRegOrImm(
+ MI, MRI, Src0, BoolRC, AMDGPU::sub0, &AMDGPU::SReg_32RegClass);
+ MachineOperand Src0Sub1 = TII->buildExtractSubRegOrImm(
+ MI, MRI, Src0, BoolRC, AMDGPU::sub1, &AMDGPU::SReg_32RegClass);
+
+ MachineOperand Src1Sub0 = TII->buildExtractSubRegOrImm(
+ MI, MRI, Src1, BoolRC, AMDGPU::sub0, &AMDGPU::SReg_32RegClass);
+ MachineOperand Src1Sub1 = TII->buildExtractSubRegOrImm(
+ MI, MRI, Src1, BoolRC, AMDGPU::sub1, &AMDGPU::SReg_32RegClass);
+
+ unsigned LoOpc = IsAdd ? AMDGPU::S_ADD_U32 : AMDGPU::S_SUB_U32;
+ unsigned HiOpc = IsAdd ? AMDGPU::S_ADDC_U32 : AMDGPU::S_SUBB_U32;
+ BuildMI(*BB, MI, DL, TII->get(LoOpc), DestSub0)
+ .add(Src0Sub0)
+ .add(Src1Sub0);
+ BuildMI(*BB, MI, DL, TII->get(HiOpc), DestSub1)
+ .add(Src0Sub1)
+ .add(Src1Sub1);
+ BuildMI(*BB, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), Dest.getReg())
+ .addReg(DestSub0)
+ .addImm(AMDGPU::sub0)
+ .addReg(DestSub1)
+ .addImm(AMDGPU::sub1);
+ }
MI.eraseFromParent();
return BB;
}
@@ -4463,8 +4858,8 @@ MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter(
const SIRegisterInfo *TRI = ST.getRegisterInfo();
Register Dst = MI.getOperand(0).getReg();
- Register Src0 = MI.getOperand(1).getReg();
- Register Src1 = MI.getOperand(2).getReg();
+ const MachineOperand &Src0 = MI.getOperand(1);
+ const MachineOperand &Src1 = MI.getOperand(2);
const DebugLoc &DL = MI.getDebugLoc();
Register SrcCond = MI.getOperand(3).getReg();
@@ -4473,20 +4868,42 @@ MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter(
const auto *CondRC = TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
Register SrcCondCopy = MRI.createVirtualRegister(CondRC);
+ const TargetRegisterClass *Src0RC = Src0.isReg()
+ ? MRI.getRegClass(Src0.getReg())
+ : &AMDGPU::VReg_64RegClass;
+ const TargetRegisterClass *Src1RC = Src1.isReg()
+ ? MRI.getRegClass(Src1.getReg())
+ : &AMDGPU::VReg_64RegClass;
+
+ const TargetRegisterClass *Src0SubRC =
+ TRI->getSubRegisterClass(Src0RC, AMDGPU::sub0);
+ const TargetRegisterClass *Src1SubRC =
+ TRI->getSubRegisterClass(Src1RC, AMDGPU::sub1);
+
+ MachineOperand Src0Sub0 = TII->buildExtractSubRegOrImm(
+ MI, MRI, Src0, Src0RC, AMDGPU::sub0, Src0SubRC);
+ MachineOperand Src1Sub0 = TII->buildExtractSubRegOrImm(
+ MI, MRI, Src1, Src1RC, AMDGPU::sub0, Src1SubRC);
+
+ MachineOperand Src0Sub1 = TII->buildExtractSubRegOrImm(
+ MI, MRI, Src0, Src0RC, AMDGPU::sub1, Src0SubRC);
+ MachineOperand Src1Sub1 = TII->buildExtractSubRegOrImm(
+ MI, MRI, Src1, Src1RC, AMDGPU::sub1, Src1SubRC);
+
BuildMI(*BB, MI, DL, TII->get(AMDGPU::COPY), SrcCondCopy)
.addReg(SrcCond);
BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_CNDMASK_B32_e64), DstLo)
- .addImm(0)
- .addReg(Src0, 0, AMDGPU::sub0)
- .addImm(0)
- .addReg(Src1, 0, AMDGPU::sub0)
- .addReg(SrcCondCopy);
+ .addImm(0)
+ .add(Src0Sub0)
+ .addImm(0)
+ .add(Src1Sub0)
+ .addReg(SrcCondCopy);
BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_CNDMASK_B32_e64), DstHi)
- .addImm(0)
- .addReg(Src0, 0, AMDGPU::sub1)
- .addImm(0)
- .addReg(Src1, 0, AMDGPU::sub1)
- .addReg(SrcCondCopy);
+ .addImm(0)
+ .add(Src0Sub1)
+ .addImm(0)
+ .add(Src1Sub1)
+ .addReg(SrcCondCopy);
BuildMI(*BB, MI, DL, TII->get(AMDGPU::REG_SEQUENCE), Dst)
.addReg(DstLo)
@@ -4843,7 +5260,7 @@ SDValue SITargetLowering::splitUnaryVectorOp(SDValue Op,
assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4f32 ||
VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v16i16 ||
VT == MVT::v16f16 || VT == MVT::v8f32 || VT == MVT::v16f32 ||
- VT == MVT::v32f32);
+ VT == MVT::v32f32 || VT == MVT::v32i16 || VT == MVT::v32f16);
SDValue Lo, Hi;
std::tie(Lo, Hi) = DAG.SplitVectorOperand(Op.getNode(), 0);
@@ -4866,7 +5283,7 @@ SDValue SITargetLowering::splitBinaryVectorOp(SDValue Op,
assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4f32 ||
VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v16i16 ||
VT == MVT::v16f16 || VT == MVT::v8f32 || VT == MVT::v16f32 ||
- VT == MVT::v32f32);
+ VT == MVT::v32f32 || VT == MVT::v32i16 || VT == MVT::v32f16);
SDValue Lo0, Hi0;
std::tie(Lo0, Hi0) = DAG.SplitVectorOperand(Op.getNode(), 0);
@@ -4926,10 +5343,14 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
"Load should return a value and a chain");
return Result;
}
- case ISD::FSQRT:
- if (Op.getValueType() == MVT::f64)
+ case ISD::FSQRT: {
+ EVT VT = Op.getValueType();
+ if (VT == MVT::f32)
+ return lowerFSQRTF32(Op, DAG);
+ if (VT == MVT::f64)
return lowerFSQRTF64(Op, DAG);
return SDValue();
+ }
case ISD::FSIN:
case ISD::FCOS:
return LowerTrig(Op, DAG);
@@ -5027,6 +5448,12 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
return lowerXMUL_LOHI(Op, DAG);
case ISD::DYNAMIC_STACKALLOC:
return LowerDYNAMIC_STACKALLOC(Op, DAG);
+ case ISD::STACKSAVE:
+ return LowerSTACKSAVE(Op, DAG);
+ case ISD::GET_ROUNDING:
+ return lowerGET_ROUNDING(Op, DAG);
+ case ISD::PREFETCH:
+ return lowerPREFETCH(Op, DAG);
}
return SDValue();
}
@@ -5382,6 +5809,12 @@ void SITargetLowering::ReplaceNodeResults(SDNode *N,
Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2f16, Op));
return;
}
+ case ISD::FSQRT: {
+ if (N->getValueType(0) != MVT::f16)
+ break;
+ Results.push_back(lowerFSQRTF16(SDValue(N, 0), DAG));
+ break;
+ }
default:
AMDGPUTargetLowering::ReplaceNodeResults(N, Results, DAG);
break;
@@ -5433,6 +5866,9 @@ bool SITargetLowering::shouldEmitFixup(const GlobalValue *GV) const {
}
bool SITargetLowering::shouldEmitGOTReloc(const GlobalValue *GV) const {
+ if (Subtarget->isAmdPalOS() || Subtarget->isMesa3DOS())
+ return false;
+
// FIXME: Either avoid relying on address space here or change the default
// address space for functions to avoid the explicit check.
return (GV->getValueType()->isFunctionTy() ||
@@ -5616,7 +6052,8 @@ SDValue SITargetLowering::lowerFMINNUM_FMAXNUM(SDValue Op,
if (IsIEEEMode)
return expandFMINNUM_FMAXNUM(Op.getNode(), DAG);
- if (VT == MVT::v4f16 || VT == MVT::v8f16 || VT == MVT::v16f16)
+ if (VT == MVT::v4f16 || VT == MVT::v8f16 || VT == MVT::v16f16 ||
+ VT == MVT::v16f16)
return splitBinaryVectorOp(Op, DAG);
return Op;
}
@@ -5711,11 +6148,6 @@ SDValue SITargetLowering::lowerTRAP(SDValue Op, SelectionDAG &DAG) const {
Subtarget->getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA)
return lowerTrapEndpgm(Op, DAG);
- const Module *M = DAG.getMachineFunction().getFunction().getParent();
- unsigned CodeObjectVersion = AMDGPU::getCodeObjectVersion(*M);
- if (CodeObjectVersion <= AMDGPU::AMDHSA_COV3)
- return lowerTrapHsaQueuePtr(Op, DAG);
-
return Subtarget->supportsGetDoorbellID() ? lowerTrapHsa(Op, DAG) :
lowerTrapHsaQueuePtr(Op, DAG);
}
@@ -5873,7 +6305,7 @@ SDValue SITargetLowering::getSegmentAperture(unsigned AS, const SDLoc &DL,
uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
SDValue Ptr =
- DAG.getObjectPtrOffset(DL, QueuePtr, TypeSize::Fixed(StructOffset));
+ DAG.getObjectPtrOffset(DL, QueuePtr, TypeSize::getFixed(StructOffset));
// TODO: Use custom target PseudoSourceValue.
// TODO: We should use the value from the IR intrinsic call, but it might not
@@ -6134,7 +6566,7 @@ SDValue SITargetLowering::lowerEXTRACT_VECTOR_ELT(SDValue Op,
if (SDValue Combined = performExtractVectorEltCombine(Op.getNode(), DCI))
return Combined;
- if (VecSize == 128 || VecSize == 256) {
+ if (VecSize == 128 || VecSize == 256 || VecSize == 512) {
SDValue Lo, Hi;
EVT LoVT, HiVT;
std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VecVT);
@@ -6147,9 +6579,7 @@ SDValue SITargetLowering::lowerEXTRACT_VECTOR_ELT(SDValue Op,
Hi = DAG.getBitcast(HiVT,
DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i64, V2,
DAG.getConstant(1, SL, MVT::i32)));
- } else {
- assert(VecSize == 256);
-
+ } else if (VecSize == 256) {
SDValue V2 = DAG.getBitcast(MVT::v4i64, Vec);
SDValue Parts[4];
for (unsigned P = 0; P < 4; ++P) {
@@ -6161,6 +6591,22 @@ SDValue SITargetLowering::lowerEXTRACT_VECTOR_ELT(SDValue Op,
Parts[0], Parts[1]));
Hi = DAG.getBitcast(HiVT, DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i64,
Parts[2], Parts[3]));
+ } else {
+ assert(VecSize == 512);
+
+ SDValue V2 = DAG.getBitcast(MVT::v8i64, Vec);
+ SDValue Parts[8];
+ for (unsigned P = 0; P < 8; ++P) {
+ Parts[P] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i64, V2,
+ DAG.getConstant(P, SL, MVT::i32));
+ }
+
+ Lo = DAG.getBitcast(LoVT,
+ DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v4i64,
+ Parts[0], Parts[1], Parts[2], Parts[3]));
+ Hi = DAG.getBitcast(HiVT,
+ DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v4i64,
+ Parts[4], Parts[5],Parts[6], Parts[7]));
}
EVT IdxVT = Idx.getValueType();
@@ -6326,6 +6772,27 @@ SDValue SITargetLowering::lowerBUILD_VECTOR(SDValue Op,
return DAG.getNode(ISD::BITCAST, SL, VT, Blend);
}
+ if (VT == MVT::v32i16 || VT == MVT::v32f16) {
+ EVT QuarterVT = MVT::getVectorVT(VT.getVectorElementType().getSimpleVT(),
+ VT.getVectorNumElements() / 8);
+ MVT QuarterIntVT = MVT::getIntegerVT(QuarterVT.getSizeInBits());
+
+ SmallVector<SDValue, 8> Parts[8];
+ for (unsigned I = 0, E = VT.getVectorNumElements() / 8; I != E; ++I) {
+ for (unsigned P = 0; P < 8; ++P)
+ Parts[P].push_back(Op.getOperand(I + P * E));
+ }
+ SDValue Casts[8];
+ for (unsigned P = 0; P < 8; ++P) {
+ SDValue Vec = DAG.getBuildVector(QuarterVT, SL, Parts[P]);
+ Casts[P] = DAG.getNode(ISD::BITCAST, SL, QuarterIntVT, Vec);
+ }
+
+ SDValue Blend =
+ DAG.getBuildVector(MVT::getVectorVT(QuarterIntVT, 8), SL, Casts);
+ return DAG.getNode(ISD::BITCAST, SL, VT, Blend);
+ }
+
assert(VT == MVT::v2f16 || VT == MVT::v2i16);
assert(!Subtarget->hasVOP3PInsts() && "this should be legal");
@@ -6391,24 +6858,12 @@ buildPCRelGlobalAddress(SelectionDAG &DAG, const GlobalValue *GV,
// $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
// which is a 64-bit pc-relative offset from the encoding of the $symbol
// operand to the global variable.
- //
- // What we want here is an offset from the value returned by s_getpc
- // (which is the address of the s_add_u32 instruction) to the global
- // variable, but since the encoding of $symbol starts 4 bytes after the start
- // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too
- // small. This requires us to add 4 to the global variable offset in order to
- // compute the correct address. Similarly for the s_addc_u32 instruction, the
- // encoding of $symbol starts 12 bytes after the start of the s_add_u32
- // instruction.
- SDValue PtrLo =
- DAG.getTargetGlobalAddress(GV, DL, MVT::i32, Offset + 4, GAFlags);
+ SDValue PtrLo = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, Offset, GAFlags);
SDValue PtrHi;
- if (GAFlags == SIInstrInfo::MO_NONE) {
+ if (GAFlags == SIInstrInfo::MO_NONE)
PtrHi = DAG.getTargetConstant(0, DL, MVT::i32);
- } else {
- PtrHi =
- DAG.getTargetGlobalAddress(GV, DL, MVT::i32, Offset + 12, GAFlags + 1);
- }
+ else
+ PtrHi = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, Offset, GAFlags + 1);
return DAG.getNode(AMDGPUISD::PC_ADD_REL_OFFSET, DL, PtrVT, PtrLo, PtrHi);
}
@@ -6450,9 +6905,22 @@ SDValue SITargetLowering::LowerGlobalAddress(AMDGPUMachineFunction *MFI,
return DAG.getNode(AMDGPUISD::LDS, DL, MVT::i32, GA);
}
+ if (Subtarget->isAmdPalOS() || Subtarget->isMesa3DOS()) {
+ SDValue AddrLo = DAG.getTargetGlobalAddress(
+ GV, DL, MVT::i32, GSD->getOffset(), SIInstrInfo::MO_ABS32_LO);
+ AddrLo = {DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, AddrLo), 0};
+
+ SDValue AddrHi = DAG.getTargetGlobalAddress(
+ GV, DL, MVT::i32, GSD->getOffset(), SIInstrInfo::MO_ABS32_HI);
+ AddrHi = {DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, AddrHi), 0};
+
+ return DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, AddrLo, AddrHi);
+ }
+
if (shouldEmitFixup(GV))
return buildPCRelGlobalAddress(DAG, GV, DL, GSD->getOffset(), PtrVT);
- else if (shouldEmitPCReloc(GV))
+
+ if (shouldEmitPCReloc(GV))
return buildPCRelGlobalAddress(DAG, GV, DL, GSD->getOffset(), PtrVT,
SIInstrInfo::MO_REL32);
@@ -6699,6 +7167,7 @@ SDValue SITargetLowering::lowerImage(SDValue Op,
unsigned IntrOpcode = Intr->BaseOpcode;
bool IsGFX10Plus = AMDGPU::isGFX10Plus(*Subtarget);
bool IsGFX11Plus = AMDGPU::isGFX11Plus(*Subtarget);
+ bool IsGFX12Plus = AMDGPU::isGFX12Plus(*Subtarget);
SmallVector<EVT, 3> ResultTypes(Op->values());
SmallVector<EVT, 3> OrigResultTypes(Op->values());
@@ -6718,7 +7187,7 @@ SDValue SITargetLowering::lowerImage(SDValue Op,
if (BaseOpcode->Atomic) {
VData = Op.getOperand(2);
- bool Is64Bit = VData.getValueType() == MVT::i64;
+ bool Is64Bit = VData.getValueSizeInBits() == 64;
if (BaseOpcode->AtomicX2) {
SDValue VData2 = Op.getOperand(3);
VData = DAG.getBuildVector(Is64Bit ? MVT::v2i64 : MVT::v2i32, DL,
@@ -6878,9 +7347,9 @@ SDValue SITargetLowering::lowerImage(SDValue Op,
// SIShrinkInstructions will convert NSA encodings to non-NSA after register
// allocation when possible.
//
- // Partial NSA is allowed on GFX11 where the final register is a contiguous
+ // Partial NSA is allowed on GFX11+ where the final register is a contiguous
// set of the remaining addresses.
- const unsigned NSAMaxSize = ST->getNSAMaxSize();
+ const unsigned NSAMaxSize = ST->getNSAMaxSize(BaseOpcode->Sampler);
const bool HasPartialNSAEncoding = ST->hasPartialNSAEncoding();
const bool UseNSA = ST->hasNSAEncoding() &&
VAddrs.size() >= ST->getNSAThreshold(MF) &&
@@ -6957,7 +7426,7 @@ SDValue SITargetLowering::lowerImage(SDValue Op,
Op.getOperand(ArgOffset + Intr->CachePolicyIndex))->getZExtValue();
if (BaseOpcode->Atomic)
CPol |= AMDGPU::CPol::GLC; // TODO no-return optimization
- if (CPol & ~AMDGPU::CPol::ALL)
+ if (CPol & ~(IsGFX12Plus ? AMDGPU::CPol::ALL : AMDGPU::CPol::ALL_pregfx12))
return Op;
SmallVector<SDValue, 26> Ops;
@@ -6977,7 +7446,8 @@ SDValue SITargetLowering::lowerImage(SDValue Op,
Ops.push_back(DAG.getTargetConstant(DMask, DL, MVT::i32));
if (IsGFX10Plus)
Ops.push_back(DAG.getTargetConstant(DimInfo->Encoding, DL, MVT::i32));
- Ops.push_back(Unorm);
+ if (!IsGFX12Plus || BaseOpcode->Sampler || BaseOpcode->MSAA)
+ Ops.push_back(Unorm);
Ops.push_back(DAG.getTargetConstant(CPol, DL, MVT::i32));
Ops.push_back(IsA16 && // r128, a16 for gfx9
ST->hasFeature(AMDGPU::FeatureR128A16) ? True : False);
@@ -6988,7 +7458,8 @@ SDValue SITargetLowering::lowerImage(SDValue Op,
} else if (cast<ConstantSDNode>(TFE)->getZExtValue()) {
report_fatal_error("TFE is not supported on this GPU");
}
- Ops.push_back(LWE); // lwe
+ if (!IsGFX12Plus || BaseOpcode->Sampler || BaseOpcode->MSAA)
+ Ops.push_back(LWE); // lwe
if (!IsGFX10Plus)
Ops.push_back(DimInfo->DA ? True : False);
if (BaseOpcode->HasD16)
@@ -7000,7 +7471,10 @@ SDValue SITargetLowering::lowerImage(SDValue Op,
UseNSA ? VAddrs.size() : VAddr.getValueType().getSizeInBits() / 32;
int Opcode = -1;
- if (IsGFX11Plus) {
+ if (IsGFX12Plus) {
+ Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx12,
+ NumVDataDwords, NumVAddrDwords);
+ } else if (IsGFX11Plus) {
Opcode = AMDGPU::getMIMGOpcode(IntrOpcode,
UseNSA ? AMDGPU::MIMGEncGfx11NSA
: AMDGPU::MIMGEncGfx11Default,
@@ -7071,7 +7545,8 @@ SDValue SITargetLowering::lowerSBuffer(EVT VT, SDLoc DL, SDValue Rsrc,
};
// Widen vec3 load to vec4.
- if (VT.isVector() && VT.getVectorNumElements() == 3) {
+ if (VT.isVector() && VT.getVectorNumElements() == 3 &&
+ !Subtarget->hasScalarDwordx3Loads()) {
EVT WidenedVT =
EVT::getVectorVT(*DAG.getContext(), VT.getVectorElementType(), 4);
auto WidenedOp = DAG.getMemIntrinsicNode(
@@ -7317,7 +7792,9 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
SDLoc(Op), MVT::i32);
case Intrinsic::amdgcn_s_buffer_load: {
unsigned CPol = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue();
- if (CPol & ~AMDGPU::CPol::ALL)
+ if (CPol & ~((Subtarget->getGeneration() >= AMDGPUSubtarget::GFX12)
+ ? AMDGPU::CPol::ALL
+ : AMDGPU::CPol::ALL_pregfx12))
return Op;
return lowerSBuffer(VT, DL, Op.getOperand(1), Op.getOperand(2), Op.getOperand(3),
DAG);
@@ -7341,9 +7818,6 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
return emitRemovedIntrinsicError(DAG, DL, VT);
}
- case Intrinsic::amdgcn_ldexp:
- return DAG.getNode(ISD::FLDEXP, DL, VT, Op.getOperand(1), Op.getOperand(2));
-
case Intrinsic::amdgcn_fract:
return DAG.getNode(AMDGPUISD::FRACT, DL, VT, Op.getOperand(1));
@@ -7490,6 +7964,19 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
}
}
+// On targets not supporting constant in soffset field, turn zero to
+// SGPR_NULL to avoid generating an extra s_mov with zero.
+static SDValue selectSOffset(SDValue SOffset, SelectionDAG &DAG,
+ const GCNSubtarget *Subtarget) {
+ if (Subtarget->hasRestrictedSOffset())
+ if (auto SOffsetConst = dyn_cast<ConstantSDNode>(SOffset)) {
+ if (SOffsetConst->isZero()) {
+ return DAG.getRegister(AMDGPU::SGPR_NULL, MVT::i32);
+ }
+ }
+ return SOffset;
+}
+
SDValue SITargetLowering::lowerRawBufferAtomicIntrin(SDValue Op,
SelectionDAG &DAG,
unsigned NewOpcode) const {
@@ -7498,13 +7985,14 @@ SDValue SITargetLowering::lowerRawBufferAtomicIntrin(SDValue Op,
SDValue VData = Op.getOperand(2);
SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
auto Offsets = splitBufferOffsets(Op.getOperand(4), DAG);
+ auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget);
SDValue Ops[] = {
Op.getOperand(0), // Chain
VData, // vdata
Rsrc, // rsrc
DAG.getConstant(0, DL, MVT::i32), // vindex
Offsets.first, // voffset
- Op.getOperand(5), // soffset
+ SOffset, // soffset
Offsets.second, // offset
Op.getOperand(6), // cachepolicy
DAG.getTargetConstant(0, DL, MVT::i1), // idxen
@@ -7531,13 +8019,14 @@ SITargetLowering::lowerStructBufferAtomicIntrin(SDValue Op, SelectionDAG &DAG,
SDValue VData = Op.getOperand(2);
SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
auto Offsets = splitBufferOffsets(Op.getOperand(5), DAG);
+ auto SOffset = selectSOffset(Op.getOperand(6), DAG, Subtarget);
SDValue Ops[] = {
Op.getOperand(0), // Chain
VData, // vdata
Rsrc, // rsrc
Op.getOperand(4), // vindex
Offsets.first, // voffset
- Op.getOperand(6), // soffset
+ SOffset, // soffset
Offsets.second, // offset
Op.getOperand(7), // cachepolicy
DAG.getTargetConstant(1, DL, MVT::i1), // idxen
@@ -7693,12 +8182,13 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(2), DAG);
auto Offsets = splitBufferOffsets(Op.getOperand(3), DAG);
+ auto SOffset = selectSOffset(Op.getOperand(4), DAG, Subtarget);
SDValue Ops[] = {
Op.getOperand(0), // Chain
Rsrc, // rsrc
DAG.getConstant(0, DL, MVT::i32), // vindex
Offsets.first, // voffset
- Op.getOperand(4), // soffset
+ SOffset, // soffset
Offsets.second, // offset
Op.getOperand(5), // cachepolicy, swizzled buffer
DAG.getTargetConstant(0, DL, MVT::i1), // idxen
@@ -7717,12 +8207,13 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(2), DAG);
auto Offsets = splitBufferOffsets(Op.getOperand(4), DAG);
+ auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget);
SDValue Ops[] = {
Op.getOperand(0), // Chain
Rsrc, // rsrc
Op.getOperand(3), // vindex
Offsets.first, // voffset
- Op.getOperand(5), // soffset
+ SOffset, // soffset
Offsets.second, // offset
Op.getOperand(6), // cachepolicy, swizzled buffer
DAG.getTargetConstant(1, DL, MVT::i1), // idxen
@@ -7734,21 +8225,22 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
MemSDNode *M = cast<MemSDNode>(Op);
EVT LoadVT = Op.getValueType();
+ auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget);
unsigned Dfmt = cast<ConstantSDNode>(Op.getOperand(7))->getZExtValue();
unsigned Nfmt = cast<ConstantSDNode>(Op.getOperand(8))->getZExtValue();
unsigned Glc = cast<ConstantSDNode>(Op.getOperand(9))->getZExtValue();
unsigned Slc = cast<ConstantSDNode>(Op.getOperand(10))->getZExtValue();
unsigned IdxEn = getIdxEn(Op.getOperand(3));
SDValue Ops[] = {
- Op.getOperand(0), // Chain
- Op.getOperand(2), // rsrc
- Op.getOperand(3), // vindex
- Op.getOperand(4), // voffset
- Op.getOperand(5), // soffset
- Op.getOperand(6), // offset
- DAG.getTargetConstant(Dfmt | (Nfmt << 4), DL, MVT::i32), // format
- DAG.getTargetConstant(Glc | (Slc << 1), DL, MVT::i32), // cachepolicy
- DAG.getTargetConstant(IdxEn, DL, MVT::i1) // idxen
+ Op.getOperand(0), // Chain
+ Op.getOperand(2), // rsrc
+ Op.getOperand(3), // vindex
+ Op.getOperand(4), // voffset
+ SOffset, // soffset
+ Op.getOperand(6), // offset
+ DAG.getTargetConstant(Dfmt | (Nfmt << 4), DL, MVT::i32), // format
+ DAG.getTargetConstant(Glc | (Slc << 1), DL, MVT::i32), // cachepolicy
+ DAG.getTargetConstant(IdxEn, DL, MVT::i1) // idxen
};
if (LoadVT.getScalarType() == MVT::f16)
@@ -7764,13 +8256,14 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
EVT LoadVT = Op.getValueType();
SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(2), DAG);
auto Offsets = splitBufferOffsets(Op.getOperand(3), DAG);
+ auto SOffset = selectSOffset(Op.getOperand(4), DAG, Subtarget);
SDValue Ops[] = {
Op.getOperand(0), // Chain
Rsrc, // rsrc
DAG.getConstant(0, DL, MVT::i32), // vindex
Offsets.first, // voffset
- Op.getOperand(4), // soffset
+ SOffset, // soffset
Offsets.second, // offset
Op.getOperand(5), // format
Op.getOperand(6), // cachepolicy, swizzled buffer
@@ -7790,13 +8283,14 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
EVT LoadVT = Op.getValueType();
SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(2), DAG);
auto Offsets = splitBufferOffsets(Op.getOperand(4), DAG);
+ auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget);
SDValue Ops[] = {
Op.getOperand(0), // Chain
Rsrc, // rsrc
Op.getOperand(3), // vindex
Offsets.first, // voffset
- Op.getOperand(5), // soffset
+ SOffset, // soffset
Offsets.second, // offset
Op.getOperand(6), // format
Op.getOperand(7), // cachepolicy, swizzled buffer
@@ -8009,6 +8503,7 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap: {
SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(4), DAG);
auto Offsets = splitBufferOffsets(Op.getOperand(5), DAG);
+ auto SOffset = selectSOffset(Op.getOperand(6), DAG, Subtarget);
SDValue Ops[] = {
Op.getOperand(0), // Chain
Op.getOperand(2), // src
@@ -8016,7 +8511,7 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
Rsrc, // rsrc
DAG.getConstant(0, DL, MVT::i32), // vindex
Offsets.first, // voffset
- Op.getOperand(6), // soffset
+ SOffset, // soffset
Offsets.second, // offset
Op.getOperand(7), // cachepolicy
DAG.getTargetConstant(0, DL, MVT::i1), // idxen
@@ -8031,6 +8526,7 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap: {
SDValue Rsrc = bufferRsrcPtrToVector(Op->getOperand(4), DAG);
auto Offsets = splitBufferOffsets(Op.getOperand(6), DAG);
+ auto SOffset = selectSOffset(Op.getOperand(7), DAG, Subtarget);
SDValue Ops[] = {
Op.getOperand(0), // Chain
Op.getOperand(2), // src
@@ -8038,7 +8534,7 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
Rsrc, // rsrc
Op.getOperand(5), // vindex
Offsets.first, // voffset
- Op.getOperand(7), // soffset
+ SOffset, // soffset
Offsets.second, // offset
Op.getOperand(8), // cachepolicy
DAG.getTargetConstant(1, DL, MVT::i1), // idxen
@@ -8068,14 +8564,17 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
return SDValue();
}
+ const bool IsGFX11 = AMDGPU::isGFX11(*Subtarget);
const bool IsGFX11Plus = AMDGPU::isGFX11Plus(*Subtarget);
+ const bool IsGFX12Plus = AMDGPU::isGFX12Plus(*Subtarget);
const bool IsA16 = RayDir.getValueType().getVectorElementType() == MVT::f16;
const bool Is64 = NodePtr.getValueType() == MVT::i64;
const unsigned NumVDataDwords = 4;
const unsigned NumVAddrDwords = IsA16 ? (Is64 ? 9 : 8) : (Is64 ? 12 : 11);
const unsigned NumVAddrs = IsGFX11Plus ? (IsA16 ? 4 : 5) : NumVAddrDwords;
- const bool UseNSA =
- Subtarget->hasNSAEncoding() && NumVAddrs <= Subtarget->getNSAMaxSize();
+ const bool UseNSA = (Subtarget->hasNSAEncoding() &&
+ NumVAddrs <= Subtarget->getNSAMaxSize()) ||
+ IsGFX12Plus;
const unsigned BaseOpcodes[2][2] = {
{AMDGPU::IMAGE_BVH_INTERSECT_RAY, AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16},
{AMDGPU::IMAGE_BVH64_INTERSECT_RAY,
@@ -8083,15 +8582,16 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
int Opcode;
if (UseNSA) {
Opcode = AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16],
- IsGFX11Plus ? AMDGPU::MIMGEncGfx11NSA
+ IsGFX12Plus ? AMDGPU::MIMGEncGfx12
+ : IsGFX11 ? AMDGPU::MIMGEncGfx11NSA
: AMDGPU::MIMGEncGfx10NSA,
NumVDataDwords, NumVAddrDwords);
} else {
- Opcode =
- AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16],
- IsGFX11Plus ? AMDGPU::MIMGEncGfx11Default
- : AMDGPU::MIMGEncGfx10Default,
- NumVDataDwords, NumVAddrDwords);
+ assert(!IsGFX12Plus);
+ Opcode = AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16],
+ IsGFX11 ? AMDGPU::MIMGEncGfx11Default
+ : AMDGPU::MIMGEncGfx10Default,
+ NumVDataDwords, NumVAddrDwords);
}
assert(Opcode != -1);
@@ -8179,8 +8679,12 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
}
case Intrinsic::amdgcn_global_atomic_fmin:
case Intrinsic::amdgcn_global_atomic_fmax:
+ case Intrinsic::amdgcn_global_atomic_fmin_num:
+ case Intrinsic::amdgcn_global_atomic_fmax_num:
case Intrinsic::amdgcn_flat_atomic_fmin:
- case Intrinsic::amdgcn_flat_atomic_fmax: {
+ case Intrinsic::amdgcn_flat_atomic_fmax:
+ case Intrinsic::amdgcn_flat_atomic_fmin_num:
+ case Intrinsic::amdgcn_flat_atomic_fmax_num: {
MemSDNode *M = cast<MemSDNode>(Op);
SDValue Ops[] = {
M->getOperand(0), // Chain
@@ -8190,12 +8694,16 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
unsigned Opcode = 0;
switch (IntrID) {
case Intrinsic::amdgcn_global_atomic_fmin:
- case Intrinsic::amdgcn_flat_atomic_fmin: {
+ case Intrinsic::amdgcn_global_atomic_fmin_num:
+ case Intrinsic::amdgcn_flat_atomic_fmin:
+ case Intrinsic::amdgcn_flat_atomic_fmin_num: {
Opcode = AMDGPUISD::ATOMIC_LOAD_FMIN;
break;
}
case Intrinsic::amdgcn_global_atomic_fmax:
- case Intrinsic::amdgcn_flat_atomic_fmax: {
+ case Intrinsic::amdgcn_global_atomic_fmax_num:
+ case Intrinsic::amdgcn_flat_atomic_fmax:
+ case Intrinsic::amdgcn_flat_atomic_fmax_num: {
Opcode = AMDGPUISD::ATOMIC_LOAD_FMAX;
break;
}
@@ -8206,6 +8714,31 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
M->getVTList(), Ops, M->getMemoryVT(),
M->getMemOperand());
}
+ case Intrinsic::amdgcn_s_get_barrier_state: {
+ SDValue Chain = Op->getOperand(0);
+ SmallVector<SDValue, 2> Ops;
+ unsigned Opc;
+ bool IsInlinableBarID = false;
+ int64_t BarID;
+
+ if (isa<ConstantSDNode>(Op->getOperand(2))) {
+ BarID = cast<ConstantSDNode>(Op->getOperand(2))->getSExtValue();
+ IsInlinableBarID = AMDGPU::isInlinableIntLiteral(BarID);
+ }
+
+ if (IsInlinableBarID) {
+ Opc = AMDGPU::S_GET_BARRIER_STATE_IMM;
+ SDValue K = DAG.getTargetConstant(BarID, DL, MVT::i32);
+ Ops.push_back(K);
+ } else {
+ Opc = AMDGPU::S_GET_BARRIER_STATE_M0;
+ SDValue M0Val = copyToM0(DAG, Chain, DL, Op.getOperand(2));
+ Ops.push_back(M0Val.getValue(0));
+ }
+
+ auto NewMI = DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops);
+ return SDValue(NewMI, 0);
+ }
default:
if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
@@ -8383,13 +8916,29 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
return SDValue(DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops), 0);
}
case Intrinsic::amdgcn_s_barrier: {
- if (getTargetMachine().getOptLevel() > CodeGenOpt::None) {
- const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
+ const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
+ if (getTargetMachine().getOptLevel() > CodeGenOptLevel::None) {
unsigned WGSize = ST.getFlatWorkGroupSizes(MF.getFunction()).second;
if (WGSize <= ST.getWavefrontSize())
return SDValue(DAG.getMachineNode(AMDGPU::WAVE_BARRIER, DL, MVT::Other,
Op.getOperand(0)), 0);
}
+
+ // On GFX12 lower s_barrier into s_barrier_signal_imm and s_barrier_wait
+ if (ST.hasSplitBarriers()) {
+ SDValue K =
+ DAG.getTargetConstant(AMDGPU::Barrier::WORKGROUP, DL, MVT::i32);
+ SDValue BarSignal =
+ SDValue(DAG.getMachineNode(AMDGPU::S_BARRIER_SIGNAL_IMM, DL,
+ MVT::Other, K, Op.getOperand(0)),
+ 0);
+ SDValue BarWait =
+ SDValue(DAG.getMachineNode(AMDGPU::S_BARRIER_WAIT, DL, MVT::Other, K,
+ BarSignal.getValue(0)),
+ 0);
+ return BarWait;
+ }
+
return SDValue();
};
case Intrinsic::amdgcn_tbuffer_store: {
@@ -8429,13 +8978,14 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
VData = handleD16VData(VData, DAG);
SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
auto Offsets = splitBufferOffsets(Op.getOperand(5), DAG);
+ auto SOffset = selectSOffset(Op.getOperand(6), DAG, Subtarget);
SDValue Ops[] = {
Chain,
VData, // vdata
Rsrc, // rsrc
Op.getOperand(4), // vindex
Offsets.first, // voffset
- Op.getOperand(6), // soffset
+ SOffset, // soffset
Offsets.second, // offset
Op.getOperand(7), // format
Op.getOperand(8), // cachepolicy, swizzled buffer
@@ -8456,13 +9006,14 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
VData = handleD16VData(VData, DAG);
SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
auto Offsets = splitBufferOffsets(Op.getOperand(4), DAG);
+ auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget);
SDValue Ops[] = {
Chain,
VData, // vdata
Rsrc, // rsrc
DAG.getConstant(0, DL, MVT::i32), // vindex
Offsets.first, // voffset
- Op.getOperand(5), // soffset
+ SOffset, // soffset
Offsets.second, // offset
Op.getOperand(6), // format
Op.getOperand(7), // cachepolicy, swizzled buffer
@@ -8536,13 +9087,14 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
auto Offsets = splitBufferOffsets(Op.getOperand(4), DAG);
+ auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget);
SDValue Ops[] = {
Chain,
VData,
Rsrc,
DAG.getConstant(0, DL, MVT::i32), // vindex
Offsets.first, // voffset
- Op.getOperand(5), // soffset
+ SOffset, // soffset
Offsets.second, // offset
Op.getOperand(6), // cachepolicy, swizzled buffer
DAG.getTargetConstant(0, DL, MVT::i1), // idxen
@@ -8586,13 +9138,14 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
auto Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
auto Offsets = splitBufferOffsets(Op.getOperand(5), DAG);
+ auto SOffset = selectSOffset(Op.getOperand(6), DAG, Subtarget);
SDValue Ops[] = {
Chain,
VData,
Rsrc,
Op.getOperand(4), // vindex
Offsets.first, // voffset
- Op.getOperand(6), // soffset
+ SOffset, // soffset
Offsets.second, // offset
Op.getOperand(7), // cachepolicy, swizzled buffer
DAG.getTargetConstant(1, DL, MVT::i1), // idxen
@@ -8620,8 +9173,7 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
IntrinsicID == Intrinsic::amdgcn_struct_ptr_buffer_load_lds;
unsigned OpOffset = HasVIndex ? 1 : 0;
SDValue VOffset = Op.getOperand(5 + OpOffset);
- auto CVOffset = dyn_cast<ConstantSDNode>(VOffset);
- bool HasVOffset = !CVOffset || !CVOffset->isZero();
+ bool HasVOffset = !isNullConstant(VOffset);
unsigned Size = Op->getConstantOperandVal(4);
switch (Size) {
@@ -8684,12 +9236,13 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
auto F = LoadMMO->getFlags() &
~(MachineMemOperand::MOStore | MachineMemOperand::MOLoad);
- LoadMMO = MF.getMachineMemOperand(LoadPtrI, F | MachineMemOperand::MOLoad,
- Size, LoadMMO->getBaseAlign());
+ LoadMMO =
+ MF.getMachineMemOperand(LoadPtrI, F | MachineMemOperand::MOLoad, Size,
+ LoadMMO->getBaseAlign(), LoadMMO->getAAInfo());
- MachineMemOperand *StoreMMO =
- MF.getMachineMemOperand(StorePtrI, F | MachineMemOperand::MOStore,
- sizeof(int32_t), LoadMMO->getBaseAlign());
+ MachineMemOperand *StoreMMO = MF.getMachineMemOperand(
+ StorePtrI, F | MachineMemOperand::MOStore, sizeof(int32_t),
+ LoadMMO->getBaseAlign(), LoadMMO->getAAInfo());
auto Load = DAG.getMachineNode(Opc, DL, M->getVTList(), Ops);
DAG.setNodeMemRefs(Load, {LoadMMO, StoreMMO});
@@ -8760,11 +9313,12 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
StorePtrI.AddrSpace = AMDGPUAS::LOCAL_ADDRESS;
auto F = LoadMMO->getFlags() &
~(MachineMemOperand::MOStore | MachineMemOperand::MOLoad);
- LoadMMO = MF.getMachineMemOperand(LoadPtrI, F | MachineMemOperand::MOLoad,
- Size, LoadMMO->getBaseAlign());
- MachineMemOperand *StoreMMO =
- MF.getMachineMemOperand(StorePtrI, F | MachineMemOperand::MOStore,
- sizeof(int32_t), Align(4));
+ LoadMMO =
+ MF.getMachineMemOperand(LoadPtrI, F | MachineMemOperand::MOLoad, Size,
+ LoadMMO->getBaseAlign(), LoadMMO->getAAInfo());
+ MachineMemOperand *StoreMMO = MF.getMachineMemOperand(
+ StorePtrI, F | MachineMemOperand::MOStore, sizeof(int32_t), Align(4),
+ LoadMMO->getAAInfo());
auto Load = DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops);
DAG.setNodeMemRefs(Load, {LoadMMO, StoreMMO});
@@ -8774,7 +9328,76 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
case Intrinsic::amdgcn_end_cf:
return SDValue(DAG.getMachineNode(AMDGPU::SI_END_CF, DL, MVT::Other,
Op->getOperand(2), Chain), 0);
+ case Intrinsic::amdgcn_s_barrier_init:
+ case Intrinsic::amdgcn_s_barrier_join:
+ case Intrinsic::amdgcn_s_wakeup_barrier: {
+ SDValue Chain = Op->getOperand(0);
+ SmallVector<SDValue, 2> Ops;
+ SDValue BarOp = Op->getOperand(2);
+ unsigned Opc;
+ bool IsInlinableBarID = false;
+ int64_t BarVal;
+
+ if (isa<ConstantSDNode>(BarOp)) {
+ BarVal = cast<ConstantSDNode>(BarOp)->getSExtValue();
+ IsInlinableBarID = AMDGPU::isInlinableIntLiteral(BarVal);
+ }
+
+ if (IsInlinableBarID) {
+ switch (IntrinsicID) {
+ default:
+ return SDValue();
+ case Intrinsic::amdgcn_s_barrier_init:
+ Opc = AMDGPU::S_BARRIER_INIT_IMM;
+ break;
+ case Intrinsic::amdgcn_s_barrier_join:
+ Opc = AMDGPU::S_BARRIER_JOIN_IMM;
+ break;
+ case Intrinsic::amdgcn_s_wakeup_barrier:
+ Opc = AMDGPU::S_WAKEUP_BARRIER_IMM;
+ break;
+ }
+
+ SDValue K = DAG.getTargetConstant(BarVal, DL, MVT::i32);
+ Ops.push_back(K);
+ } else {
+ switch (IntrinsicID) {
+ default:
+ return SDValue();
+ case Intrinsic::amdgcn_s_barrier_init:
+ Opc = AMDGPU::S_BARRIER_INIT_M0;
+ break;
+ case Intrinsic::amdgcn_s_barrier_join:
+ Opc = AMDGPU::S_BARRIER_JOIN_M0;
+ break;
+ case Intrinsic::amdgcn_s_wakeup_barrier:
+ Opc = AMDGPU::S_WAKEUP_BARRIER_M0;
+ break;
+ }
+ }
+
+ if (IntrinsicID == Intrinsic::amdgcn_s_barrier_init) {
+ SDValue M0Val;
+ // Member count will be read from M0[16:22]
+ M0Val = DAG.getNode(ISD::SHL, DL, MVT::i32, Op.getOperand(3),
+ DAG.getShiftAmountConstant(16, MVT::i32, DL));
+ if (!IsInlinableBarID) {
+ // If reference to barrier id is not an inline constant then it must be
+ // referenced with M0[4:0]. Perform an OR with the member count to
+ // include it in M0.
+ M0Val = SDValue(DAG.getMachineNode(AMDGPU::S_OR_B32, DL, MVT::i32,
+ Op.getOperand(2), M0Val),
+ 0);
+ }
+ Ops.push_back(copyToM0(DAG, Chain, DL, M0Val).getValue(0));
+ } else if (!IsInlinableBarID) {
+ Ops.push_back(copyToM0(DAG, Chain, DL, BarOp).getValue(0));
+ }
+
+ auto NewMI = DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops);
+ return SDValue(NewMI, 0);
+ }
default: {
if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
AMDGPU::getImageDimIntrinsicInfo(IntrinsicID))
@@ -8794,7 +9417,7 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
std::pair<SDValue, SDValue> SITargetLowering::splitBufferOffsets(
SDValue Offset, SelectionDAG &DAG) const {
SDLoc DL(Offset);
- const unsigned MaxImm = SIInstrInfo::getMaxMUBUFImmOffset();
+ const unsigned MaxImm = SIInstrInfo::getMaxMUBUFImmOffset(*Subtarget);
SDValue N0 = Offset;
ConstantSDNode *C1 = nullptr;
@@ -8870,8 +9493,13 @@ void SITargetLowering::setBufferOffsets(SDValue CombinedOffset,
return;
}
}
+
+ SDValue SOffsetZero = Subtarget->hasRestrictedSOffset()
+ ? DAG.getRegister(AMDGPU::SGPR_NULL, MVT::i32)
+ : DAG.getConstant(0, DL, MVT::i32);
+
Offsets[0] = CombinedOffset;
- Offsets[1] = DAG.getConstant(0, DL, MVT::i32);
+ Offsets[1] = SOffsetZero;
Offsets[2] = DAG.getTargetConstant(0, DL, MVT::i32);
}
@@ -9051,7 +9679,7 @@ static bool addressMayBeAccessedAsPrivate(const MachineMemOperand *MMO,
const SIMachineFunctionInfo &Info) {
// TODO: Should check if the address can definitely not access stack.
if (Info.isEntryFunction())
- return Info.hasFlatScratchInit();
+ return Info.getUserSGPRInfo().hasFlatScratchInit();
return true;
}
@@ -9129,7 +9757,8 @@ SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
if (AS == AMDGPUAS::CONSTANT_ADDRESS ||
AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
if (!Op->isDivergent() && Alignment >= Align(4) && NumElements < 32) {
- if (MemVT.isPow2VectorType())
+ if (MemVT.isPow2VectorType() ||
+ (Subtarget->hasScalarDwordx3Loads() && NumElements == 3))
return SDValue();
return WidenOrSplitVectorLoad(Op, DAG);
}
@@ -9145,7 +9774,8 @@ SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
if (Subtarget->getScalarizeGlobalBehavior() && !Op->isDivergent() &&
Load->isSimple() && isMemOpHasNoClobberedMemOperand(Load) &&
Alignment >= Align(4) && NumElements < 32) {
- if (MemVT.isPow2VectorType())
+ if (MemVT.isPow2VectorType() ||
+ (Subtarget->hasScalarDwordx3Loads() && NumElements == 3))
return SDValue();
return WidenOrSplitVectorLoad(Op, DAG);
}
@@ -9217,7 +9847,8 @@ SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
SDValue SITargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
EVT VT = Op.getValueType();
- if (VT.getSizeInBits() == 128 || VT.getSizeInBits() == 256)
+ if (VT.getSizeInBits() == 128 || VT.getSizeInBits() == 256 ||
+ VT.getSizeInBits() == 512)
return splitTernaryVectorOp(Op, DAG);
assert(VT.getSizeInBits() == 64);
@@ -9277,11 +9908,6 @@ SDValue SITargetLowering::lowerFastUnsafeFDIV(SDValue Op,
// XXX - Is UnsafeFPMath sufficient to do this for f64? The maximum ULP
// error seems really high at 2^29 ULP.
-
- // XXX - do we need afn for this or is arcp sufficent?
- if (RHS.getOpcode() == ISD::FSQRT)
- return DAG.getNode(AMDGPUISD::RSQ, SL, VT, RHS.getOperand(0));
-
// 1.0 / x -> rcp(x)
return DAG.getNode(AMDGPUISD::RCP, SL, VT, RHS);
}
@@ -9294,8 +9920,8 @@ SDValue SITargetLowering::lowerFastUnsafeFDIV(SDValue Op,
}
}
- // For f16 require arcp only.
- // For f32 require afn+arcp.
+ // For f16 require afn or arcp.
+ // For f32 require afn.
if (!AllowInaccurateRcp && (VT != MVT::f16 || !Flags.hasAllowReciprocal()))
return SDValue();
@@ -9480,28 +10106,44 @@ SDValue SITargetLowering::LowerFDIV32(SDValue Op, SelectionDAG &DAG) const {
const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
const DenormalMode DenormMode = Info->getMode().FP32Denormals;
- const bool HasFP32Denormals = DenormMode == DenormalMode::getIEEE();
+ const bool PreservesDenormals = DenormMode == DenormalMode::getIEEE();
+ const bool HasDynamicDenormals =
+ (DenormMode.Input == DenormalMode::Dynamic) ||
+ (DenormMode.Output == DenormalMode::Dynamic);
+
+ SDValue SavedDenormMode;
- if (!HasFP32Denormals) {
+ if (!PreservesDenormals) {
// Note we can't use the STRICT_FMA/STRICT_FMUL for the non-strict FDIV
// lowering. The chain dependence is insufficient, and we need glue. We do
// not need the glue variants in a strictfp function.
SDVTList BindParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
+ SDValue Glue = DAG.getEntryNode();
+ if (HasDynamicDenormals) {
+ SDNode *GetReg = DAG.getMachineNode(AMDGPU::S_GETREG_B32, SL,
+ DAG.getVTList(MVT::i32, MVT::Glue),
+ {BitField, Glue});
+ SavedDenormMode = SDValue(GetReg, 0);
+
+ Glue = DAG.getMergeValues(
+ {DAG.getEntryNode(), SDValue(GetReg, 0), SDValue(GetReg, 1)}, SL);
+ }
+
SDNode *EnableDenorm;
if (Subtarget->hasDenormModeInst()) {
const SDValue EnableDenormValue =
getSPDenormModeValue(FP_DENORM_FLUSH_NONE, DAG, Info, Subtarget);
- EnableDenorm = DAG.getNode(AMDGPUISD::DENORM_MODE, SL, BindParamVTs,
- DAG.getEntryNode(), EnableDenormValue).getNode();
+ EnableDenorm = DAG.getNode(AMDGPUISD::DENORM_MODE, SL, BindParamVTs, Glue,
+ EnableDenormValue)
+ .getNode();
} else {
const SDValue EnableDenormValue = DAG.getConstant(FP_DENORM_FLUSH_NONE,
SL, MVT::i32);
- EnableDenorm =
- DAG.getMachineNode(AMDGPU::S_SETREG_B32, SL, BindParamVTs,
- {EnableDenormValue, BitField, DAG.getEntryNode()});
+ EnableDenorm = DAG.getMachineNode(AMDGPU::S_SETREG_B32, SL, BindParamVTs,
+ {EnableDenormValue, BitField, Glue});
}
SDValue Ops[3] = {
@@ -9531,12 +10173,9 @@ SDValue SITargetLowering::LowerFDIV32(SDValue Op, SelectionDAG &DAG) const {
SDValue Fma4 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, NegDivScale0, Fma3,
NumeratorScaled, Fma3, Flags);
- if (!HasFP32Denormals) {
- // FIXME: This mishandles dynamic denormal mode. We need to query the
- // current mode and restore the original.
-
+ if (!PreservesDenormals) {
SDNode *DisableDenorm;
- if (Subtarget->hasDenormModeInst()) {
+ if (!HasDynamicDenormals && Subtarget->hasDenormModeInst()) {
const SDValue DisableDenormValue = getSPDenormModeValue(
FP_DENORM_FLUSH_IN_FLUSH_OUT, DAG, Info, Subtarget);
@@ -9544,8 +10183,11 @@ SDValue SITargetLowering::LowerFDIV32(SDValue Op, SelectionDAG &DAG) const {
Fma4.getValue(1), DisableDenormValue,
Fma4.getValue(2)).getNode();
} else {
+ assert(HasDynamicDenormals == (bool)SavedDenormMode);
const SDValue DisableDenormValue =
- DAG.getConstant(FP_DENORM_FLUSH_IN_FLUSH_OUT, SL, MVT::i32);
+ HasDynamicDenormals
+ ? SavedDenormMode
+ : DAG.getConstant(FP_DENORM_FLUSH_IN_FLUSH_OUT, SL, MVT::i32);
DisableDenorm = DAG.getMachineNode(
AMDGPU::S_SETREG_B32, SL, MVT::Other,
@@ -9754,6 +10396,111 @@ SDValue SITargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
return SDValue();
}
+// Avoid the full correct expansion for f32 sqrt when promoting from f16.
+SDValue SITargetLowering::lowerFSQRTF16(SDValue Op, SelectionDAG &DAG) const {
+ SDLoc SL(Op);
+ assert(!Subtarget->has16BitInsts());
+ SDNodeFlags Flags = Op->getFlags();
+ SDValue Ext =
+ DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Op.getOperand(0), Flags);
+
+ SDValue SqrtID = DAG.getTargetConstant(Intrinsic::amdgcn_sqrt, SL, MVT::i32);
+ SDValue Sqrt =
+ DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, MVT::f32, SqrtID, Ext, Flags);
+
+ return DAG.getNode(ISD::FP_ROUND, SL, MVT::f16, Sqrt,
+ DAG.getTargetConstant(0, SL, MVT::i32), Flags);
+}
+
+SDValue SITargetLowering::lowerFSQRTF32(SDValue Op, SelectionDAG &DAG) const {
+ SDLoc DL(Op);
+ SDNodeFlags Flags = Op->getFlags();
+ MVT VT = Op.getValueType().getSimpleVT();
+ const SDValue X = Op.getOperand(0);
+
+ if (allowApproxFunc(DAG, Flags)) {
+ // Instruction is 1ulp but ignores denormals.
+ return DAG.getNode(
+ ISD::INTRINSIC_WO_CHAIN, DL, VT,
+ DAG.getTargetConstant(Intrinsic::amdgcn_sqrt, DL, MVT::i32), X, Flags);
+ }
+
+ SDValue ScaleThreshold = DAG.getConstantFP(0x1.0p-96f, DL, VT);
+ SDValue NeedScale = DAG.getSetCC(DL, MVT::i1, X, ScaleThreshold, ISD::SETOLT);
+
+ SDValue ScaleUpFactor = DAG.getConstantFP(0x1.0p+32f, DL, VT);
+
+ SDValue ScaledX = DAG.getNode(ISD::FMUL, DL, VT, X, ScaleUpFactor, Flags);
+
+ SDValue SqrtX =
+ DAG.getNode(ISD::SELECT, DL, VT, NeedScale, ScaledX, X, Flags);
+
+ SDValue SqrtS;
+ if (needsDenormHandlingF32(DAG, X, Flags)) {
+ SDValue SqrtID =
+ DAG.getTargetConstant(Intrinsic::amdgcn_sqrt, DL, MVT::i32);
+ SqrtS = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, SqrtID, SqrtX, Flags);
+
+ SDValue SqrtSAsInt = DAG.getNode(ISD::BITCAST, DL, MVT::i32, SqrtS);
+ SDValue SqrtSNextDownInt = DAG.getNode(ISD::ADD, DL, MVT::i32, SqrtSAsInt,
+ DAG.getConstant(-1, DL, MVT::i32));
+ SDValue SqrtSNextDown = DAG.getNode(ISD::BITCAST, DL, VT, SqrtSNextDownInt);
+
+ SDValue NegSqrtSNextDown =
+ DAG.getNode(ISD::FNEG, DL, VT, SqrtSNextDown, Flags);
+
+ SDValue SqrtVP =
+ DAG.getNode(ISD::FMA, DL, VT, NegSqrtSNextDown, SqrtS, SqrtX, Flags);
+
+ SDValue SqrtSNextUpInt = DAG.getNode(ISD::ADD, DL, MVT::i32, SqrtSAsInt,
+ DAG.getConstant(1, DL, MVT::i32));
+ SDValue SqrtSNextUp = DAG.getNode(ISD::BITCAST, DL, VT, SqrtSNextUpInt);
+
+ SDValue NegSqrtSNextUp = DAG.getNode(ISD::FNEG, DL, VT, SqrtSNextUp, Flags);
+ SDValue SqrtVS =
+ DAG.getNode(ISD::FMA, DL, VT, NegSqrtSNextUp, SqrtS, SqrtX, Flags);
+
+ SDValue Zero = DAG.getConstantFP(0.0f, DL, VT);
+ SDValue SqrtVPLE0 = DAG.getSetCC(DL, MVT::i1, SqrtVP, Zero, ISD::SETOLE);
+
+ SqrtS = DAG.getNode(ISD::SELECT, DL, VT, SqrtVPLE0, SqrtSNextDown, SqrtS,
+ Flags);
+
+ SDValue SqrtVPVSGT0 = DAG.getSetCC(DL, MVT::i1, SqrtVS, Zero, ISD::SETOGT);
+ SqrtS = DAG.getNode(ISD::SELECT, DL, VT, SqrtVPVSGT0, SqrtSNextUp, SqrtS,
+ Flags);
+ } else {
+ SDValue SqrtR = DAG.getNode(AMDGPUISD::RSQ, DL, VT, SqrtX, Flags);
+
+ SqrtS = DAG.getNode(ISD::FMUL, DL, VT, SqrtX, SqrtR, Flags);
+
+ SDValue Half = DAG.getConstantFP(0.5f, DL, VT);
+ SDValue SqrtH = DAG.getNode(ISD::FMUL, DL, VT, SqrtR, Half, Flags);
+ SDValue NegSqrtH = DAG.getNode(ISD::FNEG, DL, VT, SqrtH, Flags);
+
+ SDValue SqrtE = DAG.getNode(ISD::FMA, DL, VT, NegSqrtH, SqrtS, Half, Flags);
+ SqrtH = DAG.getNode(ISD::FMA, DL, VT, SqrtH, SqrtE, SqrtH, Flags);
+ SqrtS = DAG.getNode(ISD::FMA, DL, VT, SqrtS, SqrtE, SqrtS, Flags);
+
+ SDValue NegSqrtS = DAG.getNode(ISD::FNEG, DL, VT, SqrtS, Flags);
+ SDValue SqrtD =
+ DAG.getNode(ISD::FMA, DL, VT, NegSqrtS, SqrtS, SqrtX, Flags);
+ SqrtS = DAG.getNode(ISD::FMA, DL, VT, SqrtD, SqrtH, SqrtS, Flags);
+ }
+
+ SDValue ScaleDownFactor = DAG.getConstantFP(0x1.0p-16f, DL, VT);
+
+ SDValue ScaledDown =
+ DAG.getNode(ISD::FMUL, DL, VT, SqrtS, ScaleDownFactor, Flags);
+
+ SqrtS = DAG.getNode(ISD::SELECT, DL, VT, NeedScale, ScaledDown, SqrtS, Flags);
+ SDValue IsZeroOrInf =
+ DAG.getNode(ISD::IS_FPCLASS, DL, MVT::i1, SqrtX,
+ DAG.getTargetConstant(fcZero | fcPosInf, DL, MVT::i32));
+
+ return DAG.getNode(ISD::SELECT, DL, VT, IsZeroOrInf, SqrtX, SqrtS, Flags);
+}
+
SDValue SITargetLowering::lowerFSQRTF64(SDValue Op, SelectionDAG &DAG) const {
// For double type, the SQRT and RSQ instructions don't have required
// precision, we apply Goldschmidt's algorithm to improve the result:
@@ -10111,9 +10858,7 @@ SDValue SITargetLowering::splitBinaryBitConstantOp(
return SDValue();
}
-// Returns true if argument is a boolean value which is not serialized into
-// memory or argument and does not require v_cndmask_b32 to be deserialized.
-static bool isBoolSGPR(SDValue V) {
+bool llvm::isBoolSGPR(SDValue V) {
if (V.getValueType() != MVT::i1)
return false;
switch (V.getOpcode()) {
@@ -10427,13 +11172,34 @@ calculateSrcByte(const SDValue Op, uint64_t DestByte, uint64_t SrcIndex = 0,
if (Depth >= 6)
return std::nullopt;
+ auto ValueSize = Op.getValueSizeInBits();
+ if (ValueSize != 8 && ValueSize != 16 && ValueSize != 32)
+ return std::nullopt;
+
switch (Op->getOpcode()) {
case ISD::TRUNCATE: {
- if (Op->getOperand(0).getScalarValueSizeInBits() != 32)
+ return calculateSrcByte(Op->getOperand(0), DestByte, SrcIndex, Depth + 1);
+ }
+
+ case ISD::SIGN_EXTEND:
+ case ISD::ZERO_EXTEND:
+ case ISD::SIGN_EXTEND_INREG: {
+ SDValue NarrowOp = Op->getOperand(0);
+ auto NarrowVT = NarrowOp.getValueType();
+ if (Op->getOpcode() == ISD::SIGN_EXTEND_INREG) {
+ auto *VTSign = cast<VTSDNode>(Op->getOperand(1));
+ NarrowVT = VTSign->getVT();
+ }
+ if (!NarrowVT.isByteSized())
+ return std::nullopt;
+ uint64_t NarrowByteWidth = NarrowVT.getStoreSize();
+
+ if (SrcIndex >= NarrowByteWidth)
return std::nullopt;
return calculateSrcByte(Op->getOperand(0), DestByte, SrcIndex, Depth + 1);
}
+ case ISD::SRA:
case ISD::SRL: {
auto ShiftOp = dyn_cast<ConstantSDNode>(Op->getOperand(1));
if (!ShiftOp)
@@ -10450,9 +11216,6 @@ calculateSrcByte(const SDValue Op, uint64_t DestByte, uint64_t SrcIndex = 0,
}
default: {
- if (Op.getScalarValueSizeInBits() != 32)
- return std::nullopt;
-
return ByteProvider<SDValue>::getSrc(Op, DestByte, SrcIndex);
}
}
@@ -10476,7 +11239,8 @@ calculateByteProvider(const SDValue &Op, unsigned Index, unsigned Depth,
unsigned BitWidth = Op.getScalarValueSizeInBits();
if (BitWidth % 8 != 0)
return std::nullopt;
- assert(Index < BitWidth / 8 && "invalid index requested");
+ if (Index > BitWidth / 8 - 1)
+ return std::nullopt;
switch (Op.getOpcode()) {
case ISD::OR: {
@@ -10519,6 +11283,31 @@ calculateByteProvider(const SDValue &Op, unsigned Index, unsigned Depth,
return calculateSrcByte(Op->getOperand(0), StartingIndex, Index);
}
+ case ISD::FSHR: {
+ // fshr(X,Y,Z): (X << (BW - (Z % BW))) | (Y >> (Z % BW))
+ auto ShiftOp = dyn_cast<ConstantSDNode>(Op->getOperand(2));
+ if (!ShiftOp || Op.getValueType().isVector())
+ return std::nullopt;
+
+ uint64_t BitsProvided = Op.getValueSizeInBits();
+ if (BitsProvided % 8 != 0)
+ return std::nullopt;
+
+ uint64_t BitShift = ShiftOp->getAPIntValue().urem(BitsProvided);
+ if (BitShift % 8)
+ return std::nullopt;
+
+ uint64_t ConcatSizeInBytes = BitsProvided / 4;
+ uint64_t ByteShift = BitShift / 8;
+
+ uint64_t NewIndex = (Index + ByteShift) % ConcatSizeInBytes;
+ uint64_t BytesProvided = BitsProvided / 8;
+ SDValue NextOp = Op.getOperand(NewIndex >= BytesProvided ? 0 : 1);
+ NewIndex %= BytesProvided;
+ return calculateByteProvider(NextOp, NewIndex, Depth + 1, StartingIndex);
+ }
+
+ case ISD::SRA:
case ISD::SRL: {
auto ShiftOp = dyn_cast<ConstantSDNode>(Op->getOperand(1));
if (!ShiftOp)
@@ -10565,9 +11354,18 @@ calculateByteProvider(const SDValue &Op, unsigned Index, unsigned Depth,
}
case ISD::ANY_EXTEND:
case ISD::SIGN_EXTEND:
- case ISD::ZERO_EXTEND: {
+ case ISD::ZERO_EXTEND:
+ case ISD::SIGN_EXTEND_INREG:
+ case ISD::AssertZext:
+ case ISD::AssertSext: {
SDValue NarrowOp = Op->getOperand(0);
- unsigned NarrowBitWidth = NarrowOp.getScalarValueSizeInBits();
+ unsigned NarrowBitWidth = NarrowOp.getValueSizeInBits();
+ if (Op->getOpcode() == ISD::SIGN_EXTEND_INREG ||
+ Op->getOpcode() == ISD::AssertZext ||
+ Op->getOpcode() == ISD::AssertSext) {
+ auto *VTSign = cast<VTSDNode>(Op->getOperand(1));
+ NarrowBitWidth = VTSign->getVT().getSizeInBits();
+ }
if (NarrowBitWidth % 8 != 0)
return std::nullopt;
uint64_t NarrowByteWidth = NarrowBitWidth / 8;
@@ -10581,10 +11379,7 @@ calculateByteProvider(const SDValue &Op, unsigned Index, unsigned Depth,
}
case ISD::TRUNCATE: {
- unsigned NarrowBitWidth = Op.getScalarValueSizeInBits();
- if (NarrowBitWidth % 8 != 0)
- return std::nullopt;
- uint64_t NarrowByteWidth = NarrowBitWidth / 8;
+ uint64_t NarrowByteWidth = BitWidth / 8;
if (NarrowByteWidth >= Index) {
return calculateByteProvider(Op.getOperand(0), Index, Depth + 1,
@@ -10594,8 +11389,16 @@ calculateByteProvider(const SDValue &Op, unsigned Index, unsigned Depth,
return std::nullopt;
}
+ case ISD::CopyFromReg: {
+ if (BitWidth / 8 > Index)
+ return calculateSrcByte(Op, StartingIndex, Index);
+
+ return std::nullopt;
+ }
+
case ISD::LOAD: {
auto L = cast<LoadSDNode>(Op.getNode());
+
unsigned NarrowBitWidth = L->getMemoryVT().getSizeInBits();
if (NarrowBitWidth % 8 != 0)
return std::nullopt;
@@ -10621,6 +11424,41 @@ calculateByteProvider(const SDValue &Op, unsigned Index, unsigned Depth,
case ISD::BSWAP:
return calculateByteProvider(Op->getOperand(0), BitWidth / 8 - Index - 1,
Depth + 1, StartingIndex);
+
+ case ISD::EXTRACT_VECTOR_ELT: {
+ auto IdxOp = dyn_cast<ConstantSDNode>(Op->getOperand(1));
+ if (!IdxOp)
+ return std::nullopt;
+ auto VecIdx = IdxOp->getZExtValue();
+ auto ScalarSize = Op.getScalarValueSizeInBits();
+ if (ScalarSize != 32) {
+ if ((VecIdx + 1) * ScalarSize > 32)
+ return std::nullopt;
+ Index = ScalarSize == 8 ? VecIdx : VecIdx * 2 + Index;
+ }
+
+ return calculateSrcByte(ScalarSize == 32 ? Op : Op.getOperand(0),
+ StartingIndex, Index);
+ }
+
+ case AMDGPUISD::PERM: {
+ auto PermMask = dyn_cast<ConstantSDNode>(Op->getOperand(2));
+ if (!PermMask)
+ return std::nullopt;
+
+ auto IdxMask =
+ (PermMask->getZExtValue() & (0xFF << (Index * 8))) >> (Index * 8);
+ if (IdxMask > 0x07 && IdxMask != 0x0c)
+ return std::nullopt;
+
+ auto NextOp = Op.getOperand(IdxMask > 0x03 ? 0 : 1);
+ auto NextIndex = IdxMask > 0x03 ? IdxMask % 4 : IdxMask;
+
+ return IdxMask != 0x0c ? calculateSrcByte(NextOp, StartingIndex, NextIndex)
+ : ByteProvider<SDValue>(
+ ByteProvider<SDValue>::getConstantZero());
+ }
+
default: {
return std::nullopt;
}
@@ -10630,7 +11468,8 @@ calculateByteProvider(const SDValue &Op, unsigned Index, unsigned Depth,
}
// Returns true if the Operand is a scalar and is 16 bits
-static bool is16BitScalarOp(SDValue &Operand) {
+static bool isExtendedFrom16Bits(SDValue &Operand) {
+
switch (Operand.getOpcode()) {
case ISD::ANY_EXTEND:
case ISD::SIGN_EXTEND:
@@ -10646,7 +11485,7 @@ static bool is16BitScalarOp(SDValue &Operand) {
auto MemVT = L->getMemoryVT();
return !MemVT.isVector() && MemVT.getSizeInBits() == 16;
}
- return false;
+ return L->getMemoryVT().getSizeInBits() == 16;
}
default:
return false;
@@ -10674,29 +11513,118 @@ static bool addresses16Bits(int Mask) {
// Do not lower into v_perm if the operands are actually 16 bit
// and the selected bits (based on PermMask) correspond with two
// easily addressable 16 bit operands.
-static bool hasEightBitAccesses(uint64_t PermMask, SDValue &Op,
+static bool hasNon16BitAccesses(uint64_t PermMask, SDValue &Op,
SDValue &OtherOp) {
int Low16 = PermMask & 0xffff;
int Hi16 = (PermMask & 0xffff0000) >> 16;
- // ByteProvider only accepts 32 bit operands
- assert(Op.getValueType().getSizeInBits() == 32);
- assert(OtherOp.getValueType().getSizeInBits() == 32);
+ assert(Op.getValueType().isByteSized());
+ assert(OtherOp.getValueType().isByteSized());
- auto OpIs16Bit = is16BitScalarOp(Op);
- auto OtherOpIs16Bit = is16BitScalarOp(Op);
+ auto TempOp = peekThroughBitcasts(Op);
+ auto TempOtherOp = peekThroughBitcasts(OtherOp);
- // If there is a size mismatch, then we must use masking on at least one
- // operand
- if (OpIs16Bit != OtherOpIs16Bit)
+ auto OpIs16Bit =
+ TempOtherOp.getValueSizeInBits() == 16 || isExtendedFrom16Bits(TempOp);
+ if (!OpIs16Bit)
return true;
- // If both operands are 16 bit, return whether or not we cleanly address both
- if (is16BitScalarOp(Op) && is16BitScalarOp(OtherOp))
- return !addresses16Bits(Low16) || !addresses16Bits(Hi16);
+ auto OtherOpIs16Bit = TempOtherOp.getValueSizeInBits() == 16 ||
+ isExtendedFrom16Bits(TempOtherOp);
+ if (!OtherOpIs16Bit)
+ return true;
- // Both are 32 bit operands
- return true;
+ // Do we cleanly address both
+ return !addresses16Bits(Low16) || !addresses16Bits(Hi16);
+}
+
+static SDValue matchPERM(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) {
+ SelectionDAG &DAG = DCI.DAG;
+ EVT VT = N->getValueType(0);
+
+ if (VT != MVT::i32)
+ return SDValue();
+
+ // VT is known to be MVT::i32, so we need to provide 4 bytes.
+ SmallVector<ByteProvider<SDValue>, 8> PermNodes;
+ for (int i = 0; i < 4; i++) {
+ // Find the ByteProvider that provides the ith byte of the result of OR
+ std::optional<ByteProvider<SDValue>> P =
+ calculateByteProvider(SDValue(N, 0), i, 0, /*StartingIndex = */ i);
+ // TODO support constantZero
+ if (!P || P->isConstantZero())
+ return SDValue();
+
+ PermNodes.push_back(*P);
+ }
+ if (PermNodes.size() != 4)
+ return SDValue();
+
+ int FirstSrc = 0;
+ std::optional<int> SecondSrc;
+ uint64_t PermMask = 0x00000000;
+ for (size_t i = 0; i < PermNodes.size(); i++) {
+ auto PermOp = PermNodes[i];
+ // Since the mask is applied to Src1:Src2, Src1 bytes must be offset
+ // by sizeof(Src2) = 4
+ int SrcByteAdjust = 4;
+
+ if (!PermOp.hasSameSrc(PermNodes[FirstSrc])) {
+ if (SecondSrc.has_value())
+ if (!PermOp.hasSameSrc(PermNodes[*SecondSrc]))
+ return SDValue();
+
+ // Set the index of the second distinct Src node
+ SecondSrc = i;
+ assert(!(PermNodes[*SecondSrc].Src->getValueSizeInBits() % 8));
+ SrcByteAdjust = 0;
+ }
+ assert(PermOp.SrcOffset + SrcByteAdjust < 8);
+ assert(!DAG.getDataLayout().isBigEndian());
+ PermMask |= (PermOp.SrcOffset + SrcByteAdjust) << (i * 8);
+ }
+
+ SDValue Op = *PermNodes[FirstSrc].Src;
+ SDValue OtherOp = SecondSrc.has_value() ? *PermNodes[*SecondSrc].Src
+ : *PermNodes[FirstSrc].Src;
+
+ // Check that we haven't just recreated the same FSHR node.
+ if (N->getOpcode() == ISD::FSHR &&
+ (N->getOperand(0) == Op || N->getOperand(0) == OtherOp) &&
+ (N->getOperand(1) == Op || N->getOperand(1) == OtherOp))
+ return SDValue();
+
+ // Check that we are not just extracting the bytes in order from an op
+ if (Op == OtherOp && Op.getValueSizeInBits() == 32) {
+ int Low16 = PermMask & 0xffff;
+ int Hi16 = (PermMask & 0xffff0000) >> 16;
+
+ bool WellFormedLow = (Low16 == 0x0504) || (Low16 == 0x0100);
+ bool WellFormedHi = (Hi16 == 0x0706) || (Hi16 == 0x0302);
+
+ // The perm op would really just produce Op. So combine into Op
+ if (WellFormedLow && WellFormedHi)
+ return DAG.getBitcast(MVT::getIntegerVT(32), Op);
+ }
+
+ if (hasNon16BitAccesses(PermMask, Op, OtherOp)) {
+ SDLoc DL(N);
+ assert(Op.getValueType().isByteSized() &&
+ OtherOp.getValueType().isByteSized());
+
+ // If the ultimate src is less than 32 bits, then we will only be
+ // using bytes 0: Op.getValueSizeInBytes() - 1 in the or.
+ // CalculateByteProvider would not have returned Op as source if we
+ // used a byte that is outside its ValueType. Thus, we are free to
+ // ANY_EXTEND as the extended bits are dont-cares.
+ Op = DAG.getBitcastedAnyExtOrTrunc(Op, DL, MVT::i32);
+ OtherOp = DAG.getBitcastedAnyExtOrTrunc(OtherOp, DL, MVT::i32);
+
+ return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, Op, OtherOp,
+ DAG.getConstant(PermMask, DL, MVT::i32));
+ }
+
+ return SDValue();
}
SDValue SITargetLowering::performOrCombine(SDNode *N,
@@ -10812,69 +11740,8 @@ SDValue SITargetLowering::performOrCombine(SDNode *N,
}
}
if (LHSMask == ~0u || RHSMask == ~0u) {
- SmallVector<ByteProvider<SDValue>, 8> PermNodes;
-
- // VT is known to be MVT::i32, so we need to provide 4 bytes.
- assert(VT == MVT::i32);
- for (int i = 0; i < 4; i++) {
- // Find the ByteProvider that provides the ith byte of the result of OR
- std::optional<ByteProvider<SDValue>> P =
- calculateByteProvider(SDValue(N, 0), i, 0, /*StartingIndex = */ i);
- // TODO support constantZero
- if (!P || P->isConstantZero())
- return SDValue();
-
- PermNodes.push_back(*P);
- }
- if (PermNodes.size() != 4)
- return SDValue();
-
- int FirstSrc = 0;
- std::optional<int> SecondSrc;
- uint64_t permMask = 0x00000000;
- for (size_t i = 0; i < PermNodes.size(); i++) {
- auto PermOp = PermNodes[i];
- // Since the mask is applied to Src1:Src2, Src1 bytes must be offset
- // by sizeof(Src2) = 4
- int SrcByteAdjust = 4;
-
- if (!PermOp.hasSameSrc(PermNodes[FirstSrc])) {
- if (SecondSrc.has_value())
- if (!PermOp.hasSameSrc(PermNodes[*SecondSrc]))
- return SDValue();
- // Set the index of the second distinct Src node
- SecondSrc = i;
- assert(PermNodes[*SecondSrc].Src->getValueType().getSizeInBits() ==
- 32);
- SrcByteAdjust = 0;
- }
- assert(PermOp.SrcOffset + SrcByteAdjust < 8);
- assert(!DAG.getDataLayout().isBigEndian());
- permMask |= (PermOp.SrcOffset + SrcByteAdjust) << (i * 8);
- }
-
- SDValue Op = *PermNodes[FirstSrc].Src;
- SDValue OtherOp = SecondSrc.has_value() ? *PermNodes[*SecondSrc].Src
- : *PermNodes[FirstSrc].Src;
-
- // Check that we are not just extracting the bytes in order from an op
- if (Op == OtherOp) {
- int Low16 = permMask & 0xffff;
- int Hi16 = (permMask & 0xffff0000) >> 16;
-
- bool WellFormedLow = (Low16 == 0x0504) || (Low16 == 0x0100);
- bool WellFormedHi = (Hi16 == 0x0706) || (Hi16 == 0x0302);
-
- // The perm op would really just produce Op. So combine into Op
- if (WellFormedLow && WellFormedHi)
- return Op;
- }
-
- if (hasEightBitAccesses(permMask, Op, OtherOp)) {
- SDLoc DL(N);
- return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, Op, OtherOp,
- DAG.getConstant(permMask, DL, MVT::i32));
- }
+ if (SDValue Perm = matchPERM(N, DCI))
+ return Perm;
}
}
@@ -11021,10 +11888,8 @@ SDValue SITargetLowering::performClassCombine(SDNode *N,
SDValue Mask = N->getOperand(1);
// fp_class x, 0 -> false
- if (const ConstantSDNode *CMask = dyn_cast<ConstantSDNode>(Mask)) {
- if (CMask->isZero())
- return DAG.getConstant(0, SDLoc(N), MVT::i1);
- }
+ if (isNullConstant(Mask))
+ return DAG.getConstant(0, SDLoc(N), MVT::i1);
if (N->getOperand(0).isUndef())
return DAG.getUNDEF(MVT::i1);
@@ -11049,7 +11914,9 @@ SDValue SITargetLowering::performRcpCombine(SDNode *N,
N->getFlags());
}
- if ((VT == MVT::f32 || VT == MVT::f16) && N0.getOpcode() == ISD::FSQRT) {
+ // TODO: Could handle f32 + amdgcn.sqrt but probably never reaches here.
+ if ((VT == MVT::f16 && N0.getOpcode() == ISD::FSQRT) &&
+ N->getFlags().hasAllowContract() && N0->getFlags().hasAllowContract()) {
return DCI.DAG.getNode(AMDGPUISD::RSQ, SDLoc(N), VT,
N0.getOperand(0), N->getFlags());
}
@@ -11131,10 +11998,14 @@ bool SITargetLowering::isCanonicalized(SelectionDAG &DAG, SDValue Op,
case ISD::FMAXNUM:
case ISD::FMINNUM_IEEE:
case ISD::FMAXNUM_IEEE:
+ case ISD::FMINIMUM:
+ case ISD::FMAXIMUM:
case AMDGPUISD::CLAMP:
case AMDGPUISD::FMED3:
case AMDGPUISD::FMAX3:
- case AMDGPUISD::FMIN3: {
+ case AMDGPUISD::FMIN3:
+ case AMDGPUISD::FMAXIMUM3:
+ case AMDGPUISD::FMINIMUM3: {
// FIXME: Shouldn't treat the generic operations different based these.
// However, we aren't really required to flush the result from
// minnum/maxnum..
@@ -11288,7 +12159,9 @@ bool SITargetLowering::isCanonicalized(Register Reg, MachineFunction &MF,
case AMDGPU::G_FMINNUM:
case AMDGPU::G_FMAXNUM:
case AMDGPU::G_FMINNUM_IEEE:
- case AMDGPU::G_FMAXNUM_IEEE: {
+ case AMDGPU::G_FMAXNUM_IEEE:
+ case AMDGPU::G_FMINIMUM:
+ case AMDGPU::G_FMAXIMUM: {
if (Subtarget->supportsMinMaxDenormModes() ||
// FIXME: denormalsEnabledForType is broken for dynamic
denormalsEnabledForType(MRI.getType(Reg), MF))
@@ -11302,7 +12175,8 @@ bool SITargetLowering::isCanonicalized(Register Reg, MachineFunction &MF,
return false;
return true;
case AMDGPU::G_INTRINSIC:
- switch (MI->getIntrinsicID()) {
+ case AMDGPU::G_INTRINSIC_CONVERGENT:
+ switch (cast<GIntrinsic>(MI)->getIntrinsicID()) {
case Intrinsic::amdgcn_fmul_legacy:
case Intrinsic::amdgcn_fmad_ftz:
case Intrinsic::amdgcn_sqrt:
@@ -11321,7 +12195,6 @@ bool SITargetLowering::isCanonicalized(Register Reg, MachineFunction &MF,
case Intrinsic::amdgcn_div_fmas:
case Intrinsic::amdgcn_div_fixup:
case Intrinsic::amdgcn_fract:
- case Intrinsic::amdgcn_ldexp:
case Intrinsic::amdgcn_cvt_pkrtz:
case Intrinsic::amdgcn_cubeid:
case Intrinsic::amdgcn_cubema:
@@ -11476,6 +12349,8 @@ static unsigned minMaxOpcToMin3Max3Opc(unsigned Opc) {
case ISD::FMAXNUM:
case ISD::FMAXNUM_IEEE:
return AMDGPUISD::FMAX3;
+ case ISD::FMAXIMUM:
+ return AMDGPUISD::FMAXIMUM3;
case ISD::SMAX:
return AMDGPUISD::SMAX3;
case ISD::UMAX:
@@ -11483,6 +12358,8 @@ static unsigned minMaxOpcToMin3Max3Opc(unsigned Opc) {
case ISD::FMINNUM:
case ISD::FMINNUM_IEEE:
return AMDGPUISD::FMIN3;
+ case ISD::FMINIMUM:
+ return AMDGPUISD::FMINIMUM3;
case ISD::SMIN:
return AMDGPUISD::SMIN3;
case ISD::UMIN:
@@ -11842,7 +12719,9 @@ SDValue SITargetLowering::performExtractVectorEltCombine(
case ISD::FMAXNUM:
case ISD::FMINNUM:
case ISD::FMAXNUM_IEEE:
- case ISD::FMINNUM_IEEE: {
+ case ISD::FMINNUM_IEEE:
+ case ISD::FMAXIMUM:
+ case ISD::FMINIMUM: {
SDValue Elt0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, ResVT,
Vec.getOperand(0), Idx);
SDValue Elt1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, ResVT,
@@ -12203,6 +13082,256 @@ SDValue SITargetLowering::tryFoldToMad64_32(SDNode *N,
return Accum;
}
+// Collect the ultimate src of each of the mul node's operands, and confirm
+// each operand is 8 bytes.
+static std::optional<ByteProvider<SDValue>>
+handleMulOperand(const SDValue &MulOperand) {
+ auto Byte0 = calculateByteProvider(MulOperand, 0, 0);
+ if (!Byte0 || Byte0->isConstantZero()) {
+ return std::nullopt;
+ }
+ auto Byte1 = calculateByteProvider(MulOperand, 1, 0);
+ if (Byte1 && !Byte1->isConstantZero()) {
+ return std::nullopt;
+ }
+ return Byte0;
+}
+
+static unsigned addPermMasks(unsigned First, unsigned Second) {
+ unsigned FirstCs = First & 0x0c0c0c0c;
+ unsigned SecondCs = Second & 0x0c0c0c0c;
+ unsigned FirstNoCs = First & ~0x0c0c0c0c;
+ unsigned SecondNoCs = Second & ~0x0c0c0c0c;
+
+ assert((FirstCs & 0xFF) | (SecondCs & 0xFF));
+ assert((FirstCs & 0xFF00) | (SecondCs & 0xFF00));
+ assert((FirstCs & 0xFF0000) | (SecondCs & 0xFF0000));
+ assert((FirstCs & 0xFF000000) | (SecondCs & 0xFF000000));
+
+ return (FirstNoCs | SecondNoCs) | (FirstCs & SecondCs);
+}
+
+static void placeSources(ByteProvider<SDValue> &Src0,
+ ByteProvider<SDValue> &Src1,
+ SmallVectorImpl<std::pair<SDValue, unsigned>> &Src0s,
+ SmallVectorImpl<std::pair<SDValue, unsigned>> &Src1s,
+ int Step) {
+
+ assert(Src0.Src.has_value() && Src1.Src.has_value());
+ // Src0s and Src1s are empty, just place arbitrarily.
+ if (Step == 0) {
+ Src0s.push_back({*Src0.Src, (Src0.SrcOffset << 24) + 0x0c0c0c});
+ Src1s.push_back({*Src1.Src, (Src1.SrcOffset << 24) + 0x0c0c0c});
+ return;
+ }
+
+ for (int BPI = 0; BPI < 2; BPI++) {
+ std::pair<ByteProvider<SDValue>, ByteProvider<SDValue>> BPP = {Src0, Src1};
+ if (BPI == 1) {
+ BPP = {Src1, Src0};
+ }
+ unsigned ZeroMask = 0x0c0c0c0c;
+ unsigned FMask = 0xFF << (8 * (3 - Step));
+
+ unsigned FirstMask =
+ BPP.first.SrcOffset << (8 * (3 - Step)) | (ZeroMask & ~FMask);
+ unsigned SecondMask =
+ BPP.second.SrcOffset << (8 * (3 - Step)) | (ZeroMask & ~FMask);
+ // Attempt to find Src vector which contains our SDValue, if so, add our
+ // perm mask to the existing one. If we are unable to find a match for the
+ // first SDValue, attempt to find match for the second.
+ int FirstGroup = -1;
+ for (int I = 0; I < 2; I++) {
+ SmallVectorImpl<std::pair<SDValue, unsigned>> &Srcs =
+ I == 0 ? Src0s : Src1s;
+ auto MatchesFirst = [&BPP](std::pair<SDValue, unsigned> IterElt) {
+ return IterElt.first == *BPP.first.Src;
+ };
+
+ auto Match = llvm::find_if(Srcs, MatchesFirst);
+ if (Match != Srcs.end()) {
+ Match->second = addPermMasks(FirstMask, Match->second);
+ FirstGroup = I;
+ break;
+ }
+ }
+ if (FirstGroup != -1) {
+ SmallVectorImpl<std::pair<SDValue, unsigned>> &Srcs =
+ FirstGroup == 1 ? Src0s : Src1s;
+ auto MatchesSecond = [&BPP](std::pair<SDValue, unsigned> IterElt) {
+ return IterElt.first == *BPP.second.Src;
+ };
+ auto Match = llvm::find_if(Srcs, MatchesSecond);
+ if (Match != Srcs.end()) {
+ Match->second = addPermMasks(SecondMask, Match->second);
+ } else
+ Srcs.push_back({*BPP.second.Src, SecondMask});
+ return;
+ }
+ }
+
+ // If we have made it here, then we could not find a match in Src0s or Src1s
+ // for either Src0 or Src1, so just place them arbitrarily.
+
+ unsigned ZeroMask = 0x0c0c0c0c;
+ unsigned FMask = 0xFF << (8 * (3 - Step));
+
+ Src0s.push_back(
+ {*Src0.Src, (Src0.SrcOffset << (8 * (3 - Step)) | (ZeroMask & ~FMask))});
+ Src1s.push_back(
+ {*Src1.Src, (Src1.SrcOffset << (8 * (3 - Step)) | (ZeroMask & ~FMask))});
+
+ return;
+}
+
+static SDValue
+resolveSources(SelectionDAG &DAG, SDLoc SL,
+ SmallVectorImpl<std::pair<SDValue, unsigned>> &Srcs,
+ bool IsSigned, bool IsAny) {
+
+ // If we just have one source, just permute it accordingly.
+ if (Srcs.size() == 1) {
+ auto Elt = Srcs.begin();
+ auto EltVal = DAG.getBitcastedAnyExtOrTrunc(Elt->first, SL, MVT::i32);
+
+ // v_perm will produce the original value.
+ if (Elt->second == 0x3020100)
+ return EltVal;
+
+ return DAG.getNode(AMDGPUISD::PERM, SL, MVT::i32, EltVal, EltVal,
+ DAG.getConstant(Elt->second, SL, MVT::i32));
+ }
+
+ auto FirstElt = Srcs.begin();
+ auto SecondElt = std::next(FirstElt);
+
+ SmallVector<SDValue, 2> Perms;
+
+ // If we have multiple sources in the chain, combine them via perms (using
+ // calculated perm mask) and Ors.
+ while (true) {
+ auto FirstMask = FirstElt->second;
+ auto SecondMask = SecondElt->second;
+
+ unsigned FirstCs = FirstMask & 0x0c0c0c0c;
+ unsigned FirstPlusFour = FirstMask | 0x04040404;
+ // 0x0c + 0x04 = 0x10, so anding with 0x0F will produced 0x00 for any
+ // original 0x0C.
+ FirstMask = (FirstPlusFour & 0x0F0F0F0F) | FirstCs;
+
+ auto PermMask = addPermMasks(FirstMask, SecondMask);
+ auto FirstVal =
+ DAG.getBitcastedAnyExtOrTrunc(FirstElt->first, SL, MVT::i32);
+ auto SecondVal =
+ DAG.getBitcastedAnyExtOrTrunc(SecondElt->first, SL, MVT::i32);
+
+ Perms.push_back(DAG.getNode(AMDGPUISD::PERM, SL, MVT::i32, FirstVal,
+ SecondVal,
+ DAG.getConstant(PermMask, SL, MVT::i32)));
+
+ FirstElt = std::next(SecondElt);
+ if (FirstElt == Srcs.end())
+ break;
+
+ SecondElt = std::next(FirstElt);
+ // If we only have a FirstElt, then just combine that into the cumulative
+ // source node.
+ if (SecondElt == Srcs.end()) {
+ auto EltVal =
+ DAG.getBitcastedAnyExtOrTrunc(FirstElt->first, SL, MVT::i32);
+
+ Perms.push_back(
+ DAG.getNode(AMDGPUISD::PERM, SL, MVT::i32, EltVal, EltVal,
+ DAG.getConstant(FirstElt->second, SL, MVT::i32)));
+ break;
+ }
+ }
+
+ assert(Perms.size() == 1 || Perms.size() == 2);
+ return Perms.size() == 2
+ ? DAG.getNode(ISD::OR, SL, MVT::i32, Perms[0], Perms[1])
+ : Perms[0];
+}
+
+static void fixMasks(SmallVectorImpl<std::pair<SDValue, unsigned>> &Srcs,
+ unsigned ChainLength) {
+ for (auto &[EntryVal, EntryMask] : Srcs) {
+ EntryMask = EntryMask >> ((4 - ChainLength) * 8);
+ auto ZeroMask = ChainLength == 2 ? 0x0c0c0000 : 0x0c000000;
+ EntryMask += ZeroMask;
+ }
+}
+
+static bool isMul(const SDValue Op) {
+ auto Opcode = Op.getOpcode();
+
+ return (Opcode == ISD::MUL || Opcode == AMDGPUISD::MUL_U24 ||
+ Opcode == AMDGPUISD::MUL_I24);
+}
+
+static std::optional<bool>
+checkDot4MulSignedness(const SDValue &N, ByteProvider<SDValue> &Src0,
+ ByteProvider<SDValue> &Src1, const SDValue &S0Op,
+ const SDValue &S1Op, const SelectionDAG &DAG) {
+ // If we both ops are i8s (pre legalize-dag), then the signedness semantics
+ // of the dot4 is irrelevant.
+ if (S0Op.getValueSizeInBits() == 8 && S1Op.getValueSizeInBits() == 8)
+ return false;
+
+ auto Known0 = DAG.computeKnownBits(S0Op, 0);
+ bool S0IsUnsigned = Known0.countMinLeadingZeros() > 0;
+ bool S0IsSigned = Known0.countMinLeadingOnes() > 0;
+ auto Known1 = DAG.computeKnownBits(S1Op, 0);
+ bool S1IsUnsigned = Known1.countMinLeadingZeros() > 0;
+ bool S1IsSigned = Known1.countMinLeadingOnes() > 0;
+
+ assert(!(S0IsUnsigned && S0IsSigned));
+ assert(!(S1IsUnsigned && S1IsSigned));
+
+ // There are 9 possible permutations of
+ // {S0IsUnsigned, S0IsSigned, S1IsUnsigned, S1IsSigned}
+
+ // In two permutations, the sign bits are known to be the same for both Ops,
+ // so simply return Signed / Unsigned corresponding to the MSB
+
+ if ((S0IsUnsigned && S1IsUnsigned) || (S0IsSigned && S1IsSigned))
+ return S0IsSigned;
+
+ // In another two permutations, the sign bits are known to be opposite. In
+ // this case return std::nullopt to indicate a bad match.
+
+ if ((S0IsUnsigned && S1IsSigned) || (S0IsSigned && S1IsUnsigned))
+ return std::nullopt;
+
+ // In the remaining five permutations, we don't know the value of the sign
+ // bit for at least one Op. Since we have a valid ByteProvider, we know that
+ // the upper bits must be extension bits. Thus, the only ways for the sign
+ // bit to be unknown is if it was sign extended from unknown value, or if it
+ // was any extended. In either case, it is correct to use the signed
+ // version of the signedness semantics of dot4
+
+ // In two of such permutations, we known the sign bit is set for
+ // one op, and the other is unknown. It is okay to used signed version of
+ // dot4.
+ if ((S0IsSigned && !(S1IsSigned || S1IsUnsigned)) ||
+ ((S1IsSigned && !(S0IsSigned || S0IsUnsigned))))
+ return true;
+
+ // In one such permutation, we don't know either of the sign bits. It is okay
+ // to used the signed version of dot4.
+ if ((!(S1IsSigned || S1IsUnsigned) && !(S0IsSigned || S0IsUnsigned)))
+ return true;
+
+ // In two of such permutations, we known the sign bit is unset for
+ // one op, and the other is unknown. Return std::nullopt to indicate a
+ // bad match.
+ if ((S0IsUnsigned && !(S1IsSigned || S1IsUnsigned)) ||
+ ((S1IsUnsigned && !(S0IsSigned || S0IsUnsigned))))
+ return std::nullopt;
+
+ llvm_unreachable("Fully covered condition");
+}
+
SDValue SITargetLowering::performAddCombine(SDNode *N,
DAGCombinerInfo &DCI) const {
SelectionDAG &DAG = DCI.DAG;
@@ -12216,14 +13345,146 @@ SDValue SITargetLowering::performAddCombine(SDNode *N,
if (SDValue Folded = tryFoldToMad64_32(N, DCI))
return Folded;
}
-
- return SDValue();
}
if (SDValue V = reassociateScalarOps(N, DAG)) {
return V;
}
+ if ((isMul(LHS) || isMul(RHS)) && Subtarget->hasDot7Insts() &&
+ (Subtarget->hasDot1Insts() || Subtarget->hasDot8Insts())) {
+ SDValue TempNode(N, 0);
+ std::optional<bool> IsSigned;
+ SmallVector<std::pair<SDValue, unsigned>, 4> Src0s;
+ SmallVector<std::pair<SDValue, unsigned>, 4> Src1s;
+ SmallVector<SDValue, 4> Src2s;
+
+ // Match the v_dot4 tree, while collecting src nodes.
+ int ChainLength = 0;
+ for (int I = 0; I < 4; I++) {
+ auto MulIdx = isMul(LHS) ? 0 : isMul(RHS) ? 1 : -1;
+ if (MulIdx == -1)
+ break;
+ auto Src0 = handleMulOperand(TempNode->getOperand(MulIdx)->getOperand(0));
+ if (!Src0)
+ break;
+ auto Src1 = handleMulOperand(TempNode->getOperand(MulIdx)->getOperand(1));
+ if (!Src1)
+ break;
+
+ auto IterIsSigned = checkDot4MulSignedness(
+ TempNode->getOperand(MulIdx), *Src0, *Src1,
+ TempNode->getOperand(MulIdx)->getOperand(0),
+ TempNode->getOperand(MulIdx)->getOperand(1), DAG);
+ if (!IterIsSigned)
+ break;
+ if (!IsSigned)
+ IsSigned = *IterIsSigned;
+ if (*IterIsSigned != *IsSigned)
+ break;
+ placeSources(*Src0, *Src1, Src0s, Src1s, I);
+ auto AddIdx = 1 - MulIdx;
+ // Allow the special case where add (add (mul24, 0), mul24) became ->
+ // add (mul24, mul24).
+ if (I == 2 && isMul(TempNode->getOperand(AddIdx))) {
+ Src2s.push_back(TempNode->getOperand(AddIdx));
+ auto Src0 =
+ handleMulOperand(TempNode->getOperand(AddIdx)->getOperand(0));
+ if (!Src0)
+ break;
+ auto Src1 =
+ handleMulOperand(TempNode->getOperand(AddIdx)->getOperand(1));
+ if (!Src1)
+ break;
+ auto IterIsSigned = checkDot4MulSignedness(
+ TempNode->getOperand(AddIdx), *Src0, *Src1,
+ TempNode->getOperand(AddIdx)->getOperand(0),
+ TempNode->getOperand(AddIdx)->getOperand(1), DAG);
+ if (!IterIsSigned)
+ break;
+ assert(IsSigned);
+ if (*IterIsSigned != *IsSigned)
+ break;
+ placeSources(*Src0, *Src1, Src0s, Src1s, I + 1);
+ Src2s.push_back(DAG.getConstant(0, SL, MVT::i32));
+ ChainLength = I + 2;
+ break;
+ }
+
+ TempNode = TempNode->getOperand(AddIdx);
+ Src2s.push_back(TempNode);
+ ChainLength = I + 1;
+ if (TempNode->getNumOperands() < 2)
+ break;
+ LHS = TempNode->getOperand(0);
+ RHS = TempNode->getOperand(1);
+ }
+
+ if (ChainLength < 2)
+ return SDValue();
+
+ // Masks were constructed with assumption that we would find a chain of
+ // length 4. If not, then we need to 0 out the MSB bits (via perm mask of
+ // 0x0c) so they do not affect dot calculation.
+ if (ChainLength < 4) {
+ fixMasks(Src0s, ChainLength);
+ fixMasks(Src1s, ChainLength);
+ }
+
+ SDValue Src0, Src1;
+
+ // If we are just using a single source for both, and have permuted the
+ // bytes consistently, we can just use the sources without permuting
+ // (commutation).
+ bool UseOriginalSrc = false;
+ if (ChainLength == 4 && Src0s.size() == 1 && Src1s.size() == 1 &&
+ Src0s.begin()->second == Src1s.begin()->second &&
+ Src0s.begin()->first.getValueSizeInBits() == 32 &&
+ Src1s.begin()->first.getValueSizeInBits() == 32) {
+ SmallVector<unsigned, 4> SrcBytes;
+ auto Src0Mask = Src0s.begin()->second;
+ SrcBytes.push_back(Src0Mask & 0xFF000000);
+ bool UniqueEntries = true;
+ for (auto I = 1; I < 4; I++) {
+ auto NextByte = Src0Mask & (0xFF << ((3 - I) * 8));
+
+ if (is_contained(SrcBytes, NextByte)) {
+ UniqueEntries = false;
+ break;
+ }
+ SrcBytes.push_back(NextByte);
+ }
+
+ if (UniqueEntries) {
+ UseOriginalSrc = true;
+ // Must be 32 bits to enter above conditional.
+ assert(Src0s.begin()->first.getValueSizeInBits() == 32);
+ assert(Src1s.begin()->first.getValueSizeInBits() == 32);
+ Src0 = DAG.getBitcast(MVT::getIntegerVT(32), Src0s.begin()->first);
+ Src1 = DAG.getBitcast(MVT::getIntegerVT(32), Src1s.begin()->first);
+ }
+ }
+
+ if (!UseOriginalSrc) {
+ Src0 = resolveSources(DAG, SL, Src0s, false, true);
+ Src1 = resolveSources(DAG, SL, Src1s, false, true);
+ }
+
+ assert(IsSigned);
+ SDValue Src2 =
+ DAG.getExtOrTrunc(*IsSigned, Src2s[ChainLength - 1], SL, MVT::i32);
+
+ SDValue IID = DAG.getTargetConstant(*IsSigned ? Intrinsic::amdgcn_sdot4
+ : Intrinsic::amdgcn_udot4,
+ SL, MVT::i64);
+
+ assert(!VT.isVector());
+ auto Dot = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, MVT::i32, IID, Src0,
+ Src1, Src2, DAG.getTargetConstant(0, SL, MVT::i1));
+
+ return DAG.getExtOrTrunc(*IsSigned, Dot, SL, VT);
+ }
+
if (VT != MVT::i32 || !DCI.isAfterLegalizeDAG())
return SDValue();
@@ -12295,8 +13556,7 @@ SDValue SITargetLowering::performSubCombine(SDNode *N,
if (LHS.getOpcode() == ISD::USUBO_CARRY) {
// sub (usubo_carry x, 0, cc), y => usubo_carry x, y, cc
- auto C = dyn_cast<ConstantSDNode>(LHS.getOperand(1));
- if (!C || !C->isZero())
+ if (!isNullConstant(LHS.getOperand(1)))
return SDValue();
SDValue Args[] = { LHS.getOperand(0), RHS, LHS.getOperand(2) };
return DAG.getNode(ISD::USUBO_CARRY, SDLoc(N), LHS->getVTList(), Args);
@@ -12417,6 +13677,41 @@ SDValue SITargetLowering::performFSubCombine(SDNode *N,
return SDValue();
}
+SDValue SITargetLowering::performFDivCombine(SDNode *N,
+ DAGCombinerInfo &DCI) const {
+ SelectionDAG &DAG = DCI.DAG;
+ SDLoc SL(N);
+ EVT VT = N->getValueType(0);
+ if (VT != MVT::f16 || !Subtarget->has16BitInsts())
+ return SDValue();
+
+ SDValue LHS = N->getOperand(0);
+ SDValue RHS = N->getOperand(1);
+
+ SDNodeFlags Flags = N->getFlags();
+ SDNodeFlags RHSFlags = RHS->getFlags();
+ if (!Flags.hasAllowContract() || !RHSFlags.hasAllowContract() ||
+ !RHS->hasOneUse())
+ return SDValue();
+
+ if (const ConstantFPSDNode *CLHS = dyn_cast<ConstantFPSDNode>(LHS)) {
+ bool IsNegative = false;
+ if (CLHS->isExactlyValue(1.0) ||
+ (IsNegative = CLHS->isExactlyValue(-1.0))) {
+ // fdiv contract 1.0, (sqrt contract x) -> rsq for f16
+ // fdiv contract -1.0, (sqrt contract x) -> fneg(rsq) for f16
+ if (RHS.getOpcode() == ISD::FSQRT) {
+ // TODO: Or in RHS flags, somehow missing from SDNodeFlags
+ SDValue Rsq =
+ DAG.getNode(AMDGPUISD::RSQ, SL, VT, RHS.getOperand(0), Flags);
+ return IsNegative ? DAG.getNode(ISD::FNEG, SL, VT, Rsq, Flags) : Rsq;
+ }
+ }
+ }
+
+ return SDValue();
+}
+
SDValue SITargetLowering::performFMACombine(SDNode *N,
DAGCombinerInfo &DCI) const {
SelectionDAG &DAG = DCI.DAG;
@@ -12666,7 +13961,7 @@ SDValue SITargetLowering::performClampCombine(SDNode *N,
SDValue SITargetLowering::PerformDAGCombine(SDNode *N,
DAGCombinerInfo &DCI) const {
- if (getTargetMachine().getOptLevel() == CodeGenOpt::None)
+ if (getTargetMachine().getOptLevel() == CodeGenOptLevel::None)
return SDValue();
switch (N->getOpcode()) {
case ISD::ADD:
@@ -12680,12 +13975,16 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N,
return performFAddCombine(N, DCI);
case ISD::FSUB:
return performFSubCombine(N, DCI);
+ case ISD::FDIV:
+ return performFDivCombine(N, DCI);
case ISD::SETCC:
return performSetCCCombine(N, DCI);
case ISD::FMAXNUM:
case ISD::FMINNUM:
case ISD::FMAXNUM_IEEE:
case ISD::FMINNUM_IEEE:
+ case ISD::FMAXIMUM:
+ case ISD::FMINIMUM:
case ISD::SMAX:
case ISD::SMIN:
case ISD::UMAX:
@@ -12699,6 +13998,14 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N,
return performAndCombine(N, DCI);
case ISD::OR:
return performOrCombine(N, DCI);
+ case ISD::FSHR: {
+ const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
+ if (N->getValueType(0) == MVT::i32 && N->isDivergent() &&
+ TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32_e64) != -1) {
+ return matchPERM(N, DCI);
+ }
+ break;
+ }
case ISD::XOR:
return performXorCombine(N, DCI);
case ISD::ZERO_EXTEND:
@@ -12793,7 +14100,7 @@ static unsigned SubIdx2Lane(unsigned Idx) {
}
}
-/// Adjust the writemask of MIMG instructions
+/// Adjust the writemask of MIMG, VIMAGE or VSAMPLE instructions
SDNode *SITargetLowering::adjustWritemask(MachineSDNode *&Node,
SelectionDAG &DAG) const {
unsigned Opcode = Node->getMachineOpcode();
@@ -12811,7 +14118,7 @@ SDNode *SITargetLowering::adjustWritemask(MachineSDNode *&Node,
unsigned TFEIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::tfe) - 1;
unsigned LWEIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::lwe) - 1;
bool UsesTFC = ((int(TFEIdx) >= 0 && Node->getConstantOperandVal(TFEIdx)) ||
- Node->getConstantOperandVal(LWEIdx))
+ (int(LWEIdx) >= 0 && Node->getConstantOperandVal(LWEIdx)))
? true
: false;
unsigned TFCLane = 0;
@@ -12943,7 +14250,11 @@ SDNode *SITargetLowering::adjustWritemask(MachineSDNode *&Node,
continue;
} else {
SDValue Op = DAG.getTargetConstant(Idx, SDLoc(User), MVT::i32);
- DAG.UpdateNodeOperands(User, SDValue(NewNode, 0), Op);
+ SDNode *NewUser = DAG.UpdateNodeOperands(User, SDValue(NewNode, 0), Op);
+ if (NewUser != User) {
+ DAG.ReplaceAllUsesWith(SDValue(User, 0), SDValue(NewUser, 0));
+ DAG.RemoveDeadNode(User);
+ }
}
switch (Idx) {
@@ -13019,7 +14330,7 @@ SDNode *SITargetLowering::PostISelFolding(MachineSDNode *Node,
const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
unsigned Opcode = Node->getMachineOpcode();
- if (TII->isMIMG(Opcode) && !TII->get(Opcode).mayStore() &&
+ if (TII->isImage(Opcode) && !TII->get(Opcode).mayStore() &&
!TII->isGather4(Opcode) &&
AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::dmask)) {
return adjustWritemask(Node, DAG);
@@ -13106,7 +14417,7 @@ void SITargetLowering::AddIMGInit(MachineInstr &MI) const {
return;
unsigned TFEVal = TFE ? TFE->getImm() : 0;
- unsigned LWEVal = LWE->getImm();
+ unsigned LWEVal = LWE ? LWE->getImm() : 0;
unsigned D16Val = D16 ? D16->getImm() : 0;
if (!TFEVal && !LWEVal)
@@ -13183,7 +14494,9 @@ void SITargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI,
SDNode *Node) const {
const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
- MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
+ MachineFunction *MF = MI.getParent()->getParent();
+ MachineRegisterInfo &MRI = MF->getRegInfo();
+ SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>();
if (TII->isVOP3(MI.getOpcode())) {
// Make sure constant bus requirements are respected.
@@ -13194,11 +14507,16 @@ void SITargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI,
// use between vgpr and agpr as agpr tuples tend to be big.
if (!MI.getDesc().operands().empty()) {
unsigned Opc = MI.getOpcode();
+ bool HasAGPRs = Info->mayNeedAGPRs();
const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
- for (auto I : { AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0),
- AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1) }) {
+ int16_t Src2Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2);
+ for (auto I :
+ {AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0),
+ AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1), Src2Idx}) {
if (I == -1)
break;
+ if ((I == Src2Idx) && (HasAGPRs))
+ break;
MachineOperand &Op = MI.getOperand(I);
if (!Op.isReg() || !Op.getReg().isVirtual())
continue;
@@ -13216,6 +14534,9 @@ void SITargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI,
MRI.setRegClass(Op.getReg(), NewRC);
}
+ if (!HasAGPRs)
+ return;
+
// Resolve the rest of AV operands to AGPRs.
if (auto *Src2 = TII->getNamedOperand(MI, AMDGPU::OpName::src2)) {
if (Src2->isReg() && Src2->getReg().isVirtual()) {
@@ -13233,7 +14554,7 @@ void SITargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI,
return;
}
- if (TII->isMIMG(MI)) {
+ if (TII->isImage(MI)) {
if (!MI.mayStore())
AddIMGInit(MI);
TII->enforceOperandRCAlignment(MI, AMDGPU::OpName::vaddr);
@@ -13377,7 +14698,7 @@ SITargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI_,
return std::pair(0U, RC);
}
- if (Constraint.startswith("{") && Constraint.endswith("}")) {
+ if (Constraint.starts_with("{") && Constraint.ends_with("}")) {
StringRef RegName(Constraint.data() + 1, Constraint.size() - 2);
if (RegName.consume_front("v")) {
RC = &AMDGPU::VGPR_32RegClass;
@@ -13467,7 +14788,7 @@ static uint64_t clearUnusedBits(uint64_t Val, unsigned Size) {
}
void SITargetLowering::LowerAsmOperandForConstraint(SDValue Op,
- std::string &Constraint,
+ StringRef Constraint,
std::vector<SDValue> &Ops,
SelectionDAG &DAG) const {
if (isImmConstraint(Constraint)) {
@@ -13516,8 +14837,7 @@ bool SITargetLowering::getAsmOperandConstVal(SDValue Op, uint64_t &Val) const {
return false;
}
-bool SITargetLowering::checkAsmConstraintVal(SDValue Op,
- const std::string &Constraint,
+bool SITargetLowering::checkAsmConstraintVal(SDValue Op, StringRef Constraint,
uint64_t Val) const {
if (Constraint.size() == 1) {
switch (Constraint[0]) {
@@ -13735,8 +15055,9 @@ void SITargetLowering::computeKnownBitsForTargetInstr(
const MachineRegisterInfo &MRI, unsigned Depth) const {
const MachineInstr *MI = MRI.getVRegDef(R);
switch (MI->getOpcode()) {
- case AMDGPU::G_INTRINSIC: {
- switch (MI->getIntrinsicID()) {
+ case AMDGPU::G_INTRINSIC:
+ case AMDGPU::G_INTRINSIC_CONVERGENT: {
+ switch (cast<GIntrinsic>(MI)->getIntrinsicID()) {
case Intrinsic::amdgcn_workitem_id_x:
knownBitsForWorkitemID(*getSubtarget(), KB, Known, 0);
break;
@@ -13801,21 +15122,16 @@ Align SITargetLowering::computeKnownAlignForTargetInstr(
GISelKnownBits &KB, Register R, const MachineRegisterInfo &MRI,
unsigned Depth) const {
const MachineInstr *MI = MRI.getVRegDef(R);
- switch (MI->getOpcode()) {
- case AMDGPU::G_INTRINSIC:
- case AMDGPU::G_INTRINSIC_W_SIDE_EFFECTS: {
+ if (auto *GI = dyn_cast<GIntrinsic>(MI)) {
// FIXME: Can this move to generic code? What about the case where the call
// site specifies a lower alignment?
- Intrinsic::ID IID = MI->getIntrinsicID();
+ Intrinsic::ID IID = GI->getIntrinsicID();
LLVMContext &Ctx = KB.getMachineFunction().getFunction().getContext();
AttributeList Attrs = Intrinsic::getAttributes(Ctx, IID);
if (MaybeAlign RetAlign = Attrs.getRetAlignment())
return *RetAlign;
- return Align(1);
- }
- default:
- return Align(1);
}
+ return Align(1);
}
Align SITargetLowering::getPrefLoopAlignment(MachineLoop *ML) const {
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIISelLowering.h b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIISelLowering.h
index 1745c0b9e88e..5bc091d6e84d 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIISelLowering.h
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIISelLowering.h
@@ -109,6 +109,8 @@ private:
SDValue LowerFFREXP(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerSTORE(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerTrig(SDValue Op, SelectionDAG &DAG) const;
+ SDValue lowerFSQRTF16(SDValue Op, SelectionDAG &DAG) const;
+ SDValue lowerFSQRTF32(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerFSQRTF64(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerATOMIC_CMP_SWAP(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerBRCOND(SDValue Op, SelectionDAG &DAG) const;
@@ -212,13 +214,15 @@ private:
SDValue performSubCombine(SDNode *N, DAGCombinerInfo &DCI) const;
SDValue performFAddCombine(SDNode *N, DAGCombinerInfo &DCI) const;
SDValue performFSubCombine(SDNode *N, DAGCombinerInfo &DCI) const;
+ SDValue performFDivCombine(SDNode *N, DAGCombinerInfo &DCI) const;
SDValue performFMACombine(SDNode *N, DAGCombinerInfo &DCI) const;
SDValue performSetCCCombine(SDNode *N, DAGCombinerInfo &DCI) const;
SDValue performCvtF32UByteNCombine(SDNode *N, DAGCombinerInfo &DCI) const;
SDValue performClampCombine(SDNode *N, DAGCombinerInfo &DCI) const;
SDValue performRcpCombine(SDNode *N, DAGCombinerInfo &DCI) const;
- bool isLegalFlatAddressingMode(const AddrMode &AM) const;
+ bool isLegalFlatAddressingMode(const AddrMode &AM, unsigned AddrSpace,
+ uint64_t FlatVariant) const;
bool isLegalMUBUFAddressingMode(const AddrMode &AM) const;
unsigned isCFIntrinsic(const SDNode *Intr) const;
@@ -409,6 +413,10 @@ public:
SDValue lowerDYNAMIC_STACKALLOCImpl(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerSTACKSAVE(SDValue Op, SelectionDAG &DAG) const;
+ SDValue lowerGET_ROUNDING(SDValue Op, SelectionDAG &DAG) const;
+
+ SDValue lowerPREFETCH(SDValue Op, SelectionDAG &DAG) const;
Register getRegisterByName(const char* RegName, LLT VT,
const MachineFunction &MF) const override;
@@ -463,13 +471,11 @@ public:
getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
StringRef Constraint, MVT VT) const override;
ConstraintType getConstraintType(StringRef Constraint) const override;
- void LowerAsmOperandForConstraint(SDValue Op,
- std::string &Constraint,
+ void LowerAsmOperandForConstraint(SDValue Op, StringRef Constraint,
std::vector<SDValue> &Ops,
SelectionDAG &DAG) const override;
bool getAsmOperandConstVal(SDValue Op, uint64_t &Val) const;
- bool checkAsmConstraintVal(SDValue Op,
- const std::string &Constraint,
+ bool checkAsmConstraintVal(SDValue Op, StringRef Constraint,
uint64_t Val) const;
bool checkAsmConstraintValA(SDValue Op,
uint64_t Val,
@@ -543,6 +549,17 @@ public:
const SIRegisterInfo &TRI,
SIMachineFunctionInfo &Info) const;
+ void allocatePreloadKernArgSGPRs(CCState &CCInfo,
+ SmallVectorImpl<CCValAssign> &ArgLocs,
+ const SmallVectorImpl<ISD::InputArg> &Ins,
+ MachineFunction &MF,
+ const SIRegisterInfo &TRI,
+ SIMachineFunctionInfo &Info) const;
+
+ void allocateLDSKernelId(CCState &CCInfo, MachineFunction &MF,
+ const SIRegisterInfo &TRI,
+ SIMachineFunctionInfo &Info) const;
+
void allocateSystemSGPRs(CCState &CCInfo,
MachineFunction &MF,
SIMachineFunctionInfo &Info,
@@ -572,6 +589,10 @@ public:
getTargetMMOFlags(const Instruction &I) const override;
};
+// Returns true if argument is a boolean value which is not serialized into
+// memory or argument and does not require v_cndmask_b32 to be deserialized.
+bool isBoolSGPR(SDValue V);
+
} // End namespace llvm
#endif
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInsertHardClauses.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInsertHardClauses.cpp
index 50f8ad4433c6..442ae4dd7b34 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInsertHardClauses.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInsertHardClauses.cpp
@@ -232,7 +232,10 @@ public:
// scheduler it limits the size of the cluster to avoid increasing
// register pressure too much, but this pass runs after register
// allocation so there is no need for that kind of limit.
- !SII->shouldClusterMemOps(CI.BaseOps, BaseOps, 2, 2)))) {
+ // We also lie about the Offset and OffsetIsScalable parameters,
+ // as they aren't used in the SIInstrInfo implementation.
+ !SII->shouldClusterMemOps(CI.BaseOps, 0, false, BaseOps, 0, false,
+ 2, 2)))) {
// Finish the current clause.
Changed |= emitClause(CI, SII);
CI = ClauseInfo();
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
index a74b917f82bf..8415a3d77d3b 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -292,6 +292,11 @@ public:
VgprVmemTypes[GprNo] = 0;
}
+ void setNonKernelFunctionInitialState() {
+ setScoreUB(VS_CNT, getWaitCountMax(VS_CNT));
+ PendingEvents |= WaitEventMaskForInst[VS_CNT];
+ }
+
void print(raw_ostream &);
void dump() { print(dbgs()); }
@@ -364,7 +369,6 @@ private:
const MachineRegisterInfo *MRI = nullptr;
AMDGPU::IsaVersion IV;
- DenseSet<MachineInstr *> TrackedWaitcntSet;
DenseMap<const Value *, MachineBasicBlock *> SLoadAddresses;
DenseMap<MachineBasicBlock *, bool> PreheadersToFlush;
MachineLoopInfo *MLI;
@@ -452,7 +456,9 @@ public:
// FLAT instruction.
WaitEventType getVmemWaitEventType(const MachineInstr &Inst) const {
assert(SIInstrInfo::isVMEM(Inst) || SIInstrInfo::isFLAT(Inst));
- if (!ST->hasVscnt())
+ // LDS DMA loads are also stores, but on the LDS side. On the VMEM side
+ // these should use VM_CNT.
+ if (!ST->hasVscnt() || SIInstrInfo::mayWriteLDSThroughDMA(Inst))
return VMEM_ACCESS;
if (Inst.mayStore() && !SIInstrInfo::isAtomicRet(Inst)) {
// FLAT and SCRATCH instructions may access scratch. Other VMEM
@@ -486,6 +492,9 @@ public:
MachineInstr &OldWaitcntInstr,
AMDGPU::Waitcnt &Wait,
MachineBasicBlock::instr_iterator It) const;
+
+ // Transform a soft waitcnt into a normal one.
+ bool promoteSoftWaitCnt(MachineInstr *Waitcnt) const;
};
} // end anonymous namespace
@@ -505,7 +514,8 @@ RegInterval WaitcntBrackets::getRegInterval(const MachineInstr *MI,
RegInterval Result;
- unsigned Reg = TRI->getEncodingValue(AMDGPU::getMCReg(Op.getReg(), *ST));
+ unsigned Reg = TRI->getEncodingValue(AMDGPU::getMCReg(Op.getReg(), *ST)) &
+ AMDGPU::HWEncoding::REG_IDX_MASK;
if (TRI->isVectorRegister(*MRI, Op.getReg())) {
assert(Reg >= Encoding.VGPR0 && Reg <= Encoding.VGPRL);
@@ -543,14 +553,6 @@ void WaitcntBrackets::setExpScore(const MachineInstr *MI,
}
}
-// MUBUF and FLAT LDS DMA operations need a wait on vmcnt before LDS written
-// can be accessed. A load from LDS to VMEM does not need a wait.
-static bool mayWriteLDSThroughDMA(const MachineInstr &MI) {
- return SIInstrInfo::isVALU(MI) &&
- (SIInstrInfo::isMUBUF(MI) || SIInstrInfo::isFLAT(MI)) &&
- MI.getOpcode() != AMDGPU::BUFFER_STORE_LDS_DWORD;
-}
-
void WaitcntBrackets::updateByEvent(const SIInstrInfo *TII,
const SIRegisterInfo *TRI,
const MachineRegisterInfo *MRI,
@@ -590,12 +592,7 @@ void WaitcntBrackets::updateByEvent(const SIInstrInfo *TII,
AMDGPU::OpName::data1),
CurrScore);
}
- } else if (SIInstrInfo::isAtomicRet(Inst) &&
- Inst.getOpcode() != AMDGPU::DS_GWS_INIT &&
- Inst.getOpcode() != AMDGPU::DS_GWS_SEMA_V &&
- Inst.getOpcode() != AMDGPU::DS_GWS_SEMA_BR &&
- Inst.getOpcode() != AMDGPU::DS_GWS_SEMA_P &&
- Inst.getOpcode() != AMDGPU::DS_GWS_BARRIER &&
+ } else if (SIInstrInfo::isAtomicRet(Inst) && !SIInstrInfo::isGWS(Inst) &&
Inst.getOpcode() != AMDGPU::DS_APPEND &&
Inst.getOpcode() != AMDGPU::DS_CONSUME &&
Inst.getOpcode() != AMDGPU::DS_ORDERED_COUNT) {
@@ -683,7 +680,7 @@ void WaitcntBrackets::updateByEvent(const SIInstrInfo *TII,
setRegScore(RegNo + NUM_ALL_VGPRS, t, CurrScore);
}
#endif
- } else {
+ } else /* LGKM_CNT || EXP_CNT || VS_CNT || NUM_INST_CNTS */ {
// Match the score to the destination registers.
for (unsigned I = 0, E = Inst.getNumOperands(); I != E; ++I) {
auto &Op = Inst.getOperand(I);
@@ -694,6 +691,10 @@ void WaitcntBrackets::updateByEvent(const SIInstrInfo *TII,
if (Interval.first >= NUM_ALL_VGPRS)
continue;
if (updateVMCntOnly(Inst)) {
+ // updateVMCntOnly should only leave us with VGPRs
+ // MUBUF, MTBUF, MIMG, FlatGlobal, and FlatScratch only have VGPR/AGPR
+ // defs. That's required for a sane index into `VgprMemTypes` below
+ assert(TRI->isVectorRegister(*MRI, Op.getReg()));
VmemType V = getVmemType(Inst);
for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo)
VgprVmemTypes[RegNo] |= 1 << V;
@@ -703,7 +704,10 @@ void WaitcntBrackets::updateByEvent(const SIInstrInfo *TII,
setRegScore(RegNo, T, CurrScore);
}
}
- if (Inst.mayStore() && (TII->isDS(Inst) || mayWriteLDSThroughDMA(Inst))) {
+ if (Inst.mayStore() &&
+ (TII->isDS(Inst) || TII->mayWriteLDSThroughDMA(Inst))) {
+ // MUBUF and FLAT LDS DMA operations need a wait on vmcnt before LDS
+ // written can be accessed. A load from LDS to VMEM does not need a wait.
setRegScore(SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS, T, CurrScore);
}
}
@@ -870,6 +874,15 @@ static bool updateOperandIfDifferent(MachineInstr &MI, uint16_t OpName,
return true;
}
+bool SIInsertWaitcnts::promoteSoftWaitCnt(MachineInstr *Waitcnt) const {
+ unsigned Opcode = Waitcnt->getOpcode();
+ if (!SIInstrInfo::isSoftWaitcnt(Opcode))
+ return false;
+
+ Waitcnt->setDesc(TII->get(SIInstrInfo::getNonSoftWaitcntOpcode(Opcode)));
+ return true;
+}
+
/// Combine consecutive waitcnt instructions that precede \p It and follow
/// \p OldWaitcntInstr and apply any extra wait from waitcnt that were added
/// by previous passes. Currently this pass conservatively assumes that these
@@ -886,86 +899,77 @@ bool SIInsertWaitcnts::applyPreexistingWaitcnt(
if (II.isMetaInstruction())
continue;
- if (II.getOpcode() == AMDGPU::S_WAITCNT) {
- // Conservatively update required wait if this waitcnt was added in an
- // earlier pass. In this case it will not exist in the tracked waitcnt
- // set.
- if (!TrackedWaitcntSet.count(&II)) {
- unsigned IEnc = II.getOperand(0).getImm();
- AMDGPU::Waitcnt OldWait = AMDGPU::decodeWaitcnt(IV, IEnc);
- Wait = Wait.combined(OldWait);
- }
+ unsigned Opcode = II.getOpcode();
+ bool IsSoft = SIInstrInfo::isSoftWaitcnt(Opcode);
+
+ if (SIInstrInfo::isWaitcnt(Opcode)) {
+ // Update required wait count. If this is a soft waitcnt (= it was added
+ // by an earlier pass), it may be entirely removed.
+ unsigned IEnc = II.getOperand(0).getImm();
+ AMDGPU::Waitcnt OldWait = AMDGPU::decodeWaitcnt(IV, IEnc);
+ if (IsSoft)
+ ScoreBrackets.simplifyWaitcnt(OldWait);
+ Wait = Wait.combined(OldWait);
// Merge consecutive waitcnt of the same type by erasing multiples.
- if (!WaitcntInstr) {
- WaitcntInstr = &II;
- } else {
+ if (WaitcntInstr || (!Wait.hasWaitExceptVsCnt() && IsSoft)) {
II.eraseFromParent();
Modified = true;
- }
+ } else
+ WaitcntInstr = &II;
} else {
- assert(II.getOpcode() == AMDGPU::S_WAITCNT_VSCNT);
+ assert(SIInstrInfo::isWaitcntVsCnt(Opcode));
assert(II.getOperand(0).getReg() == AMDGPU::SGPR_NULL);
- if (!TrackedWaitcntSet.count(&II)) {
- unsigned OldVSCnt =
- TII->getNamedOperand(II, AMDGPU::OpName::simm16)->getImm();
- Wait.VsCnt = std::min(Wait.VsCnt, OldVSCnt);
- }
- if (!WaitcntVsCntInstr) {
- WaitcntVsCntInstr = &II;
- } else {
+ unsigned OldVSCnt =
+ TII->getNamedOperand(II, AMDGPU::OpName::simm16)->getImm();
+ if (IsSoft)
+ ScoreBrackets.simplifyWaitcnt(InstCounterType::VS_CNT, OldVSCnt);
+ Wait.VsCnt = std::min(Wait.VsCnt, OldVSCnt);
+
+ if (WaitcntVsCntInstr || (!Wait.hasWaitVsCnt() && IsSoft)) {
II.eraseFromParent();
Modified = true;
- }
+ } else
+ WaitcntVsCntInstr = &II;
}
}
// Updated encoding of merged waitcnt with the required wait.
if (WaitcntInstr) {
- if (Wait.hasWaitExceptVsCnt()) {
- Modified |=
- updateOperandIfDifferent(*WaitcntInstr, AMDGPU::OpName::simm16,
- AMDGPU::encodeWaitcnt(IV, Wait));
- ScoreBrackets.applyWaitcnt(Wait);
- Wait.VmCnt = ~0u;
- Wait.LgkmCnt = ~0u;
- Wait.ExpCnt = ~0u;
-
- LLVM_DEBUG(It == OldWaitcntInstr.getParent()->end()
- ? dbgs() << "applyPreexistingWaitcnt\n"
- << "New Instr at block end: " << *WaitcntInstr
- << '\n'
- : dbgs() << "applyPreexistingWaitcnt\n"
- << "Old Instr: " << *It
- << "New Instr: " << *WaitcntInstr << '\n');
+ Modified |= updateOperandIfDifferent(*WaitcntInstr, AMDGPU::OpName::simm16,
+ AMDGPU::encodeWaitcnt(IV, Wait));
+ Modified |= promoteSoftWaitCnt(WaitcntInstr);
- } else {
- WaitcntInstr->eraseFromParent();
- Modified = true;
- }
+ ScoreBrackets.applyWaitcnt(Wait);
+ Wait.VmCnt = ~0u;
+ Wait.LgkmCnt = ~0u;
+ Wait.ExpCnt = ~0u;
+
+ LLVM_DEBUG(It == OldWaitcntInstr.getParent()->end()
+ ? dbgs()
+ << "applyPreexistingWaitcnt\n"
+ << "New Instr at block end: " << *WaitcntInstr << '\n'
+ : dbgs() << "applyPreexistingWaitcnt\n"
+ << "Old Instr: " << *It
+ << "New Instr: " << *WaitcntInstr << '\n');
}
if (WaitcntVsCntInstr) {
- if (Wait.hasWaitVsCnt()) {
- assert(ST->hasVscnt());
- Modified |= updateOperandIfDifferent(*WaitcntVsCntInstr,
- AMDGPU::OpName::simm16, Wait.VsCnt);
- ScoreBrackets.applyWaitcnt(Wait);
- Wait.VsCnt = ~0u;
-
- LLVM_DEBUG(It == OldWaitcntInstr.getParent()->end()
- ? dbgs() << "applyPreexistingWaitcnt\n"
- << "New Instr at block end: "
- << *WaitcntVsCntInstr << '\n'
- : dbgs() << "applyPreexistingWaitcnt\n"
- << "Old Instr: " << *It
- << "New Instr: " << *WaitcntVsCntInstr << '\n');
- } else {
- WaitcntVsCntInstr->eraseFromParent();
- Modified = true;
- }
+ Modified |= updateOperandIfDifferent(*WaitcntVsCntInstr,
+ AMDGPU::OpName::simm16, Wait.VsCnt);
+ Modified |= promoteSoftWaitCnt(WaitcntVsCntInstr);
+ ScoreBrackets.applyWaitcnt(Wait);
+ Wait.VsCnt = ~0u;
+
+ LLVM_DEBUG(It == OldWaitcntInstr.getParent()->end()
+ ? dbgs() << "applyPreexistingWaitcnt\n"
+ << "New Instr at block end: " << *WaitcntVsCntInstr
+ << '\n'
+ : dbgs() << "applyPreexistingWaitcnt\n"
+ << "Old Instr: " << *It
+ << "New Instr: " << *WaitcntVsCntInstr << '\n');
}
return Modified;
@@ -1178,7 +1182,7 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI,
if (AS != AMDGPUAS::LOCAL_ADDRESS && AS != AMDGPUAS::FLAT_ADDRESS)
continue;
// No need to wait before load from VMEM to LDS.
- if (mayWriteLDSThroughDMA(MI))
+ if (TII->mayWriteLDSThroughDMA(MI))
continue;
unsigned RegNo = SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS;
// VM_CNT is only relevant to vgpr or LDS.
@@ -1315,9 +1319,8 @@ bool SIInsertWaitcnts::generateWaitcnt(AMDGPU::Waitcnt Wait,
// instruction was modified to handle the required wait.
if (Wait.hasWaitExceptVsCnt()) {
unsigned Enc = AMDGPU::encodeWaitcnt(IV, Wait);
- auto SWaitInst =
+ [[maybe_unused]] auto SWaitInst =
BuildMI(Block, It, DL, TII->get(AMDGPU::S_WAITCNT)).addImm(Enc);
- TrackedWaitcntSet.insert(SWaitInst);
Modified = true;
LLVM_DEBUG(dbgs() << "generateWaitcnt\n";
@@ -1328,10 +1331,9 @@ bool SIInsertWaitcnts::generateWaitcnt(AMDGPU::Waitcnt Wait,
if (Wait.hasWaitVsCnt()) {
assert(ST->hasVscnt());
- auto SWaitInst = BuildMI(Block, It, DL, TII->get(AMDGPU::S_WAITCNT_VSCNT))
+ [[maybe_unused]] auto SWaitInst = BuildMI(Block, It, DL, TII->get(AMDGPU::S_WAITCNT_VSCNT))
.addReg(AMDGPU::SGPR_NULL, RegState::Undef)
.addImm(Wait.VsCnt);
- TrackedWaitcntSet.insert(SWaitInst);
Modified = true;
LLVM_DEBUG(dbgs() << "generateWaitcnt\n";
@@ -1504,6 +1506,11 @@ void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr &Inst,
break;
case AMDGPU::S_MEMTIME:
case AMDGPU::S_MEMREALTIME:
+ case AMDGPU::S_BARRIER_SIGNAL_ISFIRST_M0:
+ case AMDGPU::S_BARRIER_SIGNAL_ISFIRST_IMM:
+ case AMDGPU::S_BARRIER_LEAVE:
+ case AMDGPU::S_GET_BARRIER_STATE_M0:
+ case AMDGPU::S_GET_BARRIER_STATE_IMM:
ScoreBrackets->updateByEvent(TII, TRI, MRI, SMEM_ACCESS, Inst);
break;
}
@@ -1574,9 +1581,9 @@ bool WaitcntBrackets::merge(const WaitcntBrackets &Other) {
}
static bool isWaitInstr(MachineInstr &Inst) {
- return Inst.getOpcode() == AMDGPU::S_WAITCNT ||
- (Inst.getOpcode() == AMDGPU::S_WAITCNT_VSCNT &&
- Inst.getOperand(0).isReg() &&
+ auto Opcode = Inst.getOpcode();
+ return SIInstrInfo::isWaitcnt(Opcode) ||
+ (SIInstrInfo::isWaitcntVsCnt(Opcode) && Inst.getOperand(0).isReg() &&
Inst.getOperand(0).getReg() == AMDGPU::SGPR_NULL);
}
@@ -1721,26 +1728,25 @@ bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
// which we want to flush the vmcnt counter, and false otherwise.
bool SIInsertWaitcnts::isPreheaderToFlush(MachineBasicBlock &MBB,
WaitcntBrackets &ScoreBrackets) {
- if (PreheadersToFlush.count(&MBB))
- return PreheadersToFlush[&MBB];
-
- auto UpdateCache = [&](bool val) {
- PreheadersToFlush[&MBB] = val;
- return val;
- };
+ auto [Iterator, IsInserted] = PreheadersToFlush.try_emplace(&MBB, false);
+ if (!IsInserted)
+ return Iterator->second;
MachineBasicBlock *Succ = MBB.getSingleSuccessor();
if (!Succ)
- return UpdateCache(false);
+ return false;
MachineLoop *Loop = MLI->getLoopFor(Succ);
if (!Loop)
- return UpdateCache(false);
+ return false;
- if (Loop->getLoopPreheader() == &MBB && shouldFlushVmCnt(Loop, ScoreBrackets))
- return UpdateCache(true);
+ if (Loop->getLoopPreheader() == &MBB &&
+ shouldFlushVmCnt(Loop, ScoreBrackets)) {
+ Iterator->second = true;
+ return true;
+ }
- return UpdateCache(false);
+ return false;
}
bool SIInsertWaitcnts::isVMEMOrFlatVMEM(const MachineInstr &MI) const {
@@ -1825,7 +1831,7 @@ bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) {
ForceEmitWaitcnt[T] = false;
OptNone = MF.getFunction().hasOptNone() ||
- MF.getTarget().getOptLevel() == CodeGenOpt::None;
+ MF.getTarget().getOptLevel() == CodeGenOptLevel::None;
HardwareLimits Limits = {};
Limits.VmcntMax = AMDGPU::getVmcntBitMask(IV);
@@ -1839,12 +1845,13 @@ bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) {
assert(NumSGPRsMax <= SQ_MAX_PGM_SGPRS);
RegisterEncoding Encoding = {};
- Encoding.VGPR0 = TRI->getEncodingValue(AMDGPU::VGPR0);
+ Encoding.VGPR0 =
+ TRI->getEncodingValue(AMDGPU::VGPR0) & AMDGPU::HWEncoding::REG_IDX_MASK;
Encoding.VGPRL = Encoding.VGPR0 + NumVGPRsMax - 1;
- Encoding.SGPR0 = TRI->getEncodingValue(AMDGPU::SGPR0);
+ Encoding.SGPR0 =
+ TRI->getEncodingValue(AMDGPU::SGPR0) & AMDGPU::HWEncoding::REG_IDX_MASK;
Encoding.SGPRL = Encoding.SGPR0 + NumSGPRsMax - 1;
- TrackedWaitcntSet.clear();
BlockInfos.clear();
bool Modified = false;
@@ -1862,6 +1869,11 @@ bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) {
;
BuildMI(EntryBB, I, DebugLoc(), TII->get(AMDGPU::S_WAITCNT)).addImm(0);
+ auto NonKernelInitialState =
+ std::make_unique<WaitcntBrackets>(ST, Limits, Encoding);
+ NonKernelInitialState->setNonKernelFunctionInitialState();
+ BlockInfos[&EntryBB].Incoming = std::move(NonKernelInitialState);
+
Modified = true;
}
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInstrFormats.td b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInstrFormats.td
index f674777724eb..585a3eb78618 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInstrFormats.td
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInstrFormats.td
@@ -40,6 +40,8 @@ class InstSI <dag outs, dag ins, string asm = "",
field bit MTBUF = 0;
field bit SMRD = 0;
field bit MIMG = 0;
+ field bit VIMAGE = 0;
+ field bit VSAMPLE = 0;
field bit EXP = 0;
field bit FLAT = 0;
field bit DS = 0;
@@ -156,6 +158,9 @@ class InstSI <dag outs, dag ins, string asm = "",
// This bit indicates that the instruction is never-uniform/divergent
field bit IsNeverUniform = 0;
+ // ds_gws_* instructions.
+ field bit GWS = 0;
+
// These need to be kept in sync with the enum in SIInstrFlags.
let TSFlags{0} = SALU;
let TSFlags{1} = VALU;
@@ -181,15 +186,17 @@ class InstSI <dag outs, dag ins, string asm = "",
let TSFlags{18} = MTBUF;
let TSFlags{19} = SMRD;
let TSFlags{20} = MIMG;
- let TSFlags{21} = EXP;
- let TSFlags{22} = FLAT;
- let TSFlags{23} = DS;
+ let TSFlags{21} = VIMAGE;
+ let TSFlags{22} = VSAMPLE;
+ let TSFlags{23} = EXP;
+ let TSFlags{24} = FLAT;
+ let TSFlags{25} = DS;
- let TSFlags{24} = VGPRSpill;
- let TSFlags{25} = SGPRSpill;
+ let TSFlags{26} = VGPRSpill;
+ let TSFlags{27} = SGPRSpill;
- let TSFlags{26} = LDSDIR;
- let TSFlags{27} = VINTERP;
+ let TSFlags{28} = LDSDIR;
+ let TSFlags{29} = VINTERP;
let TSFlags{32} = VM_CNT;
let TSFlags{33} = EXP_CNT;
@@ -239,6 +246,8 @@ class InstSI <dag outs, dag ins, string asm = "",
let TSFlags{61} = IsNeverUniform;
+ let TSFlags{62} = GWS;
+
let SchedRW = [Write32Bit];
let AsmVariantName = AMDGPUAsmVariants.Default;
@@ -299,6 +308,16 @@ def CPolBit {
class VOPDstOperand <RegisterClass rc> : RegisterOperand <rc, "printVOPDst">;
+def VOPDstOperand_t16 : VOPDstOperand <VGPR_16> {
+ let EncoderMethod = "getMachineOpValueT16";
+ let DecoderMethod = "DecodeVGPR_16RegisterClass";
+}
+
+def VOPDstOperand_t16Lo128 : VOPDstOperand <VGPR_16_Lo128> {
+ let EncoderMethod = "getMachineOpValueT16Lo128";
+ let DecoderMethod = "DecodeVGPR_16_Lo128RegisterClass";
+}
+
class VINTRPe <bits<2> op> : Enc32 {
bits<8> vdst;
bits<8> vsrc;
@@ -414,6 +433,57 @@ class MIMGe_gfx11 <bits<8> op> : Enc64 {
let Inst{62-58} = ssamp{6-2};
}
+class VIMAGE_VSAMPLE_Common <bits<8> op> : Enc96 {
+ bits<3> dim;
+ bits<1> tfe;
+ bits<1> r128;
+ bit d16;
+ bits<1> a16;
+ bits<4> dmask;
+ bits<8> vdata;
+ bits<9> rsrc;
+ bits<6> cpol;
+ bits<8> vaddr0;
+ bits<8> vaddr1;
+ bits<8> vaddr2;
+ bits<8> vaddr3;
+
+ let Inst{2-0} = dim;
+ let Inst{4} = r128;
+ let Inst{5} = d16;
+ let Inst{6} = a16;
+ let Inst{21-14} = op;
+ let Inst{25-22} = dmask;
+ let Inst{39-32} = vdata;
+ let Inst{49-41} = rsrc;
+ let Inst{51-50} = cpol{4-3}; // scope
+ let Inst{54-52} = cpol{2-0}; // th
+ let Inst{71-64} = vaddr0;
+ let Inst{79-72} = vaddr1;
+ let Inst{87-80} = vaddr2;
+ let Inst{95-88} = vaddr3;
+}
+
+class VSAMPLEe <bits<8> op> : VIMAGE_VSAMPLE_Common<op> {
+ bits<1> unorm;
+ bits<1> lwe;
+ bits<9> samp;
+
+ let Inst{3} = tfe;
+ let Inst{13} = unorm;
+ let Inst{31-26} = 0x39;
+ let Inst{40} = lwe;
+ let Inst{63-55} = samp;
+}
+
+class VIMAGEe <bits<8> op> : VIMAGE_VSAMPLE_Common<op> {
+ bits<8> vaddr4;
+
+ let Inst{31-26} = 0x34;
+ let Inst{55} = tfe;
+ let Inst{63-56} = vaddr4;
+}
+
class EXPe : Enc64 {
bits<4> en;
bits<6> tgt;
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 0f954732a5ee..70ef1fff274a 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -17,7 +17,9 @@
#include "GCNHazardRecognizer.h"
#include "GCNSubtarget.h"
#include "SIMachineFunctionInfo.h"
+#include "Utils/AMDGPUBaseInfo.h"
#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h"
#include "llvm/CodeGen/LiveIntervals.h"
#include "llvm/CodeGen/LiveVariables.h"
#include "llvm/CodeGen/MachineDominators.h"
@@ -105,9 +107,27 @@ static bool nodesHaveSameOperandValue(SDNode *N0, SDNode* N1, unsigned OpName) {
return N0->getOperand(Op0Idx) == N1->getOperand(Op1Idx);
}
+static bool canRemat(const MachineInstr &MI) {
+
+ if (SIInstrInfo::isVOP1(MI) || SIInstrInfo::isVOP2(MI) ||
+ SIInstrInfo::isVOP3(MI) || SIInstrInfo::isSDWA(MI) ||
+ SIInstrInfo::isSALU(MI))
+ return true;
+
+ if (SIInstrInfo::isSMRD(MI)) {
+ return !MI.memoperands_empty() &&
+ llvm::all_of(MI.memoperands(), [](const MachineMemOperand *MMO) {
+ return MMO->isLoad() && MMO->isInvariant();
+ });
+ }
+
+ return false;
+}
+
bool SIInstrInfo::isReallyTriviallyReMaterializable(
const MachineInstr &MI) const {
- if (isVOP1(MI) || isVOP2(MI) || isVOP3(MI) || isSDWA(MI) || isSALU(MI)) {
+
+ if (canRemat(MI)) {
// Normally VALU use of exec would block the rematerialization, but that
// is OK in this case to have an implicit exec read as all VALU do.
// We really want all of the generic logic for this except for this.
@@ -119,12 +139,13 @@ bool SIInstrInfo::isReallyTriviallyReMaterializable(
// There is difference to generic method which does not allow
// rematerialization if there are virtual register uses. We allow this,
// therefore this method includes SOP instructions as well.
- return !MI.hasImplicitDef() &&
- MI.getNumImplicitOperands() == MI.getDesc().implicit_uses().size() &&
- !MI.mayRaiseFPException();
+ if (!MI.hasImplicitDef() &&
+ MI.getNumImplicitOperands() == MI.getDesc().implicit_uses().size() &&
+ !MI.mayRaiseFPException())
+ return true;
}
- return false;
+ return TargetInstrInfo::isReallyTriviallyReMaterializable(MI);
}
// Returns true if the scalar result of a VALU instruction depends on exec.
@@ -169,6 +190,48 @@ bool SIInstrInfo::isIgnorableUse(const MachineOperand &MO) const {
isVALU(*MO.getParent()) && !resultDependsOnExec(*MO.getParent());
}
+bool SIInstrInfo::isSafeToSink(MachineInstr &MI,
+ MachineBasicBlock *SuccToSinkTo,
+ MachineCycleInfo *CI) const {
+ // Allow sinking if MI edits lane mask (divergent i1 in sgpr).
+ if (MI.getOpcode() == AMDGPU::SI_IF_BREAK)
+ return true;
+
+ MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
+ // Check if sinking of MI would create temporal divergent use.
+ for (auto Op : MI.uses()) {
+ if (Op.isReg() && Op.getReg().isVirtual() &&
+ RI.isSGPRClass(MRI.getRegClass(Op.getReg()))) {
+ MachineInstr *SgprDef = MRI.getVRegDef(Op.getReg());
+
+ // SgprDef defined inside cycle
+ MachineCycle *FromCycle = CI->getCycle(SgprDef->getParent());
+ if (FromCycle == nullptr)
+ continue;
+
+ MachineCycle *ToCycle = CI->getCycle(SuccToSinkTo);
+ // Check if there is a FromCycle that contains SgprDef's basic block but
+ // does not contain SuccToSinkTo and also has divergent exit condition.
+ while (FromCycle && !FromCycle->contains(ToCycle)) {
+ // After structurize-cfg, there should be exactly one cycle exit.
+ SmallVector<MachineBasicBlock *, 1> ExitBlocks;
+ FromCycle->getExitBlocks(ExitBlocks);
+ assert(ExitBlocks.size() == 1);
+ assert(ExitBlocks[0]->getSinglePredecessor());
+
+ // FromCycle has divergent exit condition.
+ if (hasDivergentBranch(ExitBlocks[0]->getSinglePredecessor())) {
+ return false;
+ }
+
+ FromCycle = FromCycle->getParentCycle();
+ }
+ }
+ }
+
+ return true;
+}
+
bool SIInstrInfo::areLoadsFromSameBasePtr(SDNode *Load0, SDNode *Load1,
int64_t &Offset0,
int64_t &Offset1) const {
@@ -421,6 +484,8 @@ bool SIInstrInfo::getMemOperandsWithOffsetWidth(
Offset = OffsetOp ? OffsetOp->getImm() : 0;
// Get appropriate operand, and compute width accordingly.
DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::sdst);
+ if (DataOpIdx == -1)
+ return false;
Width = getOpSize(LdSt, DataOpIdx);
return true;
}
@@ -479,8 +544,10 @@ static bool memOpsHaveSameBasePtr(const MachineInstr &MI1,
}
bool SIInstrInfo::shouldClusterMemOps(ArrayRef<const MachineOperand *> BaseOps1,
+ int64_t Offset1, bool OffsetIsScalable1,
ArrayRef<const MachineOperand *> BaseOps2,
- unsigned NumLoads,
+ int64_t Offset2, bool OffsetIsScalable2,
+ unsigned ClusterSize,
unsigned NumBytes) const {
// If the mem ops (to be clustered) do not have the same base ptr, then they
// should not be clustered
@@ -506,8 +573,8 @@ bool SIInstrInfo::shouldClusterMemOps(ArrayRef<const MachineOperand *> BaseOps1,
// (3) 9 <= LoadSize <= 12: cluster at max 2 mem ops
// (4) 13 <= LoadSize <= 16: cluster at max 2 mem ops
// (5) LoadSize >= 17: do not cluster
- const unsigned LoadSize = NumBytes / NumLoads;
- const unsigned NumDWORDs = ((LoadSize + 3) / 4) * NumLoads;
+ const unsigned LoadSize = NumBytes / ClusterSize;
+ const unsigned NumDWORDs = ((LoadSize + 3) / 4) * ClusterSize;
return NumDWORDs <= 8;
}
@@ -619,7 +686,7 @@ static void indirectCopyToAGPR(const SIInstrInfo &TII,
}
RS.enterBasicBlockEnd(MBB);
- RS.backward(MI);
+ RS.backward(std::next(MI));
// Ideally we want to have three registers for a long reg_sequence copy
// to hide 2 waitstates between v_mov_b32 and accvgpr_write.
@@ -680,23 +747,27 @@ static void expandSGPRCopy(const SIInstrInfo &TII, MachineBasicBlock &MBB,
for (unsigned Idx = 0; Idx < BaseIndices.size(); ++Idx) {
int16_t SubIdx = BaseIndices[Idx];
- Register Reg = RI.getSubReg(DestReg, SubIdx);
+ Register DestSubReg = RI.getSubReg(DestReg, SubIdx);
+ Register SrcSubReg = RI.getSubReg(SrcReg, SubIdx);
+ assert(DestSubReg && SrcSubReg && "Failed to find subregs!");
unsigned Opcode = AMDGPU::S_MOV_B32;
// Is SGPR aligned? If so try to combine with next.
- Register Src = RI.getSubReg(SrcReg, SubIdx);
- bool AlignedDest = ((Reg - AMDGPU::SGPR0) % 2) == 0;
- bool AlignedSrc = ((Src - AMDGPU::SGPR0) % 2) == 0;
+ bool AlignedDest = ((DestSubReg - AMDGPU::SGPR0) % 2) == 0;
+ bool AlignedSrc = ((SrcSubReg - AMDGPU::SGPR0) % 2) == 0;
if (AlignedDest && AlignedSrc && (Idx + 1 < BaseIndices.size())) {
// Can use SGPR64 copy
unsigned Channel = RI.getChannelFromSubReg(SubIdx);
SubIdx = RI.getSubRegFromChannel(Channel, 2);
+ DestSubReg = RI.getSubReg(DestReg, SubIdx);
+ SrcSubReg = RI.getSubReg(SrcReg, SubIdx);
+ assert(DestSubReg && SrcSubReg && "Failed to find subregs!");
Opcode = AMDGPU::S_MOV_B64;
Idx++;
}
- LastMI = BuildMI(MBB, I, DL, TII.get(Opcode), RI.getSubReg(DestReg, SubIdx))
- .addReg(RI.getSubReg(SrcReg, SubIdx))
+ LastMI = BuildMI(MBB, I, DL, TII.get(Opcode), DestSubReg)
+ .addReg(SrcSubReg)
.addReg(SrcReg, RegState::Implicit);
if (!FirstMI)
@@ -722,24 +793,32 @@ void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
const DebugLoc &DL, MCRegister DestReg,
MCRegister SrcReg, bool KillSrc) const {
const TargetRegisterClass *RC = RI.getPhysRegBaseClass(DestReg);
-
- // FIXME: This is hack to resolve copies between 16 bit and 32 bit
- // registers until all patterns are fixed.
- if (Fix16BitCopies &&
- ((RI.getRegSizeInBits(*RC) == 16) ^
- (RI.getRegSizeInBits(*RI.getPhysRegBaseClass(SrcReg)) == 16))) {
- MCRegister &RegToFix = (RI.getRegSizeInBits(*RC) == 16) ? DestReg : SrcReg;
- MCRegister Super = RI.get32BitRegister(RegToFix);
- assert(RI.getSubReg(Super, AMDGPU::lo16) == RegToFix);
- RegToFix = Super;
-
- if (DestReg == SrcReg) {
- // Insert empty bundle since ExpandPostRA expects an instruction here.
- BuildMI(MBB, MI, DL, get(AMDGPU::BUNDLE));
- return;
+ unsigned Size = RI.getRegSizeInBits(*RC);
+ const TargetRegisterClass *SrcRC = RI.getPhysRegBaseClass(SrcReg);
+ unsigned SrcSize = RI.getRegSizeInBits(*SrcRC);
+
+ // The rest of copyPhysReg assumes Src and Dst size are the same size.
+ // TODO-GFX11_16BIT If all true 16 bit instruction patterns are completed can
+ // we remove Fix16BitCopies and this code block?
+ if (Fix16BitCopies) {
+ if (((Size == 16) != (SrcSize == 16))) {
+ // Non-VGPR Src and Dst will later be expanded back to 32 bits.
+ assert(ST.hasTrue16BitInsts());
+ MCRegister &RegToFix = (Size == 32) ? DestReg : SrcReg;
+ MCRegister SubReg = RI.getSubReg(RegToFix, AMDGPU::lo16);
+ RegToFix = SubReg;
+
+ if (DestReg == SrcReg) {
+ // Identity copy. Insert empty bundle since ExpandPostRA expects an
+ // instruction here.
+ BuildMI(MBB, MI, DL, get(AMDGPU::BUNDLE));
+ return;
+ }
+ RC = RI.getPhysRegBaseClass(DestReg);
+ Size = RI.getRegSizeInBits(*RC);
+ SrcRC = RI.getPhysRegBaseClass(SrcReg);
+ SrcSize = RI.getRegSizeInBits(*SrcRC);
}
-
- RC = RI.getPhysRegBaseClass(DestReg);
}
if (RC == &AMDGPU::VGPR_32RegClass) {
@@ -863,10 +942,8 @@ void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
return;
}
- const unsigned Size = RI.getRegSizeInBits(*RC);
if (Size == 16) {
- assert(AMDGPU::VGPR_LO16RegClass.contains(SrcReg) ||
- AMDGPU::VGPR_HI16RegClass.contains(SrcReg) ||
+ assert(AMDGPU::VGPR_16RegClass.contains(SrcReg) ||
AMDGPU::SReg_LO16RegClass.contains(SrcReg) ||
AMDGPU::AGPR_LO16RegClass.contains(SrcReg));
@@ -904,6 +981,25 @@ void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
return;
}
+ if (ST.hasTrue16BitInsts()) {
+ if (IsSGPRSrc) {
+ assert(SrcLow);
+ SrcReg = NewSrcReg;
+ }
+ // Use the smaller instruction encoding if possible.
+ if (AMDGPU::VGPR_16_Lo128RegClass.contains(DestReg) &&
+ (IsSGPRSrc || AMDGPU::VGPR_16_Lo128RegClass.contains(SrcReg))) {
+ BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B16_t16_e32), DestReg)
+ .addReg(SrcReg);
+ } else {
+ BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B16_t16_e64), DestReg)
+ .addImm(0) // src0_modifiers
+ .addReg(SrcReg)
+ .addImm(0); // op_sel
+ }
+ return;
+ }
+
if (IsSGPRSrc && !ST.hasSDWAScalar()) {
if (!DstLow || !SrcLow) {
reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc,
@@ -930,14 +1026,13 @@ void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
return;
}
- const TargetRegisterClass *SrcRC = RI.getPhysRegBaseClass(SrcReg);
if (RC == RI.getVGPR64Class() && (SrcRC == RC || RI.isSGPRClass(SrcRC))) {
if (ST.hasMovB64()) {
BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B64_e32), DestReg)
.addReg(SrcReg, getKillRegState(KillSrc));
return;
}
- if (ST.hasPackedFP32Ops()) {
+ if (ST.hasPkMovB32()) {
BuildMI(MBB, MI, DL, get(AMDGPU::V_PK_MOV_B32), DestReg)
.addImm(SISrcMods::OP_SEL_1)
.addReg(SrcReg)
@@ -984,7 +1079,7 @@ void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
if (ST.hasMovB64()) {
Opcode = AMDGPU::V_MOV_B64_e32;
EltSize = 8;
- } else if (ST.hasPackedFP32Ops()) {
+ } else if (ST.hasPkMovB32()) {
Opcode = AMDGPU::V_PK_MOV_B32;
EltSize = 8;
}
@@ -1012,6 +1107,9 @@ void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
SubIdx = SubIndices[Idx];
else
SubIdx = SubIndices[SubIndices.size() - Idx - 1];
+ Register DestSubReg = RI.getSubReg(DestReg, SubIdx);
+ Register SrcSubReg = RI.getSubReg(SrcReg, SubIdx);
+ assert(DestSubReg && SrcSubReg && "Failed to find subregs!");
bool IsFirstSubreg = Idx == 0;
bool UseKill = CanKillSuperReg && Idx == SubIndices.size() - 1;
@@ -1019,30 +1117,26 @@ void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
if (Opcode == AMDGPU::INSTRUCTION_LIST_END) {
Register ImpDefSuper = IsFirstSubreg ? Register(DestReg) : Register();
Register ImpUseSuper = SrcReg;
- indirectCopyToAGPR(*this, MBB, MI, DL, RI.getSubReg(DestReg, SubIdx),
- RI.getSubReg(SrcReg, SubIdx), UseKill, *RS, Overlap,
- ImpDefSuper, ImpUseSuper);
+ indirectCopyToAGPR(*this, MBB, MI, DL, DestSubReg, SrcSubReg, UseKill,
+ *RS, Overlap, ImpDefSuper, ImpUseSuper);
} else if (Opcode == AMDGPU::V_PK_MOV_B32) {
- Register DstSubReg = RI.getSubReg(DestReg, SubIdx);
- Register SrcSubReg = RI.getSubReg(SrcReg, SubIdx);
MachineInstrBuilder MIB =
- BuildMI(MBB, MI, DL, get(AMDGPU::V_PK_MOV_B32), DstSubReg)
- .addImm(SISrcMods::OP_SEL_1)
- .addReg(SrcSubReg)
- .addImm(SISrcMods::OP_SEL_0 | SISrcMods::OP_SEL_1)
- .addReg(SrcSubReg)
- .addImm(0) // op_sel_lo
- .addImm(0) // op_sel_hi
- .addImm(0) // neg_lo
- .addImm(0) // neg_hi
- .addImm(0) // clamp
- .addReg(SrcReg, getKillRegState(UseKill) | RegState::Implicit);
+ BuildMI(MBB, MI, DL, get(AMDGPU::V_PK_MOV_B32), DestSubReg)
+ .addImm(SISrcMods::OP_SEL_1)
+ .addReg(SrcSubReg)
+ .addImm(SISrcMods::OP_SEL_0 | SISrcMods::OP_SEL_1)
+ .addReg(SrcSubReg)
+ .addImm(0) // op_sel_lo
+ .addImm(0) // op_sel_hi
+ .addImm(0) // neg_lo
+ .addImm(0) // neg_hi
+ .addImm(0) // clamp
+ .addReg(SrcReg, getKillRegState(UseKill) | RegState::Implicit);
if (IsFirstSubreg)
MIB.addReg(DestReg, RegState::Define | RegState::Implicit);
} else {
MachineInstrBuilder Builder =
- BuildMI(MBB, MI, DL, get(Opcode), RI.getSubReg(DestReg, SubIdx))
- .addReg(RI.getSubReg(SrcReg, SubIdx));
+ BuildMI(MBB, MI, DL, get(Opcode), DestSubReg).addReg(SrcSubReg);
if (IsFirstSubreg)
Builder.addReg(DestReg, RegState::Define | RegState::Implicit);
@@ -1286,7 +1380,11 @@ unsigned SIInstrInfo::getMovOpcode(const TargetRegisterClass *DstRC) const {
if (RI.isAGPRClass(DstRC))
return AMDGPU::COPY;
- if (RI.getRegSizeInBits(*DstRC) == 32) {
+ if (RI.getRegSizeInBits(*DstRC) == 16) {
+ // Assume hi bits are unneeded. Only _e64 true16 instructions are legal
+ // before RA.
+ return RI.isSGPRClass(DstRC) ? AMDGPU::COPY : AMDGPU::V_MOV_B16_t16_e64;
+ } else if (RI.getRegSizeInBits(*DstRC) == 32) {
return RI.isSGPRClass(DstRC) ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32;
} else if (RI.getRegSizeInBits(*DstRC) == 64 && RI.isSGPRClass(DstRC)) {
return AMDGPU::S_MOV_B64;
@@ -1587,11 +1685,15 @@ static unsigned getAVSpillSaveOpcode(unsigned Size) {
}
}
-static unsigned getWWMRegSpillSaveOpcode(unsigned Size) {
+static unsigned getWWMRegSpillSaveOpcode(unsigned Size,
+ bool IsVectorSuperClass) {
// Currently, there is only 32-bit WWM register spills needed.
if (Size != 4)
llvm_unreachable("unknown wwm register spill size");
+ if (IsVectorSuperClass)
+ return AMDGPU::SI_SPILL_WWM_AV32_SAVE;
+
return AMDGPU::SI_SPILL_WWM_V32_SAVE;
}
@@ -1600,11 +1702,13 @@ static unsigned getVectorRegSpillSaveOpcode(Register Reg,
unsigned Size,
const SIRegisterInfo &TRI,
const SIMachineFunctionInfo &MFI) {
+ bool IsVectorSuperClass = TRI.isVectorSuperClass(RC);
+
// Choose the right opcode if spilling a WWM register.
if (MFI.checkFlag(Reg, AMDGPU::VirtRegFlag::WWM_REG))
- return getWWMRegSpillSaveOpcode(Size);
+ return getWWMRegSpillSaveOpcode(Size, IsVectorSuperClass);
- if (TRI.isVectorSuperClass(RC))
+ if (IsVectorSuperClass)
return getAVSpillSaveOpcode(Size);
return TRI.isAGPRClass(RC) ? getAGPRSpillSaveOpcode(Size)
@@ -1807,11 +1911,15 @@ static unsigned getAVSpillRestoreOpcode(unsigned Size) {
}
}
-static unsigned getWWMRegSpillRestoreOpcode(unsigned Size) {
+static unsigned getWWMRegSpillRestoreOpcode(unsigned Size,
+ bool IsVectorSuperClass) {
// Currently, there is only 32-bit WWM register spills needed.
if (Size != 4)
llvm_unreachable("unknown wwm register spill size");
+ if (IsVectorSuperClass)
+ return AMDGPU::SI_SPILL_WWM_AV32_RESTORE;
+
return AMDGPU::SI_SPILL_WWM_V32_RESTORE;
}
@@ -1819,11 +1927,13 @@ static unsigned
getVectorRegSpillRestoreOpcode(Register Reg, const TargetRegisterClass *RC,
unsigned Size, const SIRegisterInfo &TRI,
const SIMachineFunctionInfo &MFI) {
+ bool IsVectorSuperClass = TRI.isVectorSuperClass(RC);
+
// Choose the right opcode if restoring a WWM register.
if (MFI.checkFlag(Reg, AMDGPU::VirtRegFlag::WWM_REG))
- return getWWMRegSpillRestoreOpcode(Size);
+ return getWWMRegSpillRestoreOpcode(Size, IsVectorSuperClass);
- if (TRI.isVectorSuperClass(RC))
+ if (IsVectorSuperClass)
return getAVSpillRestoreOpcode(Size);
return TRI.isAGPRClass(RC) ? getAGPRSpillRestoreOpcode(Size)
@@ -2006,6 +2116,14 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
MI.setDesc(get(AMDGPU::S_AND_SAVEEXEC_B32));
break;
+ case AMDGPU::SI_SPILL_S32_TO_VGPR:
+ MI.setDesc(get(AMDGPU::V_WRITELANE_B32));
+ break;
+
+ case AMDGPU::SI_RESTORE_S32_FROM_VGPR:
+ MI.setDesc(get(AMDGPU::V_READLANE_B32));
+ break;
+
case AMDGPU::V_MOV_B64_PSEUDO: {
Register Dst = MI.getOperand(0).getReg();
Register DstLo = RI.getSubReg(Dst, AMDGPU::sub0);
@@ -2024,7 +2142,7 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
APInt Imm(64, SrcOp.getImm());
APInt Lo(32, Imm.getLoBits(32).getZExtValue());
APInt Hi(32, Imm.getHiBits(32).getZExtValue());
- if (ST.hasPackedFP32Ops() && Lo == Hi && isInlineConstant(Lo)) {
+ if (ST.hasPkMovB32() && Lo == Hi && isInlineConstant(Lo)) {
BuildMI(MBB, MI, DL, get(AMDGPU::V_PK_MOV_B32), Dst)
.addImm(SISrcMods::OP_SEL_1)
.addImm(Lo.getSExtValue())
@@ -2045,7 +2163,7 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
}
} else {
assert(SrcOp.isReg());
- if (ST.hasPackedFP32Ops() &&
+ if (ST.hasPkMovB32() &&
!RI.isAGPR(MBB.getParent()->getRegInfo(), SrcOp.getReg())) {
BuildMI(MBB, MI, DL, get(AMDGPU::V_PK_MOV_B32), Dst)
.addImm(SISrcMods::OP_SEL_1) // src0_mod
@@ -2275,23 +2393,34 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
Register Reg = MI.getOperand(0).getReg();
Register RegLo = RI.getSubReg(Reg, AMDGPU::sub0);
Register RegHi = RI.getSubReg(Reg, AMDGPU::sub1);
+ MachineOperand OpLo = MI.getOperand(1);
+ MachineOperand OpHi = MI.getOperand(2);
// Create a bundle so these instructions won't be re-ordered by the
// post-RA scheduler.
MIBundleBuilder Bundler(MBB, MI);
Bundler.append(BuildMI(MF, DL, get(AMDGPU::S_GETPC_B64), Reg));
- // Add 32-bit offset from this instruction to the start of the
- // constant data.
- Bundler.append(BuildMI(MF, DL, get(AMDGPU::S_ADD_U32), RegLo)
- .addReg(RegLo)
- .add(MI.getOperand(1)));
+ // What we want here is an offset from the value returned by s_getpc (which
+ // is the address of the s_add_u32 instruction) to the global variable, but
+ // since the encoding of $symbol starts 4 bytes after the start of the
+ // s_add_u32 instruction, we end up with an offset that is 4 bytes too
+ // small. This requires us to add 4 to the global variable offset in order
+ // to compute the correct address. Similarly for the s_addc_u32 instruction,
+ // the encoding of $symbol starts 12 bytes after the start of the s_add_u32
+ // instruction.
+
+ if (OpLo.isGlobal())
+ OpLo.setOffset(OpLo.getOffset() + 4);
+ Bundler.append(
+ BuildMI(MF, DL, get(AMDGPU::S_ADD_U32), RegLo).addReg(RegLo).add(OpLo));
+
+ if (OpHi.isGlobal())
+ OpHi.setOffset(OpHi.getOffset() + 12);
+ Bundler.append(BuildMI(MF, DL, get(AMDGPU::S_ADDC_U32), RegHi)
+ .addReg(RegHi)
+ .add(OpHi));
- MachineInstrBuilder MIB = BuildMI(MF, DL, get(AMDGPU::S_ADDC_U32), RegHi)
- .addReg(RegHi);
- MIB.add(MI.getOperand(2));
-
- Bundler.append(MIB);
finalizeBundle(MBB, Bundler.begin());
MI.eraseFromParent();
@@ -2350,12 +2479,98 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
return true;
}
+void SIInstrInfo::reMaterialize(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator I, Register DestReg,
+ unsigned SubIdx, const MachineInstr &Orig,
+ const TargetRegisterInfo &RI) const {
+
+ // Try shrinking the instruction to remat only the part needed for current
+ // context.
+ // TODO: Handle more cases.
+ unsigned Opcode = Orig.getOpcode();
+ switch (Opcode) {
+ case AMDGPU::S_LOAD_DWORDX16_IMM:
+ case AMDGPU::S_LOAD_DWORDX8_IMM: {
+ if (SubIdx != 0)
+ break;
+
+ if (I == MBB.end())
+ break;
+
+ if (I->isBundled())
+ break;
+
+ // Look for a single use of the register that is also a subreg.
+ Register RegToFind = Orig.getOperand(0).getReg();
+ MachineOperand *UseMO = nullptr;
+ for (auto &CandMO : I->operands()) {
+ if (!CandMO.isReg() || CandMO.getReg() != RegToFind || CandMO.isDef())
+ continue;
+ if (UseMO) {
+ UseMO = nullptr;
+ break;
+ }
+ UseMO = &CandMO;
+ }
+ if (!UseMO || UseMO->getSubReg() == AMDGPU::NoSubRegister)
+ break;
+
+ unsigned Offset = RI.getSubRegIdxOffset(UseMO->getSubReg());
+ unsigned SubregSize = RI.getSubRegIdxSize(UseMO->getSubReg());
+
+ MachineFunction *MF = MBB.getParent();
+ MachineRegisterInfo &MRI = MF->getRegInfo();
+ assert(MRI.use_nodbg_empty(DestReg) && "DestReg should have no users yet.");
+
+ unsigned NewOpcode = -1;
+ if (SubregSize == 256)
+ NewOpcode = AMDGPU::S_LOAD_DWORDX8_IMM;
+ else if (SubregSize == 128)
+ NewOpcode = AMDGPU::S_LOAD_DWORDX4_IMM;
+ else
+ break;
+
+ const MCInstrDesc &TID = get(NewOpcode);
+ const TargetRegisterClass *NewRC =
+ RI.getAllocatableClass(getRegClass(TID, 0, &RI, *MF));
+ MRI.setRegClass(DestReg, NewRC);
+
+ UseMO->setReg(DestReg);
+ UseMO->setSubReg(AMDGPU::NoSubRegister);
+
+ // Use a smaller load with the desired size, possibly with updated offset.
+ MachineInstr *MI = MF->CloneMachineInstr(&Orig);
+ MI->setDesc(TID);
+ MI->getOperand(0).setReg(DestReg);
+ MI->getOperand(0).setSubReg(AMDGPU::NoSubRegister);
+ if (Offset) {
+ MachineOperand *OffsetMO = getNamedOperand(*MI, AMDGPU::OpName::offset);
+ int64_t FinalOffset = OffsetMO->getImm() + Offset / 8;
+ OffsetMO->setImm(FinalOffset);
+ }
+ SmallVector<MachineMemOperand *> NewMMOs;
+ for (const MachineMemOperand *MemOp : Orig.memoperands())
+ NewMMOs.push_back(MF->getMachineMemOperand(MemOp, MemOp->getPointerInfo(),
+ SubregSize / 8));
+ MI->setMemRefs(*MF, NewMMOs);
+
+ MBB.insert(I, MI);
+ return;
+ }
+
+ default:
+ break;
+ }
+
+ TargetInstrInfo::reMaterialize(MBB, I, DestReg, SubIdx, Orig, RI);
+}
+
std::pair<MachineInstr*, MachineInstr*>
SIInstrInfo::expandMovDPP64(MachineInstr &MI) const {
assert (MI.getOpcode() == AMDGPU::V_MOV_B64_DPP_PSEUDO);
if (ST.hasMovB64() &&
- AMDGPU::isLegal64BitDPPControl(
+ AMDGPU::isLegalDPALU_DPPControl(
getNamedOperand(MI, AMDGPU::OpName::dpp_ctrl)->getImm())) {
MI.setDesc(get(AMDGPU::V_MOV_B64_dpp));
return std::pair(&MI, nullptr);
@@ -2414,6 +2629,14 @@ SIInstrInfo::expandMovDPP64(MachineInstr &MI) const {
return std::pair(Split[0], Split[1]);
}
+std::optional<DestSourcePair>
+SIInstrInfo::isCopyInstrImpl(const MachineInstr &MI) const {
+ if (MI.getOpcode() == AMDGPU::WWM_COPY)
+ return DestSourcePair{MI.getOperand(0), MI.getOperand(1)};
+
+ return std::nullopt;
+}
+
bool SIInstrInfo::swapSourceModifiers(MachineInstr &MI,
MachineOperand &Src0,
unsigned Src0OpName,
@@ -2474,6 +2697,9 @@ MachineInstr *SIInstrInfo::commuteInstructionImpl(MachineInstr &MI, bool NewMI,
if (CommutedOpcode == -1)
return nullptr;
+ if (Src0Idx > Src1Idx)
+ std::swap(Src0Idx, Src1Idx);
+
assert(AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0) ==
static_cast<int>(Src0Idx) &&
AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1) ==
@@ -2556,14 +2782,8 @@ bool SIInstrInfo::isBranchOffsetInRange(unsigned BranchOp,
return isIntN(BranchOffsetBits, BrOffset);
}
-MachineBasicBlock *SIInstrInfo::getBranchDestBlock(
- const MachineInstr &MI) const {
- if (MI.getOpcode() == AMDGPU::S_SETPC_B64) {
- // This would be a difficult analysis to perform, but can always be legal so
- // there's no need to analyze it.
- return nullptr;
- }
-
+MachineBasicBlock *
+SIInstrInfo::getBranchDestBlock(const MachineInstr &MI) const {
return MI.getOperand(0).getMBB();
}
@@ -2874,7 +3094,6 @@ unsigned SIInstrInfo::insertBranch(MachineBasicBlock &MBB,
= getBranchOpcode(static_cast<BranchPredicate>(Cond[0].getImm()));
if (!FBB) {
- Cond[1].isUndef();
MachineInstr *CondBr =
BuildMI(&MBB, DL, get(Opcode))
.addMBB(TBB);
@@ -3079,7 +3298,9 @@ bool SIInstrInfo::isFoldableCopy(const MachineInstr &MI) {
case AMDGPU::V_MOV_B64_e64:
case AMDGPU::S_MOV_B32:
case AMDGPU::S_MOV_B64:
+ case AMDGPU::S_MOV_B64_IMM_PSEUDO:
case AMDGPU::COPY:
+ case AMDGPU::WWM_COPY:
case AMDGPU::V_ACCVGPR_WRITE_B32_e64:
case AMDGPU::V_ACCVGPR_READ_B32_e64:
case AMDGPU::V_ACCVGPR_MOV_B32:
@@ -3111,11 +3332,10 @@ bool SIInstrInfo::FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
switch (DefMI.getOpcode()) {
default:
return false;
+ case AMDGPU::V_MOV_B64_e32:
case AMDGPU::S_MOV_B64:
- // TODO: We could fold 64-bit immediates, but this get complicated
- // when there are sub-registers.
- return false;
-
+ case AMDGPU::V_MOV_B64_PSEUDO:
+ case AMDGPU::S_MOV_B64_IMM_PSEUDO:
case AMDGPU::V_MOV_B32_e32:
case AMDGPU::S_MOV_B32:
case AMDGPU::V_ACCVGPR_WRITE_B32_e64:
@@ -3128,19 +3348,45 @@ bool SIInstrInfo::FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
if (!ImmOp->isImm())
return false;
+ auto getImmFor = [ImmOp](const MachineOperand &UseOp) -> int64_t {
+ int64_t Imm = ImmOp->getImm();
+ switch (UseOp.getSubReg()) {
+ default:
+ return Imm;
+ case AMDGPU::sub0:
+ return Lo_32(Imm);
+ case AMDGPU::sub1:
+ return Hi_32(Imm);
+ case AMDGPU::lo16:
+ return APInt(16, Imm).getSExtValue();
+ case AMDGPU::hi16:
+ return APInt(32, Imm).ashr(16).getSExtValue();
+ case AMDGPU::sub1_lo16:
+ return APInt(16, Hi_32(Imm)).getSExtValue();
+ case AMDGPU::sub1_hi16:
+ return APInt(32, Hi_32(Imm)).ashr(16).getSExtValue();
+ }
+ };
+
+ assert(!DefMI.getOperand(0).getSubReg() && "Expected SSA form");
+
unsigned Opc = UseMI.getOpcode();
if (Opc == AMDGPU::COPY) {
+ assert(!UseMI.getOperand(0).getSubReg() && "Expected SSA form");
+
Register DstReg = UseMI.getOperand(0).getReg();
- bool Is16Bit = getOpSize(UseMI, 0) == 2;
+ unsigned OpSize = getOpSize(UseMI, 0);
+ bool Is16Bit = OpSize == 2;
+ bool Is64Bit = OpSize == 8;
bool isVGPRCopy = RI.isVGPR(*MRI, DstReg);
- unsigned NewOpc = isVGPRCopy ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32;
- APInt Imm(32, ImmOp->getImm());
-
- if (UseMI.getOperand(1).getSubReg() == AMDGPU::hi16)
- Imm = Imm.ashr(16);
+ unsigned NewOpc = isVGPRCopy ? Is64Bit ? AMDGPU::V_MOV_B64_PSEUDO
+ : AMDGPU::V_MOV_B32_e32
+ : Is64Bit ? AMDGPU::S_MOV_B64_IMM_PSEUDO
+ : AMDGPU::S_MOV_B32;
+ APInt Imm(Is64Bit ? 64 : 32, getImmFor(UseMI.getOperand(1)));
if (RI.isAGPR(*MRI, DstReg)) {
- if (!isInlineConstant(Imm))
+ if (Is64Bit || !isInlineConstant(Imm))
return false;
NewOpc = AMDGPU::V_ACCVGPR_WRITE_B32_e64;
}
@@ -3200,14 +3446,32 @@ bool SIInstrInfo::FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
MachineOperand *Src2 = getNamedOperand(UseMI, AMDGPU::OpName::src2);
// Multiplied part is the constant: Use v_madmk_{f16, f32}.
- // We should only expect these to be on src0 due to canonicalization.
- if (Src0->isReg() && Src0->getReg() == Reg) {
- if (!Src1->isReg() || RI.isSGPRClass(MRI->getRegClass(Src1->getReg())))
+ if ((Src0->isReg() && Src0->getReg() == Reg) ||
+ (Src1->isReg() && Src1->getReg() == Reg)) {
+ MachineOperand *RegSrc =
+ Src1->isReg() && Src1->getReg() == Reg ? Src0 : Src1;
+ if (!RegSrc->isReg())
+ return false;
+ if (RI.isSGPRClass(MRI->getRegClass(RegSrc->getReg())) &&
+ ST.getConstantBusLimit(Opc) < 2)
return false;
if (!Src2->isReg() || RI.isSGPRClass(MRI->getRegClass(Src2->getReg())))
return false;
+ // If src2 is also a literal constant then we have to choose which one to
+ // fold. In general it is better to choose madak so that the other literal
+ // can be materialized in an sgpr instead of a vgpr:
+ // s_mov_b32 s0, literal
+ // v_madak_f32 v0, s0, v0, literal
+ // Instead of:
+ // v_mov_b32 v1, literal
+ // v_madmk_f32 v0, v0, literal, v1
+ MachineInstr *Def = MRI->getUniqueVRegDef(Src2->getReg());
+ if (Def && Def->isMoveImmediate() &&
+ !isInlineConstant(Def->getOperand(1)))
+ return false;
+
unsigned NewOpc =
IsFMA ? (IsF32 ? AMDGPU::V_FMAMK_F32
: ST.hasTrue16BitInsts() ? AMDGPU::V_FMAMK_F16_t16
@@ -3216,18 +3480,22 @@ bool SIInstrInfo::FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
if (pseudoToMCOpcode(NewOpc) == -1)
return false;
- // We need to swap operands 0 and 1 since madmk constant is at operand 1.
+ // V_FMAMK_F16_t16 takes VGPR_32_Lo128 operands, so the rewrite
+ // would also require restricting their register classes. For now
+ // just bail out.
+ if (NewOpc == AMDGPU::V_FMAMK_F16_t16)
+ return false;
- const int64_t Imm = ImmOp->getImm();
+ const int64_t Imm = getImmFor(RegSrc == Src1 ? *Src0 : *Src1);
// FIXME: This would be a lot easier if we could return a new instruction
// instead of having to modify in place.
- Register Src1Reg = Src1->getReg();
- unsigned Src1SubReg = Src1->getSubReg();
- Src0->setReg(Src1Reg);
- Src0->setSubReg(Src1SubReg);
- Src0->setIsKill(Src1->isKill());
+ Register SrcReg = RegSrc->getReg();
+ unsigned SrcSubReg = RegSrc->getSubReg();
+ Src0->setReg(SrcReg);
+ Src0->setSubReg(SrcSubReg);
+ Src0->setIsKill(RegSrc->isKill());
if (Opc == AMDGPU::V_MAC_F32_e64 || Opc == AMDGPU::V_MAC_F16_e64 ||
Opc == AMDGPU::V_FMAC_F32_e64 || Opc == AMDGPU::V_FMAC_F16_t16_e64 ||
@@ -3249,43 +3517,38 @@ bool SIInstrInfo::FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
// Added part is the constant: Use v_madak_{f16, f32}.
if (Src2->isReg() && Src2->getReg() == Reg) {
- // Not allowed to use constant bus for another operand.
- // We can however allow an inline immediate as src0.
- bool Src0Inlined = false;
- if (Src0->isReg()) {
- // Try to inline constant if possible.
- // If the Def moves immediate and the use is single
- // We are saving VGPR here.
- MachineInstr *Def = MRI->getUniqueVRegDef(Src0->getReg());
- if (Def && Def->isMoveImmediate() &&
- isInlineConstant(Def->getOperand(1)) &&
- MRI->hasOneUse(Src0->getReg())) {
- Src0->ChangeToImmediate(Def->getOperand(1).getImm());
- Src0Inlined = true;
- } else if ((Src0->getReg().isPhysical() &&
- (ST.getConstantBusLimit(Opc) <= 1 &&
- RI.isSGPRClass(RI.getPhysRegBaseClass(Src0->getReg())))) ||
- (Src0->getReg().isVirtual() &&
- (ST.getConstantBusLimit(Opc) <= 1 &&
- RI.isSGPRClass(MRI->getRegClass(Src0->getReg())))))
- return false;
+ if (ST.getConstantBusLimit(Opc) < 2) {
+ // Not allowed to use constant bus for another operand.
+ // We can however allow an inline immediate as src0.
+ bool Src0Inlined = false;
+ if (Src0->isReg()) {
+ // Try to inline constant if possible.
+ // If the Def moves immediate and the use is single
+ // We are saving VGPR here.
+ MachineInstr *Def = MRI->getUniqueVRegDef(Src0->getReg());
+ if (Def && Def->isMoveImmediate() &&
+ isInlineConstant(Def->getOperand(1)) &&
+ MRI->hasOneUse(Src0->getReg())) {
+ Src0->ChangeToImmediate(Def->getOperand(1).getImm());
+ Src0Inlined = true;
+ } else if (ST.getConstantBusLimit(Opc) <= 1 &&
+ RI.isSGPRReg(*MRI, Src0->getReg())) {
+ return false;
+ }
// VGPR is okay as Src0 - fallthrough
- }
+ }
- if (Src1->isReg() && !Src0Inlined ) {
- // We have one slot for inlinable constant so far - try to fill it
- MachineInstr *Def = MRI->getUniqueVRegDef(Src1->getReg());
- if (Def && Def->isMoveImmediate() &&
- isInlineConstant(Def->getOperand(1)) &&
- MRI->hasOneUse(Src1->getReg()) &&
- commuteInstruction(UseMI)) {
+ if (Src1->isReg() && !Src0Inlined) {
+ // We have one slot for inlinable constant so far - try to fill it
+ MachineInstr *Def = MRI->getUniqueVRegDef(Src1->getReg());
+ if (Def && Def->isMoveImmediate() &&
+ isInlineConstant(Def->getOperand(1)) &&
+ MRI->hasOneUse(Src1->getReg()) && commuteInstruction(UseMI))
Src0->ChangeToImmediate(Def->getOperand(1).getImm());
- } else if ((Src1->getReg().isPhysical() &&
- RI.isSGPRClass(RI.getPhysRegBaseClass(Src1->getReg()))) ||
- (Src1->getReg().isVirtual() &&
- RI.isSGPRClass(MRI->getRegClass(Src1->getReg()))))
- return false;
+ else if (RI.isSGPRReg(*MRI, Src1->getReg()))
+ return false;
// VGPR is okay as Src1 - fallthrough
+ }
}
unsigned NewOpc =
@@ -3296,7 +3559,11 @@ bool SIInstrInfo::FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
if (pseudoToMCOpcode(NewOpc) == -1)
return false;
- const int64_t Imm = ImmOp->getImm();
+ // V_FMAAK_F16_t16 takes VGPR_32_Lo128 operands, so the rewrite
+ // would also require restricting their register classes. For now
+ // just bail out.
+ if (NewOpc == AMDGPU::V_FMAAK_F16_t16)
+ return false;
// FIXME: This would be a lot easier if we could return a new instruction
// instead of having to modify in place.
@@ -3308,7 +3575,7 @@ bool SIInstrInfo::FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2));
// ChangingToImmediate adds Src2 back to the instruction.
- Src2->ChangeToImmediate(Imm);
+ Src2->ChangeToImmediate(getImmFor(*Src2));
// These come before src2.
removeModOperands(UseMI);
@@ -3403,19 +3670,30 @@ bool SIInstrInfo::areMemAccessesTriviallyDisjoint(const MachineInstr &MIa,
if (isMUBUF(MIb) || isMTBUF(MIb))
return checkInstOffsetsDoNotOverlap(MIa, MIb);
- return !isFLAT(MIb) && !isSMRD(MIb);
+ if (isFLAT(MIb))
+ return isFLATScratch(MIb);
+
+ return !isSMRD(MIb);
}
if (isSMRD(MIa)) {
if (isSMRD(MIb))
return checkInstOffsetsDoNotOverlap(MIa, MIb);
- return !isFLAT(MIb) && !isMUBUF(MIb) && !isMTBUF(MIb);
+ if (isFLAT(MIb))
+ return isFLATScratch(MIb);
+
+ return !isMUBUF(MIb) && !isMTBUF(MIb);
}
if (isFLAT(MIa)) {
- if (isFLAT(MIb))
+ if (isFLAT(MIb)) {
+ if ((isFLATScratch(MIa) && isFLATGlobal(MIb)) ||
+ (isFLATGlobal(MIa) && isFLATScratch(MIb)))
+ return true;
+
return checkInstOffsetsDoNotOverlap(MIa, MIb);
+ }
return false;
}
@@ -3722,13 +4000,7 @@ bool SIInstrInfo::isSchedulingBoundary(const MachineInstr &MI,
}
bool SIInstrInfo::isAlwaysGDS(uint16_t Opcode) const {
- return Opcode == AMDGPU::DS_ORDERED_COUNT ||
- Opcode == AMDGPU::DS_GWS_INIT ||
- Opcode == AMDGPU::DS_GWS_SEMA_V ||
- Opcode == AMDGPU::DS_GWS_SEMA_BR ||
- Opcode == AMDGPU::DS_GWS_SEMA_P ||
- Opcode == AMDGPU::DS_GWS_SEMA_RELEASE_ALL ||
- Opcode == AMDGPU::DS_GWS_BARRIER;
+ return Opcode == AMDGPU::DS_ORDERED_COUNT || isGWS(Opcode);
}
bool SIInstrInfo::modifiesModeRegister(const MachineInstr &MI) {
@@ -3773,7 +4045,9 @@ bool SIInstrInfo::hasUnwantedEffectsWhenEXECEmpty(const MachineInstr &MI) const
// However, executing them with EXEC = 0 causes them to operate on undefined
// data, which we avoid by returning true here.
if (Opcode == AMDGPU::V_READFIRSTLANE_B32 ||
- Opcode == AMDGPU::V_READLANE_B32 || Opcode == AMDGPU::V_WRITELANE_B32)
+ Opcode == AMDGPU::V_READLANE_B32 || Opcode == AMDGPU::V_WRITELANE_B32 ||
+ Opcode == AMDGPU::SI_RESTORE_S32_FROM_VGPR ||
+ Opcode == AMDGPU::SI_SPILL_S32_TO_VGPR)
return true;
return false;
@@ -3827,9 +4101,7 @@ bool SIInstrInfo::isInlineConstant(const APInt &Imm) const {
bool SIInstrInfo::isInlineConstant(const MachineOperand &MO,
uint8_t OperandType) const {
assert(!MO.isReg() && "isInlineConstant called on register operand!");
- if (!MO.isImm() ||
- OperandType < AMDGPU::OPERAND_SRC_FIRST ||
- OperandType > AMDGPU::OPERAND_SRC_LAST)
+ if (!MO.isImm())
return false;
// MachineOperand provides no way to tell the true operand size, since it only
@@ -3849,7 +4121,8 @@ bool SIInstrInfo::isInlineConstant(const MachineOperand &MO,
case AMDGPU::OPERAND_REG_IMM_V2INT32:
case AMDGPU::OPERAND_REG_INLINE_C_V2INT32:
case AMDGPU::OPERAND_REG_INLINE_AC_INT32:
- case AMDGPU::OPERAND_REG_INLINE_AC_FP32: {
+ case AMDGPU::OPERAND_REG_INLINE_AC_FP32:
+ case AMDGPU::OPERAND_INLINE_SPLIT_BARRIER_INT32: {
int32_t Trunc = static_cast<int32_t>(Imm);
return AMDGPU::isInlinableLiteral32(Trunc, ST.hasInv2PiInlineImm());
}
@@ -3877,12 +4150,15 @@ bool SIInstrInfo::isInlineConstant(const MachineOperand &MO,
case AMDGPU::OPERAND_REG_IMM_V2INT16:
case AMDGPU::OPERAND_REG_INLINE_C_V2INT16:
case AMDGPU::OPERAND_REG_INLINE_AC_V2INT16:
- // This suffers the same problem as the scalar 16-bit cases.
- return AMDGPU::isInlinableIntLiteralV216(Imm);
+ return (isInt<16>(Imm) || isUInt<16>(Imm)) &&
+ AMDGPU::isInlinableIntLiteral((int16_t)Imm);
case AMDGPU::OPERAND_REG_IMM_FP16:
case AMDGPU::OPERAND_REG_IMM_FP16_DEFERRED:
case AMDGPU::OPERAND_REG_INLINE_C_FP16:
- case AMDGPU::OPERAND_REG_INLINE_AC_FP16: {
+ case AMDGPU::OPERAND_REG_INLINE_AC_FP16:
+ case AMDGPU::OPERAND_REG_IMM_V2FP16:
+ case AMDGPU::OPERAND_REG_INLINE_C_V2FP16:
+ case AMDGPU::OPERAND_REG_INLINE_AC_V2FP16: {
if (isInt<16>(Imm) || isUInt<16>(Imm)) {
// A few special case instructions have 16-bit operands on subtargets
// where 16-bit instructions are not legal.
@@ -3895,17 +4171,26 @@ bool SIInstrInfo::isInlineConstant(const MachineOperand &MO,
return false;
}
- case AMDGPU::OPERAND_REG_IMM_V2FP16:
- case AMDGPU::OPERAND_REG_INLINE_C_V2FP16:
- case AMDGPU::OPERAND_REG_INLINE_AC_V2FP16: {
- uint32_t Trunc = static_cast<uint32_t>(Imm);
- return AMDGPU::isInlinableLiteralV216(Trunc, ST.hasInv2PiInlineImm());
- }
case AMDGPU::OPERAND_KIMM32:
case AMDGPU::OPERAND_KIMM16:
return false;
+ case AMDGPU::OPERAND_INPUT_MODS:
+ case MCOI::OPERAND_IMMEDIATE:
+ // Always embedded in the instruction for free.
+ return true;
+ case MCOI::OPERAND_UNKNOWN:
+ case MCOI::OPERAND_REGISTER:
+ case MCOI::OPERAND_PCREL:
+ case MCOI::OPERAND_GENERIC_0:
+ case MCOI::OPERAND_GENERIC_1:
+ case MCOI::OPERAND_GENERIC_2:
+ case MCOI::OPERAND_GENERIC_3:
+ case MCOI::OPERAND_GENERIC_4:
+ case MCOI::OPERAND_GENERIC_5:
+ // Just ignore anything else.
+ return true;
default:
- llvm_unreachable("invalid bitwidth");
+ llvm_unreachable("invalid operand type");
}
}
@@ -4154,7 +4439,9 @@ static bool shouldReadExec(const MachineInstr &MI) {
if (SIInstrInfo::isVALU(MI)) {
switch (MI.getOpcode()) {
case AMDGPU::V_READLANE_B32:
+ case AMDGPU::SI_RESTORE_S32_FROM_VGPR:
case AMDGPU::V_WRITELANE_B32:
+ case AMDGPU::SI_SPILL_S32_TO_VGPR:
return false;
}
@@ -4231,8 +4518,8 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI,
return true;
}
- if (isMIMG(MI) && MI.memoperands_empty() && MI.mayLoadOrStore()) {
- ErrInfo = "missing memory operand from MIMG instruction.";
+ if (isImage(MI) && MI.memoperands_empty() && MI.mayLoadOrStore()) {
+ ErrInfo = "missing memory operand from image instruction.";
return false;
}
@@ -4276,6 +4563,12 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI,
}
break;
}
+ case AMDGPU::OPERAND_INLINE_SPLIT_BARRIER_INT32:
+ if (!MI.getOperand(i).isImm() || !isInlineConstant(MI, i)) {
+ ErrInfo = "Expected inline constant for operand.";
+ return false;
+ }
+ break;
case MCOI::OPERAND_IMMEDIATE:
case AMDGPU::OPERAND_KIMM32:
// Check if this operand is an immediate.
@@ -4418,8 +4711,8 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI,
}
}
- // Verify MIMG
- if (isMIMG(MI.getOpcode()) && !MI.mayStore()) {
+ // Verify MIMG / VIMAGE / VSAMPLE
+ if (isImage(MI.getOpcode()) && !MI.mayStore()) {
// Ensure that the return type used is large enough for all the options
// being used TFE/LWE require an extra result register.
const MachineOperand *DMask = getNamedOperand(MI, AMDGPU::OpName::dmask);
@@ -4683,12 +4976,14 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI,
}
}
- if (isMIMG(MI)) {
+ if (isImage(MI)) {
const MachineOperand *DimOp = getNamedOperand(MI, AMDGPU::OpName::dim);
if (DimOp) {
int VAddr0Idx = AMDGPU::getNamedOperandIdx(Opcode,
AMDGPU::OpName::vaddr0);
- int SRsrcIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::srsrc);
+ int RSrcOpName =
+ isMIMG(MI) ? AMDGPU::OpName::srsrc : AMDGPU::OpName::rsrc;
+ int RsrcIdx = AMDGPU::getNamedOperandIdx(Opcode, RSrcOpName);
const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Opcode);
const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
AMDGPU::getMIMGBaseOpcodeInfo(Info->BaseOpcode);
@@ -4709,16 +5004,17 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI,
IsA16 = A16->getImm() != 0;
}
- bool IsNSA = SRsrcIdx - VAddr0Idx > 1;
+ bool IsNSA = RsrcIdx - VAddr0Idx > 1;
unsigned AddrWords =
AMDGPU::getAddrSizeMIMGOp(BaseOpcode, Dim, IsA16, ST.hasG16());
unsigned VAddrWords;
if (IsNSA) {
- VAddrWords = SRsrcIdx - VAddr0Idx;
- if (ST.hasPartialNSAEncoding() && AddrWords > ST.getNSAMaxSize()) {
- unsigned LastVAddrIdx = SRsrcIdx - 1;
+ VAddrWords = RsrcIdx - VAddr0Idx;
+ if (ST.hasPartialNSAEncoding() &&
+ AddrWords > ST.getNSAMaxSize(isVSAMPLE(MI))) {
+ unsigned LastVAddrIdx = RsrcIdx - 1;
VAddrWords += getOpSize(MI, LastVAddrIdx) / 4 - 1;
}
} else {
@@ -4779,20 +5075,10 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI,
}
}
- int DstIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vdst);
-
if (Opcode != AMDGPU::V_MOV_B64_DPP_PSEUDO &&
- ((DstIdx >= 0 &&
- (Desc.operands()[DstIdx].RegClass == AMDGPU::VReg_64RegClassID ||
- Desc.operands()[DstIdx].RegClass ==
- AMDGPU::VReg_64_Align2RegClassID)) ||
- ((Src0Idx >= 0 &&
- (Desc.operands()[Src0Idx].RegClass == AMDGPU::VReg_64RegClassID ||
- Desc.operands()[Src0Idx].RegClass ==
- AMDGPU::VReg_64_Align2RegClassID)))) &&
- !AMDGPU::isLegal64BitDPPControl(DC)) {
+ !AMDGPU::isLegalDPALU_DPPControl(DC) && AMDGPU::isDPALU_DPP(Desc)) {
ErrInfo = "Invalid dpp_ctrl value: "
- "64 bit dpp only support row_newbcast";
+ "DP ALU dpp only support row_newbcast";
return false;
}
}
@@ -4884,6 +5170,9 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI,
return true;
}
+// It is more readable to list mapped opcodes on the same line.
+// clang-format off
+
unsigned SIInstrInfo::getVALUOp(const MachineInstr &MI) const {
switch (MI.getOpcode()) {
default: return AMDGPU::INSTRUCTION_LIST_END;
@@ -4960,16 +5249,91 @@ unsigned SIInstrInfo::getVALUOp(const MachineInstr &MI) const {
case AMDGPU::S_FLBIT_I32: return AMDGPU::V_FFBH_I32_e64;
case AMDGPU::S_CBRANCH_SCC0: return AMDGPU::S_CBRANCH_VCCZ;
case AMDGPU::S_CBRANCH_SCC1: return AMDGPU::S_CBRANCH_VCCNZ;
+ case AMDGPU::S_CVT_F32_I32: return AMDGPU::V_CVT_F32_I32_e64;
+ case AMDGPU::S_CVT_F32_U32: return AMDGPU::V_CVT_F32_U32_e64;
+ case AMDGPU::S_CVT_I32_F32: return AMDGPU::V_CVT_I32_F32_e64;
+ case AMDGPU::S_CVT_U32_F32: return AMDGPU::V_CVT_U32_F32_e64;
+ case AMDGPU::S_CVT_F32_F16: return AMDGPU::V_CVT_F32_F16_t16_e64;
+ case AMDGPU::S_CVT_HI_F32_F16: return AMDGPU::V_CVT_F32_F16_t16_e64;
+ case AMDGPU::S_CVT_F16_F32: return AMDGPU::V_CVT_F16_F32_t16_e64;
+ case AMDGPU::S_CEIL_F32: return AMDGPU::V_CEIL_F32_e64;
+ case AMDGPU::S_FLOOR_F32: return AMDGPU::V_FLOOR_F32_e64;
+ case AMDGPU::S_TRUNC_F32: return AMDGPU::V_TRUNC_F32_e64;
+ case AMDGPU::S_RNDNE_F32: return AMDGPU::V_RNDNE_F32_e64;
+ case AMDGPU::S_CEIL_F16: return AMDGPU::V_CEIL_F16_t16_e64;
+ case AMDGPU::S_FLOOR_F16: return AMDGPU::V_FLOOR_F16_t16_e64;
+ case AMDGPU::S_TRUNC_F16: return AMDGPU::V_TRUNC_F16_t16_e64;
+ case AMDGPU::S_RNDNE_F16: return AMDGPU::V_RNDNE_F16_t16_e64;
+ case AMDGPU::S_ADD_F32: return AMDGPU::V_ADD_F32_e64;
+ case AMDGPU::S_SUB_F32: return AMDGPU::V_SUB_F32_e64;
+ case AMDGPU::S_MIN_F32: return AMDGPU::V_MIN_F32_e64;
+ case AMDGPU::S_MAX_F32: return AMDGPU::V_MAX_F32_e64;
+ case AMDGPU::S_MINIMUM_F32: return AMDGPU::V_MINIMUM_F32_e64;
+ case AMDGPU::S_MAXIMUM_F32: return AMDGPU::V_MAXIMUM_F32_e64;
+ case AMDGPU::S_MUL_F32: return AMDGPU::V_MUL_F32_e64;
+ case AMDGPU::S_ADD_F16: return AMDGPU::V_ADD_F16_fake16_e64;
+ case AMDGPU::S_SUB_F16: return AMDGPU::V_SUB_F16_fake16_e64;
+ case AMDGPU::S_MIN_F16: return AMDGPU::V_MIN_F16_fake16_e64;
+ case AMDGPU::S_MAX_F16: return AMDGPU::V_MAX_F16_fake16_e64;
+ case AMDGPU::S_MINIMUM_F16: return AMDGPU::V_MINIMUM_F16_e64;
+ case AMDGPU::S_MAXIMUM_F16: return AMDGPU::V_MAXIMUM_F16_e64;
+ case AMDGPU::S_MUL_F16: return AMDGPU::V_MUL_F16_fake16_e64;
+ case AMDGPU::S_CVT_PK_RTZ_F16_F32: return AMDGPU::V_CVT_PKRTZ_F16_F32_e64;
+ case AMDGPU::S_FMAC_F32: return AMDGPU::V_FMAC_F32_e64;
+ case AMDGPU::S_FMAC_F16: return AMDGPU::V_FMAC_F16_t16_e64;
+ case AMDGPU::S_FMAMK_F32: return AMDGPU::V_FMAMK_F32;
+ case AMDGPU::S_FMAAK_F32: return AMDGPU::V_FMAAK_F32;
+ case AMDGPU::S_CMP_LT_F32: return AMDGPU::V_CMP_LT_F32_e64;
+ case AMDGPU::S_CMP_EQ_F32: return AMDGPU::V_CMP_EQ_F32_e64;
+ case AMDGPU::S_CMP_LE_F32: return AMDGPU::V_CMP_LE_F32_e64;
+ case AMDGPU::S_CMP_GT_F32: return AMDGPU::V_CMP_GT_F32_e64;
+ case AMDGPU::S_CMP_LG_F32: return AMDGPU::V_CMP_LG_F32_e64;
+ case AMDGPU::S_CMP_GE_F32: return AMDGPU::V_CMP_GE_F32_e64;
+ case AMDGPU::S_CMP_O_F32: return AMDGPU::V_CMP_O_F32_e64;
+ case AMDGPU::S_CMP_U_F32: return AMDGPU::V_CMP_U_F32_e64;
+ case AMDGPU::S_CMP_NGE_F32: return AMDGPU::V_CMP_NGE_F32_e64;
+ case AMDGPU::S_CMP_NLG_F32: return AMDGPU::V_CMP_NLG_F32_e64;
+ case AMDGPU::S_CMP_NGT_F32: return AMDGPU::V_CMP_NGT_F32_e64;
+ case AMDGPU::S_CMP_NLE_F32: return AMDGPU::V_CMP_NLE_F32_e64;
+ case AMDGPU::S_CMP_NEQ_F32: return AMDGPU::V_CMP_NEQ_F32_e64;
+ case AMDGPU::S_CMP_NLT_F32: return AMDGPU::V_CMP_NLT_F32_e64;
+ case AMDGPU::S_CMP_LT_F16: return AMDGPU::V_CMP_LT_F16_t16_e64;
+ case AMDGPU::S_CMP_EQ_F16: return AMDGPU::V_CMP_EQ_F16_t16_e64;
+ case AMDGPU::S_CMP_LE_F16: return AMDGPU::V_CMP_LE_F16_t16_e64;
+ case AMDGPU::S_CMP_GT_F16: return AMDGPU::V_CMP_GT_F16_t16_e64;
+ case AMDGPU::S_CMP_LG_F16: return AMDGPU::V_CMP_LG_F16_t16_e64;
+ case AMDGPU::S_CMP_GE_F16: return AMDGPU::V_CMP_GE_F16_t16_e64;
+ case AMDGPU::S_CMP_O_F16: return AMDGPU::V_CMP_O_F16_t16_e64;
+ case AMDGPU::S_CMP_U_F16: return AMDGPU::V_CMP_U_F16_t16_e64;
+ case AMDGPU::S_CMP_NGE_F16: return AMDGPU::V_CMP_NGE_F16_t16_e64;
+ case AMDGPU::S_CMP_NLG_F16: return AMDGPU::V_CMP_NLG_F16_t16_e64;
+ case AMDGPU::S_CMP_NGT_F16: return AMDGPU::V_CMP_NGT_F16_t16_e64;
+ case AMDGPU::S_CMP_NLE_F16: return AMDGPU::V_CMP_NLE_F16_t16_e64;
+ case AMDGPU::S_CMP_NEQ_F16: return AMDGPU::V_CMP_NEQ_F16_t16_e64;
+ case AMDGPU::S_CMP_NLT_F16: return AMDGPU::V_CMP_NLT_F16_t16_e64;
+ case AMDGPU::V_S_EXP_F32_e64: return AMDGPU::V_EXP_F32_e64;
+ case AMDGPU::V_S_EXP_F16_e64: return AMDGPU::V_EXP_F16_t16_e64;
+ case AMDGPU::V_S_LOG_F32_e64: return AMDGPU::V_LOG_F32_e64;
+ case AMDGPU::V_S_LOG_F16_e64: return AMDGPU::V_LOG_F16_t16_e64;
+ case AMDGPU::V_S_RCP_F32_e64: return AMDGPU::V_RCP_F32_e64;
+ case AMDGPU::V_S_RCP_F16_e64: return AMDGPU::V_RCP_F16_t16_e64;
+ case AMDGPU::V_S_RSQ_F32_e64: return AMDGPU::V_RSQ_F32_e64;
+ case AMDGPU::V_S_RSQ_F16_e64: return AMDGPU::V_RSQ_F16_t16_e64;
+ case AMDGPU::V_S_SQRT_F32_e64: return AMDGPU::V_SQRT_F32_e64;
+ case AMDGPU::V_S_SQRT_F16_e64: return AMDGPU::V_SQRT_F16_t16_e64;
}
llvm_unreachable(
"Unexpected scalar opcode without corresponding vector one!");
}
+// clang-format on
+
void SIInstrInfo::insertScratchExecCopy(MachineFunction &MF,
MachineBasicBlock &MBB,
MachineBasicBlock::iterator MBBI,
const DebugLoc &DL, Register Reg,
- bool IsSCCLive) const {
+ bool IsSCCLive,
+ SlotIndexes *Indexes) const {
const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
const SIInstrInfo *TII = ST.getInstrInfo();
bool IsWave32 = ST.isWave32();
@@ -4979,23 +5343,34 @@ void SIInstrInfo::insertScratchExecCopy(MachineFunction &MF,
// the single instruction S_OR_SAVEEXEC that clobbers SCC.
unsigned MovOpc = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
MCRegister Exec = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
- BuildMI(MBB, MBBI, DL, TII->get(MovOpc), Reg).addReg(Exec, RegState::Kill);
- BuildMI(MBB, MBBI, DL, TII->get(MovOpc), Exec).addImm(-1);
+ auto StoreExecMI = BuildMI(MBB, MBBI, DL, TII->get(MovOpc), Reg)
+ .addReg(Exec, RegState::Kill);
+ auto FlipExecMI = BuildMI(MBB, MBBI, DL, TII->get(MovOpc), Exec).addImm(-1);
+ if (Indexes) {
+ Indexes->insertMachineInstrInMaps(*StoreExecMI);
+ Indexes->insertMachineInstrInMaps(*FlipExecMI);
+ }
} else {
const unsigned OrSaveExec =
IsWave32 ? AMDGPU::S_OR_SAVEEXEC_B32 : AMDGPU::S_OR_SAVEEXEC_B64;
auto SaveExec =
BuildMI(MBB, MBBI, DL, TII->get(OrSaveExec), Reg).addImm(-1);
SaveExec->getOperand(3).setIsDead(); // Mark SCC as dead.
+ if (Indexes)
+ Indexes->insertMachineInstrInMaps(*SaveExec);
}
}
void SIInstrInfo::restoreExec(MachineFunction &MF, MachineBasicBlock &MBB,
MachineBasicBlock::iterator MBBI,
- const DebugLoc &DL, Register Reg) const {
+ const DebugLoc &DL, Register Reg,
+ SlotIndexes *Indexes) const {
unsigned ExecMov = isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
MCRegister Exec = isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
- BuildMI(MBB, MBBI, DL, get(ExecMov), Exec).addReg(Reg, RegState::Kill);
+ auto ExecRestoreMI =
+ BuildMI(MBB, MBBI, DL, get(ExecMov), Exec).addReg(Reg, RegState::Kill);
+ if (Indexes)
+ Indexes->insertMachineInstrInMaps(*ExecRestoreMI);
}
static const TargetRegisterClass *
@@ -5102,13 +5477,10 @@ void SIInstrInfo::legalizeOpWithMove(MachineInstr &MI, unsigned OpIdx) const {
MO.ChangeToRegister(Reg, false);
}
-unsigned SIInstrInfo::buildExtractSubReg(MachineBasicBlock::iterator MI,
- MachineRegisterInfo &MRI,
- MachineOperand &SuperReg,
- const TargetRegisterClass *SuperRC,
- unsigned SubIdx,
- const TargetRegisterClass *SubRC)
- const {
+unsigned SIInstrInfo::buildExtractSubReg(
+ MachineBasicBlock::iterator MI, MachineRegisterInfo &MRI,
+ const MachineOperand &SuperReg, const TargetRegisterClass *SuperRC,
+ unsigned SubIdx, const TargetRegisterClass *SubRC) const {
MachineBasicBlock *MBB = MI->getParent();
DebugLoc DL = MI->getDebugLoc();
Register SubReg = MRI.createVirtualRegister(SubRC);
@@ -5135,12 +5507,9 @@ unsigned SIInstrInfo::buildExtractSubReg(MachineBasicBlock::iterator MI,
}
MachineOperand SIInstrInfo::buildExtractSubRegOrImm(
- MachineBasicBlock::iterator MII,
- MachineRegisterInfo &MRI,
- MachineOperand &Op,
- const TargetRegisterClass *SuperRC,
- unsigned SubIdx,
- const TargetRegisterClass *SubRC) const {
+ MachineBasicBlock::iterator MII, MachineRegisterInfo &MRI,
+ const MachineOperand &Op, const TargetRegisterClass *SuperRC,
+ unsigned SubIdx, const TargetRegisterClass *SubRC) const {
if (Op.isImm()) {
if (SubIdx == AMDGPU::sub0)
return MachineOperand::CreateImm(static_cast<int32_t>(Op.getImm()));
@@ -5235,9 +5604,8 @@ bool SIInstrInfo::isOperandLegal(const MachineInstr &MI, unsigned OpIdx,
return false;
SGPRsUsed.insert(SGPR);
}
- } else if (InstDesc.operands()[i].OperandType == AMDGPU::OPERAND_KIMM32 ||
- (AMDGPU::isSISrcOperand(InstDesc, i) &&
- !isInlineConstant(Op, InstDesc.operands()[i]))) {
+ } else if (AMDGPU::isSISrcOperand(InstDesc, i) &&
+ !isInlineConstant(Op, InstDesc.operands()[i])) {
if (!LiteralLimit--)
return false;
if (--ConstantBusLimit <= 0)
@@ -5285,6 +5653,27 @@ bool SIInstrInfo::isOperandLegal(const MachineInstr &MI, unsigned OpIdx,
return true;
}
+ if (MO->isImm()) {
+ uint64_t Imm = MO->getImm();
+ bool Is64BitFPOp = OpInfo.OperandType == AMDGPU::OPERAND_REG_IMM_FP64;
+ bool Is64BitOp = Is64BitFPOp ||
+ OpInfo.OperandType == AMDGPU::OPERAND_REG_IMM_INT64 ||
+ OpInfo.OperandType == AMDGPU::OPERAND_REG_IMM_V2INT32 ||
+ OpInfo.OperandType == AMDGPU::OPERAND_REG_IMM_V2FP32;
+ if (Is64BitOp &&
+ !AMDGPU::isInlinableLiteral64(Imm, ST.hasInv2PiInlineImm())) {
+ if (!AMDGPU::isValid32BitLiteral(Imm, Is64BitFPOp))
+ return false;
+
+ // FIXME: We can use sign extended 64-bit literals, but only for signed
+ // operands. At the moment we do not know if an operand is signed.
+ // Such operand will be encoded as its low 32 bits and then either
+ // correctly sign extended or incorrectly zero extended by HW.
+ if (!Is64BitFPOp && (int32_t)Imm < 0)
+ return false;
+ }
+ }
+
// Handle non-register types that are treated like immediates.
assert(MO->isImm() || MO->isTargetIndex() || MO->isFI() || MO->isGlobal());
@@ -5342,6 +5731,13 @@ void SIInstrInfo::legalizeOperandsVOP2(MachineRegisterInfo &MRI,
if (Src1.isReg() && RI.isAGPR(MRI, Src1.getReg()))
legalizeOpWithMove(MI, Src1Idx);
+ // Special case: V_FMAC_F32 and V_FMAC_F16 have src2.
+ if (Opc == AMDGPU::V_FMAC_F32_e32 || Opc == AMDGPU::V_FMAC_F16_e32) {
+ int Src2Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2);
+ if (!RI.isVGPR(MRI, MI.getOperand(Src2Idx).getReg()))
+ legalizeOpWithMove(MI, Src2Idx);
+ }
+
// VOP2 src0 instructions support all operand types, so we don't need to check
// their legality. If src1 is already legal, we don't need to do anything.
if (isLegalRegOperand(MRI, InstrDesc.operands()[Src1Idx], Src1))
@@ -5491,6 +5887,11 @@ void SIInstrInfo::legalizeOperandsVOP3(MachineRegisterInfo &MRI,
// legalize it.
legalizeOpWithMove(MI, Idx);
}
+
+ // Special case: V_FMAC_F32 and V_FMAC_F16 have src2 tied to vdst.
+ if ((Opc == AMDGPU::V_FMAC_F32_e64 || Opc == AMDGPU::V_FMAC_F16_e64) &&
+ !RI.isVGPR(MRI, MI.getOperand(VOP3Idx[2]).getReg()))
+ legalizeOpWithMove(MI, VOP3Idx[2]);
}
Register SIInstrInfo::readlaneVGPRToSGPR(Register SrcReg, MachineInstr &UseMI,
@@ -5862,6 +6263,17 @@ loadMBUFScalarOperandsFromVGPR(const SIInstrInfo &TII, MachineInstr &MI,
unsigned MovExecOpc = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
const auto *BoolXExecRC = TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
+ // Save SCC. Waterfall Loop may overwrite SCC.
+ Register SaveSCCReg;
+ bool SCCNotDead = (MBB.computeRegisterLiveness(TRI, AMDGPU::SCC, MI, 30) !=
+ MachineBasicBlock::LQR_Dead);
+ if (SCCNotDead) {
+ SaveSCCReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
+ BuildMI(MBB, Begin, DL, TII.get(AMDGPU::S_CSELECT_B32), SaveSCCReg)
+ .addImm(1)
+ .addImm(0);
+ }
+
Register SaveExec = MRI.createVirtualRegister(BoolXExecRC);
// Save the EXEC mask
@@ -5917,8 +6329,15 @@ loadMBUFScalarOperandsFromVGPR(const SIInstrInfo &TII, MachineInstr &MI,
emitLoadScalarOpsFromVGPRLoop(TII, MRI, MBB, *LoopBB, *BodyBB, DL, ScalarOps);
- // Restore the EXEC mask
MachineBasicBlock::iterator First = RemainderBB->begin();
+ // Restore SCC
+ if (SCCNotDead) {
+ BuildMI(*RemainderBB, First, DL, TII.get(AMDGPU::S_CMP_LG_U32))
+ .addReg(SaveSCCReg, RegState::Kill)
+ .addImm(0);
+ }
+
+ // Restore the EXEC mask
BuildMI(*RemainderBB, First, DL, TII.get(MovExecOpc), Exec).addReg(SaveExec);
return BodyBB;
}
@@ -6103,18 +6522,33 @@ SIInstrInfo::legalizeOperands(MachineInstr &MI,
return CreatedBB;
}
- // Legalize MIMG and MUBUF/MTBUF for shaders.
+ // Legalize S_BITREPLICATE, S_QUADMASK and S_WQM
+ if (MI.getOpcode() == AMDGPU::S_BITREPLICATE_B64_B32 ||
+ MI.getOpcode() == AMDGPU::S_QUADMASK_B32 ||
+ MI.getOpcode() == AMDGPU::S_QUADMASK_B64 ||
+ MI.getOpcode() == AMDGPU::S_WQM_B32 ||
+ MI.getOpcode() == AMDGPU::S_WQM_B64) {
+ MachineOperand &Src = MI.getOperand(1);
+ if (Src.isReg() && RI.hasVectorRegisters(MRI.getRegClass(Src.getReg())))
+ Src.setReg(readlaneVGPRToSGPR(Src.getReg(), MI, MRI));
+ return CreatedBB;
+ }
+
+ // Legalize MIMG/VIMAGE/VSAMPLE and MUBUF/MTBUF for shaders.
//
// Shaders only generate MUBUF/MTBUF instructions via intrinsics or via
// scratch memory access. In both cases, the legalization never involves
// conversion to the addr64 form.
- if (isMIMG(MI) || (AMDGPU::isGraphics(MF.getFunction().getCallingConv()) &&
- (isMUBUF(MI) || isMTBUF(MI)))) {
- MachineOperand *SRsrc = getNamedOperand(MI, AMDGPU::OpName::srsrc);
+ if (isImage(MI) || (AMDGPU::isGraphics(MF.getFunction().getCallingConv()) &&
+ (isMUBUF(MI) || isMTBUF(MI)))) {
+ int RSrcOpName = (isVIMAGE(MI) || isVSAMPLE(MI)) ? AMDGPU::OpName::rsrc
+ : AMDGPU::OpName::srsrc;
+ MachineOperand *SRsrc = getNamedOperand(MI, RSrcOpName);
if (SRsrc && !RI.isSGPRClass(MRI.getRegClass(SRsrc->getReg())))
CreatedBB = loadMBUFScalarOperandsFromVGPR(*this, MI, {SRsrc}, MDT);
- MachineOperand *SSamp = getNamedOperand(MI, AMDGPU::OpName::ssamp);
+ int SampOpName = isMIMG(MI) ? AMDGPU::OpName::ssamp : AMDGPU::OpName::samp;
+ MachineOperand *SSamp = getNamedOperand(MI, SampOpName);
if (SSamp && !RI.isSGPRClass(MRI.getRegClass(SSamp->getReg())))
CreatedBB = loadMBUFScalarOperandsFromVGPR(*this, MI, {SSamp}, MDT);
@@ -6149,13 +6583,26 @@ SIInstrInfo::legalizeOperands(MachineInstr &MI,
}
}
+ // Legalize s_sleep_var.
+ if (MI.getOpcode() == AMDGPU::S_SLEEP_VAR) {
+ const DebugLoc &DL = MI.getDebugLoc();
+ Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
+ int Src0Idx =
+ AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::src0);
+ MachineOperand &Src0 = MI.getOperand(Src0Idx);
+ BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
+ .add(Src0);
+ Src0.ChangeToRegister(Reg, false);
+ return nullptr;
+ }
+
// Legalize MUBUF instructions.
bool isSoffsetLegal = true;
int SoffsetIdx =
AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::soffset);
if (SoffsetIdx != -1) {
MachineOperand *Soffset = &MI.getOperand(SoffsetIdx);
- if (Soffset->isReg() &&
+ if (Soffset->isReg() && Soffset->getReg().isVirtual() &&
!RI.isSGPRClass(MRI.getRegClass(Soffset->getReg()))) {
isSoffsetLegal = false;
}
@@ -6370,10 +6817,11 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist,
default:
break;
case AMDGPU::S_ADD_U64_PSEUDO:
+ NewOpcode = AMDGPU::V_ADD_U64_PSEUDO;
+ break;
case AMDGPU::S_SUB_U64_PSEUDO:
- splitScalar64BitAddSub(Worklist, Inst, MDT);
- Inst.eraseFromParent();
- return;
+ NewOpcode = AMDGPU::V_SUB_U64_PSEUDO;
+ break;
case AMDGPU::S_ADD_I32:
case AMDGPU::S_SUB_I32: {
// FIXME: The u32 versions currently selected use the carry.
@@ -6469,7 +6917,9 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist,
break;
case AMDGPU::S_LSHL_B64:
if (ST.hasOnlyRevVALUShifts()) {
- NewOpcode = AMDGPU::V_LSHLREV_B64_e64;
+ NewOpcode = ST.getGeneration() >= AMDGPUSubtarget::GFX12
+ ? AMDGPU::V_LSHLREV_B64_pseudo_e64
+ : AMDGPU::V_LSHLREV_B64_e64;
swapOperands(Inst);
}
break;
@@ -6623,21 +7073,98 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist,
case AMDGPU::S_CMP_LT_U32:
case AMDGPU::S_CMP_LE_U32:
case AMDGPU::S_CMP_EQ_U64:
- case AMDGPU::S_CMP_LG_U64: {
- const MCInstrDesc &NewDesc = get(NewOpcode);
+ case AMDGPU::S_CMP_LG_U64:
+ case AMDGPU::S_CMP_LT_F32:
+ case AMDGPU::S_CMP_EQ_F32:
+ case AMDGPU::S_CMP_LE_F32:
+ case AMDGPU::S_CMP_GT_F32:
+ case AMDGPU::S_CMP_LG_F32:
+ case AMDGPU::S_CMP_GE_F32:
+ case AMDGPU::S_CMP_O_F32:
+ case AMDGPU::S_CMP_U_F32:
+ case AMDGPU::S_CMP_NGE_F32:
+ case AMDGPU::S_CMP_NLG_F32:
+ case AMDGPU::S_CMP_NGT_F32:
+ case AMDGPU::S_CMP_NLE_F32:
+ case AMDGPU::S_CMP_NEQ_F32:
+ case AMDGPU::S_CMP_NLT_F32:
+ case AMDGPU::S_CMP_LT_F16:
+ case AMDGPU::S_CMP_EQ_F16:
+ case AMDGPU::S_CMP_LE_F16:
+ case AMDGPU::S_CMP_GT_F16:
+ case AMDGPU::S_CMP_LG_F16:
+ case AMDGPU::S_CMP_GE_F16:
+ case AMDGPU::S_CMP_O_F16:
+ case AMDGPU::S_CMP_U_F16:
+ case AMDGPU::S_CMP_NGE_F16:
+ case AMDGPU::S_CMP_NLG_F16:
+ case AMDGPU::S_CMP_NGT_F16:
+ case AMDGPU::S_CMP_NLE_F16:
+ case AMDGPU::S_CMP_NEQ_F16:
+ case AMDGPU::S_CMP_NLT_F16: {
Register CondReg = MRI.createVirtualRegister(RI.getWaveMaskRegClass());
- MachineInstr *NewInstr =
- BuildMI(*MBB, Inst, Inst.getDebugLoc(), NewDesc, CondReg)
- .add(Inst.getOperand(0))
- .add(Inst.getOperand(1));
+ auto NewInstr =
+ BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(NewOpcode), CondReg)
+ .setMIFlags(Inst.getFlags());
+ if (AMDGPU::getNamedOperandIdx(NewOpcode,
+ AMDGPU::OpName::src0_modifiers) >= 0) {
+ NewInstr
+ .addImm(0) // src0_modifiers
+ .add(Inst.getOperand(0)) // src0
+ .addImm(0) // src1_modifiers
+ .add(Inst.getOperand(1)) // src1
+ .addImm(0); // clamp
+ } else {
+ NewInstr
+ .add(Inst.getOperand(0))
+ .add(Inst.getOperand(1));
+ }
legalizeOperands(*NewInstr, MDT);
int SCCIdx = Inst.findRegisterDefOperandIdx(AMDGPU::SCC);
MachineOperand SCCOp = Inst.getOperand(SCCIdx);
addSCCDefUsersToVALUWorklist(SCCOp, Inst, Worklist, CondReg);
Inst.eraseFromParent();
+ return;
+ }
+ case AMDGPU::S_CVT_HI_F32_F16: {
+ const DebugLoc &DL = Inst.getDebugLoc();
+ Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+ Register NewDst = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+ BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHRREV_B32_e64), TmpReg)
+ .addImm(16)
+ .add(Inst.getOperand(1));
+ BuildMI(*MBB, Inst, DL, get(NewOpcode), NewDst)
+ .addImm(0) // src0_modifiers
+ .addReg(TmpReg)
+ .addImm(0) // clamp
+ .addImm(0); // omod
+
+ MRI.replaceRegWith(Inst.getOperand(0).getReg(), NewDst);
+ addUsersToMoveToVALUWorklist(NewDst, MRI, Worklist);
+ Inst.eraseFromParent();
+ return;
}
+ case AMDGPU::S_MINIMUM_F32:
+ case AMDGPU::S_MAXIMUM_F32:
+ case AMDGPU::S_MINIMUM_F16:
+ case AMDGPU::S_MAXIMUM_F16: {
+ const DebugLoc &DL = Inst.getDebugLoc();
+ Register NewDst = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+ MachineInstr *NewInstr = BuildMI(*MBB, Inst, DL, get(NewOpcode), NewDst)
+ .addImm(0) // src0_modifiers
+ .add(Inst.getOperand(1))
+ .addImm(0) // src1_modifiers
+ .add(Inst.getOperand(2))
+ .addImm(0) // clamp
+ .addImm(0); // omod
+ MRI.replaceRegWith(Inst.getOperand(0).getReg(), NewDst);
+
+ legalizeOperands(*NewInstr, MDT);
+ addUsersToMoveToVALUWorklist(NewDst, MRI, Worklist);
+ Inst.eraseFromParent();
return;
}
+ }
if (NewOpcode == AMDGPU::INSTRUCTION_LIST_END) {
// We cannot move this instruction to the VALU, so we should try to
@@ -6681,8 +7208,61 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist,
// Use the new VALU Opcode.
auto NewInstr = BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(NewOpcode))
.setMIFlags(Inst.getFlags());
- for (const MachineOperand &Op : Inst.explicit_operands())
- NewInstr->addOperand(Op);
+ if (isVOP3(NewOpcode) && !isVOP3(Opcode)) {
+ // Intersperse VOP3 modifiers among the SALU operands.
+ NewInstr->addOperand(Inst.getOperand(0));
+ if (AMDGPU::getNamedOperandIdx(NewOpcode,
+ AMDGPU::OpName::src0_modifiers) >= 0)
+ NewInstr.addImm(0);
+ if (AMDGPU::getNamedOperandIdx(NewOpcode, AMDGPU::OpName::src0) >= 0)
+ NewInstr->addOperand(Inst.getOperand(1));
+
+ if (Opcode == AMDGPU::S_SEXT_I32_I8 || Opcode == AMDGPU::S_SEXT_I32_I16) {
+ // We are converting these to a BFE, so we need to add the missing
+ // operands for the size and offset.
+ unsigned Size = (Opcode == AMDGPU::S_SEXT_I32_I8) ? 8 : 16;
+ NewInstr.addImm(0);
+ NewInstr.addImm(Size);
+ } else if (Opcode == AMDGPU::S_BCNT1_I32_B32) {
+ // The VALU version adds the second operand to the result, so insert an
+ // extra 0 operand.
+ NewInstr.addImm(0);
+ } else if (Opcode == AMDGPU::S_BFE_I32 || Opcode == AMDGPU::S_BFE_U32) {
+ const MachineOperand &OffsetWidthOp = Inst.getOperand(2);
+ // If we need to move this to VGPRs, we need to unpack the second
+ // operand back into the 2 separate ones for bit offset and width.
+ assert(OffsetWidthOp.isImm() &&
+ "Scalar BFE is only implemented for constant width and offset");
+ uint32_t Imm = OffsetWidthOp.getImm();
+
+ uint32_t Offset = Imm & 0x3f; // Extract bits [5:0].
+ uint32_t BitWidth = (Imm & 0x7f0000) >> 16; // Extract bits [22:16].
+ NewInstr.addImm(Offset);
+ NewInstr.addImm(BitWidth);
+ } else {
+ if (AMDGPU::getNamedOperandIdx(NewOpcode,
+ AMDGPU::OpName::src1_modifiers) >= 0)
+ NewInstr.addImm(0);
+ if (AMDGPU::getNamedOperandIdx(NewOpcode, AMDGPU::OpName::src1) >= 0)
+ NewInstr->addOperand(Inst.getOperand(2));
+ if (AMDGPU::getNamedOperandIdx(NewOpcode,
+ AMDGPU::OpName::src2_modifiers) >= 0)
+ NewInstr.addImm(0);
+ if (AMDGPU::getNamedOperandIdx(NewOpcode, AMDGPU::OpName::src2) >= 0)
+ NewInstr->addOperand(Inst.getOperand(3));
+ if (AMDGPU::getNamedOperandIdx(NewOpcode, AMDGPU::OpName::clamp) >= 0)
+ NewInstr.addImm(0);
+ if (AMDGPU::getNamedOperandIdx(NewOpcode, AMDGPU::OpName::omod) >= 0)
+ NewInstr.addImm(0);
+ if (AMDGPU::getNamedOperandIdx(NewOpcode, AMDGPU::OpName::op_sel) >= 0)
+ NewInstr.addImm(0);
+ }
+ } else {
+ // Just copy the SALU operands.
+ for (const MachineOperand &Op : Inst.explicit_operands())
+ NewInstr->addOperand(Op);
+ }
+
// Remove any references to SCC. Vector instructions can't read from it, and
// We're just about to add the implicit use / defs of VCC, and we don't want
// both.
@@ -6706,30 +7286,6 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist,
NewDstReg = MRI.createVirtualRegister(NewDstRC);
MRI.replaceRegWith(DstReg, NewDstReg);
}
- if (Opcode == AMDGPU::S_SEXT_I32_I8 || Opcode == AMDGPU::S_SEXT_I32_I16) {
- // We are converting these to a BFE, so we need to add the missing
- // operands for the size and offset.
- unsigned Size = (Opcode == AMDGPU::S_SEXT_I32_I8) ? 8 : 16;
- NewInstr.addImm(0);
- NewInstr.addImm(Size);
- } else if (Opcode == AMDGPU::S_BCNT1_I32_B32) {
- // The VALU version adds the second operand to the result, so insert an
- // extra 0 operand.
- NewInstr.addImm(0);
- }
- if (Opcode == AMDGPU::S_BFE_I32 || Opcode == AMDGPU::S_BFE_U32) {
- const MachineOperand &OffsetWidthOp = NewInstr->getOperand(2);
- // If we need to move this to VGPRs, we need to unpack the second operand
- // back into the 2 separate ones for bit offset and width.
- assert(OffsetWidthOp.isImm() &&
- "Scalar BFE is only implemented for constant width and offset");
- uint32_t Imm = OffsetWidthOp.getImm();
- uint32_t Offset = Imm & 0x3f; // Extract bits [5:0].
- uint32_t BitWidth = (Imm & 0x7f0000) >> 16; // Extract bits [22:16].
- NewInstr->removeOperand(2);
- NewInstr.addImm(Offset);
- NewInstr.addImm(BitWidth);
- }
fixImplicitOperands(*NewInstr);
// Legalize the operands
legalizeOperands(*NewInstr, MDT);
@@ -6787,27 +7343,27 @@ void SIInstrInfo::lowerSelect(SIInstrWorklist &Worklist, MachineInstr &Inst,
MachineOperand &Src1 = Inst.getOperand(2);
MachineOperand &Cond = Inst.getOperand(3);
- Register SCCSource = Cond.getReg();
- bool IsSCC = (SCCSource == AMDGPU::SCC);
+ Register CondReg = Cond.getReg();
+ bool IsSCC = (CondReg == AMDGPU::SCC);
// If this is a trivial select where the condition is effectively not SCC
- // (SCCSource is a source of copy to SCC), then the select is semantically
- // equivalent to copying SCCSource. Hence, there is no need to create
+ // (CondReg is a source of copy to SCC), then the select is semantically
+ // equivalent to copying CondReg. Hence, there is no need to create
// V_CNDMASK, we can just use that and bail out.
if (!IsSCC && Src0.isImm() && (Src0.getImm() == -1) && Src1.isImm() &&
(Src1.getImm() == 0)) {
- MRI.replaceRegWith(Dest.getReg(), SCCSource);
+ MRI.replaceRegWith(Dest.getReg(), CondReg);
return;
}
- const TargetRegisterClass *TC =
- RI.getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
-
- Register CopySCC = MRI.createVirtualRegister(TC);
-
+ Register NewCondReg = CondReg;
if (IsSCC) {
+ const TargetRegisterClass *TC =
+ RI.getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
+ NewCondReg = MRI.createVirtualRegister(TC);
+
// Now look for the closest SCC def if it is a copy
- // replacing the SCCSource with the COPY source register
+ // replacing the CondReg with the COPY source register
bool CopyFound = false;
for (MachineInstr &CandI :
make_range(std::next(MachineBasicBlock::reverse_iterator(Inst)),
@@ -6815,7 +7371,7 @@ void SIInstrInfo::lowerSelect(SIInstrWorklist &Worklist, MachineInstr &Inst,
if (CandI.findRegisterDefOperandIdx(AMDGPU::SCC, false, false, &RI) !=
-1) {
if (CandI.isCopy() && CandI.getOperand(0).getReg() == AMDGPU::SCC) {
- BuildMI(MBB, MII, DL, get(AMDGPU::COPY), CopySCC)
+ BuildMI(MBB, MII, DL, get(AMDGPU::COPY), NewCondReg)
.addReg(CandI.getOperand(1).getReg());
CopyFound = true;
}
@@ -6830,24 +7386,31 @@ void SIInstrInfo::lowerSelect(SIInstrWorklist &Worklist, MachineInstr &Inst,
unsigned Opcode = (ST.getWavefrontSize() == 64) ? AMDGPU::S_CSELECT_B64
: AMDGPU::S_CSELECT_B32;
auto NewSelect =
- BuildMI(MBB, MII, DL, get(Opcode), CopySCC).addImm(-1).addImm(0);
+ BuildMI(MBB, MII, DL, get(Opcode), NewCondReg).addImm(-1).addImm(0);
NewSelect->getOperand(3).setIsUndef(Cond.isUndef());
}
}
- Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
-
- auto UpdatedInst =
- BuildMI(MBB, MII, DL, get(AMDGPU::V_CNDMASK_B32_e64), ResultReg)
- .addImm(0)
- .add(Src1) // False
- .addImm(0)
- .add(Src0) // True
- .addReg(IsSCC ? CopySCC : SCCSource);
-
- MRI.replaceRegWith(Dest.getReg(), ResultReg);
- legalizeOperands(*UpdatedInst, MDT);
- addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
+ Register NewDestReg = MRI.createVirtualRegister(
+ RI.getEquivalentVGPRClass(MRI.getRegClass(Dest.getReg())));
+ MachineInstr *NewInst;
+ if (Inst.getOpcode() == AMDGPU::S_CSELECT_B32) {
+ NewInst = BuildMI(MBB, MII, DL, get(AMDGPU::V_CNDMASK_B32_e64), NewDestReg)
+ .addImm(0)
+ .add(Src1) // False
+ .addImm(0)
+ .add(Src0) // True
+ .addReg(NewCondReg);
+ } else {
+ NewInst =
+ BuildMI(MBB, MII, DL, get(AMDGPU::V_CNDMASK_B64_PSEUDO), NewDestReg)
+ .add(Src1) // False
+ .add(Src0) // True
+ .addReg(NewCondReg);
+ }
+ MRI.replaceRegWith(Dest.getReg(), NewDestReg);
+ legalizeOperands(*NewInst, MDT);
+ addUsersToMoveToVALUWorklist(NewDestReg, MRI, Worklist);
}
void SIInstrInfo::lowerScalarAbs(SIInstrWorklist &Worklist,
@@ -7059,80 +7622,6 @@ void SIInstrInfo::splitScalar64BitUnaryOp(SIInstrWorklist &Worklist,
addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist);
}
-void SIInstrInfo::splitScalar64BitAddSub(SIInstrWorklist &Worklist,
- MachineInstr &Inst,
- MachineDominatorTree *MDT) const {
- bool IsAdd = (Inst.getOpcode() == AMDGPU::S_ADD_U64_PSEUDO);
-
- MachineBasicBlock &MBB = *Inst.getParent();
- MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
- const auto *CarryRC = RI.getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
-
- Register FullDestReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
- Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
- Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
-
- Register CarryReg = MRI.createVirtualRegister(CarryRC);
- Register DeadCarryReg = MRI.createVirtualRegister(CarryRC);
-
- MachineOperand &Dest = Inst.getOperand(0);
- MachineOperand &Src0 = Inst.getOperand(1);
- MachineOperand &Src1 = Inst.getOperand(2);
- const DebugLoc &DL = Inst.getDebugLoc();
- MachineBasicBlock::iterator MII = Inst;
-
- const TargetRegisterClass *Src0RC = MRI.getRegClass(Src0.getReg());
- const TargetRegisterClass *Src1RC = MRI.getRegClass(Src1.getReg());
- const TargetRegisterClass *Src0SubRC =
- RI.getSubRegisterClass(Src0RC, AMDGPU::sub0);
- const TargetRegisterClass *Src1SubRC =
- RI.getSubRegisterClass(Src1RC, AMDGPU::sub0);
-
- MachineOperand SrcReg0Sub0 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
- AMDGPU::sub0, Src0SubRC);
- MachineOperand SrcReg1Sub0 = buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC,
- AMDGPU::sub0, Src1SubRC);
-
-
- MachineOperand SrcReg0Sub1 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
- AMDGPU::sub1, Src0SubRC);
- MachineOperand SrcReg1Sub1 = buildExtractSubRegOrImm(MII, MRI, Src1, Src1RC,
- AMDGPU::sub1, Src1SubRC);
-
- unsigned LoOpc = IsAdd ? AMDGPU::V_ADD_CO_U32_e64 : AMDGPU::V_SUB_CO_U32_e64;
- MachineInstr *LoHalf =
- BuildMI(MBB, MII, DL, get(LoOpc), DestSub0)
- .addReg(CarryReg, RegState::Define)
- .add(SrcReg0Sub0)
- .add(SrcReg1Sub0)
- .addImm(0); // clamp bit
-
- unsigned HiOpc = IsAdd ? AMDGPU::V_ADDC_U32_e64 : AMDGPU::V_SUBB_U32_e64;
- MachineInstr *HiHalf =
- BuildMI(MBB, MII, DL, get(HiOpc), DestSub1)
- .addReg(DeadCarryReg, RegState::Define | RegState::Dead)
- .add(SrcReg0Sub1)
- .add(SrcReg1Sub1)
- .addReg(CarryReg, RegState::Kill)
- .addImm(0); // clamp bit
-
- BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg)
- .addReg(DestSub0)
- .addImm(AMDGPU::sub0)
- .addReg(DestSub1)
- .addImm(AMDGPU::sub1);
-
- MRI.replaceRegWith(Dest.getReg(), FullDestReg);
-
- // Try to legalize the operands in case we need to swap the order to keep it
- // valid.
- legalizeOperands(*LoHalf, MDT);
- legalizeOperands(*HiHalf, MDT);
-
- // Move all users of this moved value.
- addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist);
-}
-
void SIInstrInfo::splitScalar64BitBinaryOp(SIInstrWorklist &Worklist,
MachineInstr &Inst, unsigned Opcode,
MachineDominatorTree *MDT) const {
@@ -7980,9 +8469,36 @@ SIInstrInfo::getSerializableMachineMemOperandTargetFlags() const {
return ArrayRef(TargetFlags);
}
-bool SIInstrInfo::isBasicBlockPrologue(const MachineInstr &MI) const {
- return !MI.isTerminator() && MI.getOpcode() != AMDGPU::COPY &&
- MI.modifiesRegister(AMDGPU::EXEC, &RI);
+unsigned SIInstrInfo::getLiveRangeSplitOpcode(Register SrcReg,
+ const MachineFunction &MF) const {
+ const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
+ assert(SrcReg.isVirtual());
+ if (MFI->checkFlag(SrcReg, AMDGPU::VirtRegFlag::WWM_REG))
+ return AMDGPU::WWM_COPY;
+
+ return AMDGPU::COPY;
+}
+
+bool SIInstrInfo::isBasicBlockPrologue(const MachineInstr &MI,
+ Register Reg) const {
+ // We need to handle instructions which may be inserted during register
+ // allocation to handle the prolog. The initial prolog instruction may have
+ // been separated from the start of the block by spills and copies inserted
+ // needed by the prolog. However, the insertions for scalar registers can
+ // always be placed at the BB top as they are independent of the exec mask
+ // value.
+ bool IsNullOrVectorRegister = true;
+ if (Reg) {
+ const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
+ IsNullOrVectorRegister = !RI.isSGPRClass(RI.getRegClassForReg(MRI, Reg));
+ }
+
+ uint16_t Opc = MI.getOpcode();
+ // FIXME: Copies inserted in the block prolog for live-range split should also
+ // be included.
+ return IsNullOrVectorRegister &&
+ (isSpillOpcode(Opc) || (!MI.isTerminator() && Opc != AMDGPU::COPY &&
+ MI.modifiesRegister(AMDGPU::EXEC, &RI)));
}
MachineInstrBuilder
@@ -8045,7 +8561,16 @@ const MCInstrDesc &SIInstrInfo::getKillTerminatorFromPseudo(unsigned Opcode) con
}
}
-unsigned SIInstrInfo::getMaxMUBUFImmOffset() { return (1 << 12) - 1; }
+bool SIInstrInfo::isLegalMUBUFImmOffset(unsigned Imm) const {
+ return Imm <= getMaxMUBUFImmOffset(ST);
+}
+
+unsigned SIInstrInfo::getMaxMUBUFImmOffset(const GCNSubtarget &ST) {
+ // GFX12 field is non-negative 24-bit signed byte offset.
+ const unsigned OffsetBits =
+ ST.getGeneration() >= AMDGPUSubtarget::GFX12 ? 23 : 12;
+ return (1 << OffsetBits) - 1;
+}
void SIInstrInfo::fixImplicitOperands(MachineInstr &MI) const {
if (!ST.isWave32())
@@ -8082,7 +8607,7 @@ bool SIInstrInfo::isBufferSMRD(const MachineInstr &MI) const {
// offsets within the given alignment can be added to the resulting ImmOffset.
bool SIInstrInfo::splitMUBUFOffset(uint32_t Imm, uint32_t &SOffset,
uint32_t &ImmOffset, Align Alignment) const {
- const uint32_t MaxOffset = SIInstrInfo::getMaxMUBUFImmOffset();
+ const uint32_t MaxOffset = SIInstrInfo::getMaxMUBUFImmOffset(ST);
const uint32_t MaxImm = alignDown(MaxOffset, Alignment.value());
uint32_t Overflow = 0;
@@ -8108,11 +8633,17 @@ bool SIInstrInfo::splitMUBUFOffset(uint32_t Imm, uint32_t &SOffset,
}
}
- // There is a hardware bug in SI and CI which prevents address clamping in
- // MUBUF instructions from working correctly with SOffsets. The immediate
- // offset is unaffected.
- if (Overflow > 0 && ST.getGeneration() <= AMDGPUSubtarget::SEA_ISLANDS)
- return false;
+ if (Overflow > 0) {
+ // There is a hardware bug in SI and CI which prevents address clamping in
+ // MUBUF instructions from working correctly with SOffsets. The immediate
+ // offset is unaffected.
+ if (ST.getGeneration() <= AMDGPUSubtarget::SEA_ISLANDS)
+ return false;
+
+ // It is not possible to set immediate in SOffset field on some targets.
+ if (ST.hasRestrictedSOffset())
+ return false;
+ }
ImmOffset = Imm;
SOffset = Overflow;
@@ -8160,16 +8691,13 @@ bool SIInstrInfo::isLegalFLATOffset(int64_t Offset, unsigned AddrSpace,
AddrSpace == AMDGPUAS::GLOBAL_ADDRESS))
return false;
- bool AllowNegative = FlatVariant != SIInstrFlags::FLAT;
- if (ST.hasNegativeScratchOffsetBug() &&
- FlatVariant == SIInstrFlags::FlatScratch)
- AllowNegative = false;
if (ST.hasNegativeUnalignedScratchOffsetBug() &&
FlatVariant == SIInstrFlags::FlatScratch && Offset < 0 &&
(Offset % 4) != 0) {
return false;
}
+ bool AllowNegative = allowNegativeFlatOffset(FlatVariant);
unsigned N = AMDGPU::getNumFlatOffsetBits(ST);
return isIntN(N, Offset) && (AllowNegative || Offset >= 0);
}
@@ -8180,12 +8708,10 @@ SIInstrInfo::splitFlatOffset(int64_t COffsetVal, unsigned AddrSpace,
uint64_t FlatVariant) const {
int64_t RemainderOffset = COffsetVal;
int64_t ImmField = 0;
- bool AllowNegative = FlatVariant != SIInstrFlags::FLAT;
- if (ST.hasNegativeScratchOffsetBug() &&
- FlatVariant == SIInstrFlags::FlatScratch)
- AllowNegative = false;
+ bool AllowNegative = allowNegativeFlatOffset(FlatVariant);
const unsigned NumBits = AMDGPU::getNumFlatOffsetBits(ST) - 1;
+
if (AllowNegative) {
// Use signed division by a power of two to truncate towards 0.
int64_t D = 1LL << NumBits;
@@ -8209,6 +8735,14 @@ SIInstrInfo::splitFlatOffset(int64_t COffsetVal, unsigned AddrSpace,
return {ImmField, RemainderOffset};
}
+bool SIInstrInfo::allowNegativeFlatOffset(uint64_t FlatVariant) const {
+ if (ST.hasNegativeScratchOffsetBug() &&
+ FlatVariant == SIInstrFlags::FlatScratch)
+ return false;
+
+ return FlatVariant != SIInstrFlags::FLAT || AMDGPU::isGFX12Plus(ST);
+}
+
static unsigned subtargetEncodingFamily(const GCNSubtarget &ST) {
switch (ST.getGeneration()) {
default:
@@ -8223,6 +8757,8 @@ static unsigned subtargetEncodingFamily(const GCNSubtarget &ST) {
return SIEncodingFamily::GFX10;
case AMDGPUSubtarget::GFX11:
return SIEncodingFamily::GFX11;
+ case AMDGPUSubtarget::GFX12:
+ return SIEncodingFamily::GFX12;
}
llvm_unreachable("Unknown subtarget generation!");
}
@@ -8248,6 +8784,9 @@ bool SIInstrInfo::isAsmOnlyOpcode(int MCOp) const {
}
int SIInstrInfo::pseudoToMCOpcode(int Opcode) const {
+ if (SIInstrInfo::isSoftWaitcnt(Opcode))
+ Opcode = SIInstrInfo::getNonSoftWaitcntOpcode(Opcode);
+
unsigned Gen = subtargetEncodingFamily(ST);
if ((get(Opcode).TSFlags & SIInstrFlags::renamedInGFX9) != 0 &&
@@ -8282,6 +8821,12 @@ int SIInstrInfo::pseudoToMCOpcode(int Opcode) const {
int MCOp = AMDGPU::getMCOpcode(Opcode, Gen);
+ // TODO-GFX12: Remove this.
+ // Hack to allow some GFX12 codegen tests to run before all the encodings are
+ // implemented.
+ if (MCOp == (uint16_t)-1 && Gen == SIEncodingFamily::GFX12)
+ MCOp = AMDGPU::getMCOpcode(Opcode, SIEncodingFamily::GFX11);
+
// -1 means that Opcode is already a native instruction.
if (MCOp == -1)
return Opcode;
@@ -8531,7 +9076,7 @@ MachineInstr *SIInstrInfo::foldMemoryOperandImpl(
// A similar issue also exists with spilling and reloading $exec registers.
//
// To prevent that, constrain the %0 register class here.
- if (MI.isFullCopy()) {
+ if (isFullCopyInstr(MI)) {
Register DstReg = MI.getOperand(0).getReg();
Register SrcReg = MI.getOperand(1).getReg();
if ((DstReg.isVirtual() || SrcReg.isVirtual()) &&
@@ -8572,9 +9117,8 @@ unsigned SIInstrInfo::getInstrLatency(const InstrItineraryData *ItinData,
InstructionUniformity
SIInstrInfo::getGenericInstructionUniformity(const MachineInstr &MI) const {
unsigned opcode = MI.getOpcode();
- if (opcode == AMDGPU::G_INTRINSIC ||
- opcode == AMDGPU::G_INTRINSIC_W_SIDE_EFFECTS) {
- auto IID = static_cast<Intrinsic::ID>(MI.getIntrinsicID());
+ if (auto *GI = dyn_cast<GIntrinsic>(&MI)) {
+ auto IID = GI->getIntrinsicID();
if (AMDGPU::isIntrinsicSourceOfDivergence(IID))
return InstructionUniformity::NeverUniform;
if (AMDGPU::isIntrinsicAlwaysUniform(IID))
@@ -8612,7 +9156,8 @@ SIInstrInfo::getGenericInstructionUniformity(const MachineInstr &MI) const {
if (SIInstrInfo::isGenericAtomicRMWOpcode(opcode) ||
opcode == AMDGPU::G_ATOMIC_CMPXCHG ||
- opcode == AMDGPU::G_ATOMIC_CMPXCHG_WITH_SUCCESS) {
+ opcode == AMDGPU::G_ATOMIC_CMPXCHG_WITH_SUCCESS ||
+ AMDGPU::isGenericAtomic(opcode)) {
return InstructionUniformity::NeverUniform;
}
return InstructionUniformity::Default;
@@ -8625,10 +9170,12 @@ SIInstrInfo::getInstructionUniformity(const MachineInstr &MI) const {
return InstructionUniformity::NeverUniform;
unsigned opcode = MI.getOpcode();
- if (opcode == AMDGPU::V_READLANE_B32 || opcode == AMDGPU::V_READFIRSTLANE_B32)
+ if (opcode == AMDGPU::V_READLANE_B32 ||
+ opcode == AMDGPU::V_READFIRSTLANE_B32 ||
+ opcode == AMDGPU::SI_RESTORE_S32_FROM_VGPR)
return InstructionUniformity::AlwaysUniform;
- if (MI.isCopy()) {
+ if (isCopyInstr(MI)) {
const MachineOperand &srcOp = MI.getOperand(1);
if (srcOp.isReg() && srcOp.getReg().isPhysical()) {
const TargetRegisterClass *regClass =
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInstrInfo.h
index 66f93e5640d6..affe52046752 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInstrInfo.h
@@ -43,7 +43,7 @@ static const MachineMemOperand::Flags MONoClobber =
/// Utility to store machine instructions worklist.
struct SIInstrWorklist {
- SIInstrWorklist() : InstrList() {}
+ SIInstrWorklist() = default;
void insert(MachineInstr *MI);
@@ -102,16 +102,15 @@ private:
public:
unsigned buildExtractSubReg(MachineBasicBlock::iterator MI,
MachineRegisterInfo &MRI,
- MachineOperand &SuperReg,
+ const MachineOperand &SuperReg,
const TargetRegisterClass *SuperRC,
unsigned SubIdx,
const TargetRegisterClass *SubRC) const;
- MachineOperand buildExtractSubRegOrImm(MachineBasicBlock::iterator MI,
- MachineRegisterInfo &MRI,
- MachineOperand &SuperReg,
- const TargetRegisterClass *SuperRC,
- unsigned SubIdx,
- const TargetRegisterClass *SubRC) const;
+ MachineOperand buildExtractSubRegOrImm(
+ MachineBasicBlock::iterator MI, MachineRegisterInfo &MRI,
+ const MachineOperand &SuperReg, const TargetRegisterClass *SuperRC,
+ unsigned SubIdx, const TargetRegisterClass *SubRC) const;
+
private:
void swapOperands(MachineInstr &Inst) const;
@@ -135,9 +134,6 @@ private:
void splitScalar64BitUnaryOp(SIInstrWorklist &Worklist, MachineInstr &Inst,
unsigned Opcode, bool Swap = false) const;
- void splitScalar64BitAddSub(SIInstrWorklist &Worklist, MachineInstr &Inst,
- MachineDominatorTree *MDT = nullptr) const;
-
void splitScalar64BitBinaryOp(SIInstrWorklist &Worklist, MachineInstr &Inst,
unsigned Opcode,
MachineDominatorTree *MDT = nullptr) const;
@@ -170,6 +166,12 @@ private:
Register findUsedSGPR(const MachineInstr &MI, int OpIndices[3]) const;
protected:
+ /// If the specific machine instruction is a instruction that moves/copies
+ /// value from one register to another register return destination and source
+ /// registers as machine operands.
+ std::optional<DestSourcePair>
+ isCopyInstrImpl(const MachineInstr &MI) const override;
+
bool swapSourceModifiers(MachineInstr &MI,
MachineOperand &Src0, unsigned Src0OpName,
MachineOperand &Src1, unsigned Src1OpName) const;
@@ -216,6 +218,9 @@ public:
bool isIgnorableUse(const MachineOperand &MO) const override;
+ bool isSafeToSink(MachineInstr &MI, MachineBasicBlock *SuccToSinkTo,
+ MachineCycleInfo *CI) const override;
+
bool areLoadsFromSameBasePtr(SDNode *Load0, SDNode *Load1, int64_t &Offset0,
int64_t &Offset1) const override;
@@ -226,8 +231,11 @@ public:
const TargetRegisterInfo *TRI) const final;
bool shouldClusterMemOps(ArrayRef<const MachineOperand *> BaseOps1,
+ int64_t Offset1, bool OffsetIsScalable1,
ArrayRef<const MachineOperand *> BaseOps2,
- unsigned NumLoads, unsigned NumBytes) const override;
+ int64_t Offset2, bool OffsetIsScalable2,
+ unsigned ClusterSize,
+ unsigned NumBytes) const override;
bool shouldScheduleLoadsNear(SDNode *Load0, SDNode *Load1, int64_t Offset0,
int64_t Offset1, unsigned NumLoads) const override;
@@ -266,6 +274,11 @@ public:
bool expandPostRAPseudo(MachineInstr &MI) const override;
+ void reMaterialize(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
+ Register DestReg, unsigned SubIdx,
+ const MachineInstr &Orig,
+ const TargetRegisterInfo &TRI) const override;
+
// Splits a V_MOV_B64_DPP_PSEUDO opcode into a pair of v_mov_b32_dpp
// instructions. Returns a pair of generated instructions.
// Can split either post-RA with physical registers or pre-RA with
@@ -395,12 +408,20 @@ public:
return get(Opcode).TSFlags & SIInstrFlags::VALU;
}
+ static bool isImage(const MachineInstr &MI) {
+ return isMIMG(MI) || isVSAMPLE(MI) || isVIMAGE(MI);
+ }
+
+ bool isImage(uint16_t Opcode) const {
+ return isMIMG(Opcode) || isVSAMPLE(Opcode) || isVIMAGE(Opcode);
+ }
+
static bool isVMEM(const MachineInstr &MI) {
- return isMUBUF(MI) || isMTBUF(MI) || isMIMG(MI);
+ return isMUBUF(MI) || isMTBUF(MI) || isImage(MI);
}
bool isVMEM(uint16_t Opcode) const {
- return isMUBUF(Opcode) || isMTBUF(Opcode) || isMIMG(Opcode);
+ return isMUBUF(Opcode) || isMTBUF(Opcode) || isImage(Opcode);
}
static bool isSOP1(const MachineInstr &MI) {
@@ -525,6 +546,22 @@ public:
return get(Opcode).TSFlags & SIInstrFlags::DS;
}
+ static bool isLDSDMA(const MachineInstr &MI) {
+ return isVALU(MI) && (isMUBUF(MI) || isFLAT(MI));
+ }
+
+ bool isLDSDMA(uint16_t Opcode) {
+ return isVALU(Opcode) && (isMUBUF(Opcode) || isFLAT(Opcode));
+ }
+
+ static bool isGWS(const MachineInstr &MI) {
+ return MI.getDesc().TSFlags & SIInstrFlags::GWS;
+ }
+
+ bool isGWS(uint16_t Opcode) const {
+ return get(Opcode).TSFlags & SIInstrFlags::GWS;
+ }
+
bool isAlwaysGDS(uint16_t Opcode) const;
static bool isMIMG(const MachineInstr &MI) {
@@ -535,6 +572,22 @@ public:
return get(Opcode).TSFlags & SIInstrFlags::MIMG;
}
+ static bool isVIMAGE(const MachineInstr &MI) {
+ return MI.getDesc().TSFlags & SIInstrFlags::VIMAGE;
+ }
+
+ bool isVIMAGE(uint16_t Opcode) const {
+ return get(Opcode).TSFlags & SIInstrFlags::VIMAGE;
+ }
+
+ static bool isVSAMPLE(const MachineInstr &MI) {
+ return MI.getDesc().TSFlags & SIInstrFlags::VSAMPLE;
+ }
+
+ bool isVSAMPLE(uint16_t Opcode) const {
+ return get(Opcode).TSFlags & SIInstrFlags::VSAMPLE;
+ }
+
static bool isGather4(const MachineInstr &MI) {
return MI.getDesc().TSFlags & SIInstrFlags::Gather4;
}
@@ -622,6 +675,10 @@ public:
SIInstrFlags::IsAtomicNoRet);
}
+ static bool mayWriteLDSThroughDMA(const MachineInstr &MI) {
+ return isLDSDMA(MI) && MI.getOpcode() != AMDGPU::BUFFER_STORE_LDS_DWORD;
+ }
+
static bool isWQM(const MachineInstr &MI) {
return MI.getDesc().TSFlags & SIInstrFlags::WQM;
}
@@ -654,9 +711,21 @@ public:
return get(Opcode).TSFlags & SIInstrFlags::SGPRSpill;
}
+ bool isSpillOpcode(uint16_t Opcode) const {
+ return get(Opcode).TSFlags &
+ (SIInstrFlags::SGPRSpill | SIInstrFlags::VGPRSpill);
+ }
+
static bool isWWMRegSpillOpcode(uint16_t Opcode) {
return Opcode == AMDGPU::SI_SPILL_WWM_V32_SAVE ||
- Opcode == AMDGPU::SI_SPILL_WWM_V32_RESTORE;
+ Opcode == AMDGPU::SI_SPILL_WWM_AV32_SAVE ||
+ Opcode == AMDGPU::SI_SPILL_WWM_V32_RESTORE ||
+ Opcode == AMDGPU::SI_SPILL_WWM_AV32_RESTORE;
+ }
+
+ static bool isChainCallOpcode(uint64_t Opcode) {
+ return Opcode == AMDGPU::SI_CS_CHAIN_TC_W32 ||
+ Opcode == AMDGPU::SI_CS_CHAIN_TC_W64;
}
static bool isDPP(const MachineInstr &MI) {
@@ -826,8 +895,34 @@ public:
return get(Opcode).TSFlags & SIInstrFlags::TiedSourceNotRead;
}
+ static unsigned getNonSoftWaitcntOpcode(unsigned Opcode) {
+ if (isWaitcnt(Opcode))
+ return AMDGPU::S_WAITCNT;
+
+ if (isWaitcntVsCnt(Opcode))
+ return AMDGPU::S_WAITCNT_VSCNT;
+
+ llvm_unreachable("Expected opcode S_WAITCNT/S_WAITCNT_VSCNT");
+ }
+
+ static bool isWaitcnt(unsigned Opcode) {
+ return Opcode == AMDGPU::S_WAITCNT || Opcode == AMDGPU::S_WAITCNT_soft;
+ }
+
+ static bool isWaitcntVsCnt(unsigned Opcode) {
+ return Opcode == AMDGPU::S_WAITCNT_VSCNT ||
+ Opcode == AMDGPU::S_WAITCNT_VSCNT_soft;
+ }
+
+ // "Soft" waitcnt instructions can be relaxed/optimized out by
+ // SIInsertWaitcnts.
+ static bool isSoftWaitcnt(unsigned Opcode) {
+ return Opcode == AMDGPU::S_WAITCNT_soft ||
+ Opcode == AMDGPU::S_WAITCNT_VSCNT_soft;
+ }
+
bool isVGPRCopy(const MachineInstr &MI) const {
- assert(MI.isCopy());
+ assert(isCopyInstr(MI));
Register Dest = MI.getOperand(0).getReg();
const MachineFunction &MF = *MI.getParent()->getParent();
const MachineRegisterInfo &MRI = MF.getRegInfo();
@@ -897,7 +992,7 @@ public:
if (OpIdx >= MI.getDesc().NumOperands)
return false;
- if (MI.isCopy()) {
+ if (isCopyInstr(MI)) {
unsigned Size = getOpSize(MI, OpIdx);
assert(Size == 8 || Size == 4);
@@ -946,12 +1041,12 @@ public:
void insertScratchExecCopy(MachineFunction &MF, MachineBasicBlock &MBB,
MachineBasicBlock::iterator MBBI,
- const DebugLoc &DL, Register Reg,
- bool IsSCCLive) const;
+ const DebugLoc &DL, Register Reg, bool IsSCCLive,
+ SlotIndexes *Indexes = nullptr) const;
void restoreExec(MachineFunction &MF, MachineBasicBlock &MBB,
MachineBasicBlock::iterator MBBI, const DebugLoc &DL,
- Register Reg) const;
+ Register Reg, SlotIndexes *Indexes = nullptr) const;
/// Return the correct register class for \p OpNo. For target-specific
/// instructions, this will return the register class that has been defined
@@ -1143,7 +1238,11 @@ public:
CreateTargetMIHazardRecognizer(const InstrItineraryData *II,
const ScheduleDAGMI *DAG) const override;
- bool isBasicBlockPrologue(const MachineInstr &MI) const override;
+ unsigned getLiveRangeSplitOpcode(Register Reg,
+ const MachineFunction &MF) const override;
+
+ bool isBasicBlockPrologue(const MachineInstr &MI,
+ Register Reg = Register()) const override;
MachineInstr *createPHIDestinationCopy(MachineBasicBlock &MBB,
MachineBasicBlock::iterator InsPt,
@@ -1176,11 +1275,9 @@ public:
static bool isKillTerminator(unsigned Opcode);
const MCInstrDesc &getKillTerminatorFromPseudo(unsigned Opcode) const;
- static bool isLegalMUBUFImmOffset(unsigned Imm) {
- return isUInt<12>(Imm);
- }
+ bool isLegalMUBUFImmOffset(unsigned Imm) const;
- static unsigned getMaxMUBUFImmOffset();
+ static unsigned getMaxMUBUFImmOffset(const GCNSubtarget &ST);
bool splitMUBUFOffset(uint32_t Imm, uint32_t &SOffset, uint32_t &ImmOffset,
Align Alignment = Align(4)) const;
@@ -1197,6 +1294,9 @@ public:
unsigned AddrSpace,
uint64_t FlatVariant) const;
+ /// Returns true if negative offsets are allowed for the given \p FlatVariant.
+ bool allowNegativeFlatOffset(uint64_t FlatVariant) const;
+
/// \brief Return a target-specific opcode if Opcode is a pseudo instruction.
/// Return -1 if the target-specific opcode for the pseudo instruction does
/// not exist. If Opcode is not a pseudo instruction, this is identity.
@@ -1378,6 +1478,13 @@ namespace AMDGPU {
} // end namespace AMDGPU
+namespace AMDGPU {
+enum AsmComments {
+ // For sgpr to vgpr spill instructions
+ SGPR_SPILL = MachineInstr::TAsmComments
+};
+} // namespace AMDGPU
+
namespace SI {
namespace KernelInputOffsets {
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInstrInfo.td
index 044bc4507d3a..173c877b8d29 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInstrInfo.td
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInstrInfo.td
@@ -32,6 +32,7 @@ def SIEncodingFamily {
int GFX90A = 8;
int GFX940 = 9;
int GFX11 = 10;
+ int GFX12 = 11;
}
//===----------------------------------------------------------------------===//
@@ -158,36 +159,18 @@ def SIbuffer_store_format_d16 : SDNode <"AMDGPUISD::BUFFER_STORE_FORMAT_D16",
SDTBufferStore,
[SDNPMayStore, SDNPMemOperand, SDNPHasChain]>;
-class SDBufferAtomic<string opcode> : SDNode <opcode,
- SDTypeProfile<1, 8,
- [SDTCisVT<2, v4i32>, // rsrc
- SDTCisVT<3, i32>, // vindex(VGPR)
- SDTCisVT<4, i32>, // voffset(VGPR)
- SDTCisVT<5, i32>, // soffset(SGPR)
- SDTCisVT<6, i32>, // offset(imm)
- SDTCisVT<7, i32>, // cachepolicy(imm)
- SDTCisVT<8, i1>]>, // idxen(imm)
- [SDNPMemOperand, SDNPHasChain, SDNPMayLoad, SDNPMayStore]
->;
-
-def SIbuffer_atomic_swap : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_SWAP">;
-def SIbuffer_atomic_add : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_ADD">;
-def SIbuffer_atomic_sub : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_SUB">;
-def SIbuffer_atomic_smin : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_SMIN">;
-def SIbuffer_atomic_umin : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_UMIN">;
-def SIbuffer_atomic_smax : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_SMAX">;
-def SIbuffer_atomic_umax : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_UMAX">;
-def SIbuffer_atomic_and : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_AND">;
-def SIbuffer_atomic_or : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_OR">;
-def SIbuffer_atomic_xor : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_XOR">;
-def SIbuffer_atomic_inc : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_INC">;
-def SIbuffer_atomic_dec : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_DEC">;
-def SIbuffer_atomic_csub : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_CSUB">;
-def SIbuffer_atomic_fadd : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_FADD">;
-def SIbuffer_atomic_fmin : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_FMIN">;
-def SIbuffer_atomic_fmax : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_FMAX">;
-
-multiclass SDBufferAtomicNoRet {
+multiclass SDBufferAtomic<string opcode> {
+ def "" : SDNode <opcode,
+ SDTypeProfile<1, 8,
+ [SDTCisVT<2, v4i32>, // rsrc
+ SDTCisVT<3, i32>, // vindex(VGPR)
+ SDTCisVT<4, i32>, // voffset(VGPR)
+ SDTCisVT<5, i32>, // soffset(SGPR)
+ SDTCisVT<6, i32>, // offset(imm)
+ SDTCisVT<7, i32>, // cachepolicy(imm)
+ SDTCisVT<8, i1>]>, // idxen(imm)
+ [SDNPMemOperand, SDNPHasChain, SDNPMayLoad, SDNPMayStore]
+ >;
def "_noret" : PatFrag<
(ops node:$vdata_in, node:$rsrc, node:$vindex, node:$voffset, node:$soffset,
node:$offset, node:$cachepolicy, node:$idxen),
@@ -198,28 +181,26 @@ multiclass SDBufferAtomicNoRet {
}
}
-defm SIbuffer_atomic_swap : SDBufferAtomicNoRet;
-defm SIbuffer_atomic_add : SDBufferAtomicNoRet;
-defm SIbuffer_atomic_sub : SDBufferAtomicNoRet;
-defm SIbuffer_atomic_smin : SDBufferAtomicNoRet;
-defm SIbuffer_atomic_umin : SDBufferAtomicNoRet;
-defm SIbuffer_atomic_smax : SDBufferAtomicNoRet;
-defm SIbuffer_atomic_umax : SDBufferAtomicNoRet;
-defm SIbuffer_atomic_and : SDBufferAtomicNoRet;
-defm SIbuffer_atomic_or : SDBufferAtomicNoRet;
-defm SIbuffer_atomic_xor : SDBufferAtomicNoRet;
-defm SIbuffer_atomic_inc : SDBufferAtomicNoRet;
-defm SIbuffer_atomic_dec : SDBufferAtomicNoRet;
-defm SIbuffer_atomic_fadd : SDBufferAtomicNoRet;
-defm SIbuffer_atomic_fmin : SDBufferAtomicNoRet;
-defm SIbuffer_atomic_fmax : SDBufferAtomicNoRet;
+defm SIbuffer_atomic_swap : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_SWAP">;
+defm SIbuffer_atomic_add : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_ADD">;
+defm SIbuffer_atomic_sub : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_SUB">;
+defm SIbuffer_atomic_smin : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_SMIN">;
+defm SIbuffer_atomic_umin : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_UMIN">;
+defm SIbuffer_atomic_smax : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_SMAX">;
+defm SIbuffer_atomic_umax : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_UMAX">;
+defm SIbuffer_atomic_and : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_AND">;
+defm SIbuffer_atomic_or : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_OR">;
+defm SIbuffer_atomic_xor : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_XOR">;
+defm SIbuffer_atomic_inc : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_INC">;
+defm SIbuffer_atomic_dec : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_DEC">;
+defm SIbuffer_atomic_csub : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_CSUB">;
+defm SIbuffer_atomic_fadd : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_FADD">;
+defm SIbuffer_atomic_fmin : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_FMIN">;
+defm SIbuffer_atomic_fmax : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_FMAX">;
def SIbuffer_atomic_cmpswap : SDNode <"AMDGPUISD::BUFFER_ATOMIC_CMPSWAP",
SDTypeProfile<1, 9,
- [SDTCisVT<0, i32>, // dst
- SDTCisVT<1, i32>, // src
- SDTCisVT<2, i32>, // cmp
- SDTCisVT<3, v4i32>, // rsrc
+ [SDTCisVT<3, v4i32>, // rsrc
SDTCisVT<4, i32>, // vindex(VGPR)
SDTCisVT<5, i32>, // voffset(VGPR)
SDTCisVT<6, i32>, // soffset(SGPR)
@@ -604,14 +585,14 @@ def atomic_store_64_glue : PatFrag <
}
let IsAtomic = 1, AddressSpaces = StoreAddress_local.AddrSpaces in {
-def atomic_store_8_local_m0 : PatFrag<(ops node:$ptr, node:$val),
- (atomic_store_8_glue node:$ptr, node:$val)>;
-def atomic_store_16_local_m0 : PatFrag<(ops node:$ptr, node:$val),
- (atomic_store_16_glue node:$ptr, node:$val)>;
-def atomic_store_32_local_m0 : PatFrag<(ops node:$ptr, node:$val),
- (atomic_store_32_glue node:$ptr, node:$val)>;
-def atomic_store_64_local_m0 : PatFrag<(ops node:$ptr, node:$val),
- (atomic_store_64_glue node:$ptr, node:$val)>;
+def atomic_store_8_local_m0 : PatFrag<(ops node:$val, node:$ptr),
+ (atomic_store_8_glue node:$val, node:$ptr)>;
+def atomic_store_16_local_m0 : PatFrag<(ops node:$val, node:$ptr),
+ (atomic_store_16_glue node:$val, node:$ptr)>;
+def atomic_store_32_local_m0 : PatFrag<(ops node:$val, node:$ptr),
+ (atomic_store_32_glue node:$val, node:$ptr)>;
+def atomic_store_64_local_m0 : PatFrag<(ops node:$val, node:$ptr),
+ (atomic_store_64_glue node:$val, node:$ptr)>;
} // End let IsAtomic = 1, AddressSpaces = StoreAddress_local.AddrSpaces
@@ -906,11 +887,19 @@ def fp16_zeros_high_16bits : PatLeaf<(f16 VGPR_32:$src), [{
//===----------------------------------------------------------------------===//
def extract_cpol : SDNodeXForm<timm, [{
- return CurDAG->getTargetConstant(N->getZExtValue() & AMDGPU::CPol::ALL, SDLoc(N), MVT::i8);
+ return CurDAG->getTargetConstant(
+ N->getZExtValue() & (Subtarget->getGeneration() >= AMDGPUSubtarget::GFX12
+ ? AMDGPU::CPol::ALL
+ : AMDGPU::CPol::ALL_pregfx12),
+ SDLoc(N), MVT::i8);
}]>;
def extract_swz : SDNodeXForm<timm, [{
- return CurDAG->getTargetConstant((N->getZExtValue() >> 3) & 1, SDLoc(N), MVT::i8);
+ const bool Swizzle =
+ N->getZExtValue() & (Subtarget->getGeneration() >= AMDGPUSubtarget::GFX12
+ ? AMDGPU::CPol::SWZ
+ : AMDGPU::CPol::SWZ_pregfx12);
+ return CurDAG->getTargetConstant(Swizzle, SDLoc(N), MVT::i8);
}]>;
def set_glc : SDNodeXForm<timm, [{
@@ -938,6 +927,13 @@ def InterpAttr : CustomOperand<i32>;
def InterpAttrChan : ImmOperand<i32>;
+def SplitBarrier : ImmOperand<i32> {
+ let OperandNamespace = "AMDGPU";
+ let OperandType = "OPERAND_INLINE_SPLIT_BARRIER_INT32";
+ let DecoderMethod = "decodeSplitBarrier";
+ let PrintMethod = "printOperand";
+}
+
def VReg32OrOffClass : AsmOperandClass {
let Name = "VReg32OrOff";
let ParserMethod = "parseVReg32OrOff";
@@ -1044,6 +1040,7 @@ class NamedIntOperand<ValueType Type, string Prefix, string Name = NAME,
class NamedBitOperand<string Id, string Name = NAME>
: CustomOperand<i1, 1, Name> {
+ let PredicateMethod = "isImmTy<AMDGPUOperand::"#ImmTy#">";
let ParserMethod =
"[this](OperandVector &Operands) -> ParseStatus { "#
"return parseNamedBit(\""#Id#"\", Operands, AMDGPUOperand::"#ImmTy#"); }";
@@ -1054,8 +1051,8 @@ class NamedBitOperand<string Id, string Name = NAME>
class DefaultOperand<CustomOperand Op, int Value>
: OperandWithDefaultOps<Op.Type, (ops (Op.Type Value))>,
- CustomOperandProps<1, Op.ParserMatchClass.Name> {
- let ParserMethod = Op.ParserMatchClass.ParserMethod;
+ CustomOperandProps<1> {
+ let ParserMatchClass = Op.ParserMatchClass;
let PrintMethod = Op.PrintMethod;
}
@@ -1096,6 +1093,10 @@ def highmod : NamedBitOperand<"high", "High">;
def CPol : CustomOperand<i32, 1>;
def CPol_0 : DefaultOperand<CPol, 0>;
def CPol_GLC1 : DefaultOperand<CPol, 1>;
+def CPol_GLC : ValuePredicatedOperand<CPol, "Op.getImm() & CPol::GLC">;
+def CPol_NonGLC : ValuePredicatedOperand<CPol, "!(Op.getImm() & CPol::GLC)", 1>;
+def CPol_GLC_WithDefault : DefaultOperand<CPol_GLC, !shl(1, CPolBit.GLC)>;
+def CPol_NonGLC_WithDefault : DefaultOperand<CPol_NonGLC, 0>;
def TFE : NamedBitOperand<"tfe">;
def UNorm : NamedBitOperand<"unorm">;
@@ -1170,6 +1171,10 @@ class FPVCSrcInputModsMatchClass <int opSize> : FPInputModsMatchClass <opSize> {
}
def FP16InputModsMatchClass : FPInputModsMatchClass<16>;
+def FPT16InputModsMatchClass : FPInputModsMatchClass<16> {
+ let Name = "RegOrImmWithFPT16InputMods";
+ let PredicateMethod = "isRegOrImmWithFPT16InputMods";
+}
def FP32InputModsMatchClass : FPInputModsMatchClass<32>;
def FP64InputModsMatchClass : FPInputModsMatchClass<64>;
@@ -1187,6 +1192,7 @@ class FPInputMods <FPInputModsMatchClass matchClass> : InputMods <matchClass> {
}
def FP16InputMods : FPInputMods<FP16InputModsMatchClass>;
+def FPT16InputMods : FPInputMods<FPT16InputModsMatchClass>;
def FP32InputMods : FPInputMods<FP32InputModsMatchClass>;
def FP64InputMods : FPInputMods<FP64InputModsMatchClass>;
@@ -1202,6 +1208,10 @@ class IntVCSrcInputModsMatchClass <int opSize> : IntInputModsMatchClass <opSize>
let Name = "RegOrInlineImmWithInt"#opSize#"InputMods";
let PredicateMethod = "isRegOrInlineImmWithInt"#opSize#"InputMods";
}
+def IntT16InputModsMatchClass : IntInputModsMatchClass<16> {
+ let Name = "RegOrImmWithIntT16InputMods";
+ let PredicateMethod = "isRegOrImmWithIntT16InputMods";
+}
def Int32InputModsMatchClass : IntInputModsMatchClass<32>;
def Int64InputModsMatchClass : IntInputModsMatchClass<64>;
def Int32VCSrcInputModsMatchClass : IntVCSrcInputModsMatchClass<32>;
@@ -1209,6 +1219,7 @@ def Int32VCSrcInputModsMatchClass : IntVCSrcInputModsMatchClass<32>;
class IntInputMods <IntInputModsMatchClass matchClass> : InputMods <matchClass> {
let PrintMethod = "printOperandAndIntInputMods";
}
+def IntT16InputMods : IntInputMods<IntT16InputModsMatchClass>;
def Int32InputMods : IntInputMods<Int32InputModsMatchClass>;
def Int64InputMods : IntInputMods<Int64InputModsMatchClass>;
def Int32VCSrcInputMods : IntInputMods<Int32VCSrcInputModsMatchClass>;
@@ -1463,15 +1474,18 @@ class getNumSrcArgs<ValueType Src0, ValueType Src1, ValueType Src2> {
// Returns the register class to use for the destination of VOP[123C]
// instructions for the given VT.
-class getVALUDstForVT<ValueType VT> {
+class getVALUDstForVT<ValueType VT, bit IsTrue16 = 0, bit IsVOP3Encoding = 0> {
+ defvar op16 = !if(IsTrue16, !if (IsVOP3Encoding, VOPDstOperand_t16,
+ VOPDstOperand_t16Lo128),
+ VOPDstOperand<VGPR_32>);
RegisterOperand ret = !if(!eq(VT.Size, 32), VOPDstOperand<VGPR_32>,
!if(!eq(VT.Size, 128), VOPDstOperand<VReg_128>,
!if(!eq(VT.Size, 64), VOPDstOperand<VReg_64>,
- !if(!eq(VT.Size, 16), VOPDstOperand<VGPR_32>,
+ !if(!eq(VT.Size, 16), op16,
VOPDstS64orS32)))); // else VT == i1
}
-class getVALUDstForVT_t16<ValueType VT> {
+class getVALUDstForVT_fake16<ValueType VT> {
RegisterOperand ret = !if(!eq(VT.Size, 32), VOPDstOperand<VGPR_32>,
!if(!eq(VT.Size, 128), VOPDstOperand<VReg_128>,
!if(!eq(VT.Size, 64), VOPDstOperand<VReg_64>,
@@ -1489,7 +1503,7 @@ class getSDWADstForVT<ValueType VT> {
// Returns the register class to use for source 0 of VOP[12C]
// instructions for the given VT.
-class getVOPSrc0ForVT<ValueType VT, bit IsTrue16> {
+class getVOPSrc0ForVT<ValueType VT, bit IsTrue16, bit IsFake16 = 1> {
bit isFP = isFloatType<VT>.ret;
RegisterOperand ret =
@@ -1498,7 +1512,7 @@ class getVOPSrc0ForVT<ValueType VT, bit IsTrue16> {
VSrc_f64,
!if(!eq(VT.Value, f16.Value),
!if(IsTrue16,
- VSrcT_f16_Lo128,
+ !if(IsFake16, VSrcFake16_f16_Lo128, VSrcT_f16_Lo128),
VSrc_f16
),
!if(!eq(VT.Value, v2f16.Value),
@@ -1514,7 +1528,7 @@ class getVOPSrc0ForVT<ValueType VT, bit IsTrue16> {
VSrc_b64,
!if(!eq(VT.Value, i16.Value),
!if(IsTrue16,
- VSrcT_b16_Lo128,
+ !if(IsFake16, VSrcFake16_b16_Lo128, VSrcT_b16_Lo128),
VSrc_b16
),
!if(!eq(VT.Value, v2i16.Value),
@@ -1539,13 +1553,17 @@ class getVregSrcForVT<ValueType VT> {
VGPR_32))));
}
-class getVregSrcForVT_t16<ValueType VT> {
+class getVregSrcForVT_t16<ValueType VT, bit IsFake16 = 1> {
RegisterClass ret = !if(!eq(VT.Size, 128), VReg_128,
!if(!eq(VT.Size, 96), VReg_96,
!if(!eq(VT.Size, 64), VReg_64,
!if(!eq(VT.Size, 48), VReg_64,
- !if(!eq(VT.Size, 16), VGPR_32_Lo128,
+ !if(!eq(VT.Size, 16),
+ !if(IsFake16, VGPR_32_Lo128, VGPR_16_Lo128),
VGPR_32)))));
+
+ RegisterOperand op = !if (!and(!eq(VT.Size, 16), !not(IsFake16)),
+ VGPRSrc_16_Lo128, RegisterOperand<ret>);
}
class getSDWASrcForVT <ValueType VT> {
@@ -1557,7 +1575,7 @@ class getSDWASrcForVT <ValueType VT> {
// Returns the register class to use for sources of VOP3 instructions for the
// given VT.
-class getVOP3SrcForVT<ValueType VT> {
+class getVOP3SrcForVT<ValueType VT, bit IsTrue16 = 0> {
bit isFP = isFloatType<VT>.ret;
RegisterOperand ret =
!if(!eq(VT.Size, 128),
@@ -1574,7 +1592,7 @@ class getVOP3SrcForVT<ValueType VT> {
SSrc_i1,
!if(isFP,
!if(!eq(VT.Value, f16.Value),
- VSrc_f16,
+ !if(IsTrue16, VSrcT_f16, VSrc_f16),
!if(!eq(VT.Value, v2f16.Value),
VSrc_v2f16,
!if(!eq(VT.Value, v4f16.Value),
@@ -1584,7 +1602,7 @@ class getVOP3SrcForVT<ValueType VT> {
)
),
!if(!eq(VT.Value, i16.Value),
- VSrc_b16,
+ !if(IsTrue16, VSrcT_b16, VSrc_b16),
!if(!eq(VT.Value, v2i16.Value),
VSrc_v2b16,
VSrc_b32
@@ -1631,18 +1649,15 @@ class isModifierType<ValueType SrcVT> {
}
// Return type of input modifiers operand for specified input operand
-class getSrcMod <ValueType VT> {
+class getSrcMod <ValueType VT, bit IsTrue16 = 0> {
bit isFP = isFloatType<VT>.ret;
bit isPacked = isPackedType<VT>.ret;
Operand ret = !if(!eq(VT.Size, 64),
!if(isFP, FP64InputMods, Int64InputMods),
- !if(isFP,
- !if(!eq(VT.Value, f16.Value),
- FP16InputMods,
- FP32InputMods
- ),
- Int32InputMods)
- );
+ !if(!eq(VT.Size, 16),
+ !if(isFP, !if(IsTrue16, FPT16InputMods, FP16InputMods),
+ !if(IsTrue16, IntT16InputMods, IntOpSelMods)),
+ !if(isFP, FP32InputMods, Int32InputMods)));
}
class getOpSelMod <ValueType VT> {
@@ -2262,6 +2277,7 @@ class VOPProfile <list<ValueType> _ArgVT, bit _EnableClamp = 0> {
field list<ValueType> ArgVT = _ArgVT;
field bit EnableClamp = _EnableClamp;
field bit IsTrue16 = 0;
+ field bit IsRealTrue16 = 0;
field ValueType DstVT = ArgVT[0];
field ValueType Src0VT = ArgVT[1];
@@ -2281,7 +2297,7 @@ class VOPProfile <list<ValueType> _ArgVT, bit _EnableClamp = 0> {
field RegisterClass Src1DPP = getVregSrcForVT<Src1VT>.ret;
field RegisterClass Src2DPP = getVregSrcForVT<Src2VT>.ret;
field RegisterOperand Src0VOP3DPP = VGPRSrc_32;
- field RegisterOperand Src1VOP3DPP = VRegSrc_32;
+ field RegisterOperand Src1VOP3DPP = getVOP3DPPSrcForVT<Src1VT>.ret;
field RegisterOperand Src2VOP3DPP = getVOP3DPPSrcForVT<Src2VT>.ret;
field RegisterOperand Src0SDWA = getSDWASrcForVT<Src0VT>.ret;
field RegisterOperand Src1SDWA = getSDWASrcForVT<Src0VT>.ret;
@@ -2454,8 +2470,32 @@ class VOP_PAT_GEN <VOPProfile p, int mode=PatGenMode.NoPattern> : VOPProfile <p.
// class, so copy changes to this class in those profiles
class VOPProfile_True16<VOPProfile P> : VOPProfile<P.ArgVT> {
let IsTrue16 = 1;
+ let IsRealTrue16 = 1;
+ // Most DstVT are 16-bit, but not all.
+ let DstRC = getVALUDstForVT<DstVT, 1 /*IsTrue16*/, 0 /*IsVOP3Encoding*/>.ret;
+ let DstRC64 = getVALUDstForVT<DstVT>.ret;
+ let Src0RC32 = getVOPSrc0ForVT<Src0VT, 1 /*IsTrue16*/, 0 /*IsFake16*/>.ret;
+ let Src1RC32 = getVregSrcForVT_t16<Src1VT, 0 /*IsFake16*/>.op;
+ let Src0DPP = getVregSrcForVT_t16<Src0VT>.ret;
+ let Src1DPP = getVregSrcForVT_t16<Src1VT>.ret;
+ let Src2DPP = getVregSrcForVT_t16<Src2VT>.ret;
+ let Src0ModDPP = getSrcModDPP_t16<Src0VT>.ret;
+ let Src1ModDPP = getSrcModDPP_t16<Src1VT>.ret;
+ let Src2ModDPP = getSrcModDPP_t16<Src2VT>.ret;
+
+ let DstRC64 = getVALUDstForVT<DstVT, 1 /*IsTrue16*/, 1 /*IsVOP3Encoding*/>.ret;
+ let Src0RC64 = getVOP3SrcForVT<Src0VT, 1 /*IsTrue16*/>.ret;
+ let Src1RC64 = getVOP3SrcForVT<Src1VT, 1 /*IsTrue16*/>.ret;
+ let Src2RC64 = getVOP3SrcForVT<Src2VT, 1 /*IsTrue16*/>.ret;
+ let Src0Mod = getSrcMod<Src0VT, 1 /*IsTrue16*/>.ret;
+ let Src1Mod = getSrcMod<Src1VT, 1 /*IsTrue16*/>.ret;
+ let Src2Mod = getSrcMod<Src2VT, 1 /*IsTrue16*/>.ret;
+}
+
+class VOPProfile_Fake16<VOPProfile P> : VOPProfile<P.ArgVT> {
+ let IsTrue16 = 1;
// Most DstVT are 16-bit, but not all
- let DstRC = getVALUDstForVT_t16<DstVT>.ret;
+ let DstRC = getVALUDstForVT_fake16<DstVT>.ret;
let DstRC64 = getVALUDstForVT<DstVT>.ret;
let Src1RC32 = RegisterOperand<getVregSrcForVT_t16<Src1VT>.ret>;
let Src0DPP = getVregSrcForVT_t16<Src0VT>.ret;
@@ -2733,7 +2773,8 @@ def getMCOpcodeGen : InstrMapping {
[!cast<string>(SIEncodingFamily.SDWA10)],
[!cast<string>(SIEncodingFamily.GFX90A)],
[!cast<string>(SIEncodingFamily.GFX940)],
- [!cast<string>(SIEncodingFamily.GFX11)]];
+ [!cast<string>(SIEncodingFamily.GFX11)],
+ [!cast<string>(SIEncodingFamily.GFX12)]];
}
// Get equivalent SOPK instruction.
@@ -2872,14 +2913,14 @@ def getVOPDBaseFromComponent : SearchIndex {
def VOPDPairs : GenericTable {
let FilterClass = "VOPD_Base";
let CppTypeName = "VOPDInfo";
- let Fields = ["Opcode", "OpX", "OpY"];
+ let Fields = ["Opcode", "OpX", "OpY", "SubTgt"];
let PrimaryKey = ["Opcode"];
let PrimaryKeyName = "getVOPDOpcodeHelper";
}
def getVOPDInfoFromComponentOpcodes : SearchIndex {
let Table = VOPDPairs;
- let Key = ["OpX", "OpY"];
+ let Key = ["OpX", "OpY", "SubTgt"];
}
include "SIInstructions.td"
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInstructions.td b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInstructions.td
index 2edebccef7d8..f9bc623abcd0 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -132,7 +132,7 @@ def V_MOV_B64_PSEUDO : VPseudoInstSI <(outs VReg_64:$vdst),
let isAsCheapAsAMove = 1;
let isMoveImm = 1;
let SchedRW = [Write64Bit];
- let Size = 16; // Needs maximum 2 v_mov_b32 instructions 8 byte long each.
+ let Size = 4;
let UseNamedOperandTable = 1;
}
@@ -149,8 +149,9 @@ def S_MOV_B64_IMM_PSEUDO : SPseudoInstSI <(outs SReg_64:$sdst),
let isAsCheapAsAMove = 1;
let isMoveImm = 1;
let SchedRW = [WriteSALU, Write64Bit];
- let Size = 16; // Needs maximum 2 s_mov_b32 instructions 8 byte long each.
+ let Size = 4;
let Uses = [];
+ let UseNamedOperandTable = 1;
}
// Pseudoinstruction for @llvm.amdgcn.wqm. It is turned into a copy after the
@@ -172,6 +173,13 @@ def STRICT_WQM : PseudoInstSI <(outs unknown:$vdst), (ins unknown:$src0)>;
} // End let hasSideEffects = 0, mayLoad = 0, mayStore = 0, Uses = [EXEC]
+def WWM_COPY : SPseudoInstSI <
+ (outs unknown:$dst), (ins unknown:$src)> {
+ let hasSideEffects = 0;
+ let isAsCheapAsAMove = 1;
+ let isConvergent = 1;
+}
+
def ENTER_STRICT_WWM : SPseudoInstSI <(outs SReg_1:$sdst), (ins i64imm:$src0)> {
let Uses = [EXEC];
let Defs = [EXEC, SCC];
@@ -251,6 +259,12 @@ def V_SET_INACTIVE_B64 : VPseudoInstSI <(outs VReg_64:$vdst),
}
} // End Defs = [SCC]
+def : GCNPat<(i32 (int_amdgcn_set_inactive_chain_arg i32:$src, i32:$inactive)),
+ (V_SET_INACTIVE_B32 VGPR_32:$src, VGPR_32:$inactive)>;
+
+def : GCNPat<(i64 (int_amdgcn_set_inactive_chain_arg i64:$src, i64:$inactive)),
+ (V_SET_INACTIVE_B64 VReg_64:$src, VReg_64:$inactive)>;
+
let usesCustomInserter = 1, hasSideEffects = 0, mayLoad = 0, mayStore = 0, Uses = [EXEC] in {
def WAVE_REDUCE_UMIN_PSEUDO_U32 : VPseudoInstSI <(outs SGPR_32:$sdst),
(ins VSrc_b32: $src, VSrc_b32:$strategy),
@@ -263,7 +277,7 @@ let usesCustomInserter = 1, hasSideEffects = 0, mayLoad = 0, mayStore = 0, Uses
}
}
-let usesCustomInserter = 1, Defs = [VCC, EXEC] in {
+let usesCustomInserter = 1, Defs = [VCC] in {
def V_ADD_U64_PSEUDO : VPseudoInstSI <
(outs VReg_64:$vdst), (ins VSrc_b64:$src0, VSrc_b64:$src1),
[(set VReg_64:$vdst, (DivergentBinFrag<add> i64:$src0, i64:$src1))]
@@ -273,7 +287,7 @@ def V_SUB_U64_PSEUDO : VPseudoInstSI <
(outs VReg_64:$vdst), (ins VSrc_b64:$src0, VSrc_b64:$src1),
[(set VReg_64:$vdst, (DivergentBinFrag<sub> i64:$src0, i64:$src1))]
>;
-} // End usesCustomInserter = 1, Defs = [VCC, EXEC]
+} // End usesCustomInserter = 1, Defs = [VCC]
let usesCustomInserter = 1, Defs = [SCC] in {
def S_ADD_U64_PSEUDO : SPseudoInstSI <
@@ -657,6 +671,50 @@ def : GCNPat<
(SI_TCRETURN_GFX Gfx_CCR_SGPR_64:$src0, (i64 0), i32imm:$fpdiff)
>;
+// Pseudo for the llvm.amdgcn.cs.chain intrinsic.
+// This is essentially a tail call, but it also takes a mask to put in EXEC
+// right before jumping to the callee.
+class SI_CS_CHAIN_TC<
+ ValueType execvt, Predicate wavesizepred,
+ RegisterOperand execrc = getSOPSrcForVT<execvt>.ret>
+ : SPseudoInstSI <(outs),
+ (ins CCR_SGPR_64:$src0, unknown:$callee, i32imm:$fpdiff, execrc:$exec)> {
+ let FixedSize = 0;
+ let isCall = 1;
+ let isTerminator = 1;
+ let isBarrier = 1;
+ let isReturn = 1;
+ let UseNamedOperandTable = 1;
+ let SchedRW = [WriteBranch];
+ let isConvergent = 1;
+
+ let WaveSizePredicate = wavesizepred;
+}
+
+def SI_CS_CHAIN_TC_W32 : SI_CS_CHAIN_TC<i32, isWave32>;
+def SI_CS_CHAIN_TC_W64 : SI_CS_CHAIN_TC<i64, isWave64>;
+
+// Handle selecting direct & indirect calls via SI_CS_CHAIN_TC_W32/64
+multiclass si_cs_chain_tc_pattern<
+ dag callee, ValueType execvt, RegisterOperand execrc, Instruction tc> {
+def : GCNPat<
+ (AMDGPUtc_return_chain i64:$src0, callee, (i32 timm:$fpdiff), execvt:$exec),
+ (tc CCR_SGPR_64:$src0, callee, i32imm:$fpdiff, execrc:$exec)
+>;
+}
+
+multiclass si_cs_chain_tc_patterns<
+ ValueType execvt,
+ RegisterOperand execrc = getSOPSrcForVT<execvt>.ret,
+ Instruction tc = !if(!eq(execvt, i32), SI_CS_CHAIN_TC_W32, SI_CS_CHAIN_TC_W64)
+ > {
+ defm direct: si_cs_chain_tc_pattern<(tglobaladdr:$callee), execvt, execrc, tc>;
+ defm indirect: si_cs_chain_tc_pattern<(i64 0), execvt, execrc, tc>;
+}
+
+defm : si_cs_chain_tc_patterns<i32>;
+defm : si_cs_chain_tc_patterns<i64>;
+
def ADJCALLSTACKUP : SPseudoInstSI<
(outs), (ins i32imm:$amt0, i32imm:$amt1),
[(callseq_start timm:$amt0, timm:$amt1)],
@@ -867,6 +925,28 @@ defm SI_SPILL_S384 : SI_SPILL_SGPR <SReg_384>;
defm SI_SPILL_S512 : SI_SPILL_SGPR <SReg_512>;
defm SI_SPILL_S1024 : SI_SPILL_SGPR <SReg_1024>;
+let SGPRSpill = 1, VALU = 1, isConvergent = 1 in {
+def SI_SPILL_S32_TO_VGPR : PseudoInstSI <(outs VGPR_32:$vdst),
+ (ins SReg_32:$src0, i32imm:$src1, VGPR_32:$vdst_in)> {
+ let Size = 4;
+ let FixedSize = 1;
+ let IsNeverUniform = 1;
+ let hasSideEffects = 0;
+ let mayLoad = 0;
+ let mayStore = 0;
+ let Constraints = "$vdst = $vdst_in";
+}
+
+def SI_RESTORE_S32_FROM_VGPR : PseudoInstSI <(outs SReg_32:$sdst),
+ (ins VGPR_32:$src0, i32imm:$src1)> {
+ let Size = 4;
+ let FixedSize = 1;
+ let hasSideEffects = 0;
+ let mayLoad = 0;
+ let mayStore = 0;
+}
+} // End SGPRSpill = 1, VALU = 1, isConvergent = 1
+
// VGPR or AGPR spill instructions. In case of AGPR spilling a temp register
// needs to be used and an extra instruction to move between VGPR and AGPR.
// UsesTmp adds to the total size of an expanded spill in this case.
@@ -945,8 +1025,10 @@ defm SI_SPILL_AV384 : SI_SPILL_VGPR <AV_384, 1>;
defm SI_SPILL_AV512 : SI_SPILL_VGPR <AV_512, 1>;
defm SI_SPILL_AV1024 : SI_SPILL_VGPR <AV_1024, 1>;
-let isConvergent = 1 in
-defm SI_SPILL_WWM_V32 : SI_SPILL_VGPR <VGPR_32>;
+let isConvergent = 1 in {
+ defm SI_SPILL_WWM_V32 : SI_SPILL_VGPR <VGPR_32>;
+ defm SI_SPILL_WWM_AV32 : SI_SPILL_VGPR <AV_32, 1>;
+}
def SI_PC_ADD_REL_OFFSET : SPseudoInstSI <
(outs SReg_64:$dst),
@@ -1587,6 +1669,16 @@ def : BitConvert <v12i32, v12f32, VReg_384>;
def : BitConvert <v12f32, v12i32, VReg_384>;
// 512-bit bitcast
+def : BitConvert <v32f16, v32i16, VReg_512>;
+def : BitConvert <v32i16, v32f16, VReg_512>;
+def : BitConvert <v32f16, v16i32, VReg_512>;
+def : BitConvert <v32f16, v16f32, VReg_512>;
+def : BitConvert <v16f32, v32f16, VReg_512>;
+def : BitConvert <v16i32, v32f16, VReg_512>;
+def : BitConvert <v32i16, v16i32, VReg_512>;
+def : BitConvert <v32i16, v16f32, VReg_512>;
+def : BitConvert <v16f32, v32i16, VReg_512>;
+def : BitConvert <v16i32, v32i16, VReg_512>;
def : BitConvert <v16i32, v16f32, VReg_512>;
def : BitConvert <v16f32, v16i32, VReg_512>;
def : BitConvert <v8i64, v8f64, VReg_512>;
@@ -1632,8 +1724,10 @@ def : ClampPat<V_MAX_F32_e64, f32>;
def : ClampPat<V_MAX_F64_e64, f64>;
let SubtargetPredicate = NotHasTrue16BitInsts in
def : ClampPat<V_MAX_F16_e64, f16>;
-let SubtargetPredicate = HasTrue16BitInsts in
+let SubtargetPredicate = UseRealTrue16Insts in
def : ClampPat<V_MAX_F16_t16_e64, f16>;
+let SubtargetPredicate = UseFakeTrue16Insts in
+def : ClampPat<V_MAX_F16_fake16_e64, f16>;
let SubtargetPredicate = HasVOP3PInsts in {
def : GCNPat <
@@ -1922,6 +2016,29 @@ def : GCNPat <
(V_MOV_B32_e32 (f16 (bitcast_fpimm_to_i32 $imm)))
>;
+// V_MOV_B64_PSEUDO and S_MOV_B64_IMM_PSEUDO can be used with any 64-bit
+// immediate and wil be expanded as needed, but we will only use these patterns
+// for values which can be encoded.
+def : GCNPat <
+ (VGPRImm<(i64 imm)>:$imm),
+ (V_MOV_B64_PSEUDO imm:$imm)
+>;
+
+def : GCNPat <
+ (VGPRImm<(f64 fpimm)>:$imm),
+ (V_MOV_B64_PSEUDO (f64 (bitcast_fpimm_to_i64 $imm)))
+>;
+
+def : GCNPat <
+ (i64 imm:$imm),
+ (S_MOV_B64_IMM_PSEUDO imm:$imm)
+>;
+
+def : GCNPat <
+ (f64 fpimm:$imm),
+ (S_MOV_B64_IMM_PSEUDO (i64 (bitcast_fpimm_to_i64 fpimm:$imm)))
+>;
+
def : GCNPat <
(f32 fpimm:$imm),
(S_MOV_B32 (f32 (bitcast_fpimm_to_i32 $imm)))
@@ -2306,8 +2423,16 @@ class FPToI1Pat<Instruction Inst, int KOne, ValueType kone_type, ValueType vt, S
(i1 (Inst 0, (kone_type KOne), $src0_modifiers, $src0, DSTCLAMP.NONE))
>;
-def : FPToI1Pat<V_CMP_EQ_F16_e64, CONST.FP16_ONE, i16, f16, fp_to_uint>;
-def : FPToI1Pat<V_CMP_EQ_F16_e64, CONST.FP16_NEG_ONE, i16, f16, fp_to_sint>;
+let OtherPredicates = [NotHasTrue16BitInsts] in {
+ def : FPToI1Pat<V_CMP_EQ_F16_e64, CONST.FP16_ONE, i16, f16, fp_to_uint>;
+ def : FPToI1Pat<V_CMP_EQ_F16_e64, CONST.FP16_NEG_ONE, i16, f16, fp_to_sint>;
+} // end OtherPredicates = [NotHasTrue16BitInsts]
+
+let OtherPredicates = [HasTrue16BitInsts] in {
+ def : FPToI1Pat<V_CMP_EQ_F16_t16_e64, CONST.FP16_ONE, i16, f16, fp_to_uint>;
+ def : FPToI1Pat<V_CMP_EQ_F16_t16_e64, CONST.FP16_NEG_ONE, i16, f16, fp_to_sint>;
+} // end OtherPredicates = [HasTrue16BitInsts]
+
def : FPToI1Pat<V_CMP_EQ_F32_e64, CONST.FP32_ONE, i32, f32, fp_to_uint>;
def : FPToI1Pat<V_CMP_EQ_F32_e64, CONST.FP32_NEG_ONE, i32, f32, fp_to_sint>;
def : FPToI1Pat<V_CMP_EQ_F64_e64, CONST.FP64_ONE, i64, f64, fp_to_uint>;
@@ -2679,12 +2804,12 @@ def : GCNPat<
let OtherPredicates = [HasTrue16BitInsts] in {
def : GCNPat<
(fcanonicalize (f16 (VOP3Mods f16:$src, i32:$src_mods))),
- (V_MUL_F16_t16_e64 0, (i32 CONST.FP16_ONE), $src_mods, $src)
+ (V_MUL_F16_fake16_e64 0, (i32 CONST.FP16_ONE), $src_mods, $src)
>;
def : GCNPat<
(fcanonicalize (f16 (fneg (VOP3Mods f16:$src, i32:$src_mods)))),
- (V_MUL_F16_t16_e64 0, (i32 CONST.FP16_NEG_ONE), $src_mods, $src)
+ (V_MUL_F16_fake16_e64 0, (i32 CONST.FP16_NEG_ONE), $src_mods, $src)
>;
} // End OtherPredicates
@@ -2703,6 +2828,13 @@ def : GCNPat<
(V_MUL_F32_e64 0, (i32 CONST.FP32_NEG_ONE), $src_mods, $src)
>;
+let SubtargetPredicate = HasPackedFP32Ops in {
+def : GCNPat<
+ (fcanonicalize (v2f32 (VOP3PMods v2f32:$src, i32:$src_mods))),
+ (V_PK_MUL_F32 0, CONST.FP32_ONE, $src_mods, $src)
+>;
+}
+
// TODO: Handle fneg like other types.
def : GCNPat<
(fcanonicalize (f64 (VOP3Mods f64:$src, i32:$src_mods))),
@@ -2734,7 +2866,7 @@ multiclass SelectCanonicalizeAsMax<
def : GCNPat<
(fcanonicalize (f16 (VOP3Mods f16:$src, i32:$src_mods))),
- (V_MAX_F16_t16_e64 $src_mods, $src, $src_mods, $src, 0, 0)> {
+ (V_MAX_F16_fake16_e64 $src_mods, $src, $src_mods, $src, 0, 0)> {
let OtherPredicates = !listconcat(f16_preds, [Has16BitInsts, HasTrue16BitInsts]);
}
@@ -3309,6 +3441,81 @@ defm : Int16Med3Pat<V_MED3_I16_e64, smin, smax>;
defm : Int16Med3Pat<V_MED3_U16_e64, umin, umax>;
} // End Predicates = [isGFX9Plus]
+let OtherPredicates = [isGFX12Plus] in {
+def : FPMinMaxPat<V_MINIMUMMAXIMUM_F32_e64, f32, DivergentBinFrag<fmaximum>, fminimum_oneuse>;
+def : FPMinMaxPat<V_MAXIMUMMINIMUM_F32_e64, f32, DivergentBinFrag<fminimum>, fmaximum_oneuse>;
+def : FPMinMaxPat<V_MINIMUMMAXIMUM_F16_e64, f16, DivergentBinFrag<fmaximum>, fminimum_oneuse>;
+def : FPMinMaxPat<V_MAXIMUMMINIMUM_F16_e64, f16, DivergentBinFrag<fminimum>, fmaximum_oneuse>;
+}
+
+// Convert a floating-point power of 2 to the integer exponent.
+def FPPow2ToExponentXForm : SDNodeXForm<fpimm, [{
+ const auto &APF = N->getValueAPF();
+ int Log2 = APF.getExactLog2Abs();
+ assert(Log2 != INT_MIN);
+ return CurDAG->getTargetConstant(Log2, SDLoc(N), MVT::i32);
+}]>;
+
+// Check if a floating point value is a power of 2 floating-point
+// immediate where it's preferable to emit a multiply by as an
+// ldexp. We skip over 0.5 to 4.0 as those are inline immediates
+// anyway.
+def fpimm_pos_pow2_prefer_ldexp_f64 : FPImmLeaf<f64, [{
+ if (Imm.isNegative())
+ return false;
+
+ int Exp = Imm.getExactLog2Abs();
+ // Prefer leaving the FP inline immediates as they are.
+ // 0.5, 1.0, 2.0, 4.0
+
+ // For f64 ldexp is always better than materializing a 64-bit
+ // constant.
+ return Exp != INT_MIN && (Exp < -1 || Exp > 2);
+ }], FPPow2ToExponentXForm
+>;
+
+def fpimm_neg_pow2_prefer_ldexp_f64 : FPImmLeaf<f64, [{
+ if (!Imm.isNegative())
+ return false;
+ int Exp = Imm.getExactLog2Abs();
+ // Prefer leaving the FP inline immediates as they are.
+ // 0.5, 1.0, 2.0, 4.0
+
+ // For f64 ldexp is always better than materializing a 64-bit
+ // constant.
+ return Exp != INT_MIN && (Exp < -1 || Exp > 2);
+ }], FPPow2ToExponentXForm
+>;
+
+// f64 is different because we also want to handle cases that may
+// require materialization of the exponent.
+// TODO: If we know f64 ops are fast, prefer add (ldexp x, N), y over fma
+// TODO: For f32/f16, it's not a clear win on code size to use ldexp
+// in place of mul since we have to use the vop3 form. Are there power
+// savings or some other reason to prefer ldexp over mul?
+def : GCNPat<
+ (any_fmul (f64 (VOP3Mods f64:$src0, i32:$src0_mods)),
+ fpimm_pos_pow2_prefer_ldexp_f64:$src1),
+ (V_LDEXP_F64_e64 i32:$src0_mods, VSrc_b64:$src0,
+ 0, (S_MOV_B32 (i32 (FPPow2ToExponentXForm $src1))))
+>;
+
+def : GCNPat<
+ (any_fmul f64:$src0, fpimm_neg_pow2_prefer_ldexp_f64:$src1),
+ (V_LDEXP_F64_e64 SRCMODS.NEG, VSrc_b64:$src0,
+ 0, (S_MOV_B32 (i32 (FPPow2ToExponentXForm $src1))))
+>;
+
+// We want to avoid using VOP3Mods which could pull in another fneg
+// which we would need to be re-negated (which should never happen in
+// practice). I don't see a way to apply an SDNodeXForm that accounts
+// for a second operand.
+def : GCNPat<
+ (any_fmul (fabs f64:$src0), fpimm_neg_pow2_prefer_ldexp_f64:$src1),
+ (V_LDEXP_F64_e64 SRCMODS.NEG_ABS, VSrc_b64:$src0,
+ 0, (S_MOV_B32 (i32 (FPPow2ToExponentXForm $src1))))
+>;
+
class AMDGPUGenericInstruction : GenericInstruction {
let Namespace = "AMDGPU";
}
@@ -3477,8 +3684,8 @@ def G_AMDGPU_ATOMIC_FMIN : G_ATOMICRMW_OP;
def G_AMDGPU_ATOMIC_FMAX : G_ATOMICRMW_OP;
}
-class BufferAtomicGenericInstruction<bit NoRtn = 0> : AMDGPUGenericInstruction {
- let OutOperandList = !if(NoRtn, (outs), (outs type0:$dst));
+class BufferAtomicGenericInstruction : AMDGPUGenericInstruction {
+ let OutOperandList = (outs type0:$dst);
let InOperandList = (ins type0:$vdata, type1:$rsrc, type2:$vindex, type2:$voffset,
type2:$soffset, untyped_imm_0:$offset,
untyped_imm_0:$cachepolicy, untyped_imm_0:$idxen);
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SILateBranchLowering.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SILateBranchLowering.cpp
index df522a9099c0..abb72e8e63c3 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SILateBranchLowering.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SILateBranchLowering.cpp
@@ -30,6 +30,7 @@ private:
const SIInstrInfo *TII = nullptr;
MachineDominatorTree *MDT = nullptr;
+ void expandChainCall(MachineInstr &MI);
void earlyTerm(MachineInstr &MI, MachineBasicBlock *EarlyExitBlock);
public:
@@ -116,6 +117,18 @@ static void splitBlock(MachineBasicBlock &MBB, MachineInstr &MI,
MDT->getBase().applyUpdates(DTUpdates);
}
+void SILateBranchLowering::expandChainCall(MachineInstr &MI) {
+ // This is a tail call that needs to be expanded into at least
+ // 2 instructions, one for setting EXEC and one for the actual tail call.
+ constexpr unsigned ExecIdx = 3;
+
+ BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), TII->get(MovOpc), ExecReg)
+ ->addOperand(MI.getOperand(ExecIdx));
+ MI.removeOperand(ExecIdx);
+
+ MI.setDesc(TII->get(AMDGPU::SI_TCRETURN));
+}
+
void SILateBranchLowering::earlyTerm(MachineInstr &MI,
MachineBasicBlock *EarlyExitBlock) {
MachineBasicBlock &MBB = *MI.getParent();
@@ -158,6 +171,12 @@ bool SILateBranchLowering::runOnMachineFunction(MachineFunction &MF) {
}
break;
+ case AMDGPU::SI_CS_CHAIN_TC_W32:
+ case AMDGPU::SI_CS_CHAIN_TC_W64:
+ expandChainCall(MI);
+ MadeChange = true;
+ break;
+
case AMDGPU::SI_EARLY_TERMINATE_SCC0:
EarlyTermInstrs.push_back(&MI);
break;
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
index c252d30e250e..9c85ff3c43e2 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
@@ -161,8 +161,10 @@ class SILoadStoreOptimizer : public MachineFunctionPass {
if (!AddrOp->isReg())
return false;
- // TODO: We should be able to merge physical reg addresses.
- if (AddrOp->getReg().isPhysical())
+ // TODO: We should be able to merge instructions with other physical reg
+ // addresses too.
+ if (AddrOp->getReg().isPhysical() &&
+ AddrOp->getReg() != AMDGPU::SGPR_NULL)
return false;
// If an address has only one use then there will be no other
@@ -320,7 +322,7 @@ static unsigned getOpcodeWidth(const MachineInstr &MI, const SIInstrInfo &TII) {
// FIXME: Handle d16 correctly
return AMDGPU::getMUBUFElements(Opc);
}
- if (TII.isMIMG(MI)) {
+ if (TII.isImage(MI)) {
uint64_t DMaskImm =
TII.getNamedOperand(MI, AMDGPU::OpName::dmask)->getImm();
return llvm::popcount(DMaskImm);
@@ -350,6 +352,9 @@ static unsigned getOpcodeWidth(const MachineInstr &MI, const SIInstrInfo &TII) {
case AMDGPU::FLAT_LOAD_DWORDX2:
case AMDGPU::FLAT_STORE_DWORDX2:
return 2;
+ case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM:
+ case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM:
+ case AMDGPU::S_LOAD_DWORDX3_IMM:
case AMDGPU::GLOBAL_LOAD_DWORDX3:
case AMDGPU::GLOBAL_LOAD_DWORDX3_SADDR:
case AMDGPU::GLOBAL_STORE_DWORDX3:
@@ -398,15 +403,23 @@ static InstClassEnum getInstClass(unsigned Opc, const SIInstrInfo &TII) {
case AMDGPU::BUFFER_LOAD_DWORD_OFFEN_exact:
case AMDGPU::BUFFER_LOAD_DWORD_OFFSET:
case AMDGPU::BUFFER_LOAD_DWORD_OFFSET_exact:
+ case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_OFFEN:
+ case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_OFFEN_exact:
+ case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_OFFSET:
+ case AMDGPU::BUFFER_LOAD_DWORD_VBUFFER_OFFSET_exact:
return BUFFER_LOAD;
case AMDGPU::BUFFER_STORE_DWORD_OFFEN:
case AMDGPU::BUFFER_STORE_DWORD_OFFEN_exact:
case AMDGPU::BUFFER_STORE_DWORD_OFFSET:
case AMDGPU::BUFFER_STORE_DWORD_OFFSET_exact:
+ case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_OFFEN:
+ case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_OFFEN_exact:
+ case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_OFFSET:
+ case AMDGPU::BUFFER_STORE_DWORD_VBUFFER_OFFSET_exact:
return BUFFER_STORE;
}
}
- if (TII.isMIMG(Opc)) {
+ if (TII.isImage(Opc)) {
// Ignore instructions encoded without vaddr.
if (!AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::vaddr) &&
!AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::vaddr0))
@@ -424,35 +437,50 @@ static InstClassEnum getInstClass(unsigned Opc, const SIInstrInfo &TII) {
switch (AMDGPU::getMTBUFBaseOpcode(Opc)) {
default:
return UNKNOWN;
+ case AMDGPU::TBUFFER_LOAD_FORMAT_X_BOTHEN:
+ case AMDGPU::TBUFFER_LOAD_FORMAT_X_BOTHEN_exact:
+ case AMDGPU::TBUFFER_LOAD_FORMAT_X_IDXEN:
+ case AMDGPU::TBUFFER_LOAD_FORMAT_X_IDXEN_exact:
case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFEN:
case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFEN_exact:
case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFSET:
case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFSET_exact:
- case AMDGPU::TBUFFER_LOAD_FORMAT_X_IDXEN:
- case AMDGPU::TBUFFER_LOAD_FORMAT_X_IDXEN_exact:
- case AMDGPU::TBUFFER_LOAD_FORMAT_X_BOTHEN:
- case AMDGPU::TBUFFER_LOAD_FORMAT_X_BOTHEN_exact:
+ case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_BOTHEN:
+ case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_BOTHEN_exact:
+ case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_IDXEN:
+ case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_IDXEN_exact:
+ case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_OFFEN:
+ case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_OFFEN_exact:
+ case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_OFFSET:
+ case AMDGPU::TBUFFER_LOAD_FORMAT_X_VBUFFER_OFFSET_exact:
return TBUFFER_LOAD;
case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFEN:
case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFEN_exact:
case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFSET:
case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFSET_exact:
+ case AMDGPU::TBUFFER_STORE_FORMAT_X_VBUFFER_OFFEN:
+ case AMDGPU::TBUFFER_STORE_FORMAT_X_VBUFFER_OFFEN_exact:
+ case AMDGPU::TBUFFER_STORE_FORMAT_X_VBUFFER_OFFSET:
+ case AMDGPU::TBUFFER_STORE_FORMAT_X_VBUFFER_OFFSET_exact:
return TBUFFER_STORE;
}
}
return UNKNOWN;
case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
+ case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM:
case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
return S_BUFFER_LOAD_IMM;
case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM:
case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM:
+ case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM:
case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM:
case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM:
return S_BUFFER_LOAD_SGPR_IMM;
case AMDGPU::S_LOAD_DWORD_IMM:
case AMDGPU::S_LOAD_DWORDX2_IMM:
+ case AMDGPU::S_LOAD_DWORDX3_IMM:
case AMDGPU::S_LOAD_DWORDX4_IMM:
case AMDGPU::S_LOAD_DWORDX8_IMM:
return S_LOAD_IMM;
@@ -505,7 +533,7 @@ static unsigned getInstSubclass(unsigned Opc, const SIInstrInfo &TII) {
default:
if (TII.isMUBUF(Opc))
return AMDGPU::getMUBUFBaseOpcode(Opc);
- if (TII.isMIMG(Opc)) {
+ if (TII.isImage(Opc)) {
const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Opc);
assert(Info);
return Info->BaseOpcode;
@@ -524,16 +552,19 @@ static unsigned getInstSubclass(unsigned Opc, const SIInstrInfo &TII) {
return Opc;
case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
+ case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM:
case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
return AMDGPU::S_BUFFER_LOAD_DWORD_IMM;
case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM:
case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM:
+ case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM:
case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM:
case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM:
return AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM;
case AMDGPU::S_LOAD_DWORD_IMM:
case AMDGPU::S_LOAD_DWORDX2_IMM:
+ case AMDGPU::S_LOAD_DWORDX3_IMM:
case AMDGPU::S_LOAD_DWORDX4_IMM:
case AMDGPU::S_LOAD_DWORDX8_IMM:
return AMDGPU::S_LOAD_DWORD_IMM;
@@ -600,11 +631,13 @@ static AddressRegs getRegs(unsigned Opc, const SIInstrInfo &TII) {
return Result;
}
- if (TII.isMIMG(Opc)) {
+ if (TII.isImage(Opc)) {
int VAddr0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0);
if (VAddr0Idx >= 0) {
- int SRsrcIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::srsrc);
- Result.NumVAddrs = SRsrcIdx - VAddr0Idx;
+ int RsrcName =
+ TII.isMIMG(Opc) ? AMDGPU::OpName::srsrc : AMDGPU::OpName::rsrc;
+ int RsrcIdx = AMDGPU::getNamedOperandIdx(Opc, RsrcName);
+ Result.NumVAddrs = RsrcIdx - VAddr0Idx;
} else {
Result.VAddr = true;
}
@@ -631,16 +664,19 @@ static AddressRegs getRegs(unsigned Opc, const SIInstrInfo &TII) {
return Result;
case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM:
case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM:
+ case AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM:
case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM:
case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM:
Result.SOffset = true;
[[fallthrough]];
case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
+ case AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM:
case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
case AMDGPU::S_LOAD_DWORD_IMM:
case AMDGPU::S_LOAD_DWORDX2_IMM:
+ case AMDGPU::S_LOAD_DWORDX3_IMM:
case AMDGPU::S_LOAD_DWORDX4_IMM:
case AMDGPU::S_LOAD_DWORDX8_IMM:
Result.SBase = true;
@@ -739,6 +775,7 @@ void SILoadStoreOptimizer::CombineInfo::setMI(MachineBasicBlock::iterator MI,
}
AddressRegs Regs = getRegs(Opc, *LSO.TII);
+ bool isVIMAGEorVSAMPLE = LSO.TII->isVIMAGE(*I) || LSO.TII->isVSAMPLE(*I);
NumAddresses = 0;
for (unsigned J = 0; J < Regs.NumVAddrs; J++)
@@ -751,8 +788,8 @@ void SILoadStoreOptimizer::CombineInfo::setMI(MachineBasicBlock::iterator MI,
AddrIdx[NumAddresses++] =
AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::sbase);
if (Regs.SRsrc)
- AddrIdx[NumAddresses++] =
- AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::srsrc);
+ AddrIdx[NumAddresses++] = AMDGPU::getNamedOperandIdx(
+ Opc, isVIMAGEorVSAMPLE ? AMDGPU::OpName::rsrc : AMDGPU::OpName::srsrc);
if (Regs.SOffset)
AddrIdx[NumAddresses++] =
AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::soffset);
@@ -763,8 +800,8 @@ void SILoadStoreOptimizer::CombineInfo::setMI(MachineBasicBlock::iterator MI,
AddrIdx[NumAddresses++] =
AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr);
if (Regs.SSamp)
- AddrIdx[NumAddresses++] =
- AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::ssamp);
+ AddrIdx[NumAddresses++] = AMDGPU::getNamedOperandIdx(
+ Opc, isVIMAGEorVSAMPLE ? AMDGPU::OpName::samp : AMDGPU::OpName::ssamp);
assert(NumAddresses <= MaxAddressRegs);
for (unsigned J = 0; J < NumAddresses; J++)
@@ -871,6 +908,9 @@ bool SILoadStoreOptimizer::dmasksCanBeCombined(const CombineInfo &CI,
unsigned MaxMask = std::max(CI.DMask, Paired.DMask);
unsigned MinMask = std::min(CI.DMask, Paired.DMask);
+ if (!MaxMask)
+ return false;
+
unsigned AllowedBitsForMin = llvm::countr_zero(MaxMask);
if ((1u << AllowedBitsForMin) <= MinMask)
return false;
@@ -964,6 +1004,17 @@ bool SILoadStoreOptimizer::offsetsCanBeCombined(CombineInfo &CI,
return false;
if (CI.CPol != Paired.CPol)
return false;
+ if (CI.InstClass == S_LOAD_IMM || CI.InstClass == S_BUFFER_LOAD_IMM ||
+ CI.InstClass == S_BUFFER_LOAD_SGPR_IMM) {
+ // Reject cases like:
+ // dword + dwordx2 -> dwordx3
+ // dword + dwordx3 -> dwordx4
+ // If we tried to combine these cases, we would fail to extract a subreg
+ // for the result of the second load due to SGPR alignment requirements.
+ if (CI.Width != Paired.Width &&
+ (CI.Width < Paired.Width) == (CI.Offset < Paired.Offset))
+ return false;
+ }
return true;
}
@@ -1043,6 +1094,8 @@ bool SILoadStoreOptimizer::widthsFit(const GCNSubtarget &STM,
case 4:
case 8:
return true;
+ case 3:
+ return STM.hasScalarDwordx3Loads();
}
}
}
@@ -1671,6 +1724,8 @@ unsigned SILoadStoreOptimizer::getNewOpcode(const CombineInfo &CI,
return 0;
case 2:
return AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM;
+ case 3:
+ return AMDGPU::S_BUFFER_LOAD_DWORDX3_IMM;
case 4:
return AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM;
case 8:
@@ -1682,6 +1737,8 @@ unsigned SILoadStoreOptimizer::getNewOpcode(const CombineInfo &CI,
return 0;
case 2:
return AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM;
+ case 3:
+ return AMDGPU::S_BUFFER_LOAD_DWORDX3_SGPR_IMM;
case 4:
return AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM;
case 8:
@@ -1693,6 +1750,8 @@ unsigned SILoadStoreOptimizer::getNewOpcode(const CombineInfo &CI,
return 0;
case 2:
return AMDGPU::S_LOAD_DWORDX2_IMM;
+ case 3:
+ return AMDGPU::S_LOAD_DWORDX3_IMM;
case 4:
return AMDGPU::S_LOAD_DWORDX4_IMM;
case 8:
@@ -1814,6 +1873,8 @@ SILoadStoreOptimizer::getTargetRegisterClass(const CombineInfo &CI,
return nullptr;
case 2:
return &AMDGPU::SReg_64_XEXECRegClass;
+ case 3:
+ return &AMDGPU::SGPR_96RegClass;
case 4:
return &AMDGPU::SGPR_128RegClass;
case 8:
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp
index 00cb5b2878f4..f178324dbbe2 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp
@@ -79,6 +79,7 @@ private:
SetVector<MachineInstr*> LoweredEndCf;
DenseSet<Register> LoweredIf;
SmallSet<MachineBasicBlock *, 4> KillBlocks;
+ SmallSet<Register, 8> RecomputeRegs;
const TargetRegisterClass *BoolRC = nullptr;
unsigned AndOpc;
@@ -297,8 +298,7 @@ void SILowerControlFlow::emitIf(MachineInstr &MI) {
// FIXME: Is there a better way of adjusting the liveness? It shouldn't be
// hard to add another def here but I'm not sure how to correctly update the
// valno.
- LIS->removeInterval(SaveExecReg);
- LIS->createAndComputeVirtRegInterval(SaveExecReg);
+ RecomputeRegs.insert(SaveExecReg);
LIS->createAndComputeVirtRegInterval(Tmp);
if (!SimpleIf)
LIS->createAndComputeVirtRegInterval(CopyReg);
@@ -309,6 +309,7 @@ void SILowerControlFlow::emitElse(MachineInstr &MI) {
const DebugLoc &DL = MI.getDebugLoc();
Register DstReg = MI.getOperand(0).getReg();
+ Register SrcReg = MI.getOperand(1).getReg();
MachineBasicBlock::iterator Start = MBB.begin();
@@ -319,7 +320,7 @@ void SILowerControlFlow::emitElse(MachineInstr &MI) {
BuildMI(MBB, Start, DL, TII->get(OrSaveExecOpc), SaveReg)
.add(MI.getOperand(1)); // Saved EXEC
if (LV)
- LV->replaceKillInstruction(MI.getOperand(1).getReg(), MI, *OrSaveExec);
+ LV->replaceKillInstruction(SrcReg, MI, *OrSaveExec);
MachineBasicBlock *DestBB = MI.getOperand(2).getMBB();
@@ -331,9 +332,6 @@ void SILowerControlFlow::emitElse(MachineInstr &MI) {
.addReg(Exec)
.addReg(SaveReg);
- if (LIS)
- LIS->InsertMachineInstrInMaps(*And);
-
MachineInstr *Xor =
BuildMI(MBB, ElsePt, DL, TII->get(XorTermrOpc), Exec)
.addReg(Exec)
@@ -356,12 +354,13 @@ void SILowerControlFlow::emitElse(MachineInstr &MI) {
MI.eraseFromParent();
LIS->InsertMachineInstrInMaps(*OrSaveExec);
+ LIS->InsertMachineInstrInMaps(*And);
LIS->InsertMachineInstrInMaps(*Xor);
LIS->InsertMachineInstrInMaps(*Branch);
- LIS->removeInterval(DstReg);
- LIS->createAndComputeVirtRegInterval(DstReg);
+ RecomputeRegs.insert(SrcReg);
+ RecomputeRegs.insert(DstReg);
LIS->createAndComputeVirtRegInterval(SaveReg);
// Let this be recomputed.
@@ -388,8 +387,9 @@ void SILowerControlFlow::emitIfBreak(MachineInstr &MI) {
// AND the break condition operand with exec, then OR that into the "loop
// exit" mask.
MachineInstr *And = nullptr, *Or = nullptr;
+ Register AndReg;
if (!SkipAnding) {
- Register AndReg = MRI->createVirtualRegister(BoolRC);
+ AndReg = MRI->createVirtualRegister(BoolRC);
And = BuildMI(MBB, &MI, DL, TII->get(AndOpc), AndReg)
.addReg(Exec)
.add(MI.getOperand(1));
@@ -398,8 +398,6 @@ void SILowerControlFlow::emitIfBreak(MachineInstr &MI) {
Or = BuildMI(MBB, &MI, DL, TII->get(OrOpc), Dst)
.addReg(AndReg)
.add(MI.getOperand(2));
- if (LIS)
- LIS->createAndComputeVirtRegInterval(AndReg);
} else {
Or = BuildMI(MBB, &MI, DL, TII->get(OrOpc), Dst)
.add(MI.getOperand(1))
@@ -411,9 +409,13 @@ void SILowerControlFlow::emitIfBreak(MachineInstr &MI) {
LV->replaceKillInstruction(MI.getOperand(2).getReg(), MI, *Or);
if (LIS) {
- if (And)
- LIS->InsertMachineInstrInMaps(*And);
LIS->ReplaceMachineInstrInMaps(MI, *Or);
+ if (And) {
+ // Read of original operand 1 is on And now not Or.
+ RecomputeRegs.insert(And->getOperand(2).getReg());
+ LIS->InsertMachineInstrInMaps(*And);
+ LIS->createAndComputeVirtRegInterval(AndReg);
+ }
}
MI.eraseFromParent();
@@ -436,6 +438,7 @@ void SILowerControlFlow::emitLoop(MachineInstr &MI) {
.add(MI.getOperand(1));
if (LIS) {
+ RecomputeRegs.insert(MI.getOperand(0).getReg());
LIS->ReplaceMachineInstrInMaps(MI, *AndN2);
LIS->InsertMachineInstrInMaps(*Branch);
}
@@ -714,11 +717,13 @@ void SILowerControlFlow::lowerInitExec(MachineBasicBlock *MBB,
if (MI.getOpcode() == AMDGPU::SI_INIT_EXEC) {
// This should be before all vector instructions.
- BuildMI(*MBB, MBB->begin(), MI.getDebugLoc(),
+ MachineInstr *InitMI = BuildMI(*MBB, MBB->begin(), MI.getDebugLoc(),
TII->get(IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64), Exec)
.addImm(MI.getOperand(0).getImm());
- if (LIS)
+ if (LIS) {
LIS->RemoveMachineInstrFromMaps(MI);
+ LIS->InsertMachineInstrInMaps(*InitMI);
+ }
MI.eraseFromParent();
return;
}
@@ -789,8 +794,7 @@ void SILowerControlFlow::lowerInitExec(MachineBasicBlock *MBB,
LIS->InsertMachineInstrInMaps(*CmpMI);
LIS->InsertMachineInstrInMaps(*CmovMI);
- LIS->removeInterval(InputReg);
- LIS->createAndComputeVirtRegInterval(InputReg);
+ RecomputeRegs.insert(InputReg);
LIS->createAndComputeVirtRegInterval(CountReg);
}
@@ -807,7 +811,7 @@ bool SILowerControlFlow::removeMBBifRedundant(MachineBasicBlock &MBB) {
while (!MBB.predecessors().empty()) {
MachineBasicBlock *P = *MBB.pred_begin();
- if (P->getFallThrough() == &MBB)
+ if (P->getFallThrough(false) == &MBB)
FallThrough = P;
P->ReplaceUsesOfBlockWith(&MBB, Succ);
}
@@ -828,14 +832,13 @@ bool SILowerControlFlow::removeMBBifRedundant(MachineBasicBlock &MBB) {
MBB.clear();
MBB.eraseFromParent();
if (FallThrough && !FallThrough->isLayoutSuccessor(Succ)) {
- if (!Succ->canFallThrough()) {
- MachineFunction *MF = FallThrough->getParent();
- MachineFunction::iterator FallThroughPos(FallThrough);
- MF->splice(std::next(FallThroughPos), Succ);
- } else
- BuildMI(*FallThrough, FallThrough->end(),
- FallThrough->findBranchDebugLoc(), TII->get(AMDGPU::S_BRANCH))
- .addMBB(Succ);
+ // Note: we cannot update block layout and preserve live intervals;
+ // hence we must insert a branch.
+ MachineInstr *BranchMI = BuildMI(*FallThrough, FallThrough->end(),
+ FallThrough->findBranchDebugLoc(), TII->get(AMDGPU::S_BRANCH))
+ .addMBB(Succ);
+ if (LIS)
+ LIS->InsertMachineInstrInMaps(*BranchMI);
}
return true;
@@ -845,8 +848,8 @@ bool SILowerControlFlow::runOnMachineFunction(MachineFunction &MF) {
const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
TII = ST.getInstrInfo();
TRI = &TII->getRegisterInfo();
- EnableOptimizeEndCf =
- RemoveRedundantEndcf && MF.getTarget().getOptLevel() > CodeGenOpt::None;
+ EnableOptimizeEndCf = RemoveRedundantEndcf &&
+ MF.getTarget().getOptLevel() > CodeGenOptLevel::None;
// This doesn't actually need LiveIntervals, but we can preserve them.
LIS = getAnalysisIfAvailable<LiveIntervals>();
@@ -947,6 +950,14 @@ bool SILowerControlFlow::runOnMachineFunction(MachineFunction &MF) {
optimizeEndCf();
+ if (LIS) {
+ for (Register Reg : RecomputeRegs) {
+ LIS->removeInterval(Reg);
+ LIS->createAndComputeVirtRegInterval(Reg);
+ }
+ }
+
+ RecomputeRegs.clear();
LoweredEndCf.clear();
LoweredIf.clear();
KillBlocks.clear();
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp
index d4f0906f020a..cfa0c21def79 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp
@@ -21,21 +21,19 @@
//
//===----------------------------------------------------------------------===//
+#include "SILowerI1Copies.h"
#include "AMDGPU.h"
-#include "GCNSubtarget.h"
-#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
-#include "llvm/CodeGen/MachineDominators.h"
-#include "llvm/CodeGen/MachineFunctionPass.h"
-#include "llvm/CodeGen/MachinePostDominators.h"
#include "llvm/CodeGen/MachineSSAUpdater.h"
#include "llvm/InitializePasses.h"
+#include "llvm/Target/CGPassBuilderOption.h"
#define DEBUG_TYPE "si-i1-copies"
using namespace llvm;
-static unsigned createLaneMaskReg(MachineFunction &MF);
-static unsigned insertUndefLaneMask(MachineBasicBlock &MBB);
+static Register insertUndefLaneMask(MachineBasicBlock *MBB,
+ MachineRegisterInfo *MRI,
+ Register LaneMaskRegAttrs);
namespace {
@@ -43,26 +41,6 @@ class SILowerI1Copies : public MachineFunctionPass {
public:
static char ID;
-private:
- bool IsWave32 = false;
- MachineFunction *MF = nullptr;
- MachineDominatorTree *DT = nullptr;
- MachinePostDominatorTree *PDT = nullptr;
- MachineRegisterInfo *MRI = nullptr;
- const GCNSubtarget *ST = nullptr;
- const SIInstrInfo *TII = nullptr;
-
- unsigned ExecReg;
- unsigned MovOp;
- unsigned AndOp;
- unsigned OrOp;
- unsigned XorOp;
- unsigned AndN2Op;
- unsigned OrN2Op;
-
- DenseSet<unsigned> ConstrainRegs;
-
-public:
SILowerI1Copies() : MachineFunctionPass(ID) {
initializeSILowerI1CopiesPass(*PassRegistry::getPassRegistry());
}
@@ -77,29 +55,53 @@ public:
AU.addRequired<MachinePostDominatorTree>();
MachineFunctionPass::getAnalysisUsage(AU);
}
+};
+
+class Vreg1LoweringHelper : public PhiLoweringHelper {
+public:
+ Vreg1LoweringHelper(MachineFunction *MF, MachineDominatorTree *DT,
+ MachinePostDominatorTree *PDT);
private:
- bool lowerCopiesFromI1();
- bool lowerPhis();
- bool lowerCopiesToI1();
- bool isConstantLaneMask(Register Reg, bool &Val) const;
+ DenseSet<Register> ConstrainRegs;
+
+public:
+ void markAsLaneMask(Register DstReg) const override;
+ void getCandidatesForLowering(
+ SmallVectorImpl<MachineInstr *> &Vreg1Phis) const override;
+ void collectIncomingValuesFromPhi(
+ const MachineInstr *MI,
+ SmallVectorImpl<Incoming> &Incomings) const override;
+ void replaceDstReg(Register NewReg, Register OldReg,
+ MachineBasicBlock *MBB) override;
void buildMergeLaneMasks(MachineBasicBlock &MBB,
MachineBasicBlock::iterator I, const DebugLoc &DL,
- unsigned DstReg, unsigned PrevReg, unsigned CurReg);
- MachineBasicBlock::iterator
- getSaluInsertionAtEnd(MachineBasicBlock &MBB) const;
+ Register DstReg, Register PrevReg,
+ Register CurReg) override;
+ void constrainIncomingRegisterTakenAsIs(Incoming &In) override;
+ bool lowerCopiesFromI1();
+ bool lowerCopiesToI1();
+ bool cleanConstrainRegs(bool Changed);
bool isVreg1(Register Reg) const {
return Reg.isVirtual() && MRI->getRegClass(Reg) == &AMDGPU::VReg_1RegClass;
}
-
- bool isLaneMaskReg(unsigned Reg) const {
- return TII->getRegisterInfo().isSGPRReg(*MRI, Reg) &&
- TII->getRegisterInfo().getRegSizeInBits(Reg, *MRI) ==
- ST->getWavefrontSize();
- }
};
+Vreg1LoweringHelper::Vreg1LoweringHelper(MachineFunction *MF,
+ MachineDominatorTree *DT,
+ MachinePostDominatorTree *PDT)
+ : PhiLoweringHelper(MF, DT, PDT) {}
+
+bool Vreg1LoweringHelper::cleanConstrainRegs(bool Changed) {
+ assert(Changed || ConstrainRegs.empty());
+ for (Register Reg : ConstrainRegs)
+ MRI->constrainRegClass(Reg, &AMDGPU::SReg_1_XEXECRegClass);
+ ConstrainRegs.clear();
+
+ return Changed;
+}
+
/// Helper class that determines the relationship between incoming values of a
/// phi in the control flow graph to determine where an incoming value can
/// simply be taken as a scalar lane mask as-is, and where it needs to be
@@ -145,8 +147,7 @@ public:
ArrayRef<MachineBasicBlock *> predecessors() const { return Predecessors; }
- void analyze(MachineBasicBlock &DefBlock,
- ArrayRef<MachineBasicBlock *> IncomingBlocks) {
+ void analyze(MachineBasicBlock &DefBlock, ArrayRef<Incoming> Incomings) {
assert(Stack.empty());
ReachableMap.clear();
ReachableOrdered.clear();
@@ -157,7 +158,8 @@ public:
ReachableMap.try_emplace(&DefBlock, false);
ReachableOrdered.push_back(&DefBlock);
- for (MachineBasicBlock *MBB : IncomingBlocks) {
+ for (auto Incoming : Incomings) {
+ MachineBasicBlock *MBB = Incoming.Block;
if (MBB == &DefBlock) {
ReachableMap[&DefBlock] = true; // self-loop on DefBlock
continue;
@@ -302,34 +304,38 @@ public:
/// blocks, so that the SSA updater doesn't have to search all the way to the
/// function entry.
void addLoopEntries(unsigned LoopLevel, MachineSSAUpdater &SSAUpdater,
- ArrayRef<MachineBasicBlock *> Blocks = {}) {
+ MachineRegisterInfo &MRI, Register LaneMaskRegAttrs,
+ ArrayRef<Incoming> Incomings = {}) {
assert(LoopLevel < CommonDominators.size());
MachineBasicBlock *Dom = CommonDominators[LoopLevel];
- for (MachineBasicBlock *MBB : Blocks)
- Dom = DT.findNearestCommonDominator(Dom, MBB);
+ for (auto &Incoming : Incomings)
+ Dom = DT.findNearestCommonDominator(Dom, Incoming.Block);
- if (!inLoopLevel(*Dom, LoopLevel, Blocks)) {
- SSAUpdater.AddAvailableValue(Dom, insertUndefLaneMask(*Dom));
+ if (!inLoopLevel(*Dom, LoopLevel, Incomings)) {
+ SSAUpdater.AddAvailableValue(
+ Dom, insertUndefLaneMask(Dom, &MRI, LaneMaskRegAttrs));
} else {
// The dominator is part of the loop or the given blocks, so add the
// undef value to unreachable predecessors instead.
for (MachineBasicBlock *Pred : Dom->predecessors()) {
- if (!inLoopLevel(*Pred, LoopLevel, Blocks))
- SSAUpdater.AddAvailableValue(Pred, insertUndefLaneMask(*Pred));
+ if (!inLoopLevel(*Pred, LoopLevel, Incomings))
+ SSAUpdater.AddAvailableValue(
+ Pred, insertUndefLaneMask(Pred, &MRI, LaneMaskRegAttrs));
}
}
}
private:
bool inLoopLevel(MachineBasicBlock &MBB, unsigned LoopLevel,
- ArrayRef<MachineBasicBlock *> Blocks) const {
+ ArrayRef<Incoming> Incomings) const {
auto DomIt = Visited.find(&MBB);
if (DomIt != Visited.end() && DomIt->second <= LoopLevel)
return true;
- if (llvm::is_contained(Blocks, &MBB))
- return true;
+ for (auto &Incoming : Incomings)
+ if (Incoming.Block == &MBB)
+ return true;
return false;
}
@@ -405,19 +411,19 @@ FunctionPass *llvm::createSILowerI1CopiesPass() {
return new SILowerI1Copies();
}
-static unsigned createLaneMaskReg(MachineFunction &MF) {
- const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
- MachineRegisterInfo &MRI = MF.getRegInfo();
- return MRI.createVirtualRegister(ST.isWave32() ? &AMDGPU::SReg_32RegClass
- : &AMDGPU::SReg_64RegClass);
+Register llvm::createLaneMaskReg(MachineRegisterInfo *MRI,
+ Register LaneMaskRegAttrs) {
+ return MRI->cloneVirtualRegister(LaneMaskRegAttrs);
}
-static unsigned insertUndefLaneMask(MachineBasicBlock &MBB) {
- MachineFunction &MF = *MBB.getParent();
+static Register insertUndefLaneMask(MachineBasicBlock *MBB,
+ MachineRegisterInfo *MRI,
+ Register LaneMaskRegAttrs) {
+ MachineFunction &MF = *MBB->getParent();
const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
const SIInstrInfo *TII = ST.getInstrInfo();
- unsigned UndefReg = createLaneMaskReg(MF);
- BuildMI(MBB, MBB.getFirstTerminator(), {}, TII->get(AMDGPU::IMPLICIT_DEF),
+ Register UndefReg = createLaneMaskReg(MRI, LaneMaskRegAttrs);
+ BuildMI(*MBB, MBB->getFirstTerminator(), {}, TII->get(AMDGPU::IMPLICIT_DEF),
UndefReg);
return UndefReg;
}
@@ -434,47 +440,17 @@ static unsigned insertUndefLaneMask(MachineBasicBlock &MBB) {
bool SILowerI1Copies::runOnMachineFunction(MachineFunction &TheMF) {
// Only need to run this in SelectionDAG path.
if (TheMF.getProperties().hasProperty(
- MachineFunctionProperties::Property::Selected))
+ MachineFunctionProperties::Property::Selected))
return false;
- MF = &TheMF;
- MRI = &MF->getRegInfo();
- DT = &getAnalysis<MachineDominatorTree>();
- PDT = &getAnalysis<MachinePostDominatorTree>();
-
- ST = &MF->getSubtarget<GCNSubtarget>();
- TII = ST->getInstrInfo();
- IsWave32 = ST->isWave32();
-
- if (IsWave32) {
- ExecReg = AMDGPU::EXEC_LO;
- MovOp = AMDGPU::S_MOV_B32;
- AndOp = AMDGPU::S_AND_B32;
- OrOp = AMDGPU::S_OR_B32;
- XorOp = AMDGPU::S_XOR_B32;
- AndN2Op = AMDGPU::S_ANDN2_B32;
- OrN2Op = AMDGPU::S_ORN2_B32;
- } else {
- ExecReg = AMDGPU::EXEC;
- MovOp = AMDGPU::S_MOV_B64;
- AndOp = AMDGPU::S_AND_B64;
- OrOp = AMDGPU::S_OR_B64;
- XorOp = AMDGPU::S_XOR_B64;
- AndN2Op = AMDGPU::S_ANDN2_B64;
- OrN2Op = AMDGPU::S_ORN2_B64;
- }
+ Vreg1LoweringHelper Helper(&TheMF, &getAnalysis<MachineDominatorTree>(),
+ &getAnalysis<MachinePostDominatorTree>());
bool Changed = false;
- Changed |= lowerCopiesFromI1();
- Changed |= lowerPhis();
- Changed |= lowerCopiesToI1();
-
- assert(Changed || ConstrainRegs.empty());
- for (unsigned Reg : ConstrainRegs)
- MRI->constrainRegClass(Reg, &AMDGPU::SReg_1_XEXECRegClass);
- ConstrainRegs.clear();
-
- return Changed;
+ Changed |= Helper.lowerCopiesFromI1();
+ Changed |= Helper.lowerPhis();
+ Changed |= Helper.lowerCopiesToI1();
+ return Helper.cleanConstrainRegs(Changed);
}
#ifndef NDEBUG
@@ -486,7 +462,7 @@ static bool isVRegCompatibleReg(const SIRegisterInfo &TRI,
}
#endif
-bool SILowerI1Copies::lowerCopiesFromI1() {
+bool Vreg1LoweringHelper::lowerCopiesFromI1() {
bool Changed = false;
SmallVector<MachineInstr *, 4> DeadCopies;
@@ -529,27 +505,47 @@ bool SILowerI1Copies::lowerCopiesFromI1() {
return Changed;
}
-bool SILowerI1Copies::lowerPhis() {
+PhiLoweringHelper::PhiLoweringHelper(MachineFunction *MF,
+ MachineDominatorTree *DT,
+ MachinePostDominatorTree *PDT)
+ : MF(MF), DT(DT), PDT(PDT) {
+ MRI = &MF->getRegInfo();
+
+ ST = &MF->getSubtarget<GCNSubtarget>();
+ TII = ST->getInstrInfo();
+ IsWave32 = ST->isWave32();
+
+ if (IsWave32) {
+ ExecReg = AMDGPU::EXEC_LO;
+ MovOp = AMDGPU::S_MOV_B32;
+ AndOp = AMDGPU::S_AND_B32;
+ OrOp = AMDGPU::S_OR_B32;
+ XorOp = AMDGPU::S_XOR_B32;
+ AndN2Op = AMDGPU::S_ANDN2_B32;
+ OrN2Op = AMDGPU::S_ORN2_B32;
+ } else {
+ ExecReg = AMDGPU::EXEC;
+ MovOp = AMDGPU::S_MOV_B64;
+ AndOp = AMDGPU::S_AND_B64;
+ OrOp = AMDGPU::S_OR_B64;
+ XorOp = AMDGPU::S_XOR_B64;
+ AndN2Op = AMDGPU::S_ANDN2_B64;
+ OrN2Op = AMDGPU::S_ORN2_B64;
+ }
+}
+
+bool PhiLoweringHelper::lowerPhis() {
MachineSSAUpdater SSAUpdater(*MF);
LoopFinder LF(*DT, *PDT);
PhiIncomingAnalysis PIA(*PDT, TII);
SmallVector<MachineInstr *, 4> Vreg1Phis;
- SmallVector<MachineBasicBlock *, 4> IncomingBlocks;
- SmallVector<unsigned, 4> IncomingRegs;
- SmallVector<unsigned, 4> IncomingUpdated;
-#ifndef NDEBUG
- DenseSet<unsigned> PhiRegisters;
-#endif
+ SmallVector<Incoming, 4> Incomings;
- for (MachineBasicBlock &MBB : *MF) {
- for (MachineInstr &MI : MBB.phis()) {
- if (isVreg1(MI.getOperand(0).getReg()))
- Vreg1Phis.push_back(&MI);
- }
- }
+ getCandidatesForLowering(Vreg1Phis);
if (Vreg1Phis.empty())
return false;
+ DT->getBase().updateDFSNumbers();
MachineBasicBlock *PrevMBB = nullptr;
for (MachineInstr *MI : Vreg1Phis) {
MachineBasicBlock &MBB = *MI->getParent();
@@ -561,29 +557,19 @@ bool SILowerI1Copies::lowerPhis() {
LLVM_DEBUG(dbgs() << "Lower PHI: " << *MI);
Register DstReg = MI->getOperand(0).getReg();
- MRI->setRegClass(DstReg, IsWave32 ? &AMDGPU::SReg_32RegClass
- : &AMDGPU::SReg_64RegClass);
-
- // Collect incoming values.
- for (unsigned i = 1; i < MI->getNumOperands(); i += 2) {
- assert(i + 1 < MI->getNumOperands());
- Register IncomingReg = MI->getOperand(i).getReg();
- MachineBasicBlock *IncomingMBB = MI->getOperand(i + 1).getMBB();
- MachineInstr *IncomingDef = MRI->getUniqueVRegDef(IncomingReg);
-
- if (IncomingDef->getOpcode() == AMDGPU::COPY) {
- IncomingReg = IncomingDef->getOperand(1).getReg();
- assert(isLaneMaskReg(IncomingReg) || isVreg1(IncomingReg));
- assert(!IncomingDef->getOperand(1).getSubReg());
- } else if (IncomingDef->getOpcode() == AMDGPU::IMPLICIT_DEF) {
- continue;
- } else {
- assert(IncomingDef->isPHI() || PhiRegisters.count(IncomingReg));
- }
+ markAsLaneMask(DstReg);
+ initializeLaneMaskRegisterAttributes(DstReg);
- IncomingBlocks.push_back(IncomingMBB);
- IncomingRegs.push_back(IncomingReg);
- }
+ collectIncomingValuesFromPhi(MI, Incomings);
+
+ // Sort the incomings such that incoming values that dominate other incoming
+ // values are sorted earlier. This allows us to do some amount of on-the-fly
+ // constant folding.
+ // Incoming with smaller DFSNumIn goes first, DFSNumIn is 0 for entry block.
+ llvm::sort(Incomings, [this](Incoming LHS, Incoming RHS) {
+ return DT->getNode(LHS.Block)->getDFSNumIn() <
+ DT->getNode(RHS.Block)->getDFSNumIn();
+ });
#ifndef NDEBUG
PhiRegisters.insert(DstReg);
@@ -607,64 +593,63 @@ bool SILowerI1Copies::lowerPhis() {
SSAUpdater.Initialize(DstReg);
if (FoundLoopLevel) {
- LF.addLoopEntries(FoundLoopLevel, SSAUpdater, IncomingBlocks);
+ LF.addLoopEntries(FoundLoopLevel, SSAUpdater, *MRI, LaneMaskRegAttrs,
+ Incomings);
- for (unsigned i = 0; i < IncomingRegs.size(); ++i) {
- IncomingUpdated.push_back(createLaneMaskReg(*MF));
- SSAUpdater.AddAvailableValue(IncomingBlocks[i],
- IncomingUpdated.back());
+ for (auto &Incoming : Incomings) {
+ Incoming.UpdatedReg = createLaneMaskReg(MRI, LaneMaskRegAttrs);
+ SSAUpdater.AddAvailableValue(Incoming.Block, Incoming.UpdatedReg);
}
- for (unsigned i = 0; i < IncomingRegs.size(); ++i) {
- MachineBasicBlock &IMBB = *IncomingBlocks[i];
+ for (auto &Incoming : Incomings) {
+ MachineBasicBlock &IMBB = *Incoming.Block;
buildMergeLaneMasks(
- IMBB, getSaluInsertionAtEnd(IMBB), {}, IncomingUpdated[i],
- SSAUpdater.GetValueInMiddleOfBlock(&IMBB), IncomingRegs[i]);
+ IMBB, getSaluInsertionAtEnd(IMBB), {}, Incoming.UpdatedReg,
+ SSAUpdater.GetValueInMiddleOfBlock(&IMBB), Incoming.Reg);
}
} else {
// The phi is not observed from outside a loop. Use a more accurate
// lowering.
- PIA.analyze(MBB, IncomingBlocks);
+ PIA.analyze(MBB, Incomings);
for (MachineBasicBlock *MBB : PIA.predecessors())
- SSAUpdater.AddAvailableValue(MBB, insertUndefLaneMask(*MBB));
+ SSAUpdater.AddAvailableValue(
+ MBB, insertUndefLaneMask(MBB, MRI, LaneMaskRegAttrs));
- for (unsigned i = 0; i < IncomingRegs.size(); ++i) {
- MachineBasicBlock &IMBB = *IncomingBlocks[i];
+ for (auto &Incoming : Incomings) {
+ MachineBasicBlock &IMBB = *Incoming.Block;
if (PIA.isSource(IMBB)) {
- IncomingUpdated.push_back(0);
- SSAUpdater.AddAvailableValue(&IMBB, IncomingRegs[i]);
+ constrainIncomingRegisterTakenAsIs(Incoming);
+ SSAUpdater.AddAvailableValue(&IMBB, Incoming.Reg);
} else {
- IncomingUpdated.push_back(createLaneMaskReg(*MF));
- SSAUpdater.AddAvailableValue(&IMBB, IncomingUpdated.back());
+ Incoming.UpdatedReg = createLaneMaskReg(MRI, LaneMaskRegAttrs);
+ SSAUpdater.AddAvailableValue(&IMBB, Incoming.UpdatedReg);
}
}
- for (unsigned i = 0; i < IncomingRegs.size(); ++i) {
- if (!IncomingUpdated[i])
+ for (auto &Incoming : Incomings) {
+ if (!Incoming.UpdatedReg.isValid())
continue;
- MachineBasicBlock &IMBB = *IncomingBlocks[i];
+ MachineBasicBlock &IMBB = *Incoming.Block;
buildMergeLaneMasks(
- IMBB, getSaluInsertionAtEnd(IMBB), {}, IncomingUpdated[i],
- SSAUpdater.GetValueInMiddleOfBlock(&IMBB), IncomingRegs[i]);
+ IMBB, getSaluInsertionAtEnd(IMBB), {}, Incoming.UpdatedReg,
+ SSAUpdater.GetValueInMiddleOfBlock(&IMBB), Incoming.Reg);
}
}
Register NewReg = SSAUpdater.GetValueInMiddleOfBlock(&MBB);
if (NewReg != DstReg) {
- MRI->replaceRegWith(NewReg, DstReg);
+ replaceDstReg(NewReg, DstReg, &MBB);
MI->eraseFromParent();
}
- IncomingBlocks.clear();
- IncomingRegs.clear();
- IncomingUpdated.clear();
+ Incomings.clear();
}
return true;
}
-bool SILowerI1Copies::lowerCopiesToI1() {
+bool Vreg1LoweringHelper::lowerCopiesToI1() {
bool Changed = false;
MachineSSAUpdater SSAUpdater(*MF);
LoopFinder LF(*DT, *PDT);
@@ -691,8 +676,9 @@ bool SILowerI1Copies::lowerCopiesToI1() {
LLVM_DEBUG(dbgs() << "Lower Other: " << MI);
- MRI->setRegClass(DstReg, IsWave32 ? &AMDGPU::SReg_32RegClass
- : &AMDGPU::SReg_64RegClass);
+ markAsLaneMask(DstReg);
+ initializeLaneMaskRegisterAttributes(DstReg);
+
if (MI.getOpcode() == AMDGPU::IMPLICIT_DEF)
continue;
@@ -702,12 +688,15 @@ bool SILowerI1Copies::lowerCopiesToI1() {
if (!SrcReg.isVirtual() || (!isLaneMaskReg(SrcReg) && !isVreg1(SrcReg))) {
assert(TII->getRegisterInfo().getRegSizeInBits(SrcReg, *MRI) == 32);
- unsigned TmpReg = createLaneMaskReg(*MF);
+ Register TmpReg = createLaneMaskReg(MRI, LaneMaskRegAttrs);
BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_CMP_NE_U32_e64), TmpReg)
.addReg(SrcReg)
.addImm(0);
MI.getOperand(1).setReg(TmpReg);
SrcReg = TmpReg;
+ } else {
+ // SrcReg needs to be live beyond copy.
+ MI.getOperand(1).setIsKill(false);
}
// Defs in a loop that are observed outside the loop must be transformed
@@ -722,7 +711,7 @@ bool SILowerI1Copies::lowerCopiesToI1() {
if (FoundLoopLevel) {
SSAUpdater.Initialize(DstReg);
SSAUpdater.AddAvailableValue(&MBB, DstReg);
- LF.addLoopEntries(FoundLoopLevel, SSAUpdater);
+ LF.addLoopEntries(FoundLoopLevel, SSAUpdater, *MRI, LaneMaskRegAttrs);
buildMergeLaneMasks(MBB, MI, DL, DstReg,
SSAUpdater.GetValueInMiddleOfBlock(&MBB), SrcReg);
@@ -737,7 +726,7 @@ bool SILowerI1Copies::lowerCopiesToI1() {
return Changed;
}
-bool SILowerI1Copies::isConstantLaneMask(Register Reg, bool &Val) const {
+bool PhiLoweringHelper::isConstantLaneMask(Register Reg, bool &Val) const {
const MachineInstr *MI;
for (;;) {
MI = MRI->getUniqueVRegDef(Reg);
@@ -790,7 +779,7 @@ static void instrDefsUsesSCC(const MachineInstr &MI, bool &Def, bool &Use) {
/// Return a point at the end of the given \p MBB to insert SALU instructions
/// for lane mask calculation. Take terminators and SCC into account.
MachineBasicBlock::iterator
-SILowerI1Copies::getSaluInsertionAtEnd(MachineBasicBlock &MBB) const {
+PhiLoweringHelper::getSaluInsertionAtEnd(MachineBasicBlock &MBB) const {
auto InsertionPt = MBB.getFirstTerminator();
bool TerminatorsUseSCC = false;
for (auto I = InsertionPt, E = MBB.end(); I != E; ++I) {
@@ -816,10 +805,53 @@ SILowerI1Copies::getSaluInsertionAtEnd(MachineBasicBlock &MBB) const {
llvm_unreachable("SCC used by terminator but no def in block");
}
-void SILowerI1Copies::buildMergeLaneMasks(MachineBasicBlock &MBB,
- MachineBasicBlock::iterator I,
- const DebugLoc &DL, unsigned DstReg,
- unsigned PrevReg, unsigned CurReg) {
+// VReg_1 -> SReg_32 or SReg_64
+void Vreg1LoweringHelper::markAsLaneMask(Register DstReg) const {
+ MRI->setRegClass(DstReg, ST->getBoolRC());
+}
+
+void Vreg1LoweringHelper::getCandidatesForLowering(
+ SmallVectorImpl<MachineInstr *> &Vreg1Phis) const {
+ for (MachineBasicBlock &MBB : *MF) {
+ for (MachineInstr &MI : MBB.phis()) {
+ if (isVreg1(MI.getOperand(0).getReg()))
+ Vreg1Phis.push_back(&MI);
+ }
+ }
+}
+
+void Vreg1LoweringHelper::collectIncomingValuesFromPhi(
+ const MachineInstr *MI, SmallVectorImpl<Incoming> &Incomings) const {
+ for (unsigned i = 1; i < MI->getNumOperands(); i += 2) {
+ assert(i + 1 < MI->getNumOperands());
+ Register IncomingReg = MI->getOperand(i).getReg();
+ MachineBasicBlock *IncomingMBB = MI->getOperand(i + 1).getMBB();
+ MachineInstr *IncomingDef = MRI->getUniqueVRegDef(IncomingReg);
+
+ if (IncomingDef->getOpcode() == AMDGPU::COPY) {
+ IncomingReg = IncomingDef->getOperand(1).getReg();
+ assert(isLaneMaskReg(IncomingReg) || isVreg1(IncomingReg));
+ assert(!IncomingDef->getOperand(1).getSubReg());
+ } else if (IncomingDef->getOpcode() == AMDGPU::IMPLICIT_DEF) {
+ continue;
+ } else {
+ assert(IncomingDef->isPHI() || PhiRegisters.count(IncomingReg));
+ }
+
+ Incomings.emplace_back(IncomingReg, IncomingMBB, Register());
+ }
+}
+
+void Vreg1LoweringHelper::replaceDstReg(Register NewReg, Register OldReg,
+ MachineBasicBlock *MBB) {
+ MRI->replaceRegWith(NewReg, OldReg);
+}
+
+void Vreg1LoweringHelper::buildMergeLaneMasks(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator I,
+ const DebugLoc &DL,
+ Register DstReg, Register PrevReg,
+ Register CurReg) {
bool PrevVal = false;
bool PrevConstant = isConstantLaneMask(PrevReg, PrevVal);
bool CurVal = false;
@@ -838,13 +870,13 @@ void SILowerI1Copies::buildMergeLaneMasks(MachineBasicBlock &MBB,
return;
}
- unsigned PrevMaskedReg = 0;
- unsigned CurMaskedReg = 0;
+ Register PrevMaskedReg;
+ Register CurMaskedReg;
if (!PrevConstant) {
if (CurConstant && CurVal) {
PrevMaskedReg = PrevReg;
} else {
- PrevMaskedReg = createLaneMaskReg(*MF);
+ PrevMaskedReg = createLaneMaskReg(MRI, LaneMaskRegAttrs);
BuildMI(MBB, I, DL, TII->get(AndN2Op), PrevMaskedReg)
.addReg(PrevReg)
.addReg(ExecReg);
@@ -855,7 +887,7 @@ void SILowerI1Copies::buildMergeLaneMasks(MachineBasicBlock &MBB,
if (PrevConstant && PrevVal) {
CurMaskedReg = CurReg;
} else {
- CurMaskedReg = createLaneMaskReg(*MF);
+ CurMaskedReg = createLaneMaskReg(MRI, LaneMaskRegAttrs);
BuildMI(MBB, I, DL, TII->get(AndOp), CurMaskedReg)
.addReg(CurReg)
.addReg(ExecReg);
@@ -878,3 +910,7 @@ void SILowerI1Copies::buildMergeLaneMasks(MachineBasicBlock &MBB,
.addReg(CurMaskedReg ? CurMaskedReg : ExecReg);
}
}
+
+void Vreg1LoweringHelper::constrainIncomingRegisterTakenAsIs(Incoming &In) {
+ return;
+}
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SILowerI1Copies.h b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SILowerI1Copies.h
new file mode 100644
index 000000000000..5099d39c2d14
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SILowerI1Copies.h
@@ -0,0 +1,97 @@
+//===-- SILowerI1Copies.h --------------------------------------*- C++ -*--===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// Interface definition of the PhiLoweringHelper class that implements lane
+/// mask merging algorithm for divergent i1 phis.
+//
+//===----------------------------------------------------------------------===//
+
+#include "GCNSubtarget.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachinePostDominators.h"
+#include "llvm/CodeGen/MachineSSAUpdater.h"
+
+namespace llvm {
+
+/// Incoming for lane maks phi as machine instruction, incoming register \p Reg
+/// and incoming block \p Block are taken from machine instruction.
+/// \p UpdatedReg (if valid) is \p Reg lane mask merged with another lane mask.
+struct Incoming {
+ Register Reg;
+ MachineBasicBlock *Block;
+ Register UpdatedReg;
+
+ Incoming(Register Reg, MachineBasicBlock *Block, Register UpdatedReg)
+ : Reg(Reg), Block(Block), UpdatedReg(UpdatedReg) {}
+};
+
+Register createLaneMaskReg(MachineRegisterInfo *MRI, Register LaneMaskRegAttrs);
+
+class PhiLoweringHelper {
+public:
+ PhiLoweringHelper(MachineFunction *MF, MachineDominatorTree *DT,
+ MachinePostDominatorTree *PDT);
+ virtual ~PhiLoweringHelper() = default;
+
+protected:
+ bool IsWave32 = false;
+ MachineFunction *MF = nullptr;
+ MachineDominatorTree *DT = nullptr;
+ MachinePostDominatorTree *PDT = nullptr;
+ MachineRegisterInfo *MRI = nullptr;
+ const GCNSubtarget *ST = nullptr;
+ const SIInstrInfo *TII = nullptr;
+ Register LaneMaskRegAttrs;
+
+#ifndef NDEBUG
+ DenseSet<Register> PhiRegisters;
+#endif
+
+ Register ExecReg;
+ unsigned MovOp;
+ unsigned AndOp;
+ unsigned OrOp;
+ unsigned XorOp;
+ unsigned AndN2Op;
+ unsigned OrN2Op;
+
+public:
+ bool lowerPhis();
+ bool isConstantLaneMask(Register Reg, bool &Val) const;
+ MachineBasicBlock::iterator
+ getSaluInsertionAtEnd(MachineBasicBlock &MBB) const;
+
+ void initializeLaneMaskRegisterAttributes(Register LaneMask) {
+ LaneMaskRegAttrs = LaneMask;
+ }
+
+ bool isLaneMaskReg(Register Reg) const {
+ return TII->getRegisterInfo().isSGPRReg(*MRI, Reg) &&
+ TII->getRegisterInfo().getRegSizeInBits(Reg, *MRI) ==
+ ST->getWavefrontSize();
+ }
+
+ // Helpers from lowerPhis that are different between sdag and global-isel.
+
+ virtual void markAsLaneMask(Register DstReg) const = 0;
+ virtual void getCandidatesForLowering(
+ SmallVectorImpl<MachineInstr *> &Vreg1Phis) const = 0;
+ virtual void
+ collectIncomingValuesFromPhi(const MachineInstr *MI,
+ SmallVectorImpl<Incoming> &Incomings) const = 0;
+ virtual void replaceDstReg(Register NewReg, Register OldReg,
+ MachineBasicBlock *MBB) = 0;
+ virtual void buildMergeLaneMasks(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator I,
+ const DebugLoc &DL, Register DstReg,
+ Register PrevReg, Register CurReg) = 0;
+ virtual void constrainIncomingRegisterTakenAsIs(Incoming &In) = 0;
+};
+
+} // end namespace llvm
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp
index d21107c02ef7..0ba7792ac436 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp
@@ -50,7 +50,9 @@ public:
SILowerSGPRSpills() : MachineFunctionPass(ID) {}
void calculateSaveRestoreBlocks(MachineFunction &MF);
- bool spillCalleeSavedRegs(MachineFunction &MF);
+ bool spillCalleeSavedRegs(MachineFunction &MF,
+ SmallVectorImpl<int> &CalleeSavedFIs);
+ void extendWWMVirtRegLiveness(MachineFunction &MF, LiveIntervals *LIS);
bool runOnMachineFunction(MachineFunction &MF) override;
@@ -58,6 +60,13 @@ public:
AU.setPreservesAll();
MachineFunctionPass::getAnalysisUsage(AU);
}
+
+ MachineFunctionProperties getClearedProperties() const override {
+ // SILowerSGPRSpills introduces new Virtual VGPRs for spilling SGPRs.
+ return MachineFunctionProperties()
+ .set(MachineFunctionProperties::Property::IsSSA)
+ .set(MachineFunctionProperties::Property::NoVRegs);
+ }
};
} // end anonymous namespace
@@ -197,7 +206,8 @@ static void updateLiveness(MachineFunction &MF, ArrayRef<CalleeSavedInfo> CSI) {
EntryBB.sortUniqueLiveIns();
}
-bool SILowerSGPRSpills::spillCalleeSavedRegs(MachineFunction &MF) {
+bool SILowerSGPRSpills::spillCalleeSavedRegs(
+ MachineFunction &MF, SmallVectorImpl<int> &CalleeSavedFIs) {
MachineRegisterInfo &MRI = MF.getRegInfo();
const Function &F = MF.getFunction();
const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
@@ -228,6 +238,7 @@ bool SILowerSGPRSpills::spillCalleeSavedRegs(MachineFunction &MF) {
TRI->getSpillAlign(*RC), true);
CSI.push_back(CalleeSavedInfo(Reg, JunkFI));
+ CalleeSavedFIs.push_back(JunkFI);
}
}
@@ -248,6 +259,52 @@ bool SILowerSGPRSpills::spillCalleeSavedRegs(MachineFunction &MF) {
return false;
}
+void SILowerSGPRSpills::extendWWMVirtRegLiveness(MachineFunction &MF,
+ LiveIntervals *LIS) {
+ // TODO: This is a workaround to avoid the unmodelled liveness computed with
+ // whole-wave virtual registers when allocated together with the regular VGPR
+ // virtual registers. Presently, the liveness computed during the regalloc is
+ // only uniform (or single lane aware) and it doesn't take account of the
+ // divergent control flow that exists for our GPUs. Since the WWM registers
+ // can modify inactive lanes, the wave-aware liveness should be computed for
+ // the virtual registers to accurately plot their interferences. Without
+ // having the divergent CFG for the function, it is difficult to implement the
+ // wave-aware liveness info. Until then, we conservatively extend the liveness
+ // of the wwm registers into the entire function so that they won't be reused
+ // without first spilling/splitting their liveranges.
+ SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
+
+ // Insert the IMPLICIT_DEF for the wwm-registers in the entry blocks.
+ for (auto Reg : MFI->getSGPRSpillVGPRs()) {
+ for (MachineBasicBlock *SaveBlock : SaveBlocks) {
+ MachineBasicBlock::iterator InsertBefore = SaveBlock->begin();
+ auto MIB = BuildMI(*SaveBlock, *InsertBefore, InsertBefore->getDebugLoc(),
+ TII->get(AMDGPU::IMPLICIT_DEF), Reg);
+ MFI->setFlag(Reg, AMDGPU::VirtRegFlag::WWM_REG);
+ // Set SGPR_SPILL asm printer flag
+ MIB->setAsmPrinterFlag(AMDGPU::SGPR_SPILL);
+ if (LIS) {
+ LIS->InsertMachineInstrInMaps(*MIB);
+ }
+ }
+ }
+
+ // Insert the KILL in the return blocks to extend their liveness untill the
+ // end of function. Insert a separate KILL for each VGPR.
+ for (MachineBasicBlock *RestoreBlock : RestoreBlocks) {
+ MachineBasicBlock::iterator InsertBefore =
+ RestoreBlock->getFirstTerminator();
+ for (auto Reg : MFI->getSGPRSpillVGPRs()) {
+ auto MIB =
+ BuildMI(*RestoreBlock, *InsertBefore, InsertBefore->getDebugLoc(),
+ TII->get(TargetOpcode::KILL));
+ MIB.addReg(Reg);
+ if (LIS)
+ LIS->InsertMachineInstrInMaps(*MIB);
+ }
+ }
+}
+
bool SILowerSGPRSpills::runOnMachineFunction(MachineFunction &MF) {
const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
TII = ST.getInstrInfo();
@@ -261,7 +318,8 @@ bool SILowerSGPRSpills::runOnMachineFunction(MachineFunction &MF) {
// First, expose any CSR SGPR spills. This is mostly the same as what PEI
// does, but somewhat simpler.
calculateSaveRestoreBlocks(MF);
- bool HasCSRs = spillCalleeSavedRegs(MF);
+ SmallVector<int> CalleeSavedFIs;
+ bool HasCSRs = spillCalleeSavedRegs(MF, CalleeSavedFIs);
MachineFrameInfo &MFI = MF.getFrameInfo();
MachineRegisterInfo &MRI = MF.getRegInfo();
@@ -275,6 +333,7 @@ bool SILowerSGPRSpills::runOnMachineFunction(MachineFunction &MF) {
bool MadeChange = false;
bool NewReservedRegs = false;
+ bool SpilledToVirtVGPRLanes = false;
// TODO: CSR VGPRs will never be spilled to AGPRs. These can probably be
// handled as SpilledToReg in regular PrologEpilogInserter.
@@ -297,23 +356,51 @@ bool SILowerSGPRSpills::runOnMachineFunction(MachineFunction &MF) {
int FI = TII->getNamedOperand(MI, AMDGPU::OpName::addr)->getIndex();
assert(MFI.getStackID(FI) == TargetStackID::SGPRSpill);
- if (FuncInfo->allocateSGPRSpillToVGPRLane(MF, FI)) {
- NewReservedRegs = true;
- bool Spilled = TRI->eliminateSGPRToVGPRSpillFrameIndex(
- MI, FI, nullptr, Indexes, LIS);
- (void)Spilled;
- assert(Spilled && "failed to spill SGPR to VGPR when allocated");
- SpillFIs.set(FI);
+
+ bool IsCalleeSaveSGPRSpill = llvm::is_contained(CalleeSavedFIs, FI);
+ if (IsCalleeSaveSGPRSpill) {
+ // Spill callee-saved SGPRs into physical VGPR lanes.
+
+ // TODO: This is to ensure the CFIs are static for efficient frame
+ // unwinding in the debugger. Spilling them into virtual VGPR lanes
+ // involve regalloc to allocate the physical VGPRs and that might
+ // cause intermediate spill/split of such liveranges for successful
+ // allocation. This would result in broken CFI encoding unless the
+ // regalloc aware CFI generation to insert new CFIs along with the
+ // intermediate spills is implemented. There is no such support
+ // currently exist in the LLVM compiler.
+ if (FuncInfo->allocateSGPRSpillToVGPRLane(MF, FI, true)) {
+ NewReservedRegs = true;
+ bool Spilled = TRI->eliminateSGPRToVGPRSpillFrameIndex(
+ MI, FI, nullptr, Indexes, LIS, true);
+ if (!Spilled)
+ llvm_unreachable(
+ "failed to spill SGPR to physical VGPR lane when allocated");
+ }
+ } else {
+ if (FuncInfo->allocateSGPRSpillToVGPRLane(MF, FI)) {
+ bool Spilled = TRI->eliminateSGPRToVGPRSpillFrameIndex(
+ MI, FI, nullptr, Indexes, LIS);
+ if (!Spilled)
+ llvm_unreachable(
+ "failed to spill SGPR to virtual VGPR lane when allocated");
+ SpillFIs.set(FI);
+ SpilledToVirtVGPRLanes = true;
+ }
}
}
}
- // FIXME: Adding to live-ins redundant with reserving registers.
- for (MachineBasicBlock &MBB : MF) {
- for (auto Reg : FuncInfo->getSGPRSpillVGPRs())
- MBB.addLiveIn(Reg);
- MBB.sortUniqueLiveIns();
+ if (SpilledToVirtVGPRLanes) {
+ extendWWMVirtRegLiveness(MF, LIS);
+ if (LIS) {
+ // Compute the LiveInterval for the newly created virtual registers.
+ for (auto Reg : FuncInfo->getSGPRSpillVGPRs())
+ LIS->createAndComputeVirtRegInterval(Reg);
+ }
+ }
+ for (MachineBasicBlock &MBB : MF) {
// FIXME: The dead frame indices are replaced with a null register from
// the debug value instructions. We should instead, update it with the
// correct register value. But not sure the register value alone is
@@ -334,6 +421,10 @@ bool SILowerSGPRSpills::runOnMachineFunction(MachineFunction &MF) {
// lane".
FuncInfo->removeDeadFrameIndices(MFI, /*ResetSGPRSpillStackIDs*/ false);
+ MadeChange = true;
+ }
+
+ if (SpilledToVirtVGPRLanes) {
const TargetRegisterClass *RC = TRI->getWaveMaskRegClass();
// Shift back the reserved SGPR for EXEC copy into the lowest range.
// This SGPR is reserved to handle the whole-wave spill/copy operations
@@ -342,20 +433,21 @@ bool SILowerSGPRSpills::runOnMachineFunction(MachineFunction &MF) {
if (UnusedLowSGPR && TRI->getHWRegIndex(UnusedLowSGPR) <
TRI->getHWRegIndex(FuncInfo->getSGPRForEXECCopy()))
FuncInfo->setSGPRForEXECCopy(UnusedLowSGPR);
-
- MadeChange = true;
} else {
- // No SGPR spills and hence there won't be any WWM spills/copies. Reset the
- // SGPR reserved for EXEC copy.
+ // No SGPR spills to virtual VGPR lanes and hence there won't be any WWM
+ // spills/copies. Reset the SGPR reserved for EXEC copy.
FuncInfo->setSGPRForEXECCopy(AMDGPU::NoRegister);
}
SaveBlocks.clear();
RestoreBlocks.clear();
- // Updated the reserved registers with any VGPRs added for SGPR spills.
- if (NewReservedRegs)
- MRI.freezeReservedRegs(MF);
+ // Updated the reserved registers with any physical VGPRs added for SGPR
+ // spills.
+ if (NewReservedRegs) {
+ for (Register Reg : FuncInfo->getWWMReservedRegs())
+ MRI.reserveReg(Reg, TRI);
+ }
return MadeChange;
}
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SILowerWWMCopies.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SILowerWWMCopies.cpp
new file mode 100644
index 000000000000..9c3cd1bbd6b0
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SILowerWWMCopies.cpp
@@ -0,0 +1,141 @@
+//===-- SILowerWWMCopies.cpp - Lower Copies after regalloc ---===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// Lowering the WWM_COPY instructions for various register classes.
+/// AMDGPU target generates WWM_COPY instruction to differentiate WWM
+/// copy from COPY. This pass generates the necessary exec mask manipulation
+/// instructions to replicate 'Whole Wave Mode' and lowers WWM_COPY back to
+/// COPY.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "GCNSubtarget.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
+#include "SIMachineFunctionInfo.h"
+#include "llvm/CodeGen/LiveIntervals.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/VirtRegMap.h"
+#include "llvm/InitializePasses.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "si-lower-wwm-copies"
+
+namespace {
+
+class SILowerWWMCopies : public MachineFunctionPass {
+public:
+ static char ID;
+
+ SILowerWWMCopies() : MachineFunctionPass(ID) {
+ initializeSILowerWWMCopiesPass(*PassRegistry::getPassRegistry());
+ }
+
+ bool runOnMachineFunction(MachineFunction &MF) override;
+
+ StringRef getPassName() const override { return "SI Lower WWM Copies"; }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.setPreservesAll();
+ MachineFunctionPass::getAnalysisUsage(AU);
+ }
+
+private:
+ bool isSCCLiveAtMI(const MachineInstr &MI);
+ void addToWWMSpills(MachineFunction &MF, Register Reg);
+
+ LiveIntervals *LIS;
+ SlotIndexes *Indexes;
+ VirtRegMap *VRM;
+ const SIRegisterInfo *TRI;
+ const MachineRegisterInfo *MRI;
+ SIMachineFunctionInfo *MFI;
+};
+
+} // End anonymous namespace.
+
+INITIALIZE_PASS_BEGIN(SILowerWWMCopies, DEBUG_TYPE, "SI Lower WWM Copies",
+ false, false)
+INITIALIZE_PASS_DEPENDENCY(LiveIntervals)
+INITIALIZE_PASS_DEPENDENCY(VirtRegMap)
+INITIALIZE_PASS_END(SILowerWWMCopies, DEBUG_TYPE, "SI Lower WWM Copies", false,
+ false)
+
+char SILowerWWMCopies::ID = 0;
+
+char &llvm::SILowerWWMCopiesID = SILowerWWMCopies::ID;
+
+bool SILowerWWMCopies::isSCCLiveAtMI(const MachineInstr &MI) {
+ // We can't determine the liveness info if LIS isn't available. Early return
+ // in that case and always assume SCC is live.
+ if (!LIS)
+ return true;
+
+ LiveRange &LR =
+ LIS->getRegUnit(*MCRegUnitIterator(MCRegister::from(AMDGPU::SCC), TRI));
+ SlotIndex Idx = LIS->getInstructionIndex(MI);
+ return LR.liveAt(Idx);
+}
+
+// If \p Reg is assigned with a physical VGPR, add the latter into wwm-spills
+// for preserving its entire lanes at function prolog/epilog.
+void SILowerWWMCopies::addToWWMSpills(MachineFunction &MF, Register Reg) {
+ if (Reg.isPhysical())
+ return;
+
+ Register PhysReg = VRM->getPhys(Reg);
+ assert(PhysReg != VirtRegMap::NO_PHYS_REG &&
+ "should have allocated a physical register");
+
+ MFI->allocateWWMSpill(MF, PhysReg);
+}
+
+bool SILowerWWMCopies::runOnMachineFunction(MachineFunction &MF) {
+ const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
+ const SIInstrInfo *TII = ST.getInstrInfo();
+
+ MFI = MF.getInfo<SIMachineFunctionInfo>();
+ LIS = getAnalysisIfAvailable<LiveIntervals>();
+ Indexes = getAnalysisIfAvailable<SlotIndexes>();
+ VRM = getAnalysisIfAvailable<VirtRegMap>();
+ TRI = ST.getRegisterInfo();
+ MRI = &MF.getRegInfo();
+
+ if (!MFI->hasVRegFlags())
+ return false;
+
+ bool Changed = false;
+ for (MachineBasicBlock &MBB : MF) {
+ for (MachineInstr &MI : MBB) {
+ if (MI.getOpcode() != AMDGPU::WWM_COPY)
+ continue;
+
+ // TODO: Club adjacent WWM ops between same exec save/restore
+ assert(TII->isVGPRCopy(MI));
+
+ // For WWM vector copies, manipulate the exec mask around the copy
+ // instruction.
+ const DebugLoc &DL = MI.getDebugLoc();
+ MachineBasicBlock::iterator InsertPt = MI.getIterator();
+ Register RegForExecCopy = MFI->getSGPRForEXECCopy();
+ TII->insertScratchExecCopy(MF, MBB, InsertPt, DL, RegForExecCopy,
+ isSCCLiveAtMI(MI), Indexes);
+ TII->restoreExec(MF, MBB, ++InsertPt, DL, RegForExecCopy, Indexes);
+ addToWWMSpills(MF, MI.getOperand(0).getReg());
+ LLVM_DEBUG(dbgs() << "WWM copy manipulation for " << MI);
+
+ // Lower WWM_COPY back to COPY
+ MI.setDesc(TII->get(AMDGPU::COPY));
+ Changed |= true;
+ }
+ }
+
+ return Changed;
+}
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
index c9376d0ea653..e8142244b7db 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
@@ -7,17 +7,18 @@
//===----------------------------------------------------------------------===//
#include "SIMachineFunctionInfo.h"
-#include "AMDGPUTargetMachine.h"
#include "AMDGPUSubtarget.h"
-#include "SIRegisterInfo.h"
+#include "AMDGPUTargetMachine.h"
+#include "GCNSubtarget.h"
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
+#include "SIRegisterInfo.h"
#include "Utils/AMDGPUBaseInfo.h"
#include "llvm/CodeGen/LiveIntervals.h"
+#include "llvm/CodeGen/MIRParser/MIParser.h"
#include "llvm/CodeGen/MachineBasicBlock.h"
#include "llvm/CodeGen/MachineFrameInfo.h"
#include "llvm/CodeGen/MachineFunction.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/CodeGen/MIRParser/MIParser.h"
#include "llvm/IR/CallingConv.h"
#include "llvm/IR/DiagnosticInfo.h"
#include "llvm/IR/Function.h"
@@ -36,28 +37,12 @@ const GCNTargetMachine &getTM(const GCNSubtarget *STI) {
SIMachineFunctionInfo::SIMachineFunctionInfo(const Function &F,
const GCNSubtarget *STI)
- : AMDGPUMachineFunction(F, *STI),
- Mode(F),
- GWSResourcePSV(getTM(STI)),
- PrivateSegmentBuffer(false),
- DispatchPtr(false),
- QueuePtr(false),
- KernargSegmentPtr(false),
- DispatchID(false),
- FlatScratchInit(false),
- WorkGroupIDX(false),
- WorkGroupIDY(false),
- WorkGroupIDZ(false),
- WorkGroupInfo(false),
- LDSKernelId(false),
- PrivateSegmentWaveByteOffset(false),
- WorkItemIDX(false),
- WorkItemIDY(false),
- WorkItemIDZ(false),
- ImplicitBufferPtr(false),
- ImplicitArgPtr(false),
- GITPtrHigh(0xffffffff),
- HighBitsOf32BitAddress(0) {
+ : AMDGPUMachineFunction(F, *STI), Mode(F, *STI), GWSResourcePSV(getTM(STI)),
+ UserSGPRInfo(F, *STI), WorkGroupIDX(false), WorkGroupIDY(false),
+ WorkGroupIDZ(false), WorkGroupInfo(false), LDSKernelId(false),
+ PrivateSegmentWaveByteOffset(false), WorkItemIDX(false),
+ WorkItemIDY(false), WorkItemIDZ(false), ImplicitArgPtr(false),
+ GITPtrHigh(0xffffffff), HighBitsOf32BitAddress(0) {
const GCNSubtarget &ST = *static_cast<const GCNSubtarget *>(STI);
FlatWorkGroupSizes = ST.getFlatWorkGroupSizes(F);
WavesPerEU = ST.getWavesPerEU(F);
@@ -67,16 +52,10 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const Function &F,
VRegFlags.reserve(1024);
- // FIXME: Should have analysis or something rather than attribute to detect
- // calls.
- const bool HasCalls = F.hasFnAttribute("amdgpu-calls");
-
const bool IsKernel = CC == CallingConv::AMDGPU_KERNEL ||
CC == CallingConv::SPIR_KERNEL;
if (IsKernel) {
- if (!F.arg_empty() || ST.getImplicitArgNumBytes(F) != 0)
- KernargSegmentPtr = true;
WorkGroupIDX = true;
WorkItemIDX = true;
} else if (CC == CallingConv::AMDGPU_PS) {
@@ -85,7 +64,20 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const Function &F,
MayNeedAGPRs = ST.hasMAIInsts();
- if (!isEntryFunction()) {
+ if (AMDGPU::isChainCC(CC)) {
+ // Chain functions don't receive an SP from their caller, but are free to
+ // set one up. For now, we can use s32 to match what amdgpu_gfx functions
+ // would use if called, but this can be revisited.
+ // FIXME: Only reserve this if we actually need it.
+ StackPtrOffsetReg = AMDGPU::SGPR32;
+
+ ScratchRSrcReg = AMDGPU::SGPR48_SGPR49_SGPR50_SGPR51;
+
+ ArgInfo.PrivateSegmentBuffer =
+ ArgDescriptor::createRegister(ScratchRSrcReg);
+
+ ImplicitArgPtr = false;
+ } else if (!isEntryFunction()) {
if (CC != CallingConv::AMDGPU_Gfx)
ArgInfo = AMDGPUArgumentUsageInfo::FixedABIFunctionInfo;
@@ -115,12 +107,6 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const Function &F,
MayNeedAGPRs = false; // We will select all MAI with VGPR operands.
}
- bool isAmdHsaOrMesa = ST.isAmdHsaOrMesa(F);
- if (isAmdHsaOrMesa && !ST.enableFlatScratch())
- PrivateSegmentBuffer = true;
- else if (ST.isMesaGfxShader(F))
- ImplicitBufferPtr = true;
-
if (!AMDGPU::isGraphics(CC) ||
(CC == CallingConv::AMDGPU_CS && ST.hasArchitectedSGPRs())) {
if (IsKernel || !F.hasFnAttribute("amdgpu-no-workgroup-id-x"))
@@ -145,33 +131,10 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const Function &F,
ST.getMaxWorkitemID(F, 2) != 0)
WorkItemIDZ = true;
- if (!F.hasFnAttribute("amdgpu-no-dispatch-ptr"))
- DispatchPtr = true;
-
- if (!F.hasFnAttribute("amdgpu-no-queue-ptr"))
- QueuePtr = true;
-
- if (!F.hasFnAttribute("amdgpu-no-dispatch-id"))
- DispatchID = true;
-
if (!IsKernel && !F.hasFnAttribute("amdgpu-no-lds-kernel-id"))
LDSKernelId = true;
}
- // FIXME: This attribute is a hack, we just need an analysis on the function
- // to look for allocas.
- bool HasStackObjects = F.hasFnAttribute("amdgpu-stack-objects");
-
- // TODO: This could be refined a lot. The attribute is a poor way of
- // detecting calls or stack objects that may require it before argument
- // lowering.
- if (ST.hasFlatAddressSpace() && isEntryFunction() &&
- (isAmdHsaOrMesa || ST.enableFlatScratch()) &&
- (HasCalls || HasStackObjects || ST.enableFlatScratch()) &&
- !ST.flatScratchIsArchitected()) {
- FlatScratchInit = true;
- }
-
if (isEntryFunction()) {
// X, XY, and XYZ are the only supported combinations, so make sure Y is
// enabled if Z is.
@@ -280,12 +243,47 @@ Register SIMachineFunctionInfo::addLDSKernelId() {
return ArgInfo.LDSKernelId.getRegister();
}
+SmallVectorImpl<MCRegister> *SIMachineFunctionInfo::addPreloadedKernArg(
+ const SIRegisterInfo &TRI, const TargetRegisterClass *RC,
+ unsigned AllocSizeDWord, int KernArgIdx, int PaddingSGPRs) {
+ assert(!ArgInfo.PreloadKernArgs.count(KernArgIdx) &&
+ "Preload kernel argument allocated twice.");
+ NumUserSGPRs += PaddingSGPRs;
+ // If the available register tuples are aligned with the kernarg to be
+ // preloaded use that register, otherwise we need to use a set of SGPRs and
+ // merge them.
+ Register PreloadReg =
+ TRI.getMatchingSuperReg(getNextUserSGPR(), AMDGPU::sub0, RC);
+ if (PreloadReg &&
+ (RC == &AMDGPU::SReg_32RegClass || RC == &AMDGPU::SReg_64RegClass)) {
+ ArgInfo.PreloadKernArgs[KernArgIdx].Regs.push_back(PreloadReg);
+ NumUserSGPRs += AllocSizeDWord;
+ } else {
+ for (unsigned I = 0; I < AllocSizeDWord; ++I) {
+ ArgInfo.PreloadKernArgs[KernArgIdx].Regs.push_back(getNextUserSGPR());
+ NumUserSGPRs++;
+ }
+ }
+
+ // Track the actual number of SGPRs that HW will preload to.
+ UserSGPRInfo.allocKernargPreloadSGPRs(AllocSizeDWord + PaddingSGPRs);
+ return &ArgInfo.PreloadKernArgs[KernArgIdx].Regs;
+}
+
void SIMachineFunctionInfo::allocateWWMSpill(MachineFunction &MF, Register VGPR,
uint64_t Size, Align Alignment) {
// Skip if it is an entry function or the register is already added.
if (isEntryFunction() || WWMSpills.count(VGPR))
return;
+ // Skip if this is a function with the amdgpu_cs_chain or
+ // amdgpu_cs_chain_preserve calling convention and this is a scratch register.
+ // We never need to allocate a spill for these because we don't even need to
+ // restore the inactive lanes for them (they're scratchier than the usual
+ // scratch registers).
+ if (isChainFunction() && SIRegisterInfo::isChainScratchRegister(VGPR))
+ return;
+
WWMSpills.insert(std::make_pair(
VGPR, MF.getFrameInfo().CreateSpillStackObject(Size, Alignment)));
}
@@ -314,37 +312,23 @@ bool SIMachineFunctionInfo::isCalleeSavedReg(const MCPhysReg *CSRegs,
return false;
}
-bool SIMachineFunctionInfo::allocateVGPRForSGPRSpills(MachineFunction &MF,
- int FI,
- unsigned LaneIndex) {
- const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
- const SIRegisterInfo *TRI = ST.getRegisterInfo();
+bool SIMachineFunctionInfo::allocateVirtualVGPRForSGPRSpills(
+ MachineFunction &MF, int FI, unsigned LaneIndex) {
MachineRegisterInfo &MRI = MF.getRegInfo();
Register LaneVGPR;
if (!LaneIndex) {
- LaneVGPR = TRI->findUnusedRegister(MRI, &AMDGPU::VGPR_32RegClass, MF);
- if (LaneVGPR == AMDGPU::NoRegister) {
- // We have no VGPRs left for spilling SGPRs. Reset because we will not
- // partially spill the SGPR to VGPRs.
- SGPRSpillToVGPRLanes.erase(FI);
- return false;
- }
-
+ LaneVGPR = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
SpillVGPRs.push_back(LaneVGPR);
- // Add this register as live-in to all blocks to avoid machine verifier
- // complaining about use of an undefined physical register.
- for (MachineBasicBlock &BB : MF)
- BB.addLiveIn(LaneVGPR);
} else {
LaneVGPR = SpillVGPRs.back();
}
- SGPRSpillToVGPRLanes[FI].push_back(
+ SGPRSpillsToVirtualVGPRLanes[FI].push_back(
SIRegisterInfo::SpilledReg(LaneVGPR, LaneIndex));
return true;
}
-bool SIMachineFunctionInfo::allocateVGPRForPrologEpilogSGPRSpills(
+bool SIMachineFunctionInfo::allocatePhysicalVGPRForSGPRSpills(
MachineFunction &MF, int FI, unsigned LaneIndex) {
const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
const SIRegisterInfo *TRI = ST.getRegisterInfo();
@@ -355,16 +339,22 @@ bool SIMachineFunctionInfo::allocateVGPRForPrologEpilogSGPRSpills(
if (LaneVGPR == AMDGPU::NoRegister) {
// We have no VGPRs left for spilling SGPRs. Reset because we will not
// partially spill the SGPR to VGPRs.
- PrologEpilogSGPRSpillToVGPRLanes.erase(FI);
+ SGPRSpillsToPhysicalVGPRLanes.erase(FI);
return false;
}
allocateWWMSpill(MF, LaneVGPR);
+ reserveWWMRegister(LaneVGPR);
+ for (MachineBasicBlock &MBB : MF) {
+ MBB.addLiveIn(LaneVGPR);
+ MBB.sortUniqueLiveIns();
+ }
+ SpillPhysVGPRs.push_back(LaneVGPR);
} else {
- LaneVGPR = WWMSpills.back().first;
+ LaneVGPR = SpillPhysVGPRs.back();
}
- PrologEpilogSGPRSpillToVGPRLanes[FI].push_back(
+ SGPRSpillsToPhysicalVGPRLanes[FI].push_back(
SIRegisterInfo::SpilledReg(LaneVGPR, LaneIndex));
return true;
}
@@ -373,8 +363,8 @@ bool SIMachineFunctionInfo::allocateSGPRSpillToVGPRLane(MachineFunction &MF,
int FI,
bool IsPrologEpilog) {
std::vector<SIRegisterInfo::SpilledReg> &SpillLanes =
- IsPrologEpilog ? PrologEpilogSGPRSpillToVGPRLanes[FI]
- : SGPRSpillToVGPRLanes[FI];
+ IsPrologEpilog ? SGPRSpillsToPhysicalVGPRLanes[FI]
+ : SGPRSpillsToVirtualVGPRLanes[FI];
// This has already been allocated.
if (!SpillLanes.empty())
@@ -395,15 +385,14 @@ bool SIMachineFunctionInfo::allocateSGPRSpillToVGPRLane(MachineFunction &MF,
"not spilling SGPRs to VGPRs");
unsigned &NumSpillLanes =
- IsPrologEpilog ? NumVGPRPrologEpilogSpillLanes : NumVGPRSpillLanes;
+ IsPrologEpilog ? NumPhysicalVGPRSpillLanes : NumVirtualVGPRSpillLanes;
for (unsigned I = 0; I < NumLanes; ++I, ++NumSpillLanes) {
unsigned LaneIndex = (NumSpillLanes % WaveSize);
- bool Allocated =
- IsPrologEpilog
- ? allocateVGPRForPrologEpilogSGPRSpills(MF, FI, LaneIndex)
- : allocateVGPRForSGPRSpills(MF, FI, LaneIndex);
+ bool Allocated = IsPrologEpilog
+ ? allocatePhysicalVGPRForSGPRSpills(MF, FI, LaneIndex)
+ : allocateVirtualVGPRForSGPRSpills(MF, FI, LaneIndex);
if (!Allocated) {
NumSpillLanes -= I;
return false;
@@ -484,16 +473,25 @@ bool SIMachineFunctionInfo::allocateVGPRSpillToAGPR(MachineFunction &MF,
bool SIMachineFunctionInfo::removeDeadFrameIndices(
MachineFrameInfo &MFI, bool ResetSGPRSpillStackIDs) {
- // Remove dead frame indices from function frame. And also make sure to remove
- // the frame indices from `SGPRSpillToVGPRLanes` data structure, otherwise, it
- // could result in an unexpected side effect and bug, in case of any
- // re-mapping of freed frame indices by later pass(es) like "stack slot
+ // Remove dead frame indices from function frame, however keep FP & BP since
+ // spills for them haven't been inserted yet. And also make sure to remove the
+ // frame indices from `SGPRSpillsToVirtualVGPRLanes` data structure,
+ // otherwise, it could result in an unexpected side effect and bug, in case of
+ // any re-mapping of freed frame indices by later pass(es) like "stack slot
// coloring".
- for (auto &R : make_early_inc_range(SGPRSpillToVGPRLanes)) {
+ for (auto &R : make_early_inc_range(SGPRSpillsToVirtualVGPRLanes)) {
MFI.RemoveStackObject(R.first);
- SGPRSpillToVGPRLanes.erase(R.first);
+ SGPRSpillsToVirtualVGPRLanes.erase(R.first);
}
+ // Remove the dead frame indices of CSR SGPRs which are spilled to physical
+ // VGPR lanes during SILowerSGPRSpills pass.
+ if (!ResetSGPRSpillStackIDs) {
+ for (auto &R : make_early_inc_range(SGPRSpillsToPhysicalVGPRLanes)) {
+ MFI.RemoveStackObject(R.first);
+ SGPRSpillsToPhysicalVGPRLanes.erase(R.first);
+ }
+ }
bool HaveSGPRToMemory = false;
if (ResetSGPRSpillStackIDs) {
@@ -522,7 +520,7 @@ int SIMachineFunctionInfo::getScavengeFI(MachineFrameInfo &MFI,
const SIRegisterInfo &TRI) {
if (ScavengeFI)
return *ScavengeFI;
- if (isEntryFunction()) {
+ if (isBottomOfStack()) {
ScavengeFI = MFI.CreateFixedObject(
TRI.getSpillSize(AMDGPU::SGPR_32RegClass), 0, false);
} else {
@@ -608,6 +606,7 @@ convertArgumentInfo(const AMDGPUFunctionArgInfo &ArgInfo,
return true;
};
+ // TODO: Need to serialize kernarg preloads.
bool Any = false;
Any |= convertArg(AI.PrivateSegmentBuffer, ArgInfo.PrivateSegmentBuffer);
Any |= convertArg(AI.DispatchPtr, ArgInfo.DispatchPtr);
@@ -730,7 +729,7 @@ bool SIMachineFunctionInfo::mayUseAGPRs(const Function &F) const {
for (const auto &CI : IA->ParseConstraints()) {
for (StringRef Code : CI.Codes) {
Code.consume_front("{");
- if (Code.startswith("a"))
+ if (Code.starts_with("a"))
return true;
}
}
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
index 3b4747adf125..dc63ae44c528 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
@@ -16,10 +16,12 @@
#include "AMDGPUArgumentUsageInfo.h"
#include "AMDGPUMachineFunction.h"
#include "AMDGPUTargetMachine.h"
+#include "GCNSubtarget.h"
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
#include "SIInstrInfo.h"
#include "SIModeRegisterDefaults.h"
#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/SmallVector.h"
#include "llvm/CodeGen/MIRYamlMapping.h"
#include "llvm/CodeGen/PseudoSourceValue.h"
#include "llvm/Support/raw_ostream.h"
@@ -256,6 +258,7 @@ struct SIMachineFunctionInfo final : public yaml::MachineFunctionInfo {
uint32_t GDSSize = 0;
Align DynLDSAlign;
bool IsEntryFunction = false;
+ bool IsChainFunction = false;
bool NoSignedZerosFPMath = false;
bool MemoryBound = false;
bool WaveLimiter = false;
@@ -304,6 +307,7 @@ template <> struct MappingTraits<SIMachineFunctionInfo> {
YamlIO.mapOptional("gdsSize", MFI.GDSSize, 0u);
YamlIO.mapOptional("dynLDSAlign", MFI.DynLDSAlign, Align());
YamlIO.mapOptional("isEntryFunction", MFI.IsEntryFunction, false);
+ YamlIO.mapOptional("isChainFunction", MFI.IsChainFunction, false);
YamlIO.mapOptional("noSignedZerosFPMath", MFI.NoSignedZerosFPMath, false);
YamlIO.mapOptional("memoryBound", MFI.MemoryBound, false);
YamlIO.mapOptional("waveLimiter", MFI.WaveLimiter, false);
@@ -434,13 +438,9 @@ private:
unsigned NumSpilledSGPRs = 0;
unsigned NumSpilledVGPRs = 0;
- // Feature bits required for inputs passed in user SGPRs.
- bool PrivateSegmentBuffer : 1;
- bool DispatchPtr : 1;
- bool QueuePtr : 1;
- bool KernargSegmentPtr : 1;
- bool DispatchID : 1;
- bool FlatScratchInit : 1;
+ // Tracks information about user SGPRs that will be setup by hardware which
+ // will apply to all wavefronts of the grid.
+ GCNUserSGPRUsageInfo UserSGPRInfo;
// Feature bits required for inputs passed in system SGPRs.
bool WorkGroupIDX : 1; // Always initialized.
@@ -454,11 +454,6 @@ private:
bool WorkItemIDY : 1;
bool WorkItemIDZ : 1;
- // Private memory buffer
- // Compute directly in sgpr[0:1]
- // Other shaders indirect 64-bits at sgpr[0:1]
- bool ImplicitBufferPtr : 1;
-
// Pointer to where the ABI inserts special kernel arguments separate from the
// user arguments. This is an offset from the KernargSegmentPtr.
bool ImplicitArgPtr : 1;
@@ -496,16 +491,18 @@ public:
};
private:
- // To track VGPR + lane index for each subregister of the SGPR spilled to
- // frameindex key during SILowerSGPRSpills pass.
- DenseMap<int, std::vector<SIRegisterInfo::SpilledReg>> SGPRSpillToVGPRLanes;
- // To track VGPR + lane index for spilling special SGPRs like Frame Pointer
- // identified during PrologEpilogInserter.
+ // To track virtual VGPR + lane index for each subregister of the SGPR spilled
+ // to frameindex key during SILowerSGPRSpills pass.
+ DenseMap<int, std::vector<SIRegisterInfo::SpilledReg>>
+ SGPRSpillsToVirtualVGPRLanes;
+ // To track physical VGPR + lane index for CSR SGPR spills and special SGPRs
+ // like Frame Pointer identified during PrologEpilogInserter.
DenseMap<int, std::vector<SIRegisterInfo::SpilledReg>>
- PrologEpilogSGPRSpillToVGPRLanes;
- unsigned NumVGPRSpillLanes = 0;
- unsigned NumVGPRPrologEpilogSpillLanes = 0;
+ SGPRSpillsToPhysicalVGPRLanes;
+ unsigned NumVirtualVGPRSpillLanes = 0;
+ unsigned NumPhysicalVGPRSpillLanes = 0;
SmallVector<Register, 2> SpillVGPRs;
+ SmallVector<Register, 2> SpillPhysVGPRs;
using WWMSpillsMap = MapVector<Register, int>;
// To track the registers used in instructions that can potentially modify the
// inactive lanes. The WWM instructions and the writelane instructions for
@@ -548,10 +545,10 @@ private:
private:
Register VGPRForAGPRCopy;
- bool allocateVGPRForSGPRSpills(MachineFunction &MF, int FI,
- unsigned LaneIndex);
- bool allocateVGPRForPrologEpilogSGPRSpills(MachineFunction &MF, int FI,
- unsigned LaneIndex);
+ bool allocateVirtualVGPRForSGPRSpills(MachineFunction &MF, int FI,
+ unsigned LaneIndex);
+ bool allocatePhysicalVGPRForSGPRSpills(MachineFunction &MF, int FI,
+ unsigned LaneIndex);
public:
Register getVGPRForAGPRCopy() const {
@@ -583,9 +580,9 @@ public:
SIModeRegisterDefaults getMode() const { return Mode; }
ArrayRef<SIRegisterInfo::SpilledReg>
- getSGPRSpillToVGPRLanes(int FrameIndex) const {
- auto I = SGPRSpillToVGPRLanes.find(FrameIndex);
- return (I == SGPRSpillToVGPRLanes.end())
+ getSGPRSpillToVirtualVGPRLanes(int FrameIndex) const {
+ auto I = SGPRSpillsToVirtualVGPRLanes.find(FrameIndex);
+ return (I == SGPRSpillsToVirtualVGPRLanes.end())
? ArrayRef<SIRegisterInfo::SpilledReg>()
: ArrayRef(I->second);
}
@@ -598,6 +595,10 @@ public:
return PrologEpilogSGPRSpills;
}
+ GCNUserSGPRUsageInfo &getUserSGPRInfo() { return UserSGPRInfo; }
+
+ const GCNUserSGPRUsageInfo &getUserSGPRInfo() const { return UserSGPRInfo; }
+
void addToPrologEpilogSGPRSpills(Register Reg,
PrologEpilogSGPRSaveRestoreInfo SI) {
PrologEpilogSGPRSpills.insert(std::make_pair(Reg, SI));
@@ -647,9 +648,9 @@ public:
}
ArrayRef<SIRegisterInfo::SpilledReg>
- getPrologEpilogSGPRSpillToVGPRLanes(int FrameIndex) const {
- auto I = PrologEpilogSGPRSpillToVGPRLanes.find(FrameIndex);
- return (I == PrologEpilogSGPRSpillToVGPRLanes.end())
+ getSGPRSpillToPhysicalVGPRLanes(int FrameIndex) const {
+ auto I = SGPRSpillsToPhysicalVGPRLanes.find(FrameIndex);
+ return (I == SGPRSpillsToPhysicalVGPRLanes.end())
? ArrayRef<SIRegisterInfo::SpilledReg>()
: ArrayRef(I->second);
}
@@ -667,6 +668,8 @@ public:
return VRegFlags.inBounds(Reg) && VRegFlags[Reg] & Flag;
}
+ bool hasVRegFlags() { return VRegFlags.size(); }
+
void allocateWWMSpill(MachineFunction &MF, Register VGPR, uint64_t Size = 4,
Align Alignment = Align(4));
@@ -728,6 +731,10 @@ public:
Register addFlatScratchInit(const SIRegisterInfo &TRI);
Register addImplicitBufferPtr(const SIRegisterInfo &TRI);
Register addLDSKernelId();
+ SmallVectorImpl<MCRegister> *
+ addPreloadedKernArg(const SIRegisterInfo &TRI, const TargetRegisterClass *RC,
+ unsigned AllocSizeDWord, int KernArgIdx,
+ int PaddingSGPRs);
/// Increment user SGPRs used for padding the argument list only.
Register addReservedUserSGPR() {
@@ -775,6 +782,8 @@ public:
return ArgInfo.WorkGroupInfo.getRegister();
}
+ bool hasLDSKernelId() const { return LDSKernelId; }
+
// Add special VGPR inputs
void setWorkItemIDX(ArgDescriptor Arg) {
ArgInfo.WorkItemIDX = Arg;
@@ -799,30 +808,6 @@ public:
ArgInfo.PrivateSegmentWaveByteOffset = ArgDescriptor::createRegister(Reg);
}
- bool hasPrivateSegmentBuffer() const {
- return PrivateSegmentBuffer;
- }
-
- bool hasDispatchPtr() const {
- return DispatchPtr;
- }
-
- bool hasQueuePtr() const {
- return QueuePtr;
- }
-
- bool hasKernargSegmentPtr() const {
- return KernargSegmentPtr;
- }
-
- bool hasDispatchID() const {
- return DispatchID;
- }
-
- bool hasFlatScratchInit() const {
- return FlatScratchInit;
- }
-
bool hasWorkGroupIDX() const {
return WorkGroupIDX;
}
@@ -839,8 +824,6 @@ public:
return WorkGroupInfo;
}
- bool hasLDSKernelId() const { return LDSKernelId; }
-
bool hasPrivateSegmentWaveByteOffset() const {
return PrivateSegmentWaveByteOffset;
}
@@ -861,10 +844,6 @@ public:
return ImplicitArgPtr;
}
- bool hasImplicitBufferPtr() const {
- return ImplicitBufferPtr;
- }
-
AMDGPUFunctionArgInfo &getArgInfo() {
return ArgInfo;
}
@@ -901,6 +880,10 @@ public:
return NumUserSGPRs + NumSystemSGPRs;
}
+ unsigned getNumKernargPreloadedSGPRs() const {
+ return UserSGPRInfo.getNumKernargPreloadSGPRs();
+ }
+
Register getPrivateSegmentWaveByteOffsetSystemSGPR() const {
return ArgInfo.PrivateSegmentWaveByteOffset.getRegister();
}
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
index bc48f7b76c6d..10ec54d3317f 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
@@ -1055,7 +1055,8 @@ bool SIGfx6CacheControl::insertWait(MachineBasicBlock::iterator &MI,
VMCnt ? 0 : getVmcntBitMask(IV),
getExpcntBitMask(IV),
LGKMCnt ? 0 : getLgkmcntBitMask(IV));
- BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT)).addImm(WaitCntImmediate);
+ BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT_soft))
+ .addImm(WaitCntImmediate);
Changed = true;
}
@@ -1963,14 +1964,15 @@ bool SIGfx10CacheControl::insertWait(MachineBasicBlock::iterator &MI,
VMCnt ? 0 : getVmcntBitMask(IV),
getExpcntBitMask(IV),
LGKMCnt ? 0 : getLgkmcntBitMask(IV));
- BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT)).addImm(WaitCntImmediate);
+ BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT_soft))
+ .addImm(WaitCntImmediate);
Changed = true;
}
if (VSCnt) {
- BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT_VSCNT))
- .addReg(AMDGPU::SGPR_NULL, RegState::Undef)
- .addImm(0);
+ BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT_VSCNT_soft))
+ .addReg(AMDGPU::SGPR_NULL, RegState::Undef)
+ .addImm(0);
Changed = true;
}
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIModeRegisterDefaults.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIModeRegisterDefaults.cpp
index 413ef5d162a7..2684a1e3c335 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIModeRegisterDefaults.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIModeRegisterDefaults.cpp
@@ -7,20 +7,26 @@
//===----------------------------------------------------------------------===//
#include "SIModeRegisterDefaults.h"
+#include "GCNSubtarget.h"
using namespace llvm;
-SIModeRegisterDefaults::SIModeRegisterDefaults(const Function &F) {
+SIModeRegisterDefaults::SIModeRegisterDefaults(const Function &F,
+ const GCNSubtarget &ST) {
*this = getDefaultForCallingConv(F.getCallingConv());
- StringRef IEEEAttr = F.getFnAttribute("amdgpu-ieee").getValueAsString();
- if (!IEEEAttr.empty())
- IEEE = IEEEAttr == "true";
+ if (ST.hasIEEEMode()) {
+ StringRef IEEEAttr = F.getFnAttribute("amdgpu-ieee").getValueAsString();
+ if (!IEEEAttr.empty())
+ IEEE = IEEEAttr == "true";
+ }
- StringRef DX10ClampAttr =
- F.getFnAttribute("amdgpu-dx10-clamp").getValueAsString();
- if (!DX10ClampAttr.empty())
- DX10Clamp = DX10ClampAttr == "true";
+ if (ST.hasDX10ClampMode()) {
+ StringRef DX10ClampAttr =
+ F.getFnAttribute("amdgpu-dx10-clamp").getValueAsString();
+ if (!DX10ClampAttr.empty())
+ DX10Clamp = DX10ClampAttr == "true";
+ }
StringRef DenormF32Attr =
F.getFnAttribute("denormal-fp-math-f32").getValueAsString();
@@ -36,3 +42,135 @@ SIModeRegisterDefaults::SIModeRegisterDefaults(const Function &F) {
FP64FP16Denormals = DenormMode;
}
}
+
+using namespace AMDGPU;
+
+/// Combine f32 and f64 rounding modes into a combined rounding mode value.
+static constexpr uint32_t getModeRegisterRoundMode(uint32_t HWFP32Val,
+ uint32_t HWFP64Val) {
+ return HWFP32Val << F32FltRoundOffset | HWFP64Val << F64FltRoundOffset;
+}
+
+static constexpr uint64_t encodeFltRoundsTable(uint32_t FltRoundsVal,
+ uint32_t HWF32Val,
+ uint32_t HWF64Val) {
+ uint32_t ModeVal = getModeRegisterRoundMode(HWF32Val, HWF64Val);
+ if (FltRoundsVal > TowardNegative)
+ FltRoundsVal -= ExtendedFltRoundOffset;
+
+ uint32_t BitIndex = ModeVal << 2;
+ return static_cast<uint64_t>(FltRoundsVal) << BitIndex;
+}
+
+// Encode FLT_ROUNDS value where the two rounding modes are the same and use a
+// standard value
+static constexpr uint64_t
+encodeFltRoundsTableSame(AMDGPUFltRounds FltRoundsMode, uint32_t HWVal) {
+ return encodeFltRoundsTable(FltRoundsMode, HWVal, HWVal);
+}
+
+// Convert mode register encoded rounding mode to AMDGPUFltRounds
+static constexpr AMDGPUFltRounds
+decodeIndexFltRoundConversionTable(uint32_t HWMode) {
+ uint32_t TableRead = (FltRoundConversionTable >> (HWMode << 2)) & 0xf;
+ if (TableRead > TowardNegative)
+ TableRead += ExtendedFltRoundOffset;
+ return static_cast<AMDGPUFltRounds>(TableRead);
+}
+
+static constexpr uint32_t HWTowardZero = FP_ROUND_ROUND_TO_ZERO;
+static constexpr uint32_t HWNearestTiesToEven = FP_ROUND_ROUND_TO_NEAREST;
+static constexpr uint32_t HWTowardPositive = FP_ROUND_ROUND_TO_INF;
+static constexpr uint32_t HWTowardNegative = FP_ROUND_ROUND_TO_NEGINF;
+
+const uint64_t AMDGPU::FltRoundConversionTable =
+ encodeFltRoundsTableSame(TowardZeroF32_TowardZeroF64, HWTowardZero) |
+ encodeFltRoundsTableSame(NearestTiesToEvenF32_NearestTiesToEvenF64,
+ HWNearestTiesToEven) |
+ encodeFltRoundsTableSame(TowardPositiveF32_TowardPositiveF64,
+ HWTowardPositive) |
+ encodeFltRoundsTableSame(TowardNegativeF32_TowardNegativeF64,
+ HWTowardNegative) |
+
+ encodeFltRoundsTable(TowardZeroF32_NearestTiesToEvenF64, HWTowardZero,
+ HWNearestTiesToEven) |
+ encodeFltRoundsTable(TowardZeroF32_TowardPositiveF64, HWTowardZero,
+ HWTowardPositive) |
+ encodeFltRoundsTable(TowardZeroF32_TowardNegativeF64, HWTowardZero,
+ HWTowardNegative) |
+
+ encodeFltRoundsTable(NearestTiesToEvenF32_TowardZeroF64,
+ HWNearestTiesToEven, HWTowardZero) |
+ encodeFltRoundsTable(NearestTiesToEvenF32_TowardPositiveF64,
+ HWNearestTiesToEven, HWTowardPositive) |
+ encodeFltRoundsTable(NearestTiesToEvenF32_TowardNegativeF64,
+ HWNearestTiesToEven, HWTowardNegative) |
+
+ encodeFltRoundsTable(TowardPositiveF32_TowardZeroF64, HWTowardPositive,
+ HWTowardZero) |
+ encodeFltRoundsTable(TowardPositiveF32_NearestTiesToEvenF64,
+ HWTowardPositive, HWNearestTiesToEven) |
+ encodeFltRoundsTable(TowardPositiveF32_TowardNegativeF64, HWTowardPositive,
+ HWTowardNegative) |
+
+ encodeFltRoundsTable(TowardNegativeF32_TowardZeroF64, HWTowardNegative,
+ HWTowardZero) |
+ encodeFltRoundsTable(TowardNegativeF32_NearestTiesToEvenF64,
+ HWTowardNegative, HWNearestTiesToEven) |
+ encodeFltRoundsTable(TowardNegativeF32_TowardPositiveF64, HWTowardNegative,
+ HWTowardPositive);
+
+// Verify evaluation of FltRoundConversionTable
+
+// If both modes are the same, should return the standard values.
+static_assert(decodeIndexFltRoundConversionTable(getModeRegisterRoundMode(
+ HWTowardZero, HWTowardZero)) == AMDGPUFltRounds::TowardZero);
+static_assert(decodeIndexFltRoundConversionTable(getModeRegisterRoundMode(
+ HWNearestTiesToEven, HWNearestTiesToEven)) ==
+ AMDGPUFltRounds::NearestTiesToEven);
+static_assert(decodeIndexFltRoundConversionTable(getModeRegisterRoundMode(
+ HWTowardPositive, HWTowardPositive)) ==
+ AMDGPUFltRounds::TowardPositive);
+static_assert(decodeIndexFltRoundConversionTable(getModeRegisterRoundMode(
+ HWTowardNegative, HWTowardNegative)) ==
+ AMDGPUFltRounds::TowardNegative);
+
+static_assert(decodeIndexFltRoundConversionTable(getModeRegisterRoundMode(
+ HWTowardZero, HWNearestTiesToEven)) ==
+ TowardZeroF32_NearestTiesToEvenF64);
+static_assert(decodeIndexFltRoundConversionTable(
+ getModeRegisterRoundMode(HWTowardZero, HWTowardPositive)) ==
+ TowardZeroF32_TowardPositiveF64);
+static_assert(decodeIndexFltRoundConversionTable(
+ getModeRegisterRoundMode(HWTowardZero, HWTowardNegative)) ==
+ TowardZeroF32_TowardNegativeF64);
+
+static_assert(decodeIndexFltRoundConversionTable(getModeRegisterRoundMode(
+ HWNearestTiesToEven, HWTowardZero)) ==
+ NearestTiesToEvenF32_TowardZeroF64);
+static_assert(decodeIndexFltRoundConversionTable(getModeRegisterRoundMode(
+ HWNearestTiesToEven, HWTowardPositive)) ==
+ NearestTiesToEvenF32_TowardPositiveF64);
+static_assert(decodeIndexFltRoundConversionTable(getModeRegisterRoundMode(
+ HWNearestTiesToEven, HWTowardNegative)) ==
+ NearestTiesToEvenF32_TowardNegativeF64);
+
+static_assert(decodeIndexFltRoundConversionTable(
+ getModeRegisterRoundMode(HWTowardPositive, HWTowardZero)) ==
+ TowardPositiveF32_TowardZeroF64);
+static_assert(decodeIndexFltRoundConversionTable(getModeRegisterRoundMode(
+ HWTowardPositive, HWNearestTiesToEven)) ==
+ TowardPositiveF32_NearestTiesToEvenF64);
+static_assert(decodeIndexFltRoundConversionTable(getModeRegisterRoundMode(
+ HWTowardPositive, HWTowardNegative)) ==
+ TowardPositiveF32_TowardNegativeF64);
+
+static_assert(decodeIndexFltRoundConversionTable(
+ getModeRegisterRoundMode(HWTowardNegative, HWTowardZero)) ==
+ TowardNegativeF32_TowardZeroF64);
+static_assert(decodeIndexFltRoundConversionTable(getModeRegisterRoundMode(
+ HWTowardNegative, HWNearestTiesToEven)) ==
+ TowardNegativeF32_NearestTiesToEvenF64);
+static_assert(decodeIndexFltRoundConversionTable(getModeRegisterRoundMode(
+ HWTowardNegative, HWTowardPositive)) ==
+ TowardNegativeF32_TowardPositiveF64);
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIModeRegisterDefaults.h b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIModeRegisterDefaults.h
index df2e3f9bff32..9fbd74c3eede 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIModeRegisterDefaults.h
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIModeRegisterDefaults.h
@@ -14,6 +14,8 @@
namespace llvm {
+class GCNSubtarget;
+
// Track defaults for fields in the MODE register.
struct SIModeRegisterDefaults {
/// Floating point opcodes that support exception flag gathering quiet and
@@ -40,7 +42,7 @@ struct SIModeRegisterDefaults {
FP32Denormals(DenormalMode::getIEEE()),
FP64FP16Denormals(DenormalMode::getIEEE()) {}
- SIModeRegisterDefaults(const Function &F);
+ SIModeRegisterDefaults(const Function &F, const GCNSubtarget &ST);
static SIModeRegisterDefaults getDefaultForCallingConv(CallingConv::ID CC) {
SIModeRegisterDefaults Mode;
@@ -85,6 +87,65 @@ struct SIModeRegisterDefaults {
}
};
+namespace AMDGPU {
+
+/// Return values used for llvm.get.rounding
+///
+/// When both the F32 and F64/F16 modes are the same, returns the standard
+/// values. If they differ, returns an extended mode starting at 8.
+enum AMDGPUFltRounds : int8_t {
+ // Inherit everything from RoundingMode
+ TowardZero = static_cast<int8_t>(RoundingMode::TowardZero),
+ NearestTiesToEven = static_cast<int8_t>(RoundingMode::NearestTiesToEven),
+ TowardPositive = static_cast<int8_t>(RoundingMode::TowardPositive),
+ TowardNegative = static_cast<int8_t>(RoundingMode::TowardNegative),
+ NearestTiesToAwayUnsupported =
+ static_cast<int8_t>(RoundingMode::NearestTiesToAway),
+
+ Dynamic = static_cast<int8_t>(RoundingMode::Dynamic),
+
+ // Permute the mismatched rounding mode cases. If the modes are the same, use
+ // the standard values, otherwise, these values are sorted such that higher
+ // hardware encoded values have higher enum values.
+ NearestTiesToEvenF32_NearestTiesToEvenF64 = NearestTiesToEven,
+ NearestTiesToEvenF32_TowardPositiveF64 = 8,
+ NearestTiesToEvenF32_TowardNegativeF64 = 9,
+ NearestTiesToEvenF32_TowardZeroF64 = 10,
+
+ TowardPositiveF32_NearestTiesToEvenF64 = 11,
+ TowardPositiveF32_TowardPositiveF64 = TowardPositive,
+ TowardPositiveF32_TowardNegativeF64 = 12,
+ TowardPositiveF32_TowardZeroF64 = 13,
+
+ TowardNegativeF32_NearestTiesToEvenF64 = 14,
+ TowardNegativeF32_TowardPositiveF64 = 15,
+ TowardNegativeF32_TowardNegativeF64 = TowardNegative,
+ TowardNegativeF32_TowardZeroF64 = 16,
+
+ TowardZeroF32_NearestTiesToEvenF64 = 17,
+ TowardZeroF32_TowardPositiveF64 = 18,
+ TowardZeroF32_TowardNegativeF64 = 19,
+ TowardZeroF32_TowardZeroF64 = TowardZero,
+
+ Invalid = static_cast<int8_t>(RoundingMode::Invalid)
+};
+
+/// Offset of nonstandard values for llvm.get.rounding results from the largest
+/// supported mode.
+static constexpr uint32_t ExtendedFltRoundOffset = 4;
+
+/// Offset in mode register of f32 rounding mode.
+static constexpr uint32_t F32FltRoundOffset = 0;
+
+/// Offset in mode register of f64/f16 rounding mode.
+static constexpr uint32_t F64FltRoundOffset = 2;
+
+// Bit indexed table to convert from hardware rounding mode values to FLT_ROUNDS
+// values.
+extern const uint64_t FltRoundConversionTable;
+
+} // end namespace AMDGPU
+
} // end namespace llvm
#endif // LLVM_LIB_TARGET_AMDGPU_SIMODEREGISTERDEFAULTS_H
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp
index 04c9a6457944..e3f54d01eb22 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp
@@ -10,6 +10,7 @@
#include "GCNSubtarget.h"
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
#include "SIRegisterInfo.h"
+#include "llvm/ADT/SmallVector.h"
#include "llvm/CodeGen/LivePhysRegs.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
#include "llvm/CodeGen/MachineOperand.h"
@@ -32,6 +33,7 @@ class SIOptimizeExecMasking : public MachineFunctionPass {
DenseMap<MachineInstr *, MachineInstr *> SaveExecVCmpMapping;
SmallVector<std::pair<MachineInstr *, MachineInstr *>, 1> OrXors;
+ SmallVector<MachineOperand *, 1> KillFlagCandidates;
Register isCopyFromExec(const MachineInstr &MI) const;
Register isCopyToExec(const MachineInstr &MI) const;
@@ -41,15 +43,16 @@ class SIOptimizeExecMasking : public MachineFunctionPass {
MachineBasicBlock::reverse_iterator
findExecCopy(MachineBasicBlock &MBB,
MachineBasicBlock::reverse_iterator I) const;
-
bool isRegisterInUseBetween(MachineInstr &Stop, MachineInstr &Start,
MCRegister Reg, bool UseLiveOuts = false,
bool IgnoreStart = false) const;
bool isRegisterInUseAfter(MachineInstr &Stop, MCRegister Reg) const;
- MachineInstr *findInstrBackwards(MachineInstr &Origin,
- std::function<bool(MachineInstr *)> Pred,
- ArrayRef<MCRegister> NonModifiableRegs,
- unsigned MaxInstructions = 20) const;
+ MachineInstr *findInstrBackwards(
+ MachineInstr &Origin, std::function<bool(MachineInstr *)> Pred,
+ ArrayRef<MCRegister> NonModifiableRegs,
+ MachineInstr *Terminator = nullptr,
+ SmallVectorImpl<MachineOperand *> *KillFlagCandidates = nullptr,
+ unsigned MaxInstructions = 20) const;
bool optimizeExecSequence();
void tryRecordVCmpxAndSaveexecSequence(MachineInstr &MI);
bool optimizeVCMPSaveExecSequence(MachineInstr &SaveExecInstr,
@@ -325,11 +328,13 @@ static bool isLiveOut(const MachineBasicBlock &MBB, unsigned Reg) {
// Backwards-iterate from Origin (for n=MaxInstructions iterations) until either
// the beginning of the BB is reached or Pred evaluates to true - which can be
// an arbitrary condition based on the current MachineInstr, for instance an
-// target instruction. Breaks prematurely by returning nullptr if one of the
+// target instruction. Breaks prematurely by returning nullptr if one of the
// registers given in NonModifiableRegs is modified by the current instruction.
MachineInstr *SIOptimizeExecMasking::findInstrBackwards(
MachineInstr &Origin, std::function<bool(MachineInstr *)> Pred,
- ArrayRef<MCRegister> NonModifiableRegs, unsigned MaxInstructions) const {
+ ArrayRef<MCRegister> NonModifiableRegs, MachineInstr *Terminator,
+ SmallVectorImpl<MachineOperand *> *KillFlagCandidates,
+ unsigned MaxInstructions) const {
MachineBasicBlock::reverse_iterator A = Origin.getReverseIterator(),
E = Origin.getParent()->rend();
unsigned CurrentIteration = 0;
@@ -344,6 +349,21 @@ MachineInstr *SIOptimizeExecMasking::findInstrBackwards(
for (MCRegister Reg : NonModifiableRegs) {
if (A->modifiesRegister(Reg, TRI))
return nullptr;
+
+ // Check for kills that appear after the terminator instruction, that
+ // would not be detected by clearKillFlags, since they will cause the
+ // register to be dead at a later place, causing the verifier to fail.
+ // We use the candidates to clear the kill flags later.
+ if (Terminator && KillFlagCandidates && A != Terminator &&
+ A->killsRegister(Reg, TRI)) {
+ for (MachineOperand &MO : A->operands()) {
+ if (MO.isReg() && MO.isKill()) {
+ Register Candidate = MO.getReg();
+ if (Candidate != Reg && TRI->regsOverlap(Candidate, Reg))
+ KillFlagCandidates->push_back(&MO);
+ }
+ }
+ }
}
++CurrentIteration;
@@ -599,6 +619,9 @@ bool SIOptimizeExecMasking::optimizeVCMPSaveExecSequence(
if (Src1->isReg())
MRI->clearKillFlags(Src1->getReg());
+ for (MachineOperand *MO : KillFlagCandidates)
+ MO->setIsKill(false);
+
SaveExecInstr.eraseFromParent();
VCmp.eraseFromParent();
@@ -690,7 +713,8 @@ void SIOptimizeExecMasking::tryRecordVCmpxAndSaveexecSequence(
NonDefRegs.push_back(Src1->getReg());
if (!findInstrBackwards(
- MI, [&](MachineInstr *Check) { return Check == VCmp; }, NonDefRegs))
+ MI, [&](MachineInstr *Check) { return Check == VCmp; }, NonDefRegs,
+ VCmp, &KillFlagCandidates))
return;
if (VCmp)
@@ -777,6 +801,7 @@ bool SIOptimizeExecMasking::runOnMachineFunction(MachineFunction &MF) {
OrXors.clear();
SaveExecVCmpMapping.clear();
+ KillFlagCandidates.clear();
static unsigned SearchWindow = 10;
for (MachineBasicBlock &MBB : MF) {
unsigned SearchCount = 0;
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIOptimizeVGPRLiveRange.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIOptimizeVGPRLiveRange.cpp
index e95abae88d7a..8204a70e72d9 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIOptimizeVGPRLiveRange.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIOptimizeVGPRLiveRange.cpp
@@ -522,9 +522,11 @@ void SIOptimizeVGPRLiveRange::optimizeLiveRange(
auto *UseBlock = UseMI->getParent();
// Replace uses in Endif block
if (UseBlock == Endif) {
- if (UseMI->isPHI()) {
+ if (UseMI->isPHI())
O.setReg(NewReg);
- } else {
+ else if (UseMI->isDebugInstr())
+ continue;
+ else {
// DetectDeadLanes may mark register uses as undef without removing
// them, in which case a non-phi instruction using the original register
// may exist in the Endif block even though the register is not live
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
index 97b3161c7f98..53fc2c068624 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
@@ -546,7 +546,8 @@ SIPeepholeSDWA::matchSDWAOperand(MachineInstr &MI) {
MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
- if (Src1->getReg().isPhysical() || Dst->getReg().isPhysical())
+ if (!Src1->isReg() || Src1->getReg().isPhysical() ||
+ Dst->getReg().isPhysical())
break;
if (Opcode == AMDGPU::V_LSHLREV_B32_e32 ||
@@ -584,7 +585,8 @@ SIPeepholeSDWA::matchSDWAOperand(MachineInstr &MI) {
MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
- if (Src1->getReg().isPhysical() || Dst->getReg().isPhysical())
+ if (!Src1->isReg() || Src1->getReg().isPhysical() ||
+ Dst->getReg().isPhysical())
break;
if (Opcode == AMDGPU::V_LSHLREV_B16_e32 ||
@@ -647,7 +649,8 @@ SIPeepholeSDWA::matchSDWAOperand(MachineInstr &MI) {
MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
- if (Src0->getReg().isPhysical() || Dst->getReg().isPhysical())
+ if (!Src0->isReg() || Src0->getReg().isPhysical() ||
+ Dst->getReg().isPhysical())
break;
return std::make_unique<SDWASrcOperand>(
@@ -675,7 +678,8 @@ SIPeepholeSDWA::matchSDWAOperand(MachineInstr &MI) {
MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
- if (ValSrc->getReg().isPhysical() || Dst->getReg().isPhysical())
+ if (!ValSrc->isReg() || ValSrc->getReg().isPhysical() ||
+ Dst->getReg().isPhysical())
break;
return std::make_unique<SDWASrcOperand>(
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIPreAllocateWWMRegs.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIPreAllocateWWMRegs.cpp
index c2ddfd7881ab..0c57110b4eb1 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIPreAllocateWWMRegs.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIPreAllocateWWMRegs.cpp
@@ -28,6 +28,10 @@ using namespace llvm;
#define DEBUG_TYPE "si-pre-allocate-wwm-regs"
+static cl::opt<bool>
+ EnablePreallocateSGPRSpillVGPRs("amdgpu-prealloc-sgpr-spill-vgprs",
+ cl::init(false), cl::Hidden);
+
namespace {
class SIPreAllocateWWMRegs : public MachineFunctionPass {
@@ -56,11 +60,9 @@ public:
void getAnalysisUsage(AnalysisUsage &AU) const override {
AU.addRequired<LiveIntervals>();
- AU.addPreserved<LiveIntervals>();
AU.addRequired<VirtRegMap>();
AU.addRequired<LiveRegMatrix>();
- AU.addPreserved<SlotIndexes>();
- AU.setPreservesCFG();
+ AU.setPreservesAll();
MachineFunctionPass::getAnalysisUsage(AU);
}
@@ -101,7 +103,7 @@ bool SIPreAllocateWWMRegs::processDef(MachineOperand &MO) {
LiveInterval &LI = LIS->getInterval(Reg);
for (MCRegister PhysReg : RegClassInfo.getOrder(MRI->getRegClass(Reg))) {
- if (!MRI->isPhysRegUsed(PhysReg) &&
+ if (!MRI->isPhysRegUsed(PhysReg, /*SkipRegMaskTest=*/true) &&
Matrix->checkInterference(LI, PhysReg) == LiveRegMatrix::IK_Free) {
Matrix->assign(LI, PhysReg);
assert(PhysReg != 0);
@@ -201,6 +203,10 @@ bool SIPreAllocateWWMRegs::runOnMachineFunction(MachineFunction &MF) {
RegClassInfo.runOnMachineFunction(MF);
+ bool PreallocateSGPRSpillVGPRs =
+ EnablePreallocateSGPRSpillVGPRs ||
+ MF.getFunction().hasFnAttribute("amdgpu-prealloc-sgpr-spill-vgprs");
+
bool RegsAssigned = false;
// We use a reverse post-order traversal of the control-flow graph to
@@ -217,6 +223,12 @@ bool SIPreAllocateWWMRegs::runOnMachineFunction(MachineFunction &MF) {
MI.getOpcode() == AMDGPU::V_SET_INACTIVE_B64)
RegsAssigned |= processDef(MI.getOperand(0));
+ if (MI.getOpcode() == AMDGPU::SI_SPILL_S32_TO_VGPR) {
+ if (!PreallocateSGPRSpillVGPRs)
+ continue;
+ RegsAssigned |= processDef(MI.getOperand(0));
+ }
+
if (MI.getOpcode() == AMDGPU::ENTER_STRICT_WWM ||
MI.getOpcode() == AMDGPU::ENTER_STRICT_WQM ||
MI.getOpcode() == AMDGPU::ENTER_PSEUDO_WM) {
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp
index 61444b14a56b..87242a4740c8 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp
@@ -320,6 +320,9 @@ bool SIPreEmitPeephole::mustRetainExeczBranch(
if (MI.isConditionalBranch())
return true;
+ if (MI.isMetaInstruction())
+ continue;
+
if (TII->hasUnwantedEffectsWhenEXECEmpty(MI))
return true;
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIProgramInfo.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIProgramInfo.cpp
index b6839c8308d8..9ed7aacc0538 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIProgramInfo.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIProgramInfo.cpp
@@ -15,27 +15,48 @@
//
#include "SIProgramInfo.h"
+#include "GCNSubtarget.h"
#include "SIDefines.h"
#include "Utils/AMDGPUBaseInfo.h"
using namespace llvm;
-uint64_t SIProgramInfo::getComputePGMRSrc1() const {
- return S_00B848_VGPRS(VGPRBlocks) | S_00B848_SGPRS(SGPRBlocks) |
- S_00B848_PRIORITY(Priority) | S_00B848_FLOAT_MODE(FloatMode) |
- S_00B848_PRIV(Priv) | S_00B848_DX10_CLAMP(DX10Clamp) |
- S_00B848_DEBUG_MODE(DebugMode) | S_00B848_IEEE_MODE(IEEEMode) |
- S_00B848_WGP_MODE(WgpMode) | S_00B848_MEM_ORDERED(MemOrdered);
+uint64_t SIProgramInfo::getComputePGMRSrc1(const GCNSubtarget &ST) const {
+ uint64_t Reg = S_00B848_VGPRS(VGPRBlocks) | S_00B848_SGPRS(SGPRBlocks) |
+ S_00B848_PRIORITY(Priority) | S_00B848_FLOAT_MODE(FloatMode) |
+ S_00B848_PRIV(Priv) | S_00B848_DEBUG_MODE(DebugMode) |
+ S_00B848_WGP_MODE(WgpMode) | S_00B848_MEM_ORDERED(MemOrdered);
+
+ if (ST.hasDX10ClampMode())
+ Reg |= S_00B848_DX10_CLAMP(DX10Clamp);
+
+ if (ST.hasIEEEMode())
+ Reg |= S_00B848_IEEE_MODE(IEEEMode);
+
+ if (ST.hasRrWGMode())
+ Reg |= S_00B848_RR_WG_MODE(RrWgMode);
+
+ return Reg;
}
-uint64_t SIProgramInfo::getPGMRSrc1(CallingConv::ID CC) const {
+uint64_t SIProgramInfo::getPGMRSrc1(CallingConv::ID CC,
+ const GCNSubtarget &ST) const {
if (AMDGPU::isCompute(CC)) {
- return getComputePGMRSrc1();
+ return getComputePGMRSrc1(ST);
}
uint64_t Reg = S_00B848_VGPRS(VGPRBlocks) | S_00B848_SGPRS(SGPRBlocks) |
S_00B848_PRIORITY(Priority) | S_00B848_FLOAT_MODE(FloatMode) |
- S_00B848_PRIV(Priv) | S_00B848_DX10_CLAMP(DX10Clamp) |
- S_00B848_DEBUG_MODE(DebugMode) | S_00B848_IEEE_MODE(IEEEMode);
+ S_00B848_PRIV(Priv) | S_00B848_DEBUG_MODE(DebugMode);
+
+ if (ST.hasDX10ClampMode())
+ Reg |= S_00B848_DX10_CLAMP(DX10Clamp);
+
+ if (ST.hasIEEEMode())
+ Reg |= S_00B848_IEEE_MODE(IEEEMode);
+
+ if (ST.hasRrWGMode())
+ Reg |= S_00B848_RR_WG_MODE(RrWgMode);
+
switch (CC) {
case CallingConv::AMDGPU_PS:
Reg |= S_00B028_MEM_ORDERED(MemOrdered);
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIProgramInfo.h b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIProgramInfo.h
index aab127e49463..8c26789f936c 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIProgramInfo.h
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIProgramInfo.h
@@ -21,6 +21,8 @@
namespace llvm {
+class GCNSubtarget;
+
/// Track resource usage for kernels / entry functions.
struct SIProgramInfo {
// Fields set in PGM_RSRC1 pm4 packet.
@@ -34,6 +36,7 @@ struct SIProgramInfo {
uint32_t IEEEMode = 0;
uint32_t WgpMode = 0; // GFX10+
uint32_t MemOrdered = 0; // GFX10+
+ uint32_t RrWgMode = 0; // GFX12+
uint64_t ScratchSize = 0;
// State used to calculate fields set in PGM_RSRC2 pm4 packet.
@@ -85,8 +88,8 @@ struct SIProgramInfo {
SIProgramInfo() = default;
/// Compute the value of the ComputePGMRsrc1 register.
- uint64_t getComputePGMRSrc1() const;
- uint64_t getPGMRSrc1(CallingConv::ID CC) const;
+ uint64_t getComputePGMRSrc1(const GCNSubtarget &ST) const;
+ uint64_t getPGMRSrc1(CallingConv::ID CC, const GCNSubtarget &ST) const;
/// Compute the value of the ComputePGMRsrc2 register.
uint64_t getComputePGMRSrc2() const;
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
index c2a272166241..021d797344c5 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
@@ -19,7 +19,7 @@
#include "SIMachineFunctionInfo.h"
#include "SIRegisterInfo.h"
#include "llvm/CodeGen/LiveIntervals.h"
-#include "llvm/CodeGen/LivePhysRegs.h"
+#include "llvm/CodeGen/LiveRegUnits.h"
#include "llvm/CodeGen/MachineDominators.h"
#include "llvm/CodeGen/MachineFrameInfo.h"
#include "llvm/CodeGen/RegisterScavenging.h"
@@ -397,6 +397,8 @@ const MCPhysReg *SIRegisterInfo::getCalleeSavedRegs(
case CallingConv::AMDGPU_Gfx:
return ST.hasGFX90AInsts() ? CSR_AMDGPU_SI_Gfx_GFX90AInsts_SaveList
: CSR_AMDGPU_SI_Gfx_SaveList;
+ case CallingConv::AMDGPU_CS_ChainPreserve:
+ return CSR_AMDGPU_CS_ChainPreserve_SaveList;
default: {
// Dummy to not crash RegisterClassInfo.
static const MCPhysReg NoCalleeSavedReg = AMDGPU::NoRegister;
@@ -421,6 +423,11 @@ const uint32_t *SIRegisterInfo::getCallPreservedMask(const MachineFunction &MF,
case CallingConv::AMDGPU_Gfx:
return ST.hasGFX90AInsts() ? CSR_AMDGPU_SI_Gfx_GFX90AInsts_RegMask
: CSR_AMDGPU_SI_Gfx_RegMask;
+ case CallingConv::AMDGPU_CS_Chain:
+ case CallingConv::AMDGPU_CS_ChainPreserve:
+ // Calls to these functions never return, so we can pretend everything is
+ // preserved.
+ return AMDGPU_AllVGPRs_RegMask;
default:
return nullptr;
}
@@ -430,6 +437,10 @@ const uint32_t *SIRegisterInfo::getNoPreservedMask() const {
return CSR_AMDGPU_NoRegs_RegMask;
}
+bool SIRegisterInfo::isChainScratchRegister(Register VGPR) {
+ return VGPR >= AMDGPU::VGPR0 && VGPR < AMDGPU::VGPR8;
+}
+
const TargetRegisterClass *
SIRegisterInfo::getLargestLegalSuperClass(const TargetRegisterClass *RC,
const MachineFunction &MF) const {
@@ -488,11 +499,11 @@ SIRegisterInfo::getLargestLegalSuperClass(const TargetRegisterClass *RC,
Register SIRegisterInfo::getFrameRegister(const MachineFunction &MF) const {
const SIFrameLowering *TFI = ST.getFrameLowering();
const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
- // During ISel lowering we always reserve the stack pointer in entry
+ // During ISel lowering we always reserve the stack pointer in entry and chain
// functions, but never actually want to reference it when accessing our own
// frame. If we need a frame pointer we use it, but otherwise we can just use
// an immediate "0" which we represent by returning NoRegister.
- if (FuncInfo->isEntryFunction()) {
+ if (FuncInfo->isBottomOfStack()) {
return TFI->hasFP(MF) ? FuncInfo->getFrameOffsetReg() : Register();
}
return TFI->hasFP(MF) ? FuncInfo->getFrameOffsetReg()
@@ -712,9 +723,6 @@ BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
for (MCPhysReg Reg : MFI->getVGPRSpillAGPRs())
reserveRegisterTuples(Reserved, Reg);
- for (auto Reg : MFI->getSGPRSpillVGPRs())
- reserveRegisterTuples(Reserved, Reg);
-
return Reserved;
}
@@ -725,12 +733,12 @@ bool SIRegisterInfo::isAsmClobberable(const MachineFunction &MF,
bool SIRegisterInfo::shouldRealignStack(const MachineFunction &MF) const {
const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
- // On entry, the base address is 0, so it can't possibly need any more
- // alignment.
+ // On entry or in chain functions, the base address is 0, so it can't possibly
+ // need any more alignment.
// FIXME: Should be able to specify the entry frame alignment per calling
// convention instead.
- if (Info->isEntryFunction())
+ if (Info->isBottomOfStack())
return false;
return TargetRegisterInfo::shouldRealignStack(MF);
@@ -796,10 +804,10 @@ bool SIRegisterInfo::needsFrameBaseReg(MachineInstr *MI, int64_t Offset) const {
int64_t FullOffset = Offset + getScratchInstrOffset(MI);
+ const SIInstrInfo *TII = ST.getInstrInfo();
if (SIInstrInfo::isMUBUF(*MI))
- return !SIInstrInfo::isLegalMUBUFImmOffset(FullOffset);
+ return !TII->isLegalMUBUFImmOffset(FullOffset);
- const SIInstrInfo *TII = ST.getInstrInfo();
return !TII->isLegalFLATOffset(FullOffset, AMDGPUAS::PRIVATE_ADDRESS,
SIInstrFlags::FlatScratch);
}
@@ -897,8 +905,7 @@ void SIRegisterInfo::resolveFrameIndex(MachineInstr &MI, Register BaseReg,
assert(SOffset->isImm() && SOffset->getImm() == 0);
#endif
- assert(SIInstrInfo::isLegalMUBUFImmOffset(NewOffset) &&
- "offset should be legal");
+ assert(TII->isLegalMUBUFImmOffset(NewOffset) && "offset should be legal");
FIOp->ChangeToRegister(BaseReg, false);
OffsetOp->setImm(NewOffset);
@@ -912,10 +919,10 @@ bool SIRegisterInfo::isFrameOffsetLegal(const MachineInstr *MI,
int64_t NewOffset = Offset + getScratchInstrOffset(MI);
+ const SIInstrInfo *TII = ST.getInstrInfo();
if (SIInstrInfo::isMUBUF(*MI))
- return SIInstrInfo::isLegalMUBUFImmOffset(NewOffset);
+ return TII->isLegalMUBUFImmOffset(NewOffset);
- const SIInstrInfo *TII = ST.getInstrInfo();
return TII->isLegalFLATOffset(NewOffset, AMDGPUAS::PRIVATE_ADDRESS,
SIInstrFlags::FlatScratch);
}
@@ -1068,6 +1075,8 @@ static unsigned getNumSubRegsForSpillOp(unsigned Op) {
case AMDGPU::SI_SPILL_AV32_RESTORE:
case AMDGPU::SI_SPILL_WWM_V32_SAVE:
case AMDGPU::SI_SPILL_WWM_V32_RESTORE:
+ case AMDGPU::SI_SPILL_WWM_AV32_SAVE:
+ case AMDGPU::SI_SPILL_WWM_AV32_RESTORE:
return 1;
default: llvm_unreachable("Invalid spill opcode");
}
@@ -1310,8 +1319,8 @@ void SIRegisterInfo::buildSpillLoadStore(
MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, const DebugLoc &DL,
unsigned LoadStoreOp, int Index, Register ValueReg, bool IsKill,
MCRegister ScratchOffsetReg, int64_t InstOffset, MachineMemOperand *MMO,
- RegScavenger *RS, LivePhysRegs *LiveRegs) const {
- assert((!RS || !LiveRegs) && "Only RS or LiveRegs can be set but not both");
+ RegScavenger *RS, LiveRegUnits *LiveUnits) const {
+ assert((!RS || !LiveUnits) && "Only RS or LiveUnits can be set but not both");
MachineFunction *MF = MBB.getParent();
const SIInstrInfo *TII = ST.getInstrInfo();
@@ -1394,12 +1403,12 @@ void SIRegisterInfo::buildSpillLoadStore(
bool IsOffsetLegal =
IsFlat ? TII->isLegalFLATOffset(MaxOffset, AMDGPUAS::PRIVATE_ADDRESS,
SIInstrFlags::FlatScratch)
- : SIInstrInfo::isLegalMUBUFImmOffset(MaxOffset);
+ : TII->isLegalMUBUFImmOffset(MaxOffset);
if (!IsOffsetLegal || (IsFlat && !SOffset && !ST.hasFlatScratchSTMode())) {
SOffset = MCRegister();
// We don't have access to the register scavenger if this function is called
- // during PEI::scavengeFrameVirtualRegs() so use LiveRegs in this case.
+ // during PEI::scavengeFrameVirtualRegs() so use LiveUnits in this case.
// TODO: Clobbering SCC is not necessary for scratch instructions in the
// entry.
if (RS) {
@@ -1407,10 +1416,10 @@ void SIRegisterInfo::buildSpillLoadStore(
// Piggy back on the liveness scan we just did see if SCC is dead.
CanClobberSCC = !RS->isRegUsed(AMDGPU::SCC);
- } else if (LiveRegs) {
- CanClobberSCC = !LiveRegs->contains(AMDGPU::SCC);
+ } else if (LiveUnits) {
+ CanClobberSCC = LiveUnits->available(AMDGPU::SCC);
for (MCRegister Reg : AMDGPU::SGPR_32RegClass) {
- if (LiveRegs->available(MF->getRegInfo(), Reg)) {
+ if (LiveUnits->available(Reg) && !MF->getRegInfo().isReserved(Reg)) {
SOffset = Reg;
break;
}
@@ -1426,9 +1435,9 @@ void SIRegisterInfo::buildSpillLoadStore(
if (RS) {
TmpOffsetVGPR = RS->scavengeRegisterBackwards(AMDGPU::VGPR_32RegClass, MI, false, 0);
} else {
- assert(LiveRegs);
+ assert(LiveUnits);
for (MCRegister Reg : AMDGPU::VGPR_32RegClass) {
- if (LiveRegs->available(MF->getRegInfo(), Reg)) {
+ if (LiveUnits->available(Reg) && !MF->getRegInfo().isReserved(Reg)) {
TmpOffsetVGPR = Reg;
break;
}
@@ -1639,7 +1648,7 @@ void SIRegisterInfo::buildSpillLoadStore(
if (UseVGPROffset && ScratchOffsetReg) {
MIB.addReg(ScratchOffsetReg);
} else {
- assert(FuncInfo->isEntryFunction());
+ assert(FuncInfo->isBottomOfStack());
MIB.addImm(0);
}
}
@@ -1736,10 +1745,13 @@ void SIRegisterInfo::buildVGPRSpillLoadStore(SGPRSpillBuilder &SB, int Index,
bool SIRegisterInfo::spillSGPR(MachineBasicBlock::iterator MI, int Index,
RegScavenger *RS, SlotIndexes *Indexes,
- LiveIntervals *LIS, bool OnlyToVGPR) const {
+ LiveIntervals *LIS, bool OnlyToVGPR,
+ bool SpillToPhysVGPRLane) const {
SGPRSpillBuilder SB(*this, *ST.getInstrInfo(), isWave32, MI, Index, RS);
- ArrayRef<SpilledReg> VGPRSpills = SB.MFI.getSGPRSpillToVGPRLanes(Index);
+ ArrayRef<SpilledReg> VGPRSpills =
+ SpillToPhysVGPRLane ? SB.MFI.getSGPRSpillToPhysicalVGPRLanes(Index)
+ : SB.MFI.getSGPRSpillToVirtualVGPRLanes(Index);
bool SpillToVGPR = !VGPRSpills.empty();
if (OnlyToVGPR && !SpillToVGPR)
return false;
@@ -1767,7 +1779,7 @@ bool SIRegisterInfo::spillSGPR(MachineBasicBlock::iterator MI, int Index,
// Mark the "old value of vgpr" input undef only if this is the first sgpr
// spill to this specific vgpr in the first basic block.
auto MIB = BuildMI(*SB.MBB, MI, SB.DL,
- SB.TII.get(AMDGPU::V_WRITELANE_B32), Spill.VGPR)
+ SB.TII.get(AMDGPU::SI_SPILL_S32_TO_VGPR), Spill.VGPR)
.addReg(SubReg, getKillRegState(UseKill))
.addImm(Spill.Lane)
.addReg(Spill.VGPR);
@@ -1813,8 +1825,8 @@ bool SIRegisterInfo::spillSGPR(MachineBasicBlock::iterator MI, int Index,
: Register(getSubReg(SB.SuperReg, SB.SplitParts[i]));
MachineInstrBuilder WriteLane =
- BuildMI(*SB.MBB, MI, SB.DL, SB.TII.get(AMDGPU::V_WRITELANE_B32),
- SB.TmpVGPR)
+ BuildMI(*SB.MBB, MI, SB.DL,
+ SB.TII.get(AMDGPU::SI_SPILL_S32_TO_VGPR), SB.TmpVGPR)
.addReg(SubReg, SubKillState)
.addImm(i % PVD.PerVGPR)
.addReg(SB.TmpVGPR, TmpVGPRFlags);
@@ -1856,10 +1868,13 @@ bool SIRegisterInfo::spillSGPR(MachineBasicBlock::iterator MI, int Index,
bool SIRegisterInfo::restoreSGPR(MachineBasicBlock::iterator MI, int Index,
RegScavenger *RS, SlotIndexes *Indexes,
- LiveIntervals *LIS, bool OnlyToVGPR) const {
+ LiveIntervals *LIS, bool OnlyToVGPR,
+ bool SpillToPhysVGPRLane) const {
SGPRSpillBuilder SB(*this, *ST.getInstrInfo(), isWave32, MI, Index, RS);
- ArrayRef<SpilledReg> VGPRSpills = SB.MFI.getSGPRSpillToVGPRLanes(Index);
+ ArrayRef<SpilledReg> VGPRSpills =
+ SpillToPhysVGPRLane ? SB.MFI.getSGPRSpillToPhysicalVGPRLanes(Index)
+ : SB.MFI.getSGPRSpillToVirtualVGPRLanes(Index);
bool SpillToVGPR = !VGPRSpills.empty();
if (OnlyToVGPR && !SpillToVGPR)
return false;
@@ -1872,8 +1887,8 @@ bool SIRegisterInfo::restoreSGPR(MachineBasicBlock::iterator MI, int Index,
: Register(getSubReg(SB.SuperReg, SB.SplitParts[i]));
SpilledReg Spill = VGPRSpills[i];
- auto MIB = BuildMI(*SB.MBB, MI, SB.DL, SB.TII.get(AMDGPU::V_READLANE_B32),
- SubReg)
+ auto MIB = BuildMI(*SB.MBB, MI, SB.DL,
+ SB.TII.get(AMDGPU::SI_RESTORE_S32_FROM_VGPR), SubReg)
.addReg(Spill.VGPR)
.addImm(Spill.Lane);
if (SB.NumSubRegs > 1 && i == 0)
@@ -1906,7 +1921,7 @@ bool SIRegisterInfo::restoreSGPR(MachineBasicBlock::iterator MI, int Index,
bool LastSubReg = (i + 1 == e);
auto MIB = BuildMI(*SB.MBB, MI, SB.DL,
- SB.TII.get(AMDGPU::V_READLANE_B32), SubReg)
+ SB.TII.get(AMDGPU::SI_RESTORE_S32_FROM_VGPR), SubReg)
.addReg(SB.TmpVGPR, getKillRegState(LastSubReg))
.addImm(i);
if (SB.NumSubRegs > 1 && i == 0)
@@ -2005,7 +2020,7 @@ bool SIRegisterInfo::spillEmergencySGPR(MachineBasicBlock::iterator MI,
/// handled.
bool SIRegisterInfo::eliminateSGPRToVGPRSpillFrameIndex(
MachineBasicBlock::iterator MI, int FI, RegScavenger *RS,
- SlotIndexes *Indexes, LiveIntervals *LIS) const {
+ SlotIndexes *Indexes, LiveIntervals *LIS, bool SpillToPhysVGPRLane) const {
switch (MI->getOpcode()) {
case AMDGPU::SI_SPILL_S1024_SAVE:
case AMDGPU::SI_SPILL_S512_SAVE:
@@ -2021,7 +2036,7 @@ bool SIRegisterInfo::eliminateSGPRToVGPRSpillFrameIndex(
case AMDGPU::SI_SPILL_S96_SAVE:
case AMDGPU::SI_SPILL_S64_SAVE:
case AMDGPU::SI_SPILL_S32_SAVE:
- return spillSGPR(MI, FI, RS, Indexes, LIS, true);
+ return spillSGPR(MI, FI, RS, Indexes, LIS, true, SpillToPhysVGPRLane);
case AMDGPU::SI_SPILL_S1024_RESTORE:
case AMDGPU::SI_SPILL_S512_RESTORE:
case AMDGPU::SI_SPILL_S384_RESTORE:
@@ -2036,7 +2051,7 @@ bool SIRegisterInfo::eliminateSGPRToVGPRSpillFrameIndex(
case AMDGPU::SI_SPILL_S96_RESTORE:
case AMDGPU::SI_SPILL_S64_RESTORE:
case AMDGPU::SI_SPILL_S32_RESTORE:
- return restoreSGPR(MI, FI, RS, Indexes, LIS, true);
+ return restoreSGPR(MI, FI, RS, Indexes, LIS, true, SpillToPhysVGPRLane);
default:
llvm_unreachable("not an SGPR spill instruction");
}
@@ -2141,7 +2156,8 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
case AMDGPU::SI_SPILL_AV96_SAVE:
case AMDGPU::SI_SPILL_AV64_SAVE:
case AMDGPU::SI_SPILL_AV32_SAVE:
- case AMDGPU::SI_SPILL_WWM_V32_SAVE: {
+ case AMDGPU::SI_SPILL_WWM_V32_SAVE:
+ case AMDGPU::SI_SPILL_WWM_AV32_SAVE: {
const MachineOperand *VData = TII->getNamedOperand(*MI,
AMDGPU::OpName::vdata);
assert(TII->getNamedOperand(*MI, AMDGPU::OpName::soffset)->getReg() ==
@@ -2208,7 +2224,8 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
case AMDGPU::SI_SPILL_AV384_RESTORE:
case AMDGPU::SI_SPILL_AV512_RESTORE:
case AMDGPU::SI_SPILL_AV1024_RESTORE:
- case AMDGPU::SI_SPILL_WWM_V32_RESTORE: {
+ case AMDGPU::SI_SPILL_WWM_V32_RESTORE:
+ case AMDGPU::SI_SPILL_WWM_AV32_RESTORE: {
const MachineOperand *VData = TII->getNamedOperand(*MI,
AMDGPU::OpName::vdata);
assert(TII->getNamedOperand(*MI, AMDGPU::OpName::soffset)->getReg() ==
@@ -2406,7 +2423,7 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
bool IsMUBUF = TII->isMUBUF(*MI);
- if (!IsMUBUF && !MFI->isEntryFunction()) {
+ if (!IsMUBUF && !MFI->isBottomOfStack()) {
// Convert to a swizzled stack address by scaling by the wave size.
// In an entry function/kernel the offset is already swizzled.
bool IsSALU = isSGPRClass(TII->getOpRegClass(*MI, FIOperandNum));
@@ -2425,10 +2442,13 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
if (Offset == 0) {
unsigned OpCode = IsSALU && !LiveSCC ? AMDGPU::S_LSHR_B32
: AMDGPU::V_LSHRREV_B32_e64;
- // XXX - This never happens because of emergency scavenging slot at 0?
- auto Shift = BuildMI(*MBB, MI, DL, TII->get(OpCode), ResultReg)
- .addImm(ST.getWavefrontSizeLog2())
- .addReg(FrameReg);
+ auto Shift = BuildMI(*MBB, MI, DL, TII->get(OpCode), ResultReg);
+ if (OpCode == AMDGPU::V_LSHRREV_B32_e64)
+ // For V_LSHRREV, the operands are reversed (the shift count goes
+ // first).
+ Shift.addImm(ST.getWavefrontSizeLog2()).addReg(FrameReg);
+ else
+ Shift.addReg(FrameReg).addImm(ST.getWavefrontSizeLog2());
if (IsSALU && !LiveSCC)
Shift.getInstr()->getOperand(3).setIsDead(); // Mark SCC as dead.
if (IsSALU && LiveSCC) {
@@ -2541,7 +2561,7 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
= TII->getNamedOperand(*MI, AMDGPU::OpName::offset)->getImm();
int64_t NewOffset = OldImm + Offset;
- if (SIInstrInfo::isLegalMUBUFImmOffset(NewOffset) &&
+ if (TII->isLegalMUBUFImmOffset(NewOffset) &&
buildMUBUFOffsetLoadStore(ST, FrameInfo, MI, Index, NewOffset)) {
MI->eraseFromParent();
return true;
@@ -2568,6 +2588,10 @@ StringRef SIRegisterInfo::getRegAsmName(MCRegister Reg) const {
return AMDGPUInstPrinter::getRegisterName(Reg);
}
+unsigned AMDGPU::getRegBitWidth(const TargetRegisterClass &RC) {
+ return getRegBitWidth(RC.getID());
+}
+
static const TargetRegisterClass *
getAnyVGPRClassForBitWidth(unsigned BitWidth) {
if (BitWidth == 64)
@@ -3059,7 +3083,8 @@ SIRegisterInfo::getRegClassForSizeOnBank(unsigned Size,
const RegisterBank &RB) const {
switch (RB.getID()) {
case AMDGPU::VGPRRegBankID:
- return getVGPRClassForBitWidth(std::max(32u, Size));
+ return getVGPRClassForBitWidth(
+ std::max(ST.useRealTrue16Insts() ? 16u : 32u, Size));
case AMDGPU::VCCRegBankID:
assert(Size == 1);
return isWave32 ? &AMDGPU::SReg_32_XM0_XEXECRegClass
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIRegisterInfo.h b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIRegisterInfo.h
index 2120b47c581e..88d568672098 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIRegisterInfo.h
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIRegisterInfo.h
@@ -14,6 +14,8 @@
#ifndef LLVM_LIB_TARGET_AMDGPU_SIREGISTERINFO_H
#define LLVM_LIB_TARGET_AMDGPU_SIREGISTERINFO_H
+#include "llvm/ADT/BitVector.h"
+
#define GET_REGINFO_HEADER
#include "AMDGPUGenRegisterInfo.inc"
@@ -23,7 +25,7 @@ namespace llvm {
class GCNSubtarget;
class LiveIntervals;
-class LivePhysRegs;
+class LiveRegUnits;
class RegisterBank;
struct SGPRSpillBuilder;
@@ -90,6 +92,11 @@ public:
CallingConv::ID) const override;
const uint32_t *getNoPreservedMask() const override;
+ // Functions with the amdgpu_cs_chain or amdgpu_cs_chain_preserve calling
+ // conventions are free to use certain VGPRs without saving and restoring any
+ // lanes (not even inactive ones).
+ static bool isChainScratchRegister(Register VGPR);
+
// Stack access is very expensive. CSRs are also the high registers, and we
// want to minimize the number of used registers.
unsigned getCSRFirstUseCost() const override {
@@ -142,31 +149,30 @@ public:
void buildVGPRSpillLoadStore(SGPRSpillBuilder &SB, int Index, int Offset,
bool IsLoad, bool IsKill = true) const;
- /// If \p OnlyToVGPR is true, this will only succeed if this
+ /// If \p OnlyToVGPR is true, this will only succeed if this manages to find a
+ /// free VGPR lane to spill.
bool spillSGPR(MachineBasicBlock::iterator MI, int FI, RegScavenger *RS,
SlotIndexes *Indexes = nullptr, LiveIntervals *LIS = nullptr,
- bool OnlyToVGPR = false) const;
+ bool OnlyToVGPR = false,
+ bool SpillToPhysVGPRLane = false) const;
bool restoreSGPR(MachineBasicBlock::iterator MI, int FI, RegScavenger *RS,
SlotIndexes *Indexes = nullptr, LiveIntervals *LIS = nullptr,
- bool OnlyToVGPR = false) const;
+ bool OnlyToVGPR = false,
+ bool SpillToPhysVGPRLane = false) const;
bool spillEmergencySGPR(MachineBasicBlock::iterator MI,
MachineBasicBlock &RestoreMBB, Register SGPR,
RegScavenger *RS) const;
- bool supportsBackwardScavenger() const override {
- return true;
- }
-
bool eliminateFrameIndex(MachineBasicBlock::iterator MI, int SPAdj,
unsigned FIOperandNum,
RegScavenger *RS) const override;
- bool eliminateSGPRToVGPRSpillFrameIndex(MachineBasicBlock::iterator MI,
- int FI, RegScavenger *RS,
- SlotIndexes *Indexes = nullptr,
- LiveIntervals *LIS = nullptr) const;
+ bool eliminateSGPRToVGPRSpillFrameIndex(
+ MachineBasicBlock::iterator MI, int FI, RegScavenger *RS,
+ SlotIndexes *Indexes = nullptr, LiveIntervals *LIS = nullptr,
+ bool SpillToPhysVGPRLane = false) const;
StringRef getRegAsmName(MCRegister Reg) const override;
@@ -416,14 +422,14 @@ public:
// Insert spill or restore instructions.
// When lowering spill pseudos, the RegScavenger should be set.
// For creating spill instructions during frame lowering, where no scavenger
- // is available, LiveRegs can be used.
+ // is available, LiveUnits can be used.
void buildSpillLoadStore(MachineBasicBlock &MBB,
MachineBasicBlock::iterator MI, const DebugLoc &DL,
unsigned LoadStoreOp, int Index, Register ValueReg,
bool ValueIsKill, MCRegister ScratchOffsetReg,
int64_t InstrOffset, MachineMemOperand *MMO,
RegScavenger *RS,
- LivePhysRegs *LiveRegs = nullptr) const;
+ LiveRegUnits *LiveUnits = nullptr) const;
// Return alignment in register file of first register in a register tuple.
unsigned getRegClassAlignmentNumBits(const TargetRegisterClass *RC) const {
@@ -445,6 +451,11 @@ public:
unsigned SubReg) const;
};
+namespace AMDGPU {
+/// Get the size in bits of a register from the register class \p RC.
+unsigned getRegBitWidth(const TargetRegisterClass &RC);
+} // namespace AMDGPU
+
} // End namespace llvm
#endif
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIRegisterInfo.td b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
index b2b1b458a63a..981da13fe089 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
@@ -122,10 +122,18 @@ class SIRegisterTuples<list<SubRegIndex> Indices, RegisterClass RC,
//===----------------------------------------------------------------------===//
// Declarations that describe the SI registers
//===----------------------------------------------------------------------===//
-class SIReg <string n, bits<16> regIdx = 0> :
- Register<n> {
+class SIReg <string n, bits<8> regIdx = 0, bit isAGPROrVGPR = 0,
+ bit isHi = 0> : Register<n> {
let Namespace = "AMDGPU";
- let HWEncoding = regIdx;
+
+ // These are generic helper values we use to form actual register
+ // codes. They should not be assumed to match any particular register
+ // encodings on any particular subtargets.
+ let HWEncoding{7-0} = regIdx;
+ let HWEncoding{8} = isAGPROrVGPR;
+ let HWEncoding{9} = isHi;
+
+ int Index = !cast<int>(regIdx);
}
// For register classes that use TSFlags.
@@ -148,28 +156,22 @@ class SIRegisterClass <string n, list<ValueType> rTypes, int Align, dag rList>
let TSFlags{4} = HasSGPR;
}
-multiclass SIRegLoHi16 <string n, bits<16> regIdx, bit ArtificialHigh = 1,
- bit HWEncodingHigh = 0> {
- // There is no special encoding for 16 bit subregs, these are not real
- // registers but rather operands for instructions preserving other 16 bits
- // of the result or reading just 16 bits of a 32 bit VGPR.
- // It is encoded as a corresponding 32 bit register.
- // Non-VGPR register classes use it as we need to have matching subregisters
- // to move instructions and data between ALUs.
- def _LO16 : SIReg<n#".l", regIdx> {
- let HWEncoding{8} = HWEncodingHigh;
- }
- def _HI16 : SIReg<!if(ArtificialHigh, "", n#".h"), regIdx> {
+multiclass SIRegLoHi16 <string n, bits<8> regIdx, bit ArtificialHigh = 1,
+ bit isAGPROrVGPR = 0> {
+ def _LO16 : SIReg<n#".l", regIdx, isAGPROrVGPR>;
+ def _HI16 : SIReg<!if(ArtificialHigh, "", n#".h"), regIdx, isAGPROrVGPR,
+ /* isHi */ 1> {
let isArtificial = ArtificialHigh;
- let HWEncoding{8} = HWEncodingHigh;
}
def "" : RegisterWithSubRegs<n, [!cast<Register>(NAME#"_LO16"),
!cast<Register>(NAME#"_HI16")]> {
let Namespace = "AMDGPU";
let SubRegIndices = [lo16, hi16];
let CoveredBySubRegs = !not(ArtificialHigh);
- let HWEncoding = regIdx;
- let HWEncoding{8} = HWEncodingHigh;
+ let HWEncoding{7-0} = regIdx;
+ let HWEncoding{8} = isAGPROrVGPR;
+
+ int Index = !cast<int>(regIdx);
}
}
@@ -247,7 +249,7 @@ def SGPR_NULL64 :
// the high 32 bits. The lower 32 bits are always zero (for base) or
// -1 (for limit). Since we cannot access the high 32 bits, when we
// need them, we need to do a 64 bit load and extract the bits manually.
-multiclass ApertureRegister<string name, bits<16> regIdx> {
+multiclass ApertureRegister<string name, bits<8> regIdx> {
let isConstant = true in {
// FIXME: We shouldn't need to define subregisters for these (nor add them to any 16 bit
// register classes), but if we don't it seems to confuse the TableGen
@@ -315,7 +317,7 @@ foreach Index = 0...15 in {
defm TTMP#Index : SIRegLoHi16<"ttmp"#Index, 0>;
}
-multiclass FLAT_SCR_LOHI_m <string n, bits<16> ci_e, bits<16> vi_e> {
+multiclass FLAT_SCR_LOHI_m <string n, bits<8> ci_e, bits<8> vi_e> {
defm _ci : SIRegLoHi16<n, ci_e>;
defm _vi : SIRegLoHi16<n, vi_e>;
defm "" : SIRegLoHi16<n, 0>;
@@ -412,7 +414,7 @@ def SGPR_32 : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 32,
// SGPR 64-bit registers
def SGPR_64Regs : SIRegisterTuples<getSubRegs<2>.ret, SGPR_32, 105, 2, 2, "s">;
-// SGPR 96-bit registers. No operations use these, but for symmetry with 96-bit VGPRs.
+// SGPR 96-bit registers.
def SGPR_96Regs : SIRegisterTuples<getSubRegs<3>.ret, SGPR_32, 105, 4, 3, "s">;
// SGPR 128-bit registers
@@ -591,7 +593,6 @@ def VGPR_LO16 : SIRegisterClass<"AMDGPU", Reg16Types.types, 16,
let AllocationPriority = 0;
let Size = 16;
let GeneratePressureSet = 0;
- let BaseClassOrder = 16;
}
def VGPR_HI16 : SIRegisterClass<"AMDGPU", Reg16Types.types, 16,
@@ -599,9 +600,34 @@ def VGPR_HI16 : SIRegisterClass<"AMDGPU", Reg16Types.types, 16,
let AllocationPriority = 0;
let Size = 16;
let GeneratePressureSet = 0;
+}
+
+// VOP3 and VINTERP can access 256 lo and 256 hi registers.
+def VGPR_16 : SIRegisterClass<"AMDGPU", Reg16Types.types, 16,
+ (add (interleave (sequence "VGPR%u_LO16", 0, 255),
+ (sequence "VGPR%u_HI16", 0, 255)))> {
+ let AllocationPriority = 2;
+ let Size = 16;
+ let GeneratePressureSet = 0;
+
+ // This is the base class for VGPR{128..255}_{LO16,HI16}.
let BaseClassOrder = 17;
}
+// VOP1/2/C can access the First 128 lo and 128 hi registers.
+// The order of registers in the class determines order of allocation, so it is
+// important to interleave lo and hi registers.
+def VGPR_16_Lo128 : SIRegisterClass<"AMDGPU", Reg16Types.types, 16,
+ (add (interleave (sequence "VGPR%u_LO16", 0, 127),
+ (sequence "VGPR%u_HI16", 0, 127)))> {
+ let Size = 16;
+ let GeneratePressureSet = 0;
+ let isAllocatable = 0;
+
+ // This is the base class for VGPR{0..127}_{LO16,HI16}.
+ let BaseClassOrder = 16;
+}
+
// VGPR 32-bit registers
// i16/f16 only on VI+
def VGPR_32 : SIRegisterClass<"AMDGPU", !listconcat(Reg32Types.types, Reg16Types.types), 32,
@@ -904,7 +930,7 @@ defm "" : SRegClass<11, [v11i32, v11f32], SGPR_352Regs, TTMP_352Regs>;
defm "" : SRegClass<12, [v12i32, v12f32], SGPR_384Regs, TTMP_384Regs>;
let GlobalPriority = true in {
-defm "" : SRegClass<16, [v16i32, v16f32, v8i64, v8f64], SGPR_512Regs, TTMP_512Regs>;
+defm "" : SRegClass<16, [v16i32, v16f32, v8i64, v8f64, v32i16, v32f16], SGPR_512Regs, TTMP_512Regs>;
defm "" : SRegClass<32, [v32i32, v32f32, v16i64, v16f64], SGPR_1024Regs>;
}
@@ -958,7 +984,7 @@ defm VReg_352 : VRegClass<11, [v11i32, v11f32], (add VGPR_352)>;
defm VReg_384 : VRegClass<12, [v12i32, v12f32], (add VGPR_384)>;
let GlobalPriority = true in {
-defm VReg_512 : VRegClass<16, [v16i32, v16f32, v8i64, v8f64], (add VGPR_512)>;
+defm VReg_512 : VRegClass<16, [v16i32, v16f32, v8i64, v8f64, v32i16, v32f16], (add VGPR_512)>;
defm VReg_1024 : VRegClass<32, [v32i32, v32f32, v16i64, v16f64], (add VGPR_1024)>;
}
@@ -1008,6 +1034,18 @@ def VReg_1 : SIRegisterClass<"AMDGPU", [i1], 32, (add)> {
let HasVGPR = 1;
}
+def VS_16 : SIRegisterClass<"AMDGPU", Reg16Types.types, 16,
+ (add VGPR_16, SReg_32, LDS_DIRECT_CLASS)> {
+ let isAllocatable = 0;
+ let HasVGPR = 1;
+}
+
+def VS_16_Lo128 : SIRegisterClass<"AMDGPU", Reg16Types.types, 16,
+ (add VGPR_16_Lo128, SReg_32, LDS_DIRECT_CLASS)> {
+ let isAllocatable = 0;
+ let HasVGPR = 1;
+}
+
def VS_32 : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 32,
(add VGPR_32, SReg_32, LDS_DIRECT_CLASS)> {
let isAllocatable = 0;
@@ -1094,6 +1132,30 @@ class RegOrF16 <string RegisterClass, string OperandTypePrefix>
: RegOrImmOperand <RegisterClass, OperandTypePrefix # "_FP16",
!subst("_f16", "F16", NAME), "_Imm16">;
+class RegOrB16T <string RegisterClass, string OperandTypePrefix>
+ : RegOrImmOperand <RegisterClass, OperandTypePrefix # "_INT16",
+ !subst("_b16", "B16", NAME), "_Imm16"> {
+ let EncoderMethod = "getMachineOpValueT16";
+}
+
+class RegOrF16T <string RegisterClass, string OperandTypePrefix>
+ : RegOrImmOperand <RegisterClass, OperandTypePrefix # "_FP16",
+ !subst("_f16", "F16", NAME), "_Imm16"> {
+ let EncoderMethod = "getMachineOpValueT16";
+}
+
+class RegOrB16_Lo128T <string RegisterClass, string OperandTypePrefix>
+ : RegOrImmOperand <RegisterClass, OperandTypePrefix # "_INT16",
+ !subst("_b16_Lo128", "B16_Lo128", NAME), "_Imm16"> {
+ let EncoderMethod = "getMachineOpValueT16Lo128";
+}
+
+class RegOrF16_Lo128T <string RegisterClass, string OperandTypePrefix>
+ : RegOrImmOperand <RegisterClass, OperandTypePrefix # "_FP16",
+ !subst("_f16_Lo128", "F16_Lo128", NAME), "_Imm16"> {
+ let EncoderMethod = "getMachineOpValueT16Lo128";
+}
+
class RegOrB32 <string RegisterClass, string OperandTypePrefix>
: RegOrImmOperand <RegisterClass, OperandTypePrefix # "_INT32",
!subst("_b32", "B32", NAME), "_Imm32">;
@@ -1149,10 +1211,13 @@ class RegOrF16_Lo128_Deferred <string RegisterClass,
: RegOrImmOperand <RegisterClass, OperandTypePrefix # "_FP16_DEFERRED",
!subst("_f16_Lo128_Deferred", "F16_Lo128", NAME),
"_Deferred_Imm16">;
+
//===----------------------------------------------------------------------===//
// SSrc_* Operands with an SGPR or a 32-bit immediate
//===----------------------------------------------------------------------===//
+def SSrc_b16 : RegOrB16 <"SReg_32", "OPERAND_REG_IMM">;
+def SSrc_f16 : RegOrF16 <"SReg_32", "OPERAND_REG_IMM">;
def SSrc_b32 : RegOrB32 <"SReg_32", "OPERAND_REG_IMM">;
def SSrc_f32 : RegOrF32 <"SReg_32", "OPERAND_REG_IMM">;
def SSrc_b64 : RegOrB64 <"SReg_64", "OPERAND_REG_IMM">;
@@ -1160,6 +1225,13 @@ def SSrc_b64 : RegOrB64 <"SReg_64", "OPERAND_REG_IMM">;
def SSrcOrLds_b32 : RegOrB32 <"SRegOrLds_32", "OPERAND_REG_IMM">;
//===----------------------------------------------------------------------===//
+// SSrc_32_Deferred Operands with an SGPR or a 32-bit immediate for use with
+// FMAMK/FMAAK
+//===----------------------------------------------------------------------===//
+
+def SSrc_f32_Deferred : RegOrF32_Deferred<"SReg_32", "OPERAND_REG_IMM">;
+
+//===----------------------------------------------------------------------===//
// SCSrc_* Operands with an SGPR or a inline constant
//===----------------------------------------------------------------------===//
@@ -1170,20 +1242,41 @@ def SCSrc_b64 : RegOrB64 <"SReg_64", "OPERAND_REG_INLINE_C">;
// VSrc_* Operands with an SGPR, VGPR or a 32-bit immediate
//===----------------------------------------------------------------------===//
+// The current and temporary future default used case for VOP3.
def VSrc_b16 : RegOrB16 <"VS_32", "OPERAND_REG_IMM">;
def VSrc_f16 : RegOrF16 <"VS_32", "OPERAND_REG_IMM">;
+
+// True16 VOP3 operands.
+def VSrcT_b16 : RegOrB16T <"VS_16", "OPERAND_REG_IMM"> {
+ let DecoderMethod = "decodeOperand_VSrcT16";
+}
+def VSrcT_f16 : RegOrF16T <"VS_16", "OPERAND_REG_IMM"> {
+ let DecoderMethod = "decodeOperand_VSrcT16";
+}
+
+// True16 VOP1/2/C operands.
+def VSrcT_b16_Lo128 : RegOrB16_Lo128T <"VS_16_Lo128", "OPERAND_REG_IMM"> {
+ let DecoderMethod = "decodeOperand_VSrcT16_Lo128";
+}
+def VSrcT_f16_Lo128 : RegOrF16_Lo128T <"VS_16_Lo128", "OPERAND_REG_IMM"> {
+ let DecoderMethod = "decodeOperand_VSrcT16_Lo128";
+}
+
+// The current and temporary future default used case for fake VOP1/2/C.
+def VSrcFake16_b16_Lo128 : RegOrB16_Lo128 <"VS_32_Lo128", "OPERAND_REG_IMM">;
+def VSrcFake16_f16_Lo128 : RegOrF16_Lo128 <"VS_32_Lo128", "OPERAND_REG_IMM">;
+
def VSrc_b32 : RegOrB32 <"VS_32", "OPERAND_REG_IMM">;
def VSrc_f32 : RegOrF32 <"VS_32", "OPERAND_REG_IMM">;
def VSrc_v2b16 : RegOrV2B16 <"VS_32", "OPERAND_REG_IMM">;
def VSrc_v2f16 : RegOrV2F16 <"VS_32", "OPERAND_REG_IMM">;
def VSrc_b64 : RegOrB64 <"VS_64", "OPERAND_REG_IMM">;
-def VSrc_f64 : RegOrF64 <"VS_64", "OPERAND_REG_IMM">;
+def VSrc_f64 : RegOrF64 <"VS_64", "OPERAND_REG_IMM"> {
+ let DecoderMethod = "decodeOperand_VSrc_f64";
+}
def VSrc_v2b32 : RegOrV2B32 <"VS_64", "OPERAND_REG_IMM">;
def VSrc_v2f32 : RegOrV2F32 <"VS_64", "OPERAND_REG_IMM">;
-def VSrcT_b16_Lo128 : RegOrB16_Lo128 <"VS_32_Lo128", "OPERAND_REG_IMM">;
-def VSrcT_f16_Lo128 : RegOrF16_Lo128 <"VS_32_Lo128", "OPERAND_REG_IMM">;
-
//===----------------------------------------------------------------------===//
// VSrc_*_Deferred Operands with an SGPR, VGPR or a 32-bit immediate for use
// with FMAMK/FMAAK
@@ -1192,8 +1285,8 @@ def VSrcT_f16_Lo128 : RegOrF16_Lo128 <"VS_32_Lo128", "OPERAND_REG_IMM">;
def VSrc_f16_Deferred : RegOrF16_Deferred<"VS_32", "OPERAND_REG_IMM">;
def VSrc_f32_Deferred : RegOrF32_Deferred<"VS_32", "OPERAND_REG_IMM">;
-def VSrcT_f16_Lo128_Deferred : RegOrF16_Lo128_Deferred<"VS_32_Lo128",
- "OPERAND_REG_IMM">;
+def VSrcFake16_f16_Lo128_Deferred : RegOrF16_Lo128_Deferred<"VS_32_Lo128",
+ "OPERAND_REG_IMM">;
//===----------------------------------------------------------------------===//
// VRegSrc_* Operands with a VGPR
@@ -1233,6 +1326,11 @@ def VGPRSrc_32_Lo128 : RegisterOperand<VGPR_32_Lo128> {
let DecoderMethod = "DecodeVGPR_32RegisterClass";
}
+def VGPRSrc_16_Lo128 : RegisterOperand<VGPR_16_Lo128> {
+ let DecoderMethod = "DecodeVGPR_16_Lo128RegisterClass";
+ let EncoderMethod = "getMachineOpValueT16Lo128";
+}
+
//===----------------------------------------------------------------------===//
// ASrc_* Operands with an AccVGPR
//===----------------------------------------------------------------------===//
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SISchedule.td b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SISchedule.td
index 53441b5a4ced..b0e8e4112254 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SISchedule.td
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SISchedule.td
@@ -65,6 +65,12 @@ def Write16PassMAI : SchedWrite;
def Write4PassDGEMM : SchedWrite;
def Write8PassDGEMM : SchedWrite;
+// Scalar float instructions
+def WriteSFPU : SchedWrite;
+
+// F16 or F32 pseudo scalar transcendental instructions
+def WritePseudoScalarTrans : SchedWrite;
+
// FIXME: Should there be a class for instructions which are VALU
// instructions and have VALU rates, but write to the SALU (i.e. VOPC
// instructions)
@@ -90,6 +96,7 @@ def SIDPFullSpeedModel : SISchedMachineModel;
def SIDPGFX940FullSpeedModel : SISchedMachineModel;
def GFX10SpeedModel : SISchedMachineModel;
def GFX11SpeedModel : SISchedMachineModel;
+def GFX12SpeedModel : SISchedMachineModel;
// XXX: Are the resource counts correct?
def HWBranch : ProcResource<1> {
@@ -128,6 +135,10 @@ class HWWriteRes<SchedWrite write, list<ProcResourceKind> resources,
class HWVALUWriteRes<SchedWrite write, int latency> :
HWWriteRes<write, [HWVALU], latency>;
+class UnsupportedWriteRes<SchedWrite write> : WriteRes<write, []> {
+ let Unsupported = 1;
+}
+
def PredMIReadVGPR : SchedPredicate<[{TII->hasVGPRUses(*MI)}]>;
def MIReadVGPR : SchedReadVariant<[
@@ -157,14 +168,17 @@ multiclass SICommonWriteRes {
def : HWVALUWriteRes<Write4PassDGEMM, 4>;
def : HWVALUWriteRes<Write8PassDGEMM, 16>;
- let ResourceCycles = [2] in
+ let ReleaseAtCycles = [2] in
def : HWWriteRes<Write2PassMAI, [HWXDL], 2>;
- let ResourceCycles = [4] in
+ let ReleaseAtCycles = [4] in
def : HWWriteRes<Write4PassMAI, [HWXDL], 4>;
- let ResourceCycles = [8] in
+ let ReleaseAtCycles = [8] in
def : HWWriteRes<Write8PassMAI, [HWXDL], 8>;
- let ResourceCycles = [16] in
+ let ReleaseAtCycles = [16] in
def : HWWriteRes<Write16PassMAI, [HWXDL], 16>;
+
+ def : UnsupportedWriteRes<WriteSFPU>;
+ def : UnsupportedWriteRes<WritePseudoScalarTrans>;
} // End RetireOOO = 1
def : ReadAdvance<MIVGPRRead, -2>;
@@ -307,6 +321,9 @@ def : HWWriteRes<WriteSALU, [HWSALU, HWRC], 2>;
def : HWWriteRes<WriteSMEM, [HWLGKM, HWRC], 20>;
def : HWWriteRes<WriteVMEM, [HWVMEM, HWRC], 320>;
def : HWWriteRes<WriteBarrier, [HWBranch], 2000>;
+
+def : UnsupportedWriteRes<WriteSFPU>;
+def : UnsupportedWriteRes<WritePseudoScalarTrans>;
} // End RetireOOO = 1
def : InstRW<[WriteCopy], (instrs COPY)>;
@@ -315,26 +332,61 @@ def : InstRW<[WriteCopy], (instrs COPY)>;
let SchedModel = GFX11SpeedModel in {
+// The latency values are 1 / (operations / cycle).
+// Add 1 stall cycle for VGPR read.
+let RetireOOO = 1 in { // llvm-mca specific flag
def : HWWriteRes<Write32Bit, [HWVALU, HWRC], 5>;
def : HWWriteRes<WriteFloatCvt, [HWVALU, HWRC], 5>;
def : HWWriteRes<Write64Bit, [HWVALU, HWRC], 6>;
-def : HWWriteRes<WriteTrans32, [HWVALU, HWRC], 10>;
+def : HWWriteRes<WriteTrans32, [HWTransVALU, HWRC], 10>;
def : HWWriteRes<WriteQuarterRate32, [HWVALU, HWRC], 8>;
def : HWWriteRes<WriteFloatFMA, [HWVALU, HWRC], 5>;
def : HWWriteRes<WriteDouble, [HWVALU, HWRC], 38>;
def : HWWriteRes<WriteDoubleAdd, [HWVALU, HWRC], 38>;
def : HWWriteRes<WriteDoubleCvt, [HWVALU, HWRC], 38>;
def : HWWriteRes<WriteIntMul, [HWVALU, HWRC], 8>;
-def : HWWriteRes<WriteTrans64, [HWVALU, HWRC], 40>;
+def : HWWriteRes<WriteTrans64, [HWVALU, HWTransVALU, HWRC], 40>;
def : HWWriteRes<WriteBranch, [HWBranch], 32>;
def : HWWriteRes<WriteExport, [HWExport, HWRC], 16>;
def : HWWriteRes<WriteLDS, [HWLGKM, HWRC], 20>;
def : HWWriteRes<WriteSALU, [HWSALU, HWRC], 2>;
+def : HWWriteRes<WriteSFPU, [HWSALU, HWRC], 4>;
def : HWWriteRes<WriteSMEM, [HWLGKM, HWRC], 20>;
def : HWWriteRes<WriteVMEM, [HWVMEM, HWRC], 320>;
def : HWWriteRes<WriteBarrier, [HWBranch], 2000>;
+} // End RetireOOO = 1
+
+def : UnsupportedWriteRes<WritePseudoScalarTrans>;
def : InstRW<[WriteCopy], (instrs COPY)>;
} // End SchedModel = GFX11SpeedModel
+
+let SchedModel = GFX12SpeedModel in {
+
+def : HWWriteRes<Write32Bit, [HWVALU, HWRC], 5>;
+def : HWWriteRes<WriteFloatCvt, [HWVALU, HWRC], 5>;
+def : HWWriteRes<Write64Bit, [HWVALU, HWRC], 6>;
+def : HWWriteRes<WriteTrans32, [HWVALU, HWRC], 10>;
+def : HWWriteRes<WriteQuarterRate32, [HWVALU, HWRC], 8>;
+def : HWWriteRes<WriteFloatFMA, [HWVALU, HWRC], 5>;
+def : HWWriteRes<WriteDouble, [HWVALU, HWRC], 38>;
+def : HWWriteRes<WriteDoubleAdd, [HWVALU, HWRC], 38>;
+def : HWWriteRes<WriteDoubleCvt, [HWVALU, HWRC], 38>;
+def : HWWriteRes<WriteIntMul, [HWVALU, HWRC], 8>;
+def : HWWriteRes<WriteTrans64, [HWVALU, HWRC], 40>;
+def : HWWriteRes<WritePseudoScalarTrans, [HWVALU, HWRC], 7>;
+
+def : HWWriteRes<WriteBranch, [HWBranch], 32>;
+def : HWWriteRes<WriteExport, [HWExport, HWRC], 16>;
+def : HWWriteRes<WriteLDS, [HWLGKM, HWRC], 20>;
+def : HWWriteRes<WriteSALU, [HWSALU, HWRC], 2>;
+def : HWWriteRes<WriteSFPU, [HWSALU, HWRC], 4>;
+def : HWWriteRes<WriteSMEM, [HWLGKM, HWRC], 20>;
+def : HWWriteRes<WriteVMEM, [HWVMEM, HWRC], 320>;
+def : HWWriteRes<WriteBarrier, [HWBranch], 2000>;
+
+def : InstRW<[WriteCopy], (instrs COPY)>;
+
+} // End SchedModel = GFX12SpeedModel
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp
index 4159dc694c1e..d290dd82b760 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp
@@ -104,8 +104,7 @@ bool SIShrinkInstructions::foldImmediates(MachineInstr &MI,
bool ConstantFolded = false;
if (TII->isOperandLegal(MI, Src0Idx, &MovSrc)) {
- if (MovSrc.isImm() &&
- (isInt<32>(MovSrc.getImm()) || isUInt<32>(MovSrc.getImm()))) {
+ if (MovSrc.isImm()) {
Src0.ChangeToImmediate(MovSrc.getImm());
ConstantFolded = true;
} else if (MovSrc.isFI()) {
@@ -160,7 +159,7 @@ bool SIShrinkInstructions::shouldShrinkTrue16(MachineInstr &MI) const {
}
bool SIShrinkInstructions::isKImmOperand(const MachineOperand &Src) const {
- return isInt<16>(Src.getImm()) &&
+ return isInt<16>(SignExtend64(Src.getImm(), 32)) &&
!TII->isInlineConstant(*Src.getParent(), Src.getOperandNo());
}
@@ -171,7 +170,7 @@ bool SIShrinkInstructions::isKUImmOperand(const MachineOperand &Src) const {
bool SIShrinkInstructions::isKImmOrKUImmOperand(const MachineOperand &Src,
bool &IsUnsigned) const {
- if (isInt<16>(Src.getImm())) {
+ if (isInt<16>(SignExtend64(Src.getImm(), 32))) {
IsUnsigned = false;
return !TII->isInlineConstant(Src);
}
@@ -212,6 +211,9 @@ void SIShrinkInstructions::copyExtraImplicitOps(MachineInstr &NewMI,
}
void SIShrinkInstructions::shrinkScalarCompare(MachineInstr &MI) const {
+ if (!ST->hasSCmpK())
+ return;
+
// cmpk instructions do scc = dst <cc op> imm16, so commute the instruction to
// get constants on the RHS.
if (!MI.getOperand(0).isReg())
@@ -222,7 +224,7 @@ void SIShrinkInstructions::shrinkScalarCompare(MachineInstr &MI) const {
if (!Src0.isReg())
return;
- const MachineOperand &Src1 = MI.getOperand(1);
+ MachineOperand &Src1 = MI.getOperand(1);
if (!Src1.isImm())
return;
@@ -238,6 +240,7 @@ void SIShrinkInstructions::shrinkScalarCompare(MachineInstr &MI) const {
if (!HasUImm) {
SOPKOpc = (SOPKOpc == AMDGPU::S_CMPK_EQ_U32) ?
AMDGPU::S_CMPK_EQ_I32 : AMDGPU::S_CMPK_LG_I32;
+ Src1.setImm(SignExtend32(Src1.getImm(), 32));
}
MI.setDesc(TII->get(SOPKOpc));
@@ -250,6 +253,8 @@ void SIShrinkInstructions::shrinkScalarCompare(MachineInstr &MI) const {
if ((TII->sopkIsZext(SOPKOpc) && isKUImmOperand(Src1)) ||
(!TII->sopkIsZext(SOPKOpc) && isKImmOperand(Src1))) {
+ if (!TII->sopkIsZext(SOPKOpc))
+ Src1.setImm(SignExtend64(Src1.getImm(), 32));
MI.setDesc(NewDesc);
}
}
@@ -839,6 +844,7 @@ bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) {
unsigned Opc = (MI.getOpcode() == AMDGPU::S_ADD_I32) ?
AMDGPU::S_ADDK_I32 : AMDGPU::S_MULK_I32;
+ Src1->setImm(SignExtend64(Src1->getImm(), 32));
MI.setDesc(TII->get(Opc));
MI.tieOperands(0, 1);
}
@@ -858,9 +864,10 @@ bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) {
if (Src.isImm() && Dst.getReg().isPhysical()) {
int32_t ReverseImm;
- if (isKImmOperand(Src))
+ if (isKImmOperand(Src)) {
MI.setDesc(TII->get(AMDGPU::S_MOVK_I32));
- else if (isReverseInlineImm(Src, ReverseImm)) {
+ Src.setImm(SignExtend64(Src.getImm(), 32));
+ } else if (isReverseInlineImm(Src, ReverseImm)) {
MI.setDesc(TII->get(AMDGPU::S_BREV_B32));
Src.setImm(ReverseImm);
}
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
index 3143d437e370..59d6ccf513bb 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
@@ -238,9 +238,7 @@ public:
AU.addRequired<LiveIntervals>();
AU.addPreserved<SlotIndexes>();
AU.addPreserved<LiveIntervals>();
- AU.addRequired<MachineDominatorTree>();
AU.addPreserved<MachineDominatorTree>();
- AU.addRequired<MachinePostDominatorTree>();
AU.addPreserved<MachinePostDominatorTree>();
MachineFunctionPass::getAnalysisUsage(AU);
}
@@ -1320,7 +1318,8 @@ void SIWholeQuadMode::processBlock(MachineBasicBlock &MBB, bool IsEntry) {
auto II = MBB.getFirstNonPHI(), IE = MBB.end();
if (IsEntry) {
// Skip the instruction that saves LiveMask
- if (II != IE && II->getOpcode() == AMDGPU::COPY)
+ if (II != IE && II->getOpcode() == AMDGPU::COPY &&
+ II->getOperand(1).getReg() == TRI->getExec())
++II;
}
@@ -1594,8 +1593,8 @@ bool SIWholeQuadMode::runOnMachineFunction(MachineFunction &MF) {
TRI = &TII->getRegisterInfo();
MRI = &MF.getRegInfo();
LIS = &getAnalysis<LiveIntervals>();
- MDT = &getAnalysis<MachineDominatorTree>();
- PDT = &getAnalysis<MachinePostDominatorTree>();
+ MDT = getAnalysisIfAvailable<MachineDominatorTree>();
+ PDT = getAnalysisIfAvailable<MachinePostDominatorTree>();
if (ST->isWave32()) {
AndOpc = AMDGPU::S_AND_B32;
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SMInstructions.td b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SMInstructions.td
index 7ca685a0cc5d..3297847b0360 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SMInstructions.td
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SMInstructions.td
@@ -74,7 +74,7 @@ class SM_Real <SM_Pseudo ps, string opName = ps.Mnemonic>
bits<7> sdst;
bits<32> offset;
bits<8> soffset;
- bits<5> cpol;
+ bits<5> cpol;
}
class OffsetMode<bit hasOffset, bit hasSOffset, string variant,
@@ -211,6 +211,23 @@ class SM_WaveId_Pseudo<string opName, SDPatternOperator node> : SM_Pseudo<
let has_sbase = 0;
}
+class SM_Prefetch_Pseudo <string opName, RegisterClass baseClass, bit hasSBase>
+ : SM_Pseudo<opName, (outs), !con(!if(hasSBase, (ins baseClass:$sbase), (ins)),
+ (ins smem_offset:$offset, SReg_32:$soffset, i8imm:$sdata)),
+ !if(hasSBase, " $sbase,", "") # " $offset, $soffset, $sdata"> {
+ // Mark prefetches as both load and store to prevent reordering with loads
+ // and stores. This is also needed for pattern to match prefetch intrinsic.
+ let mayLoad = 1;
+ let mayStore = 1;
+ let has_glc = 0;
+ let LGKM_CNT = 0;
+ let has_sbase = hasSBase;
+ let ScalarStore = 0;
+ let has_offset = 1;
+ let has_soffset = 1;
+ let PseudoInstr = opName;
+}
+
//===----------------------------------------------------------------------===//
// Scalar Atomic Memory Classes
//===----------------------------------------------------------------------===//
@@ -234,8 +251,6 @@ class SM_Atomic_Pseudo <string opName,
let IsAtomicNoRet = !not(isRet);
let IsAtomicRet = isRet;
-
- let AsmMatchConverter = "cvtSMEMAtomic";
}
class SM_Pseudo_Atomic<string opName,
@@ -245,7 +260,7 @@ class SM_Pseudo_Atomic<string opName,
bit isRet,
string opNameWithSuffix =
opName # offsets.Variant # !if(isRet, "_RTN", ""),
- Operand CPolTy = !if(isRet, CPol_GLC1, CPol)> :
+ Operand CPolTy = !if(isRet, CPol_GLC, CPol_NonGLC)> :
SM_Atomic_Pseudo<opName,
!if(isRet, (outs dataClass:$sdst), (outs)),
!con((ins dataClass:$sdata, baseClass:$sbase), offsets.Ins,
@@ -285,6 +300,8 @@ multiclass SM_Pseudo_Atomics<RegisterClass baseClass,
// does sdst for SMRD on SI/CI?
defm S_LOAD_DWORD : SM_Pseudo_Loads <SReg_64, SReg_32_XM0_XEXEC>;
defm S_LOAD_DWORDX2 : SM_Pseudo_Loads <SReg_64, SReg_64_XEXEC>;
+let SubtargetPredicate = HasScalarDwordx3Loads in
+ defm S_LOAD_DWORDX3 : SM_Pseudo_Loads <SReg_64, SReg_96>;
defm S_LOAD_DWORDX4 : SM_Pseudo_Loads <SReg_64, SReg_128>;
defm S_LOAD_DWORDX8 : SM_Pseudo_Loads <SReg_64, SReg_256>;
defm S_LOAD_DWORDX16 : SM_Pseudo_Loads <SReg_64, SReg_512>;
@@ -294,6 +311,8 @@ defm S_BUFFER_LOAD_DWORD : SM_Pseudo_Loads <SReg_128, SReg_32_XM0_XEXEC>;
// FIXME: exec_lo/exec_hi appear to be allowed for SMRD loads on
// SI/CI, bit disallowed for SMEM on VI.
defm S_BUFFER_LOAD_DWORDX2 : SM_Pseudo_Loads <SReg_128, SReg_64_XEXEC>;
+let SubtargetPredicate = HasScalarDwordx3Loads in
+ defm S_BUFFER_LOAD_DWORDX3 : SM_Pseudo_Loads <SReg_128, SReg_96>;
defm S_BUFFER_LOAD_DWORDX4 : SM_Pseudo_Loads <SReg_128, SReg_128>;
defm S_BUFFER_LOAD_DWORDX8 : SM_Pseudo_Loads <SReg_128, SReg_256>;
defm S_BUFFER_LOAD_DWORDX16 : SM_Pseudo_Loads <SReg_128, SReg_512>;
@@ -417,6 +436,16 @@ defm S_DCACHE_DISCARD : SM_Pseudo_Discards;
defm S_DCACHE_DISCARD_X2 : SM_Pseudo_Discards;
}
+let SubtargetPredicate = isGFX12Plus in {
+def S_PREFETCH_INST : SM_Prefetch_Pseudo <"s_prefetch_inst", SReg_64, 1>;
+def S_PREFETCH_INST_PC_REL : SM_Prefetch_Pseudo <"s_prefetch_inst_pc_rel", SReg_64, 0>;
+def S_PREFETCH_DATA : SM_Prefetch_Pseudo <"s_prefetch_data", SReg_64, 1>;
+def S_PREFETCH_DATA_PC_REL : SM_Prefetch_Pseudo <"s_prefetch_data_pc_rel", SReg_64, 0>;
+def S_BUFFER_PREFETCH_DATA : SM_Prefetch_Pseudo <"s_buffer_prefetch_data", SReg_128, 1> {
+ let is_buffer = 1;
+}
+} // end let SubtargetPredicate = isGFX12Plus
+
//===----------------------------------------------------------------------===//
// Targets
//===----------------------------------------------------------------------===//
@@ -789,6 +818,14 @@ def smrd_load : PatFrag <(ops node:$ptr), (load node:$ptr), [{ return isUniformL
}];
}
+def smrd_prefetch : PatFrag <(ops node:$ptr, node:$rw, node:$loc, node:$type),
+ (prefetch node:$ptr, node:$rw, node:$loc, node:$type),
+ [{ return !N->getOperand(1)->isDivergent();}]> {
+ let GISelPredicateCode = [{
+ return isInstrUniform(MI);
+ }];
+}
+
def SMRDImm : ComplexPattern<iPTR, 2, "SelectSMRDImm">;
def SMRDImm32 : ComplexPattern<iPTR, 2, "SelectSMRDImm32">;
def SMRDSgpr : ComplexPattern<iPTR, 2, "SelectSMRDSgpr">;
@@ -797,7 +834,7 @@ def SMRDBufferImm : ComplexPattern<iPTR, 1, "SelectSMRDBufferImm">;
def SMRDBufferImm32 : ComplexPattern<iPTR, 1, "SelectSMRDBufferImm32">;
def SMRDBufferSgprImm : ComplexPattern<iPTR, 2, "SelectSMRDBufferSgprImm">;
-multiclass SMRD_Pattern <string Instr, ValueType vt> {
+multiclass SMRD_Pattern <string Instr, ValueType vt, bit immci = true> {
// 1. IMM offset
def : GCNPat <
@@ -806,7 +843,7 @@ multiclass SMRD_Pattern <string Instr, ValueType vt> {
>;
// 2. 32-bit IMM offset on CI
- def : GCNPat <
+ if immci then def : GCNPat <
(smrd_load (SMRDImm32 i64:$sbase, i32:$offset)),
(vt (!cast<InstSI>(Instr#"_IMM_ci") $sbase, $offset, 0))> {
let OtherPredicates = [isGFX7Only];
@@ -838,7 +875,7 @@ multiclass SMRD_Pattern <string Instr, ValueType vt> {
>;
}
-multiclass SMLoad_Pattern <string Instr, ValueType vt> {
+multiclass SMLoad_Pattern <string Instr, ValueType vt, bit immci = true> {
// 1. Offset as an immediate
def : GCNPat <
(SIsbuffer_load v4i32:$sbase, (SMRDBufferImm i32:$offset), timm:$cachepolicy),
@@ -847,7 +884,7 @@ multiclass SMLoad_Pattern <string Instr, ValueType vt> {
}
// 2. 32-bit IMM offset on CI
- def : GCNPat <
+ if immci then def : GCNPat <
(vt (SIsbuffer_load v4i32:$sbase, (SMRDBufferImm32 i32:$offset), timm:$cachepolicy)),
(!cast<InstSI>(Instr#"_IMM_ci") SReg_128:$sbase, smrd_literal_offset:$offset,
(extract_cpol $cachepolicy))> {
@@ -890,6 +927,10 @@ foreach vt = SReg_64.RegTypes in {
defm : SMRD_Pattern <"S_LOAD_DWORDX2", vt>;
}
+foreach vt = SReg_96.RegTypes in {
+defm : SMRD_Pattern <"S_LOAD_DWORDX3", vt, false>;
+}
+
foreach vt = SReg_128.RegTypes in {
defm : SMRD_Pattern <"S_LOAD_DWORDX4", vt>;
}
@@ -906,12 +947,14 @@ defm : SMRD_Pattern <"S_LOAD_DWORDX16", vt>;
defm : SMLoad_Pattern <"S_BUFFER_LOAD_DWORD", i32>;
defm : SMLoad_Pattern <"S_BUFFER_LOAD_DWORDX2", v2i32>;
+defm : SMLoad_Pattern <"S_BUFFER_LOAD_DWORDX3", v3i32, false>;
defm : SMLoad_Pattern <"S_BUFFER_LOAD_DWORDX4", v4i32>;
defm : SMLoad_Pattern <"S_BUFFER_LOAD_DWORDX8", v8i32>;
defm : SMLoad_Pattern <"S_BUFFER_LOAD_DWORDX16", v16i32>;
defm : SMLoad_Pattern <"S_BUFFER_LOAD_DWORD", f32>;
defm : SMLoad_Pattern <"S_BUFFER_LOAD_DWORDX2", v2f32>;
+defm : SMLoad_Pattern <"S_BUFFER_LOAD_DWORDX3", v3f32, false>;
defm : SMLoad_Pattern <"S_BUFFER_LOAD_DWORDX4", v4f32>;
defm : SMLoad_Pattern <"S_BUFFER_LOAD_DWORDX8", v8f32>;
defm : SMLoad_Pattern <"S_BUFFER_LOAD_DWORDX16", v16f32>;
@@ -934,6 +977,21 @@ def : GCNPat <
}
} // let OtherPredicates = [HasShaderCyclesRegister]
+multiclass SMPrefetchPat<string type, int cache_type> {
+ def : GCNPat <
+ (smrd_prefetch (SMRDImm i64:$sbase, i32:$offset), timm, timm, (i32 cache_type)),
+ (!cast<SM_Prefetch_Pseudo>("S_PREFETCH_"#type) $sbase, $offset, (i32 SGPR_NULL), (i8 0))
+ >;
+
+ def : GCNPat <
+ (smrd_prefetch (i64 SReg_64:$sbase), timm, timm, (i32 cache_type)),
+ (!cast<SM_Prefetch_Pseudo>("S_PREFETCH_"#type) $sbase, 0, (i32 SGPR_NULL), (i8 0))
+ >;
+}
+
+defm : SMPrefetchPat<"INST", 0>;
+defm : SMPrefetchPat<"DATA", 1>;
+
//===----------------------------------------------------------------------===//
// GFX10.
//===----------------------------------------------------------------------===//
@@ -1154,7 +1212,7 @@ def SMInfoTable : GenericTable {
class SMEM_Real_gfx11<bits<8> op, SM_Pseudo ps, string opName = ps.Mnemonic> :
SMEM_Real_10Plus_common<op, ps, opName, SIEncodingFamily.GFX11,
SGPR_NULL_gfx11plus> {
- let AssemblerPredicate = isGFX11Plus;
+ let AssemblerPredicate = isGFX11Only;
let DecoderNamespace = "GFX11";
let Inst{13} = !if(ps.has_dlc, cpol{CPolBit.DLC}, 0);
let Inst{14} = !if(ps.has_glc, cpol{CPolBit.GLC}, 0);
@@ -1205,3 +1263,84 @@ multiclass SM_Real_Probe_gfx11<bits<8> op> {
defm S_ATC_PROBE : SM_Real_Probe_gfx11 <0x22>;
defm S_ATC_PROBE_BUFFER : SM_Real_Probe_gfx11 <0x23>;
+
+//===----------------------------------------------------------------------===//
+// GFX12.
+//===----------------------------------------------------------------------===//
+
+class SMEM_Real_gfx12Plus<bits<6> op, SM_Pseudo ps, string opName,
+ int subtarget, RegisterWithSubRegs sgpr_null> :
+ SM_Real<ps, opName>, SIMCInstr<ps.PseudoInstr, subtarget>, Enc64 {
+
+ let Inst{18-13} = op;
+ let Inst{31-26} = 0x3d;
+
+ let Inst{55-32} = !if(ps.has_offset, offset{23-0}, !if(ps.has_soffset, 0, ?));
+ let Inst{63-57} = !if(ps.has_soffset, soffset{6-0},
+ !if(ps.has_offset, sgpr_null.HWEncoding{6-0}, ?));
+}
+
+class SMEM_Real_gfx12<bits<6> op, SM_Pseudo ps, string opName = ps.Mnemonic> :
+ SMEM_Real_gfx12Plus<op, ps, opName, SIEncodingFamily.GFX12,
+ SGPR_NULL_gfx11plus> {
+ let AssemblerPredicate = isGFX12Plus;
+ let DecoderNamespace = "GFX12";
+
+ let Inst{5-0} = !if(ps.has_sbase, sbase{6-1}, ?);
+ let Inst{12-6} = !if(ps.has_sdst, sdst{6-0}, ?);
+}
+
+class SMEM_Real_Prefetch_gfx12<bits<6> op, SM_Pseudo ps> :
+ SMEM_Real_gfx12<op, ps> {
+ bits<7> sdata; // Only 5 bits of sdata are supported.
+
+ let sdst = ?;
+ let Inst{12-11} = 0; // Unused sdata bits.
+ let Inst{10-6} = !if(ps.has_sdst, sdata{4-0}, ?);
+}
+
+class SMEM_Real_Load_gfx12<bits<6> op, string ps, string opName, OffsetMode offsets> :
+ SMEM_Real_gfx12<op, !cast<SM_Pseudo>(ps # offsets.Variant), opName> {
+ RegisterClass BaseClass = !cast<SM_Load_Pseudo>(ps # offsets.Variant).BaseClass;
+ let InOperandList = !con((ins BaseClass:$sbase), offsets.Ins, (ins CPol:$cpol));
+
+ let Inst{22-21} = cpol{4-3}; // scope
+ let Inst{24-23} = cpol{1-0}; // th - only lower 2 bits are supported
+}
+
+multiclass SM_Real_Loads_gfx12<bits<6> op, string ps = NAME> {
+ defvar opName = !tolower(NAME);
+ def _IMM_gfx12 : SMEM_Real_Load_gfx12<op, ps, opName, IMM_Offset>;
+ def _SGPR_IMM_gfx12 : SMEM_Real_Load_gfx12<op, ps, opName, SGPR_IMM_Offset>;
+}
+
+defm S_LOAD_B32 : SM_Real_Loads_gfx12<0x00, "S_LOAD_DWORD">;
+defm S_LOAD_B64 : SM_Real_Loads_gfx12<0x01, "S_LOAD_DWORDX2">;
+defm S_LOAD_B96 : SM_Real_Loads_gfx12<0x05, "S_LOAD_DWORDX3">;
+defm S_LOAD_B128 : SM_Real_Loads_gfx12<0x02, "S_LOAD_DWORDX4">;
+defm S_LOAD_B256 : SM_Real_Loads_gfx12<0x03, "S_LOAD_DWORDX8">;
+defm S_LOAD_B512 : SM_Real_Loads_gfx12<0x04, "S_LOAD_DWORDX16">;
+
+defm S_BUFFER_LOAD_B32 : SM_Real_Loads_gfx12<0x10, "S_BUFFER_LOAD_DWORD">;
+defm S_BUFFER_LOAD_B64 : SM_Real_Loads_gfx12<0x11, "S_BUFFER_LOAD_DWORDX2">;
+defm S_BUFFER_LOAD_B96 : SM_Real_Loads_gfx12<0x15, "S_BUFFER_LOAD_DWORDX3">;
+defm S_BUFFER_LOAD_B128 : SM_Real_Loads_gfx12<0x12, "S_BUFFER_LOAD_DWORDX4">;
+defm S_BUFFER_LOAD_B256 : SM_Real_Loads_gfx12<0x13, "S_BUFFER_LOAD_DWORDX8">;
+defm S_BUFFER_LOAD_B512 : SM_Real_Loads_gfx12<0x14, "S_BUFFER_LOAD_DWORDX16">;
+
+def S_DCACHE_INV_gfx12 : SMEM_Real_gfx12<0x021, S_DCACHE_INV>;
+
+def S_PREFETCH_INST_gfx12 : SMEM_Real_Prefetch_gfx12<0x24, S_PREFETCH_INST>;
+def S_PREFETCH_INST_PC_REL_gfx12 : SMEM_Real_Prefetch_gfx12<0x25, S_PREFETCH_INST_PC_REL>;
+def S_PREFETCH_DATA_gfx12 : SMEM_Real_Prefetch_gfx12<0x26, S_PREFETCH_DATA>;
+def S_BUFFER_PREFETCH_DATA_gfx12 : SMEM_Real_Prefetch_gfx12<0x27, S_BUFFER_PREFETCH_DATA>;
+def S_PREFETCH_DATA_PC_REL_gfx12 : SMEM_Real_Prefetch_gfx12<0x28, S_PREFETCH_DATA_PC_REL>;
+
+multiclass SMEM_Real_Probe_gfx12<bits<6> op> {
+ defvar ps = NAME;
+ def _IMM_gfx12 : SMEM_Real_Prefetch_gfx12<op, !cast<SM_Probe_Pseudo>(ps#_IMM)>;
+ def _SGPR_IMM_gfx12 : SMEM_Real_Prefetch_gfx12<op, !cast<SM_Probe_Pseudo>(ps#_SGPR_IMM)>;
+}
+
+defm S_ATC_PROBE : SMEM_Real_Probe_gfx12<0x22>;
+defm S_ATC_PROBE_BUFFER : SMEM_Real_Probe_gfx12<0x23>;
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SOPInstructions.td b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SOPInstructions.td
index bee996d1b0df..c9687ac368d3 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SOPInstructions.td
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SOPInstructions.td
@@ -15,6 +15,7 @@ class SOP_Pseudo<string opName, dag outs, dag ins, string asmOps,
let isPseudo = 1;
let isCodeGenOnly = 1;
+ let Size = 4;
string Mnemonic = opName;
string AsmOperands = asmOps;
@@ -36,7 +37,6 @@ class SOP1_Pseudo <string opName, dag outs, dag ins,
let SALU = 1;
let SOP1 = 1;
let SchedRW = [WriteSALU];
- let Size = 4;
let UseNamedOperandTable = 1;
bits<1> has_src0 = 1;
@@ -216,8 +216,10 @@ let Defs = [SCC] in {
def S_NOT_B64 : SOP1_64 <"s_not_b64",
[(set i64:$sdst, (UniformUnaryFrag<not> i64:$src0))]
>;
- def S_WQM_B32 : SOP1_32 <"s_wqm_b32">;
- def S_WQM_B64 : SOP1_64 <"s_wqm_b64">;
+ def S_WQM_B32 : SOP1_32 <"s_wqm_b32",
+ [(set i32:$sdst, (int_amdgcn_s_wqm i32:$src0))]>;
+ def S_WQM_B64 : SOP1_64 <"s_wqm_b64",
+ [(set i64:$sdst, (int_amdgcn_s_wqm i64:$src0))]>;
} // End Defs = [SCC]
@@ -290,6 +292,7 @@ def S_BITSET0_B64 : SOP1_64_32 <"s_bitset0_b64", [], 1>;
def S_BITSET1_B32 : SOP1_32 <"s_bitset1_b32", [], 1>;
def S_BITSET1_B64 : SOP1_64_32 <"s_bitset1_b64", [], 1>;
+let isReMaterializable = 1 in
def S_GETPC_B64 : SOP1_64_0 <"s_getpc_b64",
[(set i64:$sdst, (int_amdgcn_s_getpc))]
>;
@@ -326,8 +329,10 @@ def S_XNOR_SAVEEXEC_B64 : SOP1_64 <"s_xnor_saveexec_b64">;
} // End hasSideEffects = 1, Uses = [EXEC], Defs = [EXEC, SCC]
-def S_QUADMASK_B32 : SOP1_32 <"s_quadmask_b32">;
-def S_QUADMASK_B64 : SOP1_64 <"s_quadmask_b64">;
+def S_QUADMASK_B32 : SOP1_32 <"s_quadmask_b32",
+ [(set i32:$sdst, (int_amdgcn_s_quadmask i32:$src0))]>;
+def S_QUADMASK_B64 : SOP1_64 <"s_quadmask_b64",
+ [(set i64:$sdst, (int_amdgcn_s_quadmask i64:$src0))]>;
let Uses = [M0] in {
def S_MOVRELS_B32 : SOP1_32R <"s_movrels_b32">;
@@ -362,7 +367,8 @@ let SubtargetPredicate = isGFX9Plus in {
} // End hasSideEffects = 1, Defs = [EXEC, SCC], Uses = [EXEC]
let isReMaterializable = 1 in
- def S_BITREPLICATE_B64_B32 : SOP1_64_32<"s_bitreplicate_b64_b32">;
+ def S_BITREPLICATE_B64_B32 : SOP1_64_32<"s_bitreplicate_b64_b32",
+ [(set i64:$sdst, (int_amdgcn_s_bitreplicate i32:$src0))]>;
} // End SubtargetPredicate = isGFX9Plus
let SubtargetPredicate = isGFX10Plus in {
@@ -401,6 +407,120 @@ let SubtargetPredicate = isGFX11Plus in {
}
} // End SubtargetPredicate = isGFX11Plus
+class SOP1_F32_Inst<string opName, SDPatternOperator Op, ValueType vt0=f32,
+ ValueType vt1=vt0> :
+ SOP1_32<opName, [(set vt0:$sdst, (UniformUnaryFrag<Op> vt1:$src0))]>;
+
+let SubtargetPredicate = HasSALUFloatInsts, Uses = [MODE],
+ SchedRW = [WriteSFPU], isReMaterializable = 1 in {
+ def S_CVT_F32_I32 : SOP1_F32_Inst<"s_cvt_f32_i32", sint_to_fp, f32, i32>;
+ def S_CVT_F32_U32 : SOP1_F32_Inst<"s_cvt_f32_u32", uint_to_fp, f32, i32>;
+
+ let mayRaiseFPException = 1 in {
+ def S_CVT_I32_F32 : SOP1_F32_Inst<"s_cvt_i32_f32", fp_to_sint, i32, f32>;
+ def S_CVT_U32_F32 : SOP1_F32_Inst<"s_cvt_u32_f32", fp_to_uint, i32, f32>;
+ def S_CVT_F32_F16 : SOP1_F32_Inst<"s_cvt_f32_f16", fpextend, f32, f16>;
+ def S_CVT_HI_F32_F16 : SOP1_32<"s_cvt_hi_f32_f16">;
+
+ def S_CEIL_F32 : SOP1_F32_Inst<"s_ceil_f32", fceil>;
+ def S_FLOOR_F32 : SOP1_F32_Inst<"s_floor_f32", ffloor>;
+ def S_TRUNC_F32 : SOP1_F32_Inst<"s_trunc_f32", ftrunc>;
+ def S_RNDNE_F32 : SOP1_F32_Inst<"s_rndne_f32", froundeven>;
+
+ let FPDPRounding = 1 in
+ def S_CVT_F16_F32 : SOP1_F32_Inst<"s_cvt_f16_f32", fpround, f16, f32>;
+
+ def S_CEIL_F16 : SOP1_F32_Inst<"s_ceil_f16", fceil, f16>;
+ def S_FLOOR_F16 : SOP1_F32_Inst<"s_floor_f16", ffloor, f16>;
+ def S_TRUNC_F16 : SOP1_F32_Inst<"s_trunc_f16", ftrunc, f16>;
+ def S_RNDNE_F16 : SOP1_F32_Inst<"s_rndne_f16", froundeven, f16>;
+ } // End mayRaiseFPException = 1
+} // End SubtargetPredicate = HasSALUFloatInsts, Uses = [MODE]
+ // SchedRW = [WriteSFPU], isReMaterializable = 1
+
+let hasSideEffects = 1 in {
+let has_sdst = 0 in {
+let Uses = [M0] in {
+def S_BARRIER_SIGNAL_M0 : SOP1_Pseudo <"s_barrier_signal m0", (outs), (ins),
+ "", [(int_amdgcn_s_barrier_signal_var M0)]>{
+ let SchedRW = [WriteBarrier];
+ let isConvergent = 1;
+}
+
+def S_BARRIER_SIGNAL_ISFIRST_M0 : SOP1_Pseudo <"s_barrier_signal_isfirst m0", (outs), (ins),
+ "", [(set SCC, (int_amdgcn_s_barrier_signal_isfirst_var M0))]>{
+ let Defs = [SCC];
+ let SchedRW = [WriteBarrier];
+ let isConvergent = 1;
+}
+
+def S_BARRIER_INIT_M0 : SOP1_Pseudo <"s_barrier_init m0", (outs), (ins),
+ "", []>{
+ let SchedRW = [WriteBarrier];
+ let isConvergent = 1;
+}
+
+def S_BARRIER_INIT_IMM : SOP1_Pseudo <"s_barrier_init", (outs),
+ (ins SplitBarrier:$src0), "$src0", []>{
+ let SchedRW = [WriteBarrier];
+ let isConvergent = 1;
+}
+
+def S_BARRIER_JOIN_M0 : SOP1_Pseudo <"s_barrier_join m0", (outs), (ins),
+ "", []>{
+ let SchedRW = [WriteBarrier];
+ let isConvergent = 1;
+}
+
+def S_WAKEUP_BARRIER_M0 : SOP1_Pseudo <"s_wakeup_barrier m0", (outs), (ins),
+ "", []>{
+ let SchedRW = [WriteBarrier];
+ let isConvergent = 1;
+}
+} // End Uses = [M0]
+
+def S_BARRIER_SIGNAL_IMM : SOP1_Pseudo <"s_barrier_signal", (outs),
+ (ins SplitBarrier:$src0), "$src0", [(int_amdgcn_s_barrier_signal timm:$src0)]>{
+ let SchedRW = [WriteBarrier];
+ let isConvergent = 1;
+}
+
+def S_BARRIER_SIGNAL_ISFIRST_IMM : SOP1_Pseudo <"s_barrier_signal_isfirst", (outs),
+ (ins SplitBarrier:$src0), "$src0", [(set SCC, (int_amdgcn_s_barrier_signal_isfirst timm:$src0))]>{
+ let Defs = [SCC];
+ let SchedRW = [WriteBarrier];
+ let isConvergent = 1;
+}
+
+def S_BARRIER_JOIN_IMM : SOP1_Pseudo <"s_barrier_join", (outs),
+ (ins SplitBarrier:$src0), "$src0", []>{
+ let SchedRW = [WriteBarrier];
+ let isConvergent = 1;
+}
+
+def S_WAKEUP_BARRIER_IMM : SOP1_Pseudo <"s_wakeup_barrier", (outs),
+ (ins SplitBarrier:$src0), "$src0", []>{
+ let SchedRW = [WriteBarrier];
+ let isConvergent = 1;
+
+
+}
+} // End has_sdst = 0
+
+def S_GET_BARRIER_STATE_IMM : SOP1_Pseudo <"s_get_barrier_state", (outs SSrc_b32:$sdst),
+ (ins SplitBarrier:$src0), "$sdst, $src0", []>{
+ let SchedRW = [WriteBarrier];
+ let isConvergent = 1;
+}
+
+def S_GET_BARRIER_STATE_M0 : SOP1_Pseudo <"s_get_barrier_state $sdst, m0", (outs SSrc_b32:$sdst),
+ (ins), "", []>{
+ let Uses = [M0];
+ let SchedRW = [WriteBarrier];
+ let isConvergent = 1;
+}
+} // End hasSideEffects = 1
+
//===----------------------------------------------------------------------===//
// SOP2 Instructions
//===----------------------------------------------------------------------===//
@@ -424,13 +544,11 @@ class SOP2_Pseudo<string opName, dag outs, dag ins,
// let sdst = xxx in {
// for multiclasses that include both real and pseudo instructions.
// field bits<7> sdst = 0;
- // let Size = 4; // Do we need size here?
}
-class SOP2_Real<bits<7> op, SOP_Pseudo ps, string real_name = ps.Mnemonic> :
+class SOP2_Real<SOP_Pseudo ps, string real_name = ps.Mnemonic> :
InstSI <ps.OutOperandList, ps.InOperandList,
- real_name # ps.AsmOperands>,
- Enc32 {
+ real_name # ps.AsmOperands> {
let SALU = 1;
let SOP2 = 1;
let isPseudo = 0;
@@ -444,12 +562,18 @@ class SOP2_Real<bits<7> op, SOP_Pseudo ps, string real_name = ps.Mnemonic> :
let SchedRW = ps.SchedRW;
let mayLoad = ps.mayLoad;
let mayStore = ps.mayStore;
+ let Constraints = ps.Constraints;
+ let DisableEncoding = ps.DisableEncoding;
// encoding
bits<7> sdst;
bits<8> src0;
bits<8> src1;
+ bits<32> imm;
+}
+class SOP2_Real32<bits<7> op, SOP_Pseudo ps, string real_name = ps.Mnemonic> :
+ SOP2_Real<ps, real_name>, Enc32 {
let Inst{7-0} = src0;
let Inst{15-8} = src1;
let Inst{22-16} = !if(ps.has_sdst, sdst, ?);
@@ -457,12 +581,31 @@ class SOP2_Real<bits<7> op, SOP_Pseudo ps, string real_name = ps.Mnemonic> :
let Inst{31-30} = 0x2; // encoding
}
+class SOP2_Real64<bits<7> op, SOP_Pseudo ps, string real_name = ps.Mnemonic> :
+ SOP2_Real<ps, real_name>, Enc64 {
+ let Inst{7-0} = src0;
+ let Inst{15-8} = src1;
+ let Inst{22-16} = !if(ps.has_sdst, sdst, ?);
+ let Inst{29-23} = op;
+ let Inst{31-30} = 0x2; // encoding
+ let Inst{63-32} = imm;
+}
+
+class SOP2_F16 <string opName, list<dag> pattern=[]> : SOP2_Pseudo <
+ opName, (outs SReg_32:$sdst), (ins SSrc_f16:$src0, SSrc_f16:$src1),
+ "$sdst, $src0, $src1", pattern
+>;
class SOP2_32 <string opName, list<dag> pattern=[]> : SOP2_Pseudo <
opName, (outs SReg_32:$sdst), (ins SSrc_b32:$src0, SSrc_b32:$src1),
"$sdst, $src0, $src1", pattern
>;
+class SOP2_F32 <string opName, list<dag> pattern=[]> : SOP2_Pseudo <
+ opName, (outs SReg_32:$sdst), (ins SSrc_f32:$src0, SSrc_f32:$src1),
+ "$sdst, $src0, $src1", pattern
+>;
+
class SOP2_64 <string opName, list<dag> pattern=[]> : SOP2_Pseudo <
opName, (outs SReg_64:$sdst), (ins SSrc_b64:$src0, SSrc_b64:$src1),
"$sdst, $src0, $src1", pattern
@@ -518,19 +661,22 @@ def S_MAX_U32 : SOP2_32 <"s_max_u32",
} // End isCommutable = 1
} // End Defs = [SCC]
-def SelectPat : PatFrag <
- (ops node:$src1, node:$src2),
- (select SCC, $src1, $src2),
- [{ return !N->isDivergent(); }]
->;
+let SubtargetPredicate = isGFX12Plus in {
+ def S_ADD_U64 : SOP2_64<"s_add_u64">{
+ let isCommutable = 1;
+ }
-let Uses = [SCC] in {
- let AddedComplexity = 20 in {
- def S_CSELECT_B32 : SOP2_32 <"s_cselect_b32",
- [(set i32:$sdst, (SelectPat i32:$src0, i32:$src1))]
- >;
+ def S_SUB_U64 : SOP2_64<"s_sub_u64">;
+
+ def S_MUL_U64 : SOP2_64 <"s_mul_u64",
+ [(set i64:$sdst, (UniformBinFrag<mul> i64:$src0, i64:$src1))]> {
+ let isCommutable = 1;
}
+} // End SubtargetPredicate = isGFX12Plus
+
+let Uses = [SCC] in {
+ def S_CSELECT_B32 : SOP2_32 <"s_cselect_b32">;
def S_CSELECT_B64 : SOP2_64 <"s_cselect_b64">;
} // End Uses = [SCC]
@@ -705,6 +851,83 @@ let SubtargetPredicate = isGFX11Plus in {
def S_PACK_HL_B32_B16 : SOP2_32<"s_pack_hl_b32_b16">;
} // End SubtargetPredicate = isGFX11Plus
+class SOP2_F32_Inst<string opName, SDPatternOperator Op, ValueType dstVt=f32> :
+ SOP2_F32<opName,
+ [(set dstVt:$sdst, (UniformBinFrag<Op> SSrc_f32:$src0, SSrc_f32:$src1))]>;
+
+class SOP2_F16_Inst<string opName, SDPatternOperator Op> :
+ SOP2_F16<opName,
+ [(set f16:$sdst, (UniformBinFrag<Op> SSrc_f16:$src0, SSrc_f16:$src1))]>;
+
+let SubtargetPredicate = HasSALUFloatInsts, mayRaiseFPException = 1,
+ Uses = [MODE], SchedRW = [WriteSFPU] in {
+ let isReMaterializable = 1 in {
+ let isCommutable = 1 in {
+ def S_ADD_F32 : SOP2_F32_Inst<"s_add_f32", any_fadd>;
+ def S_MIN_F32 : SOP2_F32_Inst<"s_min_f32", fminnum_like>;
+ def S_MAX_F32 : SOP2_F32_Inst<"s_max_f32", fmaxnum_like>;
+ def S_MUL_F32 : SOP2_F32_Inst<"s_mul_f32", any_fmul>;
+
+ let FixedSize = 1 in
+ def S_FMAAK_F32 : SOP2_Pseudo<
+ "s_fmaak_f32", (outs SReg_32:$sdst),
+ (ins SSrc_f32_Deferred:$src0, SSrc_f32_Deferred:$src1, KImmFP32:$imm),
+ "$sdst, $src0, $src1, $imm"
+ >;
+
+ let FPDPRounding = 1 in {
+ def S_ADD_F16 : SOP2_F16_Inst<"s_add_f16", any_fadd>;
+ def S_MUL_F16 : SOP2_F16_Inst<"s_mul_f16", any_fmul>;
+ } // End FPDPRounding
+
+ def S_MIN_F16 : SOP2_F16_Inst<"s_min_f16", fminnum_like>;
+ def S_MAX_F16 : SOP2_F16_Inst<"s_max_f16", fmaxnum_like>;
+ } // End isCommutable = 1
+
+ let FPDPRounding = 1 in
+ def S_SUB_F16 : SOP2_F16_Inst<"s_sub_f16", any_fsub>;
+
+ def S_SUB_F32 : SOP2_F32_Inst<"s_sub_f32", any_fsub>;
+ def S_CVT_PK_RTZ_F16_F32 : SOP2_F32_Inst<"s_cvt_pk_rtz_f16_f32",
+ AMDGPUpkrtz_f16_f32, v2f16>;
+
+ let FixedSize = 1 in
+ def S_FMAMK_F32 : SOP2_Pseudo<
+ "s_fmamk_f32", (outs SReg_32:$sdst),
+ (ins SSrc_f32_Deferred:$src0, KImmFP32:$imm, SSrc_f32_Deferred:$src1),
+ "$sdst, $src0, $imm, $src1"
+ >;
+ } // End isReMaterializable = 1
+
+ let Constraints = "$sdst = $src2", DisableEncoding="$src2",
+ isCommutable = 1, AddedComplexity = 20 in {
+ def S_FMAC_F32 : SOP2_Pseudo<
+ "s_fmac_f32", (outs SReg_32:$sdst),
+ (ins SSrc_f32:$src0, SSrc_f32:$src1, SReg_32:$src2),
+ "$sdst, $src0, $src1",
+ [(set f32:$sdst, (UniformTernaryFrag<any_fma> SSrc_f32:$src0, SSrc_f32:$src1, SReg_32:$src2))]
+ >;
+
+ def S_FMAC_F16 : SOP2_Pseudo<
+ "s_fmac_f16", (outs SReg_32:$sdst),
+ (ins SSrc_f16:$src0, SSrc_f16:$src1, SReg_32:$src2),
+ "$sdst, $src0, $src1",
+ [(set f16:$sdst, (UniformTernaryFrag<any_fma> SSrc_f16:$src0, SSrc_f16:$src1, SReg_32:$src2))]
+ >;
+ } // End Constraints = "$sdst = $src2", DisableEncoding="$src2",
+ // isCommutable = 1, AddedComplexity = 20
+} // End SubtargetPredicate = HasSALUFloatInsts, mayRaiseFPException = 1,
+ // Uses = [MODE], SchedRW = [WriteSFPU]
+
+// On GFX12 MIN/MAX instructions do not read MODE register.
+let SubtargetPredicate = isGFX12Plus, mayRaiseFPException = 1, isCommutable = 1,
+ isReMaterializable = 1, SchedRW = [WriteSFPU] in {
+ def S_MINIMUM_F32 : SOP2_F32_Inst<"s_minimum_f32", fminimum>;
+ def S_MAXIMUM_F32 : SOP2_F32_Inst<"s_maximum_f32", fmaximum>;
+ def S_MINIMUM_F16 : SOP2_F16_Inst<"s_minimum_f16", fminimum>;
+ def S_MAXIMUM_F16 : SOP2_F16_Inst<"s_maximum_f16", fmaximum>;
+}
+
//===----------------------------------------------------------------------===//
// SOPK Instructions
//===----------------------------------------------------------------------===//
@@ -724,9 +947,9 @@ class SOPK_Pseudo <string opName, dag outs, dag ins,
let has_sdst = 1;
}
-class SOPK_Real<SOPK_Pseudo ps> :
+class SOPK_Real<SOPK_Pseudo ps, string real_name = ps.Mnemonic> :
InstSI <ps.OutOperandList, ps.InOperandList,
- ps.Mnemonic # ps.AsmOperands> {
+ real_name # ps.AsmOperands> {
let SALU = 1;
let SOPK = 1;
let isPseudo = 0;
@@ -750,8 +973,8 @@ class SOPK_Real<SOPK_Pseudo ps> :
bits<32> imm;
}
-class SOPK_Real32<bits<5> op, SOPK_Pseudo ps> :
- SOPK_Real <ps>,
+class SOPK_Real32<bits<5> op, SOPK_Pseudo ps, string real_name = ps.Mnemonic> :
+ SOPK_Real <ps, real_name>,
Enc32 {
let Inst{15-0} = simm16;
let Inst{22-16} = !if(ps.has_sdst, sdst, ?);
@@ -870,6 +1093,8 @@ def S_CBRANCH_I_FORK : SOPK_Pseudo <
// This is hasSideEffects to allow its use in readcyclecounter selection.
// FIXME: Need to truncate immediate to 16-bits.
+// FIXME: Missing mode register use. Should have separate pseudos for
+// known may read MODE and only read MODE.
def S_GETREG_B32 : SOPK_Pseudo <
"s_getreg_b32",
(outs SReg_32:$sdst), (ins hwreg:$simm16),
@@ -956,10 +1181,14 @@ let SubtargetPredicate = isGFX10Plus in {
"$simm16"> {
let has_sdst = 0;
}
+} // End SubtargetPredicate = isGFX10Plus
+let SubtargetPredicate = isGFX10GFX11 in {
def S_SUBVECTOR_LOOP_BEGIN : SOPK_32_BR<"s_subvector_loop_begin">;
def S_SUBVECTOR_LOOP_END : SOPK_32_BR<"s_subvector_loop_end">;
+} // End SubtargetPredicate = isGFX10GFX11
+let SubtargetPredicate = isGFX10Plus in {
def S_WAITCNT_VSCNT : SOPK_WAITCNT<"s_waitcnt_vscnt">;
def S_WAITCNT_VMCNT : SOPK_WAITCNT<"s_waitcnt_vmcnt">;
def S_WAITCNT_EXPCNT : SOPK_WAITCNT<"s_waitcnt_expcnt">;
@@ -1033,6 +1262,30 @@ class SOPC_CMP_32<string opName,
let isCommutable = 1;
}
+class SOPC_CMP_F32<string opName,
+ SDPatternOperator cond = COND_NULL, string revOp = opName>
+ : SOPC_Helper<SSrc_b32, f32, opName, cond>,
+ Commutable_REV<revOp, !eq(revOp, opName)>,
+ SOPKInstTable<0, opName> {
+ let isCompare = 1;
+ let isCommutable = 1;
+ let mayRaiseFPException = 1;
+ let Uses = [MODE];
+ let SchedRW = [WriteSFPU];
+}
+
+class SOPC_CMP_F16<string opName,
+ SDPatternOperator cond = COND_NULL, string revOp = opName>
+ : SOPC_Helper<SSrc_b16, f16, opName, cond>,
+ Commutable_REV<revOp, !eq(revOp, opName)>,
+ SOPKInstTable<0, opName> {
+ let isCompare = 1;
+ let isCommutable = 1;
+ let mayRaiseFPException = 1;
+ let Uses = [MODE];
+ let SchedRW = [WriteSFPU];
+}
+
class SOPC_CMP_64<string opName,
SDPatternOperator cond = COND_NULL, string revOp = opName>
: SOPC_Helper<SSrc_b64, i64, opName, cond>,
@@ -1089,6 +1342,40 @@ def S_SET_GPR_IDX_ON : SOPC_Pseudo <
}
}
+let SubtargetPredicate = HasSALUFloatInsts in {
+
+def S_CMP_LT_F32 : SOPC_CMP_F32<"s_cmp_lt_f32", COND_OLT, "s_cmp_gt_f32">;
+def S_CMP_EQ_F32 : SOPC_CMP_F32<"s_cmp_eq_f32", COND_OEQ>;
+def S_CMP_LE_F32 : SOPC_CMP_F32<"s_cmp_le_f32", COND_OLE, "s_cmp_ge_f32">;
+def S_CMP_GT_F32 : SOPC_CMP_F32<"s_cmp_gt_f32", COND_OGT>;
+def S_CMP_LG_F32 : SOPC_CMP_F32<"s_cmp_lg_f32", COND_ONE>;
+def S_CMP_GE_F32 : SOPC_CMP_F32<"s_cmp_ge_f32", COND_OGE>;
+def S_CMP_O_F32 : SOPC_CMP_F32<"s_cmp_o_f32", COND_O>;
+def S_CMP_U_F32 : SOPC_CMP_F32<"s_cmp_u_f32", COND_UO>;
+def S_CMP_NGE_F32 : SOPC_CMP_F32<"s_cmp_nge_f32", COND_ULT, "s_cmp_nle_f32">;
+def S_CMP_NLG_F32 : SOPC_CMP_F32<"s_cmp_nlg_f32", COND_UEQ>;
+def S_CMP_NGT_F32 : SOPC_CMP_F32<"s_cmp_ngt_f32", COND_ULE, "s_cmp_nlt_f32">;
+def S_CMP_NLE_F32 : SOPC_CMP_F32<"s_cmp_nle_f32", COND_UGT>;
+def S_CMP_NEQ_F32 : SOPC_CMP_F32<"s_cmp_neq_f32", COND_UNE>;
+def S_CMP_NLT_F32 : SOPC_CMP_F32<"s_cmp_nlt_f32", COND_UGE>;
+
+def S_CMP_LT_F16 : SOPC_CMP_F16<"s_cmp_lt_f16", COND_OLT, "s_cmp_gt_f16">;
+def S_CMP_EQ_F16 : SOPC_CMP_F16<"s_cmp_eq_f16", COND_OEQ>;
+def S_CMP_LE_F16 : SOPC_CMP_F16<"s_cmp_le_f16", COND_OLE, "s_cmp_ge_f16">;
+def S_CMP_GT_F16 : SOPC_CMP_F16<"s_cmp_gt_f16", COND_OGT>;
+def S_CMP_LG_F16 : SOPC_CMP_F16<"s_cmp_lg_f16", COND_ONE>;
+def S_CMP_GE_F16 : SOPC_CMP_F16<"s_cmp_ge_f16", COND_OGE>;
+def S_CMP_O_F16 : SOPC_CMP_F16<"s_cmp_o_f16", COND_O>;
+def S_CMP_U_F16 : SOPC_CMP_F16<"s_cmp_u_f16", COND_UO>;
+def S_CMP_NGE_F16 : SOPC_CMP_F16<"s_cmp_nge_f16", COND_ULT, "s_cmp_nle_f16">;
+def S_CMP_NLG_F16 : SOPC_CMP_F16<"s_cmp_nlg_f16", COND_UEQ>;
+def S_CMP_NGT_F16 : SOPC_CMP_F16<"s_cmp_ngt_f16", COND_ULE, "s_cmp_nlt_f16">;
+def S_CMP_NLE_F16 : SOPC_CMP_F16<"s_cmp_nle_f16", COND_UGT>;
+def S_CMP_NEQ_F16 : SOPC_CMP_F16<"s_cmp_neq_f16", COND_UNE>;
+def S_CMP_NLT_F16 : SOPC_CMP_F16<"s_cmp_nlt_f16", COND_UGE>;
+
+} // End SubtargetPredicate = HasSALUFloatInsts
+
//===----------------------------------------------------------------------===//
// SOPP Instructions
//===----------------------------------------------------------------------===//
@@ -1161,7 +1448,10 @@ multiclass SOPP_With_Relaxation <string opName, dag ins,
def _pad_s_nop : SOPP_Pseudo <opName # "_pad_s_nop", ins, asmOps, pattern, " ", opName>;
}
-def S_NOP : SOPP_Pseudo<"s_nop" , (ins i16imm:$simm16), "$simm16">;
+def S_NOP : SOPP_Pseudo<"s_nop" , (ins i16imm:$simm16), "$simm16",
+ [(int_amdgcn_s_nop timm:$simm16)]> {
+ let hasSideEffects = 1;
+}
let isTerminator = 1 in {
def S_ENDPGM : SOPP_Pseudo<"s_endpgm", (ins Endpgm:$simm16), "$simm16", [], ""> {
@@ -1264,6 +1554,21 @@ def S_BARRIER : SOPP_Pseudo <"s_barrier", (ins), "",
let isConvergent = 1;
}
+def S_BARRIER_WAIT : SOPP_Pseudo <"s_barrier_wait", (ins i16imm:$simm16), "$simm16",
+ [(int_amdgcn_s_barrier_wait timm:$simm16)]> {
+ let SchedRW = [WriteBarrier];
+ let isConvergent = 1;
+}
+
+def S_BARRIER_LEAVE : SOPP_Pseudo <"s_barrier_leave", (ins), "",
+ [(set SCC, (int_amdgcn_s_barrier_leave))]> {
+ let SchedRW = [WriteBarrier];
+ let simm16 = 0;
+ let fixed_imm = 1;
+ let isConvergent = 1;
+ let Defs = [SCC];
+}
+
def S_WAKEUP : SOPP_Pseudo <"s_wakeup", (ins) > {
let SubtargetPredicate = isGFX8Plus;
let simm16 = 0;
@@ -1272,9 +1577,19 @@ def S_WAKEUP : SOPP_Pseudo <"s_wakeup", (ins) > {
let mayStore = 1;
}
-let hasSideEffects = 1 in
def S_WAITCNT : SOPP_Pseudo <"s_waitcnt" , (ins SWaitCnt:$simm16), "$simm16",
[(int_amdgcn_s_waitcnt timm:$simm16)]>;
+
+// "_soft" waitcnts are waitcnts that are either relaxed into their non-soft
+// counterpart, or completely removed.
+//
+// These are inserted by SIMemoryLegalizer to resolve memory dependencies
+// and later optimized by SIInsertWaitcnts
+// For example, a S_WAITCNT_soft 0 can be completely removed in a function
+// that doesn't access memory.
+def S_WAITCNT_soft : SOPP_Pseudo <"s_soft_waitcnt" , (ins SWaitCnt:$simm16), "$simm16">;
+def S_WAITCNT_VSCNT_soft : SOPK_WAITCNT<"s_soft_waitcnt_vscnt">;
+
def S_SETHALT : SOPP_Pseudo <"s_sethalt" , (ins i32imm:$simm16), "$simm16",
[(int_amdgcn_s_sethalt timm:$simm16)]>;
def S_SETKILL : SOPP_Pseudo <"s_setkill" , (ins i16imm:$simm16), "$simm16">;
@@ -1285,23 +1600,23 @@ def S_SETKILL : SOPP_Pseudo <"s_setkill" , (ins i16imm:$simm16), "$simm16">;
// maximum really 15 on VI?
def S_SLEEP : SOPP_Pseudo <"s_sleep", (ins i32imm:$simm16),
"$simm16", [(int_amdgcn_s_sleep timm:$simm16)]> {
+}
+
+def S_SLEEP_VAR : SOP1_0_32 <"s_sleep_var", [(int_amdgcn_s_sleep_var SSrc_b32:$src0)]> {
let hasSideEffects = 1;
}
def S_SETPRIO : SOPP_Pseudo <"s_setprio", (ins i16imm:$simm16), "$simm16",
[(int_amdgcn_s_setprio timm:$simm16)]> {
- let hasSideEffects = 1;
}
let Uses = [EXEC, M0] in {
def S_SENDMSG : SOPP_Pseudo <"s_sendmsg" , (ins SendMsg:$simm16), "$simm16",
[(int_amdgcn_s_sendmsg (i32 timm:$simm16), M0)]> {
- let hasSideEffects = 1;
}
def S_SENDMSGHALT : SOPP_Pseudo <"s_sendmsghalt" , (ins SendMsg:$simm16), "$simm16",
[(int_amdgcn_s_sendmsghalt (i32 timm:$simm16), M0)]> {
- let hasSideEffects = 1;
}
} // End Uses = [EXEC, M0]
@@ -1316,13 +1631,14 @@ def S_ICACHE_INV : SOPP_Pseudo <"s_icache_inv", (ins)> {
}
def S_INCPERFLEVEL : SOPP_Pseudo <"s_incperflevel", (ins i32imm:$simm16), "$simm16",
[(int_amdgcn_s_incperflevel timm:$simm16)]> {
- let hasSideEffects = 1;
}
def S_DECPERFLEVEL : SOPP_Pseudo <"s_decperflevel", (ins i32imm:$simm16), "$simm16",
[(int_amdgcn_s_decperflevel timm:$simm16)]> {
- let hasSideEffects = 1;
}
-def S_TTRACEDATA : SOPP_Pseudo <"s_ttracedata", (ins)> {
+
+let Uses = [M0] in
+def S_TTRACEDATA : SOPP_Pseudo <"s_ttracedata", (ins), "",
+ [(int_amdgcn_s_ttracedata M0)]> {
let simm16 = 0;
let fixed_imm = 1;
}
@@ -1366,8 +1682,10 @@ let SubtargetPredicate = isGFX10Plus in {
[(SIdenorm_mode (i32 timm:$simm16))]>;
}
+ let hasSideEffects = 1 in
def S_TTRACEDATA_IMM :
- SOPP_Pseudo<"s_ttracedata_imm", (ins s16imm:$simm16), "$simm16">;
+ SOPP_Pseudo<"s_ttracedata_imm", (ins s16imm:$simm16), "$simm16",
+ [(int_amdgcn_s_ttracedata_imm timm:$simm16)]>;
} // End SubtargetPredicate = isGFX10Plus
let SubtargetPredicate = isGFX11Plus in {
@@ -1379,6 +1697,11 @@ let SubtargetPredicate = isGFX11Plus in {
"$simm16">;
} // End SubtargetPredicate = isGFX11Plus
+let SubtargetPredicate = HasVGPRSingleUseHintInsts in {
+ def S_SINGLEUSE_VDST :
+ SOPP_Pseudo<"s_singleuse_vdst", (ins s16imm:$simm16), "$simm16">;
+} // End SubtargetPredicate = HasVGPRSingeUseHintInsts
+
//===----------------------------------------------------------------------===//
// SOP1 Patterns
//===----------------------------------------------------------------------===//
@@ -1421,10 +1744,91 @@ def : GCNPat <
(S_WAIT_EVENT (i16 0))
>;
+// The first 10 bits of the mode register are the core FP mode on all
+// subtargets.
+//
+// The high bits include additional fields, intermixed with some
+// non-floating point environment information. We extract the full
+// register and clear non-relevant bits.
+//
+// EXCP_EN covers floating point exceptions, but also some other
+// non-FP exceptions.
+//
+// Bits 12-18 cover the relevant exception mask on all subtargets.
+//
+// FIXME: Bit 18 is int_div0, should this be in the FP environment? I
+// think the only source is v_rcp_iflag_i32.
+//
+// On GFX9+:
+// Bit 23 is the additional FP16_OVFL mode.
+//
+// Bits 19, 20, and 21 cover non-FP exceptions and differ between
+// gfx9/10/11, so we ignore them here.
+
+// TODO: Would it be cheaper to emit multiple s_getreg_b32 calls for
+// the ranges and combine the results?
+
+defvar fp_round_mask = !add(!shl(1, 4), -1);
+defvar fp_denorm_mask = !shl(!add(!shl(1, 4), -1), 4);
+defvar dx10_clamp_mask = !shl(1, 8);
+defvar ieee_mode_mask = !shl(1, 9);
+
+// Covers fp_round, fp_denorm, dx10_clamp, and IEEE bit.
+defvar fpmode_mask =
+ !or(fp_round_mask, fp_denorm_mask, dx10_clamp_mask, ieee_mode_mask);
+
+defvar fp_excp_en_mask = !shl(!add(!shl(1, 7), -1), 12);
+defvar fp16_ovfl = !shl(1, 23);
+defvar fpmode_mask_gfx6plus = !or(fpmode_mask, fp_excp_en_mask);
+defvar fpmode_mask_gfx9plus = !or(fpmode_mask_gfx6plus, fp16_ovfl);
+
+class GetFPModePat<int fpmode_mask> : GCNPat<
+ (i32 get_fpmode),
+ (S_AND_B32 (i32 fpmode_mask),
+ (S_GETREG_B32 getHwRegImm<
+ HWREG.MODE, 0,
+ !add(!logtwo(fpmode_mask), 1)>.ret))
+>;
+
+// TODO: Might be worth moving to custom lowering so the and is
+// exposed to demanded bits optimizations. Most users probably only
+// care about the rounding or denorm mode bits. We also can reduce the
+// demanded read from the getreg immediate.
+let SubtargetPredicate = isGFX9Plus in {
+// Last bit = FP16_OVFL
+def : GetFPModePat<fpmode_mask_gfx9plus>;
+}
+
+// Last bit = EXCP_EN.int_div0
+let SubtargetPredicate = isNotGFX9Plus in {
+def : GetFPModePat<fpmode_mask_gfx6plus>;
+}
+
//===----------------------------------------------------------------------===//
// SOP2 Patterns
//===----------------------------------------------------------------------===//
+def UniformSelect : PatFrag<
+ (ops node:$src0, node:$src1),
+ (select SCC, $src0, $src1),
+ [{ return !N->isDivergent(); }]
+>;
+
+let AddedComplexity = 20 in {
+ def : GCNPat<
+ (i32 (UniformSelect i32:$src0, i32:$src1)),
+ (S_CSELECT_B32 SSrc_b32:$src0, SSrc_b32:$src1)
+ >;
+
+ // TODO: The predicate should not be necessary, but enabling this pattern for
+ // all subtargets generates worse code in some cases.
+ let OtherPredicates = [HasPseudoScalarTrans] in
+ def : GCNPat<
+ (f32 (UniformSelect f32:$src0, f32:$src1)),
+ (S_CSELECT_B32 SSrc_b32:$src0, SSrc_b32:$src1)
+ >;
+}
+
// V_ADD_I32_e32/S_ADD_U32 produces carry in VCC/SCC. For the vector
// case, the sgpr-copies pass will fix this to use the vector version.
def : GCNPat <
@@ -1476,6 +1880,11 @@ def : ScalarNot2Pat<S_ORN2_B64, or, v2i32>;
// Target-specific instruction encodings.
//===----------------------------------------------------------------------===//
+class Select_gfx12<string opName> : SIMCInstr<opName, SIEncodingFamily.GFX12> {
+ Predicate AssemblerPredicate = isGFX12Only;
+ string DecoderNamespace = "GFX12";
+}
+
class Select_gfx11<string opName> : SIMCInstr<opName, SIEncodingFamily.GFX11> {
Predicate AssemblerPredicate = isGFX11Only;
string DecoderNamespace = "GFX11";
@@ -1497,85 +1906,143 @@ class Select_gfx6_gfx7<string opName> : SIMCInstr<opName, SIEncodingFamily.SI> {
}
//===----------------------------------------------------------------------===//
-// GFX11.
+// SOP1 - GFX11, GFX12
//===----------------------------------------------------------------------===//
+multiclass SOP1_Real_gfx12<bits<8> op> {
+ def _gfx12 : SOP1_Real<op, !cast<SOP1_Pseudo>(NAME)>,
+ Select_gfx12<!cast<SOP1_Pseudo>(NAME).Mnemonic>;
+}
+
+multiclass SOP1_M0_Real_gfx12<bits<8> op> {
+ def _gfx12 : SOP1_Real<op, !cast<SOP1_Pseudo>(NAME)>,
+ Select_gfx12<!cast<SOP1_Pseudo>(NAME).Mnemonic> {
+ let Inst{7-0} = M0_gfx11plus.HWEncoding{7-0}; // Set Src0 encoding to M0
+ }
+}
+
multiclass SOP1_Real_gfx11<bits<8> op> {
def _gfx11 : SOP1_Real<op, !cast<SOP1_Pseudo>(NAME)>,
Select_gfx11<!cast<SOP1_Pseudo>(NAME).Mnemonic>;
}
+multiclass SOP1_Real_Renamed_gfx12<bits<8> op, SOP1_Pseudo backing_pseudo, string real_name> {
+ def _gfx12 : SOP1_Real<op, backing_pseudo, real_name>,
+ Select_gfx12<backing_pseudo.Mnemonic>,
+ MnemonicAlias<backing_pseudo.Mnemonic, real_name>, Requires<[isGFX12Plus]>;
+}
+
multiclass SOP1_Real_Renamed_gfx11<bits<8> op, SOP1_Pseudo backing_pseudo, string real_name> {
def _gfx11 : SOP1_Real<op, backing_pseudo, real_name>,
Select_gfx11<backing_pseudo.Mnemonic>,
- MnemonicAlias<backing_pseudo.Mnemonic, real_name>, Requires<[isGFX11Plus]>;
-}
-
-defm S_MOV_B32 : SOP1_Real_gfx11<0x000>;
-defm S_MOV_B64 : SOP1_Real_gfx11<0x001>;
-defm S_CMOV_B32 : SOP1_Real_gfx11<0x002>;
-defm S_CMOV_B64 : SOP1_Real_gfx11<0x003>;
-defm S_BREV_B32 : SOP1_Real_gfx11<0x004>;
-defm S_BREV_B64 : SOP1_Real_gfx11<0x005>;
-defm S_CTZ_I32_B32 : SOP1_Real_Renamed_gfx11<0x008, S_FF1_I32_B32, "s_ctz_i32_b32">;
-defm S_CTZ_I32_B64 : SOP1_Real_Renamed_gfx11<0x009, S_FF1_I32_B64, "s_ctz_i32_b64">;
-defm S_CLZ_I32_U32 : SOP1_Real_Renamed_gfx11<0x00a, S_FLBIT_I32_B32, "s_clz_i32_u32">;
-defm S_CLZ_I32_U64 : SOP1_Real_Renamed_gfx11<0x00b, S_FLBIT_I32_B64, "s_clz_i32_u64">;
-defm S_CLS_I32 : SOP1_Real_Renamed_gfx11<0x00c, S_FLBIT_I32, "s_cls_i32">;
-defm S_CLS_I32_I64 : SOP1_Real_Renamed_gfx11<0x00d, S_FLBIT_I32_I64, "s_cls_i32_i64">;
-defm S_SEXT_I32_I8 : SOP1_Real_gfx11<0x00e>;
-defm S_SEXT_I32_I16 : SOP1_Real_gfx11<0x00f>;
-defm S_BITSET0_B32 : SOP1_Real_gfx11<0x010>;
-defm S_BITSET0_B64 : SOP1_Real_gfx11<0x011>;
-defm S_BITSET1_B32 : SOP1_Real_gfx11<0x012>;
-defm S_BITSET1_B64 : SOP1_Real_gfx11<0x013>;
-defm S_BITREPLICATE_B64_B32 : SOP1_Real_gfx11<0x014>;
-defm S_ABS_I32 : SOP1_Real_gfx11<0x015>;
-defm S_BCNT0_I32_B32 : SOP1_Real_gfx11<0x016>;
-defm S_BCNT0_I32_B64 : SOP1_Real_gfx11<0x017>;
-defm S_BCNT1_I32_B32 : SOP1_Real_gfx11<0x018>;
-defm S_BCNT1_I32_B64 : SOP1_Real_gfx11<0x019>;
-defm S_QUADMASK_B32 : SOP1_Real_gfx11<0x01a>;
-defm S_QUADMASK_B64 : SOP1_Real_gfx11<0x01b>;
-defm S_WQM_B32 : SOP1_Real_gfx11<0x01c>;
-defm S_WQM_B64 : SOP1_Real_gfx11<0x01d>;
-defm S_NOT_B32 : SOP1_Real_gfx11<0x01e>;
-defm S_NOT_B64 : SOP1_Real_gfx11<0x01f>;
-defm S_AND_SAVEEXEC_B32 : SOP1_Real_gfx11<0x020>;
-defm S_AND_SAVEEXEC_B64 : SOP1_Real_gfx11<0x021>;
-defm S_OR_SAVEEXEC_B32 : SOP1_Real_gfx11<0x022>;
-defm S_OR_SAVEEXEC_B64 : SOP1_Real_gfx11<0x023>;
-defm S_XOR_SAVEEXEC_B32 : SOP1_Real_gfx11<0x024>;
-defm S_XOR_SAVEEXEC_B64 : SOP1_Real_gfx11<0x025>;
-defm S_NAND_SAVEEXEC_B32 : SOP1_Real_gfx11<0x026>;
-defm S_NAND_SAVEEXEC_B64 : SOP1_Real_gfx11<0x027>;
-defm S_NOR_SAVEEXEC_B32 : SOP1_Real_gfx11<0x028>;
-defm S_NOR_SAVEEXEC_B64 : SOP1_Real_gfx11<0x029>;
-defm S_XNOR_SAVEEXEC_B32 : SOP1_Real_gfx11<0x02a>;
-/*defm S_XNOR_SAVEEXEC_B64 : SOP1_Real_gfx11<0x02b>; //same as older arch, handled there*/
-defm S_AND_NOT0_SAVEEXEC_B32 : SOP1_Real_Renamed_gfx11<0x02c, S_ANDN1_SAVEEXEC_B32, "s_and_not0_saveexec_b32">;
-defm S_AND_NOT0_SAVEEXEC_B64 : SOP1_Real_Renamed_gfx11<0x02d, S_ANDN1_SAVEEXEC_B64, "s_and_not0_saveexec_b64">;
-defm S_OR_NOT0_SAVEEXEC_B32 : SOP1_Real_Renamed_gfx11<0x02e, S_ORN1_SAVEEXEC_B32, "s_or_not0_saveexec_b32">;
-defm S_OR_NOT0_SAVEEXEC_B64 : SOP1_Real_Renamed_gfx11<0x02f, S_ORN1_SAVEEXEC_B64, "s_or_not0_saveexec_b64">;
-defm S_AND_NOT1_SAVEEXEC_B32 : SOP1_Real_Renamed_gfx11<0x030, S_ANDN2_SAVEEXEC_B32, "s_and_not1_saveexec_b32">;
-defm S_AND_NOT1_SAVEEXEC_B64 : SOP1_Real_Renamed_gfx11<0x031, S_ANDN2_SAVEEXEC_B64, "s_and_not1_saveexec_b64">;
-defm S_OR_NOT1_SAVEEXEC_B32 : SOP1_Real_Renamed_gfx11<0x032, S_ORN2_SAVEEXEC_B32, "s_or_not1_saveexec_b32">;
-defm S_OR_NOT1_SAVEEXEC_B64 : SOP1_Real_Renamed_gfx11<0x033, S_ORN2_SAVEEXEC_B64, "s_or_not1_saveexec_b64">;
-defm S_AND_NOT0_WREXEC_B32 : SOP1_Real_Renamed_gfx11<0x034, S_ANDN1_WREXEC_B32, "s_and_not0_wrexec_b32">;
-defm S_AND_NOT0_WREXEC_B64 : SOP1_Real_Renamed_gfx11<0x035, S_ANDN1_WREXEC_B64, "s_and_not0_wrexec_b64">;
-defm S_AND_NOT1_WREXEC_B32 : SOP1_Real_Renamed_gfx11<0x036, S_ANDN2_WREXEC_B32, "s_and_not1_wrexec_b32">;
-defm S_AND_NOT1_WREXEC_B64 : SOP1_Real_Renamed_gfx11<0x037, S_ANDN2_WREXEC_B64, "s_and_not1_wrexec_b64">;
-defm S_MOVRELS_B32 : SOP1_Real_gfx11<0x040>;
-defm S_MOVRELS_B64 : SOP1_Real_gfx11<0x041>;
-defm S_MOVRELD_B32 : SOP1_Real_gfx11<0x042>;
-defm S_MOVRELD_B64 : SOP1_Real_gfx11<0x043>;
-defm S_MOVRELSD_2_B32 : SOP1_Real_gfx11<0x044>;
-defm S_GETPC_B64 : SOP1_Real_gfx11<0x047>;
-defm S_SETPC_B64 : SOP1_Real_gfx11<0x048>;
-defm S_SWAPPC_B64 : SOP1_Real_gfx11<0x049>;
-defm S_RFE_B64 : SOP1_Real_gfx11<0x04a>;
-defm S_SENDMSG_RTN_B32 : SOP1_Real_gfx11<0x04c>;
-defm S_SENDMSG_RTN_B64 : SOP1_Real_gfx11<0x04d>;
+ MnemonicAlias<backing_pseudo.Mnemonic, real_name>, Requires<[isGFX11Only]>;
+}
+
+multiclass SOP1_Real_gfx11_gfx12<bits<8> op> :
+ SOP1_Real_gfx11<op>, SOP1_Real_gfx12<op>;
+
+multiclass SOP1_Real_Renamed_gfx11_gfx12<bits<8> op, SOP1_Pseudo backing_pseudo, string real_name> :
+ SOP1_Real_Renamed_gfx11<op, backing_pseudo, real_name>,
+ SOP1_Real_Renamed_gfx12<op, backing_pseudo, real_name>;
+
+defm S_MOV_B32 : SOP1_Real_gfx11_gfx12<0x000>;
+defm S_MOV_B64 : SOP1_Real_gfx11_gfx12<0x001>;
+defm S_CMOV_B32 : SOP1_Real_gfx11_gfx12<0x002>;
+defm S_CMOV_B64 : SOP1_Real_gfx11_gfx12<0x003>;
+defm S_BREV_B32 : SOP1_Real_gfx11_gfx12<0x004>;
+defm S_BREV_B64 : SOP1_Real_gfx11_gfx12<0x005>;
+defm S_CTZ_I32_B32 : SOP1_Real_Renamed_gfx11_gfx12<0x008, S_FF1_I32_B32, "s_ctz_i32_b32">;
+defm S_CTZ_I32_B64 : SOP1_Real_Renamed_gfx11_gfx12<0x009, S_FF1_I32_B64, "s_ctz_i32_b64">;
+defm S_CLZ_I32_U32 : SOP1_Real_Renamed_gfx11_gfx12<0x00a, S_FLBIT_I32_B32, "s_clz_i32_u32">;
+defm S_CLZ_I32_U64 : SOP1_Real_Renamed_gfx11_gfx12<0x00b, S_FLBIT_I32_B64, "s_clz_i32_u64">;
+defm S_CLS_I32 : SOP1_Real_Renamed_gfx11_gfx12<0x00c, S_FLBIT_I32, "s_cls_i32">;
+defm S_CLS_I32_I64 : SOP1_Real_Renamed_gfx11_gfx12<0x00d, S_FLBIT_I32_I64, "s_cls_i32_i64">;
+defm S_SEXT_I32_I8 : SOP1_Real_gfx11_gfx12<0x00e>;
+defm S_SEXT_I32_I16 : SOP1_Real_gfx11_gfx12<0x00f>;
+defm S_BITSET0_B32 : SOP1_Real_gfx11_gfx12<0x010>;
+defm S_BITSET0_B64 : SOP1_Real_gfx11_gfx12<0x011>;
+defm S_BITSET1_B32 : SOP1_Real_gfx11_gfx12<0x012>;
+defm S_BITSET1_B64 : SOP1_Real_gfx11_gfx12<0x013>;
+defm S_BITREPLICATE_B64_B32 : SOP1_Real_gfx11_gfx12<0x014>;
+defm S_ABS_I32 : SOP1_Real_gfx11_gfx12<0x015>;
+defm S_BCNT0_I32_B32 : SOP1_Real_gfx11_gfx12<0x016>;
+defm S_BCNT0_I32_B64 : SOP1_Real_gfx11_gfx12<0x017>;
+defm S_BCNT1_I32_B32 : SOP1_Real_gfx11_gfx12<0x018>;
+defm S_BCNT1_I32_B64 : SOP1_Real_gfx11_gfx12<0x019>;
+defm S_QUADMASK_B32 : SOP1_Real_gfx11_gfx12<0x01a>;
+defm S_QUADMASK_B64 : SOP1_Real_gfx11_gfx12<0x01b>;
+defm S_WQM_B32 : SOP1_Real_gfx11_gfx12<0x01c>;
+defm S_WQM_B64 : SOP1_Real_gfx11_gfx12<0x01d>;
+defm S_NOT_B32 : SOP1_Real_gfx11_gfx12<0x01e>;
+defm S_NOT_B64 : SOP1_Real_gfx11_gfx12<0x01f>;
+defm S_AND_SAVEEXEC_B32 : SOP1_Real_gfx11_gfx12<0x020>;
+defm S_AND_SAVEEXEC_B64 : SOP1_Real_gfx11_gfx12<0x021>;
+defm S_OR_SAVEEXEC_B32 : SOP1_Real_gfx11_gfx12<0x022>;
+defm S_OR_SAVEEXEC_B64 : SOP1_Real_gfx11_gfx12<0x023>;
+defm S_XOR_SAVEEXEC_B32 : SOP1_Real_gfx11_gfx12<0x024>;
+defm S_XOR_SAVEEXEC_B64 : SOP1_Real_gfx11_gfx12<0x025>;
+defm S_NAND_SAVEEXEC_B32 : SOP1_Real_gfx11_gfx12<0x026>;
+defm S_NAND_SAVEEXEC_B64 : SOP1_Real_gfx11_gfx12<0x027>;
+defm S_NOR_SAVEEXEC_B32 : SOP1_Real_gfx11_gfx12<0x028>;
+defm S_NOR_SAVEEXEC_B64 : SOP1_Real_gfx11_gfx12<0x029>;
+defm S_XNOR_SAVEEXEC_B32 : SOP1_Real_gfx11_gfx12<0x02a>;
+/*defm S_XNOR_SAVEEXEC_B64 : SOP1_Real_gfx11_gfx12<0x02b>; //same as older arch, handled there*/
+defm S_AND_NOT0_SAVEEXEC_B32 : SOP1_Real_Renamed_gfx11_gfx12<0x02c, S_ANDN1_SAVEEXEC_B32, "s_and_not0_saveexec_b32">;
+defm S_AND_NOT0_SAVEEXEC_B64 : SOP1_Real_Renamed_gfx11_gfx12<0x02d, S_ANDN1_SAVEEXEC_B64, "s_and_not0_saveexec_b64">;
+defm S_OR_NOT0_SAVEEXEC_B32 : SOP1_Real_Renamed_gfx11_gfx12<0x02e, S_ORN1_SAVEEXEC_B32, "s_or_not0_saveexec_b32">;
+defm S_OR_NOT0_SAVEEXEC_B64 : SOP1_Real_Renamed_gfx11_gfx12<0x02f, S_ORN1_SAVEEXEC_B64, "s_or_not0_saveexec_b64">;
+defm S_AND_NOT1_SAVEEXEC_B32 : SOP1_Real_Renamed_gfx11_gfx12<0x030, S_ANDN2_SAVEEXEC_B32, "s_and_not1_saveexec_b32">;
+defm S_AND_NOT1_SAVEEXEC_B64 : SOP1_Real_Renamed_gfx11_gfx12<0x031, S_ANDN2_SAVEEXEC_B64, "s_and_not1_saveexec_b64">;
+defm S_OR_NOT1_SAVEEXEC_B32 : SOP1_Real_Renamed_gfx11_gfx12<0x032, S_ORN2_SAVEEXEC_B32, "s_or_not1_saveexec_b32">;
+defm S_OR_NOT1_SAVEEXEC_B64 : SOP1_Real_Renamed_gfx11_gfx12<0x033, S_ORN2_SAVEEXEC_B64, "s_or_not1_saveexec_b64">;
+defm S_AND_NOT0_WREXEC_B32 : SOP1_Real_Renamed_gfx11_gfx12<0x034, S_ANDN1_WREXEC_B32, "s_and_not0_wrexec_b32">;
+defm S_AND_NOT0_WREXEC_B64 : SOP1_Real_Renamed_gfx11_gfx12<0x035, S_ANDN1_WREXEC_B64, "s_and_not0_wrexec_b64">;
+defm S_AND_NOT1_WREXEC_B32 : SOP1_Real_Renamed_gfx11_gfx12<0x036, S_ANDN2_WREXEC_B32, "s_and_not1_wrexec_b32">;
+defm S_AND_NOT1_WREXEC_B64 : SOP1_Real_Renamed_gfx11_gfx12<0x037, S_ANDN2_WREXEC_B64, "s_and_not1_wrexec_b64">;
+defm S_MOVRELS_B32 : SOP1_Real_gfx11_gfx12<0x040>;
+defm S_MOVRELS_B64 : SOP1_Real_gfx11_gfx12<0x041>;
+defm S_MOVRELD_B32 : SOP1_Real_gfx11_gfx12<0x042>;
+defm S_MOVRELD_B64 : SOP1_Real_gfx11_gfx12<0x043>;
+defm S_MOVRELSD_2_B32 : SOP1_Real_gfx11_gfx12<0x044>;
+defm S_GETPC_B64 : SOP1_Real_gfx11_gfx12<0x047>;
+defm S_SETPC_B64 : SOP1_Real_gfx11_gfx12<0x048>;
+defm S_SWAPPC_B64 : SOP1_Real_gfx11_gfx12<0x049>;
+defm S_RFE_B64 : SOP1_Real_gfx11_gfx12<0x04a>;
+defm S_SENDMSG_RTN_B32 : SOP1_Real_gfx11_gfx12<0x04c>;
+defm S_SENDMSG_RTN_B64 : SOP1_Real_gfx11_gfx12<0x04d>;
+defm S_BARRIER_SIGNAL_M0 : SOP1_M0_Real_gfx12<0x04e>;
+defm S_BARRIER_SIGNAL_ISFIRST_M0 : SOP1_M0_Real_gfx12<0x04f>;
+defm S_GET_BARRIER_STATE_M0 : SOP1_M0_Real_gfx12<0x050>;
+defm S_BARRIER_INIT_M0 : SOP1_M0_Real_gfx12<0x051>;
+defm S_BARRIER_JOIN_M0 : SOP1_M0_Real_gfx12<0x052>;
+defm S_WAKEUP_BARRIER_M0 : SOP1_M0_Real_gfx12<0x057>;
+defm S_BARRIER_SIGNAL_IMM : SOP1_Real_gfx12<0x04e>;
+defm S_BARRIER_SIGNAL_ISFIRST_IMM : SOP1_Real_gfx12<0x04f>;
+defm S_GET_BARRIER_STATE_IMM : SOP1_Real_gfx12<0x050>;
+defm S_BARRIER_INIT_IMM : SOP1_Real_gfx12<0x051>;
+defm S_BARRIER_JOIN_IMM : SOP1_Real_gfx12<0x052>;
+defm S_WAKEUP_BARRIER_IMM : SOP1_Real_gfx12<0x057>;
+defm S_SLEEP_VAR : SOP1_Real_gfx12<0x058>;
+
+//===----------------------------------------------------------------------===//
+// SOP1 - GFX1150, GFX12
+//===----------------------------------------------------------------------===//
+
+defm S_CEIL_F32 : SOP1_Real_gfx11_gfx12<0x060>;
+defm S_FLOOR_F32 : SOP1_Real_gfx11_gfx12<0x061>;
+defm S_TRUNC_F32 : SOP1_Real_gfx11_gfx12<0x062>;
+defm S_RNDNE_F32 : SOP1_Real_gfx11_gfx12<0x063>;
+defm S_CVT_F32_I32 : SOP1_Real_gfx11_gfx12<0x064>;
+defm S_CVT_F32_U32 : SOP1_Real_gfx11_gfx12<0x065>;
+defm S_CVT_I32_F32 : SOP1_Real_gfx11_gfx12<0x066>;
+defm S_CVT_U32_F32 : SOP1_Real_gfx11_gfx12<0x067>;
+defm S_CVT_F16_F32 : SOP1_Real_gfx11_gfx12<0x068>;
+defm S_CVT_F32_F16 : SOP1_Real_gfx11_gfx12<0x069>;
+defm S_CVT_HI_F32_F16 : SOP1_Real_gfx11_gfx12<0x06a>;
+defm S_CEIL_F16 : SOP1_Real_gfx11_gfx12<0x06b>;
+defm S_FLOOR_F16 : SOP1_Real_gfx11_gfx12<0x06c>;
+defm S_TRUNC_F16 : SOP1_Real_gfx11_gfx12<0x06d>;
+defm S_RNDNE_F16 : SOP1_Real_gfx11_gfx12<0x06e>;
//===----------------------------------------------------------------------===//
// SOP1 - GFX10.
@@ -1587,8 +2054,8 @@ multiclass SOP1_Real_gfx10<bits<8> op> {
Select_gfx10<ps.Mnemonic>;
}
-multiclass SOP1_Real_gfx10_gfx11<bits<8> op> :
- SOP1_Real_gfx10<op>, SOP1_Real_gfx11<op>;
+multiclass SOP1_Real_gfx10_gfx11_gfx12<bits<8> op> :
+ SOP1_Real_gfx10<op>, SOP1_Real_gfx11_gfx12<op>;
defm S_ANDN1_SAVEEXEC_B64 : SOP1_Real_gfx10<0x037>;
defm S_ORN1_SAVEEXEC_B64 : SOP1_Real_gfx10<0x038>;
@@ -1623,8 +2090,8 @@ multiclass SOP1_Real_gfx6_gfx7<bits<8> op> {
multiclass SOP1_Real_gfx6_gfx7_gfx10<bits<8> op> :
SOP1_Real_gfx6_gfx7<op>, SOP1_Real_gfx10<op>;
-multiclass SOP1_Real_gfx6_gfx7_gfx10_gfx11<bits<8> op> :
- SOP1_Real_gfx6_gfx7<op>, SOP1_Real_gfx10_gfx11<op>;
+multiclass SOP1_Real_gfx6_gfx7_gfx10_gfx11_gfx12<bits<8> op> :
+ SOP1_Real_gfx6_gfx7<op>, SOP1_Real_gfx10_gfx11_gfx12<op>;
defm S_CBRANCH_JOIN : SOP1_Real_gfx6_gfx7<0x032>;
@@ -1667,7 +2134,7 @@ defm S_ANDN2_SAVEEXEC_B64 : SOP1_Real_gfx6_gfx7_gfx10<0x027>;
defm S_ORN2_SAVEEXEC_B64 : SOP1_Real_gfx6_gfx7_gfx10<0x028>;
defm S_NAND_SAVEEXEC_B64 : SOP1_Real_gfx6_gfx7_gfx10<0x029>;
defm S_NOR_SAVEEXEC_B64 : SOP1_Real_gfx6_gfx7_gfx10<0x02a>;
-defm S_XNOR_SAVEEXEC_B64 : SOP1_Real_gfx6_gfx7_gfx10_gfx11<0x02b>;
+defm S_XNOR_SAVEEXEC_B64 : SOP1_Real_gfx6_gfx7_gfx10_gfx11_gfx12<0x02b>;
defm S_QUADMASK_B32 : SOP1_Real_gfx6_gfx7_gfx10<0x02c>;
defm S_QUADMASK_B64 : SOP1_Real_gfx6_gfx7_gfx10<0x02d>;
defm S_MOVRELS_B32 : SOP1_Real_gfx6_gfx7_gfx10<0x02e>;
@@ -1677,63 +2144,142 @@ defm S_MOVRELD_B64 : SOP1_Real_gfx6_gfx7_gfx10<0x031>;
defm S_ABS_I32 : SOP1_Real_gfx6_gfx7_gfx10<0x034>;
//===----------------------------------------------------------------------===//
-// SOP2 - GFX11.
+// SOP2 - GFX12
+//===----------------------------------------------------------------------===//
+
+multiclass SOP2_Real_gfx12<bits<7> op> {
+ def _gfx12 : SOP2_Real32<op, !cast<SOP2_Pseudo>(NAME)>,
+ Select_gfx12<!cast<SOP2_Pseudo>(NAME).Mnemonic>;
+}
+
+multiclass SOP2_Real_Renamed_gfx12<bits<7> op, SOP2_Pseudo backing_pseudo, string real_name> {
+ def _gfx12 : SOP2_Real32<op, backing_pseudo, real_name>,
+ Select_gfx12<backing_pseudo.Mnemonic>,
+ MnemonicAlias<backing_pseudo.Mnemonic, real_name>, Requires<[isGFX12Plus]>;
+}
+
+defm S_MIN_NUM_F32 : SOP2_Real_Renamed_gfx12<0x042, S_MIN_F32, "s_min_num_f32">;
+defm S_MAX_NUM_F32 : SOP2_Real_Renamed_gfx12<0x043, S_MAX_F32, "s_max_num_f32">;
+defm S_MIN_NUM_F16 : SOP2_Real_Renamed_gfx12<0x04b, S_MIN_F16, "s_min_num_f16">;
+defm S_MAX_NUM_F16 : SOP2_Real_Renamed_gfx12<0x04c, S_MAX_F16, "s_max_num_f16">;
+defm S_MINIMUM_F32 : SOP2_Real_gfx12<0x04f>;
+defm S_MAXIMUM_F32 : SOP2_Real_gfx12<0x050>;
+defm S_MINIMUM_F16 : SOP2_Real_gfx12<0x051>;
+defm S_MAXIMUM_F16 : SOP2_Real_gfx12<0x052>;
+
+defm S_ADD_CO_U32 : SOP2_Real_Renamed_gfx12<0x000, S_ADD_U32, "s_add_co_u32">;
+defm S_SUB_CO_U32 : SOP2_Real_Renamed_gfx12<0x001, S_SUB_U32, "s_sub_co_u32">;
+defm S_ADD_CO_I32 : SOP2_Real_Renamed_gfx12<0x002, S_ADD_I32, "s_add_co_i32">;
+defm S_SUB_CO_I32 : SOP2_Real_Renamed_gfx12<0x003, S_SUB_I32, "s_sub_co_i32">;
+defm S_ADD_CO_CI_U32 : SOP2_Real_Renamed_gfx12<0x004, S_ADDC_U32, "s_add_co_ci_u32">;
+defm S_SUB_CO_CI_U32 : SOP2_Real_Renamed_gfx12<0x005, S_SUBB_U32, "s_sub_co_ci_u32">;
+
+//===----------------------------------------------------------------------===//
+// SOP2 - GFX11, GFX12.
//===----------------------------------------------------------------------===//
multiclass SOP2_Real_gfx11<bits<7> op> {
- def _gfx11 : SOP2_Real<op, !cast<SOP2_Pseudo>(NAME)>,
+ def _gfx11 : SOP2_Real32<op, !cast<SOP2_Pseudo>(NAME)>,
Select_gfx11<!cast<SOP2_Pseudo>(NAME).Mnemonic>;
}
multiclass SOP2_Real_Renamed_gfx11<bits<7> op, SOP2_Pseudo backing_pseudo, string real_name> {
- def _gfx11 : SOP2_Real<op, backing_pseudo, real_name>,
+ def _gfx11 : SOP2_Real32<op, backing_pseudo, real_name>,
Select_gfx11<backing_pseudo.Mnemonic>,
- MnemonicAlias<backing_pseudo.Mnemonic, real_name>, Requires<[isGFX11Plus]>;
-}
-
-defm S_ABSDIFF_I32 : SOP2_Real_gfx11<0x006>;
-defm S_LSHL_B32 : SOP2_Real_gfx11<0x008>;
-defm S_LSHL_B64 : SOP2_Real_gfx11<0x009>;
-defm S_LSHR_B32 : SOP2_Real_gfx11<0x00a>;
-defm S_LSHR_B64 : SOP2_Real_gfx11<0x00b>;
-defm S_ASHR_I32 : SOP2_Real_gfx11<0x00c>;
-defm S_ASHR_I64 : SOP2_Real_gfx11<0x00d>;
-defm S_LSHL1_ADD_U32 : SOP2_Real_gfx11<0x00e>;
-defm S_LSHL2_ADD_U32 : SOP2_Real_gfx11<0x00f>;
-defm S_LSHL3_ADD_U32 : SOP2_Real_gfx11<0x010>;
-defm S_LSHL4_ADD_U32 : SOP2_Real_gfx11<0x011>;
-defm S_MIN_I32 : SOP2_Real_gfx11<0x012>;
-defm S_MIN_U32 : SOP2_Real_gfx11<0x013>;
-defm S_MAX_I32 : SOP2_Real_gfx11<0x014>;
-defm S_MAX_U32 : SOP2_Real_gfx11<0x015>;
-defm S_AND_B32 : SOP2_Real_gfx11<0x016>;
-defm S_AND_B64 : SOP2_Real_gfx11<0x017>;
-defm S_OR_B32 : SOP2_Real_gfx11<0x018>;
-defm S_OR_B64 : SOP2_Real_gfx11<0x019>;
-defm S_XOR_B32 : SOP2_Real_gfx11<0x01a>;
-defm S_XOR_B64 : SOP2_Real_gfx11<0x01b>;
-defm S_NAND_B32 : SOP2_Real_gfx11<0x01c>;
-defm S_NAND_B64 : SOP2_Real_gfx11<0x01d>;
-defm S_NOR_B32 : SOP2_Real_gfx11<0x01e>;
-defm S_NOR_B64 : SOP2_Real_gfx11<0x01f>;
-defm S_XNOR_B32 : SOP2_Real_gfx11<0x020>;
-defm S_XNOR_B64 : SOP2_Real_gfx11<0x021>;
-defm S_AND_NOT1_B32 : SOP2_Real_Renamed_gfx11<0x022, S_ANDN2_B32, "s_and_not1_b32">;
-defm S_AND_NOT1_B64 : SOP2_Real_Renamed_gfx11<0x023, S_ANDN2_B64, "s_and_not1_b64">;
-defm S_OR_NOT1_B32 : SOP2_Real_Renamed_gfx11<0x024, S_ORN2_B32, "s_or_not1_b32">;
-defm S_OR_NOT1_B64 : SOP2_Real_Renamed_gfx11<0x025, S_ORN2_B64, "s_or_not1_b64">;
-defm S_BFE_U32 : SOP2_Real_gfx11<0x026>;
-defm S_BFE_I32 : SOP2_Real_gfx11<0x027>;
-defm S_BFE_U64 : SOP2_Real_gfx11<0x028>;
-defm S_BFE_I64 : SOP2_Real_gfx11<0x029>;
-defm S_BFM_B32 : SOP2_Real_gfx11<0x02a>;
-defm S_BFM_B64 : SOP2_Real_gfx11<0x02b>;
-defm S_MUL_I32 : SOP2_Real_gfx11<0x02c>;
-defm S_MUL_HI_U32 : SOP2_Real_gfx11<0x02d>;
-defm S_MUL_HI_I32 : SOP2_Real_gfx11<0x02e>;
-defm S_CSELECT_B32 : SOP2_Real_gfx11<0x030>;
-defm S_CSELECT_B64 : SOP2_Real_gfx11<0x031>;
-defm S_PACK_HL_B32_B16 : SOP2_Real_gfx11<0x035>;
+ MnemonicAlias<backing_pseudo.Mnemonic, real_name>, Requires<[isGFX11Only]>;
+}
+
+multiclass SOP2_Real_gfx11_gfx12<bits<7> op> :
+ SOP2_Real_gfx11<op>, SOP2_Real_gfx12<op>;
+
+multiclass SOP2_Real_Renamed_gfx11_gfx12<bits<8> op, SOP2_Pseudo backing_pseudo, string real_name> :
+ SOP2_Real_Renamed_gfx11<op, backing_pseudo, real_name>,
+ SOP2_Real_Renamed_gfx12<op, backing_pseudo, real_name>;
+
+defm S_ABSDIFF_I32 : SOP2_Real_gfx11_gfx12<0x006>;
+defm S_LSHL_B32 : SOP2_Real_gfx11_gfx12<0x008>;
+defm S_LSHL_B64 : SOP2_Real_gfx11_gfx12<0x009>;
+defm S_LSHR_B32 : SOP2_Real_gfx11_gfx12<0x00a>;
+defm S_LSHR_B64 : SOP2_Real_gfx11_gfx12<0x00b>;
+defm S_ASHR_I32 : SOP2_Real_gfx11_gfx12<0x00c>;
+defm S_ASHR_I64 : SOP2_Real_gfx11_gfx12<0x00d>;
+defm S_LSHL1_ADD_U32 : SOP2_Real_gfx11_gfx12<0x00e>;
+defm S_LSHL2_ADD_U32 : SOP2_Real_gfx11_gfx12<0x00f>;
+defm S_LSHL3_ADD_U32 : SOP2_Real_gfx11_gfx12<0x010>;
+defm S_LSHL4_ADD_U32 : SOP2_Real_gfx11_gfx12<0x011>;
+defm S_MIN_I32 : SOP2_Real_gfx11_gfx12<0x012>;
+defm S_MIN_U32 : SOP2_Real_gfx11_gfx12<0x013>;
+defm S_MAX_I32 : SOP2_Real_gfx11_gfx12<0x014>;
+defm S_MAX_U32 : SOP2_Real_gfx11_gfx12<0x015>;
+defm S_AND_B32 : SOP2_Real_gfx11_gfx12<0x016>;
+defm S_AND_B64 : SOP2_Real_gfx11_gfx12<0x017>;
+defm S_OR_B32 : SOP2_Real_gfx11_gfx12<0x018>;
+defm S_OR_B64 : SOP2_Real_gfx11_gfx12<0x019>;
+defm S_XOR_B32 : SOP2_Real_gfx11_gfx12<0x01a>;
+defm S_XOR_B64 : SOP2_Real_gfx11_gfx12<0x01b>;
+defm S_NAND_B32 : SOP2_Real_gfx11_gfx12<0x01c>;
+defm S_NAND_B64 : SOP2_Real_gfx11_gfx12<0x01d>;
+defm S_NOR_B32 : SOP2_Real_gfx11_gfx12<0x01e>;
+defm S_NOR_B64 : SOP2_Real_gfx11_gfx12<0x01f>;
+defm S_XNOR_B32 : SOP2_Real_gfx11_gfx12<0x020>;
+defm S_XNOR_B64 : SOP2_Real_gfx11_gfx12<0x021>;
+defm S_AND_NOT1_B32 : SOP2_Real_Renamed_gfx11_gfx12<0x022, S_ANDN2_B32, "s_and_not1_b32">;
+defm S_AND_NOT1_B64 : SOP2_Real_Renamed_gfx11_gfx12<0x023, S_ANDN2_B64, "s_and_not1_b64">;
+defm S_OR_NOT1_B32 : SOP2_Real_Renamed_gfx11_gfx12<0x024, S_ORN2_B32, "s_or_not1_b32">;
+defm S_OR_NOT1_B64 : SOP2_Real_Renamed_gfx11_gfx12<0x025, S_ORN2_B64, "s_or_not1_b64">;
+defm S_BFE_U32 : SOP2_Real_gfx11_gfx12<0x026>;
+defm S_BFE_I32 : SOP2_Real_gfx11_gfx12<0x027>;
+defm S_BFE_U64 : SOP2_Real_gfx11_gfx12<0x028>;
+defm S_BFE_I64 : SOP2_Real_gfx11_gfx12<0x029>;
+defm S_BFM_B32 : SOP2_Real_gfx11_gfx12<0x02a>;
+defm S_BFM_B64 : SOP2_Real_gfx11_gfx12<0x02b>;
+defm S_MUL_I32 : SOP2_Real_gfx11_gfx12<0x02c>;
+defm S_MUL_HI_U32 : SOP2_Real_gfx11_gfx12<0x02d>;
+defm S_MUL_HI_I32 : SOP2_Real_gfx11_gfx12<0x02e>;
+defm S_CSELECT_B32 : SOP2_Real_gfx11_gfx12<0x030>;
+defm S_CSELECT_B64 : SOP2_Real_gfx11_gfx12<0x031>;
+defm S_PACK_HL_B32_B16 : SOP2_Real_gfx11_gfx12<0x035>;
+defm S_ADD_NC_U64 : SOP2_Real_Renamed_gfx12<0x053, S_ADD_U64, "s_add_nc_u64">;
+defm S_SUB_NC_U64 : SOP2_Real_Renamed_gfx12<0x054, S_SUB_U64, "s_sub_nc_u64">;
+defm S_MUL_U64 : SOP2_Real_gfx12<0x055>;
+
+//===----------------------------------------------------------------------===//
+// SOP2 - GFX1150, GFX12
+//===----------------------------------------------------------------------===//
+
+multiclass SOP2_Real_FMAK_gfx12<bits<7> op> {
+ def _gfx12 : SOP2_Real64<op, !cast<SOP2_Pseudo>(NAME)>,
+ Select_gfx12<!cast<SOP2_Pseudo>(NAME).Mnemonic>;
+}
+
+multiclass SOP2_Real_FMAK_gfx11<bits<7> op> {
+ def _gfx11 : SOP2_Real64<op, !cast<SOP2_Pseudo>(NAME)>,
+ Select_gfx11<!cast<SOP2_Pseudo>(NAME).Mnemonic>;
+}
+
+multiclass SOP2_Real_FMAK_gfx11_gfx12<bits<7> op> :
+ SOP2_Real_FMAK_gfx11<op>, SOP2_Real_FMAK_gfx12<op>;
+
+defm S_ADD_F32 : SOP2_Real_gfx11_gfx12<0x040>;
+defm S_SUB_F32 : SOP2_Real_gfx11_gfx12<0x041>;
+defm S_MUL_F32 : SOP2_Real_gfx11_gfx12<0x044>;
+defm S_FMAAK_F32 : SOP2_Real_FMAK_gfx11_gfx12<0x045>;
+defm S_FMAMK_F32 : SOP2_Real_FMAK_gfx11_gfx12<0x046>;
+defm S_FMAC_F32 : SOP2_Real_gfx11_gfx12<0x047>;
+defm S_CVT_PK_RTZ_F16_F32 : SOP2_Real_gfx11_gfx12<0x048>;
+defm S_ADD_F16 : SOP2_Real_gfx11_gfx12<0x049>;
+defm S_SUB_F16 : SOP2_Real_gfx11_gfx12<0x04a>;
+defm S_MUL_F16 : SOP2_Real_gfx11_gfx12<0x04d>;
+defm S_FMAC_F16 : SOP2_Real_gfx11_gfx12<0x04e>;
+
+//===----------------------------------------------------------------------===//
+// SOP2 - GFX1150
+//===----------------------------------------------------------------------===//
+
+defm S_MIN_F32 : SOP2_Real_gfx11<0x042>;
+defm S_MAX_F32 : SOP2_Real_gfx11<0x043>;
+defm S_MIN_F16 : SOP2_Real_gfx11<0x04b>;
+defm S_MAX_F16 : SOP2_Real_gfx11<0x04c>;
//===----------------------------------------------------------------------===//
// SOP2 - GFX10.
@@ -1741,20 +2287,20 @@ defm S_PACK_HL_B32_B16 : SOP2_Real_gfx11<0x035>;
multiclass SOP2_Real_gfx10<bits<7> op> {
defvar ps = !cast<SOP2_Pseudo>(NAME);
- def _gfx10 : SOP2_Real<op, ps>,
+ def _gfx10 : SOP2_Real32<op, ps>,
Select_gfx10<ps.Mnemonic>;
}
-multiclass SOP2_Real_gfx10_gfx11<bits<7> op> :
- SOP2_Real_gfx10<op>, SOP2_Real_gfx11<op>;
+multiclass SOP2_Real_gfx10_gfx11_gfx12<bits<7> op> :
+ SOP2_Real_gfx10<op>, SOP2_Real_gfx11_gfx12<op>;
defm S_LSHL1_ADD_U32 : SOP2_Real_gfx10<0x02e>;
defm S_LSHL2_ADD_U32 : SOP2_Real_gfx10<0x02f>;
defm S_LSHL3_ADD_U32 : SOP2_Real_gfx10<0x030>;
defm S_LSHL4_ADD_U32 : SOP2_Real_gfx10<0x031>;
-defm S_PACK_LL_B32_B16 : SOP2_Real_gfx10_gfx11<0x032>;
-defm S_PACK_LH_B32_B16 : SOP2_Real_gfx10_gfx11<0x033>;
-defm S_PACK_HH_B32_B16 : SOP2_Real_gfx10_gfx11<0x034>;
+defm S_PACK_LL_B32_B16 : SOP2_Real_gfx10_gfx11_gfx12<0x032>;
+defm S_PACK_LH_B32_B16 : SOP2_Real_gfx10_gfx11_gfx12<0x033>;
+defm S_PACK_HH_B32_B16 : SOP2_Real_gfx10_gfx11_gfx12<0x034>;
defm S_MUL_HI_U32 : SOP2_Real_gfx10<0x035>;
defm S_MUL_HI_I32 : SOP2_Real_gfx10<0x036>;
@@ -1764,7 +2310,7 @@ defm S_MUL_HI_I32 : SOP2_Real_gfx10<0x036>;
multiclass SOP2_Real_gfx6_gfx7<bits<7> op> {
defvar ps = !cast<SOP_Pseudo>(NAME);
- def _gfx6_gfx7 : SOP2_Real<op, ps>,
+ def _gfx6_gfx7 : SOP2_Real32<op, ps>,
Select_gfx6_gfx7<ps.Mnemonic>;
}
@@ -1772,7 +2318,10 @@ multiclass SOP2_Real_gfx6_gfx7_gfx10<bits<7> op> :
SOP2_Real_gfx6_gfx7<op>, SOP2_Real_gfx10<op>;
multiclass SOP2_Real_gfx6_gfx7_gfx10_gfx11<bits<7> op> :
- SOP2_Real_gfx6_gfx7<op>, SOP2_Real_gfx10_gfx11<op>;
+ SOP2_Real_gfx6_gfx7<op>, SOP2_Real_gfx10<op>, SOP2_Real_gfx11<op>;
+
+multiclass SOP2_Real_gfx6_gfx7_gfx10_gfx11_gfx12<bits<7> op> :
+ SOP2_Real_gfx6_gfx7<op>, SOP2_Real_gfx10_gfx11_gfx12<op>;
defm S_CBRANCH_G_FORK : SOP2_Real_gfx6_gfx7<0x02b>;
@@ -1820,29 +2369,52 @@ defm S_BFE_I64 : SOP2_Real_gfx6_gfx7_gfx10<0x02a>;
defm S_ABSDIFF_I32 : SOP2_Real_gfx6_gfx7_gfx10<0x02c>;
//===----------------------------------------------------------------------===//
-// SOPK - GFX11.
+// SOPK - GFX11, GFX12.
//===----------------------------------------------------------------------===//
+multiclass SOPK_Real32_gfx12<bits<5> op> {
+ def _gfx12 : SOPK_Real32<op, !cast<SOPK_Pseudo>(NAME)>,
+ Select_gfx12<!cast<SOPK_Pseudo>(NAME).Mnemonic>;
+}
+
+multiclass SOPK_Real32_Renamed_gfx12<bits<5> op, SOPK_Pseudo backing_pseudo, string real_name> {
+ def _gfx12 : SOPK_Real32<op, backing_pseudo, real_name>,
+ Select_gfx12<backing_pseudo.Mnemonic>,
+ MnemonicAlias<backing_pseudo.Mnemonic, real_name>, Requires<[isGFX12Plus]>;
+}
+
multiclass SOPK_Real32_gfx11<bits<5> op> {
def _gfx11 : SOPK_Real32<op, !cast<SOPK_Pseudo>(NAME)>,
Select_gfx11<!cast<SOPK_Pseudo>(NAME).Mnemonic>;
}
+multiclass SOPK_Real64_gfx12<bits<5> op> {
+ def _gfx12 : SOPK_Real64<op, !cast<SOPK_Pseudo>(NAME)>,
+ Select_gfx12<!cast<SOPK_Pseudo>(NAME).Mnemonic>;
+}
+
multiclass SOPK_Real64_gfx11<bits<5> op> {
def _gfx11 : SOPK_Real64<op, !cast<SOPK_Pseudo>(NAME)>,
Select_gfx11<!cast<SOPK_Pseudo>(NAME).Mnemonic>;
}
-defm S_GETREG_B32 : SOPK_Real32_gfx11<0x011>;
-defm S_SETREG_B32 : SOPK_Real32_gfx11<0x012>;
-defm S_SETREG_IMM32_B32 : SOPK_Real64_gfx11<0x013>;
-defm S_CALL_B64 : SOPK_Real32_gfx11<0x014>;
+multiclass SOPK_Real32_gfx11_gfx12<bits<5> op> :
+ SOPK_Real32_gfx11<op>, SOPK_Real32_gfx12<op>;
+
+multiclass SOPK_Real64_gfx11_gfx12<bits<5> op> :
+ SOPK_Real64_gfx11<op>, SOPK_Real64_gfx12<op>;
+
+defm S_ADDK_CO_I32 : SOPK_Real32_Renamed_gfx12<0x00f, S_ADDK_I32, "s_addk_co_i32">;
+defm S_GETREG_B32 : SOPK_Real32_gfx11_gfx12<0x011>;
+defm S_SETREG_B32 : SOPK_Real32_gfx11_gfx12<0x012>;
+defm S_SETREG_IMM32_B32 : SOPK_Real64_gfx11_gfx12<0x013>;
+defm S_CALL_B64 : SOPK_Real32_gfx11_gfx12<0x014>;
defm S_SUBVECTOR_LOOP_BEGIN : SOPK_Real32_gfx11<0x016>;
defm S_SUBVECTOR_LOOP_END : SOPK_Real32_gfx11<0x017>;
-defm S_WAITCNT_VSCNT : SOPK_Real32_gfx11<0x018>;
-defm S_WAITCNT_VMCNT : SOPK_Real32_gfx11<0x019>;
-defm S_WAITCNT_EXPCNT : SOPK_Real32_gfx11<0x01a>;
-defm S_WAITCNT_LGKMCNT : SOPK_Real32_gfx11<0x01b>;
+defm S_WAITCNT_VSCNT : SOPK_Real32_gfx11_gfx12<0x018>;
+defm S_WAITCNT_VMCNT : SOPK_Real32_gfx11_gfx12<0x019>;
+defm S_WAITCNT_EXPCNT : SOPK_Real32_gfx11_gfx12<0x01a>;
+defm S_WAITCNT_LGKMCNT : SOPK_Real32_gfx11_gfx12<0x01b>;
//===----------------------------------------------------------------------===//
// SOPK - GFX10.
@@ -1863,7 +2435,10 @@ multiclass SOPK_Real64_gfx10<bits<5> op> {
multiclass SOPK_Real32_gfx10_gfx11<bits<5> op> :
SOPK_Real32_gfx10<op>, SOPK_Real32_gfx11<op>;
-defm S_VERSION : SOPK_Real32_gfx10_gfx11<0x001>;
+multiclass SOPK_Real32_gfx10_gfx11_gfx12<bits<5> op> :
+ SOPK_Real32_gfx10<op>, SOPK_Real32_gfx11_gfx12<op>;
+
+defm S_VERSION : SOPK_Real32_gfx10_gfx11_gfx12<0x001>;
defm S_CALL_B64 : SOPK_Real32_gfx10<0x016>;
defm S_WAITCNT_VSCNT : SOPK_Real32_gfx10<0x017>;
defm S_WAITCNT_VMCNT : SOPK_Real32_gfx10<0x018>;
@@ -1897,10 +2472,13 @@ multiclass SOPK_Real64_gfx6_gfx7_gfx10<bits<5> op> :
multiclass SOPK_Real32_gfx6_gfx7_gfx10_gfx11<bits<5> op> :
SOPK_Real32_gfx6_gfx7<op>, SOPK_Real32_gfx10_gfx11<op>;
+multiclass SOPK_Real32_gfx6_gfx7_gfx10_gfx11_gfx12<bits<5> op> :
+ SOPK_Real32_gfx6_gfx7<op>, SOPK_Real32_gfx10_gfx11_gfx12<op>;
+
defm S_CBRANCH_I_FORK : SOPK_Real32_gfx6_gfx7<0x011>;
-defm S_MOVK_I32 : SOPK_Real32_gfx6_gfx7_gfx10_gfx11<0x000>;
-defm S_CMOVK_I32 : SOPK_Real32_gfx6_gfx7_gfx10_gfx11<0x002>;
+defm S_MOVK_I32 : SOPK_Real32_gfx6_gfx7_gfx10_gfx11_gfx12<0x000>;
+defm S_CMOVK_I32 : SOPK_Real32_gfx6_gfx7_gfx10_gfx11_gfx12<0x002>;
defm S_CMPK_EQ_I32 : SOPK_Real32_gfx6_gfx7_gfx10_gfx11<0x003>;
defm S_CMPK_LG_I32 : SOPK_Real32_gfx6_gfx7_gfx10_gfx11<0x004>;
defm S_CMPK_GT_I32 : SOPK_Real32_gfx6_gfx7_gfx10_gfx11<0x005>;
@@ -1914,21 +2492,48 @@ defm S_CMPK_GE_U32 : SOPK_Real32_gfx6_gfx7_gfx10_gfx11<0x00c>;
defm S_CMPK_LT_U32 : SOPK_Real32_gfx6_gfx7_gfx10_gfx11<0x00d>;
defm S_CMPK_LE_U32 : SOPK_Real32_gfx6_gfx7_gfx10_gfx11<0x00e>;
defm S_ADDK_I32 : SOPK_Real32_gfx6_gfx7_gfx10_gfx11<0x00f>;
-defm S_MULK_I32 : SOPK_Real32_gfx6_gfx7_gfx10_gfx11<0x010>;
+defm S_MULK_I32 : SOPK_Real32_gfx6_gfx7_gfx10_gfx11_gfx12<0x010>;
defm S_GETREG_B32 : SOPK_Real32_gfx6_gfx7_gfx10<0x012>;
defm S_SETREG_B32 : SOPK_Real32_gfx6_gfx7_gfx10<0x013>;
defm S_SETREG_IMM32_B32 : SOPK_Real64_gfx6_gfx7_gfx10<0x015>;
//===----------------------------------------------------------------------===//
-// SOPP - GFX11
+// SOPP - GFX12 only.
//===----------------------------------------------------------------------===//
+multiclass SOPP_Real_32_gfx12<bits<7> op> {
+ def _gfx12 : SOPP_Real_32<op, !cast<SOPP_Pseudo>(NAME), !cast<SOPP_Pseudo>(NAME).Mnemonic>,
+ Select_gfx12<!cast<SOPP_Pseudo>(NAME).Mnemonic>,
+ SOPPRelaxTable<0, !cast<SOPP_Pseudo>(NAME).KeyName, "_gfx12">;
+}
+
+multiclass SOPP_Real_32_Renamed_gfx12<bits<7> op, SOPP_Pseudo backing_pseudo, string real_name> {
+ def _gfx12 : SOPP_Real_32<op, backing_pseudo, real_name>,
+ Select_gfx12<backing_pseudo.Mnemonic>,
+ MnemonicAlias<backing_pseudo.Mnemonic, real_name>, Requires<[isGFX12Plus]>;
+}
+
+defm S_WAIT_ALU : SOPP_Real_32_Renamed_gfx12<0x008, S_WAITCNT_DEPCTR, "s_wait_alu">;
+defm S_BARRIER_WAIT : SOPP_Real_32_gfx12<0x014>;
+defm S_BARRIER_LEAVE : SOPP_Real_32_gfx12<0x015>;
+
+//===----------------------------------------------------------------------===//
+// SOPP - GFX11, GFX12.
+//===----------------------------------------------------------------------===//
+
+
multiclass SOPP_Real_32_gfx11<bits<7> op> {
def _gfx11 : SOPP_Real_32<op, !cast<SOPP_Pseudo>(NAME), !cast<SOPP_Pseudo>(NAME).Mnemonic>,
Select_gfx11<!cast<SOPP_Pseudo>(NAME).Mnemonic>,
SOPPRelaxTable<0, !cast<SOPP_Pseudo>(NAME).KeyName, "_gfx11">;
}
+multiclass SOPP_Real_64_gfx12<bits<7> op> {
+ def _gfx12 : SOPP_Real_64<op, !cast<SOPP_Pseudo>(NAME), !cast<SOPP_Pseudo>(NAME).Mnemonic>,
+ Select_gfx12<!cast<SOPP_Pseudo>(NAME).Mnemonic>,
+ SOPPRelaxTable<1, !cast<SOPP_Pseudo>(NAME).KeyName, "_gfx12">;
+}
+
multiclass SOPP_Real_64_gfx11<bits<7> op> {
def _gfx11 : SOPP_Real_64<op, !cast<SOPP_Pseudo>(NAME), !cast<SOPP_Pseudo>(NAME).Mnemonic>,
Select_gfx11<!cast<SOPP_Pseudo>(NAME).Mnemonic>,
@@ -1938,7 +2543,22 @@ multiclass SOPP_Real_64_gfx11<bits<7> op> {
multiclass SOPP_Real_32_Renamed_gfx11<bits<7> op, SOPP_Pseudo backing_pseudo, string real_name> {
def _gfx11 : SOPP_Real_32<op, backing_pseudo, real_name>,
Select_gfx11<backing_pseudo.Mnemonic>,
- MnemonicAlias<backing_pseudo.Mnemonic, real_name>, Requires<[isGFX11Plus]>;
+ MnemonicAlias<backing_pseudo.Mnemonic, real_name>, Requires<[isGFX11Only]>;
+}
+
+multiclass SOPP_Real_32_gfx11_gfx12<bits<7> op> :
+ SOPP_Real_32_gfx11<op>, SOPP_Real_32_gfx12<op>;
+
+multiclass SOPP_Real_64_gfx11_gfx12<bits<7> op> :
+ SOPP_Real_64_gfx11<op>, SOPP_Real_64_gfx12<op>;
+
+multiclass SOPP_Real_32_Renamed_gfx11_gfx12<bits<7> op, SOPP_Pseudo backing_pseudo, string real_name> :
+ SOPP_Real_32_Renamed_gfx11<op, backing_pseudo, real_name>,
+ SOPP_Real_32_Renamed_gfx12<op, backing_pseudo, real_name>;
+
+multiclass SOPP_Real_With_Relaxation_gfx12<bits<7> op> {
+ defm "" : SOPP_Real_32_gfx12<op>;
+ defm _pad_s_nop : SOPP_Real_64_gfx12<op>;
}
multiclass SOPP_Real_With_Relaxation_gfx11<bits<7> op> {
@@ -1946,42 +2566,51 @@ multiclass SOPP_Real_With_Relaxation_gfx11<bits<7> op> {
defm _pad_s_nop : SOPP_Real_64_gfx11<op>;
}
-defm S_SETKILL : SOPP_Real_32_gfx11<0x001>;
-defm S_SETHALT : SOPP_Real_32_gfx11<0x002>;
-defm S_SLEEP : SOPP_Real_32_gfx11<0x003>;
-defm S_SET_INST_PREFETCH_DISTANCE : SOPP_Real_32_Renamed_gfx11<0x004, S_INST_PREFETCH, "s_set_inst_prefetch_distance">;
-defm S_CLAUSE : SOPP_Real_32_gfx11<0x005>;
-defm S_DELAY_ALU : SOPP_Real_32_gfx11<0x007>;
+multiclass SOPP_Real_With_Relaxation_gfx11_gfx12<bits<7>op> :
+ SOPP_Real_With_Relaxation_gfx11<op>, SOPP_Real_With_Relaxation_gfx12<op>;
+
+defm S_SETKILL : SOPP_Real_32_gfx11_gfx12<0x001>;
+defm S_SETHALT : SOPP_Real_32_gfx11_gfx12<0x002>;
+defm S_SLEEP : SOPP_Real_32_gfx11_gfx12<0x003>;
+defm S_SET_INST_PREFETCH_DISTANCE : SOPP_Real_32_Renamed_gfx11_gfx12<0x004, S_INST_PREFETCH, "s_set_inst_prefetch_distance">;
+defm S_CLAUSE : SOPP_Real_32_gfx11_gfx12<0x005>;
+defm S_DELAY_ALU : SOPP_Real_32_gfx11_gfx12<0x007>;
defm S_WAITCNT_DEPCTR : SOPP_Real_32_gfx11<0x008>;
-defm S_WAITCNT : SOPP_Real_32_gfx11<0x009>;
-defm S_WAIT_IDLE : SOPP_Real_32_gfx11<0x00a>;
-defm S_WAIT_EVENT : SOPP_Real_32_gfx11<0x00b>;
-defm S_TRAP : SOPP_Real_32_gfx11<0x010>;
-defm S_ROUND_MODE : SOPP_Real_32_gfx11<0x011>;
-defm S_DENORM_MODE : SOPP_Real_32_gfx11<0x012>;
-defm S_BRANCH : SOPP_Real_With_Relaxation_gfx11<0x020>;
-defm S_CBRANCH_SCC0 : SOPP_Real_With_Relaxation_gfx11<0x021>;
-defm S_CBRANCH_SCC1 : SOPP_Real_With_Relaxation_gfx11<0x022>;
-defm S_CBRANCH_VCCZ : SOPP_Real_With_Relaxation_gfx11<0x023>;
-defm S_CBRANCH_VCCNZ : SOPP_Real_With_Relaxation_gfx11<0x024>;
-defm S_CBRANCH_EXECZ : SOPP_Real_With_Relaxation_gfx11<0x025>;
-defm S_CBRANCH_EXECNZ : SOPP_Real_With_Relaxation_gfx11<0x026>;
+defm S_WAITCNT : SOPP_Real_32_gfx11_gfx12<0x009>;
+defm S_WAIT_IDLE : SOPP_Real_32_gfx11_gfx12<0x00a>;
+defm S_WAIT_EVENT : SOPP_Real_32_gfx11_gfx12<0x00b>;
+defm S_TRAP : SOPP_Real_32_gfx11_gfx12<0x010>;
+defm S_ROUND_MODE : SOPP_Real_32_gfx11_gfx12<0x011>;
+defm S_DENORM_MODE : SOPP_Real_32_gfx11_gfx12<0x012>;
+defm S_BRANCH : SOPP_Real_With_Relaxation_gfx11_gfx12<0x020>;
+defm S_CBRANCH_SCC0 : SOPP_Real_With_Relaxation_gfx11_gfx12<0x021>;
+defm S_CBRANCH_SCC1 : SOPP_Real_With_Relaxation_gfx11_gfx12<0x022>;
+defm S_CBRANCH_VCCZ : SOPP_Real_With_Relaxation_gfx11_gfx12<0x023>;
+defm S_CBRANCH_VCCNZ : SOPP_Real_With_Relaxation_gfx11_gfx12<0x024>;
+defm S_CBRANCH_EXECZ : SOPP_Real_With_Relaxation_gfx11_gfx12<0x025>;
+defm S_CBRANCH_EXECNZ : SOPP_Real_With_Relaxation_gfx11_gfx12<0x026>;
defm S_CBRANCH_CDBGSYS : SOPP_Real_With_Relaxation_gfx11<0x027>;
defm S_CBRANCH_CDBGUSER : SOPP_Real_With_Relaxation_gfx11<0x028>;
defm S_CBRANCH_CDBGSYS_OR_USER : SOPP_Real_With_Relaxation_gfx11<0x029>;
defm S_CBRANCH_CDBGSYS_AND_USER : SOPP_Real_With_Relaxation_gfx11<0x02a>;
-defm S_ENDPGM : SOPP_Real_32_gfx11<0x030>;
-defm S_ENDPGM_SAVED : SOPP_Real_32_gfx11<0x031>;
-defm S_WAKEUP : SOPP_Real_32_gfx11<0x034>;
-defm S_SETPRIO : SOPP_Real_32_gfx11<0x035>;
-defm S_SENDMSG : SOPP_Real_32_gfx11<0x036>;
-defm S_SENDMSGHALT : SOPP_Real_32_gfx11<0x037>;
-defm S_INCPERFLEVEL : SOPP_Real_32_gfx11<0x038>;
-defm S_DECPERFLEVEL : SOPP_Real_32_gfx11<0x039>;
-defm S_TTRACEDATA : SOPP_Real_32_gfx11<0x03a>;
-defm S_TTRACEDATA_IMM : SOPP_Real_32_gfx11<0x03b>;
-defm S_ICACHE_INV : SOPP_Real_32_gfx11<0x03c>;
-defm S_BARRIER : SOPP_Real_32_gfx11<0x03d>;
+defm S_ENDPGM : SOPP_Real_32_gfx11_gfx12<0x030>;
+defm S_ENDPGM_SAVED : SOPP_Real_32_gfx11_gfx12<0x031>;
+defm S_WAKEUP : SOPP_Real_32_gfx11_gfx12<0x034>;
+defm S_SETPRIO : SOPP_Real_32_gfx11_gfx12<0x035>;
+defm S_SENDMSG : SOPP_Real_32_gfx11_gfx12<0x036>;
+defm S_SENDMSGHALT : SOPP_Real_32_gfx11_gfx12<0x037>;
+defm S_INCPERFLEVEL : SOPP_Real_32_gfx11_gfx12<0x038>;
+defm S_DECPERFLEVEL : SOPP_Real_32_gfx11_gfx12<0x039>;
+defm S_TTRACEDATA : SOPP_Real_32_gfx11_gfx12<0x03a>;
+defm S_TTRACEDATA_IMM : SOPP_Real_32_gfx11_gfx12<0x03b>;
+defm S_ICACHE_INV : SOPP_Real_32_gfx11_gfx12<0x03c>;
+defm S_BARRIER : SOPP_Real_32_gfx11_gfx12<0x03d>;
+
+//===----------------------------------------------------------------------===//
+// SOPP - GFX1150, GFX12.
+//===----------------------------------------------------------------------===//
+
+defm S_SINGLEUSE_VDST : SOPP_Real_32_gfx11_gfx12<0x013>;
//===----------------------------------------------------------------------===//
// SOPP - GFX6, GFX7, GFX8, GFX9, GFX10
@@ -2017,11 +2646,11 @@ multiclass SOPP_Real_32_gfx6_gfx7_gfx8_gfx9<bits<7> op> :
multiclass SOPP_Real_32_gfx6_gfx7_gfx8_gfx9_gfx10<bits<7> op> :
SOPP_Real_32_gfx6_gfx7_gfx8_gfx9<op>, SOPP_Real_32_gfx10<op>;
-multiclass SOPP_Real_32_gfx6_gfx7_gfx8_gfx9_gfx10_gfx11<bits<7> op> :
- SOPP_Real_32_gfx6_gfx7_gfx8_gfx9_gfx10<op>, SOPP_Real_32_gfx11<op>;
+multiclass SOPP_Real_32_gfx6_gfx7_gfx8_gfx9_gfx10_gfx11_gfx12<bits<7> op> :
+ SOPP_Real_32_gfx6_gfx7_gfx8_gfx9_gfx10<op>, SOPP_Real_32_gfx11_gfx12<op>;
-multiclass SOPP_Real_32_gfx10_gfx11<bits<7> op> :
- SOPP_Real_32_gfx10<op>, SOPP_Real_32_gfx11<op>;
+multiclass SOPP_Real_32_gfx10_gfx11_gfx12<bits<7> op> :
+ SOPP_Real_32_gfx10<op>, SOPP_Real_32_gfx11_gfx12<op>;
//64 bit encodings, for Relaxation
multiclass SOPP_Real_64_gfx6_gfx7<bits<7> op> {
@@ -2054,8 +2683,8 @@ multiclass SOPP_Real_64_gfx6_gfx7_gfx8_gfx9<bits<7> op> :
multiclass SOPP_Real_64_gfx6_gfx7_gfx8_gfx9_gfx10<bits<7> op> :
SOPP_Real_64_gfx6_gfx7_gfx8_gfx9<op>, SOPP_Real_64_gfx10<op>;
-multiclass SOPP_Real_64_gfx6_gfx7_gfx8_gfx9_gfx10_gfx11<bits<7> op> :
- SOPP_Real_64_gfx6_gfx7_gfx8_gfx9_gfx10<op>, SOPP_Real_64_gfx11<op>;
+multiclass SOPP_Real_64_gfx6_gfx7_gfx8_gfx9_gfx10_gfx11_gfx12<bits<7> op> :
+ SOPP_Real_64_gfx6_gfx7_gfx8_gfx9_gfx10<op>, SOPP_Real_64_gfx11_gfx12<op>;
//relaxation for insts with no operands not implemented
multiclass SOPP_Real_With_Relaxation_gfx6_gfx7_gfx8_gfx9_gfx10<bits<7> op> {
@@ -2063,7 +2692,7 @@ multiclass SOPP_Real_With_Relaxation_gfx6_gfx7_gfx8_gfx9_gfx10<bits<7> op> {
defm _pad_s_nop : SOPP_Real_64_gfx6_gfx7_gfx8_gfx9_gfx10<op>;
}
-defm S_NOP : SOPP_Real_32_gfx6_gfx7_gfx8_gfx9_gfx10_gfx11<0x000>;
+defm S_NOP : SOPP_Real_32_gfx6_gfx7_gfx8_gfx9_gfx10_gfx11_gfx12<0x000>;
defm S_ENDPGM : SOPP_Real_32_gfx6_gfx7_gfx8_gfx9_gfx10<0x001>;
defm S_WAKEUP : SOPP_Real_32_gfx8_gfx9_gfx10<0x003>;
defm S_BARRIER : SOPP_Real_32_gfx6_gfx7_gfx8_gfx9_gfx10<0x00a>;
@@ -2083,7 +2712,7 @@ defm S_ENDPGM_SAVED : SOPP_Real_32_gfx6_gfx7_gfx8_gfx9_gfx10<0x01B>;
defm S_SET_GPR_IDX_OFF : SOPP_Real_32_gfx8_gfx9<0x01c>;
defm S_SET_GPR_IDX_MODE : SOPP_Real_32_gfx8_gfx9<0x01d>;
defm S_ENDPGM_ORDERED_PS_DONE : SOPP_Real_32_gfx8_gfx9_gfx10<0x01e>;
-defm S_CODE_END : SOPP_Real_32_gfx10_gfx11<0x01f>;
+defm S_CODE_END : SOPP_Real_32_gfx10_gfx11_gfx12<0x01f>;
defm S_INST_PREFETCH : SOPP_Real_32_gfx10<0x020>;
defm S_CLAUSE : SOPP_Real_32_gfx10<0x021>;
defm S_WAIT_IDLE : SOPP_Real_32_gfx10<0x022>;
@@ -2107,32 +2736,74 @@ defm S_CBRANCH_CDBGSYS_AND_USER : SOPP_Real_With_Relaxation_gfx6_gfx7_gfx8_gfx9_
}
//===----------------------------------------------------------------------===//
-// SOPC - GFX11
+// SOPC - GFX11, GFX12.
//===----------------------------------------------------------------------===//
+multiclass SOPC_Real_gfx12<bits<7> op> {
+ def _gfx12 : SOPC_Real<op, !cast<SOPC_Pseudo>(NAME)>,
+ Select_gfx12<!cast<SOPC_Pseudo>(NAME).Mnemonic>;
+}
+
multiclass SOPC_Real_gfx11<bits<7> op> {
def _gfx11 : SOPC_Real<op, !cast<SOPC_Pseudo>(NAME)>,
Select_gfx11<!cast<SOPC_Pseudo>(NAME).Mnemonic>;
}
-defm S_CMP_EQ_I32 : SOPC_Real_gfx11<0x00>;
-defm S_CMP_LG_I32 : SOPC_Real_gfx11<0x01>;
-defm S_CMP_GT_I32 : SOPC_Real_gfx11<0x02>;
-defm S_CMP_GE_I32 : SOPC_Real_gfx11<0x03>;
-defm S_CMP_LT_I32 : SOPC_Real_gfx11<0x04>;
-defm S_CMP_LE_I32 : SOPC_Real_gfx11<0x05>;
-defm S_CMP_EQ_U32 : SOPC_Real_gfx11<0x06>;
-defm S_CMP_LG_U32 : SOPC_Real_gfx11<0x07>;
-defm S_CMP_GT_U32 : SOPC_Real_gfx11<0x08>;
-defm S_CMP_GE_U32 : SOPC_Real_gfx11<0x09>;
-defm S_CMP_LT_U32 : SOPC_Real_gfx11<0x0a>;
-defm S_CMP_LE_U32 : SOPC_Real_gfx11<0x0b>;
-defm S_BITCMP0_B32 : SOPC_Real_gfx11<0x0c>;
-defm S_BITCMP1_B32 : SOPC_Real_gfx11<0x0d>;
-defm S_BITCMP0_B64 : SOPC_Real_gfx11<0x0e>;
-defm S_BITCMP1_B64 : SOPC_Real_gfx11<0x0f>;
-defm S_CMP_EQ_U64 : SOPC_Real_gfx11<0x10>;
-defm S_CMP_LG_U64 : SOPC_Real_gfx11<0x11>;
+multiclass SOPC_Real_gfx11_gfx12<bits<7> op> :
+ SOPC_Real_gfx11<op>, SOPC_Real_gfx12<op>;
+
+defm S_CMP_EQ_I32 : SOPC_Real_gfx11_gfx12<0x00>;
+defm S_CMP_LG_I32 : SOPC_Real_gfx11_gfx12<0x01>;
+defm S_CMP_GT_I32 : SOPC_Real_gfx11_gfx12<0x02>;
+defm S_CMP_GE_I32 : SOPC_Real_gfx11_gfx12<0x03>;
+defm S_CMP_LT_I32 : SOPC_Real_gfx11_gfx12<0x04>;
+defm S_CMP_LE_I32 : SOPC_Real_gfx11_gfx12<0x05>;
+defm S_CMP_EQ_U32 : SOPC_Real_gfx11_gfx12<0x06>;
+defm S_CMP_LG_U32 : SOPC_Real_gfx11_gfx12<0x07>;
+defm S_CMP_GT_U32 : SOPC_Real_gfx11_gfx12<0x08>;
+defm S_CMP_GE_U32 : SOPC_Real_gfx11_gfx12<0x09>;
+defm S_CMP_LT_U32 : SOPC_Real_gfx11_gfx12<0x0a>;
+defm S_CMP_LE_U32 : SOPC_Real_gfx11_gfx12<0x0b>;
+defm S_BITCMP0_B32 : SOPC_Real_gfx11_gfx12<0x0c>;
+defm S_BITCMP1_B32 : SOPC_Real_gfx11_gfx12<0x0d>;
+defm S_BITCMP0_B64 : SOPC_Real_gfx11_gfx12<0x0e>;
+defm S_BITCMP1_B64 : SOPC_Real_gfx11_gfx12<0x0f>;
+defm S_CMP_EQ_U64 : SOPC_Real_gfx11_gfx12<0x10>;
+defm S_CMP_LG_U64 : SOPC_Real_gfx11_gfx12<0x11>;
+
+//===----------------------------------------------------------------------===//
+// SOPC - GFX1150, GFX12
+//===----------------------------------------------------------------------===//
+
+defm S_CMP_LT_F32 : SOPC_Real_gfx11_gfx12<0x41>;
+defm S_CMP_EQ_F32 : SOPC_Real_gfx11_gfx12<0x42>;
+defm S_CMP_LE_F32 : SOPC_Real_gfx11_gfx12<0x43>;
+defm S_CMP_GT_F32 : SOPC_Real_gfx11_gfx12<0x44>;
+defm S_CMP_LG_F32 : SOPC_Real_gfx11_gfx12<0x45>;
+defm S_CMP_GE_F32 : SOPC_Real_gfx11_gfx12<0x46>;
+defm S_CMP_O_F32 : SOPC_Real_gfx11_gfx12<0x47>;
+defm S_CMP_U_F32 : SOPC_Real_gfx11_gfx12<0x48>;
+defm S_CMP_NGE_F32 : SOPC_Real_gfx11_gfx12<0x49>;
+defm S_CMP_NLG_F32 : SOPC_Real_gfx11_gfx12<0x4a>;
+defm S_CMP_NGT_F32 : SOPC_Real_gfx11_gfx12<0x4b>;
+defm S_CMP_NLE_F32 : SOPC_Real_gfx11_gfx12<0x4c>;
+defm S_CMP_NEQ_F32 : SOPC_Real_gfx11_gfx12<0x4d>;
+defm S_CMP_NLT_F32 : SOPC_Real_gfx11_gfx12<0x4e>;
+
+defm S_CMP_LT_F16 : SOPC_Real_gfx11_gfx12<0x51>;
+defm S_CMP_EQ_F16 : SOPC_Real_gfx11_gfx12<0x52>;
+defm S_CMP_LE_F16 : SOPC_Real_gfx11_gfx12<0x53>;
+defm S_CMP_GT_F16 : SOPC_Real_gfx11_gfx12<0x54>;
+defm S_CMP_LG_F16 : SOPC_Real_gfx11_gfx12<0x55>;
+defm S_CMP_GE_F16 : SOPC_Real_gfx11_gfx12<0x56>;
+defm S_CMP_O_F16 : SOPC_Real_gfx11_gfx12<0x57>;
+defm S_CMP_U_F16 : SOPC_Real_gfx11_gfx12<0x58>;
+defm S_CMP_NGE_F16 : SOPC_Real_gfx11_gfx12<0x59>;
+defm S_CMP_NLG_F16 : SOPC_Real_gfx11_gfx12<0x5a>;
+defm S_CMP_NGT_F16 : SOPC_Real_gfx11_gfx12<0x5b>;
+defm S_CMP_NLE_F16 : SOPC_Real_gfx11_gfx12<0x5c>;
+defm S_CMP_NEQ_F16 : SOPC_Real_gfx11_gfx12<0x5d>;
+defm S_CMP_NLT_F16 : SOPC_Real_gfx11_gfx12<0x5e>;
//===----------------------------------------------------------------------===//
// SOPC - GFX6, GFX7, GFX8, GFX9, GFX10
@@ -2194,9 +2865,8 @@ class SOP1_Real_vi<bits<8> op, SOP1_Pseudo ps> :
SOP1_Real<op, ps>,
Select_vi<ps.Mnemonic>;
-
class SOP2_Real_vi<bits<7> op, SOP2_Pseudo ps> :
- SOP2_Real<op, ps>,
+ SOP2_Real32<op, ps>,
Select_vi<ps.Mnemonic>;
class SOPK_Real_vi<bits<5> op, SOPK_Pseudo ps> :
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.cpp
index ce40d82021cf..23434d2de0fc 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.cpp
@@ -36,14 +36,15 @@ namespace SendMsg {
// Disable lint checking for this block since it makes the table unreadable.
// NOLINTBEGIN
+// clang-format off
const CustomOperand<const MCSubtargetInfo &> Msg[] = {
{{""}},
{{"MSG_INTERRUPT"}, ID_INTERRUPT},
{{"MSG_GS"}, ID_GS_PreGFX11, isNotGFX11Plus},
{{"MSG_GS_DONE"}, ID_GS_DONE_PreGFX11, isNotGFX11Plus},
{{"MSG_SAVEWAVE"}, ID_SAVEWAVE, isGFX8_GFX9_GFX10},
- {{"MSG_STALL_WAVE_GEN"}, ID_STALL_WAVE_GEN, isGFX9Plus},
- {{"MSG_HALT_WAVES"}, ID_HALT_WAVES, isGFX9Plus},
+ {{"MSG_STALL_WAVE_GEN"}, ID_STALL_WAVE_GEN, isGFX9_GFX10_GFX11},
+ {{"MSG_HALT_WAVES"}, ID_HALT_WAVES, isGFX9_GFX10_GFX11},
{{"MSG_ORDERED_PS_DONE"}, ID_ORDERED_PS_DONE, isGFX9_GFX10},
{{"MSG_EARLY_PRIM_DEALLOC"}, ID_EARLY_PRIM_DEALLOC, isGFX9_GFX10},
{{"MSG_GS_ALLOC_REQ"}, ID_GS_ALLOC_REQ, isGFX9Plus},
@@ -59,7 +60,9 @@ const CustomOperand<const MCSubtargetInfo &> Msg[] = {
{{"MSG_RTN_GET_REALTIME"}, ID_RTN_GET_REALTIME, isGFX11Plus},
{{"MSG_RTN_SAVE_WAVE"}, ID_RTN_SAVE_WAVE, isGFX11Plus},
{{"MSG_RTN_GET_TBA"}, ID_RTN_GET_TBA, isGFX11Plus},
+ {{"MSG_RTN_GET_SE_AID_ID"}, ID_RTN_GET_SE_AID_ID, isGFX12Plus},
};
+// clang-format on
// NOLINTEND
const int MSG_SIZE = static_cast<int>(
@@ -87,41 +90,56 @@ namespace Hwreg {
// Disable lint checking for this block since it makes the table unreadable.
// NOLINTBEGIN
+// clang-format off
const CustomOperand<const MCSubtargetInfo &> Opr[] = {
{{""}},
{{"HW_REG_MODE"}, ID_MODE},
{{"HW_REG_STATUS"}, ID_STATUS},
- {{"HW_REG_TRAPSTS"}, ID_TRAPSTS},
+ {{"HW_REG_TRAPSTS"}, ID_TRAPSTS, isNotGFX12Plus},
{{"HW_REG_HW_ID"}, ID_HW_ID, isNotGFX10Plus},
{{"HW_REG_GPR_ALLOC"}, ID_GPR_ALLOC},
{{"HW_REG_LDS_ALLOC"}, ID_LDS_ALLOC},
{{"HW_REG_IB_STS"}, ID_IB_STS},
{{""}},
{{""}},
+ {{"HW_REG_PERF_SNAPSHOT_DATA"}, ID_PERF_SNAPSHOT_DATA_gfx12, isGFX12Plus},
+ {{"HW_REG_PERF_SNAPSHOT_PC_LO"}, ID_PERF_SNAPSHOT_PC_LO_gfx12, isGFX12Plus},
+ {{"HW_REG_PERF_SNAPSHOT_PC_HI"}, ID_PERF_SNAPSHOT_PC_HI_gfx12, isGFX12Plus},
{{""}},
{{""}},
- {{""}},
- {{""}},
- {{""}},
- {{"HW_REG_SH_MEM_BASES"}, ID_MEM_BASES, isGFX9Plus},
+ {{"HW_REG_SH_MEM_BASES"}, ID_MEM_BASES, isGFX9_GFX10_GFX11},
{{"HW_REG_TBA_LO"}, ID_TBA_LO, isGFX9_GFX10},
{{"HW_REG_TBA_HI"}, ID_TBA_HI, isGFX9_GFX10},
{{"HW_REG_TMA_LO"}, ID_TMA_LO, isGFX9_GFX10},
{{"HW_REG_TMA_HI"}, ID_TMA_HI, isGFX9_GFX10},
- {{"HW_REG_FLAT_SCR_LO"}, ID_FLAT_SCR_LO, isGFX10Plus},
- {{"HW_REG_FLAT_SCR_HI"}, ID_FLAT_SCR_HI, isGFX10Plus},
+ {{"HW_REG_FLAT_SCR_LO"}, ID_FLAT_SCR_LO, isGFX10_GFX11},
+ {{"HW_REG_FLAT_SCR_HI"}, ID_FLAT_SCR_HI, isGFX10_GFX11},
{{"HW_REG_XNACK_MASK"}, ID_XNACK_MASK, isGFX10Before1030},
{{"HW_REG_HW_ID1"}, ID_HW_ID1, isGFX10Plus},
{{"HW_REG_HW_ID2"}, ID_HW_ID2, isGFX10Plus},
{{"HW_REG_POPS_PACKER"}, ID_POPS_PACKER, isGFX10},
{{""}},
- {{"HW_REG_PERF_SNAPSHOT_DATA"}, ID_PERF_SNAPSHOT_DATA, isGFX11Plus},
+ {{"HW_REG_PERF_SNAPSHOT_DATA"}, ID_PERF_SNAPSHOT_DATA_gfx11, isGFX11},
{{""}},
- {{"HW_REG_SHADER_CYCLES"}, ID_SHADER_CYCLES, isGFX10_BEncoding},
-
- // Register numbers reused in GFX11+
- {{"HW_REG_PERF_SNAPSHOT_PC_LO"}, ID_PERF_SNAPSHOT_PC_LO, isGFX11Plus},
- {{"HW_REG_PERF_SNAPSHOT_PC_HI"}, ID_PERF_SNAPSHOT_PC_HI, isGFX11Plus},
+ {{"HW_REG_SHADER_CYCLES"}, ID_SHADER_CYCLES, isGFX10_3_GFX11},
+ {{"HW_REG_SHADER_CYCLES_HI"}, ID_SHADER_CYCLES_HI, isGFX12Plus},
+ {{"HW_REG_DVGPR_ALLOC_LO"}, ID_DVGPR_ALLOC_LO, isGFX12Plus},
+ {{"HW_REG_DVGPR_ALLOC_HI"}, ID_DVGPR_ALLOC_HI, isGFX12Plus},
+
+ // Register numbers reused in GFX11
+ {{"HW_REG_PERF_SNAPSHOT_PC_LO"}, ID_PERF_SNAPSHOT_PC_LO_gfx11, isGFX11},
+ {{"HW_REG_PERF_SNAPSHOT_PC_HI"}, ID_PERF_SNAPSHOT_PC_HI_gfx11, isGFX11},
+
+ // Register numbers reused in GFX12+
+ {{"HW_REG_STATE_PRIV"}, ID_STATE_PRIV, isGFX12Plus},
+ {{"HW_REG_PERF_SNAPSHOT_DATA1"}, ID_PERF_SNAPSHOT_DATA1, isGFX12Plus},
+ {{"HW_REG_PERF_SNAPSHOT_DATA2"}, ID_PERF_SNAPSHOT_DATA2, isGFX12Plus},
+ {{"HW_REG_EXCP_FLAG_PRIV"}, ID_EXCP_FLAG_PRIV, isGFX12Plus},
+ {{"HW_REG_EXCP_FLAG_USER"}, ID_EXCP_FLAG_USER, isGFX12Plus},
+ {{"HW_REG_TRAP_CTRL"}, ID_TRAP_CTRL, isGFX12Plus},
+ {{"HW_REG_SCRATCH_BASE_LO"}, ID_FLAT_SCR_LO, isGFX12Plus},
+ {{"HW_REG_SCRATCH_BASE_HI"}, ID_FLAT_SCR_HI, isGFX12Plus},
+ {{"HW_REG_SHADER_CYCLES_LO"}, ID_SHADER_CYCLES, isGFX12Plus},
// GFX940 specific registers
{{"HW_REG_XCC_ID"}, ID_XCC_ID, isGFX940},
@@ -133,6 +151,7 @@ const CustomOperand<const MCSubtargetInfo &> Opr[] = {
// Aliases
{{"HW_REG_HW_ID"}, ID_HW_ID1, isGFX10},
};
+// clang-format on
// NOLINTEND
const int OPR_SIZE = static_cast<int>(
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
index 296ea18b2a8d..0f92a56237ac 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
@@ -12,7 +12,6 @@
#include "AMDKernelCodeT.h"
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
#include "llvm/BinaryFormat/ELF.h"
-#include "llvm/CodeGen/TargetRegisterInfo.h"
#include "llvm/IR/Attributes.h"
#include "llvm/IR/Constants.h"
#include "llvm/IR/Function.h"
@@ -119,15 +118,16 @@ namespace llvm {
namespace AMDGPU {
+/// \returns True if \p STI is AMDHSA.
+bool isHsaAbi(const MCSubtargetInfo &STI) {
+ return STI.getTargetTriple().getOS() == Triple::AMDHSA;
+}
+
std::optional<uint8_t> getHsaAbiVersion(const MCSubtargetInfo *STI) {
if (STI && STI->getTargetTriple().getOS() != Triple::AMDHSA)
return std::nullopt;
switch (AmdhsaCodeObjectVersion) {
- case 2:
- return ELF::ELFABIVERSION_AMDGPU_HSA_V2;
- case 3:
- return ELF::ELFABIVERSION_AMDGPU_HSA_V3;
case 4:
return ELF::ELFABIVERSION_AMDGPU_HSA_V4;
case 5:
@@ -138,18 +138,6 @@ std::optional<uint8_t> getHsaAbiVersion(const MCSubtargetInfo *STI) {
}
}
-bool isHsaAbiVersion2(const MCSubtargetInfo *STI) {
- if (std::optional<uint8_t> HsaAbiVer = getHsaAbiVersion(STI))
- return *HsaAbiVer == ELF::ELFABIVERSION_AMDGPU_HSA_V2;
- return false;
-}
-
-bool isHsaAbiVersion3(const MCSubtargetInfo *STI) {
- if (std::optional<uint8_t> HsaAbiVer = getHsaAbiVersion(STI))
- return *HsaAbiVer == ELF::ELFABIVERSION_AMDGPU_HSA_V3;
- return false;
-}
-
bool isHsaAbiVersion4(const MCSubtargetInfo *STI) {
if (std::optional<uint8_t> HsaAbiVer = getHsaAbiVersion(STI))
return *HsaAbiVer == ELF::ELFABIVERSION_AMDGPU_HSA_V4;
@@ -162,11 +150,6 @@ bool isHsaAbiVersion5(const MCSubtargetInfo *STI) {
return false;
}
-bool isHsaAbiVersion3AndAbove(const MCSubtargetInfo *STI) {
- return isHsaAbiVersion3(STI) || isHsaAbiVersion4(STI) ||
- isHsaAbiVersion5(STI);
-}
-
unsigned getAmdhsaCodeObjectVersion() {
return AmdhsaCodeObjectVersion;
}
@@ -183,8 +166,6 @@ unsigned getCodeObjectVersion(const Module &M) {
unsigned getMultigridSyncArgImplicitArgPosition(unsigned CodeObjectVersion) {
switch (CodeObjectVersion) {
- case AMDHSA_COV2:
- case AMDHSA_COV3:
case AMDHSA_COV4:
return 48;
case AMDHSA_COV5:
@@ -198,8 +179,6 @@ unsigned getMultigridSyncArgImplicitArgPosition(unsigned CodeObjectVersion) {
// central TD file.
unsigned getHostcallImplicitArgPosition(unsigned CodeObjectVersion) {
switch (CodeObjectVersion) {
- case AMDHSA_COV2:
- case AMDHSA_COV3:
case AMDHSA_COV4:
return 24;
case AMDHSA_COV5:
@@ -210,8 +189,6 @@ unsigned getHostcallImplicitArgPosition(unsigned CodeObjectVersion) {
unsigned getDefaultQueueImplicitArgPosition(unsigned CodeObjectVersion) {
switch (CodeObjectVersion) {
- case AMDHSA_COV2:
- case AMDHSA_COV3:
case AMDHSA_COV4:
return 32;
case AMDHSA_COV5:
@@ -222,8 +199,6 @@ unsigned getDefaultQueueImplicitArgPosition(unsigned CodeObjectVersion) {
unsigned getCompletionActionImplicitArgPosition(unsigned CodeObjectVersion) {
switch (CodeObjectVersion) {
- case AMDHSA_COV2:
- case AMDHSA_COV3:
case AMDHSA_COV4:
return 40;
case AMDHSA_COV5:
@@ -334,6 +309,7 @@ struct VOPDInfo {
uint16_t Opcode;
uint16_t OpX;
uint16_t OpY;
+ uint16_t Subtarget;
};
struct VOPTrue16Info {
@@ -468,6 +444,14 @@ bool getMAIIsGFX940XDL(unsigned Opc) {
return Info ? Info->is_gfx940_xdl : false;
}
+unsigned getVOPDEncodingFamily(const MCSubtargetInfo &ST) {
+ if (ST.hasFeature(AMDGPU::FeatureGFX12Insts))
+ return SIEncodingFamily::GFX12;
+ if (ST.hasFeature(AMDGPU::FeatureGFX11Insts))
+ return SIEncodingFamily::GFX11;
+ llvm_unreachable("Subtarget generation does not support VOPD!");
+}
+
CanBeVOPD getCanBeVOPD(unsigned Opc) {
const VOPDComponentInfo *Info = getVOPDComponentHelper(Opc);
if (Info)
@@ -495,11 +479,13 @@ bool isMAC(unsigned Opc) {
Opc == AMDGPU::V_FMAC_F64_e64_gfx90a ||
Opc == AMDGPU::V_FMAC_F32_e64_gfx10 ||
Opc == AMDGPU::V_FMAC_F32_e64_gfx11 ||
+ Opc == AMDGPU::V_FMAC_F32_e64_gfx12 ||
Opc == AMDGPU::V_FMAC_F32_e64_vi ||
Opc == AMDGPU::V_FMAC_LEGACY_F32_e64_gfx10 ||
Opc == AMDGPU::V_FMAC_DX9_ZERO_F32_e64_gfx11 ||
Opc == AMDGPU::V_FMAC_F16_e64_gfx10 ||
Opc == AMDGPU::V_FMAC_F16_t16_e64_gfx11 ||
+ Opc == AMDGPU::V_FMAC_F16_t16_e64_gfx12 ||
Opc == AMDGPU::V_DOT2C_F32_F16_e64_vi ||
Opc == AMDGPU::V_DOT2C_I32_I16_e64_vi ||
Opc == AMDGPU::V_DOT4C_I32_I8_e64_vi ||
@@ -510,7 +496,33 @@ bool isPermlane16(unsigned Opc) {
return Opc == AMDGPU::V_PERMLANE16_B32_gfx10 ||
Opc == AMDGPU::V_PERMLANEX16_B32_gfx10 ||
Opc == AMDGPU::V_PERMLANE16_B32_e64_gfx11 ||
- Opc == AMDGPU::V_PERMLANEX16_B32_e64_gfx11;
+ Opc == AMDGPU::V_PERMLANEX16_B32_e64_gfx11 ||
+ Opc == AMDGPU::V_PERMLANE16_B32_e64_gfx12 ||
+ Opc == AMDGPU::V_PERMLANEX16_B32_e64_gfx12 ||
+ Opc == AMDGPU::V_PERMLANE16_VAR_B32_e64_gfx12 ||
+ Opc == AMDGPU::V_PERMLANEX16_VAR_B32_e64_gfx12;
+}
+
+bool isGenericAtomic(unsigned Opc) {
+ return Opc == AMDGPU::G_AMDGPU_ATOMIC_FMIN ||
+ Opc == AMDGPU::G_AMDGPU_ATOMIC_FMAX ||
+ Opc == AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP ||
+ Opc == AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD ||
+ Opc == AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB ||
+ Opc == AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN ||
+ Opc == AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN ||
+ Opc == AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX ||
+ Opc == AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX ||
+ Opc == AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND ||
+ Opc == AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR ||
+ Opc == AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR ||
+ Opc == AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC ||
+ Opc == AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC ||
+ Opc == AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD ||
+ Opc == AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN ||
+ Opc == AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX ||
+ Opc == AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP ||
+ Opc == AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG;
}
bool isTrue16Inst(unsigned Opc) {
@@ -535,8 +547,9 @@ int getMCOpcode(uint16_t Opcode, unsigned Gen) {
return getMCOpcodeGen(Opcode, static_cast<Subtarget>(Gen));
}
-int getVOPDFull(unsigned OpX, unsigned OpY) {
- const VOPDInfo *Info = getVOPDInfoFromComponentOpcodes(OpX, OpY);
+int getVOPDFull(unsigned OpX, unsigned OpY, unsigned EncodingFamily) {
+ const VOPDInfo *Info =
+ getVOPDInfoFromComponentOpcodes(OpX, OpY, EncodingFamily);
return Info ? Info->Opcode : -1;
}
@@ -588,13 +601,15 @@ unsigned ComponentInfo::getIndexInParsedOperands(unsigned CompOprIdx) const {
}
std::optional<unsigned> InstInfo::getInvalidCompOperandIndex(
- std::function<unsigned(unsigned, unsigned)> GetRegIdx) const {
+ std::function<unsigned(unsigned, unsigned)> GetRegIdx, bool SkipSrc) const {
auto OpXRegs = getRegIndices(ComponentIndex::X, GetRegIdx);
auto OpYRegs = getRegIndices(ComponentIndex::Y, GetRegIdx);
+ const unsigned CompOprNum =
+ SkipSrc ? Component::DST_NUM : Component::MAX_OPR_NUM;
unsigned CompOprIdx;
- for (CompOprIdx = 0; CompOprIdx < Component::MAX_OPR_NUM; ++CompOprIdx) {
+ for (CompOprIdx = 0; CompOprIdx < CompOprNum; ++CompOprIdx) {
unsigned BanksMasks = VOPD_VGPR_BANK_MASKS[CompOprIdx];
if (OpXRegs[CompOprIdx] && OpYRegs[CompOprIdx] &&
((OpXRegs[CompOprIdx] & BanksMasks) ==
@@ -719,9 +734,9 @@ void AMDGPUTargetID::setTargetIDFromFeaturesString(StringRef FS) {
static TargetIDSetting
getTargetIDSettingFromFeatureString(StringRef FeatureString) {
- if (FeatureString.endswith("-"))
+ if (FeatureString.ends_with("-"))
return TargetIDSetting::Off;
- if (FeatureString.endswith("+"))
+ if (FeatureString.ends_with("+"))
return TargetIDSetting::On;
llvm_unreachable("Malformed feature string");
@@ -732,9 +747,9 @@ void AMDGPUTargetID::setTargetIDFromTargetIDStream(StringRef TargetID) {
TargetID.split(TargetIDSplit, ':');
for (const auto &FeatureString : TargetIDSplit) {
- if (FeatureString.startswith("xnack"))
+ if (FeatureString.starts_with("xnack"))
XnackSetting = getTargetIDSettingFromFeatureString(FeatureString);
- if (FeatureString.startswith("sramecc"))
+ if (FeatureString.starts_with("sramecc"))
SramEccSetting = getTargetIDSettingFromFeatureString(FeatureString);
}
}
@@ -765,63 +780,6 @@ std::string AMDGPUTargetID::toString() const {
std::string Features;
if (STI.getTargetTriple().getOS() == Triple::AMDHSA) {
switch (CodeObjectVersion) {
- case AMDGPU::AMDHSA_COV2:
- // Code object V2 only supported specific processors and had fixed
- // settings for the XNACK.
- if (Processor == "gfx600") {
- } else if (Processor == "gfx601") {
- } else if (Processor == "gfx602") {
- } else if (Processor == "gfx700") {
- } else if (Processor == "gfx701") {
- } else if (Processor == "gfx702") {
- } else if (Processor == "gfx703") {
- } else if (Processor == "gfx704") {
- } else if (Processor == "gfx705") {
- } else if (Processor == "gfx801") {
- if (!isXnackOnOrAny())
- report_fatal_error(
- "AMD GPU code object V2 does not support processor " +
- Twine(Processor) + " without XNACK");
- } else if (Processor == "gfx802") {
- } else if (Processor == "gfx803") {
- } else if (Processor == "gfx805") {
- } else if (Processor == "gfx810") {
- if (!isXnackOnOrAny())
- report_fatal_error(
- "AMD GPU code object V2 does not support processor " +
- Twine(Processor) + " without XNACK");
- } else if (Processor == "gfx900") {
- if (isXnackOnOrAny())
- Processor = "gfx901";
- } else if (Processor == "gfx902") {
- if (isXnackOnOrAny())
- Processor = "gfx903";
- } else if (Processor == "gfx904") {
- if (isXnackOnOrAny())
- Processor = "gfx905";
- } else if (Processor == "gfx906") {
- if (isXnackOnOrAny())
- Processor = "gfx907";
- } else if (Processor == "gfx90c") {
- if (isXnackOnOrAny())
- report_fatal_error(
- "AMD GPU code object V2 does not support processor " +
- Twine(Processor) + " with XNACK being ON or ANY");
- } else {
- report_fatal_error(
- "AMD GPU code object V2 does not support processor " +
- Twine(Processor));
- }
- break;
- case AMDGPU::AMDHSA_COV3:
- // xnack.
- if (isXnackOnOrAny())
- Features += "+xnack";
- // In code object v2 and v3, "sramecc" feature was spelled with a
- // hyphen ("sram-ecc").
- if (isSramEccOnOrAny())
- Features += "+sram-ecc";
- break;
case AMDGPU::AMDHSA_COV4:
case AMDGPU::AMDHSA_COV5:
// sramecc.
@@ -1191,10 +1149,17 @@ amdhsa::kernel_descriptor_t getDefaultAmdhsaKernelDescriptor(
AMDHSA_BITS_SET(KD.compute_pgm_rsrc1,
amdhsa::COMPUTE_PGM_RSRC1_FLOAT_DENORM_MODE_16_64,
amdhsa::FLOAT_DENORM_MODE_FLUSH_NONE);
- AMDHSA_BITS_SET(KD.compute_pgm_rsrc1,
- amdhsa::COMPUTE_PGM_RSRC1_ENABLE_DX10_CLAMP, 1);
- AMDHSA_BITS_SET(KD.compute_pgm_rsrc1,
- amdhsa::COMPUTE_PGM_RSRC1_ENABLE_IEEE_MODE, 1);
+ if (Version.Major >= 12) {
+ AMDHSA_BITS_SET(KD.compute_pgm_rsrc1,
+ amdhsa::COMPUTE_PGM_RSRC1_GFX12_PLUS_ENABLE_WG_RR_EN, 0);
+ AMDHSA_BITS_SET(KD.compute_pgm_rsrc1,
+ amdhsa::COMPUTE_PGM_RSRC1_GFX12_PLUS_DISABLE_PERF, 0);
+ } else {
+ AMDHSA_BITS_SET(KD.compute_pgm_rsrc1,
+ amdhsa::COMPUTE_PGM_RSRC1_GFX6_GFX11_ENABLE_DX10_CLAMP, 1);
+ AMDHSA_BITS_SET(KD.compute_pgm_rsrc1,
+ amdhsa::COMPUTE_PGM_RSRC1_GFX6_GFX11_ENABLE_IEEE_MODE, 1);
+ }
AMDHSA_BITS_SET(KD.compute_pgm_rsrc2,
amdhsa::COMPUTE_PGM_RSRC2_ENABLE_SGPR_WORKGROUP_ID_X, 1);
if (Version.Major >= 10) {
@@ -1202,10 +1167,10 @@ amdhsa::kernel_descriptor_t getDefaultAmdhsaKernelDescriptor(
amdhsa::KERNEL_CODE_PROPERTY_ENABLE_WAVEFRONT_SIZE32,
STI->getFeatureBits().test(FeatureWavefrontSize32) ? 1 : 0);
AMDHSA_BITS_SET(KD.compute_pgm_rsrc1,
- amdhsa::COMPUTE_PGM_RSRC1_WGP_MODE,
+ amdhsa::COMPUTE_PGM_RSRC1_GFX10_PLUS_WGP_MODE,
STI->getFeatureBits().test(FeatureCuMode) ? 0 : 1);
AMDHSA_BITS_SET(KD.compute_pgm_rsrc1,
- amdhsa::COMPUTE_PGM_RSRC1_MEM_ORDERED, 1);
+ amdhsa::COMPUTE_PGM_RSRC1_GFX10_PLUS_MEM_ORDERED, 1);
}
if (AMDGPU::isGFX90A(*STI)) {
AMDHSA_BITS_SET(KD.compute_pgm_rsrc3,
@@ -1638,7 +1603,7 @@ unsigned getTgtId(const StringRef Name) {
if (Val.MaxIndex == 0 && Name == Val.Name)
return Val.Tgt;
- if (Val.MaxIndex > 0 && Name.startswith(Val.Name)) {
+ if (Val.MaxIndex > 0 && Name.starts_with(Val.Name)) {
StringRef Suffix = Name.drop_front(Val.Name.size());
unsigned Id;
@@ -1931,6 +1896,8 @@ bool isShader(CallingConv::ID cc) {
case CallingConv::AMDGPU_ES:
case CallingConv::AMDGPU_GS:
case CallingConv::AMDGPU_PS:
+ case CallingConv::AMDGPU_CS_Chain:
+ case CallingConv::AMDGPU_CS_ChainPreserve:
case CallingConv::AMDGPU_CS:
return true;
default:
@@ -1968,7 +1935,17 @@ bool isModuleEntryFunctionCC(CallingConv::ID CC) {
case CallingConv::AMDGPU_Gfx:
return true;
default:
- return isEntryFunctionCC(CC);
+ return isEntryFunctionCC(CC) || isChainCC(CC);
+ }
+}
+
+bool isChainCC(CallingConv::ID CC) {
+ switch (CC) {
+ case CallingConv::AMDGPU_CS_Chain:
+ case CallingConv::AMDGPU_CS_ChainPreserve:
+ return true;
+ default:
+ return false;
}
}
@@ -2001,15 +1978,23 @@ bool hasPackedD16(const MCSubtargetInfo &STI) {
!isSI(STI);
}
-unsigned getNSAMaxSize(const MCSubtargetInfo &STI) {
+bool hasGDS(const MCSubtargetInfo &STI) {
+ return STI.hasFeature(AMDGPU::FeatureGDS);
+}
+
+unsigned getNSAMaxSize(const MCSubtargetInfo &STI, bool HasSampler) {
auto Version = getIsaVersion(STI.getCPU());
if (Version.Major == 10)
return Version.Minor >= 3 ? 13 : 5;
if (Version.Major == 11)
return 5;
+ if (Version.Major >= 12)
+ return HasSampler ? 4 : 5;
return 0;
}
+unsigned getMaxNumUserSGPRs(const MCSubtargetInfo &STI) { return 16; }
+
bool isSI(const MCSubtargetInfo &STI) {
return STI.hasFeature(AMDGPU::FeatureSouthernIslands);
}
@@ -2030,6 +2015,10 @@ bool isGFX9_GFX10(const MCSubtargetInfo &STI) {
return isGFX9(STI) || isGFX10(STI);
}
+bool isGFX9_GFX10_GFX11(const MCSubtargetInfo &STI) {
+ return isGFX9(STI) || isGFX10(STI) || isGFX11(STI);
+}
+
bool isGFX8_GFX9_GFX10(const MCSubtargetInfo &STI) {
return isVI(STI) || isGFX9(STI) || isGFX10(STI);
}
@@ -2046,6 +2035,10 @@ bool isGFX10(const MCSubtargetInfo &STI) {
return STI.hasFeature(AMDGPU::FeatureGFX10);
}
+bool isGFX10_GFX11(const MCSubtargetInfo &STI) {
+ return isGFX10(STI) || isGFX11(STI);
+}
+
bool isGFX10Plus(const MCSubtargetInfo &STI) {
return isGFX10(STI) || isGFX11Plus(STI);
}
@@ -2055,9 +2048,17 @@ bool isGFX11(const MCSubtargetInfo &STI) {
}
bool isGFX11Plus(const MCSubtargetInfo &STI) {
- return isGFX11(STI);
+ return isGFX11(STI) || isGFX12Plus(STI);
}
+bool isGFX12(const MCSubtargetInfo &STI) {
+ return STI.getFeatureBits()[AMDGPU::FeatureGFX12];
+}
+
+bool isGFX12Plus(const MCSubtargetInfo &STI) { return isGFX12(STI); }
+
+bool isNotGFX12Plus(const MCSubtargetInfo &STI) { return !isGFX12Plus(STI); }
+
bool isNotGFX11Plus(const MCSubtargetInfo &STI) {
return !isGFX11Plus(STI);
}
@@ -2086,6 +2087,10 @@ bool hasGFX10_3Insts(const MCSubtargetInfo &STI) {
return STI.hasFeature(AMDGPU::FeatureGFX10_3Insts);
}
+bool isGFX10_3_GFX11(const MCSubtargetInfo &STI) {
+ return isGFX10_BEncoding(STI) && !isGFX12Plus(STI);
+}
+
bool isGFX90A(const MCSubtargetInfo &STI) {
return STI.hasFeature(AMDGPU::FeatureGFX90AInsts);
}
@@ -2106,6 +2111,14 @@ bool hasVOPD(const MCSubtargetInfo &STI) {
return STI.hasFeature(AMDGPU::FeatureVOPD);
}
+bool hasDPPSrc1SGPR(const MCSubtargetInfo &STI) {
+ return STI.hasFeature(AMDGPU::FeatureDPPSrc1SGPR);
+}
+
+unsigned hasKernargPreload(const MCSubtargetInfo &STI) {
+ return STI.hasFeature(AMDGPU::FeatureKernargPreload);
+}
+
int32_t getTotalNumVGPRs(bool has90AInsts, int32_t ArgNumAGPR,
int32_t ArgNumVGPR) {
if (has90AInsts && ArgNumAGPR)
@@ -2120,6 +2133,10 @@ bool isSGPR(unsigned Reg, const MCRegisterInfo* TRI) {
Reg == AMDGPU::SCC;
}
+bool isHi(unsigned Reg, const MCRegisterInfo &MRI) {
+ return MRI.getEncodingValue(Reg) & AMDGPU::HWEncoding::IS_HI;
+}
+
#define MAP_REG2REG \
using namespace AMDGPU; \
switch(Reg) { \
@@ -2250,16 +2267,13 @@ bool isSISrcFPOperand(const MCInstrDesc &Desc, unsigned OpNo) {
case AMDGPU::OPERAND_REG_IMM_FP16:
case AMDGPU::OPERAND_REG_IMM_FP16_DEFERRED:
case AMDGPU::OPERAND_REG_IMM_V2FP16:
- case AMDGPU::OPERAND_REG_IMM_V2INT16:
case AMDGPU::OPERAND_REG_INLINE_C_FP32:
case AMDGPU::OPERAND_REG_INLINE_C_FP64:
case AMDGPU::OPERAND_REG_INLINE_C_FP16:
case AMDGPU::OPERAND_REG_INLINE_C_V2FP16:
- case AMDGPU::OPERAND_REG_INLINE_C_V2INT16:
case AMDGPU::OPERAND_REG_INLINE_AC_FP32:
case AMDGPU::OPERAND_REG_INLINE_AC_FP16:
case AMDGPU::OPERAND_REG_INLINE_AC_V2FP16:
- case AMDGPU::OPERAND_REG_INLINE_AC_V2INT16:
case AMDGPU::OPERAND_REG_IMM_V2FP32:
case AMDGPU::OPERAND_REG_INLINE_C_V2FP32:
case AMDGPU::OPERAND_REG_INLINE_AC_FP64:
@@ -2272,8 +2286,10 @@ bool isSISrcFPOperand(const MCInstrDesc &Desc, unsigned OpNo) {
bool isSISrcInlinableOperand(const MCInstrDesc &Desc, unsigned OpNo) {
assert(OpNo < Desc.NumOperands);
unsigned OpType = Desc.operands()[OpNo].OperandType;
- return OpType >= AMDGPU::OPERAND_REG_INLINE_C_FIRST &&
- OpType <= AMDGPU::OPERAND_REG_INLINE_C_LAST;
+ return (OpType >= AMDGPU::OPERAND_REG_INLINE_C_FIRST &&
+ OpType <= AMDGPU::OPERAND_REG_INLINE_C_LAST) ||
+ (OpType >= AMDGPU::OPERAND_REG_INLINE_AC_FIRST &&
+ OpType <= AMDGPU::OPERAND_REG_INLINE_AC_LAST);
}
// Avoid using MCRegisterClass::getSize, since that function will go away
@@ -2423,10 +2439,6 @@ unsigned getRegBitWidth(const MCRegisterClass &RC) {
return getRegBitWidth(RC.getID());
}
-unsigned getRegBitWidth(const TargetRegisterClass &RC) {
- return getRegBitWidth(RC.getID());
-}
-
unsigned getRegOperandSize(const MCRegisterInfo *MRI, const MCInstrDesc &Desc,
unsigned OpNo) {
assert(OpNo < Desc.NumOperands);
@@ -2522,6 +2534,16 @@ bool isInlinableIntLiteralV216(int32_t Literal) {
return Lo16 == Hi16 && isInlinableIntLiteral(Lo16);
}
+bool isInlinableLiteralV216(int32_t Literal, bool HasInv2Pi, uint8_t OpType) {
+ switch (OpType) {
+ case AMDGPU::OPERAND_REG_IMM_V2FP16:
+ case AMDGPU::OPERAND_REG_INLINE_C_V2FP16:
+ return isInlinableLiteralV216(Literal, HasInv2Pi);
+ default:
+ return isInlinableIntLiteralV216(Literal);
+ }
+}
+
bool isFoldableLiteralV216(int32_t Literal, bool HasInv2Pi) {
assert(HasInv2Pi);
@@ -2535,6 +2557,13 @@ bool isFoldableLiteralV216(int32_t Literal, bool HasInv2Pi) {
return Lo16 == Hi16;
}
+bool isValid32BitLiteral(uint64_t Val, bool IsFP64) {
+ if (IsFP64)
+ return !(Val & 0xffffffffu);
+
+ return isUInt<32>(Val) || isInt<32>(Val);
+}
+
bool isArgPassedInSGPR(const Argument *A) {
const Function *F = A->getParent();
@@ -2552,13 +2581,15 @@ bool isArgPassedInSGPR(const Argument *A) {
case CallingConv::AMDGPU_PS:
case CallingConv::AMDGPU_CS:
case CallingConv::AMDGPU_Gfx:
+ case CallingConv::AMDGPU_CS_Chain:
+ case CallingConv::AMDGPU_CS_ChainPreserve:
// For non-compute shaders, SGPR inputs are marked with either inreg or
// byval. Everything else is in VGPRs.
return A->hasAttribute(Attribute::InReg) ||
A->hasAttribute(Attribute::ByVal);
default:
- // TODO: Should calls support inreg for SGPR inputs?
- return false;
+ // TODO: treat i1 as divergent?
+ return A->hasAttribute(Attribute::InReg);
}
}
@@ -2577,13 +2608,14 @@ bool isArgPassedInSGPR(const CallBase *CB, unsigned ArgNo) {
case CallingConv::AMDGPU_PS:
case CallingConv::AMDGPU_CS:
case CallingConv::AMDGPU_Gfx:
+ case CallingConv::AMDGPU_CS_Chain:
+ case CallingConv::AMDGPU_CS_ChainPreserve:
// For non-compute shaders, SGPR inputs are marked with either inreg or
// byval. Everything else is in VGPRs.
return CB->paramHasAttr(ArgNo, Attribute::InReg) ||
CB->paramHasAttr(ArgNo, Attribute::ByVal);
default:
- // TODO: Should calls support inreg for SGPR inputs?
- return false;
+ return CB->paramHasAttr(ArgNo, Attribute::InReg);
}
}
@@ -2597,6 +2629,9 @@ static bool hasSMRDSignedImmOffset(const MCSubtargetInfo &ST) {
bool isLegalSMRDEncodedUnsignedOffset(const MCSubtargetInfo &ST,
int64_t EncodedOffset) {
+ if (isGFX12Plus(ST))
+ return isUInt<23>(EncodedOffset);
+
return hasSMEMByteOffset(ST) ? isUInt<20>(EncodedOffset)
: isUInt<8>(EncodedOffset);
}
@@ -2604,6 +2639,9 @@ bool isLegalSMRDEncodedUnsignedOffset(const MCSubtargetInfo &ST,
bool isLegalSMRDEncodedSignedOffset(const MCSubtargetInfo &ST,
int64_t EncodedOffset,
bool IsBuffer) {
+ if (isGFX12Plus(ST))
+ return isInt<24>(EncodedOffset);
+
return !IsBuffer &&
hasSMRDSignedImmOffset(ST) &&
isInt<21>(EncodedOffset);
@@ -2624,6 +2662,10 @@ uint64_t convertSMRDOffsetUnits(const MCSubtargetInfo &ST,
std::optional<int64_t> getSMRDEncodedOffset(const MCSubtargetInfo &ST,
int64_t ByteOffset, bool IsBuffer) {
+ if (isGFX12Plus(ST)) // 24 bit signed offsets
+ return isInt<24>(ByteOffset) ? std::optional<int64_t>(ByteOffset)
+ : std::nullopt;
+
// The signed version is always a byte offset.
if (!IsBuffer && hasSMRDSignedImmOffset(ST)) {
assert(hasSMEMByteOffset(ST));
@@ -2651,10 +2693,11 @@ std::optional<int64_t> getSMRDEncodedLiteralOffset32(const MCSubtargetInfo &ST,
}
unsigned getNumFlatOffsetBits(const MCSubtargetInfo &ST) {
- // Address offset is 12-bit signed for GFX10, 13-bit for GFX9 and GFX11+.
if (AMDGPU::isGFX10(ST))
return 12;
+ if (AMDGPU::isGFX12(ST))
+ return 24;
return 13;
}
@@ -2707,6 +2750,25 @@ const GcnBufferFormatInfo *getGcnBufferFormatInfo(uint8_t Format,
: getGfx9BufferFormatInfo(Format);
}
+bool hasAny64BitVGPROperands(const MCInstrDesc &OpDesc) {
+ for (auto OpName : { OpName::vdst, OpName::src0, OpName::src1,
+ OpName::src2 }) {
+ int Idx = getNamedOperandIdx(OpDesc.getOpcode(), OpName);
+ if (Idx == -1)
+ continue;
+
+ if (OpDesc.operands()[Idx].RegClass == AMDGPU::VReg_64RegClassID ||
+ OpDesc.operands()[Idx].RegClass == AMDGPU::VReg_64_Align2RegClassID)
+ return true;
+ }
+
+ return false;
+}
+
+bool isDPALU_DPP(const MCInstrDesc &OpDesc) {
+ return hasAny64BitVGPROperands(OpDesc);
+}
+
} // namespace AMDGPU
raw_ostream &operator<<(raw_ostream &OS,
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/contrib/llvm-project/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
index bdf7ccad9c76..3c9f330cbcde 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
@@ -31,7 +31,6 @@ class MCRegisterClass;
class MCRegisterInfo;
class MCSubtargetInfo;
class StringRef;
-class TargetRegisterClass;
class Triple;
class raw_ostream;
@@ -43,30 +42,18 @@ namespace AMDGPU {
struct IsaVersion;
-enum {
- AMDHSA_COV2 = 2,
- AMDHSA_COV3 = 3,
- AMDHSA_COV4 = 4,
- AMDHSA_COV5 = 5
-};
+enum { AMDHSA_COV4 = 4, AMDHSA_COV5 = 5 };
+/// \returns True if \p STI is AMDHSA.
+bool isHsaAbi(const MCSubtargetInfo &STI);
/// \returns HSA OS ABI Version identification.
std::optional<uint8_t> getHsaAbiVersion(const MCSubtargetInfo *STI);
-/// \returns True if HSA OS ABI Version identification is 2,
-/// false otherwise.
-bool isHsaAbiVersion2(const MCSubtargetInfo *STI);
-/// \returns True if HSA OS ABI Version identification is 3,
-/// false otherwise.
-bool isHsaAbiVersion3(const MCSubtargetInfo *STI);
/// \returns True if HSA OS ABI Version identification is 4,
/// false otherwise.
bool isHsaAbiVersion4(const MCSubtargetInfo *STI);
/// \returns True if HSA OS ABI Version identification is 5,
/// false otherwise.
bool isHsaAbiVersion5(const MCSubtargetInfo *STI);
-/// \returns True if HSA OS ABI Version identification is 3 and above,
-/// false otherwise.
-bool isHsaAbiVersion3AndAbove(const MCSubtargetInfo *STI);
/// \returns The offset of the multigrid_sync_arg argument from implicitarg_ptr
unsigned getMultigridSyncArgImplicitArgPosition(unsigned COV);
@@ -518,6 +505,10 @@ struct CanBeVOPD {
bool Y;
};
+/// \returns SIEncodingFamily used for VOPD encoding on a \p ST.
+LLVM_READONLY
+unsigned getVOPDEncodingFamily(const MCSubtargetInfo &ST);
+
LLVM_READONLY
CanBeVOPD getCanBeVOPD(unsigned Opc);
@@ -537,7 +528,7 @@ LLVM_READONLY
unsigned getVOPDOpcode(unsigned Opc);
LLVM_READONLY
-int getVOPDFull(unsigned OpX, unsigned OpY);
+int getVOPDFull(unsigned OpX, unsigned OpY, unsigned EncodingFamily);
LLVM_READONLY
bool isVOPD(unsigned Opc);
@@ -548,6 +539,9 @@ bool isMAC(unsigned Opc);
LLVM_READNONE
bool isPermlane16(unsigned Opc);
+LLVM_READNONE
+bool isGenericAtomic(unsigned Opc);
+
namespace VOPD {
enum Component : unsigned {
@@ -757,15 +751,20 @@ public:
// GetRegIdx(Component, MCOperandIdx) must return a VGPR register index
// for the specified component and MC operand. The callback must return 0
// if the operand is not a register or not a VGPR.
- bool hasInvalidOperand(
- std::function<unsigned(unsigned, unsigned)> GetRegIdx) const {
- return getInvalidCompOperandIndex(GetRegIdx).has_value();
+ // If \p SkipSrc is set to true then constraints for source operands are not
+ // checked.
+ bool hasInvalidOperand(std::function<unsigned(unsigned, unsigned)> GetRegIdx,
+ bool SkipSrc = false) const {
+ return getInvalidCompOperandIndex(GetRegIdx, SkipSrc).has_value();
}
// Check VOPD operands constraints.
// Return the index of an invalid component operand, if any.
+ // If \p SkipSrc is set to true then constraints for source operands are not
+ // checked.
std::optional<unsigned> getInvalidCompOperandIndex(
- std::function<unsigned(unsigned, unsigned)> GetRegIdx) const;
+ std::function<unsigned(unsigned, unsigned)> GetRegIdx,
+ bool SkipSrc = false) const;
private:
RegIndices
@@ -1121,6 +1120,9 @@ bool isEntryFunctionCC(CallingConv::ID CC);
LLVM_READNONE
bool isModuleEntryFunctionCC(CallingConv::ID CC);
+LLVM_READNONE
+bool isChainCC(CallingConv::ID CC);
+
bool isKernelCC(const Function *Func);
// FIXME: Remove this when calling conventions cleaned up
@@ -1141,37 +1143,51 @@ bool hasMIMG_R128(const MCSubtargetInfo &STI);
bool hasA16(const MCSubtargetInfo &STI);
bool hasG16(const MCSubtargetInfo &STI);
bool hasPackedD16(const MCSubtargetInfo &STI);
-unsigned getNSAMaxSize(const MCSubtargetInfo &STI);
+bool hasGDS(const MCSubtargetInfo &STI);
+unsigned getNSAMaxSize(const MCSubtargetInfo &STI, bool HasSampler = false);
+unsigned getMaxNumUserSGPRs(const MCSubtargetInfo &STI);
bool isSI(const MCSubtargetInfo &STI);
bool isCI(const MCSubtargetInfo &STI);
bool isVI(const MCSubtargetInfo &STI);
bool isGFX9(const MCSubtargetInfo &STI);
bool isGFX9_GFX10(const MCSubtargetInfo &STI);
+bool isGFX9_GFX10_GFX11(const MCSubtargetInfo &STI);
bool isGFX8_GFX9_GFX10(const MCSubtargetInfo &STI);
bool isGFX8Plus(const MCSubtargetInfo &STI);
bool isGFX9Plus(const MCSubtargetInfo &STI);
bool isGFX10(const MCSubtargetInfo &STI);
+bool isGFX10_GFX11(const MCSubtargetInfo &STI);
bool isGFX10Plus(const MCSubtargetInfo &STI);
bool isNotGFX10Plus(const MCSubtargetInfo &STI);
bool isGFX10Before1030(const MCSubtargetInfo &STI);
bool isGFX11(const MCSubtargetInfo &STI);
bool isGFX11Plus(const MCSubtargetInfo &STI);
+bool isGFX12(const MCSubtargetInfo &STI);
+bool isGFX12Plus(const MCSubtargetInfo &STI);
+bool isNotGFX12Plus(const MCSubtargetInfo &STI);
bool isNotGFX11Plus(const MCSubtargetInfo &STI);
bool isGCN3Encoding(const MCSubtargetInfo &STI);
bool isGFX10_AEncoding(const MCSubtargetInfo &STI);
bool isGFX10_BEncoding(const MCSubtargetInfo &STI);
bool hasGFX10_3Insts(const MCSubtargetInfo &STI);
+bool isGFX10_3_GFX11(const MCSubtargetInfo &STI);
bool isGFX90A(const MCSubtargetInfo &STI);
bool isGFX940(const MCSubtargetInfo &STI);
bool hasArchitectedFlatScratch(const MCSubtargetInfo &STI);
bool hasMAIInsts(const MCSubtargetInfo &STI);
bool hasVOPD(const MCSubtargetInfo &STI);
+bool hasDPPSrc1SGPR(const MCSubtargetInfo &STI);
int getTotalNumVGPRs(bool has90AInsts, int32_t ArgNumAGPR, int32_t ArgNumVGPR);
+unsigned hasKernargPreload(const MCSubtargetInfo &STI);
/// Is Reg - scalar register
bool isSGPR(unsigned Reg, const MCRegisterInfo* TRI);
+/// \returns if \p Reg occupies the high 16-bits of a 32-bit register.
+/// The bit indicating isHi is the LSB of the encoding.
+bool isHi(unsigned Reg, const MCRegisterInfo &MRI);
+
/// If \p Reg is a pseudo reg, return the correct hardware register given
/// \p STI otherwise return \p Reg.
unsigned getMCReg(unsigned Reg, const MCSubtargetInfo &STI);
@@ -1202,9 +1218,6 @@ unsigned getRegBitWidth(unsigned RCID);
/// Get the size in bits of a register from the register class \p RC.
unsigned getRegBitWidth(const MCRegisterClass &RC);
-/// Get the size in bits of a register from the register class \p RC.
-unsigned getRegBitWidth(const TargetRegisterClass &RC);
-
/// Get size of register operand
unsigned getRegOperandSize(const MCRegisterInfo *MRI, const MCInstrDesc &Desc,
unsigned OpNo);
@@ -1225,6 +1238,7 @@ inline unsigned getOperandSize(const MCOperandInfo &OpInfo) {
case AMDGPU::OPERAND_REG_INLINE_C_V2FP32:
case AMDGPU::OPERAND_KIMM32:
case AMDGPU::OPERAND_KIMM16: // mandatory literal is always size 4
+ case AMDGPU::OPERAND_INLINE_SPLIT_BARRIER_INT32:
return 4;
case AMDGPU::OPERAND_REG_IMM_INT64:
@@ -1283,8 +1297,14 @@ LLVM_READNONE
bool isInlinableIntLiteralV216(int32_t Literal);
LLVM_READNONE
+bool isInlinableLiteralV216(int32_t Literal, bool HasInv2Pi, uint8_t OpType);
+
+LLVM_READNONE
bool isFoldableLiteralV216(int32_t Literal, bool HasInv2Pi);
+LLVM_READNONE
+bool isValid32BitLiteral(uint64_t Val, bool IsFP64);
+
bool isArgPassedInSGPR(const Argument *Arg);
bool isArgPassedInSGPR(const CallBase *CB, unsigned ArgNo);
@@ -1314,7 +1334,7 @@ std::optional<int64_t> getSMRDEncodedOffset(const MCSubtargetInfo &ST,
std::optional<int64_t> getSMRDEncodedLiteralOffset32(const MCSubtargetInfo &ST,
int64_t ByteOffset);
-/// For FLAT segment the offset must be positive;
+/// For pre-GFX12 FLAT instructions the offset must be positive;
/// MSB is ignored and forced to zero.
///
/// \return The number of bits available for the signed offset field in flat
@@ -1328,10 +1348,16 @@ unsigned getNumFlatOffsetBits(const MCSubtargetInfo &ST);
bool isLegalSMRDImmOffset(const MCSubtargetInfo &ST, int64_t ByteOffset);
LLVM_READNONE
-inline bool isLegal64BitDPPControl(unsigned DC) {
+inline bool isLegalDPALU_DPPControl(unsigned DC) {
return DC >= DPP::ROW_NEWBCAST_FIRST && DC <= DPP::ROW_NEWBCAST_LAST;
}
+/// \returns true if an instruction may have a 64-bit VGPR operand.
+bool hasAny64BitVGPROperands(const MCInstrDesc &OpDesc);
+
+/// \returns true if an instruction is a DP ALU DPP.
+bool isDPALU_DPP(const MCInstrDesc &OpDesc);
+
/// \returns true if the intrinsic is divergent
bool isIntrinsicSourceOfDivergence(unsigned IntrID);
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/Utils/AMDGPUMemoryUtils.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/Utils/AMDGPUMemoryUtils.cpp
index cbdbf1c16f9f..25e628e5cbc5 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/Utils/AMDGPUMemoryUtils.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/Utils/AMDGPUMemoryUtils.cpp
@@ -74,6 +74,16 @@ bool isReallyAClobber(const Value *Ptr, MemoryDef *Def, AAResults *AA) {
if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(DefInst)) {
switch (II->getIntrinsicID()) {
case Intrinsic::amdgcn_s_barrier:
+ case Intrinsic::amdgcn_s_barrier_signal:
+ case Intrinsic::amdgcn_s_barrier_signal_var:
+ case Intrinsic::amdgcn_s_barrier_signal_isfirst:
+ case Intrinsic::amdgcn_s_barrier_signal_isfirst_var:
+ case Intrinsic::amdgcn_s_barrier_init:
+ case Intrinsic::amdgcn_s_barrier_join:
+ case Intrinsic::amdgcn_s_barrier_wait:
+ case Intrinsic::amdgcn_s_barrier_leave:
+ case Intrinsic::amdgcn_s_get_barrier_state:
+ case Intrinsic::amdgcn_s_wakeup_barrier:
case Intrinsic::amdgcn_wave_barrier:
case Intrinsic::amdgcn_sched_barrier:
case Intrinsic::amdgcn_sched_group_barrier:
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/Utils/AMDGPUMemoryUtils.h b/contrib/llvm-project/llvm/lib/Target/AMDGPU/Utils/AMDGPUMemoryUtils.h
index df37c420fa72..e42b27f8e09e 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/Utils/AMDGPUMemoryUtils.h
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/Utils/AMDGPUMemoryUtils.h
@@ -9,19 +9,15 @@
#ifndef LLVM_LIB_TARGET_AMDGPU_UTILS_AMDGPUMEMORYUTILS_H
#define LLVM_LIB_TARGET_AMDGPU_UTILS_AMDGPUMEMORYUTILS_H
-#include <vector>
-
namespace llvm {
struct Align;
class AAResults;
class DataLayout;
-class Function;
class GlobalVariable;
class LoadInst;
class MemoryDef;
class MemorySSA;
-class Module;
class Value;
namespace AMDGPU {
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.cpp
index a92d574b1848..0fa67c559cb2 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.cpp
@@ -18,7 +18,6 @@
#include "AMDGPUPTNote.h"
#include "SIDefines.h"
#include "llvm/BinaryFormat/ELF.h"
-#include "llvm/CodeGen/MachineFunction.h"
#include "llvm/IR/Constants.h"
#include "llvm/IR/Module.h"
#include "llvm/Support/AMDGPUMetadata.h"
@@ -84,7 +83,6 @@ bool AMDGPUPALMetadata::setFromLegacyBlob(StringRef Blob) {
// Set PAL metadata from msgpack blob.
bool AMDGPUPALMetadata::setFromMsgPackBlob(StringRef Blob) {
- msgpack::Reader Reader(Blob);
return MsgPackDoc.readFromBlob(Blob, /*Multi=*/false);
}
@@ -242,30 +240,29 @@ void AMDGPUPALMetadata::setScratchSize(CallingConv::ID CC, unsigned Val) {
}
// Set the stack frame size of a function in the metadata.
-void AMDGPUPALMetadata::setFunctionScratchSize(const MachineFunction &MF,
- unsigned Val) {
- auto Node = getShaderFunction(MF.getFunction().getName());
+void AMDGPUPALMetadata::setFunctionScratchSize(StringRef FnName, unsigned Val) {
+ auto Node = getShaderFunction(FnName);
Node[".stack_frame_size_in_bytes"] = MsgPackDoc.getNode(Val);
+ Node[".backend_stack_size"] = MsgPackDoc.getNode(Val);
}
// Set the amount of LDS used in bytes in the metadata.
-void AMDGPUPALMetadata::setFunctionLdsSize(const MachineFunction &MF,
- unsigned Val) {
- auto Node = getShaderFunction(MF.getFunction().getName());
+void AMDGPUPALMetadata::setFunctionLdsSize(StringRef FnName, unsigned Val) {
+ auto Node = getShaderFunction(FnName);
Node[".lds_size"] = MsgPackDoc.getNode(Val);
}
// Set the number of used vgprs in the metadata.
-void AMDGPUPALMetadata::setFunctionNumUsedVgprs(const MachineFunction &MF,
+void AMDGPUPALMetadata::setFunctionNumUsedVgprs(StringRef FnName,
unsigned Val) {
- auto Node = getShaderFunction(MF.getFunction().getName());
+ auto Node = getShaderFunction(FnName);
Node[".vgpr_count"] = MsgPackDoc.getNode(Val);
}
// Set the number of used vgprs in the metadata.
-void AMDGPUPALMetadata::setFunctionNumUsedSgprs(const MachineFunction &MF,
+void AMDGPUPALMetadata::setFunctionNumUsedSgprs(StringRef FnName,
unsigned Val) {
- auto Node = getShaderFunction(MF.getFunction().getName());
+ auto Node = getShaderFunction(FnName);
Node[".sgpr_count"] = MsgPackDoc.getNode(Val);
}
@@ -726,7 +723,7 @@ void AMDGPUPALMetadata::toLegacyBlob(std::string &Blob) {
if (Registers.getMap().empty())
return;
raw_string_ostream OS(Blob);
- support::endian::Writer EW(OS, support::endianness::little);
+ support::endian::Writer EW(OS, llvm::endianness::little);
for (auto I : Registers.getMap()) {
EW.write(uint32_t(I.first.getUInt()));
EW.write(uint32_t(I.second.getUInt()));
@@ -911,6 +908,7 @@ void AMDGPUPALMetadata::reset() {
MsgPackDoc.clear();
Registers = MsgPackDoc.getEmptyNode();
HwStages = MsgPackDoc.getEmptyNode();
+ ShaderFunctions = MsgPackDoc.getEmptyNode();
}
unsigned AMDGPUPALMetadata::getPALVersion(unsigned idx) {
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.h b/contrib/llvm-project/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.h
index e477904cb81f..158f766d0485 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.h
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.h
@@ -17,7 +17,6 @@
namespace llvm {
-class MachineFunction;
class Module;
class StringRef;
@@ -87,22 +86,22 @@ public:
void setScratchSize(unsigned CC, unsigned Val);
// Set the stack frame size of a function in the metadata.
- void setFunctionScratchSize(const MachineFunction &MF, unsigned Val);
+ void setFunctionScratchSize(StringRef FnName, unsigned Val);
// Set the amount of LDS used in bytes in the metadata. This is an optional
// advisory record for logging etc; wave dispatch actually uses the rsrc1
// register for the shader stage to determine the amount of LDS to allocate.
- void setFunctionLdsSize(const MachineFunction &MF, unsigned Val);
+ void setFunctionLdsSize(StringRef FnName, unsigned Val);
// Set the number of used vgprs in the metadata. This is an optional advisory
// record for logging etc; wave dispatch actually uses the rsrc1 register for
// the shader stage to determine the number of vgprs to allocate.
- void setFunctionNumUsedVgprs(const MachineFunction &MF, unsigned Val);
+ void setFunctionNumUsedVgprs(StringRef FnName, unsigned Val);
// Set the number of used sgprs in the metadata. This is an optional advisory
// record for logging etc; wave dispatch actually uses the rsrc1 register for
// the shader stage to determine the number of sgprs to allocate.
- void setFunctionNumUsedSgprs(const MachineFunction &MF, unsigned Val);
+ void setFunctionNumUsedSgprs(StringRef FnName, unsigned Val);
// Set the hardware register bit in PAL metadata to enable wave32 on the
// shader of the given calling convention.
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/VINTERPInstructions.td b/contrib/llvm-project/llvm/lib/Target/AMDGPU/VINTERPInstructions.td
index 7d03150bf5b1..fc563b7493ad 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/VINTERPInstructions.td
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/VINTERPInstructions.td
@@ -10,7 +10,7 @@
// VINTERP encoding
//===----------------------------------------------------------------------===//
-class VINTERPe_gfx11 <bits<7> op, VOPProfile P> : Enc64 {
+class VINTERPe <VOPProfile P> : Enc64 {
bits<8> vdst;
bits<4> src0_modifiers;
bits<9> src0;
@@ -31,7 +31,6 @@ class VINTERPe_gfx11 <bits<7> op, VOPProfile P> : Enc64 {
let Inst{13} = !if(P.HasOpSel, src2_modifiers{2}, 0); // op_sel(2)
let Inst{14} = !if(P.HasOpSel, src0_modifiers{3}, 0); // op_sel(3)
let Inst{15} = clamp;
- let Inst{22-16} = op;
let Inst{40-32} = src0;
let Inst{49-41} = src1;
let Inst{58-50} = src2;
@@ -40,6 +39,14 @@ class VINTERPe_gfx11 <bits<7> op, VOPProfile P> : Enc64 {
let Inst{63} = src2_modifiers{0}; // neg(2)
}
+class VINTERPe_gfx11 <bits<7> op, VOPProfile P> : VINTERPe<P> {
+ let Inst{22-16} = op;
+}
+
+class VINTERPe_gfx12 <bits<7> op, VOPProfile P> : VINTERPe<P> {
+ let Inst{20-16} = op{4-0};
+}
+
//===----------------------------------------------------------------------===//
// VOP3 VINTERP
//===----------------------------------------------------------------------===//
@@ -171,17 +178,28 @@ defm : VInterpF16Pat<int_amdgcn_interp_inreg_p2_f16,
// VINTERP Real Instructions
//===----------------------------------------------------------------------===//
-let AssemblerPredicate = isGFX11Plus, DecoderNamespace = "GFX11" in {
- multiclass VINTERP_Real_gfx11 <bits<7> op> {
+multiclass VINTERP_Real_gfx11 <bits<7> op> {
+ let AssemblerPredicate = isGFX11Only, DecoderNamespace = "GFX11" in {
def _gfx11 :
VINTERP_Real<!cast<VOP3_Pseudo>(NAME), SIEncodingFamily.GFX11>,
VINTERPe_gfx11<op, !cast<VOP3_Pseudo>(NAME).Pfl>;
}
}
-defm V_INTERP_P10_F32_inreg : VINTERP_Real_gfx11<0x000>;
-defm V_INTERP_P2_F32_inreg : VINTERP_Real_gfx11<0x001>;
-defm V_INTERP_P10_F16_F32_inreg : VINTERP_Real_gfx11<0x002>;
-defm V_INTERP_P2_F16_F32_inreg : VINTERP_Real_gfx11<0x003>;
-defm V_INTERP_P10_RTZ_F16_F32_inreg : VINTERP_Real_gfx11<0x004>;
-defm V_INTERP_P2_RTZ_F16_F32_inreg : VINTERP_Real_gfx11<0x005>;
+multiclass VINTERP_Real_gfx12 <bits<7> op> {
+ let AssemblerPredicate = isGFX12Only, DecoderNamespace = "GFX12" in {
+ def _gfx12 :
+ VINTERP_Real<!cast<VOP3_Pseudo>(NAME), SIEncodingFamily.GFX12>,
+ VINTERPe_gfx12<op, !cast<VOP3_Pseudo>(NAME).Pfl>;
+ }
+}
+
+multiclass VINTERP_Real_gfx11_gfx12 <bits<7> op> :
+ VINTERP_Real_gfx11<op>, VINTERP_Real_gfx12<op>;
+
+defm V_INTERP_P10_F32_inreg : VINTERP_Real_gfx11_gfx12<0x000>;
+defm V_INTERP_P2_F32_inreg : VINTERP_Real_gfx11_gfx12<0x001>;
+defm V_INTERP_P10_F16_F32_inreg : VINTERP_Real_gfx11_gfx12<0x002>;
+defm V_INTERP_P2_F16_F32_inreg : VINTERP_Real_gfx11_gfx12<0x003>;
+defm V_INTERP_P10_RTZ_F16_F32_inreg : VINTERP_Real_gfx11_gfx12<0x004>;
+defm V_INTERP_P2_RTZ_F16_F32_inreg : VINTERP_Real_gfx11_gfx12<0x005>;
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/VOP1Instructions.td b/contrib/llvm-project/llvm/lib/Target/AMDGPU/VOP1Instructions.td
index 1a8efc6e3df2..27a7c29cb1ac 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/VOP1Instructions.td
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/VOP1Instructions.td
@@ -88,6 +88,12 @@ class VOP1_Real <VOP1_Pseudo ps, int EncodingFamily, string real_name = ps.Mnemo
let TRANS = ps.TRANS;
}
+class VOP1_Real_Gen <VOP1_Pseudo ps, GFXGen Gen, string real_name = ps.Mnemonic> :
+ VOP1_Real <ps, Gen.Subtarget, real_name> {
+ let AssemblerPredicate = Gen.AssemblerPredicate;
+ let DecoderNamespace = Gen.DecoderNamespace;
+}
+
class VOP1_SDWA_Pseudo <string OpName, VOPProfile P, list<dag> pattern=[]> :
VOP_SDWA_Pseudo <OpName, P, pattern> {
let AsmMatchConverter = "cvtSdwaVOP1";
@@ -152,7 +158,7 @@ multiclass VOP1Inst_t16<string opName,
defm NAME : VOP1Inst<opName, P, node>;
}
let OtherPredicates = [HasTrue16BitInsts] in {
- defm _t16 : VOP1Inst<opName#"_t16", VOPProfile_True16<P>, node>;
+ defm _t16 : VOP1Inst<opName#"_t16", VOPProfile_Fake16<P>, node>;
}
}
@@ -170,7 +176,7 @@ class VOPProfileI2F<ValueType dstVt, ValueType srcVt> :
}
class VOPProfileI2F_True16<ValueType dstVt, ValueType srcVt> :
- VOPProfile_True16<VOPProfile<[dstVt, srcVt, untyped, untyped]>> {
+ VOPProfile_Fake16<VOPProfile<[dstVt, srcVt, untyped, untyped]>> {
let Ins64 = (ins Src0RC64:$src0, clampmod:$clamp, omod:$omod);
let InsVOP3Base = (ins Src0VOP3DPP:$src0, clampmod:$clamp, omod:$omod);
@@ -199,7 +205,7 @@ class VOP_SPECIAL_OMOD_PROF<ValueType dstVt, ValueType srcVt> :
def VOP_I32_F32_SPECIAL_OMOD : VOP_SPECIAL_OMOD_PROF<i32, f32>;
def VOP_I32_F64_SPECIAL_OMOD : VOP_SPECIAL_OMOD_PROF<i32, f64>;
def VOP_I16_F16_SPECIAL_OMOD : VOP_SPECIAL_OMOD_PROF<i16, f16>;
-def VOP_I16_F16_SPECIAL_OMOD_t16 : VOPProfile_True16<VOP_I16_F16> {
+def VOP_I16_F16_SPECIAL_OMOD_t16 : VOPProfile_Fake16<VOP_I16_F16> {
let HasOMod = 1;
}
@@ -221,7 +227,7 @@ def VOPProfile_MOV : VOPProfile <[i32, i32, untyped, untyped]> {
let isReMaterializable = 1, isAsCheapAsAMove = 1 in {
defm V_MOV_B32 : VOP1Inst <"v_mov_b32", VOPProfile_MOV, null_frag, 0x8>;
-let SubtargetPredicate = isGFX940Plus in
+let SubtargetPredicate = isGFX940Plus, SchedRW = [Write64Bit] in
defm V_MOV_B64 : VOP1Inst <"v_mov_b64", VOP_I64_I64>;
} // End isMoveImm = 1
@@ -292,13 +298,13 @@ let FPDPRounding = 1, isReMaterializable = 0 in {
let OtherPredicates = [NotHasTrue16BitInsts] in
defm V_CVT_F16_F32 : VOP1Inst <"v_cvt_f16_f32", VOP_F16_F32, any_fpround>;
let OtherPredicates = [HasTrue16BitInsts] in
- defm V_CVT_F16_F32_t16 : VOP1Inst <"v_cvt_f16_f32_t16", VOPProfile_True16<VOP_F16_F32>, any_fpround>;
+ defm V_CVT_F16_F32_t16 : VOP1Inst <"v_cvt_f16_f32_t16", VOPProfile_Fake16<VOP_F16_F32>, any_fpround>;
} // End FPDPRounding = 1, isReMaterializable = 0
let OtherPredicates = [NotHasTrue16BitInsts] in
defm V_CVT_F32_F16 : VOP1Inst <"v_cvt_f32_f16", VOP_F32_F16, any_fpextend>;
let OtherPredicates = [HasTrue16BitInsts] in
-defm V_CVT_F32_F16_t16 : VOP1Inst <"v_cvt_f32_f16_t16", VOPProfile_True16<VOP_F32_F16>, any_fpextend>;
+defm V_CVT_F32_F16_t16 : VOP1Inst <"v_cvt_f32_f16_t16", VOPProfile_Fake16<VOP_F32_F16>, any_fpextend>;
let ReadsModeReg = 0, mayRaiseFPException = 0 in {
defm V_CVT_RPI_I32_F32 : VOP1Inst <"v_cvt_rpi_i32_f32", VOP_I32_F32, cvt_rpi_i32_f32>;
@@ -317,7 +323,7 @@ defm V_CVT_F32_UBYTE3 : VOP1Inst <"v_cvt_f32_ubyte3", VOP1_F32_I32, AMDGPUcvt_f3
defm V_FRACT_F32 : VOP1Inst <"v_fract_f32", VOP_F32_F32, AMDGPUfract>;
defm V_TRUNC_F32 : VOP1Inst <"v_trunc_f32", VOP_F32_F32, ftrunc>;
defm V_CEIL_F32 : VOP1Inst <"v_ceil_f32", VOP_F32_F32, fceil>;
-defm V_RNDNE_F32 : VOP1Inst <"v_rndne_f32", VOP_F32_F32, frint>;
+defm V_RNDNE_F32 : VOP1Inst <"v_rndne_f32", VOP_F32_F32, froundeven>;
defm V_FLOOR_F32 : VOP1Inst <"v_floor_f32", VOP_F32_F32, ffloor>;
let TRANS = 1, SchedRW = [WriteTrans32] in {
@@ -326,7 +332,7 @@ defm V_LOG_F32 : VOP1Inst <"v_log_f32", VOP_F32_F32, AMDGPUlog>;
defm V_RCP_F32 : VOP1Inst <"v_rcp_f32", VOP_F32_F32, AMDGPUrcp>;
defm V_RCP_IFLAG_F32 : VOP1Inst <"v_rcp_iflag_f32", VOP_F32_F32, AMDGPUrcp_iflag>;
defm V_RSQ_F32 : VOP1Inst <"v_rsq_f32", VOP_F32_F32, AMDGPUrsq>;
-defm V_SQRT_F32 : VOP1Inst <"v_sqrt_f32", VOP_F32_F32, any_amdgcn_sqrt>;
+defm V_SQRT_F32 : VOP1Inst <"v_sqrt_f32", VOP_F32_F32, int_amdgcn_sqrt>;
} // End TRANS = 1, SchedRW = [WriteTrans32]
let TRANS = 1, SchedRW = [WriteTrans64] in {
@@ -458,7 +464,7 @@ let SubtargetPredicate = isGFX7Plus in {
let SchedRW = [WriteDoubleAdd] in {
defm V_TRUNC_F64 : VOP1Inst<"v_trunc_f64", VOP_F64_F64, ftrunc>;
defm V_CEIL_F64 : VOP1Inst<"v_ceil_f64", VOP_F64_F64, fceil>;
- defm V_RNDNE_F64 : VOP1Inst<"v_rndne_f64", VOP_F64_F64, frint>;
+ defm V_RNDNE_F64 : VOP1Inst<"v_rndne_f64", VOP_F64_F64, froundeven>;
defm V_FLOOR_F64 : VOP1Inst<"v_floor_f64", VOP_F64_F64, ffloor>;
} // End SchedRW = [WriteDoubleAdd]
} // End SubtargetPredicate = isGFX7Plus
@@ -502,7 +508,7 @@ defm V_FREXP_EXP_I16_F16_t16 : VOP1Inst <"v_frexp_exp_i16_f16_t16", VOP_I16_F16_
defm V_FLOOR_F16 : VOP1Inst_t16 <"v_floor_f16", VOP_F16_F16, ffloor>;
defm V_CEIL_F16 : VOP1Inst_t16 <"v_ceil_f16", VOP_F16_F16, fceil>;
defm V_TRUNC_F16 : VOP1Inst_t16 <"v_trunc_f16", VOP_F16_F16, ftrunc>;
-defm V_RNDNE_F16 : VOP1Inst_t16 <"v_rndne_f16", VOP_F16_F16, frint>;
+defm V_RNDNE_F16 : VOP1Inst_t16 <"v_rndne_f16", VOP_F16_F16, froundeven>;
let FPDPRounding = 1 in {
defm V_FRACT_F16 : VOP1Inst_t16 <"v_fract_f16", VOP_F16_F16, AMDGPUfract>;
} // End FPDPRounding = 1
@@ -584,18 +590,28 @@ let SubtargetPredicate = HasFP8Insts, mayRaiseFPException = 0,
}
class Cvt_F32_F8_Pat<SDPatternOperator node, int index,
- VOP1_Pseudo inst_e32, VOP1_SDWA_Pseudo inst_sdwa> : GCNPat<
+ VOP1_SDWA_Pseudo inst_sdwa> : GCNPat<
(f32 (node i32:$src, index)),
- !if (index,
- (inst_sdwa 0, $src, 0, 0, index),
- (inst_e32 $src))
+ (inst_sdwa 0, $src, 0, 0, index)
>;
-foreach Index = [0, 1, 2, 3] in {
- def : Cvt_F32_F8_Pat<int_amdgcn_cvt_f32_fp8, Index,
- V_CVT_F32_FP8_e32, V_CVT_F32_FP8_sdwa>;
- def : Cvt_F32_F8_Pat<int_amdgcn_cvt_f32_bf8, Index,
- V_CVT_F32_BF8_e32, V_CVT_F32_BF8_sdwa>;
+let OtherPredicates = [HasCvtFP8VOP1Bug] in {
+ def : GCNPat<(f32 (int_amdgcn_cvt_f32_fp8 i32:$src, 0)),
+ (V_CVT_F32_FP8_sdwa 0, $src, 0, 0, 0)>;
+ def : GCNPat<(f32 (int_amdgcn_cvt_f32_bf8 i32:$src, 0)),
+ (V_CVT_F32_BF8_sdwa 0, $src, 0, 0, 0)>;
+}
+
+let OtherPredicates = [HasNoCvtFP8VOP1Bug] in {
+ def : GCNPat<(f32 (int_amdgcn_cvt_f32_fp8 i32:$src, 0)),
+ (V_CVT_F32_FP8_e32 $src)>;
+ def : GCNPat<(f32 (int_amdgcn_cvt_f32_bf8 i32:$src, 0)),
+ (V_CVT_F32_BF8_e32 $src)>;
+}
+
+foreach Index = [1, 2, 3] in {
+ def : Cvt_F32_F8_Pat<int_amdgcn_cvt_f32_fp8, Index, V_CVT_F32_FP8_sdwa>;
+ def : Cvt_F32_F8_Pat<int_amdgcn_cvt_f32_bf8, Index, V_CVT_F32_BF8_sdwa>;
}
class Cvt_PK_F32_F8_Pat<SDPatternOperator node, int index,
@@ -646,6 +662,7 @@ let SubtargetPredicate = isGFX11Plus in {
getVOP1Pat64<int_amdgcn_permlane64,
VOP_MOVRELS>.ret,
/*VOP1Only=*/ 1>;
+ defm V_MOV_B16_t16 : VOP1Inst<"v_mov_b16_t16", VOPProfile_True16<VOP_I16_I16>>;
defm V_NOT_B16 : VOP1Inst_t16<"v_not_b16", VOP_I16_I16>;
defm V_CVT_I32_I16 : VOP1Inst_t16<"v_cvt_i32_i16", VOP_I32_I16>;
defm V_CVT_U32_U16 : VOP1Inst_t16<"v_cvt_u32_u16", VOP_I32_I16>;
@@ -677,6 +694,13 @@ class VOP1_DPP16<bits<8> op, VOP1_DPP_Pseudo ps, int subtarget, VOPProfile p = p
let SubtargetPredicate = HasDPP16;
}
+class VOP1_DPP16_Gen<bits<8> op, VOP1_DPP_Pseudo ps, GFXGen Gen, VOPProfile p = ps.Pfl> :
+ VOP1_DPP16 <op, ps, Gen.Subtarget, p> {
+ let AssemblerPredicate = Gen.AssemblerPredicate;
+ let DecoderNamespace = "DPP"#Gen.DecoderNamespace;
+}
+
+
class VOP1_DPP8<bits<8> op, VOP1_Pseudo ps, VOPProfile p = ps.Pfl> :
VOP_DPP8<ps.OpName, p> {
let hasSideEffects = ps.hasSideEffects;
@@ -691,137 +715,173 @@ class VOP1_DPP8<bits<8> op, VOP1_Pseudo ps, VOPProfile p = ps.Pfl> :
let Inst{31-25} = 0x3f;
}
+class VOP1_DPP8_Gen<bits<8> op, VOP1_Pseudo ps, GFXGen Gen, VOPProfile p = ps.Pfl> :
+ VOP1_DPP8<op, ps, p> {
+ let AssemblerPredicate = Gen.AssemblerPredicate;
+ let DecoderNamespace = "DPP8"#Gen.DecoderNamespace;
+}
+
//===----------------------------------------------------------------------===//
-// GFX11.
+// GFX11, GFX12
//===----------------------------------------------------------------------===//
-let AssemblerPredicate = isGFX11Only, DecoderNamespace = "GFX11" in {
- multiclass VOP1Only_Real_gfx11<bits<9> op> {
- let IsSingle = 1 in
- def _gfx11 :
- VOP1_Real<!cast<VOP1_Pseudo>(NAME), SIEncodingFamily.GFX11>,
- VOP1e<op{7-0}, !cast<VOP1_Pseudo>(NAME).Pfl>;
- }
- multiclass VOP1_Real_e32_gfx11<bits<9> op, string opName = NAME> {
- defvar ps = !cast<VOP1_Pseudo>(opName#"_e32");
- def _e32_gfx11 :
- VOP1_Real<ps, SIEncodingFamily.GFX11>,
- VOP1e<op{7-0}, ps.Pfl>;
- }
- multiclass VOP1_Real_e32_with_name_gfx11<bits<9> op, string opName,
- string asmName> {
- defvar ps = !cast<VOP1_Pseudo>(opName#"_e32");
- let AsmString = asmName # ps.AsmOperands in {
- defm NAME : VOP1_Real_e32_gfx11<op, opName>;
- }
- }
- multiclass VOP1_Real_e64_gfx11<bits<9> op> {
- def _e64_gfx11 :
- VOP3_Real<!cast<VOP3_Pseudo>(NAME#"_e64"), SIEncodingFamily.GFX11>,
- VOP3e_gfx11<{0, 1, 1, op{6-0}}, !cast<VOP3_Pseudo>(NAME#"_e64").Pfl>;
- }
- multiclass VOP1_Real_dpp_gfx11<bits<9> op, string opName = NAME> {
- defvar ps = !cast<VOP1_Pseudo>(opName#"_e32");
- def _dpp_gfx11 : VOP1_DPP16<op{7-0}, !cast<VOP1_DPP_Pseudo>(opName#"_dpp"), SIEncodingFamily.GFX11> {
- let DecoderNamespace = "DPPGFX11";
- }
- }
- multiclass VOP1_Real_dpp_with_name_gfx11<bits<9> op, string opName,
- string asmName> {
- defvar ps = !cast<VOP1_Pseudo>(opName#"_e32");
- let AsmString = asmName # ps.Pfl.AsmDPP16, DecoderNamespace = "DPPGFX11" in {
- defm NAME : VOP1_Real_dpp_gfx11<op, opName>;
- }
+multiclass VOP1Only_Real<GFXGen Gen, bits<9> op> {
+ let IsSingle = 1 in
+ def Gen.Suffix :
+ VOP1_Real_Gen<!cast<VOP1_Pseudo>(NAME), Gen>,
+ VOP1e<op{7-0}, !cast<VOP1_Pseudo>(NAME).Pfl>;
+}
+
+multiclass VOP1_Real_e32<GFXGen Gen, bits<9> op, string opName = NAME> {
+ defvar ps = !cast<VOP1_Pseudo>(opName#"_e32");
+ def _e32#Gen.Suffix :
+ VOP1_Real_Gen<ps, Gen>,
+ VOP1e<op{7-0}, ps.Pfl>;
+}
+
+multiclass VOP1_Real_e32_with_name<GFXGen Gen, bits<9> op, string opName,
+ string asmName> {
+ defvar ps = !cast<VOP1_Pseudo>(opName#"_e32");
+ let AsmString = asmName # ps.AsmOperands in {
+ defm NAME : VOP1_Real_e32<Gen, op, opName>;
}
- multiclass VOP1_Real_dpp8_gfx11<bits<9> op, string opName = NAME> {
- defvar ps = !cast<VOP1_Pseudo>(opName#"_e32");
- def _dpp8_gfx11 : VOP1_DPP8<op{7-0}, ps> {
- let DecoderNamespace = "DPP8GFX11";
- }
+}
+
+multiclass VOP1_Real_e64<GFXGen Gen, bits<9> op> {
+ def _e64#Gen.Suffix :
+ VOP3_Real_Gen<!cast<VOP3_Pseudo>(NAME#"_e64"), Gen>,
+ VOP3e_gfx11_gfx12<{0, 1, 1, op{6-0}}, !cast<VOP3_Pseudo>(NAME#"_e64").Pfl>;
+}
+
+multiclass VOP1_Real_dpp<GFXGen Gen, bits<9> op, string opName = NAME> {
+ defvar ps = !cast<VOP1_Pseudo>(opName#"_e32");
+ def _dpp#Gen.Suffix : VOP1_DPP16_Gen<op{7-0}, !cast<VOP1_DPP_Pseudo>(opName#"_dpp"), Gen>;
+}
+
+multiclass VOP1_Real_dpp_with_name<GFXGen Gen, bits<9> op, string opName,
+ string asmName> {
+ defvar ps = !cast<VOP1_Pseudo>(opName#"_e32");
+ let AsmString = asmName # ps.Pfl.AsmDPP16 in {
+ defm NAME : VOP1_Real_dpp<Gen, op, opName>;
}
- multiclass VOP1_Real_dpp8_with_name_gfx11<bits<9> op, string opName,
- string asmName> {
- defvar ps = !cast<VOP1_Pseudo>(opName#"_e32");
- let AsmString = asmName # ps.Pfl.AsmDPP8, DecoderNamespace = "DPP8GFX11" in {
- defm NAME : VOP1_Real_dpp8_gfx11<op, opName>;
- }
+}
+
+multiclass VOP1_Real_dpp8<GFXGen Gen, bits<9> op, string opName = NAME> {
+ defvar ps = !cast<VOP1_Pseudo>(opName#"_e32");
+ def _dpp8#Gen.Suffix : VOP1_DPP8_Gen<op{7-0}, ps, Gen>;
+}
+
+multiclass VOP1_Real_dpp8_with_name<GFXGen Gen, bits<9> op, string opName,
+ string asmName> {
+ defvar ps = !cast<VOP1_Pseudo>(opName#"_e32");
+ let AsmString = asmName # ps.Pfl.AsmDPP8 in {
+ defm NAME : VOP1_Real_dpp8<Gen, op, opName>;
}
-} // End AssemblerPredicate = isGFX11Only, DecoderNamespace = "GFX11"
+}
-multiclass VOP1_Realtriple_e64_gfx11<bits<9> op> {
- defm NAME : VOP3_Realtriple_gfx11<{0, 1, 1, op{6-0}}, /*isSingle=*/ 0, NAME>;
+multiclass VOP1_Realtriple_e64<GFXGen Gen, bits<9> op> {
+ defm NAME : VOP3_Realtriple<Gen, {0, 1, 1, op{6-0}}, /*isSingle=*/ 0, NAME>;
}
-multiclass VOP1_Realtriple_e64_with_name_gfx11<bits<9> op, string opName,
+
+multiclass VOP1_Realtriple_e64_with_name<GFXGen Gen, bits<9> op, string opName,
string asmName> {
- defm NAME : VOP3_Realtriple_with_name_gfx11<{0, 1, 1, op{6-0}}, opName,
+ defm NAME : VOP3_Realtriple_with_name<Gen, {0, 1, 1, op{6-0}}, opName,
asmName>;
}
-multiclass VOP1_Real_FULL_gfx11<bits<9> op> :
- VOP1_Real_e32_gfx11<op>, VOP1_Realtriple_e64_gfx11<op>,
- VOP1_Real_dpp_gfx11<op>, VOP1_Real_dpp8_gfx11<op>;
+multiclass VOP1_Real_FULL<GFXGen Gen, bits<9> op> :
+ VOP1_Real_e32<Gen, op>, VOP1_Realtriple_e64<Gen, op>,
+ VOP1_Real_dpp<Gen, op>, VOP1_Real_dpp8<Gen, op>;
multiclass VOP1_Real_NO_VOP3_with_name_gfx11<bits<9> op, string opName,
- string asmName> {
- defm NAME : VOP1_Real_e32_with_name_gfx11<op, opName, asmName>,
- VOP1_Real_dpp_with_name_gfx11<op, opName, asmName>,
- VOP1_Real_dpp8_with_name_gfx11<op, opName, asmName>;
+ string asmName> {
+ defm NAME : VOP1_Real_e32_with_name<GFX11Gen, op, opName, asmName>,
+ VOP1_Real_dpp_with_name<GFX11Gen, op, opName, asmName>,
+ VOP1_Real_dpp8_with_name<GFX11Gen, op, opName, asmName>;
defvar ps = !cast<VOP1_Pseudo>(opName#"_e32");
def gfx11_alias : MnemonicAlias<ps.Mnemonic, asmName>,
Requires<[isGFX11Plus]>;
}
-multiclass VOP1_Real_FULL_with_name_gfx11<bits<9> op, string opName,
+multiclass VOP1_Real_NO_VOP3_with_name_gfx12<bits<9> op, string opName,
+ string asmName> {
+ defm NAME : VOP1_Real_e32_with_name<GFX12Gen, op, opName, asmName>,
+ VOP1_Real_dpp_with_name<GFX12Gen, op, opName, asmName>,
+ VOP1_Real_dpp8_with_name<GFX12Gen, op, opName, asmName>;
+}
+
+multiclass VOP1_Real_FULL_with_name<GFXGen Gen, bits<9> op, string opName,
string asmName> :
- VOP1_Real_NO_VOP3_with_name_gfx11<op, opName, asmName>,
- VOP1_Realtriple_e64_with_name_gfx11<op, opName, asmName>;
+ VOP1_Real_e32_with_name<Gen, op, opName, asmName>,
+ VOP1_Real_dpp_with_name<Gen, op, opName, asmName>,
+ VOP1_Real_dpp8_with_name<Gen, op, opName, asmName>,
+ VOP1_Realtriple_e64_with_name<Gen, op, opName, asmName>;
-multiclass VOP1_Real_FULL_t16_gfx11<bits<9> op, string asmName,
- string opName = NAME> :
- VOP1_Real_FULL_with_name_gfx11<op, opName, asmName>;
+multiclass VOP1_Real_NO_DPP<GFXGen Gen, bits<9> op> :
+ VOP1_Real_e32<Gen, op>, VOP1_Real_e64<Gen, op>;
-multiclass VOP1_Real_NO_DPP_gfx11<bits<9> op> :
- VOP1_Real_e32_gfx11<op>, VOP1_Real_e64_gfx11<op>;
+multiclass VOP1_Real_FULL_t16_gfx11_gfx12<bits<9> op, string asmName,
+ string opName = NAME> :
+ VOP1_Real_FULL_with_name<GFX11Gen, op, opName, asmName>,
+ VOP1_Real_FULL_with_name<GFX12Gen, op, opName, asmName>;
-defm V_CVT_NEAREST_I32_F32 : VOP1_Real_FULL_with_name_gfx11<0x00c,
+multiclass VOP1_Real_FULL_with_name_gfx11_gfx12<bits<9> op, string opName,
+ string asmName> :
+ VOP1_Real_FULL_with_name<GFX11Gen, op, opName, asmName>,
+ VOP1_Real_FULL_with_name<GFX12Gen, op, opName, asmName>;
+
+multiclass VOP1Only_Real_gfx11_gfx12<bits<9> op> :
+ VOP1Only_Real<GFX11Gen, op>, VOP1Only_Real<GFX12Gen, op>;
+
+multiclass VOP1_Real_FULL_gfx11_gfx12<bits<9> op> :
+ VOP1_Real_FULL<GFX11Gen, op>, VOP1_Real_FULL<GFX12Gen, op>;
+
+multiclass VOP1_Real_NO_DPP_OP_SEL_with_name<GFXGen Gen, bits<9> op,
+ string opName, string asmName> :
+ VOP1_Real_e32_with_name<Gen, op, opName, asmName>,
+ VOP3_Real_with_name<Gen, {0, 1, 1, op{6-0}}, opName, asmName>;
+
+
+defm V_CVT_NEAREST_I32_F32 : VOP1_Real_FULL_with_name_gfx11_gfx12<0x00c,
"V_CVT_RPI_I32_F32", "v_cvt_nearest_i32_f32">;
-defm V_CVT_FLOOR_I32_F32 : VOP1_Real_FULL_with_name_gfx11<0x00d,
+defm V_CVT_FLOOR_I32_F32 : VOP1_Real_FULL_with_name_gfx11_gfx12<0x00d,
"V_CVT_FLR_I32_F32", "v_cvt_floor_i32_f32">;
-defm V_CLZ_I32_U32 : VOP1_Real_FULL_with_name_gfx11<0x039,
+defm V_CLZ_I32_U32 : VOP1_Real_FULL_with_name_gfx11_gfx12<0x039,
"V_FFBH_U32", "v_clz_i32_u32">;
-defm V_CTZ_I32_B32 : VOP1_Real_FULL_with_name_gfx11<0x03a,
+defm V_CTZ_I32_B32 : VOP1_Real_FULL_with_name_gfx11_gfx12<0x03a,
"V_FFBL_B32", "v_ctz_i32_b32">;
-defm V_CLS_I32 : VOP1_Real_FULL_with_name_gfx11<0x03b,
+defm V_CLS_I32 : VOP1_Real_FULL_with_name_gfx11_gfx12<0x03b,
"V_FFBH_I32", "v_cls_i32">;
-defm V_PERMLANE64_B32 : VOP1Only_Real_gfx11<0x067>;
-defm V_NOT_B16_t16 : VOP1_Real_FULL_t16_gfx11<0x069, "v_not_b16">;
-defm V_CVT_I32_I16_t16 : VOP1_Real_FULL_t16_gfx11<0x06a, "v_cvt_i32_i16">;
-defm V_CVT_U32_U16_t16 : VOP1_Real_FULL_t16_gfx11<0x06b, "v_cvt_u32_u16">;
-
-defm V_CVT_F16_U16_t16 : VOP1_Real_FULL_t16_gfx11<0x050, "v_cvt_f16_u16">;
-defm V_CVT_F16_I16_t16 : VOP1_Real_FULL_t16_gfx11<0x051, "v_cvt_f16_i16">;
-defm V_CVT_U16_F16_t16 : VOP1_Real_FULL_t16_gfx11<0x052, "v_cvt_u16_f16">;
-defm V_CVT_I16_F16_t16 : VOP1_Real_FULL_t16_gfx11<0x053, "v_cvt_i16_f16">;
-defm V_RCP_F16_t16 : VOP1_Real_FULL_t16_gfx11<0x054, "v_rcp_f16">;
-defm V_SQRT_F16_t16 : VOP1_Real_FULL_t16_gfx11<0x055, "v_sqrt_f16">;
-defm V_RSQ_F16_t16 : VOP1_Real_FULL_t16_gfx11<0x056, "v_rsq_f16">;
-defm V_LOG_F16_t16 : VOP1_Real_FULL_t16_gfx11<0x057, "v_log_f16">;
-defm V_EXP_F16_t16 : VOP1_Real_FULL_t16_gfx11<0x058, "v_exp_f16">;
-defm V_FREXP_MANT_F16_t16 : VOP1_Real_FULL_t16_gfx11<0x059, "v_frexp_mant_f16">;
-defm V_FREXP_EXP_I16_F16_t16 : VOP1_Real_FULL_t16_gfx11<0x05a, "v_frexp_exp_i16_f16">;
-defm V_FLOOR_F16_t16 : VOP1_Real_FULL_t16_gfx11<0x05b, "v_floor_f16">;
-defm V_CEIL_F16_t16 : VOP1_Real_FULL_t16_gfx11<0x05c, "v_ceil_f16">;
-defm V_TRUNC_F16_t16 : VOP1_Real_FULL_t16_gfx11<0x05d, "v_trunc_f16">;
-defm V_RNDNE_F16_t16 : VOP1_Real_FULL_t16_gfx11<0x05e, "v_rndne_f16">;
-defm V_FRACT_F16_t16 : VOP1_Real_FULL_t16_gfx11<0x05f, "v_fract_f16">;
-defm V_SIN_F16_t16 : VOP1_Real_FULL_t16_gfx11<0x060, "v_sin_f16">;
-defm V_COS_F16_t16 : VOP1_Real_FULL_t16_gfx11<0x061, "v_cos_f16">;
-defm V_SAT_PK_U8_I16_t16 : VOP1_Real_FULL_t16_gfx11<0x062, "v_sat_pk_u8_i16">;
-defm V_CVT_NORM_I16_F16_t16 : VOP1_Real_FULL_t16_gfx11<0x063, "v_cvt_norm_i16_f16">;
-defm V_CVT_NORM_U16_F16_t16 : VOP1_Real_FULL_t16_gfx11<0x064, "v_cvt_norm_u16_f16">;
-
-defm V_CVT_F16_F32_t16 : VOP1_Real_FULL_t16_gfx11<0x00a, "v_cvt_f16_f32">;
-defm V_CVT_F32_F16_t16 : VOP1_Real_FULL_t16_gfx11<0x00b, "v_cvt_f32_f16">;
+defm V_PERMLANE64_B32 : VOP1Only_Real_gfx11_gfx12<0x067>;
+defm V_MOV_B16_t16 : VOP1_Real_FULL_t16_gfx11_gfx12<0x01c, "v_mov_b16">;
+defm V_NOT_B16_t16 : VOP1_Real_FULL_t16_gfx11_gfx12<0x069, "v_not_b16">;
+defm V_CVT_I32_I16_t16 : VOP1_Real_FULL_t16_gfx11_gfx12<0x06a, "v_cvt_i32_i16">;
+defm V_CVT_U32_U16_t16 : VOP1_Real_FULL_t16_gfx11_gfx12<0x06b, "v_cvt_u32_u16">;
+
+defm V_CVT_F16_U16_t16 : VOP1_Real_FULL_t16_gfx11_gfx12<0x050, "v_cvt_f16_u16">;
+defm V_CVT_F16_I16_t16 : VOP1_Real_FULL_t16_gfx11_gfx12<0x051, "v_cvt_f16_i16">;
+defm V_CVT_U16_F16_t16 : VOP1_Real_FULL_t16_gfx11_gfx12<0x052, "v_cvt_u16_f16">;
+defm V_CVT_I16_F16_t16 : VOP1_Real_FULL_t16_gfx11_gfx12<0x053, "v_cvt_i16_f16">;
+defm V_RCP_F16_t16 : VOP1_Real_FULL_t16_gfx11_gfx12<0x054, "v_rcp_f16">;
+defm V_SQRT_F16_t16 : VOP1_Real_FULL_t16_gfx11_gfx12<0x055, "v_sqrt_f16">;
+defm V_RSQ_F16_t16 : VOP1_Real_FULL_t16_gfx11_gfx12<0x056, "v_rsq_f16">;
+defm V_LOG_F16_t16 : VOP1_Real_FULL_t16_gfx11_gfx12<0x057, "v_log_f16">;
+defm V_EXP_F16_t16 : VOP1_Real_FULL_t16_gfx11_gfx12<0x058, "v_exp_f16">;
+defm V_FREXP_MANT_F16_t16 : VOP1_Real_FULL_t16_gfx11_gfx12<0x059, "v_frexp_mant_f16">;
+defm V_FREXP_EXP_I16_F16_t16 : VOP1_Real_FULL_t16_gfx11_gfx12<0x05a, "v_frexp_exp_i16_f16">;
+defm V_FLOOR_F16_t16 : VOP1_Real_FULL_t16_gfx11_gfx12<0x05b, "v_floor_f16">;
+defm V_CEIL_F16_t16 : VOP1_Real_FULL_t16_gfx11_gfx12<0x05c, "v_ceil_f16">;
+defm V_TRUNC_F16_t16 : VOP1_Real_FULL_t16_gfx11_gfx12<0x05d, "v_trunc_f16">;
+defm V_RNDNE_F16_t16 : VOP1_Real_FULL_t16_gfx11_gfx12<0x05e, "v_rndne_f16">;
+defm V_FRACT_F16_t16 : VOP1_Real_FULL_t16_gfx11_gfx12<0x05f, "v_fract_f16">;
+defm V_SIN_F16_t16 : VOP1_Real_FULL_t16_gfx11_gfx12<0x060, "v_sin_f16">;
+defm V_COS_F16_t16 : VOP1_Real_FULL_t16_gfx11_gfx12<0x061, "v_cos_f16">;
+defm V_SAT_PK_U8_I16_t16 : VOP1_Real_FULL_t16_gfx11_gfx12<0x062, "v_sat_pk_u8_i16">;
+defm V_CVT_NORM_I16_F16_t16 : VOP1_Real_FULL_t16_gfx11_gfx12<0x063, "v_cvt_norm_i16_f16">;
+defm V_CVT_NORM_U16_F16_t16 : VOP1_Real_FULL_t16_gfx11_gfx12<0x064, "v_cvt_norm_u16_f16">;
+
+defm V_CVT_F16_F32_t16 : VOP1_Real_FULL_t16_gfx11_gfx12<0x00a, "v_cvt_f16_f32">;
+defm V_CVT_F32_F16_t16 : VOP1_Real_FULL_t16_gfx11_gfx12<0x00b, "v_cvt_f32_f16">;
//===----------------------------------------------------------------------===//
// GFX10.
@@ -870,17 +930,23 @@ multiclass VOP1_Real_gfx10<bits<9> op> :
VOP1_Real_sdwa_gfx10<op>, VOP1_Real_dpp_gfx10<op>,
VOP1_Real_dpp8_gfx10<op>;
-multiclass VOP1_Real_gfx10_FULL_gfx11<bits<9> op> :
- VOP1_Real_gfx10<op>, VOP1_Real_FULL_gfx11<op>;
+multiclass VOP1_Real_gfx10_FULL_gfx11_gfx12<bits<9> op> :
+ VOP1_Real_gfx10<op>,
+ VOP1_Real_FULL<GFX11Gen, op>,
+ VOP1_Real_FULL<GFX12Gen, op>;
-multiclass VOP1_Real_gfx10_NO_DPP_gfx11<bits<9> op> :
- VOP1_Real_gfx10<op>, VOP1_Real_NO_DPP_gfx11<op>;
+multiclass VOP1_Real_gfx10_NO_DPP_gfx11_gfx12<bits<9> op> :
+ VOP1_Real_gfx10<op>,
+ VOP1_Real_NO_DPP<GFX11Gen, op>,
+ VOP1_Real_NO_DPP<GFX12Gen, op>;
-multiclass VOP1Only_Real_gfx10_gfx11<bits<9> op> :
- VOP1Only_Real_gfx10<op>, VOP1Only_Real_gfx11<op>;
+multiclass VOP1Only_Real_gfx10_gfx11_gfx12<bits<9> op> :
+ VOP1Only_Real_gfx10<op>,
+ VOP1Only_Real<GFX11Gen, op>,
+ VOP1Only_Real<GFX12Gen, op>;
-defm V_PIPEFLUSH : VOP1_Real_gfx10_NO_DPP_gfx11<0x01b>;
-defm V_MOVRELSD_2_B32 : VOP1_Real_gfx10_FULL_gfx11<0x048>;
+defm V_PIPEFLUSH : VOP1_Real_gfx10_NO_DPP_gfx11_gfx12<0x01b>;
+defm V_MOVRELSD_2_B32 : VOP1_Real_gfx10_FULL_gfx11_gfx12<0x048>;
defm V_CVT_F16_U16 : VOP1_Real_gfx10<0x050>;
defm V_CVT_F16_I16 : VOP1_Real_gfx10<0x051>;
defm V_CVT_U16_F16 : VOP1_Real_gfx10<0x052>;
@@ -903,11 +969,11 @@ defm V_SAT_PK_U8_I16 : VOP1_Real_gfx10<0x062>;
defm V_CVT_NORM_I16_F16 : VOP1_Real_gfx10<0x063>;
defm V_CVT_NORM_U16_F16 : VOP1_Real_gfx10<0x064>;
-defm V_SWAP_B32 : VOP1Only_Real_gfx10_gfx11<0x065>;
-defm V_SWAPREL_B32 : VOP1Only_Real_gfx10_gfx11<0x068>;
+defm V_SWAP_B32 : VOP1Only_Real_gfx10_gfx11_gfx12<0x065>;
+defm V_SWAPREL_B32 : VOP1Only_Real_gfx10_gfx11_gfx12<0x068>;
//===----------------------------------------------------------------------===//
-// GFX7, GFX10.
+// GFX7, GFX10, GFX11, GFX12
//===----------------------------------------------------------------------===//
let AssemblerPredicate = isGFX7Only, DecoderNamespace = "GFX7" in {
@@ -926,22 +992,20 @@ let AssemblerPredicate = isGFX7Only, DecoderNamespace = "GFX7" in {
multiclass VOP1_Real_gfx7<bits<9> op> :
VOP1_Real_e32_gfx7<op>, VOP1_Real_e64_gfx7<op>;
-multiclass VOP1_Real_gfx7_gfx10<bits<9> op> :
- VOP1_Real_gfx7<op>, VOP1_Real_gfx10<op>;
-
-multiclass VOP1_Real_gfx7_gfx10_NO_DPP_gfx11<bits<9> op> :
- VOP1_Real_gfx7_gfx10<op>, VOP1_Real_NO_DPP_gfx11<op>;
+multiclass VOP1_Real_gfx7_gfx10_NO_DPP_gfx11_gfx12<bits<9> op> :
+ VOP1_Real_gfx7<op>, VOP1_Real_gfx10<op>, VOP1_Real_NO_DPP<GFX11Gen, op>,
+ VOP1_Real_NO_DPP<GFX12Gen, op>;
defm V_LOG_LEGACY_F32 : VOP1_Real_gfx7<0x045>;
defm V_EXP_LEGACY_F32 : VOP1_Real_gfx7<0x046>;
-defm V_TRUNC_F64 : VOP1_Real_gfx7_gfx10_NO_DPP_gfx11<0x017>;
-defm V_CEIL_F64 : VOP1_Real_gfx7_gfx10_NO_DPP_gfx11<0x018>;
-defm V_RNDNE_F64 : VOP1_Real_gfx7_gfx10_NO_DPP_gfx11<0x019>;
-defm V_FLOOR_F64 : VOP1_Real_gfx7_gfx10_NO_DPP_gfx11<0x01a>;
+defm V_TRUNC_F64 : VOP1_Real_gfx7_gfx10_NO_DPP_gfx11_gfx12<0x017>;
+defm V_CEIL_F64 : VOP1_Real_gfx7_gfx10_NO_DPP_gfx11_gfx12<0x018>;
+defm V_RNDNE_F64 : VOP1_Real_gfx7_gfx10_NO_DPP_gfx11_gfx12<0x019>;
+defm V_FLOOR_F64 : VOP1_Real_gfx7_gfx10_NO_DPP_gfx11_gfx12<0x01a>;
//===----------------------------------------------------------------------===//
-// GFX6, GFX7, GFX10, GFX11.
+// GFX6, GFX7, GFX10, GFX11, GFX12
//===----------------------------------------------------------------------===//
let AssemblerPredicate = isGFX6GFX7, DecoderNamespace = "GFX6GFX7" in {
@@ -963,11 +1027,13 @@ multiclass VOP1_Real_gfx6_gfx7<bits<9> op> :
multiclass VOP1_Real_gfx6_gfx7_gfx10<bits<9> op> :
VOP1_Real_gfx6_gfx7<op>, VOP1_Real_gfx10<op>;
-multiclass VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11<bits<9> op> :
- VOP1_Real_gfx6_gfx7_gfx10<op>, VOP1_Real_FULL_gfx11<op>;
+multiclass VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11_gfx12<bits<9> op> :
+ VOP1_Real_gfx6_gfx7_gfx10<op>, VOP1_Real_FULL<GFX11Gen, op>,
+ VOP1_Real_FULL<GFX12Gen, op>;
-multiclass VOP1_Real_gfx6_gfx7_gfx10_NO_DPP_gfx11<bits<9> op> :
- VOP1_Real_gfx6_gfx7_gfx10<op>, VOP1_Real_NO_DPP_gfx11<op>;
+multiclass VOP1_Real_gfx6_gfx7_gfx10_NO_DPP_gfx11_gfx12<bits<9> op> :
+ VOP1_Real_gfx6_gfx7_gfx10<op>, VOP1_Real_NO_DPP<GFX11Gen, op>,
+ VOP1_Real_NO_DPP<GFX12Gen, op>;
defm V_LOG_CLAMP_F32 : VOP1_Real_gfx6_gfx7<0x026>;
defm V_RCP_CLAMP_F32 : VOP1_Real_gfx6_gfx7<0x028>;
@@ -977,57 +1043,57 @@ defm V_RSQ_LEGACY_F32 : VOP1_Real_gfx6_gfx7<0x02d>;
defm V_RCP_CLAMP_F64 : VOP1_Real_gfx6_gfx7<0x030>;
defm V_RSQ_CLAMP_F64 : VOP1_Real_gfx6_gfx7<0x032>;
-defm V_NOP : VOP1_Real_gfx6_gfx7_gfx10_NO_DPP_gfx11<0x000>;
-defm V_MOV_B32 : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11<0x001>;
-defm V_CVT_I32_F64 : VOP1_Real_gfx6_gfx7_gfx10_NO_DPP_gfx11<0x003>;
-defm V_CVT_F64_I32 : VOP1_Real_gfx6_gfx7_gfx10_NO_DPP_gfx11<0x004>;
-defm V_CVT_F32_I32 : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11<0x005>;
-defm V_CVT_F32_U32 : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11<0x006>;
-defm V_CVT_U32_F32 : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11<0x007>;
-defm V_CVT_I32_F32 : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11<0x008>;
+defm V_NOP : VOP1_Real_gfx6_gfx7_gfx10_NO_DPP_gfx11_gfx12<0x000>;
+defm V_MOV_B32 : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11_gfx12<0x001>;
+defm V_CVT_I32_F64 : VOP1_Real_gfx6_gfx7_gfx10_NO_DPP_gfx11_gfx12<0x003>;
+defm V_CVT_F64_I32 : VOP1_Real_gfx6_gfx7_gfx10_NO_DPP_gfx11_gfx12<0x004>;
+defm V_CVT_F32_I32 : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11_gfx12<0x005>;
+defm V_CVT_F32_U32 : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11_gfx12<0x006>;
+defm V_CVT_U32_F32 : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11_gfx12<0x007>;
+defm V_CVT_I32_F32 : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11_gfx12<0x008>;
defm V_CVT_F16_F32 : VOP1_Real_gfx6_gfx7_gfx10<0x00a>;
defm V_CVT_F32_F16 : VOP1_Real_gfx6_gfx7_gfx10<0x00b>;
defm V_CVT_RPI_I32_F32 : VOP1_Real_gfx6_gfx7_gfx10<0x00c>;
defm V_CVT_FLR_I32_F32 : VOP1_Real_gfx6_gfx7_gfx10<0x00d>;
-defm V_CVT_OFF_F32_I4 : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11<0x00e>;
-defm V_CVT_F32_F64 : VOP1_Real_gfx6_gfx7_gfx10_NO_DPP_gfx11<0x00f>;
-defm V_CVT_F64_F32 : VOP1_Real_gfx6_gfx7_gfx10_NO_DPP_gfx11<0x010>;
-defm V_CVT_F32_UBYTE0 : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11<0x011>;
-defm V_CVT_F32_UBYTE1 : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11<0x012>;
-defm V_CVT_F32_UBYTE2 : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11<0x013>;
-defm V_CVT_F32_UBYTE3 : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11<0x014>;
-defm V_CVT_U32_F64 : VOP1_Real_gfx6_gfx7_gfx10_NO_DPP_gfx11<0x015>;
-defm V_CVT_F64_U32 : VOP1_Real_gfx6_gfx7_gfx10_NO_DPP_gfx11<0x016>;
-defm V_FRACT_F32 : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11<0x020>;
-defm V_TRUNC_F32 : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11<0x021>;
-defm V_CEIL_F32 : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11<0x022>;
-defm V_RNDNE_F32 : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11<0x023>;
-defm V_FLOOR_F32 : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11<0x024>;
-defm V_EXP_F32 : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11<0x025>;
-defm V_LOG_F32 : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11<0x027>;
-defm V_RCP_F32 : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11<0x02a>;
-defm V_RCP_IFLAG_F32 : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11<0x02b>;
-defm V_RSQ_F32 : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11<0x02e>;
-defm V_RCP_F64 : VOP1_Real_gfx6_gfx7_gfx10_NO_DPP_gfx11<0x02f>;
-defm V_RSQ_F64 : VOP1_Real_gfx6_gfx7_gfx10_NO_DPP_gfx11<0x031>;
-defm V_SQRT_F32 : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11<0x033>;
-defm V_SQRT_F64 : VOP1_Real_gfx6_gfx7_gfx10_NO_DPP_gfx11<0x034>;
-defm V_SIN_F32 : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11<0x035>;
-defm V_COS_F32 : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11<0x036>;
-defm V_NOT_B32 : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11<0x037>;
-defm V_BFREV_B32 : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11<0x038>;
+defm V_CVT_OFF_F32_I4 : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11_gfx12<0x00e>;
+defm V_CVT_F32_F64 : VOP1_Real_gfx6_gfx7_gfx10_NO_DPP_gfx11_gfx12<0x00f>;
+defm V_CVT_F64_F32 : VOP1_Real_gfx6_gfx7_gfx10_NO_DPP_gfx11_gfx12<0x010>;
+defm V_CVT_F32_UBYTE0 : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11_gfx12<0x011>;
+defm V_CVT_F32_UBYTE1 : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11_gfx12<0x012>;
+defm V_CVT_F32_UBYTE2 : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11_gfx12<0x013>;
+defm V_CVT_F32_UBYTE3 : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11_gfx12<0x014>;
+defm V_CVT_U32_F64 : VOP1_Real_gfx6_gfx7_gfx10_NO_DPP_gfx11_gfx12<0x015>;
+defm V_CVT_F64_U32 : VOP1_Real_gfx6_gfx7_gfx10_NO_DPP_gfx11_gfx12<0x016>;
+defm V_FRACT_F32 : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11_gfx12<0x020>;
+defm V_TRUNC_F32 : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11_gfx12<0x021>;
+defm V_CEIL_F32 : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11_gfx12<0x022>;
+defm V_RNDNE_F32 : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11_gfx12<0x023>;
+defm V_FLOOR_F32 : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11_gfx12<0x024>;
+defm V_EXP_F32 : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11_gfx12<0x025>;
+defm V_LOG_F32 : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11_gfx12<0x027>;
+defm V_RCP_F32 : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11_gfx12<0x02a>;
+defm V_RCP_IFLAG_F32 : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11_gfx12<0x02b>;
+defm V_RSQ_F32 : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11_gfx12<0x02e>;
+defm V_RCP_F64 : VOP1_Real_gfx6_gfx7_gfx10_NO_DPP_gfx11_gfx12<0x02f>;
+defm V_RSQ_F64 : VOP1_Real_gfx6_gfx7_gfx10_NO_DPP_gfx11_gfx12<0x031>;
+defm V_SQRT_F32 : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11_gfx12<0x033>;
+defm V_SQRT_F64 : VOP1_Real_gfx6_gfx7_gfx10_NO_DPP_gfx11_gfx12<0x034>;
+defm V_SIN_F32 : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11_gfx12<0x035>;
+defm V_COS_F32 : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11_gfx12<0x036>;
+defm V_NOT_B32 : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11_gfx12<0x037>;
+defm V_BFREV_B32 : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11_gfx12<0x038>;
defm V_FFBH_U32 : VOP1_Real_gfx6_gfx7_gfx10<0x039>;
defm V_FFBL_B32 : VOP1_Real_gfx6_gfx7_gfx10<0x03a>;
defm V_FFBH_I32 : VOP1_Real_gfx6_gfx7_gfx10<0x03b>;
-defm V_FREXP_EXP_I32_F64 : VOP1_Real_gfx6_gfx7_gfx10_NO_DPP_gfx11<0x03c>;
-defm V_FREXP_MANT_F64 : VOP1_Real_gfx6_gfx7_gfx10_NO_DPP_gfx11<0x03d>;
-defm V_FRACT_F64 : VOP1_Real_gfx6_gfx7_gfx10_NO_DPP_gfx11<0x03e>;
-defm V_FREXP_EXP_I32_F32 : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11<0x03f>;
-defm V_FREXP_MANT_F32 : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11<0x040>;
+defm V_FREXP_EXP_I32_F64 : VOP1_Real_gfx6_gfx7_gfx10_NO_DPP_gfx11_gfx12<0x03c>;
+defm V_FREXP_MANT_F64 : VOP1_Real_gfx6_gfx7_gfx10_NO_DPP_gfx11_gfx12<0x03d>;
+defm V_FRACT_F64 : VOP1_Real_gfx6_gfx7_gfx10_NO_DPP_gfx11_gfx12<0x03e>;
+defm V_FREXP_EXP_I32_F32 : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11_gfx12<0x03f>;
+defm V_FREXP_MANT_F32 : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11_gfx12<0x040>;
defm V_CLREXCP : VOP1_Real_gfx6_gfx7_gfx10<0x041>;
-defm V_MOVRELD_B32 : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11<0x042>;
-defm V_MOVRELS_B32 : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11<0x043>;
-defm V_MOVRELSD_B32 : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11<0x044>;
+defm V_MOVRELD_B32 : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11_gfx12<0x042>;
+defm V_MOVRELS_B32 : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11_gfx12<0x043>;
+defm V_MOVRELSD_B32 : VOP1_Real_gfx6_gfx7_gfx10_FULL_gfx11_gfx12<0x044>;
//===----------------------------------------------------------------------===//
// GFX8, GFX9 (VI).
@@ -1163,7 +1229,7 @@ defm V_CVT_NORM_U16_F16 : VOP1_Real_vi<0x4e>;
defm V_ACCVGPR_MOV_B32 : VOP1Only_Real_vi<0x52>;
-let VOP1 = 1, SubtargetPredicate = isGFX8GFX9, Uses = [EXEC, M0] in {
+let VOP1 = 1, SubtargetPredicate = isGFX8GFX9, Uses = [EXEC, M0], Size = V_MOV_B32_e32.Size in {
// Copy of v_mov_b32 with $vdst as a use operand for use with VGPR
// indexing mode. vdst can't be treated as a def for codegen purposes,
@@ -1193,8 +1259,8 @@ def : GCNPat <
(as_i1timm $bound_ctrl))
>;
-def : GCNPat <
- (i32 (int_amdgcn_update_dpp i32:$old, i32:$src, timm:$dpp_ctrl,
+class UpdateDPPPat<ValueType vt> : GCNPat <
+ (vt (int_amdgcn_update_dpp vt:$old, vt:$src, timm:$dpp_ctrl,
timm:$row_mask, timm:$bank_mask,
timm:$bound_ctrl)),
(V_MOV_B32_dpp VGPR_32:$old, VGPR_32:$src, (as_i32timm $dpp_ctrl),
@@ -1202,6 +1268,11 @@ def : GCNPat <
(as_i1timm $bound_ctrl))
>;
+def : UpdateDPPPat<i32>;
+def : UpdateDPPPat<f32>;
+def : UpdateDPPPat<v2i16>;
+def : UpdateDPPPat<v2f16>;
+
} // End OtherPredicates = [isGFX8Plus]
let OtherPredicates = [isGFX8Plus] in {
@@ -1303,3 +1374,15 @@ def : GCNPat <
(as_i32timm $dpp8), (i32 DPP8Mode.FI_0))
>;
} // End OtherPredicates = [isGFX11Only]
+
+//===----------------------------------------------------------------------===//
+// GFX12
+//===----------------------------------------------------------------------===//
+
+let OtherPredicates = [isGFX12Only] in {
+def : GCNPat <
+ (i32 (int_amdgcn_mov_dpp8 i32:$src, timm:$dpp8)),
+ (V_MOV_B32_dpp8_gfx12 VGPR_32:$src, VGPR_32:$src,
+ (as_i32timm $dpp8), (i32 DPP8Mode.FI_0))
+>;
+} // End OtherPredicates = [isGFX12Only]
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/VOP2Instructions.td b/contrib/llvm-project/llvm/lib/Target/AMDGPU/VOP2Instructions.td
index 481a162748e6..0aa62ea77b11 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/VOP2Instructions.td
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/VOP2Instructions.td
@@ -109,6 +109,14 @@ class VOP2_Real <VOP2_Pseudo ps, int EncodingFamily, string real_name = ps.Mnemo
let mayStore = ps.mayStore;
}
+class VOP2_Real_Gen <VOP2_Pseudo ps, GFXGen Gen, string real_name = ps.Mnemonic> :
+ VOP2_Real <ps, Gen.Subtarget, real_name> {
+ let AssemblerPredicate = !if(ps.Pfl.IsRealTrue16, UseRealTrue16Insts,
+ Gen.AssemblerPredicate);
+ let DecoderNamespace = Gen.DecoderNamespace#
+ !if(ps.Pfl.IsRealTrue16, "", "_FAKE16");
+}
+
class VOP2_SDWA_Pseudo <string OpName, VOPProfile P, list<dag> pattern=[]> :
VOP_SDWA_Pseudo <OpName, P, pattern> {
let AsmMatchConverter = "cvtSdwaVOP2";
@@ -194,9 +202,12 @@ multiclass VOP2Inst_t16<string opName,
let SubtargetPredicate = NotHasTrue16BitInsts, OtherPredicates = [Has16BitInsts] in {
defm NAME : VOP2Inst<opName, P, node, revOp, GFX9Renamed>;
}
- let SubtargetPredicate = HasTrue16BitInsts in {
+ let SubtargetPredicate = UseRealTrue16Insts in {
defm _t16 : VOP2Inst<opName#"_t16", VOPProfile_True16<P>, node, revOp#"_t16", GFX9Renamed>;
}
+ let SubtargetPredicate = UseFakeTrue16Insts in {
+ defm _fake16 : VOP2Inst<opName#"_fake16", VOPProfile_Fake16<P>, node, revOp#"_fake16", GFX9Renamed>;
+ }
}
// Creating a _t16_e32 pseudo when there is no corresponding real instruction on
@@ -212,7 +223,7 @@ multiclass VOP2Inst_e64_t16<string opName,
defm NAME : VOP2Inst<opName, P, node, revOp, GFX9Renamed>;
}
let SubtargetPredicate = HasTrue16BitInsts in {
- defm _t16 : VOP2Inst_e64<opName#"_t16", VOPProfile_True16<P>, node, revOp#"_t16", GFX9Renamed>;
+ defm _t16 : VOP2Inst_e64<opName#"_t16", VOPProfile_Fake16<P>, node, revOp#"_t16", GFX9Renamed>;
}
}
@@ -378,7 +389,7 @@ def VOP_MADAK_F16 : VOP_MADAK <f16>;
def VOP_MADAK_F16_t16 : VOP_MADAK <f16> {
let IsTrue16 = 1;
let DstRC = VOPDstOperand<VGPR_32_Lo128>;
- let Ins32 = (ins VSrcT_f16_Lo128_Deferred:$src0, VGPR_32_Lo128:$src1, ImmOpType:$imm);
+ let Ins32 = (ins VSrcFake16_f16_Lo128_Deferred:$src0, VGPR_32_Lo128:$src1, ImmOpType:$imm);
}
def VOP_MADAK_F32 : VOP_MADAK <f32>;
@@ -403,7 +414,7 @@ def VOP_MADMK_F16 : VOP_MADMK <f16>;
def VOP_MADMK_F16_t16 : VOP_MADMK <f16> {
let IsTrue16 = 1;
let DstRC = VOPDstOperand<VGPR_32_Lo128>;
- let Ins32 = (ins VSrcT_f16_Lo128_Deferred:$src0, ImmOpType:$imm, VGPR_32_Lo128:$src1);
+ let Ins32 = (ins VSrcFake16_f16_Lo128_Deferred:$src0, ImmOpType:$imm, VGPR_32_Lo128:$src1);
}
def VOP_MADMK_F32 : VOP_MADMK <f32>;
@@ -859,6 +870,17 @@ def : divergent_i64_BinOp <and, V_AND_B32_e64>;
def : divergent_i64_BinOp <or, V_OR_B32_e64>;
def : divergent_i64_BinOp <xor, V_XOR_B32_e64>;
+// mul24 w/ 64 bit output.
+class mul24_64_Pat<SDPatternOperator Op, Instruction InstLo, Instruction InstHi> : GCNPat<
+ (i64 (Op i32:$src0, i32:$src1)),
+ (REG_SEQUENCE VReg_64,
+ (InstLo $src0, $src1), sub0,
+ (InstHi $src0, $src1), sub1)
+>;
+
+def : mul24_64_Pat<AMDGPUmul_i24, V_MUL_I32_I24_e64, V_MUL_HI_I32_I24_e64>;
+def : mul24_64_Pat<AMDGPUmul_u24, V_MUL_U32_U24_e64, V_MUL_HI_U32_U24_e64>;
+
//===----------------------------------------------------------------------===//
// 16-Bit Operand Instructions
//===----------------------------------------------------------------------===//
@@ -874,7 +896,7 @@ def LDEXP_F16_VOPProfile : VOPProfile <[f16, f16, f16, untyped]> {
let HasSrc1FloatMods = 0;
let Src1ModSDWA = Int16SDWAInputMods;
}
-def LDEXP_F16_VOPProfile_True16 : VOPProfile_True16<VOP_F16_F16_F16> {
+def LDEXP_F16_VOPProfile_True16 : VOPProfile_Fake16<VOP_F16_F16_F16> {
let Src1RC32 = RegisterOperand<VGPR_32_Lo128>;
let Src1DPP = VGPR_32_Lo128;
let Src1ModDPP = IntT16VRegInputMods;
@@ -925,13 +947,13 @@ def : LDEXP_F16_Pat<any_fldexp, V_LDEXP_F16_t16_e64>;
let SubtargetPredicate = isGFX11Plus in {
let isCommutable = 1 in {
- defm V_AND_B16_t16 : VOP2Inst_e64 <"v_and_b16_t16", VOPProfile_True16<VOP_I16_I16_I16>, and>;
- defm V_OR_B16_t16 : VOP2Inst_e64 <"v_or_b16_t16", VOPProfile_True16<VOP_I16_I16_I16>, or>;
- defm V_XOR_B16_t16 : VOP2Inst_e64 <"v_xor_b16_t16", VOPProfile_True16<VOP_I16_I16_I16>, xor>;
+ defm V_AND_B16_t16 : VOP2Inst_e64 <"v_and_b16_t16", VOPProfile_Fake16<VOP_I16_I16_I16>, and>;
+ defm V_OR_B16_t16 : VOP2Inst_e64 <"v_or_b16_t16", VOPProfile_Fake16<VOP_I16_I16_I16>, or>;
+ defm V_XOR_B16_t16 : VOP2Inst_e64 <"v_xor_b16_t16", VOPProfile_Fake16<VOP_I16_I16_I16>, xor>;
} // End isCommutable = 1
} // End SubtargetPredicate = isGFX11Plus
-let FPDPRounding = 1, isReMaterializable = 1 in {
+let FPDPRounding = 1, isReMaterializable = 1, FixedSize = 1 in {
let SubtargetPredicate = isGFX10Plus, OtherPredicates = [NotHasTrue16BitInsts] in {
def V_FMAMK_F16 : VOP2_Pseudo <"v_fmamk_f16", VOP_MADMK_F16, [], "">;
}
@@ -947,7 +969,7 @@ let SubtargetPredicate = HasTrue16BitInsts in {
def V_FMAAK_F16_t16 : VOP2_Pseudo <"v_fmaak_f16_t16", VOP_MADAK_F16_t16, [], "">;
}
} // End isCommutable = 1
-} // End FPDPRounding = 1, isReMaterializable = 1
+} // End FPDPRounding = 1, isReMaterializable = 1, FixedSize = 1
let Constraints = "$vdst = $src2",
DisableEncoding="$src2",
@@ -1089,12 +1111,12 @@ let AddedComplexity = 30 in {
}
} // End AddedComplexity = 30
-let SubtargetPredicate = HasFmaakFmamkF32Insts, isReMaterializable = 1 in {
+let SubtargetPredicate = HasFmaakFmamkF32Insts, isReMaterializable = 1, FixedSize = 1 in {
def V_FMAMK_F32 : VOP2_Pseudo<"v_fmamk_f32", VOP_MADMK_F32, [], "">, VOPD_Component<0x2, "v_fmamk_f32">;
let isCommutable = 1 in
def V_FMAAK_F32 : VOP2_Pseudo<"v_fmaak_f32", VOP_MADAK_F32, [], "">, VOPD_Component<0x1, "v_fmaak_f32">;
-}
+} // End SubtargetPredicate = HasFmaakFmamkF32Insts, isReMaterializable = 1, FixedSize = 1
let SubtargetPredicate = HasPkFmacF16Inst in {
defm V_PK_FMAC_F16 : VOP2Inst<"v_pk_fmac_f16", VOP_V2F16_V2F16_V2F16>;
@@ -1201,6 +1223,20 @@ def : VOPBinOpClampPat<uaddsat, V_ADD_U16_e64, i16>;
def : VOPBinOpClampPat<usubsat, V_SUB_U16_e64, i16>;
}
+let SubtargetPredicate = isGFX12Plus, isReMaterializable = 1 in {
+ let SchedRW = [WriteDoubleAdd], isCommutable = 1 in {
+ let FPDPRounding = 1 in {
+ defm V_ADD_F64_pseudo : VOP2Inst <"v_add_f64_pseudo", VOP_F64_F64_F64, any_fadd>;
+ defm V_MUL_F64_pseudo : VOP2Inst <"v_mul_f64_pseudo", VOP_F64_F64_F64, fmul>;
+ } // End FPDPRounding = 1
+ defm V_MIN_NUM_F64 : VOP2Inst <"v_min_num_f64", VOP_F64_F64_F64, fminnum_like>;
+ defm V_MAX_NUM_F64 : VOP2Inst <"v_max_num_f64", VOP_F64_F64_F64, fmaxnum_like>;
+ } // End SchedRW = [WriteDoubleAdd], isCommutable = 1
+ let SchedRW = [Write64Bit] in {
+ defm V_LSHLREV_B64_pseudo : VOP2Inst <"v_lshlrev_b64_pseudo", VOP_I64_I32_I64, clshl_rev_64>;
+ } // End SchedRW = [Write64Bit]
+} // End SubtargetPredicate = isGFX12Plus, isReMaterializable = 1
+
//===----------------------------------------------------------------------===//
// DPP Encodings
//===----------------------------------------------------------------------===//
@@ -1236,6 +1272,15 @@ class VOP2_DPP16<bits<6> op, VOP2_DPP_Pseudo ps, int subtarget,
Base_VOP2_DPP16<op, ps, opName, p>,
SIMCInstr <ps.PseudoInstr, subtarget>;
+class VOP2_DPP16_Gen<bits<6> op, VOP2_DPP_Pseudo ps, GFXGen Gen,
+ string opName = ps.OpName, VOPProfile p = ps.Pfl> :
+ VOP2_DPP16<op, ps, Gen.Subtarget, opName, p> {
+ let AssemblerPredicate = !if(ps.Pfl.IsRealTrue16, UseRealTrue16Insts,
+ Gen.AssemblerPredicate);
+ let DecoderNamespace = "DPP"#Gen.DecoderNamespace#
+ !if(ps.Pfl.IsRealTrue16, "", "_FAKE16");
+}
+
class VOP2_DPP8<bits<6> op, VOP2_Pseudo ps,
VOPProfile p = ps.Pfl> :
VOP_DPP8<ps.OpName, p> {
@@ -1255,230 +1300,362 @@ class VOP2_DPP8<bits<6> op, VOP2_Pseudo ps,
let OtherPredicates = ps.OtherPredicates;
}
+
+class VOP2_DPP8_Gen<bits<6> op, VOP2_Pseudo ps, GFXGen Gen,
+ VOPProfile p = ps.Pfl> :
+ VOP2_DPP8<op, ps, p> {
+ let AssemblerPredicate = !if(ps.Pfl.IsRealTrue16, UseRealTrue16Insts,
+ Gen.AssemblerPredicate);
+ let DecoderNamespace = "DPP8"#Gen.DecoderNamespace#
+ !if(ps.Pfl.IsRealTrue16, "", "_FAKE16");
+}
//===----------------------------------------------------------------------===//
-// GFX11.
+// GFX11, GFX12
//===----------------------------------------------------------------------===//
-let AssemblerPredicate = isGFX11Only, DecoderNamespace = "GFX11" in {
- //===------------------------------- VOP2 -------------------------------===//
- multiclass VOP2Only_Real_MADK_gfx11<bits<6> op> {
- def _gfx11 :
- VOP2_Real<!cast<VOP2_Pseudo>(NAME), SIEncodingFamily.GFX11>,
- VOP2_MADKe<op{5-0}, !cast<VOP2_Pseudo>(NAME).Pfl>;
+//===------------------------------- VOP2 -------------------------------===//
+multiclass VOP2Only_Real_MADK<GFXGen Gen, bits<6> op> {
+ def Gen.Suffix :
+ VOP2_Real_Gen<!cast<VOP2_Pseudo>(NAME), Gen>,
+ VOP2_MADKe<op{5-0}, !cast<VOP2_Pseudo>(NAME).Pfl>;
+}
+
+multiclass VOP2Only_Real_MADK_with_name<GFXGen Gen, bits<6> op, string asmName,
+ string opName = NAME> {
+ def Gen.Suffix :
+ VOP2_Real_Gen<!cast<VOP2_Pseudo>(opName), Gen>,
+ VOP2_MADKe<op{5-0}, !cast<VOP2_Pseudo>(opName).Pfl> {
+ VOP2_Pseudo ps = !cast<VOP2_Pseudo>(opName);
+ let AsmString = asmName # ps.AsmOperands;
}
- multiclass VOP2Only_Real_MADK_gfx11_with_name<bits<6> op, string asmName,
- string opName = NAME> {
- def _gfx11 :
- VOP2_Real<!cast<VOP2_Pseudo>(opName), SIEncodingFamily.GFX11>,
- VOP2_MADKe<op{5-0}, !cast<VOP2_Pseudo>(opName).Pfl> {
- VOP2_Pseudo ps = !cast<VOP2_Pseudo>(opName);
+}
+
+multiclass VOP2_Real_e32<GFXGen Gen, bits<6> op> {
+ def _e32#Gen.Suffix :
+ VOP2_Real_Gen<!cast<VOP2_Pseudo>(NAME#"_e32"), Gen>,
+ VOP2e<op{5-0}, !cast<VOP2_Pseudo>(NAME#"_e32").Pfl>;
+}
+
+multiclass VOP2Only_Real_e32<GFXGen Gen, bits<6> op> {
+ let IsSingle = 1 in
+ defm NAME: VOP2_Real_e32<Gen, op>;
+}
+
+multiclass VOP2_Real_e64<GFXGen Gen, bits<6> op> {
+ def _e64#Gen.Suffix :
+ VOP3_Real_Gen<!cast<VOP3_Pseudo>(NAME#"_e64"), Gen>,
+ VOP3e_gfx11_gfx12<{0, 1, 0, 0, op{5-0}}, !cast<VOP3_Pseudo>(NAME#"_e64").Pfl>;
+}
+
+multiclass VOP2_Real_dpp<GFXGen Gen, bits<6> op> {
+ if !cast<VOP2_Pseudo>(NAME#"_e32").Pfl.HasExtDPP then
+ def _dpp#Gen.Suffix : VOP2_DPP16_Gen<op, !cast<VOP2_DPP_Pseudo>(NAME#"_dpp"), Gen>;
+}
+
+multiclass VOP2_Real_dpp8<GFXGen Gen, bits<6> op> {
+ if !cast<VOP2_Pseudo>(NAME#"_e32").Pfl.HasExtDPP then
+ def _dpp8#Gen.Suffix : VOP2_DPP8_Gen<op, !cast<VOP2_Pseudo>(NAME#"_e32"), Gen>;
+}
+
+//===------------------------- VOP2 (with name) -------------------------===//
+multiclass VOP2_Real_e32_with_name<GFXGen Gen, bits<6> op, string opName,
+ string asmName, bit single = 0> {
+ defvar ps = !cast<VOP2_Pseudo>(opName#"_e32");
+ def _e32#Gen.Suffix :
+ VOP2_Real_Gen<ps, Gen, asmName>,
+ VOP2e<op{5-0}, ps.Pfl> {
let AsmString = asmName # ps.AsmOperands;
+ let IsSingle = single;
}
- }
- multiclass VOP2_Real_e32_gfx11<bits<6> op> {
- def _e32_gfx11 :
- VOP2_Real<!cast<VOP2_Pseudo>(NAME#"_e32"), SIEncodingFamily.GFX11>,
- VOP2e<op{5-0}, !cast<VOP2_Pseudo>(NAME#"_e32").Pfl>;
- }
- multiclass VOP2Only_Real_e32_gfx11<bits<6> op> {
- let IsSingle = 1 in
- defm NAME: VOP2_Real_e32_gfx11<op>;
- }
- multiclass VOP2_Real_e64_gfx11<bits<6> op> {
- def _e64_gfx11 :
- VOP3_Real<!cast<VOP3_Pseudo>(NAME#"_e64"), SIEncodingFamily.GFX11>,
- VOP3e_gfx11<{0, 1, 0, 0, op{5-0}}, !cast<VOP3_Pseudo>(NAME#"_e64").Pfl>;
- }
- multiclass VOP2_Real_dpp_gfx11<bits<6> op> {
- if !cast<VOP2_Pseudo>(NAME#"_e32").Pfl.HasExtDPP then
- def _dpp_gfx11 : VOP2_DPP16<op, !cast<VOP2_DPP_Pseudo>(NAME#"_dpp"), SIEncodingFamily.GFX11> {
- let DecoderNamespace = "DPPGFX11";
- }
- }
- multiclass VOP2_Real_dpp8_gfx11<bits<6> op> {
- if !cast<VOP2_Pseudo>(NAME#"_e32").Pfl.HasExtDPP then
- def _dpp8_gfx11 : VOP2_DPP8<op, !cast<VOP2_Pseudo>(NAME#"_e32")> {
- let DecoderNamespace = "DPP8GFX11";
+}
+multiclass VOP2_Real_e64_with_name<GFXGen Gen, bits<6> op, string opName,
+ string asmName> {
+ defvar ps = !cast<VOP3_Pseudo>(opName#"_e64");
+ def _e64#Gen.Suffix :
+ VOP3_Real_Gen<ps, Gen>,
+ VOP3e_gfx11_gfx12<{0, 1, 0, 0, op{5-0}}, ps.Pfl> {
+ let AsmString = asmName # ps.AsmOperands;
}
- }
+}
- //===------------------------- VOP2 (with name) -------------------------===//
- multiclass VOP2_Real_e32_with_name_gfx11<bits<6> op, string opName,
- string asmName, bit single = 0> {
- defvar ps = !cast<VOP2_Pseudo>(opName#"_e32");
- def _e32_gfx11 :
- VOP2_Real<ps, SIEncodingFamily.GFX11, asmName>,
- VOP2e<op{5-0}, ps.Pfl> {
- let AsmString = asmName # ps.AsmOperands;
- let IsSingle = single;
- }
+multiclass VOP2_Real_dpp_with_name<GFXGen Gen, bits<6> op, string opName,
+ string asmName> {
+ defvar ps = !cast<VOP2_Pseudo>(opName#"_e32");
+ if ps.Pfl.HasExtDPP then
+ def _dpp#Gen.Suffix : VOP2_DPP16_Gen<op, !cast<VOP2_DPP_Pseudo>(opName#"_dpp"), Gen> {
+ let AsmString = asmName # ps.Pfl.AsmDPP16;
}
- multiclass VOP2_Real_e64_with_name_gfx11<bits<6> op, string opName,
- string asmName> {
- defvar ps = !cast<VOP3_Pseudo>(opName#"_e64");
- def _e64_gfx11 :
- VOP3_Real<ps, SIEncodingFamily.GFX11>,
- VOP3e_gfx11<{0, 1, 0, 0, op{5-0}}, ps.Pfl> {
- let AsmString = asmName # ps.AsmOperands;
- }
+}
+multiclass VOP2_Real_dpp8_with_name<GFXGen Gen, bits<6> op, string opName,
+ string asmName> {
+ defvar ps = !cast<VOP2_Pseudo>(opName#"_e32");
+ if ps.Pfl.HasExtDPP then
+ def _dpp8#Gen.Suffix : VOP2_DPP8_Gen<op, ps, Gen> {
+ let AsmString = asmName # ps.Pfl.AsmDPP8;
}
+}
- multiclass VOP2_Real_dpp_with_name_gfx11<bits<6> op, string opName,
- string asmName> {
- defvar ps = !cast<VOP2_Pseudo>(opName#"_e32");
- if ps.Pfl.HasExtDPP then
- def _dpp_gfx11 : VOP2_DPP16<op, !cast<VOP2_DPP_Pseudo>(opName#"_dpp"),
- SIEncodingFamily.GFX11> {
- let AsmString = asmName # ps.Pfl.AsmDPP16;
- let DecoderNamespace = "DPPGFX11";
+//===------------------------------ VOP2be ------------------------------===//
+multiclass VOP2be_Real_e32<GFXGen Gen, bits<6> op, string opName, string asmName> {
+ defvar ps = !cast<VOP2_Pseudo>(opName#"_e32");
+ def _e32#Gen.Suffix :
+ VOP2_Real_Gen<ps, Gen>,
+ VOP2e<op{5-0}, ps.Pfl> {
+ let AsmString = asmName # !subst(", vcc", "", ps.AsmOperands);
}
- }
- multiclass VOP2_Real_dpp8_with_name_gfx11<bits<6> op, string opName,
- string asmName> {
- defvar ps = !cast<VOP2_Pseudo>(opName#"_e32");
- if ps.Pfl.HasExtDPP then
- def _dpp8_gfx11 : VOP2_DPP8<op, ps> {
- let AsmString = asmName # ps.Pfl.AsmDPP8;
- let DecoderNamespace = "DPP8GFX11";
+}
+multiclass VOP2be_Real_dpp<GFXGen Gen, bits<6> op, string opName, string asmName> {
+ if !cast<VOP2_Pseudo>(opName#"_e32").Pfl.HasExtDPP then
+ def _dpp#Gen.Suffix :
+ VOP2_DPP16_Gen<op, !cast<VOP2_DPP_Pseudo>(opName#"_dpp"), Gen, asmName> {
+ string AsmDPP = !cast<VOP2_Pseudo>(opName#"_e32").Pfl.AsmDPP16;
+ let AsmString = asmName # !subst(", vcc", "", AsmDPP);
}
- }
-
- //===------------------------------ VOP2be ------------------------------===//
- multiclass VOP2be_Real_e32_gfx11<bits<6> op, string opName, string asmName> {
- defvar ps = !cast<VOP2_Pseudo>(opName#"_e32");
- def _e32_gfx11 :
- VOP2_Real<ps, SIEncodingFamily.GFX11>,
- VOP2e<op{5-0}, ps.Pfl> {
- let AsmString = asmName # !subst(", vcc", "", ps.AsmOperands);
- }
- }
- multiclass VOP2be_Real_dpp_gfx11<bits<6> op, string opName, string asmName> {
- if !cast<VOP2_Pseudo>(opName#"_e32").Pfl.HasExtDPP then
- def _dpp_gfx11 :
- VOP2_DPP16<op, !cast<VOP2_DPP_Pseudo>(opName#"_dpp"), SIEncodingFamily.GFX11, asmName> {
- string AsmDPP = !cast<VOP2_Pseudo>(opName#"_e32").Pfl.AsmDPP16;
- let AsmString = asmName # !subst(", vcc", "", AsmDPP);
- let DecoderNamespace = "DPPGFX11";
- }
- if !cast<VOP2_Pseudo>(opName#"_e32").Pfl.HasExtDPP then
- def _dpp_w32_gfx11 :
- Base_VOP2_DPP16<op, !cast<VOP2_DPP_Pseudo>(opName#"_dpp"), asmName> {
- string AsmDPP = !cast<VOP2_Pseudo>(opName#"_e32").Pfl.AsmDPP16;
- let AsmString = asmName # !subst("vcc", "vcc_lo", AsmDPP);
- let isAsmParserOnly = 1;
- let WaveSizePredicate = isWave32;
- }
- if !cast<VOP2_Pseudo>(opName#"_e32").Pfl.HasExtDPP then
- def _dpp_w64_gfx11 :
- Base_VOP2_DPP16<op, !cast<VOP2_DPP_Pseudo>(opName#"_dpp"), asmName> {
- string AsmDPP = !cast<VOP2_Pseudo>(opName#"_e32").Pfl.AsmDPP16;
- let AsmString = asmName # AsmDPP;
- let isAsmParserOnly = 1;
- let WaveSizePredicate = isWave64;
- }
- }
- multiclass VOP2be_Real_dpp8_gfx11<bits<6> op, string opName, string asmName> {
- if !cast<VOP2_Pseudo>(opName#"_e32").Pfl.HasExtDPP then
- def _dpp8_gfx11 :
- VOP2_DPP8<op, !cast<VOP2_Pseudo>(opName#"_e32")> {
- string AsmDPP8 = !cast<VOP2_Pseudo>(opName#"_e32").Pfl.AsmDPP8;
- let AsmString = asmName # !subst(", vcc", "", AsmDPP8);
- let DecoderNamespace = "DPP8GFX11";
- }
- if !cast<VOP2_Pseudo>(opName#"_e32").Pfl.HasExtDPP then
- def _dpp8_w32_gfx11 :
- VOP2_DPP8<op, !cast<VOP2_Pseudo>(opName#"_e32")> {
- string AsmDPP8 = !cast<VOP2_Pseudo>(opName#"_e32").Pfl.AsmDPP8;
- let AsmString = asmName # !subst("vcc", "vcc_lo", AsmDPP8);
- let isAsmParserOnly = 1;
- let WaveSizePredicate = isWave32;
- }
- if !cast<VOP2_Pseudo>(opName#"_e32").Pfl.HasExtDPP then
- def _dpp8_w64_gfx11 :
- VOP2_DPP8<op, !cast<VOP2_Pseudo>(opName#"_e32")> {
- string AsmDPP8 = !cast<VOP2_Pseudo>(opName#"_e32").Pfl.AsmDPP8;
- let AsmString = asmName # AsmDPP8;
- let isAsmParserOnly = 1;
- let WaveSizePredicate = isWave64;
- }
- }
-
-} // End AssemblerPredicate = isGFX11Only, DecoderNamespace = "GFX11"
+ if !cast<VOP2_Pseudo>(opName#"_e32").Pfl.HasExtDPP then
+ def _dpp_w32#Gen.Suffix :
+ Base_VOP2_DPP16<op, !cast<VOP2_DPP_Pseudo>(opName#"_dpp"), asmName> {
+ string AsmDPP = !cast<VOP2_Pseudo>(opName#"_e32").Pfl.AsmDPP16;
+ let AsmString = asmName # !subst("vcc", "vcc_lo", AsmDPP);
+ let isAsmParserOnly = 1;
+ let WaveSizePredicate = isWave32;
+ let AssemblerPredicate = Gen.AssemblerPredicate;
+ let DecoderNamespace = Gen.DecoderNamespace;
+ }
+ if !cast<VOP2_Pseudo>(opName#"_e32").Pfl.HasExtDPP then
+ def _dpp_w64#Gen.Suffix :
+ Base_VOP2_DPP16<op, !cast<VOP2_DPP_Pseudo>(opName#"_dpp"), asmName> {
+ string AsmDPP = !cast<VOP2_Pseudo>(opName#"_e32").Pfl.AsmDPP16;
+ let AsmString = asmName # AsmDPP;
+ let isAsmParserOnly = 1;
+ let WaveSizePredicate = isWave64;
+ let AssemblerPredicate = Gen.AssemblerPredicate;
+ let DecoderNamespace = Gen.DecoderNamespace;
+ }
+}
+multiclass VOP2be_Real_dpp8<GFXGen Gen, bits<6> op, string opName, string asmName> {
+ if !cast<VOP2_Pseudo>(opName#"_e32").Pfl.HasExtDPP then
+ def _dpp8#Gen.Suffix :
+ VOP2_DPP8_Gen<op, !cast<VOP2_Pseudo>(opName#"_e32"), Gen> {
+ string AsmDPP8 = !cast<VOP2_Pseudo>(opName#"_e32").Pfl.AsmDPP8;
+ let AsmString = asmName # !subst(", vcc", "", AsmDPP8);
+ }
+ if !cast<VOP2_Pseudo>(opName#"_e32").Pfl.HasExtDPP then
+ def _dpp8_w32#Gen.Suffix :
+ VOP2_DPP8<op, !cast<VOP2_Pseudo>(opName#"_e32")> {
+ string AsmDPP8 = !cast<VOP2_Pseudo>(opName#"_e32").Pfl.AsmDPP8;
+ let AsmString = asmName # !subst("vcc", "vcc_lo", AsmDPP8);
+ let isAsmParserOnly = 1;
+ let WaveSizePredicate = isWave32;
+ let AssemblerPredicate = Gen.AssemblerPredicate;
+ let DecoderNamespace = Gen.DecoderNamespace;
+ }
+ if !cast<VOP2_Pseudo>(opName#"_e32").Pfl.HasExtDPP then
+ def _dpp8_w64#Gen.Suffix :
+ VOP2_DPP8<op, !cast<VOP2_Pseudo>(opName#"_e32")> {
+ string AsmDPP8 = !cast<VOP2_Pseudo>(opName#"_e32").Pfl.AsmDPP8;
+ let AsmString = asmName # AsmDPP8;
+ let isAsmParserOnly = 1;
+ let WaveSizePredicate = isWave64;
+ let AssemblerPredicate = Gen.AssemblerPredicate;
+ let DecoderNamespace = Gen.DecoderNamespace;
+ }
+}
// We don't want to override separate decoderNamespaces within these
-multiclass VOP2_Realtriple_e64_gfx11<bits<6> op> {
- defm NAME : VOP3_Realtriple_gfx11<{0, 1, 0, 0, op{5-0}}, /*isSingle=*/ 0, NAME> ;
+multiclass VOP2_Realtriple_e64<GFXGen Gen, bits<6> op> {
+ defm NAME : VOP3_Realtriple<Gen, {0, 1, 0, 0, op{5-0}}, /*isSingle=*/ 0, NAME> ;
}
-multiclass VOP2_Realtriple_e64_with_name_gfx11<bits<6> op, string opName,
+
+multiclass VOP2_Realtriple_e64_with_name<GFXGen Gen, bits<6> op, string opName,
string asmName> {
- defm NAME : VOP3_Realtriple_with_name_gfx11<{0, 1, 0, 0, op{5-0}}, opName, asmName> ;
+ defm NAME : VOP3_Realtriple_with_name<Gen, {0, 1, 0, 0, op{5-0}}, opName, asmName> ;
}
-multiclass VOP2be_Real_gfx11<bits<6> op, string opName, string asmName> :
- VOP2be_Real_e32_gfx11<op, opName, asmName>,
- VOP3be_Realtriple_gfx11<{0, 1, 0, 0, op{5-0}}, /*isSingle=*/ 0, opName, asmName>,
- VOP2be_Real_dpp_gfx11<op, opName, asmName>,
- VOP2be_Real_dpp8_gfx11<op, opName, asmName>;
+multiclass VOP2be_Real<GFXGen Gen, bits<6> op, string opName, string asmName> :
+ VOP2be_Real_e32<Gen, op, opName, asmName>,
+ VOP3be_Realtriple<Gen, {0, 1, 0, 0, op{5-0}}, /*isSingle=*/ 0, opName, asmName>,
+ VOP2be_Real_dpp<Gen, op, opName, asmName>,
+ VOP2be_Real_dpp8<Gen, op, opName, asmName>;
// Only for CNDMASK
-multiclass VOP2e_Real_gfx11<bits<6> op, string opName, string asmName> :
- VOP2_Real_e32_gfx11<op>,
- VOP2_Realtriple_e64_gfx11<op>,
- VOP2be_Real_dpp_gfx11<op, opName, asmName>,
- VOP2be_Real_dpp8_gfx11<op, opName, asmName>;
+multiclass VOP2e_Real<GFXGen Gen, bits<6> op, string opName, string asmName> :
+ VOP2_Real_e32<Gen, op>,
+ VOP2_Realtriple_e64<Gen, op>,
+ VOP2be_Real_dpp<Gen, op, opName, asmName>,
+ VOP2be_Real_dpp8<Gen, op, opName, asmName>;
+
+multiclass VOP2Only_Real<GFXGen Gen, bits<6> op> :
+ VOP2Only_Real_e32<Gen, op>,
+ VOP2_Real_dpp<Gen, op>,
+ VOP2_Real_dpp8<Gen, op>;
+
+multiclass VOP2_Real_FULL<GFXGen Gen, bits<6> op> :
+ VOP2_Realtriple_e64<Gen, op>,
+ VOP2_Real_e32<Gen, op>,
+ VOP2_Real_dpp<Gen, op>,
+ VOP2_Real_dpp8<Gen, op>;
+
+multiclass VOP2_Real_NO_VOP3_with_name<GFXGen Gen, bits<6> op, string opName,
+ string asmName, bit isSingle = 0> {
+ defm NAME : VOP2_Real_e32_with_name<Gen, op, opName, asmName, isSingle>,
+ VOP2_Real_dpp_with_name<Gen, op, opName, asmName>,
+ VOP2_Real_dpp8_with_name<Gen, op, opName, asmName>;
+ defvar ps = !cast<VOP2_Pseudo>(opName#"_e32");
+ def Gen.Suffix#"_alias" : MnemonicAlias<ps.Mnemonic, asmName>, Requires<[Gen.AssemblerPredicate]>;
+}
-multiclass VOP2Only_Real_gfx11<bits<6> op> :
- VOP2Only_Real_e32_gfx11<op>,
- VOP2_Real_dpp_gfx11<op>,
- VOP2_Real_dpp8_gfx11<op>;
+multiclass VOP2_Real_FULL_with_name<GFXGen Gen, bits<6> op, string opName,
+ string asmName> :
+ VOP2_Realtriple_e64_with_name<Gen, op, opName, asmName>,
+ VOP2_Real_NO_VOP3_with_name<Gen, op, opName, asmName>;
-multiclass VOP2_Real_NO_VOP3_gfx11<bits<6> op> :
- VOP2_Real_e32_gfx11<op>, VOP2_Real_dpp_gfx11<op>, VOP2_Real_dpp8_gfx11<op>;
+multiclass VOP2_Real_NO_DPP_with_name<GFXGen Gen, bits<6> op, string opName,
+ string asmName> {
+ defm NAME : VOP2_Real_e32_with_name<Gen, op, opName, asmName>,
+ VOP2_Real_e64_with_name<Gen, op, opName, asmName>;
+ defvar ps = !cast<VOP2_Pseudo>(opName#"_e32");
+ def Gen.Suffix#"_alias" : MnemonicAlias<ps.Mnemonic, asmName>, Requires<[Gen.AssemblerPredicate]>;
+}
-multiclass VOP2_Real_FULL_gfx11<bits<6> op> :
- VOP2_Realtriple_e64_gfx11<op>, VOP2_Real_NO_VOP3_gfx11<op>;
+multiclass VOP2_Real_NO_DPP_with_alias<GFXGen Gen, bits<6> op, string alias> {
+ defm NAME : VOP2_Real_e32<Gen, op>,
+ VOP2_Real_e64<Gen, op>;
+ def Gen.Suffix#"_alias" : MnemonicAlias<alias, NAME>, Requires<[Gen.AssemblerPredicate]>;
+}
-multiclass VOP2_Real_NO_VOP3_with_name_gfx11<bits<6> op, string opName,
- string asmName, bit isSingle = 0> {
+//===----------------------------------------------------------------------===//
+// GFX12.
+//===----------------------------------------------------------------------===//
- defm NAME : VOP2_Real_e32_with_name_gfx11<op, opName, asmName, isSingle>,
- VOP2_Real_dpp_with_name_gfx11<op, opName, asmName>,
- VOP2_Real_dpp8_with_name_gfx11<op, opName, asmName>;
- defvar ps = !cast<VOP2_Pseudo>(opName#"_e32");
- def _gfx11_alias : MnemonicAlias<ps.Mnemonic, asmName>, Requires<[isGFX11Plus]>;
+multiclass VOP2be_Real_gfx12<bits<6> op, string opName, string asmName> :
+ VOP2be_Real<GFX12Gen, op, opName, asmName>;
+
+// Only for CNDMASK
+multiclass VOP2e_Real_gfx12<bits<6> op, string opName, string asmName> :
+ VOP2e_Real<GFX12Gen, op, opName, asmName>;
+
+multiclass VOP2_Real_FULL_with_name_gfx12<bits<6> op, string opName,
+ string asmName> :
+ VOP2_Real_FULL_with_name<GFX12Gen, op, opName, asmName>;
+
+multiclass VOP2_Real_FULL_t16_with_name_gfx12<bits<6> op, string opName,
+ string asmName, string alias> {
+ defm NAME : VOP2_Real_FULL_with_name<GFX12Gen, op, opName, asmName>;
+ def _gfx12_2nd_alias : MnemonicAlias<alias, asmName>, Requires<[isGFX12Only]>;
}
-multiclass VOP2_Real_FULL_with_name_gfx11<bits<6> op, string opName,
- string asmName> :
- VOP2_Realtriple_e64_with_name_gfx11<op, opName, asmName>,
- VOP2_Real_NO_VOP3_with_name_gfx11<op, opName, asmName>;
+multiclass VOP2_Real_NO_DPP_with_name_gfx12<bits<6> op, string opName,
+ string asmName> :
+ VOP2_Real_NO_DPP_with_name<GFX12Gen, op, opName, asmName>;
-multiclass VOP2_Real_FULL_t16_gfx11<bits<6> op, string asmName, string opName = NAME>
- : VOP2_Real_FULL_with_name_gfx11<op, opName, asmName>;
+multiclass VOP2_Real_NO_DPP_with_alias_gfx12<bits<6> op, string alias> :
+ VOP2_Real_NO_DPP_with_alias<GFX12Gen, op, alias>;
-multiclass VOP2_Real_NO_DPP_gfx11<bits<6> op> :
- VOP2_Real_e32_gfx11<op>, VOP2_Real_e64_gfx11<op>;
+defm V_ADD_F64 : VOP2_Real_NO_DPP_with_name_gfx12<0x002, "V_ADD_F64_pseudo", "v_add_f64">;
+defm V_MUL_F64 : VOP2_Real_NO_DPP_with_name_gfx12<0x006, "V_MUL_F64_pseudo", "v_mul_f64">;
+defm V_LSHLREV_B64 : VOP2_Real_NO_DPP_with_name_gfx12<0x01f, "V_LSHLREV_B64_pseudo", "v_lshlrev_b64">;
+defm V_MIN_NUM_F64 : VOP2_Real_NO_DPP_with_alias_gfx12<0x00d, "v_min_f64">;
+defm V_MAX_NUM_F64 : VOP2_Real_NO_DPP_with_alias_gfx12<0x00e, "v_max_f64">;
-multiclass VOP2_Real_NO_DPP_with_name_gfx11<bits<6> op, string opName,
- string asmName> {
- defm NAME : VOP2_Real_e32_with_name_gfx11<op, opName, asmName>,
- VOP2_Real_e64_with_name_gfx11<op, opName, asmName>;
+defm V_CNDMASK_B32 : VOP2e_Real_gfx12<0x001, "V_CNDMASK_B32", "v_cndmask_b32">;
+defm V_ADD_CO_CI_U32 :
+ VOP2be_Real_gfx12<0x020, "V_ADDC_U32", "v_add_co_ci_u32">;
+defm V_SUB_CO_CI_U32 :
+ VOP2be_Real_gfx12<0x021, "V_SUBB_U32", "v_sub_co_ci_u32">;
+defm V_SUBREV_CO_CI_U32 :
+ VOP2be_Real_gfx12<0x022, "V_SUBBREV_U32", "v_subrev_co_ci_u32">;
+
+defm V_MIN_NUM_F32 : VOP2_Real_FULL_with_name_gfx12<0x015, "V_MIN_F32", "v_min_num_f32">;
+defm V_MAX_NUM_F32 : VOP2_Real_FULL_with_name_gfx12<0x016, "V_MAX_F32", "v_max_num_f32">;
+defm V_MIN_NUM_F16 : VOP2_Real_FULL_t16_with_name_gfx12<0x030, "V_MIN_F16_t16", "v_min_num_f16", "v_min_f16">;
+defm V_MIN_NUM_F16_fake16 : VOP2_Real_FULL_t16_with_name_gfx12<0x030, "V_MIN_F16_fake16", "v_min_num_f16", "v_min_f16">;
+defm V_MAX_NUM_F16 : VOP2_Real_FULL_t16_with_name_gfx12<0x031, "V_MAX_F16_t16", "v_max_num_f16", "v_max_f16">;
+defm V_MAX_NUM_F16_fake16 : VOP2_Real_FULL_t16_with_name_gfx12<0x031, "V_MAX_F16_fake16", "v_max_num_f16", "v_max_f16">;
+
+let SubtargetPredicate = isGFX12Plus in {
+ defm : VOP2eInstAliases<V_CNDMASK_B32_e32, V_CNDMASK_B32_e32_gfx12>;
+
+ defm : VOP2bInstAliases<
+ V_ADDC_U32_e32, V_ADD_CO_CI_U32_e32_gfx12, "v_add_co_ci_u32">;
+ defm : VOP2bInstAliases<
+ V_SUBB_U32_e32, V_SUB_CO_CI_U32_e32_gfx12, "v_sub_co_ci_u32">;
+ defm : VOP2bInstAliases<
+ V_SUBBREV_U32_e32, V_SUBREV_CO_CI_U32_e32_gfx12, "v_subrev_co_ci_u32">;
+} // End SubtargetPredicate = isGFX12Plus
+
+//===----------------------------------------------------------------------===//
+// GFX11.
+//===----------------------------------------------------------------------===//
+
+multiclass VOP2be_Real_gfx11<bits<6> op, string opName, string asmName> :
+ VOP2be_Real<GFX11Gen, op, opName, asmName>;
+
+// Only for CNDMASK
+multiclass VOP2e_Real_gfx11<bits<6> op, string opName, string asmName> :
+ VOP2e_Real<GFX11Gen, op, opName, asmName>;
+
+multiclass VOP2_Real_NO_VOP3_with_name_gfx11<bits<6> op, string opName,
+ string asmName, bit isSingle = 0> {
+ defm NAME : VOP2_Real_e32_with_name<GFX11Gen, op, opName, asmName, isSingle>,
+ VOP2_Real_dpp_with_name<GFX11Gen, op, opName, asmName>,
+ VOP2_Real_dpp8_with_name<GFX11Gen, op, opName, asmName>;
defvar ps = !cast<VOP2_Pseudo>(opName#"_e32");
- def _gfx11_alias : MnemonicAlias<ps.Mnemonic, asmName>, Requires<[isGFX11Plus]>;
+ def _gfx11_alias : MnemonicAlias<ps.Mnemonic, asmName>, Requires<[isGFX11Only]>;
}
+multiclass VOP2_Real_NO_DPP_with_name_gfx11<bits<6> op, string opName,
+ string asmName> :
+ VOP2_Real_NO_DPP_with_name<GFX11Gen, op, opName, asmName>;
+
+multiclass VOP2_Real_FULL_gfx11_gfx12<bits<6> op> :
+ VOP2_Real_FULL<GFX11Gen, op>, VOP2_Real_FULL<GFX12Gen, op>;
+
+multiclass VOP2_Real_FULL_with_name_gfx11_gfx12<bits<6> op, string opName,
+ string asmName> :
+ VOP2_Real_FULL_with_name<GFX11Gen, op, opName, asmName>,
+ VOP2_Real_FULL_with_name<GFX12Gen, op, opName, asmName>;
+
+multiclass VOP2_Real_e32_gfx11_gfx12<bits<6> op> :
+ VOP2Only_Real<GFX11Gen, op>, VOP2Only_Real<GFX12Gen, op>;
+
+multiclass VOP3Only_Realtriple_gfx11_gfx12<bits<10> op> :
+ VOP3Only_Realtriple<GFX11Gen, op>, VOP3Only_Realtriple<GFX12Gen, op>;
+
+multiclass VOP3Only_Realtriple_t16_gfx11_gfx12<bits<10> op, string asmName> :
+ VOP3Only_Realtriple_t16<GFX11Gen, op, asmName>,
+ VOP3Only_Realtriple_t16<GFX12Gen, op, asmName>;
+
+multiclass VOP3beOnly_Realtriple_gfx11_gfx12<bits<10> op> :
+ VOP3beOnly_Realtriple<GFX11Gen, op>, VOP3beOnly_Realtriple<GFX12Gen, op>;
+
+multiclass VOP2Only_Real_MADK_with_name_gfx11_gfx12<bits<6> op, string asmName,
+ string opName = NAME> :
+ VOP2Only_Real_MADK_with_name<GFX11Gen, op, asmName, opName>,
+ VOP2Only_Real_MADK_with_name<GFX12Gen, op, asmName, opName>;
+
+multiclass VOP2_Real_FULL_t16_gfx11<bits<6> op, string asmName,
+ string opName = NAME> :
+ VOP2_Real_FULL_with_name<GFX11Gen, op, opName, asmName>;
+
+multiclass VOP2_Real_FULL_t16_gfx11_gfx12<bits<6> op, string asmName,
+ string opName = NAME> :
+ VOP2_Real_FULL_with_name_gfx11_gfx12<op, opName, asmName>;
+
+multiclass VOP2_Real_FULL_gfx11<bits<6> op> :
+ VOP2_Real_FULL<GFX11Gen, op>;
+
defm V_CNDMASK_B32 : VOP2e_Real_gfx11<0x001, "V_CNDMASK_B32",
"v_cndmask_b32">;
defm V_DOT2ACC_F32_F16 : VOP2_Real_NO_VOP3_with_name_gfx11<0x002,
"V_DOT2C_F32_F16", "v_dot2acc_f32_f16", 1>;
defm V_FMAC_DX9_ZERO_F32 : VOP2_Real_NO_DPP_with_name_gfx11<0x006,
"V_FMAC_LEGACY_F32", "v_fmac_dx9_zero_f32">;
-defm V_MUL_DX9_ZERO_F32 : VOP2_Real_FULL_with_name_gfx11<0x007,
+defm V_MUL_DX9_ZERO_F32 : VOP2_Real_FULL_with_name_gfx11_gfx12<0x007,
"V_MUL_LEGACY_F32", "v_mul_dx9_zero_f32">;
-defm V_LSHLREV_B32 : VOP2_Real_FULL_gfx11<0x018>;
-defm V_LSHRREV_B32 : VOP2_Real_FULL_gfx11<0x019>;
-defm V_ASHRREV_I32 : VOP2_Real_FULL_gfx11<0x01a>;
+defm V_LSHLREV_B32 : VOP2_Real_FULL_gfx11_gfx12<0x018>;
+defm V_LSHRREV_B32 : VOP2_Real_FULL_gfx11_gfx12<0x019>;
+defm V_ASHRREV_I32 : VOP2_Real_FULL_gfx11_gfx12<0x01a>;
defm V_ADD_CO_CI_U32 :
VOP2be_Real_gfx11<0x020, "V_ADDC_U32", "v_add_co_ci_u32">;
defm V_SUB_CO_CI_U32 :
@@ -1486,37 +1663,43 @@ defm V_SUB_CO_CI_U32 :
defm V_SUBREV_CO_CI_U32 :
VOP2be_Real_gfx11<0x022, "V_SUBBREV_U32", "v_subrev_co_ci_u32">;
-defm V_CVT_PK_RTZ_F16_F32 : VOP2_Real_FULL_with_name_gfx11<0x02f,
+defm V_CVT_PK_RTZ_F16_F32 : VOP2_Real_FULL_with_name_gfx11_gfx12<0x02f,
"V_CVT_PKRTZ_F16_F32", "v_cvt_pk_rtz_f16_f32">;
-defm V_PK_FMAC_F16 : VOP2Only_Real_gfx11<0x03c>;
-
-defm V_ADD_F16_t16 : VOP2_Real_FULL_t16_gfx11<0x032, "v_add_f16">;
-defm V_SUB_F16_t16 : VOP2_Real_FULL_t16_gfx11<0x033, "v_sub_f16">;
-defm V_SUBREV_F16_t16 : VOP2_Real_FULL_t16_gfx11<0x034, "v_subrev_f16">;
-defm V_MUL_F16_t16 : VOP2_Real_FULL_t16_gfx11<0x035, "v_mul_f16">;
-defm V_FMAC_F16_t16 : VOP2_Real_FULL_t16_gfx11<0x036, "v_fmac_f16">;
-defm V_LDEXP_F16_t16 : VOP2_Real_FULL_t16_gfx11<0x03b, "v_ldexp_f16">;
+defm V_PK_FMAC_F16 : VOP2_Real_e32_gfx11_gfx12<0x03c>;
+
+defm V_ADD_F16_t16 : VOP2_Real_FULL_t16_gfx11_gfx12<0x032, "v_add_f16">;
+defm V_ADD_F16_fake16 : VOP2_Real_FULL_t16_gfx11_gfx12<0x032, "v_add_f16">;
+defm V_SUB_F16_t16 : VOP2_Real_FULL_t16_gfx11_gfx12<0x033, "v_sub_f16">;
+defm V_SUB_F16_fake16 : VOP2_Real_FULL_t16_gfx11_gfx12<0x033, "v_sub_f16">;
+defm V_SUBREV_F16_t16 : VOP2_Real_FULL_t16_gfx11_gfx12<0x034, "v_subrev_f16">;
+defm V_SUBREV_F16_fake16 : VOP2_Real_FULL_t16_gfx11_gfx12<0x034, "v_subrev_f16">;
+defm V_MUL_F16_t16 : VOP2_Real_FULL_t16_gfx11_gfx12<0x035, "v_mul_f16">;
+defm V_MUL_F16_fake16 : VOP2_Real_FULL_t16_gfx11_gfx12<0x035, "v_mul_f16">;
+defm V_FMAC_F16_t16 : VOP2_Real_FULL_t16_gfx11_gfx12<0x036, "v_fmac_f16">;
+defm V_LDEXP_F16_t16 : VOP2_Real_FULL_t16_gfx11_gfx12<0x03b, "v_ldexp_f16">;
defm V_MAX_F16_t16 : VOP2_Real_FULL_t16_gfx11<0x039, "v_max_f16">;
+defm V_MAX_F16_fake16 : VOP2_Real_FULL_t16_gfx11<0x039, "v_max_f16">;
defm V_MIN_F16_t16 : VOP2_Real_FULL_t16_gfx11<0x03a, "v_min_f16">;
-defm V_FMAMK_F16_t16 : VOP2Only_Real_MADK_gfx11_with_name<0x037, "v_fmamk_f16">;
-defm V_FMAAK_F16_t16 : VOP2Only_Real_MADK_gfx11_with_name<0x038, "v_fmaak_f16">;
+defm V_MIN_F16_fake16 : VOP2_Real_FULL_t16_gfx11<0x03a, "v_min_f16">;
+defm V_FMAMK_F16_t16 : VOP2Only_Real_MADK_with_name_gfx11_gfx12<0x037, "v_fmamk_f16">;
+defm V_FMAAK_F16_t16 : VOP2Only_Real_MADK_with_name_gfx11_gfx12<0x038, "v_fmaak_f16">;
// VOP3 only.
-defm V_CNDMASK_B16 : VOP3Only_Realtriple_gfx11<0x25d>;
-defm V_LDEXP_F32 : VOP3Only_Realtriple_gfx11<0x31c>;
-defm V_BFM_B32 : VOP3Only_Realtriple_gfx11<0x31d>;
-defm V_BCNT_U32_B32 : VOP3Only_Realtriple_gfx11<0x31e>;
-defm V_MBCNT_LO_U32_B32 : VOP3Only_Realtriple_gfx11<0x31f>;
-defm V_MBCNT_HI_U32_B32 : VOP3Only_Realtriple_gfx11<0x320>;
-defm V_CVT_PK_NORM_I16_F32 : VOP3Only_Realtriple_with_name_gfx11<0x321, "V_CVT_PKNORM_I16_F32", "v_cvt_pk_norm_i16_f32">;
-defm V_CVT_PK_NORM_U16_F32 : VOP3Only_Realtriple_with_name_gfx11<0x322, "V_CVT_PKNORM_U16_F32", "v_cvt_pk_norm_u16_f32">;
-defm V_CVT_PK_U16_U32 : VOP3Only_Realtriple_gfx11<0x323>;
-defm V_CVT_PK_I16_I32 : VOP3Only_Realtriple_gfx11<0x324>;
-defm V_ADD_CO_U32 : VOP3beOnly_Realtriple_gfx11<0x300>;
-defm V_SUB_CO_U32 : VOP3beOnly_Realtriple_gfx11<0x301>;
-defm V_SUBREV_CO_U32 : VOP3beOnly_Realtriple_gfx11<0x302>;
-
-let SubtargetPredicate = isGFX11Plus in {
+defm V_CNDMASK_B16 : VOP3Only_Realtriple_gfx11_gfx12<0x25d>;
+defm V_LDEXP_F32 : VOP3Only_Realtriple_gfx11_gfx12<0x31c>;
+defm V_BFM_B32 : VOP3Only_Realtriple_gfx11_gfx12<0x31d>;
+defm V_BCNT_U32_B32 : VOP3Only_Realtriple_gfx11_gfx12<0x31e>;
+defm V_MBCNT_LO_U32_B32 : VOP3Only_Realtriple_gfx11_gfx12<0x31f>;
+defm V_MBCNT_HI_U32_B32 : VOP3Only_Realtriple_gfx11_gfx12<0x320>;
+defm V_CVT_PK_NORM_I16_F32 : VOP3Only_Realtriple_with_name_gfx11_gfx12<0x321, "V_CVT_PKNORM_I16_F32", "v_cvt_pk_norm_i16_f32">;
+defm V_CVT_PK_NORM_U16_F32 : VOP3Only_Realtriple_with_name_gfx11_gfx12<0x322, "V_CVT_PKNORM_U16_F32", "v_cvt_pk_norm_u16_f32">;
+defm V_CVT_PK_U16_U32 : VOP3Only_Realtriple_gfx11_gfx12<0x323>;
+defm V_CVT_PK_I16_I32 : VOP3Only_Realtriple_gfx11_gfx12<0x324>;
+defm V_ADD_CO_U32 : VOP3beOnly_Realtriple_gfx11_gfx12<0x300>;
+defm V_SUB_CO_U32 : VOP3beOnly_Realtriple_gfx11_gfx12<0x301>;
+defm V_SUBREV_CO_U32 : VOP3beOnly_Realtriple_gfx11_gfx12<0x302>;
+
+let SubtargetPredicate = isGFX11Only in {
defm : VOP2eInstAliases<V_CNDMASK_B32_e32, V_CNDMASK_B32_e32_gfx11>;
defm : VOP2bInstAliases<
@@ -1525,7 +1708,7 @@ let SubtargetPredicate = isGFX11Plus in {
V_SUBB_U32_e32, V_SUB_CO_CI_U32_e32_gfx11, "v_sub_co_ci_u32">;
defm : VOP2bInstAliases<
V_SUBBREV_U32_e32, V_SUBREV_CO_CI_U32_e32_gfx11, "v_subrev_co_ci_u32">;
-} // End SubtargetPredicate = isGFX11Plus
+} // End SubtargetPredicate = isGFX11Only
//===----------------------------------------------------------------------===//
// GFX10.
@@ -1747,7 +1930,10 @@ let AssemblerPredicate = isGFX10Only, DecoderNamespace = "GFX10" in {
} // End AssemblerPredicate = isGFX10Only, DecoderNamespace = "GFX10"
multiclass VOP2Only_Real_MADK_gfx10_gfx11<bits<6> op> :
- VOP2Only_Real_MADK_gfx10<op>, VOP2Only_Real_MADK_gfx11<op>;
+ VOP2Only_Real_MADK_gfx10<op>, VOP2Only_Real_MADK<GFX11Gen, op>;
+
+multiclass VOP2Only_Real_MADK_gfx10_gfx11_gfx12<bits<6> op> :
+ VOP2Only_Real_MADK_gfx10_gfx11<op>, VOP2Only_Real_MADK<GFX12Gen, op>;
multiclass VOP2be_Real_gfx10<bits<6> op, string opName, string asmName> :
VOP2be_Real_e32_gfx10<op, opName, asmName>,
@@ -1768,7 +1954,10 @@ multiclass VOP2_Real_gfx10<bits<6> op> :
VOP2_Real_sdwa_gfx10<op>, VOP2_Real_dpp_gfx10<op>, VOP2_Real_dpp8_gfx10<op>;
multiclass VOP2_Real_gfx10_gfx11<bits<6> op> :
- VOP2_Real_gfx10<op>, VOP2_Real_FULL_gfx11<op>;
+ VOP2_Real_gfx10<op>, VOP2_Real_FULL<GFX11Gen, op>;
+
+multiclass VOP2_Real_gfx10_gfx11_gfx12<bits<6> op> :
+ VOP2_Real_gfx10_gfx11<op>, VOP2_Real_FULL<GFX12Gen, op>;
multiclass VOP2_Real_with_name_gfx10<bits<6> op, string opName,
string asmName> :
@@ -1778,19 +1967,20 @@ multiclass VOP2_Real_with_name_gfx10<bits<6> op, string opName,
VOP2_Real_dpp_gfx10_with_name<op, opName, asmName>,
VOP2_Real_dpp8_gfx10_with_name<op, opName, asmName>;
-multiclass VOP2_Real_with_name_gfx10_gfx11<bits<6> op, string opName,
- string asmName> :
+multiclass VOP2_Real_with_name_gfx10_gfx11_gfx12<bits<6> op, string opName,
+ string asmName> :
VOP2_Real_with_name_gfx10<op, opName, asmName>,
- VOP2_Real_FULL_with_name_gfx11<op, opName, asmName>;
+ VOP2_Real_FULL_with_name<GFX11Gen, op, opName, asmName>,
+ VOP2_Real_FULL_with_name<GFX12Gen, op, opName, asmName>;
// NB: Same opcode as v_mac_legacy_f32
let DecoderNamespace = "GFX10_B" in
defm V_FMAC_LEGACY_F32 : VOP2_Real_gfx10<0x006>;
-defm V_XNOR_B32 : VOP2_Real_gfx10_gfx11<0x01e>;
-defm V_FMAC_F32 : VOP2_Real_gfx10_gfx11<0x02b>;
-defm V_FMAMK_F32 : VOP2Only_Real_MADK_gfx10_gfx11<0x02c>;
-defm V_FMAAK_F32 : VOP2Only_Real_MADK_gfx10_gfx11<0x02d>;
+defm V_XNOR_B32 : VOP2_Real_gfx10_gfx11_gfx12<0x01e>;
+defm V_FMAC_F32 : VOP2_Real_gfx10_gfx11_gfx12<0x02b>;
+defm V_FMAMK_F32 : VOP2Only_Real_MADK_gfx10_gfx11_gfx12<0x02c>;
+defm V_FMAAK_F32 : VOP2Only_Real_MADK_gfx10_gfx11_gfx12<0x02d>;
defm V_ADD_F16 : VOP2_Real_gfx10<0x032>;
defm V_SUB_F16 : VOP2_Real_gfx10<0x033>;
defm V_SUBREV_F16 : VOP2_Real_gfx10<0x034>;
@@ -1808,11 +1998,11 @@ let IsSingle = 1 in {
// VOP2 no carry-in, carry-out.
defm V_ADD_NC_U32 :
- VOP2_Real_with_name_gfx10_gfx11<0x025, "V_ADD_U32", "v_add_nc_u32">;
+ VOP2_Real_with_name_gfx10_gfx11_gfx12<0x025, "V_ADD_U32", "v_add_nc_u32">;
defm V_SUB_NC_U32 :
- VOP2_Real_with_name_gfx10_gfx11<0x026, "V_SUB_U32", "v_sub_nc_u32">;
+ VOP2_Real_with_name_gfx10_gfx11_gfx12<0x026, "V_SUB_U32", "v_sub_nc_u32">;
defm V_SUBREV_NC_U32 :
- VOP2_Real_with_name_gfx10_gfx11<0x027, "V_SUBREV_U32", "v_subrev_nc_u32">;
+ VOP2_Real_with_name_gfx10_gfx11_gfx12<0x027, "V_SUBREV_U32", "v_subrev_nc_u32">;
// VOP2 carry-in, carry-out.
defm V_ADD_CO_CI_U32 :
@@ -1905,7 +2095,10 @@ multiclass VOP2_Real_gfx6_gfx7_gfx10<bits<6> op> :
VOP2_Real_gfx6_gfx7<op>, VOP2_Real_gfx10<op>;
multiclass VOP2_Real_gfx6_gfx7_gfx10_gfx11<bits<6> op> :
- VOP2_Real_gfx6_gfx7_gfx10<op>, VOP2_Real_FULL_gfx11<op>;
+ VOP2_Real_gfx6_gfx7_gfx10<op>, VOP2_Real_FULL<GFX11Gen, op>;
+
+multiclass VOP2_Real_gfx6_gfx7_gfx10_gfx11_gfx12<bits<6> op> :
+ VOP2_Real_gfx6_gfx7_gfx10_gfx11<op>, VOP2_Real_FULL<GFX12Gen, op>;
multiclass VOP2be_Real_gfx6_gfx7<bits<6> op> :
VOP2_Real_e32_gfx6_gfx7<op>, VOP2be_Real_e64_gfx6_gfx7<op>;
@@ -1967,28 +2160,28 @@ let SubtargetPredicate = isGFX6GFX7 in {
def : VOP2e64InstAlias<V_SUBREV_CO_U32_e64, V_SUBREV_I32_e64_gfx6_gfx7>;
} // End SubtargetPredicate = isGFX6GFX7
-defm V_ADD_F32 : VOP2_Real_gfx6_gfx7_gfx10_gfx11<0x003>;
-defm V_SUB_F32 : VOP2_Real_gfx6_gfx7_gfx10_gfx11<0x004>;
-defm V_SUBREV_F32 : VOP2_Real_gfx6_gfx7_gfx10_gfx11<0x005>;
+defm V_ADD_F32 : VOP2_Real_gfx6_gfx7_gfx10_gfx11_gfx12<0x003>;
+defm V_SUB_F32 : VOP2_Real_gfx6_gfx7_gfx10_gfx11_gfx12<0x004>;
+defm V_SUBREV_F32 : VOP2_Real_gfx6_gfx7_gfx10_gfx11_gfx12<0x005>;
defm V_MAC_LEGACY_F32 : VOP2_Real_gfx6_gfx7_gfx10<0x006>;
defm V_MUL_LEGACY_F32 : VOP2_Real_gfx6_gfx7_gfx10<0x007>;
-defm V_MUL_F32 : VOP2_Real_gfx6_gfx7_gfx10_gfx11<0x008>;
-defm V_MUL_I32_I24 : VOP2_Real_gfx6_gfx7_gfx10_gfx11<0x009>;
-defm V_MUL_HI_I32_I24 : VOP2_Real_gfx6_gfx7_gfx10_gfx11<0x00a>;
-defm V_MUL_U32_U24 : VOP2_Real_gfx6_gfx7_gfx10_gfx11<0x00b>;
-defm V_MUL_HI_U32_U24 : VOP2_Real_gfx6_gfx7_gfx10_gfx11<0x00c>;
+defm V_MUL_F32 : VOP2_Real_gfx6_gfx7_gfx10_gfx11_gfx12<0x008>;
+defm V_MUL_I32_I24 : VOP2_Real_gfx6_gfx7_gfx10_gfx11_gfx12<0x009>;
+defm V_MUL_HI_I32_I24 : VOP2_Real_gfx6_gfx7_gfx10_gfx11_gfx12<0x00a>;
+defm V_MUL_U32_U24 : VOP2_Real_gfx6_gfx7_gfx10_gfx11_gfx12<0x00b>;
+defm V_MUL_HI_U32_U24 : VOP2_Real_gfx6_gfx7_gfx10_gfx11_gfx12<0x00c>;
defm V_MIN_F32 : VOP2_Real_gfx6_gfx7_gfx10_gfx11<0x00f>;
defm V_MAX_F32 : VOP2_Real_gfx6_gfx7_gfx10_gfx11<0x010>;
-defm V_MIN_I32 : VOP2_Real_gfx6_gfx7_gfx10_gfx11<0x011>;
-defm V_MAX_I32 : VOP2_Real_gfx6_gfx7_gfx10_gfx11<0x012>;
-defm V_MIN_U32 : VOP2_Real_gfx6_gfx7_gfx10_gfx11<0x013>;
-defm V_MAX_U32 : VOP2_Real_gfx6_gfx7_gfx10_gfx11<0x014>;
+defm V_MIN_I32 : VOP2_Real_gfx6_gfx7_gfx10_gfx11_gfx12<0x011>;
+defm V_MAX_I32 : VOP2_Real_gfx6_gfx7_gfx10_gfx11_gfx12<0x012>;
+defm V_MIN_U32 : VOP2_Real_gfx6_gfx7_gfx10_gfx11_gfx12<0x013>;
+defm V_MAX_U32 : VOP2_Real_gfx6_gfx7_gfx10_gfx11_gfx12<0x014>;
defm V_LSHRREV_B32 : VOP2_Real_gfx6_gfx7_gfx10<0x016>;
defm V_ASHRREV_I32 : VOP2_Real_gfx6_gfx7_gfx10<0x018>;
defm V_LSHLREV_B32 : VOP2_Real_gfx6_gfx7_gfx10<0x01a>;
-defm V_AND_B32 : VOP2_Real_gfx6_gfx7_gfx10_gfx11<0x01b>;
-defm V_OR_B32 : VOP2_Real_gfx6_gfx7_gfx10_gfx11<0x01c>;
-defm V_XOR_B32 : VOP2_Real_gfx6_gfx7_gfx10_gfx11<0x01d>;
+defm V_AND_B32 : VOP2_Real_gfx6_gfx7_gfx10_gfx11_gfx12<0x01b>;
+defm V_OR_B32 : VOP2_Real_gfx6_gfx7_gfx10_gfx11_gfx12<0x01c>;
+defm V_XOR_B32 : VOP2_Real_gfx6_gfx7_gfx10_gfx11_gfx12<0x01d>;
defm V_MAC_F32 : VOP2_Real_gfx6_gfx7_gfx10<0x01f>;
defm V_CVT_PKRTZ_F16_F32 : VOP2_Real_gfx6_gfx7_gfx10<0x02f>;
defm V_MADMK_F32 : VOP2Only_Real_MADK_gfx6_gfx7_gfx10<0x020>;
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/VOP3Instructions.td b/contrib/llvm-project/llvm/lib/Target/AMDGPU/VOP3Instructions.td
index c0e0ac1b4ec8..eebd323210f9 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/VOP3Instructions.td
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/VOP3Instructions.td
@@ -144,11 +144,15 @@ defm V_LERP_U8 : VOP3Inst <"v_lerp_u8", VOP3_Profile<VOP_I32_I32_I32_I32>, int_a
let SchedRW = [WriteDoubleAdd] in {
let FPDPRounding = 1 in {
defm V_FMA_F64 : VOP3Inst <"v_fma_f64", VOP3_Profile<VOP_F64_F64_F64_F64>, any_fma>;
+let SubtargetPredicate = isNotGFX12Plus in {
defm V_ADD_F64 : VOP3Inst <"v_add_f64", VOP3_Profile<VOP_F64_F64_F64>, any_fadd>;
defm V_MUL_F64 : VOP3Inst <"v_mul_f64", VOP3_Profile<VOP_F64_F64_F64>, any_fmul>;
+} // End SubtargetPredicate = isNotGFX12Plus
} // End FPDPRounding = 1
+let SubtargetPredicate = isNotGFX12Plus in {
defm V_MIN_F64 : VOP3Inst <"v_min_f64", VOP3_Profile<VOP_F64_F64_F64>, fminnum_like>;
defm V_MAX_F64 : VOP3Inst <"v_max_f64", VOP3_Profile<VOP_F64_F64_F64>, fmaxnum_like>;
+} // End SubtargetPredicate = isNotGFX12Plus
} // End SchedRW = [WriteDoubleAdd]
let SchedRW = [WriteIntMul] in {
@@ -157,6 +161,19 @@ defm V_MUL_HI_U32 : VOP3Inst <"v_mul_hi_u32", V_MUL_PROF<VOP_I32_I32_I32>, mulhu
defm V_MUL_LO_I32 : VOP3Inst <"v_mul_lo_i32", V_MUL_PROF<VOP_I32_I32_I32>>;
defm V_MUL_HI_I32 : VOP3Inst <"v_mul_hi_i32", V_MUL_PROF<VOP_I32_I32_I32>, mulhs>;
} // End SchedRW = [WriteIntMul]
+
+let SubtargetPredicate = isGFX12Plus, ReadsModeReg = 0 in {
+defm V_MINIMUM_F32 : VOP3Inst <"v_minimum_f32", VOP3_Profile<VOP_F32_F32_F32>, DivergentBinFrag<fminimum>>;
+defm V_MAXIMUM_F32 : VOP3Inst <"v_maximum_f32", VOP3_Profile<VOP_F32_F32_F32>, DivergentBinFrag<fmaximum>>;
+defm V_MINIMUM_F16 : VOP3Inst <"v_minimum_f16", VOP3_Profile<VOP_F16_F16_F16>, DivergentBinFrag<fminimum>>;
+defm V_MAXIMUM_F16 : VOP3Inst <"v_maximum_f16", VOP3_Profile<VOP_F16_F16_F16>, DivergentBinFrag<fmaximum>>;
+
+let SchedRW = [WriteDoubleAdd] in {
+defm V_MINIMUM_F64 : VOP3Inst <"v_minimum_f64", VOP3_Profile<VOP_F64_F64_F64>, fminimum>;
+defm V_MAXIMUM_F64 : VOP3Inst <"v_maximum_f64", VOP3_Profile<VOP_F64_F64_F64>, fmaximum>;
+} // End SchedRW = [WriteDoubleAdd]
+} // End SubtargetPredicate = isGFX12Plus, ReadsModeReg = 0
+
} // End isReMaterializable = 1
let Uses = [MODE, VCC, EXEC] in {
@@ -207,6 +224,11 @@ let mayRaiseFPException = 0 in {
defm V_MED3_F32 : VOP3Inst <"v_med3_f32", VOP3_Profile<VOP_F32_F32_F32_F32>, AMDGPUfmed3>;
} // End mayRaiseFPException = 0
+let SubtargetPredicate = isGFX12Plus, ReadsModeReg = 0 in {
+ defm V_MINIMUM3_F32 : VOP3Inst <"v_minimum3_f32", VOP3_Profile<VOP_F32_F32_F32_F32>, AMDGPUfminimum3>;
+ defm V_MAXIMUM3_F32 : VOP3Inst <"v_maximum3_f32", VOP3_Profile<VOP_F32_F32_F32_F32>, AMDGPUfmaximum3>;
+} // End SubtargetPredicate = isGFX12Plus, ReadsModeReg = 0
+
let isCommutable = 1 in {
defm V_SAD_U8 : VOP3Inst <"v_sad_u8", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_CLAMP>>;
defm V_SAD_HI_U8 : VOP3Inst <"v_sad_hi_u8", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_CLAMP>>;
@@ -254,10 +276,13 @@ let SchedRW = [Write64Bit] in {
} // End SubtargetPredicate = isGFX6GFX7
let SubtargetPredicate = isGFX8Plus in {
- defm V_LSHLREV_B64 : VOP3Inst <"v_lshlrev_b64", VOP3_Profile<VOP_I64_I32_I64>, clshl_rev_64>;
defm V_LSHRREV_B64 : VOP3Inst <"v_lshrrev_b64", VOP3_Profile<VOP_I64_I32_I64>, clshr_rev_64>;
defm V_ASHRREV_I64 : VOP3Inst <"v_ashrrev_i64", VOP3_Profile<VOP_I64_I32_I64>, cashr_rev_64>;
} // End SubtargetPredicate = isGFX8Plus
+
+ let SubtargetPredicate = isGFX8GFX9GFX10GFX11 in {
+ defm V_LSHLREV_B64 : VOP3Inst <"v_lshlrev_b64", VOP3_Profile<VOP_I64_I32_I64>, clshl_rev_64>;
+ } // End SubtargetPredicate = isGFX8GFX9GFX10GFX11
} // End SchedRW = [Write64Bit]
} // End isReMaterializable = 1
@@ -515,6 +540,16 @@ def VOP3_CVT_SR_F8_F32_Profile : VOP3_Profile<VOPProfile<[i32, f32, i32, f32]>,
let HasExtVOP3DPP = 0;
}
+def IsPow2Plus1: PatLeaf<(i32 imm), [{
+ uint32_t V = N->getZExtValue();
+ return isPowerOf2_32(V - 1);
+}]>;
+
+def Log2_32: SDNodeXForm<imm, [{
+ uint32_t V = N->getZExtValue();
+ return CurDAG->getTargetConstant(Log2_32(V - 1), SDLoc(N), MVT::i32);
+}]>;
+
let SubtargetPredicate = isGFX9Plus in {
let isCommutable = 1, isReMaterializable = 1 in {
defm V_ADD3_U32 : VOP3Inst <"v_add3_u32", VOP3_Profile<VOP_I32_I32_I32_I32>>;
@@ -538,6 +573,11 @@ defm V_MAX3_F16 : VOP3Inst <"v_max3_f16", VOP3_Profile<VOP_F16_F16_F16_F16, VOP3
defm V_MAX3_I16 : VOP3Inst <"v_max3_i16", VOP3_Profile<VOP_I16_I16_I16_I16, VOP3_OPSEL>, AMDGPUsmax3>;
defm V_MAX3_U16 : VOP3Inst <"v_max3_u16", VOP3_Profile<VOP_I16_I16_I16_I16, VOP3_OPSEL>, AMDGPUumax3>;
+let SubtargetPredicate = isGFX12Plus, ReadsModeReg = 0 in {
+ defm V_MINIMUM3_F16 : VOP3Inst <"v_minimum3_f16", VOP3_Profile<VOP_F16_F16_F16_F16, VOP3_OPSEL>, AMDGPUfminimum3>;
+ defm V_MAXIMUM3_F16 : VOP3Inst <"v_maximum3_f16", VOP3_Profile<VOP_F16_F16_F16_F16, VOP3_OPSEL>, AMDGPUfmaximum3>;
+} // End SubtargetPredicate = isGFX12Plus, ReadsModeReg = 0
+
defm V_ADD_I16 : VOP3Inst <"v_add_i16", VOP3_Profile<VOP_I16_I16_I16, VOP3_OPSEL>>;
defm V_SUB_I16 : VOP3Inst <"v_sub_i16", VOP3_Profile<VOP_I16_I16_I16, VOP3_OPSEL>>;
@@ -612,6 +652,10 @@ def : ThreeOp_i32_Pats<and, or, V_AND_OR_B32_e64>;
def : ThreeOp_i32_Pats<or, or, V_OR3_B32_e64>;
def : ThreeOp_i32_Pats<xor, add, V_XAD_U32_e64>;
+def : GCNPat<
+ (DivergentBinFrag<mul> i32:$src0, IsPow2Plus1:$src1),
+ (V_LSHL_ADD_U32_e64 i32:$src0, (i32 (Log2_32 imm:$src1)), i32:$src0)>;
+
let SubtargetPredicate = isGFX940Plus in
def : GCNPat<
(ThreeOpFrag<shl_0_to_4, add> i64:$src0, i32:$src1, i64:$src2),
@@ -664,11 +708,22 @@ multiclass IMAD32_Pats <VOP3_Pseudo inst> {
>;
}
+// Handle cases where amdgpu-codegenprepare-mul24 made a mul24 instead of a normal mul.
+// We need to separate this because otherwise OtherPredicates would be overriden.
+class IMAD32_Mul24_Pat<VOP3_Pseudo inst>: GCNPat <
+ (i64 (add (i64 (AMDGPUmul_u24 i32:$src0, i32:$src1)), i64:$src2)),
+ (inst $src0, $src1, $src2, 0 /* clamp */)
+ >;
+
// exclude pre-GFX9 where it was slow
-let OtherPredicates = [HasNotMADIntraFwdBug], SubtargetPredicate = isGFX9Plus in
+let OtherPredicates = [HasNotMADIntraFwdBug], SubtargetPredicate = isGFX9Plus in {
defm : IMAD32_Pats<V_MAD_U64_U32_e64>;
-let OtherPredicates = [HasMADIntraFwdBug], SubtargetPredicate = isGFX11Only in
+ def : IMAD32_Mul24_Pat<V_MAD_U64_U32_e64>;
+}
+let OtherPredicates = [HasMADIntraFwdBug], SubtargetPredicate = isGFX11Only in {
defm : IMAD32_Pats<V_MAD_U64_U32_gfx11_e64>;
+ def : IMAD32_Mul24_Pat<V_MAD_U64_U32_gfx11_e64>;
+}
def VOP3_PERMLANE_Profile : VOP3_Profile<VOPProfile <[i32, i32, i32, i32]>, VOP3_OPSEL> {
let InsVOP3OpSel = (ins IntOpSelMods:$src0_modifiers, VRegSrc_32:$src0,
@@ -680,6 +735,15 @@ def VOP3_PERMLANE_Profile : VOP3_Profile<VOPProfile <[i32, i32, i32, i32]>, VOP3
let HasExtDPP = 0;
}
+def VOP3_PERMLANE_VAR_Profile : VOP3_Profile<VOPProfile <[i32, i32, i32, untyped]>, VOP3_OPSEL> {
+ let InsVOP3OpSel = (ins IntOpSelMods:$src0_modifiers, VRegSrc_32:$src0,
+ IntOpSelMods:$src1_modifiers, VRegSrc_32:$src1,
+ VGPR_32:$vdst_in, op_sel0:$op_sel);
+ let HasClamp = 0;
+ let HasExtVOP3DPP = 0;
+ let HasExtDPP = 0;
+}
+
def opsel_i1timm : SDNodeXForm<timm, [{
return CurDAG->getTargetConstant(
N->getZExtValue() ? SISrcMods::OP_SEL_0 : SISrcMods::NONE,
@@ -696,6 +760,13 @@ class PermlanePat<SDPatternOperator permlane,
SCSrc_b32:$src1, 0, SCSrc_b32:$src2, VGPR_32:$vdst_in)
>;
+class PermlaneVarPat<SDPatternOperator permlane,
+ Instruction inst> : GCNPat<
+ (permlane i32:$vdst_in, i32:$src0, i32:$src1,
+ timm:$fi, timm:$bc),
+ (inst (opsel_i1timm $fi), VGPR_32:$src0, (opsel_i1timm $bc),
+ VGPR_32:$src1, VGPR_32:$vdst_in)
+>;
let SubtargetPredicate = isGFX10Plus in {
let isCommutable = 1, isReMaterializable = 1 in {
@@ -726,6 +797,17 @@ let SubtargetPredicate = isGFX10Plus in {
} // End SubtargetPredicate = isGFX10Plus
+let SubtargetPredicate = isGFX12Plus in {
+ let Constraints = "$vdst = $vdst_in", DisableEncoding="$vdst_in" in {
+ defm V_PERMLANE16_VAR_B32 : VOP3Inst<"v_permlane16_var_b32", VOP3_PERMLANE_VAR_Profile>;
+ defm V_PERMLANEX16_VAR_B32 : VOP3Inst<"v_permlanex16_var_b32", VOP3_PERMLANE_VAR_Profile>;
+ } // End $vdst = $vdst_in, DisableEncoding $vdst_in
+
+ def : PermlaneVarPat<int_amdgcn_permlane16_var, V_PERMLANE16_VAR_B32_e64>;
+ def : PermlaneVarPat<int_amdgcn_permlanex16_var, V_PERMLANEX16_VAR_B32_e64>;
+
+} // End SubtargetPredicate = isGFX12Plus
+
class DivFmasPat<ValueType vt, Instruction inst, Register CondReg> : GCNPat<
(AMDGPUdiv_fmas (vt (VOP3Mods vt:$src0, i32:$src0_modifiers)),
(vt (VOP3Mods vt:$src1, i32:$src1_modifiers)),
@@ -773,11 +855,61 @@ let SubtargetPredicate = isGFX11Plus in {
defm V_CVT_PK_U16_F32 : VOP3Inst<"v_cvt_pk_u16_f32", VOP3_Profile<VOP_V2I16_F32_F32>>;
} // End SubtargetPredicate = isGFX11Plus
+let SubtargetPredicate = isGFX12Plus, ReadsModeReg = 0 in {
+ defm V_MAXIMUMMINIMUM_F32 : VOP3Inst<"v_maximumminimum_f32", VOP3_Profile<VOP_F32_F32_F32_F32>>;
+ defm V_MINIMUMMAXIMUM_F32 : VOP3Inst<"v_minimummaximum_f32", VOP3_Profile<VOP_F32_F32_F32_F32>>;
+ defm V_MAXIMUMMINIMUM_F16 : VOP3Inst<"v_maximumminimum_f16", VOP3_Profile<VOP_F16_F16_F16_F16, VOP3_OPSEL>>;
+ defm V_MINIMUMMAXIMUM_F16 : VOP3Inst<"v_minimummaximum_f16", VOP3_Profile<VOP_F16_F16_F16_F16, VOP3_OPSEL>>;
+} // End SubtargetPredicate = isGFX12Plus, ReadsModeReg = 0
+
let SubtargetPredicate = HasDot9Insts, IsDOT=1 in {
defm V_DOT2_F16_F16 : VOP3Inst<"v_dot2_f16_f16", VOP3_DOT_Profile<VOP_F16_V2F16_V2F16_F16>, int_amdgcn_fdot2_f16_f16>;
defm V_DOT2_BF16_BF16 : VOP3Inst<"v_dot2_bf16_bf16", VOP3_DOT_Profile<VOP_I16_V2I16_V2I16_I16>, int_amdgcn_fdot2_bf16_bf16>;
}
+class VOP_Pseudo_Scalar<RegisterClass Dst, RegisterOperand SrcOp,
+ ValueType dstVt, ValueType srcVt = dstVt>
+ : VOPProfile<[dstVt, srcVt, untyped, untyped]> {
+ let DstRC = VOPDstOperand<Dst>;
+ let Src0RC64 = SrcOp;
+
+ let HasOMod = 1;
+ let HasModifiers = 1;
+}
+
+def VOP_Pseudo_Scalar_F32 : VOP_Pseudo_Scalar<SReg_32_XEXEC, SSrc_f32, f32>;
+def VOP_Pseudo_Scalar_F16 : VOP_Pseudo_Scalar<SReg_32_XEXEC, SSrc_f16, f32, f16>;
+
+let SubtargetPredicate = HasPseudoScalarTrans, TRANS = 1,
+ isReMaterializable = 1, SchedRW = [WritePseudoScalarTrans] in {
+ defm V_S_EXP_F32 : VOP3PseudoScalarInst<"v_s_exp_f32", VOP_Pseudo_Scalar_F32, AMDGPUexp>;
+ defm V_S_EXP_F16 : VOP3PseudoScalarInst<"v_s_exp_f16", VOP_Pseudo_Scalar_F16>;
+ defm V_S_LOG_F32 : VOP3PseudoScalarInst<"v_s_log_f32", VOP_Pseudo_Scalar_F32, AMDGPUlog>;
+ defm V_S_LOG_F16 : VOP3PseudoScalarInst<"v_s_log_f16", VOP_Pseudo_Scalar_F16>;
+ defm V_S_RCP_F32 : VOP3PseudoScalarInst<"v_s_rcp_f32", VOP_Pseudo_Scalar_F32, AMDGPUrcp>;
+ defm V_S_RCP_F16 : VOP3PseudoScalarInst<"v_s_rcp_f16", VOP_Pseudo_Scalar_F16>;
+ defm V_S_RSQ_F32 : VOP3PseudoScalarInst<"v_s_rsq_f32", VOP_Pseudo_Scalar_F32, AMDGPUrsq>;
+ defm V_S_RSQ_F16 : VOP3PseudoScalarInst<"v_s_rsq_f16", VOP_Pseudo_Scalar_F16>;
+ defm V_S_SQRT_F32 : VOP3PseudoScalarInst<"v_s_sqrt_f32", VOP_Pseudo_Scalar_F32, any_amdgcn_sqrt>;
+ defm V_S_SQRT_F16 : VOP3PseudoScalarInst<"v_s_sqrt_f16", VOP_Pseudo_Scalar_F16>;
+}
+
+class PseudoScalarPatF16<SDPatternOperator node, VOP3_Pseudo inst> : GCNPat <
+ (f16 (UniformUnaryFrag<node> (f16 (VOP3Mods0 f16:$src0, i32:$src0_modifiers,
+ i1:$clamp, i32:$omod)))),
+ (f16 (COPY_TO_REGCLASS (f32 (inst i32:$src0_modifiers, f16:$src0, i1:$clamp,
+ i32:$omod)),
+ SReg_32_XEXEC))
+>;
+
+let SubtargetPredicate = HasPseudoScalarTrans in {
+ def : PseudoScalarPatF16<AMDGPUexpf16, V_S_EXP_F16_e64>;
+ def : PseudoScalarPatF16<AMDGPUlogf16, V_S_LOG_F16_e64>;
+ def : PseudoScalarPatF16<AMDGPUrcp, V_S_RCP_F16_e64>;
+ def : PseudoScalarPatF16<AMDGPUrsq, V_S_RSQ_F16_e64>;
+ def : PseudoScalarPatF16<any_amdgcn_sqrt, V_S_SQRT_F16_e64>;
+}
+
//===----------------------------------------------------------------------===//
// Integer Clamp Patterns
//===----------------------------------------------------------------------===//
@@ -823,125 +955,195 @@ def : IntClampPat<V_MQSAD_U32_U8_e64, int_amdgcn_mqsad_u32_u8>;
//===----------------------------------------------------------------------===//
//===----------------------------------------------------------------------===//
-// GFX11.
+// GFX12.
+//===----------------------------------------------------------------------===//
+
+defm V_MIN3_NUM_F32 : VOP3_Realtriple_with_name_gfx12<0x229, "V_MIN3_F32", "v_min3_num_f32">;
+defm V_MAX3_NUM_F32 : VOP3_Realtriple_with_name_gfx12<0x22a, "V_MAX3_F32", "v_max3_num_f32">;
+defm V_MIN3_NUM_F16 : VOP3_Realtriple_with_name_gfx12<0x22b, "V_MIN3_F16", "v_min3_num_f16">;
+defm V_MAX3_NUM_F16 : VOP3_Realtriple_with_name_gfx12<0x22c, "V_MAX3_F16", "v_max3_num_f16">;
+defm V_MINIMUM3_F32 : VOP3Only_Realtriple_gfx12<0x22d>;
+defm V_MAXIMUM3_F32 : VOP3Only_Realtriple_gfx12<0x22e>;
+defm V_MINIMUM3_F16 : VOP3Only_Realtriple_t16_gfx12<0x22f>;
+defm V_MAXIMUM3_F16 : VOP3Only_Realtriple_t16_gfx12<0x230>;
+defm V_MED3_NUM_F32 : VOP3_Realtriple_with_name_gfx12<0x231, "V_MED3_F32", "v_med3_num_f32">;
+defm V_MED3_NUM_F16 : VOP3_Realtriple_with_name_gfx12<0x232, "V_MED3_F16", "v_med3_num_f16">;
+defm V_MINMAX_NUM_F32 : VOP3_Realtriple_with_name_gfx12<0x268, "V_MINMAX_F32", "v_minmax_num_f32">;
+defm V_MAXMIN_NUM_F32 : VOP3_Realtriple_with_name_gfx12<0x269, "V_MAXMIN_F32", "v_maxmin_num_f32">;
+defm V_MINMAX_NUM_F16 : VOP3_Realtriple_with_name_gfx12<0x26a, "V_MINMAX_F16", "v_minmax_num_f16">;
+defm V_MAXMIN_NUM_F16 : VOP3_Realtriple_with_name_gfx12<0x26b, "V_MAXMIN_F16", "v_maxmin_num_f16">;
+defm V_MINIMUMMAXIMUM_F32 : VOP3Only_Realtriple_gfx12<0x26c>;
+defm V_MAXIMUMMINIMUM_F32 : VOP3Only_Realtriple_gfx12<0x26d>;
+defm V_MINIMUMMAXIMUM_F16 : VOP3Only_Realtriple_t16_gfx12<0x26e>;
+defm V_MAXIMUMMINIMUM_F16 : VOP3Only_Realtriple_t16_gfx12<0x26f>;
+defm V_S_EXP_F32 : VOP3Only_Real_Base_gfx12<0x280>;
+defm V_S_EXP_F16 : VOP3Only_Real_Base_gfx12<0x281>;
+defm V_S_LOG_F32 : VOP3Only_Real_Base_gfx12<0x282>;
+defm V_S_LOG_F16 : VOP3Only_Real_Base_gfx12<0x283>;
+defm V_S_RCP_F32 : VOP3Only_Real_Base_gfx12<0x284>;
+defm V_S_RCP_F16 : VOP3Only_Real_Base_gfx12<0x285>;
+defm V_S_RSQ_F32 : VOP3Only_Real_Base_gfx12<0x286>;
+defm V_S_RSQ_F16 : VOP3Only_Real_Base_gfx12<0x287>;
+defm V_S_SQRT_F32 : VOP3Only_Real_Base_gfx12<0x288>;
+defm V_S_SQRT_F16 : VOP3Only_Real_Base_gfx12<0x289>;
+defm V_MAD_CO_U64_U32 : VOP3be_Real_with_name_gfx12<0x2fe, "V_MAD_U64_U32", "v_mad_co_u64_u32">;
+defm V_MAD_CO_I64_I32 : VOP3be_Real_with_name_gfx12<0x2ff, "V_MAD_I64_I32", "v_mad_co_i64_i32">;
+defm V_MINIMUM_F64 : VOP3Only_Real_Base_gfx12<0x341>;
+defm V_MAXIMUM_F64 : VOP3Only_Real_Base_gfx12<0x342>;
+defm V_MINIMUM_F32 : VOP3Only_Realtriple_gfx12<0x365>;
+defm V_MAXIMUM_F32 : VOP3Only_Realtriple_gfx12<0x366>;
+defm V_MINIMUM_F16 : VOP3Only_Realtriple_t16_gfx12<0x367>;
+defm V_MAXIMUM_F16 : VOP3Only_Realtriple_t16_gfx12<0x368>;
+
+defm V_PERMLANE16_VAR_B32 : VOP3Only_Real_Base_gfx12<0x30f>;
+defm V_PERMLANEX16_VAR_B32 : VOP3Only_Real_Base_gfx12<0x310>;
+
+//===----------------------------------------------------------------------===//
+// GFX11, GFX12
//===----------------------------------------------------------------------===//
-defm V_FMA_DX9_ZERO_F32 : VOP3_Real_with_name_gfx11<0x209, "V_FMA_LEGACY_F32", "v_fma_dx9_zero_f32">;
-defm V_MAD_I32_I24 : VOP3_Realtriple_gfx11<0x20a>;
-defm V_MAD_U32_U24 : VOP3_Realtriple_gfx11<0x20b>;
-defm V_CUBEID_F32 : VOP3_Realtriple_gfx11<0x20c>;
-defm V_CUBESC_F32 : VOP3_Realtriple_gfx11<0x20d>;
-defm V_CUBETC_F32 : VOP3_Realtriple_gfx11<0x20e>;
-defm V_CUBEMA_F32 : VOP3_Realtriple_gfx11<0x20f>;
-defm V_BFE_U32 : VOP3_Realtriple_gfx11<0x210>;
-defm V_BFE_I32 : VOP3_Realtriple_gfx11<0x211>;
-defm V_BFI_B32 : VOP3_Realtriple_gfx11<0x212>;
-defm V_FMA_F32 : VOP3_Realtriple_gfx11<0x213>;
-defm V_FMA_F64 : VOP3_Real_Base_gfx11<0x214>;
-defm V_LERP_U8 : VOP3_Realtriple_gfx11<0x215>;
-defm V_ALIGNBIT_B32 : VOP3_Realtriple_gfx11<0x216>;
-defm V_ALIGNBYTE_B32 : VOP3_Realtriple_gfx11<0x217>;
-defm V_MULLIT_F32 : VOP3_Realtriple_gfx11<0x218>;
+multiclass VOP3_Real_with_name_gfx11_gfx12<bits<10> op, string opName,
+ string asmName> :
+ VOP3_Real_with_name<GFX11Gen, op, opName, asmName>,
+ VOP3_Real_with_name<GFX12Gen, op, opName, asmName>;
+
+multiclass VOP3_Realtriple_gfx11_gfx12<bits<10> op> :
+ VOP3_Realtriple<GFX11Gen, op>, VOP3_Realtriple<GFX12Gen, op>;
+
+multiclass VOP3_Real_Base_gfx11_gfx12<bits<10> op> :
+ VOP3_Real_Base<GFX11Gen, op>, VOP3_Real_Base<GFX12Gen, op>;
+
+multiclass VOP3_Realtriple_with_name_gfx11_gfx12<bits<10> op, string opName,
+ string asmName> :
+ VOP3_Realtriple_with_name<GFX11Gen, op, opName, asmName>,
+ VOP3_Realtriple_with_name<GFX12Gen, op, opName, asmName>;
+
+multiclass VOP3Dot_Realtriple_gfx11_gfx12<bits<10> op> :
+ VOP3Dot_Realtriple<GFX11Gen, op>, VOP3Dot_Realtriple<GFX12Gen, op>;
+
+multiclass VOP3be_Real_gfx11_gfx12<bits<10> op, string opName, string asmName> :
+ VOP3be_Real<GFX11Gen, op, opName, asmName>,
+ VOP3be_Real<GFX12Gen, op, opName, asmName>;
+
+multiclass VOP3_Real_No_Suffix_gfx11_gfx12<bits<10> op> :
+ VOP3_Real_No_Suffix<GFX11Gen, op>, VOP3_Real_No_Suffix<GFX12Gen, op>;
+
+defm V_FMA_DX9_ZERO_F32 : VOP3_Real_with_name_gfx11_gfx12<0x209, "V_FMA_LEGACY_F32", "v_fma_dx9_zero_f32">;
+defm V_MAD_I32_I24 : VOP3_Realtriple_gfx11_gfx12<0x20a>;
+defm V_MAD_U32_U24 : VOP3_Realtriple_gfx11_gfx12<0x20b>;
+defm V_CUBEID_F32 : VOP3_Realtriple_gfx11_gfx12<0x20c>;
+defm V_CUBESC_F32 : VOP3_Realtriple_gfx11_gfx12<0x20d>;
+defm V_CUBETC_F32 : VOP3_Realtriple_gfx11_gfx12<0x20e>;
+defm V_CUBEMA_F32 : VOP3_Realtriple_gfx11_gfx12<0x20f>;
+defm V_BFE_U32 : VOP3_Realtriple_gfx11_gfx12<0x210>;
+defm V_BFE_I32 : VOP3_Realtriple_gfx11_gfx12<0x211>;
+defm V_BFI_B32 : VOP3_Realtriple_gfx11_gfx12<0x212>;
+defm V_FMA_F32 : VOP3_Realtriple_gfx11_gfx12<0x213>;
+defm V_FMA_F64 : VOP3_Real_Base_gfx11_gfx12<0x214>;
+defm V_LERP_U8 : VOP3_Realtriple_gfx11_gfx12<0x215>;
+defm V_ALIGNBIT_B32 : VOP3_Realtriple_gfx11_gfx12<0x216>;
+defm V_ALIGNBYTE_B32 : VOP3_Realtriple_gfx11_gfx12<0x217>;
+defm V_MULLIT_F32 : VOP3_Realtriple_gfx11_gfx12<0x218>;
defm V_MIN3_F32 : VOP3_Realtriple_gfx11<0x219>;
-defm V_MIN3_I32 : VOP3_Realtriple_gfx11<0x21a>;
-defm V_MIN3_U32 : VOP3_Realtriple_gfx11<0x21b>;
+defm V_MIN3_I32 : VOP3_Realtriple_gfx11_gfx12<0x21a>;
+defm V_MIN3_U32 : VOP3_Realtriple_gfx11_gfx12<0x21b>;
defm V_MAX3_F32 : VOP3_Realtriple_gfx11<0x21c>;
-defm V_MAX3_I32 : VOP3_Realtriple_gfx11<0x21d>;
-defm V_MAX3_U32 : VOP3_Realtriple_gfx11<0x21e>;
+defm V_MAX3_I32 : VOP3_Realtriple_gfx11_gfx12<0x21d>;
+defm V_MAX3_U32 : VOP3_Realtriple_gfx11_gfx12<0x21e>;
defm V_MED3_F32 : VOP3_Realtriple_gfx11<0x21f>;
-defm V_MED3_I32 : VOP3_Realtriple_gfx11<0x220>;
-defm V_MED3_U32 : VOP3_Realtriple_gfx11<0x221>;
-defm V_SAD_U8 : VOP3_Realtriple_gfx11<0x222>;
-defm V_SAD_HI_U8 : VOP3_Realtriple_gfx11<0x223>;
-defm V_SAD_U16 : VOP3_Realtriple_gfx11<0x224>;
-defm V_SAD_U32 : VOP3_Realtriple_gfx11<0x225>;
-defm V_CVT_PK_U8_F32 : VOP3_Realtriple_gfx11<0x226>;
-defm V_DIV_FIXUP_F32 : VOP3_Real_Base_gfx11<0x227>;
-defm V_DIV_FIXUP_F64 : VOP3_Real_Base_gfx11<0x228>;
-defm V_DIV_FMAS_F32 : VOP3_Real_Base_gfx11<0x237>;
-defm V_DIV_FMAS_F64 : VOP3_Real_Base_gfx11<0x238>;
-defm V_MSAD_U8 : VOP3_Realtriple_gfx11<0x239>;
-defm V_QSAD_PK_U16_U8 : VOP3_Real_Base_gfx11<0x23a>;
-defm V_MQSAD_PK_U16_U8 : VOP3_Real_Base_gfx11<0x23b>;
-defm V_MQSAD_U32_U8 : VOP3_Real_Base_gfx11<0x23d>;
-defm V_XOR3_B32 : VOP3_Realtriple_gfx11<0x240>;
-defm V_MAD_U16 : VOP3_Realtriple_with_name_gfx11<0x241, "V_MAD_U16_gfx9", "v_mad_u16">;
-defm V_PERM_B32 : VOP3_Realtriple_gfx11<0x244>;
-defm V_XAD_U32 : VOP3_Realtriple_gfx11<0x245>;
-defm V_LSHL_ADD_U32 : VOP3_Realtriple_gfx11<0x246>;
-defm V_ADD_LSHL_U32 : VOP3_Realtriple_gfx11<0x247>;
-defm V_FMA_F16 : VOP3_Realtriple_with_name_gfx11<0x248, "V_FMA_F16_gfx9", "v_fma_f16">;
+defm V_MED3_I32 : VOP3_Realtriple_gfx11_gfx12<0x220>;
+defm V_MED3_U32 : VOP3_Realtriple_gfx11_gfx12<0x221>;
+defm V_SAD_U8 : VOP3_Realtriple_gfx11_gfx12<0x222>;
+defm V_SAD_HI_U8 : VOP3_Realtriple_gfx11_gfx12<0x223>;
+defm V_SAD_U16 : VOP3_Realtriple_gfx11_gfx12<0x224>;
+defm V_SAD_U32 : VOP3_Realtriple_gfx11_gfx12<0x225>;
+defm V_CVT_PK_U8_F32 : VOP3_Realtriple_gfx11_gfx12<0x226>;
+defm V_DIV_FIXUP_F32 : VOP3_Real_Base_gfx11_gfx12<0x227>;
+defm V_DIV_FIXUP_F64 : VOP3_Real_Base_gfx11_gfx12<0x228>;
+defm V_DIV_FMAS_F32 : VOP3_Real_Base_gfx11_gfx12<0x237>;
+defm V_DIV_FMAS_F64 : VOP3_Real_Base_gfx11_gfx12<0x238>;
+defm V_MSAD_U8 : VOP3_Realtriple_gfx11_gfx12<0x239>;
+defm V_QSAD_PK_U16_U8 : VOP3_Real_Base_gfx11_gfx12<0x23a>;
+defm V_MQSAD_PK_U16_U8 : VOP3_Real_Base_gfx11_gfx12<0x23b>;
+defm V_MQSAD_U32_U8 : VOP3_Real_Base_gfx11_gfx12<0x23d>;
+defm V_XOR3_B32 : VOP3_Realtriple_gfx11_gfx12<0x240>;
+defm V_MAD_U16 : VOP3_Realtriple_with_name_gfx11_gfx12<0x241, "V_MAD_U16_gfx9", "v_mad_u16">;
+defm V_PERM_B32 : VOP3_Realtriple_gfx11_gfx12<0x244>;
+defm V_XAD_U32 : VOP3_Realtriple_gfx11_gfx12<0x245>;
+defm V_LSHL_ADD_U32 : VOP3_Realtriple_gfx11_gfx12<0x246>;
+defm V_ADD_LSHL_U32 : VOP3_Realtriple_gfx11_gfx12<0x247>;
+defm V_FMA_F16 : VOP3_Realtriple_with_name_gfx11_gfx12<0x248, "V_FMA_F16_gfx9", "v_fma_f16">;
defm V_MIN3_F16 : VOP3_Realtriple_gfx11<0x249>;
-defm V_MIN3_I16 : VOP3_Realtriple_gfx11<0x24a>;
-defm V_MIN3_U16 : VOP3_Realtriple_gfx11<0x24b>;
+defm V_MIN3_I16 : VOP3_Realtriple_gfx11_gfx12<0x24a>;
+defm V_MIN3_U16 : VOP3_Realtriple_gfx11_gfx12<0x24b>;
defm V_MAX3_F16 : VOP3_Realtriple_gfx11<0x24c>;
-defm V_MAX3_I16 : VOP3_Realtriple_gfx11<0x24d>;
-defm V_MAX3_U16 : VOP3_Realtriple_gfx11<0x24e>;
+defm V_MAX3_I16 : VOP3_Realtriple_gfx11_gfx12<0x24d>;
+defm V_MAX3_U16 : VOP3_Realtriple_gfx11_gfx12<0x24e>;
defm V_MED3_F16 : VOP3_Realtriple_gfx11<0x24f>;
-defm V_MED3_I16 : VOP3_Realtriple_gfx11<0x250>;
-defm V_MED3_U16 : VOP3_Realtriple_gfx11<0x251>;
-defm V_MAD_I16 : VOP3_Realtriple_with_name_gfx11<0x253, "V_MAD_I16_gfx9", "v_mad_i16">;
-defm V_DIV_FIXUP_F16 : VOP3_Realtriple_with_name_gfx11<0x254, "V_DIV_FIXUP_F16_gfx9", "v_div_fixup_f16">;
-defm V_ADD3_U32 : VOP3_Realtriple_gfx11<0x255>;
-defm V_LSHL_OR_B32 : VOP3_Realtriple_gfx11<0x256>;
-defm V_AND_OR_B32 : VOP3_Realtriple_gfx11<0x257>;
-defm V_OR3_B32 : VOP3_Realtriple_gfx11<0x258>;
-defm V_MAD_U32_U16 : VOP3_Realtriple_gfx11<0x259>;
-defm V_MAD_I32_I16 : VOP3_Realtriple_gfx11<0x25a>;
-defm V_PERMLANE16_B32 : VOP3_Real_Base_gfx11<0x25b>;
-defm V_PERMLANEX16_B32 : VOP3_Real_Base_gfx11<0x25c>;
+defm V_MED3_I16 : VOP3_Realtriple_gfx11_gfx12<0x250>;
+defm V_MED3_U16 : VOP3_Realtriple_gfx11_gfx12<0x251>;
+defm V_MAD_I16 : VOP3_Realtriple_with_name_gfx11_gfx12<0x253, "V_MAD_I16_gfx9", "v_mad_i16">;
+defm V_DIV_FIXUP_F16 : VOP3_Realtriple_with_name_gfx11_gfx12<0x254, "V_DIV_FIXUP_F16_gfx9", "v_div_fixup_f16">;
+defm V_ADD3_U32 : VOP3_Realtriple_gfx11_gfx12<0x255>;
+defm V_LSHL_OR_B32 : VOP3_Realtriple_gfx11_gfx12<0x256>;
+defm V_AND_OR_B32 : VOP3_Realtriple_gfx11_gfx12<0x257>;
+defm V_OR3_B32 : VOP3_Realtriple_gfx11_gfx12<0x258>;
+defm V_MAD_U32_U16 : VOP3_Realtriple_gfx11_gfx12<0x259>;
+defm V_MAD_I32_I16 : VOP3_Realtriple_gfx11_gfx12<0x25a>;
+defm V_PERMLANE16_B32 : VOP3_Real_Base_gfx11_gfx12<0x25b>;
+defm V_PERMLANEX16_B32 : VOP3_Real_Base_gfx11_gfx12<0x25c>;
defm V_MAXMIN_F32 : VOP3_Realtriple_gfx11<0x25e>;
defm V_MINMAX_F32 : VOP3_Realtriple_gfx11<0x25f>;
defm V_MAXMIN_F16 : VOP3_Realtriple_gfx11<0x260>;
defm V_MINMAX_F16 : VOP3_Realtriple_gfx11<0x261>;
-defm V_MAXMIN_U32 : VOP3_Realtriple_gfx11<0x262>;
-defm V_MINMAX_U32 : VOP3_Realtriple_gfx11<0x263>;
-defm V_MAXMIN_I32 : VOP3_Realtriple_gfx11<0x264>;
-defm V_MINMAX_I32 : VOP3_Realtriple_gfx11<0x265>;
-defm V_DOT2_F16_F16 : VOP3Dot_Realtriple_gfx11<0x266>;
-defm V_DOT2_BF16_BF16 : VOP3Dot_Realtriple_gfx11<0x267>;
-defm V_DIV_SCALE_F32 : VOP3be_Real_gfx11<0x2fc, "V_DIV_SCALE_F32", "v_div_scale_f32">;
-defm V_DIV_SCALE_F64 : VOP3be_Real_gfx11<0x2fd, "V_DIV_SCALE_F64", "v_div_scale_f64">;
+defm V_MAXMIN_U32 : VOP3_Realtriple_gfx11_gfx12<0x262>;
+defm V_MINMAX_U32 : VOP3_Realtriple_gfx11_gfx12<0x263>;
+defm V_MAXMIN_I32 : VOP3_Realtriple_gfx11_gfx12<0x264>;
+defm V_MINMAX_I32 : VOP3_Realtriple_gfx11_gfx12<0x265>;
+defm V_DOT2_F16_F16 : VOP3Dot_Realtriple_gfx11_gfx12<0x266>;
+defm V_DOT2_BF16_BF16 : VOP3Dot_Realtriple_gfx11_gfx12<0x267>;
+defm V_DIV_SCALE_F32 : VOP3be_Real_gfx11_gfx12<0x2fc, "V_DIV_SCALE_F32", "v_div_scale_f32">;
+defm V_DIV_SCALE_F64 : VOP3be_Real_gfx11_gfx12<0x2fd, "V_DIV_SCALE_F64", "v_div_scale_f64">;
defm V_MAD_U64_U32_gfx11 : VOP3be_Real_gfx11<0x2fe, "V_MAD_U64_U32_gfx11", "v_mad_u64_u32">;
defm V_MAD_I64_I32_gfx11 : VOP3be_Real_gfx11<0x2ff, "V_MAD_I64_I32_gfx11", "v_mad_i64_i32">;
-defm V_ADD_NC_U16 : VOP3Only_Realtriple_gfx11<0x303>;
-defm V_SUB_NC_U16 : VOP3Only_Realtriple_gfx11<0x304>;
-defm V_MUL_LO_U16_t16 : VOP3Only_Realtriple_t16_gfx11<0x305, "v_mul_lo_u16">;
-defm V_CVT_PK_I16_F32 : VOP3_Realtriple_gfx11<0x306>;
-defm V_CVT_PK_U16_F32 : VOP3_Realtriple_gfx11<0x307>;
-defm V_MAX_U16_t16 : VOP3Only_Realtriple_t16_gfx11<0x309, "v_max_u16">;
-defm V_MAX_I16_t16 : VOP3Only_Realtriple_t16_gfx11<0x30a, "v_max_i16">;
-defm V_MIN_U16_t16 : VOP3Only_Realtriple_t16_gfx11<0x30b, "v_min_u16">;
-defm V_MIN_I16_t16 : VOP3Only_Realtriple_t16_gfx11<0x30c, "v_min_i16">;
-defm V_ADD_NC_I16 : VOP3_Realtriple_with_name_gfx11<0x30d, "V_ADD_I16", "v_add_nc_i16">;
-defm V_SUB_NC_I16 : VOP3_Realtriple_with_name_gfx11<0x30e, "V_SUB_I16", "v_sub_nc_i16">;
-defm V_PACK_B32_F16 : VOP3_Realtriple_gfx11<0x311>;
-defm V_CVT_PK_NORM_I16_F16 : VOP3_Realtriple_with_name_gfx11<0x312, "V_CVT_PKNORM_I16_F16" , "v_cvt_pk_norm_i16_f16" >;
-defm V_CVT_PK_NORM_U16_F16 : VOP3_Realtriple_with_name_gfx11<0x313, "V_CVT_PKNORM_U16_F16" , "v_cvt_pk_norm_u16_f16" >;
-defm V_SUB_NC_I32 : VOP3_Realtriple_with_name_gfx11<0x325, "V_SUB_I32", "v_sub_nc_i32">;
-defm V_ADD_NC_I32 : VOP3_Realtriple_with_name_gfx11<0x326, "V_ADD_I32", "v_add_nc_i32">;
+defm V_ADD_NC_U16 : VOP3Only_Realtriple_gfx11_gfx12<0x303>;
+defm V_SUB_NC_U16 : VOP3Only_Realtriple_gfx11_gfx12<0x304>;
+defm V_MUL_LO_U16_t16 : VOP3Only_Realtriple_t16_gfx11_gfx12<0x305, "v_mul_lo_u16">;
+defm V_CVT_PK_I16_F32 : VOP3_Realtriple_gfx11_gfx12<0x306>;
+defm V_CVT_PK_U16_F32 : VOP3_Realtriple_gfx11_gfx12<0x307>;
+defm V_MAX_U16_t16 : VOP3Only_Realtriple_t16_gfx11_gfx12<0x309, "v_max_u16">;
+defm V_MAX_I16_t16 : VOP3Only_Realtriple_t16_gfx11_gfx12<0x30a, "v_max_i16">;
+defm V_MIN_U16_t16 : VOP3Only_Realtriple_t16_gfx11_gfx12<0x30b, "v_min_u16">;
+defm V_MIN_I16_t16 : VOP3Only_Realtriple_t16_gfx11_gfx12<0x30c, "v_min_i16">;
+defm V_ADD_NC_I16 : VOP3_Realtriple_with_name_gfx11_gfx12<0x30d, "V_ADD_I16", "v_add_nc_i16">;
+defm V_SUB_NC_I16 : VOP3_Realtriple_with_name_gfx11_gfx12<0x30e, "V_SUB_I16", "v_sub_nc_i16">;
+defm V_PACK_B32_F16 : VOP3_Realtriple_gfx11_gfx12<0x311>;
+defm V_CVT_PK_NORM_I16_F16 : VOP3_Realtriple_with_name_gfx11_gfx12<0x312, "V_CVT_PKNORM_I16_F16" , "v_cvt_pk_norm_i16_f16" >;
+defm V_CVT_PK_NORM_U16_F16 : VOP3_Realtriple_with_name_gfx11_gfx12<0x313, "V_CVT_PKNORM_U16_F16" , "v_cvt_pk_norm_u16_f16" >;
+defm V_SUB_NC_I32 : VOP3_Realtriple_with_name_gfx11_gfx12<0x325, "V_SUB_I32", "v_sub_nc_i32">;
+defm V_ADD_NC_I32 : VOP3_Realtriple_with_name_gfx11_gfx12<0x326, "V_ADD_I32", "v_add_nc_i32">;
defm V_ADD_F64 : VOP3_Real_Base_gfx11<0x327>;
defm V_MUL_F64 : VOP3_Real_Base_gfx11<0x328>;
defm V_MIN_F64 : VOP3_Real_Base_gfx11<0x329>;
defm V_MAX_F64 : VOP3_Real_Base_gfx11<0x32a>;
-defm V_LDEXP_F64 : VOP3_Real_Base_gfx11<0x32b>;
-defm V_MUL_LO_U32 : VOP3_Real_Base_gfx11<0x32c>;
-defm V_MUL_HI_U32 : VOP3_Real_Base_gfx11<0x32d>;
-defm V_MUL_HI_I32 : VOP3_Real_Base_gfx11<0x32e>;
-defm V_TRIG_PREOP_F64 : VOP3_Real_Base_gfx11<0x32f>;
-defm V_LSHLREV_B16_t16 : VOP3Only_Realtriple_t16_gfx11<0x338, "v_lshlrev_b16">;
-defm V_LSHRREV_B16_t16 : VOP3Only_Realtriple_t16_gfx11<0x339, "v_lshrrev_b16">;
-defm V_ASHRREV_I16_t16 : VOP3Only_Realtriple_t16_gfx11<0x33a, "v_ashrrev_i16">;
+defm V_LDEXP_F64 : VOP3_Real_Base_gfx11_gfx12<0x32b>;
+defm V_MUL_LO_U32 : VOP3_Real_Base_gfx11_gfx12<0x32c>;
+defm V_MUL_HI_U32 : VOP3_Real_Base_gfx11_gfx12<0x32d>;
+defm V_MUL_HI_I32 : VOP3_Real_Base_gfx11_gfx12<0x32e>;
+defm V_TRIG_PREOP_F64 : VOP3_Real_Base_gfx11_gfx12<0x32f>;
+defm V_LSHLREV_B16_t16 : VOP3Only_Realtriple_t16_gfx11_gfx12<0x338, "v_lshlrev_b16">;
+defm V_LSHRREV_B16_t16 : VOP3Only_Realtriple_t16_gfx11_gfx12<0x339, "v_lshrrev_b16">;
+defm V_ASHRREV_I16_t16 : VOP3Only_Realtriple_t16_gfx11_gfx12<0x33a, "v_ashrrev_i16">;
defm V_LSHLREV_B64 : VOP3_Real_Base_gfx11<0x33c>;
-defm V_LSHRREV_B64 : VOP3_Real_Base_gfx11<0x33d>;
-defm V_ASHRREV_I64 : VOP3_Real_Base_gfx11<0x33e>;
-defm V_READLANE_B32 : VOP3_Real_No_Suffix_gfx11<0x360>; // Pseudo in VOP2
+defm V_LSHRREV_B64 : VOP3_Real_Base_gfx11_gfx12<0x33d>;
+defm V_ASHRREV_I64 : VOP3_Real_Base_gfx11_gfx12<0x33e>;
+defm V_READLANE_B32 : VOP3_Real_No_Suffix_gfx11_gfx12<0x360>; // Pseudo in VOP2
let InOperandList = (ins SSrcOrLds_b32:$src0, SCSrc_b32:$src1, VGPR_32:$vdst_in) in {
- defm V_WRITELANE_B32 : VOP3_Real_No_Suffix_gfx11<0x361>; // Pseudo in VOP2
+ defm V_WRITELANE_B32 : VOP3_Real_No_Suffix_gfx11_gfx12<0x361>; // Pseudo in VOP2
} // End InOperandList = (ins SSrcOrLds_b32:$src0, SCSrc_b32:$src1, VGPR_32:$vdst_in)
-defm V_AND_B16_t16 : VOP3Only_Realtriple_t16_gfx11<0x362, "v_and_b16">;
-defm V_OR_B16_t16 : VOP3Only_Realtriple_t16_gfx11<0x363, "v_or_b16">;
-defm V_XOR_B16_t16 : VOP3Only_Realtriple_t16_gfx11<0x364, "v_xor_b16">;
+defm V_AND_B16_t16 : VOP3Only_Realtriple_t16_gfx11_gfx12<0x362, "v_and_b16">;
+defm V_OR_B16_t16 : VOP3Only_Realtriple_t16_gfx11_gfx12<0x363, "v_or_b16">;
+defm V_XOR_B16_t16 : VOP3Only_Realtriple_t16_gfx11_gfx12<0x364, "v_xor_b16">;
//===----------------------------------------------------------------------===//
// GFX10.
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/VOP3PInstructions.td b/contrib/llvm-project/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
index 71e09611e74e..d3cefb339d9e 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
@@ -108,6 +108,11 @@ defm V_PK_MIN_I16 : VOP3PInst<"v_pk_min_i16", VOP3P_Profile<VOP_V2I16_V2I16_V2I1
defm V_PK_MIN_U16 : VOP3PInst<"v_pk_min_u16", VOP3P_Profile<VOP_V2I16_V2I16_V2I16>, umin>;
defm V_PK_MAX_I16 : VOP3PInst<"v_pk_max_i16", VOP3P_Profile<VOP_V2I16_V2I16_V2I16>, smax>;
defm V_PK_MAX_U16 : VOP3PInst<"v_pk_max_u16", VOP3P_Profile<VOP_V2I16_V2I16_V2I16>, umax>;
+
+let SubtargetPredicate = isGFX12Plus, ReadsModeReg = 0 in {
+defm V_PK_MAXIMUM_F16 : VOP3PInst<"v_pk_maximum_f16", VOP3P_Profile<VOP_V2F16_V2F16_V2F16>, fmaximum>;
+defm V_PK_MINIMUM_F16 : VOP3PInst<"v_pk_minimum_f16", VOP3P_Profile<VOP_V2F16_V2F16_V2F16>, fminimum>;
+} // End SubtargetPredicate = isGFX12Plus, ReadsModeReg = 0
}
defm V_PK_SUB_U16 : VOP3PInst<"v_pk_sub_u16", VOP3P_Profile<VOP_V2I16_V2I16_V2I16>>;
@@ -353,56 +358,51 @@ foreach Type = ["I", "U"] in
(!cast<Extract>(Type#Index#"_4bit") node:$src1))>;
}
-class UDot2Pat<Instruction Inst> : GCNPat <
+class UDot2Pat<VOP_Pseudo Inst> : GCNPat <
(add (add_oneuse (AMDGPUmul_u24_oneuse (srl i32:$src0, (i32 16)),
(srl i32:$src1, (i32 16))), i32:$src2),
(AMDGPUmul_u24_oneuse (and i32:$src0, (i32 65535)),
(and i32:$src1, (i32 65535)))
),
(Inst (i32 8), $src0, (i32 8), $src1, (i32 8), $src2, (i1 0))> {
- let SubtargetPredicate = !cast<VOP_Pseudo>(Inst).SubtargetPredicate;
+ let Predicates = Inst.Predicates;
}
-class SDot2Pat<Instruction Inst> : GCNPat <
+class SDot2Pat<VOP_Pseudo Inst> : GCNPat <
(add (add_oneuse (AMDGPUmul_i24_oneuse (sra i32:$src0, (i32 16)),
(sra i32:$src1, (i32 16))), i32:$src2),
(AMDGPUmul_i24_oneuse (sext_inreg i32:$src0, i16),
(sext_inreg i32:$src1, i16))),
(Inst (i32 8), $src0, (i32 8), $src1, (i32 8), $src2, (i1 0))> {
- let SubtargetPredicate = !cast<VOP_Pseudo>(Inst).SubtargetPredicate;
+ let Predicates = Inst.Predicates;
}
let IsDOT = 1 in {
-let SubtargetPredicate = HasDot2Insts in {
-
+let OtherPredicates = [HasDot2Insts] in {
defm V_DOT2_I32_I16 : VOP3PInst<"v_dot2_i32_i16",
VOP3P_Profile<VOP_I32_V2I16_V2I16_I32>, int_amdgcn_sdot2, 1>;
defm V_DOT2_U32_U16 : VOP3PInst<"v_dot2_u32_u16",
VOP3P_Profile<VOP_I32_V2I16_V2I16_I32>, int_amdgcn_udot2, 1>;
+} // End OtherPredicates = [HasDot2Insts]
-} // End SubtargetPredicate = HasDot2Insts
-
-let SubtargetPredicate = HasDot10Insts in
+let OtherPredicates = [HasDot10Insts] in
defm V_DOT2_F32_F16 : VOP3PInst<"v_dot2_f32_f16",
VOP3P_Profile<VOP_F32_V2F16_V2F16_F32, VOP3_REGULAR, /*HasDPP*/ 1>,
AMDGPUfdot2, 1/*ExplicitClamp*/>;
-let SubtargetPredicate = HasDot7Insts in {
+let OtherPredicates = [HasDot7Insts] in {
defm V_DOT4_U32_U8 : VOP3PInst<"v_dot4_u32_u8",
VOP3P_Profile<VOP_I32_I32_I32_I32, VOP3_PACKED>, int_amdgcn_udot4, 1>;
defm V_DOT8_U32_U4 : VOP3PInst<"v_dot8_u32_u4",
VOP3P_Profile<VOP_I32_I32_I32_I32, VOP3_PACKED>, int_amdgcn_udot8, 1>;
+} // End OtherPredicates = [HasDot7Insts]
-} // End SubtargetPredicate = HasDot7Insts
-
-let SubtargetPredicate = HasDot1Insts in {
-
+let OtherPredicates = [HasDot1Insts] in {
defm V_DOT4_I32_I8 : VOP3PInst<"v_dot4_i32_i8",
VOP3P_Profile<VOP_I32_I32_I32_I32, VOP3_PACKED>, int_amdgcn_sdot4, 1>;
defm V_DOT8_I32_I4 : VOP3PInst<"v_dot8_i32_i4",
VOP3P_Profile<VOP_I32_I32_I32_I32, VOP3_PACKED>, int_amdgcn_sdot8, 1>;
-
-} // End SubtargetPredicate = HasDot1Insts
+} // End OtherPredicates = [HasDot1Insts]
def DOT2_BF16_Profile
: VOP3P_Profile<VOP_F32_V2I16_V2I16_F32, VOP3_REGULAR, /*HasDPP*/ 1> {
@@ -436,20 +436,34 @@ multiclass VOP3PDOTIUInst <string OpName, SDPatternOperator intrinsic_node> {
let SubtargetPredicate = HasDot8Insts in {
defm V_DOT4_I32_IU8 : VOP3PDOTIUInst<"v_dot4_i32_iu8", int_amdgcn_sudot4>;
defm V_DOT8_I32_IU4 : VOP3PDOTIUInst<"v_dot8_i32_iu4", int_amdgcn_sudot8>;
+
+def : GCNPat < (int_amdgcn_sdot8 i32:$src0,
+ i32:$src1,
+ i32:$src2, (i1 timm:$clamp)),
+ (V_DOT8_I32_IU4 (i32 9), i32:$src0,
+ (i32 9), i32:$src1, (i32 8), i32:$src2, i1:$clamp)
+>;
+
+def : GCNPat < (int_amdgcn_sdot4 i32:$src0,
+ i32:$src1,
+ i32:$src2, (i1 timm:$clamp)),
+ (V_DOT4_I32_IU8 (i32 9), i32:$src0,
+ (i32 9), i32:$src1, (i32 8), i32:$src2, i1:$clamp)
+>;
} // End SubtargetPredicate = HasDot8Insts
def : UDot2Pat<V_DOT2_U32_U16>;
def : SDot2Pat<V_DOT2_I32_I16>;
foreach Type = ["U", "I"] in
- let SubtargetPredicate = !cast<VOP_Pseudo>("V_DOT4_"#Type#"32_"#Type#8).SubtargetPredicate in
+ let Predicates = !cast<VOP_Pseudo>("V_DOT4_"#Type#"32_"#Type#8).Predicates in
def : GCNPat <
!cast<dag>(!foldl((i32 i32:$src2), [0, 1, 2, 3], lhs, y,
(add_oneuse lhs, (!cast<PatFrag>("Mul"#Type#"_Elt"#y) i32:$src0, i32:$src1)))),
(!cast<VOP3P_Pseudo>("V_DOT4_"#Type#"32_"#Type#8) (i32 8), $src0, (i32 8), $src1, (i32 8), $src2, (i1 0))>;
foreach Type = ["U", "I"] in
- let SubtargetPredicate = !cast<VOP_Pseudo>("V_DOT8_"#Type#"32_"#Type#4).SubtargetPredicate in
+ let Predicates = !cast<VOP_Pseudo>("V_DOT8_"#Type#"32_"#Type#4).Predicates in
def : GCNPat <
!cast<dag>(!foldl((add_oneuse i32:$src2, (!cast<PatFrag>("Mul"#Type#"0_4bit") i32:$src0, i32:$src1)),
[1, 2, 3, 4, 5, 6, 7], lhs, y,
@@ -459,7 +473,7 @@ foreach Type = ["U", "I"] in
// Different variants of dot8 code-gen dag patterns are not generated through table-gen due to a huge increase
// in the compile time. Directly handle the pattern generated by the FE here.
foreach Type = ["U", "I"] in
- let SubtargetPredicate = !cast<VOP_Pseudo>("V_DOT8_"#Type#"32_"#Type#4).SubtargetPredicate in
+ let Predicates = !cast<VOP_Pseudo>("V_DOT8_"#Type#"32_"#Type#4).Predicates in
def : GCNPat <
!cast<dag>(!foldl((add_oneuse i32:$src2, (!cast<PatFrag>("Mul"#Type#"0_4bit") i32:$src0, i32:$src1)),
[7, 1, 2, 3, 4, 5, 6], lhs, y,
@@ -596,7 +610,7 @@ let GISelPredicateCode = [{ return !MF.getInfo<SIMachineFunctionInfo>()->mayNeed
class VgprMAIFrag<SDPatternOperator Op> :
MAIFrag<Op, [{ return !MF->getInfo<SIMachineFunctionInfo>()->mayNeedAGPRs(); }]>;
-let Predicates = [HasMAIInsts] in {
+let SubtargetPredicate = HasMAIInsts in {
let isAsCheapAsAMove = 1, isReMaterializable = 1 in {
defm V_ACCVGPR_READ_B32 : VOP3Inst<"v_accvgpr_read_b32", VOPProfileAccRead>;
@@ -687,7 +701,7 @@ let Predicates = [isGFX90APlus] in {
}
} // End Predicates = [isGFX90APlus]
-let Predicates = [isGFX940Plus], is_gfx940_xdl = 1 in {
+let SubtargetPredicate = isGFX940Plus, is_gfx940_xdl = 1 in {
defm V_MFMA_I32_32X32X16I8 : MAIInst<"v_mfma_i32_32x32x16i8", "I32_I64_X32", int_amdgcn_mfma_i32_32x32x16_i8>;
defm V_MFMA_I32_16X16X32I8 : MAIInst<"v_mfma_i32_16x16x32i8", "I32_I64_X16", int_amdgcn_mfma_i32_16x16x32_i8>;
defm V_MFMA_F32_16X16X8XF32 : MAIInst<"v_mfma_f32_16x16x8xf32", "F32_V2F32_X16", int_amdgcn_mfma_f32_16x16x8_xf32>;
@@ -700,7 +714,7 @@ let Predicates = [isGFX940Plus], is_gfx940_xdl = 1 in {
defm V_MFMA_F32_32X32X16_BF8_FP8 : MAIInst<"v_mfma_f32_32x32x16_bf8_fp8", "F32_I64_X16", int_amdgcn_mfma_f32_32x32x16_bf8_fp8>;
defm V_MFMA_F32_32X32X16_FP8_BF8 : MAIInst<"v_mfma_f32_32x32x16_fp8_bf8", "F32_I64_X16", int_amdgcn_mfma_f32_32x32x16_fp8_bf8>;
defm V_MFMA_F32_32X32X16_FP8_FP8 : MAIInst<"v_mfma_f32_32x32x16_fp8_fp8", "F32_I64_X16", int_amdgcn_mfma_f32_32x32x16_fp8_fp8>;
-} // End Predicates = [isGFX940Plus], is_gfx940_xdl = 1
+} // End SubtargetPredicate = isGFX940Plus, is_gfx940_xdl = 1
multiclass SMFMACInst<string OpName, string P, SDPatternOperator node> {
let Constraints = "$vdst = $src2", DisableEncoding = "$src2",
@@ -737,12 +751,16 @@ def MAIInstInfoTable : GenericTable {
let PrimaryKeyName = "getMAIInstInfoHelper";
}
-let SubtargetPredicate = HasPackedFP32Ops, isCommutable = 1, isReMaterializable = 1 in {
- defm V_PK_FMA_F32 : VOP3PInst<"v_pk_fma_f32", VOP3P_Profile<VOP_V2F32_V2F32_V2F32_V2F32, VOP3_PACKED>, any_fma>;
- defm V_PK_MUL_F32 : VOP3PInst<"v_pk_mul_f32", VOP3P_Profile<VOP_V2F32_V2F32_V2F32, VOP3_PACKED>, any_fmul>;
- defm V_PK_ADD_F32 : VOP3PInst<"v_pk_add_f32", VOP3P_Profile<VOP_V2F32_V2F32_V2F32, VOP3_PACKED>, any_fadd>;
+let isCommutable = 1, isReMaterializable = 1 in {
+ let SubtargetPredicate = HasPackedFP32Ops in {
+ defm V_PK_FMA_F32 : VOP3PInst<"v_pk_fma_f32", VOP3P_Profile<VOP_V2F32_V2F32_V2F32_V2F32, VOP3_PACKED>, any_fma>;
+ defm V_PK_MUL_F32 : VOP3PInst<"v_pk_mul_f32", VOP3P_Profile<VOP_V2F32_V2F32_V2F32, VOP3_PACKED>, any_fmul>;
+ defm V_PK_ADD_F32 : VOP3PInst<"v_pk_add_f32", VOP3P_Profile<VOP_V2F32_V2F32_V2F32, VOP3_PACKED>, any_fadd>;
+ } // End SubtargetPredicate = HasPackedFP32Ops
+
+ let SubtargetPredicate = HasPkMovB32 in
defm V_PK_MOV_B32 : VOP3PInst<"v_pk_mov_b32", VOP3P_Profile<VOP_V2I32_V2I32_V2I32, VOP3_PACKED>>;
-} // End SubtargetPredicate = HasPackedFP32Ops, isCommutable = 1
+} // End isCommutable = 1, isReMaterializable = 1
def : MnemonicAlias<"v_accvgpr_read", "v_accvgpr_read_b32">;
def : MnemonicAlias<"v_accvgpr_write", "v_accvgpr_write_b32">;
@@ -847,34 +865,25 @@ def WMMAOpcode3AddrMappingTable : WMMAMappingTable {
// it converts the default pseudo to the pseudo where src2 is not the same as vdst.
// 3) @earlyclobber on the destination satisfies the constraint during RA.
-multiclass WMMAInst<string Suffix, string Instr, VOPProfile P, SDPatternOperator node = null_frag, RegisterOperand _Src01RC64 = VRegSrc_256, WMMAType Type> {
+multiclass WMMAInst<string Suffix, string Instr, VOPProfile P, SDPatternOperator node = null_frag, RegisterOperand _Src01RC64 = VRegSrc_256, WMMAType Type, bit convertibleTo3Addr> {
defvar WMMAConstraints2Addr = "@earlyclobber $vdst,$vdst = $src2";
defvar WMMAConstraints3Addr = "@earlyclobber $vdst";
defvar WMMAProfile = VOPProfileWMMA<P, Suffix, _Src01RC64, Type.hasClamp, Type.hasOpsel>;
- if !eq(Suffix, "_w32") then {
- let Mnemonic = Instr, mayRaiseFPException = 0, ReadsModeReg = 0 in {
- let Constraints = WMMAConstraints2Addr, isConvertibleToThreeAddress = 1 in {
- def _twoaddr_w32 : VOP3P_Pseudo<Instr # Suffix, WMMAProfile>;
- }
- let Constraints = WMMAConstraints3Addr, SchedRW = [Write32Bit, Write32Bit] in {
- def _threeaddr_w32 : VOP3P_Pseudo<Instr # Suffix, WMMAProfile>;
- }
+ let Mnemonic = Instr, mayRaiseFPException = 0, ReadsModeReg = 0 in {
+ let Constraints = WMMAConstraints2Addr, isConvertibleToThreeAddress = convertibleTo3Addr in {
+ def _twoaddr # Suffix : VOP3P_Pseudo<Instr # Suffix, WMMAProfile>;
}
- def : WMMAOpcodeMapping<!cast<Instruction>(NAME # _twoaddr_w32),
- !cast<Instruction>(NAME # _threeaddr_w32)>;
- } else if !eq(Suffix, "_w64") then {
+ }
+ if convertibleTo3Addr then {
let Mnemonic = Instr, mayRaiseFPException = 0, ReadsModeReg = 0 in {
- let Constraints = WMMAConstraints2Addr, isConvertibleToThreeAddress = 1 in {
- def _twoaddr_w64 : VOP3P_Pseudo<Instr # Suffix, WMMAProfile>;
- }
let Constraints = WMMAConstraints3Addr, SchedRW = [Write32Bit, Write32Bit] in {
- def _threeaddr_w64 : VOP3P_Pseudo<Instr # Suffix, WMMAProfile>;
+ def _threeaddr # Suffix : VOP3P_Pseudo<Instr # Suffix, WMMAProfile>;
}
}
- def : WMMAOpcodeMapping<!cast<Instruction>(NAME # _twoaddr_w64),
- !cast<Instruction>(NAME # _threeaddr_w64)>;
+ def : WMMAOpcodeMapping<!cast<Instruction>(NAME # _twoaddr # Suffix),
+ !cast<Instruction>(NAME # _threeaddr # Suffix)>;
}
if !eq(Type, WMMAOpSel) then {
@@ -888,21 +897,25 @@ multiclass WMMAInst<string Suffix, string Instr, VOPProfile P, SDPatternOperator
let WaveSizePredicate = isWave32 in {
- defm V_WMMA_F32_16X16X16_F16 : WMMAInst<"_w32", "v_wmma_f32_16x16x16_f16", VOP_V8F32_V16F16_V16F16_V8F32, int_amdgcn_wmma_f32_16x16x16_f16, VRegSrc_256, WMMARegular>;
- defm V_WMMA_F32_16X16X16_BF16 : WMMAInst<"_w32", "v_wmma_f32_16x16x16_bf16", VOP_V8F32_V16I16_V16I16_V8F32, int_amdgcn_wmma_f32_16x16x16_bf16, VRegSrc_256, WMMARegular>;
- defm V_WMMA_F16_16X16X16_F16 : WMMAInst<"_w32", "v_wmma_f16_16x16x16_f16", VOP_V16F16_V16F16_V16F16_V16F16, int_amdgcn_wmma_f16_16x16x16_f16, VRegSrc_256, WMMAOpSel>;
- defm V_WMMA_BF16_16X16X16_BF16 : WMMAInst<"_w32", "v_wmma_bf16_16x16x16_bf16", VOP_V16I16_V16I16_V16I16_V16I16, int_amdgcn_wmma_bf16_16x16x16_bf16, VRegSrc_256, WMMAOpSel>;
- defm V_WMMA_I32_16X16X16_IU8 : WMMAInst<"_w32", "v_wmma_i32_16x16x16_iu8", VOP_V8I32_V4I32_V4I32_V8I32, int_amdgcn_wmma_i32_16x16x16_iu8, VRegSrc_128, WMMAUIClamp>;
- defm V_WMMA_I32_16X16X16_IU4 : WMMAInst<"_w32", "v_wmma_i32_16x16x16_iu4", VOP_V8I32_V2I32_V2I32_V8I32, int_amdgcn_wmma_i32_16x16x16_iu4, VRegSrc_64, WMMAUIClamp>;
+ defm V_WMMA_F32_16X16X16_F16 : WMMAInst<"_w32", "v_wmma_f32_16x16x16_f16", VOP_V8F32_V16F16_V16F16_V8F32, int_amdgcn_wmma_f32_16x16x16_f16, VRegSrc_256, WMMARegular, 1>;
+ defm V_WMMA_F32_16X16X16_BF16 : WMMAInst<"_w32", "v_wmma_f32_16x16x16_bf16", VOP_V8F32_V16I16_V16I16_V8F32, int_amdgcn_wmma_f32_16x16x16_bf16, VRegSrc_256, WMMARegular, 1>;
+ defm V_WMMA_F16_16X16X16_F16 : WMMAInst<"_w32", "v_wmma_f16_16x16x16_f16", VOP_V16F16_V16F16_V16F16_V16F16, int_amdgcn_wmma_f16_16x16x16_f16, VRegSrc_256, WMMAOpSel, 1>;
+ defm V_WMMA_BF16_16X16X16_BF16 : WMMAInst<"_w32", "v_wmma_bf16_16x16x16_bf16", VOP_V16I16_V16I16_V16I16_V16I16, int_amdgcn_wmma_bf16_16x16x16_bf16, VRegSrc_256, WMMAOpSel, 1>;
+ defm V_WMMA_F16_16X16X16_F16_TIED : WMMAInst<"_w32", "v_wmma_f16_16x16x16_f16", VOP_V16F16_V16F16_V16F16_V16F16, int_amdgcn_wmma_f16_16x16x16_f16_tied, VRegSrc_256, WMMAOpSel, 0>;
+ defm V_WMMA_BF16_16X16X16_BF16_TIED : WMMAInst<"_w32", "v_wmma_bf16_16x16x16_bf16", VOP_V16I16_V16I16_V16I16_V16I16, int_amdgcn_wmma_bf16_16x16x16_bf16_tied, VRegSrc_256, WMMAOpSel, 0>;
+ defm V_WMMA_I32_16X16X16_IU8 : WMMAInst<"_w32", "v_wmma_i32_16x16x16_iu8", VOP_V8I32_V4I32_V4I32_V8I32, int_amdgcn_wmma_i32_16x16x16_iu8, VRegSrc_128, WMMAUIClamp, 1>;
+ defm V_WMMA_I32_16X16X16_IU4 : WMMAInst<"_w32", "v_wmma_i32_16x16x16_iu4", VOP_V8I32_V2I32_V2I32_V8I32, int_amdgcn_wmma_i32_16x16x16_iu4, VRegSrc_64, WMMAUIClamp, 1>;
}
let WaveSizePredicate = isWave64 in {
- defm V_WMMA_F32_16X16X16_F16 : WMMAInst<"_w64", "v_wmma_f32_16x16x16_f16", VOP_V4F32_V16F16_V16F16_V4F32, int_amdgcn_wmma_f32_16x16x16_f16, VRegSrc_256, WMMARegular>;
- defm V_WMMA_F32_16X16X16_BF16 : WMMAInst<"_w64", "v_wmma_f32_16x16x16_bf16", VOP_V4F32_V16I16_V16I16_V4F32, int_amdgcn_wmma_f32_16x16x16_bf16, VRegSrc_256, WMMARegular>;
- defm V_WMMA_F16_16X16X16_F16 : WMMAInst<"_w64", "v_wmma_f16_16x16x16_f16", VOP_V8F16_V16F16_V16F16_V8F16, int_amdgcn_wmma_f16_16x16x16_f16, VRegSrc_256, WMMAOpSel>;
- defm V_WMMA_BF16_16X16X16_BF16 : WMMAInst<"_w64", "v_wmma_bf16_16x16x16_bf16", VOP_V8I16_V16I16_V16I16_V8I16, int_amdgcn_wmma_bf16_16x16x16_bf16, VRegSrc_256, WMMAOpSel>;
- defm V_WMMA_I32_16X16X16_IU8 : WMMAInst<"_w64", "v_wmma_i32_16x16x16_iu8", VOP_V4I32_V4I32_V4I32_V4I32, int_amdgcn_wmma_i32_16x16x16_iu8, VRegSrc_128, WMMAUIClamp>;
- defm V_WMMA_I32_16X16X16_IU4 : WMMAInst<"_w64", "v_wmma_i32_16x16x16_iu4", VOP_V4I32_V2I32_V2I32_V4I32, int_amdgcn_wmma_i32_16x16x16_iu4, VRegSrc_64, WMMAUIClamp>;
+ defm V_WMMA_F32_16X16X16_F16 : WMMAInst<"_w64", "v_wmma_f32_16x16x16_f16", VOP_V4F32_V16F16_V16F16_V4F32, int_amdgcn_wmma_f32_16x16x16_f16, VRegSrc_256, WMMARegular, 1>;
+ defm V_WMMA_F32_16X16X16_BF16 : WMMAInst<"_w64", "v_wmma_f32_16x16x16_bf16", VOP_V4F32_V16I16_V16I16_V4F32, int_amdgcn_wmma_f32_16x16x16_bf16, VRegSrc_256, WMMARegular, 1>;
+ defm V_WMMA_F16_16X16X16_F16 : WMMAInst<"_w64", "v_wmma_f16_16x16x16_f16", VOP_V8F16_V16F16_V16F16_V8F16, int_amdgcn_wmma_f16_16x16x16_f16, VRegSrc_256, WMMAOpSel, 1>;
+ defm V_WMMA_BF16_16X16X16_BF16 : WMMAInst<"_w64", "v_wmma_bf16_16x16x16_bf16", VOP_V8I16_V16I16_V16I16_V8I16, int_amdgcn_wmma_bf16_16x16x16_bf16, VRegSrc_256, WMMAOpSel, 1>;
+ defm V_WMMA_F16_16X16X16_F16_TIED : WMMAInst<"_w64", "v_wmma_f16_16x16x16_f16", VOP_V8F16_V16F16_V16F16_V8F16, int_amdgcn_wmma_f16_16x16x16_f16_tied, VRegSrc_256, WMMAOpSel, 0>;
+ defm V_WMMA_BF16_16X16X16_BF16_TIED : WMMAInst<"_w64", "v_wmma_bf16_16x16x16_bf16", VOP_V8I16_V16I16_V16I16_V8I16, int_amdgcn_wmma_bf16_16x16x16_bf16_tied, VRegSrc_256, WMMAOpSel, 0>;
+ defm V_WMMA_I32_16X16X16_IU8 : WMMAInst<"_w64", "v_wmma_i32_16x16x16_iu8", VOP_V4I32_V4I32_V4I32_V4I32, int_amdgcn_wmma_i32_16x16x16_iu8, VRegSrc_128, WMMAUIClamp, 1>;
+ defm V_WMMA_I32_16X16X16_IU4 : WMMAInst<"_w64", "v_wmma_i32_16x16x16_iu4", VOP_V4I32_V2I32_V2I32_V4I32, int_amdgcn_wmma_i32_16x16x16_iu4, VRegSrc_64, WMMAUIClamp, 1>;
}
@@ -932,56 +945,89 @@ class VOP3P_DPP8_Base<bits<7> op, VOP_Pseudo ps, string opName = ps.OpName>
}
//===----------------------------------------------------------------------===//
-// GFX11.
+// GFX11, GFX12
//===----------------------------------------------------------------------===//
-let AssemblerPredicate = isGFX11Plus,
- DecoderNamespace = "GFX11" in {
+multiclass VOP3P_Real_Base<GFXGen Gen, bits<7> op, string backing_ps_name = NAME,
+ string asmName = !cast<VOP3P_Pseudo>(NAME).Mnemonic> {
+ def Gen.Suffix :
+ VOP3P_Real_Gen<!cast<VOP3P_Pseudo>(backing_ps_name), Gen, asmName>,
+ VOP3Pe_gfx11_gfx12<op, !cast<VOP3P_Pseudo>(backing_ps_name).Pfl>;
+}
- multiclass VOP3P_Real_gfx11<bits<7> op, string backing_ps_name = NAME,
- string asmName = !cast<VOP3P_Pseudo>(NAME).Mnemonic> {
- def _gfx11 : VOP3P_Real<!cast<VOP3P_Pseudo>(backing_ps_name),
- SIEncodingFamily.GFX11, asmName>,
- VOP3Pe_gfx11<op, !cast<VOP3P_Pseudo>(backing_ps_name).Pfl>;
- }
+multiclass VOP3P_Real_with_name<GFXGen Gen, bits<7> op,
+ string backing_ps_name = NAME,
+ string asmName = !cast<VOP3P_Pseudo>(NAME).Mnemonic> {
+ defvar ps = !cast<VOP3P_Pseudo>(backing_ps_name);
+ let AsmString = asmName # ps.AsmOperands in
+ def Gen.Suffix :
+ VOP3P_Real_Gen<!cast<VOP3P_Pseudo>(backing_ps_name), Gen, asmName>,
+ VOP3Pe_gfx11_gfx12<op, !cast<VOP3P_Pseudo>(backing_ps_name).Pfl>,
+ MnemonicAlias<ps.Mnemonic, asmName>, Requires<[Gen.AssemblerPredicate]>;
+}
- multiclass VOP3P_Real_dpp_gfx11<bits<7> op, string backing_ps_name = NAME,
- string asmName = !cast<VOP3P_Pseudo>(NAME).Mnemonic> {
- defvar ps = !cast<VOP3P_Pseudo>(backing_ps_name);
- def _dpp_gfx11
- : VOP3P_DPP16<op, !cast<VOP_DPP_Pseudo>(backing_ps_name #"_dpp"),
- SIEncodingFamily.GFX11> {
- let AsmString = asmName #ps.Pfl.AsmVOP3DPP16;
- let DecoderNamespace = "DPPGFX11";
- }
+multiclass VOP3P_Real_dpp<GFXGen Gen, bits<7> op, string backing_ps_name = NAME,
+ string asmName = !cast<VOP3P_Pseudo>(NAME).Mnemonic> {
+ defvar ps = !cast<VOP3P_Pseudo>(backing_ps_name);
+ def _dpp#Gen.Suffix
+ : VOP3P_DPP16<op, !cast<VOP_DPP_Pseudo>(backing_ps_name #"_dpp"),
+ Gen.Subtarget> {
+ let AsmString = asmName #ps.Pfl.AsmVOP3DPP16;
+ let DecoderNamespace = "DPP"#Gen.DecoderNamespace;
+ let AssemblerPredicate = Gen.AssemblerPredicate;
}
+}
- multiclass VOP3P_Real_dpp8_gfx11<bits<7> op, string backing_ps_name = NAME,
- string asmName = !cast<VOP3P_Pseudo>(NAME).Mnemonic> {
- defvar ps = !cast<VOP3P_Pseudo>(backing_ps_name);
- def _dpp8_gfx11 : VOP3P_DPP8_Base<op, ps> {
- let AsmString = asmName #ps.Pfl.AsmVOP3DPP8;
- let DecoderNamespace = "DPP8GFX11";
- }
+multiclass VOP3P_Real_dpp8<GFXGen Gen, bits<7> op, string backing_ps_name = NAME,
+ string asmName = !cast<VOP3P_Pseudo>(NAME).Mnemonic> {
+ defvar ps = !cast<VOP3P_Pseudo>(backing_ps_name);
+ def _dpp8#Gen.Suffix : VOP3P_DPP8_Base<op, ps> {
+ let AsmString = asmName #ps.Pfl.AsmVOP3DPP8;
+ let DecoderNamespace = "DPP8"#Gen.DecoderNamespace;
+ let AssemblerPredicate = Gen.AssemblerPredicate;
}
+}
- multiclass VOP3P_Realtriple_gfx11<bits<7> op, string backing_ps_name = NAME,
- string asmName = !cast<VOP3P_Pseudo>(NAME).Mnemonic>
- : VOP3P_Real_gfx11<op, backing_ps_name, asmName>,
- VOP3P_Real_dpp_gfx11<op, backing_ps_name, asmName>,
- VOP3P_Real_dpp8_gfx11<op, backing_ps_name, asmName>;
-} // End AssemblerPredicate = isGFX11Plus, DecoderNamespace = "GFX11"
+multiclass VOP3P_Realtriple<GFXGen Gen, bits<7> op, string backing_ps_name = NAME,
+ string asmName = !cast<VOP3P_Pseudo>(NAME).Mnemonic>
+ : VOP3P_Real_Base<Gen, op, backing_ps_name, asmName>,
+ VOP3P_Real_dpp<Gen, op, backing_ps_name, asmName>,
+ VOP3P_Real_dpp8<Gen, op, backing_ps_name, asmName>;
-defm V_DOT4_I32_IU8 : VOP3P_Real_gfx11 <0x16>;
-defm V_DOT8_I32_IU4 : VOP3P_Real_gfx11 <0x18>;
-defm V_DOT2_F32_BF16 : VOP3P_Real_gfx11 <0x1a>;
+//===----------------------------------------------------------------------===//
+// GFX12
+//===----------------------------------------------------------------------===//
+
+multiclass VOP3P_Real_gfx12<bits<7> op> : VOP3P_Real_Base<GFX12Gen, op>;
+
+multiclass VOP3P_Real_with_name_gfx12<bits<7> op,
+ string backing_ps_name = NAME,
+ string asmName = !cast<VOP3P_Pseudo>(NAME).Mnemonic> :
+ VOP3P_Real_with_name<GFX12Gen, op, backing_ps_name, asmName>;
+
+defm V_PK_MIN_NUM_F16 : VOP3P_Real_with_name_gfx12<0x1b, "V_PK_MIN_F16", "v_pk_min_num_f16">;
+defm V_PK_MAX_NUM_F16 : VOP3P_Real_with_name_gfx12<0x1c, "V_PK_MAX_F16", "v_pk_max_num_f16">;
+
+defm V_PK_MINIMUM_F16 : VOP3P_Real_gfx12<0x1d>;
+defm V_PK_MAXIMUM_F16 : VOP3P_Real_gfx12<0x1e>;
+
+//===----------------------------------------------------------------------===//
+// GFX11
+//===----------------------------------------------------------------------===//
+
+multiclass VOP3P_Real_gfx11_gfx12<bits<7> op> :
+ VOP3P_Real_Base<GFX11Gen, op>, VOP3P_Real_Base<GFX12Gen, op>;
+
+defm V_DOT4_I32_IU8 : VOP3P_Real_gfx11_gfx12<0x16>;
+defm V_DOT8_I32_IU4 : VOP3P_Real_gfx11_gfx12<0x18>;
+defm V_DOT2_F32_BF16 : VOP3P_Real_gfx11_gfx12<0x1a>;
multiclass VOP3P_Real_WMMA <bits<7> op> {
let WaveSizePredicate = isWave32, DecoderNamespace = "GFX11" in {
- defm _twoaddr_w32 : VOP3P_Real_gfx11 <op>;
+ defm _twoaddr_w32 : VOP3P_Real_Base <GFX11Gen, op>;
}
let WaveSizePredicate = isWave64, DecoderNamespace = "WMMAGFX11" in {
- defm _twoaddr_w64 : VOP3P_Real_gfx11 <op>;
+ defm _twoaddr_w64 : VOP3P_Real_Base <GFX11Gen, op>;
}
}
@@ -1034,25 +1080,23 @@ multiclass VOP3P_Real_MFMA_gfx940_aliases<string NameFrom, string NameTo, string
VOP3_Pseudo PS_VCD = !cast<VOP3_Pseudo>(Op # "_vgprcd" # "_e64"),
VOPProfile Pfl_ACD = PS_ACD.Pfl,
VOPProfile Pfl_VCD = PS_VCD.Pfl> {
- let Predicates = [isGFX940Plus] in {
- if !ne(NameFrom, NameTo) then {
- def : InstAlias <NameTo # " " # PS_ACD.AsmOperands,
- (!cast<VOP3P_Real>(Op # "_gfx940_acd") Pfl_ACD.DstRC:$vdst,
- Pfl_ACD.Src0RC64:$src0, Pfl_ACD.Src1RC64:$src1, Pfl_ACD.Src2RC64:$src2,
- cbsz:$cbsz, abid:$abid, blgp:$blgp)>, PredicateControl;
- def : InstAlias <NameTo # " " # PS_VCD.AsmOperands,
- (!cast<VOP3P_Real>(Op # "_gfx940_vcd") Pfl_VCD.DstRC:$vdst,
- Pfl_VCD.Src0RC64:$src0, Pfl_VCD.Src1RC64:$src1, Pfl_VCD.Src2RC64:$src2,
- cbsz:$cbsz, abid:$abid, blgp:$blgp)>, PredicateControl;
- }
- } // End Predicates = [isGFX940Plus]
+ if !ne(NameFrom, NameTo) then {
+ def : InstAlias <NameTo # " " # PS_ACD.AsmOperands,
+ (!cast<VOP3P_Real>(Op # "_gfx940_acd") Pfl_ACD.DstRC:$vdst,
+ Pfl_ACD.Src0RC64:$src0, Pfl_ACD.Src1RC64:$src1, Pfl_ACD.Src2RC64:$src2,
+ cbsz:$cbsz, abid:$abid, blgp:$blgp)>, PredicateControl;
+ def : InstAlias <NameTo # " " # PS_VCD.AsmOperands,
+ (!cast<VOP3P_Real>(Op # "_gfx940_vcd") Pfl_VCD.DstRC:$vdst,
+ Pfl_VCD.Src0RC64:$src0, Pfl_VCD.Src1RC64:$src1, Pfl_VCD.Src2RC64:$src2,
+ cbsz:$cbsz, abid:$abid, blgp:$blgp)>, PredicateControl;
+ }
}
multiclass VOP3P_Real_MFMA_gfx940<bits<7> op, string Name = !cast<VOP3_Pseudo>(NAME#"_e64").Mnemonic,
VOP3_Pseudo PS_ACD = !cast<VOP3_Pseudo>(NAME # "_e64"),
VOP3_Pseudo PS_VCD = !cast<VOP3_Pseudo>(NAME # "_vgprcd" # "_e64")> {
let SubtargetPredicate = isGFX940Plus,
- AssemblerPredicate = isGFX940Plus, DecoderNamespace = "GFX940",
+ DecoderNamespace = "GFX940",
AsmString = Name # PS_ACD.AsmOperands, Constraints = "" in {
def _gfx940_acd : VOP3P_Real<PS_ACD, SIEncodingFamily.GFX940>,
VOP3Pe_MAI <op, PS_ACD.Pfl, 1>;
@@ -1061,23 +1105,32 @@ multiclass VOP3P_Real_MFMA_gfx940<bits<7> op, string Name = !cast<VOP3_Pseudo>(N
VOP3Pe_MAI <op, PS_VCD.Pfl, 0>;
} // End AssemblerPredicate = isGFX940Plus, DecoderNamespace = "GFX940"
- defm : VOP3P_Real_MFMA_gfx940_aliases<Name, PS_ACD.Mnemonic, NAME>;
+ let SubtargetPredicate = isGFX940Plus in {
+ defm : VOP3P_Real_MFMA_gfx940_aliases<Name, PS_ACD.Mnemonic, NAME>;
- if !ne(!subst("_1k", "", PS_ACD.Mnemonic), PS_ACD.Mnemonic) then
- defm : VOP3P_Real_MFMA_gfx940_aliases<Name, !subst("_1k", "", PS_ACD.Mnemonic), NAME>;
+ if !ne(!subst("_1k", "", PS_ACD.Mnemonic), PS_ACD.Mnemonic) then
+ defm : VOP3P_Real_MFMA_gfx940_aliases<Name, !subst("_1k", "", PS_ACD.Mnemonic), NAME>;
+ }
}
-multiclass VOP3P_Real_MFMA<bits<7> op, string GFX940Name = !cast<VOP3_Pseudo>(NAME#"_e64").Mnemonic> :
- VOP3P_Real_MFMA_gfx90a <op>,
- VOP3P_Real_MFMA_gfx940 <op, GFX940Name> {
+multiclass VOP3P_Real_MFMA_vi<bits<7> op> {
def _vi : VOP3P_Real<!cast<VOP3_Pseudo>(NAME#"_e64"), SIEncodingFamily.VI>,
VOP3Pe_MAI <op, !cast<VOP3_Pseudo>(NAME#"_e64").Pfl, ?> {
+ let SubtargetPredicate = isGFX8GFX9NotGFX90A;
let AssemblerPredicate = HasMAIInsts;
let DecoderNamespace = "GFX8";
let Constraints = "";
}
}
+multiclass VOP3P_Real_MFMA_vi_gfx90a<bits<7> op> :
+ VOP3P_Real_MFMA_gfx90a <op>,
+ VOP3P_Real_MFMA_vi <op>;
+
+multiclass VOP3P_Real_MFMA<bits<7> op, string GFX940Name = !cast<VOP3_Pseudo>(NAME#"_e64").Mnemonic> :
+ VOP3P_Real_MFMA_vi_gfx90a <op>,
+ VOP3P_Real_MFMA_gfx940 <op, GFX940Name>;
+
multiclass VOP3P_Real_SMFMAC<bits<7> op, string alias> {
def _gfx940 : VOP3P_Real<!cast<VOP3_Pseudo>(NAME#"_e64"), SIEncodingFamily.VI>,
VOP3Pe_SMFMAC <op> {
@@ -1087,6 +1140,7 @@ multiclass VOP3P_Real_SMFMAC<bits<7> op, string alias> {
def : MnemonicAlias<alias, !cast<VOP3_Pseudo>(NAME#"_e64").Mnemonic>;
}
+let SubtargetPredicate = isGFX8GFX9 in {
defm V_PK_MAD_I16 : VOP3P_Real_vi <0x00>;
defm V_PK_MUL_LO_U16 : VOP3P_Real_vi <0x01>;
defm V_PK_ADD_I16 : VOP3P_Real_vi <0x02>;
@@ -1108,15 +1162,14 @@ defm V_PK_MUL_F16 : VOP3P_Real_vi <0x10>;
defm V_PK_MIN_F16 : VOP3P_Real_vi <0x11>;
defm V_PK_MAX_F16 : VOP3P_Real_vi <0x12>;
-
-let SubtargetPredicate = HasMadMixInsts in {
+let OtherPredicates = [HasMadMixInsts] in {
defm V_MAD_MIX_F32 : VOP3P_Real_vi <0x20>;
defm V_MAD_MIXLO_F16 : VOP3P_Real_vi <0x21>;
defm V_MAD_MIXHI_F16 : VOP3P_Real_vi <0x22>;
}
-let SubtargetPredicate = HasFmaMixInsts in {
-let DecoderNamespace = "GFX9_DL" in {
+let OtherPredicates = [HasFmaMixInsts],
+ DecoderNamespace = "GFX9_DL" in {
// The mad_mix instructions were renamed and their behaviors changed,
// but the opcode stayed the same so we need to put these in a
// different DecoderNamespace to avoid the ambiguity.
@@ -1124,8 +1177,6 @@ defm V_FMA_MIX_F32 : VOP3P_Real_vi <0x20>;
defm V_FMA_MIXLO_F16 : VOP3P_Real_vi <0x21>;
defm V_FMA_MIXHI_F16 : VOP3P_Real_vi <0x22>;
}
-}
-
defm V_DOT2_I32_I16 : VOP3P_Real_vi <0x26>;
defm V_DOT2_U32_U16 : VOP3P_Real_vi <0x27>;
@@ -1136,8 +1187,9 @@ defm V_DOT8_U32_U4 : VOP3P_Real_vi <0x2b>;
defm V_DOT4_I32_I8 : VOP3P_Real_vi <0x28>;
defm V_DOT8_I32_I4 : VOP3P_Real_vi <0x2a>;
+} // End SubtargetPredicate = isGFX8GFX9
-let SubtargetPredicate = HasMAIInsts in {
+let OtherPredicates = [HasMAIInsts] in {
defm V_ACCVGPR_READ_B32 : VOP3P_Real_MAI <0x58>;
defm V_ACCVGPR_WRITE_B32 : VOP3P_Real_MAI <0x59>;
@@ -1155,17 +1207,15 @@ defm V_MFMA_I32_32X32X4I8 : VOP3P_Real_MFMA <0x50, "v_mfma_i32_32x32x4_2b_i8">
defm V_MFMA_I32_16X16X4I8 : VOP3P_Real_MFMA <0x51, "v_mfma_i32_16x16x4_4b_i8">;
defm V_MFMA_I32_4X4X4I8 : VOP3P_Real_MFMA <0x52, "v_mfma_i32_4x4x4_16b_i8">;
-let SubtargetPredicate = isGFX908orGFX90A in {
-defm V_MFMA_I32_16X16X16I8 : VOP3P_Real_MFMA <0x55>;
-defm V_MFMA_I32_32X32X8I8 : VOP3P_Real_MFMA <0x54>;
-defm V_MFMA_F32_32X32X2BF16 : VOP3P_Real_MFMA <0x68>;
-defm V_MFMA_F32_16X16X2BF16 : VOP3P_Real_MFMA <0x69>;
-defm V_MFMA_F32_4X4X2BF16 : VOP3P_Real_MFMA <0x6b>;
-defm V_MFMA_F32_32X32X4BF16 : VOP3P_Real_MFMA <0x6c>;
-defm V_MFMA_F32_16X16X8BF16 : VOP3P_Real_MFMA <0x6d>;
-}
+defm V_MFMA_I32_16X16X16I8 : VOP3P_Real_MFMA_vi_gfx90a <0x55>;
+defm V_MFMA_I32_32X32X8I8 : VOP3P_Real_MFMA_vi_gfx90a <0x54>;
+defm V_MFMA_F32_32X32X2BF16 : VOP3P_Real_MFMA_vi_gfx90a <0x68>;
+defm V_MFMA_F32_16X16X2BF16 : VOP3P_Real_MFMA_vi_gfx90a <0x69>;
+defm V_MFMA_F32_4X4X2BF16 : VOP3P_Real_MFMA_vi_gfx90a <0x6b>;
+defm V_MFMA_F32_32X32X4BF16 : VOP3P_Real_MFMA_vi_gfx90a <0x6c>;
+defm V_MFMA_F32_16X16X8BF16 : VOP3P_Real_MFMA_vi_gfx90a <0x6d>;
-} // End SubtargetPredicate = HasMAIInsts
+} // End OtherPredicates = [HasMAIInsts]
defm V_MFMA_F32_32X32X4BF16_1K : VOP3P_Real_MFMA_gfx90a <0x63>;
defm V_MFMA_F32_16X16X4BF16_1K : VOP3P_Real_MFMA_gfx90a <0x64>;
@@ -1212,12 +1262,10 @@ defm V_SMFMAC_F32_32X32X32_BF8_FP8 : VOP3P_Real_SMFMAC <0x7d, "v_smfmac_f32_32x3
defm V_SMFMAC_F32_32X32X32_FP8_BF8 : VOP3P_Real_SMFMAC <0x7e, "v_smfmac_f32_32x32x32fp8bf8">;
defm V_SMFMAC_F32_32X32X32_FP8_FP8 : VOP3P_Real_SMFMAC <0x7f, "v_smfmac_f32_32x32x32fp8fp8">;
-let SubtargetPredicate = HasPackedFP32Ops in {
- defm V_PK_FMA_F32 : VOP3P_Real_vi <0x30>;
- defm V_PK_MUL_F32 : VOP3P_Real_vi <0x31>;
- defm V_PK_ADD_F32 : VOP3P_Real_vi <0x32>;
- defm V_PK_MOV_B32 : VOP3P_Real_vi <0x33>;
-} // End SubtargetPredicate = HasPackedFP32Ops
+defm V_PK_FMA_F32 : VOP3P_Real_vi <0x30>;
+defm V_PK_MUL_F32 : VOP3P_Real_vi <0x31>;
+defm V_PK_ADD_F32 : VOP3P_Real_vi <0x32>;
+defm V_PK_MOV_B32 : VOP3P_Real_vi <0x33>;
//===----------------------------------------------------------------------===//
// GFX10.
@@ -1230,41 +1278,45 @@ let AssemblerPredicate = isGFX10Only, DecoderNamespace = "GFX10", VOP3P = 1 in {
}
} // End AssemblerPredicate = isGFX10Only, DecoderNamespace = "GFX10", VOP3P = 1
-multiclass VOP3P_Real_gfx10_gfx11<bits<7> op>
- : VOP3P_Real_gfx10<op>, VOP3P_Real_gfx11<op>;
-
-multiclass VOP3P_Real_gfx10_gfx11_Triple<bits<7> op>
- : VOP3P_Real_gfx10<op>, VOP3P_Realtriple_gfx11<op>;
-
-defm V_PK_MAD_I16 : VOP3P_Real_gfx10_gfx11<0x00>;
-defm V_PK_MUL_LO_U16 : VOP3P_Real_gfx10_gfx11<0x01>;
-defm V_PK_ADD_I16 : VOP3P_Real_gfx10_gfx11<0x02>;
-defm V_PK_SUB_I16 : VOP3P_Real_gfx10_gfx11<0x03>;
-defm V_PK_LSHLREV_B16 : VOP3P_Real_gfx10_gfx11<0x04>;
-defm V_PK_LSHRREV_B16 : VOP3P_Real_gfx10_gfx11<0x05>;
-defm V_PK_ASHRREV_I16 : VOP3P_Real_gfx10_gfx11<0x06>;
-defm V_PK_MAX_I16 : VOP3P_Real_gfx10_gfx11<0x07>;
-defm V_PK_MIN_I16 : VOP3P_Real_gfx10_gfx11<0x08>;
-defm V_PK_MAD_U16 : VOP3P_Real_gfx10_gfx11<0x09>;
-defm V_PK_ADD_U16 : VOP3P_Real_gfx10_gfx11<0x0a>;
-defm V_PK_SUB_U16 : VOP3P_Real_gfx10_gfx11<0x0b>;
-defm V_PK_MAX_U16 : VOP3P_Real_gfx10_gfx11<0x0c>;
-defm V_PK_MIN_U16 : VOP3P_Real_gfx10_gfx11<0x0d>;
-defm V_PK_FMA_F16 : VOP3P_Real_gfx10_gfx11<0x0e>;
-defm V_PK_ADD_F16 : VOP3P_Real_gfx10_gfx11<0x0f>;
-defm V_PK_MUL_F16 : VOP3P_Real_gfx10_gfx11<0x10>;
+multiclass VOP3P_Real_gfx10_gfx11<bits<7> op> :
+ VOP3P_Real_gfx10<op>, VOP3P_Real_Base<GFX11Gen, op>;
+
+multiclass VOP3P_Real_gfx10_gfx11_gfx12<bits<7> op> :
+ VOP3P_Real_gfx10_gfx11<op>, VOP3P_Real_Base<GFX12Gen, op>;
+
+multiclass VOP3P_Real_gfx10_gfx11_gfx12_Triple<bits<7> op> :
+ VOP3P_Real_gfx10<op>, VOP3P_Realtriple<GFX11Gen, op>,
+ VOP3P_Realtriple<GFX12Gen, op>;
+
+defm V_PK_MAD_I16 : VOP3P_Real_gfx10_gfx11_gfx12<0x00>;
+defm V_PK_MUL_LO_U16 : VOP3P_Real_gfx10_gfx11_gfx12<0x01>;
+defm V_PK_ADD_I16 : VOP3P_Real_gfx10_gfx11_gfx12<0x02>;
+defm V_PK_SUB_I16 : VOP3P_Real_gfx10_gfx11_gfx12<0x03>;
+defm V_PK_LSHLREV_B16 : VOP3P_Real_gfx10_gfx11_gfx12<0x04>;
+defm V_PK_LSHRREV_B16 : VOP3P_Real_gfx10_gfx11_gfx12<0x05>;
+defm V_PK_ASHRREV_I16 : VOP3P_Real_gfx10_gfx11_gfx12<0x06>;
+defm V_PK_MAX_I16 : VOP3P_Real_gfx10_gfx11_gfx12<0x07>;
+defm V_PK_MIN_I16 : VOP3P_Real_gfx10_gfx11_gfx12<0x08>;
+defm V_PK_MAD_U16 : VOP3P_Real_gfx10_gfx11_gfx12<0x09>;
+defm V_PK_ADD_U16 : VOP3P_Real_gfx10_gfx11_gfx12<0x0a>;
+defm V_PK_SUB_U16 : VOP3P_Real_gfx10_gfx11_gfx12<0x0b>;
+defm V_PK_MAX_U16 : VOP3P_Real_gfx10_gfx11_gfx12<0x0c>;
+defm V_PK_MIN_U16 : VOP3P_Real_gfx10_gfx11_gfx12<0x0d>;
+defm V_PK_FMA_F16 : VOP3P_Real_gfx10_gfx11_gfx12<0x0e>;
+defm V_PK_ADD_F16 : VOP3P_Real_gfx10_gfx11_gfx12<0x0f>;
+defm V_PK_MUL_F16 : VOP3P_Real_gfx10_gfx11_gfx12<0x10>;
defm V_PK_MIN_F16 : VOP3P_Real_gfx10_gfx11<0x11>;
defm V_PK_MAX_F16 : VOP3P_Real_gfx10_gfx11<0x12>;
-defm V_FMA_MIX_F32 : VOP3P_Real_gfx10_gfx11_Triple <0x20>;
-defm V_FMA_MIXLO_F16 : VOP3P_Real_gfx10_gfx11_Triple <0x21>;
-defm V_FMA_MIXHI_F16 : VOP3P_Real_gfx10_gfx11_Triple <0x22>;
+defm V_FMA_MIX_F32 : VOP3P_Real_gfx10_gfx11_gfx12_Triple<0x20>;
+defm V_FMA_MIXLO_F16 : VOP3P_Real_gfx10_gfx11_gfx12_Triple<0x21>;
+defm V_FMA_MIXHI_F16 : VOP3P_Real_gfx10_gfx11_gfx12_Triple<0x22>;
defm V_DOT2_I32_I16 : VOP3P_Real_gfx10 <0x14>;
defm V_DOT2_U32_U16 : VOP3P_Real_gfx10 <0x15>;
-defm V_DOT2_F32_F16 : VOP3P_Real_gfx10_gfx11_Triple <0x13>;
-defm V_DOT4_U32_U8 : VOP3P_Real_gfx10_gfx11 <0x17>;
-defm V_DOT8_U32_U4 : VOP3P_Real_gfx10_gfx11 <0x19>;
+defm V_DOT2_F32_F16 : VOP3P_Real_gfx10_gfx11_gfx12_Triple<0x13>;
+defm V_DOT4_U32_U8 : VOP3P_Real_gfx10_gfx11_gfx12<0x17>;
+defm V_DOT8_U32_U4 : VOP3P_Real_gfx10_gfx11_gfx12<0x19>;
defm V_DOT4_I32_I8 : VOP3P_Real_gfx10 <0x16>;
defm V_DOT8_I32_I4 : VOP3P_Real_gfx10 <0x18>;
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/VOPCInstructions.td b/contrib/llvm-project/llvm/lib/Target/AMDGPU/VOPCInstructions.td
index 6fc3d0957dce..e5b801048e6d 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/VOPCInstructions.td
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/VOPCInstructions.td
@@ -1081,6 +1081,8 @@ multiclass FCMP_Pattern <PatFrags cond, Instruction inst, ValueType vt> {
}
}
+defm : FCMP_Pattern <COND_O, V_CMP_O_F32_e64, f32>;
+defm : FCMP_Pattern <COND_UO, V_CMP_U_F32_e64, f32>;
defm : FCMP_Pattern <COND_OEQ, V_CMP_EQ_F32_e64, f32>;
defm : FCMP_Pattern <COND_ONE, V_CMP_NEQ_F32_e64, f32>;
defm : FCMP_Pattern <COND_OGT, V_CMP_GT_F32_e64, f32>;
@@ -1088,6 +1090,8 @@ defm : FCMP_Pattern <COND_OGE, V_CMP_GE_F32_e64, f32>;
defm : FCMP_Pattern <COND_OLT, V_CMP_LT_F32_e64, f32>;
defm : FCMP_Pattern <COND_OLE, V_CMP_LE_F32_e64, f32>;
+defm : FCMP_Pattern <COND_O, V_CMP_O_F64_e64, f64>;
+defm : FCMP_Pattern <COND_UO, V_CMP_U_F64_e64, f64>;
defm : FCMP_Pattern <COND_OEQ, V_CMP_EQ_F64_e64, f64>;
defm : FCMP_Pattern <COND_ONE, V_CMP_NEQ_F64_e64, f64>;
defm : FCMP_Pattern <COND_OGT, V_CMP_GT_F64_e64, f64>;
@@ -1110,6 +1114,8 @@ defm : FCMP_Pattern <COND_ULT, V_CMP_NGE_F64_e64, f64>;
defm : FCMP_Pattern <COND_ULE, V_CMP_NGT_F64_e64, f64>;
let OtherPredicates = [HasTrue16BitInsts] in {
+defm : FCMP_Pattern <COND_O, V_CMP_O_F16_t16_e64, f16>;
+defm : FCMP_Pattern <COND_UO, V_CMP_U_F16_t16_e64, f16>;
defm : FCMP_Pattern <COND_OEQ, V_CMP_EQ_F16_t16_e64, f16>;
defm : FCMP_Pattern <COND_ONE, V_CMP_NEQ_F16_t16_e64, f16>;
defm : FCMP_Pattern <COND_OGT, V_CMP_GT_F16_t16_e64, f16>;
@@ -1126,6 +1132,8 @@ defm : FCMP_Pattern <COND_ULE, V_CMP_NGT_F16_t16_e64, f16>;
} // End OtherPredicates = [HasTrue16BitInsts]
let OtherPredicates = [NotHasTrue16BitInsts] in {
+defm : FCMP_Pattern <COND_O, V_CMP_O_F16_e64, f16>;
+defm : FCMP_Pattern <COND_UO, V_CMP_U_F16_e64, f16>;
defm : FCMP_Pattern <COND_OEQ, V_CMP_EQ_F16_e64, f16>;
defm : FCMP_Pattern <COND_ONE, V_CMP_NEQ_F16_e64, f16>;
defm : FCMP_Pattern <COND_OGT, V_CMP_GT_F16_e64, f16>;
@@ -1315,53 +1323,52 @@ class VOPC64_DPP8_NoDst<bits<10> op, VOP_Pseudo ps, string opName = ps.OpName>
//===----------------------------------------------------------------------===//
//===----------------------------------------------------------------------===//
-// GFX11.
+// GFX11, GFX12
//===----------------------------------------------------------------------===//
-let AssemblerPredicate = isGFX11Only in {
- multiclass VOPC_Real_gfx11<bits<9> op> {
+multiclass VOPC_Real_Base<GFXGen Gen, bits<9> op> {
+ let AssemblerPredicate = Gen.AssemblerPredicate in {
defvar ps32 = !cast<VOPC_Pseudo>(NAME#"_e32");
defvar ps64 = !cast<VOP3_Pseudo>(NAME#"_e64");
- let DecoderNamespace = "GFX11" in {
- def _e32_gfx11 : VOPC_Real<ps32, SIEncodingFamily.GFX11>,
- VOPCe<op{7-0}>;
- def _e64_gfx11 : VOP3_Real<ps64, SIEncodingFamily.GFX11>,
- VOP3a_gfx11<{0, op}, ps64.Pfl> {
+ let DecoderNamespace = Gen.DecoderNamespace in {
+ def _e32#Gen.Suffix : VOPC_Real<ps32, Gen.Subtarget>,
+ VOPCe<op{7-0}>;
+ def _e64#Gen.Suffix : VOP3_Real<ps64, Gen.Subtarget>,
+ VOP3a_gfx11_gfx12<{0, op}, ps64.Pfl> {
// Encoding used for VOPC instructions encoded as VOP3 differs from
// VOP3e by destination name (sdst) as VOPC doesn't have vector dst.
bits<8> sdst;
let Inst{7-0} = sdst;
}
- } // End DecoderNamespace = "GFX11"
+ } // End DecoderNamespace = Gen.DecoderNamespace
- defm : VOPCInstAliases<NAME, "gfx11">;
+ defm : VOPCInstAliases<NAME, !substr(Gen.Suffix,1)>;
if ps32.Pfl.HasExtDPP then {
defvar psDPP = !cast<VOP_DPP_Pseudo>(NAME #"_e32" #"_dpp");
defvar AsmDPP = ps32.Pfl.AsmDPP16;
- let DecoderNamespace = "DPPGFX11" in {
- def _e32_dpp_gfx11 : VOPC_DPP16_SIMC<op{7-0}, psDPP,
- SIEncodingFamily.GFX11>;
- def _e32_dpp_w32_gfx11 : VOPC_DPP16<op{7-0}, psDPP> {
+ let DecoderNamespace = "DPP"#Gen.DecoderNamespace in {
+ def _e32_dpp#Gen.Suffix : VOPC_DPP16_SIMC<op{7-0}, psDPP, Gen.Subtarget>;
+ def _e32_dpp_w32#Gen.Suffix : VOPC_DPP16<op{7-0}, psDPP> {
let AsmString = psDPP.OpName # " vcc_lo, " # AsmDPP;
let isAsmParserOnly = 1;
let WaveSizePredicate = isWave32;
}
- def _e32_dpp_w64_gfx11 : VOPC_DPP16<op{7-0}, psDPP> {
+ def _e32_dpp_w64#Gen.Suffix : VOPC_DPP16<op{7-0}, psDPP> {
let AsmString = psDPP.OpName # " vcc, " # AsmDPP;
let isAsmParserOnly = 1;
let WaveSizePredicate = isWave64;
}
}
defvar AsmDPP8 = ps32.Pfl.AsmDPP8;
- let DecoderNamespace = "DPP8GFX11" in {
- def _e32_dpp8_gfx11 : VOPC_DPP8<op{7-0}, ps32>;
- def _e32_dpp8_w32_gfx11 : VOPC_DPP8<op{7-0}, ps32> {
+ let DecoderNamespace = "DPP8"#Gen.DecoderNamespace in {
+ def _e32_dpp8#Gen.Suffix : VOPC_DPP8<op{7-0}, ps32>;
+ def _e32_dpp8_w32#Gen.Suffix : VOPC_DPP8<op{7-0}, ps32> {
let AsmString = ps32.OpName # " vcc_lo, " # AsmDPP8;
let isAsmParserOnly = 1;
let WaveSizePredicate = isWave32;
}
- def _e32_dpp8_w64_gfx11 : VOPC_DPP8<op{7-0}, ps32> {
+ def _e32_dpp8_w64#Gen.Suffix : VOPC_DPP8<op{7-0}, ps32> {
let AsmString = ps32.OpName # " vcc, " # AsmDPP8;
let isAsmParserOnly = 1;
let WaveSizePredicate = isWave64;
@@ -1371,83 +1378,84 @@ let AssemblerPredicate = isGFX11Only in {
if ps64.Pfl.HasExtVOP3DPP then {
defvar psDPP = !cast<VOP_DPP_Pseudo>(NAME #"_e64" #"_dpp");
defvar AsmDPP = ps64.Pfl.AsmVOP3DPP16;
- let DecoderNamespace = "DPPGFX11" in {
- def _e64_dpp_gfx11 : VOPC64_DPP16_Dst<{0, op}, psDPP>,
- SIMCInstr<psDPP.PseudoInstr, SIEncodingFamily.GFX11>;
- def _e64_dpp_w32_gfx11 : VOPC64_DPP16_Dst<{0, op}, psDPP> {
+ let DecoderNamespace = "DPP"#Gen.DecoderNamespace in {
+ def _e64_dpp#Gen.Suffix : VOPC64_DPP16_Dst<{0, op}, psDPP>,
+ SIMCInstr<psDPP.PseudoInstr, Gen.Subtarget>;
+ def _e64_dpp_w32#Gen.Suffix : VOPC64_DPP16_Dst<{0, op}, psDPP> {
let AsmString = psDPP.OpName # " vcc_lo, " # AsmDPP;
let isAsmParserOnly = 1;
let WaveSizePredicate = isWave32;
}
- def _e64_dpp_w64_gfx11 : VOPC64_DPP16_Dst<{0, op}, psDPP> {
+ def _e64_dpp_w64#Gen.Suffix : VOPC64_DPP16_Dst<{0, op}, psDPP> {
let AsmString = psDPP.OpName # " vcc, " # AsmDPP;
let isAsmParserOnly = 1;
let WaveSizePredicate = isWave64;
}
}
defvar AsmDPP8 = ps64.Pfl.AsmVOP3DPP8;
- let DecoderNamespace = "DPP8GFX11" in {
- def _e64_dpp8_gfx11 : VOPC64_DPP8_Dst<{0, op}, ps64>;
- def _e64_dpp8_w32_gfx11 : VOPC64_DPP8_Dst<{0, op}, ps64> {
+ let DecoderNamespace = "DPP8"#Gen.DecoderNamespace in {
+ def _e64_dpp8#Gen.Suffix : VOPC64_DPP8_Dst<{0, op}, ps64>;
+ def _e64_dpp8_w32#Gen.Suffix : VOPC64_DPP8_Dst<{0, op}, ps64> {
let AsmString = ps32.OpName # " vcc_lo, " # AsmDPP8;
let isAsmParserOnly = 1;
let WaveSizePredicate = isWave32;
}
- def _e64_dpp8_w64_gfx11 : VOPC64_DPP8_Dst<{0, op}, ps64> {
+ def _e64_dpp8_w64#Gen.Suffix : VOPC64_DPP8_Dst<{0, op}, ps64> {
let AsmString = ps32.OpName # " vcc, " # AsmDPP8;
let isAsmParserOnly = 1;
let WaveSizePredicate = isWave64;
}
}
}
+ } // AssemblerPredicate = Gen.AssemblerPredicate
+}
- }
-
- multiclass VOPC_Real_with_name_gfx11<bits<9> op, string OpName,
- string asm_name, string pseudo_mnemonic = ""> {
+multiclass VOPC_Real_with_name<GFXGen Gen, bits<9> op, string OpName,
+ string asm_name, string pseudo_mnemonic = ""> {
+ let AssemblerPredicate = Gen.AssemblerPredicate in {
defvar ps32 = !cast<VOPC_Pseudo>(OpName#"_e32");
defvar ps64 = !cast<VOP3_Pseudo>(OpName#"_e64");
- let DecoderNamespace = "GFX11" in {
- def _e32_gfx11 :
+ let DecoderNamespace = Gen.DecoderNamespace in {
+ def _e32#Gen.Suffix :
// 32 and 64 bit forms of the instruction have _e32 and _e64
// respectively appended to their assembly mnemonic.
// _e64 is printed as part of the VOPDstS64orS32 operand, whereas
// the destination-less 32bit forms add it to the asmString here.
- VOPC_Real<ps32, SIEncodingFamily.GFX11, asm_name#"_e32">,
+ VOPC_Real<ps32, Gen.Subtarget, asm_name#"_e32">,
VOPCe<op{7-0}>,
MnemonicAlias<!if(!empty(pseudo_mnemonic), ps32.Mnemonic,
pseudo_mnemonic),
asm_name, ps32.AsmVariantName>,
- Requires<[isGFX11Plus]>;
- def _e64_gfx11 :
- VOP3_Real<ps64, SIEncodingFamily.GFX11, asm_name>,
- VOP3a_gfx11<{0, op}, ps64.Pfl>,
+ Requires<[Gen.AssemblerPredicate]>;
+ def _e64#Gen.Suffix :
+ VOP3_Real<ps64, Gen.Subtarget, asm_name>,
+ VOP3a_gfx11_gfx12<{0, op}, ps64.Pfl>,
MnemonicAlias<!if(!empty(pseudo_mnemonic), ps64.Mnemonic,
pseudo_mnemonic),
asm_name, ps64.AsmVariantName>,
- Requires<[isGFX11Plus]> {
+ Requires<[Gen.AssemblerPredicate]> {
// Encoding used for VOPC instructions encoded as VOP3 differs from
// VOP3e by destination name (sdst) as VOPC doesn't have vector dst.
bits<8> sdst;
let Inst{7-0} = sdst;
}
- } // End DecoderNamespace = "GFX11"
+ } // End DecoderNamespace = Gen.DecoderNamespace
- defm : VOPCInstAliases<OpName, "gfx11", NAME, asm_name>;
+ defm : VOPCInstAliases<OpName, !substr(Gen.Suffix, 1), NAME, asm_name>;
if ps32.Pfl.HasExtDPP then {
defvar psDPP = !cast<VOP_DPP_Pseudo>(OpName #"_e32" #"_dpp");
defvar AsmDPP = ps32.Pfl.AsmDPP16;
- let DecoderNamespace = "DPPGFX11" in {
- def _e32_dpp_gfx11 : VOPC_DPP16_SIMC<op{7-0}, psDPP,
- SIEncodingFamily.GFX11, asm_name>;
- def _e32_dpp_w32_gfx11
+ let DecoderNamespace = "DPP"#Gen.DecoderNamespace in {
+ def _e32_dpp#Gen.Suffix : VOPC_DPP16_SIMC<op{7-0}, psDPP,
+ Gen.Subtarget, asm_name>;
+ def _e32_dpp_w32#Gen.Suffix
: VOPC_DPP16<op{7-0}, psDPP, asm_name> {
let AsmString = asm_name # " vcc_lo, " # AsmDPP;
let isAsmParserOnly = 1;
let WaveSizePredicate = isWave32;
}
- def _e32_dpp_w64_gfx11
+ def _e32_dpp_w64#Gen.Suffix
: VOPC_DPP16<op{7-0}, psDPP, asm_name> {
let AsmString = asm_name # " vcc, " # AsmDPP;
let isAsmParserOnly = 1;
@@ -1455,15 +1463,15 @@ let AssemblerPredicate = isGFX11Only in {
}
}
defvar AsmDPP8 = ps32.Pfl.AsmDPP8;
- let DecoderNamespace = "DPP8GFX11" in {
- def _e32_dpp8_gfx11 : VOPC_DPP8<op{7-0}, ps32, asm_name>;
- def _e32_dpp8_w32_gfx11
+ let DecoderNamespace = "DPP8"#Gen.DecoderNamespace in {
+ def _e32_dpp8#Gen.Suffix : VOPC_DPP8<op{7-0}, ps32, asm_name>;
+ def _e32_dpp8_w32#Gen.Suffix
: VOPC_DPP8<op{7-0}, ps32, asm_name> {
let AsmString = asm_name # " vcc_lo, " # AsmDPP8;
let isAsmParserOnly = 1;
let WaveSizePredicate = isWave32;
}
- def _e32_dpp8_w64_gfx11
+ def _e32_dpp8_w64#Gen.Suffix
: VOPC_DPP8<op{7-0}, ps32, asm_name> {
let AsmString = asm_name # " vcc, " # AsmDPP8;
let isAsmParserOnly = 1;
@@ -1475,16 +1483,16 @@ let AssemblerPredicate = isGFX11Only in {
if ps64.Pfl.HasExtVOP3DPP then {
defvar psDPP = !cast<VOP_DPP_Pseudo>(OpName #"_e64" #"_dpp");
defvar AsmDPP = ps64.Pfl.AsmVOP3DPP16;
- let DecoderNamespace = "DPPGFX11" in {
- def _e64_dpp_gfx11 : VOPC64_DPP16_Dst<{0, op}, psDPP, asm_name>,
- SIMCInstr<psDPP.PseudoInstr, SIEncodingFamily.GFX11>;
- def _e64_dpp_w32_gfx11
+ let DecoderNamespace = "DPP"#Gen.DecoderNamespace in {
+ def _e64_dpp#Gen.Suffix : VOPC64_DPP16_Dst<{0, op}, psDPP, asm_name>,
+ SIMCInstr<psDPP.PseudoInstr, Gen.Subtarget>;
+ def _e64_dpp_w32#Gen.Suffix
: VOPC64_DPP16_Dst<{0, op}, psDPP, asm_name> {
let AsmString = asm_name # " vcc_lo, " # AsmDPP;
let isAsmParserOnly = 1;
let WaveSizePredicate = isWave32;
}
- def _e64_dpp_w64_gfx11
+ def _e64_dpp_w64#Gen.Suffix
: VOPC64_DPP16_Dst<{0, op}, psDPP, asm_name> {
let AsmString = asm_name # " vcc, " # AsmDPP;
let isAsmParserOnly = 1;
@@ -1492,15 +1500,15 @@ let AssemblerPredicate = isGFX11Only in {
}
}
defvar AsmDPP8 = ps64.Pfl.AsmVOP3DPP8;
- let DecoderNamespace = "DPP8GFX11" in {
- def _e64_dpp8_gfx11 : VOPC64_DPP8_Dst<{0, op}, ps64, asm_name>;
- def _e64_dpp8_w32_gfx11
+ let DecoderNamespace = "DPP8"#Gen.DecoderNamespace in {
+ def _e64_dpp8#Gen.Suffix : VOPC64_DPP8_Dst<{0, op}, ps64, asm_name>;
+ def _e64_dpp8_w32#Gen.Suffix
: VOPC64_DPP8_Dst<{0, op}, ps64, asm_name> {
let AsmString = asm_name # " vcc_lo, " # AsmDPP8;
let isAsmParserOnly = 1;
let WaveSizePredicate = isWave32;
}
- def _e64_dpp8_w64_gfx11
+ def _e64_dpp8_w64#Gen.Suffix
: VOPC64_DPP8_Dst<{0, op}, ps64, asm_name> {
let AsmString = asm_name # " vcc, " # AsmDPP8;
let isAsmParserOnly = 1;
@@ -1508,44 +1516,47 @@ let AssemblerPredicate = isGFX11Only in {
}
}
}
- }
+ } // AssemblerPredicate = Gen.AssemblerPredicate
+}
- multiclass VOPC_Real_t16_gfx11<bits<9> op, string asm_name,
- string OpName = NAME> : VOPC_Real_with_name_gfx11<op, OpName, asm_name>;
+multiclass VOPC_Real_t16<GFXGen Gen, bits<9> op, string asm_name,
+ string OpName = NAME, string pseudo_mnemonic = ""> :
+ VOPC_Real_with_name<Gen, op, OpName, asm_name, pseudo_mnemonic>;
- multiclass VOPCX_Real_gfx11<bits<9> op> {
+multiclass VOPCX_Real<GFXGen Gen, bits<9> op> {
+ let AssemblerPredicate = Gen.AssemblerPredicate in {
defvar ps32 = !cast<VOPC_Pseudo>(NAME#"_nosdst_e32");
defvar ps64 = !cast<VOP3_Pseudo>(NAME#"_nosdst_e64");
- let DecoderNamespace = "GFX11" in {
- def _e32_gfx11 :
- VOPC_Real<ps32, SIEncodingFamily.GFX11>,
+ let DecoderNamespace = Gen.DecoderNamespace in {
+ def _e32#Gen.Suffix :
+ VOPC_Real<ps32, Gen.Subtarget>,
VOPCe<op{7-0}> {
let AsmString = !subst("_nosdst", "", ps32.PseudoInstr)
# " " # ps32.AsmOperands;
}
- def _e64_gfx11 :
- VOP3_Real<ps64, SIEncodingFamily.GFX11>,
- VOP3a_gfx11<{0, op}, ps64.Pfl> {
+ def _e64#Gen.Suffix :
+ VOP3_Real<ps64, Gen.Subtarget>,
+ VOP3a_gfx11_gfx12<{0, op}, ps64.Pfl> {
let Inst{7-0} = ?; // sdst
let AsmString = !subst("_nosdst", "", ps64.Mnemonic)
# "{_e64} " # ps64.AsmOperands;
}
- } // End DecoderNamespace = "GFX11"
+ } // End DecoderNamespace = Gen.DecoderNamespace
- defm : VOPCXInstAliases<NAME, "gfx11">;
+ defm : VOPCXInstAliases<NAME, !substr(Gen.Suffix, 1)>;
if ps32.Pfl.HasExtDPP then {
defvar psDPP = !cast<VOP_DPP_Pseudo>(NAME #"_nosdst_e32" #"_dpp");
defvar AsmDPP = ps32.Pfl.AsmDPP16;
- let DecoderNamespace = "DPPGFX11" in {
- def _e32_dpp_gfx11
- : VOPC_DPP16_SIMC<op{7-0}, psDPP, SIEncodingFamily.GFX11> {
+ let DecoderNamespace = "DPP"#Gen.DecoderNamespace in {
+ def _e32_dpp#Gen.Suffix
+ : VOPC_DPP16_SIMC<op{7-0}, psDPP, Gen.Subtarget> {
let AsmString = !subst("_nosdst", "", psDPP.OpName) # " " # AsmDPP;
}
}
defvar AsmDPP8 = ps32.Pfl.AsmDPP8;
- let DecoderNamespace = "DPP8GFX11" in {
- def _e32_dpp8_gfx11 : VOPC_DPP8<op{7-0}, ps32> {
+ let DecoderNamespace = "DPP8"#Gen.DecoderNamespace in {
+ def _e32_dpp8#Gen.Suffix : VOPC_DPP8<op{7-0}, ps32> {
let AsmString = !subst("_nosdst", "", ps32.OpName) # " " # AsmDPP8;
}
}
@@ -1554,268 +1565,305 @@ let AssemblerPredicate = isGFX11Only in {
if ps64.Pfl.HasExtVOP3DPP then {
defvar psDPP = !cast<VOP_DPP_Pseudo>(NAME #"_nosdst_e64" #"_dpp");
defvar AsmDPP = ps64.Pfl.AsmVOP3DPP16;
- let DecoderNamespace = "DPPGFX11" in {
- def _e64_dpp_gfx11
+ let DecoderNamespace = "DPP"#Gen.DecoderNamespace in {
+ def _e64_dpp#Gen.Suffix
: VOPC64_DPP16_NoDst<{0, op}, psDPP>,
- SIMCInstr<psDPP.PseudoInstr, SIEncodingFamily.GFX11> {
+ SIMCInstr<psDPP.PseudoInstr, Gen.Subtarget> {
let AsmString = !subst("_nosdst", "", psDPP.OpName)
# "{_e64_dpp} " # AsmDPP;
}
}
defvar AsmDPP8 = ps64.Pfl.AsmVOP3DPP8;
- let DecoderNamespace = "DPP8GFX11" in {
- def _e64_dpp8_gfx11 : VOPC64_DPP8_NoDst<{0, op}, ps64> {
+ let DecoderNamespace = "DPP8"#Gen.DecoderNamespace in {
+ def _e64_dpp8#Gen.Suffix : VOPC64_DPP8_NoDst<{0, op}, ps64> {
let AsmString = !subst("_nosdst", "", ps64.OpName)
# "{_e64_dpp} " # AsmDPP8;
}
}
}
- }
+ } // AssemblerPredicate = Gen.AssemblerPredicate
+}
- multiclass VOPCX_Real_with_name_gfx11<bits<9> op, string OpName,
- string asm_name, string pseudo_mnemonic = ""> {
+multiclass VOPCX_Real_with_name<GFXGen Gen, bits<9> op, string OpName,
+ string asm_name, string pseudo_mnemonic = ""> {
+ let AssemblerPredicate = Gen.AssemblerPredicate in {
defvar ps32 = !cast<VOPC_Pseudo>(OpName#"_nosdst_e32");
defvar ps64 = !cast<VOP3_Pseudo>(OpName#"_nosdst_e64");
- let DecoderNamespace = "GFX11" in {
- def _e32_gfx11
- : VOPC_Real<ps32, SIEncodingFamily.GFX11, asm_name>,
+ let DecoderNamespace = Gen.DecoderNamespace in {
+ def _e32#Gen.Suffix
+ : VOPC_Real<ps32, Gen.Subtarget, asm_name>,
MnemonicAlias<!if(!empty(pseudo_mnemonic), !subst("_nosdst", "", ps32.Mnemonic),
pseudo_mnemonic),
asm_name, ps32.AsmVariantName>,
- Requires<[isGFX11Plus]>,
+ Requires<[Gen.AssemblerPredicate]>,
VOPCe<op{7-0}> {
let AsmString = asm_name # "{_e32} " # ps32.AsmOperands;
}
- def _e64_gfx11
- : VOP3_Real<ps64, SIEncodingFamily.GFX11, asm_name>,
+ def _e64#Gen.Suffix
+ : VOP3_Real<ps64, Gen.Subtarget, asm_name>,
MnemonicAlias<!if(!empty(pseudo_mnemonic), !subst("_nosdst", "", ps64.Mnemonic),
pseudo_mnemonic),
asm_name, ps64.AsmVariantName>,
- Requires<[isGFX11Plus]>,
- VOP3a_gfx11<{0, op}, ps64.Pfl> {
+ Requires<[Gen.AssemblerPredicate]>,
+ VOP3a_gfx11_gfx12<{0, op}, ps64.Pfl> {
let Inst{7-0} = ? ; // sdst
let AsmString = asm_name # "{_e64} " # ps64.AsmOperands;
}
- } // End DecoderNamespace = "GFX11"
+ } // End DecoderNamespace = Gen.DecoderNamespace
- defm : VOPCXInstAliases<OpName, "gfx11", NAME, asm_name>;
+ defm : VOPCXInstAliases<OpName, !substr(Gen.Suffix, 1), NAME, asm_name>;
if ps32.Pfl.HasExtDPP then {
defvar psDPP = !cast<VOP_DPP_Pseudo>(OpName#"_nosdst_e32"#"_dpp");
- let DecoderNamespace = "DPPGFX11" in {
- def _e32_dpp_gfx11 : VOPC_DPP16_SIMC<op{7-0}, psDPP,
- SIEncodingFamily.GFX11, asm_name>;
+ let DecoderNamespace = "DPP"#Gen.DecoderNamespace in {
+ def _e32_dpp#Gen.Suffix : VOPC_DPP16_SIMC<op{7-0}, psDPP,
+ Gen.Subtarget, asm_name>;
}
- let DecoderNamespace = "DPP8GFX11" in {
- def _e32_dpp8_gfx11 : VOPC_DPP8<op{7-0}, ps32, asm_name>;
+ let DecoderNamespace = "DPP8"#Gen.DecoderNamespace in {
+ def _e32_dpp8#Gen.Suffix : VOPC_DPP8<op{7-0}, ps32, asm_name>;
}
}
if ps64.Pfl.HasExtVOP3DPP then {
defvar psDPP = !cast<VOP_DPP_Pseudo>(OpName#"_nosdst_e64"#"_dpp");
defvar AsmDPP = ps64.Pfl.AsmVOP3DPP16;
- let DecoderNamespace = "DPPGFX11" in {
- def _e64_dpp_gfx11
+ let DecoderNamespace = "DPP"#Gen.DecoderNamespace in {
+ def _e64_dpp#Gen.Suffix
: VOPC64_DPP16_NoDst<{0, op}, psDPP, asm_name>,
- SIMCInstr<psDPP.PseudoInstr, SIEncodingFamily.GFX11> {
+ SIMCInstr<psDPP.PseudoInstr, Gen.Subtarget> {
let AsmString = asm_name # "{_e64_dpp} " # AsmDPP;
}
}
defvar AsmDPP8 = ps64.Pfl.AsmVOP3DPP8;
- let DecoderNamespace = "DPP8GFX11" in {
- def _e64_dpp8_gfx11 : VOPC64_DPP8_NoDst<{0, op}, ps64, asm_name> {
+ let DecoderNamespace = "DPP8"#Gen.DecoderNamespace in {
+ def _e64_dpp8#Gen.Suffix : VOPC64_DPP8_NoDst<{0, op}, ps64, asm_name> {
let AsmString = asm_name # "{_e64_dpp} " # AsmDPP8;
}
}
}
- }
+ } // AssemblerPredicate = Gen.AssemblerPredicate
+}
- multiclass VOPCX_Real_t16_gfx11<bits<9> op, string asm_name,
- string OpName = NAME> : VOPCX_Real_with_name_gfx11<op, OpName, asm_name>;
+multiclass VOPCX_Real_t16<GFXGen Gen, bits<9> op, string asm_name,
+ string OpName = NAME, string pseudo_mnemonic = ""> :
+ VOPCX_Real_with_name<Gen, op, OpName, asm_name, pseudo_mnemonic>;
+multiclass VOPC_Real_gfx11<bits<9> op> : VOPC_Real_Base<GFX11Gen, op>;
-} // End AssemblerPredicate = isGFX11Only
+multiclass VOPC_Real_with_name_gfx11<bits<9> op, string OpName, string asm_name,
+ string pseudo_mnemonic = "">
+ : VOPC_Real_with_name<GFX11Gen, op, OpName, asm_name, pseudo_mnemonic>;
+
+multiclass VOPCX_Real_gfx11<bits<9> op> : VOPCX_Real<GFX11Gen, op>;
+
+multiclass VOPCX_Real_with_name_gfx11<bits<9> op, string OpName,
+ string asm_name, string pseudo_mnemonic = ""> :
+ VOPCX_Real_with_name<GFX11Gen, op, OpName, asm_name, pseudo_mnemonic>;
+
+multiclass VOPC_Real_gfx11_gfx12<bits<9> op> :
+ VOPC_Real_Base<GFX11Gen, op>, VOPC_Real_Base<GFX12Gen, op>;
+
+multiclass VOPCX_Real_gfx11_gfx12<bits<9> op> :
+ VOPCX_Real<GFX11Gen, op>, VOPCX_Real<GFX12Gen, op>;
+
+multiclass VOPC_Real_t16_gfx11<bits <9> op, string asm_name,
+ string OpName = NAME, string pseudo_mnemonic = ""> :
+ VOPC_Real_t16<GFX11Gen, op, asm_name, OpName, pseudo_mnemonic>;
+
+multiclass VOPC_Real_t16_gfx11_gfx12<bits <9> op, string asm_name,
+ string OpName = NAME, string pseudo_mnemonic = ""> :
+ VOPC_Real_t16<GFX11Gen, op, asm_name, OpName, pseudo_mnemonic>,
+ VOPC_Real_t16<GFX12Gen, op, asm_name, OpName, pseudo_mnemonic>;
+
+multiclass VOPCX_Real_t16_gfx11<bits<9> op, string asm_name,
+ string OpName = NAME, string pseudo_mnemonic = ""> :
+ VOPCX_Real_t16<GFX11Gen, op, asm_name, OpName, pseudo_mnemonic>;
+
+multiclass VOPCX_Real_t16_gfx11_gfx12<bits<9> op, string asm_name,
+ string OpName = NAME, string pseudo_mnemonic = ""> :
+ VOPCX_Real_t16<GFX11Gen, op, asm_name, OpName, pseudo_mnemonic>,
+ VOPCX_Real_t16<GFX12Gen, op, asm_name, OpName, pseudo_mnemonic>;
defm V_CMP_F_F16_t16 : VOPC_Real_t16_gfx11<0x000, "v_cmp_f_f16">;
-defm V_CMP_LT_F16_t16 : VOPC_Real_t16_gfx11<0x001, "v_cmp_lt_f16">;
-defm V_CMP_EQ_F16_t16 : VOPC_Real_t16_gfx11<0x002, "v_cmp_eq_f16">;
-defm V_CMP_LE_F16_t16 : VOPC_Real_t16_gfx11<0x003, "v_cmp_le_f16">;
-defm V_CMP_GT_F16_t16 : VOPC_Real_t16_gfx11<0x004, "v_cmp_gt_f16">;
-defm V_CMP_LG_F16_t16 : VOPC_Real_t16_gfx11<0x005, "v_cmp_lg_f16">;
-defm V_CMP_GE_F16_t16 : VOPC_Real_t16_gfx11<0x006, "v_cmp_ge_f16">;
-defm V_CMP_O_F16_t16 : VOPC_Real_t16_gfx11<0x007, "v_cmp_o_f16">;
-defm V_CMP_U_F16_t16 : VOPC_Real_t16_gfx11<0x008, "v_cmp_u_f16">;
-defm V_CMP_NGE_F16_t16 : VOPC_Real_t16_gfx11<0x009, "v_cmp_nge_f16">;
-defm V_CMP_NLG_F16_t16 : VOPC_Real_t16_gfx11<0x00a, "v_cmp_nlg_f16">;
-defm V_CMP_NGT_F16_t16 : VOPC_Real_t16_gfx11<0x00b, "v_cmp_ngt_f16">;
-defm V_CMP_NLE_F16_t16 : VOPC_Real_t16_gfx11<0x00c, "v_cmp_nle_f16">;
-defm V_CMP_NEQ_F16_t16 : VOPC_Real_t16_gfx11<0x00d, "v_cmp_neq_f16">;
-defm V_CMP_NLT_F16_t16 : VOPC_Real_t16_gfx11<0x00e, "v_cmp_nlt_f16">;
+defm V_CMP_LT_F16_t16 : VOPC_Real_t16_gfx11_gfx12<0x001, "v_cmp_lt_f16">;
+defm V_CMP_EQ_F16_t16 : VOPC_Real_t16_gfx11_gfx12<0x002, "v_cmp_eq_f16">;
+defm V_CMP_LE_F16_t16 : VOPC_Real_t16_gfx11_gfx12<0x003, "v_cmp_le_f16">;
+defm V_CMP_GT_F16_t16 : VOPC_Real_t16_gfx11_gfx12<0x004, "v_cmp_gt_f16">;
+defm V_CMP_LG_F16_t16 : VOPC_Real_t16_gfx11_gfx12<0x005, "v_cmp_lg_f16">;
+defm V_CMP_GE_F16_t16 : VOPC_Real_t16_gfx11_gfx12<0x006, "v_cmp_ge_f16">;
+defm V_CMP_O_F16_t16 : VOPC_Real_t16_gfx11_gfx12<0x007, "v_cmp_o_f16">;
+defm V_CMP_U_F16_t16 : VOPC_Real_t16_gfx11_gfx12<0x008, "v_cmp_u_f16">;
+defm V_CMP_NGE_F16_t16 : VOPC_Real_t16_gfx11_gfx12<0x009, "v_cmp_nge_f16">;
+defm V_CMP_NLG_F16_t16 : VOPC_Real_t16_gfx11_gfx12<0x00a, "v_cmp_nlg_f16">;
+defm V_CMP_NGT_F16_t16 : VOPC_Real_t16_gfx11_gfx12<0x00b, "v_cmp_ngt_f16">;
+defm V_CMP_NLE_F16_t16 : VOPC_Real_t16_gfx11_gfx12<0x00c, "v_cmp_nle_f16">;
+defm V_CMP_NEQ_F16_t16 : VOPC_Real_t16_gfx11_gfx12<0x00d, "v_cmp_neq_f16">;
+defm V_CMP_NLT_F16_t16 : VOPC_Real_t16_gfx11_gfx12<0x00e, "v_cmp_nlt_f16">;
defm V_CMP_T_F16_t16 : VOPC_Real_with_name_gfx11<0x00f, "V_CMP_TRU_F16_t16", "v_cmp_t_f16", "v_cmp_tru_f16">;
defm V_CMP_F_F32 : VOPC_Real_gfx11<0x010>;
-defm V_CMP_LT_F32 : VOPC_Real_gfx11<0x011>;
-defm V_CMP_EQ_F32 : VOPC_Real_gfx11<0x012>;
-defm V_CMP_LE_F32 : VOPC_Real_gfx11<0x013>;
-defm V_CMP_GT_F32 : VOPC_Real_gfx11<0x014>;
-defm V_CMP_LG_F32 : VOPC_Real_gfx11<0x015>;
-defm V_CMP_GE_F32 : VOPC_Real_gfx11<0x016>;
-defm V_CMP_O_F32 : VOPC_Real_gfx11<0x017>;
-defm V_CMP_U_F32 : VOPC_Real_gfx11<0x018>;
-defm V_CMP_NGE_F32 : VOPC_Real_gfx11<0x019>;
-defm V_CMP_NLG_F32 : VOPC_Real_gfx11<0x01a>;
-defm V_CMP_NGT_F32 : VOPC_Real_gfx11<0x01b>;
-defm V_CMP_NLE_F32 : VOPC_Real_gfx11<0x01c>;
-defm V_CMP_NEQ_F32 : VOPC_Real_gfx11<0x01d>;
-defm V_CMP_NLT_F32 : VOPC_Real_gfx11<0x01e>;
+defm V_CMP_LT_F32 : VOPC_Real_gfx11_gfx12<0x011>;
+defm V_CMP_EQ_F32 : VOPC_Real_gfx11_gfx12<0x012>;
+defm V_CMP_LE_F32 : VOPC_Real_gfx11_gfx12<0x013>;
+defm V_CMP_GT_F32 : VOPC_Real_gfx11_gfx12<0x014>;
+defm V_CMP_LG_F32 : VOPC_Real_gfx11_gfx12<0x015>;
+defm V_CMP_GE_F32 : VOPC_Real_gfx11_gfx12<0x016>;
+defm V_CMP_O_F32 : VOPC_Real_gfx11_gfx12<0x017>;
+defm V_CMP_U_F32 : VOPC_Real_gfx11_gfx12<0x018>;
+defm V_CMP_NGE_F32 : VOPC_Real_gfx11_gfx12<0x019>;
+defm V_CMP_NLG_F32 : VOPC_Real_gfx11_gfx12<0x01a>;
+defm V_CMP_NGT_F32 : VOPC_Real_gfx11_gfx12<0x01b>;
+defm V_CMP_NLE_F32 : VOPC_Real_gfx11_gfx12<0x01c>;
+defm V_CMP_NEQ_F32 : VOPC_Real_gfx11_gfx12<0x01d>;
+defm V_CMP_NLT_F32 : VOPC_Real_gfx11_gfx12<0x01e>;
defm V_CMP_T_F32 : VOPC_Real_with_name_gfx11<0x01f, "V_CMP_TRU_F32", "v_cmp_t_f32">;
defm V_CMP_T_F64 : VOPC_Real_with_name_gfx11<0x02f, "V_CMP_TRU_F64", "v_cmp_t_f64">;
-defm V_CMP_LT_I16_t16 : VOPC_Real_t16_gfx11<0x031, "v_cmp_lt_i16">;
-defm V_CMP_EQ_I16_t16 : VOPC_Real_t16_gfx11<0x032, "v_cmp_eq_i16">;
-defm V_CMP_LE_I16_t16 : VOPC_Real_t16_gfx11<0x033, "v_cmp_le_i16">;
-defm V_CMP_GT_I16_t16 : VOPC_Real_t16_gfx11<0x034, "v_cmp_gt_i16">;
-defm V_CMP_NE_I16_t16 : VOPC_Real_t16_gfx11<0x035, "v_cmp_ne_i16">;
-defm V_CMP_GE_I16_t16 : VOPC_Real_t16_gfx11<0x036, "v_cmp_ge_i16">;
-defm V_CMP_LT_U16_t16 : VOPC_Real_t16_gfx11<0x039, "v_cmp_lt_u16">;
-defm V_CMP_EQ_U16_t16 : VOPC_Real_t16_gfx11<0x03a, "v_cmp_eq_u16">;
-defm V_CMP_LE_U16_t16 : VOPC_Real_t16_gfx11<0x03b, "v_cmp_le_u16">;
-defm V_CMP_GT_U16_t16 : VOPC_Real_t16_gfx11<0x03c, "v_cmp_gt_u16">;
-defm V_CMP_NE_U16_t16 : VOPC_Real_t16_gfx11<0x03d, "v_cmp_ne_u16">;
-defm V_CMP_GE_U16_t16 : VOPC_Real_t16_gfx11<0x03e, "v_cmp_ge_u16">;
+defm V_CMP_LT_I16_t16 : VOPC_Real_t16_gfx11_gfx12<0x031, "v_cmp_lt_i16">;
+defm V_CMP_EQ_I16_t16 : VOPC_Real_t16_gfx11_gfx12<0x032, "v_cmp_eq_i16">;
+defm V_CMP_LE_I16_t16 : VOPC_Real_t16_gfx11_gfx12<0x033, "v_cmp_le_i16">;
+defm V_CMP_GT_I16_t16 : VOPC_Real_t16_gfx11_gfx12<0x034, "v_cmp_gt_i16">;
+defm V_CMP_NE_I16_t16 : VOPC_Real_t16_gfx11_gfx12<0x035, "v_cmp_ne_i16">;
+defm V_CMP_GE_I16_t16 : VOPC_Real_t16_gfx11_gfx12<0x036, "v_cmp_ge_i16">;
+defm V_CMP_LT_U16_t16 : VOPC_Real_t16_gfx11_gfx12<0x039, "v_cmp_lt_u16">;
+defm V_CMP_EQ_U16_t16 : VOPC_Real_t16_gfx11_gfx12<0x03a, "v_cmp_eq_u16">;
+defm V_CMP_LE_U16_t16 : VOPC_Real_t16_gfx11_gfx12<0x03b, "v_cmp_le_u16">;
+defm V_CMP_GT_U16_t16 : VOPC_Real_t16_gfx11_gfx12<0x03c, "v_cmp_gt_u16">;
+defm V_CMP_NE_U16_t16 : VOPC_Real_t16_gfx11_gfx12<0x03d, "v_cmp_ne_u16">;
+defm V_CMP_GE_U16_t16 : VOPC_Real_t16_gfx11_gfx12<0x03e, "v_cmp_ge_u16">;
defm V_CMP_F_I32 : VOPC_Real_gfx11<0x040>;
-defm V_CMP_LT_I32 : VOPC_Real_gfx11<0x041>;
-defm V_CMP_EQ_I32 : VOPC_Real_gfx11<0x042>;
-defm V_CMP_LE_I32 : VOPC_Real_gfx11<0x043>;
-defm V_CMP_GT_I32 : VOPC_Real_gfx11<0x044>;
-defm V_CMP_NE_I32 : VOPC_Real_gfx11<0x045>;
-defm V_CMP_GE_I32 : VOPC_Real_gfx11<0x046>;
+defm V_CMP_LT_I32 : VOPC_Real_gfx11_gfx12<0x041>;
+defm V_CMP_EQ_I32 : VOPC_Real_gfx11_gfx12<0x042>;
+defm V_CMP_LE_I32 : VOPC_Real_gfx11_gfx12<0x043>;
+defm V_CMP_GT_I32 : VOPC_Real_gfx11_gfx12<0x044>;
+defm V_CMP_NE_I32 : VOPC_Real_gfx11_gfx12<0x045>;
+defm V_CMP_GE_I32 : VOPC_Real_gfx11_gfx12<0x046>;
defm V_CMP_T_I32 : VOPC_Real_gfx11<0x047>;
defm V_CMP_F_U32 : VOPC_Real_gfx11<0x048>;
-defm V_CMP_LT_U32 : VOPC_Real_gfx11<0x049>;
-defm V_CMP_EQ_U32 : VOPC_Real_gfx11<0x04a>;
-defm V_CMP_LE_U32 : VOPC_Real_gfx11<0x04b>;
-defm V_CMP_GT_U32 : VOPC_Real_gfx11<0x04c>;
-defm V_CMP_NE_U32 : VOPC_Real_gfx11<0x04d>;
-defm V_CMP_GE_U32 : VOPC_Real_gfx11<0x04e>;
+defm V_CMP_LT_U32 : VOPC_Real_gfx11_gfx12<0x049>;
+defm V_CMP_EQ_U32 : VOPC_Real_gfx11_gfx12<0x04a>;
+defm V_CMP_LE_U32 : VOPC_Real_gfx11_gfx12<0x04b>;
+defm V_CMP_GT_U32 : VOPC_Real_gfx11_gfx12<0x04c>;
+defm V_CMP_NE_U32 : VOPC_Real_gfx11_gfx12<0x04d>;
+defm V_CMP_GE_U32 : VOPC_Real_gfx11_gfx12<0x04e>;
defm V_CMP_T_U32 : VOPC_Real_gfx11<0x04f>;
defm V_CMP_F_I64 : VOPC_Real_gfx11<0x050>;
-defm V_CMP_LT_I64 : VOPC_Real_gfx11<0x051>;
-defm V_CMP_EQ_I64 : VOPC_Real_gfx11<0x052>;
-defm V_CMP_LE_I64 : VOPC_Real_gfx11<0x053>;
-defm V_CMP_GT_I64 : VOPC_Real_gfx11<0x054>;
-defm V_CMP_NE_I64 : VOPC_Real_gfx11<0x055>;
-defm V_CMP_GE_I64 : VOPC_Real_gfx11<0x056>;
+defm V_CMP_LT_I64 : VOPC_Real_gfx11_gfx12<0x051>;
+defm V_CMP_EQ_I64 : VOPC_Real_gfx11_gfx12<0x052>;
+defm V_CMP_LE_I64 : VOPC_Real_gfx11_gfx12<0x053>;
+defm V_CMP_GT_I64 : VOPC_Real_gfx11_gfx12<0x054>;
+defm V_CMP_NE_I64 : VOPC_Real_gfx11_gfx12<0x055>;
+defm V_CMP_GE_I64 : VOPC_Real_gfx11_gfx12<0x056>;
defm V_CMP_T_I64 : VOPC_Real_gfx11<0x057>;
defm V_CMP_F_U64 : VOPC_Real_gfx11<0x058>;
-defm V_CMP_LT_U64 : VOPC_Real_gfx11<0x059>;
-defm V_CMP_EQ_U64 : VOPC_Real_gfx11<0x05a>;
-defm V_CMP_LE_U64 : VOPC_Real_gfx11<0x05b>;
-defm V_CMP_GT_U64 : VOPC_Real_gfx11<0x05c>;
-defm V_CMP_NE_U64 : VOPC_Real_gfx11<0x05d>;
-defm V_CMP_GE_U64 : VOPC_Real_gfx11<0x05e>;
+defm V_CMP_LT_U64 : VOPC_Real_gfx11_gfx12<0x059>;
+defm V_CMP_EQ_U64 : VOPC_Real_gfx11_gfx12<0x05a>;
+defm V_CMP_LE_U64 : VOPC_Real_gfx11_gfx12<0x05b>;
+defm V_CMP_GT_U64 : VOPC_Real_gfx11_gfx12<0x05c>;
+defm V_CMP_NE_U64 : VOPC_Real_gfx11_gfx12<0x05d>;
+defm V_CMP_GE_U64 : VOPC_Real_gfx11_gfx12<0x05e>;
defm V_CMP_T_U64 : VOPC_Real_gfx11<0x05f>;
-defm V_CMP_CLASS_F16_t16 : VOPC_Real_t16_gfx11<0x07d, "v_cmp_class_f16">;
-defm V_CMP_CLASS_F32 : VOPC_Real_gfx11<0x07e>;
-defm V_CMP_CLASS_F64 : VOPC_Real_gfx11<0x07f>;
+defm V_CMP_CLASS_F16_t16 : VOPC_Real_t16_gfx11_gfx12<0x07d, "v_cmp_class_f16">;
+defm V_CMP_CLASS_F32 : VOPC_Real_gfx11_gfx12<0x07e>;
+defm V_CMP_CLASS_F64 : VOPC_Real_gfx11_gfx12<0x07f>;
defm V_CMPX_F_F16_t16 : VOPCX_Real_t16_gfx11<0x080, "v_cmpx_f_f16">;
-defm V_CMPX_LT_F16_t16 : VOPCX_Real_t16_gfx11<0x081, "v_cmpx_lt_f16">;
-defm V_CMPX_EQ_F16_t16 : VOPCX_Real_t16_gfx11<0x082, "v_cmpx_eq_f16">;
-defm V_CMPX_LE_F16_t16 : VOPCX_Real_t16_gfx11<0x083, "v_cmpx_le_f16">;
-defm V_CMPX_GT_F16_t16 : VOPCX_Real_t16_gfx11<0x084, "v_cmpx_gt_f16">;
-defm V_CMPX_LG_F16_t16 : VOPCX_Real_t16_gfx11<0x085, "v_cmpx_lg_f16">;
-defm V_CMPX_GE_F16_t16 : VOPCX_Real_t16_gfx11<0x086, "v_cmpx_ge_f16">;
-defm V_CMPX_O_F16_t16 : VOPCX_Real_t16_gfx11<0x087, "v_cmpx_o_f16">;
-defm V_CMPX_U_F16_t16 : VOPCX_Real_t16_gfx11<0x088, "v_cmpx_u_f16">;
-defm V_CMPX_NGE_F16_t16 : VOPCX_Real_t16_gfx11<0x089, "v_cmpx_nge_f16">;
-defm V_CMPX_NLG_F16_t16 : VOPCX_Real_t16_gfx11<0x08a, "v_cmpx_nlg_f16">;
-defm V_CMPX_NGT_F16_t16 : VOPCX_Real_t16_gfx11<0x08b, "v_cmpx_ngt_f16">;
-defm V_CMPX_NLE_F16_t16 : VOPCX_Real_t16_gfx11<0x08c, "v_cmpx_nle_f16">;
-defm V_CMPX_NEQ_F16_t16 : VOPCX_Real_t16_gfx11<0x08d, "v_cmpx_neq_f16">;
-defm V_CMPX_NLT_F16_t16 : VOPCX_Real_t16_gfx11<0x08e, "v_cmpx_nlt_f16">;
+defm V_CMPX_LT_F16_t16 : VOPCX_Real_t16_gfx11_gfx12<0x081, "v_cmpx_lt_f16">;
+defm V_CMPX_EQ_F16_t16 : VOPCX_Real_t16_gfx11_gfx12<0x082, "v_cmpx_eq_f16">;
+defm V_CMPX_LE_F16_t16 : VOPCX_Real_t16_gfx11_gfx12<0x083, "v_cmpx_le_f16">;
+defm V_CMPX_GT_F16_t16 : VOPCX_Real_t16_gfx11_gfx12<0x084, "v_cmpx_gt_f16">;
+defm V_CMPX_LG_F16_t16 : VOPCX_Real_t16_gfx11_gfx12<0x085, "v_cmpx_lg_f16">;
+defm V_CMPX_GE_F16_t16 : VOPCX_Real_t16_gfx11_gfx12<0x086, "v_cmpx_ge_f16">;
+defm V_CMPX_O_F16_t16 : VOPCX_Real_t16_gfx11_gfx12<0x087, "v_cmpx_o_f16">;
+defm V_CMPX_U_F16_t16 : VOPCX_Real_t16_gfx11_gfx12<0x088, "v_cmpx_u_f16">;
+defm V_CMPX_NGE_F16_t16 : VOPCX_Real_t16_gfx11_gfx12<0x089, "v_cmpx_nge_f16">;
+defm V_CMPX_NLG_F16_t16 : VOPCX_Real_t16_gfx11_gfx12<0x08a, "v_cmpx_nlg_f16">;
+defm V_CMPX_NGT_F16_t16 : VOPCX_Real_t16_gfx11_gfx12<0x08b, "v_cmpx_ngt_f16">;
+defm V_CMPX_NLE_F16_t16 : VOPCX_Real_t16_gfx11_gfx12<0x08c, "v_cmpx_nle_f16">;
+defm V_CMPX_NEQ_F16_t16 : VOPCX_Real_t16_gfx11_gfx12<0x08d, "v_cmpx_neq_f16">;
+defm V_CMPX_NLT_F16_t16 : VOPCX_Real_t16_gfx11_gfx12<0x08e, "v_cmpx_nlt_f16">;
defm V_CMPX_T_F16_t16 : VOPCX_Real_with_name_gfx11<0x08f, "V_CMPX_TRU_F16_t16", "v_cmpx_t_f16", "v_cmpx_tru_f16">;
defm V_CMPX_F_F32 : VOPCX_Real_gfx11<0x090>;
-defm V_CMPX_LT_F32 : VOPCX_Real_gfx11<0x091>;
-defm V_CMPX_EQ_F32 : VOPCX_Real_gfx11<0x092>;
-defm V_CMPX_LE_F32 : VOPCX_Real_gfx11<0x093>;
-defm V_CMPX_GT_F32 : VOPCX_Real_gfx11<0x094>;
-defm V_CMPX_LG_F32 : VOPCX_Real_gfx11<0x095>;
-defm V_CMPX_GE_F32 : VOPCX_Real_gfx11<0x096>;
-defm V_CMPX_O_F32 : VOPCX_Real_gfx11<0x097>;
-defm V_CMPX_U_F32 : VOPCX_Real_gfx11<0x098>;
-defm V_CMPX_NGE_F32 : VOPCX_Real_gfx11<0x099>;
-defm V_CMPX_NLG_F32 : VOPCX_Real_gfx11<0x09a>;
-defm V_CMPX_NGT_F32 : VOPCX_Real_gfx11<0x09b>;
-defm V_CMPX_NLE_F32 : VOPCX_Real_gfx11<0x09c>;
-defm V_CMPX_NEQ_F32 : VOPCX_Real_gfx11<0x09d>;
-defm V_CMPX_NLT_F32 : VOPCX_Real_gfx11<0x09e>;
+defm V_CMPX_LT_F32 : VOPCX_Real_gfx11_gfx12<0x091>;
+defm V_CMPX_EQ_F32 : VOPCX_Real_gfx11_gfx12<0x092>;
+defm V_CMPX_LE_F32 : VOPCX_Real_gfx11_gfx12<0x093>;
+defm V_CMPX_GT_F32 : VOPCX_Real_gfx11_gfx12<0x094>;
+defm V_CMPX_LG_F32 : VOPCX_Real_gfx11_gfx12<0x095>;
+defm V_CMPX_GE_F32 : VOPCX_Real_gfx11_gfx12<0x096>;
+defm V_CMPX_O_F32 : VOPCX_Real_gfx11_gfx12<0x097>;
+defm V_CMPX_U_F32 : VOPCX_Real_gfx11_gfx12<0x098>;
+defm V_CMPX_NGE_F32 : VOPCX_Real_gfx11_gfx12<0x099>;
+defm V_CMPX_NLG_F32 : VOPCX_Real_gfx11_gfx12<0x09a>;
+defm V_CMPX_NGT_F32 : VOPCX_Real_gfx11_gfx12<0x09b>;
+defm V_CMPX_NLE_F32 : VOPCX_Real_gfx11_gfx12<0x09c>;
+defm V_CMPX_NEQ_F32 : VOPCX_Real_gfx11_gfx12<0x09d>;
+defm V_CMPX_NLT_F32 : VOPCX_Real_gfx11_gfx12<0x09e>;
defm V_CMPX_T_F32 : VOPCX_Real_with_name_gfx11<0x09f, "V_CMPX_TRU_F32", "v_cmpx_t_f32">;
defm V_CMPX_F_F64 : VOPCX_Real_gfx11<0x0a0>;
-defm V_CMPX_LT_F64 : VOPCX_Real_gfx11<0x0a1>;
-defm V_CMPX_EQ_F64 : VOPCX_Real_gfx11<0x0a2>;
-defm V_CMPX_LE_F64 : VOPCX_Real_gfx11<0x0a3>;
-defm V_CMPX_GT_F64 : VOPCX_Real_gfx11<0x0a4>;
-defm V_CMPX_LG_F64 : VOPCX_Real_gfx11<0x0a5>;
-defm V_CMPX_GE_F64 : VOPCX_Real_gfx11<0x0a6>;
-defm V_CMPX_O_F64 : VOPCX_Real_gfx11<0x0a7>;
-defm V_CMPX_U_F64 : VOPCX_Real_gfx11<0x0a8>;
-defm V_CMPX_NGE_F64 : VOPCX_Real_gfx11<0x0a9>;
-defm V_CMPX_NLG_F64 : VOPCX_Real_gfx11<0x0aa>;
-defm V_CMPX_NGT_F64 : VOPCX_Real_gfx11<0x0ab>;
-defm V_CMPX_NLE_F64 : VOPCX_Real_gfx11<0x0ac>;
-defm V_CMPX_NEQ_F64 : VOPCX_Real_gfx11<0x0ad>;
-defm V_CMPX_NLT_F64 : VOPCX_Real_gfx11<0x0ae>;
+defm V_CMPX_LT_F64 : VOPCX_Real_gfx11_gfx12<0x0a1>;
+defm V_CMPX_EQ_F64 : VOPCX_Real_gfx11_gfx12<0x0a2>;
+defm V_CMPX_LE_F64 : VOPCX_Real_gfx11_gfx12<0x0a3>;
+defm V_CMPX_GT_F64 : VOPCX_Real_gfx11_gfx12<0x0a4>;
+defm V_CMPX_LG_F64 : VOPCX_Real_gfx11_gfx12<0x0a5>;
+defm V_CMPX_GE_F64 : VOPCX_Real_gfx11_gfx12<0x0a6>;
+defm V_CMPX_O_F64 : VOPCX_Real_gfx11_gfx12<0x0a7>;
+defm V_CMPX_U_F64 : VOPCX_Real_gfx11_gfx12<0x0a8>;
+defm V_CMPX_NGE_F64 : VOPCX_Real_gfx11_gfx12<0x0a9>;
+defm V_CMPX_NLG_F64 : VOPCX_Real_gfx11_gfx12<0x0aa>;
+defm V_CMPX_NGT_F64 : VOPCX_Real_gfx11_gfx12<0x0ab>;
+defm V_CMPX_NLE_F64 : VOPCX_Real_gfx11_gfx12<0x0ac>;
+defm V_CMPX_NEQ_F64 : VOPCX_Real_gfx11_gfx12<0x0ad>;
+defm V_CMPX_NLT_F64 : VOPCX_Real_gfx11_gfx12<0x0ae>;
defm V_CMPX_T_F64 : VOPCX_Real_with_name_gfx11<0x0af, "V_CMPX_TRU_F64", "v_cmpx_t_f64">;
-defm V_CMPX_LT_I16_t16 : VOPCX_Real_t16_gfx11<0x0b1, "v_cmpx_lt_i16">;
-defm V_CMPX_EQ_I16_t16 : VOPCX_Real_t16_gfx11<0x0b2, "v_cmpx_eq_i16">;
-defm V_CMPX_LE_I16_t16 : VOPCX_Real_t16_gfx11<0x0b3, "v_cmpx_le_i16">;
-defm V_CMPX_GT_I16_t16 : VOPCX_Real_t16_gfx11<0x0b4, "v_cmpx_gt_i16">;
-defm V_CMPX_NE_I16_t16 : VOPCX_Real_t16_gfx11<0x0b5, "v_cmpx_ne_i16">;
-defm V_CMPX_GE_I16_t16 : VOPCX_Real_t16_gfx11<0x0b6, "v_cmpx_ge_i16">;
-defm V_CMPX_LT_U16_t16 : VOPCX_Real_t16_gfx11<0x0b9, "v_cmpx_lt_u16">;
-defm V_CMPX_EQ_U16_t16 : VOPCX_Real_t16_gfx11<0x0ba, "v_cmpx_eq_u16">;
-defm V_CMPX_LE_U16_t16 : VOPCX_Real_t16_gfx11<0x0bb, "v_cmpx_le_u16">;
-defm V_CMPX_GT_U16_t16 : VOPCX_Real_t16_gfx11<0x0bc, "v_cmpx_gt_u16">;
-defm V_CMPX_NE_U16_t16 : VOPCX_Real_t16_gfx11<0x0bd, "v_cmpx_ne_u16">;
-defm V_CMPX_GE_U16_t16 : VOPCX_Real_t16_gfx11<0x0be, "v_cmpx_ge_u16">;
+defm V_CMPX_LT_I16_t16 : VOPCX_Real_t16_gfx11_gfx12<0x0b1, "v_cmpx_lt_i16">;
+defm V_CMPX_EQ_I16_t16 : VOPCX_Real_t16_gfx11_gfx12<0x0b2, "v_cmpx_eq_i16">;
+defm V_CMPX_LE_I16_t16 : VOPCX_Real_t16_gfx11_gfx12<0x0b3, "v_cmpx_le_i16">;
+defm V_CMPX_GT_I16_t16 : VOPCX_Real_t16_gfx11_gfx12<0x0b4, "v_cmpx_gt_i16">;
+defm V_CMPX_NE_I16_t16 : VOPCX_Real_t16_gfx11_gfx12<0x0b5, "v_cmpx_ne_i16">;
+defm V_CMPX_GE_I16_t16 : VOPCX_Real_t16_gfx11_gfx12<0x0b6, "v_cmpx_ge_i16">;
+defm V_CMPX_LT_U16_t16 : VOPCX_Real_t16_gfx11_gfx12<0x0b9, "v_cmpx_lt_u16">;
+defm V_CMPX_EQ_U16_t16 : VOPCX_Real_t16_gfx11_gfx12<0x0ba, "v_cmpx_eq_u16">;
+defm V_CMPX_LE_U16_t16 : VOPCX_Real_t16_gfx11_gfx12<0x0bb, "v_cmpx_le_u16">;
+defm V_CMPX_GT_U16_t16 : VOPCX_Real_t16_gfx11_gfx12<0x0bc, "v_cmpx_gt_u16">;
+defm V_CMPX_NE_U16_t16 : VOPCX_Real_t16_gfx11_gfx12<0x0bd, "v_cmpx_ne_u16">;
+defm V_CMPX_GE_U16_t16 : VOPCX_Real_t16_gfx11_gfx12<0x0be, "v_cmpx_ge_u16">;
defm V_CMPX_F_I32 : VOPCX_Real_gfx11<0x0c0>;
-defm V_CMPX_LT_I32 : VOPCX_Real_gfx11<0x0c1>;
-defm V_CMPX_EQ_I32 : VOPCX_Real_gfx11<0x0c2>;
-defm V_CMPX_LE_I32 : VOPCX_Real_gfx11<0x0c3>;
-defm V_CMPX_GT_I32 : VOPCX_Real_gfx11<0x0c4>;
-defm V_CMPX_NE_I32 : VOPCX_Real_gfx11<0x0c5>;
-defm V_CMPX_GE_I32 : VOPCX_Real_gfx11<0x0c6>;
+defm V_CMPX_LT_I32 : VOPCX_Real_gfx11_gfx12<0x0c1>;
+defm V_CMPX_EQ_I32 : VOPCX_Real_gfx11_gfx12<0x0c2>;
+defm V_CMPX_LE_I32 : VOPCX_Real_gfx11_gfx12<0x0c3>;
+defm V_CMPX_GT_I32 : VOPCX_Real_gfx11_gfx12<0x0c4>;
+defm V_CMPX_NE_I32 : VOPCX_Real_gfx11_gfx12<0x0c5>;
+defm V_CMPX_GE_I32 : VOPCX_Real_gfx11_gfx12<0x0c6>;
defm V_CMPX_T_I32 : VOPCX_Real_gfx11<0x0c7>;
defm V_CMPX_F_U32 : VOPCX_Real_gfx11<0x0c8>;
-defm V_CMPX_LT_U32 : VOPCX_Real_gfx11<0x0c9>;
-defm V_CMPX_EQ_U32 : VOPCX_Real_gfx11<0x0ca>;
-defm V_CMPX_LE_U32 : VOPCX_Real_gfx11<0x0cb>;
-defm V_CMPX_GT_U32 : VOPCX_Real_gfx11<0x0cc>;
-defm V_CMPX_NE_U32 : VOPCX_Real_gfx11<0x0cd>;
-defm V_CMPX_GE_U32 : VOPCX_Real_gfx11<0x0ce>;
+defm V_CMPX_LT_U32 : VOPCX_Real_gfx11_gfx12<0x0c9>;
+defm V_CMPX_EQ_U32 : VOPCX_Real_gfx11_gfx12<0x0ca>;
+defm V_CMPX_LE_U32 : VOPCX_Real_gfx11_gfx12<0x0cb>;
+defm V_CMPX_GT_U32 : VOPCX_Real_gfx11_gfx12<0x0cc>;
+defm V_CMPX_NE_U32 : VOPCX_Real_gfx11_gfx12<0x0cd>;
+defm V_CMPX_GE_U32 : VOPCX_Real_gfx11_gfx12<0x0ce>;
defm V_CMPX_T_U32 : VOPCX_Real_gfx11<0x0cf>;
defm V_CMPX_F_I64 : VOPCX_Real_gfx11<0x0d0>;
-defm V_CMPX_LT_I64 : VOPCX_Real_gfx11<0x0d1>;
-defm V_CMPX_EQ_I64 : VOPCX_Real_gfx11<0x0d2>;
-defm V_CMPX_LE_I64 : VOPCX_Real_gfx11<0x0d3>;
-defm V_CMPX_GT_I64 : VOPCX_Real_gfx11<0x0d4>;
-defm V_CMPX_NE_I64 : VOPCX_Real_gfx11<0x0d5>;
-defm V_CMPX_GE_I64 : VOPCX_Real_gfx11<0x0d6>;
+defm V_CMPX_LT_I64 : VOPCX_Real_gfx11_gfx12<0x0d1>;
+defm V_CMPX_EQ_I64 : VOPCX_Real_gfx11_gfx12<0x0d2>;
+defm V_CMPX_LE_I64 : VOPCX_Real_gfx11_gfx12<0x0d3>;
+defm V_CMPX_GT_I64 : VOPCX_Real_gfx11_gfx12<0x0d4>;
+defm V_CMPX_NE_I64 : VOPCX_Real_gfx11_gfx12<0x0d5>;
+defm V_CMPX_GE_I64 : VOPCX_Real_gfx11_gfx12<0x0d6>;
defm V_CMPX_T_I64 : VOPCX_Real_gfx11<0x0d7>;
defm V_CMPX_F_U64 : VOPCX_Real_gfx11<0x0d8>;
-defm V_CMPX_LT_U64 : VOPCX_Real_gfx11<0x0d9>;
-defm V_CMPX_EQ_U64 : VOPCX_Real_gfx11<0x0da>;
-defm V_CMPX_LE_U64 : VOPCX_Real_gfx11<0x0db>;
-defm V_CMPX_GT_U64 : VOPCX_Real_gfx11<0x0dc>;
-defm V_CMPX_NE_U64 : VOPCX_Real_gfx11<0x0dd>;
-defm V_CMPX_GE_U64 : VOPCX_Real_gfx11<0x0de>;
+defm V_CMPX_LT_U64 : VOPCX_Real_gfx11_gfx12<0x0d9>;
+defm V_CMPX_EQ_U64 : VOPCX_Real_gfx11_gfx12<0x0da>;
+defm V_CMPX_LE_U64 : VOPCX_Real_gfx11_gfx12<0x0db>;
+defm V_CMPX_GT_U64 : VOPCX_Real_gfx11_gfx12<0x0dc>;
+defm V_CMPX_NE_U64 : VOPCX_Real_gfx11_gfx12<0x0dd>;
+defm V_CMPX_GE_U64 : VOPCX_Real_gfx11_gfx12<0x0de>;
defm V_CMPX_T_U64 : VOPCX_Real_gfx11<0x0df>;
-defm V_CMPX_CLASS_F16_t16 : VOPCX_Real_t16_gfx11<0x0fd, "v_cmpx_class_f16">;
-defm V_CMPX_CLASS_F32 : VOPCX_Real_gfx11<0x0fe>;
-defm V_CMPX_CLASS_F64 : VOPCX_Real_gfx11<0x0ff>;
+defm V_CMPX_CLASS_F16_t16 : VOPCX_Real_t16_gfx11_gfx12<0x0fd, "v_cmpx_class_f16">;
+defm V_CMPX_CLASS_F32 : VOPCX_Real_gfx11_gfx12<0x0fe>;
+defm V_CMPX_CLASS_F64 : VOPCX_Real_gfx11_gfx12<0x0ff>;
//===----------------------------------------------------------------------===//
// GFX10.
@@ -1968,10 +2016,13 @@ multiclass VOPCX_Real_gfx6_gfx7_gfx10 <bits<9> op> :
VOPC_Real_gfx6_gfx7<op>, VOPCX_Real_gfx10<op>;
multiclass VOPC_Real_gfx6_gfx7_gfx10_gfx11<bits<9> op> :
- VOPC_Real_gfx6_gfx7_gfx10<op>, VOPC_Real_gfx11<op>;
+ VOPC_Real_gfx6_gfx7_gfx10<op>, VOPC_Real_Base<GFX11Gen, op>;
multiclass VOPCX_Real_gfx6_gfx7_gfx10_gfx11<bits<9> op> :
- VOPCX_Real_gfx6_gfx7_gfx10<op>, VOPCX_Real_gfx11<op>;
+ VOPCX_Real_gfx6_gfx7_gfx10<op>, VOPCX_Real<GFX11Gen, op>;
+
+multiclass VOPC_Real_gfx6_gfx7_gfx10_gfx11_gfx12<bits<9> op> :
+ VOPC_Real_gfx6_gfx7_gfx10_gfx11<op>, VOPC_Real_Base<GFX12Gen, op>;
defm V_CMP_F_F32 : VOPC_Real_gfx6_gfx7_gfx10<0x000>;
defm V_CMP_LT_F32 : VOPC_Real_gfx6_gfx7_gfx10<0x001>;
@@ -2006,20 +2057,20 @@ defm V_CMPX_NEQ_F32 : VOPCX_Real_gfx6_gfx7_gfx10<0x01d>;
defm V_CMPX_NLT_F32 : VOPCX_Real_gfx6_gfx7_gfx10<0x01e>;
defm V_CMPX_TRU_F32 : VOPCX_Real_gfx6_gfx7_gfx10<0x01f>;
defm V_CMP_F_F64 : VOPC_Real_gfx6_gfx7_gfx10_gfx11<0x020>;
-defm V_CMP_LT_F64 : VOPC_Real_gfx6_gfx7_gfx10_gfx11<0x021>;
-defm V_CMP_EQ_F64 : VOPC_Real_gfx6_gfx7_gfx10_gfx11<0x022>;
-defm V_CMP_LE_F64 : VOPC_Real_gfx6_gfx7_gfx10_gfx11<0x023>;
-defm V_CMP_GT_F64 : VOPC_Real_gfx6_gfx7_gfx10_gfx11<0x024>;
-defm V_CMP_LG_F64 : VOPC_Real_gfx6_gfx7_gfx10_gfx11<0x025>;
-defm V_CMP_GE_F64 : VOPC_Real_gfx6_gfx7_gfx10_gfx11<0x026>;
-defm V_CMP_O_F64 : VOPC_Real_gfx6_gfx7_gfx10_gfx11<0x027>;
-defm V_CMP_U_F64 : VOPC_Real_gfx6_gfx7_gfx10_gfx11<0x028>;
-defm V_CMP_NGE_F64 : VOPC_Real_gfx6_gfx7_gfx10_gfx11<0x029>;
-defm V_CMP_NLG_F64 : VOPC_Real_gfx6_gfx7_gfx10_gfx11<0x02a>;
-defm V_CMP_NGT_F64 : VOPC_Real_gfx6_gfx7_gfx10_gfx11<0x02b>;
-defm V_CMP_NLE_F64 : VOPC_Real_gfx6_gfx7_gfx10_gfx11<0x02c>;
-defm V_CMP_NEQ_F64 : VOPC_Real_gfx6_gfx7_gfx10_gfx11<0x02d>;
-defm V_CMP_NLT_F64 : VOPC_Real_gfx6_gfx7_gfx10_gfx11<0x02e>;
+defm V_CMP_LT_F64 : VOPC_Real_gfx6_gfx7_gfx10_gfx11_gfx12<0x021>;
+defm V_CMP_EQ_F64 : VOPC_Real_gfx6_gfx7_gfx10_gfx11_gfx12<0x022>;
+defm V_CMP_LE_F64 : VOPC_Real_gfx6_gfx7_gfx10_gfx11_gfx12<0x023>;
+defm V_CMP_GT_F64 : VOPC_Real_gfx6_gfx7_gfx10_gfx11_gfx12<0x024>;
+defm V_CMP_LG_F64 : VOPC_Real_gfx6_gfx7_gfx10_gfx11_gfx12<0x025>;
+defm V_CMP_GE_F64 : VOPC_Real_gfx6_gfx7_gfx10_gfx11_gfx12<0x026>;
+defm V_CMP_O_F64 : VOPC_Real_gfx6_gfx7_gfx10_gfx11_gfx12<0x027>;
+defm V_CMP_U_F64 : VOPC_Real_gfx6_gfx7_gfx10_gfx11_gfx12<0x028>;
+defm V_CMP_NGE_F64 : VOPC_Real_gfx6_gfx7_gfx10_gfx11_gfx12<0x029>;
+defm V_CMP_NLG_F64 : VOPC_Real_gfx6_gfx7_gfx10_gfx11_gfx12<0x02a>;
+defm V_CMP_NGT_F64 : VOPC_Real_gfx6_gfx7_gfx10_gfx11_gfx12<0x02b>;
+defm V_CMP_NLE_F64 : VOPC_Real_gfx6_gfx7_gfx10_gfx11_gfx12<0x02c>;
+defm V_CMP_NEQ_F64 : VOPC_Real_gfx6_gfx7_gfx10_gfx11_gfx12<0x02d>;
+defm V_CMP_NLT_F64 : VOPC_Real_gfx6_gfx7_gfx10_gfx11_gfx12<0x02e>;
defm V_CMP_TRU_F64 : VOPC_Real_gfx6_gfx7_gfx10<0x02f>;
defm V_CMPX_F_F64 : VOPCX_Real_gfx6_gfx7_gfx10<0x030>;
defm V_CMPX_LT_F64 : VOPCX_Real_gfx6_gfx7_gfx10<0x031>;
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/VOPDInstructions.td b/contrib/llvm-project/llvm/lib/Target/AMDGPU/VOPDInstructions.td
index eb2e9f04022e..c6af3d67c560 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/VOPDInstructions.td
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/VOPDInstructions.td
@@ -54,23 +54,34 @@ class VOPD_MADKe<bits<4> opX, bits<5> opY> : Enc96 {
// VOPD classes
//===----------------------------------------------------------------------===//
+
+class GFXGenD<GFXGen Gen, list<string> DXPseudos, list<string> DYPseudos,
+ Predicate subtargetPred = Gen.AssemblerPredicate> :
+ GFXGen<Gen.AssemblerPredicate, Gen.DecoderNamespace, Gen.Suffix,
+ Gen.Subtarget> {
+ list<string> VOPDXPseudos = DXPseudos;
+ list<string> VOPDYPseudos = DYPseudos;
+ Predicate SubtargetPredicate = subtargetPred;
+}
+
class VOPD_Base<dag outs, dag ins, string asm, VOP_Pseudo VDX, VOP_Pseudo VDY,
- VOPD_Component XasVC, VOPD_Component YasVC>
+ VOPD_Component XasVC, VOPD_Component YasVC, GFXGenD Gen>
: VOPAnyCommon<outs, ins, asm, []>,
VOP<NAME>,
- SIMCInstr<NAME, SIEncodingFamily.GFX11> {
+ SIMCInstr<NAME, Gen.Subtarget> {
// Fields for table indexing
Instruction Opcode = !cast<Instruction>(NAME);
bits<5> OpX = XasVC.VOPDOp;
bits<5> OpY = YasVC.VOPDOp;
+ bits<4> SubTgt = Gen.Subtarget;
let VALU = 1;
- let DecoderNamespace = "GFX11";
- let AssemblerPredicate = isGFX11Plus;
+ let DecoderNamespace = Gen.DecoderNamespace;
+ let AssemblerPredicate = Gen.AssemblerPredicate;
let WaveSizePredicate = isWave32;
let isCodeGenOnly = 0;
- let SubtargetPredicate = isGFX11Plus;
+ let SubtargetPredicate = Gen.SubtargetPredicate;
let AsmMatchConverter = "cvtVOPD";
let Size = 8;
let ReadsModeReg = !or(VDX.ReadsModeReg, VDY.ReadsModeReg);
@@ -97,77 +108,103 @@ class VOPD_Base<dag outs, dag ins, string asm, VOP_Pseudo VDX, VOP_Pseudo VDY,
}
class VOPD<dag outs, dag ins, string asm, VOP_Pseudo VDX, VOP_Pseudo VDY,
- VOPD_Component XasVC, VOPD_Component YasVC>
- : VOPD_Base<outs, ins, asm, VDX, VDY, XasVC, YasVC>,
+ VOPD_Component XasVC, VOPD_Component YasVC, GFXGenD Gen>
+ : VOPD_Base<outs, ins, asm, VDX, VDY, XasVC, YasVC, Gen>,
VOPDe<XasVC.VOPDOp{3-0}, YasVC.VOPDOp> {
let Inst{16-9} = !if (!eq(VDX.Mnemonic, "v_mov_b32"), 0x0, vsrc1X);
let Inst{48-41} = !if (!eq(VDY.Mnemonic, "v_mov_b32"), 0x0, vsrc1Y);
}
class VOPD_MADK<dag outs, dag ins, string asm, VOP_Pseudo VDX, VOP_Pseudo VDY,
- VOPD_Component XasVC, VOPD_Component YasVC>
- : VOPD_Base<outs, ins, asm, VDX, VDY, XasVC, YasVC>,
+ VOPD_Component XasVC, VOPD_Component YasVC, GFXGenD Gen>
+ : VOPD_Base<outs, ins, asm, VDX, VDY, XasVC, YasVC, Gen>,
VOPD_MADKe<XasVC.VOPDOp{3-0}, YasVC.VOPDOp> {
let Inst{16-9} = !if (!eq(VDX.Mnemonic, "v_mov_b32"), 0x0, vsrc1X);
let Inst{48-41} = !if (!eq(VDY.Mnemonic, "v_mov_b32"), 0x0, vsrc1Y);
let Size = 12;
+ let FixedSize = 1;
}
// V_DUAL_DOT2ACC_F32_BF16 is a legal instruction, but V_DOT2ACC_F32_BF16 is
-// not. Since we generate the DUAL form by converting from the normal form we
-// will never generate it.
-defvar VOPDYPseudos = [
+// not. V_DUAL_DOT2C_F32_BF16 is a legal instruction on GFX12, but
+// V_DOT2C_F32_F16_e32 is not. Since we generate the DUAL form by converting
+// from the normal form we will never generate them.
+defvar VOPDPseudosCommon = [
"V_FMAC_F32_e32", "V_FMAAK_F32", "V_FMAMK_F32", "V_MUL_F32_e32",
"V_ADD_F32_e32", "V_SUB_F32_e32", "V_SUBREV_F32_e32", "V_MUL_LEGACY_F32_e32",
- "V_MOV_B32_e32", "V_CNDMASK_B32_e32", "V_MAX_F32_e32", "V_MIN_F32_e32",
- "V_DOT2C_F32_F16_e32", "V_ADD_U32_e32", "V_LSHLREV_B32_e32", "V_AND_B32_e32"
+ "V_MOV_B32_e32", "V_CNDMASK_B32_e32", "V_MAX_F32_e32", "V_MIN_F32_e32"
];
-defvar VOPDXPseudos = VOPDYPseudos[0...VOPDX_Max_Index];
+defvar VOPDPseudosGFX11 = ["V_DOT2C_F32_F16_e32"];
+defvar VOPDYOnlyPseudosCommon = ["V_ADD_U32_e32", "V_LSHLREV_B32_e32",
+ "V_AND_B32_e32"];
+
+defvar VOPDXPseudosGFX11 = !listconcat(VOPDPseudosCommon, VOPDPseudosGFX11);
+defvar VOPDXPseudosGFX12 = VOPDPseudosCommon;
+defvar VOPDYPseudosGFX11 = !listconcat(VOPDXPseudosGFX11, VOPDYOnlyPseudosCommon);
+defvar VOPDYPseudosGFX12 = !listconcat(VOPDXPseudosGFX12, VOPDYOnlyPseudosCommon);
+
+def GFX11GenD : GFXGenD<GFX11Gen, VOPDXPseudosGFX11, VOPDYPseudosGFX11>;
+def GFX12GenD : GFXGenD<GFX12Gen, VOPDXPseudosGFX12, VOPDYPseudosGFX12>;
+
def VOPDDstYOperand : RegisterOperand<VGPR_32, "printRegularOperand"> {
let DecoderMethod = "decodeOperandVOPDDstY";
}
-foreach x = VOPDXPseudos in {
- foreach y = VOPDYPseudos in {
- defvar xInst = !cast<VOP_Pseudo>(x);
- defvar yInst = !cast<VOP_Pseudo>(y);
- defvar XasVC = !cast<VOPD_Component>(x);
- defvar YasVC = !cast<VOPD_Component>(y);
- defvar isMADK = !or(!eq(x, "V_FMAAK_F32"), !eq(x, "V_FMAMK_F32"),
- !eq(y, "V_FMAAK_F32"), !eq(y, "V_FMAMK_F32"));
- // If X or Y is MADK (have a mandatory immediate), all src operands which
- // may contain an optional literal must use the VSrc_*_Deferred operand
- // type. Optional literal operands in MADK VOPD components always use this
- // operand form. If Both X and Y are MADK, the mandatory literal of X
- // additionally must use an alternate operand format which defers to the
- // 'real' Y literal
- defvar isOpXMADK = !or(!eq(x, "V_FMAAK_F32"), !eq(x, "V_FMAMK_F32"));
- defvar isOpYMADK = !or(!eq(y, "V_FMAAK_F32"), !eq(y, "V_FMAMK_F32"));
- defvar OpName = "V_DUAL_" # !substr(x,2) # "_X_" # !substr(y,2);
- defvar outs = (outs VGPRSrc_32:$vdstX, VOPDDstYOperand:$vdstY);
- if !or(isOpXMADK, isOpYMADK) then {
- if !and(isOpXMADK, isOpYMADK) then {
- defvar X_MADK_Pfl = !cast<VOP_MADK_Base>(xInst.Pfl);
- defvar ins = !con(xInst.Pfl.InsVOPDXDeferred, yInst.Pfl.InsVOPDY);
- defvar asm = XasVC.VOPDName #" "# X_MADK_Pfl.AsmVOPDXDeferred #" :: "# YasVC.VOPDName #" "# yInst.Pfl.AsmVOPDY;
- def OpName : VOPD_MADK<outs, ins, asm, xInst, yInst, XasVC, YasVC>;
- } else {
- defvar asm = XasVC.VOPDName #" "# xInst.Pfl.AsmVOPDX #" :: "# YasVC.VOPDName #" "# yInst.Pfl.AsmVOPDY;
- if isOpXMADK then {
- assert !not(isOpYMADK), "Expected only OpX as MADK";
- defvar ins = !con(xInst.Pfl.InsVOPDX, yInst.Pfl.InsVOPDYDeferred);
- def OpName : VOPD_MADK<outs, ins, asm, xInst, yInst, XasVC, YasVC>;
- } else {
- assert !not(isOpXMADK), "Expected only OpY as MADK";
+class getRenamed<string VOPDName, GFXGen Gen> {
+ string ret = !if(!eq(Gen.Subtarget, GFX12Gen.Subtarget),
+ !if(!eq(VOPDName, "v_dual_max_f32"),
+ "v_dual_max_num_f32",
+ !if(!eq(VOPDName, "v_dual_min_f32"),
+ "v_dual_min_num_f32",
+ VOPDName)),
+ VOPDName);
+}
+
+foreach Gen = [GFX11GenD, GFX12GenD] in {
+ foreach x = Gen.VOPDXPseudos in {
+ foreach y = Gen.VOPDYPseudos in {
+ defvar xInst = !cast<VOP_Pseudo>(x);
+ defvar yInst = !cast<VOP_Pseudo>(y);
+ defvar XasVC = !cast<VOPD_Component>(x);
+ defvar YasVC = !cast<VOPD_Component>(y);
+ defvar xAsmName = getRenamed<XasVC.VOPDName, Gen>.ret;
+ defvar yAsmName = getRenamed<YasVC.VOPDName, Gen>.ret;
+ defvar isMADK = !or(!eq(x, "V_FMAAK_F32"), !eq(x, "V_FMAMK_F32"),
+ !eq(y, "V_FMAAK_F32"), !eq(y, "V_FMAMK_F32"));
+ // If X or Y is MADK (have a mandatory immediate), all src operands which
+ // may contain an optional literal must use the VSrc_*_Deferred operand
+ // type. Optional literal operands in MADK VOPD components always use this
+ // operand form. If Both X and Y are MADK, the mandatory literal of X
+ // additionally must use an alternate operand format which defers to the
+ // 'real' Y literal
+ defvar isOpXMADK = !or(!eq(x, "V_FMAAK_F32"), !eq(x, "V_FMAMK_F32"));
+ defvar isOpYMADK = !or(!eq(y, "V_FMAAK_F32"), !eq(y, "V_FMAMK_F32"));
+ defvar OpName = "V_DUAL_" # !substr(x,2) # "_X_" # !substr(y,2) # Gen.Suffix;
+ defvar outs = (outs VGPRSrc_32:$vdstX, VOPDDstYOperand:$vdstY);
+ if !or(isOpXMADK, isOpYMADK) then {
+ if !and(isOpXMADK, isOpYMADK) then {
+ defvar X_MADK_Pfl = !cast<VOP_MADK_Base>(xInst.Pfl);
defvar ins = !con(xInst.Pfl.InsVOPDXDeferred, yInst.Pfl.InsVOPDY);
- def OpName : VOPD_MADK<outs, ins, asm, xInst, yInst, XasVC, YasVC>;
+ defvar asm = xAsmName #" "# X_MADK_Pfl.AsmVOPDXDeferred #" :: "# yAsmName #" "# yInst.Pfl.AsmVOPDY;
+ def OpName : VOPD_MADK<outs, ins, asm, xInst, yInst, XasVC, YasVC, Gen>;
+ } else {
+ defvar asm = xAsmName #" "# xInst.Pfl.AsmVOPDX #" :: "# yAsmName #" "# yInst.Pfl.AsmVOPDY;
+ if isOpXMADK then {
+ assert !not(isOpYMADK), "Expected only OpX as MADK";
+ defvar ins = !con(xInst.Pfl.InsVOPDX, yInst.Pfl.InsVOPDYDeferred);
+ def OpName : VOPD_MADK<outs, ins, asm, xInst, yInst, XasVC, YasVC, Gen>;
+ } else {
+ assert !not(isOpXMADK), "Expected only OpY as MADK";
+ defvar ins = !con(xInst.Pfl.InsVOPDXDeferred, yInst.Pfl.InsVOPDY);
+ def OpName : VOPD_MADK<outs, ins, asm, xInst, yInst, XasVC, YasVC, Gen>;
+ }
}
+ } else {
+ defvar ins = !con(xInst.Pfl.InsVOPDX, yInst.Pfl.InsVOPDY);
+ defvar asm = xAsmName #" "# xInst.Pfl.AsmVOPDX #" :: "# yAsmName #" "# yInst.Pfl.AsmVOPDY;
+ def OpName : VOPD<outs, ins, asm, xInst, yInst, XasVC, YasVC, Gen>;
}
- } else {
- defvar ins = !con(xInst.Pfl.InsVOPDX, yInst.Pfl.InsVOPDY);
- defvar asm = XasVC.VOPDName #" "# xInst.Pfl.AsmVOPDX #" :: "# YasVC.VOPDName #" "# yInst.Pfl.AsmVOPDY;
- def OpName : VOPD<outs, ins, asm, xInst, yInst, XasVC, YasVC>;
}
}
}
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/VOPInstructions.td b/contrib/llvm-project/llvm/lib/Target/AMDGPU/VOPInstructions.td
index 3755daf4f9b1..fd4626d902ac 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/VOPInstructions.td
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/VOPInstructions.td
@@ -29,6 +29,22 @@ class LetDummies {
string DecoderNamespace;
}
+//===----------------------------------------------------------------------===//
+// VOP Subtarget info
+//===----------------------------------------------------------------------===//
+
+class GFXGen<Predicate pred, string dn, string suffix, int sub> {
+ Predicate AssemblerPredicate = pred;
+ string DecoderNamespace = dn;
+ string Suffix = suffix;
+ int Subtarget = sub;
+}
+
+def GFX12Gen : GFXGen<isGFX12Only, "GFX12", "_gfx12", SIEncodingFamily.GFX12>;
+def GFX11Gen : GFXGen<isGFX11Only, "GFX11", "_gfx11", SIEncodingFamily.GFX11>;
+
+//===----------------------------------------------------------------------===//
+
class VOP <string opName> {
string OpName = opName;
}
@@ -190,6 +206,14 @@ class VOP3_Real <VOP_Pseudo ps, int EncodingFamily, string asm_name = ps.Mnemoni
VOPProfile Pfl = ps.Pfl;
}
+class VOP3_Real_Gen <VOP_Pseudo ps, GFXGen Gen, string asm_name = ps.Mnemonic> :
+ VOP3_Real <ps, Gen.Subtarget, asm_name> {
+ let AssemblerPredicate = !if(ps.Pfl.IsRealTrue16, UseRealTrue16Insts,
+ Gen.AssemblerPredicate);
+ let DecoderNamespace = Gen.DecoderNamespace#
+ !if(ps.Pfl.IsRealTrue16, "", "_FAKE16");
+}
+
// XXX - Is there any reason to distinguish this from regular VOP3
// here?
class VOP3P_Real<VOP_Pseudo ps, int EncodingFamily, string asm_name = ps.Mnemonic> :
@@ -199,6 +223,12 @@ class VOP3P_Real<VOP_Pseudo ps, int EncodingFamily, string asm_name = ps.Mnemoni
let Constraints = !if(!eq(!substr(ps.Mnemonic,0,6), "v_wmma"), "", ps.Constraints);
}
+class VOP3P_Real_Gen<VOP_Pseudo ps, GFXGen Gen, string asm_name = ps.Mnemonic> :
+ VOP3P_Real<ps, Gen.Subtarget, asm_name> {
+ let AssemblerPredicate = Gen.AssemblerPredicate;
+ let DecoderNamespace = Gen.DecoderNamespace;
+}
+
class VOP3a<VOPProfile P> : Enc64 {
bits<4> src0_modifiers;
bits<9> src0;
@@ -234,7 +264,7 @@ class VOP3a_gfx10<bits<10> op, VOPProfile p> : VOP3a<p> {
let Inst{31-26} = 0x35;
}
-class VOP3a_gfx11<bits<10> op, VOPProfile p> : VOP3a_gfx10<op, p>;
+class VOP3a_gfx11_gfx12<bits<10> op, VOPProfile p> : VOP3a_gfx10<op, p>;
class VOP3a_vi <bits<10> op, VOPProfile P> : VOP3a<P> {
let Inst{25-16} = op;
@@ -251,7 +281,7 @@ class VOP3e_gfx10<bits<10> op, VOPProfile p> : VOP3a_gfx10<op, p> {
let Inst{7-0} = !if(p.EmitDst, vdst{7-0}, 0);
}
-class VOP3e_gfx11<bits<10> op, VOPProfile p> : VOP3e_gfx10<op, p>;
+class VOP3e_gfx11_gfx12<bits<10> op, VOPProfile p> : VOP3e_gfx10<op, p>;
class VOP3e_vi <bits<10> op, VOPProfile P> : VOP3a_vi <op, P> {
bits<8> vdst;
@@ -272,9 +302,9 @@ class VOP3OpSel_gfx10<bits<10> op, VOPProfile p> : VOP3e_gfx10<op, p> {
let Inst{14} = !if(p.HasDst, src0_modifiers{3}, 0);
}
-class VOP3OpSel_gfx11<bits<10> op, VOPProfile p> : VOP3OpSel_gfx10<op, p>;
+class VOP3OpSel_gfx11_gfx12<bits<10> op, VOPProfile p> : VOP3OpSel_gfx10<op, p>;
-class VOP3DotOpSel_gfx11<bits<10> op, VOPProfile p> : VOP3OpSel_gfx11<op, p>{
+class VOP3DotOpSel_gfx11_gfx12<bits<10> op, VOPProfile p> : VOP3OpSel_gfx11_gfx12<op, p>{
let Inst{11} = ?;
let Inst{12} = ?;
}
@@ -435,7 +465,7 @@ class VOP3Pe_gfx10 <bits<7> op, VOPProfile P> : VOP3Pe<op, P> {
let Inst{31-23} = 0x198; //encoding
}
-class VOP3Pe_gfx11<bits<7> op, VOPProfile P> : VOP3Pe_gfx10<op, P>;
+class VOP3Pe_gfx11_gfx12<bits<7> op, VOPProfile P> : VOP3Pe_gfx10<op, P>;
class VOP3be_gfx6_gfx7<bits<9> op, VOPProfile p> : VOP3be<p> {
let Inst{25-17} = op;
@@ -448,7 +478,7 @@ class VOP3be_gfx10<bits<10> op, VOPProfile p> : VOP3be<p> {
let Inst{31-26} = 0x35;
}
-class VOP3be_gfx11<bits<10> op, VOPProfile p> : VOP3be_gfx10<op, p>;
+class VOP3be_gfx11_gfx12<bits<10> op, VOPProfile p> : VOP3be_gfx10<op, p>;
class VOP3be_vi <bits<10> op, VOPProfile P> : VOP3be<P> {
bits<1> clamp;
@@ -791,8 +821,8 @@ class VOP_DPP_Pseudo <string OpName, VOPProfile P, list<dag> pattern=[],
string AsmOperands = asmOps;
let AsmMatchConverter = !if(P.HasModifiers, "cvtDPP", "");
- let SubtargetPredicate = !if(P.HasExt64BitDPP, Has64BitDPP, HasDPP);
- let AssemblerPredicate = !if(P.HasExt64BitDPP, Has64BitDPP, HasDPP);
+ let SubtargetPredicate = !if(P.HasExt64BitDPP, HasDPALU_DPP, HasDPP);
+ let AssemblerPredicate = !if(P.HasExt64BitDPP, HasDPALU_DPP, HasDPP);
let AsmVariantName = !if(P.HasExtDPP, AMDGPUAsmVariants.DPP,
AMDGPUAsmVariants.Disable);
let Constraints = !if(P.NumSrcArgs, P.TieRegDPP # " = $vdst", "");
@@ -862,8 +892,8 @@ class VOP_DPP_Base <string OpName, VOPProfile P,
let Size = 8;
let AsmMatchConverter = !if(P.HasModifiers, "cvtDPP", "");
- let SubtargetPredicate = !if(P.HasExt64BitDPP, Has64BitDPP, HasDPP);
- let AssemblerPredicate = !if(P.HasExt64BitDPP, Has64BitDPP, HasDPP);
+ let SubtargetPredicate = !if(P.HasExt64BitDPP, HasDPALU_DPP, HasDPP);
+ let AssemblerPredicate = !if(P.HasExt64BitDPP, HasDPALU_DPP, HasDPP);
let AsmVariantName = !if(P.HasExtDPP, AMDGPUAsmVariants.DPP,
AMDGPUAsmVariants.Disable);
let Constraints = !if(P.NumSrcArgs, P.TieRegDPP # " = $vdst", "");
@@ -1273,6 +1303,19 @@ multiclass VOP3Inst<string OpName, VOPProfile P, SDPatternOperator node = null_f
} // end SubtargetPredicate = isGFX11Plus
}
+class UniformUnaryFragOrOp<SDPatternOperator Op> {
+ SDPatternOperator ret = !if(!or(!isa<SDNode>(Op), !isa<PatFrags>(Op)),
+ UniformUnaryFrag<Op>, Op);
+}
+
+multiclass VOP3PseudoScalarInst<string OpName, VOPProfile P,
+ SDPatternOperator node = null_frag> {
+ def _e64 : VOP3_Pseudo<OpName, P, [(set P.DstVT:$vdst,
+ (UniformUnaryFragOrOp<node>.ret
+ (P.Src0VT (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers, i1:$clamp,
+ i32:$omod))))]>;
+}
+
//===----------------------------------------------------------------------===//
// VOP3 DPP
//===----------------------------------------------------------------------===//
@@ -1294,6 +1337,15 @@ class VOP3_DPP16<bits<10> op, VOP_DPP_Pseudo ps, int subtarget,
string opName = ps.OpName>
: Base_VOP3_DPP16<op, ps, opName>, SIMCInstr<ps.PseudoInstr, subtarget>;
+class VOP3_DPP16_Gen<bits<10> op, VOP_DPP_Pseudo ps, GFXGen Gen,
+ string opName = ps.OpName> :
+ VOP3_DPP16 <op, ps, Gen.Subtarget, opName> {
+ let AssemblerPredicate = !if(ps.Pfl.IsRealTrue16, UseRealTrue16Insts,
+ Gen.AssemblerPredicate);
+ let DecoderNamespace = "DPP"#Gen.DecoderNamespace#
+ !if(ps.Pfl.IsRealTrue16, "", "_FAKE16");
+}
+
class Base_VOP3_DPP8<bits<10> op, VOP_Pseudo ps, string opName = ps.OpName>
: VOP3_DPP8<op, opName, ps.Pfl> {
let VOP3_OPSEL = ps.Pfl.HasOpSel;
@@ -1320,164 +1372,240 @@ class VOP3b_DPP8_Base<bits<10> op, VOP_Pseudo ps, string opName = ps.OpName>
}
//===----------------------------------------------------------------------===//
-// VOP3 GFX11
+// VOP3 GFX11, GFX12
//===----------------------------------------------------------------------===//
-let AssemblerPredicate = isGFX11Only,
- DecoderNamespace = "GFX11" in {
- multiclass VOP3_Real_Base_gfx11<bits<10> op, string opName = NAME,
- bit isSingle = 0> {
- defvar ps = !cast<VOP_Pseudo>(opName#"_e64");
- let IsSingle = !or(isSingle, ps.Pfl.IsSingle) in {
- if ps.Pfl.HasOpSel then
- def _e64_gfx11 :
- VOP3_Real<ps, SIEncodingFamily.GFX11>,
- VOP3OpSel_gfx11<op, ps.Pfl>;
- if !not(ps.Pfl.HasOpSel) then
- def _e64_gfx11 :
- VOP3_Real<ps, SIEncodingFamily.GFX11>,
- VOP3e_gfx11<op, ps.Pfl>;
- }
- }
- multiclass VOP3Dot_Real_Base_gfx11<bits<10> op, string opName = NAME,
- bit isSingle = 0> {
- defvar ps = !cast<VOP_Pseudo>(opName#"_e64");
- let IsSingle = !or(isSingle, ps.Pfl.IsSingle) in {
- def _e64_gfx11 :
- VOP3_Real<ps, SIEncodingFamily.GFX11>,
- VOP3DotOpSel_gfx11<op, ps.Pfl>;
- }
- }
- multiclass VOP3_Real_with_name_gfx11<bits<10> op, string opName,
- string asmName, bit isSingle = 0> {
- defvar ps = !cast<VOP_Pseudo>(opName#"_e64");
- let AsmString = asmName # ps.AsmOperands,
- IsSingle = !or(isSingle, ps.Pfl.IsSingle) in {
- if ps.Pfl.HasOpSel then
- def _e64_gfx11 :
- VOP3_Real<ps, SIEncodingFamily.GFX11>,
- VOP3OpSel_gfx11<op, ps.Pfl>;
- if !not(ps.Pfl.HasOpSel) then
- def _e64_gfx11 :
- VOP3_Real<ps, SIEncodingFamily.GFX11>,
- VOP3e_gfx11<op, ps.Pfl>;
- }
- def _gfx11_VOP3_alias : MnemonicAlias<ps.Mnemonic, asmName>, Requires<[isGFX11Plus]>, LetDummies;
- }
- // for READLANE/WRITELANE
- multiclass VOP3_Real_No_Suffix_gfx11<bits<10> op, string opName = NAME> {
- defvar ps = !cast<VOP_Pseudo>(opName);
- def _e64_gfx11 :
- VOP3_Real<ps, SIEncodingFamily.GFX11>,
- VOP3e_gfx11<op, ps.Pfl>;
- }
- multiclass VOP3_Real_dpp_Base_gfx11<bits<10> op, string opName = NAME> {
- def _e64_dpp_gfx11 : VOP3_DPP16<op, !cast<VOP_DPP_Pseudo>(opName#"_e64"#"_dpp"), SIEncodingFamily.GFX11> {
- let DecoderNamespace = "DPPGFX11";
- }
+multiclass VOP3_Real_Base<GFXGen Gen, bits<10> op, string opName = NAME,
+ bit isSingle = 0> {
+ defvar ps = !cast<VOP_Pseudo>(opName#"_e64");
+ let IsSingle = !or(isSingle, ps.Pfl.IsSingle) in {
+ if ps.Pfl.HasOpSel then
+ def _e64#Gen.Suffix :
+ VOP3_Real_Gen<ps, Gen>,
+ VOP3OpSel_gfx11_gfx12<op, ps.Pfl>;
+ if !not(ps.Pfl.HasOpSel) then
+ def _e64#Gen.Suffix :
+ VOP3_Real_Gen<ps, Gen>,
+ VOP3e_gfx11_gfx12<op, ps.Pfl>;
}
+}
- multiclass VOP3Dot_Real_dpp_Base_gfx11<bits<10> op, string opName = NAME> {
- def _e64_dpp_gfx11 : VOP3_DPP16<op, !cast<VOP_DPP_Pseudo>(opName#"_e64"#"_dpp"), SIEncodingFamily.GFX11> {
- let Inst{11} = ?;
- let Inst{12} = ?;
- let DecoderNamespace = "DPPGFX11";
- }
+multiclass VOP3Dot_Real_Base<GFXGen Gen, bits<10> op, string opName = NAME,
+ bit isSingle = 0> {
+ defvar ps = !cast<VOP_Pseudo>(opName#"_e64");
+ let IsSingle = !or(isSingle, ps.Pfl.IsSingle) in {
+ def _e64#Gen.Suffix :
+ VOP3_Real_Gen<ps, Gen>,
+ VOP3DotOpSel_gfx11_gfx12<op, ps.Pfl>;
}
+}
- multiclass VOP3_Real_dpp_with_name_gfx11<bits<10> op, string opName,
- string asmName> {
- defvar ps = !cast<VOP3_Pseudo>(opName#"_e64");
- let AsmString = asmName # ps.Pfl.AsmVOP3DPP16, DecoderNamespace = "DPPGFX11" in {
- defm NAME : VOP3_Real_dpp_Base_gfx11<op, opName>;
- }
- }
- multiclass VOP3_Real_dpp8_Base_gfx11<bits<10> op, string opName = NAME> {
- defvar ps = !cast<VOP3_Pseudo>(opName#"_e64");
- def _e64_dpp8_gfx11 : Base_VOP3_DPP8<op, ps> {
- let DecoderNamespace = "DPP8GFX11";
- }
+multiclass VOP3_Real_with_name<GFXGen Gen, bits<10> op, string opName,
+ string asmName, bit isSingle = 0> {
+ defvar ps = !cast<VOP_Pseudo>(opName#"_e64");
+ let AsmString = asmName # ps.AsmOperands,
+ IsSingle = !or(isSingle, ps.Pfl.IsSingle) in {
+ if ps.Pfl.HasOpSel then
+ def _e64#Gen.Suffix :
+ VOP3_Real_Gen<ps, Gen>,
+ VOP3OpSel_gfx11_gfx12<op, ps.Pfl>;
+ if !not(ps.Pfl.HasOpSel) then
+ def _e64#Gen.Suffix :
+ VOP3_Real_Gen<ps, Gen>,
+ VOP3e_gfx11_gfx12<op, ps.Pfl>;
}
+ def Gen.Suffix#"_VOP3_alias" : MnemonicAlias<ps.Mnemonic, asmName>, Requires<[Gen.AssemblerPredicate]>, LetDummies;
+}
+
+// for READLANE/WRITELANE
+multiclass VOP3_Real_No_Suffix<GFXGen Gen, bits<10> op, string opName = NAME> {
+ defvar ps = !cast<VOP_Pseudo>(opName);
+ def _e64#Gen.Suffix :
+ VOP3_Real_Gen<ps, Gen>,
+ VOP3e_gfx11_gfx12<op, ps.Pfl>;
+}
+
+multiclass VOP3_Real_dpp_Base<GFXGen Gen, bits<10> op, string opName = NAME> {
+ def _e64_dpp#Gen.Suffix :
+ VOP3_DPP16_Gen<op, !cast<VOP_DPP_Pseudo>(opName#"_e64"#"_dpp"), Gen>;
+}
- multiclass VOP3Dot_Real_dpp8_Base_gfx11<bits<10> op, string opName = NAME> {
- defvar ps = !cast<VOP3_Pseudo>(opName#"_e64");
- def _e64_dpp8_gfx11 : Base_VOP3_DPP8<op, ps> {
+multiclass VOP3Dot_Real_dpp_Base<GFXGen Gen, bits<10> op, string opName = NAME> {
+ def _e64_dpp#Gen.Suffix :
+ VOP3_DPP16_Gen<op, !cast<VOP_DPP_Pseudo>(opName#"_e64"#"_dpp"), Gen> {
let Inst{11} = ?;
let Inst{12} = ?;
- let DecoderNamespace = "DPP8GFX11";
}
+}
+
+multiclass VOP3_Real_dpp_with_name<GFXGen Gen, bits<10> op, string opName,
+ string asmName> {
+ defvar ps = !cast<VOP3_Pseudo>(opName#"_e64");
+ let AsmString = asmName # ps.Pfl.AsmVOP3DPP16 in {
+ defm NAME : VOP3_Real_dpp_Base<Gen, op, opName>;
}
+}
- multiclass VOP3_Real_dpp8_with_name_gfx11<bits<10> op, string opName,
- string asmName> {
- defvar ps = !cast<VOP3_Pseudo>(opName#"_e64");
- let AsmString = asmName # ps.Pfl.AsmVOP3DPP8, DecoderNamespace = "DPP8GFX11" in {
- defm NAME : VOP3_Real_dpp8_Base_gfx11<op, opName>;
- }
+multiclass VOP3_Real_dpp8_Base<GFXGen Gen, bits<10> op, string opName = NAME> {
+ defvar ps = !cast<VOP3_Pseudo>(opName#"_e64");
+ def _e64_dpp8#Gen.Suffix : Base_VOP3_DPP8<op, ps> {
+ let DecoderNamespace = "DPP8"#Gen.DecoderNamespace;
+ let AssemblerPredicate = Gen.AssemblerPredicate;
}
- multiclass VOP3be_Real_gfx11<bits<10> op, string opName, string asmName,
- bit isSingle = 0> {
- defvar ps = !cast<VOP3_Pseudo>(opName#"_e64");
- let IsSingle = !or(isSingle, ps.Pfl.IsSingle) in
- def _e64_gfx11 :
- VOP3_Real<ps, SIEncodingFamily.GFX11, asmName>,
- VOP3be_gfx11<op, ps.Pfl> ;
+}
+
+multiclass VOP3Dot_Real_dpp8_Base<GFXGen Gen, bits<10> op, string opName = NAME> {
+ defvar ps = !cast<VOP3_Pseudo>(opName#"_e64");
+ def _e64_dpp8#Gen.Suffix : Base_VOP3_DPP8<op, ps> {
+ let Inst{11} = ?;
+ let Inst{12} = ?;
+ let DecoderNamespace = "DPP8"#Gen.DecoderNamespace;
+ let AssemblerPredicate = Gen.AssemblerPredicate;
}
- multiclass VOP3be_Real_dpp_gfx11<bits<10> op, string opName, string asmName> {
- defvar ps = !cast<VOP3_Pseudo>(opName #"_e64");
- defvar dpp_ps = !cast<VOP_DPP_Pseudo>(opName #"_e64" #"_dpp");
- def _e64_dpp_gfx11 : Base_VOP3b_DPP16<op, dpp_ps, asmName>,
- SIMCInstr<dpp_ps.PseudoInstr, SIEncodingFamily.GFX11> {
- let DecoderNamespace = "DPPGFX11";
- }
+}
+
+multiclass VOP3_Real_dpp8_with_name<GFXGen Gen, bits<10> op, string opName,
+ string asmName> {
+ defvar ps = !cast<VOP3_Pseudo>(opName#"_e64");
+ let AsmString = asmName # ps.Pfl.AsmVOP3DPP8,
+ DecoderNamespace = "DPP8"#Gen.DecoderNamespace#
+ !if(ps.Pfl.IsRealTrue16, "", "_FAKE16"),
+ AssemblerPredicate = !if(ps.Pfl.IsRealTrue16, UseRealTrue16Insts,
+ Gen.AssemblerPredicate) in {
+
+ defm NAME : VOP3_Real_dpp8_Base<Gen, op, opName>;
}
- multiclass VOP3be_Real_dpp8_gfx11<bits<10> op, string opName, string asmName> {
- defvar ps = !cast<VOP3_Pseudo>(opName #"_e64");
- def _e64_dpp8_gfx11 : VOP3b_DPP8_Base<op, ps, asmName> {
- let DecoderNamespace = "DPP8GFX11";
- }
+}
+
+multiclass VOP3be_Real<GFXGen Gen, bits<10> op, string opName, string asmName,
+ bit isSingle = 0> {
+ defvar ps = !cast<VOP3_Pseudo>(opName#"_e64");
+ let IsSingle = !or(isSingle, ps.Pfl.IsSingle) in
+ def _e64#Gen.Suffix :
+ VOP3_Real_Gen<ps, Gen, asmName>,
+ VOP3be_gfx11_gfx12<op, ps.Pfl> ;
+}
+
+multiclass VOP3be_Real_dpp<GFXGen Gen, bits<10> op, string opName,
+ string asmName> {
+ defvar ps = !cast<VOP3_Pseudo>(opName #"_e64");
+ defvar dpp_ps = !cast<VOP_DPP_Pseudo>(opName #"_e64" #"_dpp");
+ def _e64_dpp#Gen.Suffix : Base_VOP3b_DPP16<op, dpp_ps, asmName>,
+ SIMCInstr<dpp_ps.PseudoInstr, Gen.Subtarget> {
+ let DecoderNamespace = "DPP"#Gen.DecoderNamespace;
+ let AssemblerPredicate = Gen.AssemblerPredicate;
+ }
+}
+
+multiclass VOP3be_Real_dpp8<GFXGen Gen, bits<10> op, string opName,
+ string asmName> {
+ defvar ps = !cast<VOP3_Pseudo>(opName #"_e64");
+ def _e64_dpp8#Gen.Suffix : VOP3b_DPP8_Base<op, ps, asmName> {
+ let DecoderNamespace = "DPP8"#Gen.DecoderNamespace;
+ let AssemblerPredicate = Gen.AssemblerPredicate;
}
-} // End AssemblerPredicate = isGFX11Only, DecoderNamespace = "GFX11"
+}
// VOP1 and VOP2 depend on these triple defs
-multiclass VOP3_Realtriple_gfx11<bits<10> op,
- bit isSingle = 0, string opName = NAME> :
- VOP3_Real_Base_gfx11<op, opName, isSingle>,
- VOP3_Real_dpp_Base_gfx11<op, opName>,
- VOP3_Real_dpp8_Base_gfx11<op, opName>;
-
-multiclass VOP3Dot_Realtriple_gfx11<bits<10> op,
- bit isSingle = 0, string opName = NAME> :
- VOP3Dot_Real_Base_gfx11<op, opName, isSingle>,
- VOP3Dot_Real_dpp_Base_gfx11<op, opName>,
- VOP3Dot_Real_dpp8_Base_gfx11<op, opName>;
-
-multiclass VOP3Only_Realtriple_gfx11<bits<10> op> :
- VOP3_Realtriple_gfx11<op, 1>;
-
-multiclass VOP3_Realtriple_with_name_gfx11<bits<10> op, string opName,
- string asmName, bit isSingle = 0> :
- VOP3_Real_with_name_gfx11<op, opName, asmName, isSingle>,
- VOP3_Real_dpp_with_name_gfx11<op, opName, asmName>,
- VOP3_Real_dpp8_with_name_gfx11<op, opName, asmName>;
+multiclass VOP3_Realtriple<GFXGen Gen, bits<10> op, bit isSingle = 0,
+ string opName = NAME> :
+ VOP3_Real_Base<Gen, op, opName, isSingle>,
+ VOP3_Real_dpp_Base<Gen, op, opName>,
+ VOP3_Real_dpp8_Base<Gen, op, opName>;
+
+multiclass VOP3Dot_Realtriple<GFXGen Gen, bits<10> op, bit isSingle = 0,
+ string opName = NAME> :
+ VOP3Dot_Real_Base<Gen, op, opName, isSingle>,
+ VOP3Dot_Real_dpp_Base<Gen, op, opName>,
+ VOP3Dot_Real_dpp8_Base<Gen, op, opName>;
+
+multiclass VOP3Only_Realtriple<GFXGen Gen, bits<10> op> :
+ VOP3_Realtriple<Gen, op, 1>;
+
+multiclass VOP3_Realtriple_with_name<GFXGen Gen, bits<10> op, string opName,
+ string asmName, bit isSingle = 0> :
+ VOP3_Real_with_name<Gen, op, opName, asmName, isSingle>,
+ VOP3_Real_dpp_with_name<Gen, op, opName, asmName>,
+ VOP3_Real_dpp8_with_name<Gen, op, opName, asmName>;
+
+multiclass VOP3Only_Realtriple_with_name<GFXGen Gen, bits<10> op, string opName,
+ string asmName> :
+ VOP3_Realtriple_with_name<Gen, op, opName, asmName, 1>;
+
+multiclass VOP3Only_Realtriple_t16<GFXGen Gen, bits<10> op, string asmName,
+ string opName = NAME>
+ : VOP3Only_Realtriple_with_name<Gen, op, opName, asmName>;
+
+multiclass VOP3be_Realtriple<
+ GFXGen Gen, bits<10> op, bit isSingle = 0, string opName = NAME,
+ string asmName = !cast<VOP_Pseudo>(opName#"_e64").Mnemonic> :
+ VOP3be_Real<Gen, op, opName, asmName, isSingle>,
+ VOP3be_Real_dpp<Gen, op, opName, asmName>,
+ VOP3be_Real_dpp8<Gen, op, opName, asmName>;
-multiclass VOP3Only_Realtriple_with_name_gfx11<bits<10> op, string opName,
- string asmName> :
- VOP3_Realtriple_with_name_gfx11<op, opName, asmName, 1>;
+multiclass VOP3beOnly_Realtriple<GFXGen Gen, bits<10> op> :
+ VOP3be_Realtriple<Gen, op, 1>;
+
+//===----------------------------------------------------------------------===//
+// VOP3 GFX11
+//===----------------------------------------------------------------------===//
+
+multiclass VOP3be_Real_gfx11<bits<10> op, string opName, string asmName,
+ bit isSingle = 0> :
+ VOP3be_Real<GFX11Gen, op, opName, asmName, isSingle>;
+
+multiclass VOP3_Real_Base_gfx11<bits<10> op, string opName = NAME,
+ bit isSingle = 0> :
+ VOP3_Real_Base<GFX11Gen, op, opName, isSingle>;
+
+multiclass VOP3_Realtriple_gfx11<bits<10> op, bit isSingle = 0,
+ string opName = NAME> :
+ VOP3_Realtriple<GFX11Gen, op, isSingle, opName>;
multiclass VOP3Only_Realtriple_t16_gfx11<bits<10> op, string asmName,
string opName = NAME>
- : VOP3Only_Realtriple_with_name_gfx11<op, opName, asmName>;
+ : VOP3Only_Realtriple_with_name<GFX11Gen, op, opName, asmName>;
-multiclass VOP3be_Realtriple_gfx11<
- bits<10> op, bit isSingle = 0, string opName = NAME,
- string asmName = !cast<VOP_Pseudo>(opName#"_e64").Mnemonic> :
- VOP3be_Real_gfx11<op, opName, asmName, isSingle>,
- VOP3be_Real_dpp_gfx11<op, opName, asmName>,
- VOP3be_Real_dpp8_gfx11<op, opName, asmName>;
+//===----------------------------------------------------------------------===//
+// VOP3 GFX12
+//===----------------------------------------------------------------------===//
+
+multiclass VOP3Only_Realtriple_gfx12<bits<10> op, bit isSingle = 0> :
+ VOP3_Realtriple<GFX12Gen, op, isSingle>;
+
+// IsSingle is captured from the vopprofile for these instructions, but the
+// following alternative is more explicit
+multiclass VOP3Only_Real_Base_gfx12<bits<10> op> :
+ VOP3_Real_Base<GFX12Gen, op, NAME, 1/*IsSingle*/>;
-multiclass VOP3beOnly_Realtriple_gfx11<bits<10> op> :
- VOP3be_Realtriple_gfx11<op, 1>;
+multiclass VOP3Only_Realtriple_t16_gfx12<bits<10> op> :
+ VOP3Only_Realtriple<GFX12Gen, op>;
+
+multiclass VOP3be_Real_with_name_gfx12<bits<10> op, string opName,
+ string asmName, bit isSingle = 0> {
+ defvar ps = !cast<VOP3_Pseudo>(opName#"_e64");
+ let AsmString = asmName # ps.AsmOperands,
+ IsSingle = !or(isSingle, ps.Pfl.IsSingle) in
+ def _e64_gfx12 :
+ VOP3_Real_Gen<ps, GFX12Gen, asmName>,
+ VOP3be_gfx11_gfx12<op, ps.Pfl>,
+ MnemonicAlias<ps.Mnemonic, asmName>, Requires<[isGFX12Only]>;
+}
+
+multiclass VOP3_Realtriple_with_name_gfx12<bits<10> op, string opName,
+ string asmName, bit isSingle = 0> :
+ VOP3_Realtriple_with_name<GFX12Gen, op, opName, asmName, isSingle>;
+
+multiclass VOP3Only_Realtriple_with_name_gfx11_gfx12<bits<10> op, string opName,
+ string asmName> :
+ VOP3Only_Realtriple_with_name<GFX11Gen, op, opName, asmName>,
+ VOP3Only_Realtriple_with_name<GFX12Gen, op, opName, asmName>;
+
+multiclass VOP3Only_Realtriple_with_name_t16_gfx12<bits<10> op, string asmName,
+ string opName = NAME>
+ : VOP3Only_Realtriple_with_name<GFX12Gen, op, opName, asmName>;
+
+//===----------------------------------------------------------------------===//
include "VOPCInstructions.td"
include "VOP1Instructions.td"