aboutsummaryrefslogtreecommitdiff
path: root/llvm/lib/Target/AMDGPU
diff options
context:
space:
mode:
authorDimitry Andric <dim@FreeBSD.org>2023-07-26 19:03:47 +0000
committerDimitry Andric <dim@FreeBSD.org>2023-07-26 19:04:23 +0000
commit7fa27ce4a07f19b07799a767fc29416f3b625afb (patch)
tree27825c83636c4de341eb09a74f49f5d38a15d165 /llvm/lib/Target/AMDGPU
parente3b557809604d036af6e00c60f012c2025b59a5e (diff)
Diffstat (limited to 'llvm/lib/Target/AMDGPU')
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPU.h200
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPU.td489
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUAliasAnalysis.cpp31
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUAlwaysInlinePass.cpp2
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUAnnotateUniformValues.cpp14
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp231
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h2
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp347
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp343
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp34
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp1102
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUCombine.td108
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUCombinerHelper.cpp53
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUCombinerHelper.h12
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUCtorDtorLowering.cpp94
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUGISel.td14
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.cpp2
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp28
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.h3
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp767
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp195
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h17
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp1105
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h60
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUInsertDelayAlu.cpp28
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp369
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td41
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp359
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h25
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUInstructions.td77
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp28
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp1409
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h32
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPULowerIntrinsics.cpp177
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp9
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPULowerKernelAttributes.cpp5
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp981
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp23
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUMachineCFGStructurizer.cpp3
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp211
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h18
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUOpenCLEnqueuedBlockLowering.cpp38
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp216
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUPreLegalizerCombiner.cpp120
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUPrintfRuntimeBinding.cpp71
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp1103
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUPromoteKernelArguments.cpp2
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUPropagateAttributes.cpp426
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp222
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPURegBankSelect.cpp77
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPURegBankSelect.h29
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp129
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.h5
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUReleaseVGPRs.cpp156
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPURemoveIncompatibleFunctions.cpp186
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUReplaceLDSUseWithPointer.cpp648
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp32
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPURewriteOutArguments.cpp10
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPURewriteUndefForPHI.cpp21
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td54
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp39
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h15
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp173
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h2
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp215
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h51
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp85
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.h36
-rw-r--r--llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp1399
-rw-r--r--llvm/lib/Target/AMDGPU/BUFInstructions.td208
-rw-r--r--llvm/lib/Target/AMDGPU/DSInstructions.td49
-rw-r--r--llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp835
-rw-r--r--llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h91
-rw-r--r--llvm/lib/Target/AMDGPU/FLATInstructions.td151
-rw-r--r--llvm/lib/Target/AMDGPU/GCNCreateVOPD.cpp33
-rw-r--r--llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp48
-rw-r--r--llvm/lib/Target/AMDGPU/GCNIterativeScheduler.cpp5
-rw-r--r--llvm/lib/Target/AMDGPU/GCNNSAReassign.cpp2
-rw-r--r--llvm/lib/Target/AMDGPU/GCNPreRALongBranchReg.cpp139
-rw-r--r--llvm/lib/Target/AMDGPU/GCNProcessors.td16
-rw-r--r--llvm/lib/Target/AMDGPU/GCNRegPressure.cpp43
-rw-r--r--llvm/lib/Target/AMDGPU/GCNRewritePartialRegUses.cpp502
-rw-r--r--llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp50
-rw-r--r--llvm/lib/Target/AMDGPU/GCNSubtarget.h61
-rw-r--r--llvm/lib/Target/AMDGPU/GCNVOPDUtils.cpp2
-rw-r--r--llvm/lib/Target/AMDGPU/LDSDIRInstructions.td2
-rw-r--r--llvm/lib/Target/AMDGPU/MCA/AMDGPUCustomBehaviour.cpp4
-rw-r--r--llvm/lib/Target/AMDGPU/MCA/AMDGPUCustomBehaviour.h2
-rw-r--r--llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp4
-rw-r--r--llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFObjectWriter.cpp4
-rw-r--r--llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp180
-rw-r--r--llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h48
-rw-r--r--llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.cpp10
-rw-r--r--llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp582
-rw-r--r--llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.h68
-rw-r--r--llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.cpp11
-rw-r--r--llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.h4
-rw-r--r--llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp41
-rw-r--r--llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h18
-rw-r--r--llvm/lib/Target/AMDGPU/MCTargetDesc/R600InstPrinter.cpp2
-rw-r--r--llvm/lib/Target/AMDGPU/MCTargetDesc/R600MCCodeEmitter.cpp38
-rw-r--r--llvm/lib/Target/AMDGPU/MCTargetDesc/R600MCTargetDesc.cpp2
-rw-r--r--llvm/lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp594
-rw-r--r--llvm/lib/Target/AMDGPU/MIMGInstructions.td167
-rw-r--r--llvm/lib/Target/AMDGPU/R600ISelDAGToDAG.cpp2
-rw-r--r--llvm/lib/Target/AMDGPU/R600ISelLowering.cpp25
-rw-r--r--llvm/lib/Target/AMDGPU/R600ISelLowering.h3
-rw-r--r--llvm/lib/Target/AMDGPU/R600InstrInfo.cpp2
-rw-r--r--llvm/lib/Target/AMDGPU/R600Instructions.td4
-rw-r--r--llvm/lib/Target/AMDGPU/R600TargetTransformInfo.cpp4
-rw-r--r--llvm/lib/Target/AMDGPU/R600TargetTransformInfo.h2
-rw-r--r--llvm/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp12
-rw-r--r--llvm/lib/Target/AMDGPU/SIDefines.h69
-rw-r--r--llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp17
-rw-r--r--llvm/lib/Target/AMDGPU/SIFixVGPRCopies.cpp5
-rw-r--r--llvm/lib/Target/AMDGPU/SIFoldOperands.cpp306
-rw-r--r--llvm/lib/Target/AMDGPU/SIFormMemoryClauses.cpp4
-rw-r--r--llvm/lib/Target/AMDGPU/SIFrameLowering.cpp113
-rw-r--r--llvm/lib/Target/AMDGPU/SIFrameLowering.h4
-rw-r--r--llvm/lib/Target/AMDGPU/SIISelLowering.cpp2214
-rw-r--r--llvm/lib/Target/AMDGPU/SIISelLowering.h51
-rw-r--r--llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp133
-rw-r--r--llvm/lib/Target/AMDGPU/SIInstrFormats.td5
-rw-r--r--llvm/lib/Target/AMDGPU/SIInstrInfo.cpp1401
-rw-r--r--llvm/lib/Target/AMDGPU/SIInstrInfo.h132
-rw-r--r--llvm/lib/Target/AMDGPU/SIInstrInfo.td347
-rw-r--r--llvm/lib/Target/AMDGPU/SIInstructions.td239
-rw-r--r--llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp57
-rw-r--r--llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp23
-rw-r--r--llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp141
-rw-r--r--llvm/lib/Target/AMDGPU/SILowerWWMCopies.cpp141
-rw-r--r--llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp99
-rw-r--r--llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h133
-rw-r--r--llvm/lib/Target/AMDGPU/SIMachineScheduler.cpp2
-rw-r--r--llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp34
-rw-r--r--llvm/lib/Target/AMDGPU/SIModeRegister.cpp16
-rw-r--r--llvm/lib/Target/AMDGPU/SIModeRegisterDefaults.cpp38
-rw-r--r--llvm/lib/Target/AMDGPU/SIModeRegisterDefaults.h90
-rw-r--r--llvm/lib/Target/AMDGPU/SIOptimizeExecMaskingPreRA.cpp57
-rw-r--r--llvm/lib/Target/AMDGPU/SIOptimizeVGPRLiveRange.cpp15
-rw-r--r--llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp4
-rw-r--r--llvm/lib/Target/AMDGPU/SIPostRABundler.cpp4
-rw-r--r--llvm/lib/Target/AMDGPU/SIProgramInfo.cpp20
-rw-r--r--llvm/lib/Target/AMDGPU/SIProgramInfo.h20
-rw-r--r--llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp416
-rw-r--r--llvm/lib/Target/AMDGPU/SIRegisterInfo.h49
-rw-r--r--llvm/lib/Target/AMDGPU/SIRegisterInfo.td331
-rw-r--r--llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp21
-rw-r--r--llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp60
-rw-r--r--llvm/lib/Target/AMDGPU/SMInstructions.td729
-rw-r--r--llvm/lib/Target/AMDGPU/SOPInstructions.td57
-rw-r--r--llvm/lib/Target/AMDGPU/TargetInfo/AMDGPUTargetInfo.cpp10
-rw-r--r--llvm/lib/Target/AMDGPU/TargetInfo/AMDGPUTargetInfo.h7
-rw-r--r--llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.cpp8
-rw-r--r--llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp363
-rw-r--r--llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h198
-rw-r--r--llvm/lib/Target/AMDGPU/Utils/AMDGPUMemoryUtils.cpp82
-rw-r--r--llvm/lib/Target/AMDGPU/Utils/AMDGPUMemoryUtils.h3
-rw-r--r--llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.cpp123
-rw-r--r--llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.h44
-rw-r--r--llvm/lib/Target/AMDGPU/VINTERPInstructions.td1
-rw-r--r--llvm/lib/Target/AMDGPU/VOP1Instructions.td67
-rw-r--r--llvm/lib/Target/AMDGPU/VOP2Instructions.td165
-rw-r--r--llvm/lib/Target/AMDGPU/VOP3Instructions.td71
-rw-r--r--llvm/lib/Target/AMDGPU/VOP3PInstructions.td90
-rw-r--r--llvm/lib/Target/AMDGPU/VOPCInstructions.td76
-rw-r--r--llvm/lib/Target/AMDGPU/VOPInstructions.td12
167 files changed, 16807 insertions, 10788 deletions
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h
index eaf72686c166..b82db82de84e 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.h
@@ -26,6 +26,8 @@ FunctionPass *createAMDGPUPostLegalizeCombiner(bool IsOptNone);
FunctionPass *createAMDGPURegBankCombiner(bool IsOptNone);
void initializeAMDGPURegBankCombinerPass(PassRegistry &);
+void initializeAMDGPURegBankSelectPass(PassRegistry &);
+
// SI Passes
FunctionPass *createGCNDPPCombinePass();
FunctionPass *createSIAnnotateControlFlowPass();
@@ -39,6 +41,7 @@ FunctionPass *createSIFixControlFlowLiveIntervalsPass();
FunctionPass *createSIOptimizeExecMaskingPreRAPass();
FunctionPass *createSIOptimizeVGPRLiveRangePass();
FunctionPass *createSIFixSGPRCopiesPass();
+FunctionPass *createLowerWWMCopiesPass();
FunctionPass *createSIMemoryLegalizerPass();
FunctionPass *createSIInsertWaitcntsPass();
FunctionPass *createSIPreAllocateWWMRegsPass();
@@ -47,13 +50,11 @@ FunctionPass *createSIFormMemoryClausesPass();
FunctionPass *createSIPostRABundlerPass();
FunctionPass *createAMDGPUSimplifyLibCallsPass(const TargetMachine *);
FunctionPass *createAMDGPUUseNativeCallsPass();
+ModulePass *createAMDGPURemoveIncompatibleFunctionsPass(const TargetMachine *);
FunctionPass *createAMDGPUCodeGenPreparePass();
FunctionPass *createAMDGPULateCodeGenPreparePass();
FunctionPass *createAMDGPUMachineCFGStructurizerPass();
-FunctionPass *createAMDGPUPropagateAttributesEarlyPass(const TargetMachine *);
-ModulePass *createAMDGPUPropagateAttributesLatePass(const TargetMachine *);
FunctionPass *createAMDGPURewriteOutArgumentsPass();
-ModulePass *createAMDGPUReplaceLDSUseWithPointerPass();
ModulePass *createAMDGPULowerModuleLDSPass();
FunctionPass *createSIModeRegisterPass();
FunctionPass *createGCNPreRAOptimizationsPass();
@@ -83,14 +84,13 @@ void initializeAMDGPUAttributorPass(PassRegistry &);
void initializeAMDGPUAnnotateKernelFeaturesPass(PassRegistry &);
extern char &AMDGPUAnnotateKernelFeaturesID;
-FunctionPass *createAMDGPUAtomicOptimizerPass();
+// DPP/Iterative option enables the atomic optimizer with given strategy
+// whereas None disables the atomic optimizer.
+enum class ScanOptions { DPP, Iterative, None };
+FunctionPass *createAMDGPUAtomicOptimizerPass(ScanOptions ScanStrategy);
void initializeAMDGPUAtomicOptimizerPass(PassRegistry &);
extern char &AMDGPUAtomicOptimizerID;
-ModulePass *createAMDGPULowerIntrinsicsPass();
-void initializeAMDGPULowerIntrinsicsPass(PassRegistry &);
-extern char &AMDGPULowerIntrinsicsID;
-
ModulePass *createAMDGPUCtorDtorLoweringLegacyPass();
void initializeAMDGPUCtorDtorLoweringLegacyPass(PassRegistry &);
extern char &AMDGPUCtorDtorLoweringLegacyPassID;
@@ -117,38 +117,6 @@ struct AMDGPULowerKernelAttributesPass
PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
};
-void initializeAMDGPUPropagateAttributesEarlyPass(PassRegistry &);
-extern char &AMDGPUPropagateAttributesEarlyID;
-
-struct AMDGPUPropagateAttributesEarlyPass
- : PassInfoMixin<AMDGPUPropagateAttributesEarlyPass> {
- AMDGPUPropagateAttributesEarlyPass(TargetMachine &TM) : TM(TM) {}
- PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
-
-private:
- TargetMachine &TM;
-};
-
-void initializeAMDGPUPropagateAttributesLatePass(PassRegistry &);
-extern char &AMDGPUPropagateAttributesLateID;
-
-struct AMDGPUPropagateAttributesLatePass
- : PassInfoMixin<AMDGPUPropagateAttributesLatePass> {
- AMDGPUPropagateAttributesLatePass(TargetMachine &TM) : TM(TM) {}
- PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM);
-
-private:
- TargetMachine &TM;
-};
-
-void initializeAMDGPUReplaceLDSUseWithPointerPass(PassRegistry &);
-extern char &AMDGPUReplaceLDSUseWithPointerID;
-
-struct AMDGPUReplaceLDSUseWithPointerPass
- : PassInfoMixin<AMDGPUReplaceLDSUseWithPointerPass> {
- PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM);
-};
-
void initializeAMDGPULowerModuleLDSPass(PassRegistry &);
extern char &AMDGPULowerModuleLDSID;
@@ -177,6 +145,9 @@ extern char &SIFixSGPRCopiesID;
void initializeSIFixVGPRCopiesPass(PassRegistry &);
extern char &SIFixVGPRCopiesID;
+void initializeSILowerWWMCopiesPass(PassRegistry &);
+extern char &SILowerWWMCopiesID;
+
void initializeSILowerI1CopiesPass(PassRegistry &);
extern char &SILowerI1CopiesID;
@@ -239,6 +210,16 @@ private:
TargetMachine &TM;
};
+struct AMDGPUAtomicOptimizerPass : PassInfoMixin<AMDGPUAtomicOptimizerPass> {
+ AMDGPUAtomicOptimizerPass(TargetMachine &TM, ScanOptions ScanImpl)
+ : TM(TM), ScanImpl(ScanImpl) {}
+ PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
+
+private:
+ TargetMachine &TM;
+ ScanOptions ScanImpl;
+};
+
Pass *createAMDGPUStructurizeCFGPass();
FunctionPass *createAMDGPUISelDag(TargetMachine &TM,
CodeGenOpt::Level OptLevel);
@@ -252,6 +233,16 @@ private:
bool GlobalOpt;
};
+class AMDGPUCodeGenPreparePass
+ : public PassInfoMixin<AMDGPUCodeGenPreparePass> {
+private:
+ TargetMachine &TM;
+
+public:
+ AMDGPUCodeGenPreparePass(TargetMachine &TM) : TM(TM){};
+ PreservedAnalyses run(Function &, FunctionAnalysisManager &);
+};
+
FunctionPass *createAMDGPUAnnotateUniformValues();
ModulePass *createAMDGPUPrintfRuntimeBinding();
@@ -286,6 +277,9 @@ extern char &AMDGPUAnnotateUniformValuesPassID;
void initializeAMDGPUCodeGenPreparePass(PassRegistry&);
extern char &AMDGPUCodeGenPrepareID;
+void initializeAMDGPURemoveIncompatibleFunctionsPass(PassRegistry &);
+extern char &AMDGPURemoveIncompatibleFunctionsID;
+
void initializeAMDGPULateCodeGenPreparePass(PassRegistry &);
extern char &AMDGPULateCodeGenPrepareID;
@@ -302,9 +296,6 @@ extern char &SIMemoryLegalizerID;
void initializeSIModeRegisterPass(PassRegistry&);
extern char &SIModeRegisterID;
-void initializeAMDGPUReleaseVGPRsPass(PassRegistry &);
-extern char &AMDGPUReleaseVGPRsID;
-
void initializeAMDGPUInsertDelayAluPass(PassRegistry &);
extern char &AMDGPUInsertDelayAluID;
@@ -340,12 +331,18 @@ extern char &AMDGPUOpenCLEnqueuedBlockLoweringID;
void initializeGCNNSAReassignPass(PassRegistry &);
extern char &GCNNSAReassignID;
+void initializeGCNPreRALongBranchRegPass(PassRegistry &);
+extern char &GCNPreRALongBranchRegID;
+
void initializeGCNPreRAOptimizationsPass(PassRegistry &);
extern char &GCNPreRAOptimizationsID;
FunctionPass *createAMDGPUSetWavePriorityPass();
void initializeAMDGPUSetWavePriorityPass(PassRegistry &);
+void initializeGCNRewritePartialRegUsesPass(llvm::PassRegistry &);
+extern char &GCNRewritePartialRegUsesID;
+
namespace AMDGPU {
enum TargetIndex {
TI_CONSTDATA_START,
@@ -363,53 +360,60 @@ enum TargetIndex {
/// a separate piece of memory that is unique from other
/// memory locations.
namespace AMDGPUAS {
- enum : unsigned {
- // The maximum value for flat, generic, local, private, constant and region.
- MAX_AMDGPU_ADDRESS = 7,
+enum : unsigned {
+ // The maximum value for flat, generic, local, private, constant and region.
+ MAX_AMDGPU_ADDRESS = 8,
- FLAT_ADDRESS = 0, ///< Address space for flat memory.
- GLOBAL_ADDRESS = 1, ///< Address space for global memory (RAT0, VTX0).
- REGION_ADDRESS = 2, ///< Address space for region memory. (GDS)
+ FLAT_ADDRESS = 0, ///< Address space for flat memory.
+ GLOBAL_ADDRESS = 1, ///< Address space for global memory (RAT0, VTX0).
+ REGION_ADDRESS = 2, ///< Address space for region memory. (GDS)
- CONSTANT_ADDRESS = 4, ///< Address space for constant memory (VTX2).
- LOCAL_ADDRESS = 3, ///< Address space for local memory.
- PRIVATE_ADDRESS = 5, ///< Address space for private memory.
+ CONSTANT_ADDRESS = 4, ///< Address space for constant memory (VTX2).
+ LOCAL_ADDRESS = 3, ///< Address space for local memory.
+ PRIVATE_ADDRESS = 5, ///< Address space for private memory.
- CONSTANT_ADDRESS_32BIT = 6, ///< Address space for 32-bit constant memory.
+ CONSTANT_ADDRESS_32BIT = 6, ///< Address space for 32-bit constant memory.
- BUFFER_FAT_POINTER = 7, ///< Address space for 160-bit buffer fat pointers.
+ BUFFER_FAT_POINTER = 7, ///< Address space for 160-bit buffer fat pointers.
+ ///< Not used in backend.
- /// Address space for direct addressable parameter memory (CONST0).
- PARAM_D_ADDRESS = 6,
- /// Address space for indirect addressable parameter memory (VTX1).
- PARAM_I_ADDRESS = 7,
+ BUFFER_RESOURCE = 8, ///< Address space for 128-bit buffer resources.
- // Do not re-order the CONSTANT_BUFFER_* enums. Several places depend on
- // this order to be able to dynamically index a constant buffer, for
- // example:
- //
- // ConstantBufferAS = CONSTANT_BUFFER_0 + CBIdx
+ /// Internal address spaces. Can be freely renumbered.
+ STREAMOUT_REGISTER = 128, ///< Address space for GS NGG Streamout registers.
+ /// end Internal address spaces.
- CONSTANT_BUFFER_0 = 8,
- CONSTANT_BUFFER_1 = 9,
- CONSTANT_BUFFER_2 = 10,
- CONSTANT_BUFFER_3 = 11,
- CONSTANT_BUFFER_4 = 12,
- CONSTANT_BUFFER_5 = 13,
- CONSTANT_BUFFER_6 = 14,
- CONSTANT_BUFFER_7 = 15,
- CONSTANT_BUFFER_8 = 16,
- CONSTANT_BUFFER_9 = 17,
- CONSTANT_BUFFER_10 = 18,
- CONSTANT_BUFFER_11 = 19,
- CONSTANT_BUFFER_12 = 20,
- CONSTANT_BUFFER_13 = 21,
- CONSTANT_BUFFER_14 = 22,
- CONSTANT_BUFFER_15 = 23,
+ /// Address space for direct addressable parameter memory (CONST0).
+ PARAM_D_ADDRESS = 6,
+ /// Address space for indirect addressable parameter memory (VTX1).
+ PARAM_I_ADDRESS = 7,
- // Some places use this if the address space can't be determined.
- UNKNOWN_ADDRESS_SPACE = ~0u,
- };
+ // Do not re-order the CONSTANT_BUFFER_* enums. Several places depend on
+ // this order to be able to dynamically index a constant buffer, for
+ // example:
+ //
+ // ConstantBufferAS = CONSTANT_BUFFER_0 + CBIdx
+
+ CONSTANT_BUFFER_0 = 8,
+ CONSTANT_BUFFER_1 = 9,
+ CONSTANT_BUFFER_2 = 10,
+ CONSTANT_BUFFER_3 = 11,
+ CONSTANT_BUFFER_4 = 12,
+ CONSTANT_BUFFER_5 = 13,
+ CONSTANT_BUFFER_6 = 14,
+ CONSTANT_BUFFER_7 = 15,
+ CONSTANT_BUFFER_8 = 16,
+ CONSTANT_BUFFER_9 = 17,
+ CONSTANT_BUFFER_10 = 18,
+ CONSTANT_BUFFER_11 = 19,
+ CONSTANT_BUFFER_12 = 20,
+ CONSTANT_BUFFER_13 = 21,
+ CONSTANT_BUFFER_14 = 22,
+ CONSTANT_BUFFER_15 = 23,
+
+ // Some places use this if the address space can't be determined.
+ UNKNOWN_ADDRESS_SPACE = ~0u,
+};
}
namespace AMDGPU {
@@ -421,6 +425,38 @@ inline bool isFlatGlobalAddrSpace(unsigned AS) {
AS == AMDGPUAS::CONSTANT_ADDRESS ||
AS > AMDGPUAS::MAX_AMDGPU_ADDRESS;
}
+
+inline bool isExtendedGlobalAddrSpace(unsigned AS) {
+ return AS == AMDGPUAS::GLOBAL_ADDRESS || AS == AMDGPUAS::CONSTANT_ADDRESS ||
+ AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT ||
+ AS > AMDGPUAS::MAX_AMDGPU_ADDRESS;
+}
+
+static inline bool addrspacesMayAlias(unsigned AS1, unsigned AS2) {
+ static_assert(AMDGPUAS::MAX_AMDGPU_ADDRESS <= 8, "Addr space out of range");
+
+ if (AS1 > AMDGPUAS::MAX_AMDGPU_ADDRESS || AS2 > AMDGPUAS::MAX_AMDGPU_ADDRESS)
+ return true;
+
+ // This array is indexed by address space value enum elements 0 ... to 8
+ // clang-format off
+ static const bool ASAliasRules[9][9] = {
+ /* Flat Global Region Group Constant Private Const32 BufFatPtr BufRsrc */
+ /* Flat */ {true, true, false, true, true, true, true, true, true},
+ /* Global */ {true, true, false, false, true, false, true, true, true},
+ /* Region */ {false, false, true, false, false, false, false, false, false},
+ /* Group */ {true, false, false, true, false, false, false, false, false},
+ /* Constant */ {true, true, false, false, false, false, true, true, true},
+ /* Private */ {true, false, false, false, false, true, false, false, false},
+ /* Constant 32-bit */ {true, true, false, false, true, false, false, true, true},
+ /* Buffer Fat Ptr */ {true, true, false, false, true, false, true, true, true},
+ /* Buffer Resource */ {true, true, false, false, true, false, true, true, true},
+ };
+ // clang-format on
+
+ return ASAliasRules[AS1][AS2];
+}
+
}
} // End namespace llvm
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td
index ddc32988881a..b178623a319d 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.td
@@ -18,10 +18,6 @@ def p4 : PtrValueType<i64, 4>;
def p5 : PtrValueType<i32, 5>;
def p6 : PtrValueType<i32, 6>;
-class BoolToList<bit Value> {
- list<int> ret = !if(Value, [1]<int>, []<int>);
-}
-
//===------------------------------------------------------------===//
// Subtarget Features (device properties)
//===------------------------------------------------------------===//
@@ -494,6 +490,12 @@ def FeatureNSAEncoding : SubtargetFeature<"nsa-encoding",
"Support NSA encoding for image instructions"
>;
+def FeaturePartialNSAEncoding : SubtargetFeature<"partial-nsa-encoding",
+ "HasPartialNSAEncoding",
+ "true",
+ "Support partial NSA encoding for image instructions"
+>;
+
def FeatureImageInsts : SubtargetFeature<"image-insts",
"HasImageInsts",
"true",
@@ -581,7 +583,7 @@ def FeatureDot6Insts : SubtargetFeature<"dot6-insts",
def FeatureDot7Insts : SubtargetFeature<"dot7-insts",
"HasDot7Insts",
"true",
- "Has v_dot2_f32_f16, v_dot4_u32_u8, v_dot8_u32_u4 instructions"
+ "Has v_dot4_u32_u8, v_dot8_u32_u4 instructions"
>;
def FeatureDot8Insts : SubtargetFeature<"dot8-insts",
@@ -596,6 +598,12 @@ def FeatureDot9Insts : SubtargetFeature<"dot9-insts",
"Has v_dot2_f16_f16, v_dot2_bf16_bf16, v_dot2_f32_bf16 instructions"
>;
+def FeatureDot10Insts : SubtargetFeature<"dot10-insts",
+ "HasDot10Insts",
+ "true",
+ "Has v_dot2_f32_f16 instruction"
+>;
+
def FeatureMAIInsts : SubtargetFeature<"mai-insts",
"HasMAIInsts",
"true",
@@ -614,6 +622,19 @@ def FeaturePkFmacF16Inst : SubtargetFeature<"pk-fmac-f16-inst",
"Has v_pk_fmac_f16 instruction"
>;
+def FeatureAtomicDsPkAdd16Insts : SubtargetFeature<"atomic-ds-pk-add-16-insts",
+ "HasAtomicDsPkAdd16Insts",
+ "true",
+ "Has ds_pk_add_bf16, ds_pk_add_f16, ds_pk_add_rtn_bf16, "
+ "ds_pk_add_rtn_f16 instructions"
+>;
+
+def FeatureAtomicFlatPkAdd16Insts : SubtargetFeature<"atomic-flat-pk-add-16-insts",
+ "HasAtomicFlatPkAdd16Insts",
+ "true",
+ "Has flat_atomic_pk_add_f16 and flat_atomic_pk_add_bf16 instructions"
+>;
+
def FeatureAtomicFaddRtnInsts : SubtargetFeature<"atomic-fadd-rtn-insts",
"HasAtomicFaddRtnInsts",
"true",
@@ -630,15 +651,30 @@ def FeatureAtomicFaddNoRtnInsts : SubtargetFeature<"atomic-fadd-no-rtn-insts",
[FeatureFlatGlobalInsts]
>;
-def FeatureAtomicPkFaddNoRtnInsts
- : SubtargetFeature<"atomic-pk-fadd-no-rtn-insts",
- "HasAtomicPkFaddNoRtnInsts",
+def FeatureAtomicBufferGlobalPkAddF16NoRtnInsts
+ : SubtargetFeature<"atomic-buffer-global-pk-add-f16-no-rtn-insts",
+ "HasAtomicBufferGlobalPkAddF16NoRtnInsts",
"true",
"Has buffer_atomic_pk_add_f16 and global_atomic_pk_add_f16 instructions that "
"don't return original value",
[FeatureFlatGlobalInsts]
>;
+def FeatureAtomicBufferGlobalPkAddF16Insts : SubtargetFeature<"atomic-buffer-global-pk-add-f16-insts",
+ "HasAtomicBufferGlobalPkAddF16Insts",
+ "true",
+ "Has buffer_atomic_pk_add_f16 and global_atomic_pk_add_f16 instructions that "
+ "can return original value",
+ [FeatureFlatGlobalInsts]
+>;
+
+def FeatureAtomicGlobalPkAddBF16Inst : SubtargetFeature<"atomic-global-pk-add-bf16-inst",
+ "HasAtomicGlobalPkAddBF16Inst",
+ "true",
+ "Has global_atomic_pk_add_bf16 instruction",
+ [FeatureFlatGlobalInsts]
+>;
+
def FeatureFlatAtomicFaddF32Inst
: SubtargetFeature<"flat-atomic-fadd-f32-inst",
"HasFlatAtomicFaddF32Inst",
@@ -718,15 +754,6 @@ def FeatureGFX11FullVGPRs : SubtargetFeature<"gfx11-full-vgprs",
"GFX11 with 50% more physical VGPRs and 50% larger allocation granule than GFX10"
>;
-class SubtargetFeatureNSAMaxSize <int Value> : SubtargetFeature <
- "nsa-max-size-"#Value,
- "NSAMaxSize",
- !cast<string>(Value),
- "The maximum non-sequential address size in VGPRs."
->;
-
-def FeatureNSAMaxSize5 : SubtargetFeatureNSAMaxSize<5>;
-def FeatureNSAMaxSize13 : SubtargetFeatureNSAMaxSize<13>;
def FeatureVOPD : SubtargetFeature<"vopd",
"HasVOPDInsts",
@@ -740,6 +767,12 @@ def FeatureVALUTransUseHazard : SubtargetFeature<"valu-trans-use-hazard",
"Hazard when TRANS instructions are closely followed by a use of the result"
>;
+def FeatureForceStoreSC0SC1 : SubtargetFeature<"force-store-sc0-sc1",
+ "HasForceStoreSC0SC1",
+ "true",
+ "Has SC0 and SC1 on stores"
+>;
+
//===------------------------------------------------------------===//
// Subtarget Features (options and debugging)
//===------------------------------------------------------------===//
@@ -860,12 +893,20 @@ def FeatureArchitectedFlatScratch : SubtargetFeature<"architected-flat-scratch",
"Flat Scratch register is a readonly SPI initialized architected register"
>;
+def FeatureArchitectedSGPRs : SubtargetFeature<"architected-sgprs",
+ "HasArchitectedSGPRs",
+ "true",
+ "Enable the architected SGPRs"
+>;
+
// Dummy feature used to disable assembler instructions.
def FeatureDisable : SubtargetFeature<"",
"FeatureDisable","true",
"Dummy feature to disable assembler instructions"
>;
+//===----------------------------------------------------------------------===//
+
class GCNSubtargetFeatureGeneration <string Value,
string FeatureName,
list<SubtargetFeature> Implies> :
@@ -962,6 +1003,8 @@ def FeatureGFX11 : GCNSubtargetFeatureGeneration<"GFX11",
]
>;
+//===----------------------------------------------------------------------===//
+
class FeatureSet<list<SubtargetFeature> Features_> {
list<SubtargetFeature> Features = Features_;
}
@@ -1006,30 +1049,28 @@ def FeatureISAVersion7_0_5 : FeatureSet<
[FeatureSeaIslands,
FeatureLDSBankCount16]>;
-def FeatureISAVersion8_0_1 : FeatureSet<
+def FeatureISAVersion8_0_Common : FeatureSet<
[FeatureVolcanicIslands,
- FeatureFastFMAF32,
- HalfRate64Ops,
FeatureLDSBankCount32,
- FeatureSupportsXNACK,
FeatureUnpackedD16VMem]>;
+def FeatureISAVersion8_0_1 : FeatureSet<
+ !listconcat(FeatureISAVersion8_0_Common.Features,
+ [FeatureFastFMAF32,
+ HalfRate64Ops,
+ FeatureSupportsXNACK])>;
+
def FeatureISAVersion8_0_2 : FeatureSet<
- [FeatureVolcanicIslands,
- FeatureLDSBankCount32,
- FeatureSGPRInitBug,
- FeatureUnpackedD16VMem]>;
+ !listconcat(FeatureISAVersion8_0_Common.Features,
+ [FeatureSGPRInitBug])>;
def FeatureISAVersion8_0_3 : FeatureSet<
- [FeatureVolcanicIslands,
- FeatureLDSBankCount32,
- FeatureUnpackedD16VMem]>;
+ !listconcat(FeatureISAVersion8_0_Common.Features,
+ [])>;
def FeatureISAVersion8_0_5 : FeatureSet<
- [FeatureVolcanicIslands,
- FeatureLDSBankCount32,
- FeatureSGPRInitBug,
- FeatureUnpackedD16VMem]>;
+ !listconcat(FeatureISAVersion8_0_Common.Features,
+ [FeatureSGPRInitBug])>;
def FeatureISAVersion8_1_0 : FeatureSet<
[FeatureVolcanicIslands,
@@ -1038,126 +1079,101 @@ def FeatureISAVersion8_1_0 : FeatureSet<
FeatureImageStoreD16Bug,
FeatureImageGather4D16Bug]>;
-def FeatureISAVersion9_0_0 : FeatureSet<
+def FeatureISAVersion9_0_Common : FeatureSet<
[FeatureGFX9,
- FeatureMadMixInsts,
FeatureLDSBankCount32,
- FeatureDsSrc2Insts,
- FeatureExtendedImageInsts,
FeatureImageInsts,
- FeatureMadMacF32Insts,
- FeatureImageGather4D16Bug]>;
+ FeatureMadMacF32Insts]>;
+
+def FeatureISAVersion9_0_MI_Common : FeatureSet<
+ !listconcat(FeatureISAVersion9_0_Common.Features,
+ [FeatureFmaMixInsts,
+ FeatureDLInsts,
+ FeatureDot1Insts,
+ FeatureDot2Insts,
+ FeatureDot3Insts,
+ FeatureDot4Insts,
+ FeatureDot5Insts,
+ FeatureDot6Insts,
+ FeatureDot7Insts,
+ FeatureDot10Insts,
+ FeatureMAIInsts,
+ FeaturePkFmacF16Inst,
+ FeatureAtomicFaddNoRtnInsts,
+ FeatureSupportsSRAMECC])>;
+
+def FeatureISAVersion9_0_0 : FeatureSet<
+ !listconcat(FeatureISAVersion9_0_Common.Features,
+ [FeatureMadMixInsts,
+ FeatureDsSrc2Insts,
+ FeatureExtendedImageInsts,
+ FeatureImageGather4D16Bug])>;
def FeatureISAVersion9_0_2 : FeatureSet<
- [FeatureGFX9,
- FeatureMadMixInsts,
- FeatureLDSBankCount32,
- FeatureDsSrc2Insts,
- FeatureExtendedImageInsts,
- FeatureImageInsts,
- FeatureMadMacF32Insts,
- FeatureImageGather4D16Bug]>;
+ !listconcat(FeatureISAVersion9_0_Common.Features,
+ [FeatureMadMixInsts,
+ FeatureDsSrc2Insts,
+ FeatureExtendedImageInsts,
+ FeatureImageGather4D16Bug])>;
def FeatureISAVersion9_0_4 : FeatureSet<
- [FeatureGFX9,
- FeatureLDSBankCount32,
- FeatureDsSrc2Insts,
- FeatureExtendedImageInsts,
- FeatureImageInsts,
- FeatureMadMacF32Insts,
- FeatureFmaMixInsts,
- FeatureImageGather4D16Bug]>;
+ !listconcat(FeatureISAVersion9_0_Common.Features,
+ [FeatureDsSrc2Insts,
+ FeatureExtendedImageInsts,
+ FeatureFmaMixInsts,
+ FeatureImageGather4D16Bug])>;
def FeatureISAVersion9_0_6 : FeatureSet<
- [FeatureGFX9,
- HalfRate64Ops,
- FeatureFmaMixInsts,
- FeatureLDSBankCount32,
- FeatureDsSrc2Insts,
- FeatureExtendedImageInsts,
- FeatureImageInsts,
- FeatureMadMacF32Insts,
- FeatureDLInsts,
- FeatureDot1Insts,
- FeatureDot2Insts,
- FeatureDot7Insts,
- FeatureSupportsSRAMECC,
- FeatureImageGather4D16Bug]>;
+ !listconcat(FeatureISAVersion9_0_Common.Features,
+ [HalfRate64Ops,
+ FeatureFmaMixInsts,
+ FeatureDsSrc2Insts,
+ FeatureExtendedImageInsts,
+ FeatureDLInsts,
+ FeatureDot1Insts,
+ FeatureDot2Insts,
+ FeatureDot7Insts,
+ FeatureDot10Insts,
+ FeatureSupportsSRAMECC,
+ FeatureImageGather4D16Bug])>;
def FeatureISAVersion9_0_8 : FeatureSet<
- [FeatureGFX9,
- HalfRate64Ops,
- FeatureFmaMixInsts,
- FeatureLDSBankCount32,
- FeatureDsSrc2Insts,
- FeatureExtendedImageInsts,
- FeatureImageInsts,
- FeatureMadMacF32Insts,
- FeatureDLInsts,
- FeatureDot1Insts,
- FeatureDot2Insts,
- FeatureDot3Insts,
- FeatureDot4Insts,
- FeatureDot5Insts,
- FeatureDot6Insts,
- FeatureDot7Insts,
- FeatureMAIInsts,
- FeaturePkFmacF16Inst,
- FeatureAtomicFaddNoRtnInsts,
- FeatureAtomicPkFaddNoRtnInsts,
- FeatureSupportsSRAMECC,
- FeatureMFMAInlineLiteralBug,
- FeatureImageGather4D16Bug]>;
+ !listconcat(FeatureISAVersion9_0_MI_Common.Features,
+ [HalfRate64Ops,
+ FeatureDsSrc2Insts,
+ FeatureExtendedImageInsts,
+ FeatureAtomicBufferGlobalPkAddF16NoRtnInsts,
+ FeatureMFMAInlineLiteralBug,
+ FeatureImageGather4D16Bug])>;
def FeatureISAVersion9_0_9 : FeatureSet<
- [FeatureGFX9,
- FeatureMadMixInsts,
- FeatureLDSBankCount32,
- FeatureDsSrc2Insts,
- FeatureExtendedImageInsts,
- FeatureImageInsts,
- FeatureMadMacF32Insts,
- FeatureImageGather4D16Bug]>;
+ !listconcat(FeatureISAVersion9_0_Common.Features,
+ [FeatureMadMixInsts,
+ FeatureDsSrc2Insts,
+ FeatureExtendedImageInsts,
+ FeatureImageInsts,
+ FeatureImageGather4D16Bug])>;
def FeatureISAVersion9_0_A : FeatureSet<
- [FeatureGFX9,
- FeatureGFX90AInsts,
- FeatureFmaMixInsts,
- FeatureLDSBankCount32,
- FeatureDLInsts,
- FeatureFmacF64Inst,
- FeatureDot1Insts,
- FeatureDot2Insts,
- FeatureDot3Insts,
- FeatureDot4Insts,
- FeatureDot5Insts,
- FeatureDot6Insts,
- FeatureDot7Insts,
- Feature64BitDPP,
- FeaturePackedFP32Ops,
- FeatureMAIInsts,
- FeaturePkFmacF16Inst,
- FeatureAtomicFaddRtnInsts,
- FeatureAtomicFaddNoRtnInsts,
- FeatureAtomicPkFaddNoRtnInsts,
- FeatureImageInsts,
- FeatureMadMacF32Insts,
- FeatureSupportsSRAMECC,
- FeaturePackedTID,
- FullRate64Ops,
- FeatureBackOffBarrier]>;
+ !listconcat(FeatureISAVersion9_0_MI_Common.Features,
+ [FeatureGFX90AInsts,
+ FeatureFmacF64Inst,
+ Feature64BitDPP,
+ FeaturePackedFP32Ops,
+ FeatureAtomicFaddRtnInsts,
+ FeatureAtomicBufferGlobalPkAddF16Insts,
+ FeaturePackedTID,
+ FullRate64Ops,
+ FeatureBackOffBarrier])>;
def FeatureISAVersion9_0_C : FeatureSet<
- [FeatureGFX9,
- FeatureMadMixInsts,
- FeatureLDSBankCount32,
- FeatureDsSrc2Insts,
- FeatureExtendedImageInsts,
- FeatureImageInsts,
- FeatureMadMacF32Insts,
- FeatureImageGather4D16Bug]>;
+ !listconcat(FeatureISAVersion9_0_Common.Features,
+ [FeatureMadMixInsts,
+ FeatureDsSrc2Insts,
+ FeatureExtendedImageInsts,
+ FeatureImageGather4D16Bug])>;
-def FeatureISAVersion9_4_0 : FeatureSet<
+def FeatureISAVersion9_4_Common : FeatureSet<
[FeatureGFX9,
FeatureGFX90AInsts,
FeatureGFX940Insts,
@@ -1172,6 +1188,9 @@ def FeatureISAVersion9_4_0 : FeatureSet<
FeatureDot5Insts,
FeatureDot6Insts,
FeatureDot7Insts,
+ FeatureDot10Insts,
+ FeatureAtomicDsPkAdd16Insts,
+ FeatureAtomicFlatPkAdd16Insts,
Feature64BitDPP,
FeaturePackedFP32Ops,
FeatureMAIInsts,
@@ -1179,7 +1198,8 @@ def FeatureISAVersion9_4_0 : FeatureSet<
FeaturePkFmacF16Inst,
FeatureAtomicFaddRtnInsts,
FeatureAtomicFaddNoRtnInsts,
- FeatureAtomicPkFaddNoRtnInsts,
+ FeatureAtomicBufferGlobalPkAddF16Insts,
+ FeatureAtomicGlobalPkAddBF16Inst,
FeatureFlatAtomicFaddF32Inst,
FeatureSupportsSRAMECC,
FeaturePackedTID,
@@ -1187,33 +1207,29 @@ def FeatureISAVersion9_4_0 : FeatureSet<
FullRate64Ops,
FeatureBackOffBarrier]>;
-// TODO: Organize more features into groups.
-def FeatureGroup {
- // Bugs present on gfx10.1.
- list<SubtargetFeature> GFX10_1_Bugs = [
- FeatureVcmpxPermlaneHazard,
- FeatureVMEMtoScalarWriteHazard,
- FeatureSMEMtoVectorWriteHazard,
- FeatureInstFwdPrefetchBug,
- FeatureVcmpxExecWARHazard,
- FeatureLdsBranchVmemWARHazard,
- FeatureNSAtoVMEMBug,
- FeatureNSAClauseBug,
- FeatureOffset3fBug,
- FeatureFlatSegmentOffsetBug,
- FeatureNegativeUnalignedScratchOffsetBug
- ];
-}
+def FeatureISAVersion9_4_0 : FeatureSet<
+ !listconcat(FeatureISAVersion9_4_Common.Features,
+ [FeatureForceStoreSC0SC1])>;
-def FeatureISAVersion10_1_0 : FeatureSet<
- !listconcat(FeatureGroup.GFX10_1_Bugs,
- [FeatureGFX10,
- FeatureLDSBankCount32,
- FeatureDLInsts,
- FeatureNSAEncoding,
- FeatureNSAMaxSize5,
- FeatureWavefrontSize32,
- FeatureScalarStores,
+def FeatureISAVersion9_4_1 : FeatureSet<
+ !listconcat(FeatureISAVersion9_4_Common.Features,
+ [FeatureForceStoreSC0SC1])>;
+
+def FeatureISAVersion9_4_2 : FeatureSet<
+ !listconcat(FeatureISAVersion9_4_Common.Features,
+ [])>;
+
+def FeatureISAVersion10_Common : FeatureSet<
+ [FeatureGFX10,
+ FeatureLDSBankCount32,
+ FeatureDLInsts,
+ FeatureNSAEncoding,
+ FeatureWavefrontSize32,
+ FeatureBackOffBarrier]>;
+
+def FeatureISAVersion10_1_Common : FeatureSet<
+ !listconcat(FeatureISAVersion10_Common.Features,
+ [FeatureScalarStores,
FeatureScalarAtomics,
FeatureScalarFlatScratchInsts,
FeatureGetWaveIdInst,
@@ -1221,90 +1237,57 @@ def FeatureISAVersion10_1_0 : FeatureSet<
FeatureDsSrc2Insts,
FeatureLdsMisalignedBug,
FeatureSupportsXNACK,
- FeatureBackOffBarrier])>;
+ // gfx101x bugs
+ FeatureVcmpxPermlaneHazard,
+ FeatureVMEMtoScalarWriteHazard,
+ FeatureSMEMtoVectorWriteHazard,
+ FeatureInstFwdPrefetchBug,
+ FeatureVcmpxExecWARHazard,
+ FeatureLdsBranchVmemWARHazard,
+ FeatureNSAtoVMEMBug,
+ FeatureNSAClauseBug,
+ FeatureOffset3fBug,
+ FeatureFlatSegmentOffsetBug,
+ FeatureNegativeUnalignedScratchOffsetBug])>;
+
+def FeatureISAVersion10_1_0 : FeatureSet<
+ !listconcat(FeatureISAVersion10_1_Common.Features,
+ [])>;
def FeatureISAVersion10_1_1 : FeatureSet<
- !listconcat(FeatureGroup.GFX10_1_Bugs,
- [FeatureGFX10,
- FeatureLDSBankCount32,
- FeatureDLInsts,
- FeatureDot1Insts,
+ !listconcat(FeatureISAVersion10_1_Common.Features,
+ [FeatureDot1Insts,
FeatureDot2Insts,
FeatureDot5Insts,
FeatureDot6Insts,
FeatureDot7Insts,
- FeatureNSAEncoding,
- FeatureNSAMaxSize5,
- FeatureWavefrontSize32,
- FeatureScalarStores,
- FeatureScalarAtomics,
- FeatureScalarFlatScratchInsts,
- FeatureGetWaveIdInst,
- FeatureMadMacF32Insts,
- FeatureDsSrc2Insts,
- FeatureLdsMisalignedBug,
- FeatureSupportsXNACK,
- FeatureBackOffBarrier])>;
+ FeatureDot10Insts])>;
def FeatureISAVersion10_1_2 : FeatureSet<
- !listconcat(FeatureGroup.GFX10_1_Bugs,
- [FeatureGFX10,
- FeatureLDSBankCount32,
- FeatureDLInsts,
- FeatureDot1Insts,
+ !listconcat(FeatureISAVersion10_1_Common.Features,
+ [FeatureDot1Insts,
FeatureDot2Insts,
FeatureDot5Insts,
FeatureDot6Insts,
FeatureDot7Insts,
- FeatureNSAEncoding,
- FeatureNSAMaxSize5,
- FeatureWavefrontSize32,
- FeatureScalarStores,
- FeatureScalarAtomics,
- FeatureScalarFlatScratchInsts,
- FeatureGetWaveIdInst,
- FeatureMadMacF32Insts,
- FeatureDsSrc2Insts,
- FeatureLdsMisalignedBug,
- FeatureSupportsXNACK,
- FeatureBackOffBarrier])>;
+ FeatureDot10Insts])>;
def FeatureISAVersion10_1_3 : FeatureSet<
- !listconcat(FeatureGroup.GFX10_1_Bugs,
- [FeatureGFX10,
- FeatureGFX10_AEncoding,
- FeatureLDSBankCount32,
- FeatureDLInsts,
- FeatureNSAEncoding,
- FeatureNSAMaxSize5,
- FeatureWavefrontSize32,
- FeatureScalarStores,
- FeatureScalarAtomics,
- FeatureScalarFlatScratchInsts,
- FeatureGetWaveIdInst,
- FeatureMadMacF32Insts,
- FeatureDsSrc2Insts,
- FeatureLdsMisalignedBug,
- FeatureSupportsXNACK,
- FeatureBackOffBarrier])>;
+ !listconcat(FeatureISAVersion10_1_Common.Features,
+ [FeatureGFX10_AEncoding])>;
def FeatureISAVersion10_3_0 : FeatureSet<
- [FeatureGFX10,
- FeatureGFX10_AEncoding,
- FeatureGFX10_BEncoding,
- FeatureGFX10_3Insts,
- FeatureLDSBankCount32,
- FeatureDLInsts,
- FeatureDot1Insts,
- FeatureDot2Insts,
- FeatureDot5Insts,
- FeatureDot6Insts,
- FeatureDot7Insts,
- FeatureNSAEncoding,
- FeatureNSAMaxSize13,
- FeatureWavefrontSize32,
- FeatureShaderCyclesRegister,
- FeatureBackOffBarrier]>;
+ !listconcat(FeatureISAVersion10_Common.Features,
+ [FeatureGFX10_AEncoding,
+ FeatureGFX10_BEncoding,
+ FeatureGFX10_3Insts,
+ FeatureDot1Insts,
+ FeatureDot2Insts,
+ FeatureDot5Insts,
+ FeatureDot6Insts,
+ FeatureDot7Insts,
+ FeatureDot10Insts,
+ FeatureShaderCyclesRegister])>;
def FeatureISAVersion11_Common : FeatureSet<
[FeatureGFX11,
@@ -1314,8 +1297,9 @@ def FeatureISAVersion11_Common : FeatureSet<
FeatureDot7Insts,
FeatureDot8Insts,
FeatureDot9Insts,
+ FeatureDot10Insts,
FeatureNSAEncoding,
- FeatureNSAMaxSize5,
+ FeaturePartialNSAEncoding,
FeatureWavefrontSize32,
FeatureShaderCyclesRegister,
FeatureArchitectedFlatScratch,
@@ -1325,26 +1309,37 @@ def FeatureISAVersion11_Common : FeatureSet<
FeatureImageInsts,
FeaturePackedTID,
FeatureVcmpxPermlaneHazard,
- FeatureVALUTransUseHazard,
FeatureMADIntraFwdBug]>;
-def FeatureISAVersion11_0_0 : FeatureSet<
+def FeatureISAVersion11_0_Common : FeatureSet<
!listconcat(FeatureISAVersion11_Common.Features,
+ [FeatureVALUTransUseHazard])>;
+
+def FeatureISAVersion11_0_0 : FeatureSet<
+ !listconcat(FeatureISAVersion11_0_Common.Features,
[FeatureGFX11FullVGPRs,
FeatureUserSGPRInit16Bug])>;
def FeatureISAVersion11_0_1 : FeatureSet<
- !listconcat(FeatureISAVersion11_Common.Features,
+ !listconcat(FeatureISAVersion11_0_Common.Features,
[FeatureGFX11FullVGPRs])>;
def FeatureISAVersion11_0_2 : FeatureSet<
- !listconcat(FeatureISAVersion11_Common.Features,
+ !listconcat(FeatureISAVersion11_0_Common.Features,
[FeatureUserSGPRInit16Bug])>;
def FeatureISAVersion11_0_3 : FeatureSet<
+ !listconcat(FeatureISAVersion11_0_Common.Features,
+ [])>;
+
+def FeatureISAVersion11_5_0 : FeatureSet<
!listconcat(FeatureISAVersion11_Common.Features,
[])>;
+def FeatureISAVersion11_5_1 : FeatureSet<
+ !listconcat(FeatureISAVersion11_Common.Features,
+ [FeatureGFX11FullVGPRs])>;
+
//===----------------------------------------------------------------------===//
def AMDGPUInstrInfo : InstrInfo {
@@ -1522,6 +1517,9 @@ def isGFX9Plus :
Predicate<"Subtarget->getGeneration() >= AMDGPUSubtarget::GFX9">,
AssemblerPredicate<(all_of FeatureGFX9Insts)>;
+def isNotGFX9Plus :
+ Predicate<"Subtarget->getGeneration() < AMDGPUSubtarget::GFX9">;
+
def isGFX9Only : Predicate <
"Subtarget->getGeneration() == AMDGPUSubtarget::GFX9">,
AssemblerPredicate<(all_of FeatureGCN3Encoding, FeatureGFX9Insts)>;
@@ -1655,6 +1653,8 @@ def NotHasTrue16BitInsts : Predicate<"!Subtarget->hasTrue16BitInsts()">;
def HasVOP3PInsts : Predicate<"Subtarget->hasVOP3PInsts()">,
AssemblerPredicate<(all_of FeatureVOP3P)>;
+def NotHasMed3_16 : Predicate<"!Subtarget->hasMed3_16()">;
+
def HasMinMaxDenormModes : Predicate<"Subtarget->supportsMinMaxDenormModes()">;
def NotHasMinMaxDenormModes : Predicate<"!Subtarget->supportsMinMaxDenormModes()">;
@@ -1766,6 +1766,9 @@ def HasDot8Insts : Predicate<"Subtarget->hasDot8Insts()">,
def HasDot9Insts : Predicate<"Subtarget->hasDot9Insts()">,
AssemblerPredicate<(all_of FeatureDot9Insts)>;
+def HasDot10Insts : Predicate<"Subtarget->hasDot10Insts()">,
+ AssemblerPredicate<(all_of FeatureDot10Insts)>;
+
def HasGetWaveIdInst : Predicate<"Subtarget->hasGetWaveIdInst()">,
AssemblerPredicate<(all_of FeatureGetWaveIdInst)>;
@@ -1793,13 +1796,25 @@ def HasMadMacF32Insts : Predicate<"Subtarget->hasMadMacF32Insts()">,
def HasFmaLegacy32 : Predicate<"Subtarget->hasGFX10_3Insts()">,
AssemblerPredicate<(any_of FeatureGFX10_3Insts)>;
+def HasAtomicDsPkAdd16Insts : Predicate<"Subtarget->hasAtomicDsPkAdd16Insts()">,
+ AssemblerPredicate<(any_of FeatureAtomicDsPkAdd16Insts)>;
+
+def HasAtomicFlatPkAdd16Insts : Predicate<"Subtarget->hasAtomicFlatPkAdd16Insts()">,
+ AssemblerPredicate<(any_of FeatureAtomicFlatPkAdd16Insts)>;
+
def HasAtomicFaddRtnInsts : Predicate<"Subtarget->hasAtomicFaddRtnInsts()">,
AssemblerPredicate<(all_of FeatureAtomicFaddRtnInsts)>;
def HasAtomicFaddNoRtnInsts : Predicate<"Subtarget->hasAtomicFaddNoRtnInsts()">,
AssemblerPredicate<(all_of FeatureAtomicFaddNoRtnInsts)>;
-def HasAtomicPkFaddNoRtnInsts
- : Predicate<"Subtarget->hasAtomicPkFaddNoRtnInsts()">,
- AssemblerPredicate<(all_of FeatureAtomicPkFaddNoRtnInsts)>;
+def HasAtomicBufferGlobalPkAddF16NoRtnInsts
+ : Predicate<"Subtarget->hasAtomicBufferGlobalPkAddF16NoRtnInsts() || Subtarget->hasAtomicBufferGlobalPkAddF16Insts()">,
+ AssemblerPredicate<(any_of FeatureAtomicBufferGlobalPkAddF16NoRtnInsts, FeatureAtomicBufferGlobalPkAddF16Insts)>;
+def HasAtomicBufferGlobalPkAddF16Insts
+ : Predicate<"Subtarget->hasAtomicBufferGlobalPkAddF16Insts()">,
+ AssemblerPredicate<(all_of FeatureAtomicBufferGlobalPkAddF16Insts)>;
+def HasAtomicGlobalPkAddBF16Inst
+ : Predicate<"Subtarget->hasAtomicGlobalPkAddBF16Inst()">,
+ AssemblerPredicate<(all_of FeatureAtomicGlobalPkAddBF16Inst)>;
def HasFlatAtomicFaddF32Inst
: Predicate<"Subtarget->hasFlatAtomicFaddF32Inst()">,
AssemblerPredicate<(all_of FeatureFlatAtomicFaddF32Inst)>;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAliasAnalysis.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAliasAnalysis.cpp
index 8155c895e366..63942414bf3c 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAliasAnalysis.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAliasAnalysis.cpp
@@ -46,41 +46,14 @@ void AMDGPUAAWrapperPass::getAnalysisUsage(AnalysisUsage &AU) const {
AU.setPreservesAll();
}
-static AliasResult getAliasResult(unsigned AS1, unsigned AS2) {
- static_assert(AMDGPUAS::MAX_AMDGPU_ADDRESS <= 7, "Addr space out of range");
-
- if (AS1 > AMDGPUAS::MAX_AMDGPU_ADDRESS || AS2 > AMDGPUAS::MAX_AMDGPU_ADDRESS)
- return AliasResult::MayAlias;
-
-#define ASMay AliasResult::MayAlias
-#define ASNo AliasResult::NoAlias
- // This array is indexed by address space value enum elements 0 ... to 7
- static const AliasResult ASAliasRules[8][8] = {
- /* Flat Global Region Group Constant Private Const32 Buf Fat Ptr */
- /* Flat */ {ASMay, ASMay, ASNo, ASMay, ASMay, ASMay, ASMay, ASMay},
- /* Global */ {ASMay, ASMay, ASNo, ASNo, ASMay, ASNo, ASMay, ASMay},
- /* Region */ {ASNo, ASNo, ASMay, ASNo, ASNo, ASNo, ASNo, ASNo},
- /* Group */ {ASMay, ASNo, ASNo, ASMay, ASNo, ASNo, ASNo, ASNo},
- /* Constant */ {ASMay, ASMay, ASNo, ASNo, ASNo, ASNo, ASMay, ASMay},
- /* Private */ {ASMay, ASNo, ASNo, ASNo, ASNo, ASMay, ASNo, ASNo},
- /* Constant 32-bit */ {ASMay, ASMay, ASNo, ASNo, ASMay, ASNo, ASNo, ASMay},
- /* Buffer Fat Ptr */ {ASMay, ASMay, ASNo, ASNo, ASMay, ASNo, ASMay, ASMay}
- };
-#undef ASMay
-#undef ASNo
-
- return ASAliasRules[AS1][AS2];
-}
-
AliasResult AMDGPUAAResult::alias(const MemoryLocation &LocA,
const MemoryLocation &LocB, AAQueryInfo &AAQI,
const Instruction *) {
unsigned asA = LocA.Ptr->getType()->getPointerAddressSpace();
unsigned asB = LocB.Ptr->getType()->getPointerAddressSpace();
- AliasResult Result = getAliasResult(asA, asB);
- if (Result == AliasResult::NoAlias)
- return Result;
+ if (!AMDGPU::addrspacesMayAlias(asA, asB))
+ return AliasResult::NoAlias;
// In general, FLAT (generic) pointers could be aliased to LOCAL or PRIVATE
// pointers. However, as LOCAL or PRIVATE pointers point to local objects, in
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAlwaysInlinePass.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAlwaysInlinePass.cpp
index 2e24e9f929d2..b53def912ab6 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAlwaysInlinePass.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAlwaysInlinePass.cpp
@@ -127,7 +127,7 @@ static bool alwaysInlineImpl(Module &M, bool GlobalOpt) {
unsigned AS = GV.getAddressSpace();
if ((AS == AMDGPUAS::REGION_ADDRESS) ||
(AS == AMDGPUAS::LOCAL_ADDRESS &&
- (!AMDGPUTargetMachine::EnableLowerModuleLDS || !GV.hasInitializer())))
+ (!AMDGPUTargetMachine::EnableLowerModuleLDS)))
recursivelyVisitUsers(GV, FuncsToAlwaysInline);
}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAnnotateUniformValues.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAnnotateUniformValues.cpp
index 74be0336851c..6a409f0dcbe7 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAnnotateUniformValues.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAnnotateUniformValues.cpp
@@ -16,8 +16,8 @@
#include "Utils/AMDGPUBaseInfo.h"
#include "Utils/AMDGPUMemoryUtils.h"
#include "llvm/Analysis/AliasAnalysis.h"
-#include "llvm/Analysis/LegacyDivergenceAnalysis.h"
#include "llvm/Analysis/MemorySSA.h"
+#include "llvm/Analysis/UniformityAnalysis.h"
#include "llvm/IR/InstVisitor.h"
#include "llvm/InitializePasses.h"
@@ -29,7 +29,7 @@ namespace {
class AMDGPUAnnotateUniformValues : public FunctionPass,
public InstVisitor<AMDGPUAnnotateUniformValues> {
- LegacyDivergenceAnalysis *DA;
+ UniformityInfo *UA;
MemorySSA *MSSA;
AliasAnalysis *AA;
bool isEntryFunc;
@@ -55,7 +55,7 @@ public:
return "AMDGPU Annotate Uniform Values";
}
void getAnalysisUsage(AnalysisUsage &AU) const override {
- AU.addRequired<LegacyDivergenceAnalysis>();
+ AU.addRequired<UniformityInfoWrapperPass>();
AU.addRequired<MemorySSAWrapperPass>();
AU.addRequired<AAResultsWrapperPass>();
AU.setPreservesAll();
@@ -69,7 +69,7 @@ public:
INITIALIZE_PASS_BEGIN(AMDGPUAnnotateUniformValues, DEBUG_TYPE,
"Add AMDGPU uniform metadata", false, false)
-INITIALIZE_PASS_DEPENDENCY(LegacyDivergenceAnalysis)
+INITIALIZE_PASS_DEPENDENCY(UniformityInfoWrapperPass)
INITIALIZE_PASS_DEPENDENCY(MemorySSAWrapperPass)
INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
INITIALIZE_PASS_END(AMDGPUAnnotateUniformValues, DEBUG_TYPE,
@@ -78,13 +78,13 @@ INITIALIZE_PASS_END(AMDGPUAnnotateUniformValues, DEBUG_TYPE,
char AMDGPUAnnotateUniformValues::ID = 0;
void AMDGPUAnnotateUniformValues::visitBranchInst(BranchInst &I) {
- if (DA->isUniform(&I))
+ if (UA->isUniform(&I))
setUniformMetadata(&I);
}
void AMDGPUAnnotateUniformValues::visitLoadInst(LoadInst &I) {
Value *Ptr = I.getPointerOperand();
- if (!DA->isUniform(Ptr))
+ if (!UA->isUniform(Ptr))
return;
Instruction *PtrI = dyn_cast<Instruction>(Ptr);
if (PtrI)
@@ -108,7 +108,7 @@ bool AMDGPUAnnotateUniformValues::runOnFunction(Function &F) {
if (skipFunction(F))
return false;
- DA = &getAnalysis<LegacyDivergenceAnalysis>();
+ UA = &getAnalysis<UniformityInfoWrapperPass>().getUniformityInfo();
MSSA = &getAnalysis<MemorySSAWrapperPass>().getMSSA();
AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
isEntryFunc = AMDGPU::isEntryFunctionCC(F.getCallingConv());
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
index c916d5d547c4..7cd8e53e6521 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
@@ -38,9 +38,9 @@
#include "llvm/MC/MCStreamer.h"
#include "llvm/MC/TargetRegistry.h"
#include "llvm/Support/AMDHSAKernelDescriptor.h"
-#include "llvm/Support/TargetParser.h"
#include "llvm/Target/TargetLoweringObjectFile.h"
#include "llvm/Target/TargetMachine.h"
+#include "llvm/TargetParser/TargetParser.h"
using namespace llvm;
using namespace llvm::AMDGPU;
@@ -65,7 +65,7 @@ using namespace llvm::AMDGPU;
// We want to use these instructions, and using fp32 denormals also causes
// instructions to run at the double precision rate for the device so it's
// probably best to just report no single precision denormals.
-static uint32_t getFPMode(AMDGPU::SIModeRegisterDefaults Mode) {
+static uint32_t getFPMode(SIModeRegisterDefaults Mode) {
return FP_ROUND_MODE_SP(FP_ROUND_ROUND_TO_NEAREST) |
FP_ROUND_MODE_DP(FP_ROUND_ROUND_TO_NEAREST) |
FP_DENORM_MODE_SP(Mode.fpDenormModeSPValue()) |
@@ -78,8 +78,8 @@ createAMDGPUAsmPrinterPass(TargetMachine &tm,
return new AMDGPUAsmPrinter(tm, std::move(Streamer));
}
-extern "C" void LLVM_EXTERNAL_VISIBILITY LLVMInitializeAMDGPUAsmPrinter() {
- TargetRegistry::RegisterAsmPrinter(getTheAMDGPUTarget(),
+extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUAsmPrinter() {
+ TargetRegistry::RegisterAsmPrinter(getTheR600Target(),
llvm::createR600AsmPrinterPass);
TargetRegistry::RegisterAsmPrinter(getTheGCNTarget(),
createAMDGPUAsmPrinterPass);
@@ -89,18 +89,6 @@ AMDGPUAsmPrinter::AMDGPUAsmPrinter(TargetMachine &TM,
std::unique_ptr<MCStreamer> Streamer)
: AsmPrinter(TM, std::move(Streamer)) {
assert(OutStreamer && "AsmPrinter constructed without streamer");
-
- if (TM.getTargetTriple().getOS() == Triple::AMDHSA) {
- if (isHsaAbiVersion2(getGlobalSTI())) {
- HSAMetadataStream.reset(new HSAMD::MetadataStreamerYamlV2());
- } else if (isHsaAbiVersion3(getGlobalSTI())) {
- HSAMetadataStream.reset(new HSAMD::MetadataStreamerMsgPackV3());
- } else if (isHsaAbiVersion5(getGlobalSTI())) {
- HSAMetadataStream.reset(new HSAMD::MetadataStreamerMsgPackV5());
- } else {
- HSAMetadataStream.reset(new HSAMD::MetadataStreamerMsgPackV4());
- }
- }
}
StringRef AMDGPUAsmPrinter::getPassName() const {
@@ -133,7 +121,7 @@ void AMDGPUAsmPrinter::initTargetStreamer(Module &M) {
TM.getTargetTriple().getOS() != Triple::AMDPAL)
return;
- if (isHsaAbiVersion3AndAbove(getGlobalSTI()))
+ if (CodeObjectVersion >= AMDGPU::AMDHSA_COV3)
getTargetStreamer()->EmitDirectiveAMDGCNTarget();
if (TM.getTargetTriple().getOS() == Triple::AMDHSA)
@@ -142,7 +130,7 @@ void AMDGPUAsmPrinter::initTargetStreamer(Module &M) {
if (TM.getTargetTriple().getOS() == Triple::AMDPAL)
getTargetStreamer()->getPALMetadata()->readFromIR(M);
- if (isHsaAbiVersion3AndAbove(getGlobalSTI()))
+ if (CodeObjectVersion >= AMDGPU::AMDHSA_COV3)
return;
// HSA emits NT_AMD_HSA_CODE_OBJECT_VERSION for code objects v2.
@@ -161,7 +149,7 @@ void AMDGPUAsmPrinter::emitEndOfAsmFile(Module &M) {
initTargetStreamer(M);
if (TM.getTargetTriple().getOS() != Triple::AMDHSA ||
- isHsaAbiVersion2(getGlobalSTI()))
+ CodeObjectVersion == AMDGPU::AMDHSA_COV2)
getTargetStreamer()->EmitISAVersion();
// Emit HSA Metadata (NT_AMD_AMDGPU_HSA_METADATA).
@@ -221,7 +209,7 @@ void AMDGPUAsmPrinter::emitFunctionBodyStart() {
if (!MFI.isEntryFunction())
return;
- if ((STM.isMesaKernel(F) || isHsaAbiVersion2(getGlobalSTI())) &&
+ if ((STM.isMesaKernel(F) || CodeObjectVersion == AMDGPU::AMDHSA_COV2) &&
(F.getCallingConv() == CallingConv::AMDGPU_KERNEL ||
F.getCallingConv() == CallingConv::SPIR_KERNEL)) {
amd_kernel_code_t KernelCode;
@@ -239,7 +227,7 @@ void AMDGPUAsmPrinter::emitFunctionBodyEnd() {
return;
if (TM.getTargetTriple().getOS() != Triple::AMDHSA ||
- isHsaAbiVersion2(getGlobalSTI()))
+ CodeObjectVersion == AMDGPU::AMDHSA_COV2)
return;
auto &Streamer = getTargetStreamer()->getStreamer();
@@ -263,17 +251,18 @@ void AMDGPUAsmPrinter::emitFunctionBodyEnd() {
STM, KernelName, getAmdhsaKernelDescriptor(*MF, CurrentProgramInfo),
CurrentProgramInfo.NumVGPRsForWavesPerEU,
CurrentProgramInfo.NumSGPRsForWavesPerEU -
- IsaInfo::getNumExtraSGPRs(&STM,
- CurrentProgramInfo.VCCUsed,
- CurrentProgramInfo.FlatUsed),
- CurrentProgramInfo.VCCUsed, CurrentProgramInfo.FlatUsed);
+ IsaInfo::getNumExtraSGPRs(
+ &STM, CurrentProgramInfo.VCCUsed, CurrentProgramInfo.FlatUsed,
+ getTargetStreamer()->getTargetID()->isXnackOnOrAny()),
+ CurrentProgramInfo.VCCUsed, CurrentProgramInfo.FlatUsed,
+ CodeObjectVersion);
Streamer.popSection();
}
void AMDGPUAsmPrinter::emitFunctionEntryLabel() {
if (TM.getTargetTriple().getOS() == Triple::AMDHSA &&
- isHsaAbiVersion3AndAbove(getGlobalSTI())) {
+ CodeObjectVersion >= AMDGPU::AMDHSA_COV3) {
AsmPrinter::emitFunctionEntryLabel();
return;
}
@@ -343,6 +332,30 @@ void AMDGPUAsmPrinter::emitGlobalVariable(const GlobalVariable *GV) {
AsmPrinter::emitGlobalVariable(GV);
}
+bool AMDGPUAsmPrinter::doInitialization(Module &M) {
+ CodeObjectVersion = AMDGPU::getCodeObjectVersion(M);
+
+ if (TM.getTargetTriple().getOS() == Triple::AMDHSA) {
+ switch (CodeObjectVersion) {
+ case AMDGPU::AMDHSA_COV2:
+ HSAMetadataStream.reset(new HSAMD::MetadataStreamerYamlV2());
+ break;
+ case AMDGPU::AMDHSA_COV3:
+ HSAMetadataStream.reset(new HSAMD::MetadataStreamerMsgPackV3());
+ break;
+ case AMDGPU::AMDHSA_COV4:
+ HSAMetadataStream.reset(new HSAMD::MetadataStreamerMsgPackV4());
+ break;
+ case AMDGPU::AMDHSA_COV5:
+ HSAMetadataStream.reset(new HSAMD::MetadataStreamerMsgPackV5());
+ break;
+ default:
+ report_fatal_error("Unexpected code object version");
+ }
+ }
+ return AsmPrinter::doInitialization(M);
+}
+
bool AMDGPUAsmPrinter::doFinalization(Module &M) {
// Pad with s_code_end to help tools and guard against instruction prefetch
// causing stale data in caches. Arguably this should be done by the linker,
@@ -389,7 +402,7 @@ uint16_t AMDGPUAsmPrinter::getAmdhsaKernelCodeProperties(
KernelCodeProperties |=
amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR;
}
- if (MFI.hasQueuePtr() && AMDGPU::getAmdhsaCodeObjectVersion() < 5) {
+ if (MFI.hasQueuePtr() && CodeObjectVersion < AMDGPU::AMDHSA_COV5) {
KernelCodeProperties |=
amdhsa::KERNEL_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR;
}
@@ -411,9 +424,8 @@ uint16_t AMDGPUAsmPrinter::getAmdhsaKernelCodeProperties(
}
if (CurrentProgramInfo.DynamicCallStack &&
- AMDGPU::getAmdhsaCodeObjectVersion() >= 5) {
+ CodeObjectVersion >= AMDGPU::AMDHSA_COV5)
KernelCodeProperties |= amdhsa::KERNEL_CODE_PROPERTY_USES_DYNAMIC_STACK;
- }
return KernelCodeProperties;
}
@@ -429,7 +441,7 @@ amdhsa::kernel_descriptor_t AMDGPUAsmPrinter::getAmdhsaKernelDescriptor(
assert(isUInt<32>(PI.ScratchSize));
assert(isUInt<32>(PI.getComputePGMRSrc1()));
- assert(isUInt<32>(PI.ComputePGMRSrc2));
+ assert(isUInt<32>(PI.getComputePGMRSrc2()));
KernelDescriptor.group_segment_fixed_size = PI.LDSSize;
KernelDescriptor.private_segment_fixed_size = PI.ScratchSize;
@@ -438,7 +450,7 @@ amdhsa::kernel_descriptor_t AMDGPUAsmPrinter::getAmdhsaKernelDescriptor(
KernelDescriptor.kernarg_size = STM.getKernArgSegmentSize(F, MaxKernArgAlign);
KernelDescriptor.compute_pgm_rsrc1 = PI.getComputePGMRSrc1();
- KernelDescriptor.compute_pgm_rsrc2 = PI.ComputePGMRSrc2;
+ KernelDescriptor.compute_pgm_rsrc2 = PI.getComputePGMRSrc2();
KernelDescriptor.kernel_code_properties = getAmdhsaKernelCodeProperties(MF);
assert(STM.hasGFX90AInsts() || CurrentProgramInfo.ComputePGMRSrc3GFX90A == 0);
@@ -567,28 +579,27 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
OutStreamer->emitRawComment(
" WaveLimiterHint : " + Twine(MFI->needsWaveLimiter()), false);
- OutStreamer->emitRawComment(
- " COMPUTE_PGM_RSRC2:SCRATCH_EN: " +
- Twine(G_00B84C_SCRATCH_EN(CurrentProgramInfo.ComputePGMRSrc2)), false);
- OutStreamer->emitRawComment(
- " COMPUTE_PGM_RSRC2:USER_SGPR: " +
- Twine(G_00B84C_USER_SGPR(CurrentProgramInfo.ComputePGMRSrc2)), false);
- OutStreamer->emitRawComment(
- " COMPUTE_PGM_RSRC2:TRAP_HANDLER: " +
- Twine(G_00B84C_TRAP_HANDLER(CurrentProgramInfo.ComputePGMRSrc2)), false);
- OutStreamer->emitRawComment(
- " COMPUTE_PGM_RSRC2:TGID_X_EN: " +
- Twine(G_00B84C_TGID_X_EN(CurrentProgramInfo.ComputePGMRSrc2)), false);
- OutStreamer->emitRawComment(
- " COMPUTE_PGM_RSRC2:TGID_Y_EN: " +
- Twine(G_00B84C_TGID_Y_EN(CurrentProgramInfo.ComputePGMRSrc2)), false);
- OutStreamer->emitRawComment(
- " COMPUTE_PGM_RSRC2:TGID_Z_EN: " +
- Twine(G_00B84C_TGID_Z_EN(CurrentProgramInfo.ComputePGMRSrc2)), false);
- OutStreamer->emitRawComment(
- " COMPUTE_PGM_RSRC2:TIDIG_COMP_CNT: " +
- Twine(G_00B84C_TIDIG_COMP_CNT(CurrentProgramInfo.ComputePGMRSrc2)),
- false);
+ OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:SCRATCH_EN: " +
+ Twine(CurrentProgramInfo.ScratchEnable),
+ false);
+ OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:USER_SGPR: " +
+ Twine(CurrentProgramInfo.UserSGPR),
+ false);
+ OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:TRAP_HANDLER: " +
+ Twine(CurrentProgramInfo.TrapHandlerEnable),
+ false);
+ OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:TGID_X_EN: " +
+ Twine(CurrentProgramInfo.TGIdXEnable),
+ false);
+ OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:TGID_Y_EN: " +
+ Twine(CurrentProgramInfo.TGIdYEnable),
+ false);
+ OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:TGID_Z_EN: " +
+ Twine(CurrentProgramInfo.TGIdZEnable),
+ false);
+ OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:TIDIG_COMP_CNT: " +
+ Twine(CurrentProgramInfo.TIdIGCompCount),
+ false);
assert(STM.hasGFX90AInsts() ||
CurrentProgramInfo.ComputePGMRSrc3GFX90A == 0);
@@ -631,7 +642,7 @@ void AMDGPUAsmPrinter::initializeTargetID(const Module &M) {
// In the beginning all features are either 'Any' or 'NotSupported',
// depending on global target features. This will cover empty modules.
getTargetStreamer()->initializeTargetID(
- *getGlobalSTI(), getGlobalSTI()->getFeatureString());
+ *getGlobalSTI(), getGlobalSTI()->getFeatureString(), CodeObjectVersion);
// If module is empty, we are done.
if (M.empty())
@@ -709,7 +720,8 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
// duplicated in part in AMDGPUAsmParser::calculateGPRBlocks, and could be
// unified.
unsigned ExtraSGPRs = IsaInfo::getNumExtraSGPRs(
- &STM, ProgInfo.VCCUsed, ProgInfo.FlatUsed);
+ &STM, ProgInfo.VCCUsed, ProgInfo.FlatUsed,
+ getTargetStreamer()->getTargetID()->isXnackOnOrAny());
// Check the addressable register limit before we add ExtraSGPRs.
if (STM.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS &&
@@ -761,7 +773,7 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
// There are some rare circumstances where InputAddr is non-zero and
// InputEna can be set to 0. In this case we default to setting LastEna
// to 1.
- LastEna = InputEna ? findLastSet(InputEna) + 1 : 1;
+ LastEna = InputEna ? llvm::Log2_32(InputEna) + 1 : 1;
}
// FIXME: We should be using the number of registers determined during
@@ -909,22 +921,21 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
// anything to disable it if we know the stack isn't used here. We may still
// have emitted code reading it to initialize scratch, but if that's unused
// reading garbage should be OK.
- const bool EnablePrivateSegment =
+ ProgInfo.ScratchEnable =
ProgInfo.ScratchBlocks > 0 || ProgInfo.DynamicCallStack;
- ProgInfo.ComputePGMRSrc2 =
- S_00B84C_SCRATCH_EN(EnablePrivateSegment) |
- S_00B84C_USER_SGPR(MFI->getNumUserSGPRs()) |
- // For AMDHSA, TRAP_HANDLER must be zero, as it is populated by the CP.
- S_00B84C_TRAP_HANDLER(STM.isAmdHsaOS() ? 0 : STM.isTrapHandlerEnabled()) |
- S_00B84C_TGID_X_EN(MFI->hasWorkGroupIDX()) |
- S_00B84C_TGID_Y_EN(MFI->hasWorkGroupIDY()) |
- S_00B84C_TGID_Z_EN(MFI->hasWorkGroupIDZ()) |
- S_00B84C_TG_SIZE_EN(MFI->hasWorkGroupInfo()) |
- S_00B84C_TIDIG_COMP_CNT(TIDIGCompCnt) |
- S_00B84C_EXCP_EN_MSB(0) |
- // For AMDHSA, LDS_SIZE must be zero, as it is populated by the CP.
- S_00B84C_LDS_SIZE(STM.isAmdHsaOS() ? 0 : ProgInfo.LDSBlocks) |
- S_00B84C_EXCP_EN(0);
+ ProgInfo.UserSGPR = MFI->getNumUserSGPRs();
+ // For AMDHSA, TRAP_HANDLER must be zero, as it is populated by the CP.
+ ProgInfo.TrapHandlerEnable =
+ STM.isAmdHsaOS() ? 0 : STM.isTrapHandlerEnabled();
+ ProgInfo.TGIdXEnable = MFI->hasWorkGroupIDX();
+ ProgInfo.TGIdYEnable = MFI->hasWorkGroupIDY();
+ ProgInfo.TGIdZEnable = MFI->hasWorkGroupIDZ();
+ ProgInfo.TGSizeEnable = MFI->hasWorkGroupInfo();
+ ProgInfo.TIdIGCompCount = TIDIGCompCnt;
+ ProgInfo.EXCPEnMSB = 0;
+ // For AMDHSA, LDS_SIZE must be zero, as it is populated by the CP.
+ ProgInfo.LdsSize = STM.isAmdHsaOS() ? 0 : ProgInfo.LDSBlocks;
+ ProgInfo.EXCPEnable = 0;
if (STM.hasGFX90AInsts()) {
AMDHSA_BITS_SET(ProgInfo.ComputePGMRSrc3GFX90A,
@@ -965,7 +976,7 @@ void AMDGPUAsmPrinter::EmitProgramInfoSI(const MachineFunction &MF,
OutStreamer->emitInt32(CurrentProgramInfo.getComputePGMRSrc1());
OutStreamer->emitInt32(R_00B84C_COMPUTE_PGM_RSRC2);
- OutStreamer->emitInt32(CurrentProgramInfo.ComputePGMRSrc2);
+ OutStreamer->emitInt32(CurrentProgramInfo.getComputePGMRSrc2());
OutStreamer->emitInt32(R_00B860_COMPUTE_TMPRING_SIZE);
OutStreamer->emitInt32(
@@ -1025,25 +1036,77 @@ void AMDGPUAsmPrinter::EmitPALMetadata(const MachineFunction &MF,
}
MD->setNumUsedSgprs(CC, CurrentProgramInfo.NumSGPRsForWavesPerEU);
- MD->setRsrc1(CC, CurrentProgramInfo.getPGMRSrc1(CC));
- if (AMDGPU::isCompute(CC)) {
- MD->setRsrc2(CC, CurrentProgramInfo.ComputePGMRSrc2);
+ if (MD->getPALMajorVersion() < 3) {
+ MD->setRsrc1(CC, CurrentProgramInfo.getPGMRSrc1(CC));
+ if (AMDGPU::isCompute(CC)) {
+ MD->setRsrc2(CC, CurrentProgramInfo.getComputePGMRSrc2());
+ } else {
+ if (CurrentProgramInfo.ScratchBlocks > 0)
+ MD->setRsrc2(CC, S_00B84C_SCRATCH_EN(1));
+ }
} else {
- if (CurrentProgramInfo.ScratchBlocks > 0)
- MD->setRsrc2(CC, S_00B84C_SCRATCH_EN(1));
+ MD->setHwStage(CC, ".debug_mode", (bool)CurrentProgramInfo.DebugMode);
+ MD->setHwStage(CC, ".ieee_mode", (bool)CurrentProgramInfo.IEEEMode);
+ MD->setHwStage(CC, ".wgp_mode", (bool)CurrentProgramInfo.WgpMode);
+ MD->setHwStage(CC, ".mem_ordered", (bool)CurrentProgramInfo.MemOrdered);
+
+ if (AMDGPU::isCompute(CC)) {
+ MD->setHwStage(CC, ".scratch_en", (bool)CurrentProgramInfo.ScratchEnable);
+ MD->setHwStage(CC, ".trap_present",
+ (bool)CurrentProgramInfo.TrapHandlerEnable);
+
+ // EXCPEnMSB?
+ const unsigned LdsDwGranularity = 128;
+ MD->setHwStage(CC, ".lds_size",
+ (unsigned)(CurrentProgramInfo.LdsSize * LdsDwGranularity *
+ sizeof(uint32_t)));
+ MD->setHwStage(CC, ".excp_en", CurrentProgramInfo.EXCPEnable);
+ } else {
+ MD->setHwStage(CC, ".scratch_en", (bool)CurrentProgramInfo.ScratchEnable);
+ }
}
+
// ScratchSize is in bytes, 16 aligned.
MD->setScratchSize(CC, alignTo(CurrentProgramInfo.ScratchSize, 16));
if (MF.getFunction().getCallingConv() == CallingConv::AMDGPU_PS) {
unsigned ExtraLDSSize = STM.getGeneration() >= AMDGPUSubtarget::GFX11
? divideCeil(CurrentProgramInfo.LDSBlocks, 2)
: CurrentProgramInfo.LDSBlocks;
- MD->setRsrc2(CC, S_00B02C_EXTRA_LDS_SIZE(ExtraLDSSize));
- MD->setSpiPsInputEna(MFI->getPSInputEnable());
- MD->setSpiPsInputAddr(MFI->getPSInputAddr());
+ if (MD->getPALMajorVersion() < 3) {
+ MD->setRsrc2(CC, S_00B02C_EXTRA_LDS_SIZE(ExtraLDSSize));
+ MD->setSpiPsInputEna(MFI->getPSInputEnable());
+ MD->setSpiPsInputAddr(MFI->getPSInputAddr());
+ } else {
+ // Graphics registers
+ const unsigned ExtraLdsDwGranularity =
+ STM.getGeneration() >= AMDGPUSubtarget::GFX11 ? 256 : 128;
+ MD->setGraphicsRegisters(
+ ".ps_extra_lds_size",
+ (unsigned)(ExtraLDSSize * ExtraLdsDwGranularity * sizeof(uint32_t)));
+
+ // Set PsInputEna and PsInputAddr .spi_ps_input_ena and .spi_ps_input_addr
+ static StringLiteral const PsInputFields[] = {
+ ".persp_sample_ena", ".persp_center_ena",
+ ".persp_centroid_ena", ".persp_pull_model_ena",
+ ".linear_sample_ena", ".linear_center_ena",
+ ".linear_centroid_ena", ".line_stipple_tex_ena",
+ ".pos_x_float_ena", ".pos_y_float_ena",
+ ".pos_z_float_ena", ".pos_w_float_ena",
+ ".front_face_ena", ".ancillary_ena",
+ ".sample_coverage_ena", ".pos_fixed_pt_ena"};
+ unsigned PSInputEna = MFI->getPSInputEnable();
+ unsigned PSInputAddr = MFI->getPSInputAddr();
+ for (auto [Idx, Field] : enumerate(PsInputFields)) {
+ MD->setGraphicsRegisters(".spi_ps_input_ena", Field,
+ (bool)((PSInputEna >> Idx) & 1));
+ MD->setGraphicsRegisters(".spi_ps_input_addr", Field,
+ (bool)((PSInputAddr >> Idx) & 1));
+ }
+ }
}
- if (STM.isWave32())
+ // For version 3 and above the wave front size is already set in the metadata
+ if (MD->getPALMajorVersion() < 3 && STM.isWave32())
MD->setWave32(MF.getFunction().getCallingConv());
}
@@ -1055,7 +1118,7 @@ void AMDGPUAsmPrinter::emitPALFunctionMetadata(const MachineFunction &MF) {
// Set compute registers
MD->setRsrc1(CallingConv::AMDGPU_CS,
CurrentProgramInfo.getPGMRSrc1(CallingConv::AMDGPU_CS));
- MD->setRsrc2(CallingConv::AMDGPU_CS, CurrentProgramInfo.ComputePGMRSrc2);
+ MD->setRsrc2(CallingConv::AMDGPU_CS, CurrentProgramInfo.getComputePGMRSrc2());
// Set optional info
MD->setFunctionLdsSize(MF, CurrentProgramInfo.LDSSize);
@@ -1091,7 +1154,7 @@ void AMDGPUAsmPrinter::getAmdKernelCode(amd_kernel_code_t &Out,
Out.compute_pgm_resource_registers =
CurrentProgramInfo.getComputePGMRSrc1() |
- (CurrentProgramInfo.ComputePGMRSrc2 << 32);
+ (CurrentProgramInfo.getComputePGMRSrc2() << 32);
Out.code_properties |= AMD_CODE_PROPERTY_IS_PTR64;
if (CurrentProgramInfo.DynamicCallStack)
@@ -1109,7 +1172,7 @@ void AMDGPUAsmPrinter::getAmdKernelCode(amd_kernel_code_t &Out,
if (MFI->hasDispatchPtr())
Out.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR;
- if (MFI->hasQueuePtr() && AMDGPU::getAmdhsaCodeObjectVersion() < 5)
+ if (MFI->hasQueuePtr() && CodeObjectVersion < AMDGPU::AMDHSA_COV5)
Out.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR;
if (MFI->hasKernargSegmentPtr())
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h
index ea12086751a4..d490209ce35e 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h
@@ -39,6 +39,7 @@ struct kernel_descriptor_t;
class AMDGPUAsmPrinter final : public AsmPrinter {
private:
+ unsigned CodeObjectVersion;
void initializeTargetID(const Module &M);
AMDGPUResourceUsageAnalysis *ResourceUsage;
@@ -90,6 +91,7 @@ public:
AMDGPUTargetStreamer* getTargetStreamer() const;
+ bool doInitialization(Module &M) override;
bool doFinalization(Module &M) override;
bool runOnMachineFunction(MachineFunction &MF) override;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp
index 28967bb8e5b1..9795928094f4 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp
@@ -10,12 +10,21 @@
/// This pass optimizes atomic operations by using a single lane of a wavefront
/// to perform the atomic operation, thus reducing contention on that memory
/// location.
-//
+/// Atomic optimizer uses following strategies to compute scan and reduced
+/// values
+/// 1. DPP -
+/// This is the most efficient implementation for scan. DPP uses Whole Wave
+/// Mode (WWM)
+/// 2. Iterative -
+// An alternative implementation iterates over all active lanes
+/// of Wavefront using llvm.cttz and performs scan using readlane & writelane
+/// intrinsics
//===----------------------------------------------------------------------===//
#include "AMDGPU.h"
#include "GCNSubtarget.h"
-#include "llvm/Analysis/LegacyDivergenceAnalysis.h"
+#include "llvm/Analysis/DomTreeUpdater.h"
+#include "llvm/Analysis/UniformityAnalysis.h"
#include "llvm/CodeGen/TargetPassConfig.h"
#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/InstVisitor.h"
@@ -38,36 +47,57 @@ struct ReplacementInfo {
bool ValDivergent;
};
-class AMDGPUAtomicOptimizer : public FunctionPass,
- public InstVisitor<AMDGPUAtomicOptimizer> {
+class AMDGPUAtomicOptimizer : public FunctionPass {
+public:
+ static char ID;
+ ScanOptions ScanImpl;
+ AMDGPUAtomicOptimizer(ScanOptions ScanImpl)
+ : FunctionPass(ID), ScanImpl(ScanImpl) {}
+
+ bool runOnFunction(Function &F) override;
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.addPreserved<DominatorTreeWrapperPass>();
+ AU.addRequired<UniformityInfoWrapperPass>();
+ AU.addRequired<TargetPassConfig>();
+ }
+};
+
+class AMDGPUAtomicOptimizerImpl
+ : public InstVisitor<AMDGPUAtomicOptimizerImpl> {
private:
SmallVector<ReplacementInfo, 8> ToReplace;
- const LegacyDivergenceAnalysis *DA;
+ const UniformityInfo *UA;
const DataLayout *DL;
- DominatorTree *DT;
+ DomTreeUpdater &DTU;
const GCNSubtarget *ST;
bool IsPixelShader;
+ ScanOptions ScanImpl;
Value *buildReduction(IRBuilder<> &B, AtomicRMWInst::BinOp Op, Value *V,
Value *const Identity) const;
Value *buildScan(IRBuilder<> &B, AtomicRMWInst::BinOp Op, Value *V,
Value *const Identity) const;
Value *buildShiftRight(IRBuilder<> &B, Value *V, Value *const Identity) const;
+
+ std::pair<Value *, Value *>
+ buildScanIteratively(IRBuilder<> &B, AtomicRMWInst::BinOp Op,
+ Value *const Identity, Value *V, Instruction &I,
+ BasicBlock *ComputeLoop, BasicBlock *ComputeEnd) const;
+
void optimizeAtomic(Instruction &I, AtomicRMWInst::BinOp Op, unsigned ValIdx,
bool ValDivergent) const;
public:
- static char ID;
+ AMDGPUAtomicOptimizerImpl() = delete;
- AMDGPUAtomicOptimizer() : FunctionPass(ID) {}
-
- bool runOnFunction(Function &F) override;
+ AMDGPUAtomicOptimizerImpl(const UniformityInfo *UA, const DataLayout *DL,
+ DomTreeUpdater &DTU, const GCNSubtarget *ST,
+ bool IsPixelShader, ScanOptions ScanImpl)
+ : UA(UA), DL(DL), DTU(DTU), ST(ST), IsPixelShader(IsPixelShader),
+ ScanImpl(ScanImpl) {}
- void getAnalysisUsage(AnalysisUsage &AU) const override {
- AU.addPreserved<DominatorTreeWrapperPass>();
- AU.addRequired<LegacyDivergenceAnalysis>();
- AU.addRequired<TargetPassConfig>();
- }
+ bool run(Function &F);
void visitAtomicRMWInst(AtomicRMWInst &I);
void visitIntrinsicInst(IntrinsicInst &I);
@@ -84,15 +114,56 @@ bool AMDGPUAtomicOptimizer::runOnFunction(Function &F) {
return false;
}
- DA = &getAnalysis<LegacyDivergenceAnalysis>();
- DL = &F.getParent()->getDataLayout();
+ const UniformityInfo *UA =
+ &getAnalysis<UniformityInfoWrapperPass>().getUniformityInfo();
+ const DataLayout *DL = &F.getParent()->getDataLayout();
+
DominatorTreeWrapperPass *const DTW =
getAnalysisIfAvailable<DominatorTreeWrapperPass>();
- DT = DTW ? &DTW->getDomTree() : nullptr;
+ DomTreeUpdater DTU(DTW ? &DTW->getDomTree() : nullptr,
+ DomTreeUpdater::UpdateStrategy::Lazy);
+
const TargetPassConfig &TPC = getAnalysis<TargetPassConfig>();
const TargetMachine &TM = TPC.getTM<TargetMachine>();
- ST = &TM.getSubtarget<GCNSubtarget>(F);
- IsPixelShader = F.getCallingConv() == CallingConv::AMDGPU_PS;
+ const GCNSubtarget *ST = &TM.getSubtarget<GCNSubtarget>(F);
+
+ bool IsPixelShader = F.getCallingConv() == CallingConv::AMDGPU_PS;
+
+ return AMDGPUAtomicOptimizerImpl(UA, DL, DTU, ST, IsPixelShader, ScanImpl)
+ .run(F);
+}
+
+PreservedAnalyses AMDGPUAtomicOptimizerPass::run(Function &F,
+ FunctionAnalysisManager &AM) {
+
+ const auto *UA = &AM.getResult<UniformityInfoAnalysis>(F);
+ const DataLayout *DL = &F.getParent()->getDataLayout();
+
+ DomTreeUpdater DTU(&AM.getResult<DominatorTreeAnalysis>(F),
+ DomTreeUpdater::UpdateStrategy::Lazy);
+ const GCNSubtarget *ST = &TM.getSubtarget<GCNSubtarget>(F);
+
+ bool IsPixelShader = F.getCallingConv() == CallingConv::AMDGPU_PS;
+
+ bool IsChanged =
+ AMDGPUAtomicOptimizerImpl(UA, DL, DTU, ST, IsPixelShader, ScanImpl)
+ .run(F);
+
+ if (!IsChanged) {
+ return PreservedAnalyses::all();
+ }
+
+ PreservedAnalyses PA;
+ PA.preserve<DominatorTreeAnalysis>();
+ return PA;
+}
+
+bool AMDGPUAtomicOptimizerImpl::run(Function &F) {
+
+ // Scan option None disables the Pass
+ if (ScanImpl == ScanOptions::None) {
+ return false;
+ }
visit(F);
@@ -107,7 +178,7 @@ bool AMDGPUAtomicOptimizer::runOnFunction(Function &F) {
return Changed;
}
-void AMDGPUAtomicOptimizer::visitAtomicRMWInst(AtomicRMWInst &I) {
+void AMDGPUAtomicOptimizerImpl::visitAtomicRMWInst(AtomicRMWInst &I) {
// Early exit for unhandled address space atomic instructions.
switch (I.getPointerAddressSpace()) {
default:
@@ -139,11 +210,11 @@ void AMDGPUAtomicOptimizer::visitAtomicRMWInst(AtomicRMWInst &I) {
// If the pointer operand is divergent, then each lane is doing an atomic
// operation on a different address, and we cannot optimize that.
- if (DA->isDivergentUse(&I.getOperandUse(PtrIdx))) {
+ if (UA->isDivergentUse(I.getOperandUse(PtrIdx))) {
return;
}
- const bool ValDivergent = DA->isDivergentUse(&I.getOperandUse(ValIdx));
+ const bool ValDivergent = UA->isDivergentUse(I.getOperandUse(ValIdx));
// If the value operand is divergent, each lane is contributing a different
// value to the atomic calculation. We can only optimize divergent values if
@@ -162,7 +233,7 @@ void AMDGPUAtomicOptimizer::visitAtomicRMWInst(AtomicRMWInst &I) {
ToReplace.push_back(Info);
}
-void AMDGPUAtomicOptimizer::visitIntrinsicInst(IntrinsicInst &I) {
+void AMDGPUAtomicOptimizerImpl::visitIntrinsicInst(IntrinsicInst &I) {
AtomicRMWInst::BinOp Op;
switch (I.getIntrinsicID()) {
@@ -170,54 +241,72 @@ void AMDGPUAtomicOptimizer::visitIntrinsicInst(IntrinsicInst &I) {
return;
case Intrinsic::amdgcn_buffer_atomic_add:
case Intrinsic::amdgcn_struct_buffer_atomic_add:
+ case Intrinsic::amdgcn_struct_ptr_buffer_atomic_add:
case Intrinsic::amdgcn_raw_buffer_atomic_add:
+ case Intrinsic::amdgcn_raw_ptr_buffer_atomic_add:
Op = AtomicRMWInst::Add;
break;
case Intrinsic::amdgcn_buffer_atomic_sub:
case Intrinsic::amdgcn_struct_buffer_atomic_sub:
+ case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub:
case Intrinsic::amdgcn_raw_buffer_atomic_sub:
+ case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub:
Op = AtomicRMWInst::Sub;
break;
case Intrinsic::amdgcn_buffer_atomic_and:
case Intrinsic::amdgcn_struct_buffer_atomic_and:
+ case Intrinsic::amdgcn_struct_ptr_buffer_atomic_and:
case Intrinsic::amdgcn_raw_buffer_atomic_and:
+ case Intrinsic::amdgcn_raw_ptr_buffer_atomic_and:
Op = AtomicRMWInst::And;
break;
case Intrinsic::amdgcn_buffer_atomic_or:
case Intrinsic::amdgcn_struct_buffer_atomic_or:
+ case Intrinsic::amdgcn_struct_ptr_buffer_atomic_or:
case Intrinsic::amdgcn_raw_buffer_atomic_or:
+ case Intrinsic::amdgcn_raw_ptr_buffer_atomic_or:
Op = AtomicRMWInst::Or;
break;
case Intrinsic::amdgcn_buffer_atomic_xor:
case Intrinsic::amdgcn_struct_buffer_atomic_xor:
+ case Intrinsic::amdgcn_struct_ptr_buffer_atomic_xor:
case Intrinsic::amdgcn_raw_buffer_atomic_xor:
+ case Intrinsic::amdgcn_raw_ptr_buffer_atomic_xor:
Op = AtomicRMWInst::Xor;
break;
case Intrinsic::amdgcn_buffer_atomic_smin:
case Intrinsic::amdgcn_struct_buffer_atomic_smin:
+ case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smin:
case Intrinsic::amdgcn_raw_buffer_atomic_smin:
+ case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smin:
Op = AtomicRMWInst::Min;
break;
case Intrinsic::amdgcn_buffer_atomic_umin:
case Intrinsic::amdgcn_struct_buffer_atomic_umin:
+ case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umin:
case Intrinsic::amdgcn_raw_buffer_atomic_umin:
+ case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umin:
Op = AtomicRMWInst::UMin;
break;
case Intrinsic::amdgcn_buffer_atomic_smax:
case Intrinsic::amdgcn_struct_buffer_atomic_smax:
+ case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smax:
case Intrinsic::amdgcn_raw_buffer_atomic_smax:
+ case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smax:
Op = AtomicRMWInst::Max;
break;
case Intrinsic::amdgcn_buffer_atomic_umax:
case Intrinsic::amdgcn_struct_buffer_atomic_umax:
+ case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umax:
case Intrinsic::amdgcn_raw_buffer_atomic_umax:
+ case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umax:
Op = AtomicRMWInst::UMax;
break;
}
const unsigned ValIdx = 0;
- const bool ValDivergent = DA->isDivergentUse(&I.getOperandUse(ValIdx));
+ const bool ValDivergent = UA->isDivergentUse(I.getOperandUse(ValIdx));
// If the value operand is divergent, each lane is contributing a different
// value to the atomic calculation. We can only optimize divergent values if
@@ -231,7 +320,7 @@ void AMDGPUAtomicOptimizer::visitIntrinsicInst(IntrinsicInst &I) {
// If any of the other arguments to the intrinsic are divergent, we can't
// optimize the operation.
for (unsigned Idx = 1; Idx < I.getNumOperands(); Idx++) {
- if (DA->isDivergentUse(&I.getOperandUse(Idx))) {
+ if (UA->isDivergentUse(I.getOperandUse(Idx))) {
return;
}
}
@@ -283,9 +372,10 @@ static Value *buildNonAtomicBinOp(IRBuilder<> &B, AtomicRMWInst::BinOp Op,
// Use the builder to create a reduction of V across the wavefront, with all
// lanes active, returning the same result in all lanes.
-Value *AMDGPUAtomicOptimizer::buildReduction(IRBuilder<> &B,
- AtomicRMWInst::BinOp Op, Value *V,
- Value *const Identity) const {
+Value *AMDGPUAtomicOptimizerImpl::buildReduction(IRBuilder<> &B,
+ AtomicRMWInst::BinOp Op,
+ Value *V,
+ Value *const Identity) const {
Type *const Ty = V->getType();
Module *M = B.GetInsertBlock()->getModule();
Function *UpdateDPP =
@@ -328,8 +418,9 @@ Value *AMDGPUAtomicOptimizer::buildReduction(IRBuilder<> &B,
// Use the builder to create an inclusive scan of V across the wavefront, with
// all lanes active.
-Value *AMDGPUAtomicOptimizer::buildScan(IRBuilder<> &B, AtomicRMWInst::BinOp Op,
- Value *V, Value *const Identity) const {
+Value *AMDGPUAtomicOptimizerImpl::buildScan(IRBuilder<> &B,
+ AtomicRMWInst::BinOp Op, Value *V,
+ Value *const Identity) const {
Type *const Ty = V->getType();
Module *M = B.GetInsertBlock()->getModule();
Function *UpdateDPP =
@@ -385,8 +476,8 @@ Value *AMDGPUAtomicOptimizer::buildScan(IRBuilder<> &B, AtomicRMWInst::BinOp Op,
// Use the builder to create a shift right of V across the wavefront, with all
// lanes active, to turn an inclusive scan into an exclusive scan.
-Value *AMDGPUAtomicOptimizer::buildShiftRight(IRBuilder<> &B, Value *V,
- Value *const Identity) const {
+Value *AMDGPUAtomicOptimizerImpl::buildShiftRight(IRBuilder<> &B, Value *V,
+ Value *const Identity) const {
Type *const Ty = V->getType();
Module *M = B.GetInsertBlock()->getModule();
Function *UpdateDPP =
@@ -430,6 +521,75 @@ Value *AMDGPUAtomicOptimizer::buildShiftRight(IRBuilder<> &B, Value *V,
return V;
}
+// Use the builder to create an exclusive scan and compute the final reduced
+// value using an iterative approach. This provides an alternative
+// implementation to DPP which uses WMM for scan computations. This API iterate
+// over active lanes to read, compute and update the value using
+// readlane and writelane intrinsics.
+std::pair<Value *, Value *> AMDGPUAtomicOptimizerImpl::buildScanIteratively(
+ IRBuilder<> &B, AtomicRMWInst::BinOp Op, Value *const Identity, Value *V,
+ Instruction &I, BasicBlock *ComputeLoop, BasicBlock *ComputeEnd) const {
+
+ auto *Ty = I.getType();
+ auto *WaveTy = B.getIntNTy(ST->getWavefrontSize());
+ auto *EntryBB = I.getParent();
+ auto NeedResult = !I.use_empty();
+
+ auto *Ballot =
+ B.CreateIntrinsic(Intrinsic::amdgcn_ballot, WaveTy, B.getTrue());
+
+ // Start inserting instructions for ComputeLoop block
+ B.SetInsertPoint(ComputeLoop);
+ // Phi nodes for Accumulator, Scan results destination, and Active Lanes
+ auto *Accumulator = B.CreatePHI(Ty, 2, "Accumulator");
+ Accumulator->addIncoming(Identity, EntryBB);
+ PHINode *OldValuePhi = nullptr;
+ if (NeedResult) {
+ OldValuePhi = B.CreatePHI(Ty, 2, "OldValuePhi");
+ OldValuePhi->addIncoming(PoisonValue::get(Ty), EntryBB);
+ }
+ auto *ActiveBits = B.CreatePHI(WaveTy, 2, "ActiveBits");
+ ActiveBits->addIncoming(Ballot, EntryBB);
+
+ // Use llvm.cttz instrinsic to find the lowest remaining active lane.
+ auto *FF1 =
+ B.CreateIntrinsic(Intrinsic::cttz, WaveTy, {ActiveBits, B.getTrue()});
+ auto *LaneIdxInt = B.CreateTrunc(FF1, Ty);
+
+ // Get the value required for atomic operation
+ auto *LaneValue =
+ B.CreateIntrinsic(Intrinsic::amdgcn_readlane, {}, {V, LaneIdxInt});
+
+ // Perform writelane if intermediate scan results are required later in the
+ // kernel computations
+ Value *OldValue = nullptr;
+ if (NeedResult) {
+ OldValue = B.CreateIntrinsic(Intrinsic::amdgcn_writelane, {},
+ {Accumulator, LaneIdxInt, OldValuePhi});
+ OldValuePhi->addIncoming(OldValue, ComputeLoop);
+ }
+
+ // Accumulate the results
+ auto *NewAccumulator = buildNonAtomicBinOp(B, Op, Accumulator, LaneValue);
+ Accumulator->addIncoming(NewAccumulator, ComputeLoop);
+
+ // Set bit to zero of current active lane so that for next iteration llvm.cttz
+ // return the next active lane
+ auto *Mask = B.CreateShl(ConstantInt::get(WaveTy, 1), FF1);
+
+ auto *InverseMask = B.CreateXor(Mask, ConstantInt::get(WaveTy, -1));
+ auto *NewActiveBits = B.CreateAnd(ActiveBits, InverseMask);
+ ActiveBits->addIncoming(NewActiveBits, ComputeLoop);
+
+ // Branch out of the loop when all lanes are processed.
+ auto *IsEnd = B.CreateICmpEQ(NewActiveBits, ConstantInt::get(WaveTy, 0));
+ B.CreateCondBr(IsEnd, ComputeEnd, ComputeLoop);
+
+ B.SetInsertPoint(ComputeEnd);
+
+ return {OldValue, NewAccumulator};
+}
+
static APInt getIdentityValueForAtomicOp(AtomicRMWInst::BinOp Op,
unsigned BitWidth) {
switch (Op) {
@@ -456,10 +616,10 @@ static Value *buildMul(IRBuilder<> &B, Value *LHS, Value *RHS) {
return (CI && CI->isOne()) ? RHS : B.CreateMul(LHS, RHS);
}
-void AMDGPUAtomicOptimizer::optimizeAtomic(Instruction &I,
- AtomicRMWInst::BinOp Op,
- unsigned ValIdx,
- bool ValDivergent) const {
+void AMDGPUAtomicOptimizerImpl::optimizeAtomic(Instruction &I,
+ AtomicRMWInst::BinOp Op,
+ unsigned ValIdx,
+ bool ValDivergent) const {
// Start building just before the instruction.
IRBuilder<> B(&I);
@@ -479,7 +639,7 @@ void AMDGPUAtomicOptimizer::optimizeAtomic(Instruction &I,
Value *const Cond = B.CreateIntrinsic(Intrinsic::amdgcn_ps_live, {}, {});
Instruction *const NonHelperTerminator =
- SplitBlockAndInsertIfThen(Cond, &I, false, nullptr, DT, nullptr);
+ SplitBlockAndInsertIfThen(Cond, &I, false, nullptr, &DTU, nullptr);
// Record I's new position as the exit block.
PixelExitBB = I.getParent();
@@ -528,36 +688,50 @@ void AMDGPUAtomicOptimizer::optimizeAtomic(Instruction &I,
const bool NeedResult = !I.use_empty();
+ Function *F = I.getFunction();
+ LLVMContext &C = F->getContext();
+ BasicBlock *ComputeLoop = nullptr;
+ BasicBlock *ComputeEnd = nullptr;
// If we have a divergent value in each lane, we need to combine the value
// using DPP.
if (ValDivergent) {
- // First we need to set all inactive invocations to the identity value, so
- // that they can correctly contribute to the final result.
- NewV = B.CreateIntrinsic(Intrinsic::amdgcn_set_inactive, Ty, {V, Identity});
-
const AtomicRMWInst::BinOp ScanOp =
Op == AtomicRMWInst::Sub ? AtomicRMWInst::Add : Op;
- if (!NeedResult && ST->hasPermLaneX16()) {
- // On GFX10 the permlanex16 instruction helps us build a reduction without
- // too many readlanes and writelanes, which are generally bad for
- // performance.
- NewV = buildReduction(B, ScanOp, NewV, Identity);
+ if (ScanImpl == ScanOptions::DPP) {
+ // First we need to set all inactive invocations to the identity value, so
+ // that they can correctly contribute to the final result.
+ NewV =
+ B.CreateIntrinsic(Intrinsic::amdgcn_set_inactive, Ty, {V, Identity});
+ const AtomicRMWInst::BinOp ScanOp =
+ Op == AtomicRMWInst::Sub ? AtomicRMWInst::Add : Op;
+ if (!NeedResult && ST->hasPermLaneX16()) {
+ // On GFX10 the permlanex16 instruction helps us build a reduction
+ // without too many readlanes and writelanes, which are generally bad
+ // for performance.
+ NewV = buildReduction(B, ScanOp, NewV, Identity);
+ } else {
+ NewV = buildScan(B, ScanOp, NewV, Identity);
+ if (NeedResult)
+ ExclScan = buildShiftRight(B, NewV, Identity);
+ // Read the value from the last lane, which has accumulated the values
+ // of each active lane in the wavefront. This will be our new value
+ // which we will provide to the atomic operation.
+ Value *const LastLaneIdx = B.getInt32(ST->getWavefrontSize() - 1);
+ assert(TyBitWidth == 32);
+ NewV = B.CreateIntrinsic(Intrinsic::amdgcn_readlane, {},
+ {NewV, LastLaneIdx});
+ }
+ // Finally mark the readlanes in the WWM section.
+ NewV = B.CreateIntrinsic(Intrinsic::amdgcn_strict_wwm, Ty, NewV);
+ } else if (ScanImpl == ScanOptions::Iterative) {
+ // Alternative implementation for scan
+ ComputeLoop = BasicBlock::Create(C, "ComputeLoop", F);
+ ComputeEnd = BasicBlock::Create(C, "ComputeEnd", F);
+ std::tie(ExclScan, NewV) = buildScanIteratively(B, ScanOp, Identity, V, I,
+ ComputeLoop, ComputeEnd);
} else {
- NewV = buildScan(B, ScanOp, NewV, Identity);
- if (NeedResult)
- ExclScan = buildShiftRight(B, NewV, Identity);
-
- // Read the value from the last lane, which has accumulated the values of
- // each active lane in the wavefront. This will be our new value which we
- // will provide to the atomic operation.
- Value *const LastLaneIdx = B.getInt32(ST->getWavefrontSize() - 1);
- assert(TyBitWidth == 32);
- NewV = B.CreateIntrinsic(Intrinsic::amdgcn_readlane, {},
- {NewV, LastLaneIdx});
+ llvm_unreachable("Atomic Optimzer is disabled for None strategy");
}
-
- // Finally mark the readlanes in the WWM section.
- NewV = B.CreateIntrinsic(Intrinsic::amdgcn_strict_wwm, Ty, NewV);
} else {
switch (Op) {
default:
@@ -608,8 +782,39 @@ void AMDGPUAtomicOptimizer::optimizeAtomic(Instruction &I,
// entry --> single_lane -\
// \------------------> exit
Instruction *const SingleLaneTerminator =
- SplitBlockAndInsertIfThen(Cond, &I, false, nullptr, DT, nullptr);
+ SplitBlockAndInsertIfThen(Cond, &I, false, nullptr, &DTU, nullptr);
+
+ // At this point, we have split the I's block to allow one lane in wavefront
+ // to update the precomputed reduced value. Also, completed the codegen for
+ // new control flow i.e. iterative loop which perform reduction and scan using
+ // ComputeLoop and ComputeEnd.
+ // For the new control flow, we need to move branch instruction i.e.
+ // terminator created during SplitBlockAndInsertIfThen from I's block to
+ // ComputeEnd block. We also need to set up predecessor to next block when
+ // single lane done updating the final reduced value.
+ BasicBlock *Predecessor = nullptr;
+ if (ValDivergent && ScanImpl == ScanOptions::Iterative) {
+ // Move terminator from I's block to ComputeEnd block.
+ Instruction *Terminator = EntryBB->getTerminator();
+ B.SetInsertPoint(ComputeEnd);
+ Terminator->removeFromParent();
+ B.Insert(Terminator);
+
+ // Branch to ComputeLoop Block unconditionally from the I's block for
+ // iterative approach.
+ B.SetInsertPoint(EntryBB);
+ B.CreateBr(ComputeLoop);
+ // Update the dominator tree for new control flow.
+ DTU.applyUpdates(
+ {{DominatorTree::Insert, EntryBB, ComputeLoop},
+ {DominatorTree::Insert, ComputeLoop, ComputeEnd},
+ {DominatorTree::Delete, EntryBB, SingleLaneTerminator->getParent()}});
+
+ Predecessor = ComputeEnd;
+ } else {
+ Predecessor = EntryBB;
+ }
// Move the IR builder into single_lane next.
B.SetInsertPoint(SingleLaneTerminator);
@@ -626,7 +831,7 @@ void AMDGPUAtomicOptimizer::optimizeAtomic(Instruction &I,
if (NeedResult) {
// Create a PHI node to get our new atomic result into the exit block.
PHINode *const PHI = B.CreatePHI(Ty, 2);
- PHI->addIncoming(PoisonValue::get(Ty), EntryBB);
+ PHI->addIncoming(PoisonValue::get(Ty), Predecessor);
PHI->addIncoming(NewI, SingleLaneTerminator->getParent());
// We need to broadcast the value who was the lowest active lane (the first
@@ -660,8 +865,14 @@ void AMDGPUAtomicOptimizer::optimizeAtomic(Instruction &I,
// from the first lane, to get our lane's index into the atomic result.
Value *LaneOffset = nullptr;
if (ValDivergent) {
- LaneOffset =
- B.CreateIntrinsic(Intrinsic::amdgcn_strict_wwm, Ty, ExclScan);
+ if (ScanImpl == ScanOptions::DPP) {
+ LaneOffset =
+ B.CreateIntrinsic(Intrinsic::amdgcn_strict_wwm, Ty, ExclScan);
+ } else if (ScanImpl == ScanOptions::Iterative) {
+ LaneOffset = ExclScan;
+ } else {
+ llvm_unreachable("Atomic Optimzer is disabled for None strategy");
+ }
} else {
switch (Op) {
default:
@@ -705,11 +916,11 @@ void AMDGPUAtomicOptimizer::optimizeAtomic(Instruction &I,
INITIALIZE_PASS_BEGIN(AMDGPUAtomicOptimizer, DEBUG_TYPE,
"AMDGPU atomic optimizations", false, false)
-INITIALIZE_PASS_DEPENDENCY(LegacyDivergenceAnalysis)
+INITIALIZE_PASS_DEPENDENCY(UniformityInfoWrapperPass)
INITIALIZE_PASS_DEPENDENCY(TargetPassConfig)
INITIALIZE_PASS_END(AMDGPUAtomicOptimizer, DEBUG_TYPE,
"AMDGPU atomic optimizations", false, false)
-FunctionPass *llvm::createAMDGPUAtomicOptimizerPass() {
- return new AMDGPUAtomicOptimizer();
+FunctionPass *llvm::createAMDGPUAtomicOptimizerPass(ScanOptions ScanStrategy) {
+ return new AMDGPUAtomicOptimizer(ScanStrategy);
}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
index f7298b59f0b9..57c873f00a4a 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
@@ -56,8 +56,8 @@ static constexpr std::pair<ImplicitArgumentMask,
// size is 1 for y/z.
static ImplicitArgumentMask
intrinsicToAttrMask(Intrinsic::ID ID, bool &NonKernelOnly, bool &NeedsImplicit,
- bool HasApertureRegs, bool SupportsGetDoorBellID) {
- unsigned CodeObjectVersion = AMDGPU::getAmdhsaCodeObjectVersion();
+ bool HasApertureRegs, bool SupportsGetDoorBellID,
+ unsigned CodeObjectVersion) {
switch (ID) {
case Intrinsic::amdgcn_workitem_id_x:
NonKernelOnly = true;
@@ -88,7 +88,7 @@ intrinsicToAttrMask(Intrinsic::ID ID, bool &NonKernelOnly, bool &NeedsImplicit,
// Need queue_ptr anyway. But under V5, we also need implicitarg_ptr to access
// queue_ptr.
case Intrinsic::amdgcn_queue_ptr:
- NeedsImplicit = (CodeObjectVersion == 5);
+ NeedsImplicit = (CodeObjectVersion >= AMDGPU::AMDHSA_COV5);
return QUEUE_PTR;
case Intrinsic::amdgcn_is_shared:
case Intrinsic::amdgcn_is_private:
@@ -97,11 +97,13 @@ intrinsicToAttrMask(Intrinsic::ID ID, bool &NonKernelOnly, bool &NeedsImplicit,
// Under V5, we need implicitarg_ptr + offsets to access private_base or
// shared_base. For pre-V5, however, need to access them through queue_ptr +
// offsets.
- return CodeObjectVersion == 5 ? IMPLICIT_ARG_PTR : QUEUE_PTR;
+ return CodeObjectVersion >= AMDGPU::AMDHSA_COV5 ? IMPLICIT_ARG_PTR :
+ QUEUE_PTR;
case Intrinsic::trap:
if (SupportsGetDoorBellID) // GetDoorbellID support implemented since V4.
- return CodeObjectVersion >= 4 ? NOT_IMPLICIT_INPUT : QUEUE_PTR;
- NeedsImplicit = (CodeObjectVersion == 5); // Need impicitarg_ptr under V5.
+ return CodeObjectVersion >= AMDGPU::AMDHSA_COV4 ? NOT_IMPLICIT_INPUT :
+ QUEUE_PTR;
+ NeedsImplicit = (CodeObjectVersion >= AMDGPU::AMDHSA_COV5);
return QUEUE_PTR;
default:
return NOT_IMPLICIT_INPUT;
@@ -137,7 +139,9 @@ public:
AMDGPUInformationCache(const Module &M, AnalysisGetter &AG,
BumpPtrAllocator &Allocator,
SetVector<Function *> *CGSCC, TargetMachine &TM)
- : InformationCache(M, AG, Allocator, CGSCC), TM(TM) {}
+ : InformationCache(M, AG, Allocator, CGSCC), TM(TM),
+ CodeObjectVersion(AMDGPU::getCodeObjectVersion(M)) {}
+
TargetMachine &TM;
enum ConstantStatus { DS_GLOBAL = 1 << 0, ADDR_SPACE_CAST = 1 << 1 };
@@ -165,6 +169,34 @@ public:
return {ST.getMinFlatWorkGroupSize(), ST.getMaxFlatWorkGroupSize()};
}
+ /// Get code object version.
+ unsigned getCodeObjectVersion() const {
+ return CodeObjectVersion;
+ }
+
+ /// Get the effective value of "amdgpu-waves-per-eu" for the function,
+ /// accounting for the interaction with the passed value to use for
+ /// "amdgpu-flat-work-group-size".
+ std::pair<unsigned, unsigned>
+ getWavesPerEU(const Function &F,
+ std::pair<unsigned, unsigned> FlatWorkGroupSize) {
+ const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
+ return ST.getWavesPerEU(F, FlatWorkGroupSize);
+ }
+
+ std::pair<unsigned, unsigned>
+ getEffectiveWavesPerEU(const Function &F,
+ std::pair<unsigned, unsigned> WavesPerEU,
+ std::pair<unsigned, unsigned> FlatWorkGroupSize) {
+ const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
+ return ST.getEffectiveWavesPerEU(WavesPerEU, FlatWorkGroupSize);
+ }
+
+ unsigned getMaxWavesPerEU(const Function &F) {
+ const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
+ return ST.getMaxWavesPerEU();
+ }
+
private:
/// Check if the ConstantExpr \p CE requires the queue pointer.
static bool visitConstExpr(const ConstantExpr *CE) {
@@ -176,7 +208,8 @@ private:
}
/// Get the constant access bitmap for \p C.
- uint8_t getConstantAccess(const Constant *C) {
+ uint8_t getConstantAccess(const Constant *C,
+ SmallPtrSetImpl<const Constant *> &Visited) {
auto It = ConstantStatus.find(C);
if (It != ConstantStatus.end())
return It->second;
@@ -191,10 +224,10 @@ private:
for (const Use &U : C->operands()) {
const auto *OpC = dyn_cast<Constant>(U);
- if (!OpC)
+ if (!OpC || !Visited.insert(OpC).second)
continue;
- Result |= getConstantAccess(OpC);
+ Result |= getConstantAccess(OpC, Visited);
}
return Result;
}
@@ -209,7 +242,8 @@ public:
if (!IsNonEntryFunc && HasAperture)
return false;
- uint8_t Access = getConstantAccess(C);
+ SmallPtrSet<const Constant *, 8> Visited;
+ uint8_t Access = getConstantAccess(C, Visited);
// We need to trap on DS globals in non-entry functions.
if (IsNonEntryFunc && (Access & DS_GLOBAL))
@@ -221,6 +255,7 @@ public:
private:
/// Used to determine if the Constant needs the queue pointer.
DenseMap<const Constant *, uint8_t> ConstantStatus;
+ const unsigned CodeObjectVersion;
};
struct AAAMDAttributes
@@ -311,11 +346,13 @@ struct AAUniformWorkGroupSizeFunction : public AAUniformWorkGroupSize {
LLVM_DEBUG(dbgs() << "[AAUniformWorkGroupSize] Call " << Caller->getName()
<< "->" << getAssociatedFunction()->getName() << "\n");
- const auto &CallerInfo = A.getAAFor<AAUniformWorkGroupSize>(
+ const auto *CallerInfo = A.getAAFor<AAUniformWorkGroupSize>(
*this, IRPosition::function(*Caller), DepClassTy::REQUIRED);
+ if (!CallerInfo)
+ return false;
Change = Change | clampStateAndIndicateChange(this->getState(),
- CallerInfo.getState());
+ CallerInfo->getState());
return true;
};
@@ -333,8 +370,8 @@ struct AAUniformWorkGroupSizeFunction : public AAUniformWorkGroupSize {
AttrList.push_back(Attribute::get(Ctx, "uniform-work-group-size",
getAssumed() ? "true" : "false"));
- return IRAttributeManifest::manifestAttrs(A, getIRPosition(), AttrList,
- /* ForceReplace */ true);
+ return A.manifestAttrs(getIRPosition(), AttrList,
+ /* ForceReplace */ true);
}
bool isValidState() const override {
@@ -342,7 +379,7 @@ struct AAUniformWorkGroupSizeFunction : public AAUniformWorkGroupSize {
return true;
}
- const std::string getAsStr() const override {
+ const std::string getAsStr(Attributor *) const override {
return "AMDWorkGroupSize[" + std::to_string(getAssumed()) + "]";
}
@@ -400,9 +437,9 @@ struct AAAMDAttributesFunction : public AAAMDAttributes {
auto OrigAssumed = getAssumed();
// Check for Intrinsics and propagate attributes.
- const AACallEdges &AAEdges = A.getAAFor<AACallEdges>(
+ const AACallEdges *AAEdges = A.getAAFor<AACallEdges>(
*this, this->getIRPosition(), DepClassTy::REQUIRED);
- if (AAEdges.hasNonAsmUnknownCallee())
+ if (!AAEdges || AAEdges->hasNonAsmUnknownCallee())
return indicatePessimisticFixpoint();
bool IsNonEntryFunc = !AMDGPU::isEntryFunctionCC(F->getCallingConv());
@@ -411,20 +448,23 @@ struct AAAMDAttributesFunction : public AAAMDAttributes {
auto &InfoCache = static_cast<AMDGPUInformationCache &>(A.getInfoCache());
bool HasApertureRegs = InfoCache.hasApertureRegs(*F);
bool SupportsGetDoorbellID = InfoCache.supportsGetDoorbellID(*F);
+ unsigned COV = InfoCache.getCodeObjectVersion();
- for (Function *Callee : AAEdges.getOptimisticEdges()) {
+ for (Function *Callee : AAEdges->getOptimisticEdges()) {
Intrinsic::ID IID = Callee->getIntrinsicID();
if (IID == Intrinsic::not_intrinsic) {
- const AAAMDAttributes &AAAMD = A.getAAFor<AAAMDAttributes>(
- *this, IRPosition::function(*Callee), DepClassTy::REQUIRED);
- *this &= AAAMD;
+ const AAAMDAttributes *AAAMD = A.getAAFor<AAAMDAttributes>(
+ *this, IRPosition::function(*Callee), DepClassTy::REQUIRED);
+ if (!AAAMD)
+ return indicatePessimisticFixpoint();
+ *this &= *AAAMD;
continue;
}
bool NonKernelOnly = false;
ImplicitArgumentMask AttrMask =
intrinsicToAttrMask(IID, NonKernelOnly, NeedsImplicit,
- HasApertureRegs, SupportsGetDoorbellID);
+ HasApertureRegs, SupportsGetDoorbellID, COV);
if (AttrMask != NOT_IMPLICIT_INPUT) {
if ((IsNonEntryFunc || !NonKernelOnly))
removeAssumedBits(AttrMask);
@@ -438,29 +478,29 @@ struct AAAMDAttributesFunction : public AAAMDAttributes {
if (isAssumed(QUEUE_PTR) && checkForQueuePtr(A)) {
// Under V5, we need implicitarg_ptr + offsets to access private_base or
// shared_base. We do not actually need queue_ptr.
- if (AMDGPU::getAmdhsaCodeObjectVersion() == 5)
+ if (COV >= 5)
removeAssumedBits(IMPLICIT_ARG_PTR);
else
removeAssumedBits(QUEUE_PTR);
}
- if (funcRetrievesMultigridSyncArg(A)) {
+ if (funcRetrievesMultigridSyncArg(A, COV)) {
assert(!isAssumed(IMPLICIT_ARG_PTR) &&
"multigrid_sync_arg needs implicitarg_ptr");
removeAssumedBits(MULTIGRID_SYNC_ARG);
}
- if (funcRetrievesHostcallPtr(A)) {
+ if (funcRetrievesHostcallPtr(A, COV)) {
assert(!isAssumed(IMPLICIT_ARG_PTR) && "hostcall needs implicitarg_ptr");
removeAssumedBits(HOSTCALL_PTR);
}
- if (funcRetrievesHeapPtr(A)) {
+ if (funcRetrievesHeapPtr(A, COV)) {
assert(!isAssumed(IMPLICIT_ARG_PTR) && "heap_ptr needs implicitarg_ptr");
removeAssumedBits(HEAP_PTR);
}
- if (isAssumed(QUEUE_PTR) && funcRetrievesQueuePtr(A)) {
+ if (isAssumed(QUEUE_PTR) && funcRetrievesQueuePtr(A, COV)) {
assert(!isAssumed(IMPLICIT_ARG_PTR) && "queue_ptr needs implicitarg_ptr");
removeAssumedBits(QUEUE_PTR);
}
@@ -469,10 +509,10 @@ struct AAAMDAttributesFunction : public AAAMDAttributes {
removeAssumedBits(LDS_KERNEL_ID);
}
- if (isAssumed(DEFAULT_QUEUE) && funcRetrievesDefaultQueue(A))
+ if (isAssumed(DEFAULT_QUEUE) && funcRetrievesDefaultQueue(A, COV))
removeAssumedBits(DEFAULT_QUEUE);
- if (isAssumed(COMPLETION_ACTION) && funcRetrievesCompletionAction(A))
+ if (isAssumed(COMPLETION_ACTION) && funcRetrievesCompletionAction(A, COV))
removeAssumedBits(COMPLETION_ACTION);
return getAssumed() != OrigAssumed ? ChangeStatus::CHANGED
@@ -488,16 +528,17 @@ struct AAAMDAttributesFunction : public AAAMDAttributes {
AttrList.push_back(Attribute::get(Ctx, Attr.second));
}
- return IRAttributeManifest::manifestAttrs(A, getIRPosition(), AttrList,
- /* ForceReplace */ true);
+ return A.manifestAttrs(getIRPosition(), AttrList,
+ /* ForceReplace */ true);
}
- const std::string getAsStr() const override {
+ const std::string getAsStr(Attributor *) const override {
std::string Str;
raw_string_ostream OS(Str);
OS << "AMDInfo[";
for (auto Attr : ImplicitAttrs)
- OS << ' ' << Attr.second;
+ if (isAssumed(Attr.first))
+ OS << ' ' << Attr.second;
OS << " ]";
return OS.str();
}
@@ -557,39 +598,39 @@ private:
return false;
}
- bool funcRetrievesMultigridSyncArg(Attributor &A) {
- auto Pos = llvm::AMDGPU::getMultigridSyncArgImplicitArgPosition();
+ bool funcRetrievesMultigridSyncArg(Attributor &A, unsigned COV) {
+ auto Pos = llvm::AMDGPU::getMultigridSyncArgImplicitArgPosition(COV);
AA::RangeTy Range(Pos, 8);
return funcRetrievesImplicitKernelArg(A, Range);
}
- bool funcRetrievesHostcallPtr(Attributor &A) {
- auto Pos = llvm::AMDGPU::getHostcallImplicitArgPosition();
+ bool funcRetrievesHostcallPtr(Attributor &A, unsigned COV) {
+ auto Pos = llvm::AMDGPU::getHostcallImplicitArgPosition(COV);
AA::RangeTy Range(Pos, 8);
return funcRetrievesImplicitKernelArg(A, Range);
}
- bool funcRetrievesDefaultQueue(Attributor &A) {
- auto Pos = llvm::AMDGPU::getDefaultQueueImplicitArgPosition();
+ bool funcRetrievesDefaultQueue(Attributor &A, unsigned COV) {
+ auto Pos = llvm::AMDGPU::getDefaultQueueImplicitArgPosition(COV);
AA::RangeTy Range(Pos, 8);
return funcRetrievesImplicitKernelArg(A, Range);
}
- bool funcRetrievesCompletionAction(Attributor &A) {
- auto Pos = llvm::AMDGPU::getCompletionActionImplicitArgPosition();
+ bool funcRetrievesCompletionAction(Attributor &A, unsigned COV) {
+ auto Pos = llvm::AMDGPU::getCompletionActionImplicitArgPosition(COV);
AA::RangeTy Range(Pos, 8);
return funcRetrievesImplicitKernelArg(A, Range);
}
- bool funcRetrievesHeapPtr(Attributor &A) {
- if (AMDGPU::getAmdhsaCodeObjectVersion() != 5)
+ bool funcRetrievesHeapPtr(Attributor &A, unsigned COV) {
+ if (COV < 5)
return false;
AA::RangeTy Range(AMDGPU::ImplicitArg::HEAP_PTR_OFFSET, 8);
return funcRetrievesImplicitKernelArg(A, Range);
}
- bool funcRetrievesQueuePtr(Attributor &A) {
- if (AMDGPU::getAmdhsaCodeObjectVersion() != 5)
+ bool funcRetrievesQueuePtr(Attributor &A, unsigned COV) {
+ if (COV < 5)
return false;
AA::RangeTy Range(AMDGPU::ImplicitArg::QUEUE_PTR_OFFSET, 8);
return funcRetrievesImplicitKernelArg(A, Range);
@@ -607,10 +648,12 @@ private:
if (Call.getIntrinsicID() != Intrinsic::amdgcn_implicitarg_ptr)
return true;
- const auto &PointerInfoAA = A.getAAFor<AAPointerInfo>(
+ const auto *PointerInfoAA = A.getAAFor<AAPointerInfo>(
*this, IRPosition::callsite_returned(Call), DepClassTy::REQUIRED);
+ if (!PointerInfoAA)
+ return false;
- return PointerInfoAA.forallInterferingAccesses(
+ return PointerInfoAA->forallInterferingAccesses(
Range, [](const AAPointerInfo::Access &Acc, bool IsExact) {
return Acc.getRemoteInst()->isDroppable();
});
@@ -639,42 +682,36 @@ AAAMDAttributes &AAAMDAttributes::createForPosition(const IRPosition &IRP,
llvm_unreachable("AAAMDAttributes is only valid for function position");
}
-/// Propagate amdgpu-flat-work-group-size attribute.
-struct AAAMDFlatWorkGroupSize
+/// Base class to derive different size ranges.
+struct AAAMDSizeRangeAttribute
: public StateWrapper<IntegerRangeState, AbstractAttribute, uint32_t> {
using Base = StateWrapper<IntegerRangeState, AbstractAttribute, uint32_t>;
- AAAMDFlatWorkGroupSize(const IRPosition &IRP, Attributor &A)
- : Base(IRP, 32) {}
- /// See AbstractAttribute::getState(...).
- IntegerRangeState &getState() override { return *this; }
- const IntegerRangeState &getState() const override { return *this; }
+ StringRef AttrName;
- void initialize(Attributor &A) override {
- Function *F = getAssociatedFunction();
- auto &InfoCache = static_cast<AMDGPUInformationCache &>(A.getInfoCache());
- unsigned MinGroupSize, MaxGroupSize;
- std::tie(MinGroupSize, MaxGroupSize) = InfoCache.getFlatWorkGroupSizes(*F);
- intersectKnown(
- ConstantRange(APInt(32, MinGroupSize), APInt(32, MaxGroupSize + 1)));
+ AAAMDSizeRangeAttribute(const IRPosition &IRP, Attributor &A,
+ StringRef AttrName)
+ : Base(IRP, 32), AttrName(AttrName) {}
- if (AMDGPU::isEntryFunctionCC(F->getCallingConv()))
- indicatePessimisticFixpoint();
- }
+ /// See AbstractAttribute::trackStatistics()
+ void trackStatistics() const override {}
- ChangeStatus updateImpl(Attributor &A) override {
+ template <class AttributeImpl>
+ ChangeStatus updateImplImpl(Attributor &A) {
ChangeStatus Change = ChangeStatus::UNCHANGED;
auto CheckCallSite = [&](AbstractCallSite CS) {
Function *Caller = CS.getInstruction()->getFunction();
- LLVM_DEBUG(dbgs() << "[AAAMDFlatWorkGroupSize] Call " << Caller->getName()
+ LLVM_DEBUG(dbgs() << '[' << getName() << "] Call " << Caller->getName()
<< "->" << getAssociatedFunction()->getName() << '\n');
- const auto &CallerInfo = A.getAAFor<AAAMDFlatWorkGroupSize>(
+ const auto *CallerInfo = A.getAAFor<AttributeImpl>(
*this, IRPosition::function(*Caller), DepClassTy::REQUIRED);
+ if (!CallerInfo)
+ return false;
Change |=
- clampStateAndIndicateChange(this->getState(), CallerInfo.getState());
+ clampStateAndIndicateChange(this->getState(), CallerInfo->getState());
return true;
};
@@ -686,45 +723,65 @@ struct AAAMDFlatWorkGroupSize
return Change;
}
- ChangeStatus manifest(Attributor &A) override {
- SmallVector<Attribute, 8> AttrList;
- Function *F = getAssociatedFunction();
- LLVMContext &Ctx = F->getContext();
-
- auto &InfoCache = static_cast<AMDGPUInformationCache &>(A.getInfoCache());
- unsigned Min, Max;
- std::tie(Min, Max) = InfoCache.getMaximumFlatWorkGroupRange(*F);
-
+ ChangeStatus emitAttributeIfNotDefault(Attributor &A, unsigned Min,
+ unsigned Max) {
// Don't add the attribute if it's the implied default.
if (getAssumed().getLower() == Min && getAssumed().getUpper() - 1 == Max)
return ChangeStatus::UNCHANGED;
+ Function *F = getAssociatedFunction();
+ LLVMContext &Ctx = F->getContext();
SmallString<10> Buffer;
raw_svector_ostream OS(Buffer);
OS << getAssumed().getLower() << ',' << getAssumed().getUpper() - 1;
-
- AttrList.push_back(
- Attribute::get(Ctx, "amdgpu-flat-work-group-size", OS.str()));
- return IRAttributeManifest::manifestAttrs(A, getIRPosition(), AttrList,
- /* ForceReplace */ true);
+ return A.manifestAttrs(getIRPosition(),
+ {Attribute::get(Ctx, AttrName, OS.str())},
+ /* ForceReplace */ true);
}
- const std::string getAsStr() const override {
+ const std::string getAsStr(Attributor *) const override {
std::string Str;
raw_string_ostream OS(Str);
- OS << "AMDFlatWorkGroupSize[";
+ OS << getName() << '[';
OS << getAssumed().getLower() << ',' << getAssumed().getUpper() - 1;
OS << ']';
return OS.str();
}
+};
- /// See AbstractAttribute::trackStatistics()
- void trackStatistics() const override {}
+/// Propagate amdgpu-flat-work-group-size attribute.
+struct AAAMDFlatWorkGroupSize : public AAAMDSizeRangeAttribute {
+ AAAMDFlatWorkGroupSize(const IRPosition &IRP, Attributor &A)
+ : AAAMDSizeRangeAttribute(IRP, A, "amdgpu-flat-work-group-size") {}
+
+ void initialize(Attributor &A) override {
+ Function *F = getAssociatedFunction();
+ auto &InfoCache = static_cast<AMDGPUInformationCache &>(A.getInfoCache());
+ unsigned MinGroupSize, MaxGroupSize;
+ std::tie(MinGroupSize, MaxGroupSize) = InfoCache.getFlatWorkGroupSizes(*F);
+ intersectKnown(
+ ConstantRange(APInt(32, MinGroupSize), APInt(32, MaxGroupSize + 1)));
+
+ if (AMDGPU::isEntryFunctionCC(F->getCallingConv()))
+ indicatePessimisticFixpoint();
+ }
+
+ ChangeStatus updateImpl(Attributor &A) override {
+ return updateImplImpl<AAAMDFlatWorkGroupSize>(A);
+ }
/// Create an abstract attribute view for the position \p IRP.
static AAAMDFlatWorkGroupSize &createForPosition(const IRPosition &IRP,
Attributor &A);
+ ChangeStatus manifest(Attributor &A) override {
+ Function *F = getAssociatedFunction();
+ auto &InfoCache = static_cast<AMDGPUInformationCache &>(A.getInfoCache());
+ unsigned Min, Max;
+ std::tie(Min, Max) = InfoCache.getMaximumFlatWorkGroupRange(*F);
+ return emitAttributeIfNotDefault(A, Min, Max);
+ }
+
/// See AbstractAttribute::getName()
const std::string getName() const override {
return "AAAMDFlatWorkGroupSize";
@@ -754,6 +811,109 @@ AAAMDFlatWorkGroupSize::createForPosition(const IRPosition &IRP,
"AAAMDFlatWorkGroupSize is only valid for function position");
}
+/// Propagate amdgpu-waves-per-eu attribute.
+struct AAAMDWavesPerEU : public AAAMDSizeRangeAttribute {
+ AAAMDWavesPerEU(const IRPosition &IRP, Attributor &A)
+ : AAAMDSizeRangeAttribute(IRP, A, "amdgpu-waves-per-eu") {}
+
+ bool isValidState() const override {
+ return !Assumed.isEmptySet() && IntegerRangeState::isValidState();
+ }
+
+ void initialize(Attributor &A) override {
+ Function *F = getAssociatedFunction();
+ auto &InfoCache = static_cast<AMDGPUInformationCache &>(A.getInfoCache());
+
+ if (const auto *AssumedGroupSize = A.getAAFor<AAAMDFlatWorkGroupSize>(
+ *this, IRPosition::function(*F), DepClassTy::REQUIRED)) {
+
+ unsigned Min, Max;
+ std::tie(Min, Max) = InfoCache.getWavesPerEU(
+ *F, {AssumedGroupSize->getAssumed().getLower().getZExtValue(),
+ AssumedGroupSize->getAssumed().getUpper().getZExtValue() - 1});
+
+ ConstantRange Range(APInt(32, Min), APInt(32, Max + 1));
+ intersectKnown(Range);
+ }
+
+ if (AMDGPU::isEntryFunctionCC(F->getCallingConv()))
+ indicatePessimisticFixpoint();
+ }
+
+ ChangeStatus updateImpl(Attributor &A) override {
+ auto &InfoCache = static_cast<AMDGPUInformationCache &>(A.getInfoCache());
+ ChangeStatus Change = ChangeStatus::UNCHANGED;
+
+ auto CheckCallSite = [&](AbstractCallSite CS) {
+ Function *Caller = CS.getInstruction()->getFunction();
+ Function *Func = getAssociatedFunction();
+ LLVM_DEBUG(dbgs() << '[' << getName() << "] Call " << Caller->getName()
+ << "->" << Func->getName() << '\n');
+
+ const auto *CallerInfo = A.getAAFor<AAAMDWavesPerEU>(
+ *this, IRPosition::function(*Caller), DepClassTy::REQUIRED);
+ const auto *AssumedGroupSize = A.getAAFor<AAAMDFlatWorkGroupSize>(
+ *this, IRPosition::function(*Func), DepClassTy::REQUIRED);
+ if (!CallerInfo || !AssumedGroupSize)
+ return false;
+
+ unsigned Min, Max;
+ std::tie(Min, Max) = InfoCache.getEffectiveWavesPerEU(
+ *Caller,
+ {CallerInfo->getAssumed().getLower().getZExtValue(),
+ CallerInfo->getAssumed().getUpper().getZExtValue() - 1},
+ {AssumedGroupSize->getAssumed().getLower().getZExtValue(),
+ AssumedGroupSize->getAssumed().getUpper().getZExtValue() - 1});
+ ConstantRange CallerRange(APInt(32, Min), APInt(32, Max + 1));
+ IntegerRangeState CallerRangeState(CallerRange);
+ Change |= clampStateAndIndicateChange(this->getState(), CallerRangeState);
+
+ return true;
+ };
+
+ bool AllCallSitesKnown = true;
+ if (!A.checkForAllCallSites(CheckCallSite, *this, true, AllCallSitesKnown))
+ return indicatePessimisticFixpoint();
+
+ return Change;
+ }
+
+ /// Create an abstract attribute view for the position \p IRP.
+ static AAAMDWavesPerEU &createForPosition(const IRPosition &IRP,
+ Attributor &A);
+
+ ChangeStatus manifest(Attributor &A) override {
+ Function *F = getAssociatedFunction();
+ auto &InfoCache = static_cast<AMDGPUInformationCache &>(A.getInfoCache());
+ unsigned Max = InfoCache.getMaxWavesPerEU(*F);
+ return emitAttributeIfNotDefault(A, 1, Max);
+ }
+
+ /// See AbstractAttribute::getName()
+ const std::string getName() const override { return "AAAMDWavesPerEU"; }
+
+ /// See AbstractAttribute::getIdAddr()
+ const char *getIdAddr() const override { return &ID; }
+
+ /// This function should return true if the type of the \p AA is
+ /// AAAMDWavesPerEU
+ static bool classof(const AbstractAttribute *AA) {
+ return (AA->getIdAddr() == &ID);
+ }
+
+ /// Unique ID (due to the unique address)
+ static const char ID;
+};
+
+const char AAAMDWavesPerEU::ID = 0;
+
+AAAMDWavesPerEU &AAAMDWavesPerEU::createForPosition(const IRPosition &IRP,
+ Attributor &A) {
+ if (IRP.getPositionKind() == IRPosition::IRP_FUNCTION)
+ return *new (A.Allocator) AAAMDWavesPerEU(IRP, A);
+ llvm_unreachable("AAAMDWavesPerEU is only valid for function position");
+}
+
class AMDGPUAttributor : public ModulePass {
public:
AMDGPUAttributor() : ModulePass(ID) {}
@@ -782,13 +942,17 @@ public:
AMDGPUInformationCache InfoCache(M, AG, Allocator, nullptr, *TM);
DenseSet<const char *> Allowed(
{&AAAMDAttributes::ID, &AAUniformWorkGroupSize::ID,
- &AAPotentialValues::ID, &AAAMDFlatWorkGroupSize::ID, &AACallEdges::ID,
- &AAPointerInfo::ID, &AAPotentialConstantValues::ID});
+ &AAPotentialValues::ID, &AAAMDFlatWorkGroupSize::ID,
+ &AAAMDWavesPerEU::ID, &AACallEdges::ID, &AAPointerInfo::ID,
+ &AAPotentialConstantValues::ID, &AAUnderlyingObjects::ID});
AttributorConfig AC(CGUpdater);
AC.Allowed = &Allowed;
AC.IsModulePass = true;
AC.DefaultInitializeLiveInternals = false;
+ AC.IPOAmendableCB = [](const Function &F) {
+ return F.getCallingConv() == CallingConv::AMDGPU_KERNEL;
+ };
Attributor A(Functions, InfoCache, AC);
@@ -798,6 +962,7 @@ public:
A.getOrCreateAAFor<AAUniformWorkGroupSize>(IRPosition::function(F));
if (!AMDGPU::isEntryFunctionCC(F.getCallingConv())) {
A.getOrCreateAAFor<AAAMDFlatWorkGroupSize>(IRPosition::function(F));
+ A.getOrCreateAAFor<AAAMDWavesPerEU>(IRPosition::function(F));
}
}
}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
index da819b6d4a23..9ba5ea8fb73f 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
@@ -466,7 +466,9 @@ static void allocateHSAUserSGPRs(CCState &CCInfo,
CCInfo.AllocateReg(DispatchPtrReg);
}
- if (Info.hasQueuePtr() && AMDGPU::getAmdhsaCodeObjectVersion() < 5) {
+ const Module *M = MF.getFunction().getParent();
+ if (Info.hasQueuePtr() &&
+ AMDGPU::getCodeObjectVersion(*M) < AMDGPU::AMDHSA_COV5) {
Register QueuePtrReg = Info.addQueuePtr(TRI);
MF.addLiveIn(QueuePtrReg, &AMDGPU::SGPR_64RegClass);
CCInfo.AllocateReg(QueuePtrReg);
@@ -510,8 +512,6 @@ bool AMDGPUCallLowering::lowerFormalArgumentsKernel(
const SITargetLowering &TLI = *getTLI<SITargetLowering>();
const DataLayout &DL = F.getParent()->getDataLayout();
- Info->allocateKnownAddressLDSGlobal(F);
-
SmallVector<CCValAssign, 16> ArgLocs;
CCState CCInfo(F.getCallingConv(), F.isVarArg(), MF, ArgLocs, F.getContext());
@@ -519,7 +519,7 @@ bool AMDGPUCallLowering::lowerFormalArgumentsKernel(
unsigned i = 0;
const Align KernArgBaseAlign(16);
- const unsigned BaseOffset = Subtarget->getExplicitKernelArgOffset(F);
+ const unsigned BaseOffset = Subtarget->getExplicitKernelArgOffset();
uint64_t ExplicitArgOffset = 0;
// TODO: Align down to dword alignment and extract bits for extending loads.
@@ -594,8 +594,6 @@ bool AMDGPUCallLowering::lowerFormalArguments(
const SIRegisterInfo *TRI = Subtarget.getRegisterInfo();
const DataLayout &DL = F.getParent()->getDataLayout();
- Info->allocateKnownAddressLDSGlobal(F);
-
SmallVector<CCValAssign, 16> ArgLocs;
CCState CCInfo(CC, F.isVarArg(), MF, ArgLocs, F.getContext());
@@ -701,7 +699,7 @@ bool AMDGPUCallLowering::lowerFormalArguments(
if ((PsInputBits & 0x7F) == 0 ||
((PsInputBits & 0xF) == 0 &&
(PsInputBits >> 11 & 1)))
- Info->markPSInputEnabled(countTrailingZeros(Info->getPSInputAddr()));
+ Info->markPSInputEnabled(llvm::countr_zero(Info->getPSInputAddr()));
}
}
@@ -724,7 +722,7 @@ bool AMDGPUCallLowering::lowerFormalArguments(
if (!handleAssignments(Handler, SplitArgs, CCInfo, ArgLocs, B))
return false;
- uint64_t StackOffset = Assigner.StackOffset;
+ uint64_t StackSize = Assigner.StackSize;
// Start adding system SGPRs.
if (IsEntryFunc) {
@@ -739,7 +737,7 @@ bool AMDGPUCallLowering::lowerFormalArguments(
// the caller's stack. So, whenever we lower formal arguments, we should keep
// track of this information, since we might lower a tail call in this
// function later.
- Info->setBytesInStackArgArea(StackOffset);
+ Info->setBytesInStackArgArea(StackSize);
// Move back to the end of the basic block.
B.setMBB(MBB);
@@ -956,10 +954,14 @@ getAssignFnsForCC(CallingConv::ID CC, const SITargetLowering &TLI) {
}
static unsigned getCallOpcode(const MachineFunction &CallerF, bool IsIndirect,
- bool IsTailCall) {
+ bool IsTailCall, CallingConv::ID CC) {
assert(!(IsIndirect && IsTailCall) && "Indirect calls can't be tail calls, "
"because the address can be divergent");
- return IsTailCall ? AMDGPU::SI_TCRETURN : AMDGPU::G_SI_CALL;
+ if (!IsTailCall)
+ return AMDGPU::G_SI_CALL;
+
+ return CC == CallingConv::AMDGPU_Gfx ? AMDGPU::SI_TCRETURN_GFX :
+ AMDGPU::SI_TCRETURN;
}
// Add operands to call instruction to track the callee.
@@ -1053,7 +1055,7 @@ bool AMDGPUCallLowering::areCalleeOutgoingArgsTailCallable(
// Make sure that they can fit on the caller's stack.
const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
- if (OutInfo.getNextStackOffset() > FuncInfo->getBytesInStackArgArea()) {
+ if (OutInfo.getStackSize() > FuncInfo->getBytesInStackArgArea()) {
LLVM_DEBUG(dbgs() << "... Cannot fit call operands on caller's stack.\n");
return false;
}
@@ -1184,7 +1186,7 @@ bool AMDGPUCallLowering::lowerTailCall(
if (!IsSibCall)
CallSeqStart = MIRBuilder.buildInstr(AMDGPU::ADJCALLSTACKUP);
- unsigned Opc = getCallOpcode(MF, Info.Callee.isReg(), true);
+ unsigned Opc = getCallOpcode(MF, Info.Callee.isReg(), true, CalleeCC);
auto MIB = MIRBuilder.buildInstrNoInsert(Opc);
if (!addCallTargetOperands(MIB, MIRBuilder, Info))
return false;
@@ -1224,7 +1226,7 @@ bool AMDGPUCallLowering::lowerTailCall(
// The callee will pop the argument stack as a tail call. Thus, we must
// keep it 16-byte aligned.
- NumBytes = alignTo(OutInfo.getNextStackOffset(), ST.getStackAlignment());
+ NumBytes = alignTo(OutInfo.getStackSize(), ST.getStackAlignment());
// FPDiff will be negative if this tail call requires more space than we
// would automatically have in our incoming argument space. Positive if we
@@ -1348,7 +1350,7 @@ bool AMDGPUCallLowering::lowerCall(MachineIRBuilder &MIRBuilder,
// Create a temporarily-floating call instruction so we can add the implicit
// uses of arg registers.
- unsigned Opc = getCallOpcode(MF, Info.Callee.isReg(), false);
+ unsigned Opc = getCallOpcode(MF, Info.Callee.isReg(), false, Info.CallConv);
auto MIB = MIRBuilder.buildInstrNoInsert(Opc);
MIB.addDef(TRI->getReturnAddressReg(MF));
@@ -1390,7 +1392,7 @@ bool AMDGPUCallLowering::lowerCall(MachineIRBuilder &MIRBuilder,
handleImplicitCallArguments(MIRBuilder, MIB, ST, *MFI, ImplicitArgRegs);
// Get a count of how many bytes are to be pushed on the stack.
- unsigned NumBytes = CCInfo.getNextStackOffset();
+ unsigned NumBytes = CCInfo.getStackSize();
// If Callee is a reg, since it is used by a target specific
// instruction, it must have a register class matching the
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
index 08b29641d14a..4ec85f3c5588 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
@@ -14,23 +14,28 @@
#include "AMDGPU.h"
#include "AMDGPUTargetMachine.h"
+#include "SIModeRegisterDefaults.h"
#include "llvm/Analysis/AssumptionCache.h"
#include "llvm/Analysis/ConstantFolding.h"
-#include "llvm/Analysis/LegacyDivergenceAnalysis.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Analysis/UniformityAnalysis.h"
#include "llvm/Analysis/ValueTracking.h"
#include "llvm/CodeGen/TargetPassConfig.h"
#include "llvm/IR/Dominators.h"
+#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/InstVisitor.h"
#include "llvm/IR/IntrinsicsAMDGPU.h"
-#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/PatternMatch.h"
#include "llvm/InitializePasses.h"
#include "llvm/Pass.h"
#include "llvm/Support/KnownBits.h"
#include "llvm/Transforms/Utils/IntegerDivision.h"
+#include "llvm/Transforms/Utils/Local.h"
#define DEBUG_TYPE "amdgpu-codegenprepare"
using namespace llvm;
+using namespace llvm::PatternMatch;
namespace {
@@ -46,6 +51,22 @@ static cl::opt<bool> Widen16BitOps(
cl::ReallyHidden,
cl::init(true));
+static cl::opt<bool>
+ ScalarizeLargePHIs("amdgpu-codegenprepare-break-large-phis",
+ cl::desc("Break large PHI nodes for DAGISel"),
+ cl::ReallyHidden, cl::init(true));
+
+static cl::opt<bool>
+ ForceScalarizeLargePHIs("amdgpu-codegenprepare-force-break-large-phis",
+ cl::desc("For testing purposes, always break large "
+ "PHIs even if it isn't profitable."),
+ cl::ReallyHidden, cl::init(false));
+
+static cl::opt<unsigned> ScalarizeLargePHIsThreshold(
+ "amdgpu-codegenprepare-break-large-phis-threshold",
+ cl::desc("Minimum type size in bits for breaking large PHI nodes"),
+ cl::ReallyHidden, cl::init(32));
+
static cl::opt<bool> UseMul24Intrin(
"amdgpu-codegenprepare-mul24",
cl::desc("Introduce mul24 intrinsics in AMDGPUCodeGenPrepare"),
@@ -67,16 +88,30 @@ static cl::opt<bool> DisableIDivExpand(
cl::ReallyHidden,
cl::init(false));
-class AMDGPUCodeGenPrepare : public FunctionPass,
- public InstVisitor<AMDGPUCodeGenPrepare, bool> {
+// Disable processing of fdiv so we can better test the backend implementations.
+static cl::opt<bool> DisableFDivExpand(
+ "amdgpu-codegenprepare-disable-fdiv-expansion",
+ cl::desc("Prevent expanding floating point division in AMDGPUCodeGenPrepare"),
+ cl::ReallyHidden,
+ cl::init(false));
+
+class AMDGPUCodeGenPrepareImpl
+ : public InstVisitor<AMDGPUCodeGenPrepareImpl, bool> {
+public:
const GCNSubtarget *ST = nullptr;
+ const TargetLibraryInfo *TLInfo = nullptr;
AssumptionCache *AC = nullptr;
DominatorTree *DT = nullptr;
- LegacyDivergenceAnalysis *DA = nullptr;
+ UniformityInfo *UA = nullptr;
Module *Mod = nullptr;
const DataLayout *DL = nullptr;
bool HasUnsafeFPMath = false;
- bool HasFP32Denormals = false;
+ bool HasFP32DenormalFlush = false;
+ bool FlowChanged = false;
+
+ DenseMap<const PHINode *, bool> BreakPhiNodesCache;
+
+ bool canBreakPHINode(const PHINode &I);
/// Copies exact/nsw/nuw flags (if any) from binary operation \p I to
/// binary operation \p V.
@@ -102,6 +137,21 @@ class AMDGPUCodeGenPrepare : public FunctionPass,
/// false otherwise.
bool needsPromotionToI32(const Type *T) const;
+ /// Return true if \p T is a legal scalar floating point type.
+ bool isLegalFloatingTy(const Type *T) const;
+
+ /// Wrapper to pass all the arguments to computeKnownFPClass
+ KnownFPClass computeKnownFPClass(const Value *V, FPClassTest Interested,
+ const Instruction *CtxI) const {
+ return llvm::computeKnownFPClass(V, *DL, Interested, 0, TLInfo, AC, CtxI,
+ DT);
+ }
+
+ bool canIgnoreDenormalInput(const Value *V, const Instruction *CtxI) const {
+ return HasFP32DenormalFlush ||
+ computeKnownFPClass(V, fcSubnormal, CtxI).isKnownNeverSubnormal();
+ }
+
/// Promotes uniform binary operation \p I to equivalent 32 bit binary
/// operation.
///
@@ -199,41 +249,104 @@ class AMDGPUCodeGenPrepare : public FunctionPass,
bool canWidenScalarExtLoad(LoadInst &I) const;
-public:
- static char ID;
+ Value *matchFractPat(IntrinsicInst &I);
+ Value *applyFractPat(IRBuilder<> &Builder, Value *FractArg);
+
+ bool canOptimizeWithRsq(const FPMathOperator *SqrtOp, FastMathFlags DivFMF,
+ FastMathFlags SqrtFMF) const;
- AMDGPUCodeGenPrepare() : FunctionPass(ID) {}
+ Value *optimizeWithRsq(IRBuilder<> &Builder, Value *Num, Value *Den,
+ FastMathFlags DivFMF, FastMathFlags SqrtFMF,
+ const Instruction *CtxI) const;
+ Value *optimizeWithRcp(IRBuilder<> &Builder, Value *Num, Value *Den,
+ FastMathFlags FMF, const Instruction *CtxI) const;
+ Value *optimizeWithFDivFast(IRBuilder<> &Builder, Value *Num, Value *Den,
+ float ReqdAccuracy) const;
+
+ Value *visitFDivElement(IRBuilder<> &Builder, Value *Num, Value *Den,
+ FastMathFlags DivFMF, FastMathFlags SqrtFMF,
+ Value *RsqOp, const Instruction *FDiv,
+ float ReqdAccuracy) const;
+
+ std::pair<Value *, Value *> getFrexpResults(IRBuilder<> &Builder,
+ Value *Src) const;
+
+ Value *emitRcpIEEE1ULP(IRBuilder<> &Builder, Value *Src,
+ bool IsNegative) const;
+ Value *emitFrexpDiv(IRBuilder<> &Builder, Value *LHS, Value *RHS,
+ FastMathFlags FMF) const;
+
+public:
bool visitFDiv(BinaryOperator &I);
- bool visitXor(BinaryOperator &I);
bool visitInstruction(Instruction &I) { return false; }
bool visitBinaryOperator(BinaryOperator &I);
bool visitLoadInst(LoadInst &I);
bool visitICmpInst(ICmpInst &I);
bool visitSelectInst(SelectInst &I);
+ bool visitPHINode(PHINode &I);
bool visitIntrinsicInst(IntrinsicInst &I);
bool visitBitreverseIntrinsicInst(IntrinsicInst &I);
+ bool visitMinNum(IntrinsicInst &I);
+ bool run(Function &F);
+};
- bool doInitialization(Module &M) override;
- bool runOnFunction(Function &F) override;
-
- StringRef getPassName() const override { return "AMDGPU IR optimizations"; }
+class AMDGPUCodeGenPrepare : public FunctionPass {
+private:
+ AMDGPUCodeGenPrepareImpl Impl;
+public:
+ static char ID;
+ AMDGPUCodeGenPrepare() : FunctionPass(ID) {
+ initializeAMDGPUCodeGenPreparePass(*PassRegistry::getPassRegistry());
+ }
void getAnalysisUsage(AnalysisUsage &AU) const override {
AU.addRequired<AssumptionCacheTracker>();
- AU.addRequired<LegacyDivergenceAnalysis>();
+ AU.addRequired<UniformityInfoWrapperPass>();
+ AU.addRequired<TargetLibraryInfoWrapperPass>();
// FIXME: Division expansion needs to preserve the dominator tree.
if (!ExpandDiv64InIR)
AU.setPreservesAll();
- }
+ }
+ bool runOnFunction(Function &F) override;
+ bool doInitialization(Module &M) override;
+ StringRef getPassName() const override { return "AMDGPU IR optimizations"; }
};
} // end anonymous namespace
-unsigned AMDGPUCodeGenPrepare::getBaseElementBitWidth(const Type *T) const {
+bool AMDGPUCodeGenPrepareImpl::run(Function &F) {
+ bool MadeChange = false;
+
+ Function::iterator NextBB;
+ for (Function::iterator FI = F.begin(), FE = F.end(); FI != FE; FI = NextBB) {
+ BasicBlock *BB = &*FI;
+ NextBB = std::next(FI);
+
+ BasicBlock::iterator Next;
+ for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;
+ I = Next) {
+ Next = std::next(I);
+
+ MadeChange |= visit(*I);
+
+ if (Next != E) { // Control flow changed
+ BasicBlock *NextInstBB = Next->getParent();
+ if (NextInstBB != BB) {
+ BB = NextInstBB;
+ E = BB->end();
+ FE = F.end();
+ }
+ }
+ }
+ }
+ return MadeChange;
+}
+
+unsigned AMDGPUCodeGenPrepareImpl::getBaseElementBitWidth(const Type *T) const {
assert(needsPromotionToI32(T) && "T does not need promotion to i32");
if (T->isIntegerTy())
@@ -241,7 +354,7 @@ unsigned AMDGPUCodeGenPrepare::getBaseElementBitWidth(const Type *T) const {
return cast<VectorType>(T)->getElementType()->getIntegerBitWidth();
}
-Type *AMDGPUCodeGenPrepare::getI32Ty(IRBuilder<> &B, const Type *T) const {
+Type *AMDGPUCodeGenPrepareImpl::getI32Ty(IRBuilder<> &B, const Type *T) const {
assert(needsPromotionToI32(T) && "T does not need promotion to i32");
if (T->isIntegerTy())
@@ -249,17 +362,17 @@ Type *AMDGPUCodeGenPrepare::getI32Ty(IRBuilder<> &B, const Type *T) const {
return FixedVectorType::get(B.getInt32Ty(), cast<FixedVectorType>(T));
}
-bool AMDGPUCodeGenPrepare::isSigned(const BinaryOperator &I) const {
+bool AMDGPUCodeGenPrepareImpl::isSigned(const BinaryOperator &I) const {
return I.getOpcode() == Instruction::AShr ||
I.getOpcode() == Instruction::SDiv || I.getOpcode() == Instruction::SRem;
}
-bool AMDGPUCodeGenPrepare::isSigned(const SelectInst &I) const {
+bool AMDGPUCodeGenPrepareImpl::isSigned(const SelectInst &I) const {
return isa<ICmpInst>(I.getOperand(0)) ?
cast<ICmpInst>(I.getOperand(0))->isSigned() : false;
}
-bool AMDGPUCodeGenPrepare::needsPromotionToI32(const Type *T) const {
+bool AMDGPUCodeGenPrepareImpl::needsPromotionToI32(const Type *T) const {
if (!Widen16BitOps)
return false;
@@ -279,6 +392,11 @@ bool AMDGPUCodeGenPrepare::needsPromotionToI32(const Type *T) const {
return false;
}
+bool AMDGPUCodeGenPrepareImpl::isLegalFloatingTy(const Type *Ty) const {
+ return Ty->isFloatTy() || Ty->isDoubleTy() ||
+ (Ty->isHalfTy() && ST->has16BitInsts());
+}
+
// Return true if the op promoted to i32 should have nsw set.
static bool promotedOpIsNSW(const Instruction &I) {
switch (I.getOpcode()) {
@@ -307,16 +425,16 @@ static bool promotedOpIsNUW(const Instruction &I) {
}
}
-bool AMDGPUCodeGenPrepare::canWidenScalarExtLoad(LoadInst &I) const {
+bool AMDGPUCodeGenPrepareImpl::canWidenScalarExtLoad(LoadInst &I) const {
Type *Ty = I.getType();
const DataLayout &DL = Mod->getDataLayout();
int TySize = DL.getTypeSizeInBits(Ty);
Align Alignment = DL.getValueOrABITypeAlignment(I.getAlign(), Ty);
- return I.isSimple() && TySize < 32 && Alignment >= 4 && DA->isUniform(&I);
+ return I.isSimple() && TySize < 32 && Alignment >= 4 && UA->isUniform(&I);
}
-bool AMDGPUCodeGenPrepare::promoteUniformOpToI32(BinaryOperator &I) const {
+bool AMDGPUCodeGenPrepareImpl::promoteUniformOpToI32(BinaryOperator &I) const {
assert(needsPromotionToI32(I.getType()) &&
"I does not need promotion to i32");
@@ -363,7 +481,7 @@ bool AMDGPUCodeGenPrepare::promoteUniformOpToI32(BinaryOperator &I) const {
return true;
}
-bool AMDGPUCodeGenPrepare::promoteUniformOpToI32(ICmpInst &I) const {
+bool AMDGPUCodeGenPrepareImpl::promoteUniformOpToI32(ICmpInst &I) const {
assert(needsPromotionToI32(I.getOperand(0)->getType()) &&
"I does not need promotion to i32");
@@ -390,7 +508,7 @@ bool AMDGPUCodeGenPrepare::promoteUniformOpToI32(ICmpInst &I) const {
return true;
}
-bool AMDGPUCodeGenPrepare::promoteUniformOpToI32(SelectInst &I) const {
+bool AMDGPUCodeGenPrepareImpl::promoteUniformOpToI32(SelectInst &I) const {
assert(needsPromotionToI32(I.getType()) &&
"I does not need promotion to i32");
@@ -419,7 +537,7 @@ bool AMDGPUCodeGenPrepare::promoteUniformOpToI32(SelectInst &I) const {
return true;
}
-bool AMDGPUCodeGenPrepare::promoteUniformBitreverseToI32(
+bool AMDGPUCodeGenPrepareImpl::promoteUniformBitreverseToI32(
IntrinsicInst &I) const {
assert(I.getIntrinsicID() == Intrinsic::bitreverse &&
"I must be bitreverse intrinsic");
@@ -445,11 +563,11 @@ bool AMDGPUCodeGenPrepare::promoteUniformBitreverseToI32(
return true;
}
-unsigned AMDGPUCodeGenPrepare::numBitsUnsigned(Value *Op) const {
+unsigned AMDGPUCodeGenPrepareImpl::numBitsUnsigned(Value *Op) const {
return computeKnownBits(Op, *DL, 0, AC).countMaxActiveBits();
}
-unsigned AMDGPUCodeGenPrepare::numBitsSigned(Value *Op) const {
+unsigned AMDGPUCodeGenPrepareImpl::numBitsSigned(Value *Op) const {
return ComputeMaxSignificantBits(Op, *DL, 0, AC);
}
@@ -508,7 +626,7 @@ static Value *getMul24(IRBuilder<> &Builder, Value *LHS, Value *RHS,
return Builder.CreateOr(Lo, Builder.CreateShl(Hi, 32));
}
-bool AMDGPUCodeGenPrepare::replaceMulWithMul24(BinaryOperator &I) const {
+bool AMDGPUCodeGenPrepareImpl::replaceMulWithMul24(BinaryOperator &I) const {
if (I.getOpcode() != Instruction::Mul)
return false;
@@ -518,7 +636,7 @@ bool AMDGPUCodeGenPrepare::replaceMulWithMul24(BinaryOperator &I) const {
return false;
// Prefer scalar if this could be s_mul_i32
- if (DA->isUniform(&I))
+ if (UA->isUniform(&I))
return false;
Value *LHS = I.getOperand(0);
@@ -592,7 +710,7 @@ static SelectInst *findSelectThroughCast(Value *V, CastInst *&Cast) {
return nullptr;
}
-bool AMDGPUCodeGenPrepare::foldBinOpIntoSelect(BinaryOperator &BO) const {
+bool AMDGPUCodeGenPrepareImpl::foldBinOpIntoSelect(BinaryOperator &BO) const {
// Don't do this unless the old select is going away. We want to eliminate the
// binary operator, not replace a binop with a select.
int SelOpNo = 0;
@@ -653,30 +771,191 @@ bool AMDGPUCodeGenPrepare::foldBinOpIntoSelect(BinaryOperator &BO) const {
return true;
}
+std::pair<Value *, Value *>
+AMDGPUCodeGenPrepareImpl::getFrexpResults(IRBuilder<> &Builder,
+ Value *Src) const {
+ Type *Ty = Src->getType();
+ Value *Frexp = Builder.CreateIntrinsic(Intrinsic::frexp,
+ {Ty, Builder.getInt32Ty()}, Src);
+ Value *FrexpMant = Builder.CreateExtractValue(Frexp, {0});
+
+ // Bypass the bug workaround for the exponent result since it doesn't matter.
+ // TODO: Does the bug workaround even really need to consider the exponent
+ // result? It's unspecified by the spec.
+
+ Value *FrexpExp =
+ ST->hasFractBug()
+ ? Builder.CreateIntrinsic(Intrinsic::amdgcn_frexp_exp,
+ {Builder.getInt32Ty(), Ty}, Src)
+ : Builder.CreateExtractValue(Frexp, {1});
+ return {FrexpMant, FrexpExp};
+}
+
+/// Emit an expansion of 1.0 / Src good for 1ulp that supports denormals.
+Value *AMDGPUCodeGenPrepareImpl::emitRcpIEEE1ULP(IRBuilder<> &Builder,
+ Value *Src,
+ bool IsNegative) const {
+ // Same as for 1.0, but expand the sign out of the constant.
+ // -1.0 / x -> rcp (fneg x)
+ if (IsNegative)
+ Src = Builder.CreateFNeg(Src);
+
+ // The rcp instruction doesn't support denormals, so scale the input
+ // out of the denormal range and convert at the end.
+ //
+ // Expand as 2^-n * (1.0 / (x * 2^n))
+
+ // TODO: Skip scaling if input is known never denormal and the input
+ // range won't underflow to denormal. The hard part is knowing the
+ // result. We need a range check, the result could be denormal for
+ // 0x1p+126 < den <= 0x1p+127.
+
+ Type *Ty = Src->getType();
+
+ auto [FrexpMant, FrexpExp] = getFrexpResults(Builder, Src);
+ Value *ScaleFactor = Builder.CreateNeg(FrexpExp);
+ Value *Rcp = Builder.CreateUnaryIntrinsic(Intrinsic::amdgcn_rcp, FrexpMant);
+ return Builder.CreateIntrinsic(Intrinsic::ldexp, {Ty, Builder.getInt32Ty()},
+ {Rcp, ScaleFactor});
+}
+
+/// Emit a 2ulp expansion for fdiv by using frexp for input scaling.
+Value *AMDGPUCodeGenPrepareImpl::emitFrexpDiv(IRBuilder<> &Builder, Value *LHS,
+ Value *RHS,
+ FastMathFlags FMF) const {
+ // If we have have to work around the fract/frexp bug, we're worse off than
+ // using the fdiv.fast expansion. The full safe expansion is faster if we have
+ // fast FMA.
+ if (HasFP32DenormalFlush && ST->hasFractBug() && !ST->hasFastFMAF32() &&
+ (!FMF.noNaNs() || !FMF.noInfs()))
+ return nullptr;
+
+ // We're scaling the LHS to avoid a denormal input, and scale the denominator
+ // to avoid large values underflowing the result.
+ Type *Ty = LHS->getType();
+
+ auto [FrexpMantRHS, FrexpExpRHS] = getFrexpResults(Builder, RHS);
+
+ Value *Rcp =
+ Builder.CreateUnaryIntrinsic(Intrinsic::amdgcn_rcp, FrexpMantRHS);
+
+ auto [FrexpMantLHS, FrexpExpLHS] = getFrexpResults(Builder, LHS);
+ Value *Mul = Builder.CreateFMul(FrexpMantLHS, Rcp);
+
+ // We multiplied by 2^N/2^M, so we need to multiply by 2^(N-M) to scale the
+ // result.
+ Value *ExpDiff = Builder.CreateSub(FrexpExpLHS, FrexpExpRHS);
+ return Builder.CreateIntrinsic(Intrinsic::ldexp, {Ty, Builder.getInt32Ty()},
+ {Mul, ExpDiff});
+}
+
+/// Emit an expansion of 1.0 / sqrt(Src) good for 1ulp that supports denormals.
+static Value *emitRsqIEEE1ULP(IRBuilder<> &Builder, Value *Src,
+ bool IsNegative) {
+ // bool need_scale = x < 0x1p-126f;
+ // float input_scale = need_scale ? 0x1.0p+24f : 1.0f;
+ // float output_scale = need_scale ? 0x1.0p+12f : 1.0f;
+ // rsq(x * input_scale) * output_scale;
+
+ Type *Ty = Src->getType();
+ APFloat SmallestNormal =
+ APFloat::getSmallestNormalized(Ty->getFltSemantics());
+ Value *NeedScale =
+ Builder.CreateFCmpOLT(Src, ConstantFP::get(Ty, SmallestNormal));
+ Constant *One = ConstantFP::get(Ty, 1.0);
+ Constant *InputScale = ConstantFP::get(Ty, 0x1.0p+24);
+ Constant *OutputScale =
+ ConstantFP::get(Ty, IsNegative ? -0x1.0p+12 : 0x1.0p+12);
+
+ Value *InputScaleFactor = Builder.CreateSelect(NeedScale, InputScale, One);
+
+ Value *ScaledInput = Builder.CreateFMul(Src, InputScaleFactor);
+ Value *Rsq = Builder.CreateUnaryIntrinsic(Intrinsic::amdgcn_rsq, ScaledInput);
+ Value *OutputScaleFactor = Builder.CreateSelect(
+ NeedScale, OutputScale, IsNegative ? ConstantFP::get(Ty, -1.0) : One);
+
+ return Builder.CreateFMul(Rsq, OutputScaleFactor);
+}
+
+bool AMDGPUCodeGenPrepareImpl::canOptimizeWithRsq(const FPMathOperator *SqrtOp,
+ FastMathFlags DivFMF,
+ FastMathFlags SqrtFMF) const {
+ // The rsqrt contraction increases accuracy from ~2ulp to ~1ulp.
+ if (!DivFMF.allowContract() || !SqrtFMF.allowContract())
+ return false;
+
+ // v_rsq_f32 gives 1ulp
+ return SqrtFMF.approxFunc() || HasUnsafeFPMath ||
+ SqrtOp->getFPAccuracy() >= 1.0f;
+}
+
+Value *AMDGPUCodeGenPrepareImpl::optimizeWithRsq(
+ IRBuilder<> &Builder, Value *Num, Value *Den, FastMathFlags DivFMF,
+ FastMathFlags SqrtFMF, const Instruction *CtxI) const {
+ // The rsqrt contraction increases accuracy from ~2ulp to ~1ulp.
+ assert(DivFMF.allowContract() && SqrtFMF.allowContract());
+
+ // rsq_f16 is accurate to 0.51 ulp.
+ // rsq_f32 is accurate for !fpmath >= 1.0ulp and denormals are flushed.
+ // rsq_f64 is never accurate.
+ const ConstantFP *CLHS = dyn_cast<ConstantFP>(Num);
+ if (!CLHS)
+ return nullptr;
+
+ assert(Den->getType()->isFloatTy());
+
+ bool IsNegative = false;
+
+ // TODO: Handle other numerator values with arcp.
+ if (CLHS->isExactlyValue(1.0) || (IsNegative = CLHS->isExactlyValue(-1.0))) {
+ // Add in the sqrt flags.
+ IRBuilder<>::FastMathFlagGuard Guard(Builder);
+ DivFMF |= SqrtFMF;
+ Builder.setFastMathFlags(DivFMF);
+
+ if ((DivFMF.approxFunc() && SqrtFMF.approxFunc()) ||
+ canIgnoreDenormalInput(Den, CtxI)) {
+ Value *Result = Builder.CreateUnaryIntrinsic(Intrinsic::amdgcn_rsq, Den);
+ // -1.0 / sqrt(x) -> fneg(rsq(x))
+ return IsNegative ? Builder.CreateFNeg(Result) : Result;
+ }
+
+ return emitRsqIEEE1ULP(Builder, Den, IsNegative);
+ }
+
+ return nullptr;
+}
+
// Optimize fdiv with rcp:
//
// 1/x -> rcp(x) when rcp is sufficiently accurate or inaccurate rcp is
// allowed with unsafe-fp-math or afn.
//
-// a/b -> a*rcp(b) when inaccurate rcp is allowed with unsafe-fp-math or afn.
-static Value *optimizeWithRcp(Value *Num, Value *Den, bool AllowInaccurateRcp,
- bool RcpIsAccurate, IRBuilder<> &Builder,
- Module *Mod) {
-
- if (!AllowInaccurateRcp && !RcpIsAccurate)
- return nullptr;
+// a/b -> a*rcp(b) when arcp is allowed, and we only need provide ULP 1.0
+Value *
+AMDGPUCodeGenPrepareImpl::optimizeWithRcp(IRBuilder<> &Builder, Value *Num,
+ Value *Den, FastMathFlags FMF,
+ const Instruction *CtxI) const {
+ // rcp_f16 is accurate to 0.51 ulp.
+ // rcp_f32 is accurate for !fpmath >= 1.0ulp and denormals are flushed.
+ // rcp_f64 is never accurate.
+ assert(Den->getType()->isFloatTy());
- Type *Ty = Den->getType();
if (const ConstantFP *CLHS = dyn_cast<ConstantFP>(Num)) {
- if (AllowInaccurateRcp || RcpIsAccurate) {
- if (CLHS->isExactlyValue(1.0)) {
- Function *Decl = Intrinsic::getDeclaration(
- Mod, Intrinsic::amdgcn_rcp, Ty);
+ bool IsNegative = false;
+ if (CLHS->isExactlyValue(1.0) ||
+ (IsNegative = CLHS->isExactlyValue(-1.0))) {
+ Value *Src = Den;
+
+ if (HasFP32DenormalFlush || FMF.approxFunc()) {
+ // -1.0 / x -> 1.0 / fneg(x)
+ if (IsNegative)
+ Src = Builder.CreateFNeg(Src);
// v_rcp_f32 and v_rsq_f32 do not support denormals, and according to
// the CI documentation has a worst case error of 1 ulp.
- // OpenCL requires <= 2.5 ulp for 1.0 / x, so it should always be OK to
- // use it as long as we aren't trying to use denormals.
+ // OpenCL requires <= 2.5 ulp for 1.0 / x, so it should always be OK
+ // to use it as long as we aren't trying to use denormals.
//
// v_rcp_f16 and v_rsq_f16 DO support denormals.
@@ -684,30 +963,29 @@ static Value *optimizeWithRcp(Value *Num, Value *Den, bool AllowInaccurateRcp,
// insert rsq intrinsic here.
// 1.0 / x -> rcp(x)
- return Builder.CreateCall(Decl, { Den });
+ return Builder.CreateUnaryIntrinsic(Intrinsic::amdgcn_rcp, Src);
}
- // Same as for 1.0, but expand the sign out of the constant.
- if (CLHS->isExactlyValue(-1.0)) {
- Function *Decl = Intrinsic::getDeclaration(
- Mod, Intrinsic::amdgcn_rcp, Ty);
-
- // -1.0 / x -> rcp (fneg x)
- Value *FNeg = Builder.CreateFNeg(Den);
- return Builder.CreateCall(Decl, { FNeg });
- }
+ // TODO: If the input isn't denormal, and we know the input exponent isn't
+ // big enough to introduce a denormal we can avoid the scaling.
+ return emitRcpIEEE1ULP(Builder, Src, IsNegative);
}
}
- if (AllowInaccurateRcp) {
- Function *Decl = Intrinsic::getDeclaration(
- Mod, Intrinsic::amdgcn_rcp, Ty);
-
- // Turn into multiply by the reciprocal.
+ if (FMF.allowReciprocal()) {
// x / y -> x * (1.0 / y)
- Value *Recip = Builder.CreateCall(Decl, { Den });
+
+ // TODO: Could avoid denormal scaling and use raw rcp if we knew the output
+ // will never underflow.
+ if (HasFP32DenormalFlush || FMF.approxFunc()) {
+ Value *Recip = Builder.CreateUnaryIntrinsic(Intrinsic::amdgcn_rcp, Den);
+ return Builder.CreateFMul(Num, Recip);
+ }
+
+ Value *Recip = emitRcpIEEE1ULP(Builder, Den, false);
return Builder.CreateFMul(Num, Recip);
}
+
return nullptr;
}
@@ -718,17 +996,14 @@ static Value *optimizeWithRcp(Value *Num, Value *Den, bool AllowInaccurateRcp,
// 1/x -> fdiv.fast(1,x) when !fpmath >= 2.5ulp.
//
// NOTE: optimizeWithRcp should be tried first because rcp is the preference.
-static Value *optimizeWithFDivFast(Value *Num, Value *Den, float ReqdAccuracy,
- bool HasDenormals, IRBuilder<> &Builder,
- Module *Mod) {
+Value *AMDGPUCodeGenPrepareImpl::optimizeWithFDivFast(
+ IRBuilder<> &Builder, Value *Num, Value *Den, float ReqdAccuracy) const {
// fdiv.fast can achieve 2.5 ULP accuracy.
if (ReqdAccuracy < 2.5f)
return nullptr;
// Only have fdiv.fast for f32.
- Type *Ty = Den->getType();
- if (!Ty->isFloatTy())
- return nullptr;
+ assert(Den->getType()->isFloatTy());
bool NumIsOne = false;
if (const ConstantFP *CNum = dyn_cast<ConstantFP>(Num)) {
@@ -737,11 +1012,39 @@ static Value *optimizeWithFDivFast(Value *Num, Value *Den, float ReqdAccuracy,
}
// fdiv does not support denormals. But 1.0/x is always fine to use it.
- if (HasDenormals && !NumIsOne)
+ //
+ // TODO: This works for any value with a specific known exponent range, don't
+ // just limit to constant 1.
+ if (!HasFP32DenormalFlush && !NumIsOne)
return nullptr;
- Function *Decl = Intrinsic::getDeclaration(Mod, Intrinsic::amdgcn_fdiv_fast);
- return Builder.CreateCall(Decl, { Num, Den });
+ return Builder.CreateIntrinsic(Intrinsic::amdgcn_fdiv_fast, {}, {Num, Den});
+}
+
+Value *AMDGPUCodeGenPrepareImpl::visitFDivElement(
+ IRBuilder<> &Builder, Value *Num, Value *Den, FastMathFlags DivFMF,
+ FastMathFlags SqrtFMF, Value *RsqOp, const Instruction *FDivInst,
+ float ReqdDivAccuracy) const {
+ if (RsqOp) {
+ Value *Rsq =
+ optimizeWithRsq(Builder, Num, RsqOp, DivFMF, SqrtFMF, FDivInst);
+ if (Rsq)
+ return Rsq;
+ }
+
+ Value *Rcp = optimizeWithRcp(Builder, Num, Den, DivFMF, FDivInst);
+ if (Rcp)
+ return Rcp;
+
+ // In the basic case fdiv_fast has the same instruction count as the frexp div
+ // expansion. Slightly prefer fdiv_fast since it ends in an fmul that can
+ // potentially be fused into a user. Also, materialization of the constants
+ // can be reused for multiple instances.
+ Value *FDivFast = optimizeWithFDivFast(Builder, Num, Den, ReqdDivAccuracy);
+ if (FDivFast)
+ return FDivFast;
+
+ return emitFrexpDiv(Builder, Num, Den, DivFMF);
}
// Optimizations is performed based on fpmath, fast math flags as well as
@@ -759,100 +1062,96 @@ static Value *optimizeWithFDivFast(Value *Num, Value *Den, float ReqdAccuracy,
// 1/x -> fdiv.fast(1,x) when !fpmath >= 2.5ulp.
//
// NOTE: rcp is the preference in cases that both are legal.
-bool AMDGPUCodeGenPrepare::visitFDiv(BinaryOperator &FDiv) {
+bool AMDGPUCodeGenPrepareImpl::visitFDiv(BinaryOperator &FDiv) {
+ if (DisableFDivExpand)
+ return false;
Type *Ty = FDiv.getType()->getScalarType();
+ if (!Ty->isFloatTy())
+ return false;
// The f64 rcp/rsq approximations are pretty inaccurate. We can do an
- // expansion around them in codegen.
- if (Ty->isDoubleTy())
+ // expansion around them in codegen. f16 is good enough to always use.
+
+ const FPMathOperator *FPOp = cast<const FPMathOperator>(&FDiv);
+ const FastMathFlags DivFMF = FPOp->getFastMathFlags();
+ const float ReqdAccuracy = FPOp->getFPAccuracy();
+
+ // Inaccurate rcp is allowed with unsafe-fp-math or afn.
+ //
+ // Defer to codegen to handle this.
+ //
+ // TODO: Decide on an interpretation for interactions between afn + arcp +
+ // !fpmath, and make it consistent between here and codegen. For now, defer
+ // expansion of afn to codegen. The current interpretation is so aggressive we
+ // don't need any pre-consideration here when we have better information. A
+ // more conservative interpretation could use handling here.
+ const bool AllowInaccurateRcp = HasUnsafeFPMath || DivFMF.approxFunc();
+ if (AllowInaccurateRcp)
return false;
- // No intrinsic for fdiv16 if target does not support f16.
- if (Ty->isHalfTy() && !ST->has16BitInsts())
+ // Defer the correct implementations to codegen.
+ if (ReqdAccuracy < 1.0f)
return false;
- const FPMathOperator *FPOp = cast<const FPMathOperator>(&FDiv);
- const float ReqdAccuracy = FPOp->getFPAccuracy();
+ FastMathFlags SqrtFMF;
- // Inaccurate rcp is allowed with unsafe-fp-math or afn.
- FastMathFlags FMF = FPOp->getFastMathFlags();
- const bool AllowInaccurateRcp = HasUnsafeFPMath || FMF.approxFunc();
+ Value *Num = FDiv.getOperand(0);
+ Value *Den = FDiv.getOperand(1);
- // rcp_f16 is accurate for !fpmath >= 1.0ulp.
- // rcp_f32 is accurate for !fpmath >= 1.0ulp and denormals are flushed.
- // rcp_f64 is never accurate.
- const bool RcpIsAccurate = (Ty->isHalfTy() && ReqdAccuracy >= 1.0f) ||
- (Ty->isFloatTy() && !HasFP32Denormals && ReqdAccuracy >= 1.0f);
+ Value *RsqOp = nullptr;
+ auto *DenII = dyn_cast<IntrinsicInst>(Den);
+ if (DenII && DenII->getIntrinsicID() == Intrinsic::sqrt &&
+ DenII->hasOneUse()) {
+ const auto *SqrtOp = cast<FPMathOperator>(DenII);
+ SqrtFMF = SqrtOp->getFastMathFlags();
+ if (canOptimizeWithRsq(SqrtOp, DivFMF, SqrtFMF))
+ RsqOp = SqrtOp->getOperand(0);
+ }
IRBuilder<> Builder(FDiv.getParent(), std::next(FDiv.getIterator()));
- Builder.setFastMathFlags(FMF);
+ Builder.setFastMathFlags(DivFMF);
Builder.SetCurrentDebugLocation(FDiv.getDebugLoc());
- Value *Num = FDiv.getOperand(0);
- Value *Den = FDiv.getOperand(1);
+ SmallVector<Value *, 4> NumVals;
+ SmallVector<Value *, 4> DenVals;
+ SmallVector<Value *, 4> RsqDenVals;
+ extractValues(Builder, NumVals, Num);
+ extractValues(Builder, DenVals, Den);
- Value *NewFDiv = nullptr;
- if (auto *VT = dyn_cast<FixedVectorType>(FDiv.getType())) {
- NewFDiv = PoisonValue::get(VT);
+ if (RsqOp)
+ extractValues(Builder, RsqDenVals, RsqOp);
- // FIXME: Doesn't do the right thing for cases where the vector is partially
- // constant. This works when the scalarizer pass is run first.
- for (unsigned I = 0, E = VT->getNumElements(); I != E; ++I) {
- Value *NumEltI = Builder.CreateExtractElement(Num, I);
- Value *DenEltI = Builder.CreateExtractElement(Den, I);
- // Try rcp first.
- Value *NewElt = optimizeWithRcp(NumEltI, DenEltI, AllowInaccurateRcp,
- RcpIsAccurate, Builder, Mod);
- if (!NewElt) // Try fdiv.fast.
- NewElt = optimizeWithFDivFast(NumEltI, DenEltI, ReqdAccuracy,
- HasFP32Denormals, Builder, Mod);
- if (!NewElt) // Keep the original.
- NewElt = Builder.CreateFDiv(NumEltI, DenEltI);
+ SmallVector<Value *, 4> ResultVals(NumVals.size());
+ for (int I = 0, E = NumVals.size(); I != E; ++I) {
+ Value *NumElt = NumVals[I];
+ Value *DenElt = DenVals[I];
+ Value *RsqDenElt = RsqOp ? RsqDenVals[I] : nullptr;
- NewFDiv = Builder.CreateInsertElement(NewFDiv, NewElt, I);
- }
- } else { // Scalar FDiv.
- // Try rcp first.
- NewFDiv = optimizeWithRcp(Num, Den, AllowInaccurateRcp, RcpIsAccurate,
- Builder, Mod);
- if (!NewFDiv) { // Try fdiv.fast.
- NewFDiv = optimizeWithFDivFast(Num, Den, ReqdAccuracy, HasFP32Denormals,
- Builder, Mod);
+ Value *NewElt =
+ visitFDivElement(Builder, NumElt, DenElt, DivFMF, SqrtFMF, RsqDenElt,
+ cast<Instruction>(FPOp), ReqdAccuracy);
+ if (!NewElt) {
+ // Keep the original, but scalarized.
+
+ // This has the unfortunate side effect of sometimes scalarizing when
+ // we're not going to do anything.
+ NewElt = Builder.CreateFDiv(NumElt, DenElt);
+ if (auto *NewEltInst = dyn_cast<Instruction>(NewElt))
+ NewEltInst->copyMetadata(FDiv);
}
- }
- if (NewFDiv) {
- FDiv.replaceAllUsesWith(NewFDiv);
- NewFDiv->takeName(&FDiv);
- FDiv.eraseFromParent();
+ ResultVals[I] = NewElt;
}
- return !!NewFDiv;
-}
+ Value *NewVal = insertValues(Builder, FDiv.getType(), ResultVals);
-bool AMDGPUCodeGenPrepare::visitXor(BinaryOperator &I) {
- // Match the Xor instruction, its type and its operands
- IntrinsicInst *IntrinsicCall = dyn_cast<IntrinsicInst>(I.getOperand(0));
- ConstantInt *RHS = dyn_cast<ConstantInt>(I.getOperand(1));
- if (!RHS || !IntrinsicCall || RHS->getSExtValue() != -1)
- return visitBinaryOperator(I);
-
- // Check if the Call is an intrinsic instruction to amdgcn_class intrinsic
- // has only one use
- if (IntrinsicCall->getIntrinsicID() != Intrinsic::amdgcn_class ||
- !IntrinsicCall->hasOneUse())
- return visitBinaryOperator(I);
-
- // "Not" the second argument of the intrinsic call
- ConstantInt *Arg = dyn_cast<ConstantInt>(IntrinsicCall->getOperand(1));
- if (!Arg)
- return visitBinaryOperator(I);
+ if (NewVal) {
+ FDiv.replaceAllUsesWith(NewVal);
+ NewVal->takeName(&FDiv);
+ RecursivelyDeleteTriviallyDeadInstructions(&FDiv, TLInfo);
+ }
- IntrinsicCall->setOperand(
- 1, ConstantInt::get(Arg->getType(), Arg->getZExtValue() ^ 0x3ff));
- I.replaceAllUsesWith(IntrinsicCall);
- I.eraseFromParent();
return true;
}
@@ -882,9 +1181,9 @@ static Value* getMulHu(IRBuilder<> &Builder, Value *LHS, Value *RHS) {
/// Figure out how many bits are really needed for this division. \p AtLeast is
/// an optimization hint to bypass the second ComputeNumSignBits call if we the
/// first one is insufficient. Returns -1 on failure.
-int AMDGPUCodeGenPrepare::getDivNumBits(BinaryOperator &I,
- Value *Num, Value *Den,
- unsigned AtLeast, bool IsSigned) const {
+int AMDGPUCodeGenPrepareImpl::getDivNumBits(BinaryOperator &I, Value *Num,
+ Value *Den, unsigned AtLeast,
+ bool IsSigned) const {
const DataLayout &DL = Mod->getDataLayout();
unsigned LHSSignBits = ComputeNumSignBits(Num, DL, 0, AC, &I);
if (LHSSignBits < AtLeast)
@@ -903,21 +1202,19 @@ int AMDGPUCodeGenPrepare::getDivNumBits(BinaryOperator &I,
// The fractional part of a float is enough to accurately represent up to
// a 24-bit signed integer.
-Value *AMDGPUCodeGenPrepare::expandDivRem24(IRBuilder<> &Builder,
- BinaryOperator &I,
- Value *Num, Value *Den,
- bool IsDiv, bool IsSigned) const {
+Value *AMDGPUCodeGenPrepareImpl::expandDivRem24(IRBuilder<> &Builder,
+ BinaryOperator &I, Value *Num,
+ Value *Den, bool IsDiv,
+ bool IsSigned) const {
int DivBits = getDivNumBits(I, Num, Den, 9, IsSigned);
if (DivBits == -1)
return nullptr;
return expandDivRem24Impl(Builder, I, Num, Den, DivBits, IsDiv, IsSigned);
}
-Value *AMDGPUCodeGenPrepare::expandDivRem24Impl(IRBuilder<> &Builder,
- BinaryOperator &I,
- Value *Num, Value *Den,
- unsigned DivBits,
- bool IsDiv, bool IsSigned) const {
+Value *AMDGPUCodeGenPrepareImpl::expandDivRem24Impl(
+ IRBuilder<> &Builder, BinaryOperator &I, Value *Num, Value *Den,
+ unsigned DivBits, bool IsDiv, bool IsSigned) const {
Type *I32Ty = Builder.getInt32Ty();
Num = Builder.CreateTrunc(Num, I32Ty);
Den = Builder.CreateTrunc(Den, I32Ty);
@@ -1017,8 +1314,9 @@ Value *AMDGPUCodeGenPrepare::expandDivRem24Impl(IRBuilder<> &Builder,
// than the general expansion we do here.
// TODO: It would be better to just directly handle those optimizations here.
-bool AMDGPUCodeGenPrepare::divHasSpecialOptimization(
- BinaryOperator &I, Value *Num, Value *Den) const {
+bool AMDGPUCodeGenPrepareImpl::divHasSpecialOptimization(BinaryOperator &I,
+ Value *Num,
+ Value *Den) const {
if (Constant *C = dyn_cast<Constant>(Den)) {
// Arbitrary constants get a better expansion as long as a wider mulhi is
// legal.
@@ -1059,9 +1357,9 @@ static Value *getSign32(Value *V, IRBuilder<> &Builder, const DataLayout *DL) {
return Builder.CreateAShr(V, Builder.getInt32(31));
}
-Value *AMDGPUCodeGenPrepare::expandDivRem32(IRBuilder<> &Builder,
- BinaryOperator &I, Value *X,
- Value *Y) const {
+Value *AMDGPUCodeGenPrepareImpl::expandDivRem32(IRBuilder<> &Builder,
+ BinaryOperator &I, Value *X,
+ Value *Y) const {
Instruction::BinaryOps Opc = I.getOpcode();
assert(Opc == Instruction::URem || Opc == Instruction::UDiv ||
Opc == Instruction::SRem || Opc == Instruction::SDiv);
@@ -1147,7 +1445,7 @@ Value *AMDGPUCodeGenPrepare::expandDivRem32(IRBuilder<> &Builder,
Value *FloatY = Builder.CreateUIToFP(Y, F32Ty);
Function *Rcp = Intrinsic::getDeclaration(Mod, Intrinsic::amdgcn_rcp, F32Ty);
Value *RcpY = Builder.CreateCall(Rcp, {FloatY});
- Constant *Scale = ConstantFP::get(F32Ty, BitsToFloat(0x4F7FFFFE));
+ Constant *Scale = ConstantFP::get(F32Ty, llvm::bit_cast<float>(0x4F7FFFFE));
Value *ScaledY = Builder.CreateFMul(RcpY, Scale);
Value *Z = Builder.CreateFPToUI(ScaledY, I32Ty);
@@ -1184,9 +1482,9 @@ Value *AMDGPUCodeGenPrepare::expandDivRem32(IRBuilder<> &Builder,
return Res;
}
-Value *AMDGPUCodeGenPrepare::shrinkDivRem64(IRBuilder<> &Builder,
- BinaryOperator &I,
- Value *Num, Value *Den) const {
+Value *AMDGPUCodeGenPrepareImpl::shrinkDivRem64(IRBuilder<> &Builder,
+ BinaryOperator &I, Value *Num,
+ Value *Den) const {
if (!ExpandDiv64InIR && divHasSpecialOptimization(I, Num, Den))
return nullptr; // Keep it for later optimization.
@@ -1215,7 +1513,7 @@ Value *AMDGPUCodeGenPrepare::shrinkDivRem64(IRBuilder<> &Builder,
return nullptr;
}
-void AMDGPUCodeGenPrepare::expandDivRem64(BinaryOperator &I) const {
+void AMDGPUCodeGenPrepareImpl::expandDivRem64(BinaryOperator &I) const {
Instruction::BinaryOps Opc = I.getOpcode();
// Do the general expansion.
if (Opc == Instruction::UDiv || Opc == Instruction::SDiv) {
@@ -1231,12 +1529,12 @@ void AMDGPUCodeGenPrepare::expandDivRem64(BinaryOperator &I) const {
llvm_unreachable("not a division");
}
-bool AMDGPUCodeGenPrepare::visitBinaryOperator(BinaryOperator &I) {
+bool AMDGPUCodeGenPrepareImpl::visitBinaryOperator(BinaryOperator &I) {
if (foldBinOpIntoSelect(I))
return true;
if (ST->has16BitInsts() && needsPromotionToI32(I.getType()) &&
- DA->isUniform(&I) && promoteUniformOpToI32(I))
+ UA->isUniform(&I) && promoteUniformOpToI32(I))
return true;
if (UseMul24Intrin && replaceMulWithMul24(I))
@@ -1307,6 +1605,7 @@ bool AMDGPUCodeGenPrepare::visitBinaryOperator(BinaryOperator &I) {
// TODO: We get much worse code in specially handled constant cases.
for (BinaryOperator *Div : Div64ToExpand) {
expandDivRem64(*Div);
+ FlowChanged = true;
Changed = true;
}
}
@@ -1314,7 +1613,7 @@ bool AMDGPUCodeGenPrepare::visitBinaryOperator(BinaryOperator &I) {
return Changed;
}
-bool AMDGPUCodeGenPrepare::visitLoadInst(LoadInst &I) {
+bool AMDGPUCodeGenPrepareImpl::visitLoadInst(LoadInst &I) {
if (!WidenLoads)
return false;
@@ -1325,9 +1624,7 @@ bool AMDGPUCodeGenPrepare::visitLoadInst(LoadInst &I) {
Builder.SetCurrentDebugLocation(I.getDebugLoc());
Type *I32Ty = Builder.getInt32Ty();
- Type *PT = PointerType::get(I32Ty, I.getPointerAddressSpace());
- Value *BitCast= Builder.CreateBitCast(I.getPointerOperand(), PT);
- LoadInst *WidenLoad = Builder.CreateLoad(I32Ty, BitCast);
+ LoadInst *WidenLoad = Builder.CreateLoad(I32Ty, I.getPointerOperand());
WidenLoad->copyMetadata(I);
// If we have range metadata, we need to convert the type, and not make
@@ -1362,48 +1659,420 @@ bool AMDGPUCodeGenPrepare::visitLoadInst(LoadInst &I) {
return false;
}
-bool AMDGPUCodeGenPrepare::visitICmpInst(ICmpInst &I) {
+bool AMDGPUCodeGenPrepareImpl::visitICmpInst(ICmpInst &I) {
bool Changed = false;
if (ST->has16BitInsts() && needsPromotionToI32(I.getOperand(0)->getType()) &&
- DA->isUniform(&I))
+ UA->isUniform(&I))
Changed |= promoteUniformOpToI32(I);
return Changed;
}
-bool AMDGPUCodeGenPrepare::visitSelectInst(SelectInst &I) {
- bool Changed = false;
+bool AMDGPUCodeGenPrepareImpl::visitSelectInst(SelectInst &I) {
+ Value *Cond = I.getCondition();
+ Value *TrueVal = I.getTrueValue();
+ Value *FalseVal = I.getFalseValue();
+ Value *CmpVal;
+ FCmpInst::Predicate Pred;
- if (ST->has16BitInsts() && needsPromotionToI32(I.getType()) &&
- DA->isUniform(&I))
- Changed |= promoteUniformOpToI32(I);
+ if (ST->has16BitInsts() && needsPromotionToI32(I.getType())) {
+ if (UA->isUniform(&I))
+ return promoteUniformOpToI32(I);
+ return false;
+ }
- return Changed;
+ // Match fract pattern with nan check.
+ if (!match(Cond, m_FCmp(Pred, m_Value(CmpVal), m_NonNaN())))
+ return false;
+
+ FPMathOperator *FPOp = dyn_cast<FPMathOperator>(&I);
+ if (!FPOp)
+ return false;
+
+ IRBuilder<> Builder(&I);
+ Builder.setFastMathFlags(FPOp->getFastMathFlags());
+
+ auto *IITrue = dyn_cast<IntrinsicInst>(TrueVal);
+ auto *IIFalse = dyn_cast<IntrinsicInst>(FalseVal);
+
+ Value *Fract = nullptr;
+ if (Pred == FCmpInst::FCMP_UNO && TrueVal == CmpVal && IIFalse &&
+ CmpVal == matchFractPat(*IIFalse)) {
+ // isnan(x) ? x : fract(x)
+ Fract = applyFractPat(Builder, CmpVal);
+ } else if (Pred == FCmpInst::FCMP_ORD && FalseVal == CmpVal && IITrue &&
+ CmpVal == matchFractPat(*IITrue)) {
+ // !isnan(x) ? fract(x) : x
+ Fract = applyFractPat(Builder, CmpVal);
+ } else
+ return false;
+
+ Fract->takeName(&I);
+ I.replaceAllUsesWith(Fract);
+ RecursivelyDeleteTriviallyDeadInstructions(&I, TLInfo);
+ return true;
+}
+
+static bool areInSameBB(const Value *A, const Value *B) {
+ const auto *IA = dyn_cast<Instruction>(A);
+ const auto *IB = dyn_cast<Instruction>(B);
+ return IA && IB && IA->getParent() == IB->getParent();
+}
+
+// Helper for breaking large PHIs that returns true when an extractelement on V
+// is likely to be folded away by the DAG combiner.
+static bool isInterestingPHIIncomingValue(const Value *V) {
+ const auto *FVT = dyn_cast<FixedVectorType>(V->getType());
+ if (!FVT)
+ return false;
+
+ const Value *CurVal = V;
+
+ // Check for insertelements, keeping track of the elements covered.
+ BitVector EltsCovered(FVT->getNumElements());
+ while (const auto *IE = dyn_cast<InsertElementInst>(CurVal)) {
+ const auto *Idx = dyn_cast<ConstantInt>(IE->getOperand(2));
+
+ // Non constant index/out of bounds index -> folding is unlikely.
+ // The latter is more of a sanity check because canonical IR should just
+ // have replaced those with poison.
+ if (!Idx || Idx->getSExtValue() >= FVT->getNumElements())
+ return false;
+
+ const auto *VecSrc = IE->getOperand(0);
+
+ // If the vector source is another instruction, it must be in the same basic
+ // block. Otherwise, the DAGCombiner won't see the whole thing and is
+ // unlikely to be able to do anything interesting here.
+ if (isa<Instruction>(VecSrc) && !areInSameBB(VecSrc, IE))
+ return false;
+
+ CurVal = VecSrc;
+ EltsCovered.set(Idx->getSExtValue());
+
+ // All elements covered.
+ if (EltsCovered.all())
+ return true;
+ }
+
+ // We either didn't find a single insertelement, or the insertelement chain
+ // ended before all elements were covered. Check for other interesting values.
+
+ // Constants are always interesting because we can just constant fold the
+ // extractelements.
+ if (isa<Constant>(CurVal))
+ return true;
+
+ // shufflevector is likely to be profitable if either operand is a constant,
+ // or if either source is in the same block.
+ // This is because shufflevector is most often lowered as a series of
+ // insert/extract elements anyway.
+ if (const auto *SV = dyn_cast<ShuffleVectorInst>(CurVal)) {
+ return isa<Constant>(SV->getOperand(1)) ||
+ areInSameBB(SV, SV->getOperand(0)) ||
+ areInSameBB(SV, SV->getOperand(1));
+ }
+
+ return false;
}
-bool AMDGPUCodeGenPrepare::visitIntrinsicInst(IntrinsicInst &I) {
+bool AMDGPUCodeGenPrepareImpl::canBreakPHINode(const PHINode &I) {
+ // Check in the cache, or add an entry for this node.
+ //
+ // We init with false because we consider all PHI nodes unbreakable until we
+ // reach a conclusion. Doing the opposite - assuming they're break-able until
+ // proven otherwise - can be harmful in some pathological cases so we're
+ // conservative for now.
+ const auto [It, DidInsert] = BreakPhiNodesCache.insert({&I, false});
+ if (!DidInsert)
+ return It->second;
+
+ // This function may recurse, so to guard against infinite looping, this PHI
+ // is conservatively considered unbreakable until we reach a conclusion.
+
+ // Don't break PHIs that have no interesting incoming values. That is, where
+ // there is no clear opportunity to fold the "extractelement" instructions we
+ // would add.
+ //
+ // Note: IC does not run after this pass, so we're only interested in the
+ // foldings that the DAG combiner can do.
+ if (none_of(I.incoming_values(),
+ [&](Value *V) { return isInterestingPHIIncomingValue(V); }))
+ return false;
+
+ // Now, check users for unbreakable PHI nodes. If we have an unbreakable PHI
+ // node as user, we don't want to break this PHI either because it's unlikely
+ // to be beneficial. We would just explode the vector and reassemble it
+ // directly, wasting instructions.
+ //
+ // In the case where multiple users are PHI nodes, we want at least half of
+ // them to be breakable.
+ int Score = 0;
+ for (const Value *U : I.users()) {
+ if (const auto *PU = dyn_cast<PHINode>(U))
+ Score += canBreakPHINode(*PU) ? 1 : -1;
+ }
+
+ if (Score < 0)
+ return false;
+
+ return BreakPhiNodesCache[&I] = true;
+}
+
+/// Helper class for "break large PHIs" (visitPHINode).
+///
+/// This represents a slice of a PHI's incoming value, which is made up of:
+/// - The type of the slice (Ty)
+/// - The index in the incoming value's vector where the slice starts (Idx)
+/// - The number of elements in the slice (NumElts).
+/// It also keeps track of the NewPHI node inserted for this particular slice.
+///
+/// Slice examples:
+/// <4 x i64> -> Split into four i64 slices.
+/// -> [i64, 0, 1], [i64, 1, 1], [i64, 2, 1], [i64, 3, 1]
+/// <5 x i16> -> Split into 2 <2 x i16> slices + a i16 tail.
+/// -> [<2 x i16>, 0, 2], [<2 x i16>, 2, 2], [i16, 4, 1]
+class VectorSlice {
+public:
+ VectorSlice(Type *Ty, unsigned Idx, unsigned NumElts)
+ : Ty(Ty), Idx(Idx), NumElts(NumElts) {}
+
+ Type *Ty = nullptr;
+ unsigned Idx = 0;
+ unsigned NumElts = 0;
+ PHINode *NewPHI = nullptr;
+
+ /// Slice \p Inc according to the information contained within this slice.
+ /// This is cached, so if called multiple times for the same \p BB & \p Inc
+ /// pair, it returns the same Sliced value as well.
+ ///
+ /// Note this *intentionally* does not return the same value for, say,
+ /// [%bb.0, %0] & [%bb.1, %0] as:
+ /// - It could cause issues with dominance (e.g. if bb.1 is seen first, then
+ /// the value in bb.1 may not be reachable from bb.0 if it's its
+ /// predecessor.)
+ /// - We also want to make our extract instructions as local as possible so
+ /// the DAG has better chances of folding them out. Duplicating them like
+ /// that is beneficial in that regard.
+ ///
+ /// This is both a minor optimization to avoid creating duplicate
+ /// instructions, but also a requirement for correctness. It is not forbidden
+ /// for a PHI node to have the same [BB, Val] pair multiple times. If we
+ /// returned a new value each time, those previously identical pairs would all
+ /// have different incoming values (from the same block) and it'd cause a "PHI
+ /// node has multiple entries for the same basic block with different incoming
+ /// values!" verifier error.
+ Value *getSlicedVal(BasicBlock *BB, Value *Inc, StringRef NewValName) {
+ Value *&Res = SlicedVals[{BB, Inc}];
+ if (Res)
+ return Res;
+
+ IRBuilder<> B(BB->getTerminator());
+ if (Instruction *IncInst = dyn_cast<Instruction>(Inc))
+ B.SetCurrentDebugLocation(IncInst->getDebugLoc());
+
+ if (NumElts > 1) {
+ SmallVector<int, 4> Mask;
+ for (unsigned K = Idx; K < (Idx + NumElts); ++K)
+ Mask.push_back(K);
+ Res = B.CreateShuffleVector(Inc, Mask, NewValName);
+ } else
+ Res = B.CreateExtractElement(Inc, Idx, NewValName);
+
+ return Res;
+ }
+
+private:
+ SmallDenseMap<std::pair<BasicBlock *, Value *>, Value *> SlicedVals;
+};
+
+bool AMDGPUCodeGenPrepareImpl::visitPHINode(PHINode &I) {
+ // Break-up fixed-vector PHIs into smaller pieces.
+ // Default threshold is 32, so it breaks up any vector that's >32 bits into
+ // its elements, or into 32-bit pieces (for 8/16 bit elts).
+ //
+ // This is only helpful for DAGISel because it doesn't handle large PHIs as
+ // well as GlobalISel. DAGISel lowers PHIs by using CopyToReg/CopyFromReg.
+ // With large, odd-sized PHIs we may end up needing many `build_vector`
+ // operations with most elements being "undef". This inhibits a lot of
+ // optimization opportunities and can result in unreasonably high register
+ // pressure and the inevitable stack spilling.
+ if (!ScalarizeLargePHIs || getCGPassBuilderOption().EnableGlobalISelOption)
+ return false;
+
+ FixedVectorType *FVT = dyn_cast<FixedVectorType>(I.getType());
+ if (!FVT || DL->getTypeSizeInBits(FVT) <= ScalarizeLargePHIsThreshold)
+ return false;
+
+ if (!ForceScalarizeLargePHIs && !canBreakPHINode(I))
+ return false;
+
+ std::vector<VectorSlice> Slices;
+
+ Type *EltTy = FVT->getElementType();
+ {
+ unsigned Idx = 0;
+ // For 8/16 bits type, don't scalarize fully but break it up into as many
+ // 32-bit slices as we can, and scalarize the tail.
+ const unsigned EltSize = DL->getTypeSizeInBits(EltTy);
+ const unsigned NumElts = FVT->getNumElements();
+ if (EltSize == 8 || EltSize == 16) {
+ const unsigned SubVecSize = (32 / EltSize);
+ Type *SubVecTy = FixedVectorType::get(EltTy, SubVecSize);
+ for (unsigned End = alignDown(NumElts, SubVecSize); Idx < End;
+ Idx += SubVecSize)
+ Slices.emplace_back(SubVecTy, Idx, SubVecSize);
+ }
+
+ // Scalarize all remaining elements.
+ for (; Idx < NumElts; ++Idx)
+ Slices.emplace_back(EltTy, Idx, 1);
+ }
+
+ if (Slices.size() == 1)
+ return false;
+
+ // Create one PHI per vector piece. The "VectorSlice" class takes care of
+ // creating the necessary instruction to extract the relevant slices of each
+ // incoming value.
+ IRBuilder<> B(I.getParent());
+ B.SetCurrentDebugLocation(I.getDebugLoc());
+
+ unsigned IncNameSuffix = 0;
+ for (VectorSlice &S : Slices) {
+ // We need to reset the build on each iteration, because getSlicedVal may
+ // have inserted something into I's BB.
+ B.SetInsertPoint(I.getParent()->getFirstNonPHI());
+ S.NewPHI = B.CreatePHI(S.Ty, I.getNumIncomingValues());
+
+ for (const auto &[Idx, BB] : enumerate(I.blocks())) {
+ S.NewPHI->addIncoming(S.getSlicedVal(BB, I.getIncomingValue(Idx),
+ "largephi.extractslice" +
+ std::to_string(IncNameSuffix++)),
+ BB);
+ }
+ }
+
+ // And replace this PHI with a vector of all the previous PHI values.
+ Value *Vec = PoisonValue::get(FVT);
+ unsigned NameSuffix = 0;
+ for (VectorSlice &S : Slices) {
+ const auto ValName = "largephi.insertslice" + std::to_string(NameSuffix++);
+ if (S.NumElts > 1)
+ Vec =
+ B.CreateInsertVector(FVT, Vec, S.NewPHI, B.getInt64(S.Idx), ValName);
+ else
+ Vec = B.CreateInsertElement(Vec, S.NewPHI, S.Idx, ValName);
+ }
+
+ I.replaceAllUsesWith(Vec);
+ I.eraseFromParent();
+ return true;
+}
+
+bool AMDGPUCodeGenPrepareImpl::visitIntrinsicInst(IntrinsicInst &I) {
switch (I.getIntrinsicID()) {
case Intrinsic::bitreverse:
return visitBitreverseIntrinsicInst(I);
+ case Intrinsic::minnum:
+ return visitMinNum(I);
default:
return false;
}
}
-bool AMDGPUCodeGenPrepare::visitBitreverseIntrinsicInst(IntrinsicInst &I) {
+bool AMDGPUCodeGenPrepareImpl::visitBitreverseIntrinsicInst(IntrinsicInst &I) {
bool Changed = false;
if (ST->has16BitInsts() && needsPromotionToI32(I.getType()) &&
- DA->isUniform(&I))
+ UA->isUniform(&I))
Changed |= promoteUniformBitreverseToI32(I);
return Changed;
}
+/// Match non-nan fract pattern.
+/// minnum(fsub(x, floor(x)), nextafter(1.0, -1.0)
+///
+/// If fract is a useful instruction for the subtarget. Does not account for the
+/// nan handling; the instruction has a nan check on the input value.
+Value *AMDGPUCodeGenPrepareImpl::matchFractPat(IntrinsicInst &I) {
+ if (ST->hasFractBug())
+ return nullptr;
+
+ if (I.getIntrinsicID() != Intrinsic::minnum)
+ return nullptr;
+
+ Type *Ty = I.getType();
+ if (!isLegalFloatingTy(Ty->getScalarType()))
+ return nullptr;
+
+ Value *Arg0 = I.getArgOperand(0);
+ Value *Arg1 = I.getArgOperand(1);
+
+ const APFloat *C;
+ if (!match(Arg1, m_APFloat(C)))
+ return nullptr;
+
+ APFloat One(1.0);
+ bool LosesInfo;
+ One.convert(C->getSemantics(), APFloat::rmNearestTiesToEven, &LosesInfo);
+
+ // Match nextafter(1.0, -1)
+ One.next(true);
+ if (One != *C)
+ return nullptr;
+
+ Value *FloorSrc;
+ if (match(Arg0, m_FSub(m_Value(FloorSrc),
+ m_Intrinsic<Intrinsic::floor>(m_Deferred(FloorSrc)))))
+ return FloorSrc;
+ return nullptr;
+}
+
+Value *AMDGPUCodeGenPrepareImpl::applyFractPat(IRBuilder<> &Builder,
+ Value *FractArg) {
+ SmallVector<Value *, 4> FractVals;
+ extractValues(Builder, FractVals, FractArg);
+
+ SmallVector<Value *, 4> ResultVals(FractVals.size());
+
+ Type *Ty = FractArg->getType()->getScalarType();
+ for (unsigned I = 0, E = FractVals.size(); I != E; ++I) {
+ ResultVals[I] =
+ Builder.CreateIntrinsic(Intrinsic::amdgcn_fract, {Ty}, {FractVals[I]});
+ }
+
+ return insertValues(Builder, FractArg->getType(), ResultVals);
+}
+
+bool AMDGPUCodeGenPrepareImpl::visitMinNum(IntrinsicInst &I) {
+ Value *FractArg = matchFractPat(I);
+ if (!FractArg)
+ return false;
+
+ // Match pattern for fract intrinsic in contexts where the nan check has been
+ // optimized out (and hope the knowledge the source can't be nan wasn't lost).
+ if (!I.hasNoNaNs() && !isKnownNeverNaN(FractArg, *DL, TLInfo))
+ return false;
+
+ IRBuilder<> Builder(&I);
+ FastMathFlags FMF = I.getFastMathFlags();
+ FMF.setNoNaNs();
+ Builder.setFastMathFlags(FMF);
+
+ Value *Fract = applyFractPat(Builder, FractArg);
+ Fract->takeName(&I);
+ I.replaceAllUsesWith(Fract);
+
+ RecursivelyDeleteTriviallyDeadInstructions(&I, TLInfo);
+ return true;
+}
+
bool AMDGPUCodeGenPrepare::doInitialization(Module &M) {
- Mod = &M;
- DL = &Mod->getDataLayout();
+ Impl.Mod = &M;
+ Impl.DL = &Impl.Mod->getDataLayout();
return false;
}
@@ -1416,49 +2085,44 @@ bool AMDGPUCodeGenPrepare::runOnFunction(Function &F) {
return false;
const AMDGPUTargetMachine &TM = TPC->getTM<AMDGPUTargetMachine>();
- ST = &TM.getSubtarget<GCNSubtarget>(F);
- AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
- DA = &getAnalysis<LegacyDivergenceAnalysis>();
-
+ Impl.TLInfo = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
+ Impl.ST = &TM.getSubtarget<GCNSubtarget>(F);
+ Impl.AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
+ Impl.UA = &getAnalysis<UniformityInfoWrapperPass>().getUniformityInfo();
auto *DTWP = getAnalysisIfAvailable<DominatorTreeWrapperPass>();
- DT = DTWP ? &DTWP->getDomTree() : nullptr;
-
- HasUnsafeFPMath = hasUnsafeFPMath(F);
-
- AMDGPU::SIModeRegisterDefaults Mode(F);
- HasFP32Denormals = Mode.allFP32Denormals();
-
- bool MadeChange = false;
-
- Function::iterator NextBB;
- for (Function::iterator FI = F.begin(), FE = F.end(); FI != FE; FI = NextBB) {
- BasicBlock *BB = &*FI;
- NextBB = std::next(FI);
-
- BasicBlock::iterator Next;
- for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; I = Next) {
- Next = std::next(I);
-
- MadeChange |= visit(*I);
-
- if (Next != E) { // Control flow changed
- BasicBlock *NextInstBB = Next->getParent();
- if (NextInstBB != BB) {
- BB = NextInstBB;
- E = BB->end();
- FE = F.end();
- }
- }
- }
- }
+ Impl.DT = DTWP ? &DTWP->getDomTree() : nullptr;
+ Impl.HasUnsafeFPMath = hasUnsafeFPMath(F);
+ SIModeRegisterDefaults Mode(F);
+ Impl.HasFP32DenormalFlush =
+ Mode.FP32Denormals == DenormalMode::getPreserveSign();
+ return Impl.run(F);
+}
- return MadeChange;
+PreservedAnalyses AMDGPUCodeGenPreparePass::run(Function &F,
+ FunctionAnalysisManager &FAM) {
+ AMDGPUCodeGenPrepareImpl Impl;
+ Impl.Mod = F.getParent();
+ Impl.DL = &Impl.Mod->getDataLayout();
+ Impl.TLInfo = &FAM.getResult<TargetLibraryAnalysis>(F);
+ Impl.ST = &TM.getSubtarget<GCNSubtarget>(F);
+ Impl.AC = &FAM.getResult<AssumptionAnalysis>(F);
+ Impl.UA = &FAM.getResult<UniformityInfoAnalysis>(F);
+ Impl.DT = FAM.getCachedResult<DominatorTreeAnalysis>(F);
+ Impl.HasUnsafeFPMath = hasUnsafeFPMath(F);
+ SIModeRegisterDefaults Mode(F);
+ Impl.HasFP32DenormalFlush =
+ Mode.FP32Denormals == DenormalMode::getPreserveSign();
+ PreservedAnalyses PA = PreservedAnalyses::none();
+ if (!Impl.FlowChanged)
+ PA.preserveSet<CFGAnalyses>();
+ return Impl.run(F) ? PA : PreservedAnalyses::all();
}
INITIALIZE_PASS_BEGIN(AMDGPUCodeGenPrepare, DEBUG_TYPE,
"AMDGPU IR optimizations", false, false)
INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
-INITIALIZE_PASS_DEPENDENCY(LegacyDivergenceAnalysis)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(UniformityInfoWrapperPass)
INITIALIZE_PASS_END(AMDGPUCodeGenPrepare, DEBUG_TYPE, "AMDGPU IR optimizations",
false, false)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCombine.td b/llvm/lib/Target/AMDGPU/AMDGPUCombine.td
index c11d4656db3f..892e1eef27a8 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCombine.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCombine.td
@@ -10,31 +10,31 @@ include "llvm/Target/GlobalISel/Combine.td"
// TODO: This really belongs after legalization after scalarization.
-def fmin_fmax_legacy_matchdata : GIDefMatchData<"AMDGPUPostLegalizerCombinerHelper::FMinFMaxLegacyInfo">;
+def fmin_fmax_legacy_matchdata : GIDefMatchData<"FMinFMaxLegacyInfo">;
let Predicates = [HasFminFmaxLegacy] in
def fcmp_select_to_fmin_fmax_legacy : GICombineRule<
(defs root:$select, fmin_fmax_legacy_matchdata:$matchinfo),
(match (wip_match_opcode G_SELECT):$select,
- [{ return PostLegalizerHelper.matchFMinFMaxLegacy(*${select}, ${matchinfo}); }]),
- (apply [{ PostLegalizerHelper.applySelectFCmpToFMinToFMaxLegacy(*${select}, ${matchinfo}); }])>;
+ [{ return matchFMinFMaxLegacy(*${select}, ${matchinfo}); }]),
+ (apply [{ applySelectFCmpToFMinToFMaxLegacy(*${select}, ${matchinfo}); }])>;
def uchar_to_float : GICombineRule<
(defs root:$itofp),
(match (wip_match_opcode G_UITOFP, G_SITOFP):$itofp,
- [{ return PostLegalizerHelper.matchUCharToFloat(*${itofp}); }]),
- (apply [{ PostLegalizerHelper.applyUCharToFloat(*${itofp}); }])>;
+ [{ return matchUCharToFloat(*${itofp}); }]),
+ (apply [{ applyUCharToFloat(*${itofp}); }])>;
def rcp_sqrt_to_rsq : GICombineRule<
(defs root:$rcp, build_fn_matchinfo:$matchinfo),
(match (wip_match_opcode G_INTRINSIC, G_FSQRT):$rcp,
- [{ return PostLegalizerHelper.matchRcpSqrtToRsq(*${rcp}, ${matchinfo}); }]),
+ [{ return matchRcpSqrtToRsq(*${rcp}, ${matchinfo}); }]),
(apply [{ Helper.applyBuildFn(*${rcp}, ${matchinfo}); }])>;
-def cvt_f32_ubyteN_matchdata : GIDefMatchData<"AMDGPUPostLegalizerCombinerHelper::CvtF32UByteMatchInfo">;
+def cvt_f32_ubyteN_matchdata : GIDefMatchData<"CvtF32UByteMatchInfo">;
def cvt_f32_ubyteN : GICombineRule<
(defs root:$cvt_f32_ubyteN, cvt_f32_ubyteN_matchdata:$matchinfo),
@@ -42,18 +42,18 @@ def cvt_f32_ubyteN : GICombineRule<
G_AMDGPU_CVT_F32_UBYTE1,
G_AMDGPU_CVT_F32_UBYTE2,
G_AMDGPU_CVT_F32_UBYTE3):$cvt_f32_ubyteN,
- [{ return PostLegalizerHelper.matchCvtF32UByteN(*${cvt_f32_ubyteN}, ${matchinfo}); }]),
- (apply [{ PostLegalizerHelper.applyCvtF32UByteN(*${cvt_f32_ubyteN}, ${matchinfo}); }])>;
+ [{ return matchCvtF32UByteN(*${cvt_f32_ubyteN}, ${matchinfo}); }]),
+ (apply [{ applyCvtF32UByteN(*${cvt_f32_ubyteN}, ${matchinfo}); }])>;
-def clamp_i64_to_i16_matchdata : GIDefMatchData<"AMDGPUPreLegalizerCombinerHelper::ClampI64ToI16MatchInfo">;
+def clamp_i64_to_i16_matchdata : GIDefMatchData<"ClampI64ToI16MatchInfo">;
def clamp_i64_to_i16 : GICombineRule<
(defs root:$clamp_i64_to_i16, clamp_i64_to_i16_matchdata:$matchinfo),
(match (wip_match_opcode G_TRUNC):$clamp_i64_to_i16,
- [{ return PreLegalizerHelper.matchClampI64ToI16(*${clamp_i64_to_i16}, MRI, *MF, ${matchinfo}); }]),
- (apply [{ PreLegalizerHelper.applyClampI64ToI16(*${clamp_i64_to_i16}, ${matchinfo}); }])>;
+ [{ return matchClampI64ToI16(*${clamp_i64_to_i16}, MRI, MF, ${matchinfo}); }]),
+ (apply [{ applyClampI64ToI16(*${clamp_i64_to_i16}, ${matchinfo}); }])>;
-def med3_matchdata : GIDefMatchData<"AMDGPURegBankCombinerHelper::Med3MatchInfo">;
+def med3_matchdata : GIDefMatchData<"Med3MatchInfo">;
def int_minmax_to_med3 : GICombineRule<
(defs root:$min_or_max, med3_matchdata:$matchinfo),
@@ -61,8 +61,8 @@ def int_minmax_to_med3 : GICombineRule<
G_SMIN,
G_UMAX,
G_UMIN):$min_or_max,
- [{ return RegBankHelper.matchIntMinMaxToMed3(*${min_or_max}, ${matchinfo}); }]),
- (apply [{ RegBankHelper.applyMed3(*${min_or_max}, ${matchinfo}); }])>;
+ [{ return matchIntMinMaxToMed3(*${min_or_max}, ${matchinfo}); }]),
+ (apply [{ applyMed3(*${min_or_max}, ${matchinfo}); }])>;
def fp_minmax_to_med3 : GICombineRule<
(defs root:$min_or_max, med3_matchdata:$matchinfo),
@@ -70,8 +70,8 @@ def fp_minmax_to_med3 : GICombineRule<
G_FMINNUM,
G_FMAXNUM_IEEE,
G_FMINNUM_IEEE):$min_or_max,
- [{ return RegBankHelper.matchFPMinMaxToMed3(*${min_or_max}, ${matchinfo}); }]),
- (apply [{ RegBankHelper.applyMed3(*${min_or_max}, ${matchinfo}); }])>;
+ [{ return matchFPMinMaxToMed3(*${min_or_max}, ${matchinfo}); }]),
+ (apply [{ applyMed3(*${min_or_max}, ${matchinfo}); }])>;
def fp_minmax_to_clamp : GICombineRule<
(defs root:$min_or_max, register_matchinfo:$matchinfo),
@@ -79,21 +79,21 @@ def fp_minmax_to_clamp : GICombineRule<
G_FMINNUM,
G_FMAXNUM_IEEE,
G_FMINNUM_IEEE):$min_or_max,
- [{ return RegBankHelper.matchFPMinMaxToClamp(*${min_or_max}, ${matchinfo}); }]),
- (apply [{ RegBankHelper.applyClamp(*${min_or_max}, ${matchinfo}); }])>;
+ [{ return matchFPMinMaxToClamp(*${min_or_max}, ${matchinfo}); }]),
+ (apply [{ applyClamp(*${min_or_max}, ${matchinfo}); }])>;
def fmed3_intrinsic_to_clamp : GICombineRule<
(defs root:$fmed3, register_matchinfo:$matchinfo),
- (match (wip_match_opcode G_INTRINSIC):$fmed3,
- [{ return RegBankHelper.matchFPMed3ToClamp(*${fmed3}, ${matchinfo}); }]),
- (apply [{ RegBankHelper.applyClamp(*${fmed3}, ${matchinfo}); }])>;
+ (match (wip_match_opcode G_AMDGPU_FMED3):$fmed3,
+ [{ return matchFPMed3ToClamp(*${fmed3}, ${matchinfo}); }]),
+ (apply [{ applyClamp(*${fmed3}, ${matchinfo}); }])>;
def remove_fcanonicalize_matchinfo : GIDefMatchData<"Register">;
def remove_fcanonicalize : GICombineRule<
(defs root:$fcanonicalize, remove_fcanonicalize_matchinfo:$matchinfo),
(match (wip_match_opcode G_FCANONICALIZE):$fcanonicalize,
- [{ return PostLegalizerHelper.matchRemoveFcanonicalize(*${fcanonicalize}, ${matchinfo}); }]),
+ [{ return matchRemoveFcanonicalize(*${fcanonicalize}, ${matchinfo}); }]),
(apply [{ Helper.replaceSingleDefInstWithReg(*${fcanonicalize}, ${matchinfo}); }])>;
def foldable_fneg_matchdata : GIDefMatchData<"MachineInstr *">;
@@ -104,32 +104,56 @@ def foldable_fneg : GICombineRule<
[{ return Helper.matchFoldableFneg(*${ffn}, ${matchinfo}); }]),
(apply [{ Helper.applyFoldableFneg(*${ffn}, ${matchinfo}); }])>;
-// Combines which should only apply on SI/VI
+def sign_exension_in_reg_matchdata : GIDefMatchData<"MachineInstr *">;
+
+def sign_extension_in_reg : GICombineRule<
+ (defs root:$sign_inreg, sign_exension_in_reg_matchdata:$matchinfo),
+ (match (wip_match_opcode G_SEXT_INREG):$sign_inreg,
+ [{ return matchCombineSignExtendInReg(*${sign_inreg}, ${matchinfo}); }]),
+ (apply [{ applyCombineSignExtendInReg(*${sign_inreg}, ${matchinfo}); }])>;
+
+
+let Predicates = [Has16BitInsts, NotHasMed3_16] in {
+// For gfx8, expand f16-fmed3-as-f32 into a min/max f16 sequence. This
+// saves one instruction compared to the promotion.
+//
+// FIXME: Should have ComplexPattern like in/out matchers
+//
+// FIXME: We should be able to match either G_AMDGPU_FMED3 or
+// G_INTRINSIC @llvm.amdgcn.fmed3. Currently the legalizer will
+// replace the intrinsic with G_AMDGPU_FMED3 since we can't write a
+// pattern to match it.
+def expand_promoted_fmed3 : GICombineRule<
+ (defs root:$fptrunc_dst),
+ (match (G_FPTRUNC $fptrunc_dst, $fmed3_dst):$fptrunc,
+ (G_AMDGPU_FMED3 $fmed3_dst, $src0, $src1, $src2),
+ [{ return Helper.matchExpandPromotedF16FMed3(*${fptrunc}, ${src0}.getReg(), ${src1}.getReg(), ${src2}.getReg()); }]),
+ (apply [{ Helper.applyExpandPromotedF16FMed3(*${fptrunc}, ${src0}.getReg(), ${src1}.getReg(), ${src2}.getReg()); }])
+>;
+
+} // End Predicates = [NotHasMed3_16]
+
+// Combines which should only apply on SI/CI
def gfx6gfx7_combines : GICombineGroup<[fcmp_select_to_fmin_fmax_legacy]>;
-def AMDGPUPreLegalizerCombinerHelper: GICombinerHelper<
- "AMDGPUGenPreLegalizerCombinerHelper",
+// Combines which should only apply on VI
+def gfx8_combines : GICombineGroup<[expand_promoted_fmed3]>;
+
+def AMDGPUPreLegalizerCombiner: GICombinerHelper<
+ "AMDGPUPreLegalizerCombinerImpl",
[all_combines, clamp_i64_to_i16, foldable_fneg]> {
- let DisableRuleOption = "amdgpuprelegalizercombiner-disable-rule";
- let StateClass = "AMDGPUPreLegalizerCombinerHelperState";
- let AdditionalArguments = [];
}
-def AMDGPUPostLegalizerCombinerHelper: GICombinerHelper<
- "AMDGPUGenPostLegalizerCombinerHelper",
- [all_combines, gfx6gfx7_combines,
+def AMDGPUPostLegalizerCombiner: GICombinerHelper<
+ "AMDGPUPostLegalizerCombinerImpl",
+ [all_combines, gfx6gfx7_combines, gfx8_combines,
uchar_to_float, cvt_f32_ubyteN, remove_fcanonicalize, foldable_fneg,
- rcp_sqrt_to_rsq]> {
- let DisableRuleOption = "amdgpupostlegalizercombiner-disable-rule";
- let StateClass = "AMDGPUPostLegalizerCombinerHelperState";
- let AdditionalArguments = [];
+ rcp_sqrt_to_rsq, sign_extension_in_reg]> {
}
-def AMDGPURegBankCombinerHelper : GICombinerHelper<
- "AMDGPUGenRegBankCombinerHelper",
- [zext_trunc_fold, int_minmax_to_med3, ptr_add_immed_chain,
+def AMDGPURegBankCombiner : GICombinerHelper<
+ "AMDGPURegBankCombinerImpl",
+ [unmerge_merge, unmerge_cst, unmerge_undef,
+ zext_trunc_fold, int_minmax_to_med3, ptr_add_immed_chain,
fp_minmax_to_clamp, fp_minmax_to_med3, fmed3_intrinsic_to_clamp]> {
- let DisableRuleOption = "amdgpuregbankcombiner-disable-rule";
- let StateClass = "AMDGPURegBankCombinerHelperState";
- let AdditionalArguments = [];
}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCombinerHelper.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCombinerHelper.cpp
index 069baf748bfa..78fdedc0b511 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCombinerHelper.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCombinerHelper.cpp
@@ -380,3 +380,56 @@ void AMDGPUCombinerHelper::applyFoldableFneg(MachineInstr &MI,
MI.eraseFromParent();
}
+
+// TODO: Should return converted value / extension source and avoid introducing
+// intermediate fptruncs in the apply function.
+static bool isFPExtFromF16OrConst(const MachineRegisterInfo &MRI,
+ Register Reg) {
+ const MachineInstr *Def = MRI.getVRegDef(Reg);
+ if (Def->getOpcode() == TargetOpcode::G_FPEXT) {
+ Register SrcReg = Def->getOperand(1).getReg();
+ return MRI.getType(SrcReg) == LLT::scalar(16);
+ }
+
+ if (Def->getOpcode() == TargetOpcode::G_FCONSTANT) {
+ APFloat Val = Def->getOperand(1).getFPImm()->getValueAPF();
+ bool LosesInfo = true;
+ Val.convert(APFloat::IEEEhalf(), APFloat::rmNearestTiesToEven, &LosesInfo);
+ return !LosesInfo;
+ }
+
+ return false;
+}
+
+bool AMDGPUCombinerHelper::matchExpandPromotedF16FMed3(MachineInstr &MI,
+ Register Src0,
+ Register Src1,
+ Register Src2) {
+ assert(MI.getOpcode() == TargetOpcode::G_FPTRUNC);
+ Register SrcReg = MI.getOperand(1).getReg();
+ if (!MRI.hasOneNonDBGUse(SrcReg) || MRI.getType(SrcReg) != LLT::scalar(32))
+ return false;
+
+ return isFPExtFromF16OrConst(MRI, Src0) && isFPExtFromF16OrConst(MRI, Src1) &&
+ isFPExtFromF16OrConst(MRI, Src2);
+}
+
+void AMDGPUCombinerHelper::applyExpandPromotedF16FMed3(MachineInstr &MI,
+ Register Src0,
+ Register Src1,
+ Register Src2) {
+ Builder.setInstrAndDebugLoc(MI);
+
+ // We expect fptrunc (fpext x) to fold out, and to constant fold any constant
+ // sources.
+ Src0 = Builder.buildFPTrunc(LLT::scalar(16), Src0).getReg(0);
+ Src1 = Builder.buildFPTrunc(LLT::scalar(16), Src1).getReg(0);
+ Src2 = Builder.buildFPTrunc(LLT::scalar(16), Src2).getReg(0);
+
+ LLT Ty = MRI.getType(Src0);
+ auto A1 = Builder.buildFMinNumIEEE(Ty, Src0, Src1);
+ auto B1 = Builder.buildFMaxNumIEEE(Ty, Src0, Src1);
+ auto C1 = Builder.buildFMaxNumIEEE(Ty, A1, Src2);
+ Builder.buildFMinNumIEEE(MI.getOperand(0), B1, C1);
+ MI.eraseFromParent();
+}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCombinerHelper.h b/llvm/lib/Target/AMDGPU/AMDGPUCombinerHelper.h
index 1d4747136bf7..a933e85ce3ca 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCombinerHelper.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCombinerHelper.h
@@ -1,4 +1,4 @@
-//=== lib/CodeGen/GlobalISel/AMDGPUCombinerHelper.h -----------------------===//
+//=== lib/CodeGen/GlobalISel/AMDGPUCombinerHelper.h -------------*- C++ -*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
@@ -12,6 +12,9 @@
///
//===----------------------------------------------------------------------===//
+#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUCOMBINERHELPER_H
+#define LLVM_LIB_TARGET_AMDGPU_AMDGPUCOMBINERHELPER_H
+
#include "llvm/CodeGen/GlobalISel/Combiner.h"
#include "llvm/CodeGen/GlobalISel/CombinerHelper.h"
@@ -23,4 +26,11 @@ public:
bool matchFoldableFneg(MachineInstr &MI, MachineInstr *&MatchInfo);
void applyFoldableFneg(MachineInstr &MI, MachineInstr *&MatchInfo);
+
+ bool matchExpandPromotedF16FMed3(MachineInstr &MI, Register Src0,
+ Register Src1, Register Src2);
+ void applyExpandPromotedF16FMed3(MachineInstr &MI, Register Src0,
+ Register Src1, Register Src2);
};
+
+#endif // LLVM_LIB_TARGET_AMDGPU_AMDGPUCOMBINERHELPER_H
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCtorDtorLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCtorDtorLowering.cpp
index ba5a8799792a..a13447586bd4 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCtorDtorLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCtorDtorLowering.cpp
@@ -31,15 +31,14 @@ static Function *createInitOrFiniKernelFunction(Module &M, bool IsCtor) {
StringRef InitOrFiniKernelName = "amdgcn.device.init";
if (!IsCtor)
InitOrFiniKernelName = "amdgcn.device.fini";
+ if (M.getFunction(InitOrFiniKernelName))
+ return nullptr;
Function *InitOrFiniKernel = Function::createWithDefaultAttr(
FunctionType::get(Type::getVoidTy(M.getContext()), false),
- GlobalValue::ExternalLinkage, 0, InitOrFiniKernelName, &M);
- BasicBlock *InitOrFiniKernelBB =
- BasicBlock::Create(M.getContext(), "", InitOrFiniKernel);
- ReturnInst::Create(M.getContext(), InitOrFiniKernelBB);
-
+ GlobalValue::WeakODRLinkage, 0, InitOrFiniKernelName, &M);
InitOrFiniKernel->setCallingConv(CallingConv::AMDGPU_KERNEL);
+ InitOrFiniKernel->addFnAttr("amdgpu-flat-work-group-size", "1,1");
if (IsCtor)
InitOrFiniKernel->addFnAttr("device-init");
else
@@ -47,6 +46,71 @@ static Function *createInitOrFiniKernelFunction(Module &M, bool IsCtor) {
return InitOrFiniKernel;
}
+// The linker will provide the associated symbols to allow us to traverse the
+// global constructors / destructors in priority order. We create the IR
+// required to call each callback in this section. This is equivalent to the
+// following code.
+//
+// extern "C" void * __init_array_start[];
+// extern "C" void * __init_array_end[];
+//
+// using InitCallback = void();
+//
+// void call_init_array_callbacks() {
+// for (auto start = __init_array_start; start != __init_array_end; ++start)
+// reinterpret_cast<InitCallback *>(*start)();
+// }
+static void createInitOrFiniCalls(Function &F, bool IsCtor) {
+ Module &M = *F.getParent();
+ LLVMContext &C = M.getContext();
+
+ IRBuilder<> IRB(BasicBlock::Create(C, "entry", &F));
+ auto *LoopBB = BasicBlock::Create(C, "while.entry", &F);
+ auto *ExitBB = BasicBlock::Create(C, "while.end", &F);
+ Type *PtrTy = IRB.getPtrTy(AMDGPUAS::GLOBAL_ADDRESS);
+
+ auto *Begin = M.getOrInsertGlobal(
+ IsCtor ? "__init_array_start" : "__fini_array_start",
+ ArrayType::get(PtrTy, 0), [&]() {
+ return new GlobalVariable(
+ M, ArrayType::get(PtrTy, 0),
+ /*isConstant=*/true, GlobalValue::ExternalLinkage,
+ /*Initializer=*/nullptr,
+ IsCtor ? "__init_array_start" : "__fini_array_start",
+ /*InsertBefore=*/nullptr, GlobalVariable::NotThreadLocal,
+ /*AddressSpace=*/1);
+ });
+ auto *End = M.getOrInsertGlobal(
+ IsCtor ? "__init_array_end" : "__fini_array_end",
+ ArrayType::get(PtrTy, 0), [&]() {
+ return new GlobalVariable(
+ M, ArrayType::get(PtrTy, 0),
+ /*isConstant=*/true, GlobalValue::ExternalLinkage,
+ /*Initializer=*/nullptr,
+ IsCtor ? "__init_array_end" : "__fini_array_end",
+ /*InsertBefore=*/nullptr, GlobalVariable::NotThreadLocal,
+ /*AddressSpace=*/1);
+ });
+
+ // The constructor type is suppoed to allow using the argument vectors, but
+ // for now we just call them with no arguments.
+ auto *CallBackTy = FunctionType::get(IRB.getVoidTy(), {});
+
+ IRB.CreateCondBr(IRB.CreateICmpNE(Begin, End), LoopBB, ExitBB);
+ IRB.SetInsertPoint(LoopBB);
+ auto *CallBackPHI = IRB.CreatePHI(PtrTy, 2, "ptr");
+ auto *CallBack = IRB.CreateLoad(CallBackTy->getPointerTo(F.getAddressSpace()),
+ CallBackPHI, "callback");
+ IRB.CreateCall(CallBackTy, CallBack);
+ auto *NewCallBack = IRB.CreateConstGEP1_64(PtrTy, CallBackPHI, 1, "next");
+ auto *EndCmp = IRB.CreateICmpEQ(NewCallBack, End, "end");
+ CallBackPHI->addIncoming(Begin, &F.getEntryBlock());
+ CallBackPHI->addIncoming(NewCallBack, LoopBB);
+ IRB.CreateCondBr(EndCmp, ExitBB, LoopBB);
+ IRB.SetInsertPoint(ExitBB);
+ IRB.CreateRetVoid();
+}
+
static bool createInitOrFiniKernel(Module &M, StringRef GlobalName,
bool IsCtor) {
GlobalVariable *GV = M.getGlobalVariable(GlobalName);
@@ -57,18 +121,12 @@ static bool createInitOrFiniKernel(Module &M, StringRef GlobalName,
return false;
Function *InitOrFiniKernel = createInitOrFiniKernelFunction(M, IsCtor);
- IRBuilder<> IRB(InitOrFiniKernel->getEntryBlock().getTerminator());
-
- FunctionType *ConstructorTy = InitOrFiniKernel->getFunctionType();
+ if (!InitOrFiniKernel)
+ return false;
- for (Value *V : GA->operands()) {
- auto *CS = cast<ConstantStruct>(V);
- IRB.CreateCall(ConstructorTy, CS->getOperand(1));
- }
+ createInitOrFiniCalls(*InitOrFiniKernel, IsCtor);
appendToUsed(M, {InitOrFiniKernel});
-
- GV->eraseFromParent();
return true;
}
@@ -83,17 +141,15 @@ class AMDGPUCtorDtorLoweringLegacy final : public ModulePass {
public:
static char ID;
AMDGPUCtorDtorLoweringLegacy() : ModulePass(ID) {}
- bool runOnModule(Module &M) override {
- return lowerCtorsAndDtors(M);
- }
+ bool runOnModule(Module &M) override { return lowerCtorsAndDtors(M); }
};
} // End anonymous namespace
PreservedAnalyses AMDGPUCtorDtorLoweringPass::run(Module &M,
ModuleAnalysisManager &AM) {
- lowerCtorsAndDtors(M);
- return PreservedAnalyses::all();
+ return lowerCtorsAndDtors(M) ? PreservedAnalyses::none()
+ : PreservedAnalyses::all();
}
char AMDGPUCtorDtorLoweringLegacy::ID = 0;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUGISel.td b/llvm/lib/Target/AMDGPU/AMDGPUGISel.td
index 7e7dbacaac11..37df4f68c265 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUGISel.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUGISel.td
@@ -31,6 +31,10 @@ def gi_vop3mods :
GIComplexOperandMatcher<s32, "selectVOP3Mods">,
GIComplexPatternEquiv<VOP3Mods>;
+def gi_vop3modsnoncanonicalizing :
+ GIComplexOperandMatcher<s32, "selectVOP3ModsNonCanonicalizing">,
+ GIComplexPatternEquiv<VOP3ModsNonCanonicalizing>;
+
def gi_vop3_no_mods :
GIComplexOperandMatcher<s32, "selectVOP3NoMods">,
GIComplexPatternEquiv<VOP3NoMods>;
@@ -153,6 +157,10 @@ def gi_vop3_mad_mix_mods :
GIComplexOperandMatcher<s64, "selectVOP3PMadMixMods">,
GIComplexPatternEquiv<VOP3PMadMixMods>;
+def gi_vop3_mad_mix_mods_ext :
+ GIComplexOperandMatcher<s64, "selectVOP3PMadMixModsExt">,
+ GIComplexPatternEquiv<VOP3PMadMixModsExt>;
+
// Separate load nodes are defined to glue m0 initialization in
// SelectionDAG. The GISel selector can just insert m0 initialization
// directly before selecting a glue-less load, so hide this
@@ -227,10 +235,8 @@ def : GINodeEquiv<G_AMDGPU_TBUFFER_STORE_FORMAT, SItbuffer_store>;
def : GINodeEquiv<G_AMDGPU_TBUFFER_STORE_FORMAT_D16, SItbuffer_store_d16>;
// FIXME: Check MMO is atomic
-def : GINodeEquiv<G_AMDGPU_ATOMIC_INC, SIatomic_inc>;
-def : GINodeEquiv<G_AMDGPU_ATOMIC_DEC, SIatomic_dec>;
-def : GINodeEquiv<G_AMDGPU_ATOMIC_INC, atomic_inc_glue>;
-def : GINodeEquiv<G_AMDGPU_ATOMIC_DEC, atomic_dec_glue>;
+def : GINodeEquiv<G_ATOMICRMW_UINC_WRAP, atomic_load_uinc_wrap_glue>;
+def : GINodeEquiv<G_ATOMICRMW_UDEC_WRAP, atomic_load_udec_wrap_glue>;
def : GINodeEquiv<G_AMDGPU_ATOMIC_FMIN, SIatomic_fmin>;
def : GINodeEquiv<G_AMDGPU_ATOMIC_FMAX, SIatomic_fmax>;
def : GINodeEquiv<G_AMDGPU_ATOMIC_FMIN, atomic_load_fmin_glue>;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.cpp b/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.cpp
index 2ffc8b2a3a7b..09930dc9612c 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.cpp
@@ -10,8 +10,8 @@
#include "GCNSubtarget.h"
#include "llvm/CodeGen/GlobalISel/GISelKnownBits.h"
#include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
+#include "llvm/CodeGen/LowLevelType.h"
#include "llvm/IR/Constants.h"
-#include "llvm/Support/LowLevelTypeImpl.h"
using namespace llvm;
using namespace MIPatternMatch;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp b/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp
index a71ba6b77565..dadc0c92ef8b 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp
@@ -418,9 +418,7 @@ void MetadataStreamerYamlV2::emitHiddenKernelArgs(const Function &Func,
}
if (HiddenArgNumBytes >= 48) {
- if (!Func.hasFnAttribute("amdgpu-no-completion-action") &&
- // FIXME: Hack for runtime bug if we fail to optimize this out
- Func.hasFnAttribute("calls-enqueue-kernel")) {
+ if (!Func.hasFnAttribute("amdgpu-no-completion-action")) {
emitKernelArg(DL, Int8PtrTy, Align(8), ValueKind::HiddenCompletionAction);
} else {
emitKernelArg(DL, Int8PtrTy, Align(8), ValueKind::HiddenNone);
@@ -854,9 +852,7 @@ void MetadataStreamerMsgPackV3::emitHiddenKernelArgs(
}
if (HiddenArgNumBytes >= 48) {
- if (!Func.hasFnAttribute("amdgpu-no-completion-action") &&
- // FIXME: Hack for runtime bug if we fail to optimize this out
- Func.hasFnAttribute("calls-enqueue-kernel")) {
+ if (!Func.hasFnAttribute("amdgpu-no-completion-action")) {
emitKernelArg(DL, Int8PtrTy, Align(8), "hidden_completion_action", Offset,
Args);
} else {
@@ -876,7 +872,8 @@ void MetadataStreamerMsgPackV3::emitHiddenKernelArgs(
}
msgpack::MapDocNode MetadataStreamerMsgPackV3::getHSAKernelProps(
- const MachineFunction &MF, const SIProgramInfo &ProgramInfo) const {
+ const MachineFunction &MF, const SIProgramInfo &ProgramInfo,
+ unsigned CodeObjectVersion) const {
const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
const Function &F = MF.getFunction();
@@ -890,10 +887,11 @@ msgpack::MapDocNode MetadataStreamerMsgPackV3::getHSAKernelProps(
Kern.getDocument()->getNode(ProgramInfo.LDSSize);
Kern[".private_segment_fixed_size"] =
Kern.getDocument()->getNode(ProgramInfo.ScratchSize);
- if (AMDGPU::getAmdhsaCodeObjectVersion() >= 5)
+ if (CodeObjectVersion >= AMDGPU::AMDHSA_COV5)
Kern[".uses_dynamic_stack"] =
Kern.getDocument()->getNode(ProgramInfo.DynamicCallStack);
- if (AMDGPU::getAmdhsaCodeObjectVersion() >= 5 && STM.supportsWGP())
+
+ if (CodeObjectVersion >= AMDGPU::AMDHSA_COV5 && STM.supportsWGP())
Kern[".workgroup_processor_mode"] =
Kern.getDocument()->getNode(ProgramInfo.WgpMode);
@@ -945,10 +943,12 @@ void MetadataStreamerMsgPackV3::end() {
void MetadataStreamerMsgPackV3::emitKernel(const MachineFunction &MF,
const SIProgramInfo &ProgramInfo) {
auto &Func = MF.getFunction();
- auto Kern = getHSAKernelProps(MF, ProgramInfo);
+ if (Func.getCallingConv() != CallingConv::AMDGPU_KERNEL &&
+ Func.getCallingConv() != CallingConv::SPIR_KERNEL)
+ return;
- assert(Func.getCallingConv() == CallingConv::AMDGPU_KERNEL ||
- Func.getCallingConv() == CallingConv::SPIR_KERNEL);
+ auto CodeObjectVersion = AMDGPU::getCodeObjectVersion(*Func.getParent());
+ auto Kern = getHSAKernelProps(MF, ProgramInfo, CodeObjectVersion);
auto Kernels =
getRootMetadata("amdhsa.kernels").getArray(/*Convert=*/true);
@@ -1079,9 +1079,7 @@ void MetadataStreamerMsgPackV5::emitHiddenKernelArgs(
Offset += 8; // Skipped.
}
- if (!Func.hasFnAttribute("amdgpu-no-completion-action") &&
- // FIXME: Hack for runtime bug
- Func.hasFnAttribute("calls-enqueue-kernel")) {
+ if (!Func.hasFnAttribute("amdgpu-no-completion-action")) {
emitKernelArg(DL, Int8PtrTy, Align(8), "hidden_completion_action", Offset,
Args);
} else {
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.h b/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.h
index 91670b9820a2..7d7080e920f5 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.h
@@ -84,7 +84,8 @@ protected:
msgpack::ArrayDocNode getWorkGroupDimensions(MDNode *Node) const;
msgpack::MapDocNode getHSAKernelProps(const MachineFunction &MF,
- const SIProgramInfo &ProgramInfo) const;
+ const SIProgramInfo &ProgramInfo,
+ unsigned CodeObjectVersion) const;
void emitVersion() override;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp
index fc0df61952e4..ffa6c88f9d41 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp
@@ -80,6 +80,37 @@ enum class SchedGroupMask {
LLVM_MARK_AS_BITMASK_ENUM(/* LargestFlag = */ ALL)
};
+class SchedGroup;
+
+// InstructionRule class is used to enact a filter which determines whether or
+// not an SU maps to a given SchedGroup. It contains complementary data
+// structures (e.g Cache) to help those filters.
+class InstructionRule {
+protected:
+ const SIInstrInfo *TII;
+ unsigned SGID;
+ // A cache made available to the Filter to store SUnits for subsequent
+ // invocations of the Filter
+ std::optional<SmallVector<SUnit *, 4>> Cache;
+
+public:
+ virtual bool
+ apply(const SUnit *, const ArrayRef<SUnit *>,
+ SmallVectorImpl<SchedGroup> &) {
+ return true;
+ };
+
+ InstructionRule(const SIInstrInfo *TII, unsigned SGID,
+ bool NeedsCache = false)
+ : TII(TII), SGID(SGID) {
+ if (NeedsCache) {
+ Cache = SmallVector<SUnit *, 4>();
+ }
+ }
+
+ virtual ~InstructionRule() = default;
+};
+
typedef DenseMap<SUnit *, SmallVector<int, 4>> SUnitsToCandidateSGsMap;
// Classify instructions into groups to enable fine tuned control over the
@@ -102,11 +133,12 @@ private:
// SGID is used to map instructions to candidate SchedGroups
unsigned SGID;
+ // The different rules each instruction in this SchedGroup must conform to
+ SmallVector<std::shared_ptr<InstructionRule>, 4> Rules;
+
// Count of the number of created SchedGroups, used to initialize SGID.
static unsigned NumSchedGroups;
- ScheduleDAGInstrs *DAG;
-
const SIInstrInfo *TII;
// Try to add and edge from SU A to SU B.
@@ -120,6 +152,8 @@ public:
// Collection of SUnits that are classified as members of this group.
SmallVector<SUnit *, 32> Collection;
+ ScheduleDAGInstrs *DAG;
+
// Returns true if SU can be added to this SchedGroup.
bool canAddSU(SUnit &SU) const;
@@ -145,6 +179,28 @@ public:
// Returns true if no more instructions may be added to this group.
bool isFull() const { return MaxSize && Collection.size() >= *MaxSize; }
+ // Append a constraint that SUs must meet in order to fit into this
+ // SchedGroup. Since many rules involve the relationship between a SchedGroup
+ // and the SUnits in other SchedGroups, rules are checked at Pipeline Solve
+ // time (rather than SchedGroup init time.)
+ void addRule(std::shared_ptr<InstructionRule> NewRule) {
+ Rules.push_back(NewRule);
+ }
+
+ // Returns true if the SU matches all rules
+ bool allowedByRules(const SUnit *SU,
+ SmallVectorImpl<SchedGroup> &SyncPipe) const {
+ if (Rules.empty())
+ return true;
+ for (size_t I = 0; I < Rules.size(); I++) {
+ auto TheRule = Rules[I].get();
+ if (!TheRule->apply(SU, Collection, SyncPipe)) {
+ return false;
+ }
+ }
+ return true;
+ }
+
// Add SU to the SchedGroup.
void add(SUnit &SU) {
LLVM_DEBUG(dbgs() << "For SchedGroup with mask "
@@ -177,13 +233,13 @@ public:
SchedGroup(SchedGroupMask SGMask, std::optional<unsigned> MaxSize,
ScheduleDAGInstrs *DAG, const SIInstrInfo *TII)
- : SGMask(SGMask), MaxSize(MaxSize), DAG(DAG), TII(TII) {
+ : SGMask(SGMask), MaxSize(MaxSize), TII(TII), DAG(DAG) {
SGID = NumSchedGroups++;
}
SchedGroup(SchedGroupMask SGMask, std::optional<unsigned> MaxSize, int SyncID,
ScheduleDAGInstrs *DAG, const SIInstrInfo *TII)
- : SGMask(SGMask), MaxSize(MaxSize), SyncID(SyncID), DAG(DAG), TII(TII) {
+ : SGMask(SGMask), MaxSize(MaxSize), SyncID(SyncID), TII(TII), DAG(DAG) {
SGID = NumSchedGroups++;
}
};
@@ -254,6 +310,9 @@ class PipelineSolver {
// How many branches we have explored
uint64_t BranchesExplored = 0;
+ // The direction in which we process the candidate SchedGroups per SU
+ bool IsBottomUp = 1;
+
// Update indices to fit next conflicting instruction
void advancePosition();
// Recede indices to attempt to find better fit for previous conflicting
@@ -264,19 +323,35 @@ class PipelineSolver {
bool solveExact();
// The polynomial time algorithm which attempts to find a good fit
bool solveGreedy();
+ // Find the best SchedGroup for the current SU using the heuristic given all
+ // current information. One step in the greedy algorithm. Templated against
+ // the SchedGroup iterator (either reverse or forward).
+ template <typename T>
+ void greedyFind(std::vector<std::pair<SUnit *, SUnit *>> &AddedEdges, T I,
+ T E);
// Whether or not the current solution is optimal
bool checkOptimal();
// Populate the ready list, prioiritizing fewest missed edges first
- void populateReadyList(SUToCandSGsPair &CurrSU,
- SmallVectorImpl<std::pair<int, int>> &ReadyList,
- SmallVectorImpl<SchedGroup> &SyncPipeline);
+ // Templated against the SchedGroup iterator (either reverse or forward).
+ template <typename T>
+ void populateReadyList(SmallVectorImpl<std::pair<int, int>> &ReadyList, T I,
+ T E);
// Add edges corresponding to the SchedGroups as assigned by solver
void makePipeline();
+ // Link the SchedGroups in the best found pipeline.
+ // Tmplated against the SchedGroup iterator (either reverse or forward).
+ template <typename T> void linkSchedGroups(T I, T E);
// Add the edges from the SU to the other SchedGroups in pipeline, and
// return the number of edges missed.
int addEdges(SmallVectorImpl<SchedGroup> &SyncPipeline, SUnit *SU, int SGID,
std::vector<std::pair<SUnit *, SUnit *>> &AddedEdges);
- // Remove the edges passed via AddedEdges
+ // Link the pipeline as if \p SU was in the SchedGroup with ID \p SGID. It
+ // returns the cost (in terms of missed pipeline edges), and tracks the edges
+ // added in \p AddedEdges
+ template <typename T>
+ int linkSUnit(SUnit *SU, int SGID,
+ std::vector<std::pair<SUnit *, SUnit *>> &AddedEdges, T I, T E);
+ // Remove the edges passed via \p AddedEdges
void removeEdges(const std::vector<std::pair<SUnit *, SUnit *>> &AddedEdges);
// Convert the passed in maps to arrays for bidirectional iterators
void convertSyncMapsToArrays();
@@ -290,9 +365,9 @@ public:
PipelineSolver(DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups,
DenseMap<int, SUnitsToCandidateSGsMap> &SyncedInstrs,
- ScheduleDAGMI *DAG)
+ ScheduleDAGMI *DAG, bool IsBottomUp = 1)
: DAG(DAG), SyncedInstrs(SyncedInstrs),
- SyncedSchedGroups(SyncedSchedGroups) {
+ SyncedSchedGroups(SyncedSchedGroups), IsBottomUp(IsBottomUp) {
for (auto &PipelineInstrs : SyncedInstrs) {
if (PipelineInstrs.second.size() > 0) {
@@ -363,14 +438,28 @@ void PipelineSolver::convertSyncMapsToArrays() {
}
}
+template <typename T> void PipelineSolver::linkSchedGroups(T I, T E) {
+ for (; I != E; ++I) {
+ auto &GroupA = *I;
+ for (auto J = std::next(I); J != E; ++J) {
+ auto &GroupB = *J;
+ GroupA.link(GroupB);
+ }
+ }
+}
+
void PipelineSolver::makePipeline() {
// Preserve the order of barrier for subsequent SchedGroupBarrier mutations
for (auto &SyncPipeline : BestPipeline) {
+ LLVM_DEBUG(dbgs() << "Printing SchedGroups\n");
for (auto &SG : SyncPipeline) {
+ LLVM_DEBUG(dbgs() << "SchedGroup with SGID " << SG.getSGID()
+ << " has: \n");
SUnit *SGBarr = nullptr;
for (auto &SU : SG.Collection) {
if (SU->getInstr()->getOpcode() == AMDGPU::SCHED_GROUP_BARRIER)
SGBarr = SU;
+ LLVM_DEBUG(dbgs() << "SU(" << SU->NodeNum << ")\n");
}
// Command line requested IGroupLP doesn't have SGBarr
if (!SGBarr)
@@ -381,43 +470,47 @@ void PipelineSolver::makePipeline() {
}
for (auto &SyncPipeline : BestPipeline) {
- auto I = SyncPipeline.rbegin();
- auto E = SyncPipeline.rend();
- for (; I != E; ++I) {
- auto &GroupA = *I;
- for (auto J = std::next(I); J != E; ++J) {
- auto &GroupB = *J;
- GroupA.link(GroupB);
- }
- }
+ IsBottomUp ? linkSchedGroups(SyncPipeline.rbegin(), SyncPipeline.rend())
+ : linkSchedGroups(SyncPipeline.begin(), SyncPipeline.end());
}
}
-int PipelineSolver::addEdges(
- SmallVectorImpl<SchedGroup> &SyncPipeline, SUnit *SU, int SGID,
- std::vector<std::pair<SUnit *, SUnit *>> &AddedEdges) {
- int AddedCost = 0;
+template <typename T>
+int PipelineSolver::linkSUnit(
+ SUnit *SU, int SGID, std::vector<std::pair<SUnit *, SUnit *>> &AddedEdges,
+ T I, T E) {
bool MakePred = false;
-
- // The groups in the pipeline are in reverse order. Thus,
- // by traversing them from last to first, we are traversing
- // them in the order as they were introduced in the code. After we
- // pass the group the SU is being assigned to, it should be
- // linked as a predecessor of the subsequent SchedGroups
- auto GroupNo = (int)SyncPipeline.size() - 1;
- for (; GroupNo >= 0; GroupNo--) {
- if (SyncPipeline[GroupNo].getSGID() == SGID) {
+ int AddedCost = 0;
+ for (; I < E; ++I) {
+ if (I->getSGID() == SGID) {
MakePred = true;
continue;
}
- auto Group = &SyncPipeline[GroupNo];
- AddedCost += Group->link(*SU, MakePred, AddedEdges);
+ auto Group = *I;
+ AddedCost += Group.link(*SU, MakePred, AddedEdges);
assert(AddedCost >= 0);
}
-
return AddedCost;
}
+int PipelineSolver::addEdges(
+ SmallVectorImpl<SchedGroup> &SyncPipeline, SUnit *SU, int SGID,
+ std::vector<std::pair<SUnit *, SUnit *>> &AddedEdges) {
+
+ // For IsBottomUp, the first SchedGroup in SyncPipeline contains the
+ // instructions that are the ultimate successors in the resultant mutation.
+ // Therefore, in such a configuration, the SchedGroups occurring before the
+ // candidate SGID are successors of the candidate SchedGroup, thus the current
+ // SU should be linked as a predecessor to SUs in those SchedGroups. The
+ // opposite is true if !IsBottomUp. IsBottomUp occurs in the case of multiple
+ // SCHED_GROUP_BARRIERS, or if a user specifies IGLP_OPT SchedGroups using
+ // IsBottomUp (in reverse).
+ return IsBottomUp ? linkSUnit(SU, SGID, AddedEdges, SyncPipeline.rbegin(),
+ SyncPipeline.rend())
+ : linkSUnit(SU, SGID, AddedEdges, SyncPipeline.begin(),
+ SyncPipeline.end());
+}
+
void PipelineSolver::removeEdges(
const std::vector<std::pair<SUnit *, SUnit *>> &EdgesToRemove) {
// Only remove the edges that we have added when testing
@@ -490,12 +583,13 @@ bool PipelineSolver::checkOptimal() {
return (DoneExploring || BestCost == 0);
}
+template <typename T>
void PipelineSolver::populateReadyList(
- SUToCandSGsPair &CurrSU, SmallVectorImpl<std::pair<int, int>> &ReadyList,
- SmallVectorImpl<SchedGroup> &SyncPipeline) {
+ SmallVectorImpl<std::pair<int, int>> &ReadyList, T I, T E) {
+ SUToCandSGsPair CurrSU = PipelineInstrs[CurrSyncGroupIdx][CurrConflInstNo];
+ auto SyncPipeline = CurrPipeline[CurrSyncGroupIdx];
assert(CurrSU.second.size() >= 1);
- auto I = CurrSU.second.rbegin();
- auto E = CurrSU.second.rend();
+
for (; I != E; ++I) {
std::vector<std::pair<SUnit *, SUnit *>> AddedEdges;
int CandSGID = *I;
@@ -545,7 +639,10 @@ bool PipelineSolver::solveExact() {
// SchedGroup -> Cost pairs
SmallVector<std::pair<int, int>, 4> ReadyList;
// Prioritize the candidate sched groups in terms of lowest cost first
- populateReadyList(CurrSU, ReadyList, CurrPipeline[CurrSyncGroupIdx]);
+ IsBottomUp ? populateReadyList(ReadyList, CurrSU.second.rbegin(),
+ CurrSU.second.rend())
+ : populateReadyList(ReadyList, CurrSU.second.begin(),
+ CurrSU.second.end());
auto I = ReadyList.begin();
auto E = ReadyList.end();
@@ -569,6 +666,9 @@ bool PipelineSolver::solveExact() {
if (Match->isFull())
continue;
+ if (!Match->allowedByRules(CurrSU.first, SyncPipeline))
+ continue;
+
LLVM_DEBUG(dbgs() << "Assigning to SchedGroup with Mask "
<< (int)Match->getMask() << "and ID " << CandSGID
<< "\n");
@@ -620,64 +720,75 @@ bool PipelineSolver::solveExact() {
return FinishedExploring;
}
-bool PipelineSolver::solveGreedy() {
- BestCost = 0;
- std::vector<std::pair<SUnit *, SUnit *>> AddedEdges;
-
- while (static_cast<size_t>(CurrSyncGroupIdx) < PipelineInstrs.size()) {
- SUToCandSGsPair CurrSU = PipelineInstrs[CurrSyncGroupIdx][CurrConflInstNo];
- int BestNodeCost = -1;
- int TempCost;
- SchedGroup *BestGroup = nullptr;
- int BestGroupID = -1;
- auto &SyncPipeline = CurrPipeline[CurrSyncGroupIdx];
- LLVM_DEBUG(dbgs() << "Fitting SU(" << CurrSU.first->NodeNum
- << ") in Pipeline # " << CurrSyncGroupIdx << "\n");
+template <typename T>
+void PipelineSolver::greedyFind(
+ std::vector<std::pair<SUnit *, SUnit *>> &AddedEdges, T I, T E) {
+ SUToCandSGsPair CurrSU = PipelineInstrs[CurrSyncGroupIdx][CurrConflInstNo];
+ int BestNodeCost = -1;
+ int TempCost;
+ SchedGroup *BestGroup = nullptr;
+ int BestGroupID = -1;
+ auto &SyncPipeline = CurrPipeline[CurrSyncGroupIdx];
+ LLVM_DEBUG(dbgs() << "Fitting SU(" << CurrSU.first->NodeNum
+ << ") in Pipeline # " << CurrSyncGroupIdx << "\n");
- // Since we have added the potential SchedGroups from bottom up, but
- // traversed the DAG from top down, parse over the groups from last to
- // first. If we fail to do this for the greedy algorithm, the solution will
- // likely not be good in more complex cases.
- auto I = CurrSU.second.rbegin();
- auto E = CurrSU.second.rend();
- for (; I != E; ++I) {
- std::vector<std::pair<SUnit *, SUnit *>> AddedEdges;
- int CandSGID = *I;
- SchedGroup *Match;
- for (auto &SG : SyncPipeline) {
- if (SG.getSGID() == CandSGID)
- Match = &SG;
- }
+ // Since we have added the potential SchedGroups from bottom up, but
+ // traversed the DAG from top down, parse over the groups from last to
+ // first. If we fail to do this for the greedy algorithm, the solution will
+ // likely not be good in more complex cases.
+ for (; I != E; ++I) {
+ std::vector<std::pair<SUnit *, SUnit *>> AddedEdges;
+ int CandSGID = *I;
+ SchedGroup *Match;
+ for (auto &SG : SyncPipeline) {
+ if (SG.getSGID() == CandSGID)
+ Match = &SG;
+ }
- LLVM_DEBUG(dbgs() << "Trying SGID # " << CandSGID << " with Mask "
- << (int)Match->getMask() << "\n");
+ LLVM_DEBUG(dbgs() << "Trying SGID # " << CandSGID << " with Mask "
+ << (int)Match->getMask() << "\n");
- if (Match->isFull()) {
- LLVM_DEBUG(dbgs() << "SGID # " << CandSGID << " is full\n");
- continue;
- }
- TempCost = addEdges(SyncPipeline, CurrSU.first, CandSGID, AddedEdges);
- LLVM_DEBUG(dbgs() << "Cost of Group " << TempCost << "\n");
- if (TempCost < BestNodeCost || BestNodeCost == -1) {
- BestGroup = Match;
- BestNodeCost = TempCost;
- BestGroupID = CandSGID;
- }
- removeEdges(AddedEdges);
- if (BestNodeCost == 0)
- break;
+ if (Match->isFull()) {
+ LLVM_DEBUG(dbgs() << "SGID # " << CandSGID << " is full\n");
+ continue;
+ }
+ if (!Match->allowedByRules(CurrSU.first, SyncPipeline)) {
+ LLVM_DEBUG(dbgs() << "SGID # " << CandSGID << " has conflicting rule\n");
+ continue;
}
+ TempCost = addEdges(SyncPipeline, CurrSU.first, CandSGID, AddedEdges);
+ LLVM_DEBUG(dbgs() << "Cost of Group " << TempCost << "\n");
+ if (TempCost < BestNodeCost || BestNodeCost == -1) {
+ BestGroup = Match;
+ BestNodeCost = TempCost;
+ BestGroupID = CandSGID;
+ }
+ removeEdges(AddedEdges);
+ if (BestNodeCost == 0)
+ break;
+ }
- if (BestGroupID != -1) {
- BestGroup->add(*CurrSU.first);
- addEdges(SyncPipeline, CurrSU.first, BestGroupID, AddedEdges);
- LLVM_DEBUG(dbgs() << "Best Group has ID: " << BestGroupID << " and Mask"
- << (int)BestGroup->getMask() << "\n");
- BestCost += TempCost;
- } else
- BestCost += MissPenalty;
+ if (BestGroupID != -1) {
+ BestGroup->add(*CurrSU.first);
+ addEdges(SyncPipeline, CurrSU.first, BestGroupID, AddedEdges);
+ LLVM_DEBUG(dbgs() << "Best Group has ID: " << BestGroupID << " and Mask"
+ << (int)BestGroup->getMask() << "\n");
+ BestCost += TempCost;
+ } else
+ BestCost += MissPenalty;
- CurrPipeline[CurrSyncGroupIdx] = SyncPipeline;
+ CurrPipeline[CurrSyncGroupIdx] = SyncPipeline;
+}
+
+bool PipelineSolver::solveGreedy() {
+ BestCost = 0;
+ std::vector<std::pair<SUnit *, SUnit *>> AddedEdges;
+
+ while (static_cast<size_t>(CurrSyncGroupIdx) < PipelineInstrs.size()) {
+ SUToCandSGsPair CurrSU = PipelineInstrs[CurrSyncGroupIdx][CurrConflInstNo];
+ IsBottomUp
+ ? greedyFind(AddedEdges, CurrSU.second.rbegin(), CurrSU.second.rend())
+ : greedyFind(AddedEdges, CurrSU.second.begin(), CurrSU.second.end());
advancePosition();
}
BestPipeline = CurrPipeline;
@@ -721,9 +832,14 @@ void PipelineSolver::solve() {
}
makePipeline();
+ LLVM_DEBUG(dbgs() << "After applying mutation\n");
+ LLVM_DEBUG(DAG->dump());
}
-enum IGLPStrategyID : int { MFMASmallGemmOptID = 0 };
+enum IGLPStrategyID : int {
+ MFMASmallGemmOptID = 0,
+ MFMASmallGemmSingleWaveOptID = 1,
+};
// Implement a IGLP scheduling strategy.
class IGLPStrategy {
@@ -741,6 +857,8 @@ public:
// Returns true if this strategy should be applied to a ScheduleDAG.
virtual bool shouldApplyStrategy(ScheduleDAGInstrs *DAG) = 0;
+ bool IsBottomUp = 1;
+
IGLPStrategy(ScheduleDAGInstrs *DAG, const SIInstrInfo *TII)
: DAG(DAG), TII(TII) {}
@@ -748,6 +866,7 @@ public:
};
class MFMASmallGemmOpt final : public IGLPStrategy {
+private:
public:
void applyIGLPStrategy(
DenseMap<int, SUnitsToCandidateSGsMap> &SyncedInstrs,
@@ -756,7 +875,9 @@ public:
bool shouldApplyStrategy(ScheduleDAGInstrs *DAG) override { return true; }
MFMASmallGemmOpt(ScheduleDAGInstrs *DAG, const SIInstrInfo *TII)
- : IGLPStrategy(DAG, TII) {}
+ : IGLPStrategy(DAG, TII) {
+ IsBottomUp = 1;
+ }
};
void MFMASmallGemmOpt::applyIGLPStrategy(
@@ -781,12 +902,456 @@ void MFMASmallGemmOpt::applyIGLPStrategy(
}
}
+class MFMASmallGemmSingleWaveOpt final : public IGLPStrategy {
+private:
+ // Whether the DS_READ is a predecessor of first four MFMA in region
+ class EnablesInitialMFMA final : public InstructionRule {
+ public:
+ bool apply(const SUnit *SU, const ArrayRef<SUnit *> Collection,
+ SmallVectorImpl<SchedGroup> &SyncPipe) override {
+ if (!SyncPipe.size())
+ return false;
+ int MFMAsFound = 0;
+ if (!Cache->size()) {
+ for (auto &Elt : SyncPipe[0].DAG->SUnits) {
+ if (TII->isMFMAorWMMA(*Elt.getInstr())) {
+ ++MFMAsFound;
+ if (MFMAsFound > 4)
+ break;
+ Cache->push_back(&Elt);
+ }
+ }
+ }
+
+ assert(Cache->size());
+ auto DAG = SyncPipe[0].DAG;
+ for (auto &Elt : *Cache) {
+ if (DAG->IsReachable(Elt, const_cast<SUnit *>(SU)))
+ return true;
+ }
+ return false;
+ }
+
+ EnablesInitialMFMA(const SIInstrInfo *TII, unsigned SGID,
+ bool NeedsCache = false)
+ : InstructionRule(TII, SGID, NeedsCache) {}
+ };
+
+ // Whether the MI is a V_PERM and is a predecessor of a common DS_WRITE
+ class IsPermForDSW final : public InstructionRule {
+ public:
+ bool apply(const SUnit *SU, const ArrayRef<SUnit *> Collection,
+ SmallVectorImpl<SchedGroup> &SyncPipe) override {
+ auto MI = SU->getInstr();
+ if (MI->getOpcode() != AMDGPU::V_PERM_B32_e64)
+ return false;
+
+ bool FitsInGroup = false;
+ // Does the VALU have a DS_WRITE successor
+ if (!Collection.size()) {
+ for (auto &Succ : SU->Succs) {
+ SUnit *SuccUnit = Succ.getSUnit();
+ if (TII->isDS(*SuccUnit->getInstr()) &&
+ SuccUnit->getInstr()->mayStore()) {
+ Cache->push_back(SuccUnit);
+ FitsInGroup = true;
+ }
+ }
+ return FitsInGroup;
+ }
+
+ assert(Cache->size());
+
+ // Does the VALU have a DS_WRITE successor that is the same as other
+ // VALU already in the group. The V_PERMs will all share 1 DS_W succ
+ return std::any_of(Cache->begin(), Cache->end(), [&SU](SUnit *Elt) {
+ return std::any_of(SU->Succs.begin(), SU->Succs.end(),
+ [&Elt](const SDep &ThisSucc) {
+ return ThisSucc.getSUnit() == Elt;
+ });
+ });
+ }
+
+ IsPermForDSW(const SIInstrInfo *TII, unsigned SGID, bool NeedsCache = false)
+ : InstructionRule(TII, SGID, NeedsCache) {}
+ };
+
+ // Whether the SU is a successor of any element in previous SchedGroup
+ class IsSuccOfPrevGroup final : public InstructionRule {
+ public:
+ bool apply(const SUnit *SU, const ArrayRef<SUnit *> Collection,
+ SmallVectorImpl<SchedGroup> &SyncPipe) override {
+ SchedGroup *OtherGroup = nullptr;
+ for (auto &PipeSG : SyncPipe) {
+ if ((unsigned)PipeSG.getSGID() == SGID - 1) {
+ OtherGroup = &PipeSG;
+ }
+ }
+
+ if (!OtherGroup)
+ return false;
+ if (!OtherGroup->Collection.size())
+ return true;
+
+ // Does the previous VALU have this DS_Write as a successor
+ return (std::any_of(OtherGroup->Collection.begin(),
+ OtherGroup->Collection.end(), [&SU](SUnit *Elt) {
+ return std::any_of(Elt->Succs.begin(),
+ Elt->Succs.end(),
+ [&SU](SDep &Succ) {
+ return Succ.getSUnit() == SU;
+ });
+ }));
+ }
+ IsSuccOfPrevGroup(const SIInstrInfo *TII, unsigned SGID,
+ bool NeedsCache = false)
+ : InstructionRule(TII, SGID, NeedsCache) {}
+ };
+
+ // Whether the combined load width of group is 128 bits
+ class VMEMSize final : public InstructionRule {
+ public:
+ bool apply(const SUnit *SU, const ArrayRef<SUnit *> Collection,
+ SmallVectorImpl<SchedGroup> &SyncPipe) override {
+ auto MI = SU->getInstr();
+ if (MI->getOpcode() == TargetOpcode::BUNDLE)
+ return false;
+ if (!Collection.size())
+ return true;
+
+ int NumBits = 0;
+
+ auto TRI = TII->getRegisterInfo();
+ auto &MRI = MI->getParent()->getParent()->getRegInfo();
+ for (auto &Elt : Collection) {
+ auto Op = Elt->getInstr()->getOperand(0);
+ auto Size =
+ TRI.getRegSizeInBits(*TRI.getRegClassForOperandReg(MRI, Op));
+ NumBits += Size;
+ }
+
+ if (NumBits < 128) {
+ assert(TII->isVMEM(*MI) && MI->mayLoad());
+ if (NumBits + TRI.getRegSizeInBits(*TRI.getRegClassForOperandReg(
+ MRI, MI->getOperand(0))) <=
+ 128)
+ return true;
+ }
+
+ return false;
+ }
+
+ VMEMSize(const SIInstrInfo *TII, unsigned SGID, bool NeedsCache = false)
+ : InstructionRule(TII, SGID, NeedsCache) {}
+ };
+
+ // Whether the SU shares a V_PERM predecessor with any SU in the SchedGroup
+ // that is /p Distance steps away
+ class SharesPredWithPrevNthGroup final : public InstructionRule {
+ private:
+ unsigned Distance = 1;
+
+ public:
+ bool apply(const SUnit *SU, const ArrayRef<SUnit *> Collection,
+ SmallVectorImpl<SchedGroup> &SyncPipe) override {
+ SchedGroup *OtherGroup = nullptr;
+ if (!SyncPipe.size())
+ return false;
+
+ if (!Cache->size()) {
+
+ for (auto &PipeSG : SyncPipe) {
+ if ((unsigned)PipeSG.getSGID() == SGID - Distance) {
+ OtherGroup = &PipeSG;
+ }
+ }
+
+ if (!OtherGroup)
+ return false;
+ if (!OtherGroup->Collection.size())
+ return true;
+
+ for (auto &OtherEle : OtherGroup->Collection) {
+ for (auto &Pred : OtherEle->Preds) {
+ if (Pred.getSUnit()->getInstr()->getOpcode() ==
+ AMDGPU::V_PERM_B32_e64)
+ Cache->push_back(Pred.getSUnit());
+ }
+ }
+ }
+
+ assert(Cache->size());
+ auto DAG = SyncPipe[0].DAG;
+ // Does the previous DS_WRITE share a V_PERM predecessor with this
+ // VMEM_READ
+ return (
+ std::any_of(Cache->begin(), Cache->end(), [&SU, &DAG](SUnit *Elt) {
+ return DAG->IsReachable(const_cast<SUnit *>(SU), Elt);
+ }));
+ }
+ SharesPredWithPrevNthGroup(unsigned Distance, const SIInstrInfo *TII,
+ unsigned SGID, bool NeedsCache = false)
+ : InstructionRule(TII, SGID, NeedsCache), Distance(Distance) {}
+ };
+
+public:
+ void applyIGLPStrategy(
+ DenseMap<int, SUnitsToCandidateSGsMap> &SyncedInstrs,
+ DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups) override;
+
+ bool shouldApplyStrategy(ScheduleDAGInstrs *DAG) override { return true; }
+
+ MFMASmallGemmSingleWaveOpt(ScheduleDAGInstrs *DAG, const SIInstrInfo *TII)
+ : IGLPStrategy(DAG, TII) {
+ IsBottomUp = 0;
+ }
+};
+
+void MFMASmallGemmSingleWaveOpt::applyIGLPStrategy(
+ DenseMap<int, SUnitsToCandidateSGsMap> &SyncedInstrs,
+ DenseMap<int, SmallVector<SchedGroup, 4>> &SyncedSchedGroups) {
+ unsigned MFMACount = 0;
+ unsigned DSWCount = 0;
+ unsigned DSWWithPermCount = 0;
+ unsigned DSWWithSharedVMEMCount = 0;
+ unsigned DSRCount = 0;
+ SmallVector<SUnit *, 6> DSWithPerms;
+ for (auto &SU : DAG->SUnits) {
+ auto I = SU.getInstr();
+ if (TII->isMFMAorWMMA(*I))
+ ++MFMACount;
+ else if (TII->isDS(*I)) {
+ if (I->mayLoad())
+ ++DSRCount;
+ else if (I->mayStore()) {
+ ++DSWCount;
+ for (auto Pred : SU.Preds) {
+ if (Pred.getSUnit()->getInstr()->getOpcode() ==
+ AMDGPU::V_PERM_B32_e64) {
+ DSWithPerms.push_back(&SU);
+ break;
+ }
+ }
+ }
+ }
+ }
+ DSWWithPermCount = DSWithPerms.size();
+ auto I = DSWithPerms.begin();
+ auto E = DSWithPerms.end();
+
+ // Get the count of DS_WRITES with V_PERM predecessors which
+ // have loop carried dependencies (WAR) on the same VMEM_READs.
+ // We consider partial overlap as a miss -- in other words,
+ // for a given DS_W, we only consider another DS_W as matching
+ // if there is a corresponding (in terms of the VMEM_R it uses) V_PERM pred
+ // for every V_PERM pred of this DS_W.
+ DenseMap<MachineInstr *, SUnit *> VMEMLookup;
+ SmallVector<SUnit *, 6> Counted;
+ for (; I != E; I++) {
+ SUnit *Cand = nullptr;
+ bool MissedAny = false;
+ for (auto &Pred : (*I)->Preds) {
+ if (Pred.getSUnit()->getInstr()->getOpcode() != AMDGPU::V_PERM_B32_e64)
+ continue;
+
+ if (Cand &&
+ std::find(Counted.begin(), Counted.end(), Cand) != Counted.end())
+ break;
+
+ for (auto &Succ : Pred.getSUnit()->Succs) {
+ auto MI = Succ.getSUnit()->getInstr();
+ if (!TII->isVMEM(*MI) || !MI->mayLoad())
+ continue;
+
+ if (MissedAny || !VMEMLookup.size()) {
+ MissedAny = true;
+ VMEMLookup[MI] = *I;
+ continue;
+ }
+
+ if (!VMEMLookup.contains(MI)) {
+ MissedAny = true;
+ VMEMLookup[MI] = *I;
+ continue;
+ }
+
+ Cand = VMEMLookup[MI];
+ if (std::find(Counted.begin(), Counted.end(), Cand) != Counted.end()) {
+ MissedAny = true;
+ break;
+ }
+ }
+ }
+ if (!MissedAny && Cand) {
+ DSWWithSharedVMEMCount += 2;
+ Counted.push_back(Cand);
+ Counted.push_back(*I);
+ }
+ }
+
+ assert(DSWWithSharedVMEMCount <= DSWWithPermCount);
+ SchedGroup *SG;
+ unsigned PipelineSyncID = 0;
+ // For kernels with V_PERM, there are enough VALU to mix in between MFMAs
+ if (DSWWithPermCount) {
+ for (unsigned I = 0; I < MFMACount; I++) {
+ SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
+ SchedGroupMask::MFMA, 1, PipelineSyncID, DAG, TII);
+ SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
+
+ SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
+ SchedGroupMask::VALU, 2, PipelineSyncID, DAG, TII);
+ SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
+ }
+ }
+
+ PipelineSyncID = 1;
+ // Phase 1: Break up DS_READ and MFMA clusters.
+ // First DS_READ to make ready initial MFMA, then interleave MFMA with DS_READ
+ // prefetch
+
+ // Make ready initial MFMA
+ SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
+ SchedGroupMask::DS_READ, 4, PipelineSyncID, DAG, TII);
+ SG->addRule(std::make_shared<EnablesInitialMFMA>(TII, SG->getSGID(), true));
+ SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
+
+ SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
+ SchedGroupMask::MFMA, 1, PipelineSyncID, DAG, TII);
+ SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
+
+ // Interleave MFMA with DS_READ prefetch
+ for (unsigned I = 0; I < DSRCount - 4; ++I) {
+ SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
+ SchedGroupMask::DS_READ, 1, PipelineSyncID, DAG, TII);
+ SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
+
+ SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
+ SchedGroupMask::MFMA, 1, PipelineSyncID, DAG, TII);
+ SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
+ }
+
+ // Phase 2a: Loop carried dependency with V_PERM
+ // Schedule VPerm & DS_WRITE as closely as possible to the VMEM_READ they
+ // depend on. Interleave MFMA to keep XDL unit busy throughout.
+ for (unsigned I = 0; I < DSWWithPermCount - DSWWithSharedVMEMCount; ++I) {
+ SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
+ SchedGroupMask::VALU, 4, PipelineSyncID, DAG, TII);
+ SG->addRule(std::make_shared<IsPermForDSW>(TII, SG->getSGID(), true));
+ SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
+
+ SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
+ SchedGroupMask::DS_WRITE, 1, PipelineSyncID, DAG, TII);
+ SG->addRule(std::make_shared<IsSuccOfPrevGroup>(TII, SG->getSGID(), false));
+ SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
+
+ SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
+ SchedGroupMask::VMEM_READ, 4, PipelineSyncID, DAG, TII);
+ SG->addRule(std::make_shared<SharesPredWithPrevNthGroup>(
+ 1, TII, SG->getSGID(), true));
+ SG->addRule(std::make_shared<VMEMSize>(TII, SG->getSGID(), false));
+ SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
+
+ SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
+ SchedGroupMask::MFMA, 1, PipelineSyncID, DAG, TII);
+ SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
+
+ SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
+ SchedGroupMask::VMEM_READ, 4, PipelineSyncID, DAG, TII);
+ SG->addRule(std::make_shared<SharesPredWithPrevNthGroup>(
+ 3, TII, SG->getSGID(), true));
+ SG->addRule(std::make_shared<VMEMSize>(TII, SG->getSGID(), false));
+ SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
+
+ SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
+ SchedGroupMask::MFMA, 1, PipelineSyncID, DAG, TII);
+ SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
+ }
+
+ // Phase 2b: Loop carried dependency without V_PERM
+ // Schedule DS_WRITE as closely as possible to the VMEM_READ they depend on.
+ // Interleave MFMA to keep XDL unit busy throughout.
+ for (unsigned I = 0; I < DSWCount - DSWWithPermCount; I++) {
+ SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
+ SchedGroupMask::DS_WRITE, 1, PipelineSyncID, DAG, TII);
+ SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
+
+ SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
+ SchedGroupMask::VMEM_READ, 4, PipelineSyncID, DAG, TII);
+ SG->addRule(std::make_shared<VMEMSize>(TII, SG->getSGID(), false));
+ SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
+
+ SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
+ SchedGroupMask::MFMA, 1, PipelineSyncID, DAG, TII);
+ SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
+ }
+
+ // Phase 2c: Loop carried dependency with V_PERM, VMEM_READs are
+ // ultimately used by two DS_WRITE
+ // Schedule VPerm & DS_WRITE as closely as possible to the VMEM_READ they
+ // depend on. Interleave MFMA to keep XDL unit busy throughout.
+
+ for (unsigned I = 0; I < DSWWithSharedVMEMCount; ++I) {
+ SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
+ SchedGroupMask::VALU, 4, PipelineSyncID, DAG, TII);
+ SG->addRule(std::make_shared<IsPermForDSW>(TII, SG->getSGID(), true));
+ SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
+
+ SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
+ SchedGroupMask::DS_WRITE, 1, PipelineSyncID, DAG, TII);
+ SG->addRule(std::make_shared<IsSuccOfPrevGroup>(TII, SG->getSGID(), false));
+ SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
+
+ SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
+ SchedGroupMask::MFMA, 1, PipelineSyncID, DAG, TII);
+ SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
+
+ SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
+ SchedGroupMask::VALU, 4, PipelineSyncID, DAG, TII);
+ SG->addRule(std::make_shared<IsPermForDSW>(TII, SG->getSGID(), true));
+ SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
+
+ SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
+ SchedGroupMask::DS_WRITE, 1, PipelineSyncID, DAG, TII);
+ SG->addRule(std::make_shared<IsSuccOfPrevGroup>(TII, SG->getSGID(), false));
+ SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
+
+ SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
+ SchedGroupMask::MFMA, 1, PipelineSyncID, DAG, TII);
+ SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
+
+ SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
+ SchedGroupMask::VMEM_READ, 4, PipelineSyncID, DAG, TII);
+ SG->addRule(std::make_shared<SharesPredWithPrevNthGroup>(
+ 2, TII, SG->getSGID(), true));
+ SG->addRule(std::make_shared<VMEMSize>(TII, SG->getSGID(), false));
+ SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
+
+ SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
+ SchedGroupMask::MFMA, 1, PipelineSyncID, DAG, TII);
+ SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
+
+ SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
+ SchedGroupMask::VMEM_READ, 4, PipelineSyncID, DAG, TII);
+ SG->addRule(std::make_shared<SharesPredWithPrevNthGroup>(
+ 4, TII, SG->getSGID(), true));
+ SG->addRule(std::make_shared<VMEMSize>(TII, SG->getSGID(), false));
+ SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
+
+ SG = &SyncedSchedGroups[PipelineSyncID].emplace_back(
+ SchedGroupMask::MFMA, 1, PipelineSyncID, DAG, TII);
+ SG->initSchedGroup(SyncedInstrs[SG->getSyncID()]);
+ }
+}
+
static std::unique_ptr<IGLPStrategy>
createIGLPStrategy(IGLPStrategyID ID, ScheduleDAGInstrs *DAG,
const SIInstrInfo *TII) {
switch (ID) {
case MFMASmallGemmOptID:
return std::make_unique<MFMASmallGemmOpt>(DAG, TII);
+ case MFMASmallGemmSingleWaveOptID:
+ return std::make_unique<MFMASmallGemmSingleWaveOpt>(DAG, TII);
}
llvm_unreachable("Unknown IGLPStrategyID");
@@ -829,6 +1394,13 @@ private:
public:
void apply(ScheduleDAGInstrs *DAGInstrs) override;
+ // The order in which the PipelineSolver should process the candidate
+ // SchedGroup for a PipelineInstr. BOTTOM_UP will try to add SUs to the last
+ // created SchedGroup first, and will consider that as the ultimate
+ // predecessor group when linking. TOP_DOWN instead links and processes the
+ // first created SchedGroup first.
+ bool IsBottomUp = 1;
+
IGroupLPDAGMutation() = default;
};
@@ -908,6 +1480,7 @@ int SchedGroup::link(SUnit &SU, bool MakePred,
if (DAG->IsReachable(B, A))
continue;
+
// tryAddEdge returns false if there is a dependency that makes adding
// the A->B edge impossible, otherwise it returns true;
bool Added = tryAddEdge(A, B);
@@ -1034,7 +1607,7 @@ void IGroupLPDAGMutation::apply(ScheduleDAGInstrs *DAGInstrs) {
}
if (foundSB || foundIGLP) {
- PipelineSolver PS(SyncedSchedGroups, SyncedInstrs, DAG);
+ PipelineSolver PS(SyncedSchedGroups, SyncedInstrs, DAG, IsBottomUp);
// PipelineSolver performs the mutation by adding the edges it
// determined as the best
PS.solve();
@@ -1114,8 +1687,10 @@ void IGroupLPDAGMutation::initIGLPOpt(SUnit &SU) {
IGLPStrategyID StrategyID =
(IGLPStrategyID)SU.getInstr()->getOperand(0).getImm();
auto S = createIGLPStrategy(StrategyID, DAG, TII);
- if (S->shouldApplyStrategy(DAG))
+ if (S->shouldApplyStrategy(DAG)) {
+ IsBottomUp = S->IsBottomUp;
S->applyIGLPStrategy(SyncedInstrs, SyncedSchedGroups);
+ }
}
} // namespace
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
index 42d1f58e4239..825c6f0acd0f 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
@@ -20,7 +20,7 @@
#include "MCTargetDesc/R600MCTargetDesc.h"
#include "R600RegisterInfo.h"
#include "SIMachineFunctionInfo.h"
-#include "llvm/Analysis/LegacyDivergenceAnalysis.h"
+#include "llvm/Analysis/UniformityAnalysis.h"
#include "llvm/Analysis/ValueTracking.h"
#include "llvm/CodeGen/FunctionLoweringInfo.h"
#include "llvm/CodeGen/SelectionDAG.h"
@@ -28,6 +28,7 @@
#include "llvm/CodeGen/SelectionDAGNodes.h"
#include "llvm/IR/IntrinsicsAMDGPU.h"
#include "llvm/InitializePasses.h"
+#include "llvm/Support/ErrorHandling.h"
#ifdef EXPENSIVE_CHECKS
#include "llvm/Analysis/LoopInfo.h"
@@ -101,7 +102,7 @@ INITIALIZE_PASS_BEGIN(AMDGPUDAGToDAGISel, "amdgpu-isel",
"AMDGPU DAG->DAG Pattern Instruction Selection", false, false)
INITIALIZE_PASS_DEPENDENCY(AMDGPUArgumentUsageInfo)
INITIALIZE_PASS_DEPENDENCY(AMDGPUPerfHintAnalysis)
-INITIALIZE_PASS_DEPENDENCY(LegacyDivergenceAnalysis)
+INITIALIZE_PASS_DEPENDENCY(UniformityInfoWrapperPass)
#ifdef EXPENSIVE_CHECKS
INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
@@ -131,7 +132,7 @@ bool AMDGPUDAGToDAGISel::runOnMachineFunction(MachineFunction &MF) {
}
#endif
Subtarget = &MF.getSubtarget<GCNSubtarget>();
- Mode = AMDGPU::SIModeRegisterDefaults(MF.getFunction());
+ Mode = SIModeRegisterDefaults(MF.getFunction());
return SelectionDAGISel::runOnMachineFunction(MF);
}
@@ -167,6 +168,7 @@ bool AMDGPUDAGToDAGISel::fp16SrcZerosHighBits(unsigned Opc) const {
case ISD::FFLOOR:
case ISD::FMINNUM:
case ISD::FMAXNUM:
+ case ISD::FLDEXP:
case AMDGPUISD::FRACT:
case AMDGPUISD::CLAMP:
case AMDGPUISD::COS_HW:
@@ -178,7 +180,6 @@ bool AMDGPUDAGToDAGISel::fp16SrcZerosHighBits(unsigned Opc) const {
case AMDGPUISD::RCP:
case AMDGPUISD::RSQ:
case AMDGPUISD::RCP_IFLAG:
- case AMDGPUISD::LDEXP:
// On gfx10, all 16-bit instructions preserve the high bits.
return Subtarget->getGeneration() <= AMDGPUSubtarget::GFX9;
case ISD::FP_ROUND:
@@ -199,7 +200,7 @@ bool AMDGPUDAGToDAGISel::fp16SrcZerosHighBits(unsigned Opc) const {
void AMDGPUDAGToDAGISel::getAnalysisUsage(AnalysisUsage &AU) const {
AU.addRequired<AMDGPUArgumentUsageInfo>();
- AU.addRequired<LegacyDivergenceAnalysis>();
+ AU.addRequired<UniformityInfoWrapperPass>();
#ifdef EXPENSIVE_CHECKS
AU.addRequired<DominatorTreeWrapperPass>();
AU.addRequired<LoopInfoWrapperPass>();
@@ -503,10 +504,8 @@ void AMDGPUDAGToDAGISel::Select(SDNode *N) {
// isa<MemSDNode> almost works but is slightly too permissive for some DS
// intrinsics.
if (Opc == ISD::LOAD || Opc == ISD::STORE || isa<AtomicSDNode>(N) ||
- (Opc == AMDGPUISD::ATOMIC_INC || Opc == AMDGPUISD::ATOMIC_DEC ||
- Opc == ISD::ATOMIC_LOAD_FADD ||
- Opc == AMDGPUISD::ATOMIC_LOAD_FMIN ||
- Opc == AMDGPUISD::ATOMIC_LOAD_FMAX)) {
+ Opc == AMDGPUISD::ATOMIC_LOAD_FMIN ||
+ Opc == AMDGPUISD::ATOMIC_LOAD_FMAX) {
N = glueCopyToM0LDSInit(N);
SelectCode(N);
return;
@@ -528,8 +527,8 @@ void AMDGPUDAGToDAGISel::Select(SDNode *N) {
SelectADD_SUB_I64(N);
return;
}
- case ISD::ADDCARRY:
- case ISD::SUBCARRY:
+ case ISD::UADDO_CARRY:
+ case ISD::USUBO_CARRY:
if (N->getValueType(0) != MVT::i32)
break;
@@ -665,10 +664,6 @@ void AMDGPUDAGToDAGISel::Select(SDNode *N) {
case ISD::BRCOND:
SelectBRCOND(N);
return;
- case ISD::FMAD:
- case ISD::FMA:
- SelectFMAD_FMA(N);
- return;
case AMDGPUISD::CVT_PKRTZ_F16_F32:
case AMDGPUISD::CVT_PKNORM_I16_F32:
case AMDGPUISD::CVT_PKNORM_U16_F32:
@@ -714,11 +709,11 @@ bool AMDGPUDAGToDAGISel::isUnneededShiftMask(const SDNode *N,
assert(N->getOpcode() == ISD::AND);
const APInt &RHS = cast<ConstantSDNode>(N->getOperand(1))->getAPIntValue();
- if (RHS.countTrailingOnes() >= ShAmtBits)
+ if (RHS.countr_one() >= ShAmtBits)
return true;
const APInt &LHSKnownZeros = CurDAG->computeKnownBits(N->getOperand(0)).Zero;
- return (LHSKnownZeros | RHS).countTrailingOnes() >= ShAmtBits;
+ return (LHSKnownZeros | RHS).countr_one() >= ShAmtBits;
}
static bool getBaseWithOffsetUsingSplitOR(SelectionDAG &DAG, SDValue Addr,
@@ -813,7 +808,7 @@ SDValue AMDGPUDAGToDAGISel::getMaterializedScalarImm32(int64_t Val,
return SDValue(Mov, 0);
}
-// FIXME: Should only handle addcarry/subcarry
+// FIXME: Should only handle uaddo_carry/usubo_carry
void AMDGPUDAGToDAGISel::SelectADD_SUB_I64(SDNode *N) {
SDLoc DL(N);
SDValue LHS = N->getOperand(0);
@@ -890,15 +885,15 @@ void AMDGPUDAGToDAGISel::SelectAddcSubb(SDNode *N) {
SDValue CI = N->getOperand(2);
if (N->isDivergent()) {
- unsigned Opc = N->getOpcode() == ISD::ADDCARRY ? AMDGPU::V_ADDC_U32_e64
- : AMDGPU::V_SUBB_U32_e64;
+ unsigned Opc = N->getOpcode() == ISD::UADDO_CARRY ? AMDGPU::V_ADDC_U32_e64
+ : AMDGPU::V_SUBB_U32_e64;
CurDAG->SelectNodeTo(
N, Opc, N->getVTList(),
{LHS, RHS, CI,
CurDAG->getTargetConstant(0, {}, MVT::i1) /*clamp bit*/});
} else {
- unsigned Opc = N->getOpcode() == ISD::ADDCARRY ? AMDGPU::S_ADD_CO_PSEUDO
- : AMDGPU::S_SUB_CO_PSEUDO;
+ unsigned Opc = N->getOpcode() == ISD::UADDO_CARRY ? AMDGPU::S_ADD_CO_PSEUDO
+ : AMDGPU::S_SUB_CO_PSEUDO;
CurDAG->SelectNodeTo(N, Opc, N->getVTList(), {LHS, RHS, CI});
}
}
@@ -913,8 +908,8 @@ void AMDGPUDAGToDAGISel::SelectUADDO_USUBO(SDNode *N) {
for (SDNode::use_iterator UI = N->use_begin(), E = N->use_end(); UI != E;
++UI)
if (UI.getUse().getResNo() == 1) {
- if ((IsAdd && (UI->getOpcode() != ISD::ADDCARRY)) ||
- (!IsAdd && (UI->getOpcode() != ISD::SUBCARRY))) {
+ if ((IsAdd && (UI->getOpcode() != ISD::UADDO_CARRY)) ||
+ (!IsAdd && (UI->getOpcode() != ISD::USUBO_CARRY))) {
IsVALU = true;
break;
}
@@ -1141,6 +1136,15 @@ bool AMDGPUDAGToDAGISel::isDSOffset2Legal(SDValue Base, unsigned Offset0,
return CurDAG->SignBitIsZero(Base);
}
+bool AMDGPUDAGToDAGISel::isFlatScratchBaseLegal(SDValue Base,
+ uint64_t FlatVariant) const {
+ if (FlatVariant != SIInstrFlags::FlatScratch)
+ return true;
+ // When value in 32-bit Base can be negative calculate scratch offset using
+ // 32-bit add instruction, otherwise use Base(unsigned) + offset.
+ return CurDAG->SignBitIsZero(Base);
+}
+
// TODO: If offset is too big, put low 16-bit into offset.
bool AMDGPUDAGToDAGISel::SelectDS64Bit4ByteAligned(SDValue Addr, SDValue &Base,
SDValue &Offset0,
@@ -1283,7 +1287,7 @@ bool AMDGPUDAGToDAGISel::SelectMUBUF(SDValue Addr, SDValue &Ptr, SDValue &VAddr,
Ptr = N2;
VAddr = N3;
}
- Offset = CurDAG->getTargetConstant(0, DL, MVT::i16);
+ Offset = CurDAG->getTargetConstant(0, DL, MVT::i32);
} else if (N0->isDivergent()) {
// N0 is divergent. Use it as the addr64, and construct the resource from a
// 0 address.
@@ -1299,18 +1303,18 @@ bool AMDGPUDAGToDAGISel::SelectMUBUF(SDValue Addr, SDValue &Ptr, SDValue &VAddr,
if (!C1) {
// No offset.
- Offset = CurDAG->getTargetConstant(0, DL, MVT::i16);
+ Offset = CurDAG->getTargetConstant(0, DL, MVT::i32);
return true;
}
if (SIInstrInfo::isLegalMUBUFImmOffset(C1->getZExtValue())) {
// Legal offset for instruction.
- Offset = CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i16);
+ Offset = CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i32);
return true;
}
// Illegal offset, store it in soffset.
- Offset = CurDAG->getTargetConstant(0, DL, MVT::i16);
+ Offset = CurDAG->getTargetConstant(0, DL, MVT::i32);
SOffset =
SDValue(CurDAG->getMachineNode(
AMDGPU::S_MOV_B32, DL, MVT::i32,
@@ -1377,13 +1381,15 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFScratchOffen(SDNode *Parent,
AMDGPUTargetMachine::getNullPointerValue(AMDGPUAS::PRIVATE_ADDRESS);
// Don't fold null pointer.
if (Imm != NullPtr) {
- SDValue HighBits = CurDAG->getTargetConstant(Imm & ~4095, DL, MVT::i32);
+ const uint32_t MaxOffset = SIInstrInfo::getMaxMUBUFImmOffset();
+ SDValue HighBits =
+ CurDAG->getTargetConstant(Imm & ~MaxOffset, DL, MVT::i32);
MachineSDNode *MovHighBits = CurDAG->getMachineNode(
AMDGPU::V_MOV_B32_e32, DL, MVT::i32, HighBits);
VAddr = SDValue(MovHighBits, 0);
SOffset = CurDAG->getTargetConstant(0, DL, MVT::i32);
- ImmOffset = CurDAG->getTargetConstant(Imm & 4095, DL, MVT::i16);
+ ImmOffset = CurDAG->getTargetConstant(Imm & MaxOffset, DL, MVT::i32);
return true;
}
}
@@ -1414,14 +1420,14 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFScratchOffen(SDNode *Parent,
(!Subtarget->privateMemoryResourceIsRangeChecked() ||
CurDAG->SignBitIsZero(N0))) {
std::tie(VAddr, SOffset) = foldFrameIndex(N0);
- ImmOffset = CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i16);
+ ImmOffset = CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i32);
return true;
}
}
// (node)
std::tie(VAddr, SOffset) = foldFrameIndex(Addr);
- ImmOffset = CurDAG->getTargetConstant(0, DL, MVT::i16);
+ ImmOffset = CurDAG->getTargetConstant(0, DL, MVT::i32);
return true;
}
@@ -1450,7 +1456,7 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFScratchOffset(SDNode *Parent,
if (IsCopyFromSGPR(*TRI, Addr)) {
SRsrc = CurDAG->getRegister(Info->getScratchRSrcReg(), MVT::v4i32);
SOffset = Addr;
- Offset = CurDAG->getTargetConstant(0, DL, MVT::i16);
+ Offset = CurDAG->getTargetConstant(0, DL, MVT::i32);
return true;
}
@@ -1474,7 +1480,7 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFScratchOffset(SDNode *Parent,
SRsrc = CurDAG->getRegister(Info->getScratchRSrcReg(), MVT::v4i32);
- Offset = CurDAG->getTargetConstant(CAddr->getZExtValue(), DL, MVT::i16);
+ Offset = CurDAG->getTargetConstant(CAddr->getZExtValue(), DL, MVT::i32);
return true;
}
@@ -1532,7 +1538,8 @@ bool AMDGPUDAGToDAGISel::SelectFlatOffsetImpl(SDNode *N, SDValue Addr,
if (Subtarget->hasFlatInstOffsets() && !CanHaveFlatSegmentOffsetBug) {
SDValue N0, N1;
- if (isBaseWithConstantOffset64(Addr, N0, N1)) {
+ if (isBaseWithConstantOffset64(Addr, N0, N1) &&
+ isFlatScratchBaseLegal(N0, FlatVariant)) {
int64_t COffsetVal = cast<ConstantSDNode>(N1)->getSExtValue();
const SIInstrInfo *TII = Subtarget->getInstrInfo();
@@ -1764,7 +1771,8 @@ bool AMDGPUDAGToDAGISel::SelectScratchSAddr(SDNode *Parent, SDValue Addr,
int64_t COffsetVal = 0;
- if (CurDAG->isBaseWithConstantOffset(Addr)) {
+ if (CurDAG->isBaseWithConstantOffset(Addr) &&
+ isFlatScratchBaseLegal(Addr.getOperand(0))) {
COffsetVal = cast<ConstantSDNode>(Addr.getOperand(1))->getSExtValue();
SAddr = Addr.getOperand(0);
} else {
@@ -1842,6 +1850,8 @@ bool AMDGPUDAGToDAGISel::SelectScratchSVAddr(SDNode *N, SDValue Addr,
CurDAG->getTargetConstant(RemainderOffset, SDLoc(), MVT::i32));
VAddr = SDValue(VMov, 0);
SAddr = LHS;
+ if (!isFlatScratchBaseLegal(SAddr) || !isFlatScratchBaseLegal(VAddr))
+ return false;
if (checkFlatScratchSVSSwizzleBug(VAddr, SAddr, SplitImmOffset))
return false;
Offset = CurDAG->getTargetConstant(SplitImmOffset, SDLoc(), MVT::i16);
@@ -1866,6 +1876,9 @@ bool AMDGPUDAGToDAGISel::SelectScratchSVAddr(SDNode *N, SDValue Addr,
return false;
}
+ if (!isFlatScratchBaseLegal(SAddr) || !isFlatScratchBaseLegal(VAddr))
+ return false;
+
if (checkFlatScratchSVSSwizzleBug(VAddr, SAddr, ImmOffset))
return false;
SAddr = SelectSAddrFI(CurDAG, SAddr);
@@ -2283,52 +2296,6 @@ void AMDGPUDAGToDAGISel::SelectBRCOND(SDNode *N) {
VCC.getValue(0));
}
-void AMDGPUDAGToDAGISel::SelectFMAD_FMA(SDNode *N) {
- MVT VT = N->getSimpleValueType(0);
- bool IsFMA = N->getOpcode() == ISD::FMA;
- if (VT != MVT::f32 || (!Subtarget->hasMadMixInsts() &&
- !Subtarget->hasFmaMixInsts()) ||
- ((IsFMA && Subtarget->hasMadMixInsts()) ||
- (!IsFMA && Subtarget->hasFmaMixInsts()))) {
- SelectCode(N);
- return;
- }
-
- SDValue Src0 = N->getOperand(0);
- SDValue Src1 = N->getOperand(1);
- SDValue Src2 = N->getOperand(2);
- unsigned Src0Mods, Src1Mods, Src2Mods;
-
- // Avoid using v_mad_mix_f32/v_fma_mix_f32 unless there is actually an operand
- // using the conversion from f16.
- bool Sel0 = SelectVOP3PMadMixModsImpl(Src0, Src0, Src0Mods);
- bool Sel1 = SelectVOP3PMadMixModsImpl(Src1, Src1, Src1Mods);
- bool Sel2 = SelectVOP3PMadMixModsImpl(Src2, Src2, Src2Mods);
-
- assert((IsFMA || !Mode.allFP32Denormals()) &&
- "fmad selected with denormals enabled");
- // TODO: We can select this with f32 denormals enabled if all the sources are
- // converted from f16 (in which case fmad isn't legal).
-
- if (Sel0 || Sel1 || Sel2) {
- // For dummy operands.
- SDValue Zero = CurDAG->getTargetConstant(0, SDLoc(), MVT::i32);
- SDValue Ops[] = {
- CurDAG->getTargetConstant(Src0Mods, SDLoc(), MVT::i32), Src0,
- CurDAG->getTargetConstant(Src1Mods, SDLoc(), MVT::i32), Src1,
- CurDAG->getTargetConstant(Src2Mods, SDLoc(), MVT::i32), Src2,
- CurDAG->getTargetConstant(0, SDLoc(), MVT::i1),
- Zero, Zero
- };
-
- CurDAG->SelectNodeTo(N,
- IsFMA ? AMDGPU::V_FMA_MIX_F32 : AMDGPU::V_MAD_MIX_F32,
- MVT::f32, Ops);
- } else {
- SelectCode(N);
- }
-}
-
void AMDGPUDAGToDAGISel::SelectDSAppendConsume(SDNode *N, unsigned IntrID) {
// The address is assumed to be uniform, so if it ends up in a VGPR, it will
// be copied to an SGPR with readfirstlane.
@@ -2562,6 +2529,18 @@ void AMDGPUDAGToDAGISel::SelectINTRINSIC_WO_CHAIN(SDNode *N) {
case Intrinsic::amdgcn_interp_p1_f16:
SelectInterpP1F16(N);
return;
+ case Intrinsic::amdgcn_inverse_ballot:
+ switch (N->getOperand(1).getValueSizeInBits()) {
+ case 32:
+ Opcode = AMDGPU::S_INVERSE_BALLOT_U32;
+ break;
+ case 64:
+ Opcode = AMDGPU::S_INVERSE_BALLOT_U64;
+ break;
+ default:
+ llvm_unreachable("Unsupported size for inverse ballot mask.");
+ }
+ break;
default:
SelectCode(N);
return;
@@ -2591,13 +2570,22 @@ void AMDGPUDAGToDAGISel::SelectINTRINSIC_VOID(SDNode *N) {
bool AMDGPUDAGToDAGISel::SelectVOP3ModsImpl(SDValue In, SDValue &Src,
unsigned &Mods,
+ bool IsCanonicalizing,
bool AllowAbs) const {
- Mods = 0;
+ Mods = SISrcMods::NONE;
Src = In;
if (Src.getOpcode() == ISD::FNEG) {
Mods |= SISrcMods::NEG;
Src = Src.getOperand(0);
+ } else if (Src.getOpcode() == ISD::FSUB && IsCanonicalizing) {
+ // Fold fsub [+-]0 into fneg. This may not have folded depending on the
+ // denormal mode, but we're implicitly canonicalizing in a source operand.
+ auto *LHS = dyn_cast<ConstantFPSDNode>(Src.getOperand(0));
+ if (LHS && LHS->isZero()) {
+ Mods |= SISrcMods::NEG;
+ Src = Src.getOperand(1);
+ }
}
if (AllowAbs && Src.getOpcode() == ISD::FABS) {
@@ -2611,7 +2599,20 @@ bool AMDGPUDAGToDAGISel::SelectVOP3ModsImpl(SDValue In, SDValue &Src,
bool AMDGPUDAGToDAGISel::SelectVOP3Mods(SDValue In, SDValue &Src,
SDValue &SrcMods) const {
unsigned Mods;
- if (SelectVOP3ModsImpl(In, Src, Mods)) {
+ if (SelectVOP3ModsImpl(In, Src, Mods, /*IsCanonicalizing=*/true,
+ /*AllowAbs=*/true)) {
+ SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
+ return true;
+ }
+
+ return false;
+}
+
+bool AMDGPUDAGToDAGISel::SelectVOP3ModsNonCanonicalizing(
+ SDValue In, SDValue &Src, SDValue &SrcMods) const {
+ unsigned Mods;
+ if (SelectVOP3ModsImpl(In, Src, Mods, /*IsCanonicalizing=*/false,
+ /*AllowAbs=*/true)) {
SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
return true;
}
@@ -2622,7 +2623,9 @@ bool AMDGPUDAGToDAGISel::SelectVOP3Mods(SDValue In, SDValue &Src,
bool AMDGPUDAGToDAGISel::SelectVOP3BMods(SDValue In, SDValue &Src,
SDValue &SrcMods) const {
unsigned Mods;
- if (SelectVOP3ModsImpl(In, Src, Mods, /* AllowAbs */ false)) {
+ if (SelectVOP3ModsImpl(In, Src, Mods,
+ /*IsCanonicalizing=*/true,
+ /*AllowAbs=*/false)) {
SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
return true;
}
@@ -2642,7 +2645,9 @@ bool AMDGPUDAGToDAGISel::SelectVINTERPModsImpl(SDValue In, SDValue &Src,
SDValue &SrcMods,
bool OpSel) const {
unsigned Mods;
- if (SelectVOP3ModsImpl(In, Src, Mods, /* AllowAbs */ false)) {
+ if (SelectVOP3ModsImpl(In, Src, Mods,
+ /*IsCanonicalizing=*/true,
+ /*AllowAbs=*/false)) {
if (OpSel)
Mods |= SISrcMods::OP_SEL_0;
SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
@@ -2695,9 +2700,10 @@ bool AMDGPUDAGToDAGISel::SelectVOP3OMods(SDValue In, SDValue &Src,
bool AMDGPUDAGToDAGISel::SelectVOP3PMods(SDValue In, SDValue &Src,
SDValue &SrcMods, bool IsDOT) const {
- unsigned Mods = 0;
+ unsigned Mods = SISrcMods::NONE;
Src = In;
+ // TODO: Handle G_FSUB 0 as fneg
if (Src.getOpcode() == ISD::FNEG) {
Mods ^= (SISrcMods::NEG | SISrcMods::NEG_HI);
Src = Src.getOperand(0);
@@ -2776,7 +2782,7 @@ bool AMDGPUDAGToDAGISel::SelectVOP3PMods(SDValue In, SDValue &Src,
uint64_t Lit = cast<ConstantFPSDNode>(Lo)->getValueAPF()
.bitcastToAPInt().getZExtValue();
if (AMDGPU::isInlinableLiteral32(Lit, Subtarget->hasInv2PiInlineImm())) {
- Src = CurDAG->getTargetConstant(Lit, SDLoc(In), MVT::i64);;
+ Src = CurDAG->getTargetConstant(Lit, SDLoc(In), MVT::i64);
SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
return true;
}
@@ -2804,7 +2810,7 @@ bool AMDGPUDAGToDAGISel::SelectDotIUVOP3PMods(SDValue In, SDValue &Src) const {
assert(C->getAPIntValue().getBitWidth() == 1 && "expected i1 value");
unsigned Mods = SISrcMods::OP_SEL_1;
- unsigned SrcSign = C->getAPIntValue().getZExtValue();
+ unsigned SrcSign = C->getZExtValue();
if (SrcSign == 1)
Mods ^= SISrcMods::NEG;
@@ -2818,7 +2824,7 @@ bool AMDGPUDAGToDAGISel::SelectWMMAOpSelVOP3PMods(SDValue In,
assert(C->getAPIntValue().getBitWidth() == 1 && "expected i1 value");
unsigned Mods = SISrcMods::OP_SEL_1;
- unsigned SrcVal = C->getAPIntValue().getZExtValue();
+ unsigned SrcVal = C->getZExtValue();
if (SrcVal == 1)
Mods |= SISrcMods::OP_SEL_0;
@@ -2883,6 +2889,15 @@ bool AMDGPUDAGToDAGISel::SelectVOP3PMadMixModsImpl(SDValue In, SDValue &Src,
return false;
}
+bool AMDGPUDAGToDAGISel::SelectVOP3PMadMixModsExt(SDValue In, SDValue &Src,
+ SDValue &SrcMods) const {
+ unsigned Mods = 0;
+ if (!SelectVOP3PMadMixModsImpl(In, Src, Mods))
+ return false;
+ SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
+ return true;
+}
+
bool AMDGPUDAGToDAGISel::SelectVOP3PMadMixMods(SDValue In, SDValue &Src,
SDValue &SrcMods) const {
unsigned Mods = 0;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
index 162b0340a6aa..0605baf3a0cc 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
@@ -16,6 +16,7 @@
#include "GCNSubtarget.h"
#include "SIMachineFunctionInfo.h"
+#include "SIModeRegisterDefaults.h"
#include "llvm/CodeGen/SelectionDAGISel.h"
#include "llvm/Target/TargetMachine.h"
@@ -24,11 +25,7 @@ using namespace llvm;
namespace {
static inline bool isNullConstantOrUndef(SDValue V) {
- if (V.isUndef())
- return true;
-
- ConstantSDNode *Const = dyn_cast<ConstantSDNode>(V);
- return Const != nullptr && Const->isZero();
+ return V.isUndef() || isNullConstant(V);
}
static inline bool getConstantValue(SDValue N, uint32_t &Out) {
@@ -82,7 +79,7 @@ class AMDGPUDAGToDAGISel : public SelectionDAGISel {
const GCNSubtarget *Subtarget;
// Default FP mode for the current function.
- AMDGPU::SIModeRegisterDefaults Mode;
+ SIModeRegisterDefaults Mode;
bool EnableLateStructurizeCFG;
@@ -157,6 +154,9 @@ private:
bool isDSOffsetLegal(SDValue Base, unsigned Offset) const;
bool isDSOffset2Legal(SDValue Base, unsigned Offset0, unsigned Offset1,
unsigned Size) const;
+ bool isFlatScratchBaseLegal(
+ SDValue Base, uint64_t FlatVariant = SIInstrFlags::FlatScratch) const;
+
bool SelectDS1Addr1Offset(SDValue Ptr, SDValue &Base, SDValue &Offset) const;
bool SelectDS64Bit4ByteAligned(SDValue Ptr, SDValue &Base, SDValue &Offset0,
SDValue &Offset1) const;
@@ -216,8 +216,11 @@ private:
bool SelectMOVRELOffset(SDValue Index, SDValue &Base, SDValue &Offset) const;
bool SelectVOP3ModsImpl(SDValue In, SDValue &Src, unsigned &SrcMods,
+ bool IsCanonicalizing = true,
bool AllowAbs = true) const;
bool SelectVOP3Mods(SDValue In, SDValue &Src, SDValue &SrcMods) const;
+ bool SelectVOP3ModsNonCanonicalizing(SDValue In, SDValue &Src,
+ SDValue &SrcMods) const;
bool SelectVOP3BMods(SDValue In, SDValue &Src, SDValue &SrcMods) const;
bool SelectVOP3NoMods(SDValue In, SDValue &Src) const;
bool SelectVOP3Mods0(SDValue In, SDValue &Src, SDValue &SrcMods,
@@ -247,6 +250,8 @@ private:
bool SelectVOP3OpSelMods(SDValue In, SDValue &Src, SDValue &SrcMods) const;
bool SelectVOP3PMadMixModsImpl(SDValue In, SDValue &Src,
unsigned &Mods) const;
+ bool SelectVOP3PMadMixModsExt(SDValue In, SDValue &Src,
+ SDValue &SrcMods) const;
bool SelectVOP3PMadMixMods(SDValue In, SDValue &Src, SDValue &SrcMods) const;
SDValue getHi16Elt(SDValue In) const;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index 8121b381e83f..254d02d4ce5b 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -16,12 +16,13 @@
#include "AMDGPU.h"
#include "AMDGPUInstrInfo.h"
#include "AMDGPUMachineFunction.h"
-#include "GCNSubtarget.h"
#include "SIMachineFunctionInfo.h"
#include "llvm/CodeGen/Analysis.h"
+#include "llvm/CodeGen/GlobalISel/GISelKnownBits.h"
#include "llvm/CodeGen/MachineFrameInfo.h"
#include "llvm/IR/DiagnosticInfo.h"
#include "llvm/IR/IntrinsicsAMDGPU.h"
+#include "llvm/IR/PatternMatch.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/KnownBits.h"
#include "llvm/Target/TargetMachine.h"
@@ -138,6 +139,9 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
setOperationAction(ISD::LOAD, MVT::v16f64, Promote);
AddPromotedToType(ISD::LOAD, MVT::v16f64, MVT::v32i32);
+ setOperationAction(ISD::LOAD, MVT::i128, Promote);
+ AddPromotedToType(ISD::LOAD, MVT::i128, MVT::v4i32);
+
// There are no 64-bit extloads. These should be done as a 32-bit extload and
// an extension to 64-bit.
for (MVT VT : MVT::integer_valuetypes())
@@ -264,6 +268,9 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
setOperationAction(ISD::STORE, MVT::v16f64, Promote);
AddPromotedToType(ISD::STORE, MVT::v16f64, MVT::v32i32);
+ setOperationAction(ISD::STORE, MVT::i128, Promote);
+ AddPromotedToType(ISD::STORE, MVT::i128, MVT::v4i32);
+
setTruncStoreAction(MVT::i64, MVT::i1, Expand);
setTruncStoreAction(MVT::i64, MVT::i8, Expand);
setTruncStoreAction(MVT::i64, MVT::i16, Expand);
@@ -321,14 +328,15 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
// Library functions. These default to Expand, but we have instructions
// for them.
- setOperationAction({ISD::FCEIL, ISD::FEXP2, ISD::FPOW, ISD::FLOG2, ISD::FABS,
- ISD::FFLOOR, ISD::FRINT, ISD::FTRUNC, ISD::FMINNUM,
- ISD::FMAXNUM},
+ setOperationAction({ISD::FCEIL, ISD::FPOW, ISD::FABS, ISD::FFLOOR, ISD::FRINT,
+ ISD::FTRUNC, ISD::FMINNUM, ISD::FMAXNUM},
MVT::f32, Legal);
+ setOperationAction(ISD::FLOG2, MVT::f32, Custom);
setOperationAction(ISD::FROUND, {MVT::f32, MVT::f64}, Custom);
- setOperationAction({ISD::FLOG, ISD::FLOG10, ISD::FEXP}, MVT::f32, Custom);
+ setOperationAction({ISD::FLOG, ISD::FLOG10, ISD::FEXP, ISD::FEXP2}, MVT::f32,
+ Custom);
setOperationAction(ISD::FNEARBYINT, {MVT::f16, MVT::f32, MVT::f64}, Custom);
@@ -338,8 +346,12 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
if (Subtarget->has16BitInsts())
setOperationAction(ISD::IS_FPCLASS, {MVT::f16, MVT::f32, MVT::f64}, Legal);
- else
+ else {
setOperationAction(ISD::IS_FPCLASS, {MVT::f32, MVT::f64}, Legal);
+ setOperationAction({ISD::FLOG2, ISD::FEXP2}, MVT::f16, Custom);
+ }
+
+ setOperationAction({ISD::FLOG10, ISD::FLOG, ISD::FEXP}, MVT::f16, Custom);
// FIXME: These IS_FPCLASS vector fp types are marked custom so it reaches
// scalarization code. Can be removed when IS_FPCLASS expand isn't called by
@@ -556,7 +568,7 @@ bool AMDGPUTargetLowering::mayIgnoreSignedZero(SDValue Op) const {
//===----------------------------------------------------------------------===//
LLVM_READNONE
-static bool fnegFoldsIntoOp(unsigned Opc) {
+static bool fnegFoldsIntoOpcode(unsigned Opc) {
switch (Opc) {
case ISD::FADD:
case ISD::FSUB:
@@ -567,6 +579,7 @@ static bool fnegFoldsIntoOp(unsigned Opc) {
case ISD::FMAXNUM:
case ISD::FMINNUM_IEEE:
case ISD::FMAXNUM_IEEE:
+ case ISD::SELECT:
case ISD::FSIN:
case ISD::FTRUNC:
case ISD::FRINT:
@@ -582,17 +595,45 @@ static bool fnegFoldsIntoOp(unsigned Opc) {
case AMDGPUISD::FMED3:
// TODO: handle llvm.amdgcn.fma.legacy
return true;
+ case ISD::BITCAST:
+ llvm_unreachable("bitcast is special cased");
default:
return false;
}
}
+static bool fnegFoldsIntoOp(const SDNode *N) {
+ unsigned Opc = N->getOpcode();
+ if (Opc == ISD::BITCAST) {
+ // TODO: Is there a benefit to checking the conditions performFNegCombine
+ // does? We don't for the other cases.
+ SDValue BCSrc = N->getOperand(0);
+ if (BCSrc.getOpcode() == ISD::BUILD_VECTOR) {
+ return BCSrc.getNumOperands() == 2 &&
+ BCSrc.getOperand(1).getValueSizeInBits() == 32;
+ }
+
+ return BCSrc.getOpcode() == ISD::SELECT && BCSrc.getValueType() == MVT::f32;
+ }
+
+ return fnegFoldsIntoOpcode(Opc);
+}
+
/// \p returns true if the operation will definitely need to use a 64-bit
/// encoding, and thus will use a VOP3 encoding regardless of the source
/// modifiers.
LLVM_READONLY
static bool opMustUseVOP3Encoding(const SDNode *N, MVT VT) {
- return N->getNumOperands() > 2 || VT == MVT::f64;
+ return (N->getNumOperands() > 2 && N->getOpcode() != ISD::SELECT) ||
+ VT == MVT::f64;
+}
+
+/// Return true if v_cndmask_b32 will support fabs/fneg source modifiers for the
+/// type for ISD::SELECT.
+LLVM_READONLY
+static bool selectSupportsSourceMods(const SDNode *N) {
+ // TODO: Only applies if select will be vector
+ return N->getValueType(0) == MVT::f32;
}
// Most FP instructions support source modifiers, but this could be refined
@@ -604,7 +645,6 @@ static bool hasSourceMods(const SDNode *N) {
switch (N->getOpcode()) {
case ISD::CopyToReg:
- case ISD::SELECT:
case ISD::FDIV:
case ISD::FREM:
case ISD::INLINEASM:
@@ -629,6 +669,8 @@ static bool hasSourceMods(const SDNode *N) {
return true;
}
}
+ case ISD::SELECT:
+ return selectSupportsSourceMods(N);
default:
return true;
}
@@ -644,6 +686,8 @@ bool AMDGPUTargetLowering::allUsesHaveSourceMods(const SDNode *N,
unsigned NumMayIncreaseSize = 0;
MVT VT = N->getValueType(0).getScalarType().getSimpleVT();
+ assert(!N->use_empty());
+
// XXX - Should this limit number of uses to check?
for (const SDNode *U : N->uses()) {
if (!hasSourceMods(U))
@@ -800,6 +844,17 @@ SDValue AMDGPUTargetLowering::getNegatedExpression(
return SDValue();
break;
}
+ case AMDGPUISD::RCP: {
+ SDValue Src = Op.getOperand(0);
+ EVT VT = Op.getValueType();
+ SDLoc SL(Op);
+
+ SDValue NegSrc = getNegatedExpression(Src, DAG, LegalOperations,
+ ForCodeSize, Cost, Depth + 1);
+ if (NegSrc)
+ return DAG.getNode(AMDGPUISD::RCP, SL, VT, NegSrc, Op->getFlags());
+ return SDValue();
+ }
default:
break;
}
@@ -827,7 +882,7 @@ bool AMDGPUTargetLowering::isFNegFree(EVT VT) const {
return VT == MVT::f32 || VT == MVT::f64 || VT == MVT::f16;
}
-bool AMDGPUTargetLowering:: storeOfVectorConstantIsCheap(EVT MemVT,
+bool AMDGPUTargetLowering:: storeOfVectorConstantIsCheap(bool IsZero, EVT MemVT,
unsigned NumElem,
unsigned AS) const {
return true;
@@ -888,10 +943,6 @@ bool AMDGPUTargetLowering::isZExtFree(EVT Src, EVT Dest) const {
return Src == MVT::i32 && Dest == MVT::i64;
}
-bool AMDGPUTargetLowering::isZExtFree(SDValue Val, EVT VT2) const {
- return isZExtFree(Val.getValueType(), VT2);
-}
-
bool AMDGPUTargetLowering::isNarrowingProfitable(EVT SrcVT, EVT DestVT) const {
// There aren't really 64-bit registers, but pairs of 32-bit ones and only a
// limited number of native 64-bit operations. Shrinking an operation to fit
@@ -1021,7 +1072,7 @@ void AMDGPUTargetLowering::analyzeFormalArgumentsCompute(
const Function &Fn = MF.getFunction();
LLVMContext &Ctx = Fn.getParent()->getContext();
const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(MF);
- const unsigned ExplicitOffset = ST.getExplicitKernelArgOffset(Fn);
+ const unsigned ExplicitOffset = ST.getExplicitKernelArgOffset();
CallingConv::ID CC = Fn.getCallingConv();
Align MaxAlign = Align(1);
@@ -1258,12 +1309,15 @@ SDValue AMDGPUTargetLowering::LowerOperation(SDValue Op,
return LowerFROUNDEVEN(Op, DAG);
case ISD::FROUND: return LowerFROUND(Op, DAG);
case ISD::FFLOOR: return LowerFFLOOR(Op, DAG);
+ case ISD::FLOG2:
+ return LowerFLOG2(Op, DAG);
case ISD::FLOG:
- return LowerFLOG(Op, DAG, numbers::ln2f);
case ISD::FLOG10:
- return LowerFLOG(Op, DAG, numbers::ln2f / numbers::ln10f);
+ return LowerFLOGCommon(Op, DAG);
case ISD::FEXP:
return lowerFEXP(Op, DAG);
+ case ISD::FEXP2:
+ return lowerFEXP2(Op, DAG);
case ISD::SINT_TO_FP: return LowerSINT_TO_FP(Op, DAG);
case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG);
case ISD::FP_TO_FP16: return LowerFP_TO_FP16(Op, DAG);
@@ -1292,6 +1346,23 @@ void AMDGPUTargetLowering::ReplaceNodeResults(SDNode *N,
// ReplaceNodeResults to sext_in_reg to an illegal type, so we'll just do
// nothing here and let the illegal result integer be handled normally.
return;
+ case ISD::FLOG2:
+ if (SDValue Lowered = LowerFLOG2(SDValue(N, 0), DAG))
+ Results.push_back(Lowered);
+ return;
+ case ISD::FLOG:
+ case ISD::FLOG10:
+ if (SDValue Lowered = LowerFLOGCommon(SDValue(N, 0), DAG))
+ Results.push_back(Lowered);
+ return;
+ case ISD::FEXP2:
+ if (SDValue Lowered = lowerFEXP2(SDValue(N, 0), DAG))
+ Results.push_back(Lowered);
+ return;
+ case ISD::FEXP:
+ if (SDValue Lowered = lowerFEXP(SDValue(N, 0), DAG))
+ Results.push_back(Lowered);
+ return;
default:
return;
}
@@ -1305,6 +1376,13 @@ SDValue AMDGPUTargetLowering::LowerGlobalAddress(AMDGPUMachineFunction* MFI,
GlobalAddressSDNode *G = cast<GlobalAddressSDNode>(Op);
const GlobalValue *GV = G->getGlobal();
+ if (!MFI->isModuleEntryFunction()) {
+ if (std::optional<uint32_t> Address =
+ AMDGPUMachineFunction::getLDSAbsoluteAddress(*GV)) {
+ return DAG.getConstant(*Address, SDLoc(Op), Op.getValueType());
+ }
+ }
+
if (G->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS ||
G->getAddressSpace() == AMDGPUAS::REGION_ADDRESS) {
if (!MFI->isModuleEntryFunction() &&
@@ -1378,43 +1456,60 @@ SDValue AMDGPUTargetLowering::LowerCONCAT_VECTORS(SDValue Op,
SDValue AMDGPUTargetLowering::LowerEXTRACT_SUBVECTOR(SDValue Op,
SelectionDAG &DAG) const {
-
+ SDLoc SL(Op);
SmallVector<SDValue, 8> Args;
unsigned Start = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
EVT VT = Op.getValueType();
EVT SrcVT = Op.getOperand(0).getValueType();
- // For these types, we have some TableGen patterns except if the index is 1
- if (((SrcVT == MVT::v4f16 && VT == MVT::v2f16) ||
- (SrcVT == MVT::v4i16 && VT == MVT::v2i16)) &&
- Start != 1)
- return Op;
+ if (VT.getScalarSizeInBits() == 16 && Start % 2 == 0) {
+ unsigned NumElt = VT.getVectorNumElements();
+ unsigned NumSrcElt = SrcVT.getVectorNumElements();
+ assert(NumElt % 2 == 0 && NumSrcElt % 2 == 0 && "expect legal types");
- if (((SrcVT == MVT::v8f16 && VT == MVT::v4f16) ||
- (SrcVT == MVT::v8i16 && VT == MVT::v4i16)) &&
- (Start == 0 || Start == 4))
- return Op;
+ // Extract 32-bit registers at a time.
+ EVT NewSrcVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumSrcElt / 2);
+ EVT NewVT = NumElt == 2
+ ? MVT::i32
+ : EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElt / 2);
+ SDValue Tmp = DAG.getNode(ISD::BITCAST, SL, NewSrcVT, Op.getOperand(0));
- if (((SrcVT == MVT::v16f16 && VT == MVT::v8f16) ||
- (SrcVT == MVT::v16i16 && VT == MVT::v8i16)) &&
- (Start == 0 || Start == 8))
- return Op;
+ DAG.ExtractVectorElements(Tmp, Args, Start / 2, NumElt / 2);
+ if (NumElt == 2)
+ Tmp = Args[0];
+ else
+ Tmp = DAG.getBuildVector(NewVT, SL, Args);
+
+ return DAG.getNode(ISD::BITCAST, SL, VT, Tmp);
+ }
DAG.ExtractVectorElements(Op.getOperand(0), Args, Start,
VT.getVectorNumElements());
- return DAG.getBuildVector(Op.getValueType(), SDLoc(Op), Args);
+ return DAG.getBuildVector(Op.getValueType(), SL, Args);
+}
+
+// TODO: Handle fabs too
+static SDValue peekFNeg(SDValue Val) {
+ if (Val.getOpcode() == ISD::FNEG)
+ return Val.getOperand(0);
+
+ return Val;
}
-/// Generate Min/Max node
-SDValue AMDGPUTargetLowering::combineFMinMaxLegacy(const SDLoc &DL, EVT VT,
- SDValue LHS, SDValue RHS,
- SDValue True, SDValue False,
- SDValue CC,
- DAGCombinerInfo &DCI) const {
- if (!(LHS == True && RHS == False) && !(LHS == False && RHS == True))
- return SDValue();
+static SDValue peekFPSignOps(SDValue Val) {
+ if (Val.getOpcode() == ISD::FNEG)
+ Val = Val.getOperand(0);
+ if (Val.getOpcode() == ISD::FABS)
+ Val = Val.getOperand(0);
+ if (Val.getOpcode() == ISD::FCOPYSIGN)
+ Val = Val.getOperand(0);
+ return Val;
+}
+SDValue AMDGPUTargetLowering::combineFMinMaxLegacyImpl(
+ const SDLoc &DL, EVT VT, SDValue LHS, SDValue RHS, SDValue True,
+ SDValue False, SDValue CC, DAGCombinerInfo &DCI) const {
SelectionDAG &DAG = DCI.DAG;
ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get();
switch (CCOpcode) {
@@ -1480,6 +1575,45 @@ SDValue AMDGPUTargetLowering::combineFMinMaxLegacy(const SDLoc &DL, EVT VT,
return SDValue();
}
+/// Generate Min/Max node
+SDValue AMDGPUTargetLowering::combineFMinMaxLegacy(const SDLoc &DL, EVT VT,
+ SDValue LHS, SDValue RHS,
+ SDValue True, SDValue False,
+ SDValue CC,
+ DAGCombinerInfo &DCI) const {
+ if ((LHS == True && RHS == False) || (LHS == False && RHS == True))
+ return combineFMinMaxLegacyImpl(DL, VT, LHS, RHS, True, False, CC, DCI);
+
+ SelectionDAG &DAG = DCI.DAG;
+
+ // If we can't directly match this, try to see if we can fold an fneg to
+ // match.
+
+ ConstantFPSDNode *CRHS = dyn_cast<ConstantFPSDNode>(RHS);
+ ConstantFPSDNode *CFalse = dyn_cast<ConstantFPSDNode>(False);
+ SDValue NegTrue = peekFNeg(True);
+
+ // Undo the combine foldFreeOpFromSelect does if it helps us match the
+ // fmin/fmax.
+ //
+ // select (fcmp olt (lhs, K)), (fneg lhs), -K
+ // -> fneg (fmin_legacy lhs, K)
+ //
+ // TODO: Use getNegatedExpression
+ if (LHS == NegTrue && CFalse && CRHS) {
+ APFloat NegRHS = neg(CRHS->getValueAPF());
+ if (NegRHS == CFalse->getValueAPF()) {
+ SDValue Combined =
+ combineFMinMaxLegacyImpl(DL, VT, LHS, RHS, NegTrue, False, CC, DCI);
+ if (Combined)
+ return DAG.getNode(ISD::FNEG, DL, VT, Combined);
+ return SDValue();
+ }
+ }
+
+ return SDValue();
+}
+
std::pair<SDValue, SDValue>
AMDGPUTargetLowering::split64BitValue(SDValue Op, SelectionDAG &DAG) const {
SDLoc SL(Op);
@@ -1749,7 +1883,8 @@ SDValue AMDGPUTargetLowering::LowerDIVREM24(SDValue Op, SelectionDAG &DAG,
bool UseFmadFtz = false;
if (Subtarget->isGCN()) {
const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
- UseFmadFtz = MFI->getMode().allFP32Denormals();
+ UseFmadFtz =
+ MFI->getMode().FP32Denormals != DenormalMode::getPreserveSign();
}
// float fr = mad(fqneg, fb, fa);
@@ -1811,13 +1946,13 @@ void AMDGPUTargetLowering::LowerUDIVREM64(SDValue Op,
SDValue Zero = DAG.getConstant(0, DL, HalfVT);
//HiLo split
+ SDValue LHS_Lo, LHS_Hi;
SDValue LHS = Op.getOperand(0);
- SDValue LHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, LHS, Zero);
- SDValue LHS_Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, LHS, One);
+ std::tie(LHS_Lo, LHS_Hi) = DAG.SplitScalar(LHS, DL, HalfVT, HalfVT);
+ SDValue RHS_Lo, RHS_Hi;
SDValue RHS = Op.getOperand(1);
- SDValue RHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, RHS, Zero);
- SDValue RHS_Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, RHS, One);
+ std::tie(RHS_Lo, RHS_Hi) = DAG.SplitScalar(RHS, DL, HalfVT, HalfVT);
if (DAG.MaskedValueIsZero(RHS, APInt::getHighBitsSet(64, 32)) &&
DAG.MaskedValueIsZero(LHS, APInt::getHighBitsSet(64, 32))) {
@@ -1841,11 +1976,11 @@ void AMDGPUTargetLowering::LowerUDIVREM64(SDValue Op,
const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
// Compute denominator reciprocal.
- unsigned FMAD = !Subtarget->hasMadMacF32Insts() ?
- (unsigned)ISD::FMA :
- !MFI->getMode().allFP32Denormals() ?
- (unsigned)ISD::FMAD :
- (unsigned)AMDGPUISD::FMAD_FTZ;
+ unsigned FMAD =
+ !Subtarget->hasMadMacF32Insts() ? (unsigned)ISD::FMA
+ : MFI->getMode().FP32Denormals == DenormalMode::getPreserveSign()
+ ? (unsigned)ISD::FMAD
+ : (unsigned)AMDGPUISD::FMAD_FTZ;
SDValue Cvt_Lo = DAG.getNode(ISD::UINT_TO_FP, DL, MVT::f32, RHS_Lo);
SDValue Cvt_Hi = DAG.getNode(ISD::UINT_TO_FP, DL, MVT::f32, RHS_Hi);
@@ -1875,13 +2010,12 @@ void AMDGPUTargetLowering::LowerUDIVREM64(SDValue Op,
SDValue Neg_RHS = DAG.getNode(ISD::SUB, DL, VT, Zero64, RHS);
SDValue Mullo1 = DAG.getNode(ISD::MUL, DL, VT, Neg_RHS, Rcp64);
SDValue Mulhi1 = DAG.getNode(ISD::MULHU, DL, VT, Rcp64, Mullo1);
- SDValue Mulhi1_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, Mulhi1,
- Zero);
- SDValue Mulhi1_Hi =
- DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, Mulhi1, One);
- SDValue Add1_Lo = DAG.getNode(ISD::ADDCARRY, DL, HalfCarryVT, Rcp_Lo,
+ SDValue Mulhi1_Lo, Mulhi1_Hi;
+ std::tie(Mulhi1_Lo, Mulhi1_Hi) =
+ DAG.SplitScalar(Mulhi1, DL, HalfVT, HalfVT);
+ SDValue Add1_Lo = DAG.getNode(ISD::UADDO_CARRY, DL, HalfCarryVT, Rcp_Lo,
Mulhi1_Lo, Zero1);
- SDValue Add1_Hi = DAG.getNode(ISD::ADDCARRY, DL, HalfCarryVT, Rcp_Hi,
+ SDValue Add1_Hi = DAG.getNode(ISD::UADDO_CARRY, DL, HalfCarryVT, Rcp_Hi,
Mulhi1_Hi, Add1_Lo.getValue(1));
SDValue Add1 = DAG.getBitcast(VT,
DAG.getBuildVector(MVT::v2i32, DL, {Add1_Lo, Add1_Hi}));
@@ -1889,13 +2023,12 @@ void AMDGPUTargetLowering::LowerUDIVREM64(SDValue Op,
// Second round of UNR.
SDValue Mullo2 = DAG.getNode(ISD::MUL, DL, VT, Neg_RHS, Add1);
SDValue Mulhi2 = DAG.getNode(ISD::MULHU, DL, VT, Add1, Mullo2);
- SDValue Mulhi2_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, Mulhi2,
- Zero);
- SDValue Mulhi2_Hi =
- DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, Mulhi2, One);
- SDValue Add2_Lo = DAG.getNode(ISD::ADDCARRY, DL, HalfCarryVT, Add1_Lo,
+ SDValue Mulhi2_Lo, Mulhi2_Hi;
+ std::tie(Mulhi2_Lo, Mulhi2_Hi) =
+ DAG.SplitScalar(Mulhi2, DL, HalfVT, HalfVT);
+ SDValue Add2_Lo = DAG.getNode(ISD::UADDO_CARRY, DL, HalfCarryVT, Add1_Lo,
Mulhi2_Lo, Zero1);
- SDValue Add2_Hi = DAG.getNode(ISD::ADDCARRY, DL, HalfCarryVT, Add1_Hi,
+ SDValue Add2_Hi = DAG.getNode(ISD::UADDO_CARRY, DL, HalfCarryVT, Add1_Hi,
Mulhi2_Hi, Add2_Lo.getValue(1));
SDValue Add2 = DAG.getBitcast(VT,
DAG.getBuildVector(MVT::v2i32, DL, {Add2_Lo, Add2_Hi}));
@@ -1904,11 +2037,11 @@ void AMDGPUTargetLowering::LowerUDIVREM64(SDValue Op,
SDValue Mul3 = DAG.getNode(ISD::MUL, DL, VT, RHS, Mulhi3);
- SDValue Mul3_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, Mul3, Zero);
- SDValue Mul3_Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, Mul3, One);
- SDValue Sub1_Lo = DAG.getNode(ISD::SUBCARRY, DL, HalfCarryVT, LHS_Lo,
+ SDValue Mul3_Lo, Mul3_Hi;
+ std::tie(Mul3_Lo, Mul3_Hi) = DAG.SplitScalar(Mul3, DL, HalfVT, HalfVT);
+ SDValue Sub1_Lo = DAG.getNode(ISD::USUBO_CARRY, DL, HalfCarryVT, LHS_Lo,
Mul3_Lo, Zero1);
- SDValue Sub1_Hi = DAG.getNode(ISD::SUBCARRY, DL, HalfCarryVT, LHS_Hi,
+ SDValue Sub1_Hi = DAG.getNode(ISD::USUBO_CARRY, DL, HalfCarryVT, LHS_Hi,
Mul3_Hi, Sub1_Lo.getValue(1));
SDValue Sub1_Mi = DAG.getNode(ISD::SUB, DL, HalfVT, LHS_Hi, Mul3_Hi);
SDValue Sub1 = DAG.getBitcast(VT,
@@ -1926,11 +2059,11 @@ void AMDGPUTargetLowering::LowerUDIVREM64(SDValue Op,
// potential endif to substitute PHIs.
// if C3 != 0 ...
- SDValue Sub2_Lo = DAG.getNode(ISD::SUBCARRY, DL, HalfCarryVT, Sub1_Lo,
+ SDValue Sub2_Lo = DAG.getNode(ISD::USUBO_CARRY, DL, HalfCarryVT, Sub1_Lo,
RHS_Lo, Zero1);
- SDValue Sub2_Mi = DAG.getNode(ISD::SUBCARRY, DL, HalfCarryVT, Sub1_Mi,
+ SDValue Sub2_Mi = DAG.getNode(ISD::USUBO_CARRY, DL, HalfCarryVT, Sub1_Mi,
RHS_Hi, Sub1_Lo.getValue(1));
- SDValue Sub2_Hi = DAG.getNode(ISD::SUBCARRY, DL, HalfCarryVT, Sub2_Mi,
+ SDValue Sub2_Hi = DAG.getNode(ISD::USUBO_CARRY, DL, HalfCarryVT, Sub2_Mi,
Zero, Sub2_Lo.getValue(1));
SDValue Sub2 = DAG.getBitcast(VT,
DAG.getBuildVector(MVT::v2i32, DL, {Sub2_Lo, Sub2_Hi}));
@@ -1946,11 +2079,11 @@ void AMDGPUTargetLowering::LowerUDIVREM64(SDValue Op,
// if (C6 != 0)
SDValue Add4 = DAG.getNode(ISD::ADD, DL, VT, Add3, One64);
- SDValue Sub3_Lo = DAG.getNode(ISD::SUBCARRY, DL, HalfCarryVT, Sub2_Lo,
+ SDValue Sub3_Lo = DAG.getNode(ISD::USUBO_CARRY, DL, HalfCarryVT, Sub2_Lo,
RHS_Lo, Zero1);
- SDValue Sub3_Mi = DAG.getNode(ISD::SUBCARRY, DL, HalfCarryVT, Sub2_Mi,
+ SDValue Sub3_Mi = DAG.getNode(ISD::USUBO_CARRY, DL, HalfCarryVT, Sub2_Mi,
RHS_Hi, Sub2_Lo.getValue(1));
- SDValue Sub3_Hi = DAG.getNode(ISD::SUBCARRY, DL, HalfCarryVT, Sub3_Mi,
+ SDValue Sub3_Hi = DAG.getNode(ISD::USUBO_CARRY, DL, HalfCarryVT, Sub3_Mi,
Zero, Sub3_Lo.getValue(1));
SDValue Sub3 = DAG.getBitcast(VT,
DAG.getBuildVector(MVT::v2i32, DL, {Sub3_Lo, Sub3_Hi}));
@@ -2329,27 +2462,445 @@ SDValue AMDGPUTargetLowering::LowerFFLOOR(SDValue Op, SelectionDAG &DAG) const {
return DAG.getNode(ISD::FADD, SL, MVT::f64, Trunc, Add);
}
-SDValue AMDGPUTargetLowering::LowerFLOG(SDValue Op, SelectionDAG &DAG,
- double Log2BaseInverted) const {
- EVT VT = Op.getValueType();
+/// Return true if it's known that \p Src can never be an f32 denormal value.
+static bool valueIsKnownNeverF32Denorm(SDValue Src) {
+ switch (Src.getOpcode()) {
+ case ISD::FP_EXTEND:
+ return Src.getOperand(0).getValueType() == MVT::f16;
+ case ISD::FP16_TO_FP:
+ return true;
+ default:
+ return false;
+ }
+
+ llvm_unreachable("covered opcode switch");
+}
+
+static bool allowApproxFunc(const SelectionDAG &DAG, SDNodeFlags Flags) {
+ if (Flags.hasApproximateFuncs())
+ return true;
+ auto &Options = DAG.getTarget().Options;
+ return Options.UnsafeFPMath || Options.ApproxFuncFPMath;
+}
+
+static bool needsDenormHandlingF32(const SelectionDAG &DAG, SDValue Src,
+ SDNodeFlags Flags) {
+ return !valueIsKnownNeverF32Denorm(Src) &&
+ DAG.getMachineFunction()
+ .getDenormalMode(APFloat::IEEEsingle())
+ .Input != DenormalMode::PreserveSign;
+}
+
+SDValue AMDGPUTargetLowering::getIsLtSmallestNormal(SelectionDAG &DAG,
+ SDValue Src,
+ SDNodeFlags Flags) const {
+ SDLoc SL(Src);
+ EVT VT = Src.getValueType();
+ const fltSemantics &Semantics = SelectionDAG::EVTToAPFloatSemantics(VT);
+ SDValue SmallestNormal =
+ DAG.getConstantFP(APFloat::getSmallestNormalized(Semantics), SL, VT);
+
+ // Want to scale denormals up, but negatives and 0 work just as well on the
+ // scaled path.
+ SDValue IsLtSmallestNormal = DAG.getSetCC(
+ SL, getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT), Src,
+ SmallestNormal, ISD::SETOLT);
+
+ return IsLtSmallestNormal;
+}
+
+SDValue AMDGPUTargetLowering::getIsFinite(SelectionDAG &DAG, SDValue Src,
+ SDNodeFlags Flags) const {
+ SDLoc SL(Src);
+ EVT VT = Src.getValueType();
+ const fltSemantics &Semantics = SelectionDAG::EVTToAPFloatSemantics(VT);
+ SDValue Inf = DAG.getConstantFP(APFloat::getInf(Semantics), SL, VT);
+
+ SDValue Fabs = DAG.getNode(ISD::FABS, SL, VT, Src, Flags);
+ SDValue IsFinite = DAG.getSetCC(
+ SL, getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT), Fabs,
+ Inf, ISD::SETOLT);
+ return IsFinite;
+}
+
+/// If denormal handling is required return the scaled input to FLOG2, and the
+/// check for denormal range. Otherwise, return null values.
+std::pair<SDValue, SDValue>
+AMDGPUTargetLowering::getScaledLogInput(SelectionDAG &DAG, const SDLoc SL,
+ SDValue Src, SDNodeFlags Flags) const {
+ if (allowApproxFunc(DAG, Flags) || !needsDenormHandlingF32(DAG, Src, Flags))
+ return {};
+
+ MVT VT = MVT::f32;
+ const fltSemantics &Semantics = APFloat::IEEEsingle();
+ SDValue SmallestNormal =
+ DAG.getConstantFP(APFloat::getSmallestNormalized(Semantics), SL, VT);
+
+ SDValue IsLtSmallestNormal = DAG.getSetCC(
+ SL, getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT), Src,
+ SmallestNormal, ISD::SETOLT);
+
+ SDValue Scale32 = DAG.getConstantFP(0x1.0p+32, SL, VT);
+ SDValue One = DAG.getConstantFP(1.0, SL, VT);
+ SDValue ScaleFactor =
+ DAG.getNode(ISD::SELECT, SL, VT, IsLtSmallestNormal, Scale32, One, Flags);
+
+ SDValue ScaledInput = DAG.getNode(ISD::FMUL, SL, VT, Src, ScaleFactor, Flags);
+ return {ScaledInput, IsLtSmallestNormal};
+}
+
+SDValue AMDGPUTargetLowering::LowerFLOG2(SDValue Op, SelectionDAG &DAG) const {
+ // v_log_f32 is good enough for OpenCL, except it doesn't handle denormals.
+ // If we have to handle denormals, scale up the input and adjust the result.
+
+ // scaled = x * (is_denormal ? 0x1.0p+32 : 1.0)
+ // log2 = amdgpu_log2 - (is_denormal ? 32.0 : 0.0)
SDLoc SL(Op);
- SDValue Operand = Op.getOperand(0);
- SDValue Log2Operand = DAG.getNode(ISD::FLOG2, SL, VT, Operand);
- SDValue Log2BaseInvertedOperand = DAG.getConstantFP(Log2BaseInverted, SL, VT);
+ EVT VT = Op.getValueType();
+ SDValue Src = Op.getOperand(0);
+ SDNodeFlags Flags = Op->getFlags();
+
+ if (VT == MVT::f16) {
+ // Nothing in half is a denormal when promoted to f32.
+ assert(!Subtarget->has16BitInsts());
+ SDValue Ext = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src, Flags);
+ SDValue Log = DAG.getNode(AMDGPUISD::LOG, SL, MVT::f32, Ext, Flags);
+ return DAG.getNode(ISD::FP_ROUND, SL, VT, Log,
+ DAG.getTargetConstant(0, SL, MVT::i32), Flags);
+ }
+
+ auto [ScaledInput, IsLtSmallestNormal] =
+ getScaledLogInput(DAG, SL, Src, Flags);
+ if (!ScaledInput)
+ return DAG.getNode(AMDGPUISD::LOG, SL, VT, Src, Flags);
+
+ SDValue Log2 = DAG.getNode(AMDGPUISD::LOG, SL, VT, ScaledInput, Flags);
- return DAG.getNode(ISD::FMUL, SL, VT, Log2Operand, Log2BaseInvertedOperand);
+ SDValue ThirtyTwo = DAG.getConstantFP(32.0, SL, VT);
+ SDValue Zero = DAG.getConstantFP(0.0, SL, VT);
+ SDValue ResultOffset =
+ DAG.getNode(ISD::SELECT, SL, VT, IsLtSmallestNormal, ThirtyTwo, Zero);
+ return DAG.getNode(ISD::FSUB, SL, VT, Log2, ResultOffset, Flags);
}
-// exp2(M_LOG2E_F * f);
-SDValue AMDGPUTargetLowering::lowerFEXP(SDValue Op, SelectionDAG &DAG) const {
+static SDValue getMad(SelectionDAG &DAG, const SDLoc &SL, EVT VT, SDValue X,
+ SDValue Y, SDValue C, SDNodeFlags Flags = SDNodeFlags()) {
+ SDValue Mul = DAG.getNode(ISD::FMUL, SL, VT, X, Y, Flags);
+ return DAG.getNode(ISD::FADD, SL, VT, Mul, C, Flags);
+}
+
+SDValue AMDGPUTargetLowering::LowerFLOGCommon(SDValue Op,
+ SelectionDAG &DAG) const {
+ SDValue X = Op.getOperand(0);
EVT VT = Op.getValueType();
+ SDNodeFlags Flags = Op->getFlags();
+ SDLoc DL(Op);
+
+ const bool IsLog10 = Op.getOpcode() == ISD::FLOG10;
+ assert(IsLog10 || Op.getOpcode() == ISD::FLOG);
+
+ const auto &Options = getTargetMachine().Options;
+ if (VT == MVT::f16 || Flags.hasApproximateFuncs() ||
+ Options.ApproxFuncFPMath || Options.UnsafeFPMath) {
+
+ if (VT == MVT::f16 && !Subtarget->has16BitInsts()) {
+ // Log and multiply in f32 is good enough for f16.
+ X = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, X, Flags);
+ }
+
+ SDValue Lowered = LowerFLOGUnsafe(
+ X, DL, DAG, IsLog10 ? numbers::ln2 / numbers::ln10 : numbers::ln2,
+ Flags);
+ if (VT == MVT::f16 && !Subtarget->has16BitInsts()) {
+ return DAG.getNode(ISD::FP_ROUND, DL, VT, Lowered,
+ DAG.getTargetConstant(0, DL, MVT::i32), Flags);
+ }
+
+ return Lowered;
+ }
+
+ auto [ScaledInput, IsScaled] = getScaledLogInput(DAG, DL, X, Flags);
+ if (ScaledInput)
+ X = ScaledInput;
+
+ SDValue Y = DAG.getNode(AMDGPUISD::LOG, DL, VT, X, Flags);
+
+ SDValue R;
+ if (Subtarget->hasFastFMAF32()) {
+ // c+cc are ln(2)/ln(10) to more than 49 bits
+ const float c_log10 = 0x1.344134p-2f;
+ const float cc_log10 = 0x1.09f79ep-26f;
+
+ // c + cc is ln(2) to more than 49 bits
+ const float c_log = 0x1.62e42ep-1f;
+ const float cc_log = 0x1.efa39ep-25f;
+
+ SDValue C = DAG.getConstantFP(IsLog10 ? c_log10 : c_log, DL, VT);
+ SDValue CC = DAG.getConstantFP(IsLog10 ? cc_log10 : cc_log, DL, VT);
+
+ R = DAG.getNode(ISD::FMUL, DL, VT, Y, C, Flags);
+ SDValue NegR = DAG.getNode(ISD::FNEG, DL, VT, R, Flags);
+ SDValue FMA0 = DAG.getNode(ISD::FMA, DL, VT, Y, C, NegR, Flags);
+ SDValue FMA1 = DAG.getNode(ISD::FMA, DL, VT, Y, CC, FMA0, Flags);
+ R = DAG.getNode(ISD::FADD, DL, VT, R, FMA1, Flags);
+ } else {
+ // ch+ct is ln(2)/ln(10) to more than 36 bits
+ const float ch_log10 = 0x1.344000p-2f;
+ const float ct_log10 = 0x1.3509f6p-18f;
+
+ // ch + ct is ln(2) to more than 36 bits
+ const float ch_log = 0x1.62e000p-1f;
+ const float ct_log = 0x1.0bfbe8p-15f;
+
+ SDValue CH = DAG.getConstantFP(IsLog10 ? ch_log10 : ch_log, DL, VT);
+ SDValue CT = DAG.getConstantFP(IsLog10 ? ct_log10 : ct_log, DL, VT);
+
+ SDValue YAsInt = DAG.getNode(ISD::BITCAST, DL, MVT::i32, Y);
+ SDValue MaskConst = DAG.getConstant(0xfffff000, DL, MVT::i32);
+ SDValue YHInt = DAG.getNode(ISD::AND, DL, MVT::i32, YAsInt, MaskConst);
+ SDValue YH = DAG.getNode(ISD::BITCAST, DL, MVT::f32, YHInt);
+ SDValue YT = DAG.getNode(ISD::FSUB, DL, VT, Y, YH, Flags);
+
+ SDValue YTCT = DAG.getNode(ISD::FMUL, DL, VT, YT, CT, Flags);
+ SDValue Mad0 = getMad(DAG, DL, VT, YH, CT, YTCT, Flags);
+ SDValue Mad1 = getMad(DAG, DL, VT, YT, CH, Mad0, Flags);
+ R = getMad(DAG, DL, VT, YH, CH, Mad1);
+ }
+
+ const bool IsFiniteOnly = (Flags.hasNoNaNs() || Options.NoNaNsFPMath) &&
+ (Flags.hasNoInfs() || Options.NoInfsFPMath);
+
+ // TODO: Check if known finite from source value.
+ if (!IsFiniteOnly) {
+ SDValue IsFinite = getIsFinite(DAG, Y, Flags);
+ R = DAG.getNode(ISD::SELECT, DL, VT, IsFinite, R, Y, Flags);
+ }
+
+ if (IsScaled) {
+ SDValue Zero = DAG.getConstantFP(0.0f, DL, VT);
+ SDValue ShiftK =
+ DAG.getConstantFP(IsLog10 ? 0x1.344136p+3f : 0x1.62e430p+4f, DL, VT);
+ SDValue Shift =
+ DAG.getNode(ISD::SELECT, DL, VT, IsScaled, ShiftK, Zero, Flags);
+ R = DAG.getNode(ISD::FSUB, DL, VT, R, Shift, Flags);
+ }
+
+ return R;
+}
+
+SDValue AMDGPUTargetLowering::LowerFLOG10(SDValue Op, SelectionDAG &DAG) const {
+ return LowerFLOGCommon(Op, DAG);
+}
+
+// Do f32 fast math expansion for flog2 or flog10. This is accurate enough for a
+// promote f16 operation.
+SDValue AMDGPUTargetLowering::LowerFLOGUnsafe(SDValue Src, const SDLoc &SL,
+ SelectionDAG &DAG,
+ double Log2BaseInverted,
+ SDNodeFlags Flags) const {
+ EVT VT = Src.getValueType();
+ unsigned LogOp = VT == MVT::f32 ? AMDGPUISD::LOG : ISD::FLOG2;
+ SDValue Log2Operand = DAG.getNode(LogOp, SL, VT, Src, Flags);
+ SDValue Log2BaseInvertedOperand = DAG.getConstantFP(Log2BaseInverted, SL, VT);
+
+ return DAG.getNode(ISD::FMUL, SL, VT, Log2Operand, Log2BaseInvertedOperand,
+ Flags);
+}
+
+SDValue AMDGPUTargetLowering::lowerFEXP2(SDValue Op, SelectionDAG &DAG) const {
+ // v_exp_f32 is good enough for OpenCL, except it doesn't handle denormals.
+ // If we have to handle denormals, scale up the input and adjust the result.
+
SDLoc SL(Op);
+ EVT VT = Op.getValueType();
SDValue Src = Op.getOperand(0);
+ SDNodeFlags Flags = Op->getFlags();
+
+ if (VT == MVT::f16) {
+ // Nothing in half is a denormal when promoted to f32.
+ assert(!Subtarget->has16BitInsts());
+ SDValue Ext = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src, Flags);
+ SDValue Log = DAG.getNode(AMDGPUISD::EXP, SL, MVT::f32, Ext, Flags);
+ return DAG.getNode(ISD::FP_ROUND, SL, VT, Log,
+ DAG.getTargetConstant(0, SL, MVT::i32), Flags);
+ }
+ assert(VT == MVT::f32);
+
+ if (allowApproxFunc(DAG, Flags) || !needsDenormHandlingF32(DAG, Src, Flags))
+ return DAG.getNode(AMDGPUISD::EXP, SL, MVT::f32, Src, Flags);
+
+ // bool needs_scaling = x < -0x1.f80000p+6f;
+ // v_exp_f32(x + (s ? 0x1.0p+6f : 0.0f)) * (s ? 0x1.0p-64f : 1.0f);
+
+ // -nextafter(128.0, -1)
+ SDValue RangeCheckConst = DAG.getConstantFP(-0x1.f80000p+6f, SL, VT);
+
+ EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
+
+ SDValue NeedsScaling =
+ DAG.getSetCC(SL, SetCCVT, Src, RangeCheckConst, ISD::SETOLT);
+
+ SDValue SixtyFour = DAG.getConstantFP(0x1.0p+6f, SL, VT);
+ SDValue Zero = DAG.getConstantFP(0.0, SL, VT);
+
+ SDValue AddOffset =
+ DAG.getNode(ISD::SELECT, SL, VT, NeedsScaling, SixtyFour, Zero);
+
+ SDValue AddInput = DAG.getNode(ISD::FADD, SL, VT, Src, AddOffset, Flags);
+ SDValue Exp2 = DAG.getNode(AMDGPUISD::EXP, SL, VT, AddInput, Flags);
+
+ SDValue TwoExpNeg64 = DAG.getConstantFP(0x1.0p-64f, SL, VT);
+ SDValue One = DAG.getConstantFP(1.0, SL, VT);
+ SDValue ResultScale =
+ DAG.getNode(ISD::SELECT, SL, VT, NeedsScaling, TwoExpNeg64, One);
+
+ return DAG.getNode(ISD::FMUL, SL, VT, Exp2, ResultScale, Flags);
+}
+
+SDValue AMDGPUTargetLowering::lowerFEXPUnsafe(SDValue Op, const SDLoc &SL,
+ SelectionDAG &DAG,
+ SDNodeFlags Flags) const {
+ // exp2(M_LOG2E_F * f);
+ EVT VT = Op.getValueType();
const SDValue K = DAG.getConstantFP(numbers::log2e, SL, VT);
- SDValue Mul = DAG.getNode(ISD::FMUL, SL, VT, Src, K, Op->getFlags());
- return DAG.getNode(ISD::FEXP2, SL, VT, Mul, Op->getFlags());
+ SDValue Mul = DAG.getNode(ISD::FMUL, SL, VT, Op, K, Flags);
+ return DAG.getNode(VT == MVT::f32 ? AMDGPUISD::EXP : ISD::FEXP2, SL, VT, Mul,
+ Flags);
+}
+
+SDValue AMDGPUTargetLowering::lowerFEXP(SDValue Op, SelectionDAG &DAG) const {
+ EVT VT = Op.getValueType();
+ SDLoc SL(Op);
+ SDValue X = Op.getOperand(0);
+ SDNodeFlags Flags = Op->getFlags();
+ const bool IsExp10 = false; // TODO: For some reason exp10 is missing
+
+ if (VT.getScalarType() == MVT::f16) {
+ // v_exp_f16 (fmul x, log2e)
+ if (allowApproxFunc(DAG, Flags)) // TODO: Does this really require fast?
+ return lowerFEXPUnsafe(X, SL, DAG, Flags);
+
+ if (VT.isVector())
+ return SDValue();
+
+ // exp(f16 x) ->
+ // fptrunc (v_exp_f32 (fmul (fpext x), log2e))
+
+ // Nothing in half is a denormal when promoted to f32.
+ SDValue Ext = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, X, Flags);
+ SDValue Lowered = lowerFEXPUnsafe(Ext, SL, DAG, Flags);
+ return DAG.getNode(ISD::FP_ROUND, SL, VT, Lowered,
+ DAG.getTargetConstant(0, SL, MVT::i32), Flags);
+ }
+
+ assert(VT == MVT::f32);
+
+ // TODO: Interpret allowApproxFunc as ignoring DAZ. This is currently copying
+ // library behavior. Also, is known-not-daz source sufficient?
+ if (allowApproxFunc(DAG, Flags) && !needsDenormHandlingF32(DAG, X, Flags)) {
+ assert(!IsExp10 && "todo exp10 support");
+ return lowerFEXPUnsafe(X, SL, DAG, Flags);
+ }
+
+ // Algorithm:
+ //
+ // e^x = 2^(x/ln(2)) = 2^(x*(64/ln(2))/64)
+ //
+ // x*(64/ln(2)) = n + f, |f| <= 0.5, n is integer
+ // n = 64*m + j, 0 <= j < 64
+ //
+ // e^x = 2^((64*m + j + f)/64)
+ // = (2^m) * (2^(j/64)) * 2^(f/64)
+ // = (2^m) * (2^(j/64)) * e^(f*(ln(2)/64))
+ //
+ // f = x*(64/ln(2)) - n
+ // r = f*(ln(2)/64) = x - n*(ln(2)/64)
+ //
+ // e^x = (2^m) * (2^(j/64)) * e^r
+ //
+ // (2^(j/64)) is precomputed
+ //
+ // e^r = 1 + r + (r^2)/2! + (r^3)/3! + (r^4)/4! + (r^5)/5!
+ // e^r = 1 + q
+ //
+ // q = r + (r^2)/2! + (r^3)/3! + (r^4)/4! + (r^5)/5!
+ //
+ // e^x = (2^m) * ( (2^(j/64)) + q*(2^(j/64)) )
+ SDNodeFlags FlagsNoContract = Flags;
+ FlagsNoContract.setAllowContract(false);
+
+ SDValue PH, PL;
+ if (Subtarget->hasFastFMAF32()) {
+ const float c_exp = numbers::log2ef;
+ const float cc_exp = 0x1.4ae0bep-26f; // c+cc are 49 bits
+ const float c_exp10 = 0x1.a934f0p+1f;
+ const float cc_exp10 = 0x1.2f346ep-24f;
+
+ SDValue C = DAG.getConstantFP(IsExp10 ? c_exp10 : c_exp, SL, VT);
+ SDValue CC = DAG.getConstantFP(IsExp10 ? cc_exp10 : cc_exp, SL, VT);
+
+ PH = DAG.getNode(ISD::FMUL, SL, VT, X, C, Flags);
+ SDValue NegPH = DAG.getNode(ISD::FNEG, SL, VT, PH, Flags);
+ SDValue FMA0 = DAG.getNode(ISD::FMA, SL, VT, X, C, NegPH, Flags);
+ PL = DAG.getNode(ISD::FMA, SL, VT, X, CC, FMA0, Flags);
+ } else {
+ const float ch_exp = 0x1.714000p+0f;
+ const float cl_exp = 0x1.47652ap-12f; // ch + cl are 36 bits
+
+ const float ch_exp10 = 0x1.a92000p+1f;
+ const float cl_exp10 = 0x1.4f0978p-11f;
+
+ SDValue CH = DAG.getConstantFP(IsExp10 ? ch_exp10 : ch_exp, SL, VT);
+ SDValue CL = DAG.getConstantFP(IsExp10 ? cl_exp10 : cl_exp, SL, VT);
+
+ SDValue XAsInt = DAG.getNode(ISD::BITCAST, SL, MVT::i32, X);
+ SDValue MaskConst = DAG.getConstant(0xfffff000, SL, MVT::i32);
+ SDValue XHAsInt = DAG.getNode(ISD::AND, SL, MVT::i32, XAsInt, MaskConst);
+ SDValue XH = DAG.getNode(ISD::BITCAST, SL, VT, XHAsInt);
+ SDValue XL = DAG.getNode(ISD::FSUB, SL, VT, X, XH, Flags);
+
+ PH = DAG.getNode(ISD::FMUL, SL, VT, XH, CH, Flags);
+
+ SDValue XLCL = DAG.getNode(ISD::FMUL, SL, VT, XL, CL, Flags);
+ SDValue Mad0 = getMad(DAG, SL, VT, XL, CH, XLCL, Flags);
+ PL = getMad(DAG, SL, VT, XH, CL, Mad0, Flags);
+ }
+
+ SDValue E = DAG.getNode(ISD::FRINT, SL, VT, PH, Flags);
+
+ // It is unsafe to contract this fsub into the PH multiply.
+ SDValue PHSubE = DAG.getNode(ISD::FSUB, SL, VT, PH, E, FlagsNoContract);
+
+ SDValue A = DAG.getNode(ISD::FADD, SL, VT, PHSubE, PL, Flags);
+ SDValue IntE = DAG.getNode(ISD::FP_TO_SINT, SL, MVT::i32, E);
+ SDValue Exp2 = DAG.getNode(AMDGPUISD::EXP, SL, VT, A, Flags);
+
+ SDValue R = DAG.getNode(ISD::FLDEXP, SL, VT, Exp2, IntE, Flags);
+
+ SDValue UnderflowCheckConst =
+ DAG.getConstantFP(IsExp10 ? -0x1.66d3e8p+5f : -0x1.9d1da0p+6f, SL, VT);
+
+ EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
+ SDValue Zero = DAG.getConstantFP(0.0, SL, VT);
+ SDValue Underflow =
+ DAG.getSetCC(SL, SetCCVT, X, UnderflowCheckConst, ISD::SETOLT);
+
+ R = DAG.getNode(ISD::SELECT, SL, VT, Underflow, Zero, R);
+ const auto &Options = getTargetMachine().Options;
+
+ if (!Flags.hasNoInfs() && !Options.NoInfsFPMath) {
+ SDValue OverflowCheckConst =
+ DAG.getConstantFP(IsExp10 ? 0x1.344136p+5f : 0x1.62e430p+6f, SL, VT);
+ SDValue Overflow =
+ DAG.getSetCC(SL, SetCCVT, X, OverflowCheckConst, ISD::SETOGT);
+ SDValue Inf =
+ DAG.getConstantFP(APFloat::getInf(APFloat::IEEEsingle()), SL, VT);
+ R = DAG.getNode(ISD::SELECT, SL, VT, Overflow, Inf, R);
+ }
+
+ return R;
}
static bool isCtlzOpc(unsigned Opc) {
@@ -2518,7 +3069,7 @@ SDValue AMDGPUTargetLowering::LowerINT_TO_FP32(SDValue Op, SelectionDAG &DAG,
ShAmt);
// On GCN, use LDEXP directly.
if (Subtarget->isGCN())
- return DAG.getNode(AMDGPUISD::LDEXP, SL, MVT::f32, FVal, ShAmt);
+ return DAG.getNode(ISD::FLDEXP, SL, MVT::f32, FVal, ShAmt);
// Otherwise, align 'ShAmt' to the exponent part and add it into the exponent
// part directly to emulate the multiplication of 2^ShAmt. That 8-bit
@@ -2551,7 +3102,7 @@ SDValue AMDGPUTargetLowering::LowerINT_TO_FP64(SDValue Op, SelectionDAG &DAG,
SDValue CvtLo = DAG.getNode(ISD::UINT_TO_FP, SL, MVT::f64, Lo);
- SDValue LdExp = DAG.getNode(AMDGPUISD::LDEXP, SL, MVT::f64, CvtHi,
+ SDValue LdExp = DAG.getNode(ISD::FLDEXP, SL, MVT::f64, CvtHi,
DAG.getConstant(32, SL, MVT::i32));
// TODO: Should this propagate fast-math-flags?
return DAG.getNode(ISD::FADD, SL, MVT::f64, LdExp, CvtLo);
@@ -2670,15 +3221,17 @@ SDValue AMDGPUTargetLowering::LowerFP_TO_INT64(SDValue Op, SelectionDAG &DAG,
SDValue K0, K1;
if (SrcVT == MVT::f64) {
- K0 = DAG.getConstantFP(BitsToDouble(UINT64_C(/*2^-32*/ 0x3df0000000000000)),
- SL, SrcVT);
- K1 = DAG.getConstantFP(BitsToDouble(UINT64_C(/*-2^32*/ 0xc1f0000000000000)),
- SL, SrcVT);
+ K0 = DAG.getConstantFP(
+ llvm::bit_cast<double>(UINT64_C(/*2^-32*/ 0x3df0000000000000)), SL,
+ SrcVT);
+ K1 = DAG.getConstantFP(
+ llvm::bit_cast<double>(UINT64_C(/*-2^32*/ 0xc1f0000000000000)), SL,
+ SrcVT);
} else {
- K0 = DAG.getConstantFP(BitsToFloat(UINT32_C(/*2^-32*/ 0x2f800000)), SL,
- SrcVT);
- K1 = DAG.getConstantFP(BitsToFloat(UINT32_C(/*-2^32*/ 0xcf800000)), SL,
- SrcVT);
+ K0 = DAG.getConstantFP(
+ llvm::bit_cast<float>(UINT32_C(/*2^-32*/ 0x2f800000)), SL, SrcVT);
+ K1 = DAG.getConstantFP(
+ llvm::bit_cast<float>(UINT32_C(/*-2^32*/ 0xcf800000)), SL, SrcVT);
}
// TODO: Should this propagate fast-math-flags?
SDValue Mul = DAG.getNode(ISD::FMUL, SL, SrcVT, Trunc, K0);
@@ -3128,6 +3681,17 @@ SDValue AMDGPUTargetLowering::performIntrinsicWOChainCombine(
SDValue Src = N->getOperand(1);
return Src.isUndef() ? Src : SDValue();
}
+ case Intrinsic::amdgcn_frexp_exp: {
+ // frexp_exp (fneg x) -> frexp_exp x
+ // frexp_exp (fabs x) -> frexp_exp x
+ // frexp_exp (fneg (fabs x)) -> frexp_exp x
+ SDValue Src = N->getOperand(1);
+ SDValue PeekSign = peekFPSignOps(Src);
+ if (PeekSign == Src)
+ return SDValue();
+ return SDValue(DCI.DAG.UpdateNodeOperands(N, N->getOperand(0), PeekSign),
+ 0);
+ }
default:
return SDValue();
}
@@ -3419,6 +3983,16 @@ static SDValue getMul24(SelectionDAG &DAG, const SDLoc &SL,
return DAG.getNode(ISD::BUILD_PAIR, SL, MVT::i64, MulLo, MulHi);
}
+/// If \p V is an add of a constant 1, returns the other operand. Otherwise
+/// return SDValue().
+static SDValue getAddOneOp(const SDNode *V) {
+ if (V->getOpcode() != ISD::ADD)
+ return SDValue();
+
+ auto *C = dyn_cast<ConstantSDNode>(V->getOperand(1));
+ return C && C->isOne() ? V->getOperand(0) : SDValue();
+}
+
SDValue AMDGPUTargetLowering::performMulCombine(SDNode *N,
DAGCombinerInfo &DCI) const {
EVT VT = N->getValueType(0);
@@ -3434,16 +4008,49 @@ SDValue AMDGPUTargetLowering::performMulCombine(SDNode *N,
if (VT.isVector() || Size > 64)
return SDValue();
- // There are i16 integer mul/mad.
- if (Subtarget->has16BitInsts() && VT.getScalarType().bitsLE(MVT::i16))
- return SDValue();
-
SelectionDAG &DAG = DCI.DAG;
SDLoc DL(N);
SDValue N0 = N->getOperand(0);
SDValue N1 = N->getOperand(1);
+ // Undo InstCombine canonicalize X * (Y + 1) -> X * Y + X to enable mad
+ // matching.
+
+ // mul x, (add y, 1) -> add (mul x, y), x
+ auto IsFoldableAdd = [](SDValue V) -> SDValue {
+ SDValue AddOp = getAddOneOp(V.getNode());
+ if (!AddOp)
+ return SDValue();
+
+ if (V.hasOneUse() || all_of(V->uses(), [](const SDNode *U) -> bool {
+ return U->getOpcode() == ISD::MUL;
+ }))
+ return AddOp;
+
+ return SDValue();
+ };
+
+ // FIXME: The selection pattern is not properly checking for commuted
+ // operands, so we have to place the mul in the LHS
+ if (SDValue MulOper = IsFoldableAdd(N0)) {
+ SDValue MulVal = DAG.getNode(N->getOpcode(), DL, VT, N1, MulOper);
+ return DAG.getNode(ISD::ADD, DL, VT, MulVal, N1);
+ }
+
+ if (SDValue MulOper = IsFoldableAdd(N1)) {
+ SDValue MulVal = DAG.getNode(N->getOpcode(), DL, VT, N0, MulOper);
+ return DAG.getNode(ISD::ADD, DL, VT, MulVal, N0);
+ }
+
+ // Skip if already mul24.
+ if (N->getOpcode() != ISD::MUL)
+ return SDValue();
+
+ // There are i16 integer mul/mad.
+ if (Subtarget->has16BitInsts() && VT.getScalarType().bitsLE(MVT::i16))
+ return SDValue();
+
// SimplifyDemandedBits has the annoying habit of turning useful zero_extends
// in the source into any_extends if the result of the mul is truncated. Since
// we can assume the high bits are whatever we want, use the underlying value
@@ -3583,12 +4190,6 @@ SDValue AMDGPUTargetLowering::performMulhuCombine(SDNode *N,
return DAG.getZExtOrTrunc(Mulhi, DL, VT);
}
-static bool isNegativeOne(SDValue Val) {
- if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Val))
- return C->isAllOnes();
- return false;
-}
-
SDValue AMDGPUTargetLowering::getFFBX_U32(SelectionDAG &DAG,
SDValue Op,
const SDLoc &DL,
@@ -3631,7 +4232,7 @@ SDValue AMDGPUTargetLowering::performCtlz_CttzCombine(const SDLoc &SL, SDValue C
// select (setcc x, 0, eq), -1, (cttz_zero_undef x) -> ffbl_u32 x
if (CCOpcode == ISD::SETEQ &&
(isCtlzOpc(RHS.getOpcode()) || isCttzOpc(RHS.getOpcode())) &&
- RHS.getOperand(0) == CmpLHS && isNegativeOne(LHS)) {
+ RHS.getOperand(0) == CmpLHS && isAllOnesConstant(LHS)) {
unsigned Opc =
isCttzOpc(RHS.getOpcode()) ? AMDGPUISD::FFBL_B32 : AMDGPUISD::FFBH_U32;
return getFFBX_U32(DAG, CmpLHS, SL, Opc);
@@ -3641,7 +4242,7 @@ SDValue AMDGPUTargetLowering::performCtlz_CttzCombine(const SDLoc &SL, SDValue C
// select (setcc x, 0, ne), (cttz_zero_undef x), -1 -> ffbl_u32 x
if (CCOpcode == ISD::SETNE &&
(isCtlzOpc(LHS.getOpcode()) || isCttzOpc(LHS.getOpcode())) &&
- LHS.getOperand(0) == CmpLHS && isNegativeOne(RHS)) {
+ LHS.getOperand(0) == CmpLHS && isAllOnesConstant(RHS)) {
unsigned Opc =
isCttzOpc(LHS.getOpcode()) ? AMDGPUISD::FFBL_B32 : AMDGPUISD::FFBH_U32;
@@ -3673,8 +4274,9 @@ static SDValue distributeOpThroughSelect(TargetLowering::DAGCombinerInfo &DCI,
//
// select c, (fabs x), (fabs y) -> fabs (select c, x, y)
// select c, (fabs x), +k -> fabs (select c, x, k)
-static SDValue foldFreeOpFromSelect(TargetLowering::DAGCombinerInfo &DCI,
- SDValue N) {
+SDValue
+AMDGPUTargetLowering::foldFreeOpFromSelect(TargetLowering::DAGCombinerInfo &DCI,
+ SDValue N) const {
SelectionDAG &DAG = DCI.DAG;
SDValue Cond = N.getOperand(0);
SDValue LHS = N.getOperand(1);
@@ -3683,6 +4285,9 @@ static SDValue foldFreeOpFromSelect(TargetLowering::DAGCombinerInfo &DCI,
EVT VT = N.getValueType();
if ((LHS.getOpcode() == ISD::FABS && RHS.getOpcode() == ISD::FABS) ||
(LHS.getOpcode() == ISD::FNEG && RHS.getOpcode() == ISD::FNEG)) {
+ if (!AMDGPUTargetLowering::allUsesHaveSourceMods(N.getNode()))
+ return SDValue();
+
return distributeOpThroughSelect(DCI, LHS.getOpcode(),
SDLoc(N), Cond, LHS, RHS);
}
@@ -3695,7 +4300,8 @@ static SDValue foldFreeOpFromSelect(TargetLowering::DAGCombinerInfo &DCI,
// TODO: Support vector constants.
ConstantFPSDNode *CRHS = dyn_cast<ConstantFPSDNode>(RHS);
- if ((LHS.getOpcode() == ISD::FNEG || LHS.getOpcode() == ISD::FABS) && CRHS) {
+ if ((LHS.getOpcode() == ISD::FNEG || LHS.getOpcode() == ISD::FABS) && CRHS &&
+ !selectSupportsSourceMods(N.getNode())) {
SDLoc SL(N);
// If one side is an fneg/fabs and the other is a constant, we can push the
// fneg/fabs down. If it's an fabs, the constant needs to be non-negative.
@@ -3707,17 +4313,31 @@ static SDValue foldFreeOpFromSelect(TargetLowering::DAGCombinerInfo &DCI,
if (NewLHS.hasOneUse()) {
unsigned Opc = NewLHS.getOpcode();
- if (LHS.getOpcode() == ISD::FNEG && fnegFoldsIntoOp(Opc))
+ if (LHS.getOpcode() == ISD::FNEG && fnegFoldsIntoOp(NewLHS.getNode()))
ShouldFoldNeg = false;
if (LHS.getOpcode() == ISD::FABS && Opc == ISD::FMUL)
ShouldFoldNeg = false;
}
if (ShouldFoldNeg) {
+ if (LHS.getOpcode() == ISD::FABS && CRHS->isNegative())
+ return SDValue();
+
+ // We're going to be forced to use a source modifier anyway, there's no
+ // point to pulling the negate out unless we can get a size reduction by
+ // negating the constant.
+ //
+ // TODO: Generalize to use getCheaperNegatedExpression which doesn't know
+ // about cheaper constants.
+ if (NewLHS.getOpcode() == ISD::FABS &&
+ getConstantNegateCost(CRHS) != NegatibleCost::Cheaper)
+ return SDValue();
+
+ if (!AMDGPUTargetLowering::allUsesHaveSourceMods(N.getNode()))
+ return SDValue();
+
if (LHS.getOpcode() == ISD::FNEG)
NewRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
- else if (CRHS->isNegative())
- return SDValue();
if (Inv)
std::swap(NewLHS, NewRHS);
@@ -3732,7 +4352,6 @@ static SDValue foldFreeOpFromSelect(TargetLowering::DAGCombinerInfo &DCI,
return SDValue();
}
-
SDValue AMDGPUTargetLowering::performSelectCombine(SDNode *N,
DAGCombinerInfo &DCI) const {
if (SDValue Folded = foldFreeOpFromSelect(DCI, SDValue(N, 0)))
@@ -3791,15 +4410,26 @@ static bool isInv2Pi(const APFloat &APF) {
// 0 and 1.0 / (0.5 * pi) do not have inline immmediates, so there is an
// additional cost to negate them.
-bool AMDGPUTargetLowering::isConstantCostlierToNegate(SDValue N) const {
- if (const ConstantFPSDNode *C = isConstOrConstSplatFP(N)) {
- if (C->isZero() && !C->isNegative())
- return true;
+TargetLowering::NegatibleCost
+AMDGPUTargetLowering::getConstantNegateCost(const ConstantFPSDNode *C) const {
+ if (C->isZero())
+ return C->isNegative() ? NegatibleCost::Cheaper : NegatibleCost::Expensive;
- if (Subtarget->hasInv2PiInlineImm() && isInv2Pi(C->getValueAPF()))
- return true;
- }
+ if (Subtarget->hasInv2PiInlineImm() && isInv2Pi(C->getValueAPF()))
+ return C->isNegative() ? NegatibleCost::Cheaper : NegatibleCost::Expensive;
+
+ return NegatibleCost::Neutral;
+}
+
+bool AMDGPUTargetLowering::isConstantCostlierToNegate(SDValue N) const {
+ if (const ConstantFPSDNode *C = isConstOrConstSplatFP(N))
+ return getConstantNegateCost(C) == NegatibleCost::Expensive;
+ return false;
+}
+bool AMDGPUTargetLowering::isConstantCheaperToNegate(SDValue N) const {
+ if (const ConstantFPSDNode *C = isConstOrConstSplatFP(N))
+ return getConstantNegateCost(C) == NegatibleCost::Cheaper;
return false;
}
@@ -3822,14 +4452,9 @@ static unsigned inverseMinMax(unsigned Opc) {
}
}
-SDValue AMDGPUTargetLowering::performFNegCombine(SDNode *N,
- DAGCombinerInfo &DCI) const {
- SelectionDAG &DAG = DCI.DAG;
- SDValue N0 = N->getOperand(0);
- EVT VT = N->getValueType(0);
-
- unsigned Opc = N0.getOpcode();
-
+/// \return true if it's profitable to try to push an fneg into its source
+/// instruction.
+bool AMDGPUTargetLowering::shouldFoldFNegIntoSrc(SDNode *N, SDValue N0) {
// If the input has multiple uses and we can either fold the negate down, or
// the other uses cannot, give up. This both prevents unprofitable
// transformations and infinite loops: we won't repeatedly try to fold around
@@ -3838,13 +4463,27 @@ SDValue AMDGPUTargetLowering::performFNegCombine(SDNode *N,
// This may be able to fold into the source, but at a code size cost. Don't
// fold if the fold into the user is free.
if (allUsesHaveSourceMods(N, 0))
- return SDValue();
+ return false;
} else {
- if (fnegFoldsIntoOp(Opc) &&
+ if (fnegFoldsIntoOp(N0.getNode()) &&
(allUsesHaveSourceMods(N) || !allUsesHaveSourceMods(N0.getNode())))
- return SDValue();
+ return false;
}
+ return true;
+}
+
+SDValue AMDGPUTargetLowering::performFNegCombine(SDNode *N,
+ DAGCombinerInfo &DCI) const {
+ SelectionDAG &DAG = DCI.DAG;
+ SDValue N0 = N->getOperand(0);
+ EVT VT = N->getValueType(0);
+
+ unsigned Opc = N0.getOpcode();
+
+ if (!shouldFoldFNegIntoSrc(N, N0))
+ return SDValue();
+
SDLoc SL(N);
switch (Opc) {
case ISD::FADD: {
@@ -4027,6 +4666,67 @@ SDValue AMDGPUTargetLowering::performFNegCombine(SDNode *N,
DAG.getConstant(0x8000, SL, SrcVT));
return DAG.getNode(ISD::FP16_TO_FP, SL, N->getValueType(0), IntFNeg);
}
+ case ISD::SELECT: {
+ // fneg (select c, a, b) -> select c, (fneg a), (fneg b)
+ // TODO: Invert conditions of foldFreeOpFromSelect
+ return SDValue();
+ }
+ case ISD::BITCAST: {
+ SDLoc SL(N);
+ SDValue BCSrc = N0.getOperand(0);
+ if (BCSrc.getOpcode() == ISD::BUILD_VECTOR) {
+ SDValue HighBits = BCSrc.getOperand(BCSrc.getNumOperands() - 1);
+ if (HighBits.getValueType().getSizeInBits() != 32 ||
+ !fnegFoldsIntoOp(HighBits.getNode()))
+ return SDValue();
+
+ // f64 fneg only really needs to operate on the high half of of the
+ // register, so try to force it to an f32 operation to help make use of
+ // source modifiers.
+ //
+ //
+ // fneg (f64 (bitcast (build_vector x, y))) ->
+ // f64 (bitcast (build_vector (bitcast i32:x to f32),
+ // (fneg (bitcast i32:y to f32)))
+
+ SDValue CastHi = DAG.getNode(ISD::BITCAST, SL, MVT::f32, HighBits);
+ SDValue NegHi = DAG.getNode(ISD::FNEG, SL, MVT::f32, CastHi);
+ SDValue CastBack =
+ DAG.getNode(ISD::BITCAST, SL, HighBits.getValueType(), NegHi);
+
+ SmallVector<SDValue, 8> Ops(BCSrc->op_begin(), BCSrc->op_end());
+ Ops.back() = CastBack;
+ DCI.AddToWorklist(NegHi.getNode());
+ SDValue Build =
+ DAG.getNode(ISD::BUILD_VECTOR, SL, BCSrc.getValueType(), Ops);
+ SDValue Result = DAG.getNode(ISD::BITCAST, SL, VT, Build);
+
+ if (!N0.hasOneUse())
+ DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Result));
+ return Result;
+ }
+
+ if (BCSrc.getOpcode() == ISD::SELECT && VT == MVT::f32 &&
+ BCSrc.hasOneUse()) {
+ // fneg (bitcast (f32 (select cond, i32:lhs, i32:rhs))) ->
+ // select cond, (bitcast i32:lhs to f32), (bitcast i32:rhs to f32)
+
+ // TODO: Cast back result for multiple uses is beneficial in some cases.
+
+ SDValue LHS =
+ DAG.getNode(ISD::BITCAST, SL, MVT::f32, BCSrc.getOperand(1));
+ SDValue RHS =
+ DAG.getNode(ISD::BITCAST, SL, MVT::f32, BCSrc.getOperand(2));
+
+ SDValue NegLHS = DAG.getNode(ISD::FNEG, SL, MVT::f32, LHS);
+ SDValue NegRHS = DAG.getNode(ISD::FNEG, SL, MVT::f32, RHS);
+
+ return DAG.getNode(ISD::SELECT, SL, MVT::f32, BCSrc.getOperand(0), NegLHS,
+ NegRHS);
+ }
+
+ return SDValue();
+ }
default:
return SDValue();
}
@@ -4158,6 +4858,15 @@ SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N,
return performTruncateCombine(N, DCI);
case ISD::MUL:
return performMulCombine(N, DCI);
+ case AMDGPUISD::MUL_U24:
+ case AMDGPUISD::MUL_I24: {
+ if (SDValue Simplified = simplifyMul24(N, DCI))
+ return Simplified;
+ return performMulCombine(N, DCI);
+ }
+ case AMDGPUISD::MULHI_I24:
+ case AMDGPUISD::MULHI_U24:
+ return simplifyMul24(N, DCI);
case ISD::SMUL_LOHI:
case ISD::UMUL_LOHI:
return performMulLoHiCombine(N, DCI);
@@ -4165,11 +4874,6 @@ SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N,
return performMulhsCombine(N, DCI);
case ISD::MULHU:
return performMulhuCombine(N, DCI);
- case AMDGPUISD::MUL_I24:
- case AMDGPUISD::MUL_U24:
- case AMDGPUISD::MULHI_I24:
- case AMDGPUISD::MULHI_U24:
- return simplifyMul24(N, DCI);
case ISD::SELECT:
return performSelectCombine(N, DCI);
case ISD::FNEG:
@@ -4365,7 +5069,7 @@ SDValue AMDGPUTargetLowering::loadInputValue(SelectionDAG &DAG,
return V;
unsigned Mask = Arg.getMask();
- unsigned Shift = countTrailingZeros<unsigned>(Mask);
+ unsigned Shift = llvm::countr_zero<unsigned>(Mask);
V = DAG.getNode(ISD::SRL, SL, VT, V,
DAG.getShiftAmountConstant(Shift, VT, SL));
return DAG.getNode(ISD::AND, SL, VT, V,
@@ -4373,14 +5077,11 @@ SDValue AMDGPUTargetLowering::loadInputValue(SelectionDAG &DAG,
}
uint32_t AMDGPUTargetLowering::getImplicitParameterOffset(
- const MachineFunction &MF, const ImplicitParameter Param) const {
- const AMDGPUMachineFunction *MFI = MF.getInfo<AMDGPUMachineFunction>();
- const AMDGPUSubtarget &ST =
- AMDGPUSubtarget::get(getTargetMachine(), MF.getFunction());
- unsigned ExplicitArgOffset = ST.getExplicitKernelArgOffset(MF.getFunction());
- const Align Alignment = ST.getAlignmentForImplicitArgPtr();
- uint64_t ArgOffset = alignTo(MFI->getExplicitKernArgSize(), Alignment) +
- ExplicitArgOffset;
+ uint64_t ExplicitKernArgSize, const ImplicitParameter Param) const {
+ unsigned ExplicitArgOffset = Subtarget->getExplicitKernelArgOffset();
+ const Align Alignment = Subtarget->getAlignmentForImplicitArgPtr();
+ uint64_t ArgOffset =
+ alignTo(ExplicitKernArgSize, Alignment) + ExplicitArgOffset;
switch (Param) {
case FIRST_IMPLICIT:
return ArgOffset;
@@ -4394,6 +5095,12 @@ uint32_t AMDGPUTargetLowering::getImplicitParameterOffset(
llvm_unreachable("unexpected implicit parameter type");
}
+uint32_t AMDGPUTargetLowering::getImplicitParameterOffset(
+ const MachineFunction &MF, const ImplicitParameter Param) const {
+ const AMDGPUMachineFunction *MFI = MF.getInfo<AMDGPUMachineFunction>();
+ return getImplicitParameterOffset(MFI->getExplicitKernArgSize(), Param);
+}
+
#define NODE_NAME_CASE(node) case AMDGPUISD::node: return #node;
const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
@@ -4409,10 +5116,12 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
NODE_NAME_CASE(LOOP)
NODE_NAME_CASE(CALL)
NODE_NAME_CASE(TC_RETURN)
+ NODE_NAME_CASE(TC_RETURN_GFX)
NODE_NAME_CASE(TRAP)
- NODE_NAME_CASE(RET_FLAG)
+ NODE_NAME_CASE(RET_GLUE)
NODE_NAME_CASE(RETURN_TO_EPILOG)
NODE_NAME_CASE(ENDPGM)
+ NODE_NAME_CASE(ENDPGM_TRAP)
NODE_NAME_CASE(DWORDADDR)
NODE_NAME_CASE(FRACT)
NODE_NAME_CASE(SETCC)
@@ -4444,9 +5153,10 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
NODE_NAME_CASE(RSQ)
NODE_NAME_CASE(RCP_LEGACY)
NODE_NAME_CASE(RCP_IFLAG)
+ NODE_NAME_CASE(LOG)
+ NODE_NAME_CASE(EXP)
NODE_NAME_CASE(FMUL_LEGACY)
NODE_NAME_CASE(RSQ_CLAMP)
- NODE_NAME_CASE(LDEXP)
NODE_NAME_CASE(FP_CLASS)
NODE_NAME_CASE(DOT4)
NODE_NAME_CASE(CARRY)
@@ -4508,8 +5218,6 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
NODE_NAME_CASE(TBUFFER_LOAD_FORMAT_D16)
NODE_NAME_CASE(DS_ORDERED_COUNT)
NODE_NAME_CASE(ATOMIC_CMP_SWAP)
- NODE_NAME_CASE(ATOMIC_INC)
- NODE_NAME_CASE(ATOMIC_DEC)
NODE_NAME_CASE(ATOMIC_LOAD_FMIN)
NODE_NAME_CASE(ATOMIC_LOAD_FMAX)
NODE_NAME_CASE(BUFFER_LOAD)
@@ -4725,31 +5433,38 @@ void AMDGPUTargetLowering::computeKnownBitsForTargetNode(
Known.Zero.setLowBits(Log2(Alignment));
break;
}
+ case AMDGPUISD::SMIN3:
+ case AMDGPUISD::SMAX3:
+ case AMDGPUISD::SMED3:
+ case AMDGPUISD::UMIN3:
+ case AMDGPUISD::UMAX3:
+ case AMDGPUISD::UMED3: {
+ KnownBits Known2 = DAG.computeKnownBits(Op.getOperand(2), Depth + 1);
+ if (Known2.isUnknown())
+ break;
+
+ KnownBits Known1 = DAG.computeKnownBits(Op.getOperand(1), Depth + 1);
+ if (Known1.isUnknown())
+ break;
+
+ KnownBits Known0 = DAG.computeKnownBits(Op.getOperand(0), Depth + 1);
+ if (Known0.isUnknown())
+ break;
+
+ // TODO: Handle LeadZero/LeadOne from UMIN/UMAX handling.
+ Known.Zero = Known0.Zero & Known1.Zero & Known2.Zero;
+ Known.One = Known0.One & Known1.One & Known2.One;
+ break;
+ }
case ISD::INTRINSIC_WO_CHAIN: {
unsigned IID = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
switch (IID) {
- case Intrinsic::amdgcn_mbcnt_lo:
- case Intrinsic::amdgcn_mbcnt_hi: {
- const GCNSubtarget &ST =
- DAG.getMachineFunction().getSubtarget<GCNSubtarget>();
- // These return at most the (wavefront size - 1) + src1
- // As long as src1 is an immediate we can calc known bits
- KnownBits Src1Known = DAG.computeKnownBits(Op.getOperand(2), Depth + 1);
- unsigned Src1ValBits = Src1Known.countMaxActiveBits();
- unsigned MaxActiveBits = std::max(Src1ValBits, ST.getWavefrontSizeLog2());
- // Cater for potential carry
- MaxActiveBits += Src1ValBits ? 1 : 0;
- unsigned Size = Op.getValueType().getSizeInBits();
- if (MaxActiveBits < Size)
- Known.Zero.setHighBits(Size - MaxActiveBits);
- break;
- }
case Intrinsic::amdgcn_workitem_id_x:
case Intrinsic::amdgcn_workitem_id_y:
case Intrinsic::amdgcn_workitem_id_z: {
unsigned MaxValue = Subtarget->getMaxWorkitemID(
DAG.getMachineFunction().getFunction(), workitemIntrinsicDim(IID));
- Known.Zero.setHighBits(countLeadingZeros(MaxValue));
+ Known.Zero.setHighBits(llvm::countl_zero(MaxValue));
break;
}
default:
@@ -4795,6 +5510,26 @@ unsigned AMDGPUTargetLowering::ComputeNumSignBitsForTargetNode(
return 16;
case AMDGPUISD::FP_TO_FP16:
return 16;
+ case AMDGPUISD::SMIN3:
+ case AMDGPUISD::SMAX3:
+ case AMDGPUISD::SMED3:
+ case AMDGPUISD::UMIN3:
+ case AMDGPUISD::UMAX3:
+ case AMDGPUISD::UMED3: {
+ unsigned Tmp2 = DAG.ComputeNumSignBits(Op.getOperand(2), Depth + 1);
+ if (Tmp2 == 1)
+ return 1; // Early out.
+
+ unsigned Tmp1 = DAG.ComputeNumSignBits(Op.getOperand(1), Depth + 1);
+ if (Tmp1 == 1)
+ return 1; // Early out.
+
+ unsigned Tmp0 = DAG.ComputeNumSignBits(Op.getOperand(0), Depth + 1);
+ if (Tmp0 == 1)
+ return 1; // Early out.
+
+ return std::min(Tmp0, std::min(Tmp1, Tmp2));
+ }
default:
return 1;
}
@@ -4818,6 +5553,20 @@ unsigned AMDGPUTargetLowering::computeNumSignBitsForTargetInstr(
return 24;
case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT:
return 16;
+ case AMDGPU::G_AMDGPU_SMED3:
+ case AMDGPU::G_AMDGPU_UMED3: {
+ auto [Dst, Src0, Src1, Src2] = MI->getFirst4Regs();
+ unsigned Tmp2 = Analysis.computeNumSignBits(Src2, DemandedElts, Depth + 1);
+ if (Tmp2 == 1)
+ return 1;
+ unsigned Tmp1 = Analysis.computeNumSignBits(Src1, DemandedElts, Depth + 1);
+ if (Tmp1 == 1)
+ return 1;
+ unsigned Tmp0 = Analysis.computeNumSignBits(Src0, DemandedElts, Depth + 1);
+ if (Tmp0 == 1)
+ return 1;
+ return std::min(Tmp0, std::min(Tmp1, Tmp2));
+ }
default:
return 1;
}
@@ -4871,7 +5620,7 @@ bool AMDGPUTargetLowering::isKnownNeverNaNForTargetNode(SDValue Op,
// TODO: Need is known positive check.
return false;
}
- case AMDGPUISD::LDEXP:
+ case ISD::FLDEXP:
case AMDGPUISD::FRACT: {
if (SNaN)
return true;
@@ -4936,6 +5685,11 @@ bool AMDGPUTargetLowering::isKnownNeverNaNForTargetNode(SDValue Op,
}
}
+bool AMDGPUTargetLowering::isReassocProfitable(MachineRegisterInfo &MRI,
+ Register N0, Register N1) const {
+ return MRI.hasOneNonDBGUse(N0); // FIXME: handle regbanks
+}
+
TargetLowering::AtomicExpansionKind
AMDGPUTargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const {
switch (RMW->getOperation()) {
@@ -4962,3 +5716,22 @@ bool AMDGPUTargetLowering::isConstantUnsignedBitfieldExtractLegal(
return (Ty1 == LLT::scalar(32) || Ty1 == LLT::scalar(64)) &&
Ty2 == LLT::scalar(32);
}
+
+/// Whether it is profitable to sink the operands of an
+/// Instruction I to the basic block of I.
+/// This helps using several modifiers (like abs and neg) more often.
+bool AMDGPUTargetLowering::shouldSinkOperands(
+ Instruction *I, SmallVectorImpl<Use *> &Ops) const {
+ using namespace PatternMatch;
+
+ for (auto &Op : I->operands()) {
+ // Ensure we are not already sinking this operand.
+ if (any_of(Ops, [&](Use *U) { return U->get() == Op.get(); }))
+ continue;
+
+ if (match(&Op, m_FAbs(m_Value())) || match(&Op, m_FNeg(m_Value())))
+ Ops.push_back(&Op);
+ }
+
+ return !Ops.empty();
+}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
index bc3b57a82d08..26b91155ba85 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
@@ -60,8 +60,23 @@ protected:
SDValue LowerFROUNDEVEN(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerFROUND(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerFFLOOR(SDValue Op, SelectionDAG &DAG) const;
- SDValue LowerFLOG(SDValue Op, SelectionDAG &DAG,
- double Log2BaseInverted) const;
+
+ SDValue getIsLtSmallestNormal(SelectionDAG &DAG, SDValue Op,
+ SDNodeFlags Flags) const;
+ SDValue getIsFinite(SelectionDAG &DAG, SDValue Op, SDNodeFlags Flags) const;
+ std::pair<SDValue, SDValue> getScaledLogInput(SelectionDAG &DAG,
+ const SDLoc SL, SDValue Op,
+ SDNodeFlags Flags) const;
+
+ SDValue LowerFLOG2(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerFLOGCommon(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerFLOG10(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerFLOGUnsafe(SDValue Op, const SDLoc &SL, SelectionDAG &DAG,
+ double Log2BaseInverted, SDNodeFlags Flags) const;
+ SDValue lowerFEXP2(SDValue Op, SelectionDAG &DAG) const;
+
+ SDValue lowerFEXPUnsafe(SDValue Op, const SDLoc &SL, SelectionDAG &DAG,
+ SDNodeFlags Flags) const;
SDValue lowerFEXP(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerCTLZ_CTTZ(SDValue Op, SelectionDAG &DAG) const;
@@ -97,9 +112,16 @@ protected:
SDValue performMulhuCombine(SDNode *N, DAGCombinerInfo &DCI) const;
SDValue performCtlz_CttzCombine(const SDLoc &SL, SDValue Cond, SDValue LHS,
SDValue RHS, DAGCombinerInfo &DCI) const;
+
+ SDValue foldFreeOpFromSelect(TargetLowering::DAGCombinerInfo &DCI,
+ SDValue N) const;
SDValue performSelectCombine(SDNode *N, DAGCombinerInfo &DCI) const;
+ TargetLowering::NegatibleCost
+ getConstantNegateCost(const ConstantFPSDNode *C) const;
+
bool isConstantCostlierToNegate(SDValue N) const;
+ bool isConstantCheaperToNegate(SDValue N) const;
SDValue performFNegCombine(SDNode *N, DAGCombinerInfo &DCI) const;
SDValue performFAbsCombine(SDNode *N, DAGCombinerInfo &DCI) const;
SDValue performRcpCombine(SDNode *N, DAGCombinerInfo &DCI) const;
@@ -156,6 +178,7 @@ public:
return Val.getOpcode() == ISD::BITCAST ? Val.getOperand(0) : Val;
}
+ static bool shouldFoldFNegIntoSrc(SDNode *FNeg, SDValue FNegSrc);
static bool allUsesHaveSourceMods(const SDNode *N,
unsigned CostThreshold = 4);
bool isFAbsFree(EVT VT) const override;
@@ -165,14 +188,13 @@ public:
bool isZExtFree(Type *Src, Type *Dest) const override;
bool isZExtFree(EVT Src, EVT Dest) const override;
- bool isZExtFree(SDValue Val, EVT VT2) const override;
SDValue getNegatedExpression(SDValue Op, SelectionDAG &DAG,
bool LegalOperations, bool ForCodeSize,
NegatibleCost &Cost,
unsigned Depth) const override;
- bool isNarrowingProfitable(EVT VT1, EVT VT2) const override;
+ bool isNarrowingProfitable(EVT SrcVT, EVT DestVT) const override;
bool isDesirableToCommuteWithShift(const SDNode *N,
CombineLevel Level) const override;
@@ -193,7 +215,7 @@ public:
bool isLoadBitCastBeneficial(EVT, EVT, const SelectionDAG &DAG,
const MachineMemOperand &MMO) const final;
- bool storeOfVectorConstantIsCheap(EVT MemVT,
+ bool storeOfVectorConstantIsCheap(bool IsZero, EVT MemVT,
unsigned NumElem,
unsigned AS) const override;
bool aggressivelyPreferBuildVectorSources(EVT VecVT) const override;
@@ -229,6 +251,10 @@ public:
SmallVectorImpl<SDValue> &Results,
SelectionDAG &DAG) const override;
+ SDValue combineFMinMaxLegacyImpl(const SDLoc &DL, EVT VT, SDValue LHS,
+ SDValue RHS, SDValue True, SDValue False,
+ SDValue CC, DAGCombinerInfo &DCI) const;
+
SDValue combineFMinMaxLegacy(const SDLoc &DL, EVT VT, SDValue LHS,
SDValue RHS, SDValue True, SDValue False,
SDValue CC, DAGCombinerInfo &DCI) const;
@@ -281,6 +307,9 @@ public:
bool SNaN = false,
unsigned Depth = 0) const override;
+ bool isReassocProfitable(MachineRegisterInfo &MRI, Register N0,
+ Register N1) const override;
+
/// Helper function that adds Reg to the LiveIn list of the DAG's
/// MachineFunction.
///
@@ -333,6 +362,8 @@ public:
/// type of implicit parameter.
uint32_t getImplicitParameterOffset(const MachineFunction &MF,
const ImplicitParameter Param) const;
+ uint32_t getImplicitParameterOffset(const uint64_t ExplicitKernArgSize,
+ const ImplicitParameter Param) const;
MVT getFenceOperandTy(const DataLayout &DL) const override {
return MVT::i32;
@@ -342,6 +373,9 @@ public:
bool isConstantUnsignedBitfieldExtractLegal(unsigned Opc, LLT Ty1,
LLT Ty2) const override;
+
+ bool shouldSinkOperands(Instruction *I,
+ SmallVectorImpl<Use *> &Ops) const override;
};
namespace AMDGPUISD {
@@ -356,6 +390,7 @@ enum NodeType : unsigned {
// Function call.
CALL,
TC_RETURN,
+ TC_RETURN_GFX,
TRAP,
// Masked control flow nodes.
@@ -366,11 +401,14 @@ enum NodeType : unsigned {
// A uniform kernel return that terminates the wavefront.
ENDPGM,
+ // s_endpgm, but we may want to insert it in the middle of the block.
+ ENDPGM_TRAP,
+
// Return to a shader part's epilog code.
RETURN_TO_EPILOG,
// Return with values from a non-entry function.
- RET_FLAG,
+ RET_GLUE,
DWORDADDR,
FRACT,
@@ -421,9 +459,15 @@ enum NodeType : unsigned {
RSQ,
RCP_LEGACY,
RCP_IFLAG,
+
+ // log2, no denormal handling for f32.
+ LOG,
+
+ // exp2, no denormal handling for f32.
+ EXP,
+
FMUL_LEGACY,
RSQ_CLAMP,
- LDEXP,
FP_CLASS,
DOT4,
CARRY,
@@ -505,8 +549,6 @@ enum NodeType : unsigned {
TBUFFER_LOAD_FORMAT_D16,
DS_ORDERED_COUNT,
ATOMIC_CMP_SWAP,
- ATOMIC_INC,
- ATOMIC_DEC,
ATOMIC_LOAD_FMIN,
ATOMIC_LOAD_FMAX,
BUFFER_LOAD,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInsertDelayAlu.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInsertDelayAlu.cpp
index c9cdbc89f3a4..7619a39bac9c 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInsertDelayAlu.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInsertDelayAlu.cpp
@@ -51,7 +51,7 @@ public:
MI.getOpcode() == AMDGPU::S_SENDMSG_RTN_B64)
return true;
if (MI.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
- (MI.getOperand(0).getImm() & 0xf000) == 0)
+ AMDGPU::DepCtr::decodeFieldVaVdst(MI.getOperand(0).getImm()) == 0)
return true;
return false;
}
@@ -77,11 +77,15 @@ public:
struct DelayInfo {
// One larger than the maximum number of (non-TRANS) VALU instructions we
// can encode in an s_delay_alu instruction.
- static const unsigned VALU_MAX = 5;
+ static constexpr unsigned VALU_MAX = 5;
// One larger than the maximum number of TRANS instructions we can encode in
// an s_delay_alu instruction.
- static const unsigned TRANS_MAX = 4;
+ static constexpr unsigned TRANS_MAX = 4;
+
+ // One larger than the maximum number of SALU cycles we can encode in an
+ // s_delay_alu instruction.
+ static constexpr unsigned SALU_CYCLES_MAX = 4;
// If it was written by a (non-TRANS) VALU, remember how many clock cycles
// are left until it completes, and how many other (non-TRANS) VALU we have
@@ -120,7 +124,9 @@ public:
TRANSNumVALU = 0;
break;
case SALU:
- SALUCycles = Cycles;
+ // Guard against pseudo-instructions like SI_CALL which are marked as
+ // SALU but with a very high latency.
+ SALUCycles = std::min(Cycles, SALU_CYCLES_MAX);
break;
}
}
@@ -278,6 +284,7 @@ public:
// Wait for an SALU instruction.
if (Delay.SALUCycles) {
+ assert(Delay.SALUCycles < DelayInfo::SALU_CYCLES_MAX);
if (Imm & 0x780) {
// We have already encoded a VALU and a TRANS delay. There's no room in
// the encoding for an SALU delay as well, so just drop it.
@@ -349,6 +356,7 @@ public:
if (instructionWaitsForVALU(MI)) {
// Forget about all outstanding VALU delays.
+ // TODO: This is overkill since it also forgets about SALU delays.
State = DelayState();
} else if (Type != OTHER) {
DelayInfo Delay;
@@ -360,11 +368,11 @@ public:
// ignore this operand.
if (MI.getOpcode() == AMDGPU::V_WRITELANE_B32 && Op.isTied())
continue;
- for (MCRegUnitIterator UI(Op.getReg(), TRI); UI.isValid(); ++UI) {
- auto It = State.find(*UI);
+ for (MCRegUnit Unit : TRI->regunits(Op.getReg())) {
+ auto It = State.find(Unit);
if (It != State.end()) {
Delay.merge(It->second);
- State.erase(*UI);
+ State.erase(Unit);
}
}
}
@@ -380,9 +388,9 @@ public:
// TODO: Scan implicit defs too?
for (const auto &Op : MI.defs()) {
unsigned Latency = SchedModel.computeOperandLatency(
- &MI, MI.getOperandNo(&Op), nullptr, 0);
- for (MCRegUnitIterator UI(Op.getReg(), TRI); UI.isValid(); ++UI)
- State[*UI] = DelayInfo(Type, Latency);
+ &MI, Op.getOperandNo(), nullptr, 0);
+ for (MCRegUnit Unit : TRI->regunits(Op.getReg()))
+ State[Unit] = DelayInfo(Type, Latency);
}
}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
index 62c3eec41836..3c399e497227 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
@@ -23,6 +23,7 @@
#include <optional>
using namespace llvm;
+using namespace llvm::PatternMatch;
#define DEBUG_TYPE "AMDGPUtti"
@@ -328,7 +329,8 @@ simplifyAMDGCNImageIntrinsic(const GCNSubtarget *ST,
});
}
-bool GCNTTIImpl::canSimplifyLegacyMulToMul(const Value *Op0, const Value *Op1,
+bool GCNTTIImpl::canSimplifyLegacyMulToMul(const Instruction &I,
+ const Value *Op0, const Value *Op1,
InstCombiner &IC) const {
// The legacy behaviour is that multiplying +/-0.0 by anything, even NaN or
// infinity, gives +0.0. If we can prove we don't have one of the special
@@ -340,15 +342,72 @@ bool GCNTTIImpl::canSimplifyLegacyMulToMul(const Value *Op0, const Value *Op1,
// One operand is not zero or infinity or NaN.
return true;
}
+
auto *TLI = &IC.getTargetLibraryInfo();
- if (isKnownNeverInfinity(Op0, TLI) && isKnownNeverNaN(Op0, TLI) &&
- isKnownNeverInfinity(Op1, TLI) && isKnownNeverNaN(Op1, TLI)) {
+ if (isKnownNeverInfOrNaN(Op0, IC.getDataLayout(), TLI, 0,
+ &IC.getAssumptionCache(), &I,
+ &IC.getDominatorTree()) &&
+ isKnownNeverInfOrNaN(Op1, IC.getDataLayout(), TLI, 0,
+ &IC.getAssumptionCache(), &I,
+ &IC.getDominatorTree())) {
// Neither operand is infinity or NaN.
return true;
}
return false;
}
+/// Match an fpext from half to float, or a constant we can convert.
+static bool matchFPExtFromF16(Value *Arg, Value *&FPExtSrc) {
+ if (match(Arg, m_OneUse(m_FPExt(m_Value(FPExtSrc)))))
+ return FPExtSrc->getType()->isHalfTy();
+
+ ConstantFP *CFP;
+ if (match(Arg, m_ConstantFP(CFP))) {
+ bool LosesInfo;
+ APFloat Val(CFP->getValueAPF());
+ Val.convert(APFloat::IEEEhalf(), APFloat::rmNearestTiesToEven, &LosesInfo);
+ if (LosesInfo)
+ return false;
+
+ FPExtSrc = ConstantFP::get(Type::getHalfTy(Arg->getContext()), Val);
+ return true;
+ }
+
+ return false;
+}
+
+// Trim all zero components from the end of the vector \p UseV and return
+// an appropriate bitset with known elements.
+static APInt trimTrailingZerosInVector(InstCombiner &IC, Value *UseV,
+ Instruction *I) {
+ auto *VTy = cast<FixedVectorType>(UseV->getType());
+ unsigned VWidth = VTy->getNumElements();
+ APInt DemandedElts = APInt::getAllOnes(VWidth);
+
+ for (int i = VWidth - 1; i > 0; --i) {
+ auto *Elt = findScalarElement(UseV, i);
+ if (!Elt)
+ break;
+
+ if (auto *ConstElt = dyn_cast<Constant>(Elt)) {
+ if (!ConstElt->isNullValue() && !isa<UndefValue>(Elt))
+ break;
+ } else {
+ break;
+ }
+
+ DemandedElts.clearBit(i);
+ }
+
+ return DemandedElts;
+}
+
+static Value *simplifyAMDGCNMemoryIntrinsicDemanded(InstCombiner &IC,
+ IntrinsicInst &II,
+ APInt DemandedElts,
+ int DMaskIdx = -1,
+ bool IsLoad = true);
+
std::optional<Instruction *>
GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
Intrinsic::ID IID = II.getIntrinsicID();
@@ -393,6 +452,54 @@ GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
break;
}
+ case Intrinsic::amdgcn_log:
+ case Intrinsic::amdgcn_exp2: {
+ const bool IsLog = IID == Intrinsic::amdgcn_log;
+ const bool IsExp = IID == Intrinsic::amdgcn_exp2;
+ Value *Src = II.getArgOperand(0);
+ Type *Ty = II.getType();
+
+ if (isa<PoisonValue>(Src))
+ return IC.replaceInstUsesWith(II, Src);
+
+ if (IC.getSimplifyQuery().isUndefValue(Src))
+ return IC.replaceInstUsesWith(II, ConstantFP::getNaN(Ty));
+
+ if (ConstantFP *C = dyn_cast<ConstantFP>(Src)) {
+ if (C->isInfinity()) {
+ // exp2(+inf) -> +inf
+ // log2(+inf) -> +inf
+ if (!C->isNegative())
+ return IC.replaceInstUsesWith(II, C);
+
+ // exp2(-inf) -> 0
+ if (IsExp && C->isNegative())
+ return IC.replaceInstUsesWith(II, ConstantFP::getZero(Ty));
+ }
+
+ if (II.isStrictFP())
+ break;
+
+ if (C->isNaN()) {
+ Constant *Quieted = ConstantFP::get(Ty, C->getValue().makeQuiet());
+ return IC.replaceInstUsesWith(II, Quieted);
+ }
+
+ // f32 instruction doesn't handle denormals, f16 does.
+ if (C->isZero() || (C->getValue().isDenormal() && Ty->isFloatTy())) {
+ Constant *FoldedValue = IsLog ? ConstantFP::getInfinity(Ty, true)
+ : ConstantFP::get(Ty, 1.0);
+ return IC.replaceInstUsesWith(II, FoldedValue);
+ }
+
+ if (IsLog && C->isNegative())
+ return IC.replaceInstUsesWith(II, ConstantFP::getNaN(Ty));
+
+ // TODO: Full constant folding matching hardware behavior.
+ }
+
+ break;
+ }
case Intrinsic::amdgcn_frexp_mant:
case Intrinsic::amdgcn_frexp_exp: {
Value *Src = II.getArgOperand(0);
@@ -423,85 +530,31 @@ GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
Value *Src0 = II.getArgOperand(0);
Value *Src1 = II.getArgOperand(1);
const ConstantInt *CMask = dyn_cast<ConstantInt>(Src1);
- if (!CMask) {
- if (isa<UndefValue>(Src0)) {
- return IC.replaceInstUsesWith(II, UndefValue::get(II.getType()));
- }
+ if (CMask) {
+ II.setCalledOperand(Intrinsic::getDeclaration(
+ II.getModule(), Intrinsic::is_fpclass, Src0->getType()));
- if (isa<UndefValue>(Src1)) {
- return IC.replaceInstUsesWith(II,
- ConstantInt::get(II.getType(), false));
- }
- break;
+ // Clamp any excess bits, as they're illegal for the generic intrinsic.
+ II.setArgOperand(1, ConstantInt::get(Src1->getType(),
+ CMask->getZExtValue() & fcAllFlags));
+ return &II;
}
- uint32_t Mask = CMask->getZExtValue();
-
- // If all tests are made, it doesn't matter what the value is.
- if ((Mask & fcAllFlags) == fcAllFlags) {
- return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), true));
- }
+ // Propagate poison.
+ if (isa<PoisonValue>(Src0) || isa<PoisonValue>(Src1))
+ return IC.replaceInstUsesWith(II, PoisonValue::get(II.getType()));
- if ((Mask & fcAllFlags) == 0) {
+ // llvm.amdgcn.class(_, undef) -> false
+ if (IC.getSimplifyQuery().isUndefValue(Src1))
return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), false));
- }
-
- if (Mask == fcNan && !II.isStrictFP()) {
- // Equivalent of isnan. Replace with standard fcmp.
- Value *FCmp = IC.Builder.CreateFCmpUNO(Src0, Src0);
- FCmp->takeName(&II);
- return IC.replaceInstUsesWith(II, FCmp);
- }
-
- if (Mask == fcZero && !II.isStrictFP()) {
- // Equivalent of == 0.
- Value *FCmp =
- IC.Builder.CreateFCmpOEQ(Src0, ConstantFP::get(Src0->getType(), 0.0));
- FCmp->takeName(&II);
- return IC.replaceInstUsesWith(II, FCmp);
+ // llvm.amdgcn.class(undef, mask) -> mask != 0
+ if (IC.getSimplifyQuery().isUndefValue(Src0)) {
+ Value *CmpMask = IC.Builder.CreateICmpNE(
+ Src1, ConstantInt::getNullValue(Src1->getType()));
+ return IC.replaceInstUsesWith(II, CmpMask);
}
-
- // fp_class (nnan x), qnan|snan|other -> fp_class (nnan x), other
- if ((Mask & fcNan) && isKnownNeverNaN(Src0, &IC.getTargetLibraryInfo())) {
- return IC.replaceOperand(
- II, 1, ConstantInt::get(Src1->getType(), Mask & ~fcNan));
- }
-
- const ConstantFP *CVal = dyn_cast<ConstantFP>(Src0);
- if (!CVal) {
- if (isa<UndefValue>(Src0)) {
- return IC.replaceInstUsesWith(II, UndefValue::get(II.getType()));
- }
-
- // Clamp mask to used bits
- if ((Mask & fcAllFlags) != Mask) {
- CallInst *NewCall = IC.Builder.CreateCall(
- II.getCalledFunction(),
- {Src0, ConstantInt::get(Src1->getType(), Mask & fcAllFlags)});
-
- NewCall->takeName(&II);
- return IC.replaceInstUsesWith(II, NewCall);
- }
-
- break;
- }
-
- const APFloat &Val = CVal->getValueAPF();
-
- bool Result =
- ((Mask & fcSNan) && Val.isNaN() && Val.isSignaling()) ||
- ((Mask & fcQNan) && Val.isNaN() && !Val.isSignaling()) ||
- ((Mask & fcNegInf) && Val.isInfinity() && Val.isNegative()) ||
- ((Mask & fcNegNormal) && Val.isNormal() && Val.isNegative()) ||
- ((Mask & fcNegSubnormal) && Val.isDenormal() && Val.isNegative()) ||
- ((Mask & fcNegZero) && Val.isZero() && Val.isNegative()) ||
- ((Mask & fcPosZero) && Val.isZero() && !Val.isNegative()) ||
- ((Mask & fcPosSubnormal) && Val.isDenormal() && !Val.isNegative()) ||
- ((Mask & fcPosNormal) && Val.isNormal() && !Val.isNegative()) ||
- ((Mask & fcPosInf) && Val.isInfinity() && !Val.isNegative());
-
- return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), Result));
+ break;
}
case Intrinsic::amdgcn_cvt_pkrtz: {
Value *Src0 = II.getArgOperand(0);
@@ -695,6 +748,20 @@ GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
}
}
+ if (!ST->hasMed3_16())
+ break;
+
+ Value *X, *Y, *Z;
+
+ // Repeat floating-point width reduction done for minnum/maxnum.
+ // fmed3((fpext X), (fpext Y), (fpext Z)) -> fpext (fmed3(X, Y, Z))
+ if (matchFPExtFromF16(Src0, X) && matchFPExtFromF16(Src1, Y) &&
+ matchFPExtFromF16(Src2, Z)) {
+ Value *NewCall = IC.Builder.CreateIntrinsic(IID, {X->getType()},
+ {X, Y, Z}, &II, II.getName());
+ return new FPExtInst(NewCall, II.getType());
+ }
+
break;
}
case Intrinsic::amdgcn_icmp:
@@ -835,31 +902,18 @@ GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
break;
}
+ case Intrinsic::amdgcn_mbcnt_hi: {
+ // exec_hi is all 0, so this is just a copy.
+ if (ST->isWave32())
+ return IC.replaceInstUsesWith(II, II.getArgOperand(1));
+ break;
+ }
case Intrinsic::amdgcn_ballot: {
if (auto *Src = dyn_cast<ConstantInt>(II.getArgOperand(0))) {
if (Src->isZero()) {
// amdgcn.ballot(i1 0) is zero.
return IC.replaceInstUsesWith(II, Constant::getNullValue(II.getType()));
}
-
- if (Src->isOne()) {
- // amdgcn.ballot(i1 1) is exec.
- const char *RegName = "exec";
- if (II.getType()->isIntegerTy(32))
- RegName = "exec_lo";
- else if (!II.getType()->isIntegerTy(64))
- break;
-
- Function *NewF = Intrinsic::getDeclaration(
- II.getModule(), Intrinsic::read_register, II.getType());
- Metadata *MDArgs[] = {MDString::get(II.getContext(), RegName)};
- MDNode *MD = MDNode::get(II.getContext(), MDArgs);
- Value *Args[] = {MetadataAsValue::get(II.getContext(), MD)};
- CallInst *NewCall = IC.Builder.CreateCall(NewF, Args);
- NewCall->addFnAttr(Attribute::Convergent);
- NewCall->takeName(&II);
- return IC.replaceInstUsesWith(II, NewCall);
- }
}
break;
}
@@ -981,13 +1035,8 @@ GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
if (II.isStrictFP())
break;
- if (C && C->isNaN()) {
- // FIXME: We just need to make the nan quiet here, but that's unavailable
- // on APFloat, only IEEEfloat
- auto *Quieted =
- ConstantFP::get(Ty, scalbn(*C, 0, APFloat::rmNearestTiesToEven));
- return IC.replaceInstUsesWith(II, Quieted);
- }
+ if (C && C->isNaN())
+ return IC.replaceInstUsesWith(II, ConstantFP::get(Ty, C->makeQuiet()));
// ldexp(x, 0) -> x
// ldexp(x, undef) -> x
@@ -1006,11 +1055,11 @@ GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
// TODO: Move to InstSimplify?
if (match(Op0, PatternMatch::m_AnyZeroFP()) ||
match(Op1, PatternMatch::m_AnyZeroFP()))
- return IC.replaceInstUsesWith(II, ConstantFP::getNullValue(II.getType()));
+ return IC.replaceInstUsesWith(II, ConstantFP::getZero(II.getType()));
// If we can prove we don't have one of the special cases then we can use a
// normal fmul instruction instead.
- if (canSimplifyLegacyMulToMul(Op0, Op1, IC)) {
+ if (canSimplifyLegacyMulToMul(II, Op0, Op1, IC)) {
auto *FMul = IC.Builder.CreateFMulFMF(Op0, Op1, &II);
FMul->takeName(&II);
return IC.replaceInstUsesWith(II, FMul);
@@ -1029,7 +1078,7 @@ GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
match(Op1, PatternMatch::m_AnyZeroFP())) {
// It's tempting to just return Op2 here, but that would give the wrong
// result if Op2 was -0.0.
- auto *Zero = ConstantFP::getNullValue(II.getType());
+ auto *Zero = ConstantFP::getZero(II.getType());
auto *FAdd = IC.Builder.CreateFAddFMF(Zero, Op2, &II);
FAdd->takeName(&II);
return IC.replaceInstUsesWith(II, FAdd);
@@ -1037,7 +1086,7 @@ GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
// If we can prove we don't have one of the special cases then we can use a
// normal fma instead.
- if (canSimplifyLegacyMulToMul(Op0, Op1, IC)) {
+ if (canSimplifyLegacyMulToMul(II, Op0, Op1, IC)) {
II.setCalledOperand(Intrinsic::getDeclaration(
II.getModule(), Intrinsic::fma, II.getType()));
return &II;
@@ -1053,26 +1102,62 @@ GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
return IC.replaceInstUsesWith(II, ConstantInt::getFalse(II.getType()));
break;
}
- default: {
- if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
- AMDGPU::getImageDimIntrinsicInfo(II.getIntrinsicID())) {
- return simplifyAMDGCNImageIntrinsic(ST, ImageDimIntr, II, IC);
+ case Intrinsic::amdgcn_buffer_store_format:
+ case Intrinsic::amdgcn_raw_buffer_store_format:
+ case Intrinsic::amdgcn_struct_buffer_store_format:
+ case Intrinsic::amdgcn_raw_tbuffer_store:
+ case Intrinsic::amdgcn_struct_tbuffer_store:
+ case Intrinsic::amdgcn_tbuffer_store:
+ case Intrinsic::amdgcn_image_store_1d:
+ case Intrinsic::amdgcn_image_store_1darray:
+ case Intrinsic::amdgcn_image_store_2d:
+ case Intrinsic::amdgcn_image_store_2darray:
+ case Intrinsic::amdgcn_image_store_2darraymsaa:
+ case Intrinsic::amdgcn_image_store_2dmsaa:
+ case Intrinsic::amdgcn_image_store_3d:
+ case Intrinsic::amdgcn_image_store_cube:
+ case Intrinsic::amdgcn_image_store_mip_1d:
+ case Intrinsic::amdgcn_image_store_mip_1darray:
+ case Intrinsic::amdgcn_image_store_mip_2d:
+ case Intrinsic::amdgcn_image_store_mip_2darray:
+ case Intrinsic::amdgcn_image_store_mip_3d:
+ case Intrinsic::amdgcn_image_store_mip_cube: {
+ if (!isa<FixedVectorType>(II.getArgOperand(0)->getType()))
+ break;
+
+ APInt DemandedElts =
+ trimTrailingZerosInVector(IC, II.getArgOperand(0), &II);
+
+ int DMaskIdx = getAMDGPUImageDMaskIntrinsic(II.getIntrinsicID()) ? 1 : -1;
+ if (simplifyAMDGCNMemoryIntrinsicDemanded(IC, II, DemandedElts, DMaskIdx,
+ false)) {
+ return IC.eraseInstFromFunction(II);
}
+
+ break;
+ }
}
+ if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
+ AMDGPU::getImageDimIntrinsicInfo(II.getIntrinsicID())) {
+ return simplifyAMDGCNImageIntrinsic(ST, ImageDimIntr, II, IC);
}
return std::nullopt;
}
/// Implement SimplifyDemandedVectorElts for amdgcn buffer and image intrinsics.
///
+/// The result of simplifying amdgcn image and buffer store intrinsics is updating
+/// definitions of the intrinsics vector argument, not Uses of the result like
+/// image and buffer loads.
/// Note: This only supports non-TFE/LWE image intrinsic calls; those have
/// struct returns.
static Value *simplifyAMDGCNMemoryIntrinsicDemanded(InstCombiner &IC,
IntrinsicInst &II,
APInt DemandedElts,
- int DMaskIdx = -1) {
+ int DMaskIdx, bool IsLoad) {
- auto *IIVTy = cast<FixedVectorType>(II.getType());
+ auto *IIVTy = cast<FixedVectorType>(IsLoad ? II.getType()
+ : II.getOperand(0)->getType());
unsigned VWidth = IIVTy->getNumElements();
if (VWidth == 1)
return nullptr;
@@ -1088,7 +1173,7 @@ static Value *simplifyAMDGCNMemoryIntrinsicDemanded(InstCombiner &IC,
// Buffer case.
const unsigned ActiveBits = DemandedElts.getActiveBits();
- const unsigned UnusedComponentsAtFront = DemandedElts.countTrailingZeros();
+ const unsigned UnusedComponentsAtFront = DemandedElts.countr_zero();
// Start assuming the prefix of elements is demanded, but possibly clear
// some other bits if there are trailing zeros (unused components at front)
@@ -1101,6 +1186,7 @@ static Value *simplifyAMDGCNMemoryIntrinsicDemanded(InstCombiner &IC,
unsigned OffsetIdx;
switch (II.getIntrinsicID()) {
case Intrinsic::amdgcn_raw_buffer_load:
+ case Intrinsic::amdgcn_raw_ptr_buffer_load:
OffsetIdx = 1;
break;
case Intrinsic::amdgcn_s_buffer_load:
@@ -1113,6 +1199,7 @@ static Value *simplifyAMDGCNMemoryIntrinsicDemanded(InstCombiner &IC,
OffsetIdx = 1;
break;
case Intrinsic::amdgcn_struct_buffer_load:
+ case Intrinsic::amdgcn_struct_ptr_buffer_load:
OffsetIdx = 2;
break;
default:
@@ -1143,13 +1230,13 @@ static Value *simplifyAMDGCNMemoryIntrinsicDemanded(InstCombiner &IC,
DemandedElts &= (1 << llvm::popcount(DMaskVal)) - 1;
unsigned NewDMaskVal = 0;
- unsigned OrigLoadIdx = 0;
+ unsigned OrigLdStIdx = 0;
for (unsigned SrcIdx = 0; SrcIdx < 4; ++SrcIdx) {
const unsigned Bit = 1 << SrcIdx;
if (!!(DMaskVal & Bit)) {
- if (!!DemandedElts[OrigLoadIdx])
+ if (!!DemandedElts[OrigLdStIdx])
NewDMaskVal |= Bit;
- OrigLoadIdx++;
+ OrigLdStIdx++;
}
}
@@ -1157,7 +1244,7 @@ static Value *simplifyAMDGCNMemoryIntrinsicDemanded(InstCombiner &IC,
Args[DMaskIdx] = ConstantInt::get(DMask->getType(), NewDMaskVal);
}
- unsigned NewNumElts = DemandedElts.countPopulation();
+ unsigned NewNumElts = DemandedElts.popcount();
if (!NewNumElts)
return UndefValue::get(IIVTy);
@@ -1177,29 +1264,45 @@ static Value *simplifyAMDGCNMemoryIntrinsicDemanded(InstCombiner &IC,
(NewNumElts == 1) ? EltTy : FixedVectorType::get(EltTy, NewNumElts);
OverloadTys[0] = NewTy;
+ if (!IsLoad) {
+ SmallVector<int, 8> EltMask;
+ for (unsigned OrigStoreIdx = 0; OrigStoreIdx < VWidth; ++OrigStoreIdx)
+ if (DemandedElts[OrigStoreIdx])
+ EltMask.push_back(OrigStoreIdx);
+
+ if (NewNumElts == 1)
+ Args[0] = IC.Builder.CreateExtractElement(II.getOperand(0), EltMask[0]);
+ else
+ Args[0] = IC.Builder.CreateShuffleVector(II.getOperand(0), EltMask);
+ }
+
Function *NewIntrin = Intrinsic::getDeclaration(
II.getModule(), II.getIntrinsicID(), OverloadTys);
CallInst *NewCall = IC.Builder.CreateCall(NewIntrin, Args);
NewCall->takeName(&II);
NewCall->copyMetadata(II);
- if (NewNumElts == 1) {
- return IC.Builder.CreateInsertElement(UndefValue::get(IIVTy), NewCall,
- DemandedElts.countTrailingZeros());
- }
+ if (IsLoad) {
+ if (NewNumElts == 1) {
+ return IC.Builder.CreateInsertElement(UndefValue::get(IIVTy), NewCall,
+ DemandedElts.countr_zero());
+ }
- SmallVector<int, 8> EltMask;
- unsigned NewLoadIdx = 0;
- for (unsigned OrigLoadIdx = 0; OrigLoadIdx < VWidth; ++OrigLoadIdx) {
- if (!!DemandedElts[OrigLoadIdx])
- EltMask.push_back(NewLoadIdx++);
- else
- EltMask.push_back(NewNumElts);
- }
+ SmallVector<int, 8> EltMask;
+ unsigned NewLoadIdx = 0;
+ for (unsigned OrigLoadIdx = 0; OrigLoadIdx < VWidth; ++OrigLoadIdx) {
+ if (!!DemandedElts[OrigLoadIdx])
+ EltMask.push_back(NewLoadIdx++);
+ else
+ EltMask.push_back(NewNumElts);
+ }
+
+ auto *Shuffle = IC.Builder.CreateShuffleVector(NewCall, EltMask);
- Value *Shuffle = IC.Builder.CreateShuffleVector(NewCall, EltMask);
+ return Shuffle;
+ }
- return Shuffle;
+ return NewCall;
}
std::optional<Value *> GCNTTIImpl::simplifyDemandedVectorEltsIntrinsic(
@@ -1211,12 +1314,18 @@ std::optional<Value *> GCNTTIImpl::simplifyDemandedVectorEltsIntrinsic(
case Intrinsic::amdgcn_buffer_load:
case Intrinsic::amdgcn_buffer_load_format:
case Intrinsic::amdgcn_raw_buffer_load:
+ case Intrinsic::amdgcn_raw_ptr_buffer_load:
case Intrinsic::amdgcn_raw_buffer_load_format:
+ case Intrinsic::amdgcn_raw_ptr_buffer_load_format:
case Intrinsic::amdgcn_raw_tbuffer_load:
+ case Intrinsic::amdgcn_raw_ptr_tbuffer_load:
case Intrinsic::amdgcn_s_buffer_load:
case Intrinsic::amdgcn_struct_buffer_load:
+ case Intrinsic::amdgcn_struct_ptr_buffer_load:
case Intrinsic::amdgcn_struct_buffer_load_format:
+ case Intrinsic::amdgcn_struct_ptr_buffer_load_format:
case Intrinsic::amdgcn_struct_tbuffer_load:
+ case Intrinsic::amdgcn_struct_ptr_tbuffer_load:
case Intrinsic::amdgcn_tbuffer_load:
return simplifyAMDGCNMemoryIntrinsicDemanded(IC, II, DemandedElts);
default: {
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td b/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td
index 15b7f971f09c..b69cae0c73b3 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td
@@ -18,10 +18,6 @@ def AMDGPUDTIntTernaryOp : SDTypeProfile<1, 3, [
SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2>, SDTCisInt<0>, SDTCisInt<3>
]>;
-def AMDGPULdExpOp : SDTypeProfile<1, 2,
- [SDTCisSameAs<0, 1>, SDTCisFP<0>, SDTCisInt<2>]
->;
-
def AMDGPUFPClassOp : SDTypeProfile<1, 2,
[SDTCisInt<0>, SDTCisFP<1>, SDTCisInt<2>]
>;
@@ -43,6 +39,7 @@ def AMDGPUFmasOp : SDTypeProfile<1, 4,
[SDTCisFP<0>, SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2>, SDTCisSameAs<0, 3>, SDTCisInt<4>]
>;
+def ImmOp : SDTypeProfile<0, 1, [SDTCisInt<0>]>;
def AMDGPUKillSDT : SDTypeProfile<0, 1, [SDTCisInt<0>]>;
def AMDGPUIfOp : SDTypeProfile<1, 2,
@@ -85,9 +82,16 @@ def AMDGPUcall : SDNode<"AMDGPUISD::CALL",
SDNPVariadic]
>;
-def AMDGPUtc_return: SDNode<"AMDGPUISD::TC_RETURN",
- SDTypeProfile<0, 3, [SDTCisPtrTy<0>]>,
- [SDNPHasChain, SDNPOptInGlue, SDNPVariadic]
+def AMDGPUTCReturnTP : SDTypeProfile<0, 3, [
+ SDTCisPtrTy<0>
+]>;
+
+def AMDGPUtc_return: SDNode<"AMDGPUISD::TC_RETURN", AMDGPUTCReturnTP,
+[SDNPHasChain, SDNPOptInGlue, SDNPVariadic]
+>;
+
+def AMDGPUtc_return_gfx: SDNode<"AMDGPUISD::TC_RETURN_GFX", AMDGPUTCReturnTP,
+[SDNPHasChain, SDNPOptInGlue, SDNPVariadic]
>;
def AMDGPUtrap : SDNode<"AMDGPUISD::TRAP",
@@ -111,6 +115,12 @@ def AMDGPUfract_impl : SDNode<"AMDGPUISD::FRACT", SDTFPUnaryOp>;
// out = 1.0 / a
def AMDGPUrcp_impl : SDNode<"AMDGPUISD::RCP", SDTFPUnaryOp>;
+// v_log_f32, which is log2
+def AMDGPUlog_impl : SDNode<"AMDGPUISD::LOG", SDTFPUnaryOp>;
+
+// v_exp_f32, which is exp2
+def AMDGPUexp_impl : SDNode<"AMDGPUISD::EXP", SDTFPUnaryOp>;
+
// out = 1.0 / sqrt(a)
def AMDGPUrsq_impl : SDNode<"AMDGPUISD::RSQ", SDTFPUnaryOp>;
@@ -121,8 +131,6 @@ def AMDGPUrcp_iflag : SDNode<"AMDGPUISD::RCP_IFLAG", SDTFPUnaryOp>;
// out = 1.0 / sqrt(a) result clamped to +/- max_float.
def AMDGPUrsq_clamp_impl : SDNode<"AMDGPUISD::RSQ_CLAMP", SDTFPUnaryOp>;
-def AMDGPUldexp_impl : SDNode<"AMDGPUISD::LDEXP", AMDGPULdExpOp>;
-
def AMDGPUpkrtz_f16_f32_impl : SDNode<"AMDGPUISD::CVT_PKRTZ_F16_F32", AMDGPUFPPackOp>;
def AMDGPUpknorm_i16_f32_impl : SDNode<"AMDGPUISD::CVT_PKNORM_I16_F32", AMDGPUFPPackOp>;
def AMDGPUpknorm_u16_f32_impl : SDNode<"AMDGPUISD::CVT_PKNORM_U16_F32", AMDGPUFPPackOp>;
@@ -351,11 +359,13 @@ def IL_brcond : SDNode<"AMDGPUISD::BRANCH_COND", SDTIL_BRCond, [SDNPHasChai
//===----------------------------------------------------------------------===//
def AMDGPUendpgm : SDNode<"AMDGPUISD::ENDPGM", SDTNone,
[SDNPHasChain, SDNPOptInGlue]>;
+def AMDGPUendpgm_trap : SDNode<"AMDGPUISD::ENDPGM_TRAP", SDTNone,
+ [SDNPHasChain]>;
def AMDGPUreturn_to_epilog : SDNode<"AMDGPUISD::RETURN_TO_EPILOG", SDTNone,
[SDNPHasChain, SDNPOptInGlue, SDNPVariadic]>;
-def AMDGPUret_flag : SDNode<"AMDGPUISD::RET_FLAG", SDTNone,
+def AMDGPUret_glue : SDNode<"AMDGPUISD::RET_GLUE", SDTNone,
[SDNPHasChain, SDNPOptInGlue, SDNPVariadic]
>;
@@ -381,10 +391,15 @@ def AMDGPUcos : PatFrags<(ops node:$src), [(int_amdgcn_cos node:$src),
(AMDGPUcos_impl node:$src)]>;
def AMDGPUfract : PatFrags<(ops node:$src), [(int_amdgcn_fract node:$src),
(AMDGPUfract_impl node:$src)]>;
+def AMDGPUlog : PatFrags<(ops node:$src), [(int_amdgcn_log node:$src),
+ (AMDGPUlog_impl node:$src)]>;
+def AMDGPUlogf16 : PatFrags<(ops node:$src), [(int_amdgcn_log node:$src),
+ (flog2 node:$src)]>;
-def AMDGPUldexp : PatFrags<(ops node:$src0, node:$src1),
- [(int_amdgcn_ldexp node:$src0, node:$src1),
- (AMDGPUldexp_impl node:$src0, node:$src1)]>;
+def AMDGPUexp : PatFrags<(ops node:$src), [(int_amdgcn_exp2 node:$src),
+ (AMDGPUexp_impl node:$src)]>;
+def AMDGPUexpf16 : PatFrags<(ops node:$src), [(int_amdgcn_exp2 node:$src),
+ (fexp2 node:$src)]>;
def AMDGPUfp_class : PatFrags<(ops node:$src0, node:$src1),
[(int_amdgcn_class node:$src0, node:$src1),
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
index 2639f1f45565..747f9fe2f8ae 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -19,8 +19,8 @@
#include "AMDGPUTargetMachine.h"
#include "SIMachineFunctionInfo.h"
#include "Utils/AMDGPUBaseInfo.h"
+#include "llvm/CodeGen/GlobalISel/GIMatchTableExecutorImpl.h"
#include "llvm/CodeGen/GlobalISel/GISelKnownBits.h"
-#include "llvm/CodeGen/GlobalISel/InstructionSelectorImpl.h"
#include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
#include "llvm/CodeGen/MachineFrameInfo.h"
@@ -63,7 +63,7 @@ AMDGPUInstructionSelector::AMDGPUInstructionSelector(
const char *AMDGPUInstructionSelector::getName() { return DEBUG_TYPE; }
void AMDGPUInstructionSelector::setupMF(MachineFunction &MF, GISelKnownBits *KB,
- CodeGenCoverage &CoverageInfo,
+ CodeGenCoverage *CoverageInfo,
ProfileSummaryInfo *PSI,
BlockFrequencyInfo *BFI) {
MRI = &MF.getRegInfo();
@@ -523,60 +523,6 @@ bool AMDGPUInstructionSelector::selectG_EXTRACT(MachineInstr &I) const {
return true;
}
-bool AMDGPUInstructionSelector::selectG_FMA_FMAD(MachineInstr &I) const {
- assert(I.getOpcode() == AMDGPU::G_FMA || I.getOpcode() == AMDGPU::G_FMAD);
-
- // Try to manually select MAD_MIX/FMA_MIX.
- Register Dst = I.getOperand(0).getReg();
- LLT ResultTy = MRI->getType(Dst);
- bool IsFMA = I.getOpcode() == AMDGPU::G_FMA;
- if (ResultTy != LLT::scalar(32) ||
- (IsFMA ? !Subtarget->hasFmaMixInsts() : !Subtarget->hasMadMixInsts()))
- return false;
-
- // Avoid using v_mad_mix_f32/v_fma_mix_f32 unless there is actually an operand
- // using the conversion from f16.
- bool MatchedSrc0, MatchedSrc1, MatchedSrc2;
- auto [Src0, Src0Mods] =
- selectVOP3PMadMixModsImpl(I.getOperand(1), MatchedSrc0);
- auto [Src1, Src1Mods] =
- selectVOP3PMadMixModsImpl(I.getOperand(2), MatchedSrc1);
- auto [Src2, Src2Mods] =
- selectVOP3PMadMixModsImpl(I.getOperand(3), MatchedSrc2);
-
-#ifndef NDEBUG
- const SIMachineFunctionInfo *MFI =
- I.getMF()->getInfo<SIMachineFunctionInfo>();
- AMDGPU::SIModeRegisterDefaults Mode = MFI->getMode();
- assert((IsFMA || !Mode.allFP32Denormals()) &&
- "fmad selected with denormals enabled");
-#endif
-
- // TODO: We can select this with f32 denormals enabled if all the sources are
- // converted from f16 (in which case fmad isn't legal).
- if (!MatchedSrc0 && !MatchedSrc1 && !MatchedSrc2)
- return false;
-
- const unsigned OpC = IsFMA ? AMDGPU::V_FMA_MIX_F32 : AMDGPU::V_MAD_MIX_F32;
- MachineInstr *MixInst =
- BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(OpC), Dst)
- .addImm(Src0Mods)
- .addReg(copyToVGPRIfSrcFolded(Src0, Src0Mods, I.getOperand(1), &I))
- .addImm(Src1Mods)
- .addReg(copyToVGPRIfSrcFolded(Src1, Src1Mods, I.getOperand(2), &I))
- .addImm(Src2Mods)
- .addReg(copyToVGPRIfSrcFolded(Src2, Src2Mods, I.getOperand(3), &I))
- .addImm(0)
- .addImm(0)
- .addImm(0);
-
- if (!constrainSelectedInstRegOperands(*MixInst, TII, TRI, RBI))
- return false;
-
- I.eraseFromParent();
- return true;
-}
-
bool AMDGPUInstructionSelector::selectG_MERGE_VALUES(MachineInstr &MI) const {
MachineBasicBlock *BB = MI.getParent();
Register DstReg = MI.getOperand(0).getReg();
@@ -1100,6 +1046,8 @@ bool AMDGPUInstructionSelector::selectG_INTRINSIC(MachineInstr &I) const {
return selectIntrinsicCmp(I);
case Intrinsic::amdgcn_ballot:
return selectBallot(I);
+ case Intrinsic::amdgcn_inverse_ballot:
+ return selectInverseBallot(I);
case Intrinsic::amdgcn_reloc_constant:
return selectRelocConstant(I);
case Intrinsic::amdgcn_groupstaticsize:
@@ -1343,27 +1291,26 @@ bool AMDGPUInstructionSelector::selectIntrinsicCmp(MachineInstr &I) const {
if (Opcode == -1)
return false;
- MachineInstr *SelectedMI;
- if (CmpInst::isFPPredicate(Pred)) {
- MachineOperand &LHS = I.getOperand(2);
- MachineOperand &RHS = I.getOperand(3);
- auto [Src0, Src0Mods] = selectVOP3ModsImpl(LHS);
- auto [Src1, Src1Mods] = selectVOP3ModsImpl(RHS);
- Register Src0Reg =
- copyToVGPRIfSrcFolded(Src0, Src0Mods, LHS, &I, /*ForceVGPR*/ true);
- Register Src1Reg =
- copyToVGPRIfSrcFolded(Src1, Src1Mods, RHS, &I, /*ForceVGPR*/ true);
- SelectedMI = BuildMI(*BB, &I, DL, TII.get(Opcode), Dst)
- .addImm(Src0Mods)
- .addReg(Src0Reg)
- .addImm(Src1Mods)
- .addReg(Src1Reg)
- .addImm(0); // clamp
- } else {
- SelectedMI = BuildMI(*BB, &I, DL, TII.get(Opcode), Dst)
- .add(I.getOperand(2))
- .add(I.getOperand(3));
- }
+ MachineInstrBuilder SelectedMI;
+ MachineOperand &LHS = I.getOperand(2);
+ MachineOperand &RHS = I.getOperand(3);
+ auto [Src0, Src0Mods] = selectVOP3ModsImpl(LHS);
+ auto [Src1, Src1Mods] = selectVOP3ModsImpl(RHS);
+ Register Src0Reg =
+ copyToVGPRIfSrcFolded(Src0, Src0Mods, LHS, &I, /*ForceVGPR*/ true);
+ Register Src1Reg =
+ copyToVGPRIfSrcFolded(Src1, Src1Mods, RHS, &I, /*ForceVGPR*/ true);
+ SelectedMI = BuildMI(*BB, &I, DL, TII.get(Opcode), Dst);
+ if (AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::src0_modifiers))
+ SelectedMI.addImm(Src0Mods);
+ SelectedMI.addReg(Src0Reg);
+ if (AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::src1_modifiers))
+ SelectedMI.addImm(Src1Mods);
+ SelectedMI.addReg(Src1Reg);
+ if (AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::clamp))
+ SelectedMI.addImm(0); // clamp
+ if (AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::op_sel))
+ SelectedMI.addImm(0); // op_sel
RBI.constrainGenericRegister(Dst, *TRI.getBoolRC(), *MRI);
if (!constrainSelectedInstRegOperands(*SelectedMI, TII, TRI, RBI))
@@ -1379,28 +1326,56 @@ bool AMDGPUInstructionSelector::selectBallot(MachineInstr &I) const {
Register DstReg = I.getOperand(0).getReg();
const unsigned Size = MRI->getType(DstReg).getSizeInBits();
const bool Is64 = Size == 64;
+ const bool IsWave32 = (STI.getWavefrontSize() == 32);
- if (Size != STI.getWavefrontSize())
+ // In the common case, the return type matches the wave size.
+ // However we also support emitting i64 ballots in wave32 mode.
+ if (Size != STI.getWavefrontSize() && (!Is64 || !IsWave32))
return false;
std::optional<ValueAndVReg> Arg =
getIConstantVRegValWithLookThrough(I.getOperand(2).getReg(), *MRI);
+ const auto BuildCopy = [&](Register SrcReg) {
+ if (Size == STI.getWavefrontSize()) {
+ BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), DstReg)
+ .addReg(SrcReg);
+ return;
+ }
+
+ // If emitting a i64 ballot in wave32, fill the upper bits with zeroes.
+ Register HiReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
+ BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_MOV_B32), HiReg).addImm(0);
+ BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)
+ .addReg(SrcReg)
+ .addImm(AMDGPU::sub0)
+ .addReg(HiReg)
+ .addImm(AMDGPU::sub1);
+ };
+
if (Arg) {
const int64_t Value = Arg->Value.getSExtValue();
if (Value == 0) {
unsigned Opcode = Is64 ? AMDGPU::S_MOV_B64 : AMDGPU::S_MOV_B32;
BuildMI(*BB, &I, DL, TII.get(Opcode), DstReg).addImm(0);
- } else if (Value == -1) { // all ones
- Register SrcReg = Is64 ? AMDGPU::EXEC : AMDGPU::EXEC_LO;
- BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), DstReg).addReg(SrcReg);
- } else
+ } else if (Value == -1) // all ones
+ BuildCopy(IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC);
+ else
return false;
- } else {
- Register SrcReg = I.getOperand(2).getReg();
- BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), DstReg).addReg(SrcReg);
- }
+ } else
+ BuildCopy(I.getOperand(2).getReg());
+
+ I.eraseFromParent();
+ return true;
+}
+
+bool AMDGPUInstructionSelector::selectInverseBallot(MachineInstr &I) const {
+ MachineBasicBlock *BB = I.getParent();
+ const DebugLoc &DL = I.getDebugLoc();
+ const Register DstReg = I.getOperand(0).getReg();
+ const Register MaskReg = I.getOperand(2).getReg();
+ BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), DstReg).addReg(MaskReg);
I.eraseFromParent();
return true;
}
@@ -1635,7 +1610,7 @@ bool AMDGPUInstructionSelector::selectDSGWSIntrinsic(MachineInstr &MI,
.addImm(0);
} else {
std::tie(BaseOffset, ImmOffset) =
- AMDGPU::getBaseWithConstantOffset(*MRI, BaseOffset, KnownBits);
+ AMDGPU::getBaseWithConstantOffset(*MRI, BaseOffset, KB);
if (Readfirstlane) {
// We have the constant offset now, so put the readfirstlane back on the
@@ -1824,7 +1799,7 @@ bool AMDGPUInstructionSelector::selectImageIntrinsic(
}
// Set G16 opcode
- if (IsG16 && !IsA16) {
+ if (Subtarget->hasG16() && IsG16) {
const AMDGPU::MIMGG16MappingInfo *G16MappingInfo =
AMDGPU::getMIMGG16MappingInfo(Intr->BaseOpcode);
assert(G16MappingInfo);
@@ -1859,7 +1834,10 @@ bool AMDGPUInstructionSelector::selectImageIntrinsic(
// The legalizer preprocessed the intrinsic arguments. If we aren't using
// NSA, these should have been packed into a single value in the first
// address register
- const bool UseNSA = NumVAddrRegs != 1 && NumVAddrDwords == NumVAddrRegs;
+ const bool UseNSA =
+ NumVAddrRegs != 1 &&
+ (STI.hasPartialNSAEncoding() ? NumVAddrDwords >= NumVAddrRegs
+ : NumVAddrDwords == NumVAddrRegs);
if (UseNSA && !STI.hasFeature(AMDGPU::FeatureNSAEncoding)) {
LLVM_DEBUG(dbgs() << "Trying to use NSA on non-NSA target\n");
return false;
@@ -1898,7 +1876,8 @@ bool AMDGPUInstructionSelector::selectImageIntrinsic(
Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx6,
NumVDataDwords, NumVAddrDwords);
}
- assert(Opcode != -1);
+ if (Opcode == -1)
+ return false;
auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opcode))
.cloneMemRefs(MI);
@@ -2050,7 +2029,9 @@ bool AMDGPUInstructionSelector::selectG_INTRINSIC_W_SIDE_EFFECTS(
case Intrinsic::amdgcn_s_barrier:
return selectSBarrier(I);
case Intrinsic::amdgcn_raw_buffer_load_lds:
+ case Intrinsic::amdgcn_raw_ptr_buffer_load_lds:
case Intrinsic::amdgcn_struct_buffer_load_lds:
+ case Intrinsic::amdgcn_struct_ptr_buffer_load_lds:
return selectBufferLoadLds(I);
case Intrinsic::amdgcn_global_load_lds:
return selectGlobalLoadLds(I);
@@ -2137,7 +2118,7 @@ static int sizeToSubRegIndex(unsigned Size) {
return AMDGPU::sub0;
if (Size > 256)
return -1;
- return sizeToSubRegIndex(PowerOf2Ceil(Size));
+ return sizeToSubRegIndex(llvm::bit_ceil(Size));
}
}
@@ -2801,7 +2782,7 @@ bool AMDGPUInstructionSelector::selectG_PTRMASK(MachineInstr &I) const {
// Try to avoid emitting a bit operation when we only need to touch half of
// the 64-bit pointer.
- APInt MaskOnes = KnownBits->getKnownOnes(MaskReg).zext(64);
+ APInt MaskOnes = KB->getKnownOnes(MaskReg).zext(64);
const APInt MaskHi32 = APInt::getHighBitsSet(64, 32);
const APInt MaskLo32 = APInt::getLowBitsSet(64, 32);
@@ -2953,7 +2934,7 @@ bool AMDGPUInstructionSelector::selectG_EXTRACT_VECTOR_ELT(
unsigned SubReg;
std::tie(IdxReg, SubReg) = computeIndirectRegIndex(
- *MRI, TRI, SrcRC, IdxReg, DstTy.getSizeInBits() / 8, *KnownBits);
+ *MRI, TRI, SrcRC, IdxReg, DstTy.getSizeInBits() / 8, *KB);
if (SrcRB->getID() == AMDGPU::SGPRRegBankID) {
if (DstTy.getSizeInBits() != 32 && !Is64)
@@ -3033,8 +3014,8 @@ bool AMDGPUInstructionSelector::selectG_INSERT_VECTOR_ELT(
return false;
unsigned SubReg;
- std::tie(IdxReg, SubReg) = computeIndirectRegIndex(*MRI, TRI, VecRC, IdxReg,
- ValSize / 8, *KnownBits);
+ std::tie(IdxReg, SubReg) =
+ computeIndirectRegIndex(*MRI, TRI, VecRC, IdxReg, ValSize / 8, *KB);
const bool IndexMode = VecRB->getID() == AMDGPU::VGPRRegBankID &&
STI.useVGPRIndexMode();
@@ -3402,11 +3383,6 @@ bool AMDGPUInstructionSelector::select(MachineInstr &I) {
return selectG_FABS(I);
case TargetOpcode::G_EXTRACT:
return selectG_EXTRACT(I);
- case TargetOpcode::G_FMA:
- case TargetOpcode::G_FMAD:
- if (selectG_FMA_FMAD(I))
- return true;
- return selectImpl(I, *CoverageInfo);
case TargetOpcode::G_MERGE_VALUES:
case TargetOpcode::G_CONCAT_VECTORS:
return selectG_MERGE_VALUES(I);
@@ -3446,9 +3422,9 @@ bool AMDGPUInstructionSelector::select(MachineInstr &I) {
case TargetOpcode::G_ATOMICRMW_MAX:
case TargetOpcode::G_ATOMICRMW_UMIN:
case TargetOpcode::G_ATOMICRMW_UMAX:
+ case TargetOpcode::G_ATOMICRMW_UINC_WRAP:
+ case TargetOpcode::G_ATOMICRMW_UDEC_WRAP:
case TargetOpcode::G_ATOMICRMW_FADD:
- case AMDGPU::G_AMDGPU_ATOMIC_INC:
- case AMDGPU::G_AMDGPU_ATOMIC_DEC:
case AMDGPU::G_AMDGPU_ATOMIC_FMIN:
case AMDGPU::G_AMDGPU_ATOMIC_FMAX:
return selectG_LOAD_STORE_ATOMICRMW(I);
@@ -3460,7 +3436,11 @@ bool AMDGPUInstructionSelector::select(MachineInstr &I) {
case TargetOpcode::G_ZEXT:
case TargetOpcode::G_ANYEXT:
case TargetOpcode::G_SEXT_INREG:
- if (selectImpl(I, *CoverageInfo))
+ // This is a workaround. For extension from type i1, `selectImpl()` uses
+ // patterns from TD file and generates an illegal VGPR to SGPR COPY as type
+ // i1 can only be hold in a SGPR class.
+ if (MRI->getType(I.getOperand(1).getReg()) != LLT::scalar(1) &&
+ selectImpl(I, *CoverageInfo))
return true;
return selectG_SZA_EXT(I);
case TargetOpcode::G_BRCOND:
@@ -3506,8 +3486,10 @@ AMDGPUInstructionSelector::selectVCSRC(MachineOperand &Root) const {
}
-std::pair<Register, unsigned> AMDGPUInstructionSelector::selectVOP3ModsImpl(
- MachineOperand &Root, bool AllowAbs, bool OpSel) const {
+std::pair<Register, unsigned>
+AMDGPUInstructionSelector::selectVOP3ModsImpl(MachineOperand &Root,
+ bool IsCanonicalizing,
+ bool AllowAbs, bool OpSel) const {
Register Src = Root.getReg();
unsigned Mods = 0;
MachineInstr *MI = getDefIgnoringCopies(Src, *MRI);
@@ -3516,6 +3498,15 @@ std::pair<Register, unsigned> AMDGPUInstructionSelector::selectVOP3ModsImpl(
Src = MI->getOperand(1).getReg();
Mods |= SISrcMods::NEG;
MI = getDefIgnoringCopies(Src, *MRI);
+ } else if (MI->getOpcode() == AMDGPU::G_FSUB && IsCanonicalizing) {
+ // Fold fsub [+-]0 into fneg. This may not have folded depending on the
+ // denormal mode, but we're implicitly canonicalizing in a source operand.
+ const ConstantFP *LHS =
+ getConstantFPVRegVal(MI->getOperand(1).getReg(), *MRI);
+ if (LHS && LHS->isZero()) {
+ Mods |= SISrcMods::NEG;
+ Src = MI->getOperand(2).getReg();
+ }
}
if (AllowAbs && MI->getOpcode() == AMDGPU::G_FABS) {
@@ -3578,7 +3569,9 @@ InstructionSelector::ComplexRendererFns
AMDGPUInstructionSelector::selectVOP3BMods0(MachineOperand &Root) const {
Register Src;
unsigned Mods;
- std::tie(Src, Mods) = selectVOP3ModsImpl(Root, /* AllowAbs */ false);
+ std::tie(Src, Mods) = selectVOP3ModsImpl(Root,
+ /*IsCanonicalizing=*/true,
+ /*AllowAbs=*/false);
return {{
[=](MachineInstrBuilder &MIB) {
@@ -3614,10 +3607,26 @@ AMDGPUInstructionSelector::selectVOP3Mods(MachineOperand &Root) const {
}
InstructionSelector::ComplexRendererFns
+AMDGPUInstructionSelector::selectVOP3ModsNonCanonicalizing(
+ MachineOperand &Root) const {
+ Register Src;
+ unsigned Mods;
+ std::tie(Src, Mods) = selectVOP3ModsImpl(Root, /*IsCanonicalizing=*/false);
+
+ return {{
+ [=](MachineInstrBuilder &MIB) {
+ MIB.addReg(copyToVGPRIfSrcFolded(Src, Mods, Root, MIB));
+ },
+ [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
+ }};
+}
+
+InstructionSelector::ComplexRendererFns
AMDGPUInstructionSelector::selectVOP3BMods(MachineOperand &Root) const {
Register Src;
unsigned Mods;
- std::tie(Src, Mods) = selectVOP3ModsImpl(Root, /* AllowAbs */ false);
+ std::tie(Src, Mods) = selectVOP3ModsImpl(Root, /*IsCanonicalizing=*/true,
+ /*AllowAbs=*/false);
return {{
[=](MachineInstrBuilder &MIB) {
@@ -3653,6 +3662,8 @@ AMDGPUInstructionSelector::selectVOP3PModsImpl(
MI = MRI.getVRegDef(Src);
}
+ // TODO: Handle G_FSUB 0 as fneg
+
// TODO: Match op_sel through g_build_vector_trunc and g_shuffle_vector.
(void)IsDOT; // DOTs do not use OPSEL on gfx940+, check ST.hasDOTOpSelHazard()
@@ -3739,8 +3750,9 @@ AMDGPUInstructionSelector::selectVINTERPMods(MachineOperand &Root) const {
Register Src;
unsigned Mods;
std::tie(Src, Mods) = selectVOP3ModsImpl(Root,
- /* AllowAbs */ false,
- /* OpSel */ false);
+ /*IsCanonicalizing=*/true,
+ /*AllowAbs=*/false,
+ /*OpSel=*/false);
return {{
[=](MachineInstrBuilder &MIB) {
@@ -3756,8 +3768,9 @@ AMDGPUInstructionSelector::selectVINTERPModsHi(MachineOperand &Root) const {
Register Src;
unsigned Mods;
std::tie(Src, Mods) = selectVOP3ModsImpl(Root,
- /* AllowAbs */ false,
- /* OpSel */ true);
+ /*IsCanonicalizing=*/true,
+ /*AllowAbs=*/false,
+ /*OpSel=*/true);
return {{
[=](MachineInstrBuilder &MIB) {
@@ -3903,7 +3916,7 @@ AMDGPUInstructionSelector::selectFlatOffsetImpl(MachineOperand &Root,
int64_t ConstOffset;
std::tie(PtrBase, ConstOffset) =
getPtrBaseWithConstantOffset(Root.getReg(), *MRI);
- if (ConstOffset == 0)
+ if (ConstOffset == 0 || !isFlatScratchBaseLegal(PtrBase, FlatVariant))
return Default;
unsigned AddrSpace = (*MI->memoperands_begin())->getAddrSpace();
@@ -4066,7 +4079,7 @@ AMDGPUInstructionSelector::selectScratchSAddr(MachineOperand &Root) const {
// possible.
std::tie(PtrBase, ConstOffset) = getPtrBaseWithConstantOffset(Addr, *MRI);
- if (ConstOffset != 0 &&
+ if (ConstOffset != 0 && isFlatScratchBaseLegal(PtrBase) &&
TII.isLegalFLATOffset(ConstOffset, AMDGPUAS::PRIVATE_ADDRESS,
SIInstrFlags::FlatScratch)) {
Addr = PtrBase;
@@ -4122,9 +4135,9 @@ bool AMDGPUInstructionSelector::checkFlatScratchSVSSwizzleBug(
// The bug affects the swizzling of SVS accesses if there is any carry out
// from the two low order bits (i.e. from bit 1 into bit 2) when adding
// voffset to (soffset + inst_offset).
- auto VKnown = KnownBits->getKnownBits(VAddr);
+ auto VKnown = KB->getKnownBits(VAddr);
auto SKnown = KnownBits::computeForAddSub(
- true, false, KnownBits->getKnownBits(SAddr),
+ true, false, KB->getKnownBits(SAddr),
KnownBits::makeConstant(APInt(32, ImmOffset)));
uint64_t VMax = VKnown.getMaxValue().getZExtValue();
uint64_t SMax = SKnown.getMaxValue().getZExtValue();
@@ -4159,6 +4172,9 @@ AMDGPUInstructionSelector::selectScratchSVAddr(MachineOperand &Root) const {
Register LHS = AddrDef->MI->getOperand(1).getReg();
auto LHSDef = getDefSrcRegIgnoringCopies(LHS, *MRI);
+ if (!isFlatScratchBaseLegal(LHS) || !isFlatScratchBaseLegal(RHS))
+ return std::nullopt;
+
if (checkFlatScratchSVSSwizzleBug(RHS, LHS, ImmOffset))
return std::nullopt;
@@ -4195,9 +4211,10 @@ AMDGPUInstructionSelector::selectMUBUFScratchOffen(MachineOperand &Root) const {
// TODO: Should this be inside the render function? The iterator seems to
// move.
+ const uint32_t MaxOffset = SIInstrInfo::getMaxMUBUFImmOffset();
BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::V_MOV_B32_e32),
HighBits)
- .addImm(Offset & ~4095);
+ .addImm(Offset & ~MaxOffset);
return {{[=](MachineInstrBuilder &MIB) { // rsrc
MIB.addReg(Info->getScratchRSrcReg());
@@ -4211,7 +4228,7 @@ AMDGPUInstructionSelector::selectMUBUFScratchOffen(MachineOperand &Root) const {
MIB.addImm(0);
},
[=](MachineInstrBuilder &MIB) { // offset
- MIB.addImm(Offset & 4095);
+ MIB.addImm(Offset & MaxOffset);
}}};
}
@@ -4228,7 +4245,7 @@ AMDGPUInstructionSelector::selectMUBUFScratchOffen(MachineOperand &Root) const {
if (ConstOffset != 0) {
if (SIInstrInfo::isLegalMUBUFImmOffset(ConstOffset) &&
(!STI.privateMemoryResourceIsRangeChecked() ||
- KnownBits->signBitIsZero(PtrBase))) {
+ KB->signBitIsZero(PtrBase))) {
const MachineInstr *PtrBaseDef = MRI->getVRegDef(PtrBase);
if (PtrBaseDef->getOpcode() == AMDGPU::G_FRAME_INDEX)
FI = PtrBaseDef->getOperand(1).getIndex();
@@ -4270,7 +4287,7 @@ bool AMDGPUInstructionSelector::isDSOffsetLegal(Register Base,
// On Southern Islands instruction with a negative base value and an offset
// don't seem to work.
- return KnownBits->signBitIsZero(Base);
+ return KB->signBitIsZero(Base);
}
bool AMDGPUInstructionSelector::isDSOffset2Legal(Register Base, int64_t Offset0,
@@ -4286,7 +4303,17 @@ bool AMDGPUInstructionSelector::isDSOffset2Legal(Register Base, int64_t Offset0,
// On Southern Islands instruction with a negative base value and an offset
// don't seem to work.
- return KnownBits->signBitIsZero(Base);
+ return KB->signBitIsZero(Base);
+}
+
+bool AMDGPUInstructionSelector::isFlatScratchBaseLegal(
+ Register Base, uint64_t FlatVariant) const {
+ if (FlatVariant != SIInstrFlags::FlatScratch)
+ return true;
+
+ // When value in 32-bit Base can be negative calculate scratch offset using
+ // 32-bit add instruction, otherwise use Base(unsigned) + offset.
+ return KB->signBitIsZero(Base);
}
bool AMDGPUInstructionSelector::isUnneededShiftMask(const MachineInstr &MI,
@@ -4298,12 +4325,11 @@ bool AMDGPUInstructionSelector::isUnneededShiftMask(const MachineInstr &MI,
if (!RHS)
return false;
- if (RHS->countTrailingOnes() >= ShAmtBits)
+ if (RHS->countr_one() >= ShAmtBits)
return true;
- const APInt &LHSKnownZeros =
- KnownBits->getKnownZeroes(MI.getOperand(1).getReg());
- return (LHSKnownZeros | *RHS).countTrailingOnes() >= ShAmtBits;
+ const APInt &LHSKnownZeros = KB->getKnownZeroes(MI.getOperand(1).getReg());
+ return (LHSKnownZeros | *RHS).countr_one() >= ShAmtBits;
}
// Return the wave level SGPR base address if this is a wave address.
@@ -4746,64 +4772,6 @@ AMDGPUInstructionSelector::selectMUBUFOffset(MachineOperand &Root) const {
}};
}
-InstructionSelector::ComplexRendererFns
-AMDGPUInstructionSelector::selectMUBUFAddr64Atomic(MachineOperand &Root) const {
- Register VAddr;
- Register RSrcReg;
- Register SOffset;
- int64_t Offset = 0;
-
- if (!selectMUBUFAddr64Impl(Root, VAddr, RSrcReg, SOffset, Offset))
- return {};
-
- // FIXME: Use defaulted operands for trailing 0s and remove from the complex
- // pattern.
- return {{
- [=](MachineInstrBuilder &MIB) { // rsrc
- MIB.addReg(RSrcReg);
- },
- [=](MachineInstrBuilder &MIB) { // vaddr
- MIB.addReg(VAddr);
- },
- [=](MachineInstrBuilder &MIB) { // soffset
- if (SOffset)
- MIB.addReg(SOffset);
- else
- MIB.addImm(0);
- },
- [=](MachineInstrBuilder &MIB) { // offset
- MIB.addImm(Offset);
- },
- [=](MachineInstrBuilder &MIB) {
- MIB.addImm(AMDGPU::CPol::GLC); // cpol
- }
- }};
-}
-
-InstructionSelector::ComplexRendererFns
-AMDGPUInstructionSelector::selectMUBUFOffsetAtomic(MachineOperand &Root) const {
- Register RSrcReg;
- Register SOffset;
- int64_t Offset = 0;
-
- if (!selectMUBUFOffsetImpl(Root, RSrcReg, SOffset, Offset))
- return {};
-
- return {{
- [=](MachineInstrBuilder &MIB) { // rsrc
- MIB.addReg(RSrcReg);
- },
- [=](MachineInstrBuilder &MIB) { // soffset
- if (SOffset)
- MIB.addReg(SOffset);
- else
- MIB.addImm(0);
- },
- [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); }, // offset
- [=](MachineInstrBuilder &MIB) { MIB.addImm(AMDGPU::CPol::GLC); } // cpol
- }};
-}
-
/// Get an immediate that must be 32-bits, and treated as zero extended.
static std::optional<uint64_t>
getConstantZext32Val(Register Reg, const MachineRegisterInfo &MRI) {
@@ -4851,7 +4819,7 @@ AMDGPUInstructionSelector::selectSMRDBufferSgprImm(MachineOperand &Root) const {
Register SOffset;
unsigned Offset;
std::tie(SOffset, Offset) =
- AMDGPU::getBaseWithConstantOffset(*MRI, Root.getReg(), KnownBits);
+ AMDGPU::getBaseWithConstantOffset(*MRI, Root.getReg(), KB);
if (!SOffset)
return std::nullopt;
@@ -4984,6 +4952,22 @@ AMDGPUInstructionSelector::selectVOP3PMadMixModsImpl(MachineOperand &Root,
}
InstructionSelector::ComplexRendererFns
+AMDGPUInstructionSelector::selectVOP3PMadMixModsExt(
+ MachineOperand &Root) const {
+ Register Src;
+ unsigned Mods;
+ bool Matched;
+ std::tie(Src, Mods) = selectVOP3PMadMixModsImpl(Root, Matched);
+ if (!Matched)
+ return {};
+
+ return {{
+ [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
+ [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods
+ }};
+}
+
+InstructionSelector::ComplexRendererFns
AMDGPUInstructionSelector::selectVOP3PMadMixMods(MachineOperand &Root) const {
Register Src;
unsigned Mods;
@@ -5031,7 +5015,7 @@ void AMDGPUInstructionSelector::renderPopcntImm(MachineInstrBuilder &MIB,
int OpIdx) const {
assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 &&
"Expected G_CONSTANT");
- MIB.addImm(MI.getOperand(1).getCImm()->getValue().countPopulation());
+ MIB.addImm(MI.getOperand(1).getCImm()->getValue().popcount());
}
/// This only really exists to satisfy DAG type checking machinery, so is a
@@ -5042,6 +5026,13 @@ void AMDGPUInstructionSelector::renderTruncTImm(MachineInstrBuilder &MIB,
MIB.addImm(MI.getOperand(OpIdx).getImm());
}
+void AMDGPUInstructionSelector::renderOpSelTImm(MachineInstrBuilder &MIB,
+ const MachineInstr &MI,
+ int OpIdx) const {
+ assert(OpIdx >= 0 && "expected to match an immediate operand");
+ MIB.addImm(MI.getOperand(OpIdx).getImm() ? (int64_t)SISrcMods::OP_SEL_0 : 0);
+}
+
void AMDGPUInstructionSelector::renderExtractCPol(MachineInstrBuilder &MIB,
const MachineInstr &MI,
int OpIdx) const {
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
index 33a01ed0a1ce..243ff72e2979 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
@@ -13,6 +13,7 @@
#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUINSTRUCTIONSELECTOR_H
#define LLVM_LIB_TARGET_AMDGPU_AMDGPUINSTRUCTIONSELECTOR_H
+#include "SIDefines.h"
#include "llvm/CodeGen/GlobalISel/InstructionSelector.h"
#include "llvm/IR/InstrTypes.h"
@@ -58,7 +59,7 @@ public:
static const char *getName();
void setupMF(MachineFunction &MF, GISelKnownBits *KB,
- CodeGenCoverage &CoverageInfo, ProfileSummaryInfo *PSI,
+ CodeGenCoverage *CoverageInfo, ProfileSummaryInfo *PSI,
BlockFrequencyInfo *BFI) override;
private:
@@ -111,6 +112,7 @@ private:
bool selectDivScale(MachineInstr &MI) const;
bool selectIntrinsicCmp(MachineInstr &MI) const;
bool selectBallot(MachineInstr &I) const;
+ bool selectInverseBallot(MachineInstr &I) const;
bool selectRelocConstant(MachineInstr &I) const;
bool selectGroupStaticSize(MachineInstr &I) const;
bool selectReturnAddress(MachineInstr &I) const;
@@ -146,9 +148,10 @@ private:
bool selectSMFMACIntrin(MachineInstr &I) const;
bool selectWaveAddress(MachineInstr &I) const;
- std::pair<Register, unsigned>
- selectVOP3ModsImpl(MachineOperand &Root, bool AllowAbs = true,
- bool OpSel = false) const;
+ std::pair<Register, unsigned> selectVOP3ModsImpl(MachineOperand &Root,
+ bool IsCanonicalizing = true,
+ bool AllowAbs = true,
+ bool OpSel = false) const;
Register copyToVGPRIfSrcFolded(Register Src, unsigned Mods,
MachineOperand Root, MachineInstr *InsertPt,
@@ -169,6 +172,8 @@ private:
InstructionSelector::ComplexRendererFns
selectVOP3Mods(MachineOperand &Root) const;
InstructionSelector::ComplexRendererFns
+ selectVOP3ModsNonCanonicalizing(MachineOperand &Root) const;
+ InstructionSelector::ComplexRendererFns
selectVOP3BMods(MachineOperand &Root) const;
ComplexRendererFns selectVOP3NoMods(MachineOperand &Root) const;
@@ -236,6 +241,8 @@ private:
bool isDSOffsetLegal(Register Base, int64_t Offset) const;
bool isDSOffset2Legal(Register Base, int64_t Offset0, int64_t Offset1,
unsigned Size) const;
+ bool isFlatScratchBaseLegal(
+ Register Base, uint64_t FlatVariant = SIInstrFlags::FlatScratch) const;
std::pair<Register, unsigned>
selectDS1Addr1OffsetImpl(MachineOperand &Root) const;
@@ -285,18 +292,13 @@ private:
InstructionSelector::ComplexRendererFns
selectMUBUFOffset(MachineOperand &Root) const;
- InstructionSelector::ComplexRendererFns
- selectMUBUFOffsetAtomic(MachineOperand &Root) const;
-
- InstructionSelector::ComplexRendererFns
- selectMUBUFAddr64Atomic(MachineOperand &Root) const;
-
ComplexRendererFns selectSMRDBufferImm(MachineOperand &Root) const;
ComplexRendererFns selectSMRDBufferImm32(MachineOperand &Root) const;
ComplexRendererFns selectSMRDBufferSgprImm(MachineOperand &Root) const;
std::pair<Register, unsigned> selectVOP3PMadMixModsImpl(MachineOperand &Root,
bool &Matched) const;
+ ComplexRendererFns selectVOP3PMadMixModsExt(MachineOperand &Root) const;
ComplexRendererFns selectVOP3PMadMixMods(MachineOperand &Root) const;
void renderTruncImm32(MachineInstrBuilder &MIB, const MachineInstr &MI,
@@ -305,6 +307,9 @@ private:
void renderTruncTImm(MachineInstrBuilder &MIB, const MachineInstr &MI,
int OpIdx) const;
+ void renderOpSelTImm(MachineInstrBuilder &MIB, const MachineInstr &MI,
+ int OpIdx) const;
+
void renderNegateImm(MachineInstrBuilder &MIB, const MachineInstr &MI,
int OpIdx) const;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td b/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td
index 22b327279211..2305097e3f94 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td
@@ -104,15 +104,18 @@ class PredicateControl {
}
class AMDGPUPat<dag pattern, dag result> : Pat<pattern, result>,
- PredicateControl;
+ PredicateControl, GISelFlags;
+
+let GIIgnoreCopies = 1 in
+class AMDGPUPatIgnoreCopies<dag pattern, dag result> : AMDGPUPat<pattern, result>;
let RecomputePerFunction = 1 in {
-def FP16Denormals : Predicate<"MF->getInfo<SIMachineFunctionInfo>()->getMode().allFP64FP16Denormals()">;
-def FP32Denormals : Predicate<"MF->getInfo<SIMachineFunctionInfo>()->getMode().allFP32Denormals()">;
-def FP64Denormals : Predicate<"MF->getInfo<SIMachineFunctionInfo>()->getMode().allFP64FP16Denormals()">;
-def NoFP16Denormals : Predicate<"!MF->getInfo<SIMachineFunctionInfo>()->getMode().allFP64FP16Denormals()">;
-def NoFP32Denormals : Predicate<"!MF->getInfo<SIMachineFunctionInfo>()->getMode().allFP32Denormals()">;
-def NoFP64Denormals : Predicate<"!MF->getInfo<SIMachineFunctionInfo>()->getMode().allFP64FP16Denormals()">;
+def FP16Denormals : Predicate<"MF->getInfo<SIMachineFunctionInfo>()->getMode().FP64FP16Denormals != DenormalMode::getPreserveSign()">;
+def FP32Denormals : Predicate<"MF->getInfo<SIMachineFunctionInfo>()->getMode().FP32Denormals != DenormalMode::getPreserveSign()">;
+def FP64Denormals : Predicate<"MF->getInfo<SIMachineFunctionInfo>()->getMode().FP64FP16Denormals != DenormalMode::getPreserveSign()">;
+def NoFP16Denormals : Predicate<"MF->getInfo<SIMachineFunctionInfo>()->getMode().FP64FP16Denormals == DenormalMode::getPreserveSign()">;
+def NoFP32Denormals : Predicate<"MF->getInfo<SIMachineFunctionInfo>()->getMode().FP32Denormals == DenormalMode::getPreserveSign()">;
+def NoFP64Denormals : Predicate<"MF->getInfo<SIMachineFunctionInfo>()->getMode().FP64FP16Denormals == DenormalMode::getPreserveSign()">;
def UnsafeFPMath : Predicate<"TM.Options.UnsafeFPMath">;
}
@@ -120,37 +123,45 @@ def FMA : Predicate<"Subtarget->hasFMA()">;
def InstFlag : OperandWithDefaultOps <i32, (ops (i32 0))>;
-def u16ImmTarget : AsmOperandClass {
- let Name = "U16Imm";
- let RenderMethod = "addImmOperands";
-}
+def i1imm_0 : OperandWithDefaultOps<i1, (ops (i1 0))>;
-def s16ImmTarget : AsmOperandClass {
- let Name = "S16Imm";
+class CustomOperandClass<string name, bit optional, string parserMethod,
+ string defaultMethod>
+ : AsmOperandClass {
+ let Name = name;
+ let PredicateMethod = "is"#name;
+ let ParserMethod = parserMethod;
let RenderMethod = "addImmOperands";
+ let IsOptional = optional;
+ let DefaultMethod = defaultMethod;
}
-let OperandType = "OPERAND_IMMEDIATE" in {
-
-def u32imm : Operand<i32> {
- let PrintMethod = "printU32ImmOperand";
-}
-
-def u16imm : Operand<i16> {
- let PrintMethod = "printU16ImmOperand";
- let ParserMatchClass = u16ImmTarget;
+class CustomOperandProps<bit optional = 0, string name = NAME> {
+ string ImmTy = "ImmTy"#name;
+ string ParserMethod = "parse"#name;
+ string DefaultValue = "0";
+ string DefaultMethod = "[this]() { return "#
+ "AMDGPUOperand::CreateImm(this, "#DefaultValue#", SMLoc(), "#
+ "AMDGPUOperand::"#ImmTy#"); }";
+ string PrintMethod = "print"#name;
+ AsmOperandClass ParserMatchClass =
+ CustomOperandClass<name, optional, ParserMethod, DefaultMethod>;
+ string OperandType = "OPERAND_IMMEDIATE";
}
-def s16imm : Operand<i16> {
- let PrintMethod = "printU16ImmOperand";
- let ParserMatchClass = s16ImmTarget;
-}
+class CustomOperand<ValueType type, bit optional = 0, string name = NAME>
+ : Operand<type>, CustomOperandProps<optional, name>;
-def u8imm : Operand<i8> {
- let PrintMethod = "printU8ImmOperand";
+class ImmOperand<ValueType type, string name = NAME, bit optional = 0,
+ string printer = "print"#name>
+ : CustomOperand<type, optional, name> {
+ let ImmTy = "ImmTyNone";
+ let ParserMethod = "";
+ let PrintMethod = printer;
}
-} // End OperandType = "OPERAND_IMMEDIATE"
+def s16imm : ImmOperand<i16, "S16Imm", 0, "printU16ImmOperand">;
+def u16imm : ImmOperand<i16, "U16Imm", 0, "printU16ImmOperand">;
//===--------------------------------------------------------------------===//
// Custom Operands
@@ -210,6 +221,12 @@ class is_canonicalized<SDPatternOperator op> : PatFrag<
}];
}
+class FoldTernaryOpPat<SDPatternOperator op1, SDPatternOperator op2> : PatFrag<
+ (ops node:$src0, node:$src1, node:$src2),
+ (op2 (op1 node:$src0, node:$src1), node:$src2)
+>;
+
+def imad : FoldTernaryOpPat<mul, add>;
let Properties = [SDNPCommutative, SDNPAssociative] in {
def smax_oneuse : HasOneUseBinOp<smax>;
@@ -638,6 +655,8 @@ defm atomic_load_umax : binary_atomic_op_all_as<atomic_load_umax>;
defm atomic_load_umin : binary_atomic_op_all_as<atomic_load_umin>;
defm atomic_load_xor : binary_atomic_op_all_as<atomic_load_xor>;
defm atomic_load_fadd : binary_atomic_op_all_as<atomic_load_fadd, 0>;
+defm atomic_load_uinc_wrap : binary_atomic_op_all_as<atomic_load_uinc_wrap>;
+defm atomic_load_udec_wrap : binary_atomic_op_all_as<atomic_load_udec_wrap>;
let MemoryVT = v2f16 in
defm atomic_load_fadd_v2f16 : binary_atomic_op_all_as<atomic_load_fadd, 0>;
defm AMDGPUatomic_cmp_swap : binary_atomic_op_all_as<AMDGPUatomic_cmp_swap>;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp b/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp
index 9e86bd0c2b97..fb7148ba10ac 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp
@@ -14,7 +14,7 @@
#include "AMDGPU.h"
#include "llvm/Analysis/AssumptionCache.h"
-#include "llvm/Analysis/LegacyDivergenceAnalysis.h"
+#include "llvm/Analysis/UniformityAnalysis.h"
#include "llvm/Analysis/ValueTracking.h"
#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/InstVisitor.h"
@@ -46,7 +46,7 @@ class AMDGPULateCodeGenPrepare
const DataLayout *DL = nullptr;
AssumptionCache *AC = nullptr;
- LegacyDivergenceAnalysis *DA = nullptr;
+ UniformityInfo *UA = nullptr;
public:
static char ID;
@@ -59,7 +59,7 @@ public:
void getAnalysisUsage(AnalysisUsage &AU) const override {
AU.addRequired<AssumptionCacheTracker>();
- AU.addRequired<LegacyDivergenceAnalysis>();
+ AU.addRequired<UniformityInfoWrapperPass>();
AU.setPreservesAll();
}
@@ -91,7 +91,7 @@ bool AMDGPULateCodeGenPrepare::runOnFunction(Function &F) {
return false;
AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
- DA = &getAnalysis<LegacyDivergenceAnalysis>();
+ UA = &getAnalysis<UniformityInfoWrapperPass>().getUniformityInfo();
bool Changed = false;
for (auto &BB : F)
@@ -122,7 +122,7 @@ bool AMDGPULateCodeGenPrepare::canWidenScalarExtLoad(LoadInst &LI) const {
if (LI.getAlign() < DL->getABITypeAlign(Ty))
return false;
// It should be uniform, i.e. a scalar load.
- return DA->isUniform(&LI);
+ return UA->isUniform(&LI);
}
bool AMDGPULateCodeGenPrepare::visitLoadInst(LoadInst &LI) {
@@ -156,18 +156,14 @@ bool AMDGPULateCodeGenPrepare::visitLoadInst(LoadInst &LI) {
IRBuilder<> IRB(&LI);
IRB.SetCurrentDebugLocation(LI.getDebugLoc());
- unsigned AS = LI.getPointerAddressSpace();
- unsigned LdBits = DL->getTypeStoreSize(LI.getType()) * 8;
+ unsigned LdBits = DL->getTypeStoreSizeInBits(LI.getType());
auto IntNTy = Type::getIntNTy(LI.getContext(), LdBits);
- PointerType *Int32PtrTy = Type::getInt32PtrTy(LI.getContext(), AS);
- PointerType *Int8PtrTy = Type::getInt8PtrTy(LI.getContext(), AS);
- auto *NewPtr = IRB.CreateBitCast(
- IRB.CreateConstGEP1_64(
- IRB.getInt8Ty(),
- IRB.CreatePointerBitCastOrAddrSpaceCast(Base, Int8PtrTy),
- Offset - Adjust),
- Int32PtrTy);
+ auto *NewPtr = IRB.CreateConstGEP1_64(
+ IRB.getInt8Ty(),
+ IRB.CreateAddrSpaceCast(Base, LI.getPointerOperand()->getType()),
+ Offset - Adjust);
+
LoadInst *NewLd = IRB.CreateAlignedLoad(IRB.getInt32Ty(), NewPtr, Align(4));
NewLd->copyMetadata(LI);
NewLd->setMetadata(LLVMContext::MD_range, nullptr);
@@ -184,7 +180,7 @@ bool AMDGPULateCodeGenPrepare::visitLoadInst(LoadInst &LI) {
INITIALIZE_PASS_BEGIN(AMDGPULateCodeGenPrepare, DEBUG_TYPE,
"AMDGPU IR late optimizations", false, false)
INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
-INITIALIZE_PASS_DEPENDENCY(LegacyDivergenceAnalysis)
+INITIALIZE_PASS_DEPENDENCY(UniformityInfoWrapperPass)
INITIALIZE_PASS_END(AMDGPULateCodeGenPrepare, DEBUG_TYPE,
"AMDGPU IR late optimizations", false, false)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
index 41cb0a99b420..120c00b14a36 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -24,6 +24,7 @@
#include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
#include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
+#include "llvm/CodeGen/GlobalISel/Utils.h"
#include "llvm/IR/DiagnosticInfo.h"
#include "llvm/IR/IntrinsicsAMDGPU.h"
#include "llvm/IR/IntrinsicsR600.h"
@@ -131,6 +132,42 @@ static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) {
};
}
+// Increase the number of vector elements to reach the next legal RegClass.
+static LegalizeMutation moreElementsToNextExistingRegClass(unsigned TypeIdx) {
+ return [=](const LegalityQuery &Query) {
+ const LLT Ty = Query.Types[TypeIdx];
+ const unsigned NumElts = Ty.getNumElements();
+ const unsigned EltSize = Ty.getElementType().getSizeInBits();
+ const unsigned MaxNumElts = MaxRegisterSize / EltSize;
+
+ assert(EltSize == 32 || EltSize == 64);
+ assert(Ty.getSizeInBits() < MaxRegisterSize);
+
+ unsigned NewNumElts;
+ // Find the nearest legal RegClass that is larger than the current type.
+ for (NewNumElts = NumElts; NewNumElts < MaxNumElts; ++NewNumElts) {
+ if (SIRegisterInfo::getSGPRClassForBitWidth(NewNumElts * EltSize))
+ break;
+ }
+
+ return std::pair(TypeIdx, LLT::fixed_vector(NewNumElts, EltSize));
+ };
+}
+
+static LLT getBufferRsrcScalarType(const LLT Ty) {
+ if (!Ty.isVector())
+ return LLT::scalar(128);
+ const ElementCount NumElems = Ty.getElementCount();
+ return LLT::vector(NumElems, LLT::scalar(128));
+}
+
+static LLT getBufferRsrcRegisterType(const LLT Ty) {
+ if (!Ty.isVector())
+ return LLT::fixed_vector(4, LLT::scalar(32));
+ const unsigned NumElems = Ty.getElementCount().getFixedValue();
+ return LLT::fixed_vector(NumElems * 4, LLT::scalar(32));
+}
+
static LLT getBitcastRegisterType(const LLT Ty) {
const unsigned Size = Ty.getSizeInBits();
@@ -215,6 +252,15 @@ static LegalityPredicate isRegisterType(unsigned TypeIdx) {
};
}
+// RegisterType that doesn't have a corresponding RegClass.
+static LegalityPredicate isIllegalRegisterType(unsigned TypeIdx) {
+ return [=](const LegalityQuery &Query) {
+ LLT Ty = Query.Types[TypeIdx];
+ return isRegisterType(Ty) &&
+ !SIRegisterInfo::getSGPRClassForBitWidth(Ty.getSizeInBits());
+ };
+}
+
static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx) {
return [=](const LegalityQuery &Query) {
const LLT QueryTy = Query.Types[TypeIdx];
@@ -239,7 +285,7 @@ static LegalityPredicate isWideScalarExtLoadTruncStore(unsigned TypeIdx) {
// handle some operations by just promoting the register during
// selection. There are also d16 loads on GFX9+ which preserve the high bits.
static unsigned maxSizeForAddrSpace(const GCNSubtarget &ST, unsigned AS,
- bool IsLoad) {
+ bool IsLoad, bool IsAtomic) {
switch (AS) {
case AMDGPUAS::PRIVATE_ADDRESS:
// FIXME: Private element size.
@@ -249,6 +295,7 @@ static unsigned maxSizeForAddrSpace(const GCNSubtarget &ST, unsigned AS,
case AMDGPUAS::GLOBAL_ADDRESS:
case AMDGPUAS::CONSTANT_ADDRESS:
case AMDGPUAS::CONSTANT_ADDRESS_32BIT:
+ case AMDGPUAS::BUFFER_RESOURCE:
// Treat constant and global as identical. SMRD loads are sometimes usable for
// global loads (ideally constant address space should be eliminated)
// depending on the context. Legality cannot be context dependent, but
@@ -257,9 +304,10 @@ static unsigned maxSizeForAddrSpace(const GCNSubtarget &ST, unsigned AS,
// kernel.
return IsLoad ? 512 : 128;
default:
- // Flat addresses may contextually need to be split to 32-bit parts if they
- // may alias scratch depending on the subtarget.
- return 128;
+ // FIXME: Flat addresses may contextually need to be split to 32-bit parts
+ // if they may alias scratch depending on the subtarget. This needs to be
+ // moved to custom handling to use addressMayBeAccessedAsPrivate
+ return ST.hasMultiDwordFlatScratchAddressing() || IsAtomic ? 128 : 32;
}
}
@@ -295,7 +343,9 @@ static bool isLoadStoreSizeLegal(const GCNSubtarget &ST,
if (MemSize != RegSize && RegSize != 32)
return false;
- if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad))
+ if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad,
+ Query.MMODescrs[0].Ordering !=
+ AtomicOrdering::NotAtomic))
return false;
switch (MemSize) {
@@ -329,6 +379,21 @@ static bool isLoadStoreSizeLegal(const GCNSubtarget &ST,
return true;
}
+// The newer buffer intrinsic forms take their resource arguments as
+// pointers in address space 8, aka s128 values. However, in order to not break
+// SelectionDAG, the underlying operations have to continue to take v4i32
+// arguments. Therefore, we convert resource pointers - or vectors of them
+// to integer values here.
+static bool hasBufferRsrcWorkaround(const LLT Ty) {
+ if (Ty.isPointer() && Ty.getAddressSpace() == AMDGPUAS::BUFFER_RESOURCE)
+ return true;
+ if (Ty.isVector()) {
+ const LLT ElemTy = Ty.getElementType();
+ return hasBufferRsrcWorkaround(ElemTy);
+ }
+ return false;
+}
+
// The current selector can't handle <6 x s16>, <8 x s16>, s96, s128 etc, so
// workaround this. Eventually it should ignore the type for loads and only care
// about the size. Return true in cases where we will workaround this for now by
@@ -340,6 +405,9 @@ static bool loadStoreBitcastWorkaround(const LLT Ty) {
const unsigned Size = Ty.getSizeInBits();
if (Size <= 64)
return false;
+ // Address space 8 pointers get their own workaround.
+ if (hasBufferRsrcWorkaround(Ty))
+ return false;
if (!Ty.isVector())
return true;
@@ -354,7 +422,7 @@ static bool loadStoreBitcastWorkaround(const LLT Ty) {
static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query) {
const LLT Ty = Query.Types[0];
return isRegisterType(Ty) && isLoadStoreSizeLegal(ST, Query) &&
- !loadStoreBitcastWorkaround(Ty);
+ !hasBufferRsrcWorkaround(Ty) && !loadStoreBitcastWorkaround(Ty);
}
/// Return true if a load or store of the type should be lowered with a bitcast
@@ -392,7 +460,7 @@ static bool shouldWidenLoad(const GCNSubtarget &ST, LLT MemoryTy,
if (SizeInBits == 96 && ST.hasDwordx3LoadStores())
return false;
- if (SizeInBits >= maxSizeForAddrSpace(ST, AddrSpace, Opcode))
+ if (SizeInBits >= maxSizeForAddrSpace(ST, AddrSpace, Opcode, false))
return false;
// A load is known dereferenceable up to the alignment, so it's legal to widen
@@ -422,6 +490,80 @@ static bool shouldWidenLoad(const GCNSubtarget &ST, const LegalityQuery &Query,
Query.Types[1].getAddressSpace(), Opcode);
}
+/// Mutates IR (typicaly a load instruction) to use a <4 x s32> as the initial
+/// type of the operand `idx` and then to transform it to a `p8` via bitcasts
+/// and inttoptr. In addition, handle vectors of p8. Returns the new type.
+static LLT castBufferRsrcFromV4I32(MachineInstr &MI, MachineIRBuilder &B,
+ MachineRegisterInfo &MRI, unsigned Idx) {
+ MachineOperand &MO = MI.getOperand(Idx);
+
+ const LLT PointerTy = MRI.getType(MO.getReg());
+
+ // Paranoidly prevent us from doing this multiple times.
+ if (!hasBufferRsrcWorkaround(PointerTy))
+ return PointerTy;
+
+ const LLT ScalarTy = getBufferRsrcScalarType(PointerTy);
+ const LLT VectorTy = getBufferRsrcRegisterType(PointerTy);
+ if (!PointerTy.isVector()) {
+ // Happy path: (4 x s32) -> (s32, s32, s32, s32) -> (p8)
+ const unsigned NumParts = PointerTy.getSizeInBits() / 32;
+ const LLT S32 = LLT::scalar(32);
+
+ Register VectorReg = MRI.createGenericVirtualRegister(VectorTy);
+ std::array<Register, 4> VectorElems;
+ B.setInsertPt(B.getMBB(), ++B.getInsertPt());
+ for (unsigned I = 0; I < NumParts; ++I)
+ VectorElems[I] =
+ B.buildExtractVectorElementConstant(S32, VectorReg, I).getReg(0);
+ B.buildMergeValues(MO, VectorElems);
+ MO.setReg(VectorReg);
+ return VectorTy;
+ }
+ Register BitcastReg = MRI.createGenericVirtualRegister(VectorTy);
+ B.setInsertPt(B.getMBB(), ++B.getInsertPt());
+ auto Scalar = B.buildBitcast(ScalarTy, BitcastReg);
+ B.buildIntToPtr(MO, Scalar);
+ MO.setReg(BitcastReg);
+
+ return VectorTy;
+}
+
+/// Cast a buffer resource (an address space 8 pointer) into a 4xi32, which is
+/// the form in which the value must be in order to be passed to the low-level
+/// representations used for MUBUF/MTBUF intrinsics. This is a hack, which is
+/// needed in order to account for the fact that we can't define a register
+/// class for s128 without breaking SelectionDAG.
+static Register castBufferRsrcToV4I32(Register Pointer, MachineIRBuilder &B) {
+ MachineRegisterInfo &MRI = *B.getMRI();
+ const LLT PointerTy = MRI.getType(Pointer);
+ const LLT ScalarTy = getBufferRsrcScalarType(PointerTy);
+ const LLT VectorTy = getBufferRsrcRegisterType(PointerTy);
+
+ if (!PointerTy.isVector()) {
+ // Special case: p8 -> (s32, s32, s32, s32) -> (4xs32)
+ SmallVector<Register, 4> PointerParts;
+ const unsigned NumParts = PointerTy.getSizeInBits() / 32;
+ auto Unmerged = B.buildUnmerge(LLT::scalar(32), Pointer);
+ for (unsigned I = 0; I < NumParts; ++I)
+ PointerParts.push_back(Unmerged.getReg(I));
+ return B.buildBuildVector(VectorTy, PointerParts).getReg(0);
+ }
+ Register Scalar = B.buildPtrToInt(ScalarTy, Pointer).getReg(0);
+ return B.buildBitcast(VectorTy, Scalar).getReg(0);
+}
+
+static void castBufferRsrcArgToV4I32(MachineInstr &MI, MachineIRBuilder &B,
+ unsigned Idx) {
+ MachineOperand &MO = MI.getOperand(Idx);
+
+ const LLT PointerTy = B.getMRI()->getType(MO.getReg());
+ // Paranoidly prevent us from doing this multiple times.
+ if (!hasBufferRsrcWorkaround(PointerTy))
+ return;
+ MO.setReg(castBufferRsrcToV4I32(MO.getReg(), B));
+}
+
AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
const GCNTargetMachine &TM)
: ST(ST_) {
@@ -484,6 +626,8 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS);
const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS);
const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS);
+ const LLT BufferFatPtr = GetAddrSpacePtr(AMDGPUAS::BUFFER_FAT_POINTER);
+ const LLT RsrcPtr = GetAddrSpacePtr(AMDGPUAS::BUFFER_RESOURCE);
const LLT CodePtr = FlatPtr;
@@ -495,6 +639,8 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr
};
+ const std::initializer_list<LLT> AddrSpaces128 = {RsrcPtr};
+
const std::initializer_list<LLT> FPTypesBase = {
S32, S64
};
@@ -515,17 +661,18 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
// TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more
// elements for v3s16
getActionDefinitionsBuilder(G_PHI)
- .legalFor({S32, S64, V2S16, S16, V4S16, S1, S128, S256})
- .legalFor(AllS32Vectors)
- .legalFor(AllS64Vectors)
- .legalFor(AddrSpaces64)
- .legalFor(AddrSpaces32)
- .legalIf(isPointer(0))
- .clampScalar(0, S16, S256)
- .widenScalarToNextPow2(0, 32)
- .clampMaxNumElements(0, S32, 16)
- .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
- .scalarize(0);
+ .legalFor({S32, S64, V2S16, S16, V4S16, S1, S128, S256})
+ .legalFor(AllS32Vectors)
+ .legalFor(AllS64Vectors)
+ .legalFor(AddrSpaces64)
+ .legalFor(AddrSpaces32)
+ .legalFor(AddrSpaces128)
+ .legalIf(isPointer(0))
+ .clampScalar(0, S16, S256)
+ .widenScalarToNextPow2(0, 32)
+ .clampMaxNumElements(0, S32, 16)
+ .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
+ .scalarize(0);
if (ST.hasVOP3PInsts() && ST.hasAddNoCarry() && ST.hasIntClamp()) {
// Full set of gfx9 features.
@@ -760,13 +907,31 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
.clampScalar(0, S16, S64);
if (ST.has16BitInsts()) {
- getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR})
+ getActionDefinitionsBuilder(G_FSQRT)
+ .legalFor({S32, S16})
+ .customFor({S64})
+ .scalarize(0)
+ .clampScalar(0, S16, S64);
+ getActionDefinitionsBuilder(G_FFLOOR)
.legalFor({S32, S64, S16})
.scalarize(0)
.clampScalar(0, S16, S64);
+
+ getActionDefinitionsBuilder({G_FLDEXP, G_STRICT_FLDEXP})
+ .legalFor({{S32, S32}, {S64, S32}, {S16, S16}})
+ .scalarize(0)
+ .maxScalarIf(typeIs(0, S16), 1, S16)
+ .clampScalar(1, S32, S32)
+ .lower();
+
+ getActionDefinitionsBuilder(G_FFREXP)
+ .customFor({{S32, S32}, {S64, S32}, {S16, S16}, {S16, S32}})
+ .scalarize(0)
+ .lower();
} else {
getActionDefinitionsBuilder(G_FSQRT)
- .legalFor({S32, S64})
+ .legalFor({S32})
+ .customFor({S64})
.scalarize(0)
.clampScalar(0, S32, S64);
@@ -782,6 +947,20 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
.scalarize(0)
.clampScalar(0, S32, S64);
}
+
+ getActionDefinitionsBuilder({G_FLDEXP, G_STRICT_FLDEXP})
+ .legalFor({{S32, S32}, {S64, S32}})
+ .scalarize(0)
+ .clampScalar(0, S32, S64)
+ .clampScalar(1, S32, S32)
+ .lower();
+
+ getActionDefinitionsBuilder(G_FFREXP)
+ .customFor({{S32, S32}, {S64, S32}})
+ .scalarize(0)
+ .minScalar(0, S32)
+ .clampScalar(1, S32, S32)
+ .lower();
}
getActionDefinitionsBuilder(G_FPTRUNC)
@@ -906,9 +1085,10 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
}
getActionDefinitionsBuilder(G_PTR_ADD)
- .legalIf(all(isPointer(0), sameSize(0, 1)))
- .scalarize(0)
- .scalarSameSizeAs(1, 0);
+ .unsupportedFor({BufferFatPtr, RsrcPtr})
+ .legalIf(all(isPointer(0), sameSize(0, 1)))
+ .scalarize(0)
+ .scalarSameSizeAs(1, 0);
getActionDefinitionsBuilder(G_PTRMASK)
.legalIf(all(sameSize(0, 1), typeInSet(1, {S64, S32})))
@@ -948,15 +1128,7 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
.scalarize(0);
// FIXME: fpow has a selection pattern that should move to custom lowering.
- auto &Exp2Ops = getActionDefinitionsBuilder({G_FEXP2, G_FLOG2});
- if (ST.has16BitInsts())
- Exp2Ops.legalFor({S32, S16});
- else
- Exp2Ops.legalFor({S32});
- Exp2Ops.clampScalar(0, MinScalarFPTy, S32);
- Exp2Ops.scalarize(0);
-
- auto &ExpOps = getActionDefinitionsBuilder({G_FEXP, G_FLOG, G_FLOG10, G_FPOW});
+ auto &ExpOps = getActionDefinitionsBuilder(G_FPOW);
if (ST.has16BitInsts())
ExpOps.customFor({{S32}, {S16}});
else
@@ -968,6 +1140,20 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
.clampScalar(0, MinScalarFPTy, S32)
.lower();
+ auto &Log2Ops = getActionDefinitionsBuilder({G_FLOG2, G_FEXP2});
+ Log2Ops.customFor({S32});
+ if (ST.has16BitInsts())
+ Log2Ops.legalFor({S16});
+ else
+ Log2Ops.customFor({S16});
+ Log2Ops.scalarize(0)
+ .lower();
+
+ auto &LogOps = getActionDefinitionsBuilder({G_FLOG, G_FLOG10, G_FEXP});
+ LogOps.customFor({S32, S16});
+ LogOps.clampScalar(0, MinScalarFPTy, S32)
+ .scalarize(0);
+
// The 64-bit versions produce 32-bit results, but only on the SALU.
getActionDefinitionsBuilder(G_CTPOP)
.legalFor({{S32, S32}, {S32, S64}})
@@ -1115,7 +1301,9 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
const LLT PtrTy = Query.Types[1];
unsigned AS = PtrTy.getAddressSpace();
- if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad))
+ if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad,
+ Query.MMODescrs[0].Ordering !=
+ AtomicOrdering::NotAtomic))
return true;
// Catch weird sized loads that don't evenly divide into the access sizes
@@ -1178,6 +1366,18 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
return isLoadStoreLegal(ST, Query);
});
+ // The custom pointers (fat pointers, buffer resources) don't work with load
+ // and store at this level. Fat pointers should have been lowered to
+ // intrinsics before the translation to MIR.
+ Actions.unsupportedIf(typeInSet(1, {BufferFatPtr, RsrcPtr}));
+
+ // Address space 8 pointers are handled by a 4xs32 load, bitcast, and
+ // ptrtoint. This is needed to account for the fact that we can't have i128
+ // as a register class for SelectionDAG reasons.
+ Actions.customIf([=](const LegalityQuery &Query) -> bool {
+ return hasBufferRsrcWorkaround(Query.Types[0]);
+ });
+
// Constant 32-bit is handled by addrspacecasting the 32-bit pointer to
// 64-bits.
//
@@ -1223,9 +1423,9 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
if (DstSize > MemSize)
return std::pair(0, LLT::scalar(MemSize));
- unsigned MaxSize = maxSizeForAddrSpace(ST,
- PtrTy.getAddressSpace(),
- Op == G_LOAD);
+ unsigned MaxSize = maxSizeForAddrSpace(
+ ST, PtrTy.getAddressSpace(), Op == G_LOAD,
+ Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic);
if (MemSize > MaxSize)
return std::pair(0, LLT::scalar(MaxSize));
@@ -1242,9 +1442,9 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
const LLT PtrTy = Query.Types[1];
LLT EltTy = DstTy.getElementType();
- unsigned MaxSize = maxSizeForAddrSpace(ST,
- PtrTy.getAddressSpace(),
- Op == G_LOAD);
+ unsigned MaxSize = maxSizeForAddrSpace(
+ ST, PtrTy.getAddressSpace(), Op == G_LOAD,
+ Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic);
// FIXME: Handle widened to power of 2 results better. This ends
// up scalarizing.
@@ -1284,7 +1484,7 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
// We're probably decomposing an odd sized store. Try to split
// to the widest type. TODO: Account for alignment. As-is it
// should be OK, since the new parts will be further legalized.
- unsigned FloorSize = PowerOf2Floor(DstSize);
+ unsigned FloorSize = llvm::bit_floor(DstSize);
return std::pair(
0, LLT::scalarOrVector(
ElementCount::getFixed(FloorSize / EltSize), EltTy));
@@ -1335,7 +1535,7 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
{G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB,
G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR,
G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX,
- G_ATOMICRMW_UMIN})
+ G_ATOMICRMW_UMIN, G_ATOMICRMW_UINC_WRAP, G_ATOMICRMW_UDEC_WRAP})
.legalFor({{S32, GlobalPtr}, {S32, LocalPtr},
{S64, GlobalPtr}, {S64, LocalPtr},
{S32, RegionPtr}, {S64, RegionPtr}});
@@ -1348,7 +1548,7 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
Atomic.legalFor({{S32, LocalPtr}, {S32, RegionPtr}});
if (ST.hasGFX90AInsts())
Atomic.legalFor({{S64, LocalPtr}});
- if (ST.hasGFX940Insts())
+ if (ST.hasAtomicDsPkAdd16Insts())
Atomic.legalFor({{V2S16, LocalPtr}});
}
if (ST.hasAtomicFaddInsts())
@@ -1450,10 +1650,21 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
const LLT VecTy = Query.Types[VecTypeIdx];
const LLT IdxTy = Query.Types[IdxTypeIdx];
const unsigned EltSize = EltTy.getSizeInBits();
+ const bool isLegalVecType =
+ !!SIRegisterInfo::getSGPRClassForBitWidth(VecTy.getSizeInBits());
+ // Address space 8 pointers are 128-bit wide values, but the logic
+ // below will try to bitcast them to 2N x s64, which will fail.
+ // Therefore, as an intermediate step, wrap extracts/insertions from a
+ // ptrtoint-ing the vector and scalar arguments (or inttoptring the
+ // extraction result) in order to produce a vector operation that can
+ // be handled by the logic below.
+ if (EltTy.isPointer() && EltSize > 64)
+ return true;
return (EltSize == 32 || EltSize == 64) &&
VecTy.getSizeInBits() % 32 == 0 &&
VecTy.getSizeInBits() <= MaxRegisterSize &&
- IdxTy.getSizeInBits() == 32;
+ IdxTy.getSizeInBits() == 32 &&
+ isLegalVecType;
})
.bitcastIf(all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltNarrowerThan(VecTypeIdx, 32)),
bitcastToVectorElement32(VecTypeIdx))
@@ -1479,6 +1690,9 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
.clampScalar(IdxTypeIdx, S32, S32)
.clampMaxNumElements(VecTypeIdx, S32, 32)
// TODO: Clamp elements for 64-bit vectors?
+ .moreElementsIf(
+ isIllegalRegisterType(VecTypeIdx),
+ moreElementsToNextExistingRegClass(VecTypeIdx))
// It should only be necessary with variable indexes.
// As a last resort, lower to the stack
.lower();
@@ -1533,7 +1747,10 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
.legalForCartesianProduct(AllS64Vectors, {S64})
.clampNumElements(0, V16S32, V32S32)
.clampNumElements(0, V2S64, V16S64)
- .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16));
+ .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16))
+ .moreElementsIf(
+ isIllegalRegisterType(0),
+ moreElementsToNextExistingRegClass(0));
if (ST.hasScalarPackInsts()) {
BuildVector
@@ -1575,7 +1792,7 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
const LLT &EltTy = Ty.getElementType();
if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 512)
return true;
- if (!isPowerOf2_32(EltTy.getSizeInBits()))
+ if (!llvm::has_single_bit<uint32_t>(EltTy.getSizeInBits()))
return true;
}
return false;
@@ -1623,8 +1840,7 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
Builder.widenScalarIf(
[=](const LegalityQuery &Query) {
const LLT Ty = Query.Types[BigTyIdx];
- return !isPowerOf2_32(Ty.getSizeInBits()) &&
- Ty.getSizeInBits() % 16 != 0;
+ return Ty.getSizeInBits() % 16 != 0;
},
[=](const LegalityQuery &Query) {
// Pick the next power of 2, or a multiple of 64 over 128.
@@ -1778,10 +1994,16 @@ bool AMDGPULegalizerInfo::legalizeCustom(LegalizerHelper &Helper,
case TargetOpcode::G_SEXTLOAD:
case TargetOpcode::G_ZEXTLOAD:
return legalizeLoad(Helper, MI);
+ case TargetOpcode::G_STORE:
+ return legalizeStore(Helper, MI);
case TargetOpcode::G_FMAD:
return legalizeFMad(MI, MRI, B);
case TargetOpcode::G_FDIV:
return legalizeFDIV(MI, MRI, B);
+ case TargetOpcode::G_FFREXP:
+ return legalizeFFREXP(MI, MRI, B);
+ case TargetOpcode::G_FSQRT:
+ return legalizeFSQRT(MI, MRI, B);
case TargetOpcode::G_UDIV:
case TargetOpcode::G_UREM:
case TargetOpcode::G_UDIVREM:
@@ -1792,10 +2014,13 @@ bool AMDGPULegalizerInfo::legalizeCustom(LegalizerHelper &Helper,
return legalizeSignedDIV_REM(MI, MRI, B);
case TargetOpcode::G_ATOMIC_CMPXCHG:
return legalizeAtomicCmpXChg(MI, MRI, B);
+ case TargetOpcode::G_FLOG2:
+ return legalizeFlog2(MI, B);
case TargetOpcode::G_FLOG:
- return legalizeFlog(MI, B, numbers::ln2f);
case TargetOpcode::G_FLOG10:
- return legalizeFlog(MI, B, numbers::ln2f / numbers::ln10f);
+ return legalizeFlogCommon(MI, B);
+ case TargetOpcode::G_FEXP2:
+ return legalizeFExp2(MI, B);
case TargetOpcode::G_FEXP:
return legalizeFExp(MI, B);
case TargetOpcode::G_FPOW:
@@ -1856,7 +2081,8 @@ Register AMDGPULegalizerInfo::getSegmentAperture(
LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
// For code object version 5, private_base and shared_base are passed through
// implicit kernargs.
- if (AMDGPU::getAmdhsaCodeObjectVersion() == 5) {
+ if (AMDGPU::getCodeObjectVersion(*MF.getFunction().getParent()) >=
+ AMDGPU::AMDHSA_COV5) {
AMDGPUTargetLowering::ImplicitParameter Param =
AS == AMDGPUAS::LOCAL_ADDRESS ? AMDGPUTargetLowering::SHARED_BASE
: AMDGPUTargetLowering::PRIVATE_BASE;
@@ -2192,9 +2418,7 @@ bool AMDGPULegalizerInfo::legalizeITOFP(
: B.buildUITOFP(S64, Unmerge.getReg(1));
auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0));
- auto LdExp = B.buildIntrinsic(Intrinsic::amdgcn_ldexp, {S64}, false)
- .addUse(CvtHi.getReg(0))
- .addUse(ThirtyTwo.getReg(0));
+ auto LdExp = B.buildFLdexp(S64, CvtHi, ThirtyTwo);
// TODO: Should this propagate fast-math-flags?
B.buildFAdd(Dst, LdExp, CvtLo);
@@ -2225,10 +2449,7 @@ bool AMDGPULegalizerInfo::legalizeITOFP(
auto Norm2 = B.buildOr(S32, Unmerge2.getReg(1), Adjust);
auto FVal = Signed ? B.buildSITOFP(S32, Norm2) : B.buildUITOFP(S32, Norm2);
auto Scale = B.buildSub(S32, ThirtyTwo, ShAmt);
- B.buildIntrinsic(Intrinsic::amdgcn_ldexp, ArrayRef<Register>{Dst},
- /*HasSideEffects=*/false)
- .addUse(FVal.getReg(0))
- .addUse(Scale.getReg(0));
+ B.buildFLdexp(Dst, FVal, Scale);
MI.eraseFromParent();
return true;
}
@@ -2273,13 +2494,15 @@ bool AMDGPULegalizerInfo::legalizeFPTOI(MachineInstr &MI,
}
MachineInstrBuilder K0, K1;
if (SrcLT == S64) {
- K0 = B.buildFConstant(S64,
- BitsToDouble(UINT64_C(/*2^-32*/ 0x3df0000000000000)));
- K1 = B.buildFConstant(S64,
- BitsToDouble(UINT64_C(/*-2^32*/ 0xc1f0000000000000)));
+ K0 = B.buildFConstant(
+ S64, llvm::bit_cast<double>(UINT64_C(/*2^-32*/ 0x3df0000000000000)));
+ K1 = B.buildFConstant(
+ S64, llvm::bit_cast<double>(UINT64_C(/*-2^32*/ 0xc1f0000000000000)));
} else {
- K0 = B.buildFConstant(S32, BitsToFloat(UINT32_C(/*2^-32*/ 0x2f800000)));
- K1 = B.buildFConstant(S32, BitsToFloat(UINT32_C(/*-2^32*/ 0xcf800000)));
+ K0 = B.buildFConstant(
+ S32, llvm::bit_cast<float>(UINT32_C(/*2^-32*/ 0x2f800000)));
+ K1 = B.buildFConstant(
+ S32, llvm::bit_cast<float>(UINT32_C(/*-2^32*/ 0xcf800000)));
}
auto Mul = B.buildFMul(SrcLT, Trunc, K0, Flags);
@@ -2329,6 +2552,30 @@ bool AMDGPULegalizerInfo::legalizeExtractVectorElt(
// TODO: Promote dynamic indexing of s16 to s32
+ Register Dst = MI.getOperand(0).getReg();
+ Register Vec = MI.getOperand(1).getReg();
+
+ LLT VecTy = MRI.getType(Vec);
+ LLT EltTy = VecTy.getElementType();
+ assert(EltTy == MRI.getType(Dst));
+
+ // Other legalization maps vector<? x [type bigger than 64 bits]> via bitcasts
+ // but we can't go directly to that logic becasue you can't bitcast a vector
+ // of pointers to a vector of integers. Therefore, introduce an intermediate
+ // vector of integers using ptrtoint (and inttoptr on the output) in order to
+ // drive the legalization forward.
+ if (EltTy.isPointer() && EltTy.getSizeInBits() > 64) {
+ LLT IntTy = LLT::scalar(EltTy.getSizeInBits());
+ LLT IntVecTy = VecTy.changeElementType(IntTy);
+
+ auto IntVec = B.buildPtrToInt(IntVecTy, Vec);
+ auto IntElt = B.buildExtractVectorElement(IntTy, IntVec, MI.getOperand(2));
+ B.buildIntToPtr(Dst, IntElt);
+
+ MI.eraseFromParent();
+ return true;
+ }
+
// FIXME: Artifact combiner probably should have replaced the truncated
// constant before this, so we shouldn't need
// getIConstantVRegValWithLookThrough.
@@ -2338,13 +2585,6 @@ bool AMDGPULegalizerInfo::legalizeExtractVectorElt(
return true;
const uint64_t IdxVal = MaybeIdxVal->Value.getZExtValue();
- Register Dst = MI.getOperand(0).getReg();
- Register Vec = MI.getOperand(1).getReg();
-
- LLT VecTy = MRI.getType(Vec);
- LLT EltTy = VecTy.getElementType();
- assert(EltTy == MRI.getType(Dst));
-
if (IdxVal < VecTy.getNumElements()) {
auto Unmerge = B.buildUnmerge(EltTy, Vec);
B.buildCopy(Dst, Unmerge.getReg(IdxVal));
@@ -2363,6 +2603,33 @@ bool AMDGPULegalizerInfo::legalizeInsertVectorElt(
// TODO: Promote dynamic indexing of s16 to s32
+ Register Dst = MI.getOperand(0).getReg();
+ Register Vec = MI.getOperand(1).getReg();
+ Register Ins = MI.getOperand(2).getReg();
+
+ LLT VecTy = MRI.getType(Vec);
+ LLT EltTy = VecTy.getElementType();
+ assert(EltTy == MRI.getType(Ins));
+
+ // Other legalization maps vector<? x [type bigger than 64 bits]> via bitcasts
+ // but we can't go directly to that logic becasue you can't bitcast a vector
+ // of pointers to a vector of integers. Therefore, make the pointer vector
+ // into an equivalent vector of integers with ptrtoint, insert the ptrtoint'd
+ // new value, and then inttoptr the result vector back. This will then allow
+ // the rest of legalization to take over.
+ if (EltTy.isPointer() && EltTy.getSizeInBits() > 64) {
+ LLT IntTy = LLT::scalar(EltTy.getSizeInBits());
+ LLT IntVecTy = VecTy.changeElementType(IntTy);
+
+ auto IntVecSource = B.buildPtrToInt(IntVecTy, Vec);
+ auto IntIns = B.buildPtrToInt(IntTy, Ins);
+ auto IntVecDest = B.buildInsertVectorElement(IntVecTy, IntVecSource, IntIns,
+ MI.getOperand(3));
+ B.buildIntToPtr(Dst, IntVecDest);
+ MI.eraseFromParent();
+ return true;
+ }
+
// FIXME: Artifact combiner probably should have replaced the truncated
// constant before this, so we shouldn't need
// getIConstantVRegValWithLookThrough.
@@ -2372,14 +2639,6 @@ bool AMDGPULegalizerInfo::legalizeInsertVectorElt(
return true;
const uint64_t IdxVal = MaybeIdxVal->Value.getZExtValue();
- Register Dst = MI.getOperand(0).getReg();
- Register Vec = MI.getOperand(1).getReg();
- Register Ins = MI.getOperand(2).getReg();
-
- LLT VecTy = MRI.getType(Vec);
- LLT EltTy = VecTy.getElementType();
- assert(EltTy == MRI.getType(Ins));
- (void)Ins;
unsigned NumElts = VecTy.getNumElements();
if (IdxVal < NumElts) {
@@ -2479,7 +2738,8 @@ bool AMDGPULegalizerInfo::buildPCRelGlobalAddress(Register DstReg, LLT PtrTy,
else
MIB.addGlobalAddress(GV, Offset + 12, GAFlags + 1);
- B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass);
+ if (!B.getMRI()->getRegClassOrNull(PCReg))
+ B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass);
if (PtrTy.getSizeInBits() == 32)
B.buildExtract(DstReg, PCReg, 0);
@@ -2535,7 +2795,7 @@ bool AMDGPULegalizerInfo::legalizeGlobalValue(
// allocated ones. They all share the same offset.
if (B.getDataLayout().getTypeAllocSize(Ty).isZero()) {
// Adjust alignment for that dynamic shared memory array.
- MFI->setDynLDSAlign(B.getDataLayout(), *cast<GlobalVariable>(GV));
+ MFI->setDynLDSAlign(MF.getFunction(), *cast<GlobalVariable>(GV));
LLT S32 = LLT::scalar(32);
auto Sz =
B.buildIntrinsic(Intrinsic::amdgcn_groupstaticsize, {S32}, false);
@@ -2620,6 +2880,13 @@ bool AMDGPULegalizerInfo::legalizeLoad(LegalizerHelper &Helper,
Register ValReg = MI.getOperand(0).getReg();
LLT ValTy = MRI.getType(ValReg);
+ if (hasBufferRsrcWorkaround(ValTy)) {
+ Observer.changingInstr(MI);
+ castBufferRsrcFromV4I32(MI, B, MRI, 0);
+ Observer.changedInstr(MI);
+ return true;
+ }
+
MachineMemOperand *MMO = *MI.memoperands_begin();
const unsigned ValSize = ValTy.getSizeInBits();
const LLT MemTy = MMO->getMemoryType();
@@ -2677,6 +2944,24 @@ bool AMDGPULegalizerInfo::legalizeLoad(LegalizerHelper &Helper,
return false;
}
+bool AMDGPULegalizerInfo::legalizeStore(LegalizerHelper &Helper,
+ MachineInstr &MI) const {
+ MachineIRBuilder &B = Helper.MIRBuilder;
+ MachineRegisterInfo &MRI = *B.getMRI();
+ GISelChangeObserver &Observer = Helper.Observer;
+
+ Register DataReg = MI.getOperand(0).getReg();
+ LLT DataTy = MRI.getType(DataReg);
+
+ if (hasBufferRsrcWorkaround(DataTy)) {
+ Observer.changingInstr(MI);
+ castBufferRsrcArgToV4I32(MI, B, 0);
+ Observer.changedInstr(MI);
+ return true;
+ }
+ return false;
+}
+
bool AMDGPULegalizerInfo::legalizeFMad(
MachineInstr &MI, MachineRegisterInfo &MRI,
MachineIRBuilder &B) const {
@@ -2688,9 +2973,11 @@ bool AMDGPULegalizerInfo::legalizeFMad(
// TODO: Always legal with future ftz flag.
// FIXME: Do we need just output?
- if (Ty == LLT::scalar(32) && !MFI->getMode().allFP32Denormals())
+ if (Ty == LLT::scalar(32) &&
+ MFI->getMode().FP32Denormals == DenormalMode::getPreserveSign())
return true;
- if (Ty == LLT::scalar(16) && !MFI->getMode().allFP64FP16Denormals())
+ if (Ty == LLT::scalar(16) &&
+ MFI->getMode().FP64FP16Denormals == DenormalMode::getPreserveSign())
return true;
MachineIRBuilder HelperBuilder(MI);
@@ -2724,31 +3011,449 @@ bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg(
return true;
}
-bool AMDGPULegalizerInfo::legalizeFlog(
- MachineInstr &MI, MachineIRBuilder &B, double Log2BaseInverted) const {
+/// Return true if it's known that \p Src can never be an f32 denormal value.
+static bool valueIsKnownNeverF32Denorm(const MachineRegisterInfo &MRI,
+ Register Src) {
+ Register ExtSrc;
+ if (mi_match(Src, MRI, m_GFPExt(m_Reg(ExtSrc))))
+ return MRI.getType(ExtSrc) == LLT::scalar(16);
+ return false;
+}
+
+static bool allowApproxFunc(const MachineFunction &MF, unsigned Flags) {
+ if (Flags & MachineInstr::FmAfn)
+ return true;
+ const auto &Options = MF.getTarget().Options;
+ return Options.UnsafeFPMath || Options.ApproxFuncFPMath;
+}
+
+static bool needsDenormHandlingF32(const MachineFunction &MF, Register Src,
+ unsigned Flags) {
+ return !valueIsKnownNeverF32Denorm(MF.getRegInfo(), Src) &&
+ MF.getDenormalMode(APFloat::IEEEsingle()).Input !=
+ DenormalMode::PreserveSign;
+}
+
+std::pair<Register, Register>
+AMDGPULegalizerInfo::getScaledLogInput(MachineIRBuilder &B, Register Src,
+ unsigned Flags) const {
+ if (allowApproxFunc(B.getMF(), Flags) ||
+ !needsDenormHandlingF32(B.getMF(), Src, Flags))
+ return {};
+
+ const LLT F32 = LLT::scalar(32);
+ auto SmallestNormal = B.buildFConstant(
+ F32, APFloat::getSmallestNormalized(APFloat::IEEEsingle()));
+ auto IsLtSmallestNormal =
+ B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), Src, SmallestNormal);
+
+ auto Scale32 = B.buildFConstant(F32, 0x1.0p+32);
+ auto One = B.buildFConstant(F32, 1.0);
+ auto ScaleFactor =
+ B.buildSelect(F32, IsLtSmallestNormal, Scale32, One, Flags);
+ auto ScaledInput = B.buildFMul(F32, Src, ScaleFactor, Flags);
+
+ return {ScaledInput.getReg(0), IsLtSmallestNormal.getReg(0)};
+}
+
+bool AMDGPULegalizerInfo::legalizeFlog2(MachineInstr &MI,
+ MachineIRBuilder &B) const {
+ // v_log_f32 is good enough for OpenCL, except it doesn't handle denormals.
+ // If we have to handle denormals, scale up the input and adjust the result.
+
+ // scaled = x * (is_denormal ? 0x1.0p+32 : 1.0)
+ // log2 = amdgpu_log2 - (is_denormal ? 32.0 : 0.0)
+
Register Dst = MI.getOperand(0).getReg();
Register Src = MI.getOperand(1).getReg();
LLT Ty = B.getMRI()->getType(Dst);
unsigned Flags = MI.getFlags();
- auto Log2Operand = B.buildFLog2(Ty, Src, Flags);
- auto Log2BaseInvertedOperand = B.buildFConstant(Ty, Log2BaseInverted);
+ if (Ty == LLT::scalar(16)) {
+ const LLT F32 = LLT::scalar(32);
+ // Nothing in half is a denormal when promoted to f32.
+ auto Ext = B.buildFPExt(F32, Src, Flags);
+ auto Log2 = B.buildIntrinsic(Intrinsic::amdgcn_log, {F32}, false)
+ .addUse(Ext.getReg(0))
+ .setMIFlags(Flags);
+ B.buildFPTrunc(Dst, Log2, Flags);
+ MI.eraseFromParent();
+ return true;
+ }
+
+ assert(Ty == LLT::scalar(32));
+
+ auto [ScaledInput, IsLtSmallestNormal] = getScaledLogInput(B, Src, Flags);
+ if (!ScaledInput) {
+ B.buildIntrinsic(Intrinsic::amdgcn_log, {MI.getOperand(0)}, false)
+ .addUse(Src)
+ .setMIFlags(Flags);
+ MI.eraseFromParent();
+ return true;
+ }
+
+ auto Log2 = B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty}, false)
+ .addUse(ScaledInput)
+ .setMIFlags(Flags);
+
+ auto ThirtyTwo = B.buildFConstant(Ty, 32.0);
+ auto Zero = B.buildFConstant(Ty, 0.0);
+ auto ResultOffset =
+ B.buildSelect(Ty, IsLtSmallestNormal, ThirtyTwo, Zero, Flags);
+ B.buildFSub(Dst, Log2, ResultOffset, Flags);
- B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags);
MI.eraseFromParent();
return true;
}
-bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI,
- MachineIRBuilder &B) const {
+static Register getMad(MachineIRBuilder &B, LLT Ty, Register X, Register Y,
+ Register Z, unsigned Flags) {
+ auto FMul = B.buildFMul(Ty, X, Y, Flags);
+ return B.buildFAdd(Ty, FMul, Z, Flags).getReg(0);
+}
+
+bool AMDGPULegalizerInfo::legalizeFlogCommon(MachineInstr &MI,
+ MachineIRBuilder &B) const {
+ const bool IsLog10 = MI.getOpcode() == TargetOpcode::G_FLOG10;
+ assert(IsLog10 || MI.getOpcode() == TargetOpcode::G_FLOG);
+
+ MachineRegisterInfo &MRI = *B.getMRI();
+ Register Dst = MI.getOperand(0).getReg();
+ Register X = MI.getOperand(1).getReg();
+ unsigned Flags = MI.getFlags();
+ const LLT Ty = MRI.getType(X);
+ MachineFunction &MF = B.getMF();
+
+ const LLT F32 = LLT::scalar(32);
+ const LLT F16 = LLT::scalar(16);
+
+ const AMDGPUTargetMachine &TM =
+ static_cast<const AMDGPUTargetMachine &>(MF.getTarget());
+
+ if (Ty == F16 || MI.getFlag(MachineInstr::FmAfn) ||
+ TM.Options.ApproxFuncFPMath || TM.Options.UnsafeFPMath) {
+ const double Log2BaseInv =
+ IsLog10 ? numbers::ln2 / numbers::ln10 : numbers::ln2;
+
+ if (Ty == F16 && !ST.has16BitInsts()) {
+ Register LogVal = MRI.createGenericVirtualRegister(F32);
+ auto PromoteSrc = B.buildFPExt(F32, X);
+ legalizeFlogUnsafe(B, LogVal, PromoteSrc.getReg(0), Log2BaseInv, Flags);
+ B.buildFPTrunc(Dst, LogVal);
+ } else {
+ legalizeFlogUnsafe(B, Dst, X, Log2BaseInv, Flags);
+ }
+
+ MI.eraseFromParent();
+ return true;
+ }
+
+ auto [ScaledInput, IsScaled] = getScaledLogInput(B, X, Flags);
+ if (ScaledInput)
+ X = ScaledInput;
+
+ auto Y = B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty}, false)
+ .addUse(X)
+ .setMIFlags(Flags);
+
+ Register R;
+ if (ST.hasFastFMAF32()) {
+ // c+cc are ln(2)/ln(10) to more than 49 bits
+ const float c_log10 = 0x1.344134p-2f;
+ const float cc_log10 = 0x1.09f79ep-26f;
+
+ // c + cc is ln(2) to more than 49 bits
+ const float c_log = 0x1.62e42ep-1f;
+ const float cc_log = 0x1.efa39ep-25f;
+
+ auto C = B.buildFConstant(Ty, IsLog10 ? c_log10 : c_log);
+ auto CC = B.buildFConstant(Ty, IsLog10 ? cc_log10 : cc_log);
+
+ R = B.buildFMul(Ty, Y, C, Flags).getReg(0);
+ auto NegR = B.buildFNeg(Ty, R, Flags);
+ auto FMA0 = B.buildFMA(Ty, Y, C, NegR, Flags);
+ auto FMA1 = B.buildFMA(Ty, Y, CC, FMA0, Flags);
+ R = B.buildFAdd(Ty, R, FMA1, Flags).getReg(0);
+ } else {
+ // ch+ct is ln(2)/ln(10) to more than 36 bits
+ const float ch_log10 = 0x1.344000p-2f;
+ const float ct_log10 = 0x1.3509f6p-18f;
+
+ // ch + ct is ln(2) to more than 36 bits
+ const float ch_log = 0x1.62e000p-1f;
+ const float ct_log = 0x1.0bfbe8p-15f;
+
+ auto CH = B.buildFConstant(Ty, IsLog10 ? ch_log10 : ch_log);
+ auto CT = B.buildFConstant(Ty, IsLog10 ? ct_log10 : ct_log);
+
+ auto MaskConst = B.buildConstant(Ty, 0xfffff000);
+ auto YH = B.buildAnd(Ty, Y, MaskConst);
+ auto YT = B.buildFSub(Ty, Y, YH, Flags);
+ auto YTCT = B.buildFMul(Ty, YT, CT, Flags);
+
+ Register Mad0 =
+ getMad(B, Ty, YH.getReg(0), CT.getReg(0), YTCT.getReg(0), Flags);
+ Register Mad1 = getMad(B, Ty, YT.getReg(0), CH.getReg(0), Mad0, Flags);
+ R = getMad(B, Ty, YH.getReg(0), CH.getReg(0), Mad1, Flags);
+ }
+
+ const bool IsFiniteOnly =
+ (MI.getFlag(MachineInstr::FmNoNans) || TM.Options.NoNaNsFPMath) &&
+ (MI.getFlag(MachineInstr::FmNoInfs) || TM.Options.NoInfsFPMath);
+
+ if (!IsFiniteOnly) {
+ // Expand isfinite(x) => fabs(x) < inf
+ auto Inf = B.buildFConstant(Ty, APFloat::getInf(APFloat::IEEEsingle()));
+ auto Fabs = B.buildFAbs(Ty, Y);
+ auto IsFinite =
+ B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), Fabs, Inf, Flags);
+ R = B.buildSelect(Ty, IsFinite, R, Y, Flags).getReg(0);
+ }
+
+ if (ScaledInput) {
+ auto Zero = B.buildFConstant(Ty, 0.0);
+ auto ShiftK =
+ B.buildFConstant(Ty, IsLog10 ? 0x1.344136p+3f : 0x1.62e430p+4f);
+ auto Shift = B.buildSelect(Ty, IsScaled, ShiftK, Zero, Flags);
+ B.buildFSub(Dst, R, Shift, Flags);
+ } else {
+ B.buildCopy(Dst, R);
+ }
+
+ MI.eraseFromParent();
+ return true;
+}
+
+bool AMDGPULegalizerInfo::legalizeFlogUnsafe(MachineIRBuilder &B, Register Dst,
+ Register Src,
+ double Log2BaseInverted,
+ unsigned Flags) const {
+ LLT Ty = B.getMRI()->getType(Dst);
+ auto Log2Operand = Ty == LLT::scalar(16)
+ ? B.buildFLog2(Ty, Src, Flags)
+ : B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty}, false)
+ .addUse(Src)
+ .setMIFlags(Flags);
+ auto Log2BaseInvertedOperand = B.buildFConstant(Ty, Log2BaseInverted);
+ B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags);
+ return true;
+}
+
+bool AMDGPULegalizerInfo::legalizeFExp2(MachineInstr &MI,
+ MachineIRBuilder &B) const {
+ // v_exp_f32 is good enough for OpenCL, except it doesn't handle denormals.
+ // If we have to handle denormals, scale up the input and adjust the result.
+
Register Dst = MI.getOperand(0).getReg();
Register Src = MI.getOperand(1).getReg();
unsigned Flags = MI.getFlags();
LLT Ty = B.getMRI()->getType(Dst);
+ const LLT F16 = LLT::scalar(16);
+ const LLT F32 = LLT::scalar(32);
+
+ if (Ty == F16) {
+ // Nothing in half is a denormal when promoted to f32.
+ auto Ext = B.buildFPExt(F32, Src, Flags);
+ auto Log2 = B.buildIntrinsic(Intrinsic::amdgcn_exp2, {F32}, false)
+ .addUse(Ext.getReg(0))
+ .setMIFlags(Flags);
+ B.buildFPTrunc(Dst, Log2, Flags);
+ MI.eraseFromParent();
+ return true;
+ }
+
+ assert(Ty == F32);
+
+ if (allowApproxFunc(B.getMF(), Flags) ||
+ !needsDenormHandlingF32(B.getMF(), Src, Flags)) {
+ B.buildIntrinsic(Intrinsic::amdgcn_exp2, ArrayRef<Register>{Dst}, false)
+ .addUse(Src)
+ .setMIFlags(Flags);
+ MI.eraseFromParent();
+ return true;
+ }
+
+ // bool needs_scaling = x < -0x1.f80000p+6f;
+ // v_exp_f32(x + (s ? 0x1.0p+6f : 0.0f)) * (s ? 0x1.0p-64f : 1.0f);
+
+ // -nextafter(128.0, -1)
+ auto RangeCheckConst = B.buildFConstant(Ty, -0x1.f80000p+6f);
+ auto NeedsScaling = B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), Src,
+ RangeCheckConst, Flags);
+
+ auto SixtyFour = B.buildFConstant(Ty, 0x1.0p+6f);
+ auto Zero = B.buildFConstant(Ty, 0.0);
+ auto AddOffset = B.buildSelect(F32, NeedsScaling, SixtyFour, Zero, Flags);
+ auto AddInput = B.buildFAdd(F32, Src, AddOffset, Flags);
+ auto Exp2 = B.buildIntrinsic(Intrinsic::amdgcn_exp2, {Ty}, false)
+ .addUse(AddInput.getReg(0))
+ .setMIFlags(Flags);
+
+ auto TwoExpNeg64 = B.buildFConstant(Ty, 0x1.0p-64f);
+ auto One = B.buildFConstant(Ty, 1.0);
+ auto ResultScale = B.buildSelect(F32, NeedsScaling, TwoExpNeg64, One, Flags);
+ B.buildFMul(Dst, Exp2, ResultScale, Flags);
+ MI.eraseFromParent();
+ return true;
+}
+
+bool AMDGPULegalizerInfo::legalizeFExpUnsafe(MachineIRBuilder &B, Register Dst,
+ Register Src,
+ unsigned Flags) const {
+ LLT Ty = B.getMRI()->getType(Dst);
auto K = B.buildFConstant(Ty, numbers::log2e);
auto Mul = B.buildFMul(Ty, Src, K, Flags);
- B.buildFExp2(Dst, Mul, Flags);
+
+ if (Ty == LLT::scalar(32)) {
+ B.buildIntrinsic(Intrinsic::amdgcn_exp2, ArrayRef<Register>{Dst}, false)
+ .addUse(Mul.getReg(0))
+ .setMIFlags(Flags);
+ } else {
+ B.buildFExp2(Dst, Mul.getReg(0), Flags);
+ }
+
+ return true;
+}
+
+bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI,
+ MachineIRBuilder &B) const {
+ Register Dst = MI.getOperand(0).getReg();
+ Register X = MI.getOperand(1).getReg();
+ const unsigned Flags = MI.getFlags();
+ MachineFunction &MF = B.getMF();
+ MachineRegisterInfo &MRI = *B.getMRI();
+ LLT Ty = MRI.getType(Dst);
+ const LLT F16 = LLT::scalar(16);
+ const LLT F32 = LLT::scalar(32);
+ const bool IsExp10 = false; // TODO: For some reason exp10 is missing
+
+ if (Ty == F16) {
+ // v_exp_f16 (fmul x, log2e)
+ if (allowApproxFunc(MF, Flags)) {
+ // TODO: Does this really require fast?
+ legalizeFExpUnsafe(B, Dst, X, Flags);
+ MI.eraseFromParent();
+ return true;
+ }
+
+ // exp(f16 x) ->
+ // fptrunc (v_exp_f32 (fmul (fpext x), log2e))
+
+ // Nothing in half is a denormal when promoted to f32.
+ auto Ext = B.buildFPExt(F32, X, Flags);
+ Register Lowered = MRI.createGenericVirtualRegister(F32);
+ legalizeFExpUnsafe(B, Lowered, Ext.getReg(0), Flags);
+ B.buildFPTrunc(Dst, Lowered, Flags);
+ MI.eraseFromParent();
+ return true;
+ }
+
+ assert(Ty == F32);
+
+ // TODO: Interpret allowApproxFunc as ignoring DAZ. This is currently copying
+ // library behavior. Also, is known-not-daz source sufficient?
+ if (allowApproxFunc(MF, Flags) && !needsDenormHandlingF32(MF, X, Flags)) {
+ legalizeFExpUnsafe(B, Dst, X, Flags);
+ MI.eraseFromParent();
+ return true;
+ }
+
+ // Algorithm:
+ //
+ // e^x = 2^(x/ln(2)) = 2^(x*(64/ln(2))/64)
+ //
+ // x*(64/ln(2)) = n + f, |f| <= 0.5, n is integer
+ // n = 64*m + j, 0 <= j < 64
+ //
+ // e^x = 2^((64*m + j + f)/64)
+ // = (2^m) * (2^(j/64)) * 2^(f/64)
+ // = (2^m) * (2^(j/64)) * e^(f*(ln(2)/64))
+ //
+ // f = x*(64/ln(2)) - n
+ // r = f*(ln(2)/64) = x - n*(ln(2)/64)
+ //
+ // e^x = (2^m) * (2^(j/64)) * e^r
+ //
+ // (2^(j/64)) is precomputed
+ //
+ // e^r = 1 + r + (r^2)/2! + (r^3)/3! + (r^4)/4! + (r^5)/5!
+ // e^r = 1 + q
+ //
+ // q = r + (r^2)/2! + (r^3)/3! + (r^4)/4! + (r^5)/5!
+ //
+ // e^x = (2^m) * ( (2^(j/64)) + q*(2^(j/64)) )
+ const unsigned FlagsNoContract = Flags & ~MachineInstr::FmContract;
+ Register PH, PL;
+
+ if (ST.hasFastFMAF32()) {
+ const float c_exp = numbers::log2ef;
+ const float cc_exp = 0x1.4ae0bep-26f; // c+cc are 49 bits
+ const float c_exp10 = 0x1.a934f0p+1f;
+ const float cc_exp10 = 0x1.2f346ep-24f;
+
+ auto C = B.buildFConstant(Ty, IsExp10 ? c_exp10 : c_exp);
+ PH = B.buildFMul(Ty, X, C, Flags).getReg(0);
+ auto NegPH = B.buildFNeg(Ty, PH, Flags);
+ auto FMA0 = B.buildFMA(Ty, X, C, NegPH, Flags);
+
+ auto CC = B.buildFConstant(Ty, IsExp10 ? cc_exp10 : cc_exp);
+ PL = B.buildFMA(Ty, X, CC, FMA0, Flags).getReg(0);
+ } else {
+ const float ch_exp = 0x1.714000p+0f;
+ const float cl_exp = 0x1.47652ap-12f; // ch + cl are 36 bits
+
+ const float ch_exp10 = 0x1.a92000p+1f;
+ const float cl_exp10 = 0x1.4f0978p-11f;
+
+ auto MaskConst = B.buildConstant(Ty, 0xfffff000);
+ auto XH = B.buildAnd(Ty, X, MaskConst);
+ auto XL = B.buildFSub(Ty, X, XH, Flags);
+
+ auto CH = B.buildFConstant(Ty, IsExp10 ? ch_exp10 : ch_exp);
+ PH = B.buildFMul(Ty, XH, CH, Flags).getReg(0);
+
+ auto CL = B.buildFConstant(Ty, IsExp10 ? cl_exp10 : cl_exp);
+ auto XLCL = B.buildFMul(Ty, XL, CL, Flags);
+
+ Register Mad0 =
+ getMad(B, Ty, XL.getReg(0), CH.getReg(0), XLCL.getReg(0), Flags);
+ PL = getMad(B, Ty, XH.getReg(0), CL.getReg(0), Mad0, Flags);
+ }
+
+ auto E = B.buildFRint(Ty, PH, Flags);
+
+ // It is unsafe to contract this fsub into the PH multiply.
+ auto PHSubE = B.buildFSub(Ty, PH, E, FlagsNoContract);
+ auto A = B.buildFAdd(Ty, PHSubE, PL, Flags);
+ auto IntE = B.buildFPTOSI(LLT::scalar(32), E);
+
+ auto Exp2 = B.buildIntrinsic(Intrinsic::amdgcn_exp2, {Ty}, false)
+ .addUse(A.getReg(0))
+ .setMIFlags(Flags);
+ auto R = B.buildFLdexp(Ty, Exp2, IntE, Flags);
+
+ auto UnderflowCheckConst =
+ B.buildFConstant(Ty, IsExp10 ? -0x1.66d3e8p+5f : -0x1.9d1da0p+6f);
+ auto Zero = B.buildFConstant(Ty, 0.0);
+ auto Underflow =
+ B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), X, UnderflowCheckConst);
+
+ R = B.buildSelect(Ty, Underflow, Zero, R);
+
+ const auto &Options = MF.getTarget().Options;
+
+ if (!(Flags & MachineInstr::FmNoInfs) && !Options.NoInfsFPMath) {
+ auto OverflowCheckConst =
+ B.buildFConstant(Ty, IsExp10 ? 0x1.344136p+5f : 0x1.62e430p+6f);
+
+ auto Overflow =
+ B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), X, OverflowCheckConst);
+ auto Inf = B.buildFConstant(Ty, APFloat::getInf(APFloat::IEEEsingle()));
+ R = B.buildSelect(Ty, Overflow, Inf, R, Flags);
+ }
+
+ B.buildCopy(Dst, R);
MI.eraseFromParent();
return true;
}
@@ -2831,7 +3536,8 @@ bool AMDGPULegalizerInfo::legalizeFFloor(MachineInstr &MI,
// shouldn't matter?
Register ModSrc = stripAnySourceMods(OrigSrc, MRI);
- auto Const = B.buildFConstant(S64, BitsToDouble(0x3fefffffffffffff));
+ auto Const =
+ B.buildFConstant(S64, llvm::bit_cast<double>(0x3fefffffffffffff));
Register Min = MRI.createGenericVirtualRegister(S64);
@@ -2890,15 +3596,18 @@ bool AMDGPULegalizerInfo::legalizeBuildVector(
// the outer loop going over parts of the result, the outer loop should go
// over parts of one of the factors. This should result in instruction
// selection that makes full use of S_ADDC_U32 instructions.
-void AMDGPULegalizerInfo::buildMultiply(
- LegalizerHelper &Helper, MutableArrayRef<Register> Accum,
- ArrayRef<Register> Src0, ArrayRef<Register> Src1,
- bool UsePartialMad64_32, bool SeparateOddAlignedProducts) const {
+void AMDGPULegalizerInfo::buildMultiply(LegalizerHelper &Helper,
+ MutableArrayRef<Register> Accum,
+ ArrayRef<Register> Src0,
+ ArrayRef<Register> Src1,
+ bool UsePartialMad64_32,
+ bool SeparateOddAlignedProducts) const {
// Use (possibly empty) vectors of S1 registers to represent the set of
// carries from one pair of positions to the next.
using Carry = SmallVector<Register, 2>;
MachineIRBuilder &B = Helper.MIRBuilder;
+ GISelKnownBits &KB = *Helper.getKnownBits();
const LLT S1 = LLT::scalar(1);
const LLT S32 = LLT::scalar(32);
@@ -2918,6 +3627,12 @@ void AMDGPULegalizerInfo::buildMultiply(
return Zero64;
};
+ SmallVector<bool, 2> Src0KnownZeros, Src1KnownZeros;
+ for (unsigned i = 0; i < Src0.size(); ++i) {
+ Src0KnownZeros.push_back(KB.getKnownBits(Src0[i]).isZero());
+ Src1KnownZeros.push_back(KB.getKnownBits(Src1[i]).isZero());
+ }
+
// Merge the given carries into the 32-bit LocalAccum, which is modified
// in-place.
//
@@ -2980,9 +3695,14 @@ void AMDGPULegalizerInfo::buildMultiply(
if (LocalAccum.size() == 1 &&
(!UsePartialMad64_32 || !CarryIn.empty())) {
do {
+ // Skip multiplication if one of the operands is 0
unsigned j1 = DstIndex - j0;
+ if (Src0KnownZeros[j0] || Src1KnownZeros[j1]) {
+ ++j0;
+ continue;
+ }
auto Mul = B.buildMul(S32, Src0[j0], Src1[j1]);
- if (!LocalAccum[0]) {
+ if (!LocalAccum[0] || KB.getKnownBits(LocalAccum[0]).isZero()) {
LocalAccum[0] = Mul.getReg(0);
} else {
if (CarryIn.empty()) {
@@ -3022,12 +3742,17 @@ void AMDGPULegalizerInfo::buildMultiply(
do {
unsigned j1 = DstIndex - j0;
+ if (Src0KnownZeros[j0] || Src1KnownZeros[j1]) {
+ ++j0;
+ continue;
+ }
auto Mad = B.buildInstr(AMDGPU::G_AMDGPU_MAD_U64_U32, {S64, S1},
{Src0[j0], Src1[j1], Tmp});
Tmp = Mad.getReg(0);
if (!HaveSmallAccum)
CarryOut.push_back(Mad.getReg(1));
HaveSmallAccum = false;
+
++j0;
} while (j0 <= DstIndex);
@@ -3170,7 +3895,6 @@ bool AMDGPULegalizerInfo::legalizeMul(LegalizerHelper &Helper,
B.buildMergeLikeInstr(DstReg, AccumRegs);
MI.eraseFromParent();
return true;
-
}
// Legalize ctlz/cttz to ffbh/ffbl instead of the default legalization to
@@ -3259,7 +3983,7 @@ bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B,
// TODO: Should we try to emit this once in the entry block?
const LLT S32 = LLT::scalar(32);
const unsigned Mask = Arg->getMask();
- const unsigned Shift = countTrailingZeros<unsigned>(Mask);
+ const unsigned Shift = llvm::countr_zero<unsigned>(Mask);
Register AndMaskSrc = LiveIn;
@@ -3432,7 +4156,7 @@ void AMDGPULegalizerInfo::legalizeUnsignedDIV_REM32Impl(MachineIRBuilder &B,
// Initial estimate of inv(y).
auto FloatY = B.buildUITOFP(S32, Y);
auto RcpIFlag = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {FloatY});
- auto Scale = B.buildFConstant(S32, BitsToFloat(0x4f7ffffe));
+ auto Scale = B.buildFConstant(S32, llvm::bit_cast<float>(0x4f7ffffe));
auto ScaledY = B.buildFMul(S32, RcpIFlag, Scale);
auto Z = B.buildFPTOUI(S32, ScaledY);
@@ -3482,21 +4206,23 @@ static std::pair<Register, Register> emitReciprocalU64(MachineIRBuilder &B,
auto CvtLo = B.buildUITOFP(S32, Unmerge.getReg(0));
auto CvtHi = B.buildUITOFP(S32, Unmerge.getReg(1));
- auto Mad = B.buildFMAD(S32, CvtHi, // 2**32
- B.buildFConstant(S32, BitsToFloat(0x4f800000)), CvtLo);
+ auto Mad = B.buildFMAD(
+ S32, CvtHi, // 2**32
+ B.buildFConstant(S32, llvm::bit_cast<float>(0x4f800000)), CvtLo);
auto Rcp = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Mad});
- auto Mul1 =
- B.buildFMul(S32, Rcp, B.buildFConstant(S32, BitsToFloat(0x5f7ffffc)));
+ auto Mul1 = B.buildFMul(
+ S32, Rcp, B.buildFConstant(S32, llvm::bit_cast<float>(0x5f7ffffc)));
// 2**(-32)
- auto Mul2 =
- B.buildFMul(S32, Mul1, B.buildFConstant(S32, BitsToFloat(0x2f800000)));
+ auto Mul2 = B.buildFMul(
+ S32, Mul1, B.buildFConstant(S32, llvm::bit_cast<float>(0x2f800000)));
auto Trunc = B.buildIntrinsicTrunc(S32, Mul2);
// -(2**32)
- auto Mad2 = B.buildFMAD(S32, Trunc,
- B.buildFConstant(S32, BitsToFloat(0xcf800000)), Mul1);
+ auto Mad2 = B.buildFMAD(
+ S32, Trunc, B.buildFConstant(S32, llvm::bit_cast<float>(0xcf800000)),
+ Mul1);
auto ResultLo = B.buildFPTOUI(S32, Mad2);
auto ResultHi = B.buildFPTOUI(S32, Trunc);
@@ -3734,13 +4460,20 @@ bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI,
LLT ResTy = MRI.getType(Res);
const MachineFunction &MF = B.getMF();
- bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath ||
- MI.getFlag(MachineInstr::FmAfn);
-
- if (!AllowInaccurateRcp)
- return false;
+ bool AllowInaccurateRcp = MI.getFlag(MachineInstr::FmAfn) ||
+ MF.getTarget().Options.UnsafeFPMath;
if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) {
+ if (!AllowInaccurateRcp && ResTy != LLT::scalar(16))
+ return false;
+
+ // v_rcp_f32 and v_rsq_f32 do not support denormals, and according to
+ // the CI documentation has a worst case error of 1 ulp.
+ // OpenCL requires <= 2.5 ulp for 1.0 / x, so it should always be OK to
+ // use it as long as we aren't trying to use denormals.
+ //
+ // v_rcp_f16 and v_rsq_f16 DO support denormals and 0.51ulp.
+
// 1 / x -> RCP(x)
if (CLHS->isExactlyValue(1.0)) {
B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
@@ -3751,6 +4484,8 @@ bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI,
return true;
}
+ // TODO: Match rsq
+
// -1 / x -> RCP( FNEG(x) )
if (CLHS->isExactlyValue(-1.0)) {
auto FNeg = B.buildFNeg(ResTy, RHS, Flags);
@@ -3763,6 +4498,12 @@ bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI,
}
}
+ // For f16 require arcp only.
+ // For f32 require afn+arcp.
+ if (!AllowInaccurateRcp && (ResTy != LLT::scalar(16) ||
+ !MI.getFlag(MachineInstr::FmArcp)))
+ return false;
+
// x / y -> x * (1.0 / y)
auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false)
.addUse(RHS)
@@ -3847,10 +4588,9 @@ bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI,
// Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions
// to enable denorm mode. When 'Enable' is false, disable denorm mode.
-static void toggleSPDenormMode(bool Enable,
- MachineIRBuilder &B,
+static void toggleSPDenormMode(bool Enable, MachineIRBuilder &B,
const GCNSubtarget &ST,
- AMDGPU::SIModeRegisterDefaults Mode) {
+ SIModeRegisterDefaults Mode) {
// Set SP denorm mode to this value.
unsigned SPDenormMode =
Enable ? FP_DENORM_FLUSH_NONE : Mode.fpDenormModeSPValue();
@@ -3885,7 +4625,7 @@ bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI,
Register LHS = MI.getOperand(1).getReg();
Register RHS = MI.getOperand(2).getReg();
const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
- AMDGPU::SIModeRegisterDefaults Mode = MFI->getMode();
+ SIModeRegisterDefaults Mode = MFI->getMode();
uint16_t Flags = MI.getFlags();
@@ -3914,7 +4654,7 @@ bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI,
// FIXME: Doesn't correctly model the FP mode switch, and the FP operations
// aren't modeled as reading it.
- if (!Mode.allFP32Denormals())
+ if (Mode.FP32Denormals != DenormalMode::getIEEE())
toggleSPDenormMode(true, B, ST, Mode);
auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags);
@@ -3924,7 +4664,9 @@ bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI,
auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags);
auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags);
- if (!Mode.allFP32Denormals())
+ // FIXME: This mishandles dynamic denormal mode. We need to query the
+ // current mode and restore the original.
+ if (Mode.FP32Denormals != DenormalMode::getIEEE())
toggleSPDenormMode(false, B, ST, Mode);
auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32}, false)
@@ -4025,6 +4767,41 @@ bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI,
return true;
}
+bool AMDGPULegalizerInfo::legalizeFFREXP(MachineInstr &MI,
+ MachineRegisterInfo &MRI,
+ MachineIRBuilder &B) const {
+ Register Res0 = MI.getOperand(0).getReg();
+ Register Res1 = MI.getOperand(1).getReg();
+ Register Val = MI.getOperand(2).getReg();
+ uint16_t Flags = MI.getFlags();
+
+ LLT Ty = MRI.getType(Res0);
+ LLT InstrExpTy = Ty == LLT::scalar(16) ? LLT::scalar(16) : LLT::scalar(32);
+
+ auto Mant = B.buildIntrinsic(Intrinsic::amdgcn_frexp_mant, {Ty}, false)
+ .addUse(Val)
+ .setMIFlags(Flags);
+ auto Exp = B.buildIntrinsic(Intrinsic::amdgcn_frexp_exp, {InstrExpTy}, false)
+ .addUse(Val)
+ .setMIFlags(Flags);
+
+ if (ST.hasFractBug()) {
+ auto Fabs = B.buildFAbs(Ty, Val);
+ auto Inf = B.buildFConstant(Ty, APFloat::getInf(getFltSemanticForLLT(Ty)));
+ auto IsFinite =
+ B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), Fabs, Inf, Flags);
+ auto Zero = B.buildConstant(InstrExpTy, 0);
+ Exp = B.buildSelect(InstrExpTy, IsFinite, Exp, Zero);
+ Mant = B.buildSelect(Ty, IsFinite, Mant, Val);
+ }
+
+ B.buildCopy(Res0, Mant);
+ B.buildSExtOrTrunc(Res1, Exp);
+
+ MI.eraseFromParent();
+ return true;
+}
+
bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI,
MachineRegisterInfo &MRI,
MachineIRBuilder &B) const {
@@ -4039,9 +4816,9 @@ bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI,
auto Abs = B.buildFAbs(S32, RHS, Flags);
const APFloat C0Val(1.0f);
- auto C0 = B.buildConstant(S32, 0x6f800000);
- auto C1 = B.buildConstant(S32, 0x2f800000);
- auto C2 = B.buildConstant(S32, FloatToBits(1.0f));
+ auto C0 = B.buildFConstant(S32, 0x1p+96f);
+ auto C1 = B.buildFConstant(S32, 0x1p-32f);
+ auto C2 = B.buildFConstant(S32, 1.0f);
auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags);
auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags);
@@ -4060,6 +4837,90 @@ bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI,
return true;
}
+bool AMDGPULegalizerInfo::legalizeFSQRT(MachineInstr &MI,
+ MachineRegisterInfo &MRI,
+ MachineIRBuilder &B) const {
+ // For double type, the SQRT and RSQ instructions don't have required
+ // precision, we apply Goldschmidt's algorithm to improve the result:
+ //
+ // y0 = rsq(x)
+ // g0 = x * y0
+ // h0 = 0.5 * y0
+ //
+ // r0 = 0.5 - h0 * g0
+ // g1 = g0 * r0 + g0
+ // h1 = h0 * r0 + h0
+ //
+ // r1 = 0.5 - h1 * g1 => d0 = x - g1 * g1
+ // g2 = g1 * r1 + g1 g2 = d0 * h1 + g1
+ // h2 = h1 * r1 + h1
+ //
+ // r2 = 0.5 - h2 * g2 => d1 = x - g2 * g2
+ // g3 = g2 * r2 + g2 g3 = d1 * h1 + g2
+ //
+ // sqrt(x) = g3
+
+ const LLT S1 = LLT::scalar(1);
+ const LLT S32 = LLT::scalar(32);
+ const LLT F64 = LLT::scalar(64);
+
+ Register Dst = MI.getOperand(0).getReg();
+ assert(MRI.getType(Dst) == F64 && "only expect to lower f64 sqrt");
+
+ Register X = MI.getOperand(1).getReg();
+ unsigned Flags = MI.getFlags();
+
+ auto ScaleConstant = B.buildFConstant(F64, 0x1.0p-767);
+
+ auto ZeroInt = B.buildConstant(S32, 0);
+ auto Scaling = B.buildFCmp(FCmpInst::FCMP_OLT, S1, X, ScaleConstant);
+
+ // Scale up input if it is too small.
+ auto ScaleUpFactor = B.buildConstant(S32, 256);
+ auto ScaleUp = B.buildSelect(S32, Scaling, ScaleUpFactor, ZeroInt);
+ auto SqrtX = B.buildFLdexp(F64, X, ScaleUp, Flags);
+
+ auto SqrtY = B.buildIntrinsic(Intrinsic::amdgcn_rsq, {F64}, false)
+ .addReg(SqrtX.getReg(0));
+
+ auto Half = B.buildFConstant(F64, 0.5);
+ auto SqrtH0 = B.buildFMul(F64, SqrtY, Half);
+ auto SqrtS0 = B.buildFMul(F64, SqrtX, SqrtY);
+
+ auto NegSqrtH0 = B.buildFNeg(F64, SqrtH0);
+ auto SqrtR0 = B.buildFMA(F64, NegSqrtH0, SqrtS0, Half);
+
+ auto SqrtS1 = B.buildFMA(F64, SqrtS0, SqrtR0, SqrtS0);
+ auto SqrtH1 = B.buildFMA(F64, SqrtH0, SqrtR0, SqrtH0);
+
+ auto NegSqrtS1 = B.buildFNeg(F64, SqrtS1);
+ auto SqrtD0 = B.buildFMA(F64, NegSqrtS1, SqrtS1, SqrtX);
+
+ auto SqrtS2 = B.buildFMA(F64, SqrtD0, SqrtH1, SqrtS1);
+
+ auto NegSqrtS2 = B.buildFNeg(F64, SqrtS2);
+ auto SqrtD1 = B.buildFMA(F64, NegSqrtS2, SqrtS2, SqrtX);
+
+ auto SqrtRet = B.buildFMA(F64, SqrtD1, SqrtH1, SqrtS2);
+
+ // Scale down the result.
+ auto ScaleDownFactor = B.buildConstant(S32, -128);
+ auto ScaleDown = B.buildSelect(S32, Scaling, ScaleDownFactor, ZeroInt);
+ SqrtRet = B.buildFLdexp(F64, SqrtRet, ScaleDown, Flags);
+
+ // TODO: Switch to fcmp oeq 0 for finite only. Can't fully remove this check
+ // with finite only or nsz because rsq(+/-0) = +/-inf
+
+ // TODO: Check for DAZ and expand to subnormals
+ auto IsZeroOrInf = B.buildIsFPClass(LLT::scalar(1), SqrtX, fcZero | fcPosInf);
+
+ // If x is +INF, +0, or -0, use its original value
+ B.buildSelect(Dst, IsZeroOrInf, SqrtX, SqrtRet, Flags);
+
+ MI.eraseFromParent();
+ return true;
+}
+
// Expand llvm.amdgcn.rsq.clamp on targets that don't support the instruction.
// FIXME: Why do we handle this one but not other removed instructions?
//
@@ -4159,6 +5020,50 @@ bool AMDGPULegalizerInfo::getImplicitArgPtr(Register DstReg,
return true;
}
+/// To create a buffer resource from a 64-bit pointer, mask off the upper 32
+/// bits of the pointer and replace them with the stride argument, then
+/// merge_values everything together. In the common case of a raw buffer (the
+/// stride component is 0), we can just AND off the upper half.
+bool AMDGPULegalizerInfo::legalizePointerAsRsrcIntrin(
+ MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
+ Register Result = MI.getOperand(0).getReg();
+ Register Pointer = MI.getOperand(2).getReg();
+ Register Stride = MI.getOperand(3).getReg();
+ Register NumRecords = MI.getOperand(4).getReg();
+ Register Flags = MI.getOperand(5).getReg();
+
+ LLT S32 = LLT::scalar(32);
+
+ B.setInsertPt(B.getMBB(), ++B.getInsertPt());
+ auto Unmerge = B.buildUnmerge(S32, Pointer);
+ Register LowHalf = Unmerge.getReg(0);
+ Register HighHalf = Unmerge.getReg(1);
+
+ auto AndMask = B.buildConstant(S32, 0x0000ffff);
+ auto Masked = B.buildAnd(S32, HighHalf, AndMask);
+
+ MachineInstrBuilder NewHighHalf = Masked;
+ std::optional<ValueAndVReg> StrideConst =
+ getIConstantVRegValWithLookThrough(Stride, MRI);
+ if (!StrideConst || !StrideConst->Value.isZero()) {
+ MachineInstrBuilder ShiftedStride;
+ if (StrideConst) {
+ uint32_t StrideVal = StrideConst->Value.getZExtValue();
+ uint32_t ShiftedStrideVal = StrideVal << 16;
+ ShiftedStride = B.buildConstant(S32, ShiftedStrideVal);
+ } else {
+ auto ExtStride = B.buildAnyExt(S32, Stride);
+ auto ShiftConst = B.buildConstant(S32, 16);
+ ShiftedStride = B.buildShl(S32, ExtStride, ShiftConst);
+ }
+ NewHighHalf = B.buildOr(S32, Masked, ShiftedStride);
+ }
+ Register NewHighHalfReg = NewHighHalf.getReg(0);
+ B.buildMergeValues(Result, {LowHalf, NewHighHalfReg, NumRecords, Flags});
+ MI.eraseFromParent();
+ return true;
+}
+
bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI,
MachineRegisterInfo &MRI,
MachineIRBuilder &B) const {
@@ -4227,7 +5132,7 @@ bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI,
std::pair<Register, unsigned>
AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B,
Register OrigOffset) const {
- const unsigned MaxImm = 4095;
+ const unsigned MaxImm = SIInstrInfo::getMaxMUBUFImmOffset();
Register BaseReg;
unsigned ImmOffset;
const LLT S32 = LLT::scalar(32);
@@ -4240,13 +5145,14 @@ AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B,
if (MRI.getType(BaseReg).isPointer())
BaseReg = B.buildPtrToInt(MRI.getType(OrigOffset), BaseReg).getReg(0);
- // If the immediate value is too big for the immoffset field, put the value
- // and -4096 into the immoffset field so that the value that is copied/added
- // for the voffset field is a multiple of 4096, and it stands more chance
- // of being CSEd with the copy/add for another similar load/store.
- // However, do not do that rounding down to a multiple of 4096 if that is a
- // negative number, as it appears to be illegal to have a negative offset
- // in the vgpr, even if adding the immediate offset makes it positive.
+ // If the immediate value is too big for the immoffset field, put only bits
+ // that would normally fit in the immoffset field. The remaining value that
+ // is copied/added for the voffset field is a large power of 2, and it
+ // stands more chance of being CSEd with the copy/add for another similar
+ // load/store.
+ // However, do not do that rounding down if that is a negative
+ // number, as it appears to be illegal to have a negative offset in the
+ // vgpr, even if adding the immediate offset makes it positive.
unsigned Overflow = ImmOffset & ~MaxImm;
ImmOffset -= Overflow;
if ((int32_t)Overflow < 0) {
@@ -4269,31 +5175,6 @@ AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B,
return std::pair(BaseReg, ImmOffset);
}
-/// Update \p MMO based on the offset inputs to a raw/struct buffer intrinsic.
-void AMDGPULegalizerInfo::updateBufferMMO(MachineMemOperand *MMO,
- Register VOffset, Register SOffset,
- unsigned ImmOffset, Register VIndex,
- MachineRegisterInfo &MRI) const {
- std::optional<ValueAndVReg> MaybeVOffsetVal =
- getIConstantVRegValWithLookThrough(VOffset, MRI);
- std::optional<ValueAndVReg> MaybeSOffsetVal =
- getIConstantVRegValWithLookThrough(SOffset, MRI);
- std::optional<ValueAndVReg> MaybeVIndexVal =
- getIConstantVRegValWithLookThrough(VIndex, MRI);
- // If the combined VOffset + SOffset + ImmOffset + strided VIndex is constant,
- // update the MMO with that offset. The stride is unknown so we can only do
- // this if VIndex is constant 0.
- if (MaybeVOffsetVal && MaybeSOffsetVal && MaybeVIndexVal &&
- MaybeVIndexVal->Value == 0) {
- uint64_t TotalOffset = MaybeVOffsetVal->Value.getZExtValue() +
- MaybeSOffsetVal->Value.getZExtValue() + ImmOffset;
- MMO->setOffset(TotalOffset);
- } else {
- // We don't have a constant combined offset to use in the MMO. Give up.
- MMO->setValue((Value *)nullptr);
- }
-}
-
/// Handle register layout difference for f16 images for some subtargets.
Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B,
MachineRegisterInfo &MRI,
@@ -4365,6 +5246,10 @@ Register AMDGPULegalizerInfo::fixStoreSourceType(
const LLT S16 = LLT::scalar(16);
+ // Fixup buffer resources themselves needing to be v4i128.
+ if (hasBufferRsrcWorkaround(Ty))
+ return castBufferRsrcToV4I32(VData, B);
+
// Fixup illegal register types for i8 stores.
if (Ty == LLT::scalar(8) || Ty == S16) {
Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0);
@@ -4393,6 +5278,7 @@ bool AMDGPULegalizerInfo::legalizeBufferStore(MachineInstr &MI,
const LLT S32 = LLT::scalar(32);
VData = fixStoreSourceType(B, VData, IsFormat);
+ castBufferRsrcArgToV4I32(MI, B, 2);
Register RSrc = MI.getOperand(2).getReg();
MachineMemOperand *MMO = *MI.memoperands_begin();
@@ -4426,7 +5312,6 @@ bool AMDGPULegalizerInfo::legalizeBufferStore(MachineInstr &MI,
unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset);
- updateBufferMMO(MMO, VOffset, SOffset, ImmOffset, VIndex, MRI);
unsigned Opc;
if (IsTyped) {
@@ -4510,6 +5395,7 @@ bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI,
++OpOffset;
}
+ castBufferRsrcArgToV4I32(MI, B, 2 + OpOffset);
Register RSrc = MI.getOperand(2 + OpOffset).getReg();
// The typed intrinsics add an immediate after the registers.
@@ -4538,12 +5424,17 @@ bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI,
unsigned ImmOffset;
LLT Ty = MRI.getType(Dst);
+ // Make addrspace 8 pointers loads into 4xs32 loads here, so the rest of the
+ // logic doesn't have to handle that case.
+ if (hasBufferRsrcWorkaround(Ty)) {
+ Ty = castBufferRsrcFromV4I32(MI, B, MRI, 0);
+ Dst = MI.getOperand(0).getReg();
+ }
LLT EltTy = Ty.getScalarType();
const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
const bool Unpacked = ST.hasUnpackedD16VMem();
std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset);
- updateBufferMMO(MMO, VOffset, SOffset, ImmOffset, VIndex, MRI);
unsigned Opc;
@@ -4624,69 +5515,87 @@ bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI,
return true;
}
-bool AMDGPULegalizerInfo::legalizeAtomicIncDec(MachineInstr &MI,
- MachineIRBuilder &B,
- bool IsInc) const {
- unsigned Opc = IsInc ? AMDGPU::G_AMDGPU_ATOMIC_INC :
- AMDGPU::G_AMDGPU_ATOMIC_DEC;
- B.buildInstr(Opc)
- .addDef(MI.getOperand(0).getReg())
- .addUse(MI.getOperand(2).getReg())
- .addUse(MI.getOperand(3).getReg())
- .cloneMemRefs(MI);
- MI.eraseFromParent();
- return true;
-}
-
static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) {
switch (IntrID) {
case Intrinsic::amdgcn_raw_buffer_atomic_swap:
+ case Intrinsic::amdgcn_raw_ptr_buffer_atomic_swap:
case Intrinsic::amdgcn_struct_buffer_atomic_swap:
+ case Intrinsic::amdgcn_struct_ptr_buffer_atomic_swap:
return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP;
case Intrinsic::amdgcn_raw_buffer_atomic_add:
+ case Intrinsic::amdgcn_raw_ptr_buffer_atomic_add:
case Intrinsic::amdgcn_struct_buffer_atomic_add:
+ case Intrinsic::amdgcn_struct_ptr_buffer_atomic_add:
return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD;
case Intrinsic::amdgcn_raw_buffer_atomic_sub:
+ case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub:
case Intrinsic::amdgcn_struct_buffer_atomic_sub:
+ case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub:
return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB;
case Intrinsic::amdgcn_raw_buffer_atomic_smin:
+ case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smin:
case Intrinsic::amdgcn_struct_buffer_atomic_smin:
+ case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smin:
return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN;
case Intrinsic::amdgcn_raw_buffer_atomic_umin:
+ case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umin:
case Intrinsic::amdgcn_struct_buffer_atomic_umin:
+ case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umin:
return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN;
case Intrinsic::amdgcn_raw_buffer_atomic_smax:
+ case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smax:
case Intrinsic::amdgcn_struct_buffer_atomic_smax:
+ case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smax:
return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX;
case Intrinsic::amdgcn_raw_buffer_atomic_umax:
+ case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umax:
case Intrinsic::amdgcn_struct_buffer_atomic_umax:
+ case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umax:
return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX;
case Intrinsic::amdgcn_raw_buffer_atomic_and:
+ case Intrinsic::amdgcn_raw_ptr_buffer_atomic_and:
case Intrinsic::amdgcn_struct_buffer_atomic_and:
+ case Intrinsic::amdgcn_struct_ptr_buffer_atomic_and:
return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND;
case Intrinsic::amdgcn_raw_buffer_atomic_or:
+ case Intrinsic::amdgcn_raw_ptr_buffer_atomic_or:
case Intrinsic::amdgcn_struct_buffer_atomic_or:
+ case Intrinsic::amdgcn_struct_ptr_buffer_atomic_or:
return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR;
case Intrinsic::amdgcn_raw_buffer_atomic_xor:
+ case Intrinsic::amdgcn_raw_ptr_buffer_atomic_xor:
case Intrinsic::amdgcn_struct_buffer_atomic_xor:
+ case Intrinsic::amdgcn_struct_ptr_buffer_atomic_xor:
return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR;
case Intrinsic::amdgcn_raw_buffer_atomic_inc:
+ case Intrinsic::amdgcn_raw_ptr_buffer_atomic_inc:
case Intrinsic::amdgcn_struct_buffer_atomic_inc:
+ case Intrinsic::amdgcn_struct_ptr_buffer_atomic_inc:
return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC;
case Intrinsic::amdgcn_raw_buffer_atomic_dec:
+ case Intrinsic::amdgcn_raw_ptr_buffer_atomic_dec:
case Intrinsic::amdgcn_struct_buffer_atomic_dec:
+ case Intrinsic::amdgcn_struct_ptr_buffer_atomic_dec:
return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC;
case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
+ case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap:
case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
+ case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap:
return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP;
case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
+ case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd:
case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
+ case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fadd:
return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD;
case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
+ case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmin:
case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
+ case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmin:
return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN;
case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
+ case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmax:
case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
+ case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmax:
return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX;
default:
llvm_unreachable("unhandled atomic opcode");
@@ -4696,8 +5605,11 @@ static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) {
bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI,
MachineIRBuilder &B,
Intrinsic::ID IID) const {
- const bool IsCmpSwap = IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap ||
- IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap;
+ const bool IsCmpSwap =
+ IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap ||
+ IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap ||
+ IID == Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap ||
+ IID == Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap;
const bool HasReturn = MI.getNumExplicitDefs() != 0;
Register Dst;
@@ -4710,6 +5622,8 @@ bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI,
OpOffset = -1;
}
+ // Since we don't have 128-bit atomics, we don't need to handle the case of
+ // p8 argmunents to the atomic itself
Register VData = MI.getOperand(2 + OpOffset).getReg();
Register CmpVal;
@@ -4718,6 +5632,7 @@ bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI,
++OpOffset;
}
+ castBufferRsrcArgToV4I32(MI, B, 3 + OpOffset);
Register RSrc = MI.getOperand(3 + OpOffset).getReg();
const unsigned NumVIndexOps = (IsCmpSwap ? 8 : 7) + HasReturn;
@@ -4739,7 +5654,6 @@ bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI,
unsigned ImmOffset;
std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset);
- updateBufferMMO(MMO, VOffset, SOffset, ImmOffset, VIndex, *B.getMRI());
auto MIB = B.buildInstr(getBufferAtomicPseudo(IID));
@@ -4896,7 +5810,8 @@ bool AMDGPULegalizerInfo::legalizeImageIntrinsic(
MRI->getType(MI.getOperand(ArgOffset + Intr->GradientStart).getReg());
LLT AddrTy =
MRI->getType(MI.getOperand(ArgOffset + Intr->CoordStart).getReg());
- const bool IsG16 = GradTy == S16;
+ const bool IsG16 =
+ ST.hasG16() ? (BaseOpcode->Gradients && GradTy == S16) : GradTy == S16;
const bool IsA16 = AddrTy == S16;
const bool IsD16 = Ty.getScalarType() == S16;
@@ -4967,6 +5882,9 @@ bool AMDGPULegalizerInfo::legalizeImageIntrinsic(
return false;
}
+ const unsigned NSAMaxSize = ST.getNSAMaxSize();
+ const unsigned HasPartialNSA = ST.hasPartialNSAEncoding();
+
if (IsA16 || IsG16) {
if (Intr->NumVAddrs > 1) {
SmallVector<Register, 4> PackedRegs;
@@ -4977,9 +5895,19 @@ bool AMDGPULegalizerInfo::legalizeImageIntrinsic(
// See also below in the non-a16 branch
const bool UseNSA = ST.hasNSAEncoding() &&
PackedRegs.size() >= ST.getNSAThreshold(MF) &&
- PackedRegs.size() <= ST.getNSAMaxSize();
+ (PackedRegs.size() <= NSAMaxSize || HasPartialNSA);
+ const bool UsePartialNSA =
+ UseNSA && HasPartialNSA && PackedRegs.size() > NSAMaxSize;
- if (!UseNSA && PackedRegs.size() > 1) {
+ if (UsePartialNSA) {
+ // Pack registers that would go over NSAMaxSize into last VAddr register
+ LLT PackedAddrTy =
+ LLT::fixed_vector(2 * (PackedRegs.size() - NSAMaxSize + 1), 16);
+ auto Concat = B.buildConcatVectors(
+ PackedAddrTy, ArrayRef(PackedRegs).slice(NSAMaxSize - 1));
+ PackedRegs[NSAMaxSize - 1] = Concat.getReg(0);
+ PackedRegs.resize(NSAMaxSize);
+ } else if (!UseNSA && PackedRegs.size() > 1) {
LLT PackedAddrTy = LLT::fixed_vector(2 * PackedRegs.size(), 16);
auto Concat = B.buildConcatVectors(PackedAddrTy, PackedRegs);
PackedRegs[0] = Concat.getReg(0);
@@ -5015,16 +5943,22 @@ bool AMDGPULegalizerInfo::legalizeImageIntrinsic(
// SIShrinkInstructions will convert NSA encodings to non-NSA after register
// allocation when possible.
//
- // TODO: we can actually allow partial NSA where the final register is a
- // contiguous set of the remaining addresses.
- // This could help where there are more addresses than supported.
+ // Partial NSA is allowed on GFX11 where the final register is a contiguous
+ // set of the remaining addresses.
const bool UseNSA = ST.hasNSAEncoding() &&
CorrectedNumVAddrs >= ST.getNSAThreshold(MF) &&
- CorrectedNumVAddrs <= ST.getNSAMaxSize();
+ (CorrectedNumVAddrs <= NSAMaxSize || HasPartialNSA);
+ const bool UsePartialNSA =
+ UseNSA && HasPartialNSA && CorrectedNumVAddrs > NSAMaxSize;
- if (!UseNSA && Intr->NumVAddrs > 1)
+ if (UsePartialNSA) {
+ convertImageAddrToPacked(B, MI,
+ ArgOffset + Intr->VAddrStart + NSAMaxSize - 1,
+ Intr->NumVAddrs - NSAMaxSize + 1);
+ } else if (!UseNSA && Intr->NumVAddrs > 1) {
convertImageAddrToPacked(B, MI, ArgOffset + Intr->VAddrStart,
Intr->NumVAddrs);
+ }
}
int Flags = 0;
@@ -5237,6 +6171,12 @@ bool AMDGPULegalizerInfo::legalizeSBufferLoad(
Observer.changingInstr(MI);
+ // Handle needing to s.buffer.load() a p8 value.
+ if (hasBufferRsrcWorkaround(Ty)) {
+ Ty = castBufferRsrcFromV4I32(MI, B, *B.getMRI(), 0);
+ Dst = MI.getOperand(0).getReg();
+ B.setInsertPt(B.getMBB(), MI);
+ }
if (shouldBitcastLoadStoreType(ST, Ty, LLT::scalar(Size))) {
Ty = getBitcastRegisterType(Ty);
Helper.bitcastDst(MI, Ty, 0);
@@ -5283,25 +6223,40 @@ bool AMDGPULegalizerInfo::legalizeTrapIntrinsic(MachineInstr &MI,
ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA)
return legalizeTrapEndpgm(MI, MRI, B);
- if (std::optional<uint8_t> HsaAbiVer = AMDGPU::getHsaAbiVersion(&ST)) {
- switch (*HsaAbiVer) {
- case ELF::ELFABIVERSION_AMDGPU_HSA_V2:
- case ELF::ELFABIVERSION_AMDGPU_HSA_V3:
- return legalizeTrapHsaQueuePtr(MI, MRI, B);
- case ELF::ELFABIVERSION_AMDGPU_HSA_V4:
- case ELF::ELFABIVERSION_AMDGPU_HSA_V5:
- return ST.supportsGetDoorbellID() ?
- legalizeTrapHsa(MI, MRI, B) :
- legalizeTrapHsaQueuePtr(MI, MRI, B);
- }
- }
+ const Module *M = B.getMF().getFunction().getParent();
+ unsigned CodeObjectVersion = AMDGPU::getCodeObjectVersion(*M);
+ if (CodeObjectVersion <= AMDGPU::AMDHSA_COV3)
+ return legalizeTrapHsaQueuePtr(MI, MRI, B);
- llvm_unreachable("Unknown trap handler");
+ return ST.supportsGetDoorbellID() ?
+ legalizeTrapHsa(MI, MRI, B) : legalizeTrapHsaQueuePtr(MI, MRI, B);
}
bool AMDGPULegalizerInfo::legalizeTrapEndpgm(
MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
- B.buildInstr(AMDGPU::S_ENDPGM).addImm(0);
+ const DebugLoc &DL = MI.getDebugLoc();
+ MachineBasicBlock &BB = B.getMBB();
+ MachineFunction *MF = BB.getParent();
+
+ if (BB.succ_empty() && std::next(MI.getIterator()) == BB.end()) {
+ BuildMI(BB, BB.end(), DL, B.getTII().get(AMDGPU::S_ENDPGM))
+ .addImm(0);
+ MI.eraseFromParent();
+ return true;
+ }
+
+ // We need a block split to make the real endpgm a terminator. We also don't
+ // want to break phis in successor blocks, so we can't just delete to the
+ // end of the block.
+ BB.splitAt(MI, false /*UpdateLiveIns*/);
+ MachineBasicBlock *TrapBB = MF->CreateMachineBasicBlock();
+ MF->push_back(TrapBB);
+ BuildMI(*TrapBB, TrapBB->end(), DL, B.getTII().get(AMDGPU::S_ENDPGM))
+ .addImm(0);
+ BuildMI(BB, &MI, DL, B.getTII().get(AMDGPU::S_CBRANCH_EXECNZ))
+ .addMBB(TrapBB);
+
+ BB.addSuccessor(TrapBB);
MI.eraseFromParent();
return true;
}
@@ -5313,7 +6268,8 @@ bool AMDGPULegalizerInfo::legalizeTrapHsaQueuePtr(
Register SGPR01(AMDGPU::SGPR0_SGPR1);
// For code object version 5, queue_ptr is passed through implicit kernarg.
- if (AMDGPU::getAmdhsaCodeObjectVersion() == 5) {
+ if (AMDGPU::getCodeObjectVersion(*MF.getFunction().getParent()) >=
+ AMDGPU::AMDHSA_COV5) {
AMDGPUTargetLowering::ImplicitParameter Param =
AMDGPUTargetLowering::QUEUE_PTR;
uint64_t Offset =
@@ -5652,6 +6608,8 @@ bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
return false;
}
+ case Intrinsic::amdgcn_make_buffer_rsrc:
+ return legalizePointerAsRsrcIntrin(MI, MRI, B);
case Intrinsic::amdgcn_kernarg_segment_ptr:
if (!AMDGPU::isKernel(B.getMF().getFunction().getCallingConv())) {
// This only makes sense to call in a kernel, so just lower to null.
@@ -5736,60 +6694,100 @@ bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
case Intrinsic::amdgcn_s_buffer_load:
return legalizeSBufferLoad(Helper, MI);
case Intrinsic::amdgcn_raw_buffer_store:
+ case Intrinsic::amdgcn_raw_ptr_buffer_store:
case Intrinsic::amdgcn_struct_buffer_store:
+ case Intrinsic::amdgcn_struct_ptr_buffer_store:
return legalizeBufferStore(MI, MRI, B, false, false);
case Intrinsic::amdgcn_raw_buffer_store_format:
+ case Intrinsic::amdgcn_raw_ptr_buffer_store_format:
case Intrinsic::amdgcn_struct_buffer_store_format:
+ case Intrinsic::amdgcn_struct_ptr_buffer_store_format:
return legalizeBufferStore(MI, MRI, B, false, true);
case Intrinsic::amdgcn_raw_tbuffer_store:
+ case Intrinsic::amdgcn_raw_ptr_tbuffer_store:
case Intrinsic::amdgcn_struct_tbuffer_store:
+ case Intrinsic::amdgcn_struct_ptr_tbuffer_store:
return legalizeBufferStore(MI, MRI, B, true, true);
case Intrinsic::amdgcn_raw_buffer_load:
+ case Intrinsic::amdgcn_raw_ptr_buffer_load:
case Intrinsic::amdgcn_struct_buffer_load:
+ case Intrinsic::amdgcn_struct_ptr_buffer_load:
return legalizeBufferLoad(MI, MRI, B, false, false);
case Intrinsic::amdgcn_raw_buffer_load_format:
+ case Intrinsic::amdgcn_raw_ptr_buffer_load_format:
case Intrinsic::amdgcn_struct_buffer_load_format:
+ case Intrinsic::amdgcn_struct_ptr_buffer_load_format:
return legalizeBufferLoad(MI, MRI, B, true, false);
case Intrinsic::amdgcn_raw_tbuffer_load:
+ case Intrinsic::amdgcn_raw_ptr_tbuffer_load:
case Intrinsic::amdgcn_struct_tbuffer_load:
+ case Intrinsic::amdgcn_struct_ptr_tbuffer_load:
return legalizeBufferLoad(MI, MRI, B, true, true);
case Intrinsic::amdgcn_raw_buffer_atomic_swap:
+ case Intrinsic::amdgcn_raw_ptr_buffer_atomic_swap:
case Intrinsic::amdgcn_struct_buffer_atomic_swap:
+ case Intrinsic::amdgcn_struct_ptr_buffer_atomic_swap:
case Intrinsic::amdgcn_raw_buffer_atomic_add:
+ case Intrinsic::amdgcn_raw_ptr_buffer_atomic_add:
case Intrinsic::amdgcn_struct_buffer_atomic_add:
+ case Intrinsic::amdgcn_struct_ptr_buffer_atomic_add:
case Intrinsic::amdgcn_raw_buffer_atomic_sub:
+ case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub:
case Intrinsic::amdgcn_struct_buffer_atomic_sub:
+ case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub:
case Intrinsic::amdgcn_raw_buffer_atomic_smin:
+ case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smin:
case Intrinsic::amdgcn_struct_buffer_atomic_smin:
+ case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smin:
case Intrinsic::amdgcn_raw_buffer_atomic_umin:
+ case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umin:
case Intrinsic::amdgcn_struct_buffer_atomic_umin:
+ case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umin:
case Intrinsic::amdgcn_raw_buffer_atomic_smax:
+ case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smax:
case Intrinsic::amdgcn_struct_buffer_atomic_smax:
+ case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smax:
case Intrinsic::amdgcn_raw_buffer_atomic_umax:
+ case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umax:
case Intrinsic::amdgcn_struct_buffer_atomic_umax:
+ case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umax:
case Intrinsic::amdgcn_raw_buffer_atomic_and:
+ case Intrinsic::amdgcn_raw_ptr_buffer_atomic_and:
case Intrinsic::amdgcn_struct_buffer_atomic_and:
+ case Intrinsic::amdgcn_struct_ptr_buffer_atomic_and:
case Intrinsic::amdgcn_raw_buffer_atomic_or:
+ case Intrinsic::amdgcn_raw_ptr_buffer_atomic_or:
case Intrinsic::amdgcn_struct_buffer_atomic_or:
+ case Intrinsic::amdgcn_struct_ptr_buffer_atomic_or:
case Intrinsic::amdgcn_raw_buffer_atomic_xor:
+ case Intrinsic::amdgcn_raw_ptr_buffer_atomic_xor:
case Intrinsic::amdgcn_struct_buffer_atomic_xor:
+ case Intrinsic::amdgcn_struct_ptr_buffer_atomic_xor:
case Intrinsic::amdgcn_raw_buffer_atomic_inc:
+ case Intrinsic::amdgcn_raw_ptr_buffer_atomic_inc:
case Intrinsic::amdgcn_struct_buffer_atomic_inc:
+ case Intrinsic::amdgcn_struct_ptr_buffer_atomic_inc:
case Intrinsic::amdgcn_raw_buffer_atomic_dec:
+ case Intrinsic::amdgcn_raw_ptr_buffer_atomic_dec:
case Intrinsic::amdgcn_struct_buffer_atomic_dec:
+ case Intrinsic::amdgcn_struct_ptr_buffer_atomic_dec:
case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
+ case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap:
case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
+ case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap:
case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
+ case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmin:
case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
+ case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmin:
case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
+ case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmax:
case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
+ case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmax:
case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
+ case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd:
case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
+ case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fadd:
return legalizeBufferAtomic(MI, B, IntrID);
- case Intrinsic::amdgcn_atomic_inc:
- return legalizeAtomicIncDec(MI, B, true);
- case Intrinsic::amdgcn_atomic_dec:
- return legalizeAtomicIncDec(MI, B, false);
case Intrinsic::trap:
return legalizeTrapIntrinsic(MI, MRI, B);
case Intrinsic::debugtrap:
@@ -5802,6 +6800,17 @@ bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
return legalizeDSAtomicFPIntrinsic(Helper, MI, IntrID);
case Intrinsic::amdgcn_image_bvh_intersect_ray:
return legalizeBVHIntrinsic(MI, B);
+ case Intrinsic::amdgcn_fmed3: {
+ GISelChangeObserver &Observer = Helper.Observer;
+
+ // FIXME: This is to workaround the inability of tablegen match combiners to
+ // match intrinsics in patterns.
+ Observer.changingInstr(MI);
+ MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_FMED3));
+ MI.removeOperand(1);
+ Observer.changedInstr(MI);
+ return true;
+ }
default: {
if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
AMDGPU::getImageDimIntrinsicInfo(IntrID))
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h
index 37c987108bc4..04773f275c87 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h
@@ -71,14 +71,24 @@ public:
bool legalizeGlobalValue(MachineInstr &MI, MachineRegisterInfo &MRI,
MachineIRBuilder &B) const;
bool legalizeLoad(LegalizerHelper &Helper, MachineInstr &MI) const;
+ bool legalizeStore(LegalizerHelper &Helper, MachineInstr &MI) const;
bool legalizeFMad(MachineInstr &MI, MachineRegisterInfo &MRI,
MachineIRBuilder &B) const;
bool legalizeAtomicCmpXChg(MachineInstr &MI, MachineRegisterInfo &MRI,
MachineIRBuilder &B) const;
- bool legalizeFlog(MachineInstr &MI, MachineIRBuilder &B,
- double Log2BaseInverted) const;
+
+ std::pair<Register, Register>
+ getScaledLogInput(MachineIRBuilder &B, Register Src, unsigned Flags) const;
+
+ bool legalizeFlog2(MachineInstr &MI, MachineIRBuilder &B) const;
+ bool legalizeFlogCommon(MachineInstr &MI, MachineIRBuilder &B) const;
+ bool legalizeFlogUnsafe(MachineIRBuilder &B, Register Dst, Register Src,
+ double Log2BaseInverted, unsigned Flags) const;
+ bool legalizeFExp2(MachineInstr &MI, MachineIRBuilder &B) const;
+ bool legalizeFExpUnsafe(MachineIRBuilder &B, Register Dst, Register Src,
+ unsigned Flags) const;
bool legalizeFExp(MachineInstr &MI, MachineIRBuilder &B) const;
bool legalizeFPow(MachineInstr &MI, MachineIRBuilder &B) const;
bool legalizeFFloor(MachineInstr &MI, MachineRegisterInfo &MRI,
@@ -101,6 +111,9 @@ public:
bool loadInputValue(Register DstReg, MachineIRBuilder &B,
AMDGPUFunctionArgInfo::PreloadedValue ArgType) const;
+ bool legalizePointerAsRsrcIntrin(MachineInstr &MI, MachineRegisterInfo &MRI,
+ MachineIRBuilder &B) const;
+
bool legalizePreloadedArgIntrin(
MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B,
AMDGPUFunctionArgInfo::PreloadedValue ArgType) const;
@@ -135,6 +148,8 @@ public:
MachineIRBuilder &B) const;
bool legalizeFDIV64(MachineInstr &MI, MachineRegisterInfo &MRI,
MachineIRBuilder &B) const;
+ bool legalizeFFREXP(MachineInstr &MI, MachineRegisterInfo &MRI,
+ MachineIRBuilder &B) const;
bool legalizeFastUnsafeFDIV(MachineInstr &MI, MachineRegisterInfo &MRI,
MachineIRBuilder &B) const;
bool legalizeFastUnsafeFDIV64(MachineInstr &MI, MachineRegisterInfo &MRI,
@@ -142,6 +157,9 @@ public:
bool legalizeFDIVFastIntrin(MachineInstr &MI, MachineRegisterInfo &MRI,
MachineIRBuilder &B) const;
+ bool legalizeFSQRT(MachineInstr &MI, MachineRegisterInfo &MRI,
+ MachineIRBuilder &B) const;
+
bool legalizeRsqClampIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI,
MachineIRBuilder &B) const;
@@ -165,16 +183,9 @@ public:
std::pair<Register, unsigned> splitBufferOffsets(MachineIRBuilder &B,
Register OrigOffset) const;
- void updateBufferMMO(MachineMemOperand *MMO, Register VOffset,
- Register SOffset, unsigned ImmOffset, Register VIndex,
- MachineRegisterInfo &MRI) const;
Register handleD16VData(MachineIRBuilder &B, MachineRegisterInfo &MRI,
Register Reg, bool ImageStore = false) const;
- bool legalizeRawBufferStore(MachineInstr &MI, MachineRegisterInfo &MRI,
- MachineIRBuilder &B, bool IsFormat) const;
- bool legalizeRawBufferLoad(MachineInstr &MI, MachineRegisterInfo &MRI,
- MachineIRBuilder &B, bool IsFormat) const;
Register fixStoreSourceType(MachineIRBuilder &B, Register VData,
bool IsFormat) const;
@@ -198,9 +209,6 @@ public:
bool legalizeSBufferLoad(LegalizerHelper &Helper, MachineInstr &MI) const;
- bool legalizeAtomicIncDec(MachineInstr &MI, MachineIRBuilder &B,
- bool IsInc) const;
-
bool legalizeTrapIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI,
MachineIRBuilder &B) const;
bool legalizeTrapEndpgm(MachineInstr &MI, MachineRegisterInfo &MRI,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerIntrinsics.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerIntrinsics.cpp
deleted file mode 100644
index 93d1eed2cf63..000000000000
--- a/llvm/lib/Target/AMDGPU/AMDGPULowerIntrinsics.cpp
+++ /dev/null
@@ -1,177 +0,0 @@
-//===-- AMDGPULowerIntrinsics.cpp -----------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "AMDGPU.h"
-#include "AMDGPUSubtarget.h"
-#include "llvm/Analysis/TargetTransformInfo.h"
-#include "llvm/CodeGen/TargetPassConfig.h"
-#include "llvm/IR/Constants.h"
-#include "llvm/IR/Instructions.h"
-#include "llvm/IR/IntrinsicInst.h"
-#include "llvm/IR/IntrinsicsR600.h"
-#include "llvm/IR/Module.h"
-#include "llvm/Support/CommandLine.h"
-#include "llvm/Target/TargetMachine.h"
-#include "llvm/Transforms/Utils/LowerMemIntrinsics.h"
-
-#define DEBUG_TYPE "amdgpu-lower-intrinsics"
-
-using namespace llvm;
-
-namespace {
-
-static int MaxStaticSize;
-
-static cl::opt<int, true> MemIntrinsicExpandSizeThresholdOpt(
- "amdgpu-mem-intrinsic-expand-size",
- cl::desc("Set minimum mem intrinsic size to expand in IR"),
- cl::location(MaxStaticSize),
- cl::init(1024),
- cl::Hidden);
-
-
-class AMDGPULowerIntrinsics : public ModulePass {
-private:
- bool makeLIDRangeMetadata(Function &F) const;
-
-public:
- static char ID;
-
- AMDGPULowerIntrinsics() : ModulePass(ID) {}
-
- bool runOnModule(Module &M) override;
- bool expandMemIntrinsicUses(Function &F);
- StringRef getPassName() const override {
- return "AMDGPU Lower Intrinsics";
- }
-
- void getAnalysisUsage(AnalysisUsage &AU) const override {
- AU.addRequired<TargetTransformInfoWrapperPass>();
- }
-};
-
-}
-
-char AMDGPULowerIntrinsics::ID = 0;
-
-char &llvm::AMDGPULowerIntrinsicsID = AMDGPULowerIntrinsics::ID;
-
-INITIALIZE_PASS(AMDGPULowerIntrinsics, DEBUG_TYPE, "Lower intrinsics", false,
- false)
-
-// TODO: Should refine based on estimated number of accesses (e.g. does it
-// require splitting based on alignment)
-static bool shouldExpandOperationWithSize(Value *Size) {
- ConstantInt *CI = dyn_cast<ConstantInt>(Size);
- return !CI || (CI->getSExtValue() > MaxStaticSize);
-}
-
-bool AMDGPULowerIntrinsics::expandMemIntrinsicUses(Function &F) {
- Intrinsic::ID ID = F.getIntrinsicID();
- bool Changed = false;
-
- for (User *U : llvm::make_early_inc_range(F.users())) {
- Instruction *Inst = cast<Instruction>(U);
-
- switch (ID) {
- case Intrinsic::memcpy: {
- auto *Memcpy = cast<MemCpyInst>(Inst);
- if (shouldExpandOperationWithSize(Memcpy->getLength())) {
- Function *ParentFunc = Memcpy->getParent()->getParent();
- const TargetTransformInfo &TTI =
- getAnalysis<TargetTransformInfoWrapperPass>().getTTI(*ParentFunc);
- expandMemCpyAsLoop(Memcpy, TTI);
- Changed = true;
- Memcpy->eraseFromParent();
- }
-
- break;
- }
- case Intrinsic::memmove: {
- auto *Memmove = cast<MemMoveInst>(Inst);
- if (shouldExpandOperationWithSize(Memmove->getLength())) {
- expandMemMoveAsLoop(Memmove);
- Changed = true;
- Memmove->eraseFromParent();
- }
-
- break;
- }
- case Intrinsic::memset: {
- auto *Memset = cast<MemSetInst>(Inst);
- if (shouldExpandOperationWithSize(Memset->getLength())) {
- expandMemSetAsLoop(Memset);
- Changed = true;
- Memset->eraseFromParent();
- }
-
- break;
- }
- default:
- break;
- }
- }
-
- return Changed;
-}
-
-bool AMDGPULowerIntrinsics::makeLIDRangeMetadata(Function &F) const {
- auto *TPC = getAnalysisIfAvailable<TargetPassConfig>();
- if (!TPC)
- return false;
-
- const TargetMachine &TM = TPC->getTM<TargetMachine>();
- bool Changed = false;
-
- for (auto *U : F.users()) {
- auto *CI = dyn_cast<CallInst>(U);
- if (!CI)
- continue;
-
- Function *Caller = CI->getParent()->getParent();
- const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(TM, *Caller);
- Changed |= ST.makeLIDRangeMetadata(CI);
- }
- return Changed;
-}
-
-bool AMDGPULowerIntrinsics::runOnModule(Module &M) {
- bool Changed = false;
-
- for (Function &F : M) {
- if (!F.isDeclaration())
- continue;
-
- switch (F.getIntrinsicID()) {
- case Intrinsic::memcpy:
- case Intrinsic::memmove:
- case Intrinsic::memset:
- if (expandMemIntrinsicUses(F))
- Changed = true;
- break;
-
- case Intrinsic::r600_read_tidig_x:
- case Intrinsic::r600_read_tidig_y:
- case Intrinsic::r600_read_tidig_z:
- case Intrinsic::r600_read_local_size_x:
- case Intrinsic::r600_read_local_size_y:
- case Intrinsic::r600_read_local_size_z:
- Changed |= makeLIDRangeMetadata(F);
- break;
-
- default:
- break;
- }
- }
-
- return Changed;
-}
-
-ModulePass *llvm::createAMDGPULowerIntrinsicsPass() {
- return new AMDGPULowerIntrinsics();
-}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp
index f3ff9b753585..f5323725250f 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp
@@ -70,7 +70,7 @@ bool AMDGPULowerKernelArguments::runOnFunction(Function &F) {
IRBuilder<> Builder(&*getInsertPt(EntryBlock));
const Align KernArgBaseAlign(16); // FIXME: Increase if necessary
- const uint64_t BaseOffset = ST.getExplicitKernelArgOffset(F);
+ const uint64_t BaseOffset = ST.getExplicitKernelArgOffset();
Align MaxAlign;
// FIXME: Alignment is broken with explicit arg offset.;
@@ -86,7 +86,6 @@ bool AMDGPULowerKernelArguments::runOnFunction(Function &F) {
KernArgSegment->addRetAttr(
Attribute::getWithDereferenceableBytes(Ctx, TotalKernArgSize));
- unsigned AS = KernArgSegment->getType()->getPointerAddressSpace();
uint64_t ExplicitArgOffset = 0;
for (Argument &Arg : F.args()) {
@@ -111,8 +110,8 @@ bool AMDGPULowerKernelArguments::runOnFunction(Function &F) {
Builder.getInt8Ty(), KernArgSegment, EltOffset,
Arg.getName() + ".byval.kernarg.offset");
- Value *CastOffsetPtr = Builder.CreatePointerBitCastOrAddrSpaceCast(
- ArgOffsetPtr, Arg.getType());
+ Value *CastOffsetPtr =
+ Builder.CreateAddrSpaceCast(ArgOffsetPtr, Arg.getType());
Arg.replaceAllUsesWith(CastOffsetPtr);
continue;
}
@@ -170,8 +169,6 @@ bool AMDGPULowerKernelArguments::runOnFunction(Function &F) {
AdjustedArgTy = V4Ty;
}
- ArgPtr = Builder.CreateBitCast(ArgPtr, AdjustedArgTy->getPointerTo(AS),
- ArgPtr->getName() + ".cast");
LoadInst *Load =
Builder.CreateAlignedLoad(AdjustedArgTy, ArgPtr, AdjustedAlign);
Load->setMetadata(LLVMContext::MD_invariant_load, MDNode::get(Ctx, {}));
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerKernelAttributes.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerKernelAttributes.cpp
index 56e5e0708492..26074cf06071 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULowerKernelAttributes.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULowerKernelAttributes.cpp
@@ -322,7 +322,7 @@ static bool processUse(CallInst *CI, bool IsV5OrAbove) {
// TargetPassConfig for subtarget.
bool AMDGPULowerKernelAttributes::runOnModule(Module &M) {
bool MadeChange = false;
- bool IsV5OrAbove = AMDGPU::getAmdhsaCodeObjectVersion() >= 5;
+ bool IsV5OrAbove = AMDGPU::getCodeObjectVersion(M) >= AMDGPU::AMDHSA_COV5;
Function *BasePtr = getBasePtrIntrinsic(M, IsV5OrAbove);
if (!BasePtr) // ImplicitArgPtr/DispatchPtr not used.
@@ -354,7 +354,8 @@ ModulePass *llvm::createAMDGPULowerKernelAttributesPass() {
PreservedAnalyses
AMDGPULowerKernelAttributesPass::run(Function &F, FunctionAnalysisManager &AM) {
- bool IsV5OrAbove = AMDGPU::getAmdhsaCodeObjectVersion() >= 5;
+ bool IsV5OrAbove =
+ AMDGPU::getCodeObjectVersion(*F.getParent()) >= AMDGPU::AMDHSA_COV5;
Function *BasePtr = getBasePtrIntrinsic(*F.getParent(), IsV5OrAbove);
if (!BasePtr) // ImplicitArgPtr/DispatchPtr not used.
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp
index 11ba5c91dae9..e3a645977f92 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp
@@ -20,9 +20,8 @@
// This model means the GPU runtime can specify the amount of memory allocated.
// If this is more than the kernel assumed, the excess can be made available
// using a language specific feature, which IR represents as a variable with
-// no initializer. This feature is not yet implemented for non-kernel functions.
-// This lowering could be extended to handle that use case, but would probably
-// require closer integration with promoteAllocaToLDS.
+// no initializer. This feature is referred to here as "Dynamic LDS" and is
+// lowered slightly differently to the normal case.
//
// Consequences of this GPU feature:
// - memory is limited and exceeding it halts compilation
@@ -65,17 +64,15 @@
// Kernel | Yes | Yes | No |
// Hybrid | Yes | Partial | Yes |
//
-// Module spends LDS memory to save cycles. Table spends cycles and global
-// memory to save LDS. Kernel is as fast as kernel allocation but only works
-// for variables that are known reachable from a single kernel. Hybrid picks
-// between all three. When forced to choose between LDS and cycles it minimises
+// "Module" spends LDS memory to save cycles. "Table" spends cycles and global
+// memory to save LDS. "Kernel" is as fast as kernel allocation but only works
+// for variables that are known reachable from a single kernel. "Hybrid" picks
+// between all three. When forced to choose between LDS and cycles we minimise
// LDS use.
// The "module" lowering implemented here finds LDS variables which are used by
// non-kernel functions and creates a new struct with a field for each of those
// LDS variables. Variables that are only used from kernels are excluded.
-// Kernels that do not use this struct are annoteated with the attribute
-// amdgpu-elide-module-lds which allows the back end to elide the allocation.
//
// The "table" lowering implemented here has three components.
// First kernels are assigned a unique integer identifier which is available in
@@ -115,6 +112,68 @@
// use LDS are expected to hit the "Kernel" lowering strategy
// - The runtime properties impose a cost in compiler implementation complexity
//
+// Dynamic LDS implementation
+// Dynamic LDS is lowered similarly to the "table" strategy above and uses the
+// same intrinsic to identify which kernel is at the root of the dynamic call
+// graph. This relies on the specified behaviour that all dynamic LDS variables
+// alias one another, i.e. are at the same address, with respect to a given
+// kernel. Therefore this pass creates new dynamic LDS variables for each kernel
+// that allocates any dynamic LDS and builds a table of addresses out of those.
+// The AMDGPUPromoteAlloca pass skips kernels that use dynamic LDS.
+// The corresponding optimisation for "kernel" lowering where the table lookup
+// is elided is not implemented.
+//
+//
+// Implementation notes / limitations
+// A single LDS global variable represents an instance per kernel that can reach
+// said variables. This pass essentially specialises said variables per kernel.
+// Handling ConstantExpr during the pass complicated this significantly so now
+// all ConstantExpr uses of LDS variables are expanded to instructions. This
+// may need amending when implementing non-undef initialisers.
+//
+// Lowering is split between this IR pass and the back end. This pass chooses
+// where given variables should be allocated and marks them with metadata,
+// MD_absolute_symbol. The backend places the variables in coincidentally the
+// same location and raises a fatal error if something has gone awry. This works
+// in practice because the only pass between this one and the backend that
+// changes LDS is PromoteAlloca and the changes it makes do not conflict.
+//
+// Addresses are written to constant global arrays based on the same metadata.
+//
+// The backend lowers LDS variables in the order of traversal of the function.
+// This is at odds with the deterministic layout required. The workaround is to
+// allocate the fixed-address variables immediately upon starting the function
+// where they can be placed as intended. This requires a means of mapping from
+// the function to the variables that it allocates. For the module scope lds,
+// this is via metadata indicating whether the variable is not required. If a
+// pass deletes that metadata, a fatal error on disagreement with the absolute
+// symbol metadata will occur. For kernel scope and dynamic, this is by _name_
+// correspondence between the function and the variable. It requires the
+// kernel to have a name (which is only a limitation for tests in practice) and
+// for nothing to rename the corresponding symbols. This is a hazard if the pass
+// is run multiple times during debugging. Alternative schemes considered all
+// involve bespoke metadata.
+//
+// If the name correspondence can be replaced, multiple distinct kernels that
+// have the same memory layout can map to the same kernel id (as the address
+// itself is handled by the absolute symbol metadata) and that will allow more
+// uses of the "kernel" style faster lowering and reduce the size of the lookup
+// tables.
+//
+// There is a test that checks this does not fire for a graphics shader. This
+// lowering is expected to work for graphics if the isKernel test is changed.
+//
+// The current markUsedByKernel is sufficient for PromoteAlloca but is elided
+// before codegen. Replacing this with an equivalent intrinsic which lasts until
+// shortly after the machine function lowering of LDS would help break the name
+// mapping. The other part needed is probably to amend PromoteAlloca to embed
+// the LDS variables it creates in the same struct created here. That avoids the
+// current hazard where a PromoteAlloca LDS variable might be allocated before
+// the kernel scope (and thus error on the address check). Given a new invariant
+// that no LDS variables exist outside of the structs managed here, and an
+// intrinsic that lasts until after the LDS frame lowering, it should be
+// possible to drop the name mapping and fold equivalent memory layouts.
+//
//===----------------------------------------------------------------------===//
#include "AMDGPU.h"
@@ -134,11 +193,14 @@
#include "llvm/IR/Instructions.h"
#include "llvm/IR/IntrinsicsAMDGPU.h"
#include "llvm/IR/MDBuilder.h"
+#include "llvm/IR/ReplaceConstant.h"
#include "llvm/InitializePasses.h"
#include "llvm/Pass.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/Debug.h"
+#include "llvm/Support/Format.h"
#include "llvm/Support/OptimizedStructLayout.h"
+#include "llvm/Support/raw_ostream.h"
#include "llvm/Transforms/Utils/BasicBlockUtils.h"
#include "llvm/Transforms/Utils/ModuleUtils.h"
@@ -162,7 +224,7 @@ enum class LoweringKind { module, table, kernel, hybrid };
cl::opt<LoweringKind> LoweringKindLoc(
"amdgpu-lower-module-lds-strategy",
cl::desc("Specify lowering strategy for function LDS access:"), cl::Hidden,
- cl::init(LoweringKind::module),
+ cl::init(LoweringKind::hybrid),
cl::values(
clEnumValN(LoweringKind::table, "table", "Lower via table lookup"),
clEnumValN(LoweringKind::module, "module", "Lower via module struct"),
@@ -183,6 +245,13 @@ bool isKernelLDS(const Function *F) {
return AMDGPU::isKernel(F->getCallingConv());
}
+template <typename T> std::vector<T> sortByName(std::vector<T> &&V) {
+ llvm::sort(V.begin(), V.end(), [](const auto *L, const auto *R) {
+ return L->getName() < R->getName();
+ });
+ return {std::move(V)};
+}
+
class AMDGPULowerModuleLDS : public ModulePass {
static void
@@ -201,8 +270,7 @@ class AMDGPULowerModuleLDS : public ModulePass {
LocalVar->removeDeadConstantUsers();
}
- static void markUsedByKernel(IRBuilder<> &Builder, Function *Func,
- GlobalVariable *SGV) {
+ static void markUsedByKernel(Function *Func, GlobalVariable *SGV) {
// The llvm.amdgcn.module.lds instance is implicitly used by all kernels
// that might call a function which accesses a field within it. This is
// presently approximated to 'all kernels' if there are any such functions
@@ -217,21 +285,22 @@ class AMDGPULowerModuleLDS : public ModulePass {
// llvm.donothing that takes a pointer to the instance and is lowered to a
// no-op after LDS is allocated, but that is not presently necessary.
- LLVMContext &Ctx = Func->getContext();
-
- Builder.SetInsertPoint(Func->getEntryBlock().getFirstNonPHI());
-
- FunctionType *FTy = FunctionType::get(Type::getVoidTy(Ctx), {});
+ // This intrinsic is eliminated shortly before instruction selection. It
+ // does not suffice to indicate to ISel that a given global which is not
+ // immediately used by the kernel must still be allocated by it. An
+ // equivalent target specific intrinsic which lasts until immediately after
+ // codegen would suffice for that, but one would still need to ensure that
+ // the variables are allocated in the anticpated order.
+ IRBuilder<> Builder(Func->getEntryBlock().getFirstNonPHI());
Function *Decl =
Intrinsic::getDeclaration(Func->getParent(), Intrinsic::donothing, {});
- Value *UseInstance[1] = {Builder.CreateInBoundsGEP(
- SGV->getValueType(), SGV, ConstantInt::get(Type::getInt32Ty(Ctx), 0))};
+ Value *UseInstance[1] = {
+ Builder.CreateConstInBoundsGEP1_32(SGV->getValueType(), SGV, 0)};
- Builder.CreateCall(FTy, Decl, {},
- {OperandBundleDefT<Value *>("ExplicitUse", UseInstance)},
- "");
+ Builder.CreateCall(
+ Decl, {}, {OperandBundleDefT<Value *>("ExplicitUse", UseInstance)});
}
static bool eliminateConstantExprUsesOfLDSFromAllInstructions(Module &M) {
@@ -240,7 +309,7 @@ class AMDGPULowerModuleLDS : public ModulePass {
// This pass specialises LDS variables with respect to the kernel that
// allocates them.
- // This is semantically equivalent to:
+ // This is semantically equivalent to (the unimplemented as slow):
// for (auto &F : M.functions())
// for (auto &BB : F)
// for (auto &I : BB)
@@ -248,63 +317,12 @@ class AMDGPULowerModuleLDS : public ModulePass {
// if (constantExprUsesLDS(Op))
// replaceConstantExprInFunction(I, Op);
- bool Changed = false;
-
- // Find all ConstantExpr that are direct users of an LDS global
- SmallVector<ConstantExpr *> Stack;
+ SmallVector<Constant *> LDSGlobals;
for (auto &GV : M.globals())
if (AMDGPU::isLDSVariableToLower(GV))
- for (User *U : GV.users())
- if (ConstantExpr *C = dyn_cast<ConstantExpr>(U))
- Stack.push_back(C);
-
- // Expand to include constexpr users of direct users
- SetVector<ConstantExpr *> ConstExprUsersOfLDS;
- while (!Stack.empty()) {
- ConstantExpr *V = Stack.pop_back_val();
- if (ConstExprUsersOfLDS.contains(V))
- continue;
-
- ConstExprUsersOfLDS.insert(V);
+ LDSGlobals.push_back(&GV);
- for (auto *Nested : V->users())
- if (ConstantExpr *CE = dyn_cast<ConstantExpr>(Nested))
- Stack.push_back(CE);
- }
-
- // Find all instructions that use any of the ConstExpr users of LDS
- SetVector<Instruction *> InstructionWorklist;
- for (ConstantExpr *CE : ConstExprUsersOfLDS)
- for (User *U : CE->users())
- if (auto *I = dyn_cast<Instruction>(U))
- InstructionWorklist.insert(I);
-
- // Replace those ConstExpr operands with instructions
- while (!InstructionWorklist.empty()) {
- Instruction *I = InstructionWorklist.pop_back_val();
- for (Use &U : I->operands()) {
-
- auto *BI = I;
- if (auto *Phi = dyn_cast<PHINode>(I)) {
- BasicBlock *BB = Phi->getIncomingBlock(U);
- BasicBlock::iterator It = BB->getFirstInsertionPt();
- assert(It != BB->end() && "Unexpected empty basic block");
- BI = &(*(It));
- }
-
- if (ConstantExpr *C = dyn_cast<ConstantExpr>(U.get())) {
- if (ConstExprUsersOfLDS.contains(C)) {
- Changed = true;
- Instruction *NI = C->getAsInstruction(BI);
- InstructionWorklist.insert(NI);
- U.set(NI);
- C->removeDeadConstantUsers();
- }
- }
- }
- }
-
- return Changed;
+ return convertUsersOfConstantsToInstructions(LDSGlobals);
}
public:
@@ -329,7 +347,11 @@ public:
continue;
}
- SmallVector<User *, 16> Stack(GV.users());
+ if (GV.isAbsoluteSymbolRef()) {
+ report_fatal_error(
+ "LDS variables with absolute addresses are unimplemented.");
+ }
+
for (User *V : GV.users()) {
if (auto *I = dyn_cast<Instruction>(V)) {
Function *F = I->getFunction();
@@ -358,11 +380,11 @@ public:
DenseSet<GlobalVariable *> VariablesReachableThroughFunctionPointer;
for (Function &F : M.functions()) {
if (!isKernelLDS(&F))
- if (F.hasAddressTaken(nullptr,
- /* IgnoreCallbackUses */ false,
- /* IgnoreAssumeLikeCalls */ false,
- /* IgnoreLLVMUsed */ true,
- /* IgnoreArcAttachedCall */ false)) {
+ if (F.hasAddressTaken(nullptr,
+ /* IgnoreCallbackUses */ false,
+ /* IgnoreAssumeLikeCalls */ false,
+ /* IgnoreLLVMUsed */ true,
+ /* IgnoreArcAttachedCall */ false)) {
set_union(VariablesReachableThroughFunctionPointer,
direct_map_function[&F]);
}
@@ -370,7 +392,7 @@ public:
auto functionMakesUnknownCall = [&](const Function *F) -> bool {
assert(!F->isDeclaration());
- for (CallGraphNode::CallRecord R : *CG[F]) {
+ for (const CallGraphNode::CallRecord &R : *CG[F]) {
if (!R.second->getFunction()) {
return true;
}
@@ -408,7 +430,7 @@ public:
// have already been computed, with more care than this
set_union(transitive_map_function[&Func], direct_map_function[F]);
- for (CallGraphNode::CallRecord R : *CG[F]) {
+ for (const CallGraphNode::CallRecord &R : *CG[F]) {
Function *ith = R.second->getFunction();
if (ith) {
if (!seen.contains(ith)) {
@@ -428,7 +450,7 @@ public:
if (Func.isDeclaration() || !isKernelLDS(&Func))
continue;
- for (CallGraphNode::CallRecord R : *CG[&Func]) {
+ for (const CallGraphNode::CallRecord &R : *CG[&Func]) {
Function *ith = R.second->getFunction();
if (ith) {
set_union(indirect_map_kernel[&Func], transitive_map_function[ith]);
@@ -454,7 +476,7 @@ public:
static Constant *getAddressesOfVariablesInKernel(
LLVMContext &Ctx, ArrayRef<GlobalVariable *> Variables,
- DenseMap<GlobalVariable *, Constant *> &LDSVarsToConstantGEP) {
+ const DenseMap<GlobalVariable *, Constant *> &LDSVarsToConstantGEP) {
// Create a ConstantArray containing the address of each Variable within the
// kernel corresponding to LDSVarsToConstantGEP, or poison if that kernel
// does not allocate it
@@ -467,8 +489,9 @@ public:
SmallVector<Constant *> Elements;
for (size_t i = 0; i < Variables.size(); i++) {
GlobalVariable *GV = Variables[i];
- if (LDSVarsToConstantGEP.count(GV) != 0) {
- auto elt = ConstantExpr::getPtrToInt(LDSVarsToConstantGEP[GV], I32);
+ auto ConstantGepIt = LDSVarsToConstantGEP.find(GV);
+ if (ConstantGepIt != LDSVarsToConstantGEP.end()) {
+ auto elt = ConstantExpr::getPtrToInt(ConstantGepIt->second, I32);
Elements.push_back(elt);
} else {
Elements.push_back(PoisonValue::get(I32));
@@ -495,11 +518,15 @@ public:
ArrayType *AllKernelsOffsetsType =
ArrayType::get(KernelOffsetsType, NumberKernels);
+ Constant *Missing = PoisonValue::get(KernelOffsetsType);
std::vector<Constant *> overallConstantExprElts(NumberKernels);
for (size_t i = 0; i < NumberKernels; i++) {
- LDSVariableReplacement Replacement = KernelToReplacement[kernels[i]];
- overallConstantExprElts[i] = getAddressesOfVariablesInKernel(
- Ctx, Variables, Replacement.LDSVarsToConstantGEP);
+ auto Replacement = KernelToReplacement.find(kernels[i]);
+ overallConstantExprElts[i] =
+ (Replacement == KernelToReplacement.end())
+ ? Missing
+ : getAddressesOfVariablesInKernel(
+ Ctx, Variables, Replacement->second.LDSVarsToConstantGEP);
}
Constant *init =
@@ -511,36 +538,49 @@ public:
AMDGPUAS::CONSTANT_ADDRESS);
}
- void replaceUsesInInstructionsWithTableLookup(
- Module &M, ArrayRef<GlobalVariable *> ModuleScopeVariables,
- GlobalVariable *LookupTable) {
-
+ void replaceUseWithTableLookup(Module &M, IRBuilder<> &Builder,
+ GlobalVariable *LookupTable,
+ GlobalVariable *GV, Use &U,
+ Value *OptionalIndex) {
+ // Table is a constant array of the same length as OrderedKernels
LLVMContext &Ctx = M.getContext();
- IRBuilder<> Builder(Ctx);
Type *I32 = Type::getInt32Ty(Ctx);
+ auto *I = cast<Instruction>(U.getUser());
- // Accesses from a function use the amdgcn_lds_kernel_id intrinsic which
- // lowers to a read from a live in register. Emit it once in the entry
- // block to spare deduplicating it later.
+ Value *tableKernelIndex = getTableLookupKernelIndex(M, I->getFunction());
- DenseMap<Function *, Value *> tableKernelIndexCache;
- auto getTableKernelIndex = [&](Function *F) -> Value * {
- if (tableKernelIndexCache.count(F) == 0) {
- LLVMContext &Ctx = M.getContext();
- FunctionType *FTy = FunctionType::get(Type::getInt32Ty(Ctx), {});
- Function *Decl =
- Intrinsic::getDeclaration(&M, Intrinsic::amdgcn_lds_kernel_id, {});
+ if (auto *Phi = dyn_cast<PHINode>(I)) {
+ BasicBlock *BB = Phi->getIncomingBlock(U);
+ Builder.SetInsertPoint(&(*(BB->getFirstInsertionPt())));
+ } else {
+ Builder.SetInsertPoint(I);
+ }
- BasicBlock::iterator it =
- F->getEntryBlock().getFirstNonPHIOrDbgOrAlloca();
- Instruction &i = *it;
- Builder.SetInsertPoint(&i);
+ SmallVector<Value *, 3> GEPIdx = {
+ ConstantInt::get(I32, 0),
+ tableKernelIndex,
+ };
+ if (OptionalIndex)
+ GEPIdx.push_back(OptionalIndex);
- tableKernelIndexCache[F] = Builder.CreateCall(FTy, Decl, {});
- }
+ Value *Address = Builder.CreateInBoundsGEP(
+ LookupTable->getValueType(), LookupTable, GEPIdx, GV->getName());
- return tableKernelIndexCache[F];
- };
+ Value *loaded = Builder.CreateLoad(I32, Address);
+
+ Value *replacement =
+ Builder.CreateIntToPtr(loaded, GV->getType(), GV->getName());
+
+ U.set(replacement);
+ }
+
+ void replaceUsesInInstructionsWithTableLookup(
+ Module &M, ArrayRef<GlobalVariable *> ModuleScopeVariables,
+ GlobalVariable *LookupTable) {
+
+ LLVMContext &Ctx = M.getContext();
+ IRBuilder<> Builder(Ctx);
+ Type *I32 = Type::getInt32Ty(Ctx);
for (size_t Index = 0; Index < ModuleScopeVariables.size(); Index++) {
auto *GV = ModuleScopeVariables[Index];
@@ -550,32 +590,8 @@ public:
if (!I)
continue;
- Value *tableKernelIndex = getTableKernelIndex(I->getFunction());
-
- // So if the phi uses this value multiple times, what does this look
- // like?
- if (auto *Phi = dyn_cast<PHINode>(I)) {
- BasicBlock *BB = Phi->getIncomingBlock(U);
- Builder.SetInsertPoint(&(*(BB->getFirstInsertionPt())));
- } else {
- Builder.SetInsertPoint(I);
- }
-
- Value *GEPIdx[3] = {
- ConstantInt::get(I32, 0),
- tableKernelIndex,
- ConstantInt::get(I32, Index),
- };
-
- Value *Address = Builder.CreateInBoundsGEP(
- LookupTable->getValueType(), LookupTable, GEPIdx, GV->getName());
-
- Value *loaded = Builder.CreateLoad(I32, Address);
-
- Value *replacement =
- Builder.CreateIntToPtr(loaded, GV->getType(), GV->getName());
-
- U.set(replacement);
+ replaceUseWithTableLookup(M, Builder, LookupTable, GV, U,
+ ConstantInt::get(I32, Index));
}
}
}
@@ -586,7 +602,8 @@ public:
DenseSet<Function *> KernelSet;
- if (VariableSet.empty()) return KernelSet;
+ if (VariableSet.empty())
+ return KernelSet;
for (Function &Func : M.functions()) {
if (Func.isDeclaration() || !isKernelLDS(&Func))
@@ -649,8 +666,9 @@ public:
// strategy
continue;
}
- CandidateTy Candidate(GV, K.second.size(),
- DL.getTypeAllocSize(GV->getValueType()).getFixedValue());
+ CandidateTy Candidate(
+ GV, K.second.size(),
+ DL.getTypeAllocSize(GV->getValueType()).getFixedValue());
if (MostUsed < Candidate)
MostUsed = Candidate;
}
@@ -658,173 +676,258 @@ public:
return MostUsed.GV;
}
- bool runOnModule(Module &M) override {
- LLVMContext &Ctx = M.getContext();
- CallGraph CG = CallGraph(M);
- bool Changed = superAlignLDSGlobals(M);
+ static void recordLDSAbsoluteAddress(Module *M, GlobalVariable *GV,
+ uint32_t Address) {
+ // Write the specified address into metadata where it can be retrieved by
+ // the assembler. Format is a half open range, [Address Address+1)
+ LLVMContext &Ctx = M->getContext();
+ auto *IntTy =
+ M->getDataLayout().getIntPtrType(Ctx, AMDGPUAS::LOCAL_ADDRESS);
+ auto *MinC = ConstantAsMetadata::get(ConstantInt::get(IntTy, Address));
+ auto *MaxC = ConstantAsMetadata::get(ConstantInt::get(IntTy, Address + 1));
+ GV->setMetadata(LLVMContext::MD_absolute_symbol,
+ MDNode::get(Ctx, {MinC, MaxC}));
+ }
- Changed |= eliminateConstantExprUsesOfLDSFromAllInstructions(M);
+ DenseMap<Function *, Value *> tableKernelIndexCache;
+ Value *getTableLookupKernelIndex(Module &M, Function *F) {
+ // Accesses from a function use the amdgcn_lds_kernel_id intrinsic which
+ // lowers to a read from a live in register. Emit it once in the entry
+ // block to spare deduplicating it later.
+ auto [It, Inserted] = tableKernelIndexCache.try_emplace(F);
+ if (Inserted) {
+ Function *Decl =
+ Intrinsic::getDeclaration(&M, Intrinsic::amdgcn_lds_kernel_id, {});
- Changed = true; // todo: narrow this down
+ auto InsertAt = F->getEntryBlock().getFirstNonPHIOrDbgOrAlloca();
+ IRBuilder<> Builder(&*InsertAt);
- // For each kernel, what variables does it access directly or through
- // callees
- LDSUsesInfoTy LDSUsesInfo = getTransitiveUsesOfLDS(CG, M);
+ It->second = Builder.CreateCall(Decl, {});
+ }
- // For each variable accessed through callees, which kernels access it
- VariableFunctionMap LDSToKernelsThatNeedToAccessItIndirectly;
- for (auto &K : LDSUsesInfo.indirect_access) {
- Function *F = K.first;
- assert(isKernelLDS(F));
- for (GlobalVariable *GV : K.second) {
- LDSToKernelsThatNeedToAccessItIndirectly[GV].insert(F);
+ return It->second;
+ }
+
+ static std::vector<Function *> assignLDSKernelIDToEachKernel(
+ Module *M, DenseSet<Function *> const &KernelsThatAllocateTableLDS,
+ DenseSet<Function *> const &KernelsThatIndirectlyAllocateDynamicLDS) {
+ // Associate kernels in the set with an arbirary but reproducible order and
+ // annotate them with that order in metadata. This metadata is recognised by
+ // the backend and lowered to a SGPR which can be read from using
+ // amdgcn_lds_kernel_id.
+
+ std::vector<Function *> OrderedKernels;
+ if (!KernelsThatAllocateTableLDS.empty() ||
+ !KernelsThatIndirectlyAllocateDynamicLDS.empty()) {
+
+ for (Function &Func : M->functions()) {
+ if (Func.isDeclaration())
+ continue;
+ if (!isKernelLDS(&Func))
+ continue;
+
+ if (KernelsThatAllocateTableLDS.contains(&Func) ||
+ KernelsThatIndirectlyAllocateDynamicLDS.contains(&Func)) {
+ assert(Func.hasName()); // else fatal error earlier
+ OrderedKernels.push_back(&Func);
+ }
+ }
+
+ // Put them in an arbitrary but reproducible order
+ OrderedKernels = sortByName(std::move(OrderedKernels));
+
+ // Annotate the kernels with their order in this vector
+ LLVMContext &Ctx = M->getContext();
+ IRBuilder<> Builder(Ctx);
+
+ if (OrderedKernels.size() > UINT32_MAX) {
+ // 32 bit keeps it in one SGPR. > 2**32 kernels won't fit on the GPU
+ report_fatal_error("Unimplemented LDS lowering for > 2**32 kernels");
+ }
+
+ for (size_t i = 0; i < OrderedKernels.size(); i++) {
+ Metadata *AttrMDArgs[1] = {
+ ConstantAsMetadata::get(Builder.getInt32(i)),
+ };
+ OrderedKernels[i]->setMetadata("llvm.amdgcn.lds.kernel.id",
+ MDNode::get(Ctx, AttrMDArgs));
}
}
+ return OrderedKernels;
+ }
- // Partition variables into the different strategies
- DenseSet<GlobalVariable *> ModuleScopeVariables;
- DenseSet<GlobalVariable *> TableLookupVariables;
- DenseSet<GlobalVariable *> KernelAccessVariables;
+ static void partitionVariablesIntoIndirectStrategies(
+ Module &M, LDSUsesInfoTy const &LDSUsesInfo,
+ VariableFunctionMap &LDSToKernelsThatNeedToAccessItIndirectly,
+ DenseSet<GlobalVariable *> &ModuleScopeVariables,
+ DenseSet<GlobalVariable *> &TableLookupVariables,
+ DenseSet<GlobalVariable *> &KernelAccessVariables,
+ DenseSet<GlobalVariable *> &DynamicVariables) {
- {
- GlobalVariable *HybridModuleRoot =
- LoweringKindLoc != LoweringKind::hybrid
- ? nullptr
- : chooseBestVariableForModuleStrategy(
- M.getDataLayout(),
- LDSToKernelsThatNeedToAccessItIndirectly);
+ GlobalVariable *HybridModuleRoot =
+ LoweringKindLoc != LoweringKind::hybrid
+ ? nullptr
+ : chooseBestVariableForModuleStrategy(
+ M.getDataLayout(), LDSToKernelsThatNeedToAccessItIndirectly);
- DenseSet<Function *> const EmptySet;
- DenseSet<Function *> const &HybridModuleRootKernels =
- HybridModuleRoot
- ? LDSToKernelsThatNeedToAccessItIndirectly[HybridModuleRoot]
- : EmptySet;
+ DenseSet<Function *> const EmptySet;
+ DenseSet<Function *> const &HybridModuleRootKernels =
+ HybridModuleRoot
+ ? LDSToKernelsThatNeedToAccessItIndirectly[HybridModuleRoot]
+ : EmptySet;
- for (auto &K : LDSToKernelsThatNeedToAccessItIndirectly) {
- // Each iteration of this loop assigns exactly one global variable to
- // exactly one of the implementation strategies.
+ for (auto &K : LDSToKernelsThatNeedToAccessItIndirectly) {
+ // Each iteration of this loop assigns exactly one global variable to
+ // exactly one of the implementation strategies.
- GlobalVariable *GV = K.first;
- assert(AMDGPU::isLDSVariableToLower(*GV));
- assert(K.second.size() != 0);
+ GlobalVariable *GV = K.first;
+ assert(AMDGPU::isLDSVariableToLower(*GV));
+ assert(K.second.size() != 0);
- switch (LoweringKindLoc) {
- case LoweringKind::module:
- ModuleScopeVariables.insert(GV);
- break;
+ if (AMDGPU::isDynamicLDS(*GV)) {
+ DynamicVariables.insert(GV);
+ continue;
+ }
- case LoweringKind::table:
- TableLookupVariables.insert(GV);
- break;
+ switch (LoweringKindLoc) {
+ case LoweringKind::module:
+ ModuleScopeVariables.insert(GV);
+ break;
- case LoweringKind::kernel:
- if (K.second.size() == 1) {
- KernelAccessVariables.insert(GV);
- } else {
- report_fatal_error(
- "cannot lower LDS '" + GV->getName() +
- "' to kernel access as it is reachable from multiple kernels");
- }
- break;
+ case LoweringKind::table:
+ TableLookupVariables.insert(GV);
+ break;
- case LoweringKind::hybrid: {
- if (GV == HybridModuleRoot) {
- assert(K.second.size() != 1);
- ModuleScopeVariables.insert(GV);
- } else if (K.second.size() == 1) {
- KernelAccessVariables.insert(GV);
- } else if (set_is_subset(K.second, HybridModuleRootKernels)) {
- ModuleScopeVariables.insert(GV);
- } else {
- TableLookupVariables.insert(GV);
- }
- break;
+ case LoweringKind::kernel:
+ if (K.second.size() == 1) {
+ KernelAccessVariables.insert(GV);
+ } else {
+ report_fatal_error(
+ "cannot lower LDS '" + GV->getName() +
+ "' to kernel access as it is reachable from multiple kernels");
}
+ break;
+
+ case LoweringKind::hybrid: {
+ if (GV == HybridModuleRoot) {
+ assert(K.second.size() != 1);
+ ModuleScopeVariables.insert(GV);
+ } else if (K.second.size() == 1) {
+ KernelAccessVariables.insert(GV);
+ } else if (set_is_subset(K.second, HybridModuleRootKernels)) {
+ ModuleScopeVariables.insert(GV);
+ } else {
+ TableLookupVariables.insert(GV);
}
+ break;
+ }
}
+ }
- assert(ModuleScopeVariables.size() + TableLookupVariables.size() +
- KernelAccessVariables.size() ==
- LDSToKernelsThatNeedToAccessItIndirectly.size());
- } // Variables have now been partitioned into the three lowering strategies.
+ // All LDS variables accessed indirectly have now been partitioned into
+ // the distinct lowering strategies.
+ assert(ModuleScopeVariables.size() + TableLookupVariables.size() +
+ KernelAccessVariables.size() + DynamicVariables.size() ==
+ LDSToKernelsThatNeedToAccessItIndirectly.size());
+ }
- // If the kernel accesses a variable that is going to be stored in the
- // module instance through a call then that kernel needs to allocate the
- // module instance
- DenseSet<Function *> KernelsThatAllocateModuleLDS =
- kernelsThatIndirectlyAccessAnyOfPassedVariables(M, LDSUsesInfo,
- ModuleScopeVariables);
- DenseSet<Function *> KernelsThatAllocateTableLDS =
- kernelsThatIndirectlyAccessAnyOfPassedVariables(M, LDSUsesInfo,
- TableLookupVariables);
+ static GlobalVariable *lowerModuleScopeStructVariables(
+ Module &M, DenseSet<GlobalVariable *> const &ModuleScopeVariables,
+ DenseSet<Function *> const &KernelsThatAllocateModuleLDS) {
+ // Create a struct to hold the ModuleScopeVariables
+ // Replace all uses of those variables from non-kernel functions with the
+ // new struct instance Replace only the uses from kernel functions that will
+ // allocate this instance. That is a space optimisation - kernels that use a
+ // subset of the module scope struct and do not need to allocate it for
+ // indirect calls will only allocate the subset they use (they do so as part
+ // of the per-kernel lowering).
+ if (ModuleScopeVariables.empty()) {
+ return nullptr;
+ }
- if (!ModuleScopeVariables.empty()) {
- LDSVariableReplacement ModuleScopeReplacement =
- createLDSVariableReplacement(M, "llvm.amdgcn.module.lds",
- ModuleScopeVariables);
+ LLVMContext &Ctx = M.getContext();
- appendToCompilerUsed(M,
- {static_cast<GlobalValue *>(
- ConstantExpr::getPointerBitCastOrAddrSpaceCast(
- cast<Constant>(ModuleScopeReplacement.SGV),
- Type::getInt8PtrTy(Ctx)))});
+ LDSVariableReplacement ModuleScopeReplacement =
+ createLDSVariableReplacement(M, "llvm.amdgcn.module.lds",
+ ModuleScopeVariables);
- // historic
- removeLocalVarsFromUsedLists(M, ModuleScopeVariables);
+ appendToCompilerUsed(M, {static_cast<GlobalValue *>(
+ ConstantExpr::getPointerBitCastOrAddrSpaceCast(
+ cast<Constant>(ModuleScopeReplacement.SGV),
+ Type::getInt8PtrTy(Ctx)))});
- // Replace all uses of module scope variable from non-kernel functions
- replaceLDSVariablesWithStruct(
- M, ModuleScopeVariables, ModuleScopeReplacement, [&](Use &U) {
- Instruction *I = dyn_cast<Instruction>(U.getUser());
- if (!I) {
- return false;
- }
- Function *F = I->getFunction();
- return !isKernelLDS(F);
- });
+ // module.lds will be allocated at zero in any kernel that allocates it
+ recordLDSAbsoluteAddress(&M, ModuleScopeReplacement.SGV, 0);
- // Replace uses of module scope variable from kernel functions that
- // allocate the module scope variable, otherwise leave them unchanged
- // Record on each kernel whether the module scope global is used by it
+ // historic
+ removeLocalVarsFromUsedLists(M, ModuleScopeVariables);
- LLVMContext &Ctx = M.getContext();
- IRBuilder<> Builder(Ctx);
+ // Replace all uses of module scope variable from non-kernel functions
+ replaceLDSVariablesWithStruct(
+ M, ModuleScopeVariables, ModuleScopeReplacement, [&](Use &U) {
+ Instruction *I = dyn_cast<Instruction>(U.getUser());
+ if (!I) {
+ return false;
+ }
+ Function *F = I->getFunction();
+ return !isKernelLDS(F);
+ });
- for (Function &Func : M.functions()) {
- if (Func.isDeclaration() || !isKernelLDS(&Func))
- continue;
+ // Replace uses of module scope variable from kernel functions that
+ // allocate the module scope variable, otherwise leave them unchanged
+ // Record on each kernel whether the module scope global is used by it
- if (KernelsThatAllocateModuleLDS.contains(&Func)) {
- replaceLDSVariablesWithStruct(
- M, ModuleScopeVariables, ModuleScopeReplacement, [&](Use &U) {
- Instruction *I = dyn_cast<Instruction>(U.getUser());
- if (!I) {
- return false;
- }
- Function *F = I->getFunction();
- return F == &Func;
- });
+ for (Function &Func : M.functions()) {
+ if (Func.isDeclaration() || !isKernelLDS(&Func))
+ continue;
- markUsedByKernel(Builder, &Func, ModuleScopeReplacement.SGV);
+ if (KernelsThatAllocateModuleLDS.contains(&Func)) {
+ replaceLDSVariablesWithStruct(
+ M, ModuleScopeVariables, ModuleScopeReplacement, [&](Use &U) {
+ Instruction *I = dyn_cast<Instruction>(U.getUser());
+ if (!I) {
+ return false;
+ }
+ Function *F = I->getFunction();
+ return F == &Func;
+ });
- } else {
- Func.addFnAttr("amdgpu-elide-module-lds");
- }
+ markUsedByKernel(&Func, ModuleScopeReplacement.SGV);
}
}
- // Create a struct for each kernel for the non-module-scope variables
+ return ModuleScopeReplacement.SGV;
+ }
+
+ static DenseMap<Function *, LDSVariableReplacement>
+ lowerKernelScopeStructVariables(
+ Module &M, LDSUsesInfoTy &LDSUsesInfo,
+ DenseSet<GlobalVariable *> const &ModuleScopeVariables,
+ DenseSet<Function *> const &KernelsThatAllocateModuleLDS,
+ GlobalVariable *MaybeModuleScopeStruct) {
+
+ // Create a struct for each kernel for the non-module-scope variables.
+
DenseMap<Function *, LDSVariableReplacement> KernelToReplacement;
for (Function &Func : M.functions()) {
if (Func.isDeclaration() || !isKernelLDS(&Func))
continue;
DenseSet<GlobalVariable *> KernelUsedVariables;
+ // Allocating variables that are used directly in this struct to get
+ // alignment aware allocation and predictable frame size.
for (auto &v : LDSUsesInfo.direct_access[&Func]) {
- KernelUsedVariables.insert(v);
+ if (!AMDGPU::isDynamicLDS(*v)) {
+ KernelUsedVariables.insert(v);
+ }
}
+
+ // Allocating variables that are accessed indirectly so that a lookup of
+ // this struct instance can find them from nested functions.
for (auto &v : LDSUsesInfo.indirect_access[&Func]) {
- KernelUsedVariables.insert(v);
+ if (!AMDGPU::isDynamicLDS(*v)) {
+ KernelUsedVariables.insert(v);
+ }
}
// Variables allocated in module lds must all resolve to that struct,
@@ -836,7 +939,8 @@ public:
}
if (KernelUsedVariables.empty()) {
- // Either used no LDS, or all the LDS it used was also in module
+ // Either used no LDS, or the LDS it used was all in the module struct
+ // or dynamically sized
continue;
}
@@ -856,6 +960,14 @@ public:
auto Replacement =
createLDSVariableReplacement(M, VarName, KernelUsedVariables);
+ // If any indirect uses, create a direct use to ensure allocation
+ // TODO: Simpler to unconditionally mark used but that regresses
+ // codegen in test/CodeGen/AMDGPU/noclobber-barrier.ll
+ auto Accesses = LDSUsesInfo.indirect_access.find(&Func);
+ if ((Accesses != LDSUsesInfo.indirect_access.end()) &&
+ !Accesses->second.empty())
+ markUsedByKernel(&Func, Replacement.SGV);
+
// remove preserves existing codegen
removeLocalVarsFromUsedLists(M, KernelUsedVariables);
KernelToReplacement[&Func] = Replacement;
@@ -867,6 +979,169 @@ public:
return I && I->getFunction() == &Func;
});
}
+ return KernelToReplacement;
+ }
+
+ static GlobalVariable *
+ buildRepresentativeDynamicLDSInstance(Module &M, LDSUsesInfoTy &LDSUsesInfo,
+ Function *func) {
+ // Create a dynamic lds variable with a name associated with the passed
+ // function that has the maximum alignment of any dynamic lds variable
+ // reachable from this kernel. Dynamic LDS is allocated after the static LDS
+ // allocation, possibly after alignment padding. The representative variable
+ // created here has the maximum alignment of any other dynamic variable
+ // reachable by that kernel. All dynamic LDS variables are allocated at the
+ // same address in each kernel in order to provide the documented aliasing
+ // semantics. Setting the alignment here allows this IR pass to accurately
+ // predict the exact constant at which it will be allocated.
+
+ assert(isKernelLDS(func));
+
+ LLVMContext &Ctx = M.getContext();
+ const DataLayout &DL = M.getDataLayout();
+ Align MaxDynamicAlignment(1);
+
+ auto UpdateMaxAlignment = [&MaxDynamicAlignment, &DL](GlobalVariable *GV) {
+ if (AMDGPU::isDynamicLDS(*GV)) {
+ MaxDynamicAlignment =
+ std::max(MaxDynamicAlignment, AMDGPU::getAlign(DL, GV));
+ }
+ };
+
+ for (GlobalVariable *GV : LDSUsesInfo.indirect_access[func]) {
+ UpdateMaxAlignment(GV);
+ }
+
+ for (GlobalVariable *GV : LDSUsesInfo.direct_access[func]) {
+ UpdateMaxAlignment(GV);
+ }
+
+ assert(func->hasName()); // Checked by caller
+ auto emptyCharArray = ArrayType::get(Type::getInt8Ty(Ctx), 0);
+ GlobalVariable *N = new GlobalVariable(
+ M, emptyCharArray, false, GlobalValue::ExternalLinkage, nullptr,
+ Twine("llvm.amdgcn." + func->getName() + ".dynlds"), nullptr, GlobalValue::NotThreadLocal, AMDGPUAS::LOCAL_ADDRESS,
+ false);
+ N->setAlignment(MaxDynamicAlignment);
+
+ assert(AMDGPU::isDynamicLDS(*N));
+ return N;
+ }
+
+ DenseMap<Function *, GlobalVariable *> lowerDynamicLDSVariables(
+ Module &M, LDSUsesInfoTy &LDSUsesInfo,
+ DenseSet<Function *> const &KernelsThatIndirectlyAllocateDynamicLDS,
+ DenseSet<GlobalVariable *> const &DynamicVariables,
+ std::vector<Function *> const &OrderedKernels) {
+ DenseMap<Function *, GlobalVariable *> KernelToCreatedDynamicLDS;
+ if (!KernelsThatIndirectlyAllocateDynamicLDS.empty()) {
+ LLVMContext &Ctx = M.getContext();
+ IRBuilder<> Builder(Ctx);
+ Type *I32 = Type::getInt32Ty(Ctx);
+
+ std::vector<Constant *> newDynamicLDS;
+
+ // Table is built in the same order as OrderedKernels
+ for (auto &func : OrderedKernels) {
+
+ if (KernelsThatIndirectlyAllocateDynamicLDS.contains(func)) {
+ assert(isKernelLDS(func));
+ if (!func->hasName()) {
+ report_fatal_error("Anonymous kernels cannot use LDS variables");
+ }
+
+ GlobalVariable *N =
+ buildRepresentativeDynamicLDSInstance(M, LDSUsesInfo, func);
+
+ KernelToCreatedDynamicLDS[func] = N;
+
+ markUsedByKernel(func, N);
+
+ auto emptyCharArray = ArrayType::get(Type::getInt8Ty(Ctx), 0);
+ auto GEP = ConstantExpr::getGetElementPtr(
+ emptyCharArray, N, ConstantInt::get(I32, 0), true);
+ newDynamicLDS.push_back(ConstantExpr::getPtrToInt(GEP, I32));
+ } else {
+ newDynamicLDS.push_back(PoisonValue::get(I32));
+ }
+ }
+ assert(OrderedKernels.size() == newDynamicLDS.size());
+
+ ArrayType *t = ArrayType::get(I32, newDynamicLDS.size());
+ Constant *init = ConstantArray::get(t, newDynamicLDS);
+ GlobalVariable *table = new GlobalVariable(
+ M, t, true, GlobalValue::InternalLinkage, init,
+ "llvm.amdgcn.dynlds.offset.table", nullptr,
+ GlobalValue::NotThreadLocal, AMDGPUAS::CONSTANT_ADDRESS);
+
+ for (GlobalVariable *GV : DynamicVariables) {
+ for (Use &U : make_early_inc_range(GV->uses())) {
+ auto *I = dyn_cast<Instruction>(U.getUser());
+ if (!I)
+ continue;
+ if (isKernelLDS(I->getFunction()))
+ continue;
+
+ replaceUseWithTableLookup(M, Builder, table, GV, U, nullptr);
+ }
+ }
+ }
+ return KernelToCreatedDynamicLDS;
+ }
+
+ bool runOnModule(Module &M) override {
+ CallGraph CG = CallGraph(M);
+ bool Changed = superAlignLDSGlobals(M);
+
+ Changed |= eliminateConstantExprUsesOfLDSFromAllInstructions(M);
+
+ Changed = true; // todo: narrow this down
+
+ // For each kernel, what variables does it access directly or through
+ // callees
+ LDSUsesInfoTy LDSUsesInfo = getTransitiveUsesOfLDS(CG, M);
+
+ // For each variable accessed through callees, which kernels access it
+ VariableFunctionMap LDSToKernelsThatNeedToAccessItIndirectly;
+ for (auto &K : LDSUsesInfo.indirect_access) {
+ Function *F = K.first;
+ assert(isKernelLDS(F));
+ for (GlobalVariable *GV : K.second) {
+ LDSToKernelsThatNeedToAccessItIndirectly[GV].insert(F);
+ }
+ }
+
+ // Partition variables accessed indirectly into the different strategies
+ DenseSet<GlobalVariable *> ModuleScopeVariables;
+ DenseSet<GlobalVariable *> TableLookupVariables;
+ DenseSet<GlobalVariable *> KernelAccessVariables;
+ DenseSet<GlobalVariable *> DynamicVariables;
+ partitionVariablesIntoIndirectStrategies(
+ M, LDSUsesInfo, LDSToKernelsThatNeedToAccessItIndirectly,
+ ModuleScopeVariables, TableLookupVariables, KernelAccessVariables,
+ DynamicVariables);
+
+ // If the kernel accesses a variable that is going to be stored in the
+ // module instance through a call then that kernel needs to allocate the
+ // module instance
+ const DenseSet<Function *> KernelsThatAllocateModuleLDS =
+ kernelsThatIndirectlyAccessAnyOfPassedVariables(M, LDSUsesInfo,
+ ModuleScopeVariables);
+ const DenseSet<Function *> KernelsThatAllocateTableLDS =
+ kernelsThatIndirectlyAccessAnyOfPassedVariables(M, LDSUsesInfo,
+ TableLookupVariables);
+
+ const DenseSet<Function *> KernelsThatIndirectlyAllocateDynamicLDS =
+ kernelsThatIndirectlyAccessAnyOfPassedVariables(M, LDSUsesInfo,
+ DynamicVariables);
+
+ GlobalVariable *MaybeModuleScopeStruct = lowerModuleScopeStructVariables(
+ M, ModuleScopeVariables, KernelsThatAllocateModuleLDS);
+
+ DenseMap<Function *, LDSVariableReplacement> KernelToReplacement =
+ lowerKernelScopeStructVariables(M, LDSUsesInfo, ModuleScopeVariables,
+ KernelsThatAllocateModuleLDS,
+ MaybeModuleScopeStruct);
// Lower zero cost accesses to the kernel instances just created
for (auto &GV : KernelAccessVariables) {
@@ -879,72 +1154,113 @@ public:
Vec.insert(GV);
replaceLDSVariablesWithStruct(M, Vec, Replacement, [](Use &U) {
- return isa<Instruction>(U.getUser());
+ return isa<Instruction>(U.getUser());
});
}
+ // The ith element of this vector is kernel id i
+ std::vector<Function *> OrderedKernels =
+ assignLDSKernelIDToEachKernel(&M, KernelsThatAllocateTableLDS,
+ KernelsThatIndirectlyAllocateDynamicLDS);
+
if (!KernelsThatAllocateTableLDS.empty()) {
- // Collect the kernels that allocate table lookup LDS
- std::vector<Function *> OrderedKernels;
- {
- for (Function &Func : M.functions()) {
- if (Func.isDeclaration())
- continue;
- if (!isKernelLDS(&Func))
- continue;
+ LLVMContext &Ctx = M.getContext();
+ IRBuilder<> Builder(Ctx);
- if (KernelsThatAllocateTableLDS.contains(&Func)) {
- assert(Func.hasName()); // else fatal error earlier
- OrderedKernels.push_back(&Func);
- }
- }
+ // The order must be consistent between lookup table and accesses to
+ // lookup table
+ auto TableLookupVariablesOrdered =
+ sortByName(std::vector<GlobalVariable *>(TableLookupVariables.begin(),
+ TableLookupVariables.end()));
+
+ GlobalVariable *LookupTable = buildLookupTable(
+ M, TableLookupVariablesOrdered, OrderedKernels, KernelToReplacement);
+ replaceUsesInInstructionsWithTableLookup(M, TableLookupVariablesOrdered,
+ LookupTable);
+ }
- // Put them in an arbitrary but reproducible order
- llvm::sort(OrderedKernels.begin(), OrderedKernels.end(),
- [](const Function *lhs, const Function *rhs) -> bool {
- return lhs->getName() < rhs->getName();
- });
+ DenseMap<Function *, GlobalVariable *> KernelToCreatedDynamicLDS =
+ lowerDynamicLDSVariables(M, LDSUsesInfo,
+ KernelsThatIndirectlyAllocateDynamicLDS,
+ DynamicVariables, OrderedKernels);
+
+ // All kernel frames have been allocated. Calculate and record the
+ // addresses.
+ {
+ const DataLayout &DL = M.getDataLayout();
+
+ for (Function &Func : M.functions()) {
+ if (Func.isDeclaration() || !isKernelLDS(&Func))
+ continue;
- // Annotate the kernels with their order in this vector
- LLVMContext &Ctx = M.getContext();
- IRBuilder<> Builder(Ctx);
+ // All three of these are optional. The first variable is allocated at
+ // zero. They are allocated by AMDGPUMachineFunction as one block.
+ // Layout:
+ //{
+ // module.lds
+ // alignment padding
+ // kernel instance
+ // alignment padding
+ // dynamic lds variables
+ //}
- if (OrderedKernels.size() > UINT32_MAX) {
- // 32 bit keeps it in one SGPR. > 2**32 kernels won't fit on the GPU
- report_fatal_error("Unimplemented LDS lowering for > 2**32 kernels");
+ const bool AllocateModuleScopeStruct =
+ MaybeModuleScopeStruct &&
+ KernelsThatAllocateModuleLDS.contains(&Func);
+
+ auto Replacement = KernelToReplacement.find(&Func);
+ const bool AllocateKernelScopeStruct =
+ Replacement != KernelToReplacement.end();
+
+ const bool AllocateDynamicVariable =
+ KernelToCreatedDynamicLDS.contains(&Func);
+
+ uint32_t Offset = 0;
+
+ if (AllocateModuleScopeStruct) {
+ // Allocated at zero, recorded once on construction, not once per
+ // kernel
+ Offset += DL.getTypeAllocSize(MaybeModuleScopeStruct->getValueType());
}
- for (size_t i = 0; i < OrderedKernels.size(); i++) {
- Metadata *AttrMDArgs[1] = {
- ConstantAsMetadata::get(Builder.getInt32(i)),
- };
- OrderedKernels[i]->setMetadata("llvm.amdgcn.lds.kernel.id",
- MDNode::get(Ctx, AttrMDArgs));
+ if (AllocateKernelScopeStruct) {
+ GlobalVariable *KernelStruct = Replacement->second.SGV;
+ Offset = alignTo(Offset, AMDGPU::getAlign(DL, KernelStruct));
+ recordLDSAbsoluteAddress(&M, KernelStruct, Offset);
+ Offset += DL.getTypeAllocSize(KernelStruct->getValueType());
+ }
- markUsedByKernel(Builder, OrderedKernels[i],
- KernelToReplacement[OrderedKernels[i]].SGV);
+ // If there is dynamic allocation, the alignment needed is included in
+ // the static frame size. There may be no reference to the dynamic
+ // variable in the kernel itself, so without including it here, that
+ // alignment padding could be missed.
+ if (AllocateDynamicVariable) {
+ GlobalVariable *DynamicVariable = KernelToCreatedDynamicLDS[&Func];
+ Offset = alignTo(Offset, AMDGPU::getAlign(DL, DynamicVariable));
+ recordLDSAbsoluteAddress(&M, DynamicVariable, Offset);
}
- }
- // The order must be consistent between lookup table and accesses to
- // lookup table
- std::vector<GlobalVariable *> TableLookupVariablesOrdered(
- TableLookupVariables.begin(), TableLookupVariables.end());
- llvm::sort(TableLookupVariablesOrdered.begin(),
- TableLookupVariablesOrdered.end(),
- [](const GlobalVariable *lhs, const GlobalVariable *rhs) {
- return lhs->getName() < rhs->getName();
- });
+ if (Offset != 0) {
+ std::string Buffer;
+ raw_string_ostream SS{Buffer};
+ SS << format("%u", Offset);
- GlobalVariable *LookupTable = buildLookupTable(
- M, TableLookupVariablesOrdered, OrderedKernels, KernelToReplacement);
- replaceUsesInInstructionsWithTableLookup(M, TableLookupVariablesOrdered,
- LookupTable);
+ // Instead of explictly marking kernels that access dynamic variables
+ // using special case metadata, annotate with min-lds == max-lds, i.e.
+ // that there is no more space available for allocating more static
+ // LDS variables. That is the right condition to prevent allocating
+ // more variables which would collide with the addresses assigned to
+ // dynamic variables.
+ if (AllocateDynamicVariable)
+ SS << format(",%u", Offset);
+
+ Func.addFnAttr("amdgpu-lds-size", Buffer);
+ }
+ }
}
for (auto &GV : make_early_inc_range(M.globals()))
if (AMDGPU::isLDSVariableToLower(GV)) {
-
// probably want to remove from used lists
GV.removeDeadConstantUsers();
if (GV.use_empty())
@@ -1017,12 +1333,9 @@ private:
// The order of fields in this struct depends on the order of
// varables in the argument which varies when changing how they
// are identified, leading to spurious test breakage.
- std::vector<GlobalVariable *> Sorted(LDSVarsToTransform.begin(),
- LDSVarsToTransform.end());
- llvm::sort(Sorted.begin(), Sorted.end(),
- [](const GlobalVariable *lhs, const GlobalVariable *rhs) {
- return lhs->getName() < rhs->getName();
- });
+ auto Sorted = sortByName(std::vector<GlobalVariable *>(
+ LDSVarsToTransform.begin(), LDSVarsToTransform.end()));
+
for (GlobalVariable *GV : Sorted) {
OptimizedStructLayoutField F(GV,
DL.getTypeAllocSize(GV->getValueType()),
@@ -1101,21 +1414,17 @@ private:
}
template <typename PredicateTy>
- void replaceLDSVariablesWithStruct(
+ static void replaceLDSVariablesWithStruct(
Module &M, DenseSet<GlobalVariable *> const &LDSVarsToTransformArg,
- LDSVariableReplacement Replacement, PredicateTy Predicate) {
+ const LDSVariableReplacement &Replacement, PredicateTy Predicate) {
LLVMContext &Ctx = M.getContext();
const DataLayout &DL = M.getDataLayout();
// A hack... we need to insert the aliasing info in a predictable order for
// lit tests. Would like to have them in a stable order already, ideally the
// same order they get allocated, which might mean an ordered set container
- std::vector<GlobalVariable *> LDSVarsToTransform(
- LDSVarsToTransformArg.begin(), LDSVarsToTransformArg.end());
- llvm::sort(LDSVarsToTransform.begin(), LDSVarsToTransform.end(),
- [](const GlobalVariable *lhs, const GlobalVariable *rhs) {
- return lhs->getName() < rhs->getName();
- });
+ auto LDSVarsToTransform = sortByName(std::vector<GlobalVariable *>(
+ LDSVarsToTransformArg.begin(), LDSVarsToTransformArg.end()));
// Create alias.scope and their lists. Each field in the new structure
// does not alias with all other fields.
@@ -1137,7 +1446,7 @@ private:
// field of the instance that will be allocated by AMDGPUMachineFunction
for (size_t I = 0; I < NumberVars; I++) {
GlobalVariable *GV = LDSVarsToTransform[I];
- Constant *GEP = Replacement.LDSVarsToConstantGEP[GV];
+ Constant *GEP = Replacement.LDSVarsToConstantGEP.at(GV);
GV->replaceUsesWithIf(GEP, Predicate);
@@ -1159,9 +1468,9 @@ private:
}
}
- void refineUsesAlignmentAndAA(Value *Ptr, Align A, const DataLayout &DL,
- MDNode *AliasScope, MDNode *NoAlias,
- unsigned MaxDepth = 5) {
+ static void refineUsesAlignmentAndAA(Value *Ptr, Align A,
+ const DataLayout &DL, MDNode *AliasScope,
+ MDNode *NoAlias, unsigned MaxDepth = 5) {
if (!MaxDepth || (A == 1 && !AliasScope))
return;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp
index d88a2cd961b2..c24d39b9e5fd 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp
@@ -13,6 +13,7 @@
//
#include "AMDGPUMCInstLower.h"
+#include "AMDGPU.h"
#include "AMDGPUAsmPrinter.h"
#include "AMDGPUMachineFunction.h"
#include "AMDGPUTargetMachine.h"
@@ -133,7 +134,8 @@ void AMDGPUMCInstLower::lower(const MachineInstr *MI, MCInst &OutMI) const {
OutMI.addOperand(Dest);
OutMI.addOperand(Src);
return;
- } else if (Opcode == AMDGPU::SI_TCRETURN) {
+ } else if (Opcode == AMDGPU::SI_TCRETURN ||
+ Opcode == AMDGPU::SI_TCRETURN_GFX) {
// TODO: How to use branch immediate and avoid register+add?
Opcode = AMDGPU::S_SETPC_B64;
}
@@ -168,12 +170,11 @@ bool AMDGPUAsmPrinter::lowerOperand(const MachineOperand &MO,
const MCExpr *AMDGPUAsmPrinter::lowerConstant(const Constant *CV) {
// Intercept LDS variables with known addresses
- if (const GlobalVariable *GV = dyn_cast<GlobalVariable>(CV)) {
- if (AMDGPUMachineFunction::isKnownAddressLDSGlobal(*GV)) {
- unsigned offset =
- AMDGPUMachineFunction::calculateKnownAddressOfLDSGlobal(*GV);
- Constant *C = ConstantInt::get(CV->getContext(), APInt(32, offset));
- return AsmPrinter::lowerConstant(C);
+ if (const GlobalVariable *GV = dyn_cast<const GlobalVariable>(CV)) {
+ if (std::optional<uint32_t> Address =
+ AMDGPUMachineFunction::getLDSAbsoluteAddress(*GV)) {
+ auto *IntTy = Type::getInt32Ty(CV->getContext());
+ return AsmPrinter::lowerConstant(ConstantInt::get(IntTy, *Address));
}
}
@@ -285,11 +286,10 @@ void AMDGPUAsmPrinter::emitInstruction(const MachineInstr *MI) {
(!STI.hasOffset3fBug() || !MI->isBranch())) {
SmallVector<MCFixup, 4> Fixups;
SmallVector<char, 16> CodeBytes;
- raw_svector_ostream CodeStream(CodeBytes);
- std::unique_ptr<MCCodeEmitter> InstEmitter(createSIMCCodeEmitter(
+ std::unique_ptr<MCCodeEmitter> InstEmitter(createAMDGPUMCCodeEmitter(
*STI.getInstrInfo(), OutContext));
- InstEmitter->encodeInstruction(TmpInst, CodeStream, Fixups, STI);
+ InstEmitter->encodeInstruction(TmpInst, CodeBytes, Fixups, STI);
assert(CodeBytes.size() == STI.getInstrInfo()->getInstSizeInBytes(*MI));
}
@@ -308,10 +308,9 @@ void AMDGPUAsmPrinter::emitInstruction(const MachineInstr *MI) {
// Disassemble instruction/operands to hex representation.
SmallVector<MCFixup, 4> Fixups;
SmallVector<char, 16> CodeBytes;
- raw_svector_ostream CodeStream(CodeBytes);
DumpCodeInstEmitter->encodeInstruction(
- TmpInst, CodeStream, Fixups, MF->getSubtarget<MCSubtargetInfo>());
+ TmpInst, CodeBytes, Fixups, MF->getSubtarget<MCSubtargetInfo>());
HexLines.resize(HexLines.size() + 1);
std::string &HexLine = HexLines.back();
raw_string_ostream HexStream(HexLine);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMachineCFGStructurizer.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMachineCFGStructurizer.cpp
index c96fab08a267..d90fcac87540 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUMachineCFGStructurizer.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMachineCFGStructurizer.cpp
@@ -2600,9 +2600,6 @@ bool AMDGPUMachineCFGStructurizer::structurizeComplexRegion(RegionMRT *Region) {
LLVM_DEBUG(dbgs() << "CurrentRegion: \n");
LLVM_DEBUG(LRegion->print(dbgs(), TRI));
- auto CNI = CI;
- ++CNI;
-
MRT *Child = (*CI);
if (Child->isRegion()) {
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp
index a6a32b98f44c..44bbfe6f13d9 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp
@@ -10,8 +10,11 @@
#include "AMDGPU.h"
#include "AMDGPUPerfHintAnalysis.h"
#include "AMDGPUSubtarget.h"
+#include "Utils/AMDGPUBaseInfo.h"
#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/IR/ConstantRange.h"
#include "llvm/IR/Constants.h"
+#include "llvm/IR/Metadata.h"
#include "llvm/Target/TargetMachine.h"
using namespace llvm;
@@ -41,6 +44,18 @@ AMDGPUMachineFunction::AMDGPUMachineFunction(const Function &F,
// Assume the attribute allocates before any known GDS globals.
StaticGDSSize = GDSSize;
+ // Second value, if present, is the maximum value that can be assigned.
+ // Useful in PromoteAlloca or for LDS spills. Could be used for diagnostics
+ // during codegen.
+ std::pair<unsigned, unsigned> LDSSizeRange = AMDGPU::getIntegerPairAttribute(
+ F, "amdgpu-lds-size", {0, UINT32_MAX}, true);
+
+ // The two separate variables are only profitable when the LDS module lowering
+ // pass is disabled. If graphics does not use dynamic LDS, this is never
+ // profitable. Leaving cleanup for a later change.
+ LDSSize = LDSSizeRange.first;
+ StaticLDSSize = LDSSize;
+
CallingConv::ID CC = F.getCallingConv();
if (CC == CallingConv::AMDGPU_KERNEL || CC == CallingConv::SPIR_KERNEL)
ExplicitKernArgSize = ST.getExplicitKernArgSize(F, MaxKernArgAlign);
@@ -63,6 +78,42 @@ unsigned AMDGPUMachineFunction::allocateLDSGlobal(const DataLayout &DL,
unsigned Offset;
if (GV.getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) {
+
+ std::optional<uint32_t> MaybeAbs = getLDSAbsoluteAddress(GV);
+ if (MaybeAbs) {
+ // Absolute address LDS variables that exist prior to the LDS lowering
+ // pass raise a fatal error in that pass. These failure modes are only
+ // reachable if that lowering pass is disabled or broken. If/when adding
+ // support for absolute addresses on user specified variables, the
+ // alignment check moves to the lowering pass and the frame calculation
+ // needs to take the user variables into consideration.
+
+ uint32_t ObjectStart = *MaybeAbs;
+
+ if (ObjectStart != alignTo(ObjectStart, Alignment)) {
+ report_fatal_error("Absolute address LDS variable inconsistent with "
+ "variable alignment");
+ }
+
+ if (isModuleEntryFunction()) {
+ // If this is a module entry function, we can also sanity check against
+ // the static frame. Strictly it would be better to check against the
+ // attribute, i.e. that the variable is within the always-allocated
+ // section, and not within some other non-absolute-address object
+ // allocated here, but the extra error detection is minimal and we would
+ // have to pass the Function around or cache the attribute value.
+ uint32_t ObjectEnd =
+ ObjectStart + DL.getTypeAllocSize(GV.getValueType());
+ if (ObjectEnd > StaticLDSSize) {
+ report_fatal_error(
+ "Absolute address LDS variable outside of static frame");
+ }
+ }
+
+ Entry.first->second = ObjectStart;
+ return ObjectStart;
+ }
+
/// TODO: We should sort these to minimize wasted space due to alignment
/// padding. Currently the padding is decided by the first encountered use
/// during lowering.
@@ -87,135 +138,54 @@ unsigned AMDGPUMachineFunction::allocateLDSGlobal(const DataLayout &DL,
return Offset;
}
-static constexpr StringLiteral ModuleLDSName = "llvm.amdgcn.module.lds";
-
-bool AMDGPUMachineFunction::isKnownAddressLDSGlobal(const GlobalVariable &GV) {
- auto name = GV.getName();
- return (name == ModuleLDSName) ||
- (name.startswith("llvm.amdgcn.kernel.") && name.endswith(".lds"));
-}
-
-const Function *AMDGPUMachineFunction::getKernelLDSFunctionFromGlobal(
- const GlobalVariable &GV) {
- const Module &M = *GV.getParent();
- StringRef N(GV.getName());
- if (N.consume_front("llvm.amdgcn.kernel.") && N.consume_back(".lds")) {
- return M.getFunction(N);
- }
- return nullptr;
-}
-
-const GlobalVariable *
-AMDGPUMachineFunction::getKernelLDSGlobalFromFunction(const Function &F) {
+static const GlobalVariable *
+getKernelDynLDSGlobalFromFunction(const Function &F) {
const Module *M = F.getParent();
- std::string KernelLDSName = "llvm.amdgcn.kernel.";
- KernelLDSName += F.getName();
- KernelLDSName += ".lds";
- return M->getNamedGlobal(KernelLDSName);
+ std::string KernelDynLDSName = "llvm.amdgcn.";
+ KernelDynLDSName += F.getName();
+ KernelDynLDSName += ".dynlds";
+ return M->getNamedGlobal(KernelDynLDSName);
}
-// This kernel calls no functions that require the module lds struct
-static bool canElideModuleLDS(const Function &F) {
- return F.hasFnAttribute("amdgpu-elide-module-lds");
-}
-
-unsigned AMDGPUMachineFunction::calculateKnownAddressOfLDSGlobal(
- const GlobalVariable &GV) {
- // module.lds, then alignment padding, then kernel.lds, then other variables
- // if any
-
- assert(isKnownAddressLDSGlobal(GV));
- unsigned Offset = 0;
-
- if (GV.getName() == ModuleLDSName) {
- return 0;
- }
-
- const Module *M = GV.getParent();
- const DataLayout &DL = M->getDataLayout();
-
- const GlobalVariable *GVM = M->getNamedGlobal(ModuleLDSName);
- const Function *f = getKernelLDSFunctionFromGlobal(GV);
-
- // Account for module.lds if allocated for this function
- if (GVM && f && !canElideModuleLDS(*f)) {
- // allocator aligns this to var align, but it's zero to begin with
- Offset += DL.getTypeAllocSize(GVM->getValueType());
+std::optional<uint32_t>
+AMDGPUMachineFunction::getLDSKernelIdMetadata(const Function &F) {
+ // TODO: Would be more consistent with the abs symbols to use a range
+ MDNode *MD = F.getMetadata("llvm.amdgcn.lds.kernel.id");
+ if (MD && MD->getNumOperands() == 1) {
+ if (ConstantInt *KnownSize =
+ mdconst::extract<ConstantInt>(MD->getOperand(0))) {
+ uint64_t ZExt = KnownSize->getZExtValue();
+ if (ZExt <= UINT32_MAX) {
+ return ZExt;
+ }
+ }
}
-
- // No dynamic LDS alignment done by allocateModuleLDSGlobal
- Offset = alignTo(
- Offset, DL.getValueOrABITypeAlignment(GV.getAlign(), GV.getValueType()));
-
- return Offset;
+ return {};
}
-void AMDGPUMachineFunction::allocateKnownAddressLDSGlobal(const Function &F) {
- const Module *M = F.getParent();
-
- // This function is called before allocating any other LDS so that it can
- // reliably put values at known addresses. Consequently, dynamic LDS, if
- // present, will not yet have been allocated
-
- assert(getDynLDSAlign() == Align() && "dynamic LDS not yet allocated");
-
- if (isModuleEntryFunction()) {
-
- // Pointer values start from zero, memory allocated per-kernel-launch
- // Variables can be grouped into a module level struct and a struct per
- // kernel function by AMDGPULowerModuleLDSPass. If that is done, they
- // are allocated at statically computable addresses here.
- //
- // Address 0
- // {
- // llvm.amdgcn.module.lds
- // }
- // alignment padding
- // {
- // llvm.amdgcn.kernel.some-name.lds
- // }
- // other variables, e.g. dynamic lds, allocated after this call
+std::optional<uint32_t>
+AMDGPUMachineFunction::getLDSAbsoluteAddress(const GlobalValue &GV) {
+ if (GV.getAddressSpace() != AMDGPUAS::LOCAL_ADDRESS)
+ return {};
- const GlobalVariable *GV = M->getNamedGlobal(ModuleLDSName);
- const GlobalVariable *KV = getKernelLDSGlobalFromFunction(F);
+ std::optional<ConstantRange> AbsSymRange = GV.getAbsoluteSymbolRange();
+ if (!AbsSymRange)
+ return {};
- if (GV && !canElideModuleLDS(F)) {
- assert(isKnownAddressLDSGlobal(*GV));
- unsigned Offset = allocateLDSGlobal(M->getDataLayout(), *GV, Align());
- (void)Offset;
- assert(Offset == calculateKnownAddressOfLDSGlobal(*GV) &&
- "Module LDS expected to be allocated before other LDS");
- }
-
- if (KV) {
- // The per-kernel offset is deterministic because it is allocated
- // before any other non-module LDS variables.
- assert(isKnownAddressLDSGlobal(*KV));
- unsigned Offset = allocateLDSGlobal(M->getDataLayout(), *KV, Align());
- (void)Offset;
- assert(Offset == calculateKnownAddressOfLDSGlobal(*KV) &&
- "Kernel LDS expected to be immediately after module LDS");
+ if (const APInt *V = AbsSymRange->getSingleElement()) {
+ std::optional<uint64_t> ZExt = V->tryZExtValue();
+ if (ZExt && (*ZExt <= UINT32_MAX)) {
+ return *ZExt;
}
}
-}
-std::optional<uint32_t>
-AMDGPUMachineFunction::getLDSKernelIdMetadata(const Function &F) {
- auto MD = F.getMetadata("llvm.amdgcn.lds.kernel.id");
- if (MD && MD->getNumOperands() == 1) {
- ConstantInt *KnownSize = mdconst::extract<ConstantInt>(MD->getOperand(0));
- if (KnownSize) {
- uint64_t V = KnownSize->getZExtValue();
- if (V <= UINT32_MAX) {
- return V;
- }
- }
- }
return {};
}
-void AMDGPUMachineFunction::setDynLDSAlign(const DataLayout &DL,
+void AMDGPUMachineFunction::setDynLDSAlign(const Function &F,
const GlobalVariable &GV) {
+ const Module *M = F.getParent();
+ const DataLayout &DL = M->getDataLayout();
assert(DL.getTypeAllocSize(GV.getValueType()).isZero());
Align Alignment =
@@ -225,4 +195,17 @@ void AMDGPUMachineFunction::setDynLDSAlign(const DataLayout &DL,
LDSSize = alignTo(StaticLDSSize, Alignment);
DynLDSAlign = Alignment;
+
+ // If there is a dynamic LDS variable associated with this function F, every
+ // further dynamic LDS instance (allocated by calling setDynLDSAlign) must
+ // map to the same address. This holds because no LDS is allocated after the
+ // lowering pass if there are dynamic LDS variables present.
+ const GlobalVariable *Dyn = getKernelDynLDSGlobalFromFunction(F);
+ if (Dyn) {
+ unsigned Offset = LDSSize; // return this?
+ std::optional<uint32_t> Expect = getLDSAbsoluteAddress(*Dyn);
+ if (!Expect || (Offset != *Expect)) {
+ report_fatal_error("Inconsistent metadata on dynamic LDS variable");
+ }
+ }
}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h b/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h
index f27f8252a4d8..5780fa64a7e4 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h
@@ -104,26 +104,12 @@ public:
unsigned allocateLDSGlobal(const DataLayout &DL, const GlobalVariable &GV,
Align Trailing);
- void allocateKnownAddressLDSGlobal(const Function &F);
-
- // A kernel function may have an associated LDS allocation, and a kernel-scope
- // LDS allocation must have an associated kernel function
-
- // LDS allocation should have an associated kernel function
- static const Function *
- getKernelLDSFunctionFromGlobal(const GlobalVariable &GV);
- static const GlobalVariable *
- getKernelLDSGlobalFromFunction(const Function &F);
-
- // Module or kernel scope LDS variable
- static bool isKnownAddressLDSGlobal(const GlobalVariable &GV);
- static unsigned calculateKnownAddressOfLDSGlobal(const GlobalVariable &GV);
-
static std::optional<uint32_t> getLDSKernelIdMetadata(const Function &F);
+ static std::optional<uint32_t> getLDSAbsoluteAddress(const GlobalValue &GV);
Align getDynLDSAlign() const { return DynLDSAlign; }
- void setDynLDSAlign(const DataLayout &DL, const GlobalVariable &GV);
+ void setDynLDSAlign(const Function &F, const GlobalVariable &GV);
};
}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUOpenCLEnqueuedBlockLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUOpenCLEnqueuedBlockLowering.cpp
index 98c5c96cd4b2..2092707c8a3f 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUOpenCLEnqueuedBlockLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUOpenCLEnqueuedBlockLowering.cpp
@@ -72,31 +72,6 @@ ModulePass* llvm::createAMDGPUOpenCLEnqueuedBlockLoweringPass() {
return new AMDGPUOpenCLEnqueuedBlockLowering();
}
-/// Collect direct or indirect callers of \p F and save them
-/// to \p Callers.
-static void collectCallers(Function *F, DenseSet<Function *> &Callers) {
- for (auto *U : F->users()) {
- if (auto *CI = dyn_cast<CallInst>(&*U)) {
- auto *Caller = CI->getParent()->getParent();
- if (Callers.insert(Caller).second)
- collectCallers(Caller, Callers);
- }
- }
-}
-
-/// If \p U is instruction or constant, collect functions which directly or
-/// indirectly use it.
-static void collectFunctionUsers(User *U, DenseSet<Function *> &Funcs) {
- if (auto *I = dyn_cast<Instruction>(U)) {
- auto *F = I->getParent()->getParent();
- if (Funcs.insert(F).second)
- collectCallers(F, Funcs);
- return;
- }
- for (User *U : U->users())
- collectFunctionUsers(U, Funcs);
-}
-
bool AMDGPUOpenCLEnqueuedBlockLowering::runOnModule(Module &M) {
DenseSet<Function *> Callers;
auto &C = M.getContext();
@@ -131,9 +106,6 @@ bool AMDGPUOpenCLEnqueuedBlockLowering::runOnModule(Module &M) {
/*isExternallyInitialized=*/true);
LLVM_DEBUG(dbgs() << "runtime handle created: " << *GV << '\n');
- for (User *U : F.users())
- collectFunctionUsers(U, Callers);
-
F.replaceAllUsesWith(ConstantExpr::getAddrSpaceCast(GV, F.getType()));
F.addFnAttr("runtime-handle", RuntimeHandle);
F.setLinkage(GlobalValue::ExternalLinkage);
@@ -141,15 +113,5 @@ bool AMDGPUOpenCLEnqueuedBlockLowering::runOnModule(Module &M) {
}
}
- // FIXME: This call graph analysis is broken and should be
- // removed. AMDGPUAttributor infers the individual implicit argument fields
- // are needed or not, but the runtime crashes in cases where we fail to
- // optimize these out at -O0.
- for (auto *F : Callers) {
- if (F->getCallingConv() != CallingConv::AMDGPU_KERNEL)
- continue;
- F->addFnAttr("calls-enqueue-kernel");
- LLVM_DEBUG(dbgs() << "mark enqueue_kernel caller:" << F->getName() << '\n');
- }
return Changed;
}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp
index 9c04df0b3683..536fb02cb4ec 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp
@@ -1,4 +1,4 @@
-//=== lib/CodeGen/GlobalISel/AMDGPUPostLegalizerCombiner.cpp ---------------===//
+//=== lib/CodeGen/GlobalISel/AMDGPUPostLegalizerCombiner.cpp --------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
@@ -19,6 +19,8 @@
#include "llvm/CodeGen/GlobalISel/Combiner.h"
#include "llvm/CodeGen/GlobalISel/CombinerHelper.h"
#include "llvm/CodeGen/GlobalISel/CombinerInfo.h"
+#include "llvm/CodeGen/GlobalISel/GIMatchTableExecutor.h"
+#include "llvm/CodeGen/GlobalISel/GIMatchTableExecutorImpl.h"
#include "llvm/CodeGen/GlobalISel/GISelKnownBits.h"
#include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
#include "llvm/CodeGen/MachineDominators.h"
@@ -26,22 +28,41 @@
#include "llvm/IR/IntrinsicsAMDGPU.h"
#include "llvm/Target/TargetMachine.h"
+#define GET_GICOMBINER_DEPS
+#include "AMDGPUGenPreLegalizeGICombiner.inc"
+#undef GET_GICOMBINER_DEPS
+
#define DEBUG_TYPE "amdgpu-postlegalizer-combiner"
using namespace llvm;
using namespace MIPatternMatch;
-class AMDGPUPostLegalizerCombinerHelper {
+namespace {
+#define GET_GICOMBINER_TYPES
+#include "AMDGPUGenPostLegalizeGICombiner.inc"
+#undef GET_GICOMBINER_TYPES
+
+class AMDGPUPostLegalizerCombinerImpl : public GIMatchTableExecutor {
protected:
+ const AMDGPUPostLegalizerCombinerImplRuleConfig &RuleConfig;
+
MachineIRBuilder &B;
MachineFunction &MF;
MachineRegisterInfo &MRI;
+ const GCNSubtarget &STI;
+ const SIInstrInfo &TII;
AMDGPUCombinerHelper &Helper;
+ GISelChangeObserver &Observer;
public:
- AMDGPUPostLegalizerCombinerHelper(MachineIRBuilder &B,
- AMDGPUCombinerHelper &Helper)
- : B(B), MF(B.getMF()), MRI(*B.getMRI()), Helper(Helper){};
+ AMDGPUPostLegalizerCombinerImpl(
+ const AMDGPUPostLegalizerCombinerImplRuleConfig &RuleConfig,
+ MachineIRBuilder &B, AMDGPUCombinerHelper &Helper,
+ GISelChangeObserver &Observer);
+
+ static const char *getName() { return "AMDGPUPostLegalizerCombinerImpl"; }
+
+ bool tryCombineAll(MachineInstr &I) const;
struct FMinFMaxLegacyInfo {
Register LHS;
@@ -52,15 +73,16 @@ public:
};
// TODO: Make sure fmin_legacy/fmax_legacy don't canonicalize
- bool matchFMinFMaxLegacy(MachineInstr &MI, FMinFMaxLegacyInfo &Info);
+ bool matchFMinFMaxLegacy(MachineInstr &MI, FMinFMaxLegacyInfo &Info) const;
void applySelectFCmpToFMinToFMaxLegacy(MachineInstr &MI,
- const FMinFMaxLegacyInfo &Info);
+ const FMinFMaxLegacyInfo &Info) const;
- bool matchUCharToFloat(MachineInstr &MI);
- void applyUCharToFloat(MachineInstr &MI);
+ bool matchUCharToFloat(MachineInstr &MI) const;
+ void applyUCharToFloat(MachineInstr &MI) const;
- bool matchRcpSqrtToRsq(MachineInstr &MI,
- std::function<void(MachineIRBuilder &)> &MatchInfo);
+ bool
+ matchRcpSqrtToRsq(MachineInstr &MI,
+ std::function<void(MachineIRBuilder &)> &MatchInfo) const;
// FIXME: Should be able to have 2 separate matchdatas rather than custom
// struct boilerplate.
@@ -69,15 +91,49 @@ public:
unsigned ShiftOffset;
};
- bool matchCvtF32UByteN(MachineInstr &MI, CvtF32UByteMatchInfo &MatchInfo);
+ bool matchCvtF32UByteN(MachineInstr &MI,
+ CvtF32UByteMatchInfo &MatchInfo) const;
void applyCvtF32UByteN(MachineInstr &MI,
- const CvtF32UByteMatchInfo &MatchInfo);
+ const CvtF32UByteMatchInfo &MatchInfo) const;
+
+ bool matchRemoveFcanonicalize(MachineInstr &MI, Register &Reg) const;
- bool matchRemoveFcanonicalize(MachineInstr &MI, Register &Reg);
+ // Combine unsigned buffer load and signed extension instructions to generate
+ // signed buffer laod instructions.
+ bool matchCombineSignExtendInReg(MachineInstr &MI,
+ MachineInstr *&MatchInfo) const;
+ void applyCombineSignExtendInReg(MachineInstr &MI,
+ MachineInstr *&MatchInfo) const;
+
+private:
+#define GET_GICOMBINER_CLASS_MEMBERS
+#define AMDGPUSubtarget GCNSubtarget
+#include "AMDGPUGenPostLegalizeGICombiner.inc"
+#undef GET_GICOMBINER_CLASS_MEMBERS
+#undef AMDGPUSubtarget
};
-bool AMDGPUPostLegalizerCombinerHelper::matchFMinFMaxLegacy(
- MachineInstr &MI, FMinFMaxLegacyInfo &Info) {
+#define GET_GICOMBINER_IMPL
+#define AMDGPUSubtarget GCNSubtarget
+#include "AMDGPUGenPostLegalizeGICombiner.inc"
+#undef AMDGPUSubtarget
+#undef GET_GICOMBINER_IMPL
+
+AMDGPUPostLegalizerCombinerImpl::AMDGPUPostLegalizerCombinerImpl(
+ const AMDGPUPostLegalizerCombinerImplRuleConfig &RuleConfig,
+ MachineIRBuilder &B, AMDGPUCombinerHelper &Helper,
+ GISelChangeObserver &Observer)
+ : RuleConfig(RuleConfig), B(B), MF(B.getMF()), MRI(*B.getMRI()),
+ STI(MF.getSubtarget<GCNSubtarget>()), TII(*STI.getInstrInfo()),
+ Helper(Helper), Observer(Observer),
+#define GET_GICOMBINER_CONSTRUCTOR_INITS
+#include "AMDGPUGenPostLegalizeGICombiner.inc"
+#undef GET_GICOMBINER_CONSTRUCTOR_INITS
+{
+}
+
+bool AMDGPUPostLegalizerCombinerImpl::matchFMinFMaxLegacy(
+ MachineInstr &MI, FMinFMaxLegacyInfo &Info) const {
// FIXME: Type predicate on pattern
if (MRI.getType(MI.getOperand(0).getReg()) != LLT::scalar(32))
return false;
@@ -91,6 +147,8 @@ bool AMDGPUPostLegalizerCombinerHelper::matchFMinFMaxLegacy(
Info.True = MI.getOperand(2).getReg();
Info.False = MI.getOperand(3).getReg();
+ // TODO: Handle case where the the selected value is an fneg and the compared
+ // constant is the negation of the selected value.
if (!(Info.LHS == Info.True && Info.RHS == Info.False) &&
!(Info.LHS == Info.False && Info.RHS == Info.True))
return false;
@@ -110,8 +168,8 @@ bool AMDGPUPostLegalizerCombinerHelper::matchFMinFMaxLegacy(
}
}
-void AMDGPUPostLegalizerCombinerHelper::applySelectFCmpToFMinToFMaxLegacy(
- MachineInstr &MI, const FMinFMaxLegacyInfo &Info) {
+void AMDGPUPostLegalizerCombinerImpl::applySelectFCmpToFMinToFMaxLegacy(
+ MachineInstr &MI, const FMinFMaxLegacyInfo &Info) const {
B.setInstrAndDebugLoc(MI);
auto buildNewInst = [&MI, this](unsigned Opc, Register X, Register Y) {
B.buildInstr(Opc, {MI.getOperand(0)}, {X, Y}, MI.getFlags());
@@ -159,7 +217,8 @@ void AMDGPUPostLegalizerCombinerHelper::applySelectFCmpToFMinToFMaxLegacy(
MI.eraseFromParent();
}
-bool AMDGPUPostLegalizerCombinerHelper::matchUCharToFloat(MachineInstr &MI) {
+bool AMDGPUPostLegalizerCombinerImpl::matchUCharToFloat(
+ MachineInstr &MI) const {
Register DstReg = MI.getOperand(0).getReg();
// TODO: We could try to match extracting the higher bytes, which would be
@@ -178,7 +237,8 @@ bool AMDGPUPostLegalizerCombinerHelper::matchUCharToFloat(MachineInstr &MI) {
return false;
}
-void AMDGPUPostLegalizerCombinerHelper::applyUCharToFloat(MachineInstr &MI) {
+void AMDGPUPostLegalizerCombinerImpl::applyUCharToFloat(
+ MachineInstr &MI) const {
B.setInstrAndDebugLoc(MI);
const LLT S32 = LLT::scalar(32);
@@ -191,19 +251,20 @@ void AMDGPUPostLegalizerCombinerHelper::applyUCharToFloat(MachineInstr &MI) {
SrcReg = B.buildAnyExtOrTrunc(S32, SrcReg).getReg(0);
if (Ty == S32) {
- B.buildInstr(AMDGPU::G_AMDGPU_CVT_F32_UBYTE0, {DstReg},
- {SrcReg}, MI.getFlags());
+ B.buildInstr(AMDGPU::G_AMDGPU_CVT_F32_UBYTE0, {DstReg}, {SrcReg},
+ MI.getFlags());
} else {
- auto Cvt0 = B.buildInstr(AMDGPU::G_AMDGPU_CVT_F32_UBYTE0, {S32},
- {SrcReg}, MI.getFlags());
+ auto Cvt0 = B.buildInstr(AMDGPU::G_AMDGPU_CVT_F32_UBYTE0, {S32}, {SrcReg},
+ MI.getFlags());
B.buildFPTrunc(DstReg, Cvt0, MI.getFlags());
}
MI.eraseFromParent();
}
-bool AMDGPUPostLegalizerCombinerHelper::matchRcpSqrtToRsq(
- MachineInstr &MI, std::function<void(MachineIRBuilder &)> &MatchInfo) {
+bool AMDGPUPostLegalizerCombinerImpl::matchRcpSqrtToRsq(
+ MachineInstr &MI,
+ std::function<void(MachineIRBuilder &)> &MatchInfo) const {
auto getRcpSrc = [=](const MachineInstr &MI) {
MachineInstr *ResMI = nullptr;
@@ -246,8 +307,8 @@ bool AMDGPUPostLegalizerCombinerHelper::matchRcpSqrtToRsq(
return false;
}
-bool AMDGPUPostLegalizerCombinerHelper::matchCvtF32UByteN(
- MachineInstr &MI, CvtF32UByteMatchInfo &MatchInfo) {
+bool AMDGPUPostLegalizerCombinerImpl::matchCvtF32UByteN(
+ MachineInstr &MI, CvtF32UByteMatchInfo &MatchInfo) const {
Register SrcReg = MI.getOperand(1).getReg();
// Look through G_ZEXT.
@@ -274,8 +335,8 @@ bool AMDGPUPostLegalizerCombinerHelper::matchCvtF32UByteN(
return false;
}
-void AMDGPUPostLegalizerCombinerHelper::applyCvtF32UByteN(
- MachineInstr &MI, const CvtF32UByteMatchInfo &MatchInfo) {
+void AMDGPUPostLegalizerCombinerImpl::applyCvtF32UByteN(
+ MachineInstr &MI, const CvtF32UByteMatchInfo &MatchInfo) const {
B.setInstrAndDebugLoc(MI);
unsigned NewOpc = AMDGPU::G_AMDGPU_CVT_F32_UBYTE0 + MatchInfo.ShiftOffset / 8;
@@ -292,57 +353,66 @@ void AMDGPUPostLegalizerCombinerHelper::applyCvtF32UByteN(
MI.eraseFromParent();
}
-bool AMDGPUPostLegalizerCombinerHelper::matchRemoveFcanonicalize(
- MachineInstr &MI, Register &Reg) {
+bool AMDGPUPostLegalizerCombinerImpl::matchRemoveFcanonicalize(
+ MachineInstr &MI, Register &Reg) const {
const SITargetLowering *TLI = static_cast<const SITargetLowering *>(
MF.getSubtarget().getTargetLowering());
Reg = MI.getOperand(1).getReg();
return TLI->isCanonicalized(Reg, MF);
}
-class AMDGPUPostLegalizerCombinerHelperState {
-protected:
- AMDGPUCombinerHelper &Helper;
- AMDGPUPostLegalizerCombinerHelper &PostLegalizerHelper;
+// The buffer_load_{i8, i16} intrinsics are intially lowered as buffer_load_{u8,
+// u16} instructions. Here, the buffer_load_{u8, u16} instructions are combined
+// with sign extension instrucions in order to generate buffer_load_{i8, i16}
+// instructions.
- // Note: pointer is necessary because Target Predicates use
- // "Subtarget->"
- const GCNSubtarget *Subtarget;
+// Identify buffer_load_{u8, u16}.
+bool AMDGPUPostLegalizerCombinerImpl::matchCombineSignExtendInReg(
+ MachineInstr &MI, MachineInstr *&SubwordBufferLoad) const {
+ Register Op0Reg = MI.getOperand(1).getReg();
+ SubwordBufferLoad = MRI.getVRegDef(Op0Reg);
-public:
- AMDGPUPostLegalizerCombinerHelperState(
- AMDGPUCombinerHelper &Helper,
- AMDGPUPostLegalizerCombinerHelper &PostLegalizerHelper,
- const GCNSubtarget &Subtarget)
- : Helper(Helper), PostLegalizerHelper(PostLegalizerHelper),
- Subtarget(&Subtarget) {}
-};
+ if (!MRI.hasOneNonDBGUse(Op0Reg))
+ return false;
-#define AMDGPUPOSTLEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_DEPS
-#include "AMDGPUGenPostLegalizeGICombiner.inc"
-#undef AMDGPUPOSTLEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_DEPS
+ // Check if the first operand of the sign extension is a subword buffer load
+ // instruction.
+ return SubwordBufferLoad->getOpcode() == AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE ||
+ SubwordBufferLoad->getOpcode() == AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT;
+}
-namespace {
-#define AMDGPUPOSTLEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_H
-#include "AMDGPUGenPostLegalizeGICombiner.inc"
-#undef AMDGPUPOSTLEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_H
+// Combine buffer_load_{u8, u16} and the sign extension instruction to generate
+// buffer_load_{i8, i16}.
+void AMDGPUPostLegalizerCombinerImpl::applyCombineSignExtendInReg(
+ MachineInstr &MI, MachineInstr *&SubwordBufferLoad) const {
+ // Modify the opcode and the destination of buffer_load_{u8, u16}:
+ // Replace the opcode.
+ unsigned Opc =
+ SubwordBufferLoad->getOpcode() == AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE
+ ? AMDGPU::G_AMDGPU_BUFFER_LOAD_SBYTE
+ : AMDGPU::G_AMDGPU_BUFFER_LOAD_SSHORT;
+ SubwordBufferLoad->setDesc(TII.get(Opc));
+ // Update the destination register of SubwordBufferLoad with the destination
+ // register of the sign extension.
+ Register SignExtendInsnDst = MI.getOperand(0).getReg();
+ SubwordBufferLoad->getOperand(0).setReg(SignExtendInsnDst);
+ // Remove the sign extension.
+ MI.eraseFromParent();
+}
class AMDGPUPostLegalizerCombinerInfo final : public CombinerInfo {
GISelKnownBits *KB;
MachineDominatorTree *MDT;
- const GCNSubtarget &Subtarget;
+ AMDGPUPostLegalizerCombinerImplRuleConfig RuleConfig;
public:
- AMDGPUGenPostLegalizerCombinerHelperRuleConfig GeneratedRuleCfg;
-
- AMDGPUPostLegalizerCombinerInfo(const GCNSubtarget &Subtarget, bool EnableOpt,
- bool OptSize, bool MinSize,
+ AMDGPUPostLegalizerCombinerInfo(bool EnableOpt, bool OptSize, bool MinSize,
const AMDGPULegalizerInfo *LI,
GISelKnownBits *KB, MachineDominatorTree *MDT)
: CombinerInfo(/*AllowIllegalOps*/ false, /*ShouldLegalizeIllegal*/ true,
/*LegalizerInfo*/ LI, EnableOpt, OptSize, MinSize),
- KB(KB), MDT(MDT), Subtarget(Subtarget) {
- if (!GeneratedRuleCfg.parseCommandLineOption())
+ KB(KB), MDT(MDT) {
+ if (!RuleConfig.parseCommandLineOption())
report_fatal_error("Invalid rule identifier");
}
@@ -355,11 +425,11 @@ bool AMDGPUPostLegalizerCombinerInfo::combine(GISelChangeObserver &Observer,
MachineIRBuilder &B) const {
AMDGPUCombinerHelper Helper(Observer, B, /*IsPreLegalize*/ false, KB, MDT,
LInfo);
- AMDGPUPostLegalizerCombinerHelper PostLegalizerHelper(B, Helper);
- AMDGPUGenPostLegalizerCombinerHelper Generated(
- GeneratedRuleCfg, Helper, PostLegalizerHelper, Subtarget);
+ // TODO: Do not re-create the Impl on every inst, it should be per function.
+ AMDGPUPostLegalizerCombinerImpl Impl(RuleConfig, B, Helper, Observer);
+ Impl.setupMF(*MI.getMF(), KB);
- if (Generated.tryCombineAll(Observer, MI, B))
+ if (Impl.tryCombineAll(MI))
return true;
switch (MI.getOpcode()) {
@@ -375,10 +445,6 @@ bool AMDGPUPostLegalizerCombinerInfo::combine(GISelChangeObserver &Observer,
return false;
}
-#define AMDGPUPOSTLEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_CPP
-#include "AMDGPUGenPostLegalizeGICombiner.inc"
-#undef AMDGPUPOSTLEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_CPP
-
// Pass boilerplate
// ================
@@ -414,7 +480,7 @@ void AMDGPUPostLegalizerCombiner::getAnalysisUsage(AnalysisUsage &AU) const {
}
AMDGPUPostLegalizerCombiner::AMDGPUPostLegalizerCombiner(bool IsOptNone)
- : MachineFunctionPass(ID), IsOptNone(IsOptNone) {
+ : MachineFunctionPass(ID), IsOptNone(IsOptNone) {
initializeAMDGPUPostLegalizerCombinerPass(*PassRegistry::getPassRegistry());
}
@@ -428,13 +494,13 @@ bool AMDGPUPostLegalizerCombiner::runOnMachineFunction(MachineFunction &MF) {
MF.getTarget().getOptLevel() != CodeGenOpt::None && !skipFunction(F);
const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
- const AMDGPULegalizerInfo *LI
- = static_cast<const AMDGPULegalizerInfo *>(ST.getLegalizerInfo());
+ const AMDGPULegalizerInfo *LI =
+ static_cast<const AMDGPULegalizerInfo *>(ST.getLegalizerInfo());
GISelKnownBits *KB = &getAnalysis<GISelKnownBitsAnalysis>().get(MF);
MachineDominatorTree *MDT =
IsOptNone ? nullptr : &getAnalysis<MachineDominatorTree>();
- AMDGPUPostLegalizerCombinerInfo PCInfo(ST, EnableOpt, F.hasOptSize(),
+ AMDGPUPostLegalizerCombinerInfo PCInfo(EnableOpt, F.hasOptSize(),
F.hasMinSize(), LI, KB, MDT);
Combiner C(PCInfo, TPC);
return C.combineMachineInstrs(MF, /*CSEInfo*/ nullptr);
@@ -442,8 +508,8 @@ bool AMDGPUPostLegalizerCombiner::runOnMachineFunction(MachineFunction &MF) {
char AMDGPUPostLegalizerCombiner::ID = 0;
INITIALIZE_PASS_BEGIN(AMDGPUPostLegalizerCombiner, DEBUG_TYPE,
- "Combine AMDGPU machine instrs after legalization",
- false, false)
+ "Combine AMDGPU machine instrs after legalization", false,
+ false)
INITIALIZE_PASS_DEPENDENCY(TargetPassConfig)
INITIALIZE_PASS_DEPENDENCY(GISelKnownBitsAnalysis)
INITIALIZE_PASS_END(AMDGPUPostLegalizerCombiner, DEBUG_TYPE,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPreLegalizerCombiner.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPreLegalizerCombiner.cpp
index a02d2cd302fb..936ca54fcf2e 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPreLegalizerCombiner.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPreLegalizerCombiner.cpp
@@ -20,28 +20,48 @@
#include "llvm/CodeGen/GlobalISel/Combiner.h"
#include "llvm/CodeGen/GlobalISel/CombinerHelper.h"
#include "llvm/CodeGen/GlobalISel/CombinerInfo.h"
+#include "llvm/CodeGen/GlobalISel/GIMatchTableExecutor.h"
+#include "llvm/CodeGen/GlobalISel/GIMatchTableExecutorImpl.h"
#include "llvm/CodeGen/GlobalISel/GISelKnownBits.h"
#include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
#include "llvm/CodeGen/MachineDominators.h"
#include "llvm/CodeGen/TargetPassConfig.h"
#include "llvm/Target/TargetMachine.h"
+#define GET_GICOMBINER_DEPS
+#include "AMDGPUGenPreLegalizeGICombiner.inc"
+#undef GET_GICOMBINER_DEPS
+
#define DEBUG_TYPE "amdgpu-prelegalizer-combiner"
using namespace llvm;
using namespace MIPatternMatch;
+namespace {
+
+#define GET_GICOMBINER_TYPES
+#include "AMDGPUGenPreLegalizeGICombiner.inc"
+#undef GET_GICOMBINER_TYPES
-class AMDGPUPreLegalizerCombinerHelper {
+class AMDGPUPreLegalizerCombinerImpl : public GIMatchTableExecutor {
protected:
+ const AMDGPUPreLegalizerCombinerImplRuleConfig &RuleConfig;
+ const GCNSubtarget &STI;
+
+ GISelChangeObserver &Observer;
MachineIRBuilder &B;
MachineFunction &MF;
MachineRegisterInfo &MRI;
AMDGPUCombinerHelper &Helper;
public:
- AMDGPUPreLegalizerCombinerHelper(MachineIRBuilder &B,
- AMDGPUCombinerHelper &Helper)
- : B(B), MF(B.getMF()), MRI(*B.getMRI()), Helper(Helper){};
+ AMDGPUPreLegalizerCombinerImpl(
+ const AMDGPUPreLegalizerCombinerImplRuleConfig &RuleConfig,
+ const GCNSubtarget &STI, GISelChangeObserver &Observer,
+ MachineIRBuilder &B, AMDGPUCombinerHelper &Helper);
+
+ static const char *getName() { return "AMDGPUPreLegalizerCombinerImpl"; }
+
+ bool tryCombineAll(MachineInstr &I) const;
struct ClampI64ToI16MatchInfo {
int64_t Cmp1 = 0;
@@ -49,17 +69,42 @@ public:
Register Origin;
};
- bool matchClampI64ToI16(MachineInstr &MI, MachineRegisterInfo &MRI,
- MachineFunction &MF,
- ClampI64ToI16MatchInfo &MatchInfo);
+ bool matchClampI64ToI16(MachineInstr &MI, const MachineRegisterInfo &MRI,
+ const MachineFunction &MF,
+ ClampI64ToI16MatchInfo &MatchInfo) const;
void applyClampI64ToI16(MachineInstr &MI,
- const ClampI64ToI16MatchInfo &MatchInfo);
+ const ClampI64ToI16MatchInfo &MatchInfo) const;
+
+private:
+#define GET_GICOMBINER_CLASS_MEMBERS
+#define AMDGPUSubtarget GCNSubtarget
+#include "AMDGPUGenPreLegalizeGICombiner.inc"
+#undef GET_GICOMBINER_CLASS_MEMBERS
+#undef AMDGPUSubtarget
};
-bool AMDGPUPreLegalizerCombinerHelper::matchClampI64ToI16(
- MachineInstr &MI, MachineRegisterInfo &MRI, MachineFunction &MF,
- ClampI64ToI16MatchInfo &MatchInfo) {
+#define GET_GICOMBINER_IMPL
+#define AMDGPUSubtarget GCNSubtarget
+#include "AMDGPUGenPreLegalizeGICombiner.inc"
+#undef AMDGPUSubtarget
+#undef GET_GICOMBINER_IMPL
+
+AMDGPUPreLegalizerCombinerImpl::AMDGPUPreLegalizerCombinerImpl(
+ const AMDGPUPreLegalizerCombinerImplRuleConfig &RuleConfig,
+ const GCNSubtarget &STI, GISelChangeObserver &Observer, MachineIRBuilder &B,
+ AMDGPUCombinerHelper &Helper)
+ : RuleConfig(RuleConfig), STI(STI), Observer(Observer), B(B), MF(B.getMF()),
+ MRI(*B.getMRI()), Helper(Helper),
+#define GET_GICOMBINER_CONSTRUCTOR_INITS
+#include "AMDGPUGenPreLegalizeGICombiner.inc"
+#undef GET_GICOMBINER_CONSTRUCTOR_INITS
+{
+}
+
+bool AMDGPUPreLegalizerCombinerImpl::matchClampI64ToI16(
+ MachineInstr &MI, const MachineRegisterInfo &MRI, const MachineFunction &MF,
+ ClampI64ToI16MatchInfo &MatchInfo) const {
assert(MI.getOpcode() == TargetOpcode::G_TRUNC && "Invalid instruction!");
// Try to find a pattern where an i64 value should get clamped to short.
@@ -118,8 +163,8 @@ bool AMDGPUPreLegalizerCombinerHelper::matchClampI64ToI16(
// This can be efficiently written as following:
// v_cvt_pk_i16_i32 v0, v0, v1
// v_med3_i32 v0, Clamp_Min, v0, Clamp_Max
-void AMDGPUPreLegalizerCombinerHelper::applyClampI64ToI16(
- MachineInstr &MI, const ClampI64ToI16MatchInfo &MatchInfo) {
+void AMDGPUPreLegalizerCombinerImpl::applyClampI64ToI16(
+ MachineInstr &MI, const ClampI64ToI16MatchInfo &MatchInfo) const {
Register Src = MatchInfo.Origin;
assert(MI.getParent()->getParent()->getRegInfo().getType(Src) ==
@@ -154,40 +199,18 @@ void AMDGPUPreLegalizerCombinerHelper::applyClampI64ToI16(
MI.eraseFromParent();
}
-class AMDGPUPreLegalizerCombinerHelperState {
-protected:
- AMDGPUCombinerHelper &Helper;
- AMDGPUPreLegalizerCombinerHelper &PreLegalizerHelper;
-
-public:
- AMDGPUPreLegalizerCombinerHelperState(
- AMDGPUCombinerHelper &Helper,
- AMDGPUPreLegalizerCombinerHelper &PreLegalizerHelper)
- : Helper(Helper), PreLegalizerHelper(PreLegalizerHelper) {}
-};
-
-#define AMDGPUPRELEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_DEPS
-#include "AMDGPUGenPreLegalizeGICombiner.inc"
-#undef AMDGPUPRELEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_DEPS
-
-namespace {
-#define AMDGPUPRELEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_H
-#include "AMDGPUGenPreLegalizeGICombiner.inc"
-#undef AMDGPUPRELEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_H
-
class AMDGPUPreLegalizerCombinerInfo final : public CombinerInfo {
GISelKnownBits *KB;
MachineDominatorTree *MDT;
+ AMDGPUPreLegalizerCombinerImplRuleConfig RuleConfig;
public:
- AMDGPUGenPreLegalizerCombinerHelperRuleConfig GeneratedRuleCfg;
-
AMDGPUPreLegalizerCombinerInfo(bool EnableOpt, bool OptSize, bool MinSize,
- GISelKnownBits *KB, MachineDominatorTree *MDT)
+ GISelKnownBits *KB, MachineDominatorTree *MDT)
: CombinerInfo(/*AllowIllegalOps*/ true, /*ShouldLegalizeIllegal*/ false,
/*LegalizerInfo*/ nullptr, EnableOpt, OptSize, MinSize),
KB(KB), MDT(MDT) {
- if (!GeneratedRuleCfg.parseCommandLineOption())
+ if (!RuleConfig.parseCommandLineOption())
report_fatal_error("Invalid rule identifier");
}
@@ -196,15 +219,17 @@ public:
};
bool AMDGPUPreLegalizerCombinerInfo::combine(GISelChangeObserver &Observer,
- MachineInstr &MI,
- MachineIRBuilder &B) const {
+ MachineInstr &MI,
+ MachineIRBuilder &B) const {
const auto *LI = MI.getMF()->getSubtarget().getLegalizerInfo();
AMDGPUCombinerHelper Helper(Observer, B, /*IsPreLegalize*/ true, KB, MDT, LI);
- AMDGPUPreLegalizerCombinerHelper PreLegalizerHelper(B, Helper);
- AMDGPUGenPreLegalizerCombinerHelper Generated(GeneratedRuleCfg, Helper,
- PreLegalizerHelper);
- if (Generated.tryCombineAll(Observer, MI, B))
+ const GCNSubtarget &STI = MI.getMF()->getSubtarget<GCNSubtarget>();
+ // TODO: Do not re-create the Impl on every inst, it should be per function.
+ AMDGPUPreLegalizerCombinerImpl Impl(RuleConfig, STI, Observer, B, Helper);
+ Impl.setupMF(*MI.getMF(), KB);
+
+ if (Impl.tryCombineAll(MI))
return true;
switch (MI.getOpcode()) {
@@ -217,10 +242,6 @@ bool AMDGPUPreLegalizerCombinerInfo::combine(GISelChangeObserver &Observer,
return false;
}
-#define AMDGPUPRELEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_CPP
-#include "AMDGPUGenPreLegalizeGICombiner.inc"
-#undef AMDGPUPRELEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_CPP
-
// Pass boilerplate
// ================
@@ -237,6 +258,7 @@ public:
bool runOnMachineFunction(MachineFunction &MF) override;
void getAnalysisUsage(AnalysisUsage &AU) const override;
+
private:
bool IsOptNone;
};
@@ -259,7 +281,7 @@ void AMDGPUPreLegalizerCombiner::getAnalysisUsage(AnalysisUsage &AU) const {
}
AMDGPUPreLegalizerCombiner::AMDGPUPreLegalizerCombiner(bool IsOptNone)
- : MachineFunctionPass(ID), IsOptNone(IsOptNone) {
+ : MachineFunctionPass(ID), IsOptNone(IsOptNone) {
initializeAMDGPUPreLegalizerCombinerPass(*PassRegistry::getPassRegistry());
}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPrintfRuntimeBinding.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPrintfRuntimeBinding.cpp
index b24300923780..13f83e298cf4 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPrintfRuntimeBinding.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPrintfRuntimeBinding.cpp
@@ -19,9 +19,7 @@
//===----------------------------------------------------------------------===//
#include "AMDGPU.h"
-#include "llvm/ADT/Triple.h"
-#include "llvm/Analysis/InstructionSimplify.h"
-#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/ADT/StringExtras.h"
#include "llvm/Analysis/ValueTracking.h"
#include "llvm/IR/DiagnosticInfo.h"
#include "llvm/IR/Dominators.h"
@@ -29,6 +27,7 @@
#include "llvm/IR/Instructions.h"
#include "llvm/InitializePasses.h"
#include "llvm/Support/DataExtractor.h"
+#include "llvm/TargetParser/Triple.h"
#include "llvm/Transforms/Utils/BasicBlockUtils.h"
using namespace llvm;
@@ -46,19 +45,11 @@ public:
private:
bool runOnModule(Module &M) override;
-
- void getAnalysisUsage(AnalysisUsage &AU) const override {
- AU.addRequired<TargetLibraryInfoWrapperPass>();
- AU.addRequired<DominatorTreeWrapperPass>();
- }
};
class AMDGPUPrintfRuntimeBindingImpl {
public:
- AMDGPUPrintfRuntimeBindingImpl(
- function_ref<const DominatorTree &(Function &)> GetDT,
- function_ref<const TargetLibraryInfo &(Function &)> GetTLI)
- : GetDT(GetDT), GetTLI(GetTLI) {}
+ AMDGPUPrintfRuntimeBindingImpl() {}
bool run(Module &M);
private:
@@ -67,14 +58,7 @@ private:
bool lowerPrintfForGpu(Module &M);
- Value *simplify(Instruction *I, const TargetLibraryInfo *TLI,
- const DominatorTree *DT) {
- return simplifyInstruction(I, {*TD, TLI, DT});
- }
-
const DataLayout *TD;
- function_ref<const DominatorTree &(Function &)> GetDT;
- function_ref<const TargetLibraryInfo &(Function &)> GetTLI;
SmallVector<CallInst *, 32> Printfs;
};
} // namespace
@@ -175,23 +159,6 @@ bool AMDGPUPrintfRuntimeBindingImpl::lowerPrintfForGpu(Module &M) {
SmallString<16> OpConvSpecifiers;
Value *Op = CI->getArgOperand(0);
- if (auto LI = dyn_cast<LoadInst>(Op)) {
- Op = LI->getPointerOperand();
- for (auto *Use : Op->users()) {
- if (auto SI = dyn_cast<StoreInst>(Use)) {
- Op = SI->getValueOperand();
- break;
- }
- }
- }
-
- if (auto I = dyn_cast<Instruction>(Op)) {
- Value *Op_simplified =
- simplify(I, &GetTLI(*I->getFunction()), &GetDT(*I->getFunction()));
- if (Op_simplified)
- Op = Op_simplified;
- }
-
StringRef FormatStr;
if (!getConstantStringInfo(Op, FormatStr)) {
Value *Stripped = Op->stripPointerCasts();
@@ -438,20 +405,15 @@ bool AMDGPUPrintfRuntimeBindingImpl::lowerPrintfForGpu(Module &M) {
for (unsigned I = 0, E = WhatToStore.size(); I != E; ++I) {
Value *TheBtCast = WhatToStore[I];
unsigned ArgSize = TD->getTypeAllocSize(TheBtCast->getType());
- SmallVector<Value *, 1> BuffOffset;
- BuffOffset.push_back(ConstantInt::get(I32Ty, ArgSize));
-
- Type *ArgPointer = PointerType::get(TheBtCast->getType(), 1);
- Value *CastedGEP =
- new BitCastInst(BufferIdx, ArgPointer, "PrintBuffPtrCast", Brnch);
- StoreInst *StBuff = new StoreInst(TheBtCast, CastedGEP, Brnch);
+ StoreInst *StBuff = new StoreInst(TheBtCast, BufferIdx, Brnch);
LLVM_DEBUG(dbgs() << "inserting store to printf buffer:\n"
<< *StBuff << '\n');
(void)StBuff;
if (I + 1 == E && ArgCount + 1 == CI->arg_size())
break;
- BufferIdx = GetElementPtrInst::Create(I8Ty, BufferIdx, BuffOffset,
- "PrintBuffNextPtr", Brnch);
+ BufferIdx = GetElementPtrInst::Create(
+ I8Ty, BufferIdx, {ConstantInt::get(I32Ty, ArgSize)},
+ "PrintBuffNextPtr", Brnch);
LLVM_DEBUG(dbgs() << "inserting gep to the printf buffer:\n"
<< *BufferIdx << '\n');
}
@@ -491,26 +453,11 @@ bool AMDGPUPrintfRuntimeBindingImpl::run(Module &M) {
}
bool AMDGPUPrintfRuntimeBinding::runOnModule(Module &M) {
- auto GetDT = [this](Function &F) -> DominatorTree & {
- return this->getAnalysis<DominatorTreeWrapperPass>(F).getDomTree();
- };
- auto GetTLI = [this](Function &F) -> TargetLibraryInfo & {
- return this->getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
- };
-
- return AMDGPUPrintfRuntimeBindingImpl(GetDT, GetTLI).run(M);
+ return AMDGPUPrintfRuntimeBindingImpl().run(M);
}
PreservedAnalyses
AMDGPUPrintfRuntimeBindingPass::run(Module &M, ModuleAnalysisManager &AM) {
- FunctionAnalysisManager &FAM =
- AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
- auto GetDT = [&FAM](Function &F) -> DominatorTree & {
- return FAM.getResult<DominatorTreeAnalysis>(F);
- };
- auto GetTLI = [&FAM](Function &F) -> TargetLibraryInfo & {
- return FAM.getResult<TargetLibraryAnalysis>(F);
- };
- bool Changed = AMDGPUPrintfRuntimeBindingImpl(GetDT, GetTLI).run(M);
+ bool Changed = AMDGPUPrintfRuntimeBindingImpl().run(M);
return Changed ? PreservedAnalyses::none() : PreservedAnalyses::all();
}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
index a7da4005e867..1d69f0434b58 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
@@ -6,23 +6,42 @@
//
//===----------------------------------------------------------------------===//
//
-// This pass eliminates allocas by either converting them into vectors or
-// by migrating them to local address space.
+// Eliminates allocas by either converting them into vectors or by migrating
+// them to local address space.
+//
+// Two passes are exposed by this file:
+// - "promote-alloca-to-vector", which runs early in the pipeline and only
+// promotes to vector. Promotion to vector is almost always profitable
+// except when the alloca is too big and the promotion would result in
+// very high register pressure.
+// - "promote-alloca", which does both promotion to vector and LDS and runs
+// much later in the pipeline. This runs after SROA because promoting to
+// LDS is of course less profitable than getting rid of the alloca or
+// vectorizing it, thus we only want to do it when the only alternative is
+// lowering the alloca to stack.
+//
+// Note that both of them exist for the old and new PMs. The new PM passes are
+// declared in AMDGPU.h and the legacy PM ones are declared here.s
//
//===----------------------------------------------------------------------===//
#include "AMDGPU.h"
#include "GCNSubtarget.h"
#include "Utils/AMDGPUBaseInfo.h"
+#include "llvm/ADT/STLExtras.h"
#include "llvm/Analysis/CaptureTracking.h"
+#include "llvm/Analysis/InstSimplifyFolder.h"
+#include "llvm/Analysis/InstructionSimplify.h"
#include "llvm/Analysis/ValueTracking.h"
#include "llvm/CodeGen/TargetPassConfig.h"
#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/IntrinsicInst.h"
#include "llvm/IR/IntrinsicsAMDGPU.h"
#include "llvm/IR/IntrinsicsR600.h"
+#include "llvm/IR/PatternMatch.h"
#include "llvm/Pass.h"
#include "llvm/Target/TargetMachine.h"
+#include "llvm/Transforms/Utils/SSAUpdater.h"
#define DEBUG_TYPE "amdgpu-promote-alloca"
@@ -30,40 +49,22 @@ using namespace llvm;
namespace {
-static cl::opt<bool> DisablePromoteAllocaToVector(
- "disable-promote-alloca-to-vector",
- cl::desc("Disable promote alloca to vector"),
- cl::init(false));
+static cl::opt<bool>
+ DisablePromoteAllocaToVector("disable-promote-alloca-to-vector",
+ cl::desc("Disable promote alloca to vector"),
+ cl::init(false));
-static cl::opt<bool> DisablePromoteAllocaToLDS(
- "disable-promote-alloca-to-lds",
- cl::desc("Disable promote alloca to LDS"),
- cl::init(false));
+static cl::opt<bool>
+ DisablePromoteAllocaToLDS("disable-promote-alloca-to-lds",
+ cl::desc("Disable promote alloca to LDS"),
+ cl::init(false));
static cl::opt<unsigned> PromoteAllocaToVectorLimit(
- "amdgpu-promote-alloca-to-vector-limit",
- cl::desc("Maximum byte size to consider promote alloca to vector"),
- cl::init(0));
-
-// FIXME: This can create globals so should be a module pass.
-class AMDGPUPromoteAlloca : public FunctionPass {
-public:
- static char ID;
-
- AMDGPUPromoteAlloca() : FunctionPass(ID) {}
-
- bool runOnFunction(Function &F) override;
-
- StringRef getPassName() const override { return "AMDGPU Promote Alloca"; }
-
- bool handleAlloca(AllocaInst &I, bool SufficientLDS);
-
- void getAnalysisUsage(AnalysisUsage &AU) const override {
- AU.setPreservesCFG();
- FunctionPass::getAnalysisUsage(AU);
- }
-};
+ "amdgpu-promote-alloca-to-vector-limit",
+ cl::desc("Maximum byte size to consider promote alloca to vector"),
+ cl::init(0));
+// Shared implementation which can do both promotion to vector and to LDS.
class AMDGPUPromoteAllocaImpl {
private:
const TargetMachine &TM;
@@ -83,26 +84,55 @@ private:
/// BaseAlloca is the alloca root the search started from.
/// Val may be that alloca or a recursive user of it.
- bool collectUsesWithPtrTypes(Value *BaseAlloca,
- Value *Val,
- std::vector<Value*> &WorkList) const;
+ bool collectUsesWithPtrTypes(Value *BaseAlloca, Value *Val,
+ std::vector<Value *> &WorkList) const;
/// Val is a derived pointer from Alloca. OpIdx0/OpIdx1 are the operand
/// indices to an instruction with 2 pointer inputs (e.g. select, icmp).
/// Returns true if both operands are derived from the same alloca. Val should
/// be the same value as one of the input operands of UseInst.
bool binaryOpIsDerivedFromSameAlloca(Value *Alloca, Value *Val,
- Instruction *UseInst,
- int OpIdx0, int OpIdx1) const;
+ Instruction *UseInst, int OpIdx0,
+ int OpIdx1) const;
/// Check whether we have enough local memory for promotion.
bool hasSufficientLocalMem(const Function &F);
- bool handleAlloca(AllocaInst &I, bool SufficientLDS);
+ bool tryPromoteAllocaToVector(AllocaInst &I);
+ bool tryPromoteAllocaToLDS(AllocaInst &I, bool SufficientLDS);
public:
- AMDGPUPromoteAllocaImpl(TargetMachine &TM) : TM(TM) {}
- bool run(Function &F);
+ AMDGPUPromoteAllocaImpl(TargetMachine &TM) : TM(TM) {
+ const Triple &TT = TM.getTargetTriple();
+ IsAMDGCN = TT.getArch() == Triple::amdgcn;
+ IsAMDHSA = TT.getOS() == Triple::AMDHSA;
+ }
+
+ bool run(Function &F, bool PromoteToLDS);
+};
+
+// FIXME: This can create globals so should be a module pass.
+class AMDGPUPromoteAlloca : public FunctionPass {
+public:
+ static char ID;
+
+ AMDGPUPromoteAlloca() : FunctionPass(ID) {}
+
+ bool runOnFunction(Function &F) override {
+ if (skipFunction(F))
+ return false;
+ if (auto *TPC = getAnalysisIfAvailable<TargetPassConfig>())
+ return AMDGPUPromoteAllocaImpl(TPC->getTM<TargetMachine>())
+ .run(F, /*PromoteToLDS*/ true);
+ return false;
+ }
+
+ StringRef getPassName() const override { return "AMDGPU Promote Alloca"; }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.setPreservesCFG();
+ FunctionPass::getAnalysisUsage(AU);
+ }
};
class AMDGPUPromoteAllocaToVector : public FunctionPass {
@@ -111,7 +141,14 @@ public:
AMDGPUPromoteAllocaToVector() : FunctionPass(ID) {}
- bool runOnFunction(Function &F) override;
+ bool runOnFunction(Function &F) override {
+ if (skipFunction(F))
+ return false;
+ if (auto *TPC = getAnalysisIfAvailable<TargetPassConfig>())
+ return AMDGPUPromoteAllocaImpl(TPC->getTM<TargetMachine>())
+ .run(F, /*PromoteToLDS*/ false);
+ return false;
+ }
StringRef getPassName() const override {
return "AMDGPU Promote Alloca to vector";
@@ -123,6 +160,22 @@ public:
}
};
+unsigned getMaxVGPRs(const TargetMachine &TM, const Function &F) {
+ if (!TM.getTargetTriple().isAMDGCN())
+ return 128;
+
+ const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
+ unsigned MaxVGPRs = ST.getMaxNumVGPRs(ST.getWavesPerEU(F).first);
+
+ // A non-entry function has only 32 caller preserved registers.
+ // Do not promote alloca which will force spilling unless we know the function
+ // will be inlined.
+ if (!F.hasFnAttribute(Attribute::AlwaysInline) &&
+ !AMDGPU::isEntryFunctionCC(F.getCallingConv()))
+ MaxVGPRs = std::min(MaxVGPRs, 32u);
+ return MaxVGPRs;
+}
+
} // end anonymous namespace
char AMDGPUPromoteAlloca::ID = 0;
@@ -142,19 +195,20 @@ INITIALIZE_PASS(AMDGPUPromoteAllocaToVector, DEBUG_TYPE "-to-vector",
char &llvm::AMDGPUPromoteAllocaID = AMDGPUPromoteAlloca::ID;
char &llvm::AMDGPUPromoteAllocaToVectorID = AMDGPUPromoteAllocaToVector::ID;
-bool AMDGPUPromoteAlloca::runOnFunction(Function &F) {
- if (skipFunction(F))
- return false;
-
- if (auto *TPC = getAnalysisIfAvailable<TargetPassConfig>()) {
- return AMDGPUPromoteAllocaImpl(TPC->getTM<TargetMachine>()).run(F);
+PreservedAnalyses AMDGPUPromoteAllocaPass::run(Function &F,
+ FunctionAnalysisManager &AM) {
+ bool Changed = AMDGPUPromoteAllocaImpl(TM).run(F, /*PromoteToLDS*/ true);
+ if (Changed) {
+ PreservedAnalyses PA;
+ PA.preserveSet<CFGAnalyses>();
+ return PA;
}
- return false;
+ return PreservedAnalyses::all();
}
-PreservedAnalyses AMDGPUPromoteAllocaPass::run(Function &F,
- FunctionAnalysisManager &AM) {
- bool Changed = AMDGPUPromoteAllocaImpl(TM).run(F);
+PreservedAnalyses
+AMDGPUPromoteAllocaToVectorPass::run(Function &F, FunctionAnalysisManager &AM) {
+ bool Changed = AMDGPUPromoteAllocaImpl(TM).run(F, /*PromoteToLDS*/ false);
if (Changed) {
PreservedAnalyses PA;
PA.preserveSet<CFGAnalyses>();
@@ -163,175 +217,72 @@ PreservedAnalyses AMDGPUPromoteAllocaPass::run(Function &F,
return PreservedAnalyses::all();
}
-bool AMDGPUPromoteAllocaImpl::run(Function &F) {
+FunctionPass *llvm::createAMDGPUPromoteAlloca() {
+ return new AMDGPUPromoteAlloca();
+}
+
+FunctionPass *llvm::createAMDGPUPromoteAllocaToVector() {
+ return new AMDGPUPromoteAllocaToVector();
+}
+
+bool AMDGPUPromoteAllocaImpl::run(Function &F, bool PromoteToLDS) {
Mod = F.getParent();
DL = &Mod->getDataLayout();
- const Triple &TT = TM.getTargetTriple();
- IsAMDGCN = TT.getArch() == Triple::amdgcn;
- IsAMDHSA = TT.getOS() == Triple::AMDHSA;
-
const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(TM, F);
if (!ST.isPromoteAllocaEnabled())
return false;
- if (IsAMDGCN) {
- const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
- MaxVGPRs = ST.getMaxNumVGPRs(ST.getWavesPerEU(F).first);
- // A non-entry function has only 32 caller preserved registers.
- // Do not promote alloca which will force spilling.
- if (!AMDGPU::isEntryFunctionCC(F.getCallingConv()))
- MaxVGPRs = std::min(MaxVGPRs, 32u);
- } else {
- MaxVGPRs = 128;
- }
+ MaxVGPRs = getMaxVGPRs(TM, F);
- bool SufficientLDS = hasSufficientLocalMem(F);
- bool Changed = false;
- BasicBlock &EntryBB = *F.begin();
+ bool SufficientLDS = PromoteToLDS ? hasSufficientLocalMem(F) : false;
SmallVector<AllocaInst *, 16> Allocas;
- for (Instruction &I : EntryBB) {
- if (AllocaInst *AI = dyn_cast<AllocaInst>(&I))
+ for (Instruction &I : F.getEntryBlock()) {
+ if (AllocaInst *AI = dyn_cast<AllocaInst>(&I)) {
+ // Array allocations are probably not worth handling, since an allocation
+ // of the array type is the canonical form.
+ if (!AI->isStaticAlloca() || AI->isArrayAllocation())
+ continue;
Allocas.push_back(AI);
+ }
}
+ bool Changed = false;
for (AllocaInst *AI : Allocas) {
- if (handleAlloca(*AI, SufficientLDS))
+ if (tryPromoteAllocaToVector(*AI))
+ Changed = true;
+ else if (PromoteToLDS && tryPromoteAllocaToLDS(*AI, SufficientLDS))
Changed = true;
}
+ // NOTE: tryPromoteAllocaToVector removes the alloca, so Allocas contains
+ // dangling pointers. If we want to reuse it past this point, the loop above
+ // would need to be updated to remove successfully promoted allocas.
+
return Changed;
}
-std::pair<Value *, Value *>
-AMDGPUPromoteAllocaImpl::getLocalSizeYZ(IRBuilder<> &Builder) {
- Function &F = *Builder.GetInsertBlock()->getParent();
- const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(TM, F);
-
- if (!IsAMDHSA) {
- Function *LocalSizeYFn
- = Intrinsic::getDeclaration(Mod, Intrinsic::r600_read_local_size_y);
- Function *LocalSizeZFn
- = Intrinsic::getDeclaration(Mod, Intrinsic::r600_read_local_size_z);
-
- CallInst *LocalSizeY = Builder.CreateCall(LocalSizeYFn, {});
- CallInst *LocalSizeZ = Builder.CreateCall(LocalSizeZFn, {});
-
- ST.makeLIDRangeMetadata(LocalSizeY);
- ST.makeLIDRangeMetadata(LocalSizeZ);
-
- return std::pair(LocalSizeY, LocalSizeZ);
- }
-
- // We must read the size out of the dispatch pointer.
- assert(IsAMDGCN);
+struct MemTransferInfo {
+ ConstantInt *SrcIndex = nullptr;
+ ConstantInt *DestIndex = nullptr;
+};
- // We are indexing into this struct, and want to extract the workgroup_size_*
- // fields.
- //
- // typedef struct hsa_kernel_dispatch_packet_s {
- // uint16_t header;
- // uint16_t setup;
- // uint16_t workgroup_size_x ;
- // uint16_t workgroup_size_y;
- // uint16_t workgroup_size_z;
- // uint16_t reserved0;
- // uint32_t grid_size_x ;
- // uint32_t grid_size_y ;
- // uint32_t grid_size_z;
- //
- // uint32_t private_segment_size;
- // uint32_t group_segment_size;
- // uint64_t kernel_object;
+// Checks if the instruction I is a memset user of the alloca AI that we can
+// deal with. Currently, only non-volatile memsets that affect the whole alloca
+// are handled.
+static bool isSupportedMemset(MemSetInst *I, AllocaInst *AI,
+ const DataLayout &DL) {
+ using namespace PatternMatch;
+ // For now we only care about non-volatile memsets that affect the whole type
+ // (start at index 0 and fill the whole alloca).
//
- // #ifdef HSA_LARGE_MODEL
- // void *kernarg_address;
- // #elif defined HSA_LITTLE_ENDIAN
- // void *kernarg_address;
- // uint32_t reserved1;
- // #else
- // uint32_t reserved1;
- // void *kernarg_address;
- // #endif
- // uint64_t reserved2;
- // hsa_signal_t completion_signal; // uint64_t wrapper
- // } hsa_kernel_dispatch_packet_t
- //
- Function *DispatchPtrFn
- = Intrinsic::getDeclaration(Mod, Intrinsic::amdgcn_dispatch_ptr);
-
- CallInst *DispatchPtr = Builder.CreateCall(DispatchPtrFn, {});
- DispatchPtr->addRetAttr(Attribute::NoAlias);
- DispatchPtr->addRetAttr(Attribute::NonNull);
- F.removeFnAttr("amdgpu-no-dispatch-ptr");
-
- // Size of the dispatch packet struct.
- DispatchPtr->addDereferenceableRetAttr(64);
-
- Type *I32Ty = Type::getInt32Ty(Mod->getContext());
- Value *CastDispatchPtr = Builder.CreateBitCast(
- DispatchPtr, PointerType::get(I32Ty, AMDGPUAS::CONSTANT_ADDRESS));
-
- // We could do a single 64-bit load here, but it's likely that the basic
- // 32-bit and extract sequence is already present, and it is probably easier
- // to CSE this. The loads should be mergeable later anyway.
- Value *GEPXY = Builder.CreateConstInBoundsGEP1_64(I32Ty, CastDispatchPtr, 1);
- LoadInst *LoadXY = Builder.CreateAlignedLoad(I32Ty, GEPXY, Align(4));
-
- Value *GEPZU = Builder.CreateConstInBoundsGEP1_64(I32Ty, CastDispatchPtr, 2);
- LoadInst *LoadZU = Builder.CreateAlignedLoad(I32Ty, GEPZU, Align(4));
-
- MDNode *MD = MDNode::get(Mod->getContext(), std::nullopt);
- LoadXY->setMetadata(LLVMContext::MD_invariant_load, MD);
- LoadZU->setMetadata(LLVMContext::MD_invariant_load, MD);
- ST.makeLIDRangeMetadata(LoadZU);
-
- // Extract y component. Upper half of LoadZU should be zero already.
- Value *Y = Builder.CreateLShr(LoadXY, 16);
-
- return std::pair(Y, LoadZU);
-}
-
-Value *AMDGPUPromoteAllocaImpl::getWorkitemID(IRBuilder<> &Builder,
- unsigned N) {
- Function *F = Builder.GetInsertBlock()->getParent();
- const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(TM, *F);
- Intrinsic::ID IntrID = Intrinsic::not_intrinsic;
- StringRef AttrName;
-
- switch (N) {
- case 0:
- IntrID = IsAMDGCN ? (Intrinsic::ID)Intrinsic::amdgcn_workitem_id_x
- : (Intrinsic::ID)Intrinsic::r600_read_tidig_x;
- AttrName = "amdgpu-no-workitem-id-x";
- break;
- case 1:
- IntrID = IsAMDGCN ? (Intrinsic::ID)Intrinsic::amdgcn_workitem_id_y
- : (Intrinsic::ID)Intrinsic::r600_read_tidig_y;
- AttrName = "amdgpu-no-workitem-id-y";
- break;
-
- case 2:
- IntrID = IsAMDGCN ? (Intrinsic::ID)Intrinsic::amdgcn_workitem_id_z
- : (Intrinsic::ID)Intrinsic::r600_read_tidig_z;
- AttrName = "amdgpu-no-workitem-id-z";
- break;
- default:
- llvm_unreachable("invalid dimension");
- }
-
- Function *WorkitemIdFn = Intrinsic::getDeclaration(Mod, IntrID);
- CallInst *CI = Builder.CreateCall(WorkitemIdFn);
- ST.makeLIDRangeMetadata(CI);
- F->removeFnAttr(AttrName);
-
- return CI;
-}
-
-static FixedVectorType *arrayTypeToVecType(ArrayType *ArrayTy) {
- return FixedVectorType::get(ArrayTy->getElementType(),
- ArrayTy->getNumElements());
+ // TODO: Now that we moved to PromoteAlloca we could handle any memsets
+ // (except maybe volatile ones?) - we just need to use shufflevector if it
+ // only affects a subset of the vector.
+ const unsigned Size = DL.getTypeStoreSize(AI->getAllocatedType());
+ return I->getOperand(0) == AI &&
+ match(I->getOperand(2), m_SpecificInt(Size)) && !I->isVolatile();
}
static Value *
@@ -379,60 +330,336 @@ static Value *GEPToVectorIndex(GetElementPtrInst *GEP, AllocaInst *Alloca,
return ConstantInt::get(GEP->getContext(), Quot);
}
-struct MemTransferInfo {
- ConstantInt *SrcIndex = nullptr;
- ConstantInt *DestIndex = nullptr;
-};
+/// Promotes a single user of the alloca to a vector form.
+///
+/// \param Inst Instruction to be promoted.
+/// \param DL Module Data Layout.
+/// \param VectorTy Vectorized Type.
+/// \param VecStoreSize Size of \p VectorTy in bytes.
+/// \param ElementSize Size of \p VectorTy element type in bytes.
+/// \param TransferInfo MemTransferInst info map.
+/// \param GEPVectorIdx GEP -> VectorIdx cache.
+/// \param CurVal Current value of the vector (e.g. last stored value)
+/// \param[out] DeferredLoads \p Inst is added to this vector if it can't
+/// be promoted now. This happens when promoting requires \p
+/// CurVal, but \p CurVal is nullptr.
+/// \return the stored value if \p Inst would have written to the alloca, or
+/// nullptr otherwise.
+static Value *promoteAllocaUserToVector(
+ Instruction *Inst, const DataLayout &DL, FixedVectorType *VectorTy,
+ unsigned VecStoreSize, unsigned ElementSize,
+ DenseMap<MemTransferInst *, MemTransferInfo> &TransferInfo,
+ std::map<GetElementPtrInst *, Value *> &GEPVectorIdx, Value *CurVal,
+ SmallVectorImpl<LoadInst *> &DeferredLoads) {
+ // Note: we use InstSimplifyFolder because it can leverage the DataLayout
+ // to do more folding, especially in the case of vector splats.
+ IRBuilder<InstSimplifyFolder> Builder(Inst->getContext(),
+ InstSimplifyFolder(DL));
+ Builder.SetInsertPoint(Inst);
+
+ const auto GetOrLoadCurrentVectorValue = [&]() -> Value * {
+ if (CurVal)
+ return CurVal;
+
+ // If the current value is not known, insert a dummy load and lower it on
+ // the second pass.
+ LoadInst *Dummy =
+ Builder.CreateLoad(VectorTy, PoisonValue::get(Builder.getPtrTy()),
+ "promotealloca.dummyload");
+ DeferredLoads.push_back(Dummy);
+ return Dummy;
+ };
+
+ const auto CreateTempPtrIntCast = [&Builder, DL](Value *Val,
+ Type *PtrTy) -> Value * {
+ assert(DL.getTypeStoreSize(Val->getType()) == DL.getTypeStoreSize(PtrTy));
+ const unsigned Size = DL.getTypeStoreSizeInBits(PtrTy);
+ if (!PtrTy->isVectorTy())
+ return Builder.CreateBitOrPointerCast(Val, Builder.getIntNTy(Size));
+ const unsigned NumPtrElts = cast<FixedVectorType>(PtrTy)->getNumElements();
+ // If we want to cast to cast, e.g. a <2 x ptr> into a <4 x i32>, we need to
+ // first cast the ptr vector to <2 x i64>.
+ assert((Size % NumPtrElts == 0) && "Vector size not divisble");
+ Type *EltTy = Builder.getIntNTy(Size / NumPtrElts);
+ return Builder.CreateBitOrPointerCast(
+ Val, FixedVectorType::get(EltTy, NumPtrElts));
+ };
+
+ Type *VecEltTy = VectorTy->getElementType();
+ switch (Inst->getOpcode()) {
+ case Instruction::Load: {
+ // Loads can only be lowered if the value is known.
+ if (!CurVal) {
+ DeferredLoads.push_back(cast<LoadInst>(Inst));
+ return nullptr;
+ }
+
+ Value *Index = calculateVectorIndex(
+ cast<LoadInst>(Inst)->getPointerOperand(), GEPVectorIdx);
+
+ // We're loading the full vector.
+ Type *AccessTy = Inst->getType();
+ TypeSize AccessSize = DL.getTypeStoreSize(AccessTy);
+ if (AccessSize == VecStoreSize && cast<Constant>(Index)->isZeroValue()) {
+ if (AccessTy->isPtrOrPtrVectorTy())
+ CurVal = CreateTempPtrIntCast(CurVal, AccessTy);
+ else if (CurVal->getType()->isPtrOrPtrVectorTy())
+ CurVal = CreateTempPtrIntCast(CurVal, CurVal->getType());
+ Value *NewVal = Builder.CreateBitOrPointerCast(CurVal, AccessTy);
+ Inst->replaceAllUsesWith(NewVal);
+ return nullptr;
+ }
-static bool tryPromoteAllocaToVector(AllocaInst *Alloca, const DataLayout &DL,
- unsigned MaxVGPRs) {
+ // Loading a subvector.
+ if (isa<FixedVectorType>(AccessTy)) {
+ assert(AccessSize.isKnownMultipleOf(DL.getTypeStoreSize(VecEltTy)));
+ const unsigned NumElts = AccessSize / DL.getTypeStoreSize(VecEltTy);
+ auto *SubVecTy = FixedVectorType::get(VecEltTy, NumElts);
+ assert(DL.getTypeStoreSize(SubVecTy) == DL.getTypeStoreSize(AccessTy));
+
+ unsigned IndexVal = cast<ConstantInt>(Index)->getZExtValue();
+ Value *SubVec = PoisonValue::get(SubVecTy);
+ for (unsigned K = 0; K < NumElts; ++K) {
+ SubVec = Builder.CreateInsertElement(
+ SubVec, Builder.CreateExtractElement(CurVal, IndexVal + K), K);
+ }
+
+ if (AccessTy->isPtrOrPtrVectorTy())
+ SubVec = CreateTempPtrIntCast(SubVec, AccessTy);
+ else if (SubVecTy->isPtrOrPtrVectorTy())
+ SubVec = CreateTempPtrIntCast(SubVec, SubVecTy);
+
+ SubVec = Builder.CreateBitOrPointerCast(SubVec, AccessTy);
+ Inst->replaceAllUsesWith(SubVec);
+ return nullptr;
+ }
+
+ // We're loading one element.
+ Value *ExtractElement = Builder.CreateExtractElement(CurVal, Index);
+ if (AccessTy != VecEltTy)
+ ExtractElement = Builder.CreateBitOrPointerCast(ExtractElement, AccessTy);
+
+ Inst->replaceAllUsesWith(ExtractElement);
+ return nullptr;
+ }
+ case Instruction::Store: {
+ // For stores, it's a bit trickier and it depends on whether we're storing
+ // the full vector or not. If we're storing the full vector, we don't need
+ // to know the current value. If this is a store of a single element, we
+ // need to know the value.
+ StoreInst *SI = cast<StoreInst>(Inst);
+ Value *Index = calculateVectorIndex(SI->getPointerOperand(), GEPVectorIdx);
+ Value *Val = SI->getValueOperand();
+
+ // We're storing the full vector, we can handle this without knowing CurVal.
+ Type *AccessTy = Val->getType();
+ TypeSize AccessSize = DL.getTypeStoreSize(AccessTy);
+ if (AccessSize == VecStoreSize && cast<Constant>(Index)->isZeroValue()) {
+ if (AccessTy->isPtrOrPtrVectorTy())
+ Val = CreateTempPtrIntCast(Val, AccessTy);
+ else if (VectorTy->isPtrOrPtrVectorTy())
+ Val = CreateTempPtrIntCast(Val, VectorTy);
+ return Builder.CreateBitOrPointerCast(Val, VectorTy);
+ }
+
+ // Storing a subvector.
+ if (isa<FixedVectorType>(AccessTy)) {
+ assert(AccessSize.isKnownMultipleOf(DL.getTypeStoreSize(VecEltTy)));
+ const unsigned NumElts = AccessSize / DL.getTypeStoreSize(VecEltTy);
+ auto *SubVecTy = FixedVectorType::get(VecEltTy, NumElts);
+ assert(DL.getTypeStoreSize(SubVecTy) == DL.getTypeStoreSize(AccessTy));
+
+ if (SubVecTy->isPtrOrPtrVectorTy())
+ Val = CreateTempPtrIntCast(Val, SubVecTy);
+ else if (AccessTy->isPtrOrPtrVectorTy())
+ Val = CreateTempPtrIntCast(Val, AccessTy);
+
+ Val = Builder.CreateBitOrPointerCast(Val, SubVecTy);
+
+ unsigned IndexVal = cast<ConstantInt>(Index)->getZExtValue();
+ Value *CurVec = GetOrLoadCurrentVectorValue();
+ for (unsigned K = 0; (IndexVal + K) < NumElts; ++K) {
+ CurVec = Builder.CreateInsertElement(
+ CurVec, Builder.CreateExtractElement(Val, K), IndexVal + K);
+ }
+ return CurVec;
+ }
+
+ if (Val->getType() != VecEltTy)
+ Val = Builder.CreateBitOrPointerCast(Val, VecEltTy);
+ return Builder.CreateInsertElement(GetOrLoadCurrentVectorValue(), Val,
+ Index);
+ }
+ case Instruction::Call: {
+ if (auto *MTI = dyn_cast<MemTransferInst>(Inst)) {
+ // For memcpy, we need to know curval.
+ ConstantInt *Length = cast<ConstantInt>(MTI->getLength());
+ unsigned NumCopied = Length->getZExtValue() / ElementSize;
+ MemTransferInfo *TI = &TransferInfo[MTI];
+ unsigned SrcBegin = TI->SrcIndex->getZExtValue();
+ unsigned DestBegin = TI->DestIndex->getZExtValue();
+
+ SmallVector<int> Mask;
+ for (unsigned Idx = 0; Idx < VectorTy->getNumElements(); ++Idx) {
+ if (Idx >= DestBegin && Idx < DestBegin + NumCopied) {
+ Mask.push_back(SrcBegin++);
+ } else {
+ Mask.push_back(Idx);
+ }
+ }
+
+ return Builder.CreateShuffleVector(GetOrLoadCurrentVectorValue(), Mask);
+ }
+
+ if (auto *MSI = dyn_cast<MemSetInst>(Inst)) {
+ // For memset, we don't need to know the previous value because we
+ // currently only allow memsets that cover the whole alloca.
+ Value *Elt = MSI->getOperand(1);
+ if (DL.getTypeStoreSize(VecEltTy) > 1) {
+ Value *EltBytes =
+ Builder.CreateVectorSplat(DL.getTypeStoreSize(VecEltTy), Elt);
+ Elt = Builder.CreateBitCast(EltBytes, VecEltTy);
+ }
+
+ return Builder.CreateVectorSplat(VectorTy->getElementCount(), Elt);
+ }
+
+ llvm_unreachable("Unsupported call when promoting alloca to vector");
+ }
+
+ default:
+ llvm_unreachable("Inconsistency in instructions promotable to vector");
+ }
+
+ llvm_unreachable("Did not return after promoting instruction!");
+}
+
+static bool isSupportedAccessType(FixedVectorType *VecTy, Type *AccessTy,
+ const DataLayout &DL) {
+ // Access as a vector type can work if the size of the access vector is a
+ // multiple of the size of the alloca's vector element type.
+ //
+ // Examples:
+ // - VecTy = <8 x float>, AccessTy = <4 x float> -> OK
+ // - VecTy = <4 x double>, AccessTy = <2 x float> -> OK
+ // - VecTy = <4 x double>, AccessTy = <3 x float> -> NOT OK
+ // - 3*32 is not a multiple of 64
+ //
+ // We could handle more complicated cases, but it'd make things a lot more
+ // complicated.
+ if (isa<FixedVectorType>(AccessTy)) {
+ TypeSize AccTS = DL.getTypeStoreSize(AccessTy);
+ TypeSize VecTS = DL.getTypeStoreSize(VecTy->getElementType());
+ return AccTS.isKnownMultipleOf(VecTS);
+ }
+
+ return CastInst::isBitOrNoopPointerCastable(VecTy->getElementType(), AccessTy,
+ DL);
+}
+
+/// Iterates over an instruction worklist that may contain multiple instructions
+/// from the same basic block, but in a different order.
+template <typename InstContainer>
+static void forEachWorkListItem(const InstContainer &WorkList,
+ std::function<void(Instruction *)> Fn) {
+ // Bucket up uses of the alloca by the block they occur in.
+ // This is important because we have to handle multiple defs/uses in a block
+ // ourselves: SSAUpdater is purely for cross-block references.
+ DenseMap<BasicBlock *, SmallDenseSet<Instruction *>> UsesByBlock;
+ for (Instruction *User : WorkList)
+ UsesByBlock[User->getParent()].insert(User);
+
+ for (Instruction *User : WorkList) {
+ BasicBlock *BB = User->getParent();
+ auto &BlockUses = UsesByBlock[BB];
+
+ // Already processed, skip.
+ if (BlockUses.empty())
+ continue;
+
+ // Only user in the block, directly process it.
+ if (BlockUses.size() == 1) {
+ Fn(User);
+ continue;
+ }
+
+ // Multiple users in the block, do a linear scan to see users in order.
+ for (Instruction &Inst : *BB) {
+ if (!BlockUses.contains(&Inst))
+ continue;
+
+ Fn(&Inst);
+ }
+
+ // Clear the block so we know it's been processed.
+ BlockUses.clear();
+ }
+}
+
+// FIXME: Should try to pick the most likely to be profitable allocas first.
+bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToVector(AllocaInst &Alloca) {
+ LLVM_DEBUG(dbgs() << "Trying to promote to vector: " << Alloca << '\n');
if (DisablePromoteAllocaToVector) {
- LLVM_DEBUG(dbgs() << " Promotion alloca to vector is disabled\n");
+ LLVM_DEBUG(dbgs() << " Promote alloca to vector is disabled\n");
return false;
}
- Type *AllocaTy = Alloca->getAllocatedType();
+ Type *AllocaTy = Alloca.getAllocatedType();
auto *VectorTy = dyn_cast<FixedVectorType>(AllocaTy);
if (auto *ArrayTy = dyn_cast<ArrayType>(AllocaTy)) {
if (VectorType::isValidElementType(ArrayTy->getElementType()) &&
ArrayTy->getNumElements() > 0)
- VectorTy = arrayTypeToVecType(ArrayTy);
+ VectorTy = FixedVectorType::get(ArrayTy->getElementType(),
+ ArrayTy->getNumElements());
}
// Use up to 1/4 of available register budget for vectorization.
unsigned Limit = PromoteAllocaToVectorLimit ? PromoteAllocaToVectorLimit * 8
: (MaxVGPRs * 32);
- if (DL.getTypeSizeInBits(AllocaTy) * 4 > Limit) {
- LLVM_DEBUG(dbgs() << " Alloca too big for vectorization with "
- << MaxVGPRs << " registers available\n");
+ if (DL->getTypeSizeInBits(AllocaTy) * 4 > Limit) {
+ LLVM_DEBUG(dbgs() << " Alloca too big for vectorization with " << MaxVGPRs
+ << " registers available\n");
return false;
}
- LLVM_DEBUG(dbgs() << "Alloca candidate for vectorization\n");
-
// FIXME: There is no reason why we can't support larger arrays, we
// are just being conservative for now.
- // FIXME: We also reject alloca's of the form [ 2 x [ 2 x i32 ]] or equivalent. Potentially these
- // could also be promoted but we don't currently handle this case
- if (!VectorTy || VectorTy->getNumElements() > 16 ||
- VectorTy->getNumElements() < 2) {
+ // FIXME: We also reject alloca's of the form [ 2 x [ 2 x i32 ]] or
+ // equivalent. Potentially these could also be promoted but we don't currently
+ // handle this case
+ if (!VectorTy) {
LLVM_DEBUG(dbgs() << " Cannot convert type to vector\n");
return false;
}
- std::map<GetElementPtrInst*, Value*> GEPVectorIdx;
+ if (VectorTy->getNumElements() > 16 || VectorTy->getNumElements() < 2) {
+ LLVM_DEBUG(dbgs() << " " << *VectorTy
+ << " has an unsupported number of elements\n");
+ return false;
+ }
+
+ std::map<GetElementPtrInst *, Value *> GEPVectorIdx;
SmallVector<Instruction *> WorkList;
+ SmallVector<Instruction *> UsersToRemove;
SmallVector<Instruction *> DeferredInsts;
SmallVector<Use *, 8> Uses;
DenseMap<MemTransferInst *, MemTransferInfo> TransferInfo;
- for (Use &U : Alloca->uses())
+ const auto RejectUser = [&](Instruction *Inst, Twine Msg) {
+ LLVM_DEBUG(dbgs() << " Cannot promote alloca to vector: " << Msg << "\n"
+ << " " << *Inst << "\n");
+ return false;
+ };
+
+ for (Use &U : Alloca.uses())
Uses.push_back(&U);
+ LLVM_DEBUG(dbgs() << " Attempting promotion to: " << *VectorTy << "\n");
+
Type *VecEltTy = VectorTy->getElementType();
- unsigned ElementSize = DL.getTypeSizeInBits(VecEltTy) / 8;
+ unsigned ElementSize = DL->getTypeSizeInBits(VecEltTy) / 8;
while (!Uses.empty()) {
Use *U = Uses.pop_back_val();
Instruction *Inst = cast<Instruction>(U->getUser());
@@ -441,22 +668,29 @@ static bool tryPromoteAllocaToVector(AllocaInst *Alloca, const DataLayout &DL,
// This is a store of the pointer, not to the pointer.
if (isa<StoreInst>(Inst) &&
U->getOperandNo() != StoreInst::getPointerOperandIndex())
- return false;
+ return RejectUser(Inst, "pointer is being stored");
Type *AccessTy = getLoadStoreType(Inst);
+ if (AccessTy->isAggregateType())
+ return RejectUser(Inst, "unsupported load/store as aggregate");
+ assert(!AccessTy->isAggregateType() || AccessTy->isArrayTy());
+
Ptr = Ptr->stripPointerCasts();
- // Alloca already accessed as vector, leave alone.
- if (Ptr == Alloca && DL.getTypeStoreSize(Alloca->getAllocatedType()) ==
- DL.getTypeStoreSize(AccessTy))
+ // Alloca already accessed as vector.
+ if (Ptr == &Alloca && DL->getTypeStoreSize(Alloca.getAllocatedType()) ==
+ DL->getTypeStoreSize(AccessTy)) {
+ WorkList.push_back(Inst);
continue;
+ }
// Check that this is a simple access of a vector element.
bool IsSimple = isa<LoadInst>(Inst) ? cast<LoadInst>(Inst)->isSimple()
: cast<StoreInst>(Inst)->isSimple();
- if (!IsSimple ||
- !CastInst::isBitOrNoopPointerCastable(VecEltTy, AccessTy, DL))
- return false;
+ if (!IsSimple)
+ return RejectUser(Inst, "not a simple load or store");
+ if (!isSupportedAccessType(VectorTy, AccessTy, *DL))
+ return RejectUser(Inst, "not a supported access type");
WorkList.push_back(Inst);
continue;
@@ -466,32 +700,38 @@ static bool tryPromoteAllocaToVector(AllocaInst *Alloca, const DataLayout &DL,
// Look through bitcasts.
for (Use &U : Inst->uses())
Uses.push_back(&U);
+ UsersToRemove.push_back(Inst);
continue;
}
if (auto *GEP = dyn_cast<GetElementPtrInst>(Inst)) {
// If we can't compute a vector index from this GEP, then we can't
// promote this alloca to vector.
- Value *Index = GEPToVectorIndex(GEP, Alloca, VecEltTy, DL);
- if (!Index) {
- LLVM_DEBUG(dbgs() << " Cannot compute vector index for GEP " << *GEP
- << '\n');
- return false;
- }
+ Value *Index = GEPToVectorIndex(GEP, &Alloca, VecEltTy, *DL);
+ if (!Index)
+ return RejectUser(Inst, "cannot compute vector index for GEP");
GEPVectorIdx[GEP] = Index;
for (Use &U : Inst->uses())
Uses.push_back(&U);
+ UsersToRemove.push_back(Inst);
+ continue;
+ }
+
+ if (MemSetInst *MSI = dyn_cast<MemSetInst>(Inst);
+ MSI && isSupportedMemset(MSI, &Alloca, *DL)) {
+ WorkList.push_back(Inst);
continue;
}
if (MemTransferInst *TransferInst = dyn_cast<MemTransferInst>(Inst)) {
if (TransferInst->isVolatile())
- return false;
+ return RejectUser(Inst, "mem transfer inst is volatile");
ConstantInt *Len = dyn_cast<ConstantInt>(TransferInst->getLength());
- if (!Len || !!(Len->getZExtValue() % ElementSize))
- return false;
+ if (!Len || (Len->getZExtValue() % ElementSize))
+ return RejectUser(Inst, "mem transfer inst length is non-constant or "
+ "not a multiple of the vector element size");
if (!TransferInfo.count(TransferInst)) {
DeferredInsts.push_back(Inst);
@@ -501,7 +741,7 @@ static bool tryPromoteAllocaToVector(AllocaInst *Alloca, const DataLayout &DL,
auto getPointerIndexOfAlloca = [&](Value *Ptr) -> ConstantInt * {
GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Ptr);
- if (Ptr != Alloca && !GEPVectorIdx.count(GEP))
+ if (Ptr != &Alloca && !GEPVectorIdx.count(GEP))
return nullptr;
return dyn_cast<ConstantInt>(calculateVectorIndex(Ptr, GEPVectorIdx));
@@ -513,30 +753,33 @@ static bool tryPromoteAllocaToVector(AllocaInst *Alloca, const DataLayout &DL,
Value *Dest = TransferInst->getDest();
ConstantInt *Index = getPointerIndexOfAlloca(Dest);
if (!Index)
- return false;
+ return RejectUser(Inst, "could not calculate constant dest index");
TI->DestIndex = Index;
} else {
assert(OpNum == 1);
Value *Src = TransferInst->getSource();
ConstantInt *Index = getPointerIndexOfAlloca(Src);
if (!Index)
- return false;
+ return RejectUser(Inst, "could not calculate constant src index");
TI->SrcIndex = Index;
}
continue;
}
// Ignore assume-like intrinsics and comparisons used in assumes.
- if (isAssumeLikeIntrinsic(Inst))
+ if (isAssumeLikeIntrinsic(Inst)) {
+ UsersToRemove.push_back(Inst);
continue;
+ }
if (isa<ICmpInst>(Inst) && all_of(Inst->users(), [](User *U) {
return isAssumeLikeIntrinsic(cast<Instruction>(U));
- }))
+ })) {
+ UsersToRemove.push_back(Inst);
continue;
+ }
- // Unknown user.
- return false;
+ return RejectUser(Inst, "unhandled alloca user");
}
while (!DeferredInsts.empty()) {
@@ -546,82 +789,194 @@ static bool tryPromoteAllocaToVector(AllocaInst *Alloca, const DataLayout &DL,
// from different address spaces.
MemTransferInfo &Info = TransferInfo[TransferInst];
if (!Info.SrcIndex || !Info.DestIndex)
- return false;
+ return RejectUser(
+ Inst, "mem transfer inst is missing constant src and/or dst index");
}
LLVM_DEBUG(dbgs() << " Converting alloca to vector " << *AllocaTy << " -> "
<< *VectorTy << '\n');
+ const unsigned VecStoreSize = DL->getTypeStoreSize(VectorTy);
- for (Instruction *Inst : WorkList) {
- IRBuilder<> Builder(Inst);
- switch (Inst->getOpcode()) {
- case Instruction::Load: {
- Value *Ptr = cast<LoadInst>(Inst)->getPointerOperand();
- Value *Index = calculateVectorIndex(Ptr, GEPVectorIdx);
- Type *VecPtrTy = VectorTy->getPointerTo(Alloca->getAddressSpace());
- Value *BitCast = Builder.CreateBitCast(Alloca, VecPtrTy);
- Value *VecValue =
- Builder.CreateAlignedLoad(VectorTy, BitCast, Alloca->getAlign());
- Value *ExtractElement = Builder.CreateExtractElement(VecValue, Index);
- if (Inst->getType() != VecEltTy)
- ExtractElement = Builder.CreateBitOrPointerCast(ExtractElement, Inst->getType());
- Inst->replaceAllUsesWith(ExtractElement);
- Inst->eraseFromParent();
- break;
- }
- case Instruction::Store: {
- StoreInst *SI = cast<StoreInst>(Inst);
- Value *Ptr = SI->getPointerOperand();
- Value *Index = calculateVectorIndex(Ptr, GEPVectorIdx);
- Type *VecPtrTy = VectorTy->getPointerTo(Alloca->getAddressSpace());
- Value *BitCast = Builder.CreateBitCast(Alloca, VecPtrTy);
- Value *VecValue =
- Builder.CreateAlignedLoad(VectorTy, BitCast, Alloca->getAlign());
- Value *Elt = SI->getValueOperand();
- if (Elt->getType() != VecEltTy)
- Elt = Builder.CreateBitOrPointerCast(Elt, VecEltTy);
- Value *NewVecValue = Builder.CreateInsertElement(VecValue, Elt, Index);
- Builder.CreateAlignedStore(NewVecValue, BitCast, Alloca->getAlign());
- Inst->eraseFromParent();
- break;
- }
- case Instruction::Call: {
- if (const MemTransferInst *MTI = dyn_cast<MemTransferInst>(Inst)) {
- ConstantInt *Length = cast<ConstantInt>(MTI->getLength());
- unsigned NumCopied = Length->getZExtValue() / ElementSize;
- MemTransferInfo *TI = &TransferInfo[cast<MemTransferInst>(Inst)];
- unsigned SrcBegin = TI->SrcIndex->getZExtValue();
- unsigned DestBegin = TI->DestIndex->getZExtValue();
+ // Alloca is uninitialized memory. Imitate that by making the first value
+ // undef.
+ SSAUpdater Updater;
+ Updater.Initialize(VectorTy, "promotealloca");
+ Updater.AddAvailableValue(Alloca.getParent(), UndefValue::get(VectorTy));
- SmallVector<int> Mask;
- for (unsigned Idx = 0; Idx < VectorTy->getNumElements(); ++Idx) {
- if (Idx >= DestBegin && Idx < DestBegin + NumCopied) {
- Mask.push_back(SrcBegin++);
- } else {
- Mask.push_back(Idx);
- }
- }
- Type *VecPtrTy = VectorTy->getPointerTo(Alloca->getAddressSpace());
- Value *BitCast = Builder.CreateBitCast(Alloca, VecPtrTy);
- Value *VecValue =
- Builder.CreateAlignedLoad(VectorTy, BitCast, Alloca->getAlign());
- Value *NewVecValue = Builder.CreateShuffleVector(VecValue, Mask);
- Builder.CreateAlignedStore(NewVecValue, BitCast, Alloca->getAlign());
+ // First handle the initial worklist.
+ SmallVector<LoadInst *, 4> DeferredLoads;
+ forEachWorkListItem(WorkList, [&](Instruction *I) {
+ BasicBlock *BB = I->getParent();
+ // On the first pass, we only take values that are trivially known, i.e.
+ // where AddAvailableValue was already called in this block.
+ Value *Result = promoteAllocaUserToVector(
+ I, *DL, VectorTy, VecStoreSize, ElementSize, TransferInfo, GEPVectorIdx,
+ Updater.FindValueForBlock(BB), DeferredLoads);
+ if (Result)
+ Updater.AddAvailableValue(BB, Result);
+ });
- Inst->eraseFromParent();
- } else {
- llvm_unreachable("Unsupported call when promoting alloca to vector");
- }
- break;
- }
+ // Then handle deferred loads.
+ forEachWorkListItem(DeferredLoads, [&](Instruction *I) {
+ SmallVector<LoadInst *, 0> NewDLs;
+ BasicBlock *BB = I->getParent();
+ // On the second pass, we use GetValueInMiddleOfBlock to guarantee we always
+ // get a value, inserting PHIs as needed.
+ Value *Result = promoteAllocaUserToVector(
+ I, *DL, VectorTy, VecStoreSize, ElementSize, TransferInfo, GEPVectorIdx,
+ Updater.GetValueInMiddleOfBlock(I->getParent()), NewDLs);
+ if (Result)
+ Updater.AddAvailableValue(BB, Result);
+ assert(NewDLs.empty() && "No more deferred loads should be queued!");
+ });
- default:
- llvm_unreachable("Inconsistency in instructions promotable to vector");
- }
+ // Delete all instructions. On the first pass, new dummy loads may have been
+ // added so we need to collect them too.
+ DenseSet<Instruction *> InstsToDelete(WorkList.begin(), WorkList.end());
+ InstsToDelete.insert(DeferredLoads.begin(), DeferredLoads.end());
+ for (Instruction *I : InstsToDelete) {
+ assert(I->use_empty());
+ I->eraseFromParent();
+ }
+
+ // Delete all the users that are known to be removeable.
+ for (Instruction *I : reverse(UsersToRemove)) {
+ I->dropDroppableUses();
+ assert(I->use_empty());
+ I->eraseFromParent();
}
+
+ // Alloca should now be dead too.
+ assert(Alloca.use_empty());
+ Alloca.eraseFromParent();
return true;
}
+std::pair<Value *, Value *>
+AMDGPUPromoteAllocaImpl::getLocalSizeYZ(IRBuilder<> &Builder) {
+ Function &F = *Builder.GetInsertBlock()->getParent();
+ const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(TM, F);
+
+ if (!IsAMDHSA) {
+ Function *LocalSizeYFn =
+ Intrinsic::getDeclaration(Mod, Intrinsic::r600_read_local_size_y);
+ Function *LocalSizeZFn =
+ Intrinsic::getDeclaration(Mod, Intrinsic::r600_read_local_size_z);
+
+ CallInst *LocalSizeY = Builder.CreateCall(LocalSizeYFn, {});
+ CallInst *LocalSizeZ = Builder.CreateCall(LocalSizeZFn, {});
+
+ ST.makeLIDRangeMetadata(LocalSizeY);
+ ST.makeLIDRangeMetadata(LocalSizeZ);
+
+ return std::pair(LocalSizeY, LocalSizeZ);
+ }
+
+ // We must read the size out of the dispatch pointer.
+ assert(IsAMDGCN);
+
+ // We are indexing into this struct, and want to extract the workgroup_size_*
+ // fields.
+ //
+ // typedef struct hsa_kernel_dispatch_packet_s {
+ // uint16_t header;
+ // uint16_t setup;
+ // uint16_t workgroup_size_x ;
+ // uint16_t workgroup_size_y;
+ // uint16_t workgroup_size_z;
+ // uint16_t reserved0;
+ // uint32_t grid_size_x ;
+ // uint32_t grid_size_y ;
+ // uint32_t grid_size_z;
+ //
+ // uint32_t private_segment_size;
+ // uint32_t group_segment_size;
+ // uint64_t kernel_object;
+ //
+ // #ifdef HSA_LARGE_MODEL
+ // void *kernarg_address;
+ // #elif defined HSA_LITTLE_ENDIAN
+ // void *kernarg_address;
+ // uint32_t reserved1;
+ // #else
+ // uint32_t reserved1;
+ // void *kernarg_address;
+ // #endif
+ // uint64_t reserved2;
+ // hsa_signal_t completion_signal; // uint64_t wrapper
+ // } hsa_kernel_dispatch_packet_t
+ //
+ Function *DispatchPtrFn =
+ Intrinsic::getDeclaration(Mod, Intrinsic::amdgcn_dispatch_ptr);
+
+ CallInst *DispatchPtr = Builder.CreateCall(DispatchPtrFn, {});
+ DispatchPtr->addRetAttr(Attribute::NoAlias);
+ DispatchPtr->addRetAttr(Attribute::NonNull);
+ F.removeFnAttr("amdgpu-no-dispatch-ptr");
+
+ // Size of the dispatch packet struct.
+ DispatchPtr->addDereferenceableRetAttr(64);
+
+ Type *I32Ty = Type::getInt32Ty(Mod->getContext());
+ Value *CastDispatchPtr = Builder.CreateBitCast(
+ DispatchPtr, PointerType::get(I32Ty, AMDGPUAS::CONSTANT_ADDRESS));
+
+ // We could do a single 64-bit load here, but it's likely that the basic
+ // 32-bit and extract sequence is already present, and it is probably easier
+ // to CSE this. The loads should be mergeable later anyway.
+ Value *GEPXY = Builder.CreateConstInBoundsGEP1_64(I32Ty, CastDispatchPtr, 1);
+ LoadInst *LoadXY = Builder.CreateAlignedLoad(I32Ty, GEPXY, Align(4));
+
+ Value *GEPZU = Builder.CreateConstInBoundsGEP1_64(I32Ty, CastDispatchPtr, 2);
+ LoadInst *LoadZU = Builder.CreateAlignedLoad(I32Ty, GEPZU, Align(4));
+
+ MDNode *MD = MDNode::get(Mod->getContext(), std::nullopt);
+ LoadXY->setMetadata(LLVMContext::MD_invariant_load, MD);
+ LoadZU->setMetadata(LLVMContext::MD_invariant_load, MD);
+ ST.makeLIDRangeMetadata(LoadZU);
+
+ // Extract y component. Upper half of LoadZU should be zero already.
+ Value *Y = Builder.CreateLShr(LoadXY, 16);
+
+ return std::pair(Y, LoadZU);
+}
+
+Value *AMDGPUPromoteAllocaImpl::getWorkitemID(IRBuilder<> &Builder,
+ unsigned N) {
+ Function *F = Builder.GetInsertBlock()->getParent();
+ const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(TM, *F);
+ Intrinsic::ID IntrID = Intrinsic::not_intrinsic;
+ StringRef AttrName;
+
+ switch (N) {
+ case 0:
+ IntrID = IsAMDGCN ? (Intrinsic::ID)Intrinsic::amdgcn_workitem_id_x
+ : (Intrinsic::ID)Intrinsic::r600_read_tidig_x;
+ AttrName = "amdgpu-no-workitem-id-x";
+ break;
+ case 1:
+ IntrID = IsAMDGCN ? (Intrinsic::ID)Intrinsic::amdgcn_workitem_id_y
+ : (Intrinsic::ID)Intrinsic::r600_read_tidig_y;
+ AttrName = "amdgpu-no-workitem-id-y";
+ break;
+
+ case 2:
+ IntrID = IsAMDGCN ? (Intrinsic::ID)Intrinsic::amdgcn_workitem_id_z
+ : (Intrinsic::ID)Intrinsic::r600_read_tidig_z;
+ AttrName = "amdgpu-no-workitem-id-z";
+ break;
+ default:
+ llvm_unreachable("invalid dimension");
+ }
+
+ Function *WorkitemIdFn = Intrinsic::getDeclaration(Mod, IntrID);
+ CallInst *CI = Builder.CreateCall(WorkitemIdFn);
+ ST.makeLIDRangeMetadata(CI);
+ F->removeFnAttr(AttrName);
+
+ return CI;
+}
+
static bool isCallPromotable(CallInst *CI) {
IntrinsicInst *II = dyn_cast<IntrinsicInst>(CI);
if (!II)
@@ -883,8 +1238,8 @@ bool AMDGPUPromoteAllocaImpl::hasSufficientLocalMem(const Function &F) {
CurrentLocalMemUsage += Alloc.first;
}
- unsigned MaxOccupancy = ST.getOccupancyWithLocalMemSize(CurrentLocalMemUsage,
- F);
+ unsigned MaxOccupancy =
+ ST.getOccupancyWithLocalMemSize(CurrentLocalMemUsage, F);
// Restrict local memory usage so that we don't drastically reduce occupancy,
// unless it is already significantly reduced.
@@ -902,10 +1257,9 @@ bool AMDGPUPromoteAllocaImpl::hasSufficientLocalMem(const Function &F) {
// usage.
MaxOccupancy = std::min(OccupancyHint, MaxOccupancy);
-
// Round up to the next tier of usage.
- unsigned MaxSizeWithWaveCount
- = ST.getMaxLocalMemSizeWithWaveCount(MaxOccupancy, F);
+ unsigned MaxSizeWithWaveCount =
+ ST.getMaxLocalMemSizeWithWaveCount(MaxOccupancy, F);
// Program is possibly broken by using more local mem than available.
if (CurrentLocalMemUsage > MaxSizeWithWaveCount)
@@ -924,26 +1278,18 @@ bool AMDGPUPromoteAllocaImpl::hasSufficientLocalMem(const Function &F) {
}
// FIXME: Should try to pick the most likely to be profitable allocas first.
-bool AMDGPUPromoteAllocaImpl::handleAlloca(AllocaInst &I, bool SufficientLDS) {
- // Array allocations are probably not worth handling, since an allocation of
- // the array type is the canonical form.
- if (!I.isStaticAlloca() || I.isArrayAllocation())
+bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToLDS(AllocaInst &I,
+ bool SufficientLDS) {
+ LLVM_DEBUG(dbgs() << "Trying to promote to LDS: " << I << '\n');
+
+ if (DisablePromoteAllocaToLDS) {
+ LLVM_DEBUG(dbgs() << " Promote alloca to LDS is disabled\n");
return false;
+ }
const DataLayout &DL = Mod->getDataLayout();
IRBuilder<> Builder(&I);
- // First try to replace the alloca with a vector
- Type *AllocaTy = I.getAllocatedType();
-
- LLVM_DEBUG(dbgs() << "Trying to promote " << I << '\n');
-
- if (tryPromoteAllocaToVector(&I, DL, MaxVGPRs))
- return true; // Promoted to vector.
-
- if (DisablePromoteAllocaToLDS)
- return false;
-
const Function &ContainingFunction = *I.getParent()->getParent();
CallingConv::ID CC = ContainingFunction.getCallingConv();
@@ -978,7 +1324,8 @@ bool AMDGPUPromoteAllocaImpl::handleAlloca(AllocaInst &I, bool SufficientLDS) {
// could end up using more than the maximum due to alignment padding.
uint32_t NewSize = alignTo(CurrentLocalMemUsage, Alignment);
- uint32_t AllocSize = WorkGroupSize * DL.getTypeAllocSize(AllocaTy);
+ uint32_t AllocSize =
+ WorkGroupSize * DL.getTypeAllocSize(I.getAllocatedType());
NewSize += AllocSize;
if (NewSize > LocalMemLimit) {
@@ -989,7 +1336,7 @@ bool AMDGPUPromoteAllocaImpl::handleAlloca(AllocaInst &I, bool SufficientLDS) {
CurrentLocalMemUsage = NewSize;
- std::vector<Value*> WorkList;
+ std::vector<Value *> WorkList;
if (!collectUsesWithPtrTypes(&I, &I, WorkList)) {
LLVM_DEBUG(dbgs() << " Do not know how to convert all uses\n");
@@ -1021,10 +1368,8 @@ bool AMDGPUPromoteAllocaImpl::handleAlloca(AllocaInst &I, bool SufficientLDS) {
Value *TID = Builder.CreateAdd(Tmp0, Tmp1);
TID = Builder.CreateAdd(TID, TIdZ);
- Value *Indices[] = {
- Constant::getNullValue(Type::getInt32Ty(Mod->getContext())),
- TID
- };
+ LLVMContext &Context = Mod->getContext();
+ Value *Indices[] = {Constant::getNullValue(Type::getInt32Ty(Context)), TID};
Value *Offset = Builder.CreateInBoundsGEP(GVTy, GV, Indices);
I.mutateType(Offset->getType());
@@ -1037,9 +1382,7 @@ bool AMDGPUPromoteAllocaImpl::handleAlloca(AllocaInst &I, bool SufficientLDS) {
CallInst *Call = dyn_cast<CallInst>(V);
if (!Call) {
if (ICmpInst *CI = dyn_cast<ICmpInst>(V)) {
- Value *Src0 = CI->getOperand(0);
- PointerType *NewTy = PointerType::getWithSamePointeeType(
- cast<PointerType>(Src0->getType()), AMDGPUAS::LOCAL_ADDRESS);
+ PointerType *NewTy = PointerType::get(Context, AMDGPUAS::LOCAL_ADDRESS);
if (isa<ConstantPointerNull>(CI->getOperand(0)))
CI->setOperand(0, ConstantPointerNull::get(NewTy));
@@ -1055,8 +1398,7 @@ bool AMDGPUPromoteAllocaImpl::handleAlloca(AllocaInst &I, bool SufficientLDS) {
if (isa<AddrSpaceCastInst>(V))
continue;
- PointerType *NewTy = PointerType::getWithSamePointeeType(
- cast<PointerType>(V->getType()), AMDGPUAS::LOCAL_ADDRESS);
+ PointerType *NewTy = PointerType::get(Context, AMDGPUAS::LOCAL_ADDRESS);
// FIXME: It doesn't really make sense to try to do this for all
// instructions.
@@ -1116,8 +1458,7 @@ bool AMDGPUPromoteAllocaImpl::handleAlloca(AllocaInst &I, bool SufficientLDS) {
Function *ObjectSize = Intrinsic::getDeclaration(
Mod, Intrinsic::objectsize,
{Intr->getType(),
- PointerType::getWithSamePointeeType(
- cast<PointerType>(Src->getType()), AMDGPUAS::LOCAL_ADDRESS)});
+ PointerType::get(Context, AMDGPUAS::LOCAL_ADDRESS)});
CallInst *NewCall = Builder.CreateCall(
ObjectSize,
@@ -1138,10 +1479,9 @@ bool AMDGPUPromoteAllocaImpl::handleAlloca(AllocaInst &I, bool SufficientLDS) {
assert(ID == Intrinsic::memcpy || ID == Intrinsic::memmove);
MemTransferInst *MI = cast<MemTransferInst>(Intr);
- auto *B =
- Builder.CreateMemTransferInst(ID, MI->getRawDest(), MI->getDestAlign(),
- MI->getRawSource(), MI->getSourceAlign(),
- MI->getLength(), MI->isVolatile());
+ auto *B = Builder.CreateMemTransferInst(
+ ID, MI->getRawDest(), MI->getDestAlign(), MI->getRawSource(),
+ MI->getSourceAlign(), MI->getLength(), MI->isVolatile());
for (unsigned I = 0; I != 2; ++I) {
if (uint64_t Bytes = Intr->getParamDereferenceableBytes(I)) {
@@ -1154,80 +1494,3 @@ bool AMDGPUPromoteAllocaImpl::handleAlloca(AllocaInst &I, bool SufficientLDS) {
return true;
}
-
-bool handlePromoteAllocaToVector(AllocaInst &I, unsigned MaxVGPRs) {
- // Array allocations are probably not worth handling, since an allocation of
- // the array type is the canonical form.
- if (!I.isStaticAlloca() || I.isArrayAllocation())
- return false;
-
- LLVM_DEBUG(dbgs() << "Trying to promote " << I << '\n');
-
- Module *Mod = I.getParent()->getParent()->getParent();
- return tryPromoteAllocaToVector(&I, Mod->getDataLayout(), MaxVGPRs);
-}
-
-bool promoteAllocasToVector(Function &F, TargetMachine &TM) {
- if (DisablePromoteAllocaToVector)
- return false;
-
- const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(TM, F);
- if (!ST.isPromoteAllocaEnabled())
- return false;
-
- unsigned MaxVGPRs;
- if (TM.getTargetTriple().getArch() == Triple::amdgcn) {
- const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
- MaxVGPRs = ST.getMaxNumVGPRs(ST.getWavesPerEU(F).first);
- // A non-entry function has only 32 caller preserved registers.
- // Do not promote alloca which will force spilling.
- if (!AMDGPU::isEntryFunctionCC(F.getCallingConv()))
- MaxVGPRs = std::min(MaxVGPRs, 32u);
- } else {
- MaxVGPRs = 128;
- }
-
- bool Changed = false;
- BasicBlock &EntryBB = *F.begin();
-
- SmallVector<AllocaInst *, 16> Allocas;
- for (Instruction &I : EntryBB) {
- if (AllocaInst *AI = dyn_cast<AllocaInst>(&I))
- Allocas.push_back(AI);
- }
-
- for (AllocaInst *AI : Allocas) {
- if (handlePromoteAllocaToVector(*AI, MaxVGPRs))
- Changed = true;
- }
-
- return Changed;
-}
-
-bool AMDGPUPromoteAllocaToVector::runOnFunction(Function &F) {
- if (skipFunction(F))
- return false;
- if (auto *TPC = getAnalysisIfAvailable<TargetPassConfig>()) {
- return promoteAllocasToVector(F, TPC->getTM<TargetMachine>());
- }
- return false;
-}
-
-PreservedAnalyses
-AMDGPUPromoteAllocaToVectorPass::run(Function &F, FunctionAnalysisManager &AM) {
- bool Changed = promoteAllocasToVector(F, TM);
- if (Changed) {
- PreservedAnalyses PA;
- PA.preserveSet<CFGAnalyses>();
- return PA;
- }
- return PreservedAnalyses::all();
-}
-
-FunctionPass *llvm::createAMDGPUPromoteAlloca() {
- return new AMDGPUPromoteAlloca();
-}
-
-FunctionPass *llvm::createAMDGPUPromoteAllocaToVector() {
- return new AMDGPUPromoteAllocaToVector();
-}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPromoteKernelArguments.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPromoteKernelArguments.cpp
index ed450f59e4b3..9b654a2bba7f 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPromoteKernelArguments.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPromoteKernelArguments.cpp
@@ -116,7 +116,7 @@ bool AMDGPUPromoteKernelArguments::promotePointer(Value *Ptr) {
// Cast pointer to global address space and back to flat and let
// Infer Address Spaces pass to do all necessary rewriting.
PointerType *NewPT =
- PointerType::getWithSamePointeeType(PT, AMDGPUAS::GLOBAL_ADDRESS);
+ PointerType::get(PT->getContext(), AMDGPUAS::GLOBAL_ADDRESS);
Value *Cast =
B.CreateAddrSpaceCast(Ptr, NewPT, Twine(Ptr->getName(), ".global"));
Value *CastBack =
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPropagateAttributes.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPropagateAttributes.cpp
deleted file mode 100644
index 5a4ab467731e..000000000000
--- a/llvm/lib/Target/AMDGPU/AMDGPUPropagateAttributes.cpp
+++ /dev/null
@@ -1,426 +0,0 @@
-//===--- AMDGPUPropagateAttributes.cpp --------------------------*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-/// \file
-/// \brief This pass propagates attributes from kernels to the non-entry
-/// functions. Most of the library functions were not compiled for specific ABI,
-/// yet will be correctly compiled if proper attributes are propagated from the
-/// caller.
-///
-/// The pass analyzes call graph and propagates ABI target features through the
-/// call graph.
-///
-/// It can run in two modes: as a function or module pass. A function pass
-/// simply propagates attributes. A module pass clones functions if there are
-/// callers with different ABI. If a function is cloned all call sites will
-/// be updated to use a correct clone.
-///
-/// A function pass is limited in functionality but can run early in the
-/// pipeline. A module pass is more powerful but has to run late, so misses
-/// library folding opportunities.
-//
-//===----------------------------------------------------------------------===//
-
-#include "AMDGPU.h"
-#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
-#include "Utils/AMDGPUBaseInfo.h"
-#include "llvm/ADT/SmallSet.h"
-#include "llvm/CodeGen/TargetPassConfig.h"
-#include "llvm/CodeGen/TargetSubtargetInfo.h"
-#include "llvm/IR/InstrTypes.h"
-#include "llvm/Target/TargetMachine.h"
-#include "llvm/Transforms/Utils/Cloning.h"
-
-#define DEBUG_TYPE "amdgpu-propagate-attributes"
-
-using namespace llvm;
-
-namespace llvm {
-extern const SubtargetFeatureKV AMDGPUFeatureKV[AMDGPU::NumSubtargetFeatures-1];
-}
-
-namespace {
-
-// Target features to propagate.
-static constexpr const FeatureBitset TargetFeatures = {
- AMDGPU::FeatureWavefrontSize16,
- AMDGPU::FeatureWavefrontSize32,
- AMDGPU::FeatureWavefrontSize64
-};
-
-// Attributes to propagate.
-// TODO: Support conservative min/max merging instead of cloning.
-static constexpr const char *AttributeNames[] = {"amdgpu-waves-per-eu"};
-
-static constexpr unsigned NumAttr = std::size(AttributeNames);
-
-class AMDGPUPropagateAttributes {
-
- class FnProperties {
- private:
- explicit FnProperties(const FeatureBitset &&FB) : Features(FB) {}
-
- public:
- explicit FnProperties(const TargetMachine &TM, const Function &F) {
- Features = TM.getSubtargetImpl(F)->getFeatureBits();
-
- for (unsigned I = 0; I < NumAttr; ++I)
- if (F.hasFnAttribute(AttributeNames[I]))
- Attributes[I] = F.getFnAttribute(AttributeNames[I]);
- }
-
- bool operator == (const FnProperties &Other) const {
- if ((Features & TargetFeatures) != (Other.Features & TargetFeatures))
- return false;
- for (unsigned I = 0; I < NumAttr; ++I)
- if (Attributes[I] != Other.Attributes[I])
- return false;
- return true;
- }
-
- FnProperties adjustToCaller(const FnProperties &CallerProps) const {
- FnProperties New((Features & ~TargetFeatures) | CallerProps.Features);
- for (unsigned I = 0; I < NumAttr; ++I)
- New.Attributes[I] = CallerProps.Attributes[I];
- return New;
- }
-
- FeatureBitset Features;
- std::optional<Attribute> Attributes[NumAttr];
- };
-
- class Clone {
- public:
- Clone(const FnProperties &Props, Function *OrigF, Function *NewF) :
- Properties(Props), OrigF(OrigF), NewF(NewF) {}
-
- FnProperties Properties;
- Function *OrigF;
- Function *NewF;
- };
-
- const TargetMachine *TM;
-
- // Clone functions as needed or just set attributes.
- bool AllowClone;
-
- // Option propagation roots.
- SmallSet<Function *, 32> Roots;
-
- // Clones of functions with their attributes.
- SmallVector<Clone, 32> Clones;
-
- // Find a clone with required features.
- Function *findFunction(const FnProperties &PropsNeeded,
- Function *OrigF);
-
- // Clone function \p F and set \p NewProps on the clone.
- // Cole takes the name of original function.
- Function *cloneWithProperties(Function &F, const FnProperties &NewProps);
-
- // Set new function's features in place.
- void setFeatures(Function &F, const FeatureBitset &NewFeatures);
-
- // Set new function's attributes in place.
- void setAttributes(Function &F,
- const ArrayRef<std::optional<Attribute>> NewAttrs);
-
- std::string getFeatureString(const FeatureBitset &Features) const;
-
- // Propagate attributes from Roots.
- bool process();
-
-public:
- AMDGPUPropagateAttributes(const TargetMachine *TM, bool AllowClone) :
- TM(TM), AllowClone(AllowClone) {}
-
- // Use F as a root and propagate its attributes.
- bool process(Function &F);
-
- // Propagate attributes starting from kernel functions.
- bool process(Module &M);
-};
-
-// Allows to propagate attributes early, but no cloning is allowed as it must
-// be a function pass to run before any optimizations.
-// TODO: We shall only need a one instance of module pass, but that needs to be
-// in the linker pipeline which is currently not possible.
-class AMDGPUPropagateAttributesEarly : public FunctionPass {
- const TargetMachine *TM;
-
-public:
- static char ID; // Pass identification
-
- AMDGPUPropagateAttributesEarly(const TargetMachine *TM = nullptr) :
- FunctionPass(ID), TM(TM) {
- initializeAMDGPUPropagateAttributesEarlyPass(
- *PassRegistry::getPassRegistry());
- }
-
- bool runOnFunction(Function &F) override;
-};
-
-// Allows to propagate attributes with cloning but does that late in the
-// pipeline.
-class AMDGPUPropagateAttributesLate : public ModulePass {
- const TargetMachine *TM;
-
-public:
- static char ID; // Pass identification
-
- AMDGPUPropagateAttributesLate(const TargetMachine *TM = nullptr) :
- ModulePass(ID), TM(TM) {
- initializeAMDGPUPropagateAttributesLatePass(
- *PassRegistry::getPassRegistry());
- }
-
- bool runOnModule(Module &M) override;
-};
-
-} // end anonymous namespace.
-
-char AMDGPUPropagateAttributesEarly::ID = 0;
-char AMDGPUPropagateAttributesLate::ID = 0;
-
-INITIALIZE_PASS(AMDGPUPropagateAttributesEarly,
- "amdgpu-propagate-attributes-early",
- "Early propagate attributes from kernels to functions",
- false, false)
-INITIALIZE_PASS(AMDGPUPropagateAttributesLate,
- "amdgpu-propagate-attributes-late",
- "Late propagate attributes from kernels to functions",
- false, false)
-
-Function *
-AMDGPUPropagateAttributes::findFunction(const FnProperties &PropsNeeded,
- Function *OrigF) {
- // TODO: search for clone's clones.
- for (Clone &C : Clones)
- if (C.OrigF == OrigF && PropsNeeded == C.Properties)
- return C.NewF;
-
- return nullptr;
-}
-
-bool AMDGPUPropagateAttributes::process(Module &M) {
- for (auto &F : M.functions())
- if (AMDGPU::isKernel(F.getCallingConv()))
- Roots.insert(&F);
-
- return Roots.empty() ? false : process();
-}
-
-bool AMDGPUPropagateAttributes::process(Function &F) {
- Roots.insert(&F);
- return process();
-}
-
-bool AMDGPUPropagateAttributes::process() {
- bool Changed = false;
- SmallSet<Function *, 32> NewRoots;
- SmallSet<Function *, 32> Replaced;
-
- assert(!Roots.empty());
- Module &M = *(*Roots.begin())->getParent();
-
- do {
- Roots.insert(NewRoots.begin(), NewRoots.end());
- NewRoots.clear();
-
- for (auto &F : M.functions()) {
- if (F.isDeclaration())
- continue;
-
- const FnProperties CalleeProps(*TM, F);
- SmallVector<std::pair<CallBase *, Function *>, 32> ToReplace;
- SmallSet<CallBase *, 32> Visited;
-
- for (User *U : F.users()) {
- Instruction *I = dyn_cast<Instruction>(U);
- if (!I)
- continue;
- CallBase *CI = dyn_cast<CallBase>(I);
- // Only propagate attributes if F is the called function. Specifically,
- // do not propagate attributes if F is passed as an argument.
- // FIXME: handle bitcasted callee, e.g.
- // %retval = call i8* bitcast (i32* ()* @f to i8* ()*)()
- if (!CI || CI->getCalledOperand() != &F)
- continue;
- Function *Caller = CI->getCaller();
- if (!Caller || !Visited.insert(CI).second)
- continue;
- if (!Roots.count(Caller) && !NewRoots.count(Caller))
- continue;
-
- const FnProperties CallerProps(*TM, *Caller);
-
- if (CalleeProps == CallerProps) {
- if (!Roots.count(&F))
- NewRoots.insert(&F);
- continue;
- }
-
- Function *NewF = findFunction(CallerProps, &F);
- if (!NewF) {
- const FnProperties NewProps = CalleeProps.adjustToCaller(CallerProps);
- if (!AllowClone) {
- // This may set different features on different iterations if
- // there is a contradiction in callers' attributes. In this case
- // we rely on a second pass running on Module, which is allowed
- // to clone.
- setFeatures(F, NewProps.Features);
- setAttributes(F, NewProps.Attributes);
- NewRoots.insert(&F);
- Changed = true;
- break;
- }
-
- NewF = cloneWithProperties(F, NewProps);
- Clones.push_back(Clone(CallerProps, &F, NewF));
- NewRoots.insert(NewF);
- }
-
- ToReplace.push_back(std::pair(CI, NewF));
- Replaced.insert(&F);
-
- Changed = true;
- }
-
- while (!ToReplace.empty()) {
- auto R = ToReplace.pop_back_val();
- R.first->setCalledFunction(R.second);
- }
- }
- } while (!NewRoots.empty());
-
- for (Function *F : Replaced) {
- if (F->use_empty())
- F->eraseFromParent();
- }
-
- Roots.clear();
- Clones.clear();
-
- return Changed;
-}
-
-Function *
-AMDGPUPropagateAttributes::cloneWithProperties(Function &F,
- const FnProperties &NewProps) {
- LLVM_DEBUG(dbgs() << "Cloning " << F.getName() << '\n');
-
- ValueToValueMapTy dummy;
- Function *NewF = CloneFunction(&F, dummy);
- setFeatures(*NewF, NewProps.Features);
- setAttributes(*NewF, NewProps.Attributes);
- NewF->setVisibility(GlobalValue::DefaultVisibility);
- NewF->setLinkage(GlobalValue::InternalLinkage);
-
- // Swap names. If that is the only clone it will retain the name of now
- // dead value. Preserve original name for externally visible functions.
- if (F.hasName() && F.hasLocalLinkage()) {
- std::string NewName = std::string(NewF->getName());
- NewF->takeName(&F);
- F.setName(NewName);
- }
-
- return NewF;
-}
-
-void AMDGPUPropagateAttributes::setFeatures(Function &F,
- const FeatureBitset &NewFeatures) {
- std::string NewFeatureStr = getFeatureString(NewFeatures);
-
- LLVM_DEBUG(dbgs() << "Set features "
- << getFeatureString(NewFeatures & TargetFeatures)
- << " on " << F.getName() << '\n');
-
- F.removeFnAttr("target-features");
- F.addFnAttr("target-features", NewFeatureStr);
-}
-
-void AMDGPUPropagateAttributes::setAttributes(
- Function &F, const ArrayRef<std::optional<Attribute>> NewAttrs) {
- LLVM_DEBUG(dbgs() << "Set attributes on " << F.getName() << ":\n");
- for (unsigned I = 0; I < NumAttr; ++I) {
- F.removeFnAttr(AttributeNames[I]);
- if (NewAttrs[I]) {
- LLVM_DEBUG(dbgs() << '\t' << NewAttrs[I]->getAsString() << '\n');
- F.addFnAttr(*NewAttrs[I]);
- }
- }
-}
-
-std::string
-AMDGPUPropagateAttributes::getFeatureString(const FeatureBitset &Features) const
-{
- std::string Ret;
- for (const SubtargetFeatureKV &KV : AMDGPUFeatureKV) {
- if (Features[KV.Value])
- Ret += (StringRef("+") + KV.Key + ",").str();
- else if (TargetFeatures[KV.Value])
- Ret += (StringRef("-") + KV.Key + ",").str();
- }
- Ret.pop_back(); // Remove last comma.
- return Ret;
-}
-
-bool AMDGPUPropagateAttributesEarly::runOnFunction(Function &F) {
- if (!TM) {
- auto *TPC = getAnalysisIfAvailable<TargetPassConfig>();
- if (!TPC)
- return false;
-
- TM = &TPC->getTM<TargetMachine>();
- }
-
- if (!AMDGPU::isKernel(F.getCallingConv()))
- return false;
-
- return AMDGPUPropagateAttributes(TM, false).process(F);
-}
-
-bool AMDGPUPropagateAttributesLate::runOnModule(Module &M) {
- if (!TM) {
- auto *TPC = getAnalysisIfAvailable<TargetPassConfig>();
- if (!TPC)
- return false;
-
- TM = &TPC->getTM<TargetMachine>();
- }
-
- return AMDGPUPropagateAttributes(TM, true).process(M);
-}
-
-FunctionPass
-*llvm::createAMDGPUPropagateAttributesEarlyPass(const TargetMachine *TM) {
- return new AMDGPUPropagateAttributesEarly(TM);
-}
-
-ModulePass
-*llvm::createAMDGPUPropagateAttributesLatePass(const TargetMachine *TM) {
- return new AMDGPUPropagateAttributesLate(TM);
-}
-
-PreservedAnalyses
-AMDGPUPropagateAttributesEarlyPass::run(Function &F,
- FunctionAnalysisManager &AM) {
- if (!AMDGPU::isEntryFunctionCC(F.getCallingConv()))
- return PreservedAnalyses::all();
-
- return AMDGPUPropagateAttributes(&TM, false).process(F)
- ? PreservedAnalyses::none()
- : PreservedAnalyses::all();
-}
-
-PreservedAnalyses
-AMDGPUPropagateAttributesLatePass::run(Module &M, ModuleAnalysisManager &AM) {
- return AMDGPUPropagateAttributes(&TM, true).process(M)
- ? PreservedAnalyses::none()
- : PreservedAnalyses::all();
-}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp
index b4315950b225..c935e384da8e 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp
@@ -20,37 +20,55 @@
#include "llvm/CodeGen/GlobalISel/Combiner.h"
#include "llvm/CodeGen/GlobalISel/CombinerHelper.h"
#include "llvm/CodeGen/GlobalISel/CombinerInfo.h"
+#include "llvm/CodeGen/GlobalISel/GIMatchTableExecutor.h"
+#include "llvm/CodeGen/GlobalISel/GIMatchTableExecutorImpl.h"
#include "llvm/CodeGen/GlobalISel/GISelKnownBits.h"
#include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
#include "llvm/CodeGen/MachineDominators.h"
#include "llvm/CodeGen/TargetPassConfig.h"
#include "llvm/IR/IntrinsicsAMDGPU.h"
#include "llvm/Target/TargetMachine.h"
+
+#define GET_GICOMBINER_DEPS
+#include "AMDGPUGenPreLegalizeGICombiner.inc"
+#undef GET_GICOMBINER_DEPS
+
#define DEBUG_TYPE "amdgpu-regbank-combiner"
using namespace llvm;
using namespace MIPatternMatch;
-class AMDGPURegBankCombinerHelper {
+namespace {
+#define GET_GICOMBINER_TYPES
+#include "AMDGPUGenRegBankGICombiner.inc"
+#undef GET_GICOMBINER_TYPES
+
+class AMDGPURegBankCombinerImpl : public GIMatchTableExecutor {
protected:
+ const AMDGPURegBankCombinerImplRuleConfig &RuleConfig;
+
MachineIRBuilder &B;
MachineFunction &MF;
MachineRegisterInfo &MRI;
- const GCNSubtarget &Subtarget;
+ const GCNSubtarget &STI;
const RegisterBankInfo &RBI;
const TargetRegisterInfo &TRI;
const SIInstrInfo &TII;
CombinerHelper &Helper;
+ GISelChangeObserver &Observer;
public:
- AMDGPURegBankCombinerHelper(MachineIRBuilder &B, CombinerHelper &Helper)
- : B(B), MF(B.getMF()), MRI(*B.getMRI()),
- Subtarget(MF.getSubtarget<GCNSubtarget>()),
- RBI(*Subtarget.getRegBankInfo()), TRI(*Subtarget.getRegisterInfo()),
- TII(*Subtarget.getInstrInfo()), Helper(Helper){};
+ AMDGPURegBankCombinerImpl(
+ const AMDGPURegBankCombinerImplRuleConfig &RuleConfig,
+ MachineIRBuilder &B, CombinerHelper &Helper,
+ GISelChangeObserver &Observer);
+
+ static const char *getName() { return "AMDGPURegBankCombinerImpl"; }
+
+ bool tryCombineAll(MachineInstr &I) const;
- bool isVgprRegBank(Register Reg);
- Register getAsVgpr(Register Reg);
+ bool isVgprRegBank(Register Reg) const;
+ Register getAsVgpr(Register Reg) const;
struct MinMaxMedOpc {
unsigned Min, Max, Med;
@@ -61,33 +79,58 @@ public:
Register Val0, Val1, Val2;
};
- MinMaxMedOpc getMinMaxPair(unsigned Opc);
+ MinMaxMedOpc getMinMaxPair(unsigned Opc) const;
template <class m_Cst, typename CstTy>
bool matchMed(MachineInstr &MI, MachineRegisterInfo &MRI, MinMaxMedOpc MMMOpc,
- Register &Val, CstTy &K0, CstTy &K1);
+ Register &Val, CstTy &K0, CstTy &K1) const;
- bool matchIntMinMaxToMed3(MachineInstr &MI, Med3MatchInfo &MatchInfo);
- bool matchFPMinMaxToMed3(MachineInstr &MI, Med3MatchInfo &MatchInfo);
- bool matchFPMinMaxToClamp(MachineInstr &MI, Register &Reg);
- bool matchFPMed3ToClamp(MachineInstr &MI, Register &Reg);
- void applyMed3(MachineInstr &MI, Med3MatchInfo &MatchInfo);
- void applyClamp(MachineInstr &MI, Register &Reg);
+ bool matchIntMinMaxToMed3(MachineInstr &MI, Med3MatchInfo &MatchInfo) const;
+ bool matchFPMinMaxToMed3(MachineInstr &MI, Med3MatchInfo &MatchInfo) const;
+ bool matchFPMinMaxToClamp(MachineInstr &MI, Register &Reg) const;
+ bool matchFPMed3ToClamp(MachineInstr &MI, Register &Reg) const;
+ void applyMed3(MachineInstr &MI, Med3MatchInfo &MatchInfo) const;
+ void applyClamp(MachineInstr &MI, Register &Reg) const;
private:
- AMDGPU::SIModeRegisterDefaults getMode();
- bool getIEEE();
- bool getDX10Clamp();
- bool isFminnumIeee(const MachineInstr &MI);
- bool isFCst(MachineInstr *MI);
- bool isClampZeroToOne(MachineInstr *K0, MachineInstr *K1);
+ SIModeRegisterDefaults getMode() const;
+ bool getIEEE() const;
+ bool getDX10Clamp() const;
+ bool isFminnumIeee(const MachineInstr &MI) const;
+ bool isFCst(MachineInstr *MI) const;
+ bool isClampZeroToOne(MachineInstr *K0, MachineInstr *K1) const;
+
+#define GET_GICOMBINER_CLASS_MEMBERS
+#define AMDGPUSubtarget GCNSubtarget
+#include "AMDGPUGenRegBankGICombiner.inc"
+#undef GET_GICOMBINER_CLASS_MEMBERS
+#undef AMDGPUSubtarget
};
-bool AMDGPURegBankCombinerHelper::isVgprRegBank(Register Reg) {
+#define GET_GICOMBINER_IMPL
+#define AMDGPUSubtarget GCNSubtarget
+#include "AMDGPUGenRegBankGICombiner.inc"
+#undef AMDGPUSubtarget
+#undef GET_GICOMBINER_IMPL
+
+AMDGPURegBankCombinerImpl::AMDGPURegBankCombinerImpl(
+ const AMDGPURegBankCombinerImplRuleConfig &RuleConfig, MachineIRBuilder &B,
+ CombinerHelper &Helper, GISelChangeObserver &Observer)
+ : RuleConfig(RuleConfig), B(B), MF(B.getMF()), MRI(*B.getMRI()),
+ STI(MF.getSubtarget<GCNSubtarget>()), RBI(*STI.getRegBankInfo()),
+ TRI(*STI.getRegisterInfo()), TII(*STI.getInstrInfo()), Helper(Helper),
+ Observer(Observer),
+#define GET_GICOMBINER_CONSTRUCTOR_INITS
+#include "AMDGPUGenRegBankGICombiner.inc"
+#undef GET_GICOMBINER_CONSTRUCTOR_INITS
+{
+}
+
+bool AMDGPURegBankCombinerImpl::isVgprRegBank(Register Reg) const {
return RBI.getRegBank(Reg, MRI, TRI)->getID() == AMDGPU::VGPRRegBankID;
}
-Register AMDGPURegBankCombinerHelper::getAsVgpr(Register Reg) {
+Register AMDGPURegBankCombinerImpl::getAsVgpr(Register Reg) const {
if (isVgprRegBank(Reg))
return Reg;
@@ -104,8 +147,8 @@ Register AMDGPURegBankCombinerHelper::getAsVgpr(Register Reg) {
return VgprReg;
}
-AMDGPURegBankCombinerHelper::MinMaxMedOpc
-AMDGPURegBankCombinerHelper::getMinMaxPair(unsigned Opc) {
+AMDGPURegBankCombinerImpl::MinMaxMedOpc
+AMDGPURegBankCombinerImpl::getMinMaxPair(unsigned Opc) const {
switch (Opc) {
default:
llvm_unreachable("Unsupported opcode");
@@ -126,10 +169,10 @@ AMDGPURegBankCombinerHelper::getMinMaxPair(unsigned Opc) {
}
template <class m_Cst, typename CstTy>
-bool AMDGPURegBankCombinerHelper::matchMed(MachineInstr &MI,
- MachineRegisterInfo &MRI,
- MinMaxMedOpc MMMOpc, Register &Val,
- CstTy &K0, CstTy &K1) {
+bool AMDGPURegBankCombinerImpl::matchMed(MachineInstr &MI,
+ MachineRegisterInfo &MRI,
+ MinMaxMedOpc MMMOpc, Register &Val,
+ CstTy &K0, CstTy &K1) const {
// 4 operand commutes of: min(max(Val, K0), K1).
// Find K1 from outer instr: min(max(...), K1) or min(K1, max(...)).
// Find K0 and Val from inner instr: max(K0, Val) or max(Val, K0).
@@ -147,16 +190,15 @@ bool AMDGPURegBankCombinerHelper::matchMed(MachineInstr &MI,
m_Cst(K0))));
}
-bool AMDGPURegBankCombinerHelper::matchIntMinMaxToMed3(
- MachineInstr &MI, Med3MatchInfo &MatchInfo) {
+bool AMDGPURegBankCombinerImpl::matchIntMinMaxToMed3(
+ MachineInstr &MI, Med3MatchInfo &MatchInfo) const {
Register Dst = MI.getOperand(0).getReg();
if (!isVgprRegBank(Dst))
return false;
// med3 for i16 is only available on gfx9+, and not available for v2i16.
LLT Ty = MRI.getType(Dst);
- if ((Ty != LLT::scalar(16) || !Subtarget.hasMed3_16()) &&
- Ty != LLT::scalar(32))
+ if ((Ty != LLT::scalar(16) || !STI.hasMed3_16()) && Ty != LLT::scalar(32))
return false;
MinMaxMedOpc OpcodeTriple = getMinMaxPair(MI.getOpcode());
@@ -193,14 +235,13 @@ bool AMDGPURegBankCombinerHelper::matchIntMinMaxToMed3(
// fmed3(NaN, K0, K1) = min(min(NaN, K0), K1) = min(K0, K1) = K0
// min(max(NaN, K0), K1) = min(K0, K1) = K0 (can clamp when dx10_clamp = true)
// max(min(NaN, K1), K0) = max(K1, K0) = K1 != K0
-bool AMDGPURegBankCombinerHelper::matchFPMinMaxToMed3(
- MachineInstr &MI, Med3MatchInfo &MatchInfo) {
+bool AMDGPURegBankCombinerImpl::matchFPMinMaxToMed3(
+ MachineInstr &MI, Med3MatchInfo &MatchInfo) const {
Register Dst = MI.getOperand(0).getReg();
LLT Ty = MRI.getType(Dst);
// med3 for f16 is only available on gfx9+, and not available for v2f16.
- if ((Ty != LLT::scalar(16) || !Subtarget.hasMed3_16()) &&
- Ty != LLT::scalar(32))
+ if ((Ty != LLT::scalar(16) || !STI.hasMed3_16()) && Ty != LLT::scalar(32))
return false;
auto OpcodeTriple = getMinMaxPair(MI.getOpcode());
@@ -233,8 +274,8 @@ bool AMDGPURegBankCombinerHelper::matchFPMinMaxToMed3(
return false;
}
-bool AMDGPURegBankCombinerHelper::matchFPMinMaxToClamp(MachineInstr &MI,
- Register &Reg) {
+bool AMDGPURegBankCombinerImpl::matchFPMinMaxToClamp(MachineInstr &MI,
+ Register &Reg) const {
// Clamp is available on all types after regbankselect (f16, f32, f64, v2f16).
auto OpcodeTriple = getMinMaxPair(MI.getOpcode());
Register Val;
@@ -269,16 +310,13 @@ bool AMDGPURegBankCombinerHelper::matchFPMinMaxToClamp(MachineInstr &MI,
// min(min(NaN, 0.0), 1.0) = min(0.0, 1.0) = 0.0
// min(min(NaN, 1.0), 0.0) = min(1.0, 0.0) = 0.0
// min(min(0.0, 1.0), NaN) = min(0.0, NaN) = 0.0
-bool AMDGPURegBankCombinerHelper::matchFPMed3ToClamp(MachineInstr &MI,
- Register &Reg) {
- if (MI.getIntrinsicID() != Intrinsic::amdgcn_fmed3)
- return false;
-
+bool AMDGPURegBankCombinerImpl::matchFPMed3ToClamp(MachineInstr &MI,
+ Register &Reg) const {
// In llvm-ir, clamp is often represented as an intrinsic call to
// @llvm.amdgcn.fmed3.f32(%Val, 0.0, 1.0). Check for other operand orders.
- MachineInstr *Src0 = getDefIgnoringCopies(MI.getOperand(2).getReg(), MRI);
- MachineInstr *Src1 = getDefIgnoringCopies(MI.getOperand(3).getReg(), MRI);
- MachineInstr *Src2 = getDefIgnoringCopies(MI.getOperand(4).getReg(), MRI);
+ MachineInstr *Src0 = getDefIgnoringCopies(MI.getOperand(1).getReg(), MRI);
+ MachineInstr *Src1 = getDefIgnoringCopies(MI.getOperand(2).getReg(), MRI);
+ MachineInstr *Src2 = getDefIgnoringCopies(MI.getOperand(3).getReg(), MRI);
if (isFCst(Src0) && !isFCst(Src1))
std::swap(Src0, Src1);
@@ -311,15 +349,16 @@ bool AMDGPURegBankCombinerHelper::matchFPMed3ToClamp(MachineInstr &MI,
return false;
}
-void AMDGPURegBankCombinerHelper::applyClamp(MachineInstr &MI, Register &Reg) {
+void AMDGPURegBankCombinerImpl::applyClamp(MachineInstr &MI,
+ Register &Reg) const {
B.setInstrAndDebugLoc(MI);
B.buildInstr(AMDGPU::G_AMDGPU_CLAMP, {MI.getOperand(0)}, {Reg},
MI.getFlags());
MI.eraseFromParent();
}
-void AMDGPURegBankCombinerHelper::applyMed3(MachineInstr &MI,
- Med3MatchInfo &MatchInfo) {
+void AMDGPURegBankCombinerImpl::applyMed3(MachineInstr &MI,
+ Med3MatchInfo &MatchInfo) const {
B.setInstrAndDebugLoc(MI);
B.buildInstr(MatchInfo.Opc, {MI.getOperand(0)},
{getAsVgpr(MatchInfo.Val0), getAsVgpr(MatchInfo.Val1),
@@ -328,24 +367,26 @@ void AMDGPURegBankCombinerHelper::applyMed3(MachineInstr &MI,
MI.eraseFromParent();
}
-AMDGPU::SIModeRegisterDefaults AMDGPURegBankCombinerHelper::getMode() {
+SIModeRegisterDefaults AMDGPURegBankCombinerImpl::getMode() const {
return MF.getInfo<SIMachineFunctionInfo>()->getMode();
}
-bool AMDGPURegBankCombinerHelper::getIEEE() { return getMode().IEEE; }
+bool AMDGPURegBankCombinerImpl::getIEEE() const { return getMode().IEEE; }
-bool AMDGPURegBankCombinerHelper::getDX10Clamp() { return getMode().DX10Clamp; }
+bool AMDGPURegBankCombinerImpl::getDX10Clamp() const {
+ return getMode().DX10Clamp;
+}
-bool AMDGPURegBankCombinerHelper::isFminnumIeee(const MachineInstr &MI) {
+bool AMDGPURegBankCombinerImpl::isFminnumIeee(const MachineInstr &MI) const {
return MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE;
}
-bool AMDGPURegBankCombinerHelper::isFCst(MachineInstr *MI) {
+bool AMDGPURegBankCombinerImpl::isFCst(MachineInstr *MI) const {
return MI->getOpcode() == AMDGPU::G_FCONSTANT;
}
-bool AMDGPURegBankCombinerHelper::isClampZeroToOne(MachineInstr *K0,
- MachineInstr *K1) {
+bool AMDGPURegBankCombinerImpl::isClampZeroToOne(MachineInstr *K0,
+ MachineInstr *K1) const {
if (isFCst(K0) && isFCst(K1)) {
const ConstantFP *KO_FPImm = K0->getOperand(1).getFPImm();
const ConstantFP *K1_FPImm = K1->getOperand(1).getFPImm();
@@ -355,40 +396,19 @@ bool AMDGPURegBankCombinerHelper::isClampZeroToOne(MachineInstr *K0,
return false;
}
-class AMDGPURegBankCombinerHelperState {
-protected:
- CombinerHelper &Helper;
- AMDGPURegBankCombinerHelper &RegBankHelper;
-
-public:
- AMDGPURegBankCombinerHelperState(CombinerHelper &Helper,
- AMDGPURegBankCombinerHelper &RegBankHelper)
- : Helper(Helper), RegBankHelper(RegBankHelper) {}
-};
-
-#define AMDGPUREGBANKCOMBINERHELPER_GENCOMBINERHELPER_DEPS
-#include "AMDGPUGenRegBankGICombiner.inc"
-#undef AMDGPUREGBANKCOMBINERHELPER_GENCOMBINERHELPER_DEPS
-
-namespace {
-#define AMDGPUREGBANKCOMBINERHELPER_GENCOMBINERHELPER_H
-#include "AMDGPUGenRegBankGICombiner.inc"
-#undef AMDGPUREGBANKCOMBINERHELPER_GENCOMBINERHELPER_H
-
class AMDGPURegBankCombinerInfo final : public CombinerInfo {
GISelKnownBits *KB;
MachineDominatorTree *MDT;
+ AMDGPURegBankCombinerImplRuleConfig RuleConfig;
public:
- AMDGPUGenRegBankCombinerHelperRuleConfig GeneratedRuleCfg;
-
AMDGPURegBankCombinerInfo(bool EnableOpt, bool OptSize, bool MinSize,
- const AMDGPULegalizerInfo *LI,
- GISelKnownBits *KB, MachineDominatorTree *MDT)
+ const AMDGPULegalizerInfo *LI, GISelKnownBits *KB,
+ MachineDominatorTree *MDT)
: CombinerInfo(/*AllowIllegalOps*/ false, /*ShouldLegalizeIllegal*/ true,
/*LegalizerInfo*/ LI, EnableOpt, OptSize, MinSize),
KB(KB), MDT(MDT) {
- if (!GeneratedRuleCfg.parseCommandLineOption())
+ if (!RuleConfig.parseCommandLineOption())
report_fatal_error("Invalid rule identifier");
}
@@ -397,23 +417,15 @@ public:
};
bool AMDGPURegBankCombinerInfo::combine(GISelChangeObserver &Observer,
- MachineInstr &MI,
- MachineIRBuilder &B) const {
+ MachineInstr &MI,
+ MachineIRBuilder &B) const {
CombinerHelper Helper(Observer, B, /* IsPreLegalize*/ false, KB, MDT);
- AMDGPURegBankCombinerHelper RegBankHelper(B, Helper);
- AMDGPUGenRegBankCombinerHelper Generated(GeneratedRuleCfg, Helper,
- RegBankHelper);
-
- if (Generated.tryCombineAll(Observer, MI, B))
- return true;
-
- return false;
+ // TODO: Do not re-create the Impl on every inst, it should be per function.
+ AMDGPURegBankCombinerImpl Impl(RuleConfig, B, Helper, Observer);
+ Impl.setupMF(*MI.getMF(), KB);
+ return Impl.tryCombineAll(MI);
}
-#define AMDGPUREGBANKCOMBINERHELPER_GENCOMBINERHELPER_CPP
-#include "AMDGPUGenRegBankGICombiner.inc"
-#undef AMDGPUREGBANKCOMBINERHELPER_GENCOMBINERHELPER_CPP
-
// Pass boilerplate
// ================
@@ -423,9 +435,7 @@ public:
AMDGPURegBankCombiner(bool IsOptNone = false);
- StringRef getPassName() const override {
- return "AMDGPURegBankCombiner";
- }
+ StringRef getPassName() const override { return "AMDGPURegBankCombiner"; }
bool runOnMachineFunction(MachineFunction &MF) override;
@@ -449,7 +459,7 @@ void AMDGPURegBankCombiner::getAnalysisUsage(AnalysisUsage &AU) const {
}
AMDGPURegBankCombiner::AMDGPURegBankCombiner(bool IsOptNone)
- : MachineFunctionPass(ID), IsOptNone(IsOptNone) {
+ : MachineFunctionPass(ID), IsOptNone(IsOptNone) {
initializeAMDGPURegBankCombinerPass(*PassRegistry::getPassRegistry());
}
@@ -463,14 +473,14 @@ bool AMDGPURegBankCombiner::runOnMachineFunction(MachineFunction &MF) {
MF.getTarget().getOptLevel() != CodeGenOpt::None && !skipFunction(F);
const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
- const AMDGPULegalizerInfo *LI
- = static_cast<const AMDGPULegalizerInfo *>(ST.getLegalizerInfo());
+ const AMDGPULegalizerInfo *LI =
+ static_cast<const AMDGPULegalizerInfo *>(ST.getLegalizerInfo());
GISelKnownBits *KB = &getAnalysis<GISelKnownBitsAnalysis>().get(MF);
MachineDominatorTree *MDT =
IsOptNone ? nullptr : &getAnalysis<MachineDominatorTree>();
- AMDGPURegBankCombinerInfo PCInfo(EnableOpt, F.hasOptSize(),
- F.hasMinSize(), LI, KB, MDT);
+ AMDGPURegBankCombinerInfo PCInfo(EnableOpt, F.hasOptSize(), F.hasMinSize(),
+ LI, KB, MDT);
Combiner C(PCInfo, TPC);
return C.combineMachineInstrs(MF, /*CSEInfo*/ nullptr);
}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankSelect.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankSelect.cpp
new file mode 100644
index 000000000000..2ea03ddb1fcc
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankSelect.cpp
@@ -0,0 +1,77 @@
+//===- AMDGPURegBankSelect.cpp -----------------------------------*- C++ -*-==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Use MachineUniformityAnalysis as the primary basis for making SGPR vs. VGPR
+// register bank selection. Use/def analysis as in the default RegBankSelect can
+// be useful in narrower circumstances (e.g. choosing AGPR vs. VGPR for gfx908).
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPURegBankSelect.h"
+#include "AMDGPU.h"
+#include "GCNSubtarget.h"
+#include "llvm/CodeGen/MachineUniformityAnalysis.h"
+#include "llvm/InitializePasses.h"
+
+#define DEBUG_TYPE "regbankselect"
+
+using namespace llvm;
+
+AMDGPURegBankSelect::AMDGPURegBankSelect(Mode RunningMode)
+ : RegBankSelect(AMDGPURegBankSelect::ID, RunningMode) {}
+
+char AMDGPURegBankSelect::ID = 0;
+
+StringRef AMDGPURegBankSelect::getPassName() const {
+ return "AMDGPURegBankSelect";
+}
+
+void AMDGPURegBankSelect::getAnalysisUsage(AnalysisUsage &AU) const {
+ AU.addRequired<MachineCycleInfoWrapperPass>();
+ AU.addRequired<MachineDominatorTree>();
+ // TODO: Preserve DomTree
+ RegBankSelect::getAnalysisUsage(AU);
+}
+
+INITIALIZE_PASS_BEGIN(AMDGPURegBankSelect, "amdgpu-" DEBUG_TYPE,
+ "AMDGPU Register Bank Select", false, false)
+INITIALIZE_PASS_DEPENDENCY(MachineCycleInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
+INITIALIZE_PASS_END(AMDGPURegBankSelect, "amdgpu-" DEBUG_TYPE,
+ "AMDGPU Register Bank Select", false, false)
+
+bool AMDGPURegBankSelect::runOnMachineFunction(MachineFunction &MF) {
+ // If the ISel pipeline failed, do not bother running that pass.
+ if (MF.getProperties().hasProperty(
+ MachineFunctionProperties::Property::FailedISel))
+ return false;
+
+ LLVM_DEBUG(dbgs() << "Assign register banks for: " << MF.getName() << '\n');
+ const Function &F = MF.getFunction();
+ Mode SaveOptMode = OptMode;
+ if (F.hasOptNone())
+ OptMode = Mode::Fast;
+ init(MF);
+
+ assert(checkFunctionIsLegal(MF));
+
+ const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
+ MachineCycleInfo &CycleInfo =
+ getAnalysis<MachineCycleInfoWrapperPass>().getCycleInfo();
+ MachineDominatorTree &DomTree = getAnalysis<MachineDominatorTree>();
+
+ MachineUniformityInfo Uniformity =
+ computeMachineUniformityInfo(MF, CycleInfo, DomTree.getBase(),
+ !ST.isSingleLaneExecution(F));
+ (void)Uniformity; // TODO: Use this
+
+ assignRegisterBanks(MF);
+
+ OptMode = SaveOptMode;
+ return false;
+}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankSelect.h b/llvm/lib/Target/AMDGPU/AMDGPURegBankSelect.h
new file mode 100644
index 000000000000..83e4a6b41da1
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankSelect.h
@@ -0,0 +1,29 @@
+//===- AMDGPURegBankSelect.h -------------------------------------*- C++ -*-==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUREGBANKSELECT_H
+#define LLVM_LIB_TARGET_AMDGPU_AMDGPUREGBANKSELECT_H
+
+#include "llvm/CodeGen/GlobalISel/RegBankSelect.h"
+
+namespace llvm {
+
+class AMDGPURegBankSelect final : public RegBankSelect {
+public:
+ static char ID;
+
+ AMDGPURegBankSelect(Mode RunningMode = Fast);
+
+ StringRef getPassName() const override;
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override;
+ bool runOnMachineFunction(MachineFunction &MF) override;
+};
+
+} // namespace llvm
+#endif
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
index 5e16a405f375..0203af32e389 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -215,6 +215,10 @@ static bool isVectorRegisterBank(const RegisterBank &Bank) {
return BankID == AMDGPU::VGPRRegBankID || BankID == AMDGPU::AGPRRegBankID;
}
+bool AMDGPURegisterBankInfo::isDivergentRegBank(const RegisterBank *RB) const {
+ return RB != &AMDGPU::SGPRRegBank;
+}
+
unsigned AMDGPURegisterBankInfo::copyCost(const RegisterBank &Dst,
const RegisterBank &Src,
unsigned Size) const {
@@ -846,10 +850,7 @@ bool AMDGPURegisterBankInfo::executeInWaterfallLoop(
assert(std::distance(NewBegin, NewEnd) == OrigRangeSize);
for (MachineInstr &MI : make_range(NewBegin, NewEnd)) {
- for (MachineOperand &Op : MI.uses()) {
- if (!Op.isReg() || Op.isDef())
- continue;
-
+ for (MachineOperand &Op : MI.all_uses()) {
Register OldReg = Op.getReg();
if (!SGPROperandRegs.count(OldReg))
continue;
@@ -1233,31 +1234,18 @@ bool AMDGPURegisterBankInfo::applyMappingImage(
return true;
}
-static Register getSrcRegIgnoringCopies(const MachineRegisterInfo &MRI,
- Register Reg) {
- MachineInstr *Def = getDefIgnoringCopies(Reg, MRI);
- if (!Def)
- return Reg;
-
- // TODO: Guard against this being an implicit def
- return Def->getOperand(0).getReg();
-}
-
// Analyze a combined offset from an llvm.amdgcn.s.buffer intrinsic and store
// the three offsets (voffset, soffset and instoffset)
-static unsigned setBufferOffsets(MachineIRBuilder &B,
- const AMDGPURegisterBankInfo &RBI,
- Register CombinedOffset, Register &VOffsetReg,
- Register &SOffsetReg, int64_t &InstOffsetVal,
- Align Alignment) {
+unsigned AMDGPURegisterBankInfo::setBufferOffsets(
+ MachineIRBuilder &B, Register CombinedOffset, Register &VOffsetReg,
+ Register &SOffsetReg, int64_t &InstOffsetVal, Align Alignment) const {
const LLT S32 = LLT::scalar(32);
MachineRegisterInfo *MRI = B.getMRI();
if (std::optional<int64_t> Imm =
getIConstantVRegSExtVal(CombinedOffset, *MRI)) {
uint32_t SOffset, ImmOffset;
- if (AMDGPU::splitMUBUFOffset(*Imm, SOffset, ImmOffset, &RBI.Subtarget,
- Alignment)) {
+ if (TII->splitMUBUFOffset(*Imm, SOffset, ImmOffset, Alignment)) {
VOffsetReg = B.buildConstant(S32, 0).getReg(0);
SOffsetReg = B.buildConstant(S32, SOffset).getReg(0);
InstOffsetVal = ImmOffset;
@@ -1275,9 +1263,9 @@ static unsigned setBufferOffsets(MachineIRBuilder &B,
AMDGPU::getBaseWithConstantOffset(*MRI, CombinedOffset);
uint32_t SOffset, ImmOffset;
- if ((int)Offset > 0 && AMDGPU::splitMUBUFOffset(Offset, SOffset, ImmOffset,
- &RBI.Subtarget, Alignment)) {
- if (RBI.getRegBank(Base, *MRI, *RBI.TRI) == &AMDGPU::VGPRRegBank) {
+ if ((int)Offset > 0 &&
+ TII->splitMUBUFOffset(Offset, SOffset, ImmOffset, Alignment)) {
+ if (getRegBank(Base, *MRI, *TRI) == &AMDGPU::VGPRRegBank) {
VOffsetReg = Base;
SOffsetReg = B.buildConstant(S32, SOffset).getReg(0);
B.getMRI()->setRegBank(SOffsetReg, AMDGPU::SGPRRegBank);
@@ -1298,11 +1286,11 @@ static unsigned setBufferOffsets(MachineIRBuilder &B,
// Handle the variable sgpr + vgpr case.
MachineInstr *Add = getOpcodeDef(AMDGPU::G_ADD, CombinedOffset, *MRI);
if (Add && (int)Offset >= 0) {
- Register Src0 = getSrcRegIgnoringCopies(*MRI, Add->getOperand(1).getReg());
- Register Src1 = getSrcRegIgnoringCopies(*MRI, Add->getOperand(2).getReg());
+ Register Src0 = getSrcRegIgnoringCopies(Add->getOperand(1).getReg(), *MRI);
+ Register Src1 = getSrcRegIgnoringCopies(Add->getOperand(2).getReg(), *MRI);
- const RegisterBank *Src0Bank = RBI.getRegBank(Src0, *MRI, *RBI.TRI);
- const RegisterBank *Src1Bank = RBI.getRegBank(Src1, *MRI, *RBI.TRI);
+ const RegisterBank *Src0Bank = getRegBank(Src0, *MRI, *TRI);
+ const RegisterBank *Src1Bank = getRegBank(Src1, *MRI, *TRI);
if (Src0Bank == &AMDGPU::VGPRRegBank && Src1Bank == &AMDGPU::SGPRRegBank) {
VOffsetReg = Src0;
@@ -1319,7 +1307,7 @@ static unsigned setBufferOffsets(MachineIRBuilder &B,
// Ensure we have a VGPR for the combined offset. This could be an issue if we
// have an SGPR offset and a VGPR resource.
- if (RBI.getRegBank(CombinedOffset, *MRI, *RBI.TRI) == &AMDGPU::VGPRRegBank) {
+ if (getRegBank(CombinedOffset, *MRI, *TRI) == &AMDGPU::VGPRRegBank) {
VOffsetReg = CombinedOffset;
} else {
VOffsetReg = B.buildCopy(S32, CombinedOffset).getReg(0);
@@ -1369,8 +1357,8 @@ bool AMDGPURegisterBankInfo::applyMappingSBufferLoad(
Register VOffset;
int64_t ImmOffset = 0;
- unsigned MMOOffset = setBufferOffsets(B, *this, MI.getOperand(2).getReg(),
- VOffset, SOffset, ImmOffset, Alignment);
+ unsigned MMOOffset = setBufferOffsets(B, MI.getOperand(2).getReg(), VOffset,
+ SOffset, ImmOffset, Alignment);
// TODO: 96-bit loads were widened to 128-bit results. Shrink the result if we
// can, but we need to track an MMO for that.
@@ -1804,7 +1792,7 @@ getBaseWithConstantOffset(MachineRegisterInfo &MRI, Register Reg) {
std::pair<Register, unsigned>
AMDGPURegisterBankInfo::splitBufferOffsets(MachineIRBuilder &B,
Register OrigOffset) const {
- const unsigned MaxImm = 4095;
+ const unsigned MaxImm = SIInstrInfo::getMaxMUBUFImmOffset();
Register BaseReg;
unsigned ImmOffset;
const LLT S32 = LLT::scalar(32);
@@ -1815,13 +1803,14 @@ AMDGPURegisterBankInfo::splitBufferOffsets(MachineIRBuilder &B,
unsigned C1 = 0;
if (ImmOffset != 0) {
- // If the immediate value is too big for the immoffset field, put the value
- // and -4096 into the immoffset field so that the value that is copied/added
- // for the voffset field is a multiple of 4096, and it stands more chance
- // of being CSEd with the copy/add for another similar load/store.
- // However, do not do that rounding down to a multiple of 4096 if that is a
- // negative number, as it appears to be illegal to have a negative offset
- // in the vgpr, even if adding the immediate offset makes it positive.
+ // If the immediate value is too big for the immoffset field, put only bits
+ // that would normally fit in the immoffset field. The remaining value that
+ // is copied/added for the voffset field is a large power of 2, and it
+ // stands more chance of being CSEd with the copy/add for another similar
+ // load/store.
+ // However, do not do that rounding down if that is a negative
+ // number, as it appears to be illegal to have a negative offset in the
+ // vgpr, even if adding the immediate offset makes it positive.
unsigned Overflow = ImmOffset & ~MaxImm;
ImmOffset -= Overflow;
if ((int32_t)Overflow < 0) {
@@ -3016,6 +3005,10 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
case Intrinsic::amdgcn_ubfe:
applyMappingBFE(OpdMapper, false);
return;
+ case Intrinsic::amdgcn_inverse_ballot:
+ applyDefaultMapping(OpdMapper);
+ constrainOpWithReadfirstlane(MI, MRI, 2); // Mask
+ return;
case Intrinsic::amdgcn_ballot:
// Use default handling and insert copy to vcc source.
break;
@@ -3082,14 +3075,16 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
constrainOpWithReadfirstlane(MI, MRI, 2);
return;
}
- case Intrinsic::amdgcn_raw_buffer_load_lds: {
+ case Intrinsic::amdgcn_raw_buffer_load_lds:
+ case Intrinsic::amdgcn_raw_ptr_buffer_load_lds: {
applyDefaultMapping(OpdMapper);
constrainOpWithReadfirstlane(MI, MRI, 1); // rsrc
constrainOpWithReadfirstlane(MI, MRI, 2); // M0
constrainOpWithReadfirstlane(MI, MRI, 5); // soffset
return;
}
- case Intrinsic::amdgcn_struct_buffer_load_lds: {
+ case Intrinsic::amdgcn_struct_buffer_load_lds:
+ case Intrinsic::amdgcn_struct_ptr_buffer_load_lds: {
applyDefaultMapping(OpdMapper);
constrainOpWithReadfirstlane(MI, MRI, 1); // rsrc
constrainOpWithReadfirstlane(MI, MRI, 2); // M0
@@ -3745,6 +3740,7 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
case AMDGPU::G_FPEXT:
case AMDGPU::G_FEXP2:
case AMDGPU::G_FLOG2:
+ case AMDGPU::G_FLDEXP:
case AMDGPU::G_FMINNUM:
case AMDGPU::G_FMAXNUM:
case AMDGPU::G_FMINNUM_IEEE:
@@ -3755,6 +3751,7 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
case AMDGPU::G_STRICT_FSUB:
case AMDGPU::G_STRICT_FMUL:
case AMDGPU::G_STRICT_FMA:
+ case AMDGPU::G_STRICT_FLDEXP:
case AMDGPU::G_BSWAP: // TODO: Somehow expand for scalar?
case AMDGPU::G_FSHR: // TODO: Expand for scalar
case AMDGPU::G_AMDGPU_FMIN_LEGACY:
@@ -3766,6 +3763,7 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
case AMDGPU::G_AMDGPU_CVT_F32_UBYTE3:
case AMDGPU::G_AMDGPU_CVT_PK_I16_I32:
case AMDGPU::G_AMDGPU_SMED3:
+ case AMDGPU::G_AMDGPU_FMED3:
return getDefaultMappingVOP(MI);
case AMDGPU::G_UMULH:
case AMDGPU::G_SMULH: {
@@ -4209,6 +4207,8 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
case Intrinsic::amdgcn_sin:
case Intrinsic::amdgcn_cos:
case Intrinsic::amdgcn_log_clamp:
+ case Intrinsic::amdgcn_log:
+ case Intrinsic::amdgcn_exp2:
case Intrinsic::amdgcn_rcp:
case Intrinsic::amdgcn_rcp_legacy:
case Intrinsic::amdgcn_sqrt:
@@ -4217,7 +4217,6 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
case Intrinsic::amdgcn_rsq_clamp:
case Intrinsic::amdgcn_fmul_legacy:
case Intrinsic::amdgcn_fma_legacy:
- case Intrinsic::amdgcn_ldexp:
case Intrinsic::amdgcn_frexp_mant:
case Intrinsic::amdgcn_frexp_exp:
case Intrinsic::amdgcn_fract:
@@ -4506,6 +4505,25 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, SrcSize);
break;
}
+ case Intrinsic::amdgcn_inverse_ballot: {
+ // This must be an SGPR, but accept a VGPR.
+ Register MaskReg = MI.getOperand(2).getReg();
+ unsigned MaskSize = MRI.getType(MaskReg).getSizeInBits();
+ unsigned MaskBank = getRegBankID(MaskReg, MRI, AMDGPU::SGPRRegBankID);
+ OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1);
+ OpdsMapping[2] = AMDGPU::getValueMapping(MaskBank, MaskSize);
+ break;
+ }
+ case Intrinsic::amdgcn_wave_reduce_umin:
+ case Intrinsic::amdgcn_wave_reduce_umax: {
+ unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
+ OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, DstSize);
+ unsigned OpSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
+ auto regBankID =
+ isSALUMapping(MI) ? AMDGPU::SGPRRegBankID : AMDGPU::VGPRRegBankID;
+ OpdsMapping[2] = AMDGPU::getValueMapping(regBankID, OpSize);
+ break;
+ }
}
break;
}
@@ -4636,7 +4654,9 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
break;
}
case Intrinsic::amdgcn_raw_buffer_load:
- case Intrinsic::amdgcn_raw_tbuffer_load: {
+ case Intrinsic::amdgcn_raw_ptr_buffer_load:
+ case Intrinsic::amdgcn_raw_tbuffer_load:
+ case Intrinsic::amdgcn_raw_ptr_tbuffer_load: {
// FIXME: Should make intrinsic ID the last operand of the instruction,
// then this would be the same as store
OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
@@ -4645,7 +4665,8 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
OpdsMapping[4] = getSGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
break;
}
- case Intrinsic::amdgcn_raw_buffer_load_lds: {
+ case Intrinsic::amdgcn_raw_buffer_load_lds:
+ case Intrinsic::amdgcn_raw_ptr_buffer_load_lds: {
OpdsMapping[1] = getSGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
OpdsMapping[4] = getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
@@ -4653,8 +4674,11 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
break;
}
case Intrinsic::amdgcn_raw_buffer_store:
+ case Intrinsic::amdgcn_raw_ptr_buffer_store:
case Intrinsic::amdgcn_raw_buffer_store_format:
- case Intrinsic::amdgcn_raw_tbuffer_store: {
+ case Intrinsic::amdgcn_raw_ptr_buffer_store_format:
+ case Intrinsic::amdgcn_raw_tbuffer_store:
+ case Intrinsic::amdgcn_raw_ptr_tbuffer_store: {
OpdsMapping[1] = getVGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
@@ -4662,7 +4686,9 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
break;
}
case Intrinsic::amdgcn_struct_buffer_load:
- case Intrinsic::amdgcn_struct_tbuffer_load: {
+ case Intrinsic::amdgcn_struct_ptr_buffer_load:
+ case Intrinsic::amdgcn_struct_tbuffer_load:
+ case Intrinsic::amdgcn_struct_ptr_tbuffer_load: {
OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
@@ -4670,7 +4696,8 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
OpdsMapping[5] = getSGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI);
break;
}
- case Intrinsic::amdgcn_struct_buffer_load_lds: {
+ case Intrinsic::amdgcn_struct_buffer_load_lds:
+ case Intrinsic::amdgcn_struct_ptr_buffer_load_lds: {
OpdsMapping[1] = getSGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
OpdsMapping[4] = getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
@@ -4679,7 +4706,9 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
break;
}
case Intrinsic::amdgcn_struct_buffer_store:
- case Intrinsic::amdgcn_struct_tbuffer_store: {
+ case Intrinsic::amdgcn_struct_ptr_buffer_store:
+ case Intrinsic::amdgcn_struct_tbuffer_store:
+ case Intrinsic::amdgcn_struct_ptr_tbuffer_store: {
OpdsMapping[1] = getVGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
@@ -4828,9 +4857,9 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
case AMDGPU::G_ATOMICRMW_UMAX:
case AMDGPU::G_ATOMICRMW_UMIN:
case AMDGPU::G_ATOMICRMW_FADD:
+ case AMDGPU::G_ATOMICRMW_UINC_WRAP:
+ case AMDGPU::G_ATOMICRMW_UDEC_WRAP:
case AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG:
- case AMDGPU::G_AMDGPU_ATOMIC_INC:
- case AMDGPU::G_AMDGPU_ATOMIC_DEC:
case AMDGPU::G_AMDGPU_ATOMIC_FMIN:
case AMDGPU::G_AMDGPU_ATOMIC_FMAX: {
OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.h b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.h
index c9741c2202e6..78214d7a1058 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.h
@@ -82,6 +82,9 @@ public:
applyMappingImage(MachineInstr &MI,
const OperandsMapper &OpdMapper,
MachineRegisterInfo &MRI, int RSrcIdx) const;
+ unsigned setBufferOffsets(MachineIRBuilder &B, Register CombinedOffset,
+ Register &VOffsetReg, Register &SOffsetReg,
+ int64_t &InstOffsetVal, Align Alignment) const;
bool applyMappingSBufferLoad(const OperandsMapper &OpdMapper) const;
bool applyMappingBFE(const OperandsMapper &OpdMapper, bool Signed) const;
@@ -165,6 +168,8 @@ public:
public:
AMDGPURegisterBankInfo(const GCNSubtarget &STI);
+ bool isDivergentRegBank(const RegisterBank *RB) const override;
+
unsigned copyCost(const RegisterBank &A, const RegisterBank &B,
unsigned Size) const override;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUReleaseVGPRs.cpp b/llvm/lib/Target/AMDGPU/AMDGPUReleaseVGPRs.cpp
deleted file mode 100644
index b7521540c020..000000000000
--- a/llvm/lib/Target/AMDGPU/AMDGPUReleaseVGPRs.cpp
+++ /dev/null
@@ -1,156 +0,0 @@
-//===- AMDGPUReleaseVGPRs.cpp - Automatically release vgprs on GFX11+ -----===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-/// \file
-/// Insert S_SENDMSG instructions to release vgprs on GFX11+.
-//
-//===----------------------------------------------------------------------===//
-
-#include "AMDGPU.h"
-#include "AMDGPUSubtarget.h"
-#include "GCNSubtarget.h"
-#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
-#include "SIDefines.h"
-#include "llvm/ADT/DepthFirstIterator.h"
-#include "llvm/CodeGen/MachineBasicBlock.h"
-#include "llvm/CodeGen/MachineOperand.h"
-#include <optional>
-using namespace llvm;
-
-#define DEBUG_TYPE "release-vgprs"
-
-namespace {
-
-class AMDGPUReleaseVGPRs : public MachineFunctionPass {
-public:
- static char ID;
-
- AMDGPUReleaseVGPRs() : MachineFunctionPass(ID) {}
-
- void getAnalysisUsage(AnalysisUsage &AU) const override {
- AU.setPreservesAll();
- MachineFunctionPass::getAnalysisUsage(AU);
- }
-
- // Track if the last instruction referencing a vgpr in a MBB is a VMEM
- // store. Because this pass is late in the pipeline, it is expected that the
- // last vgpr use will likely be one of vmem store, ds, exp.
- // Loads and others vgpr operations would have been
- // deleted by this point, except for complex control flow involving loops.
- // This is why we are just testing the type of instructions rather
- // than the operands.
- class LastVGPRUseIsVMEMStore {
- BitVector BlockVMEMStore;
-
- static std::optional<bool>
- lastVGPRUseIsStore(const MachineBasicBlock &MBB) {
- for (auto &MI : reverse(MBB.instrs())) {
- // If it's a VMEM store, a VGPR will be used, return true.
- if ((SIInstrInfo::isVMEM(MI) || SIInstrInfo::isFLAT(MI)) &&
- MI.mayStore())
- return true;
-
- // If it's referencing a VGPR but is not a VMEM store, return false.
- if (SIInstrInfo::isDS(MI) || SIInstrInfo::isEXP(MI) ||
- SIInstrInfo::isVMEM(MI) || SIInstrInfo::isFLAT(MI) ||
- SIInstrInfo::isVALU(MI))
- return false;
- }
- // Wait until the values are propagated from the predecessors
- return std::nullopt;
- }
-
- public:
- LastVGPRUseIsVMEMStore(const MachineFunction &MF)
- : BlockVMEMStore(MF.getNumBlockIDs()) {
-
- df_iterator_default_set<const MachineBasicBlock *> Visited;
- SmallVector<const MachineBasicBlock *> EndWithVMEMStoreBlocks;
-
- for (const auto &MBB : MF) {
- auto LastUseIsStore = lastVGPRUseIsStore(MBB);
- if (!LastUseIsStore.has_value())
- continue;
-
- if (*LastUseIsStore) {
- EndWithVMEMStoreBlocks.push_back(&MBB);
- } else {
- Visited.insert(&MBB);
- }
- }
-
- for (const auto *MBB : EndWithVMEMStoreBlocks) {
- for (const auto *Succ : depth_first_ext(MBB, Visited)) {
- BlockVMEMStore[Succ->getNumber()] = true;
- }
- }
- }
-
- // Return true if the last instruction referencing a vgpr in this MBB
- // is a VMEM store, otherwise return false.
- bool isLastVGPRUseVMEMStore(const MachineBasicBlock &MBB) const {
- return BlockVMEMStore[MBB.getNumber()];
- }
- };
-
- static bool
- runOnMachineBasicBlock(MachineBasicBlock &MBB, const SIInstrInfo *SII,
- const LastVGPRUseIsVMEMStore &BlockVMEMStore) {
-
- bool Changed = false;
-
- for (auto &MI : MBB.terminators()) {
- // Look for S_ENDPGM instructions
- if (MI.getOpcode() == AMDGPU::S_ENDPGM ||
- MI.getOpcode() == AMDGPU::S_ENDPGM_SAVED) {
- // If the last instruction using a VGPR in the block is a VMEM store,
- // release VGPRs. The VGPRs release will be placed just before ending
- // the program
- if (BlockVMEMStore.isLastVGPRUseVMEMStore(MBB)) {
- BuildMI(MBB, MI, DebugLoc(), SII->get(AMDGPU::S_SENDMSG))
- .addImm(AMDGPU::SendMsg::ID_DEALLOC_VGPRS_GFX11Plus);
- Changed = true;
- }
- }
- }
-
- return Changed;
- }
-
- bool runOnMachineFunction(MachineFunction &MF) override {
- Function &F = MF.getFunction();
- if (skipFunction(F) || !AMDGPU::isEntryFunctionCC(F.getCallingConv()))
- return false;
-
- // This pass only runs on GFX11+
- const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
- if (ST.getGeneration() < AMDGPUSubtarget::GFX11)
- return false;
-
- LLVM_DEBUG(dbgs() << "AMDGPUReleaseVGPRs running on " << MF.getName()
- << "\n");
-
- const SIInstrInfo *SII = ST.getInstrInfo();
- LastVGPRUseIsVMEMStore BlockVMEMStore(MF);
-
- bool Changed = false;
- for (auto &MBB : MF) {
- Changed |= runOnMachineBasicBlock(MBB, SII, BlockVMEMStore);
- }
-
- return Changed;
- }
-};
-
-} // namespace
-
-char AMDGPUReleaseVGPRs::ID = 0;
-
-char &llvm::AMDGPUReleaseVGPRsID = AMDGPUReleaseVGPRs::ID;
-
-INITIALIZE_PASS(AMDGPUReleaseVGPRs, DEBUG_TYPE, "Release VGPRs", false, false)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURemoveIncompatibleFunctions.cpp b/llvm/lib/Target/AMDGPU/AMDGPURemoveIncompatibleFunctions.cpp
new file mode 100644
index 000000000000..580352fb8cf4
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/AMDGPURemoveIncompatibleFunctions.cpp
@@ -0,0 +1,186 @@
+//===-- AMDGPURemoveIncompatibleFunctions.cpp -----------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// This pass replaces all uses of functions that use GPU features
+/// incompatible with the current GPU with null then deletes the function.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "GCNSubtarget.h"
+#include "llvm/Analysis/OptimizationRemarkEmitter.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Pass.h"
+#include "llvm/Target/TargetMachine.h"
+
+#define DEBUG_TYPE "amdgpu-remove-incompatible-functions"
+
+using namespace llvm;
+
+namespace llvm {
+extern const SubtargetFeatureKV
+ AMDGPUFeatureKV[AMDGPU::NumSubtargetFeatures - 1];
+}
+
+namespace {
+
+using Generation = AMDGPUSubtarget::Generation;
+
+class AMDGPURemoveIncompatibleFunctions : public ModulePass {
+public:
+ static char ID;
+
+ AMDGPURemoveIncompatibleFunctions(const TargetMachine *TM = nullptr)
+ : ModulePass(ID), TM(TM) {
+ assert(TM && "No TargetMachine!");
+ }
+
+ StringRef getPassName() const override {
+ return "AMDGPU Remove Incompatible Functions";
+ }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {}
+
+ /// Checks a single function, returns true if the function must be deleted.
+ bool checkFunction(Function &F);
+
+ bool runOnModule(Module &M) override {
+ assert(TM->getTargetTriple().isAMDGCN());
+
+ SmallVector<Function *, 4> FnsToDelete;
+ for (Function &F : M) {
+ if (checkFunction(F))
+ FnsToDelete.push_back(&F);
+ }
+
+ for (Function *F : FnsToDelete) {
+ F->replaceAllUsesWith(ConstantPointerNull::get(F->getType()));
+ F->eraseFromParent();
+ }
+ return !FnsToDelete.empty();
+ }
+
+private:
+ const TargetMachine *TM = nullptr;
+};
+
+StringRef getFeatureName(unsigned Feature) {
+ for (const SubtargetFeatureKV &KV : AMDGPUFeatureKV)
+ if (Feature == KV.Value)
+ return KV.Key;
+
+ llvm_unreachable("Unknown Target feature");
+}
+
+const SubtargetSubTypeKV *getGPUInfo(const GCNSubtarget &ST,
+ StringRef GPUName) {
+ for (const SubtargetSubTypeKV &KV : ST.getAllProcessorDescriptions())
+ if (StringRef(KV.Key) == GPUName)
+ return &KV;
+
+ return nullptr;
+}
+
+constexpr unsigned FeaturesToCheck[] = {
+ AMDGPU::FeatureGFX11Insts, AMDGPU::FeatureGFX10Insts,
+ AMDGPU::FeatureGFX9Insts, AMDGPU::FeatureGFX8Insts,
+ AMDGPU::FeatureDPP, AMDGPU::Feature16BitInsts,
+ AMDGPU::FeatureDot1Insts, AMDGPU::FeatureDot2Insts,
+ AMDGPU::FeatureDot3Insts, AMDGPU::FeatureDot4Insts,
+ AMDGPU::FeatureDot5Insts, AMDGPU::FeatureDot6Insts,
+ AMDGPU::FeatureDot7Insts, AMDGPU::FeatureDot8Insts,
+};
+
+FeatureBitset expandImpliedFeatures(const FeatureBitset &Features) {
+ FeatureBitset Result = Features;
+ for (const SubtargetFeatureKV &FE : AMDGPUFeatureKV) {
+ if (Features.test(FE.Value) && FE.Implies.any())
+ Result |= expandImpliedFeatures(FE.Implies.getAsBitset());
+ }
+ return Result;
+}
+
+void reportFunctionRemoved(Function &F, unsigned Feature) {
+ OptimizationRemarkEmitter ORE(&F);
+ ORE.emit([&]() {
+ // Note: we print the function name as part of the diagnostic because if
+ // debug info is not present, users get "<unknown>:0:0" as the debug
+ // loc. If we didn't print the function name there would be no way to
+ // tell which function got removed.
+ return OptimizationRemark(DEBUG_TYPE, "AMDGPUIncompatibleFnRemoved", &F)
+ << "removing function '" << F.getName() << "': +"
+ << getFeatureName(Feature)
+ << " is not supported on the current target";
+ });
+ return;
+}
+} // end anonymous namespace
+
+bool AMDGPURemoveIncompatibleFunctions::checkFunction(Function &F) {
+ if (F.isDeclaration())
+ return false;
+
+ const GCNSubtarget *ST =
+ static_cast<const GCNSubtarget *>(TM->getSubtargetImpl(F));
+
+ // Check the GPU isn't generic. Generic is used for testing only
+ // and we don't want this pass to interfere with it.
+ StringRef GPUName = ST->getCPU();
+ if (GPUName.empty() || GPUName.contains("generic"))
+ return false;
+
+ // Try to fetch the GPU's info. If we can't, it's likely an unknown processor
+ // so just bail out.
+ const SubtargetSubTypeKV *GPUInfo = getGPUInfo(*ST, GPUName);
+ if (!GPUInfo)
+ return false;
+
+ // Get all the features implied by the current GPU, and recursively expand
+ // the features that imply other features.
+ //
+ // e.g. GFX90A implies FeatureGFX9, and FeatureGFX9 implies a whole set of
+ // other features.
+ const FeatureBitset GPUFeatureBits =
+ expandImpliedFeatures(GPUInfo->Implies.getAsBitset());
+
+ // Now that the have a FeatureBitset containing all possible features for
+ // the chosen GPU, check our list of "suspicious" features.
+
+ // Check that the user didn't enable any features that aren't part of that
+ // GPU's feature set. We only check a predetermined set of features.
+ for (unsigned Feature : FeaturesToCheck) {
+ if (ST->hasFeature(Feature) && !GPUFeatureBits.test(Feature)) {
+ reportFunctionRemoved(F, Feature);
+ return true;
+ }
+ }
+
+ // Delete FeatureWavefrontSize32 functions for
+ // gfx9 and below targets that don't support the mode.
+ // gfx10+ is implied to support both wave32 and 64 features.
+ // They are not in the feature set. So, we need a separate check
+ if (ST->getGeneration() < AMDGPUSubtarget::GFX10 &&
+ ST->hasFeature(AMDGPU::FeatureWavefrontSize32)) {
+ reportFunctionRemoved(F, AMDGPU::FeatureWavefrontSize32);
+ return true;
+ }
+ return false;
+}
+
+INITIALIZE_PASS(AMDGPURemoveIncompatibleFunctions, DEBUG_TYPE,
+ "AMDGPU Remove Incompatible Functions", false, false)
+
+char AMDGPURemoveIncompatibleFunctions::ID = 0;
+
+ModulePass *
+llvm::createAMDGPURemoveIncompatibleFunctionsPass(const TargetMachine *TM) {
+ return new AMDGPURemoveIncompatibleFunctions(TM);
+}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUReplaceLDSUseWithPointer.cpp b/llvm/lib/Target/AMDGPU/AMDGPUReplaceLDSUseWithPointer.cpp
deleted file mode 100644
index 299ac106ebee..000000000000
--- a/llvm/lib/Target/AMDGPU/AMDGPUReplaceLDSUseWithPointer.cpp
+++ /dev/null
@@ -1,648 +0,0 @@
-//===-- AMDGPUReplaceLDSUseWithPointer.cpp --------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This pass replaces all the uses of LDS within non-kernel functions by
-// corresponding pointer counter-parts.
-//
-// The main motivation behind this pass is - to *avoid* subsequent LDS lowering
-// pass from directly packing LDS (assume large LDS) into a struct type which
-// would otherwise cause allocating huge memory for struct instance within every
-// kernel.
-//
-// Brief sketch of the algorithm implemented in this pass is as below:
-//
-// 1. Collect all the LDS defined in the module which qualify for pointer
-// replacement, say it is, LDSGlobals set.
-//
-// 2. Collect all the reachable callees for each kernel defined in the module,
-// say it is, KernelToCallees map.
-//
-// 3. FOR (each global GV from LDSGlobals set) DO
-// LDSUsedNonKernels = Collect all non-kernel functions which use GV.
-// FOR (each kernel K in KernelToCallees map) DO
-// ReachableCallees = KernelToCallees[K]
-// ReachableAndLDSUsedCallees =
-// SetIntersect(LDSUsedNonKernels, ReachableCallees)
-// IF (ReachableAndLDSUsedCallees is not empty) THEN
-// Pointer = Create a pointer to point-to GV if not created.
-// Initialize Pointer to point-to GV within kernel K.
-// ENDIF
-// ENDFOR
-// Replace all uses of GV within non kernel functions by Pointer.
-// ENFOR
-//
-// LLVM IR example:
-//
-// Input IR:
-//
-// @lds = internal addrspace(3) global [4 x i32] undef, align 16
-//
-// define internal void @f0() {
-// entry:
-// %gep = getelementptr inbounds [4 x i32], [4 x i32] addrspace(3)* @lds,
-// i32 0, i32 0
-// ret void
-// }
-//
-// define protected amdgpu_kernel void @k0() {
-// entry:
-// call void @f0()
-// ret void
-// }
-//
-// Output IR:
-//
-// @lds = internal addrspace(3) global [4 x i32] undef, align 16
-// @lds.ptr = internal unnamed_addr addrspace(3) global i16 undef, align 2
-//
-// define internal void @f0() {
-// entry:
-// %0 = load i16, i16 addrspace(3)* @lds.ptr, align 2
-// %1 = getelementptr i8, i8 addrspace(3)* null, i16 %0
-// %2 = bitcast i8 addrspace(3)* %1 to [4 x i32] addrspace(3)*
-// %gep = getelementptr inbounds [4 x i32], [4 x i32] addrspace(3)* %2,
-// i32 0, i32 0
-// ret void
-// }
-//
-// define protected amdgpu_kernel void @k0() {
-// entry:
-// store i16 ptrtoint ([4 x i32] addrspace(3)* @lds to i16),
-// i16 addrspace(3)* @lds.ptr, align 2
-// call void @f0()
-// ret void
-// }
-//
-//===----------------------------------------------------------------------===//
-
-#include "AMDGPU.h"
-#include "GCNSubtarget.h"
-#include "Utils/AMDGPUBaseInfo.h"
-#include "Utils/AMDGPUMemoryUtils.h"
-#include "llvm/ADT/DenseMap.h"
-#include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/SetOperations.h"
-#include "llvm/Analysis/CallGraph.h"
-#include "llvm/CodeGen/TargetPassConfig.h"
-#include "llvm/IR/Constants.h"
-#include "llvm/IR/DerivedTypes.h"
-#include "llvm/IR/IRBuilder.h"
-#include "llvm/IR/InlineAsm.h"
-#include "llvm/IR/Instructions.h"
-#include "llvm/IR/IntrinsicsAMDGPU.h"
-#include "llvm/IR/ReplaceConstant.h"
-#include "llvm/InitializePasses.h"
-#include "llvm/Pass.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Target/TargetMachine.h"
-#include "llvm/Transforms/Utils/BasicBlockUtils.h"
-#include "llvm/Transforms/Utils/ModuleUtils.h"
-#include <algorithm>
-#include <vector>
-
-#define DEBUG_TYPE "amdgpu-replace-lds-use-with-pointer"
-
-using namespace llvm;
-
-namespace {
-
-namespace AMDGPU {
-/// Collect all the instructions where user \p U belongs to. \p U could be
-/// instruction itself or it could be a constant expression which is used within
-/// an instruction. If \p CollectKernelInsts is true, collect instructions only
-/// from kernels, otherwise collect instructions only from non-kernel functions.
-DenseMap<Function *, SmallPtrSet<Instruction *, 8>>
-getFunctionToInstsMap(User *U, bool CollectKernelInsts);
-
-SmallPtrSet<Function *, 8> collectNonKernelAccessorsOfLDS(GlobalVariable *GV);
-
-} // namespace AMDGPU
-
-class ReplaceLDSUseImpl {
- Module &M;
- LLVMContext &Ctx;
- const DataLayout &DL;
- Constant *LDSMemBaseAddr;
-
- DenseMap<GlobalVariable *, GlobalVariable *> LDSToPointer;
- DenseMap<GlobalVariable *, SmallPtrSet<Function *, 8>> LDSToNonKernels;
- DenseMap<Function *, SmallPtrSet<Function *, 8>> KernelToCallees;
- DenseMap<Function *, SmallPtrSet<GlobalVariable *, 8>> KernelToLDSPointers;
- DenseMap<Function *, BasicBlock *> KernelToInitBB;
- DenseMap<Function *, DenseMap<GlobalVariable *, Value *>>
- FunctionToLDSToReplaceInst;
-
- // Collect LDS which requires their uses to be replaced by pointer.
- std::vector<GlobalVariable *> collectLDSRequiringPointerReplace() {
- // Collect LDS which requires module lowering.
- std::vector<GlobalVariable *> LDSGlobals =
- llvm::AMDGPU::findLDSVariablesToLower(M, nullptr);
-
- // Remove LDS which don't qualify for replacement.
- llvm::erase_if(LDSGlobals, [&](GlobalVariable *GV) {
- return shouldIgnorePointerReplacement(GV);
- });
-
- return LDSGlobals;
- }
-
- // Returns true if uses of given LDS global within non-kernel functions should
- // be keep as it is without pointer replacement.
- bool shouldIgnorePointerReplacement(GlobalVariable *GV) {
- // LDS whose size is very small and doesn't exceed pointer size is not worth
- // replacing.
- if (DL.getTypeAllocSize(GV->getValueType()) <= 2)
- return true;
-
- // LDS which is not used from non-kernel function scope or it is used from
- // global scope does not qualify for replacement.
- LDSToNonKernels[GV] = AMDGPU::collectNonKernelAccessorsOfLDS(GV);
- return LDSToNonKernels[GV].empty();
-
- // FIXME: When GV is used within all (or within most of the kernels), then
- // it does not make sense to create a pointer for it.
- }
-
- // Insert new global LDS pointer which points to LDS.
- GlobalVariable *createLDSPointer(GlobalVariable *GV) {
- // LDS pointer which points to LDS is already created? Return it.
- auto PointerEntry = LDSToPointer.insert(std::pair(GV, nullptr));
- if (!PointerEntry.second)
- return PointerEntry.first->second;
-
- // We need to create new LDS pointer which points to LDS.
- //
- // Each CU owns at max 64K of LDS memory, so LDS address ranges from 0 to
- // 2^16 - 1. Hence 16 bit pointer is enough to hold the LDS address.
- auto *I16Ty = Type::getInt16Ty(Ctx);
- GlobalVariable *LDSPointer = new GlobalVariable(
- M, I16Ty, false, GlobalValue::InternalLinkage, UndefValue::get(I16Ty),
- GV->getName() + Twine(".ptr"), nullptr, GlobalVariable::NotThreadLocal,
- AMDGPUAS::LOCAL_ADDRESS);
-
- LDSPointer->setUnnamedAddr(GlobalValue::UnnamedAddr::Global);
- LDSPointer->setAlignment(llvm::AMDGPU::getAlign(DL, LDSPointer));
-
- // Mark that an associated LDS pointer is created for LDS.
- LDSToPointer[GV] = LDSPointer;
-
- return LDSPointer;
- }
-
- // Split entry basic block in such a way that only lane 0 of each wave does
- // the LDS pointer initialization, and return newly created basic block.
- BasicBlock *activateLaneZero(Function *K) {
- // If the entry basic block of kernel K is already split, then return
- // newly created basic block.
- auto BasicBlockEntry = KernelToInitBB.insert(std::pair(K, nullptr));
- if (!BasicBlockEntry.second)
- return BasicBlockEntry.first->second;
-
- // Split entry basic block of kernel K.
- auto *EI = &(*(K->getEntryBlock().getFirstInsertionPt()));
- IRBuilder<> Builder(EI);
-
- Value *Mbcnt =
- Builder.CreateIntrinsic(Intrinsic::amdgcn_mbcnt_lo, {},
- {Builder.getInt32(-1), Builder.getInt32(0)});
- Value *Cond = Builder.CreateICmpEQ(Mbcnt, Builder.getInt32(0));
- Instruction *WB = cast<Instruction>(
- Builder.CreateIntrinsic(Intrinsic::amdgcn_wave_barrier, {}, {}));
-
- BasicBlock *NBB = SplitBlockAndInsertIfThen(Cond, WB, false)->getParent();
-
- // Mark that the entry basic block of kernel K is split.
- KernelToInitBB[K] = NBB;
-
- return NBB;
- }
-
- // Within given kernel, initialize given LDS pointer to point to given LDS.
- void initializeLDSPointer(Function *K, GlobalVariable *GV,
- GlobalVariable *LDSPointer) {
- // If LDS pointer is already initialized within K, then nothing to do.
- auto PointerEntry = KernelToLDSPointers.insert(
- std::pair(K, SmallPtrSet<GlobalVariable *, 8>()));
- if (!PointerEntry.second)
- if (PointerEntry.first->second.contains(LDSPointer))
- return;
-
- // Insert instructions at EI which initialize LDS pointer to point-to LDS
- // within kernel K.
- //
- // That is, convert pointer type of GV to i16, and then store this converted
- // i16 value within LDSPointer which is of type i16*.
- auto *EI = &(*(activateLaneZero(K)->getFirstInsertionPt()));
- IRBuilder<> Builder(EI);
- Builder.CreateStore(Builder.CreatePtrToInt(GV, Type::getInt16Ty(Ctx)),
- LDSPointer);
-
- // Mark that LDS pointer is initialized within kernel K.
- KernelToLDSPointers[K].insert(LDSPointer);
- }
-
- // We have created an LDS pointer for LDS, and initialized it to point-to LDS
- // within all relevant kernels. Now replace all the uses of LDS within
- // non-kernel functions by LDS pointer.
- void replaceLDSUseByPointer(GlobalVariable *GV, GlobalVariable *LDSPointer) {
- SmallVector<User *, 8> LDSUsers(GV->users());
- for (auto *U : LDSUsers) {
- // When `U` is a constant expression, it is possible that same constant
- // expression exists within multiple instructions, and within multiple
- // non-kernel functions. Collect all those non-kernel functions and all
- // those instructions within which `U` exist.
- auto FunctionToInsts =
- AMDGPU::getFunctionToInstsMap(U, false /*=CollectKernelInsts*/);
-
- for (const auto &FunctionToInst : FunctionToInsts) {
- Function *F = FunctionToInst.first;
- auto &Insts = FunctionToInst.second;
- for (auto *I : Insts) {
- // If `U` is a constant expression, then we need to break the
- // associated instruction into a set of separate instructions by
- // converting constant expressions into instructions.
- SmallPtrSet<Instruction *, 8> UserInsts;
-
- if (U == I) {
- // `U` is an instruction, conversion from constant expression to
- // set of instructions is *not* required.
- UserInsts.insert(I);
- } else {
- // `U` is a constant expression, convert it into corresponding set
- // of instructions.
- auto *CE = cast<ConstantExpr>(U);
- convertConstantExprsToInstructions(I, CE, &UserInsts);
- }
-
- // Go through all the user instructions, if LDS exist within them as
- // an operand, then replace it by replace instruction.
- for (auto *II : UserInsts) {
- auto *ReplaceInst = getReplacementInst(F, GV, LDSPointer);
- II->replaceUsesOfWith(GV, ReplaceInst);
- }
- }
- }
- }
- }
-
- // Create a set of replacement instructions which together replace LDS within
- // non-kernel function F by accessing LDS indirectly using LDS pointer.
- Value *getReplacementInst(Function *F, GlobalVariable *GV,
- GlobalVariable *LDSPointer) {
- // If the instruction which replaces LDS within F is already created, then
- // return it.
- auto LDSEntry = FunctionToLDSToReplaceInst.insert(
- std::pair(F, DenseMap<GlobalVariable *, Value *>()));
- if (!LDSEntry.second) {
- auto ReplaceInstEntry =
- LDSEntry.first->second.insert(std::pair(GV, nullptr));
- if (!ReplaceInstEntry.second)
- return ReplaceInstEntry.first->second;
- }
-
- // Get the instruction insertion point within the beginning of the entry
- // block of current non-kernel function.
- auto *EI = &(*(F->getEntryBlock().getFirstInsertionPt()));
- IRBuilder<> Builder(EI);
-
- // Insert required set of instructions which replace LDS within F.
- auto *V = Builder.CreateBitCast(
- Builder.CreateGEP(
- Builder.getInt8Ty(), LDSMemBaseAddr,
- Builder.CreateLoad(LDSPointer->getValueType(), LDSPointer)),
- GV->getType());
-
- // Mark that the replacement instruction which replace LDS within F is
- // created.
- FunctionToLDSToReplaceInst[F][GV] = V;
-
- return V;
- }
-
-public:
- ReplaceLDSUseImpl(Module &M)
- : M(M), Ctx(M.getContext()), DL(M.getDataLayout()) {
- LDSMemBaseAddr = Constant::getIntegerValue(
- PointerType::get(Type::getInt8Ty(M.getContext()),
- AMDGPUAS::LOCAL_ADDRESS),
- APInt(32, 0));
- }
-
- // Entry-point function which interface ReplaceLDSUseImpl with outside of the
- // class.
- bool replaceLDSUse();
-
-private:
- // For a given LDS from collected LDS globals set, replace its non-kernel
- // function scope uses by pointer.
- bool replaceLDSUse(GlobalVariable *GV);
-};
-
-// For given LDS from collected LDS globals set, replace its non-kernel function
-// scope uses by pointer.
-bool ReplaceLDSUseImpl::replaceLDSUse(GlobalVariable *GV) {
- // Holds all those non-kernel functions within which LDS is being accessed.
- SmallPtrSet<Function *, 8> &LDSAccessors = LDSToNonKernels[GV];
-
- // The LDS pointer which points to LDS and replaces all the uses of LDS.
- GlobalVariable *LDSPointer = nullptr;
-
- // Traverse through each kernel K, check and if required, initialize the
- // LDS pointer to point to LDS within K.
- for (const auto &KernelToCallee : KernelToCallees) {
- Function *K = KernelToCallee.first;
- SmallPtrSet<Function *, 8> Callees = KernelToCallee.second;
-
- // Compute reachable and LDS used callees for kernel K.
- set_intersect(Callees, LDSAccessors);
-
- // None of the LDS accessing non-kernel functions are reachable from
- // kernel K. Hence, no need to initialize LDS pointer within kernel K.
- if (Callees.empty())
- continue;
-
- // We have found reachable and LDS used callees for kernel K, and we need to
- // initialize LDS pointer within kernel K, and we need to replace LDS use
- // within those callees by LDS pointer.
- //
- // But, first check if LDS pointer is already created, if not create one.
- LDSPointer = createLDSPointer(GV);
-
- // Initialize LDS pointer to point to LDS within kernel K.
- initializeLDSPointer(K, GV, LDSPointer);
- }
-
- // We have not found reachable and LDS used callees for any of the kernels,
- // and hence we have not created LDS pointer.
- if (!LDSPointer)
- return false;
-
- // We have created an LDS pointer for LDS, and initialized it to point-to LDS
- // within all relevant kernels. Now replace all the uses of LDS within
- // non-kernel functions by LDS pointer.
- replaceLDSUseByPointer(GV, LDSPointer);
-
- return true;
-}
-
-namespace AMDGPU {
-
-// An helper class for collecting all reachable callees for each kernel defined
-// within the module.
-class CollectReachableCallees {
- Module &M;
- CallGraph CG;
- SmallPtrSet<CallGraphNode *, 8> AddressTakenFunctions;
-
- // Collect all address taken functions within the module.
- void collectAddressTakenFunctions() {
- auto *ECNode = CG.getExternalCallingNode();
-
- for (const auto &GI : *ECNode) {
- auto *CGN = GI.second;
- auto *F = CGN->getFunction();
- if (!F || F->isDeclaration() || llvm::AMDGPU::isKernelCC(F))
- continue;
- AddressTakenFunctions.insert(CGN);
- }
- }
-
- // For given kernel, collect all its reachable non-kernel functions.
- SmallPtrSet<Function *, 8> collectReachableCallees(Function *K) {
- SmallPtrSet<Function *, 8> ReachableCallees;
-
- // Call graph node which represents this kernel.
- auto *KCGN = CG[K];
-
- // Go through all call graph nodes reachable from the node representing this
- // kernel, visit all their call sites, if the call site is direct, add
- // corresponding callee to reachable callee set, if it is indirect, resolve
- // the indirect call site to potential reachable callees, add them to
- // reachable callee set, and repeat the process for the newly added
- // potential callee nodes.
- //
- // FIXME: Need to handle bit-casted function pointers.
- //
- SmallVector<CallGraphNode *, 8> CGNStack(depth_first(KCGN));
- SmallPtrSet<CallGraphNode *, 8> VisitedCGNodes;
- while (!CGNStack.empty()) {
- auto *CGN = CGNStack.pop_back_val();
-
- if (!VisitedCGNodes.insert(CGN).second)
- continue;
-
- // Ignore call graph node which does not have associated function or
- // associated function is not a definition.
- if (!CGN->getFunction() || CGN->getFunction()->isDeclaration())
- continue;
-
- for (const auto &GI : *CGN) {
- auto *RCB = cast<CallBase>(*GI.first);
- auto *RCGN = GI.second;
-
- if (auto *DCallee = RCGN->getFunction()) {
- ReachableCallees.insert(DCallee);
- } else if (RCB->isIndirectCall()) {
- auto *RCBFTy = RCB->getFunctionType();
- for (auto *ACGN : AddressTakenFunctions) {
- auto *ACallee = ACGN->getFunction();
- if (ACallee->getFunctionType() == RCBFTy) {
- ReachableCallees.insert(ACallee);
- CGNStack.append(df_begin(ACGN), df_end(ACGN));
- }
- }
- }
- }
- }
-
- return ReachableCallees;
- }
-
-public:
- explicit CollectReachableCallees(Module &M) : M(M), CG(CallGraph(M)) {
- // Collect address taken functions.
- collectAddressTakenFunctions();
- }
-
- void collectReachableCallees(
- DenseMap<Function *, SmallPtrSet<Function *, 8>> &KernelToCallees) {
- // Collect reachable callee set for each kernel defined in the module.
- for (Function &F : M.functions()) {
- if (!llvm::AMDGPU::isKernelCC(&F))
- continue;
- Function *K = &F;
- KernelToCallees[K] = collectReachableCallees(K);
- }
- }
-};
-
-/// Collect reachable callees for each kernel defined in the module \p M and
-/// return collected callees at \p KernelToCallees.
-void collectReachableCallees(
- Module &M,
- DenseMap<Function *, SmallPtrSet<Function *, 8>> &KernelToCallees) {
- CollectReachableCallees CRC{M};
- CRC.collectReachableCallees(KernelToCallees);
-}
-
-/// For the given LDS global \p GV, visit all its users and collect all
-/// non-kernel functions within which \p GV is used and return collected list of
-/// such non-kernel functions.
-SmallPtrSet<Function *, 8> collectNonKernelAccessorsOfLDS(GlobalVariable *GV) {
- SmallPtrSet<Function *, 8> LDSAccessors;
- SmallVector<User *, 8> UserStack(GV->users());
- SmallPtrSet<User *, 8> VisitedUsers;
-
- while (!UserStack.empty()) {
- auto *U = UserStack.pop_back_val();
-
- // `U` is already visited? continue to next one.
- if (!VisitedUsers.insert(U).second)
- continue;
-
- // `U` is a global variable which is initialized with LDS. Ignore LDS.
- if (isa<GlobalValue>(U))
- return SmallPtrSet<Function *, 8>();
-
- // Recursively explore constant users.
- if (isa<Constant>(U)) {
- append_range(UserStack, U->users());
- continue;
- }
-
- // `U` should be an instruction, if it belongs to a non-kernel function F,
- // then collect F.
- Function *F = cast<Instruction>(U)->getFunction();
- if (!llvm::AMDGPU::isKernelCC(F))
- LDSAccessors.insert(F);
- }
-
- return LDSAccessors;
-}
-
-DenseMap<Function *, SmallPtrSet<Instruction *, 8>>
-getFunctionToInstsMap(User *U, bool CollectKernelInsts) {
- DenseMap<Function *, SmallPtrSet<Instruction *, 8>> FunctionToInsts;
- SmallVector<User *, 8> UserStack;
- SmallPtrSet<User *, 8> VisitedUsers;
-
- UserStack.push_back(U);
-
- while (!UserStack.empty()) {
- auto *UU = UserStack.pop_back_val();
-
- if (!VisitedUsers.insert(UU).second)
- continue;
-
- if (isa<GlobalValue>(UU))
- continue;
-
- if (isa<Constant>(UU)) {
- append_range(UserStack, UU->users());
- continue;
- }
-
- auto *I = cast<Instruction>(UU);
- Function *F = I->getFunction();
- if (CollectKernelInsts) {
- if (!llvm::AMDGPU::isKernelCC(F)) {
- continue;
- }
- } else {
- if (llvm::AMDGPU::isKernelCC(F)) {
- continue;
- }
- }
-
- FunctionToInsts.insert(std::pair(F, SmallPtrSet<Instruction *, 8>()));
- FunctionToInsts[F].insert(I);
- }
-
- return FunctionToInsts;
-}
-
-} // namespace AMDGPU
-
-// Entry-point function which interface ReplaceLDSUseImpl with outside of the
-// class.
-bool ReplaceLDSUseImpl::replaceLDSUse() {
- // Collect LDS which requires their uses to be replaced by pointer.
- std::vector<GlobalVariable *> LDSGlobals =
- collectLDSRequiringPointerReplace();
-
- // No LDS to pointer-replace. Nothing to do.
- if (LDSGlobals.empty())
- return false;
-
- // Collect reachable callee set for each kernel defined in the module.
- AMDGPU::collectReachableCallees(M, KernelToCallees);
-
- if (KernelToCallees.empty()) {
- // Either module does not have any kernel definitions, or none of the kernel
- // has a call to non-kernel functions, or we could not resolve any of the
- // call sites to proper non-kernel functions, because of the situations like
- // inline asm calls. Nothing to replace.
- return false;
- }
-
- // For every LDS from collected LDS globals set, replace its non-kernel
- // function scope use by pointer.
- bool Changed = false;
- for (auto *GV : LDSGlobals)
- Changed |= replaceLDSUse(GV);
-
- return Changed;
-}
-
-class AMDGPUReplaceLDSUseWithPointer : public ModulePass {
-public:
- static char ID;
-
- AMDGPUReplaceLDSUseWithPointer() : ModulePass(ID) {
- initializeAMDGPUReplaceLDSUseWithPointerPass(
- *PassRegistry::getPassRegistry());
- }
-
- bool runOnModule(Module &M) override;
-
- void getAnalysisUsage(AnalysisUsage &AU) const override {
- AU.addRequired<TargetPassConfig>();
- }
-};
-
-} // namespace
-
-char AMDGPUReplaceLDSUseWithPointer::ID = 0;
-char &llvm::AMDGPUReplaceLDSUseWithPointerID =
- AMDGPUReplaceLDSUseWithPointer::ID;
-
-INITIALIZE_PASS_BEGIN(
- AMDGPUReplaceLDSUseWithPointer, DEBUG_TYPE,
- "Replace within non-kernel function use of LDS with pointer",
- false /*only look at the cfg*/, false /*analysis pass*/)
-INITIALIZE_PASS_DEPENDENCY(TargetPassConfig)
-INITIALIZE_PASS_END(
- AMDGPUReplaceLDSUseWithPointer, DEBUG_TYPE,
- "Replace within non-kernel function use of LDS with pointer",
- false /*only look at the cfg*/, false /*analysis pass*/)
-
-bool AMDGPUReplaceLDSUseWithPointer::runOnModule(Module &M) {
- ReplaceLDSUseImpl LDSUseReplacer{M};
- return LDSUseReplacer.replaceLDSUse();
-}
-
-ModulePass *llvm::createAMDGPUReplaceLDSUseWithPointerPass() {
- return new AMDGPUReplaceLDSUseWithPointer();
-}
-
-PreservedAnalyses
-AMDGPUReplaceLDSUseWithPointerPass::run(Module &M, ModuleAnalysisManager &AM) {
- ReplaceLDSUseImpl LDSUseReplacer{M};
- LDSUseReplacer.replaceLDSUse();
- return PreservedAnalyses::all();
-}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp b/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp
index 31e134d42e23..804bf503e4f9 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp
@@ -104,6 +104,7 @@ bool AMDGPUResourceUsageAnalysis::runOnModule(Module &M) {
MachineModuleInfo &MMI = getAnalysis<MachineModuleInfoWrapperPass>().getMMI();
const TargetMachine &TM = TPC->getTM<TargetMachine>();
+ const MCSubtargetInfo &STI = *TM.getMCSubtargetInfo();
bool HasIndirectCall = false;
CallGraph CG = CallGraph(M);
@@ -111,7 +112,8 @@ bool AMDGPUResourceUsageAnalysis::runOnModule(Module &M) {
// By default, for code object v5 and later, track only the minimum scratch
// size
- if (AMDGPU::getAmdhsaCodeObjectVersion() >= 5) {
+ if (AMDGPU::getCodeObjectVersion(M) >= AMDGPU::AMDHSA_COV5 ||
+ STI.getTargetTriple().getOS() == Triple::AMDPAL) {
if (!AssumedStackSizeForDynamicSizeObjects.getNumOccurrences())
AssumedStackSizeForDynamicSizeObjects = 0;
if (!AssumedStackSizeForExternalCall.getNumOccurrences())
@@ -338,11 +340,9 @@ AMDGPUResourceUsageAnalysis::analyzeResourceUsage(
break;
}
- if (AMDGPU::SReg_32RegClass.contains(Reg) ||
- AMDGPU::SReg_LO16RegClass.contains(Reg) ||
+ if (AMDGPU::SGPR_32RegClass.contains(Reg) ||
+ AMDGPU::SGPR_LO16RegClass.contains(Reg) ||
AMDGPU::SGPR_HI16RegClass.contains(Reg)) {
- assert(!AMDGPU::TTMP_32RegClass.contains(Reg) &&
- "trap handler registers should not be used");
IsSGPR = true;
Width = 1;
} else if (AMDGPU::VGPR_32RegClass.contains(Reg) ||
@@ -355,9 +355,7 @@ AMDGPUResourceUsageAnalysis::analyzeResourceUsage(
IsSGPR = false;
IsAGPR = true;
Width = 1;
- } else if (AMDGPU::SReg_64RegClass.contains(Reg)) {
- assert(!AMDGPU::TTMP_64RegClass.contains(Reg) &&
- "trap handler registers should not be used");
+ } else if (AMDGPU::SGPR_64RegClass.contains(Reg)) {
IsSGPR = true;
Width = 2;
} else if (AMDGPU::VReg_64RegClass.contains(Reg)) {
@@ -377,9 +375,7 @@ AMDGPUResourceUsageAnalysis::analyzeResourceUsage(
IsSGPR = false;
IsAGPR = true;
Width = 3;
- } else if (AMDGPU::SReg_128RegClass.contains(Reg)) {
- assert(!AMDGPU::TTMP_128RegClass.contains(Reg) &&
- "trap handler registers should not be used");
+ } else if (AMDGPU::SGPR_128RegClass.contains(Reg)) {
IsSGPR = true;
Width = 4;
} else if (AMDGPU::VReg_128RegClass.contains(Reg)) {
@@ -420,8 +416,6 @@ AMDGPUResourceUsageAnalysis::analyzeResourceUsage(
IsAGPR = true;
Width = 7;
} else if (AMDGPU::SReg_256RegClass.contains(Reg)) {
- assert(!AMDGPU::TTMP_256RegClass.contains(Reg) &&
- "trap handler registers should not be used");
IsSGPR = true;
Width = 8;
} else if (AMDGPU::VReg_256RegClass.contains(Reg)) {
@@ -472,8 +466,6 @@ AMDGPUResourceUsageAnalysis::analyzeResourceUsage(
IsAGPR = true;
Width = 12;
} else if (AMDGPU::SReg_512RegClass.contains(Reg)) {
- assert(!AMDGPU::TTMP_512RegClass.contains(Reg) &&
- "trap handler registers should not be used");
IsSGPR = true;
Width = 16;
} else if (AMDGPU::VReg_512RegClass.contains(Reg)) {
@@ -494,7 +486,15 @@ AMDGPUResourceUsageAnalysis::analyzeResourceUsage(
IsAGPR = true;
Width = 32;
} else {
- llvm_unreachable("Unknown register class");
+ // We only expect TTMP registers or registers that do not belong to
+ // any RC.
+ assert((AMDGPU::TTMP_32RegClass.contains(Reg) ||
+ AMDGPU::TTMP_64RegClass.contains(Reg) ||
+ AMDGPU::TTMP_128RegClass.contains(Reg) ||
+ AMDGPU::TTMP_256RegClass.contains(Reg) ||
+ AMDGPU::TTMP_512RegClass.contains(Reg) ||
+ !TRI.getPhysRegBaseClass(Reg)) &&
+ "Unknown register class");
}
unsigned HWReg = TRI.getHWRegIndex(Reg);
int MaxUsed = HWReg + Width - 1;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURewriteOutArguments.cpp b/llvm/lib/Target/AMDGPU/AMDGPURewriteOutArguments.cpp
index 3ff3546f4f92..2fde7afc0c14 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURewriteOutArguments.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURewriteOutArguments.cpp
@@ -46,6 +46,7 @@
#include "llvm/ADT/SmallSet.h"
#include "llvm/ADT/Statistic.h"
#include "llvm/Analysis/MemoryDependenceAnalysis.h"
+#include "llvm/IR/AttributeMask.h"
#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/Instructions.h"
#include "llvm/InitializePasses.h"
@@ -377,19 +378,12 @@ bool AMDGPURewriteOutArguments::runOnFunction(Function &F) {
if (!OutArgIndexes.count(Arg.getArgNo()))
continue;
- PointerType *ArgType = cast<PointerType>(Arg.getType());
-
Type *EltTy = OutArgIndexes[Arg.getArgNo()];
const auto Align =
DL->getValueOrABITypeAlignment(Arg.getParamAlign(), EltTy);
Value *Val = B.CreateExtractValue(StubCall, RetIdx++);
- Type *PtrTy = Val->getType()->getPointerTo(ArgType->getAddressSpace());
-
- // We can peek through bitcasts, so the type may not match.
- Value *PtrVal = B.CreateBitCast(&Arg, PtrTy);
-
- B.CreateAlignedStore(Val, PtrVal, Align);
+ B.CreateAlignedStore(Val, &Arg, Align);
}
if (!RetTy->isVoidTy()) {
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURewriteUndefForPHI.cpp b/llvm/lib/Target/AMDGPU/AMDGPURewriteUndefForPHI.cpp
index ff34726fdf02..9c07851243c9 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURewriteUndefForPHI.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURewriteUndefForPHI.cpp
@@ -10,7 +10,7 @@
// StructurizedCFG pass, and this pass has some additional limitation that make
// it can only run after SIAnnotateControlFlow.
//
-// To achieve optimal code generation for AMDGPU, we assume that divergence
+// To achieve optimal code generation for AMDGPU, we assume that uniformity
// analysis reports the PHI in join block of divergent branch as uniform if
// it has one unique uniform value plus additional undefined/poisoned incoming
// value. That is to say the later compiler pipeline will ensure such PHI always
@@ -56,7 +56,7 @@
// \---
#include "AMDGPU.h"
-#include "llvm/Analysis/LegacyDivergenceAnalysis.h"
+#include "llvm/Analysis/UniformityAnalysis.h"
#include "llvm/IR/BasicBlock.h"
#include "llvm/IR/Constants.h"
#include "llvm/IR/Dominators.h"
@@ -81,11 +81,11 @@ public:
}
void getAnalysisUsage(AnalysisUsage &AU) const override {
- AU.addRequired<LegacyDivergenceAnalysis>();
+ AU.addRequired<UniformityInfoWrapperPass>();
AU.addRequired<DominatorTreeWrapperPass>();
AU.addPreserved<DominatorTreeWrapperPass>();
- AU.addPreserved<LegacyDivergenceAnalysis>();
+ AU.addPreserved<UniformityInfoWrapperPass>();
AU.setPreservesCFG();
}
};
@@ -95,17 +95,17 @@ char AMDGPURewriteUndefForPHI::ID = 0;
INITIALIZE_PASS_BEGIN(AMDGPURewriteUndefForPHI, DEBUG_TYPE,
"Rewrite undef for PHI", false, false)
-INITIALIZE_PASS_DEPENDENCY(LegacyDivergenceAnalysis)
+INITIALIZE_PASS_DEPENDENCY(UniformityInfoWrapperPass)
INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
INITIALIZE_PASS_END(AMDGPURewriteUndefForPHI, DEBUG_TYPE,
"Rewrite undef for PHI", false, false)
-bool rewritePHIs(Function &F, LegacyDivergenceAnalysis *DA, DominatorTree *DT) {
+bool rewritePHIs(Function &F, UniformityInfo &UA, DominatorTree *DT) {
bool Changed = false;
SmallVector<PHINode *> ToBeDeleted;
for (auto &BB : F) {
for (auto &PHI : BB.phis()) {
- if (DA->isDivergent(&PHI))
+ if (UA.isDivergent(&PHI))
continue;
// The unique incoming value except undef/poison for the PHI node.
@@ -147,7 +147,7 @@ bool rewritePHIs(Function &F, LegacyDivergenceAnalysis *DA, DominatorTree *DT) {
// TODO: We should still be able to replace undef value if the unique
// value is a Constant.
if (!UniqueDefinedIncoming || Undefs.empty() ||
- !DA->isDivergent(DominateBB->getTerminator()))
+ !UA.isDivergent(DominateBB->getTerminator()))
continue;
// We only replace the undef when DominateBB truly dominates all the
@@ -171,9 +171,10 @@ bool rewritePHIs(Function &F, LegacyDivergenceAnalysis *DA, DominatorTree *DT) {
}
bool AMDGPURewriteUndefForPHI::runOnFunction(Function &F) {
- LegacyDivergenceAnalysis *DA = &getAnalysis<LegacyDivergenceAnalysis>();
+ UniformityInfo &UA =
+ getAnalysis<UniformityInfoWrapperPass>().getUniformityInfo();
DominatorTree *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
- return rewritePHIs(F, DA, DT);
+ return rewritePHIs(F, UA, DT);
}
FunctionPass *llvm::createAMDGPURewriteUndefForPHIPass() {
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td b/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td
index ca714baffe3e..317f3f21d240 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td
@@ -237,8 +237,6 @@ def : SourceOfDivergence<int_amdgcn_mbcnt_lo>;
def : SourceOfDivergence<int_r600_read_tidig_x>;
def : SourceOfDivergence<int_r600_read_tidig_y>;
def : SourceOfDivergence<int_r600_read_tidig_z>;
-def : SourceOfDivergence<int_amdgcn_atomic_inc>;
-def : SourceOfDivergence<int_amdgcn_atomic_dec>;
def : SourceOfDivergence<int_amdgcn_global_atomic_csub>;
def : SourceOfDivergence<int_amdgcn_global_atomic_fadd>;
def : SourceOfDivergence<int_amdgcn_global_atomic_fmin>;
@@ -279,6 +277,22 @@ def : SourceOfDivergence<int_amdgcn_raw_buffer_atomic_fadd>;
def : SourceOfDivergence<int_amdgcn_raw_buffer_atomic_fmin>;
def : SourceOfDivergence<int_amdgcn_raw_buffer_atomic_fmax>;
def : SourceOfDivergence<int_amdgcn_raw_buffer_atomic_cmpswap>;
+def : SourceOfDivergence<int_amdgcn_raw_ptr_buffer_atomic_swap>;
+def : SourceOfDivergence<int_amdgcn_raw_ptr_buffer_atomic_add>;
+def : SourceOfDivergence<int_amdgcn_raw_ptr_buffer_atomic_sub>;
+def : SourceOfDivergence<int_amdgcn_raw_ptr_buffer_atomic_smin>;
+def : SourceOfDivergence<int_amdgcn_raw_ptr_buffer_atomic_umin>;
+def : SourceOfDivergence<int_amdgcn_raw_ptr_buffer_atomic_smax>;
+def : SourceOfDivergence<int_amdgcn_raw_ptr_buffer_atomic_umax>;
+def : SourceOfDivergence<int_amdgcn_raw_ptr_buffer_atomic_and>;
+def : SourceOfDivergence<int_amdgcn_raw_ptr_buffer_atomic_or>;
+def : SourceOfDivergence<int_amdgcn_raw_ptr_buffer_atomic_xor>;
+def : SourceOfDivergence<int_amdgcn_raw_ptr_buffer_atomic_inc>;
+def : SourceOfDivergence<int_amdgcn_raw_ptr_buffer_atomic_dec>;
+def : SourceOfDivergence<int_amdgcn_raw_ptr_buffer_atomic_fadd>;
+def : SourceOfDivergence<int_amdgcn_raw_ptr_buffer_atomic_fmin>;
+def : SourceOfDivergence<int_amdgcn_raw_ptr_buffer_atomic_fmax>;
+def : SourceOfDivergence<int_amdgcn_raw_ptr_buffer_atomic_cmpswap>;
def : SourceOfDivergence<int_amdgcn_struct_buffer_atomic_swap>;
def : SourceOfDivergence<int_amdgcn_struct_buffer_atomic_add>;
def : SourceOfDivergence<int_amdgcn_struct_buffer_atomic_sub>;
@@ -295,6 +309,22 @@ def : SourceOfDivergence<int_amdgcn_struct_buffer_atomic_fadd>;
def : SourceOfDivergence<int_amdgcn_struct_buffer_atomic_fmin>;
def : SourceOfDivergence<int_amdgcn_struct_buffer_atomic_fmax>;
def : SourceOfDivergence<int_amdgcn_struct_buffer_atomic_cmpswap>;
+def : SourceOfDivergence<int_amdgcn_struct_ptr_buffer_atomic_swap>;
+def : SourceOfDivergence<int_amdgcn_struct_ptr_buffer_atomic_add>;
+def : SourceOfDivergence<int_amdgcn_struct_ptr_buffer_atomic_sub>;
+def : SourceOfDivergence<int_amdgcn_struct_ptr_buffer_atomic_smin>;
+def : SourceOfDivergence<int_amdgcn_struct_ptr_buffer_atomic_umin>;
+def : SourceOfDivergence<int_amdgcn_struct_ptr_buffer_atomic_smax>;
+def : SourceOfDivergence<int_amdgcn_struct_ptr_buffer_atomic_umax>;
+def : SourceOfDivergence<int_amdgcn_struct_ptr_buffer_atomic_and>;
+def : SourceOfDivergence<int_amdgcn_struct_ptr_buffer_atomic_or>;
+def : SourceOfDivergence<int_amdgcn_struct_ptr_buffer_atomic_xor>;
+def : SourceOfDivergence<int_amdgcn_struct_ptr_buffer_atomic_inc>;
+def : SourceOfDivergence<int_amdgcn_struct_ptr_buffer_atomic_dec>;
+def : SourceOfDivergence<int_amdgcn_struct_ptr_buffer_atomic_fadd>;
+def : SourceOfDivergence<int_amdgcn_struct_ptr_buffer_atomic_fmin>;
+def : SourceOfDivergence<int_amdgcn_struct_ptr_buffer_atomic_fmax>;
+def : SourceOfDivergence<int_amdgcn_struct_ptr_buffer_atomic_cmpswap>;
def : SourceOfDivergence<int_amdgcn_buffer_atomic_csub>;
def : SourceOfDivergence<int_amdgcn_ps_live>;
def : SourceOfDivergence<int_amdgcn_live_mask>;
@@ -376,6 +406,26 @@ def : SourceOfDivergence<int_amdgcn_wmma_i32_16x16x16_iu4>;
def : SourceOfDivergence<int_amdgcn_if>;
def : SourceOfDivergence<int_amdgcn_else>;
def : SourceOfDivergence<int_amdgcn_loop>;
+def : SourceOfDivergence<int_amdgcn_inverse_ballot>;
foreach intr = AMDGPUImageDimAtomicIntrinsics in
def : SourceOfDivergence<intr>;
+
+class AlwaysUniform<Intrinsic intr> {
+ Intrinsic Intr = intr;
+}
+
+def UniformIntrinsics : GenericTable {
+ let FilterClass = "AlwaysUniform";
+ let Fields = ["Intr"];
+
+ let PrimaryKey = ["Intr"];
+ let PrimaryKeyName = "lookupAlwaysUniform";
+}
+
+def : AlwaysUniform<int_amdgcn_readfirstlane>;
+def : AlwaysUniform<int_amdgcn_readlane>;
+def : AlwaysUniform<int_amdgcn_icmp>;
+def : AlwaysUniform<int_amdgcn_fcmp>;
+def : AlwaysUniform<int_amdgcn_ballot>;
+def : AlwaysUniform<int_amdgcn_if_break>;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
index 03ccd563975f..9b50f4fa53ac 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
@@ -416,8 +416,9 @@ std::pair<unsigned, unsigned> AMDGPUSubtarget::getFlatWorkGroupSizes(
return Requested;
}
-std::pair<unsigned, unsigned> AMDGPUSubtarget::getWavesPerEU(
- const Function &F, std::pair<unsigned, unsigned> FlatWorkGroupSizes) const {
+std::pair<unsigned, unsigned> AMDGPUSubtarget::getEffectiveWavesPerEU(
+ std::pair<unsigned, unsigned> Requested,
+ std::pair<unsigned, unsigned> FlatWorkGroupSizes) const {
// Default minimum/maximum number of waves per execution unit.
std::pair<unsigned, unsigned> Default(1, getMaxWavesPerEU());
@@ -429,10 +430,6 @@ std::pair<unsigned, unsigned> AMDGPUSubtarget::getWavesPerEU(
getWavesPerEUForWorkGroup(FlatWorkGroupSizes.second);
Default.first = MinImpliedByFlatWorkGroupSize;
- // Requested minimum/maximum number of waves per execution unit.
- std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute(
- F, "amdgpu-waves-per-eu", Default, true);
-
// Make sure requested minimum is less than requested maximum.
if (Requested.second && Requested.first > Requested.second)
return Default;
@@ -450,6 +447,17 @@ std::pair<unsigned, unsigned> AMDGPUSubtarget::getWavesPerEU(
return Requested;
}
+std::pair<unsigned, unsigned> AMDGPUSubtarget::getWavesPerEU(
+ const Function &F, std::pair<unsigned, unsigned> FlatWorkGroupSizes) const {
+ // Default minimum/maximum number of waves per execution unit.
+ std::pair<unsigned, unsigned> Default(1, getMaxWavesPerEU());
+
+ // Requested minimum/maximum number of waves per execution unit.
+ std::pair<unsigned, unsigned> Requested =
+ AMDGPU::getIntegerPairAttribute(F, "amdgpu-waves-per-eu", Default, true);
+ return getEffectiveWavesPerEU(Requested, FlatWorkGroupSizes);
+}
+
static unsigned getReqdWorkGroupSize(const Function &Kernel, unsigned Dim) {
auto Node = Kernel.getMetadata("reqd_work_group_size");
if (Node && Node->getNumOperands() == 3)
@@ -469,6 +477,15 @@ unsigned AMDGPUSubtarget::getMaxWorkitemID(const Function &Kernel,
return getFlatWorkGroupSizes(Kernel).second - 1;
}
+bool AMDGPUSubtarget::isSingleLaneExecution(const Function &Func) const {
+ for (int I = 0; I < 3; ++I) {
+ if (getMaxWorkitemID(Func, I) > 0)
+ return false;
+ }
+
+ return true;
+}
+
bool AMDGPUSubtarget::makeLIDRangeMetadata(Instruction *I) const {
Function *Kernel = I->getParent()->getParent();
unsigned MinSize = 0;
@@ -543,7 +560,9 @@ unsigned AMDGPUSubtarget::getImplicitArgNumBytes(const Function &F) const {
return 16;
// Assume all implicit inputs are used by default
- unsigned NBytes = (AMDGPU::getAmdhsaCodeObjectVersion() >= 5) ? 256 : 56;
+ const Module *M = F.getParent();
+ unsigned NBytes =
+ AMDGPU::getCodeObjectVersion(*M) >= AMDGPU::AMDHSA_COV5 ? 256 : 56;
return F.getFnAttributeAsParsedInteger("amdgpu-implicitarg-num-bytes",
NBytes);
}
@@ -572,9 +591,13 @@ uint64_t AMDGPUSubtarget::getExplicitKernArgSize(const Function &F,
unsigned AMDGPUSubtarget::getKernArgSegmentSize(const Function &F,
Align &MaxAlign) const {
+ if (F.getCallingConv() != CallingConv::AMDGPU_KERNEL &&
+ F.getCallingConv() != CallingConv::SPIR_KERNEL)
+ return 0;
+
uint64_t ExplicitArgBytes = getExplicitKernArgSize(F, MaxAlign);
- unsigned ExplicitOffset = getExplicitKernelArgOffset(F);
+ unsigned ExplicitOffset = getExplicitKernelArgOffset();
uint64_t TotalSize = ExplicitOffset + ExplicitArgBytes;
unsigned ImplicitBytes = getImplicitArgNumBytes(F);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
index 972f996ad85a..10ce00fe68ca 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
@@ -14,9 +14,9 @@
#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUSUBTARGET_H
#define LLVM_LIB_TARGET_AMDGPU_AMDGPUSUBTARGET_H
-#include "llvm/ADT/Triple.h"
#include "llvm/IR/CallingConv.h"
#include "llvm/Support/Alignment.h"
+#include "llvm/TargetParser/Triple.h"
namespace llvm {
@@ -61,6 +61,7 @@ protected:
bool HasFminFmaxLegacy = true;
bool EnablePromoteAlloca = false;
bool HasTrigReducedRange = false;
+ bool FastFMAF32 = false;
unsigned EUsPerCU = 4;
unsigned MaxWavesPerEU = 10;
unsigned LocalMemorySize = 0;
@@ -107,6 +108,9 @@ public:
std::pair<unsigned, unsigned>
getWavesPerEU(const Function &F,
std::pair<unsigned, unsigned> FlatWorkGroupSizes) const;
+ std::pair<unsigned, unsigned> getEffectiveWavesPerEU(
+ std::pair<unsigned, unsigned> WavesPerEU,
+ std::pair<unsigned, unsigned> FlatWorkGroupSizes) const;
/// Return the amount of LDS that can be used that will not restrict the
/// occupancy lower than WaveCount.
@@ -195,6 +199,10 @@ public:
return HasTrigReducedRange;
}
+ bool hasFastFMAF32() const {
+ return FastFMAF32;
+ }
+
bool isPromoteAllocaEnabled() const {
return EnablePromoteAlloca;
}
@@ -226,7 +234,7 @@ public:
/// Returns the offset in bytes from the start of the input buffer
/// of the first explicit kernel argument.
- unsigned getExplicitKernelArgOffset(const Function &F) const {
+ unsigned getExplicitKernelArgOffset() const {
switch (TargetTriple.getOS()) {
case Triple::AMDHSA:
case Triple::AMDPAL:
@@ -269,6 +277,9 @@ public:
/// 2) dimension.
unsigned getMaxWorkitemID(const Function &Kernel, unsigned Dimension) const;
+ /// Return true if only a single workitem can be active in a wave.
+ bool isSingleLaneExecution(const Function &Kernel) const;
+
/// Creates value range metadata on an workitemid.* intrinsic call or load.
bool makeLIDRangeMetadata(Instruction *I) const;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index 5694acf40527..f90c8e4bdddd 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -19,8 +19,10 @@
#include "AMDGPUExportClustering.h"
#include "AMDGPUIGroupLP.h"
#include "AMDGPUMacroFusion.h"
+#include "AMDGPURegBankSelect.h"
#include "AMDGPUTargetObjectFile.h"
#include "AMDGPUTargetTransformInfo.h"
+#include "AMDGPUUnifyDivergentExitNodes.h"
#include "GCNIterativeScheduler.h"
#include "GCNSchedStrategy.h"
#include "GCNVOPDUtils.h"
@@ -43,7 +45,6 @@
#include "llvm/CodeGen/RegAllocRegistry.h"
#include "llvm/CodeGen/TargetPassConfig.h"
#include "llvm/IR/IntrinsicsAMDGPU.h"
-#include "llvm/IR/LegacyPassManager.h"
#include "llvm/IR/PassManager.h"
#include "llvm/IR/PatternMatch.h"
#include "llvm/InitializePasses.h"
@@ -58,7 +59,7 @@
#include "llvm/Transforms/Scalar/InferAddressSpaces.h"
#include "llvm/Transforms/Utils.h"
#include "llvm/Transforms/Utils/SimplifyLibCalls.h"
-#include "llvm/Transforms/Vectorize.h"
+#include "llvm/Transforms/Vectorize/LoadStoreVectorizer.h"
#include <optional>
using namespace llvm;
@@ -188,6 +189,11 @@ OptExecMaskPreRA("amdgpu-opt-exec-mask-pre-ra", cl::Hidden,
cl::desc("Run pre-RA exec mask optimizations"),
cl::init(true));
+static cl::opt<bool>
+ LowerCtorDtor("amdgpu-lower-global-ctor-dtor",
+ cl::desc("Lower GPU ctor / dtors to globals on the device."),
+ cl::init(true), cl::Hidden);
+
// Option to disable vectorizer for tests.
static cl::opt<bool> EnableLoadStoreVectorizer(
"amdgpu-load-store-vectorizer",
@@ -216,6 +222,12 @@ static cl::opt<bool> EarlyInlineAll(
cl::init(false),
cl::Hidden);
+static cl::opt<bool> RemoveIncompatibleFunctions(
+ "amdgpu-enable-remove-incompatible-functions", cl::Hidden,
+ cl::desc("Enable removal of functions when they"
+ "use features not supported by the target GPU"),
+ cl::init(true));
+
static cl::opt<bool> EnableSDWAPeephole(
"amdgpu-sdwa-peephole",
cl::desc("Enable SDWA peepholer"),
@@ -262,12 +274,15 @@ static cl::opt<bool> OptVGPRLiveRange(
cl::desc("Enable VGPR liverange optimizations for if-else structure"),
cl::init(true), cl::Hidden);
-// Enable atomic optimization
-static cl::opt<bool> EnableAtomicOptimizations(
- "amdgpu-atomic-optimizations",
- cl::desc("Enable atomic optimizations"),
- cl::init(false),
- cl::Hidden);
+static cl::opt<ScanOptions> AMDGPUAtomicOptimizerStrategy(
+ "amdgpu-atomic-optimizer-strategy",
+ cl::desc("Select DPP or Iterative strategy for scan"),
+ cl::init(ScanOptions::Iterative),
+ cl::values(
+ clEnumValN(ScanOptions::DPP, "DPP", "Use DPP operations for scan"),
+ clEnumValN(ScanOptions::Iterative, "Iterative",
+ "Use Iterative approach for scan"),
+ clEnumValN(ScanOptions::None, "None", "Disable atomic optimizer")));
// Enable Mode register optimization
static cl::opt<bool> EnableSIModeRegisterPass(
@@ -309,11 +324,6 @@ static cl::opt<bool> EnableStructurizerWorkarounds(
cl::desc("Enable workarounds for the StructurizeCFG pass"), cl::init(true),
cl::Hidden);
-static cl::opt<bool> EnableLDSReplaceWithPointer(
- "amdgpu-enable-lds-replace-with-pointer",
- cl::desc("Enable LDS replace with pointer pass"), cl::init(false),
- cl::Hidden);
-
static cl::opt<bool, true> EnableLowerModuleLDS(
"amdgpu-enable-lower-module-lds", cl::desc("Enable lower module lds pass"),
cl::location(AMDGPUTargetMachine::EnableLowerModuleLDS), cl::init(true),
@@ -334,9 +344,14 @@ static cl::opt<bool> EnableMaxIlpSchedStrategy(
cl::desc("Enable scheduling strategy to maximize ILP for a single wave."),
cl::Hidden, cl::init(false));
+static cl::opt<bool> EnableRewritePartialRegUses(
+ "amdgpu-enable-rewrite-partial-reg-uses",
+ cl::desc("Enable rewrite partial reg uses pass"), cl::init(false),
+ cl::Hidden);
+
extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() {
// Register the target
- RegisterTargetMachine<R600TargetMachine> X(getTheAMDGPUTarget());
+ RegisterTargetMachine<R600TargetMachine> X(getTheR600Target());
RegisterTargetMachine<GCNTargetMachine> Y(getTheGCNTarget());
PassRegistry *PR = PassRegistry::getPassRegistry();
@@ -349,6 +364,7 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() {
initializeAMDGPUDAGToDAGISelPass(*PR);
initializeGCNDPPCombinePass(*PR);
initializeSILowerI1CopiesPass(*PR);
+ initializeSILowerWWMCopiesPass(*PR);
initializeSILowerSGPRSpillsPass(*PR);
initializeSIFixSGPRCopiesPass(*PR);
initializeSIFixVGPRCopiesPass(*PR);
@@ -368,24 +384,21 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() {
initializeAMDGPULowerKernelArgumentsPass(*PR);
initializeAMDGPUPromoteKernelArgumentsPass(*PR);
initializeAMDGPULowerKernelAttributesPass(*PR);
- initializeAMDGPULowerIntrinsicsPass(*PR);
initializeAMDGPUOpenCLEnqueuedBlockLoweringPass(*PR);
initializeAMDGPUPostLegalizerCombinerPass(*PR);
initializeAMDGPUPreLegalizerCombinerPass(*PR);
initializeAMDGPURegBankCombinerPass(*PR);
+ initializeAMDGPURegBankSelectPass(*PR);
initializeAMDGPUPromoteAllocaPass(*PR);
initializeAMDGPUPromoteAllocaToVectorPass(*PR);
initializeAMDGPUCodeGenPreparePass(*PR);
initializeAMDGPULateCodeGenPreparePass(*PR);
- initializeAMDGPUPropagateAttributesEarlyPass(*PR);
- initializeAMDGPUPropagateAttributesLatePass(*PR);
- initializeAMDGPUReplaceLDSUseWithPointerPass(*PR);
+ initializeAMDGPURemoveIncompatibleFunctionsPass(*PR);
initializeAMDGPULowerModuleLDSPass(*PR);
initializeAMDGPURewriteOutArgumentsPass(*PR);
initializeAMDGPURewriteUndefForPHIPass(*PR);
initializeAMDGPUUnifyMetadataPass(*PR);
initializeSIAnnotateControlFlowPass(*PR);
- initializeAMDGPUReleaseVGPRsPass(*PR);
initializeAMDGPUInsertDelayAluPass(*PR);
initializeSIInsertHardClausesPass(*PR);
initializeSIInsertWaitcntsPass(*PR);
@@ -409,6 +422,8 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() {
initializeAMDGPUResourceUsageAnalysisPass(*PR);
initializeGCNNSAReassignPass(*PR);
initializeGCNPreRAOptimizationsPass(*PR);
+ initializeGCNPreRALongBranchRegPass(*PR);
+ initializeGCNRewritePartialRegUsesPass(*PR);
}
static std::unique_ptr<TargetLoweringObjectFile> createTLOF(const Triple &TT) {
@@ -505,11 +520,15 @@ static StringRef computeDataLayout(const Triple &TT) {
}
// 32-bit private, local, and region pointers. 64-bit global, constant and
- // flat, non-integral buffer fat pointers.
+ // flat. 160-bit non-integral fat buffer pointers that include a 128-bit
+ // buffer descriptor and a 32-bit offset, which are indexed by 32-bit values
+ // (address space 7), and 128-bit non-integral buffer resourcees (address
+ // space 8) which cannot be non-trivilally accessed by LLVM memory operations
+ // like getelementptr.
return "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32"
- "-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128"
- "-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1"
- "-ni:7";
+ "-p7:160:256:256:32-p8:128:128-i64:64-v16:16-v24:32-v32:32-v48:64-v96:"
+ "128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-"
+ "G1-ni:7:8";
}
LLVM_READNONE
@@ -584,12 +603,8 @@ void AMDGPUTargetMachine::registerDefaultAliasAnalyses(AAManager &AAM) {
void AMDGPUTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) {
PB.registerPipelineParsingCallback(
- [this](StringRef PassName, ModulePassManager &PM,
- ArrayRef<PassBuilder::PipelineElement>) {
- if (PassName == "amdgpu-propagate-attributes-late") {
- PM.addPass(AMDGPUPropagateAttributesLatePass(*this));
- return true;
- }
+ [](StringRef PassName, ModulePassManager &PM,
+ ArrayRef<PassBuilder::PipelineElement>) {
if (PassName == "amdgpu-unify-metadata") {
PM.addPass(AMDGPUUnifyMetadataPass());
return true;
@@ -602,10 +617,6 @@ void AMDGPUTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) {
PM.addPass(AMDGPUAlwaysInlinePass());
return true;
}
- if (PassName == "amdgpu-replace-lds-use-with-pointer") {
- PM.addPass(AMDGPUReplaceLDSUseWithPointerPass());
- return true;
- }
if (PassName == "amdgpu-lower-module-lds") {
PM.addPass(AMDGPULowerModuleLDSPass());
return true;
@@ -639,14 +650,23 @@ void AMDGPUTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) {
PM.addPass(AMDGPULowerKernelAttributesPass());
return true;
}
- if (PassName == "amdgpu-propagate-attributes-early") {
- PM.addPass(AMDGPUPropagateAttributesEarlyPass(*this));
- return true;
- }
if (PassName == "amdgpu-promote-kernel-arguments") {
PM.addPass(AMDGPUPromoteKernelArgumentsPass());
return true;
}
+ if (PassName == "amdgpu-unify-divergent-exit-nodes") {
+ PM.addPass(AMDGPUUnifyDivergentExitNodesPass());
+ return true;
+ }
+ if (PassName == "amdgpu-atomic-optimizer") {
+ PM.addPass(
+ AMDGPUAtomicOptimizerPass(*this, AMDGPUAtomicOptimizerStrategy));
+ return true;
+ }
+ if (PassName == "amdgpu-codegenprepare") {
+ PM.addPass(AMDGPUCodeGenPreparePass(*this));
+ return true;
+ }
return false;
});
@@ -665,7 +685,6 @@ void AMDGPUTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) {
PB.registerPipelineStartEPCallback(
[this](ModulePassManager &PM, OptimizationLevel Level) {
FunctionPassManager FPM;
- FPM.addPass(AMDGPUPropagateAttributesEarlyPass(*this));
FPM.addPass(AMDGPUUseNativeCallsPass());
if (EnableLibCallSimplify && Level != OptimizationLevel::O0)
FPM.addPass(AMDGPUSimplifyLibCallsPass(*this));
@@ -673,20 +692,19 @@ void AMDGPUTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) {
});
PB.registerPipelineEarlySimplificationEPCallback(
- [this](ModulePassManager &PM, OptimizationLevel Level) {
+ [](ModulePassManager &PM, OptimizationLevel Level) {
+ PM.addPass(AMDGPUPrintfRuntimeBindingPass());
+
if (Level == OptimizationLevel::O0)
return;
PM.addPass(AMDGPUUnifyMetadataPass());
- PM.addPass(AMDGPUPrintfRuntimeBindingPass());
if (InternalizeSymbols) {
PM.addPass(InternalizePass(mustPreserveGV));
- }
- PM.addPass(AMDGPUPropagateAttributesLatePass(*this));
- if (InternalizeSymbols) {
PM.addPass(GlobalDCEPass());
}
+
if (EarlyInlineAll && !EnableFunctionCalls)
PM.addPass(AMDGPUAlwaysInlinePass());
});
@@ -932,7 +950,6 @@ void AMDGPUPassConfig::addEarlyCSEOrGVNPass() {
}
void AMDGPUPassConfig::addStraightLineScalarOptimizationPasses() {
- addPass(createLICMPass());
addPass(createSeparateConstOffsetFromGEPPass());
// ReassociateGEPs exposes more opportunities for SLSR. See
// the example in reassociate-geps-and-slsr.ll.
@@ -956,22 +973,12 @@ void AMDGPUPassConfig::addIRPasses() {
disablePass(&PatchableFunctionID);
addPass(createAMDGPUPrintfRuntimeBinding());
- addPass(createAMDGPUCtorDtorLoweringLegacyPass());
-
- // A call to propagate attributes pass in the backend in case opt was not run.
- addPass(createAMDGPUPropagateAttributesEarlyPass(&TM));
-
- addPass(createAMDGPULowerIntrinsicsPass());
+ if (LowerCtorDtor)
+ addPass(createAMDGPUCtorDtorLoweringLegacyPass());
// Function calls are not supported, so make sure we inline everything.
addPass(createAMDGPUAlwaysInlinePass());
addPass(createAlwaysInlinerLegacyPass());
- // We need to add the barrier noop pass, otherwise adding the function
- // inlining pass will cause all of the PassConfigs passes to be run
- // one function at a time, which means if we have a module with two
- // functions, then we will generate code for the first function
- // without ever running any passes on the second.
- addPass(createBarrierNoopPass());
// Handle uses of OpenCL image2d_t, image3d_t and sampler_t arguments.
if (TM.getTargetTriple().getArch() == Triple::r600)
@@ -980,17 +987,16 @@ void AMDGPUPassConfig::addIRPasses() {
// Replace OpenCL enqueued block function pointers with global variables.
addPass(createAMDGPUOpenCLEnqueuedBlockLoweringPass());
- // Can increase LDS used by kernel so runs before PromoteAlloca
+ // Runs before PromoteAlloca so the latter can account for function uses
if (EnableLowerModuleLDS) {
- // The pass "amdgpu-replace-lds-use-with-pointer" need to be run before the
- // pass "amdgpu-lower-module-lds", and also it required to be run only if
- // "amdgpu-lower-module-lds" pass is enabled.
- if (EnableLDSReplaceWithPointer)
- addPass(createAMDGPUReplaceLDSUseWithPointerPass());
-
addPass(createAMDGPULowerModuleLDSPass());
}
+ // AMDGPUAttributor infers lack of llvm.amdgcn.lds.kernel.id calls, so run
+ // after their introduction
+ if (TM.getOptLevel() > CodeGenOpt::None)
+ addPass(createAMDGPUAttributorPass());
+
if (TM.getOptLevel() > CodeGenOpt::None)
addPass(createInferAddressSpacesPass());
@@ -1017,6 +1023,11 @@ void AMDGPUPassConfig::addIRPasses() {
// TODO: May want to move later or split into an early and late one.
addPass(createAMDGPUCodeGenPreparePass());
}
+
+ // Try to hoist loop invariant parts of divisions AMDGPUCodeGenPrepare may
+ // have expanded.
+ if (TM.getOptLevel() > CodeGenOpt::Less)
+ addPass(createLICMPass());
}
TargetPassConfig::addIRPasses();
@@ -1039,7 +1050,8 @@ void AMDGPUPassConfig::addIRPasses() {
void AMDGPUPassConfig::addCodeGenPrepare() {
if (TM->getTargetTriple().getArch() == Triple::amdgcn) {
- addPass(createAMDGPUAttributorPass());
+ if (RemoveIncompatibleFunctions)
+ addPass(createAMDGPURemoveIncompatibleFunctionsPass(TM));
// FIXME: This pass adds 2 hacky attributes that can be replaced with an
// analysis, and should be removed.
@@ -1117,8 +1129,9 @@ bool GCNPassConfig::addPreISel() {
if (TM->getOptLevel() > CodeGenOpt::None)
addPass(createAMDGPULateCodeGenPreparePass());
- if (isPassEnabled(EnableAtomicOptimizations, CodeGenOpt::Less)) {
- addPass(createAMDGPUAtomicOptimizerPass());
+ if ((TM->getOptLevel() >= CodeGenOpt::Less) &&
+ (AMDGPUAtomicOptimizerStrategy != ScanOptions::None)) {
+ addPass(createAMDGPUAtomicOptimizerPass(AMDGPUAtomicOptimizerStrategy));
}
if (TM->getOptLevel() > CodeGenOpt::None)
@@ -1211,7 +1224,7 @@ void GCNPassConfig::addPreRegBankSelect() {
}
bool GCNPassConfig::addRegBankSelect() {
- addPass(new RegBankSelect());
+ addPass(new AMDGPURegBankSelect());
return false;
}
@@ -1255,6 +1268,9 @@ void GCNPassConfig::addOptimizedRegAlloc() {
if (OptExecMaskPreRA)
insertPass(&MachineSchedulerID, &SIOptimizeExecMaskingPreRAID);
+ if (EnableRewritePartialRegUses)
+ insertPass(&RenameIndependentSubregsID, &GCNRewritePartialRegUsesID);
+
if (isPassEnabled(EnablePreRAOptimizations))
insertPass(&RenameIndependentSubregsID, &GCNPreRAOptimizationsID);
@@ -1281,6 +1297,7 @@ void GCNPassConfig::addOptimizedRegAlloc() {
}
bool GCNPassConfig::addPreRewrite() {
+ addPass(&SILowerWWMCopiesID);
if (EnableRegReassign)
addPass(&GCNNSAReassignID);
return true;
@@ -1327,12 +1344,16 @@ bool GCNPassConfig::addRegAssignAndRewriteFast() {
if (!usingDefaultRegAlloc())
report_fatal_error(RegAllocOptNotSupportedMessage);
+ addPass(&GCNPreRALongBranchRegID);
+
addPass(createSGPRAllocPass(false));
// Equivalent of PEI for SGPRs.
addPass(&SILowerSGPRSpillsID);
addPass(createVGPRAllocPass(false));
+
+ addPass(&SILowerWWMCopiesID);
return true;
}
@@ -1340,6 +1361,8 @@ bool GCNPassConfig::addRegAssignAndRewriteOptimized() {
if (!usingDefaultRegAlloc())
report_fatal_error(RegAllocOptNotSupportedMessage);
+ addPass(&GCNPreRALongBranchRegID);
+
addPass(createSGPRAllocPass(true));
// Commit allocated register changes. This is mostly necessary because too
@@ -1398,9 +1421,6 @@ void GCNPassConfig::addPreEmitPass() {
// cases.
addPass(&PostRAHazardRecognizerID);
- if (getOptLevel() > CodeGenOpt::Less)
- addPass(&AMDGPUReleaseVGPRsID);
-
if (isPassEnabled(EnableInsertDelayAlu, CodeGenOpt::Less))
addPass(&AMDGPUInsertDelayAluID);
@@ -1411,6 +1431,12 @@ TargetPassConfig *GCNTargetMachine::createPassConfig(PassManagerBase &PM) {
return new GCNPassConfig(*this, PM);
}
+void GCNTargetMachine::registerMachineRegisterInfoCallback(
+ MachineFunction &MF) const {
+ SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
+ MF.getRegInfo().addDelegate(MFI);
+}
+
MachineFunctionInfo *GCNTargetMachine::createMachineFunctionInfo(
BumpPtrAllocator &Allocator, const Function &F,
const TargetSubtargetInfo *STI) const {
@@ -1465,6 +1491,13 @@ bool GCNTargetMachine::parseMachineFunctionInfo(
if (parseOptionalRegister(YamlMFI.VGPRForAGPRCopy, MFI->VGPRForAGPRCopy))
return true;
+ if (parseOptionalRegister(YamlMFI.SGPRForEXECCopy, MFI->SGPRForEXECCopy))
+ return true;
+
+ if (parseOptionalRegister(YamlMFI.LongBranchReservedReg,
+ MFI->LongBranchReservedReg))
+ return true;
+
auto diagnoseRegisterClass = [&](const yaml::StringValue &RegName) {
// Create a diagnostic for a the register string literal.
const MemoryBuffer &Buffer =
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h
index ce93704b78f4..2426be405a65 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h
@@ -92,6 +92,8 @@ public:
return true;
}
+ void registerMachineRegisterInfoCallback(MachineFunction &MF) const override;
+
MachineFunctionInfo *
createMachineFunctionInfo(BumpPtrAllocator &Allocator, const Function &F,
const TargetSubtargetInfo *STI) const override;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
index 0c3324f84b25..81d083c1c88a 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
@@ -17,8 +17,11 @@
#include "AMDGPUTargetTransformInfo.h"
#include "AMDGPUTargetMachine.h"
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
+#include "SIModeRegisterDefaults.h"
+#include "llvm/Analysis/InlineCost.h"
#include "llvm/Analysis/LoopInfo.h"
#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/CodeGen/Analysis.h"
#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/IntrinsicsAMDGPU.h"
#include "llvm/IR/PatternMatch.h"
@@ -49,11 +52,6 @@ static cl::opt<bool> UnrollRuntimeLocal(
cl::desc("Allow runtime unroll for AMDGPU if local memory used in a loop"),
cl::init(true), cl::Hidden);
-static cl::opt<bool> UseLegacyDA(
- "amdgpu-use-legacy-divergence-analysis",
- cl::desc("Enable legacy divergence analysis for AMDGPU"),
- cl::init(false), cl::Hidden);
-
static cl::opt<unsigned> UnrollMaxBlockToAnalyze(
"amdgpu-unroll-max-block-to-analyze",
cl::desc("Inner loop block size threshold to analyze in unroll for AMDGPU"),
@@ -115,6 +113,9 @@ void AMDGPUTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
// manipulations in average.
UP.BEInsns += 3;
+ // We want to run unroll even for the loops which have been vectorized.
+ UP.UnrollVectorizedLoop = true;
+
// TODO: Do we want runtime unrolling?
// Maximum alloca size than can fit registers. Reserve 16 registers.
@@ -266,6 +267,10 @@ void AMDGPUTTIImpl::getPeelingPreferences(Loop *L, ScalarEvolution &SE,
BaseT::getPeelingPreferences(L, SE, PP);
}
+int64_t AMDGPUTTIImpl::getMaxMemIntrinsicInlineSizeThreshold() const {
+ return 1024;
+}
+
const FeatureBitset GCNTTIImpl::InlineFeatureIgnoreList = {
// Codegen control options which don't matter.
AMDGPU::FeatureEnableLoadStoreOpt, AMDGPU::FeatureEnableSIScheduler,
@@ -291,9 +296,14 @@ GCNTTIImpl::GCNTTIImpl(const AMDGPUTargetMachine *TM, const Function &F)
ST(static_cast<const GCNSubtarget *>(TM->getSubtargetImpl(F))),
TLI(ST->getTargetLowering()), CommonTTI(TM, F),
IsGraphics(AMDGPU::isGraphics(F.getCallingConv())) {
- AMDGPU::SIModeRegisterDefaults Mode(F);
- HasFP32Denormals = Mode.allFP32Denormals();
- HasFP64FP16Denormals = Mode.allFP64FP16Denormals();
+ SIModeRegisterDefaults Mode(F);
+ HasFP32Denormals = Mode.FP32Denormals != DenormalMode::getPreserveSign();
+ HasFP64FP16Denormals =
+ Mode.FP64FP16Denormals != DenormalMode::getPreserveSign();
+}
+
+bool GCNTTIImpl::hasBranchDivergence(const Function *F) const {
+ return !F || !ST->isSingleLaneExecution(*F);
}
unsigned GCNTTIImpl::getNumberOfRegisters(unsigned RCID) const {
@@ -357,7 +367,8 @@ unsigned GCNTTIImpl::getLoadStoreVecRegBitWidth(unsigned AddrSpace) const {
if (AddrSpace == AMDGPUAS::GLOBAL_ADDRESS ||
AddrSpace == AMDGPUAS::CONSTANT_ADDRESS ||
AddrSpace == AMDGPUAS::CONSTANT_ADDRESS_32BIT ||
- AddrSpace == AMDGPUAS::BUFFER_FAT_POINTER) {
+ AddrSpace == AMDGPUAS::BUFFER_FAT_POINTER ||
+ AddrSpace == AMDGPUAS::BUFFER_RESOURCE) {
return 512;
}
@@ -393,6 +404,10 @@ bool GCNTTIImpl::isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes,
return isLegalToVectorizeMemChain(ChainSizeInBytes, Alignment, AddrSpace);
}
+int64_t GCNTTIImpl::getMaxMemIntrinsicInlineSizeThreshold() const {
+ return 1024;
+}
+
// FIXME: Really we would like to issue multiple 128-bit loads and stores per
// iteration. Should we report a larger size and let it legalize?
//
@@ -472,10 +487,10 @@ void GCNTTIImpl::getMemcpyLoopResidualLoweringType(
}
}
-unsigned GCNTTIImpl::getMaxInterleaveFactor(unsigned VF) {
+unsigned GCNTTIImpl::getMaxInterleaveFactor(ElementCount VF) {
// Disable unrolling if the loop is not vectorized.
// TODO: Enable this again.
- if (VF == 1)
+ if (VF.isScalar())
return 1;
return 8;
@@ -484,8 +499,6 @@ unsigned GCNTTIImpl::getMaxInterleaveFactor(unsigned VF) {
bool GCNTTIImpl::getTgtMemIntrinsic(IntrinsicInst *Inst,
MemIntrinsicInfo &Info) const {
switch (Inst->getIntrinsicID()) {
- case Intrinsic::amdgcn_atomic_inc:
- case Intrinsic::amdgcn_atomic_dec:
case Intrinsic::amdgcn_ds_ordered_add:
case Intrinsic::amdgcn_ds_ordered_swap:
case Intrinsic::amdgcn_ds_fadd:
@@ -775,15 +788,15 @@ GCNTTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *Ty,
}
InstructionCost
-GCNTTIImpl::getMinMaxReductionCost(VectorType *Ty, VectorType *CondTy,
- bool IsUnsigned,
+GCNTTIImpl::getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty,
+ FastMathFlags FMF,
TTI::TargetCostKind CostKind) {
EVT OrigTy = TLI->getValueType(DL, Ty);
// Computes cost on targets that have packed math instructions(which support
// 16-bit types only).
if (!ST->hasVOP3PInsts() || OrigTy.getScalarSizeInBits() != 16)
- return BaseT::getMinMaxReductionCost(Ty, CondTy, IsUnsigned, CostKind);
+ return BaseT::getMinMaxReductionCost(IID, Ty, FMF, CostKind);
std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
return LT.first * getHalfRateInstrCost(CostKind);
@@ -857,11 +870,6 @@ bool GCNTTIImpl::isInlineAsmSourceOfDivergence(
return false;
}
-/// \returns true if the new GPU divergence analysis is enabled.
-bool GCNTTIImpl::useGPUDivergenceAnalysis() const {
- return !UseLegacyDA;
-}
-
bool GCNTTIImpl::isReadRegisterSourceOfDivergence(
const IntrinsicInst *ReadReg) const {
Metadata *MD =
@@ -928,19 +936,8 @@ bool GCNTTIImpl::isSourceOfDivergence(const Value *V) const {
}
bool GCNTTIImpl::isAlwaysUniform(const Value *V) const {
- if (const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(V)) {
- switch (Intrinsic->getIntrinsicID()) {
- default:
- return false;
- case Intrinsic::amdgcn_readfirstlane:
- case Intrinsic::amdgcn_readlane:
- case Intrinsic::amdgcn_icmp:
- case Intrinsic::amdgcn_fcmp:
- case Intrinsic::amdgcn_ballot:
- case Intrinsic::amdgcn_if_break:
- return true;
- }
- }
+ if (const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(V))
+ return AMDGPU::isIntrinsicAlwaysUniform(Intrinsic->getIntrinsicID());
if (const CallInst *CI = dyn_cast<CallInst>(V)) {
if (CI->isInlineAsm())
@@ -1012,8 +1009,6 @@ bool GCNTTIImpl::isAlwaysUniform(const Value *V) const {
bool GCNTTIImpl::collectFlatAddressOperands(SmallVectorImpl<int> &OpIndexes,
Intrinsic::ID IID) const {
switch (IID) {
- case Intrinsic::amdgcn_atomic_inc:
- case Intrinsic::amdgcn_atomic_dec:
case Intrinsic::amdgcn_ds_fadd:
case Intrinsic::amdgcn_ds_fmin:
case Intrinsic::amdgcn_ds_fmax:
@@ -1034,8 +1029,6 @@ Value *GCNTTIImpl::rewriteIntrinsicWithAddressSpace(IntrinsicInst *II,
Value *NewV) const {
auto IntrID = II->getIntrinsicID();
switch (IntrID) {
- case Intrinsic::amdgcn_atomic_inc:
- case Intrinsic::amdgcn_atomic_dec:
case Intrinsic::amdgcn_ds_fadd:
case Intrinsic::amdgcn_ds_fmin:
case Intrinsic::amdgcn_ds_fmax: {
@@ -1099,9 +1092,12 @@ Value *GCNTTIImpl::rewriteIntrinsicWithAddressSpace(IntrinsicInst *II,
case Intrinsic::amdgcn_flat_atomic_fadd:
case Intrinsic::amdgcn_flat_atomic_fmax:
case Intrinsic::amdgcn_flat_atomic_fmin: {
- Module *M = II->getParent()->getParent()->getParent();
Type *DestTy = II->getType();
Type *SrcTy = NewV->getType();
+ unsigned NewAS = SrcTy->getPointerAddressSpace();
+ if (!AMDGPU::isExtendedGlobalAddrSpace(NewAS))
+ return nullptr;
+ Module *M = II->getModule();
Function *NewDecl = Intrinsic::getDeclaration(M, II->getIntrinsicID(),
{DestTy, SrcTy, DestTy});
II->setArgOperand(0, NewV);
@@ -1157,8 +1153,8 @@ bool GCNTTIImpl::areInlineCompatible(const Function *Caller,
// FIXME: dx10_clamp can just take the caller setting, but there seems to be
// no way to support merge for backend defined attributes.
- AMDGPU::SIModeRegisterDefaults CallerMode(*Caller);
- AMDGPU::SIModeRegisterDefaults CalleeMode(*Callee);
+ SIModeRegisterDefaults CallerMode(*Caller);
+ SIModeRegisterDefaults CalleeMode(*Callee);
if (!CallerMode.isInlineCompatible(CalleeMode))
return false;
@@ -1178,34 +1174,129 @@ bool GCNTTIImpl::areInlineCompatible(const Function *Caller,
return true;
}
-unsigned GCNTTIImpl::adjustInliningThreshold(const CallBase *CB) const {
- // If we have a pointer to private array passed into a function
+static unsigned adjustInliningThresholdUsingCallee(const CallBase *CB,
+ const SITargetLowering *TLI,
+ const GCNTTIImpl *TTIImpl) {
+ const int NrOfSGPRUntilSpill = 26;
+ const int NrOfVGPRUntilSpill = 32;
+
+ const DataLayout &DL = TTIImpl->getDataLayout();
+
+ unsigned adjustThreshold = 0;
+ int SGPRsInUse = 0;
+ int VGPRsInUse = 0;
+ for (const Use &A : CB->args()) {
+ SmallVector<EVT, 4> ValueVTs;
+ ComputeValueVTs(*TLI, DL, A.get()->getType(), ValueVTs);
+ for (auto ArgVT : ValueVTs) {
+ unsigned CCRegNum = TLI->getNumRegistersForCallingConv(
+ CB->getContext(), CB->getCallingConv(), ArgVT);
+ if (AMDGPU::isArgPassedInSGPR(CB, CB->getArgOperandNo(&A)))
+ SGPRsInUse += CCRegNum;
+ else
+ VGPRsInUse += CCRegNum;
+ }
+ }
+
+ // The cost of passing function arguments through the stack:
+ // 1 instruction to put a function argument on the stack in the caller.
+ // 1 instruction to take a function argument from the stack in callee.
+ // 1 instruction is explicitly take care of data dependencies in callee
+ // function.
+ InstructionCost ArgStackCost(1);
+ ArgStackCost += const_cast<GCNTTIImpl *>(TTIImpl)->getMemoryOpCost(
+ Instruction::Store, Type::getInt32Ty(CB->getContext()), Align(4),
+ AMDGPUAS::PRIVATE_ADDRESS, TTI::TCK_SizeAndLatency);
+ ArgStackCost += const_cast<GCNTTIImpl *>(TTIImpl)->getMemoryOpCost(
+ Instruction::Load, Type::getInt32Ty(CB->getContext()), Align(4),
+ AMDGPUAS::PRIVATE_ADDRESS, TTI::TCK_SizeAndLatency);
+
+ // The penalty cost is computed relative to the cost of instructions and does
+ // not model any storage costs.
+ adjustThreshold += std::max(0, SGPRsInUse - NrOfSGPRUntilSpill) *
+ *ArgStackCost.getValue() * InlineConstants::getInstrCost();
+ adjustThreshold += std::max(0, VGPRsInUse - NrOfVGPRUntilSpill) *
+ *ArgStackCost.getValue() * InlineConstants::getInstrCost();
+ return adjustThreshold;
+}
+
+static unsigned getCallArgsTotalAllocaSize(const CallBase *CB,
+ const DataLayout &DL) {
+ // If we have a pointer to a private array passed into a function
// it will not be optimized out, leaving scratch usage.
- // Increase the inline threshold to allow inlining in this case.
- uint64_t AllocaSize = 0;
+ // This function calculates the total size in bytes of the memory that would
+ // end in scratch if the call was not inlined.
+ unsigned AllocaSize = 0;
SmallPtrSet<const AllocaInst *, 8> AIVisited;
for (Value *PtrArg : CB->args()) {
PointerType *Ty = dyn_cast<PointerType>(PtrArg->getType());
- if (!Ty || (Ty->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS &&
- Ty->getAddressSpace() != AMDGPUAS::FLAT_ADDRESS))
+ if (!Ty)
continue;
- PtrArg = getUnderlyingObject(PtrArg);
- if (const AllocaInst *AI = dyn_cast<AllocaInst>(PtrArg)) {
- if (!AI->isStaticAlloca() || !AIVisited.insert(AI).second)
- continue;
- AllocaSize += DL.getTypeAllocSize(AI->getAllocatedType());
- // If the amount of stack memory is excessive we will not be able
- // to get rid of the scratch anyway, bail out.
- if (AllocaSize > ArgAllocaCutoff) {
- AllocaSize = 0;
- break;
- }
- }
+ unsigned AddrSpace = Ty->getAddressSpace();
+ if (AddrSpace != AMDGPUAS::FLAT_ADDRESS &&
+ AddrSpace != AMDGPUAS::PRIVATE_ADDRESS)
+ continue;
+
+ const AllocaInst *AI = dyn_cast<AllocaInst>(getUnderlyingObject(PtrArg));
+ if (!AI || !AI->isStaticAlloca() || !AIVisited.insert(AI).second)
+ continue;
+
+ AllocaSize += DL.getTypeAllocSize(AI->getAllocatedType());
}
- if (AllocaSize)
- return ArgAllocaCost;
- return 0;
+ return AllocaSize;
+}
+
+unsigned GCNTTIImpl::adjustInliningThreshold(const CallBase *CB) const {
+ unsigned Threshold = adjustInliningThresholdUsingCallee(CB, TLI, this);
+
+ // Private object passed as arguments may end up in scratch usage if the call
+ // is not inlined. Increase the inline threshold to promote inlining.
+ unsigned AllocaSize = getCallArgsTotalAllocaSize(CB, DL);
+ if (AllocaSize > 0)
+ Threshold += ArgAllocaCost;
+ return Threshold;
+}
+
+unsigned GCNTTIImpl::getCallerAllocaCost(const CallBase *CB,
+ const AllocaInst *AI) const {
+
+ // Below the cutoff, assume that the private memory objects would be
+ // optimized
+ auto AllocaSize = getCallArgsTotalAllocaSize(CB, DL);
+ if (AllocaSize <= ArgAllocaCutoff)
+ return 0;
+
+ // Above the cutoff, we give a cost to each private memory object
+ // depending its size. If the array can be optimized by SROA this cost is not
+ // added to the total-cost in the inliner cost analysis.
+ //
+ // We choose the total cost of the alloca such that their sum cancels the
+ // bonus given in the threshold (ArgAllocaCost).
+ //
+ // Cost_Alloca_0 + ... + Cost_Alloca_N == ArgAllocaCost
+ //
+ // Awkwardly, the ArgAllocaCost bonus is multiplied by threshold-multiplier,
+ // the single-bb bonus and the vector-bonus.
+ //
+ // We compensate the first two multipliers, by repeating logic from the
+ // inliner-cost in here. The vector-bonus is 0 on AMDGPU.
+ static_assert(InlinerVectorBonusPercent == 0, "vector bonus assumed to be 0");
+ unsigned Threshold = ArgAllocaCost * getInliningThresholdMultiplier();
+
+ bool SingleBB = none_of(*CB->getCalledFunction(), [](const BasicBlock &BB) {
+ return BB.getTerminator()->getNumSuccessors() > 1;
+ });
+ if (SingleBB) {
+ Threshold += Threshold / 2;
+ }
+
+ auto ArgAllocaSize = DL.getTypeAllocSize(AI->getAllocatedType());
+
+ // Attribute the bonus proportionally to the alloca size
+ unsigned AllocaThresholdBonus = (Threshold * ArgAllocaSize) / AllocaSize;
+
+ return AllocaThresholdBonus;
}
void GCNTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
index 7862f21cfc35..1e6c5bbfc0d7 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
@@ -55,6 +55,8 @@ public:
void getPeelingPreferences(Loop *L, ScalarEvolution &SE,
TTI::PeelingPreferences &PP);
+
+ int64_t getMaxMemIntrinsicInlineSizeThreshold() const;
};
class GCNTTIImpl final : public BasicTTIImplBase<GCNTTIImpl> {
@@ -69,6 +71,7 @@ class GCNTTIImpl final : public BasicTTIImplBase<GCNTTIImpl> {
bool IsGraphics;
bool HasFP32Denormals;
bool HasFP64FP16Denormals;
+ static constexpr bool InlinerVectorBonusPercent = 0;
static const FeatureBitset InlineFeatureIgnoreList;
@@ -100,8 +103,7 @@ class GCNTTIImpl final : public BasicTTIImplBase<GCNTTIImpl> {
public:
explicit GCNTTIImpl(const AMDGPUTargetMachine *TM, const Function &F);
- bool hasBranchDivergence() { return true; }
- bool useGPUDivergenceAnalysis() const;
+ bool hasBranchDivergence(const Function *F = nullptr) const;
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
TTI::UnrollingPreferences &UP,
@@ -133,6 +135,8 @@ public:
unsigned AddrSpace) const;
bool isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes, Align Alignment,
unsigned AddrSpace) const;
+
+ int64_t getMaxMemIntrinsicInlineSizeThreshold() const;
Type *getMemcpyLoopLoweringType(
LLVMContext & Context, Value * Length, unsigned SrcAddrSpace,
unsigned DestAddrSpace, unsigned SrcAlign, unsigned DestAlign,
@@ -143,7 +147,7 @@ public:
unsigned RemainingBytes, unsigned SrcAddrSpace, unsigned DestAddrSpace,
unsigned SrcAlign, unsigned DestAlign,
std::optional<uint32_t> AtomicCpySize) const;
- unsigned getMaxInterleaveFactor(unsigned VF);
+ unsigned getMaxInterleaveFactor(ElementCount VF);
bool getTgtMemIntrinsic(IntrinsicInst *Inst, MemIntrinsicInfo &Info) const;
@@ -169,6 +173,32 @@ public:
bool isSourceOfDivergence(const Value *V) const;
bool isAlwaysUniform(const Value *V) const;
+ bool isValidAddrSpaceCast(unsigned FromAS, unsigned ToAS) const {
+ if (ToAS == AMDGPUAS::FLAT_ADDRESS) {
+ switch (FromAS) {
+ case AMDGPUAS::GLOBAL_ADDRESS:
+ case AMDGPUAS::CONSTANT_ADDRESS:
+ case AMDGPUAS::CONSTANT_ADDRESS_32BIT:
+ case AMDGPUAS::LOCAL_ADDRESS:
+ case AMDGPUAS::PRIVATE_ADDRESS:
+ return true;
+ default:
+ break;
+ }
+ return false;
+ }
+ if ((FromAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT &&
+ ToAS == AMDGPUAS::CONSTANT_ADDRESS) ||
+ (FromAS == AMDGPUAS::CONSTANT_ADDRESS &&
+ ToAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT))
+ return true;
+ return false;
+ }
+
+ bool addrspacesMayAlias(unsigned AS0, unsigned AS1) const {
+ return AMDGPU::addrspacesMayAlias(AS0, AS1);
+ }
+
unsigned getFlatAddressSpace() const {
// Don't bother running InferAddressSpaces pass on graphics shaders which
// don't use flat addressing.
@@ -188,8 +218,8 @@ public:
Value *rewriteIntrinsicWithAddressSpace(IntrinsicInst *II, Value *OldV,
Value *NewV) const;
- bool canSimplifyLegacyMulToMul(const Value *Op0, const Value *Op1,
- InstCombiner &IC) const;
+ bool canSimplifyLegacyMulToMul(const Instruction &I, const Value *Op0,
+ const Value *Op1, InstCombiner &IC) const;
std::optional<Instruction *> instCombineIntrinsic(InstCombiner &IC,
IntrinsicInst &II) const;
std::optional<Value *> simplifyDemandedVectorEltsIntrinsic(
@@ -209,10 +239,11 @@ public:
bool areInlineCompatible(const Function *Caller,
const Function *Callee) const;
- unsigned getInliningThresholdMultiplier() { return 11; }
+ unsigned getInliningThresholdMultiplier() const { return 11; }
unsigned adjustInliningThreshold(const CallBase *CB) const;
+ unsigned getCallerAllocaCost(const CallBase *CB, const AllocaInst *AI) const;
- int getInlinerVectorBonusPercent() { return 0; }
+ int getInlinerVectorBonusPercent() const { return InlinerVectorBonusPercent; }
InstructionCost getArithmeticReductionCost(
unsigned Opcode, VectorType *Ty, std::optional<FastMathFlags> FMF,
@@ -220,9 +251,9 @@ public:
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
TTI::TargetCostKind CostKind);
- InstructionCost getMinMaxReductionCost(
- VectorType *Ty, VectorType *CondTy, bool IsUnsigned,
- TTI::TargetCostKind CostKind);
+ InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty,
+ FastMathFlags FMF,
+ TTI::TargetCostKind CostKind);
};
} // end namespace llvm
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp b/llvm/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp
index c27e69a0bcbb..9ad841c3c8a5 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp
@@ -19,6 +19,7 @@
//
//===----------------------------------------------------------------------===//
+#include "AMDGPUUnifyDivergentExitNodes.h"
#include "AMDGPU.h"
#include "SIDefines.h"
#include "llvm/ADT/ArrayRef.h"
@@ -26,9 +27,9 @@
#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/StringRef.h"
#include "llvm/Analysis/DomTreeUpdater.h"
-#include "llvm/Analysis/LegacyDivergenceAnalysis.h"
#include "llvm/Analysis/PostDominators.h"
#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/Analysis/UniformityAnalysis.h"
#include "llvm/IR/BasicBlock.h"
#include "llvm/IR/CFG.h"
#include "llvm/IR/Constants.h"
@@ -53,25 +54,33 @@ using namespace llvm;
namespace {
-class AMDGPUUnifyDivergentExitNodes : public FunctionPass {
+class AMDGPUUnifyDivergentExitNodesImpl {
private:
const TargetTransformInfo *TTI = nullptr;
public:
- static char ID; // Pass identification, replacement for typeid
-
- AMDGPUUnifyDivergentExitNodes() : FunctionPass(ID) {
- initializeAMDGPUUnifyDivergentExitNodesPass(*PassRegistry::getPassRegistry());
- }
+ AMDGPUUnifyDivergentExitNodesImpl() = delete;
+ AMDGPUUnifyDivergentExitNodesImpl(const TargetTransformInfo *TTI)
+ : TTI(TTI) {}
// We can preserve non-critical-edgeness when we unify function exit nodes
- void getAnalysisUsage(AnalysisUsage &AU) const override;
BasicBlock *unifyReturnBlockSet(Function &F, DomTreeUpdater &DTU,
ArrayRef<BasicBlock *> ReturningBlocks,
StringRef Name);
- bool runOnFunction(Function &F) override;
+ bool run(Function &F, DominatorTree *DT, const PostDominatorTree &PDT,
+ const UniformityInfo &UA);
};
+class AMDGPUUnifyDivergentExitNodes : public FunctionPass {
+public:
+ static char ID;
+ AMDGPUUnifyDivergentExitNodes() : FunctionPass(ID) {
+ initializeAMDGPUUnifyDivergentExitNodesPass(
+ *PassRegistry::getPassRegistry());
+ }
+ void getAnalysisUsage(AnalysisUsage &AU) const override;
+ bool runOnFunction(Function &F) override;
+};
} // end anonymous namespace
char AMDGPUUnifyDivergentExitNodes::ID = 0;
@@ -79,20 +88,20 @@ char AMDGPUUnifyDivergentExitNodes::ID = 0;
char &llvm::AMDGPUUnifyDivergentExitNodesID = AMDGPUUnifyDivergentExitNodes::ID;
INITIALIZE_PASS_BEGIN(AMDGPUUnifyDivergentExitNodes, DEBUG_TYPE,
- "Unify divergent function exit nodes", false, false)
+ "Unify divergent function exit nodes", false, false)
INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
INITIALIZE_PASS_DEPENDENCY(PostDominatorTreeWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(LegacyDivergenceAnalysis)
+INITIALIZE_PASS_DEPENDENCY(UniformityInfoWrapperPass)
INITIALIZE_PASS_END(AMDGPUUnifyDivergentExitNodes, DEBUG_TYPE,
"Unify divergent function exit nodes", false, false)
-void AMDGPUUnifyDivergentExitNodes::getAnalysisUsage(AnalysisUsage &AU) const{
+void AMDGPUUnifyDivergentExitNodes::getAnalysisUsage(AnalysisUsage &AU) const {
if (RequireAndPreserveDomTree)
AU.addRequired<DominatorTreeWrapperPass>();
AU.addRequired<PostDominatorTreeWrapperPass>();
- AU.addRequired<LegacyDivergenceAnalysis>();
+ AU.addRequired<UniformityInfoWrapperPass>();
if (RequireAndPreserveDomTree) {
AU.addPreserved<DominatorTreeWrapperPass>();
@@ -100,7 +109,7 @@ void AMDGPUUnifyDivergentExitNodes::getAnalysisUsage(AnalysisUsage &AU) const{
}
// No divergent values are changed, only blocks and branch edges.
- AU.addPreserved<LegacyDivergenceAnalysis>();
+ AU.addPreserved<UniformityInfoWrapperPass>();
// We preserve the non-critical-edgeness property
AU.addPreservedID(BreakCriticalEdgesID);
@@ -114,14 +123,13 @@ void AMDGPUUnifyDivergentExitNodes::getAnalysisUsage(AnalysisUsage &AU) const{
/// \returns true if \p BB is reachable through only uniform branches.
/// XXX - Is there a more efficient way to find this?
-static bool isUniformlyReached(const LegacyDivergenceAnalysis &DA,
- BasicBlock &BB) {
+static bool isUniformlyReached(const UniformityInfo &UA, BasicBlock &BB) {
SmallVector<BasicBlock *, 8> Stack(predecessors(&BB));
SmallPtrSet<BasicBlock *, 8> Visited;
while (!Stack.empty()) {
BasicBlock *Top = Stack.pop_back_val();
- if (!DA.isUniform(Top->getTerminator()))
+ if (!UA.isUniform(Top->getTerminator()))
return false;
for (BasicBlock *Pred : predecessors(Top)) {
@@ -133,7 +141,7 @@ static bool isUniformlyReached(const LegacyDivergenceAnalysis &DA,
return true;
}
-BasicBlock *AMDGPUUnifyDivergentExitNodes::unifyReturnBlockSet(
+BasicBlock *AMDGPUUnifyDivergentExitNodesImpl::unifyReturnBlockSet(
Function &F, DomTreeUpdater &DTU, ArrayRef<BasicBlock *> ReturningBlocks,
StringRef Name) {
// Otherwise, we need to insert a new basic block into the function, add a PHI
@@ -181,20 +189,14 @@ BasicBlock *AMDGPUUnifyDivergentExitNodes::unifyReturnBlockSet(
return NewRetBlock;
}
-bool AMDGPUUnifyDivergentExitNodes::runOnFunction(Function &F) {
- DominatorTree *DT = nullptr;
- if (RequireAndPreserveDomTree)
- DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
-
- auto &PDT = getAnalysis<PostDominatorTreeWrapperPass>().getPostDomTree();
+bool AMDGPUUnifyDivergentExitNodesImpl::run(Function &F, DominatorTree *DT,
+ const PostDominatorTree &PDT,
+ const UniformityInfo &UA) {
if (PDT.root_size() == 0 ||
(PDT.root_size() == 1 &&
!isa<BranchInst>(PDT.getRoot()->getTerminator())))
return false;
- LegacyDivergenceAnalysis &DA = getAnalysis<LegacyDivergenceAnalysis>();
- TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
-
// Loop over all of the blocks in a function, tracking all of the blocks that
// return.
SmallVector<BasicBlock *, 4> ReturningBlocks;
@@ -213,7 +215,7 @@ bool AMDGPUUnifyDivergentExitNodes::runOnFunction(Function &F) {
// exits, we should only unify UnreachableBlocks that are not uniformly
// reachable.
bool HasDivergentExitBlock = llvm::any_of(
- PDT.roots(), [&](auto BB) { return !isUniformlyReached(DA, *BB); });
+ PDT.roots(), [&](auto BB) { return !isUniformlyReached(UA, *BB); });
for (BasicBlock *BB : PDT.roots()) {
if (isa<ReturnInst>(BB->getTerminator())) {
@@ -327,3 +329,30 @@ bool AMDGPUUnifyDivergentExitNodes::runOnFunction(Function &F) {
unifyReturnBlockSet(F, DTU, ReturningBlocks, "UnifiedReturnBlock");
return true;
}
+
+bool AMDGPUUnifyDivergentExitNodes::runOnFunction(Function &F) {
+ DominatorTree *DT = nullptr;
+ if (RequireAndPreserveDomTree)
+ DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+ const auto &PDT =
+ getAnalysis<PostDominatorTreeWrapperPass>().getPostDomTree();
+ const auto &UA = getAnalysis<UniformityInfoWrapperPass>().getUniformityInfo();
+ const auto *TranformInfo =
+ &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
+ return AMDGPUUnifyDivergentExitNodesImpl(TranformInfo).run(F, DT, PDT, UA);
+}
+
+PreservedAnalyses
+AMDGPUUnifyDivergentExitNodesPass::run(Function &F,
+ FunctionAnalysisManager &AM) {
+ DominatorTree *DT = nullptr;
+ if (RequireAndPreserveDomTree)
+ DT = &AM.getResult<DominatorTreeAnalysis>(F);
+
+ const auto &PDT = AM.getResult<PostDominatorTreeAnalysis>(F);
+ const auto &UA = AM.getResult<UniformityInfoAnalysis>(F);
+ const auto *TransformInfo = &AM.getResult<TargetIRAnalysis>(F);
+ return AMDGPUUnifyDivergentExitNodesImpl(TransformInfo).run(F, DT, PDT, UA)
+ ? PreservedAnalyses::none()
+ : PreservedAnalyses::all();
+}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.h b/llvm/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.h
new file mode 100644
index 000000000000..2fd98a2ee1a9
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.h
@@ -0,0 +1,36 @@
+//===- AMDGPUUnifyDivergentExitNodes.h ------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This is a variant of the UnifyFunctionExitNodes pass. Rather than ensuring
+// there is at most one ret and one unreachable instruction, it ensures there is
+// at most one divergent exiting block.
+//
+// StructurizeCFG can't deal with multi-exit regions formed by branches to
+// multiple return nodes. It is not desirable to structurize regions with
+// uniform branches, so unifying those to the same return block as divergent
+// branches inhibits use of scalar branching. It still can't deal with the case
+// where one branch goes to return, and one unreachable. Replace unreachable in
+// this case with a return.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUUNIFYDIVERGENTEXITNODES_H
+#define LLVM_LIB_TARGET_AMDGPU_AMDGPUUNIFYDIVERGENTEXITNODES_H
+
+#include "AMDGPU.h"
+
+namespace llvm {
+class AMDGPUUnifyDivergentExitNodesPass
+ : public PassInfoMixin<AMDGPUUnifyDivergentExitNodesPass> {
+public:
+ PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
+};
+
+} // end namespace llvm
+
+#endif // LLVM_LIB_TARGET_AMDGPU_AMDGPUUNIFYDIVERGENTEXITNODES_H
diff --git a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
index 671d263a41a4..b9443559132f 100644
--- a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
+++ b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
@@ -21,6 +21,7 @@
#include "llvm/ADT/StringSet.h"
#include "llvm/ADT/Twine.h"
#include "llvm/BinaryFormat/ELF.h"
+#include "llvm/CodeGen/MachineValueType.h"
#include "llvm/MC/MCAsmInfo.h"
#include "llvm/MC/MCContext.h"
#include "llvm/MC/MCExpr.h"
@@ -35,9 +36,8 @@
#include "llvm/Support/AMDGPUMetadata.h"
#include "llvm/Support/AMDHSAKernelDescriptor.h"
#include "llvm/Support/Casting.h"
-#include "llvm/Support/MachineValueType.h"
#include "llvm/Support/MathExtras.h"
-#include "llvm/Support/TargetParser.h"
+#include "llvm/TargetParser/TargetParser.h"
#include <optional>
using namespace llvm;
@@ -119,16 +119,16 @@ public:
ImmTyInstOffset,
ImmTyOffset0,
ImmTyOffset1,
+ ImmTySMEMOffsetMod,
ImmTyCPol,
- ImmTySWZ,
ImmTyTFE,
ImmTyD16,
ImmTyClampSI,
ImmTyOModSI,
- ImmTySdwaDstSel,
- ImmTySdwaSrc0Sel,
- ImmTySdwaSrc1Sel,
- ImmTySdwaDstUnused,
+ ImmTySDWADstSel,
+ ImmTySDWASrc0Sel,
+ ImmTySDWASrc1Sel,
+ ImmTySDWADstUnused,
ImmTyDMask,
ImmTyDim,
ImmTyUNorm,
@@ -145,7 +145,7 @@ public:
ImmTySendMsg,
ImmTyInterpSlot,
ImmTyInterpAttr,
- ImmTyAttrChan,
+ ImmTyInterpAttrChan,
ImmTyOpSel,
ImmTyOpSelHi,
ImmTyNegLo,
@@ -155,7 +155,7 @@ public:
ImmTyDppRowMask,
ImmTyDppBankMask,
ImmTyDppBoundCtrl,
- ImmTyDppFi,
+ ImmTyDppFI,
ImmTySwizzle,
ImmTyGprIdxMode,
ImmTyHigh,
@@ -347,6 +347,8 @@ public:
return isImm() && Imm.Type == ImmT;
}
+ bool isImmLiteral() const { return isImmTy(ImmTyNone); }
+
bool isImmModifier() const {
return isImm() && Imm.Type != ImmTyNone;
}
@@ -370,26 +372,25 @@ public:
bool isOffset() const { return isImmTy(ImmTyOffset) && isUInt<16>(getImm()); }
bool isOffset0() const { return isImmTy(ImmTyOffset0) && isUInt<8>(getImm()); }
bool isOffset1() const { return isImmTy(ImmTyOffset1) && isUInt<8>(getImm()); }
-
+ bool isSMEMOffsetMod() const { return isImmTy(ImmTySMEMOffsetMod); }
bool isFlatOffset() const { return isImmTy(ImmTyOffset) || isImmTy(ImmTyInstOffset); }
bool isGDS() const { return isImmTy(ImmTyGDS); }
bool isLDS() const { return isImmTy(ImmTyLDS); }
bool isCPol() const { return isImmTy(ImmTyCPol); }
- bool isSWZ() const { return isImmTy(ImmTySWZ); }
bool isTFE() const { return isImmTy(ImmTyTFE); }
bool isD16() const { return isImmTy(ImmTyD16); }
bool isFORMAT() const { return isImmTy(ImmTyFORMAT) && isUInt<7>(getImm()); }
- bool isBankMask() const { return isImmTy(ImmTyDppBankMask); }
- bool isRowMask() const { return isImmTy(ImmTyDppRowMask); }
+ bool isDppBankMask() const { return isImmTy(ImmTyDppBankMask); }
+ bool isDppRowMask() const { return isImmTy(ImmTyDppRowMask); }
bool isDppBoundCtrl() const { return isImmTy(ImmTyDppBoundCtrl); }
- bool isFI() const { return isImmTy(ImmTyDppFi); }
- bool isSDWADstSel() const { return isImmTy(ImmTySdwaDstSel); }
- bool isSDWASrc0Sel() const { return isImmTy(ImmTySdwaSrc0Sel); }
- bool isSDWASrc1Sel() const { return isImmTy(ImmTySdwaSrc1Sel); }
- bool isSDWADstUnused() const { return isImmTy(ImmTySdwaDstUnused); }
+ bool isDppFI() const { return isImmTy(ImmTyDppFI); }
+ bool isSDWADstSel() const { return isImmTy(ImmTySDWADstSel); }
+ bool isSDWASrc0Sel() const { return isImmTy(ImmTySDWASrc0Sel); }
+ bool isSDWASrc1Sel() const { return isImmTy(ImmTySDWASrc1Sel); }
+ bool isSDWADstUnused() const { return isImmTy(ImmTySDWADstUnused); }
bool isInterpSlot() const { return isImmTy(ImmTyInterpSlot); }
bool isInterpAttr() const { return isImmTy(ImmTyInterpAttr); }
- bool isAttrChan() const { return isImmTy(ImmTyAttrChan); }
+ bool isInterpAttrChan() const { return isImmTy(ImmTyInterpAttrChan); }
bool isOpSel() const { return isImmTy(ImmTyOpSel); }
bool isOpSelHi() const { return isImmTy(ImmTyOpSelHi); }
bool isNegLo() const { return isImmTy(ImmTyNegLo); }
@@ -855,13 +856,11 @@ public:
return Kind == Expression;
}
- bool isSoppBrTarget() const {
- return isExpr() || isImm();
- }
+ bool isSOPPBrTarget() const { return isExpr() || isImm(); }
bool isSWaitCnt() const;
bool isDepCtr() const;
- bool isSDelayAlu() const;
+ bool isSDelayALU() const;
bool isHwreg() const;
bool isSendMsg() const;
bool isSwizzle() const;
@@ -948,28 +947,11 @@ public:
void addLiteralImmOperand(MCInst &Inst, int64_t Val, bool ApplyModifiers) const;
- template <unsigned Bitwidth>
- void addKImmFPOperands(MCInst &Inst, unsigned N) const;
-
- void addKImmFP16Operands(MCInst &Inst, unsigned N) const {
- addKImmFPOperands<16>(Inst, N);
- }
-
- void addKImmFP32Operands(MCInst &Inst, unsigned N) const {
- addKImmFPOperands<32>(Inst, N);
- }
-
void addRegOperands(MCInst &Inst, unsigned N) const;
- void addBoolRegOperands(MCInst &Inst, unsigned N) const {
- addRegOperands(Inst, N);
- }
-
void addRegOrImmOperands(MCInst &Inst, unsigned N) const {
if (isRegKind())
addRegOperands(Inst, N);
- else if (isExpr())
- Inst.addOperand(MCOperand::createExpr(Expr));
else
addImmOperands(Inst, N);
}
@@ -1011,15 +993,6 @@ public:
addRegWithInputModsOperands(Inst, N);
}
- void addSoppBrTargetOperands(MCInst &Inst, unsigned N) const {
- if (isImm())
- addImmOperands(Inst, N);
- else {
- assert(isExpr());
- Inst.addOperand(MCOperand::createExpr(Expr));
- }
- }
-
static void printImmTy(raw_ostream& OS, ImmTy Type) {
switch (Type) {
case ImmTyNone: OS << "None"; break;
@@ -1032,8 +1005,8 @@ public:
case ImmTyInstOffset: OS << "InstOffset"; break;
case ImmTyOffset0: OS << "Offset0"; break;
case ImmTyOffset1: OS << "Offset1"; break;
+ case ImmTySMEMOffsetMod: OS << "SMEMOffsetMod"; break;
case ImmTyCPol: OS << "CPol"; break;
- case ImmTySWZ: OS << "SWZ"; break;
case ImmTyTFE: OS << "TFE"; break;
case ImmTyD16: OS << "D16"; break;
case ImmTyFORMAT: OS << "FORMAT"; break;
@@ -1044,11 +1017,11 @@ public:
case ImmTyDppRowMask: OS << "DppRowMask"; break;
case ImmTyDppBankMask: OS << "DppBankMask"; break;
case ImmTyDppBoundCtrl: OS << "DppBoundCtrl"; break;
- case ImmTyDppFi: OS << "FI"; break;
- case ImmTySdwaDstSel: OS << "SdwaDstSel"; break;
- case ImmTySdwaSrc0Sel: OS << "SdwaSrc0Sel"; break;
- case ImmTySdwaSrc1Sel: OS << "SdwaSrc1Sel"; break;
- case ImmTySdwaDstUnused: OS << "SdwaDstUnused"; break;
+ case ImmTyDppFI: OS << "DppFI"; break;
+ case ImmTySDWADstSel: OS << "SDWADstSel"; break;
+ case ImmTySDWASrc0Sel: OS << "SDWASrc0Sel"; break;
+ case ImmTySDWASrc1Sel: OS << "SDWASrc1Sel"; break;
+ case ImmTySDWADstUnused: OS << "SDWADstUnused"; break;
case ImmTyDMask: OS << "DMask"; break;
case ImmTyDim: OS << "Dim"; break;
case ImmTyUNorm: OS << "UNorm"; break;
@@ -1064,7 +1037,7 @@ public:
case ImmTySendMsg: OS << "SendMsg"; break;
case ImmTyInterpSlot: OS << "InterpSlot"; break;
case ImmTyInterpAttr: OS << "InterpAttr"; break;
- case ImmTyAttrChan: OS << "AttrChan"; break;
+ case ImmTyInterpAttrChan: OS << "InterpAttrChan"; break;
case ImmTyOpSel: OS << "OpSel"; break;
case ImmTyOpSelHi: OS << "OpSelHi"; break;
case ImmTyNegLo: OS << "NegLo"; break;
@@ -1339,8 +1312,6 @@ private:
unsigned RegWidth);
void cvtMubufImpl(MCInst &Inst, const OperandVector &Operands,
bool IsAtomic);
- void cvtDSImpl(MCInst &Inst, const OperandVector &Operands,
- bool IsGdsHardcoded);
public:
enum AMDGPUMatchResultTy {
@@ -1481,6 +1452,14 @@ public:
return getFeatureBits()[AMDGPU::FeatureIntClamp];
}
+ bool hasPartialNSAEncoding() const {
+ return getFeatureBits()[AMDGPU::FeaturePartialNSAEncoding];
+ }
+
+ unsigned getNSAMaxSize() const {
+ return AMDGPU::getNSAMaxSize(getSTI());
+ }
+
AMDGPUTargetStreamer &getTargetStreamer() {
MCTargetStreamer &TS = *getParser().getStreamer().getTargetStreamer();
return static_cast<AMDGPUTargetStreamer &>(TS);
@@ -1526,36 +1505,34 @@ public:
uint64_t &ErrorInfo,
bool MatchingInlineAsm) override;
bool ParseDirective(AsmToken DirectiveID) override;
- OperandMatchResultTy parseOperand(OperandVector &Operands, StringRef Mnemonic,
- OperandMode Mode = OperandMode_Default);
+ ParseStatus parseOperand(OperandVector &Operands, StringRef Mnemonic,
+ OperandMode Mode = OperandMode_Default);
StringRef parseMnemonicSuffix(StringRef Name);
bool ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
SMLoc NameLoc, OperandVector &Operands) override;
//bool ProcessInstruction(MCInst &Inst);
- OperandMatchResultTy parseTokenOp(StringRef Name, OperandVector &Operands);
+ ParseStatus parseTokenOp(StringRef Name, OperandVector &Operands);
- OperandMatchResultTy parseIntWithPrefix(const char *Prefix, int64_t &Int);
+ ParseStatus parseIntWithPrefix(const char *Prefix, int64_t &Int);
- OperandMatchResultTy
+ ParseStatus
parseIntWithPrefix(const char *Prefix, OperandVector &Operands,
AMDGPUOperand::ImmTy ImmTy = AMDGPUOperand::ImmTyNone,
- bool (*ConvertResult)(int64_t &) = nullptr);
+ std::function<bool(int64_t &)> ConvertResult = nullptr);
- OperandMatchResultTy
- parseOperandArrayWithPrefix(const char *Prefix,
- OperandVector &Operands,
- AMDGPUOperand::ImmTy ImmTy = AMDGPUOperand::ImmTyNone,
- bool (*ConvertResult)(int64_t&) = nullptr);
+ ParseStatus parseOperandArrayWithPrefix(
+ const char *Prefix, OperandVector &Operands,
+ AMDGPUOperand::ImmTy ImmTy = AMDGPUOperand::ImmTyNone,
+ bool (*ConvertResult)(int64_t &) = nullptr);
- OperandMatchResultTy
+ ParseStatus
parseNamedBit(StringRef Name, OperandVector &Operands,
AMDGPUOperand::ImmTy ImmTy = AMDGPUOperand::ImmTyNone);
unsigned getCPolKind(StringRef Id, StringRef Mnemo, bool &Disabling) const;
- OperandMatchResultTy parseCPol(OperandVector &Operands);
- OperandMatchResultTy parseStringWithPrefix(StringRef Prefix,
- StringRef &Value,
- SMLoc &StringLoc);
+ ParseStatus parseCPol(OperandVector &Operands);
+ ParseStatus parseStringWithPrefix(StringRef Prefix, StringRef &Value,
+ SMLoc &StringLoc);
bool isModifier();
bool isOperandModifier(const AsmToken &Token, const AsmToken &NextToken) const;
@@ -1563,42 +1540,44 @@ public:
bool isNamedOperandModifier(const AsmToken &Token, const AsmToken &NextToken) const;
bool isOpcodeModifierWithVal(const AsmToken &Token, const AsmToken &NextToken) const;
bool parseSP3NegModifier();
- OperandMatchResultTy parseImm(OperandVector &Operands, bool HasSP3AbsModifier = false);
- OperandMatchResultTy parseReg(OperandVector &Operands);
- OperandMatchResultTy parseRegOrImm(OperandVector &Operands, bool HasSP3AbsMod = false);
- OperandMatchResultTy parseRegOrImmWithFPInputMods(OperandVector &Operands, bool AllowImm = true);
- OperandMatchResultTy parseRegOrImmWithIntInputMods(OperandVector &Operands, bool AllowImm = true);
- OperandMatchResultTy parseRegWithFPInputMods(OperandVector &Operands);
- OperandMatchResultTy parseRegWithIntInputMods(OperandVector &Operands);
- OperandMatchResultTy parseVReg32OrOff(OperandVector &Operands);
- OperandMatchResultTy parseDfmtNfmt(int64_t &Format);
- OperandMatchResultTy parseUfmt(int64_t &Format);
- OperandMatchResultTy parseSymbolicSplitFormat(StringRef FormatStr, SMLoc Loc, int64_t &Format);
- OperandMatchResultTy parseSymbolicUnifiedFormat(StringRef FormatStr, SMLoc Loc, int64_t &Format);
- OperandMatchResultTy parseFORMAT(OperandVector &Operands);
- OperandMatchResultTy parseSymbolicOrNumericFormat(int64_t &Format);
- OperandMatchResultTy parseNumericFormat(int64_t &Format);
- OperandMatchResultTy parseFlatOffset(OperandVector &Operands);
- OperandMatchResultTy parseR128A16(OperandVector &Operands);
+ ParseStatus parseImm(OperandVector &Operands, bool HasSP3AbsModifier = false);
+ ParseStatus parseReg(OperandVector &Operands);
+ ParseStatus parseRegOrImm(OperandVector &Operands, bool HasSP3AbsMod = false);
+ ParseStatus parseRegOrImmWithFPInputMods(OperandVector &Operands,
+ bool AllowImm = true);
+ ParseStatus parseRegOrImmWithIntInputMods(OperandVector &Operands,
+ bool AllowImm = true);
+ ParseStatus parseRegWithFPInputMods(OperandVector &Operands);
+ ParseStatus parseRegWithIntInputMods(OperandVector &Operands);
+ ParseStatus parseVReg32OrOff(OperandVector &Operands);
+ ParseStatus parseDfmtNfmt(int64_t &Format);
+ ParseStatus parseUfmt(int64_t &Format);
+ ParseStatus parseSymbolicSplitFormat(StringRef FormatStr, SMLoc Loc,
+ int64_t &Format);
+ ParseStatus parseSymbolicUnifiedFormat(StringRef FormatStr, SMLoc Loc,
+ int64_t &Format);
+ ParseStatus parseFORMAT(OperandVector &Operands);
+ ParseStatus parseSymbolicOrNumericFormat(int64_t &Format);
+ ParseStatus parseNumericFormat(int64_t &Format);
+ ParseStatus parseFlatOffset(OperandVector &Operands);
+ ParseStatus parseR128A16(OperandVector &Operands);
+ ParseStatus parseBLGP(OperandVector &Operands);
bool tryParseFmt(const char *Pref, int64_t MaxVal, int64_t &Val);
bool matchDfmtNfmt(int64_t &Dfmt, int64_t &Nfmt, StringRef FormatStr, SMLoc Loc);
- void cvtDSOffset01(MCInst &Inst, const OperandVector &Operands);
- void cvtDS(MCInst &Inst, const OperandVector &Operands) { cvtDSImpl(Inst, Operands, false); }
- void cvtDSGds(MCInst &Inst, const OperandVector &Operands) { cvtDSImpl(Inst, Operands, true); }
void cvtExp(MCInst &Inst, const OperandVector &Operands);
bool parseCnt(int64_t &IntVal);
- OperandMatchResultTy parseSWaitCntOps(OperandVector &Operands);
+ ParseStatus parseSWaitCnt(OperandVector &Operands);
bool parseDepCtr(int64_t &IntVal, unsigned &Mask);
void depCtrError(SMLoc Loc, int ErrorId, StringRef DepCtrName);
- OperandMatchResultTy parseDepCtrOps(OperandVector &Operands);
+ ParseStatus parseDepCtr(OperandVector &Operands);
bool parseDelay(int64_t &Delay);
- OperandMatchResultTy parseSDelayAluOps(OperandVector &Operands);
+ ParseStatus parseSDelayALU(OperandVector &Operands);
- OperandMatchResultTy parseHwreg(OperandVector &Operands);
+ ParseStatus parseHwreg(OperandVector &Operands);
private:
struct OperandInfoTy {
@@ -1648,7 +1627,7 @@ private:
bool validateMIMGGatherDMask(const MCInst &Inst);
bool validateMovrels(const MCInst &Inst, const OperandVector &Operands);
bool validateMIMGDataSize(const MCInst &Inst, const SMLoc &IDLoc);
- bool validateMIMGAddrSize(const MCInst &Inst);
+ bool validateMIMGAddrSize(const MCInst &Inst, const SMLoc &IDLoc);
bool validateMIMGD16(const MCInst &Inst);
bool validateMIMGMSAA(const MCInst &Inst);
bool validateOpSel(const MCInst &Inst);
@@ -1706,15 +1685,14 @@ private:
public:
void onBeginOfFile() override;
- OperandMatchResultTy parseCustomOperand(OperandVector &Operands,
- unsigned MCK);
+ ParseStatus parseCustomOperand(OperandVector &Operands, unsigned MCK);
- OperandMatchResultTy parseExpTgt(OperandVector &Operands);
- OperandMatchResultTy parseSendMsgOp(OperandVector &Operands);
- OperandMatchResultTy parseInterpSlot(OperandVector &Operands);
- OperandMatchResultTy parseInterpAttr(OperandVector &Operands);
- OperandMatchResultTy parseSOppBrTarget(OperandVector &Operands);
- OperandMatchResultTy parseBoolReg(OperandVector &Operands);
+ ParseStatus parseExpTgt(OperandVector &Operands);
+ ParseStatus parseSendMsg(OperandVector &Operands);
+ ParseStatus parseInterpSlot(OperandVector &Operands);
+ ParseStatus parseInterpAttr(OperandVector &Operands);
+ ParseStatus parseSOPPBrTarget(OperandVector &Operands);
+ ParseStatus parseBoolReg(OperandVector &Operands);
bool parseSwizzleOperand(int64_t &Op,
const unsigned MinVal,
@@ -1725,7 +1703,7 @@ public:
const unsigned MinVal,
const unsigned MaxVal,
const StringRef ErrMsg);
- OperandMatchResultTy parseSwizzleOp(OperandVector &Operands);
+ ParseStatus parseSwizzle(OperandVector &Operands);
bool parseSwizzleOffset(int64_t &Imm);
bool parseSwizzleMacro(int64_t &Imm);
bool parseSwizzleQuadPerm(int64_t &Imm);
@@ -1734,21 +1712,13 @@ public:
bool parseSwizzleSwap(int64_t &Imm);
bool parseSwizzleReverse(int64_t &Imm);
- OperandMatchResultTy parseGPRIdxMode(OperandVector &Operands);
+ ParseStatus parseGPRIdxMode(OperandVector &Operands);
int64_t parseGPRIdxMacro();
void cvtMubuf(MCInst &Inst, const OperandVector &Operands) { cvtMubufImpl(Inst, Operands, false); }
void cvtMubufAtomic(MCInst &Inst, const OperandVector &Operands) { cvtMubufImpl(Inst, Operands, true); }
- void cvtMtbuf(MCInst &Inst, const OperandVector &Operands);
-
- AMDGPUOperand::Ptr defaultCPol() const;
- AMDGPUOperand::Ptr defaultSMRDOffset8() const;
- AMDGPUOperand::Ptr defaultSMEMOffset() const;
- AMDGPUOperand::Ptr defaultSMRDLiteralOffset() const;
- AMDGPUOperand::Ptr defaultFlatOffset() const;
-
- OperandMatchResultTy parseOModOperand(OperandVector &Operands);
+ ParseStatus parseOModSI(OperandVector &Operands);
void cvtVOP3(MCInst &Inst, const OperandVector &Operands,
OptionalImmIndexMap &OptionalIdx);
@@ -1763,25 +1733,16 @@ public:
void cvtVOP3Interp(MCInst &Inst, const OperandVector &Operands);
void cvtVINTERP(MCInst &Inst, const OperandVector &Operands);
-
- void cvtMIMG(MCInst &Inst, const OperandVector &Operands,
- bool IsAtomic = false);
- void cvtMIMGAtomic(MCInst &Inst, const OperandVector &Operands);
- void cvtIntersectRay(MCInst &Inst, const OperandVector &Operands);
-
void cvtSMEMAtomic(MCInst &Inst, const OperandVector &Operands);
bool parseDimId(unsigned &Encoding);
- OperandMatchResultTy parseDim(OperandVector &Operands);
- OperandMatchResultTy parseDPP8(OperandVector &Operands);
- OperandMatchResultTy parseDPPCtrl(OperandVector &Operands);
+ ParseStatus parseDim(OperandVector &Operands);
+ bool convertDppBoundCtrl(int64_t &BoundCtrl);
+ ParseStatus parseDPP8(OperandVector &Operands);
+ ParseStatus parseDPPCtrl(OperandVector &Operands);
bool isSupportedDPPCtrl(StringRef Ctrl, const OperandVector &Operands);
int64_t parseDPPCtrlSel(StringRef Ctrl);
int64_t parseDPPCtrlPerm();
- AMDGPUOperand::Ptr defaultRowMask() const;
- AMDGPUOperand::Ptr defaultBankMask() const;
- AMDGPUOperand::Ptr defaultDppBoundCtrl() const;
- AMDGPUOperand::Ptr defaultFI() const;
void cvtDPP(MCInst &Inst, const OperandVector &Operands, bool IsDPP8 = false);
void cvtDPP8(MCInst &Inst, const OperandVector &Operands) {
cvtDPP(Inst, Operands, true);
@@ -1792,9 +1753,9 @@ public:
cvtVOP3DPP(Inst, Operands, true);
}
- OperandMatchResultTy parseSDWASel(OperandVector &Operands, StringRef Prefix,
- AMDGPUOperand::ImmTy Type);
- OperandMatchResultTy parseSDWADstUnused(OperandVector &Operands);
+ ParseStatus parseSDWASel(OperandVector &Operands, StringRef Prefix,
+ AMDGPUOperand::ImmTy Type);
+ ParseStatus parseSDWADstUnused(OperandVector &Operands);
void cvtSdwaVOP1(MCInst &Inst, const OperandVector &Operands);
void cvtSdwaVOP2(MCInst &Inst, const OperandVector &Operands);
void cvtSdwaVOP2b(MCInst &Inst, const OperandVector &Operands);
@@ -1805,16 +1766,9 @@ public:
bool SkipDstVcc = false,
bool SkipSrcVcc = false);
- AMDGPUOperand::Ptr defaultBLGP() const;
- AMDGPUOperand::Ptr defaultCBSZ() const;
- AMDGPUOperand::Ptr defaultABID() const;
-
- OperandMatchResultTy parseEndpgmOp(OperandVector &Operands);
- AMDGPUOperand::Ptr defaultEndpgmImmOperands() const;
+ ParseStatus parseEndpgm(OperandVector &Operands);
- AMDGPUOperand::Ptr defaultWaitVDST() const;
- AMDGPUOperand::Ptr defaultWaitEXP() const;
- OperandMatchResultTy parseVOPD(OperandVector &Operands);
+ ParseStatus parseVOPD(OperandVector &Operands);
};
} // end anonymous namespace
@@ -2089,6 +2043,11 @@ uint64_t AMDGPUOperand::applyInputFPModifiers(uint64_t Val, unsigned Size) const
}
void AMDGPUOperand::addImmOperands(MCInst &Inst, unsigned N, bool ApplyModifiers) const {
+ if (isExpr()) {
+ Inst.addOperand(MCOperand::createExpr(Expr));
+ return;
+ }
+
if (AMDGPU::isSISrcOperand(AsmParser->getMII()->get(Inst.getOpcode()),
Inst.getNumOperands())) {
addLiteralImmOperand(Inst, Imm.Val,
@@ -2285,24 +2244,6 @@ void AMDGPUOperand::addLiteralImmOperand(MCInst &Inst, int64_t Val, bool ApplyMo
}
}
-template <unsigned Bitwidth>
-void AMDGPUOperand::addKImmFPOperands(MCInst &Inst, unsigned N) const {
- APInt Literal(64, Imm.Val);
- setImmKindMandatoryLiteral();
-
- if (!Imm.IsFPImm) {
- // We got int literal token.
- Inst.addOperand(MCOperand::createImm(Literal.getLoBits(Bitwidth).getZExtValue()));
- return;
- }
-
- bool Lost;
- APFloat FPLiteral(APFloat::IEEEdouble(), Literal);
- FPLiteral.convert(*getFltSemantics(Bitwidth / 8),
- APFloat::rmNearestTiesToEven, &Lost);
- Inst.addOperand(MCOperand::createImm(FPLiteral.bitcastToAPInt().getZExtValue()));
-}
-
void AMDGPUOperand::addRegOperands(MCInst &Inst, unsigned N) const {
Inst.addOperand(MCOperand::createReg(AMDGPU::getMCReg(getReg(), AsmParser->getSTI())));
}
@@ -2922,12 +2863,12 @@ AMDGPUAsmParser::parseRegister(bool RestoreOnFailure) {
return AMDGPUOperand::CreateReg(this, Reg, StartLoc, EndLoc);
}
-OperandMatchResultTy
-AMDGPUAsmParser::parseImm(OperandVector &Operands, bool HasSP3AbsModifier) {
+ParseStatus AMDGPUAsmParser::parseImm(OperandVector &Operands,
+ bool HasSP3AbsModifier) {
// TODO: add syntactic sugar for 1/(2*PI)
if (isRegister())
- return MatchOperand_NoMatch;
+ return ParseStatus::NoMatch;
assert(!isModifier());
const auto& Tok = getToken();
@@ -2952,9 +2893,8 @@ AMDGPUAsmParser::parseImm(OperandVector &Operands, bool HasSP3AbsModifier) {
APFloat RealVal(APFloat::IEEEdouble());
auto roundMode = APFloat::rmNearestTiesToEven;
- if (errorToBool(RealVal.convertFromString(Num, roundMode).takeError())) {
- return MatchOperand_ParseFail;
- }
+ if (errorToBool(RealVal.convertFromString(Num, roundMode).takeError()))
+ return ParseStatus::Failure;
if (Negate)
RealVal.changeSign();
@@ -2962,7 +2902,7 @@ AMDGPUAsmParser::parseImm(OperandVector &Operands, bool HasSP3AbsModifier) {
AMDGPUOperand::CreateImm(this, RealVal.bitcastToAPInt().getZExtValue(), S,
AMDGPUOperand::ImmTyNone, true));
- return MatchOperand_Success;
+ return ParseStatus::Success;
} else {
int64_t IntVal;
@@ -2979,10 +2919,10 @@ AMDGPUAsmParser::parseImm(OperandVector &Operands, bool HasSP3AbsModifier) {
// MC expressions (due to the trailing '|').
SMLoc EndLoc;
if (getParser().parsePrimaryExpr(Expr, EndLoc, nullptr))
- return MatchOperand_ParseFail;
+ return ParseStatus::Failure;
} else {
if (Parser.parseExpression(Expr))
- return MatchOperand_ParseFail;
+ return ParseStatus::Failure;
}
if (Expr->evaluateAsAbsolute(IntVal)) {
@@ -2991,35 +2931,32 @@ AMDGPUAsmParser::parseImm(OperandVector &Operands, bool HasSP3AbsModifier) {
Operands.push_back(AMDGPUOperand::CreateExpr(this, Expr, S));
}
- return MatchOperand_Success;
+ return ParseStatus::Success;
}
- return MatchOperand_NoMatch;
+ return ParseStatus::NoMatch;
}
-OperandMatchResultTy
-AMDGPUAsmParser::parseReg(OperandVector &Operands) {
+ParseStatus AMDGPUAsmParser::parseReg(OperandVector &Operands) {
if (!isRegister())
- return MatchOperand_NoMatch;
+ return ParseStatus::NoMatch;
if (auto R = parseRegister()) {
assert(R->isReg());
Operands.push_back(std::move(R));
- return MatchOperand_Success;
+ return ParseStatus::Success;
}
- return MatchOperand_ParseFail;
+ return ParseStatus::Failure;
}
-OperandMatchResultTy
-AMDGPUAsmParser::parseRegOrImm(OperandVector &Operands, bool HasSP3AbsMod) {
- auto res = parseReg(Operands);
- if (res != MatchOperand_NoMatch) {
- return res;
- } else if (isModifier()) {
- return MatchOperand_NoMatch;
- } else {
- return parseImm(Operands, HasSP3AbsMod);
- }
+ParseStatus AMDGPUAsmParser::parseRegOrImm(OperandVector &Operands,
+ bool HasSP3AbsMod) {
+ ParseStatus Res = parseReg(Operands);
+ if (!Res.isNoMatch())
+ return Res;
+ if (isModifier())
+ return ParseStatus::NoMatch;
+ return parseImm(Operands, HasSP3AbsMod);
}
bool
@@ -3110,7 +3047,7 @@ AMDGPUAsmParser::parseSP3NegModifier() {
return false;
}
-OperandMatchResultTy
+ParseStatus
AMDGPUAsmParser::parseRegOrImmWithFPInputMods(OperandVector &Operands,
bool AllowImm) {
bool Neg, SP3Neg;
@@ -3118,49 +3055,42 @@ AMDGPUAsmParser::parseRegOrImmWithFPInputMods(OperandVector &Operands,
SMLoc Loc;
// Disable ambiguous constructs like '--1' etc. Should use neg(-1) instead.
- if (isToken(AsmToken::Minus) && peekToken().is(AsmToken::Minus)) {
- Error(getLoc(), "invalid syntax, expected 'neg' modifier");
- return MatchOperand_ParseFail;
- }
+ if (isToken(AsmToken::Minus) && peekToken().is(AsmToken::Minus))
+ return Error(getLoc(), "invalid syntax, expected 'neg' modifier");
SP3Neg = parseSP3NegModifier();
Loc = getLoc();
Neg = trySkipId("neg");
- if (Neg && SP3Neg) {
- Error(Loc, "expected register or immediate");
- return MatchOperand_ParseFail;
- }
+ if (Neg && SP3Neg)
+ return Error(Loc, "expected register or immediate");
if (Neg && !skipToken(AsmToken::LParen, "expected left paren after neg"))
- return MatchOperand_ParseFail;
+ return ParseStatus::Failure;
Abs = trySkipId("abs");
if (Abs && !skipToken(AsmToken::LParen, "expected left paren after abs"))
- return MatchOperand_ParseFail;
+ return ParseStatus::Failure;
Loc = getLoc();
SP3Abs = trySkipToken(AsmToken::Pipe);
- if (Abs && SP3Abs) {
- Error(Loc, "expected register or immediate");
- return MatchOperand_ParseFail;
- }
+ if (Abs && SP3Abs)
+ return Error(Loc, "expected register or immediate");
- OperandMatchResultTy Res;
+ ParseStatus Res;
if (AllowImm) {
Res = parseRegOrImm(Operands, SP3Abs);
} else {
Res = parseReg(Operands);
}
- if (Res != MatchOperand_Success) {
- return (SP3Neg || Neg || SP3Abs || Abs)? MatchOperand_ParseFail : Res;
- }
+ if (!Res.isSuccess())
+ return (SP3Neg || Neg || SP3Abs || Abs) ? ParseStatus::Failure : Res;
if (SP3Abs && !skipToken(AsmToken::Pipe, "expected vertical bar"))
- return MatchOperand_ParseFail;
+ return ParseStatus::Failure;
if (Abs && !skipToken(AsmToken::RParen, "expected closing parentheses"))
- return MatchOperand_ParseFail;
+ return ParseStatus::Failure;
if (Neg && !skipToken(AsmToken::RParen, "expected closing parentheses"))
- return MatchOperand_ParseFail;
+ return ParseStatus::Failure;
AMDGPUOperand::Modifiers Mods;
Mods.Abs = Abs || SP3Abs;
@@ -3168,79 +3098,71 @@ AMDGPUAsmParser::parseRegOrImmWithFPInputMods(OperandVector &Operands,
if (Mods.hasFPModifiers()) {
AMDGPUOperand &Op = static_cast<AMDGPUOperand &>(*Operands.back());
- if (Op.isExpr()) {
- Error(Op.getStartLoc(), "expected an absolute expression");
- return MatchOperand_ParseFail;
- }
+ if (Op.isExpr())
+ return Error(Op.getStartLoc(), "expected an absolute expression");
Op.setModifiers(Mods);
}
- return MatchOperand_Success;
+ return ParseStatus::Success;
}
-OperandMatchResultTy
+ParseStatus
AMDGPUAsmParser::parseRegOrImmWithIntInputMods(OperandVector &Operands,
bool AllowImm) {
bool Sext = trySkipId("sext");
if (Sext && !skipToken(AsmToken::LParen, "expected left paren after sext"))
- return MatchOperand_ParseFail;
+ return ParseStatus::Failure;
- OperandMatchResultTy Res;
+ ParseStatus Res;
if (AllowImm) {
Res = parseRegOrImm(Operands);
} else {
Res = parseReg(Operands);
}
- if (Res != MatchOperand_Success) {
- return Sext? MatchOperand_ParseFail : Res;
- }
+ if (!Res.isSuccess())
+ return Sext ? ParseStatus::Failure : Res;
if (Sext && !skipToken(AsmToken::RParen, "expected closing parentheses"))
- return MatchOperand_ParseFail;
+ return ParseStatus::Failure;
AMDGPUOperand::Modifiers Mods;
Mods.Sext = Sext;
if (Mods.hasIntModifiers()) {
AMDGPUOperand &Op = static_cast<AMDGPUOperand &>(*Operands.back());
- if (Op.isExpr()) {
- Error(Op.getStartLoc(), "expected an absolute expression");
- return MatchOperand_ParseFail;
- }
+ if (Op.isExpr())
+ return Error(Op.getStartLoc(), "expected an absolute expression");
Op.setModifiers(Mods);
}
- return MatchOperand_Success;
+ return ParseStatus::Success;
}
-OperandMatchResultTy
-AMDGPUAsmParser::parseRegWithFPInputMods(OperandVector &Operands) {
+ParseStatus AMDGPUAsmParser::parseRegWithFPInputMods(OperandVector &Operands) {
return parseRegOrImmWithFPInputMods(Operands, false);
}
-OperandMatchResultTy
-AMDGPUAsmParser::parseRegWithIntInputMods(OperandVector &Operands) {
+ParseStatus AMDGPUAsmParser::parseRegWithIntInputMods(OperandVector &Operands) {
return parseRegOrImmWithIntInputMods(Operands, false);
}
-OperandMatchResultTy AMDGPUAsmParser::parseVReg32OrOff(OperandVector &Operands) {
+ParseStatus AMDGPUAsmParser::parseVReg32OrOff(OperandVector &Operands) {
auto Loc = getLoc();
if (trySkipId("off")) {
Operands.push_back(AMDGPUOperand::CreateImm(this, 0, Loc,
AMDGPUOperand::ImmTyOff, false));
- return MatchOperand_Success;
+ return ParseStatus::Success;
}
if (!isRegister())
- return MatchOperand_NoMatch;
+ return ParseStatus::NoMatch;
std::unique_ptr<AMDGPUOperand> Reg = parseRegister();
if (Reg) {
Operands.push_back(std::move(Reg));
- return MatchOperand_Success;
+ return ParseStatus::Success;
}
- return MatchOperand_ParseFail;
-
+ return ParseStatus::Failure;
}
unsigned AMDGPUAsmParser::checkTargetMatchPredicate(MCInst &Inst) {
@@ -3647,7 +3569,8 @@ bool AMDGPUAsmParser::validateMIMGDataSize(const MCInst &Inst,
return false;
}
-bool AMDGPUAsmParser::validateMIMGAddrSize(const MCInst &Inst) {
+bool AMDGPUAsmParser::validateMIMGAddrSize(const MCInst &Inst,
+ const SMLoc &IDLoc) {
const unsigned Opc = Inst.getOpcode();
const MCInstrDesc &Desc = MII.get(Opc);
@@ -3667,8 +3590,13 @@ bool AMDGPUAsmParser::validateMIMGAddrSize(const MCInst &Inst) {
assert(SrsrcIdx != -1);
assert(SrsrcIdx > VAddr0Idx);
- if (DimIdx == -1)
- return true; // intersect_ray
+ bool IsA16 = Inst.getOperand(A16Idx).getImm();
+ if (BaseOpcode->BVH) {
+ if (IsA16 == BaseOpcode->A16)
+ return true;
+ Error(IDLoc, "image address size does not match a16");
+ return false;
+ }
unsigned Dim = Inst.getOperand(DimIdx).getImm();
const AMDGPU::MIMGDimInfo *DimInfo = AMDGPU::getMIMGDimInfoByEncoding(Dim);
@@ -3676,12 +3604,19 @@ bool AMDGPUAsmParser::validateMIMGAddrSize(const MCInst &Inst) {
unsigned ActualAddrSize =
IsNSA ? SrsrcIdx - VAddr0Idx
: AMDGPU::getRegOperandSize(getMRI(), Desc, VAddr0Idx) / 4;
- bool IsA16 = (A16Idx != -1 && Inst.getOperand(A16Idx).getImm());
unsigned ExpectedAddrSize =
AMDGPU::getAddrSizeMIMGOp(BaseOpcode, DimInfo, IsA16, hasG16());
- if (!IsNSA) {
+ if (IsNSA) {
+ if (hasPartialNSAEncoding() && ExpectedAddrSize > getNSAMaxSize()) {
+ int VAddrLastIdx = SrsrcIdx - 1;
+ unsigned VAddrLastSize =
+ AMDGPU::getRegOperandSize(getMRI(), Desc, VAddrLastIdx) / 4;
+
+ ActualAddrSize = VAddrLastIdx - VAddr0Idx + VAddrLastSize;
+ }
+ } else {
if (ExpectedAddrSize > 12)
ExpectedAddrSize = 16;
@@ -3692,7 +3627,11 @@ bool AMDGPUAsmParser::validateMIMGAddrSize(const MCInst &Inst) {
return true;
}
- return ActualAddrSize == ExpectedAddrSize;
+ if (ActualAddrSize == ExpectedAddrSize)
+ return true;
+
+ Error(IDLoc, "image address size does not match dim and a16");
+ return false;
}
bool AMDGPUAsmParser::validateMIMGAtomicDMask(const MCInst &Inst) {
@@ -4136,7 +4075,7 @@ SMLoc AMDGPUAsmParser::getSMEMOffsetLoc(const OperandVector &Operands) const {
// Start with second operand because SMEM Offset cannot be dst or src0.
for (unsigned i = 2, e = Operands.size(); i != e; ++i) {
AMDGPUOperand &Op = ((AMDGPUOperand &)*Operands[i]);
- if (Op.isSMEMOffset())
+ if (Op.isSMEMOffset() || Op.isSMEMOffsetMod())
return Op.getStartLoc();
}
return getLoc();
@@ -4628,11 +4567,8 @@ bool AMDGPUAsmParser::validateInstruction(const MCInst &Inst,
if (!validateMIMGDataSize(Inst, IDLoc)) {
return false;
}
- if (!validateMIMGAddrSize(Inst)) {
- Error(IDLoc,
- "image address size does not match dim and a16");
+ if (!validateMIMGAddrSize(Inst, IDLoc))
return false;
- }
if (!validateMIMGAtomicDMask(Inst)) {
Error(getImmLoc(AMDGPUOperand::ImmTyDMask, Operands),
"invalid atomic image dmask");
@@ -5242,10 +5178,10 @@ bool AMDGPUAsmParser::ParseDirectiveAMDHSAKernel() {
#undef PARSE_BITS_ENTRY
}
- if (Seen.find(".amdhsa_next_free_vgpr") == Seen.end())
+ if (!Seen.contains(".amdhsa_next_free_vgpr"))
return TokError(".amdhsa_next_free_vgpr directive is required");
- if (Seen.find(".amdhsa_next_free_sgpr") == Seen.end())
+ if (!Seen.contains(".amdhsa_next_free_sgpr"))
return TokError(".amdhsa_next_free_sgpr directive is required");
unsigned VGPRBlocks;
@@ -5283,7 +5219,7 @@ bool AMDGPUAsmParser::ParseDirectiveAMDHSAKernel() {
UserSGPRCount);
if (isGFX90A()) {
- if (Seen.find(".amdhsa_accum_offset") == Seen.end())
+ if (!Seen.contains(".amdhsa_accum_offset"))
return TokError(".amdhsa_accum_offset directive is required");
if (AccumOffset < 4 || AccumOffset > 256 || (AccumOffset & 3))
return TokError("accum_offset should be in range [4..256] in "
@@ -5294,9 +5230,9 @@ bool AMDGPUAsmParser::ParseDirectiveAMDHSAKernel() {
(AccumOffset / 4 - 1));
}
- if (IVersion.Major == 10) {
+ if (IVersion.Major >= 10) {
// SharedVGPRCount < 16 checked by PARSE_ENTRY_BITS
- if (SharedVGPRCount && EnableWavefrontSize32) {
+ if (SharedVGPRCount && EnableWavefrontSize32 && *EnableWavefrontSize32) {
return TokError("shared_vgpr_count directive not valid on "
"wavefront size 32");
}
@@ -5309,7 +5245,7 @@ bool AMDGPUAsmParser::ParseDirectiveAMDHSAKernel() {
getTargetStreamer().EmitAmdhsaKernelDescriptor(
getSTI(), KernelName, KD, NextFreeVGPR, NextFreeSGPR, ReserveVCC,
- ReserveFlatScr);
+ ReserveFlatScr, AMDGPU::getAmdhsaCodeObjectVersion());
return false;
}
@@ -5487,10 +5423,10 @@ bool AMDGPUAsmParser::ParseDirectiveHSAMetadata() {
const char *AssemblerDirectiveEnd;
std::tie(AssemblerDirectiveBegin, AssemblerDirectiveEnd) =
isHsaAbiVersion3AndAbove(&getSTI())
- ? std::tuple(HSAMD::V3::AssemblerDirectiveBegin,
- HSAMD::V3::AssemblerDirectiveEnd)
- : std::tuple(HSAMD::AssemblerDirectiveBegin,
- HSAMD::AssemblerDirectiveEnd);
+ ? std::pair(HSAMD::V3::AssemblerDirectiveBegin,
+ HSAMD::V3::AssemblerDirectiveEnd)
+ : std::pair(HSAMD::AssemblerDirectiveBegin,
+ HSAMD::AssemblerDirectiveEnd);
if (getSTI().getTargetTriple().getOS() != Triple::AMDHSA) {
return Error(getLoc(),
@@ -5609,7 +5545,7 @@ bool AMDGPUAsmParser::ParseDirectiveAMDGPULDS() {
return TokError("expected identifier in directive");
MCSymbol *Symbol = getContext().getOrCreateSymbol(Name);
- if (parseToken(AsmToken::Comma, "expected ','"))
+ if (getParser().parseComma())
return true;
unsigned LocalMemorySize = AMDGPU::IsaInfo::getLocalMemorySize(&getSTI());
@@ -5758,16 +5694,15 @@ bool AMDGPUAsmParser::subtargetHasRegister(const MCRegisterInfo &MRI,
return true;
}
-OperandMatchResultTy
-AMDGPUAsmParser::parseOperand(OperandVector &Operands, StringRef Mnemonic,
- OperandMode Mode) {
- OperandMatchResultTy ResTy = parseVOPD(Operands);
- if (ResTy == MatchOperand_Success || ResTy == MatchOperand_ParseFail ||
- isToken(AsmToken::EndOfStatement))
- return ResTy;
+ParseStatus AMDGPUAsmParser::parseOperand(OperandVector &Operands,
+ StringRef Mnemonic,
+ OperandMode Mode) {
+ ParseStatus Res = parseVOPD(Operands);
+ if (Res.isSuccess() || Res.isFailure() || isToken(AsmToken::EndOfStatement))
+ return Res;
// Try to parse with a custom parser
- ResTy = MatchOperandParserImpl(Operands, Mnemonic);
+ Res = MatchOperandParserImpl(Operands, Mnemonic);
// If we successfully parsed the operand or if there as an error parsing,
// we are done.
@@ -5775,9 +5710,8 @@ AMDGPUAsmParser::parseOperand(OperandVector &Operands, StringRef Mnemonic,
// If we are parsing after we reach EndOfStatement then this means we
// are appending default values to the Operands list. This is only done
// by custom parser, so we shouldn't continue on to the generic parsing.
- if (ResTy == MatchOperand_Success || ResTy == MatchOperand_ParseFail ||
- isToken(AsmToken::EndOfStatement))
- return ResTy;
+ if (Res.isSuccess() || Res.isFailure() || isToken(AsmToken::EndOfStatement))
+ return Res;
SMLoc RBraceLoc;
SMLoc LBraceLoc = getLoc();
@@ -5786,20 +5720,19 @@ AMDGPUAsmParser::parseOperand(OperandVector &Operands, StringRef Mnemonic,
for (;;) {
auto Loc = getLoc();
- ResTy = parseReg(Operands);
- if (ResTy == MatchOperand_NoMatch)
+ Res = parseReg(Operands);
+ if (Res.isNoMatch())
Error(Loc, "expected a register");
- if (ResTy != MatchOperand_Success)
- return MatchOperand_ParseFail;
+ if (!Res.isSuccess())
+ return ParseStatus::Failure;
RBraceLoc = getLoc();
if (trySkipToken(AsmToken::RBrac))
break;
if (!skipToken(AsmToken::Comma,
- "expected a comma or a closing square bracket")) {
- return MatchOperand_ParseFail;
- }
+ "expected a comma or a closing square bracket"))
+ return ParseStatus::Failure;
}
if (Operands.size() - Prefix > 1) {
@@ -5808,7 +5741,7 @@ AMDGPUAsmParser::parseOperand(OperandVector &Operands, StringRef Mnemonic,
Operands.push_back(AMDGPUOperand::CreateToken(this, "]", RBraceLoc));
}
- return MatchOperand_Success;
+ return ParseStatus::Success;
}
return parseRegOrImm(Operands);
@@ -5862,15 +5795,14 @@ bool AMDGPUAsmParser::ParseInstruction(ParseInstructionInfo &Info,
OperandMode Mode = OperandMode_Default;
if (IsMIMG && isGFX10Plus() && Operands.size() == 2)
Mode = OperandMode_NSA;
- OperandMatchResultTy Res = parseOperand(Operands, Name, Mode);
+ ParseStatus Res = parseOperand(Operands, Name, Mode);
- if (Res != MatchOperand_Success) {
+ if (!Res.isSuccess()) {
checkUnsupportedInstruction(Name, NameLoc);
if (!Parser.hasPendingError()) {
// FIXME: use real operand location rather than the current location.
- StringRef Msg =
- (Res == MatchOperand_ParseFail) ? "failed parsing operand." :
- "not a valid operand.";
+ StringRef Msg = Res.isFailure() ? "failed parsing operand."
+ : "not a valid operand.";
Error(getLoc(), Msg);
}
while (!trySkipToken(AsmToken::EndOfStatement)) {
@@ -5890,34 +5822,33 @@ bool AMDGPUAsmParser::ParseInstruction(ParseInstructionInfo &Info,
// Utility functions
//===----------------------------------------------------------------------===//
-OperandMatchResultTy AMDGPUAsmParser::parseTokenOp(StringRef Name,
- OperandVector &Operands) {
+ParseStatus AMDGPUAsmParser::parseTokenOp(StringRef Name,
+ OperandVector &Operands) {
SMLoc S = getLoc();
if (!trySkipId(Name))
- return MatchOperand_NoMatch;
+ return ParseStatus::NoMatch;
Operands.push_back(AMDGPUOperand::CreateToken(this, Name, S));
- return MatchOperand_Success;
+ return ParseStatus::Success;
}
-OperandMatchResultTy
-AMDGPUAsmParser::parseIntWithPrefix(const char *Prefix, int64_t &IntVal) {
+ParseStatus AMDGPUAsmParser::parseIntWithPrefix(const char *Prefix,
+ int64_t &IntVal) {
if (!trySkipId(Prefix, AsmToken::Colon))
- return MatchOperand_NoMatch;
+ return ParseStatus::NoMatch;
- return parseExpr(IntVal) ? MatchOperand_Success : MatchOperand_ParseFail;
+ return parseExpr(IntVal) ? ParseStatus::Success : ParseStatus::Failure;
}
-OperandMatchResultTy
-AMDGPUAsmParser::parseIntWithPrefix(const char *Prefix, OperandVector &Operands,
- AMDGPUOperand::ImmTy ImmTy,
- bool (*ConvertResult)(int64_t&)) {
+ParseStatus AMDGPUAsmParser::parseIntWithPrefix(
+ const char *Prefix, OperandVector &Operands, AMDGPUOperand::ImmTy ImmTy,
+ std::function<bool(int64_t &)> ConvertResult) {
SMLoc S = getLoc();
int64_t Value = 0;
- OperandMatchResultTy Res = parseIntWithPrefix(Prefix, Value);
- if (Res != MatchOperand_Success)
+ ParseStatus Res = parseIntWithPrefix(Prefix, Value);
+ if (!Res.isSuccess())
return Res;
if (ConvertResult && !ConvertResult(Value)) {
@@ -5925,20 +5856,18 @@ AMDGPUAsmParser::parseIntWithPrefix(const char *Prefix, OperandVector &Operands,
}
Operands.push_back(AMDGPUOperand::CreateImm(this, Value, S, ImmTy));
- return MatchOperand_Success;
+ return ParseStatus::Success;
}
-OperandMatchResultTy
-AMDGPUAsmParser::parseOperandArrayWithPrefix(const char *Prefix,
- OperandVector &Operands,
- AMDGPUOperand::ImmTy ImmTy,
- bool (*ConvertResult)(int64_t&)) {
+ParseStatus AMDGPUAsmParser::parseOperandArrayWithPrefix(
+ const char *Prefix, OperandVector &Operands, AMDGPUOperand::ImmTy ImmTy,
+ bool (*ConvertResult)(int64_t &)) {
SMLoc S = getLoc();
if (!trySkipId(Prefix, AsmToken::Colon))
- return MatchOperand_NoMatch;
+ return ParseStatus::NoMatch;
if (!skipToken(AsmToken::LBrac, "expected a left square bracket"))
- return MatchOperand_ParseFail;
+ return ParseStatus::Failure;
unsigned Val = 0;
const unsigned MaxSize = 4;
@@ -5949,34 +5878,30 @@ AMDGPUAsmParser::parseOperandArrayWithPrefix(const char *Prefix,
int64_t Op;
SMLoc Loc = getLoc();
if (!parseExpr(Op))
- return MatchOperand_ParseFail;
+ return ParseStatus::Failure;
- if (Op != 0 && Op != 1) {
- Error(Loc, "invalid " + StringRef(Prefix) + " value.");
- return MatchOperand_ParseFail;
- }
+ if (Op != 0 && Op != 1)
+ return Error(Loc, "invalid " + StringRef(Prefix) + " value.");
Val |= (Op << I);
if (trySkipToken(AsmToken::RBrac))
break;
- if (I + 1 == MaxSize) {
- Error(getLoc(), "expected a closing square bracket");
- return MatchOperand_ParseFail;
- }
+ if (I + 1 == MaxSize)
+ return Error(getLoc(), "expected a closing square bracket");
if (!skipToken(AsmToken::Comma, "expected a comma"))
- return MatchOperand_ParseFail;
+ return ParseStatus::Failure;
}
Operands.push_back(AMDGPUOperand::CreateImm(this, Val, S, ImmTy));
- return MatchOperand_Success;
+ return ParseStatus::Success;
}
-OperandMatchResultTy
-AMDGPUAsmParser::parseNamedBit(StringRef Name, OperandVector &Operands,
- AMDGPUOperand::ImmTy ImmTy) {
+ParseStatus AMDGPUAsmParser::parseNamedBit(StringRef Name,
+ OperandVector &Operands,
+ AMDGPUOperand::ImmTy ImmTy) {
int64_t Bit;
SMLoc S = getLoc();
@@ -5985,54 +5910,42 @@ AMDGPUAsmParser::parseNamedBit(StringRef Name, OperandVector &Operands,
} else if (trySkipId("no", Name)) {
Bit = 0;
} else {
- return MatchOperand_NoMatch;
+ return ParseStatus::NoMatch;
}
- if (Name == "r128" && !hasMIMG_R128()) {
- Error(S, "r128 modifier is not supported on this GPU");
- return MatchOperand_ParseFail;
- }
- if (Name == "a16" && !hasA16()) {
- Error(S, "a16 modifier is not supported on this GPU");
- return MatchOperand_ParseFail;
- }
+ if (Name == "r128" && !hasMIMG_R128())
+ return Error(S, "r128 modifier is not supported on this GPU");
+ if (Name == "a16" && !hasA16())
+ return Error(S, "a16 modifier is not supported on this GPU");
if (isGFX9() && ImmTy == AMDGPUOperand::ImmTyA16)
ImmTy = AMDGPUOperand::ImmTyR128A16;
Operands.push_back(AMDGPUOperand::CreateImm(this, Bit, S, ImmTy));
- return MatchOperand_Success;
+ return ParseStatus::Success;
}
unsigned AMDGPUAsmParser::getCPolKind(StringRef Id, StringRef Mnemo,
bool &Disabling) const {
- Disabling = Id.startswith("no");
+ Disabling = Id.consume_front("no");
if (isGFX940() && !Mnemo.startswith("s_")) {
return StringSwitch<unsigned>(Id)
.Case("nt", AMDGPU::CPol::NT)
- .Case("nont", AMDGPU::CPol::NT)
.Case("sc0", AMDGPU::CPol::SC0)
- .Case("nosc0", AMDGPU::CPol::SC0)
.Case("sc1", AMDGPU::CPol::SC1)
- .Case("nosc1", AMDGPU::CPol::SC1)
.Default(0);
}
return StringSwitch<unsigned>(Id)
.Case("dlc", AMDGPU::CPol::DLC)
- .Case("nodlc", AMDGPU::CPol::DLC)
.Case("glc", AMDGPU::CPol::GLC)
- .Case("noglc", AMDGPU::CPol::GLC)
.Case("scc", AMDGPU::CPol::SCC)
- .Case("noscc", AMDGPU::CPol::SCC)
.Case("slc", AMDGPU::CPol::SLC)
- .Case("noslc", AMDGPU::CPol::SLC)
.Default(0);
}
-OperandMatchResultTy
-AMDGPUAsmParser::parseCPol(OperandVector &Operands) {
+ParseStatus AMDGPUAsmParser::parseCPol(OperandVector &Operands) {
StringRef Mnemo = ((AMDGPUOperand &)*Operands[0]).getToken();
SMLoc OpLoc = getLoc();
unsigned Enabled = 0, Seen = 0;
@@ -6045,20 +5958,14 @@ AMDGPUAsmParser::parseCPol(OperandVector &Operands) {
lex();
- if (!isGFX10Plus() && CPol == AMDGPU::CPol::DLC) {
- Error(S, "dlc modifier is not supported on this GPU");
- return MatchOperand_ParseFail;
- }
+ if (!isGFX10Plus() && CPol == AMDGPU::CPol::DLC)
+ return Error(S, "dlc modifier is not supported on this GPU");
- if (!isGFX90A() && CPol == AMDGPU::CPol::SCC) {
- Error(S, "scc modifier is not supported on this GPU");
- return MatchOperand_ParseFail;
- }
+ if (!isGFX90A() && CPol == AMDGPU::CPol::SCC)
+ return Error(S, "scc modifier is not supported on this GPU");
- if (Seen & CPol) {
- Error(S, "duplicate cache policy modifier");
- return MatchOperand_ParseFail;
- }
+ if (Seen & CPol)
+ return Error(S, "duplicate cache policy modifier");
if (!Disabling)
Enabled |= CPol;
@@ -6067,11 +5974,11 @@ AMDGPUAsmParser::parseCPol(OperandVector &Operands) {
}
if (!Seen)
- return MatchOperand_NoMatch;
+ return ParseStatus::NoMatch;
Operands.push_back(
AMDGPUOperand::CreateImm(this, Enabled, OpLoc, AMDGPUOperand::ImmTyCPol));
- return MatchOperand_Success;
+ return ParseStatus::Success;
}
static void addOptionalImmOperand(
@@ -6088,16 +5995,15 @@ static void addOptionalImmOperand(
}
}
-OperandMatchResultTy
-AMDGPUAsmParser::parseStringWithPrefix(StringRef Prefix,
- StringRef &Value,
- SMLoc &StringLoc) {
+ParseStatus AMDGPUAsmParser::parseStringWithPrefix(StringRef Prefix,
+ StringRef &Value,
+ SMLoc &StringLoc) {
if (!trySkipId(Prefix, AsmToken::Colon))
- return MatchOperand_NoMatch;
+ return ParseStatus::NoMatch;
StringLoc = getLoc();
- return parseId(Value, "expected an identifier") ? MatchOperand_Success
- : MatchOperand_ParseFail;
+ return parseId(Value, "expected an identifier") ? ParseStatus::Success
+ : ParseStatus::Failure;
}
//===----------------------------------------------------------------------===//
@@ -6111,9 +6017,9 @@ bool AMDGPUAsmParser::tryParseFmt(const char *Pref,
SMLoc Loc = getLoc();
auto Res = parseIntWithPrefix(Pref, Val);
- if (Res == MatchOperand_ParseFail)
+ if (Res.isFailure())
return false;
- if (Res == MatchOperand_NoMatch)
+ if (Res.isNoMatch())
return true;
if (Val < 0 || Val > MaxVal) {
@@ -6127,8 +6033,7 @@ bool AMDGPUAsmParser::tryParseFmt(const char *Pref,
// dfmt and nfmt (in a tbuffer instruction) are parsed as one to allow their
// values to live in a joint format operand in the MCInst encoding.
-OperandMatchResultTy
-AMDGPUAsmParser::parseDfmtNfmt(int64_t &Format) {
+ParseStatus AMDGPUAsmParser::parseDfmtNfmt(int64_t &Format) {
using namespace llvm::AMDGPU::MTBUFFormat;
int64_t Dfmt = DFMT_UNDEF;
@@ -6137,11 +6042,11 @@ AMDGPUAsmParser::parseDfmtNfmt(int64_t &Format) {
// dfmt and nfmt can appear in either order, and each is optional.
for (int I = 0; I < 2; ++I) {
if (Dfmt == DFMT_UNDEF && !tryParseFmt("dfmt", DFMT_MAX, Dfmt))
- return MatchOperand_ParseFail;
+ return ParseStatus::Failure;
+
+ if (Nfmt == NFMT_UNDEF && !tryParseFmt("nfmt", NFMT_MAX, Nfmt))
+ return ParseStatus::Failure;
- if (Nfmt == NFMT_UNDEF && !tryParseFmt("nfmt", NFMT_MAX, Nfmt)) {
- return MatchOperand_ParseFail;
- }
// Skip optional comma between dfmt/nfmt
// but guard against 2 commas following each other.
if ((Dfmt == DFMT_UNDEF) != (Nfmt == NFMT_UNDEF) &&
@@ -6151,29 +6056,28 @@ AMDGPUAsmParser::parseDfmtNfmt(int64_t &Format) {
}
if (Dfmt == DFMT_UNDEF && Nfmt == NFMT_UNDEF)
- return MatchOperand_NoMatch;
+ return ParseStatus::NoMatch;
Dfmt = (Dfmt == DFMT_UNDEF) ? DFMT_DEFAULT : Dfmt;
Nfmt = (Nfmt == NFMT_UNDEF) ? NFMT_DEFAULT : Nfmt;
Format = encodeDfmtNfmt(Dfmt, Nfmt);
- return MatchOperand_Success;
+ return ParseStatus::Success;
}
-OperandMatchResultTy
-AMDGPUAsmParser::parseUfmt(int64_t &Format) {
+ParseStatus AMDGPUAsmParser::parseUfmt(int64_t &Format) {
using namespace llvm::AMDGPU::MTBUFFormat;
int64_t Fmt = UFMT_UNDEF;
if (!tryParseFmt("format", UFMT_MAX, Fmt))
- return MatchOperand_ParseFail;
+ return ParseStatus::Failure;
if (Fmt == UFMT_UNDEF)
- return MatchOperand_NoMatch;
+ return ParseStatus::NoMatch;
Format = Fmt;
- return MatchOperand_Success;
+ return ParseStatus::Success;
}
bool AMDGPUAsmParser::matchDfmtNfmt(int64_t &Dfmt,
@@ -6199,31 +6103,26 @@ bool AMDGPUAsmParser::matchDfmtNfmt(int64_t &Dfmt,
return false;
}
-OperandMatchResultTy
-AMDGPUAsmParser::parseSymbolicSplitFormat(StringRef FormatStr,
- SMLoc FormatLoc,
- int64_t &Format) {
+ParseStatus AMDGPUAsmParser::parseSymbolicSplitFormat(StringRef FormatStr,
+ SMLoc FormatLoc,
+ int64_t &Format) {
using namespace llvm::AMDGPU::MTBUFFormat;
int64_t Dfmt = DFMT_UNDEF;
int64_t Nfmt = NFMT_UNDEF;
if (!matchDfmtNfmt(Dfmt, Nfmt, FormatStr, FormatLoc))
- return MatchOperand_ParseFail;
+ return ParseStatus::Failure;
if (trySkipToken(AsmToken::Comma)) {
StringRef Str;
SMLoc Loc = getLoc();
if (!parseId(Str, "expected a format string") ||
- !matchDfmtNfmt(Dfmt, Nfmt, Str, Loc)) {
- return MatchOperand_ParseFail;
- }
- if (Dfmt == DFMT_UNDEF) {
- Error(Loc, "duplicate numeric format");
- return MatchOperand_ParseFail;
- } else if (Nfmt == NFMT_UNDEF) {
- Error(Loc, "duplicate data format");
- return MatchOperand_ParseFail;
- }
+ !matchDfmtNfmt(Dfmt, Nfmt, Str, Loc))
+ return ParseStatus::Failure;
+ if (Dfmt == DFMT_UNDEF)
+ return Error(Loc, "duplicate numeric format");
+ if (Nfmt == NFMT_UNDEF)
+ return Error(Loc, "duplicate data format");
}
Dfmt = (Dfmt == DFMT_UNDEF) ? DFMT_DEFAULT : Dfmt;
@@ -6231,94 +6130,84 @@ AMDGPUAsmParser::parseSymbolicSplitFormat(StringRef FormatStr,
if (isGFX10Plus()) {
auto Ufmt = convertDfmtNfmt2Ufmt(Dfmt, Nfmt, getSTI());
- if (Ufmt == UFMT_UNDEF) {
- Error(FormatLoc, "unsupported format");
- return MatchOperand_ParseFail;
- }
+ if (Ufmt == UFMT_UNDEF)
+ return Error(FormatLoc, "unsupported format");
Format = Ufmt;
} else {
Format = encodeDfmtNfmt(Dfmt, Nfmt);
}
- return MatchOperand_Success;
+ return ParseStatus::Success;
}
-OperandMatchResultTy
-AMDGPUAsmParser::parseSymbolicUnifiedFormat(StringRef FormatStr,
- SMLoc Loc,
- int64_t &Format) {
+ParseStatus AMDGPUAsmParser::parseSymbolicUnifiedFormat(StringRef FormatStr,
+ SMLoc Loc,
+ int64_t &Format) {
using namespace llvm::AMDGPU::MTBUFFormat;
auto Id = getUnifiedFormat(FormatStr, getSTI());
if (Id == UFMT_UNDEF)
- return MatchOperand_NoMatch;
+ return ParseStatus::NoMatch;
- if (!isGFX10Plus()) {
- Error(Loc, "unified format is not supported on this GPU");
- return MatchOperand_ParseFail;
- }
+ if (!isGFX10Plus())
+ return Error(Loc, "unified format is not supported on this GPU");
Format = Id;
- return MatchOperand_Success;
+ return ParseStatus::Success;
}
-OperandMatchResultTy
-AMDGPUAsmParser::parseNumericFormat(int64_t &Format) {
+ParseStatus AMDGPUAsmParser::parseNumericFormat(int64_t &Format) {
using namespace llvm::AMDGPU::MTBUFFormat;
SMLoc Loc = getLoc();
if (!parseExpr(Format))
- return MatchOperand_ParseFail;
- if (!isValidFormatEncoding(Format, getSTI())) {
- Error(Loc, "out of range format");
- return MatchOperand_ParseFail;
- }
+ return ParseStatus::Failure;
+ if (!isValidFormatEncoding(Format, getSTI()))
+ return Error(Loc, "out of range format");
- return MatchOperand_Success;
+ return ParseStatus::Success;
}
-OperandMatchResultTy
-AMDGPUAsmParser::parseSymbolicOrNumericFormat(int64_t &Format) {
+ParseStatus AMDGPUAsmParser::parseSymbolicOrNumericFormat(int64_t &Format) {
using namespace llvm::AMDGPU::MTBUFFormat;
if (!trySkipId("format", AsmToken::Colon))
- return MatchOperand_NoMatch;
+ return ParseStatus::NoMatch;
if (trySkipToken(AsmToken::LBrac)) {
StringRef FormatStr;
SMLoc Loc = getLoc();
if (!parseId(FormatStr, "expected a format string"))
- return MatchOperand_ParseFail;
+ return ParseStatus::Failure;
auto Res = parseSymbolicUnifiedFormat(FormatStr, Loc, Format);
- if (Res == MatchOperand_NoMatch)
+ if (Res.isNoMatch())
Res = parseSymbolicSplitFormat(FormatStr, Loc, Format);
- if (Res != MatchOperand_Success)
+ if (!Res.isSuccess())
return Res;
if (!skipToken(AsmToken::RBrac, "expected a closing square bracket"))
- return MatchOperand_ParseFail;
+ return ParseStatus::Failure;
- return MatchOperand_Success;
+ return ParseStatus::Success;
}
return parseNumericFormat(Format);
}
-OperandMatchResultTy
-AMDGPUAsmParser::parseFORMAT(OperandVector &Operands) {
+ParseStatus AMDGPUAsmParser::parseFORMAT(OperandVector &Operands) {
using namespace llvm::AMDGPU::MTBUFFormat;
int64_t Format = getDefaultFormatEncoding(getSTI());
- OperandMatchResultTy Res;
+ ParseStatus Res;
SMLoc Loc = getLoc();
// Parse legacy format syntax.
Res = isGFX10Plus() ? parseUfmt(Format) : parseDfmtNfmt(Format);
- if (Res == MatchOperand_ParseFail)
+ if (Res.isFailure())
return Res;
- bool FormatFound = (Res == MatchOperand_Success);
+ bool FormatFound = Res.isSuccess();
Operands.push_back(
AMDGPUOperand::CreateImm(this, Format, Loc, AMDGPUOperand::ImmTyFORMAT));
@@ -6329,124 +6218,65 @@ AMDGPUAsmParser::parseFORMAT(OperandVector &Operands) {
if (isToken(AsmToken::EndOfStatement)) {
// We are expecting an soffset operand,
// but let matcher handle the error.
- return MatchOperand_Success;
+ return ParseStatus::Success;
}
// Parse soffset.
Res = parseRegOrImm(Operands);
- if (Res != MatchOperand_Success)
+ if (!Res.isSuccess())
return Res;
trySkipToken(AsmToken::Comma);
if (!FormatFound) {
Res = parseSymbolicOrNumericFormat(Format);
- if (Res == MatchOperand_ParseFail)
+ if (Res.isFailure())
return Res;
- if (Res == MatchOperand_Success) {
+ if (Res.isSuccess()) {
auto Size = Operands.size();
AMDGPUOperand &Op = static_cast<AMDGPUOperand &>(*Operands[Size - 2]);
assert(Op.isImm() && Op.getImmTy() == AMDGPUOperand::ImmTyFORMAT);
Op.setImm(Format);
}
- return MatchOperand_Success;
+ return ParseStatus::Success;
}
- if (isId("format") && peekToken().is(AsmToken::Colon)) {
- Error(getLoc(), "duplicate format");
- return MatchOperand_ParseFail;
- }
- return MatchOperand_Success;
+ if (isId("format") && peekToken().is(AsmToken::Colon))
+ return Error(getLoc(), "duplicate format");
+ return ParseStatus::Success;
}
-OperandMatchResultTy AMDGPUAsmParser::parseFlatOffset(OperandVector &Operands) {
- OperandMatchResultTy Res =
+ParseStatus AMDGPUAsmParser::parseFlatOffset(OperandVector &Operands) {
+ ParseStatus Res =
parseIntWithPrefix("offset", Operands, AMDGPUOperand::ImmTyOffset);
- if (Res == MatchOperand_NoMatch) {
+ if (Res.isNoMatch()) {
Res = parseIntWithPrefix("inst_offset", Operands,
AMDGPUOperand::ImmTyInstOffset);
}
return Res;
}
-OperandMatchResultTy AMDGPUAsmParser::parseR128A16(OperandVector &Operands) {
- OperandMatchResultTy Res =
+ParseStatus AMDGPUAsmParser::parseR128A16(OperandVector &Operands) {
+ ParseStatus Res =
parseNamedBit("r128", Operands, AMDGPUOperand::ImmTyR128A16);
- if (Res == MatchOperand_NoMatch)
+ if (Res.isNoMatch())
Res = parseNamedBit("a16", Operands, AMDGPUOperand::ImmTyA16);
return Res;
}
-//===----------------------------------------------------------------------===//
-// ds
-//===----------------------------------------------------------------------===//
-
-void AMDGPUAsmParser::cvtDSOffset01(MCInst &Inst,
- const OperandVector &Operands) {
- OptionalImmIndexMap OptionalIdx;
-
- for (unsigned i = 1, e = Operands.size(); i != e; ++i) {
- AMDGPUOperand &Op = ((AMDGPUOperand &)*Operands[i]);
-
- // Add the register arguments
- if (Op.isReg()) {
- Op.addRegOperands(Inst, 1);
- continue;
- }
-
- // Handle optional arguments
- OptionalIdx[Op.getImmTy()] = i;
+ParseStatus AMDGPUAsmParser::parseBLGP(OperandVector &Operands) {
+ ParseStatus Res =
+ parseIntWithPrefix("blgp", Operands, AMDGPUOperand::ImmTyBLGP);
+ if (Res.isNoMatch()) {
+ Res =
+ parseOperandArrayWithPrefix("neg", Operands, AMDGPUOperand::ImmTyBLGP);
}
-
- addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyOffset0);
- addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyOffset1);
- addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyGDS);
-
- Inst.addOperand(MCOperand::createReg(AMDGPU::M0)); // m0
+ return Res;
}
-void AMDGPUAsmParser::cvtDSImpl(MCInst &Inst, const OperandVector &Operands,
- bool IsGdsHardcoded) {
- OptionalImmIndexMap OptionalIdx;
- const MCInstrDesc &Desc = MII.get(Inst.getOpcode());
- AMDGPUOperand::ImmTy OffsetType = AMDGPUOperand::ImmTyOffset;
-
- for (unsigned i = 1, e = Operands.size(); i != e; ++i) {
- AMDGPUOperand &Op = ((AMDGPUOperand &)*Operands[i]);
-
- auto TiedTo =
- Desc.getOperandConstraint(Inst.getNumOperands(), MCOI::TIED_TO);
-
- if (TiedTo != -1) {
- assert((unsigned)TiedTo < Inst.getNumOperands());
- Inst.addOperand(Inst.getOperand(TiedTo));
- }
-
- // Add the register arguments
- if (Op.isReg()) {
- Op.addRegOperands(Inst, 1);
- continue;
- }
-
- if (Op.isToken() && Op.getToken() == "gds") {
- IsGdsHardcoded = true;
- continue;
- }
-
- // Handle optional arguments
- OptionalIdx[Op.getImmTy()] = i;
-
- if (Op.getImmTy() == AMDGPUOperand::ImmTySwizzle)
- OffsetType = AMDGPUOperand::ImmTySwizzle;
- }
-
- addOptionalImmOperand(Inst, Operands, OptionalIdx, OffsetType);
-
- if (!IsGdsHardcoded) {
- addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyGDS);
- }
- Inst.addOperand(MCOperand::createReg(AMDGPU::M0)); // m0
-}
+//===----------------------------------------------------------------------===//
+// Exp
+//===----------------------------------------------------------------------===//
void AMDGPUAsmParser::cvtExp(MCInst &Inst, const OperandVector &Operands) {
OptionalImmIndexMap OptionalIdx;
@@ -6583,8 +6413,7 @@ bool AMDGPUAsmParser::parseCnt(int64_t &IntVal) {
return true;
}
-OperandMatchResultTy
-AMDGPUAsmParser::parseSWaitCntOps(OperandVector &Operands) {
+ParseStatus AMDGPUAsmParser::parseSWaitCnt(OperandVector &Operands) {
AMDGPU::IsaVersion ISA = AMDGPU::getIsaVersion(getSTI().getCPU());
int64_t Waitcnt = getWaitcntBitMask(ISA);
SMLoc S = getLoc();
@@ -6592,15 +6421,15 @@ AMDGPUAsmParser::parseSWaitCntOps(OperandVector &Operands) {
if (isToken(AsmToken::Identifier) && peekToken().is(AsmToken::LParen)) {
while (!isToken(AsmToken::EndOfStatement)) {
if (!parseCnt(Waitcnt))
- return MatchOperand_ParseFail;
+ return ParseStatus::Failure;
}
} else {
if (!parseExpr(Waitcnt))
- return MatchOperand_ParseFail;
+ return ParseStatus::Failure;
}
Operands.push_back(AMDGPUOperand::CreateImm(this, Waitcnt, S));
- return MatchOperand_Success;
+ return ParseStatus::Success;
}
bool AMDGPUAsmParser::parseDelay(int64_t &Delay) {
@@ -6665,23 +6494,22 @@ bool AMDGPUAsmParser::parseDelay(int64_t &Delay) {
return true;
}
-OperandMatchResultTy
-AMDGPUAsmParser::parseSDelayAluOps(OperandVector &Operands) {
+ParseStatus AMDGPUAsmParser::parseSDelayALU(OperandVector &Operands) {
int64_t Delay = 0;
SMLoc S = getLoc();
if (isToken(AsmToken::Identifier) && peekToken().is(AsmToken::LParen)) {
do {
if (!parseDelay(Delay))
- return MatchOperand_ParseFail;
+ return ParseStatus::Failure;
} while (trySkipToken(AsmToken::Pipe));
} else {
if (!parseExpr(Delay))
- return MatchOperand_ParseFail;
+ return ParseStatus::Failure;
}
Operands.push_back(AMDGPUOperand::CreateImm(this, Delay, S));
- return MatchOperand_Success;
+ return ParseStatus::Success;
}
bool
@@ -6689,7 +6517,7 @@ AMDGPUOperand::isSWaitCnt() const {
return isImm();
}
-bool AMDGPUOperand::isSDelayAlu() const { return isImm(); }
+bool AMDGPUOperand::isSDelayALU() const { return isImm(); }
//===----------------------------------------------------------------------===//
// DepCtr
@@ -6753,7 +6581,7 @@ bool AMDGPUAsmParser::parseDepCtr(int64_t &DepCtr, unsigned &UsedOprMask) {
return true;
}
-OperandMatchResultTy AMDGPUAsmParser::parseDepCtrOps(OperandVector &Operands) {
+ParseStatus AMDGPUAsmParser::parseDepCtr(OperandVector &Operands) {
using namespace llvm::AMDGPU::DepCtr;
int64_t DepCtr = getDefaultDepCtrEncoding(getSTI());
@@ -6763,15 +6591,15 @@ OperandMatchResultTy AMDGPUAsmParser::parseDepCtrOps(OperandVector &Operands) {
unsigned UsedOprMask = 0;
while (!isToken(AsmToken::EndOfStatement)) {
if (!parseDepCtr(DepCtr, UsedOprMask))
- return MatchOperand_ParseFail;
+ return ParseStatus::Failure;
}
} else {
if (!parseExpr(DepCtr))
- return MatchOperand_ParseFail;
+ return ParseStatus::Failure;
}
Operands.push_back(AMDGPUOperand::CreateImm(this, DepCtr, Loc));
- return MatchOperand_Success;
+ return ParseStatus::Success;
}
bool AMDGPUOperand::isDepCtr() const { return isS16Imm(); }
@@ -6847,8 +6675,7 @@ AMDGPUAsmParser::validateHwreg(const OperandInfoTy &HwReg,
return true;
}
-OperandMatchResultTy
-AMDGPUAsmParser::parseHwreg(OperandVector &Operands) {
+ParseStatus AMDGPUAsmParser::parseHwreg(OperandVector &Operands) {
using namespace llvm::AMDGPU::Hwreg;
int64_t ImmVal = 0;
@@ -6862,19 +6689,17 @@ AMDGPUAsmParser::parseHwreg(OperandVector &Operands) {
validateHwreg(HwReg, Offset, Width)) {
ImmVal = encodeHwreg(HwReg.Id, Offset.Id, Width.Id);
} else {
- return MatchOperand_ParseFail;
+ return ParseStatus::Failure;
}
} else if (parseExpr(ImmVal, "a hwreg macro")) {
- if (ImmVal < 0 || !isUInt<16>(ImmVal)) {
- Error(Loc, "invalid immediate: only 16-bit values are legal");
- return MatchOperand_ParseFail;
- }
+ if (ImmVal < 0 || !isUInt<16>(ImmVal))
+ return Error(Loc, "invalid immediate: only 16-bit values are legal");
} else {
- return MatchOperand_ParseFail;
+ return ParseStatus::Failure;
}
Operands.push_back(AMDGPUOperand::CreateImm(this, ImmVal, Loc, AMDGPUOperand::ImmTyHwreg));
- return MatchOperand_Success;
+ return ParseStatus::Success;
}
bool AMDGPUOperand::isHwreg() const {
@@ -6967,8 +6792,7 @@ AMDGPUAsmParser::validateSendMsg(const OperandInfoTy &Msg,
return true;
}
-OperandMatchResultTy
-AMDGPUAsmParser::parseSendMsgOp(OperandVector &Operands) {
+ParseStatus AMDGPUAsmParser::parseSendMsg(OperandVector &Operands) {
using namespace llvm::AMDGPU::SendMsg;
int64_t ImmVal = 0;
@@ -6982,19 +6806,17 @@ AMDGPUAsmParser::parseSendMsgOp(OperandVector &Operands) {
validateSendMsg(Msg, Op, Stream)) {
ImmVal = encodeMsg(Msg.Id, Op.Id, Stream.Id);
} else {
- return MatchOperand_ParseFail;
+ return ParseStatus::Failure;
}
} else if (parseExpr(ImmVal, "a sendmsg macro")) {
- if (ImmVal < 0 || !isUInt<16>(ImmVal)) {
- Error(Loc, "invalid immediate: only 16-bit values are legal");
- return MatchOperand_ParseFail;
- }
+ if (ImmVal < 0 || !isUInt<16>(ImmVal))
+ return Error(Loc, "invalid immediate: only 16-bit values are legal");
} else {
- return MatchOperand_ParseFail;
+ return ParseStatus::Failure;
}
Operands.push_back(AMDGPUOperand::CreateImm(this, ImmVal, Loc, AMDGPUOperand::ImmTySendMsg));
- return MatchOperand_Success;
+ return ParseStatus::Success;
}
bool AMDGPUOperand::isSendMsg() const {
@@ -7005,12 +6827,12 @@ bool AMDGPUOperand::isSendMsg() const {
// v_interp
//===----------------------------------------------------------------------===//
-OperandMatchResultTy AMDGPUAsmParser::parseInterpSlot(OperandVector &Operands) {
+ParseStatus AMDGPUAsmParser::parseInterpSlot(OperandVector &Operands) {
StringRef Str;
SMLoc S = getLoc();
if (!parseId(Str))
- return MatchOperand_NoMatch;
+ return ParseStatus::NoMatch;
int Slot = StringSwitch<int>(Str)
.Case("p10", 0)
@@ -7018,27 +6840,23 @@ OperandMatchResultTy AMDGPUAsmParser::parseInterpSlot(OperandVector &Operands) {
.Case("p0", 2)
.Default(-1);
- if (Slot == -1) {
- Error(S, "invalid interpolation slot");
- return MatchOperand_ParseFail;
- }
+ if (Slot == -1)
+ return Error(S, "invalid interpolation slot");
Operands.push_back(AMDGPUOperand::CreateImm(this, Slot, S,
AMDGPUOperand::ImmTyInterpSlot));
- return MatchOperand_Success;
+ return ParseStatus::Success;
}
-OperandMatchResultTy AMDGPUAsmParser::parseInterpAttr(OperandVector &Operands) {
+ParseStatus AMDGPUAsmParser::parseInterpAttr(OperandVector &Operands) {
StringRef Str;
SMLoc S = getLoc();
if (!parseId(Str))
- return MatchOperand_NoMatch;
+ return ParseStatus::NoMatch;
- if (!Str.startswith("attr")) {
- Error(S, "invalid interpolation attribute");
- return MatchOperand_ParseFail;
- }
+ if (!Str.startswith("attr"))
+ return Error(S, "invalid interpolation attribute");
StringRef Chan = Str.take_back(2);
int AttrChan = StringSwitch<int>(Chan)
@@ -7047,57 +6865,49 @@ OperandMatchResultTy AMDGPUAsmParser::parseInterpAttr(OperandVector &Operands) {
.Case(".z", 2)
.Case(".w", 3)
.Default(-1);
- if (AttrChan == -1) {
- Error(S, "invalid or missing interpolation attribute channel");
- return MatchOperand_ParseFail;
- }
+ if (AttrChan == -1)
+ return Error(S, "invalid or missing interpolation attribute channel");
Str = Str.drop_back(2).drop_front(4);
uint8_t Attr;
- if (Str.getAsInteger(10, Attr)) {
- Error(S, "invalid or missing interpolation attribute number");
- return MatchOperand_ParseFail;
- }
+ if (Str.getAsInteger(10, Attr))
+ return Error(S, "invalid or missing interpolation attribute number");
- if (Attr > 63) {
- Error(S, "out of bounds interpolation attribute number");
- return MatchOperand_ParseFail;
- }
+ if (Attr > 32)
+ return Error(S, "out of bounds interpolation attribute number");
SMLoc SChan = SMLoc::getFromPointer(Chan.data());
Operands.push_back(AMDGPUOperand::CreateImm(this, Attr, S,
AMDGPUOperand::ImmTyInterpAttr));
- Operands.push_back(AMDGPUOperand::CreateImm(this, AttrChan, SChan,
- AMDGPUOperand::ImmTyAttrChan));
- return MatchOperand_Success;
+ Operands.push_back(AMDGPUOperand::CreateImm(
+ this, AttrChan, SChan, AMDGPUOperand::ImmTyInterpAttrChan));
+ return ParseStatus::Success;
}
//===----------------------------------------------------------------------===//
// exp
//===----------------------------------------------------------------------===//
-OperandMatchResultTy AMDGPUAsmParser::parseExpTgt(OperandVector &Operands) {
+ParseStatus AMDGPUAsmParser::parseExpTgt(OperandVector &Operands) {
using namespace llvm::AMDGPU::Exp;
StringRef Str;
SMLoc S = getLoc();
if (!parseId(Str))
- return MatchOperand_NoMatch;
+ return ParseStatus::NoMatch;
unsigned Id = getTgtId(Str);
- if (Id == ET_INVALID || !isSupportedTgtId(Id, getSTI())) {
- Error(S, (Id == ET_INVALID) ?
- "invalid exp target" :
- "exp target is not supported on this GPU");
- return MatchOperand_ParseFail;
- }
+ if (Id == ET_INVALID || !isSupportedTgtId(Id, getSTI()))
+ return Error(S, (Id == ET_INVALID)
+ ? "invalid exp target"
+ : "exp target is not supported on this GPU");
Operands.push_back(AMDGPUOperand::CreateImm(this, Id, S,
AMDGPUOperand::ImmTyExpTgt));
- return MatchOperand_Success;
+ return ParseStatus::Success;
}
//===----------------------------------------------------------------------===//
@@ -7562,8 +7372,7 @@ AMDGPUAsmParser::parseSwizzleMacro(int64_t &Imm) {
return false;
}
-OperandMatchResultTy
-AMDGPUAsmParser::parseSwizzleOp(OperandVector &Operands) {
+ParseStatus AMDGPUAsmParser::parseSwizzle(OperandVector &Operands) {
SMLoc S = getLoc();
int64_t Imm = 0;
@@ -7580,9 +7389,9 @@ AMDGPUAsmParser::parseSwizzleOp(OperandVector &Operands) {
Operands.push_back(AMDGPUOperand::CreateImm(this, Imm, S, AMDGPUOperand::ImmTySwizzle));
- return Ok ? MatchOperand_Success : MatchOperand_ParseFail;
+ return Ok ? ParseStatus::Success : ParseStatus::Failure;
}
- return MatchOperand_NoMatch;
+ return ParseStatus::NoMatch;
}
bool
@@ -7638,8 +7447,7 @@ int64_t AMDGPUAsmParser::parseGPRIdxMacro() {
return Imm;
}
-OperandMatchResultTy
-AMDGPUAsmParser::parseGPRIdxMode(OperandVector &Operands) {
+ParseStatus AMDGPUAsmParser::parseGPRIdxMode(OperandVector &Operands) {
using namespace llvm::AMDGPU::VGPRIndexMode;
@@ -7649,19 +7457,17 @@ AMDGPUAsmParser::parseGPRIdxMode(OperandVector &Operands) {
if (trySkipId("gpr_idx", AsmToken::LParen)) {
Imm = parseGPRIdxMacro();
if (Imm == UNDEF)
- return MatchOperand_ParseFail;
+ return ParseStatus::Failure;
} else {
if (getParser().parseAbsoluteExpression(Imm))
- return MatchOperand_ParseFail;
- if (Imm < 0 || !isUInt<4>(Imm)) {
- Error(S, "invalid immediate: only 4-bit values are legal");
- return MatchOperand_ParseFail;
- }
+ return ParseStatus::Failure;
+ if (Imm < 0 || !isUInt<4>(Imm))
+ return Error(S, "invalid immediate: only 4-bit values are legal");
}
Operands.push_back(
AMDGPUOperand::CreateImm(this, Imm, S, AMDGPUOperand::ImmTyGprIdxMode));
- return MatchOperand_Success;
+ return ParseStatus::Success;
}
bool AMDGPUOperand::isGPRIdxMode() const {
@@ -7672,17 +7478,16 @@ bool AMDGPUOperand::isGPRIdxMode() const {
// sopp branch targets
//===----------------------------------------------------------------------===//
-OperandMatchResultTy
-AMDGPUAsmParser::parseSOppBrTarget(OperandVector &Operands) {
+ParseStatus AMDGPUAsmParser::parseSOPPBrTarget(OperandVector &Operands) {
// Make sure we are not parsing something
// that looks like a label or an expression but is not.
// This will improve error messages.
if (isRegister() || isModifier())
- return MatchOperand_NoMatch;
+ return ParseStatus::NoMatch;
if (!parseExpr(Operands))
- return MatchOperand_ParseFail;
+ return ParseStatus::Failure;
AMDGPUOperand &Opr = ((AMDGPUOperand &)*Operands[Operands.size() - 1]);
assert(Opr.isImm() || Opr.isExpr());
@@ -7696,15 +7501,14 @@ AMDGPUAsmParser::parseSOppBrTarget(OperandVector &Operands) {
Error(Loc, "expected a 16-bit signed jump offset");
}
- return MatchOperand_Success;
+ return ParseStatus::Success;
}
//===----------------------------------------------------------------------===//
// Boolean holding registers
//===----------------------------------------------------------------------===//
-OperandMatchResultTy
-AMDGPUAsmParser::parseBoolReg(OperandVector &Operands) {
+ParseStatus AMDGPUAsmParser::parseBoolReg(OperandVector &Operands) {
return parseReg(Operands);
}
@@ -7712,10 +7516,6 @@ AMDGPUAsmParser::parseBoolReg(OperandVector &Operands) {
// mubuf
//===----------------------------------------------------------------------===//
-AMDGPUOperand::Ptr AMDGPUAsmParser::defaultCPol() const {
- return AMDGPUOperand::CreateImm(this, 0, SMLoc(), AMDGPUOperand::ImmTyCPol);
-}
-
void AMDGPUAsmParser::cvtMubufImpl(MCInst &Inst,
const OperandVector &Operands,
bool IsAtomic) {
@@ -7775,100 +7575,12 @@ void AMDGPUAsmParser::cvtMubufImpl(MCInst &Inst,
addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyOffset);
addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyCPol, 0);
- addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySWZ);
-}
-
-void AMDGPUAsmParser::cvtMtbuf(MCInst &Inst, const OperandVector &Operands) {
- OptionalImmIndexMap OptionalIdx;
-
- for (unsigned i = 1, e = Operands.size(); i != e; ++i) {
- AMDGPUOperand &Op = ((AMDGPUOperand &)*Operands[i]);
-
- // Add the register arguments
- if (Op.isReg()) {
- Op.addRegOperands(Inst, 1);
- continue;
- }
-
- // Handle the case where soffset is an immediate
- if (Op.isImm() && Op.getImmTy() == AMDGPUOperand::ImmTyNone) {
- Op.addImmOperands(Inst, 1);
- continue;
- }
-
- // Handle tokens like 'offen' which are sometimes hard-coded into the
- // asm string. There are no MCInst operands for these.
- if (Op.isToken()) {
- continue;
- }
- assert(Op.isImm());
-
- // Handle optional arguments
- OptionalIdx[Op.getImmTy()] = i;
- }
-
- addOptionalImmOperand(Inst, Operands, OptionalIdx,
- AMDGPUOperand::ImmTyOffset);
- addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyFORMAT);
- addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyCPol, 0);
- addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySWZ);
}
//===----------------------------------------------------------------------===//
-// mimg
+// SMEM
//===----------------------------------------------------------------------===//
-void AMDGPUAsmParser::cvtMIMG(MCInst &Inst, const OperandVector &Operands,
- bool IsAtomic) {
- unsigned I = 1;
- const MCInstrDesc &Desc = MII.get(Inst.getOpcode());
- for (unsigned J = 0; J < Desc.getNumDefs(); ++J) {
- ((AMDGPUOperand &)*Operands[I++]).addRegOperands(Inst, 1);
- }
-
- if (IsAtomic) {
- // Add src, same as dst
- assert(Desc.getNumDefs() == 1);
- ((AMDGPUOperand &)*Operands[I - 1]).addRegOperands(Inst, 1);
- }
-
- OptionalImmIndexMap OptionalIdx;
-
- for (unsigned E = Operands.size(); I != E; ++I) {
- AMDGPUOperand &Op = ((AMDGPUOperand &)*Operands[I]);
-
- // Add the register arguments
- if (Op.isReg()) {
- Op.addRegOperands(Inst, 1);
- } else if (Op.isImmModifier()) {
- OptionalIdx[Op.getImmTy()] = I;
- } else if (!Op.isToken()) {
- llvm_unreachable("unexpected operand type");
- }
- }
-
- bool IsGFX10Plus = isGFX10Plus();
-
- addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyDMask);
- if (IsGFX10Plus)
- addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyDim, -1);
- addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyUNorm);
- addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyCPol);
- addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyR128A16);
- if (IsGFX10Plus)
- addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyA16);
- if (AMDGPU::hasNamedOperand(Inst.getOpcode(), AMDGPU::OpName::tfe))
- addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyTFE);
- addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyLWE);
- if (!IsGFX10Plus)
- addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyDA);
- addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyD16);
-}
-
-void AMDGPUAsmParser::cvtMIMGAtomic(MCInst &Inst, const OperandVector &Operands) {
- cvtMIMG(Inst, Operands, true);
-}
-
void AMDGPUAsmParser::cvtSMEMAtomic(MCInst &Inst, const OperandVector &Operands) {
OptionalImmIndexMap OptionalIdx;
bool IsAtomicReturn = false;
@@ -7920,54 +7632,28 @@ void AMDGPUAsmParser::cvtSMEMAtomic(MCInst &Inst, const OperandVector &Operands)
if ((int)Inst.getNumOperands() <=
AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::offset))
- addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyOffset);
+ addOptionalImmOperand(Inst, Operands, OptionalIdx,
+ AMDGPUOperand::ImmTySMEMOffsetMod);
addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyCPol, 0);
}
-void AMDGPUAsmParser::cvtIntersectRay(MCInst &Inst,
- const OperandVector &Operands) {
- for (unsigned I = 1; I < Operands.size(); ++I) {
- auto &Operand = (AMDGPUOperand &)*Operands[I];
- if (Operand.isReg())
- Operand.addRegOperands(Inst, 1);
- }
-
- Inst.addOperand(MCOperand::createImm(1)); // a16
-}
-
//===----------------------------------------------------------------------===//
// smrd
//===----------------------------------------------------------------------===//
bool AMDGPUOperand::isSMRDOffset8() const {
- return isImm() && isUInt<8>(getImm());
+ return isImmLiteral() && isUInt<8>(getImm());
}
bool AMDGPUOperand::isSMEMOffset() const {
- return isImmTy(ImmTyNone) ||
- isImmTy(ImmTyOffset); // Offset range is checked later by validator.
+ // Offset range is checked later by validator.
+ return isImmLiteral();
}
bool AMDGPUOperand::isSMRDLiteralOffset() const {
// 32-bit literals are only supported on CI and we only want to use them
// when the offset is > 8-bits.
- return isImm() && !isUInt<8>(getImm()) && isUInt<32>(getImm());
-}
-
-AMDGPUOperand::Ptr AMDGPUAsmParser::defaultSMRDOffset8() const {
- return AMDGPUOperand::CreateImm(this, 0, SMLoc(), AMDGPUOperand::ImmTyOffset);
-}
-
-AMDGPUOperand::Ptr AMDGPUAsmParser::defaultSMEMOffset() const {
- return AMDGPUOperand::CreateImm(this, 0, SMLoc(), AMDGPUOperand::ImmTyOffset);
-}
-
-AMDGPUOperand::Ptr AMDGPUAsmParser::defaultSMRDLiteralOffset() const {
- return AMDGPUOperand::CreateImm(this, 0, SMLoc(), AMDGPUOperand::ImmTyOffset);
-}
-
-AMDGPUOperand::Ptr AMDGPUAsmParser::defaultFlatOffset() const {
- return AMDGPUOperand::CreateImm(this, 0, SMLoc(), AMDGPUOperand::ImmTyOffset);
+ return isImmLiteral() && !isUInt<8>(getImm()) && isUInt<32>(getImm());
}
//===----------------------------------------------------------------------===//
@@ -7996,12 +7682,13 @@ static bool ConvertOmodDiv(int64_t &Div) {
return false;
}
-// Both bound_ctrl:0 and bound_ctrl:1 are encoded as 1.
+// For pre-gfx11 targets, both bound_ctrl:0 and bound_ctrl:1 are encoded as 1.
// This is intentional and ensures compatibility with sp3.
// See bug 35397 for details.
-static bool ConvertDppBoundCtrl(int64_t &BoundCtrl) {
+bool AMDGPUAsmParser::convertDppBoundCtrl(int64_t &BoundCtrl) {
if (BoundCtrl == 0 || BoundCtrl == 1) {
- BoundCtrl = 1;
+ if (!isGFX11Plus())
+ BoundCtrl = 1;
return true;
}
return false;
@@ -8013,13 +7700,15 @@ void AMDGPUAsmParser::onBeginOfFile() {
return;
if (!getTargetStreamer().getTargetID())
- getTargetStreamer().initializeTargetID(getSTI(), getSTI().getFeatureString());
+ getTargetStreamer().initializeTargetID(getSTI(), getSTI().getFeatureString(),
+ // TODO: Should try to check code object version from directive???
+ AMDGPU::getAmdhsaCodeObjectVersion());
if (isHsaAbiVersion3AndAbove(&getSTI()))
getTargetStreamer().EmitDirectiveAMDGCNTarget();
}
-OperandMatchResultTy AMDGPUAsmParser::parseOModOperand(OperandVector &Operands) {
+ParseStatus AMDGPUAsmParser::parseOModSI(OperandVector &Operands) {
StringRef Name = getTokenStr();
if (Name == "mul") {
return parseIntWithPrefix("mul", Operands,
@@ -8031,7 +7720,7 @@ OperandMatchResultTy AMDGPUAsmParser::parseOModOperand(OperandVector &Operands)
AMDGPUOperand::ImmTyOModSI, ConvertOmodDiv);
}
- return MatchOperand_NoMatch;
+ return ParseStatus::NoMatch;
}
// Determines which bit DST_OP_SEL occupies in the op_sel operand according to
@@ -8100,9 +7789,8 @@ void AMDGPUAsmParser::cvtVOP3Interp(MCInst &Inst, const OperandVector &Operands)
AMDGPUOperand &Op = ((AMDGPUOperand &)*Operands[I]);
if (isRegOrImmWithInputMods(Desc, Inst.getNumOperands())) {
Op.addRegOrImmWithFPInputModsOperands(Inst, 2);
- } else if (Op.isInterpSlot() ||
- Op.isInterpAttr() ||
- Op.isAttrChan()) {
+ } else if (Op.isInterpSlot() || Op.isInterpAttr() ||
+ Op.isInterpAttrChan()) {
Inst.addOperand(MCOperand::createImm(Op.getImm()));
} else if (Op.isImmModifier()) {
OptionalIdx[Op.getImmTy()] = I;
@@ -8335,9 +8023,9 @@ void AMDGPUAsmParser::cvtVOP3P(MCInst &Inst, const OperandVector &Operands) {
// VOPD
//===----------------------------------------------------------------------===//
-OperandMatchResultTy AMDGPUAsmParser::parseVOPD(OperandVector &Operands) {
+ParseStatus AMDGPUAsmParser::parseVOPD(OperandVector &Operands) {
if (!hasVOPD(getSTI()))
- return MatchOperand_NoMatch;
+ return ParseStatus::NoMatch;
if (isToken(AsmToken::Colon) && peekToken(false).is(AsmToken::Colon)) {
SMLoc S = getLoc();
@@ -8348,12 +8036,11 @@ OperandMatchResultTy AMDGPUAsmParser::parseVOPD(OperandVector &Operands) {
StringRef OpYName;
if (isToken(AsmToken::Identifier) && !Parser.parseIdentifier(OpYName)) {
Operands.push_back(AMDGPUOperand::CreateToken(this, OpYName, OpYLoc));
- return MatchOperand_Success;
+ return ParseStatus::Success;
}
- Error(OpYLoc, "expected a VOPDY instruction after ::");
- return MatchOperand_ParseFail;
+ return Error(OpYLoc, "expected a VOPDY instruction after ::");
}
- return MatchOperand_NoMatch;
+ return ParseStatus::NoMatch;
}
// Create VOPD MCInst operands using parsed assembler operands.
@@ -8439,11 +8126,11 @@ bool AMDGPUOperand::isABID() const {
}
bool AMDGPUOperand::isS16Imm() const {
- return isImm() && (isInt<16>(getImm()) || isUInt<16>(getImm()));
+ return isImmLiteral() && (isInt<16>(getImm()) || isUInt<16>(getImm()));
}
bool AMDGPUOperand::isU16Imm() const {
- return isImm() && isUInt<16>(getImm());
+ return isImmLiteral() && isUInt<16>(getImm());
}
//===----------------------------------------------------------------------===//
@@ -8479,66 +8166,62 @@ bool AMDGPUAsmParser::parseDimId(unsigned &Encoding) {
return true;
}
-OperandMatchResultTy AMDGPUAsmParser::parseDim(OperandVector &Operands) {
+ParseStatus AMDGPUAsmParser::parseDim(OperandVector &Operands) {
if (!isGFX10Plus())
- return MatchOperand_NoMatch;
+ return ParseStatus::NoMatch;
SMLoc S = getLoc();
if (!trySkipId("dim", AsmToken::Colon))
- return MatchOperand_NoMatch;
+ return ParseStatus::NoMatch;
unsigned Encoding;
SMLoc Loc = getLoc();
- if (!parseDimId(Encoding)) {
- Error(Loc, "invalid dim value");
- return MatchOperand_ParseFail;
- }
+ if (!parseDimId(Encoding))
+ return Error(Loc, "invalid dim value");
Operands.push_back(AMDGPUOperand::CreateImm(this, Encoding, S,
AMDGPUOperand::ImmTyDim));
- return MatchOperand_Success;
+ return ParseStatus::Success;
}
//===----------------------------------------------------------------------===//
// dpp
//===----------------------------------------------------------------------===//
-OperandMatchResultTy AMDGPUAsmParser::parseDPP8(OperandVector &Operands) {
+ParseStatus AMDGPUAsmParser::parseDPP8(OperandVector &Operands) {
SMLoc S = getLoc();
if (!isGFX10Plus() || !trySkipId("dpp8", AsmToken::Colon))
- return MatchOperand_NoMatch;
+ return ParseStatus::NoMatch;
// dpp8:[%d,%d,%d,%d,%d,%d,%d,%d]
int64_t Sels[8];
if (!skipToken(AsmToken::LBrac, "expected an opening square bracket"))
- return MatchOperand_ParseFail;
+ return ParseStatus::Failure;
for (size_t i = 0; i < 8; ++i) {
if (i > 0 && !skipToken(AsmToken::Comma, "expected a comma"))
- return MatchOperand_ParseFail;
+ return ParseStatus::Failure;
SMLoc Loc = getLoc();
if (getParser().parseAbsoluteExpression(Sels[i]))
- return MatchOperand_ParseFail;
- if (0 > Sels[i] || 7 < Sels[i]) {
- Error(Loc, "expected a 3-bit value");
- return MatchOperand_ParseFail;
- }
+ return ParseStatus::Failure;
+ if (0 > Sels[i] || 7 < Sels[i])
+ return Error(Loc, "expected a 3-bit value");
}
if (!skipToken(AsmToken::RBrac, "expected a closing square bracket"))
- return MatchOperand_ParseFail;
+ return ParseStatus::Failure;
unsigned DPP8 = 0;
for (size_t i = 0; i < 8; ++i)
DPP8 |= (Sels[i] << (i * 3));
Operands.push_back(AMDGPUOperand::CreateImm(this, DPP8, S, AMDGPUOperand::ImmTyDPP8));
- return MatchOperand_Success;
+ return ParseStatus::Success;
}
bool
@@ -8644,13 +8327,12 @@ AMDGPUAsmParser::parseDPPCtrlSel(StringRef Ctrl) {
return Val;
}
-OperandMatchResultTy
-AMDGPUAsmParser::parseDPPCtrl(OperandVector &Operands) {
+ParseStatus AMDGPUAsmParser::parseDPPCtrl(OperandVector &Operands) {
using namespace AMDGPU::DPP;
if (!isToken(AsmToken::Identifier) ||
!isSupportedDPPCtrl(getTokenStr(), Operands))
- return MatchOperand_NoMatch;
+ return ParseStatus::NoMatch;
SMLoc S = getLoc();
int64_t Val = -1;
@@ -8673,31 +8355,11 @@ AMDGPUAsmParser::parseDPPCtrl(OperandVector &Operands) {
}
if (Val == -1)
- return MatchOperand_ParseFail;
+ return ParseStatus::Failure;
Operands.push_back(
AMDGPUOperand::CreateImm(this, Val, S, AMDGPUOperand::ImmTyDppCtrl));
- return MatchOperand_Success;
-}
-
-AMDGPUOperand::Ptr AMDGPUAsmParser::defaultRowMask() const {
- return AMDGPUOperand::CreateImm(this, 0xf, SMLoc(), AMDGPUOperand::ImmTyDppRowMask);
-}
-
-AMDGPUOperand::Ptr AMDGPUAsmParser::defaultEndpgmImmOperands() const {
- return AMDGPUOperand::CreateImm(this, 0, SMLoc(), AMDGPUOperand::ImmTyEndpgm);
-}
-
-AMDGPUOperand::Ptr AMDGPUAsmParser::defaultBankMask() const {
- return AMDGPUOperand::CreateImm(this, 0xf, SMLoc(), AMDGPUOperand::ImmTyDppBankMask);
-}
-
-AMDGPUOperand::Ptr AMDGPUAsmParser::defaultDppBoundCtrl() const {
- return AMDGPUOperand::CreateImm(this, 0, SMLoc(), AMDGPUOperand::ImmTyDppBoundCtrl);
-}
-
-AMDGPUOperand::Ptr AMDGPUAsmParser::defaultFI() const {
- return AMDGPUOperand::CreateImm(this, 0, SMLoc(), AMDGPUOperand::ImmTyDppFi);
+ return ParseStatus::Success;
}
void AMDGPUAsmParser::cvtVOP3DPP(MCInst &Inst, const OperandVector &Operands,
@@ -8744,7 +8406,7 @@ void AMDGPUAsmParser::cvtVOP3DPP(MCInst &Inst, const OperandVector &Operands,
}
AMDGPUOperand &Op = ((AMDGPUOperand &)*Operands[I]);
// Add the register arguments
- if (IsDPP8 && Op.isFI()) {
+ if (IsDPP8 && Op.isDppFI()) {
Fi = Op.getImm();
} else if (isRegOrImmWithInputMods(Desc, Inst.getNumOperands())) {
Op.addRegOrImmWithFPInputModsOperands(Inst, 2);
@@ -8786,7 +8448,7 @@ void AMDGPUAsmParser::cvtVOP3DPP(MCInst &Inst, const OperandVector &Operands,
if (AMDGPU::hasNamedOperand(Inst.getOpcode(), AMDGPU::OpName::fi))
addOptionalImmOperand(Inst, Operands, OptionalIdx,
- AMDGPUOperand::ImmTyDppFi);
+ AMDGPUOperand::ImmTyDppFI);
}
}
@@ -8821,7 +8483,7 @@ void AMDGPUAsmParser::cvtDPP(MCInst &Inst, const OperandVector &Operands, bool I
Op.addImmOperands(Inst, 1);
} else if (isRegOrImmWithInputMods(Desc, Inst.getNumOperands())) {
Op.addRegWithFPInputModsOperands(Inst, 2);
- } else if (Op.isFI()) {
+ } else if (Op.isDppFI()) {
Fi = Op.getImm();
} else if (Op.isReg()) {
Op.addRegOperands(Inst, 1);
@@ -8852,7 +8514,8 @@ void AMDGPUAsmParser::cvtDPP(MCInst &Inst, const OperandVector &Operands, bool I
addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyDppBankMask, 0xf);
addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyDppBoundCtrl);
if (AMDGPU::hasNamedOperand(Inst.getOpcode(), AMDGPU::OpName::fi)) {
- addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyDppFi);
+ addOptionalImmOperand(Inst, Operands, OptionalIdx,
+ AMDGPUOperand::ImmTyDppFI);
}
}
}
@@ -8861,20 +8524,18 @@ void AMDGPUAsmParser::cvtDPP(MCInst &Inst, const OperandVector &Operands, bool I
// sdwa
//===----------------------------------------------------------------------===//
-OperandMatchResultTy
-AMDGPUAsmParser::parseSDWASel(OperandVector &Operands, StringRef Prefix,
- AMDGPUOperand::ImmTy Type) {
+ParseStatus AMDGPUAsmParser::parseSDWASel(OperandVector &Operands,
+ StringRef Prefix,
+ AMDGPUOperand::ImmTy Type) {
using namespace llvm::AMDGPU::SDWA;
SMLoc S = getLoc();
StringRef Value;
- OperandMatchResultTy res;
SMLoc StringLoc;
- res = parseStringWithPrefix(Prefix, Value, StringLoc);
- if (res != MatchOperand_Success) {
- return res;
- }
+ ParseStatus Res = parseStringWithPrefix(Prefix, Value, StringLoc);
+ if (!Res.isSuccess())
+ return Res;
int64_t Int;
Int = StringSwitch<int64_t>(Value)
@@ -8887,28 +8548,23 @@ AMDGPUAsmParser::parseSDWASel(OperandVector &Operands, StringRef Prefix,
.Case("DWORD", SdwaSel::DWORD)
.Default(0xffffffff);
- if (Int == 0xffffffff) {
- Error(StringLoc, "invalid " + Twine(Prefix) + " value");
- return MatchOperand_ParseFail;
- }
+ if (Int == 0xffffffff)
+ return Error(StringLoc, "invalid " + Twine(Prefix) + " value");
Operands.push_back(AMDGPUOperand::CreateImm(this, Int, S, Type));
- return MatchOperand_Success;
+ return ParseStatus::Success;
}
-OperandMatchResultTy
-AMDGPUAsmParser::parseSDWADstUnused(OperandVector &Operands) {
+ParseStatus AMDGPUAsmParser::parseSDWADstUnused(OperandVector &Operands) {
using namespace llvm::AMDGPU::SDWA;
SMLoc S = getLoc();
StringRef Value;
- OperandMatchResultTy res;
SMLoc StringLoc;
- res = parseStringWithPrefix("dst_unused", Value, StringLoc);
- if (res != MatchOperand_Success) {
- return res;
- }
+ ParseStatus Res = parseStringWithPrefix("dst_unused", Value, StringLoc);
+ if (!Res.isSuccess())
+ return Res;
int64_t Int;
Int = StringSwitch<int64_t>(Value)
@@ -8917,13 +8573,11 @@ AMDGPUAsmParser::parseSDWADstUnused(OperandVector &Operands) {
.Case("UNUSED_PRESERVE", DstUnused::UNUSED_PRESERVE)
.Default(0xffffffff);
- if (Int == 0xffffffff) {
- Error(StringLoc, "invalid dst_unused value");
- return MatchOperand_ParseFail;
- }
+ if (Int == 0xffffffff)
+ return Error(StringLoc, "invalid dst_unused value");
- Operands.push_back(AMDGPUOperand::CreateImm(this, Int, S, AMDGPUOperand::ImmTySdwaDstUnused));
- return MatchOperand_Success;
+ Operands.push_back(AMDGPUOperand::CreateImm(this, Int, S, AMDGPUOperand::ImmTySDWADstUnused));
+ return ParseStatus::Success;
}
void AMDGPUAsmParser::cvtSdwaVOP1(MCInst &Inst, const OperandVector &Operands) {
@@ -9009,14 +8663,14 @@ void AMDGPUAsmParser::cvtSDWA(MCInst &Inst, const OperandVector &Operands,
if (AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::dst_sel))
addOptionalImmOperand(Inst, Operands, OptionalIdx,
- AMDGPUOperand::ImmTySdwaDstSel, SdwaSel::DWORD);
+ AMDGPUOperand::ImmTySDWADstSel, SdwaSel::DWORD);
if (AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::dst_unused))
addOptionalImmOperand(Inst, Operands, OptionalIdx,
- AMDGPUOperand::ImmTySdwaDstUnused,
+ AMDGPUOperand::ImmTySDWADstUnused,
DstUnused::UNUSED_PRESERVE);
- addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySdwaSrc0Sel, SdwaSel::DWORD);
+ addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySDWASrc0Sel, SdwaSel::DWORD);
break;
case SIInstrFlags::VOP2:
@@ -9025,17 +8679,17 @@ void AMDGPUAsmParser::cvtSDWA(MCInst &Inst, const OperandVector &Operands,
if (AMDGPU::hasNamedOperand(Inst.getOpcode(), AMDGPU::OpName::omod))
addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyOModSI, 0);
- addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySdwaDstSel, SdwaSel::DWORD);
- addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySdwaDstUnused, DstUnused::UNUSED_PRESERVE);
- addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySdwaSrc0Sel, SdwaSel::DWORD);
- addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySdwaSrc1Sel, SdwaSel::DWORD);
+ addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySDWADstSel, SdwaSel::DWORD);
+ addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySDWADstUnused, DstUnused::UNUSED_PRESERVE);
+ addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySDWASrc0Sel, SdwaSel::DWORD);
+ addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySDWASrc1Sel, SdwaSel::DWORD);
break;
case SIInstrFlags::VOPC:
if (AMDGPU::hasNamedOperand(Inst.getOpcode(), AMDGPU::OpName::clamp))
addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyClampSI, 0);
- addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySdwaSrc0Sel, SdwaSel::DWORD);
- addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySdwaSrc1Sel, SdwaSel::DWORD);
+ addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySDWASrc0Sel, SdwaSel::DWORD);
+ addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySDWASrc1Sel, SdwaSel::DWORD);
break;
default:
@@ -9054,25 +8708,9 @@ void AMDGPUAsmParser::cvtSDWA(MCInst &Inst, const OperandVector &Operands,
}
}
-//===----------------------------------------------------------------------===//
-// mAI
-//===----------------------------------------------------------------------===//
-
-AMDGPUOperand::Ptr AMDGPUAsmParser::defaultBLGP() const {
- return AMDGPUOperand::CreateImm(this, 0, SMLoc(), AMDGPUOperand::ImmTyBLGP);
-}
-
-AMDGPUOperand::Ptr AMDGPUAsmParser::defaultCBSZ() const {
- return AMDGPUOperand::CreateImm(this, 0, SMLoc(), AMDGPUOperand::ImmTyCBSZ);
-}
-
-AMDGPUOperand::Ptr AMDGPUAsmParser::defaultABID() const {
- return AMDGPUOperand::CreateImm(this, 0, SMLoc(), AMDGPUOperand::ImmTyABID);
-}
-
/// Force static initialization.
extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUAsmParser() {
- RegisterMCAsmParser<AMDGPUAsmParser> A(getTheAMDGPUTarget());
+ RegisterMCAsmParser<AMDGPUAsmParser> A(getTheR600Target());
RegisterMCAsmParser<AMDGPUAsmParser> B(getTheGCNTarget());
}
@@ -9082,8 +8720,8 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUAsmParser() {
#define GET_MNEMONIC_CHECKER
#include "AMDGPUGenAsmMatcher.inc"
-OperandMatchResultTy
-AMDGPUAsmParser::parseCustomOperand(OperandVector &Operands, unsigned MCK) {
+ParseStatus AMDGPUAsmParser::parseCustomOperand(OperandVector &Operands,
+ unsigned MCK) {
switch (MCK) {
case MCK_addr64:
return parseTokenOp("addr64", Operands);
@@ -9099,55 +8737,8 @@ AMDGPUAsmParser::parseCustomOperand(OperandVector &Operands, unsigned MCK) {
return parseTokenOp("off", Operands);
case MCK_row_95_en:
return parseTokenOp("row_en", Operands);
- case MCK_ImmABID:
- return parseIntWithPrefix("abid", Operands, AMDGPUOperand::ImmTyABID);
- case MCK_ImmBankMask:
- return parseIntWithPrefix("bank_mask", Operands,
- AMDGPUOperand::ImmTyDppBankMask);
- case MCK_ImmBLGP: {
- OperandMatchResultTy Res =
- parseIntWithPrefix("blgp", Operands, AMDGPUOperand::ImmTyBLGP);
- if (Res == MatchOperand_NoMatch) {
- Res = parseOperandArrayWithPrefix("neg", Operands,
- AMDGPUOperand::ImmTyBLGP);
- }
- return Res;
- }
- case MCK_ImmCBSZ:
- return parseIntWithPrefix("cbsz", Operands, AMDGPUOperand::ImmTyCBSZ);
- case MCK_ImmCPol:
- return parseCPol(Operands);
- case MCK_ImmFI:
- return parseIntWithPrefix("fi", Operands, AMDGPUOperand::ImmTyDppFi);
case MCK_gds:
return parseNamedBit("gds", Operands, AMDGPUOperand::ImmTyGDS);
- case MCK_ImmNegHi:
- return parseOperandArrayWithPrefix("neg_hi", Operands,
- AMDGPUOperand::ImmTyNegHi);
- case MCK_ImmNegLo:
- return parseOperandArrayWithPrefix("neg_lo", Operands,
- AMDGPUOperand::ImmTyNegLo);
- case MCK_ImmSMEMOffset:
- return parseIntWithPrefix("offset", Operands, AMDGPUOperand::ImmTyOffset);
- case MCK_ImmOModSI:
- return parseOModOperand(Operands);
- case MCK_ImmOpSel:
- return parseOperandArrayWithPrefix("op_sel", Operands,
- AMDGPUOperand::ImmTyOpSel);
- case MCK_ImmOpSelHi:
- return parseOperandArrayWithPrefix("op_sel_hi", Operands,
- AMDGPUOperand::ImmTyOpSelHi);
- case MCK_ImmRowMask:
- return parseIntWithPrefix("row_mask", Operands,
- AMDGPUOperand::ImmTyDppRowMask);
- case MCK_ImmSDWADstSel:
- return parseSDWASel(Operands, "dst_sel", AMDGPUOperand::ImmTySdwaDstSel);
- case MCK_ImmSDWADstUnused:
- return parseSDWADstUnused(Operands);
- case MCK_ImmSDWASrc0Sel:
- return parseSDWASel(Operands, "src0_sel", AMDGPUOperand::ImmTySdwaSrc0Sel);
- case MCK_ImmSDWASrc1Sel:
- return parseSDWASel(Operands, "src1_sel", AMDGPUOperand::ImmTySdwaSrc1Sel);
case MCK_tfe:
return parseNamedBit("tfe", Operands, AMDGPUOperand::ImmTyTFE);
}
@@ -9186,18 +8777,16 @@ unsigned AMDGPUAsmParser::validateTargetOperandClass(MCParsedAsmOperand &Op,
return Operand.isSSrcB32() ? Match_Success : Match_InvalidOperand;
case MCK_SSrcF32:
return Operand.isSSrcF32() ? Match_Success : Match_InvalidOperand;
- case MCK_SoppBrTarget:
- return Operand.isSoppBrTarget() ? Match_Success : Match_InvalidOperand;
+ case MCK_SOPPBrTarget:
+ return Operand.isSOPPBrTarget() ? Match_Success : Match_InvalidOperand;
case MCK_VReg32OrOff:
return Operand.isVReg32OrOff() ? Match_Success : Match_InvalidOperand;
case MCK_InterpSlot:
return Operand.isInterpSlot() ? Match_Success : Match_InvalidOperand;
- case MCK_Attr:
+ case MCK_InterpAttr:
return Operand.isInterpAttr() ? Match_Success : Match_InvalidOperand;
- case MCK_AttrChan:
- return Operand.isAttrChan() ? Match_Success : Match_InvalidOperand;
- case MCK_ImmSMEMOffset:
- return Operand.isSMEMOffset() ? Match_Success : Match_InvalidOperand;
+ case MCK_InterpAttrChan:
+ return Operand.isInterpAttrChan() ? Match_Success : Match_InvalidOperand;
case MCK_SReg_64:
case MCK_SReg_64_XEXEC:
// Null is defined as a 32-bit register but
@@ -9215,7 +8804,7 @@ unsigned AMDGPUAsmParser::validateTargetOperandClass(MCParsedAsmOperand &Op,
// endpgm
//===----------------------------------------------------------------------===//
-OperandMatchResultTy AMDGPUAsmParser::parseEndpgmOp(OperandVector &Operands) {
+ParseStatus AMDGPUAsmParser::parseEndpgm(OperandVector &Operands) {
SMLoc S = getLoc();
int64_t Imm = 0;
@@ -9224,14 +8813,12 @@ OperandMatchResultTy AMDGPUAsmParser::parseEndpgmOp(OperandVector &Operands) {
Imm = 0;
}
- if (!isUInt<16>(Imm)) {
- Error(S, "expected a 16-bit value");
- return MatchOperand_ParseFail;
- }
+ if (!isUInt<16>(Imm))
+ return Error(S, "expected a 16-bit value");
Operands.push_back(
AMDGPUOperand::CreateImm(this, Imm, S, AMDGPUOperand::ImmTyEndpgm));
- return MatchOperand_Success;
+ return ParseStatus::Success;
}
bool AMDGPUOperand::isEndpgm() const { return isImmTy(ImmTyEndpgm); }
@@ -9240,10 +8827,6 @@ bool AMDGPUOperand::isEndpgm() const { return isImmTy(ImmTyEndpgm); }
// LDSDIR
//===----------------------------------------------------------------------===//
-AMDGPUOperand::Ptr AMDGPUAsmParser::defaultWaitVDST() const {
- return AMDGPUOperand::CreateImm(this, 0, SMLoc(), AMDGPUOperand::ImmTyWaitVDST);
-}
-
bool AMDGPUOperand::isWaitVDST() const {
return isImmTy(ImmTyWaitVDST) && isUInt<4>(getImm());
}
@@ -9252,10 +8835,6 @@ bool AMDGPUOperand::isWaitVDST() const {
// VINTERP
//===----------------------------------------------------------------------===//
-AMDGPUOperand::Ptr AMDGPUAsmParser::defaultWaitEXP() const {
- return AMDGPUOperand::CreateImm(this, 0, SMLoc(), AMDGPUOperand::ImmTyWaitEXP);
-}
-
bool AMDGPUOperand::isWaitEXP() const {
return isImmTy(ImmTyWaitEXP) && isUInt<3>(getImm());
}
diff --git a/llvm/lib/Target/AMDGPU/BUFInstructions.td b/llvm/lib/Target/AMDGPU/BUFInstructions.td
index bd7f088c76e3..ea1578e30ae8 100644
--- a/llvm/lib/Target/AMDGPU/BUFInstructions.td
+++ b/llvm/lib/Target/AMDGPU/BUFInstructions.td
@@ -110,7 +110,6 @@ class MTBUF_Pseudo <string opName, dag outs, dag ins,
Instruction BaseOpcode = !cast<Instruction>(MTBUFGetBaseOpcode<NAME>.ret);
let MTBUF = 1;
- let AsmMatchConverter = "cvtMtbuf";
}
class MTBUF_Real <MTBUF_Pseudo ps, string real_name = ps.Mnemonic> :
@@ -158,7 +157,7 @@ class getMTBUFInsDA<list<RegisterClass> vdataList,
RegisterClass vaddrClass = !if(!empty(vaddrList), ?, !head(vaddrList));
RegisterOperand vdata_op = getLdStRegisterOperand<vdataClass>.ret;
- dag NonVaddrInputs = (ins SReg_128:$srsrc, SCSrc_b32:$soffset, offset:$offset, FORMAT:$format, CPol:$cpol, SWZ:$swz);
+ dag NonVaddrInputs = (ins SReg_128:$srsrc, SCSrc_b32:$soffset, offset:$offset, FORMAT:$format, CPol:$cpol, i1imm:$swz);
dag Inputs = !if(!empty(vaddrList), NonVaddrInputs, !con((ins vaddrClass:$vaddr), NonVaddrInputs));
dag ret = !if(!empty(vdataList), Inputs, !con((ins vdata_op:$vdata), Inputs));
}
@@ -186,7 +185,7 @@ class getMTBUFAsmOps<int addrKind> {
!if(!eq(addrKind, BUFAddrKind.Addr64),
"$vaddr, $srsrc,$format $soffset addr64",
"")))));
- string ret = " $vdata, " # Pfx # "$offset$cpol$swz";
+ string ret = " $vdata, " # Pfx # "$offset$cpol";
}
class MTBUF_SetupAddr<int addrKind> {
@@ -387,7 +386,7 @@ class getMUBUFInsDA<list<RegisterClass> vdataList,
RegisterClass vaddrClass = !if(!empty(vaddrList), ?, !head(vaddrList));
RegisterOperand vdata_op = getLdStVDataRegisterOperand<vdataClass, isTFE>.ret;
- dag NonVaddrInputs = (ins SReg_128:$srsrc, SCSrc_b32:$soffset, offset:$offset, CPol_0:$cpol, SWZ_0:$swz);
+ dag NonVaddrInputs = (ins SReg_128:$srsrc, SCSrc_b32:$soffset, offset:$offset, CPol_0:$cpol, i1imm_0:$swz);
dag Inputs = !if(!empty(vaddrList), NonVaddrInputs, !con((ins vaddrClass:$vaddr), NonVaddrInputs));
dag ret = !if(!empty(vdataList), Inputs, !con((ins vdata_op:$vdata), Inputs));
}
@@ -421,7 +420,7 @@ class getMUBUFIns<int addrKind, list<RegisterClass> vdataList, bit isTFE> {
(ins))))));
}
-class getMUBUFAsmOps<int addrKind, bit noVdata = 0, bit isLds = 0, bit isTFE = 0, bit isSwz = 0> {
+class getMUBUFAsmOps<int addrKind, bit noVdata = 0, bit isLds = 0, bit isTFE = 0> {
string Vdata = !if(noVdata, " ", " $vdata, ");
string Lds = !if(isLds, " lds", "");
string TFE = !if(isTFE, " tfe", "");
@@ -434,9 +433,8 @@ class getMUBUFAsmOps<int addrKind, bit noVdata = 0, bit isLds = 0, bit isTFE = 0
"")))));
string Offset = "$offset";
string OtherArgs = "$cpol";
- string Swz = !if(isSwz, "$swz", "");
- string ret = Vdata # MainArgs # Offset # OtherArgs # Lds # TFE # Swz;
+ string ret = Vdata # MainArgs # Offset # OtherArgs # Lds # TFE;
}
class MUBUF_SetupAddr<int addrKind> {
@@ -467,7 +465,7 @@ class MUBUF_Load_Pseudo <string opName,
!if(!or(isLds, isLdsOpc), (outs), (outs vdata_op:$vdata)),
!con(getMUBUFIns<addrKindCopy, [], isTFE>.ret,
!if(HasTiedDest, (ins vdata_op:$vdata_in), (ins))),
- getMUBUFAsmOps<addrKindCopy, !or(isLds, isLdsOpc), isLds, isTFE, 1>.ret,
+ getMUBUFAsmOps<addrKindCopy, !or(isLds, isLdsOpc), isLds, isTFE>.ret,
pattern>,
MUBUF_SetupAddr<addrKindCopy> {
let PseudoInstr = opName # !if(isLds, "_lds", "") # !if(isTFE, "_tfe", "") #
@@ -488,15 +486,15 @@ class MUBUF_Load_Pseudo <string opName,
}
class MUBUF_Offset_Load_Pat <Instruction inst, ValueType load_vt = i32, SDPatternOperator ld = null_frag> : Pat <
- (load_vt (ld (MUBUFOffset v4i32:$srsrc, i32:$soffset, i16:$offset))),
- (load_vt (inst v4i32:$srsrc, i32:$soffset, i16:$offset))
+ (load_vt (ld (MUBUFOffset v4i32:$srsrc, i32:$soffset, i32:$offset))),
+ (load_vt (inst v4i32:$srsrc, i32:$soffset, i32:$offset))
>;
class MUBUF_Addr64_Load_Pat <Instruction inst,
ValueType load_vt = i32,
SDPatternOperator ld = null_frag> : Pat <
- (load_vt (ld (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, i32:$soffset, i16:$offset))),
- (load_vt (inst i64:$vaddr, v4i32:$srsrc, i32:$soffset, i16:$offset))
+ (load_vt (ld (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, i32:$soffset, i32:$offset))),
+ (load_vt (inst i64:$vaddr, v4i32:$srsrc, i32:$soffset, i32:$offset))
>;
multiclass MUBUF_Pseudo_Load_Pats<string BaseInst, ValueType load_vt = i32, SDPatternOperator ld = null_frag> {
@@ -562,7 +560,7 @@ class MUBUF_Store_Pseudo <string opName,
: MUBUF_Pseudo<opName,
(outs),
getMUBUFIns<addrKindCopy, [getVregSrcForVT<store_vt>.ret], isTFE>.ret,
- getMUBUFAsmOps<addrKindCopy, 0, 0, isTFE, 1>.ret,
+ getMUBUFAsmOps<addrKindCopy, 0, 0, isTFE>.ret,
pattern>,
MUBUF_SetupAddr<addrKindCopy> {
let PseudoInstr = opName # "_" # !if(isTFE, "_tfe", "") #
@@ -580,12 +578,12 @@ multiclass MUBUF_Pseudo_Stores_Helper<string opName, ValueType store_vt,
def _OFFSET : MUBUF_Store_Pseudo <opName, BUFAddrKind.Offset, legal_store_vt, isTFE,
[(st legal_store_vt:$vdata, (MUBUFOffset v4i32:$srsrc, i32:$soffset,
- i16:$offset))]>,
+ i32:$offset))]>,
MUBUFAddr64Table<0, NAME>;
def _ADDR64 : MUBUF_Store_Pseudo <opName, BUFAddrKind.Addr64, legal_store_vt, isTFE,
[(st legal_store_vt:$vdata, (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, i32:$soffset,
- i16:$offset))]>,
+ i32:$offset))]>,
MUBUFAddr64Table<1, NAME>;
def _OFFEN : MUBUF_Store_Pseudo <opName, BUFAddrKind.OffEn, legal_store_vt, isTFE>;
@@ -609,8 +607,8 @@ multiclass MUBUF_Pseudo_Stores<string opName, ValueType store_vt = i32,
class MUBUF_Pseudo_Store_Lds<string opName>
: MUBUF_Pseudo<opName,
(outs),
- (ins SReg_128:$srsrc, SCSrc_b32:$soffset, offset:$offset, CPol:$cpol, SWZ:$swz),
- " $srsrc, $soffset$offset lds$cpol$swz"> {
+ (ins SReg_128:$srsrc, SCSrc_b32:$soffset, offset:$offset, CPol:$cpol, i1imm:$swz),
+ " $srsrc, $soffset$offset lds$cpol"> {
let LGKM_CNT = 1;
let mayLoad = 1;
let mayStore = 1;
@@ -635,7 +633,7 @@ class getMUBUFAtomicInsDA<RegisterClass vdataClass, bit vdata_in,
dag MainInputs = (ins SReg_128:$srsrc, SCSrc_b32:$soffset, offset:$offset);
dag CPol = !if(vdata_in, (ins CPol_GLC1:$cpol), (ins CPol_0:$cpol));
- dag ret = !con(Data, !con(MainInputs, CPol));
+ dag ret = !con(Data, MainInputs, CPol);
}
class getMUBUFAtomicIns<int addrKind,
@@ -724,23 +722,15 @@ multiclass MUBUF_Pseudo_Atomics_NO_RTN <string opName,
RegisterClass vdataClass,
ValueType vdataType,
bit isFP = isFloatType<vdataType>.ret> {
- let FPAtomic = isFP in
- def _OFFSET : MUBUF_AtomicNoRet_Pseudo <opName, BUFAddrKind.Offset, vdataClass>,
- MUBUFAddr64Table <0, NAME>;
-
- let FPAtomic = isFP in
- def _ADDR64 : MUBUF_AtomicNoRet_Pseudo <opName, BUFAddrKind.Addr64, vdataClass>,
- MUBUFAddr64Table <1, NAME>;
-
- let FPAtomic = isFP in
- def _OFFEN : MUBUF_AtomicNoRet_Pseudo <opName, BUFAddrKind.OffEn, vdataClass>;
-
- let FPAtomic = isFP in
-
- def _IDXEN : MUBUF_AtomicNoRet_Pseudo <opName, BUFAddrKind.IdxEn, vdataClass>;
-
- let FPAtomic = isFP in
- def _BOTHEN : MUBUF_AtomicNoRet_Pseudo <opName, BUFAddrKind.BothEn, vdataClass>;
+ let FPAtomic = isFP in {
+ def _OFFSET : MUBUF_AtomicNoRet_Pseudo <opName, BUFAddrKind.Offset, vdataClass>,
+ MUBUFAddr64Table <0, NAME>;
+ def _ADDR64 : MUBUF_AtomicNoRet_Pseudo <opName, BUFAddrKind.Addr64, vdataClass>,
+ MUBUFAddr64Table <1, NAME>;
+ def _OFFEN : MUBUF_AtomicNoRet_Pseudo <opName, BUFAddrKind.OffEn, vdataClass>;
+ def _IDXEN : MUBUF_AtomicNoRet_Pseudo <opName, BUFAddrKind.IdxEn, vdataClass>;
+ def _BOTHEN : MUBUF_AtomicNoRet_Pseudo <opName, BUFAddrKind.BothEn, vdataClass>;
+ }
}
multiclass MUBUF_Pseudo_Atomics_RTN <string opName,
@@ -748,28 +738,23 @@ multiclass MUBUF_Pseudo_Atomics_RTN <string opName,
ValueType vdataType,
SDPatternOperator atomic,
bit isFP = isFloatType<vdataType>.ret> {
- let FPAtomic = isFP in
- def _OFFSET_RTN : MUBUF_AtomicRet_Pseudo <opName, BUFAddrKind.Offset, vdataClass,
- [(set vdataType:$vdata,
- (atomic (MUBUFOffset v4i32:$srsrc, i32:$soffset, i16:$offset),
- vdataType:$vdata_in))]>,
- MUBUFAddr64Table <0, NAME # "_RTN">;
-
- let FPAtomic = isFP in
- def _ADDR64_RTN : MUBUF_AtomicRet_Pseudo <opName, BUFAddrKind.Addr64, vdataClass,
- [(set vdataType:$vdata,
- (atomic (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, i32:$soffset, i16:$offset),
- vdataType:$vdata_in))]>,
- MUBUFAddr64Table <1, NAME # "_RTN">;
-
- let FPAtomic = isFP in
- def _OFFEN_RTN : MUBUF_AtomicRet_Pseudo <opName, BUFAddrKind.OffEn, vdataClass>;
+ let FPAtomic = isFP in {
+ def _OFFSET_RTN : MUBUF_AtomicRet_Pseudo <opName, BUFAddrKind.Offset, vdataClass,
+ [(set vdataType:$vdata,
+ (atomic (MUBUFOffset v4i32:$srsrc, i32:$soffset, i32:$offset),
+ vdataType:$vdata_in))]>,
+ MUBUFAddr64Table <0, NAME # "_RTN">;
- let FPAtomic = isFP in
- def _IDXEN_RTN : MUBUF_AtomicRet_Pseudo <opName, BUFAddrKind.IdxEn, vdataClass>;
+ def _ADDR64_RTN : MUBUF_AtomicRet_Pseudo <opName, BUFAddrKind.Addr64, vdataClass,
+ [(set vdataType:$vdata,
+ (atomic (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, i32:$soffset, i32:$offset),
+ vdataType:$vdata_in))]>,
+ MUBUFAddr64Table <1, NAME # "_RTN">;
- let FPAtomic = isFP in
- def _BOTHEN_RTN : MUBUF_AtomicRet_Pseudo <opName, BUFAddrKind.BothEn, vdataClass>;
+ def _OFFEN_RTN : MUBUF_AtomicRet_Pseudo <opName, BUFAddrKind.OffEn, vdataClass>;
+ def _IDXEN_RTN : MUBUF_AtomicRet_Pseudo <opName, BUFAddrKind.IdxEn, vdataClass>;
+ def _BOTHEN_RTN : MUBUF_AtomicRet_Pseudo <opName, BUFAddrKind.BothEn, vdataClass>;
+ }
}
multiclass MUBUF_Pseudo_Atomics <string opName,
@@ -1124,7 +1109,7 @@ defm BUFFER_ATOMIC_ADD_F32 : MUBUF_Pseudo_Atomics_NO_RTN<
"buffer_atomic_add_f32", VGPR_32, f32
>;
-let SubtargetPredicate = HasAtomicPkFaddNoRtnInsts in
+let SubtargetPredicate = HasAtomicBufferGlobalPkAddF16NoRtnInsts in
defm BUFFER_ATOMIC_PK_ADD_F16 : MUBUF_Pseudo_Atomics_NO_RTN <
"buffer_atomic_pk_add_f16", VGPR_32, v2f16
>;
@@ -1134,7 +1119,7 @@ defm BUFFER_ATOMIC_ADD_F32 : MUBUF_Pseudo_Atomics_RTN<
"buffer_atomic_add_f32", VGPR_32, f32, null_frag
>;
-let OtherPredicates = [isGFX90APlus] in
+let OtherPredicates = [HasAtomicBufferGlobalPkAddF16Insts] in
defm BUFFER_ATOMIC_PK_ADD_F16 : MUBUF_Pseudo_Atomics_RTN <
"buffer_atomic_pk_add_f16", VGPR_32, v2f16, null_frag
>;
@@ -1233,21 +1218,21 @@ multiclass MUBUF_LoadIntrinsicPat<SDPatternOperator name, ValueType vt,
def : GCNPat<
(vt (st v4i32:$rsrc, 0, 0, i32:$soffset, timm:$offset,
timm:$auxiliary, 0)),
- (!cast<MUBUF_Pseudo>(opcode # _OFFSET) SReg_128:$rsrc, SCSrc_b32:$soffset, (as_i16timm $offset),
+ (!cast<MUBUF_Pseudo>(opcode # _OFFSET) SReg_128:$rsrc, SCSrc_b32:$soffset, timm:$offset,
(extract_cpol $auxiliary), (extract_swz $auxiliary))
>;
def : GCNPat<
(vt (st v4i32:$rsrc, 0, i32:$voffset, i32:$soffset, timm:$offset,
timm:$auxiliary, 0)),
- (!cast<MUBUF_Pseudo>(opcode # _OFFEN) VGPR_32:$voffset, SReg_128:$rsrc, SCSrc_b32:$soffset, (as_i16timm $offset),
+ (!cast<MUBUF_Pseudo>(opcode # _OFFEN) VGPR_32:$voffset, SReg_128:$rsrc, SCSrc_b32:$soffset, timm:$offset,
(extract_cpol $auxiliary), (extract_swz $auxiliary))
>;
def : GCNPat<
(vt (st v4i32:$rsrc, i32:$vindex, 0, i32:$soffset, timm:$offset,
timm:$auxiliary, timm)),
- (!cast<MUBUF_Pseudo>(opcode # _IDXEN) VGPR_32:$vindex, SReg_128:$rsrc, SCSrc_b32:$soffset, (as_i16timm $offset),
+ (!cast<MUBUF_Pseudo>(opcode # _IDXEN) VGPR_32:$vindex, SReg_128:$rsrc, SCSrc_b32:$soffset, timm:$offset,
(extract_cpol $auxiliary), (extract_swz $auxiliary))
>;
@@ -1256,7 +1241,7 @@ multiclass MUBUF_LoadIntrinsicPat<SDPatternOperator name, ValueType vt,
timm:$auxiliary, timm)),
(!cast<MUBUF_Pseudo>(opcode # _BOTHEN)
(REG_SEQUENCE VReg_64, VGPR_32:$vindex, sub0, VGPR_32:$voffset, sub1),
- SReg_128:$rsrc, SCSrc_b32:$soffset, (as_i16timm $offset),
+ SReg_128:$rsrc, SCSrc_b32:$soffset, timm:$offset,
(extract_cpol $auxiliary), (extract_swz $auxiliary))
>;
}
@@ -1320,7 +1305,7 @@ multiclass MUBUF_StoreIntrinsicPat<SDPatternOperator name, ValueType vt,
def : GCNPat<
(st vt:$vdata, v4i32:$rsrc, 0, 0, i32:$soffset, timm:$offset,
timm:$auxiliary, 0),
- (!cast<MUBUF_Pseudo>(opcode # _OFFSET_exact) getVregSrcForVT<vt>.ret:$vdata, SReg_128:$rsrc, SCSrc_b32:$soffset, (as_i16timm $offset),
+ (!cast<MUBUF_Pseudo>(opcode # _OFFSET_exact) getVregSrcForVT<vt>.ret:$vdata, SReg_128:$rsrc, SCSrc_b32:$soffset, timm:$offset,
(extract_cpol $auxiliary), (extract_swz $auxiliary))
>;
@@ -1328,14 +1313,14 @@ multiclass MUBUF_StoreIntrinsicPat<SDPatternOperator name, ValueType vt,
(st vt:$vdata, v4i32:$rsrc, 0, i32:$voffset, i32:$soffset, timm:$offset,
timm:$auxiliary, 0),
(!cast<MUBUF_Pseudo>(opcode # _OFFEN_exact) getVregSrcForVT<vt>.ret:$vdata, VGPR_32:$voffset, SReg_128:$rsrc, SCSrc_b32:$soffset,
- (as_i16timm $offset), (extract_cpol $auxiliary), (extract_swz $auxiliary))
+ timm:$offset, (extract_cpol $auxiliary), (extract_swz $auxiliary))
>;
def : GCNPat<
(st vt:$vdata, v4i32:$rsrc, i32:$vindex, 0, i32:$soffset, timm:$offset,
timm:$auxiliary, timm),
(!cast<MUBUF_Pseudo>(opcode # _IDXEN_exact) getVregSrcForVT<vt>.ret:$vdata, VGPR_32:$vindex, SReg_128:$rsrc, SCSrc_b32:$soffset,
- (as_i16timm $offset), (extract_cpol $auxiliary), (extract_swz $auxiliary))
+ timm:$offset, (extract_cpol $auxiliary), (extract_swz $auxiliary))
>;
def : GCNPat<
@@ -1344,7 +1329,7 @@ multiclass MUBUF_StoreIntrinsicPat<SDPatternOperator name, ValueType vt,
(!cast<MUBUF_Pseudo>(opcode # _BOTHEN_exact)
getVregSrcForVT<vt>.ret:$vdata,
(REG_SEQUENCE VReg_64, VGPR_32:$vindex, sub0, VGPR_32:$voffset, sub1),
- SReg_128:$rsrc, SCSrc_b32:$soffset, (as_i16timm $offset), (extract_cpol $auxiliary),
+ SReg_128:$rsrc, SCSrc_b32:$soffset, timm:$offset, (extract_cpol $auxiliary),
(extract_swz $auxiliary))
>;
}
@@ -1408,13 +1393,13 @@ multiclass BufferAtomicPat<string OpPrefix, ValueType vt, string Inst, bit isInt
let AddedComplexity = !if(!eq(RtnMode, "ret"), 0, 1) in {
def : GCNPat<
- (vt (Op (MUBUFOffset v4i32:$srsrc, i32:$soffset, i16:$offset), vt:$vdata_in)),
+ (vt (Op (MUBUFOffset v4i32:$srsrc, i32:$soffset, i32:$offset), vt:$vdata_in)),
(!cast<MUBUF_Pseudo>(Inst # "_OFFSET" # InstSuffix) getVregSrcForVT<vt>.ret:$vdata_in,
SReg_128:$srsrc, SCSrc_b32:$soffset, offset:$offset)
>;
def : GCNPat<
- (vt (Op (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, i32:$soffset, i16:$offset),
+ (vt (Op (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, i32:$soffset, i32:$offset),
vt:$vdata_in)),
(!cast<MUBUF_Pseudo>(Inst # "_ADDR64" # InstSuffix) getVregSrcForVT<vt>.ret:$vdata_in,
VReg_64:$vaddr, SReg_128:$srsrc, SCSrc_b32:$soffset, offset:$offset)
@@ -1441,7 +1426,7 @@ multiclass BufferAtomicCmpSwapPat<ValueType vt, ValueType data_vt, string Inst>
getVregSrcForVT<data_vt>.ret:$vdata_in, SReg_128:$srsrc, SCSrc_b32:$soffset,
offset:$offset);
def : GCNPat<
- (vt (Op (MUBUFOffset v4i32:$srsrc, i32:$soffset, i16:$offset), data_vt:$vdata_in)),
+ (vt (Op (MUBUFOffset v4i32:$srsrc, i32:$soffset, i32:$offset), data_vt:$vdata_in)),
!if(!eq(RtnMode, "ret"),
(EXTRACT_SUBREG (vt (COPY_TO_REGCLASS OffsetResDag, getVregSrcForVT<data_vt>.ret)),
!if(!eq(vt, i32), sub0, sub0_sub1)),
@@ -1452,7 +1437,7 @@ multiclass BufferAtomicCmpSwapPat<ValueType vt, ValueType data_vt, string Inst>
getVregSrcForVT<data_vt>.ret:$vdata_in, VReg_64:$vaddr, SReg_128:$srsrc,
SCSrc_b32:$soffset, offset:$offset);
def : GCNPat<
- (vt (Op (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, i32:$soffset, i16:$offset),
+ (vt (Op (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, i32:$soffset, i32:$offset),
data_vt:$vdata_in)),
!if(!eq(RtnMode, "ret"),
(EXTRACT_SUBREG (vt (COPY_TO_REGCLASS Addr64ResDag, getVregSrcForVT<data_vt>.ret)),
@@ -1478,8 +1463,8 @@ defm : BufferAtomicPat<"atomic_load_umax_global", Ty, "BUFFER_ATOMIC_UMAX" # Suf
defm : BufferAtomicPat<"atomic_load_and_global", Ty, "BUFFER_ATOMIC_AND" # Suffix>;
defm : BufferAtomicPat<"atomic_load_or_global", Ty, "BUFFER_ATOMIC_OR" # Suffix>;
defm : BufferAtomicPat<"atomic_load_xor_global", Ty, "BUFFER_ATOMIC_XOR" # Suffix>;
-defm : BufferAtomicPat<"atomic_inc_global", Ty, "BUFFER_ATOMIC_INC" # Suffix>;
-defm : BufferAtomicPat<"atomic_dec_global", Ty, "BUFFER_ATOMIC_DEC" # Suffix>;
+defm : BufferAtomicPat<"atomic_load_uinc_wrap_global", Ty, "BUFFER_ATOMIC_INC" # Suffix>;
+defm : BufferAtomicPat<"atomic_load_udec_wrap_global", Ty, "BUFFER_ATOMIC_DEC" # Suffix>;
} // end foreach Ty
@@ -1503,7 +1488,7 @@ multiclass SIBufferAtomicPat<string OpPrefix, ValueType vt, string Inst,
timm:$offset, timm:$cachepolicy, 0)),
(!cast<MUBUF_Pseudo>(Inst # "_OFFSET" # InstSuffix)
getVregSrcForVT<vt>.ret:$vdata_in, SReg_128:$rsrc, SCSrc_b32:$soffset,
- (as_i16timm $offset), CachePolicy)
+ timm:$offset, CachePolicy)
>;
def : GCNPat<
@@ -1511,7 +1496,7 @@ multiclass SIBufferAtomicPat<string OpPrefix, ValueType vt, string Inst,
timm:$offset, timm:$cachepolicy, timm)),
(!cast<MUBUF_Pseudo>(Inst # "_IDXEN" # InstSuffix)
getVregSrcForVT<vt>.ret:$vdata_in, VGPR_32:$vindex, SReg_128:$rsrc,
- SCSrc_b32:$soffset, (as_i16timm $offset), CachePolicy)
+ SCSrc_b32:$soffset, timm:$offset, CachePolicy)
>;
def : GCNPat<
@@ -1519,7 +1504,7 @@ multiclass SIBufferAtomicPat<string OpPrefix, ValueType vt, string Inst,
i32:$soffset, timm:$offset, timm:$cachepolicy, 0)),
(!cast<MUBUF_Pseudo>(Inst # "_OFFEN" # InstSuffix)
getVregSrcForVT<vt>.ret:$vdata_in, VGPR_32:$voffset, SReg_128:$rsrc,
- SCSrc_b32:$soffset, (as_i16timm $offset), CachePolicy)
+ SCSrc_b32:$soffset, timm:$offset, CachePolicy)
>;
def : GCNPat<
@@ -1528,7 +1513,7 @@ multiclass SIBufferAtomicPat<string OpPrefix, ValueType vt, string Inst,
(!cast<MUBUF_Pseudo>(Inst # "_BOTHEN" # InstSuffix)
getVregSrcForVT<vt>.ret:$vdata_in,
(REG_SEQUENCE VReg_64, VGPR_32:$vindex, sub0, VGPR_32:$voffset, sub1),
- SReg_128:$rsrc, SCSrc_b32:$soffset, (as_i16timm $offset), CachePolicy)
+ SReg_128:$rsrc, SCSrc_b32:$soffset, timm:$offset, CachePolicy)
>;
} // end let AddedComplexity
@@ -1584,7 +1569,7 @@ multiclass BufferAtomicPatterns_NO_RTN<SDPatternOperator name, ValueType vt,
0, i32:$soffset, timm:$offset,
timm:$cachepolicy, 0),
(!cast<MUBUF_Pseudo>(opcode # _OFFSET) getVregSrcForVT<vt>.ret:$vdata_in, SReg_128:$rsrc, SCSrc_b32:$soffset,
- (as_i16timm $offset), timm:$cachepolicy)
+ timm:$offset, timm:$cachepolicy)
>;
def : GCNPat<
@@ -1592,7 +1577,7 @@ multiclass BufferAtomicPatterns_NO_RTN<SDPatternOperator name, ValueType vt,
0, i32:$soffset, timm:$offset,
timm:$cachepolicy, timm),
(!cast<MUBUF_Pseudo>(opcode # _IDXEN) getVregSrcForVT<vt>.ret:$vdata_in, VGPR_32:$vindex, SReg_128:$rsrc, SCSrc_b32:$soffset,
- (as_i16timm $offset), timm:$cachepolicy)
+ timm:$offset, timm:$cachepolicy)
>;
def : GCNPat<
@@ -1600,7 +1585,7 @@ multiclass BufferAtomicPatterns_NO_RTN<SDPatternOperator name, ValueType vt,
i32:$voffset, i32:$soffset, timm:$offset,
timm:$cachepolicy, 0),
(!cast<MUBUF_Pseudo>(opcode # _OFFEN) getVregSrcForVT<vt>.ret:$vdata_in, VGPR_32:$voffset, SReg_128:$rsrc, SCSrc_b32:$soffset,
- (as_i16timm $offset), timm:$cachepolicy)
+ timm:$offset, timm:$cachepolicy)
>;
def : GCNPat<
@@ -1610,22 +1595,23 @@ multiclass BufferAtomicPatterns_NO_RTN<SDPatternOperator name, ValueType vt,
(!cast<MUBUF_Pseudo>(opcode # _BOTHEN)
getVregSrcForVT<vt>.ret:$vdata_in,
(REG_SEQUENCE VReg_64, VGPR_32:$vindex, sub0, VGPR_32:$voffset, sub1),
- SReg_128:$rsrc, SCSrc_b32:$soffset, (as_i16timm $offset), timm:$cachepolicy)
+ SReg_128:$rsrc, SCSrc_b32:$soffset, timm:$offset, timm:$cachepolicy)
>;
}
let SubtargetPredicate = HasAtomicFaddNoRtnInsts in
defm : SIBufferAtomicPat<"SIbuffer_atomic_fadd", f32, "BUFFER_ATOMIC_ADD_F32", ["noret"]>;
-let SubtargetPredicate = HasAtomicPkFaddNoRtnInsts in
+let SubtargetPredicate = HasAtomicBufferGlobalPkAddF16NoRtnInsts in
defm : SIBufferAtomicPat<"SIbuffer_atomic_fadd", v2f16, "BUFFER_ATOMIC_PK_ADD_F16", ["noret"]>;
let SubtargetPredicate = HasAtomicFaddRtnInsts in
defm : SIBufferAtomicPat<"SIbuffer_atomic_fadd", f32, "BUFFER_ATOMIC_ADD_F32", ["ret"]>;
-let SubtargetPredicate = isGFX90APlus in {
- defm : SIBufferAtomicPat<"SIbuffer_atomic_fadd", v2f16, "BUFFER_ATOMIC_PK_ADD_F16", ["ret"]>;
+let SubtargetPredicate = HasAtomicBufferGlobalPkAddF16Insts in
+defm : SIBufferAtomicPat<"SIbuffer_atomic_fadd", v2f16, "BUFFER_ATOMIC_PK_ADD_F16", ["ret"]>;
+let SubtargetPredicate = isGFX90APlus in {
defm : SIBufferAtomicPat<"SIbuffer_atomic_fadd", f64, "BUFFER_ATOMIC_ADD_F64">;
defm : SIBufferAtomicPat<"SIbuffer_atomic_fmin", f64, "BUFFER_ATOMIC_MIN_F64">;
defm : SIBufferAtomicPat<"SIbuffer_atomic_fmax", f64, "BUFFER_ATOMIC_MAX_F64">;
@@ -1641,7 +1627,7 @@ defvar CachePolicy = !if(!eq(RtnMode, "ret"), (set_glc $cachepolicy),
defvar OffsetResDag = (!cast<MUBUF_Pseudo>("BUFFER_ATOMIC_CMPSWAP_OFFSET" # InstSuffix)
(REG_SEQUENCE VReg_64, VGPR_32:$data, sub0, VGPR_32:$cmp, sub1),
- SReg_128:$rsrc, SCSrc_b32:$soffset, (as_i16timm $offset), CachePolicy);
+ SReg_128:$rsrc, SCSrc_b32:$soffset, timm:$offset, CachePolicy);
def : GCNPat<
(Op
i32:$data, i32:$cmp, v4i32:$rsrc, 0, 0, i32:$soffset,
@@ -1653,7 +1639,7 @@ def : GCNPat<
defvar IdxenResDag = (!cast<MUBUF_Pseudo>("BUFFER_ATOMIC_CMPSWAP_IDXEN" # InstSuffix)
(REG_SEQUENCE VReg_64, VGPR_32:$data, sub0, VGPR_32:$cmp, sub1),
- VGPR_32:$vindex, SReg_128:$rsrc, SCSrc_b32:$soffset, (as_i16timm $offset),
+ VGPR_32:$vindex, SReg_128:$rsrc, SCSrc_b32:$soffset, timm:$offset,
CachePolicy);
def : GCNPat<
(Op
@@ -1667,7 +1653,7 @@ def : GCNPat<
defvar OffenResDag = (!cast<MUBUF_Pseudo>("BUFFER_ATOMIC_CMPSWAP_OFFEN" # InstSuffix)
(REG_SEQUENCE VReg_64, VGPR_32:$data, sub0, VGPR_32:$cmp, sub1),
- VGPR_32:$voffset, SReg_128:$rsrc, SCSrc_b32:$soffset, (as_i16timm $offset),
+ VGPR_32:$voffset, SReg_128:$rsrc, SCSrc_b32:$soffset, timm:$offset,
CachePolicy);
def : GCNPat<
(Op
@@ -1682,7 +1668,7 @@ def : GCNPat<
defvar BothenResDag = (!cast<MUBUF_Pseudo>("BUFFER_ATOMIC_CMPSWAP_BOTHEN" # InstSuffix)
(REG_SEQUENCE VReg_64, VGPR_32:$data, sub0, VGPR_32:$cmp, sub1),
(REG_SEQUENCE VReg_64, VGPR_32:$vindex, sub0, VGPR_32:$voffset, sub1),
- SReg_128:$rsrc, SCSrc_b32:$soffset, (as_i16timm $offset), CachePolicy);
+ SReg_128:$rsrc, SCSrc_b32:$soffset, timm:$offset, CachePolicy);
def : GCNPat<
(Op
i32:$data, i32:$cmp, v4i32:$rsrc, i32:$vindex,
@@ -1698,19 +1684,19 @@ def : GCNPat<
class MUBUFLoad_PatternADDR64 <MUBUF_Pseudo Instr_ADDR64, ValueType vt,
PatFrag constant_ld> : GCNPat <
(vt (constant_ld (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, i32:$soffset,
- i16:$offset))),
+ i32:$offset))),
(Instr_ADDR64 $vaddr, $srsrc, $soffset, $offset)
>;
multiclass MUBUFLoad_Atomic_Pattern <MUBUF_Pseudo Instr_ADDR64, MUBUF_Pseudo Instr_OFFSET,
ValueType vt, PatFrag atomic_ld> {
def : GCNPat <
- (vt (atomic_ld (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, i32:$soffset, i16:$offset))),
+ (vt (atomic_ld (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, i32:$soffset, i32:$offset))),
(Instr_ADDR64 $vaddr, $srsrc, $soffset, $offset)
>;
def : GCNPat <
- (vt (atomic_ld (MUBUFOffset v4i32:$rsrc, i32:$soffset, i16:$offset))),
+ (vt (atomic_ld (MUBUFOffset v4i32:$rsrc, i32:$soffset, i32:$offset))),
(Instr_OFFSET $rsrc, $soffset, (as_i16imm $offset))
>;
}
@@ -1731,7 +1717,7 @@ multiclass MUBUFLoad_Pattern <MUBUF_Pseudo Instr_OFFSET, ValueType vt,
PatFrag ld> {
def : GCNPat <
- (vt (ld (MUBUFOffset v4i32:$srsrc, i32:$soffset, i16:$offset))),
+ (vt (ld (MUBUFOffset v4i32:$srsrc, i32:$soffset, i32:$offset))),
(Instr_OFFSET $srsrc, $soffset, $offset)
>;
}
@@ -1754,12 +1740,12 @@ multiclass MUBUFScratchLoadPat <MUBUF_Pseudo InstrOffen,
ValueType vt, PatFrag ld> {
def : GCNPat <
(vt (ld (MUBUFScratchOffen v4i32:$srsrc, i32:$vaddr,
- i32:$soffset, u16imm:$offset))),
+ i32:$soffset, i32:$offset))),
(InstrOffen $vaddr, $srsrc, $soffset, $offset, 0, 0)
>;
def : GCNPat <
- (vt (ld (MUBUFScratchOffset v4i32:$srsrc, i32:$soffset, u16imm:$offset))),
+ (vt (ld (MUBUFScratchOffset v4i32:$srsrc, i32:$soffset, i32:$offset))),
(InstrOffset $srsrc, $soffset, $offset, 0, 0)
>;
}
@@ -1769,12 +1755,12 @@ multiclass MUBUFScratchLoadPat_D16 <MUBUF_Pseudo InstrOffen,
MUBUF_Pseudo InstrOffset,
ValueType vt, PatFrag ld_frag> {
def : GCNPat <
- (ld_frag (MUBUFScratchOffen v4i32:$srsrc, i32:$vaddr, i32:$soffset, u16imm:$offset), vt:$in),
+ (ld_frag (MUBUFScratchOffen v4i32:$srsrc, i32:$vaddr, i32:$soffset, i32:$offset), vt:$in),
(InstrOffen $vaddr, $srsrc, $soffset, $offset, $in)
>;
def : GCNPat <
- (ld_frag (MUBUFScratchOffset v4i32:$srsrc, i32:$soffset, u16imm:$offset), vt:$in),
+ (ld_frag (MUBUFScratchOffset v4i32:$srsrc, i32:$soffset, i32:$offset), vt:$in),
(InstrOffset $srsrc, $soffset, $offset, $in)
>;
}
@@ -1820,12 +1806,12 @@ multiclass MUBUFStore_Atomic_Pattern <MUBUF_Pseudo Instr_ADDR64, MUBUF_Pseudo In
ValueType vt, PatFrag atomic_st> {
// Store follows atomic op convention so address is first
def : GCNPat <
- (atomic_st (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, i32:$soffset, i16:$offset), vt:$val),
+ (atomic_st (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, i32:$soffset, i32:$offset), vt:$val),
(Instr_ADDR64 $val, $vaddr, $srsrc, $soffset, $offset)
>;
def : GCNPat <
- (atomic_st (MUBUFOffset v4i32:$rsrc, i32:$soffset, i16:$offset), vt:$val),
+ (atomic_st (MUBUFOffset v4i32:$rsrc, i32:$soffset, i32:$offset), vt:$val),
(Instr_OFFSET $val, $rsrc, $soffset, (as_i16imm $offset))
>;
}
@@ -1843,7 +1829,7 @@ multiclass MUBUFStore_Pattern <MUBUF_Pseudo Instr_OFFSET, ValueType vt,
PatFrag st> {
def : GCNPat <
- (st vt:$vdata, (MUBUFOffset v4i32:$srsrc, i32:$soffset, i16:$offset)),
+ (st vt:$vdata, (MUBUFOffset v4i32:$srsrc, i32:$soffset, i32:$offset)),
(Instr_OFFSET $vdata, $srsrc, $soffset, $offset)
>;
}
@@ -1857,13 +1843,13 @@ multiclass MUBUFScratchStorePat <MUBUF_Pseudo InstrOffen,
RegisterClass rc = VGPR_32> {
def : GCNPat <
(st vt:$value, (MUBUFScratchOffen v4i32:$srsrc, i32:$vaddr,
- i32:$soffset, u16imm:$offset)),
+ i32:$soffset, i32:$offset)),
(InstrOffen rc:$value, $vaddr, $srsrc, $soffset, $offset, 0, 0)
>;
def : GCNPat <
(st vt:$value, (MUBUFScratchOffset v4i32:$srsrc, i32:$soffset,
- u16imm:$offset)),
+ i32:$offset)),
(InstrOffset rc:$value, $srsrc, $soffset, $offset, 0, 0)
>;
}
@@ -1908,7 +1894,7 @@ multiclass MTBUF_LoadIntrinsicPat<SDPatternOperator name, ValueType vt,
def : GCNPat<
(vt (st v4i32:$rsrc, 0, 0, i32:$soffset, timm:$offset,
timm:$format, timm:$auxiliary, 0)),
- (!cast<MTBUF_Pseudo>(opcode # _OFFSET) SReg_128:$rsrc, SCSrc_b32:$soffset, (as_i16timm $offset),
+ (!cast<MTBUF_Pseudo>(opcode # _OFFSET) SReg_128:$rsrc, SCSrc_b32:$soffset, timm:$offset,
(as_i8timm $format),
(extract_cpol $auxiliary), (extract_swz $auxiliary))
>;
@@ -1916,7 +1902,7 @@ multiclass MTBUF_LoadIntrinsicPat<SDPatternOperator name, ValueType vt,
def : GCNPat<
(vt (st v4i32:$rsrc, i32:$vindex, 0, i32:$soffset, timm:$offset,
timm:$format, timm:$auxiliary, timm)),
- (!cast<MTBUF_Pseudo>(opcode # _IDXEN) VGPR_32:$vindex, SReg_128:$rsrc, SCSrc_b32:$soffset, (as_i16timm $offset),
+ (!cast<MTBUF_Pseudo>(opcode # _IDXEN) VGPR_32:$vindex, SReg_128:$rsrc, SCSrc_b32:$soffset, timm:$offset,
(as_i8timm $format),
(extract_cpol $auxiliary), (extract_swz $auxiliary))
>;
@@ -1924,7 +1910,7 @@ multiclass MTBUF_LoadIntrinsicPat<SDPatternOperator name, ValueType vt,
def : GCNPat<
(vt (st v4i32:$rsrc, 0, i32:$voffset, i32:$soffset, timm:$offset,
timm:$format, timm:$auxiliary, 0)),
- (!cast<MTBUF_Pseudo>(opcode # _OFFEN) VGPR_32:$voffset, SReg_128:$rsrc, SCSrc_b32:$soffset, (as_i16timm $offset),
+ (!cast<MTBUF_Pseudo>(opcode # _OFFEN) VGPR_32:$voffset, SReg_128:$rsrc, SCSrc_b32:$soffset, timm:$offset,
(as_i8timm $format),
(extract_cpol $auxiliary), (extract_swz $auxiliary))
>;
@@ -1934,7 +1920,7 @@ multiclass MTBUF_LoadIntrinsicPat<SDPatternOperator name, ValueType vt,
timm:$format, timm:$auxiliary, timm)),
(!cast<MTBUF_Pseudo>(opcode # _BOTHEN)
(REG_SEQUENCE VReg_64, VGPR_32:$vindex, sub0, VGPR_32:$voffset, sub1),
- SReg_128:$rsrc, SCSrc_b32:$soffset, (as_i16timm $offset),
+ SReg_128:$rsrc, SCSrc_b32:$soffset, timm:$offset,
(as_i8timm $format),
(extract_cpol $auxiliary), (extract_swz $auxiliary))
>;
@@ -1973,7 +1959,7 @@ multiclass MTBUF_StoreIntrinsicPat<SDPatternOperator name, ValueType vt,
(st vt:$vdata, v4i32:$rsrc, 0, 0, i32:$soffset, timm:$offset,
timm:$format, timm:$auxiliary, 0),
(!cast<MTBUF_Pseudo>(opcode # _OFFSET_exact) getVregSrcForVT<vt>.ret:$vdata, SReg_128:$rsrc, SCSrc_b32:$soffset,
- (as_i16timm $offset), (as_i8timm $format),
+ timm:$offset, (as_i8timm $format),
(extract_cpol $auxiliary), (extract_swz $auxiliary))
>;
@@ -1981,7 +1967,7 @@ multiclass MTBUF_StoreIntrinsicPat<SDPatternOperator name, ValueType vt,
(st vt:$vdata, v4i32:$rsrc, i32:$vindex, 0, i32:$soffset, timm:$offset,
timm:$format, timm:$auxiliary, timm),
(!cast<MTBUF_Pseudo>(opcode # _IDXEN_exact) getVregSrcForVT<vt>.ret:$vdata, VGPR_32:$vindex, SReg_128:$rsrc, SCSrc_b32:$soffset,
- (as_i16timm $offset), (as_i8timm $format),
+ timm:$offset, (as_i8timm $format),
(extract_cpol $auxiliary), (extract_swz $auxiliary))
>;
@@ -1989,7 +1975,7 @@ multiclass MTBUF_StoreIntrinsicPat<SDPatternOperator name, ValueType vt,
(st vt:$vdata, v4i32:$rsrc, 0, i32:$voffset, i32:$soffset, timm:$offset,
timm:$format, timm:$auxiliary, 0),
(!cast<MTBUF_Pseudo>(opcode # _OFFEN_exact) getVregSrcForVT<vt>.ret:$vdata, VGPR_32:$voffset, SReg_128:$rsrc, SCSrc_b32:$soffset,
- (as_i16timm $offset), (as_i8timm $format),
+ timm:$offset, (as_i8timm $format),
(extract_cpol $auxiliary), (extract_swz $auxiliary))
>;
@@ -1999,7 +1985,7 @@ multiclass MTBUF_StoreIntrinsicPat<SDPatternOperator name, ValueType vt,
(!cast<MTBUF_Pseudo>(opcode # _BOTHEN_exact)
getVregSrcForVT<vt>.ret:$vdata,
(REG_SEQUENCE VReg_64, VGPR_32:$vindex, sub0, VGPR_32:$voffset, sub1),
- SReg_128:$rsrc, SCSrc_b32:$soffset, (as_i16timm $offset), (as_i8timm $format),
+ SReg_128:$rsrc, SCSrc_b32:$soffset, timm:$offset, (as_i8timm $format),
(extract_cpol $auxiliary), (extract_swz $auxiliary))
>;
}
@@ -2710,11 +2696,11 @@ multiclass MUBUF_Real_vi_gfx90a<bits<7> op, MUBUF_Pseudo ps, bit isTFE = 0> {
def _vi : MUBUF_Real_vi<op, ps>;
if !not(isTFE) then {
- foreach _ = BoolToList<!not(ps.FPAtomic)>.ret in
+ if !not(ps.FPAtomic) then
def _gfx90a : MUBUF_Real_gfx90a<op, ps>;
}
- foreach _ = BoolToList<ps.FPAtomic>.ret in {
+ if ps.FPAtomic then {
def _gfx90a : MUBUF_Real_gfx90a<op, ps, 0> {
let SubtargetPredicate = isGFX90AOnly;
let AssemblerPredicate = isGFX90AOnly;
@@ -2897,11 +2883,11 @@ def BUFFER_WBINVL1_vi : MUBUF_Real_vi <0x3e, BUFFER_WBINVL1>;
def BUFFER_WBINVL1_VOL_vi : MUBUF_Real_vi <0x3f, BUFFER_WBINVL1_VOL>;
} // End AssemblerPredicate = isGFX8GFX9
-let SubtargetPredicate = HasAtomicFaddNoRtnInsts in {
-defm BUFFER_ATOMIC_ADD_F32 : MUBUF_Real_Atomic_vi <0x4d>;
defm BUFFER_ATOMIC_PK_ADD_F16 : MUBUF_Real_Atomic_vi <0x4e>;
+let SubtargetPredicate = HasAtomicFaddNoRtnInsts in {
+defm BUFFER_ATOMIC_ADD_F32 : MUBUF_Real_Atomic_vi <0x4d>;
} // End SubtargetPredicate = HasAtomicFaddNoRtnInsts
let SubtargetPredicate = isGFX90APlus in {
diff --git a/llvm/lib/Target/AMDGPU/DSInstructions.td b/llvm/lib/Target/AMDGPU/DSInstructions.td
index 26f3537ff095..85a3f763cd5a 100644
--- a/llvm/lib/Target/AMDGPU/DSInstructions.td
+++ b/llvm/lib/Target/AMDGPU/DSInstructions.td
@@ -26,8 +26,6 @@ class DS_Pseudo <string opName, dag outs, dag ins, string asmOps, list<dag> patt
let isPseudo = 1;
let isCodeGenOnly = 1;
- let AsmMatchConverter = "cvtDS";
-
string Mnemonic = opName;
string AsmOperands = asmOps;
@@ -65,7 +63,6 @@ class DS_Real <DS_Pseudo ps, string opName = ps.Mnemonic> :
// copy relevant pseudo op flags
let SubtargetPredicate = ps.SubtargetPredicate;
let OtherPredicates = ps.OtherPredicates;
- let AsmMatchConverter = ps.AsmMatchConverter;
let SchedRW = ps.SchedRW;
let mayLoad = ps.mayLoad;
let mayStore = ps.mayStore;
@@ -164,7 +161,6 @@ class DS_1A2D_Off8_NORET <string opName, RegisterClass rc = VGPR_32,
let has_vdst = 0;
let has_offset = 0;
- let AsmMatchConverter = "cvtDSOffset01";
}
multiclass DS_1A2D_Off8_NORET_mc <string opName, RegisterClass rc = VGPR_32> {
@@ -187,7 +183,6 @@ class DS_0A1D_RET_GDS<string opName, RegisterClass rc = VGPR_32, RegisterClass s
let has_data1 = 0;
let has_gds = 0;
let gdsValue = 1;
- let AsmMatchConverter = "cvtDSGds";
let hasSideEffects = 1;
}
@@ -220,7 +215,7 @@ multiclass DS_1A1D_RET_mc_gfx9 <string opName, RegisterClass rc = VGPR_32,
let has_m0_read = 0 in {
def "" : DS_1A1D_RET<opName, rc>,
AtomicNoRet<!if(!eq(NoRetOp, ""), "", NoRetOp),
- !if(!eq(NoRetOp, ""), 0, 1)>;
+ !ne(NoRetOp, "")>;
}
}
@@ -262,8 +257,6 @@ class DS_1A2D_Off8_RET<string opName,
" $vdst, $addr, $data0, $data1$offset0$offset1$gds"> {
let has_offset = 0;
- let AsmMatchConverter = "cvtDSOffset01";
-
let hasPostISelHook = 1;
}
@@ -325,7 +318,6 @@ class DS_1A_Off8_RET <string opName, RegisterClass rc = VGPR_32>
let has_offset = 0;
let has_data0 = 0;
let has_data1 = 0;
- let AsmMatchConverter = "cvtDSOffset01";
}
multiclass DS_1A_Off8_RET_mc <string opName, RegisterClass rc = VGPR_32> {
@@ -345,7 +337,6 @@ class DS_1A_RET_GDS <string opName> : DS_Pseudo<opName,
let has_data1 = 0;
let has_gds = 0;
let gdsValue = 1;
- let AsmMatchConverter = "cvtDSGds";
}
class DS_0A_RET <string opName> : DS_Pseudo<opName,
@@ -393,7 +384,6 @@ class DS_GWS <string opName, dag ins, string asmOps>
let has_gds = 0;
let gdsValue = 1;
- let AsmMatchConverter = "cvtDSGds";
}
class DS_GWS_0D <string opName>
@@ -417,7 +407,6 @@ class DS_VOID <string opName> : DS_Pseudo<opName,
let mayStore = 0;
let hasSideEffects = 1;
let UseNamedOperandTable = 0;
- let AsmMatchConverter = "";
let has_vdst = 0;
let has_addr = 0;
@@ -436,7 +425,7 @@ class DS_1A1D_PERMUTE <string opName, SDPatternOperator node = null_frag,
(ins VGPR_32:$addr, data_op:$data0, offset:$offset),
" $vdst, $addr, $data0$offset",
[(set i32:$vdst,
- (node (DS1Addr1Offset i32:$addr, i16:$offset), i32:$data0))] > {
+ (node (DS1Addr1Offset i32:$addr, i32:$offset), i32:$data0))] > {
let mayLoad = 0;
let mayStore = 0;
@@ -494,12 +483,12 @@ let SubtargetPredicate = isGFX90APlus in {
defm DS_ADD_RTN_F64 : DS_1A1D_RET_mc_gfx9<"ds_add_rtn_f64", VReg_64, "ds_add_f64">;
} // End SubtargetPredicate = isGFX90APlus
-let SubtargetPredicate = isGFX940Plus in {
+let SubtargetPredicate = HasAtomicDsPkAdd16Insts in {
defm DS_PK_ADD_F16 : DS_1A1D_NORET_mc_gfx9<"ds_pk_add_f16">;
defm DS_PK_ADD_RTN_F16 : DS_1A1D_RET_mc_gfx9<"ds_pk_add_rtn_f16", VGPR_32, "ds_pk_add_f16">;
defm DS_PK_ADD_BF16 : DS_1A1D_NORET_mc_gfx9<"ds_pk_add_bf16">;
defm DS_PK_ADD_RTN_BF16 : DS_1A1D_RET_mc_gfx9<"ds_pk_add_rtn_bf16", VGPR_32, "ds_pk_add_bf16">;
-} // End SubtargetPredicate = isGFX940Plus
+} // End SubtargetPredicate = HasAtomicDsPkAdd16Insts
defm DS_CMPSTORE_B32 : DS_1A2D_NORET_mc<"ds_cmpstore_b32">;
defm DS_CMPSTORE_F32 : DS_1A2D_NORET_mc<"ds_cmpstore_f32">;
@@ -631,7 +620,7 @@ def DS_WRITE_SRC2_B64 : DS_1A<"ds_write_src2_b64">;
} // End SubtargetPredicate = HasDsSrc2Insts
let Uses = [EXEC], mayLoad = 0, mayStore = 0, isConvergent = 1 in {
-def DS_SWIZZLE_B32 : DS_1A_RET <"ds_swizzle_b32", VGPR_32, 0, SwizzleImm>;
+def DS_SWIZZLE_B32 : DS_1A_RET <"ds_swizzle_b32", VGPR_32, 0, Swizzle>;
}
let mayStore = 0 in {
@@ -740,7 +729,7 @@ def : GCNPat <
>;
class DSReadPat <DS_Pseudo inst, ValueType vt, PatFrag frag, int gds=0> : GCNPat <
- (vt (frag (DS1Addr1Offset i32:$ptr, i16:$offset))),
+ (vt (frag (DS1Addr1Offset i32:$ptr, i32:$offset))),
(inst $ptr, offset:$offset, (i1 gds))
>;
@@ -756,7 +745,7 @@ multiclass DSReadPat_mc<DS_Pseudo inst, ValueType vt, string frag> {
}
class DSReadPat_D16 <DS_Pseudo inst, PatFrag frag, ValueType vt> : GCNPat <
- (frag (DS1Addr1Offset i32:$ptr, i16:$offset), vt:$in),
+ (frag (DS1Addr1Offset i32:$ptr, i32:$offset), vt:$in),
(inst $ptr, offset:$offset, (i1 0), $in)
>;
@@ -800,7 +789,7 @@ def : DSReadPat_D16<DS_READ_I8_D16, sextloadi8_d16_lo_local, v2f16>;
}
class DSWritePat <DS_Pseudo inst, ValueType vt, PatFrag frag, int gds=0> : GCNPat <
- (frag vt:$value, (DS1Addr1Offset i32:$ptr, i16:$offset)),
+ (frag vt:$value, (DS1Addr1Offset i32:$ptr, i32:$offset)),
(inst $ptr, getVregSrcForVT<vt>.ret:$value, offset:$offset, (i1 gds))
>;
@@ -817,7 +806,7 @@ multiclass DSWritePat_mc <DS_Pseudo inst, ValueType vt, string frag> {
// Irritatingly, atomic_store reverses the order of operands from a
// normal store.
class DSAtomicWritePat <DS_Pseudo inst, ValueType vt, PatFrag frag> : GCNPat <
- (frag (DS1Addr1Offset i32:$ptr, i16:$offset), vt:$value),
+ (frag (DS1Addr1Offset i32:$ptr, i32:$offset), vt:$value),
(inst $ptr, getVregSrcForVT<vt>.ret:$value, offset:$offset, (i1 0))
>;
@@ -965,7 +954,7 @@ defm : DSWritePat_mc <DS_WRITE_B128, vt, "store_align_less_than_4_local">;
} // End AddedComplexity = 100
class DSAtomicRetPat<DS_Pseudo inst, ValueType vt, PatFrag frag, int complexity = 0,
- bit gds=0> : GCNPat <(frag (DS1Addr1Offset i32:$ptr, i16:$offset), vt:$value),
+ bit gds=0> : GCNPat <(frag (DS1Addr1Offset i32:$ptr, i32:$offset), vt:$value),
(inst $ptr, getVregSrcForVT<vt>.ret:$value, offset:$offset, (i1 gds))> {
let AddedComplexity = complexity;
}
@@ -1014,7 +1003,7 @@ let SubtargetPredicate = isGFX6GFX7GFX8GFX9GFX10 in {
// Caution, the order of src and cmp is the *opposite* of the BUFFER_ATOMIC_CMPSWAP opcode.
class DSAtomicCmpXChgSwapped<DS_Pseudo inst, ValueType vt, PatFrag frag,
int complexity = 0, bit gds=0> : GCNPat<
- (frag (DS1Addr1Offset i32:$ptr, i16:$offset), vt:$cmp, vt:$swap),
+ (frag (DS1Addr1Offset i32:$ptr, i32:$offset), vt:$cmp, vt:$swap),
(inst $ptr, getVregSrcForVT<vt>.ret:$cmp, getVregSrcForVT<vt>.ret:$swap, offset:$offset, (i1 gds))> {
let AddedComplexity = complexity;
}
@@ -1046,7 +1035,7 @@ let SubtargetPredicate = isGFX11Plus in {
// The order of src and cmp agrees with the BUFFER_ATOMIC_CMPSWAP opcode.
class DSAtomicCmpXChg<DS_Pseudo inst, ValueType vt, PatFrag frag,
int complexity = 0, bit gds=0> : GCNPat<
- (frag (DS1Addr1Offset i32:$ptr, i16:$offset), vt:$cmp, vt:$swap),
+ (frag (DS1Addr1Offset i32:$ptr, i32:$offset), vt:$cmp, vt:$swap),
(inst $ptr, getVregSrcForVT<vt>.ret:$swap, getVregSrcForVT<vt>.ret:$cmp, offset:$offset, (i1 gds))> {
let AddedComplexity = complexity;
}
@@ -1069,8 +1058,8 @@ multiclass DSAtomicCmpXChg_mc<DS_Pseudo inst, DS_Pseudo noRetInst, ValueType vt,
defm : DSAtomicRetPat_mc<DS_WRXCHG_RTN_B32, i32, "atomic_swap">;
defm : DSAtomicRetNoRetPat_mc<DS_ADD_RTN_U32, DS_ADD_U32, i32, "atomic_load_add">;
defm : DSAtomicRetNoRetPat_mc<DS_SUB_RTN_U32, DS_SUB_U32, i32, "atomic_load_sub">;
-defm : DSAtomicRetNoRetPat_mc<DS_INC_RTN_U32, DS_INC_U32, i32, "atomic_inc">;
-defm : DSAtomicRetNoRetPat_mc<DS_DEC_RTN_U32, DS_DEC_U32, i32, "atomic_dec">;
+defm : DSAtomicRetNoRetPat_mc<DS_INC_RTN_U32, DS_INC_U32, i32, "atomic_load_uinc_wrap">;
+defm : DSAtomicRetNoRetPat_mc<DS_DEC_RTN_U32, DS_DEC_U32, i32, "atomic_load_udec_wrap">;
defm : DSAtomicRetNoRetPat_mc<DS_AND_RTN_B32, DS_AND_B32, i32, "atomic_load_and">;
defm : DSAtomicRetNoRetPat_mc<DS_OR_RTN_B32, DS_OR_B32, i32, "atomic_load_or">;
defm : DSAtomicRetNoRetPat_mc<DS_XOR_RTN_B32, DS_XOR_B32, i32, "atomic_load_xor">;
@@ -1097,8 +1086,8 @@ defm : DSAtomicRetNoRetPat_mc<DS_ADD_RTN_F32, DS_ADD_F32, f32, "atomic_load_fadd
defm : DSAtomicRetPat_mc<DS_WRXCHG_RTN_B64, i64, "atomic_swap">;
defm : DSAtomicRetNoRetPat_mc<DS_ADD_RTN_U64, DS_ADD_U64, i64, "atomic_load_add">;
defm : DSAtomicRetNoRetPat_mc<DS_SUB_RTN_U64, DS_SUB_U64, i64, "atomic_load_sub">;
-defm : DSAtomicRetNoRetPat_mc<DS_INC_RTN_U64, DS_INC_U64, i64, "atomic_inc">;
-defm : DSAtomicRetNoRetPat_mc<DS_DEC_RTN_U64, DS_DEC_U64, i64, "atomic_dec">;
+defm : DSAtomicRetNoRetPat_mc<DS_INC_RTN_U64, DS_INC_U64, i64, "atomic_load_uinc_wrap">;
+defm : DSAtomicRetNoRetPat_mc<DS_DEC_RTN_U64, DS_DEC_U64, i64, "atomic_load_udec_wrap">;
defm : DSAtomicRetNoRetPat_mc<DS_AND_RTN_B64, DS_AND_B64, i64, "atomic_load_and">;
defm : DSAtomicRetNoRetPat_mc<DS_OR_RTN_B64, DS_OR_B64, i64, "atomic_load_or">;
defm : DSAtomicRetNoRetPat_mc<DS_XOR_RTN_B64, DS_XOR_B64, i64, "atomic_load_xor">;
@@ -1124,7 +1113,7 @@ def : DSAtomicRetPat<DS_ADD_F64, f64, atomic_load_fadd_local_noret_64>;
class DSAtomicRetPatIntrinsic<DS_Pseudo inst, ValueType vt, PatFrag frag,
bit gds=0> : GCNPat <
- (vt (frag (DS1Addr1Offset i32:$ptr, i16:$offset), vt:$value)),
+ (vt (frag (DS1Addr1Offset i32:$ptr, i32:$offset), vt:$value)),
(inst $ptr, getVregSrcForVT<vt>.ret:$value, offset:$offset, (i1 gds))> {
}
@@ -1133,7 +1122,7 @@ let AddedComplexity = 1 in
def : DSAtomicRetPatIntrinsic<DS_ADD_F64, f64, int_amdgcn_flat_atomic_fadd_noret_local_addrspace>;
}
-let SubtargetPredicate = isGFX940Plus in {
+let SubtargetPredicate = HasAtomicDsPkAdd16Insts in {
def : DSAtomicRetPat<DS_PK_ADD_RTN_F16, v2f16, atomic_load_fadd_v2f16_local_32>;
let AddedComplexity = 1 in
def : DSAtomicRetPat<DS_PK_ADD_F16, v2f16, atomic_load_fadd_v2f16_local_noret_32>;
@@ -1146,7 +1135,7 @@ def : GCNPat <
(v2i16 (int_amdgcn_ds_fadd_v2bf16_noret i32:$ptr, v2i16:$src)),
(DS_PK_ADD_BF16 VGPR_32:$ptr, VGPR_32:$src, 0, 0)
>;
-}
+} // End SubtargetPredicate = HasAtomicDsPkAdd16Insts
def : Pat <
(SIds_ordered_count i32:$value, i16:$offset),
diff --git a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
index c4e85210848a..1b05acd5c90a 100644
--- a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
+++ b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
@@ -45,13 +45,11 @@ using namespace llvm;
using DecodeStatus = llvm::MCDisassembler::DecodeStatus;
AMDGPUDisassembler::AMDGPUDisassembler(const MCSubtargetInfo &STI,
- MCContext &Ctx,
- MCInstrInfo const *MCII) :
- MCDisassembler(STI, Ctx), MCII(MCII), MRI(*Ctx.getRegisterInfo()),
- TargetMaxInstBytes(Ctx.getAsmInfo()->getMaxInstLength(&STI)) {
-
+ MCContext &Ctx, MCInstrInfo const *MCII)
+ : MCDisassembler(STI, Ctx), MCII(MCII), MRI(*Ctx.getRegisterInfo()),
+ MAI(*Ctx.getAsmInfo()), TargetMaxInstBytes(MAI.getMaxInstLength(&STI)) {
// ToDo: AMDGPUDisassembler supports only VI ISA.
- if (!STI.getFeatureBits()[AMDGPU::FeatureGCN3Encoding] && !isGFX10Plus())
+ if (!STI.hasFeature(AMDGPU::FeatureGCN3Encoding) && !isGFX10Plus())
report_fatal_error("Disassembly not yet supported for subtarget");
}
@@ -74,7 +72,7 @@ static int insertNamedMCOperand(MCInst &MI, const MCOperand &Op,
return OpIdx;
}
-static DecodeStatus decodeSoppBrTarget(MCInst &Inst, unsigned Imm,
+static DecodeStatus decodeSOPPBrTarget(MCInst &Inst, unsigned Imm,
uint64_t Addr,
const MCDisassembler *Decoder) {
auto DAsm = static_cast<const AMDGPUDisassembler*>(Decoder);
@@ -115,181 +113,158 @@ static DecodeStatus decodeBoolReg(MCInst &Inst, unsigned Val, uint64_t Addr,
return addOperand(Inst, DAsm->DecoderName(Imm)); \
}
-#define DECODE_OPERAND_REG(RegClass) \
-DECODE_OPERAND(Decode##RegClass##RegisterClass, decodeOperand_##RegClass)
+// Decoder for registers, decode directly using RegClassID. Imm(8-bit) is
+// number of register. Used by VGPR only and AGPR only operands.
+#define DECODE_OPERAND_REG_8(RegClass) \
+ static DecodeStatus Decode##RegClass##RegisterClass( \
+ MCInst &Inst, unsigned Imm, uint64_t /*Addr*/, \
+ const MCDisassembler *Decoder) { \
+ assert(Imm < (1 << 8) && "8-bit encoding"); \
+ auto DAsm = static_cast<const AMDGPUDisassembler *>(Decoder); \
+ return addOperand( \
+ Inst, DAsm->createRegOperand(AMDGPU::RegClass##RegClassID, Imm)); \
+ }
-DECODE_OPERAND_REG(VGPR_32)
-DECODE_OPERAND_REG(VGPR_32_Lo128)
-DECODE_OPERAND_REG(VRegOrLds_32)
-DECODE_OPERAND_REG(VS_32)
-DECODE_OPERAND_REG(VS_64)
-DECODE_OPERAND_REG(VS_128)
+#define DECODE_SrcOp(Name, EncSize, OpWidth, EncImm, MandatoryLiteral, \
+ ImmWidth) \
+ static DecodeStatus Name(MCInst &Inst, unsigned Imm, uint64_t /*Addr*/, \
+ const MCDisassembler *Decoder) { \
+ assert(Imm < (1 << EncSize) && #EncSize "-bit encoding"); \
+ auto DAsm = static_cast<const AMDGPUDisassembler *>(Decoder); \
+ return addOperand(Inst, \
+ DAsm->decodeSrcOp(AMDGPUDisassembler::OpWidth, EncImm, \
+ MandatoryLiteral, ImmWidth)); \
+ }
-DECODE_OPERAND_REG(VReg_64)
-DECODE_OPERAND_REG(VReg_96)
-DECODE_OPERAND_REG(VReg_128)
-DECODE_OPERAND_REG(VReg_256)
-DECODE_OPERAND_REG(VReg_288)
-DECODE_OPERAND_REG(VReg_352)
-DECODE_OPERAND_REG(VReg_384)
-DECODE_OPERAND_REG(VReg_512)
-DECODE_OPERAND_REG(VReg_1024)
+// Decoder for registers. Imm(7-bit) is number of register, uses decodeSrcOp to
+// get register class. Used by SGPR only operands.
+#define DECODE_OPERAND_REG_7(RegClass, OpWidth) \
+ DECODE_SrcOp(Decode##RegClass##RegisterClass, 7, OpWidth, Imm, false, 0)
-DECODE_OPERAND_REG(SReg_32)
-DECODE_OPERAND_REG(SReg_32_XM0_XEXEC)
-DECODE_OPERAND_REG(SReg_32_XEXEC_HI)
-DECODE_OPERAND_REG(SRegOrLds_32)
-DECODE_OPERAND_REG(SReg_64)
-DECODE_OPERAND_REG(SReg_64_XEXEC)
-DECODE_OPERAND_REG(SReg_128)
-DECODE_OPERAND_REG(SReg_256)
-DECODE_OPERAND_REG(SReg_512)
+// Decoder for registers. Imm(10-bit): Imm{7-0} is number of register,
+// Imm{9} is acc(agpr or vgpr) Imm{8} should be 0 (see VOP3Pe_SMFMAC).
+// Set Imm{8} to 1 (IS_VGPR) to decode using 'enum10' from decodeSrcOp.
+// Used by AV_ register classes (AGPR or VGPR only register operands).
+#define DECODE_OPERAND_REG_AV10(RegClass, OpWidth) \
+ DECODE_SrcOp(Decode##RegClass##RegisterClass, 10, OpWidth, \
+ Imm | AMDGPU::EncValues::IS_VGPR, false, 0)
-DECODE_OPERAND_REG(AGPR_32)
-DECODE_OPERAND_REG(AReg_64)
-DECODE_OPERAND_REG(AReg_128)
-DECODE_OPERAND_REG(AReg_256)
-DECODE_OPERAND_REG(AReg_512)
-DECODE_OPERAND_REG(AReg_1024)
-DECODE_OPERAND_REG(AV_32)
-DECODE_OPERAND_REG(AV_64)
-DECODE_OPERAND_REG(AV_128)
-DECODE_OPERAND_REG(AVDst_128)
-DECODE_OPERAND_REG(AVDst_512)
+// Decoder for Src(9-bit encoding) registers only.
+#define DECODE_OPERAND_SRC_REG_9(RegClass, OpWidth) \
+ DECODE_SrcOp(decodeOperand_##RegClass, 9, OpWidth, Imm, false, 0)
-static DecodeStatus decodeOperand_VSrc16(MCInst &Inst, unsigned Imm,
- uint64_t Addr,
- const MCDisassembler *Decoder) {
- auto DAsm = static_cast<const AMDGPUDisassembler*>(Decoder);
- return addOperand(Inst, DAsm->decodeOperand_VSrc16(Imm));
-}
+// Decoder for Src(9-bit encoding) AGPR, register number encoded in 9bits, set
+// Imm{9} to 1 (set acc) and decode using 'enum10' from decodeSrcOp, registers
+// only.
+#define DECODE_OPERAND_SRC_REG_A9(RegClass, OpWidth) \
+ DECODE_SrcOp(decodeOperand_##RegClass, 9, OpWidth, Imm | 512, false, 0)
-static DecodeStatus decodeOperand_VSrcV216(MCInst &Inst, unsigned Imm,
- uint64_t Addr,
- const MCDisassembler *Decoder) {
- auto DAsm = static_cast<const AMDGPUDisassembler*>(Decoder);
- return addOperand(Inst, DAsm->decodeOperand_VSrcV216(Imm));
-}
+// Decoder for 'enum10' from decodeSrcOp, Imm{0-8} is 9-bit Src encoding
+// Imm{9} is acc, registers only.
+#define DECODE_SRC_OPERAND_REG_AV10(RegClass, OpWidth) \
+ DECODE_SrcOp(decodeOperand_##RegClass, 10, OpWidth, Imm, false, 0)
-static DecodeStatus decodeOperand_VSrcV232(MCInst &Inst, unsigned Imm,
- uint64_t Addr,
- const MCDisassembler *Decoder) {
- auto DAsm = static_cast<const AMDGPUDisassembler*>(Decoder);
- return addOperand(Inst, DAsm->decodeOperand_VSrcV232(Imm));
-}
+// Decoder for RegisterOperands using 9-bit Src encoding. Operand can be
+// register from RegClass or immediate. Registers that don't belong to RegClass
+// will be decoded and InstPrinter will report warning. Immediate will be
+// decoded into constant of size ImmWidth, should match width of immediate used
+// by OperandType (important for floating point types).
+#define DECODE_OPERAND_SRC_REG_OR_IMM_9(RegClass, OpWidth, ImmWidth) \
+ DECODE_SrcOp(decodeOperand_##RegClass##_Imm##ImmWidth, 9, OpWidth, Imm, \
+ false, ImmWidth)
-static DecodeStatus decodeOperand_VS_16(MCInst &Inst, unsigned Imm,
- uint64_t Addr,
- const MCDisassembler *Decoder) {
- auto DAsm = static_cast<const AMDGPUDisassembler*>(Decoder);
- return addOperand(Inst, DAsm->decodeOperand_VSrc16(Imm));
-}
+// Decoder for Src(9-bit encoding) AGPR or immediate. Set Imm{9} to 1 (set acc)
+// and decode using 'enum10' from decodeSrcOp.
+#define DECODE_OPERAND_SRC_REG_OR_IMM_A9(RegClass, OpWidth, ImmWidth) \
+ DECODE_SrcOp(decodeOperand_##RegClass##_Imm##ImmWidth, 9, OpWidth, \
+ Imm | 512, false, ImmWidth)
-static DecodeStatus decodeOperand_VS_32(MCInst &Inst, unsigned Imm,
- uint64_t Addr,
- const MCDisassembler *Decoder) {
- auto DAsm = static_cast<const AMDGPUDisassembler*>(Decoder);
- return addOperand(Inst, DAsm->decodeOperand_VS_32(Imm));
-}
+#define DECODE_OPERAND_SRC_REG_OR_IMM_DEFERRED_9(RegClass, OpWidth, ImmWidth) \
+ DECODE_SrcOp(decodeOperand_##RegClass##_Deferred##_Imm##ImmWidth, 9, \
+ OpWidth, Imm, true, ImmWidth)
-static DecodeStatus decodeOperand_AReg_64(MCInst &Inst, unsigned Imm,
- uint64_t Addr,
- const MCDisassembler *Decoder) {
- auto DAsm = static_cast<const AMDGPUDisassembler*>(Decoder);
- return addOperand(Inst, DAsm->decodeSrcOp(AMDGPUDisassembler::OPW64, Imm | 512));
-}
+// Default decoders generated by tablegen: 'Decode<RegClass>RegisterClass'
+// when RegisterClass is used as an operand. Most often used for destination
+// operands.
-static DecodeStatus decodeOperand_AReg_128(MCInst &Inst, unsigned Imm,
- uint64_t Addr,
- const MCDisassembler *Decoder) {
- auto DAsm = static_cast<const AMDGPUDisassembler*>(Decoder);
- return addOperand(Inst, DAsm->decodeSrcOp(AMDGPUDisassembler::OPW128, Imm | 512));
-}
+DECODE_OPERAND_REG_8(VGPR_32)
+DECODE_OPERAND_REG_8(VGPR_32_Lo128)
+DECODE_OPERAND_REG_8(VReg_64)
+DECODE_OPERAND_REG_8(VReg_96)
+DECODE_OPERAND_REG_8(VReg_128)
+DECODE_OPERAND_REG_8(VReg_256)
+DECODE_OPERAND_REG_8(VReg_288)
+DECODE_OPERAND_REG_8(VReg_352)
+DECODE_OPERAND_REG_8(VReg_384)
+DECODE_OPERAND_REG_8(VReg_512)
+DECODE_OPERAND_REG_8(VReg_1024)
-static DecodeStatus decodeOperand_AReg_256(MCInst &Inst, unsigned Imm,
- uint64_t Addr,
- const MCDisassembler *Decoder) {
- auto DAsm = static_cast<const AMDGPUDisassembler*>(Decoder);
- return addOperand(Inst, DAsm->decodeSrcOp(AMDGPUDisassembler::OPW256, Imm | 512));
-}
+DECODE_OPERAND_REG_7(SReg_32, OPW32)
+DECODE_OPERAND_REG_7(SReg_32_XM0_XEXEC, OPW32)
+DECODE_OPERAND_REG_7(SReg_32_XEXEC_HI, OPW32)
+DECODE_OPERAND_REG_7(SReg_64, OPW64)
+DECODE_OPERAND_REG_7(SReg_64_XEXEC, OPW64)
+DECODE_OPERAND_REG_7(SReg_128, OPW128)
+DECODE_OPERAND_REG_7(SReg_256, OPW256)
+DECODE_OPERAND_REG_7(SReg_512, OPW512)
-static DecodeStatus decodeOperand_AReg_512(MCInst &Inst, unsigned Imm,
- uint64_t Addr,
- const MCDisassembler *Decoder) {
- auto DAsm = static_cast<const AMDGPUDisassembler*>(Decoder);
- return addOperand(Inst, DAsm->decodeSrcOp(AMDGPUDisassembler::OPW512, Imm | 512));
-}
+DECODE_OPERAND_REG_8(AGPR_32)
+DECODE_OPERAND_REG_8(AReg_64)
+DECODE_OPERAND_REG_8(AReg_128)
+DECODE_OPERAND_REG_8(AReg_256)
+DECODE_OPERAND_REG_8(AReg_512)
+DECODE_OPERAND_REG_8(AReg_1024)
-static DecodeStatus decodeOperand_AReg_1024(MCInst &Inst, unsigned Imm,
- uint64_t Addr,
- const MCDisassembler *Decoder) {
- auto DAsm = static_cast<const AMDGPUDisassembler*>(Decoder);
- return addOperand(Inst, DAsm->decodeSrcOp(AMDGPUDisassembler::OPW1024, Imm | 512));
-}
+DECODE_OPERAND_REG_AV10(AVDst_128, OPW128)
+DECODE_OPERAND_REG_AV10(AVDst_512, OPW512)
-static DecodeStatus decodeOperand_VReg_64(MCInst &Inst, unsigned Imm,
- uint64_t Addr,
- const MCDisassembler *Decoder) {
- auto DAsm = static_cast<const AMDGPUDisassembler*>(Decoder);
- return addOperand(Inst, DAsm->decodeSrcOp(AMDGPUDisassembler::OPW64, Imm));
-}
+// Decoders for register only source RegisterOperands that use use 9-bit Src
+// encoding: 'decodeOperand_<RegClass>'.
-static DecodeStatus decodeOperand_VReg_128(MCInst &Inst, unsigned Imm,
- uint64_t Addr,
- const MCDisassembler *Decoder) {
- auto DAsm = static_cast<const AMDGPUDisassembler*>(Decoder);
- return addOperand(Inst, DAsm->decodeSrcOp(AMDGPUDisassembler::OPW128, Imm));
-}
+DECODE_OPERAND_SRC_REG_9(VGPR_32, OPW32)
+DECODE_OPERAND_SRC_REG_9(VReg_64, OPW64)
+DECODE_OPERAND_SRC_REG_9(VReg_128, OPW128)
+DECODE_OPERAND_SRC_REG_9(VReg_256, OPW256)
+DECODE_OPERAND_SRC_REG_9(VRegOrLds_32, OPW32)
-static DecodeStatus decodeOperand_VReg_256(MCInst &Inst, unsigned Imm,
- uint64_t Addr,
- const MCDisassembler *Decoder) {
- auto DAsm = static_cast<const AMDGPUDisassembler*>(Decoder);
- return addOperand(Inst, DAsm->decodeSrcOp(AMDGPUDisassembler::OPW256, Imm));
-}
+DECODE_OPERAND_SRC_REG_A9(AGPR_32, OPW32)
-static DecodeStatus decodeOperand_VReg_512(MCInst &Inst, unsigned Imm,
- uint64_t Addr,
- const MCDisassembler *Decoder) {
- auto DAsm = static_cast<const AMDGPUDisassembler*>(Decoder);
- return addOperand(Inst, DAsm->decodeSrcOp(AMDGPUDisassembler::OPW512, Imm));
-}
+DECODE_SRC_OPERAND_REG_AV10(AV_32, OPW32)
+DECODE_SRC_OPERAND_REG_AV10(AV_64, OPW64)
+DECODE_SRC_OPERAND_REG_AV10(AV_128, OPW128)
-static DecodeStatus decodeOperand_VReg_1024(MCInst &Inst, unsigned Imm,
- uint64_t Addr,
- const MCDisassembler *Decoder) {
- auto DAsm = static_cast<const AMDGPUDisassembler*>(Decoder);
- return addOperand(Inst, DAsm->decodeSrcOp(AMDGPUDisassembler::OPW1024, Imm));
-}
+// Decoders for register or immediate RegisterOperands that use 9-bit Src
+// encoding: 'decodeOperand_<RegClass>_Imm<ImmWidth>'.
-static DecodeStatus decodeOperand_f32kimm(MCInst &Inst, unsigned Imm,
- uint64_t Addr,
- const MCDisassembler *Decoder) {
- const auto *DAsm = static_cast<const AMDGPUDisassembler *>(Decoder);
- return addOperand(Inst, DAsm->decodeMandatoryLiteralConstant(Imm));
-}
+DECODE_OPERAND_SRC_REG_OR_IMM_9(SReg_64, OPW64, 64)
+DECODE_OPERAND_SRC_REG_OR_IMM_9(SReg_32, OPW32, 32)
+DECODE_OPERAND_SRC_REG_OR_IMM_9(SRegOrLds_32, OPW32, 32)
+DECODE_OPERAND_SRC_REG_OR_IMM_9(VS_32_Lo128, OPW16, 16)
+DECODE_OPERAND_SRC_REG_OR_IMM_9(VS_32, OPW32, 16)
+DECODE_OPERAND_SRC_REG_OR_IMM_9(VS_32, OPW32, 32)
+DECODE_OPERAND_SRC_REG_OR_IMM_9(VS_64, OPW64, 64)
+DECODE_OPERAND_SRC_REG_OR_IMM_9(VS_64, OPW64, 32)
+DECODE_OPERAND_SRC_REG_OR_IMM_9(VReg_64, OPW64, 64)
+DECODE_OPERAND_SRC_REG_OR_IMM_9(VReg_128, OPW128, 32)
+DECODE_OPERAND_SRC_REG_OR_IMM_9(VReg_256, OPW256, 64)
+DECODE_OPERAND_SRC_REG_OR_IMM_9(VReg_512, OPW512, 32)
+DECODE_OPERAND_SRC_REG_OR_IMM_9(VReg_1024, OPW1024, 32)
-static DecodeStatus decodeOperand_f16kimm(MCInst &Inst, unsigned Imm,
- uint64_t Addr,
- const MCDisassembler *Decoder) {
- const auto *DAsm = static_cast<const AMDGPUDisassembler *>(Decoder);
- return addOperand(Inst, DAsm->decodeMandatoryLiteralConstant(Imm));
-}
+DECODE_OPERAND_SRC_REG_OR_IMM_A9(AReg_64, OPW64, 64)
+DECODE_OPERAND_SRC_REG_OR_IMM_A9(AReg_128, OPW128, 32)
+DECODE_OPERAND_SRC_REG_OR_IMM_A9(AReg_256, OPW256, 64)
+DECODE_OPERAND_SRC_REG_OR_IMM_A9(AReg_512, OPW512, 32)
+DECODE_OPERAND_SRC_REG_OR_IMM_A9(AReg_1024, OPW1024, 32)
-static DecodeStatus
-decodeOperand_VS_16_Deferred(MCInst &Inst, unsigned Imm, uint64_t Addr,
- const MCDisassembler *Decoder) {
- const auto *DAsm = static_cast<const AMDGPUDisassembler *>(Decoder);
- return addOperand(
- Inst, DAsm->decodeSrcOp(llvm::AMDGPUDisassembler::OPW16, Imm, true));
-}
+DECODE_OPERAND_SRC_REG_OR_IMM_DEFERRED_9(VS_32_Lo128, OPW16, 16)
+DECODE_OPERAND_SRC_REG_OR_IMM_DEFERRED_9(VS_32, OPW16, 16)
+DECODE_OPERAND_SRC_REG_OR_IMM_DEFERRED_9(VS_32, OPW32, 32)
-static DecodeStatus
-decodeOperand_VS_32_Deferred(MCInst &Inst, unsigned Imm, uint64_t Addr,
- const MCDisassembler *Decoder) {
+static DecodeStatus decodeOperand_KImmFP(MCInst &Inst, unsigned Imm,
+ uint64_t Addr,
+ const MCDisassembler *Decoder) {
const auto *DAsm = static_cast<const AMDGPUDisassembler *>(Decoder);
- return addOperand(
- Inst, DAsm->decodeSrcOp(llvm::AMDGPUDisassembler::OPW32, Imm, true));
+ return addOperand(Inst, DAsm->decodeMandatoryLiteralConstant(Imm));
}
static DecodeStatus decodeOperandVOPDDstY(MCInst &Inst, unsigned Val,
@@ -381,13 +356,6 @@ DecodeAVLdSt_160RegisterClass(MCInst &Inst, unsigned Imm, uint64_t Addr,
Decoder);
}
-static DecodeStatus decodeOperand_SReg_32(MCInst &Inst, unsigned Imm,
- uint64_t Addr,
- const MCDisassembler *Decoder) {
- auto DAsm = static_cast<const AMDGPUDisassembler*>(Decoder);
- return addOperand(Inst, DAsm->decodeOperand_SReg_32(Imm));
-}
-
#define DECODE_SDWA(DecName) \
DECODE_OPERAND(decodeSDWA##DecName, decodeSDWA##DecName)
@@ -436,7 +404,6 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
ArrayRef<uint8_t> Bytes_,
uint64_t Address,
raw_ostream &CS) const {
- CommentStream = &CS;
bool IsSDWA = false;
unsigned MaxInstBytesNum = std::min((size_t)TargetMaxInstBytes, Bytes_.size());
@@ -451,13 +418,11 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
// encodings
if (isGFX11Plus() && Bytes.size() >= 12 ) {
DecoderUInt128 DecW = eat12Bytes(Bytes);
- Res = tryDecodeInst(DecoderTableDPP8GFX1196, MI, DecW,
- Address);
+ Res = tryDecodeInst(DecoderTableDPP8GFX1196, MI, DecW, Address, CS);
if (Res && convertDPP8Inst(MI) == MCDisassembler::Success)
break;
MI = MCInst(); // clear
- Res = tryDecodeInst(DecoderTableDPPGFX1196, MI, DecW,
- Address);
+ Res = tryDecodeInst(DecoderTableDPPGFX1196, MI, DecW, Address, CS);
if (Res) {
if (MCII->get(MI.getOpcode()).TSFlags & SIInstrFlags::VOP3P)
convertVOP3PDPPInst(MI);
@@ -469,7 +434,7 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
}
break;
}
- Res = tryDecodeInst(DecoderTableGFX1196, MI, DecW, Address);
+ Res = tryDecodeInst(DecoderTableGFX1196, MI, DecW, Address, CS);
if (Res)
break;
}
@@ -479,8 +444,8 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
if (Bytes.size() >= 8) {
const uint64_t QW = eatBytes<uint64_t>(Bytes);
- if (STI.getFeatureBits()[AMDGPU::FeatureGFX10_BEncoding]) {
- Res = tryDecodeInst(DecoderTableGFX10_B64, MI, QW, Address);
+ if (STI.hasFeature(AMDGPU::FeatureGFX10_BEncoding)) {
+ Res = tryDecodeInst(DecoderTableGFX10_B64, MI, QW, Address, CS);
if (Res) {
if (AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::dpp8)
== -1)
@@ -491,37 +456,37 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
}
}
- Res = tryDecodeInst(DecoderTableDPP864, MI, QW, Address);
+ Res = tryDecodeInst(DecoderTableDPP864, MI, QW, Address, CS);
if (Res && convertDPP8Inst(MI) == MCDisassembler::Success)
break;
MI = MCInst(); // clear
- Res = tryDecodeInst(DecoderTableDPP8GFX1164, MI, QW, Address);
+ Res = tryDecodeInst(DecoderTableDPP8GFX1164, MI, QW, Address, CS);
if (Res && convertDPP8Inst(MI) == MCDisassembler::Success)
break;
MI = MCInst(); // clear
- Res = tryDecodeInst(DecoderTableDPP64, MI, QW, Address);
+ Res = tryDecodeInst(DecoderTableDPP64, MI, QW, Address, CS);
if (Res) break;
- Res = tryDecodeInst(DecoderTableDPPGFX1164, MI, QW, Address);
+ Res = tryDecodeInst(DecoderTableDPPGFX1164, MI, QW, Address, CS);
if (Res) {
if (MCII->get(MI.getOpcode()).TSFlags & SIInstrFlags::VOPC)
convertVOPCDPPInst(MI);
break;
}
- Res = tryDecodeInst(DecoderTableSDWA64, MI, QW, Address);
+ Res = tryDecodeInst(DecoderTableSDWA64, MI, QW, Address, CS);
if (Res) { IsSDWA = true; break; }
- Res = tryDecodeInst(DecoderTableSDWA964, MI, QW, Address);
+ Res = tryDecodeInst(DecoderTableSDWA964, MI, QW, Address, CS);
if (Res) { IsSDWA = true; break; }
- Res = tryDecodeInst(DecoderTableSDWA1064, MI, QW, Address);
+ Res = tryDecodeInst(DecoderTableSDWA1064, MI, QW, Address, CS);
if (Res) { IsSDWA = true; break; }
- if (STI.getFeatureBits()[AMDGPU::FeatureUnpackedD16VMem]) {
- Res = tryDecodeInst(DecoderTableGFX80_UNPACKED64, MI, QW, Address);
+ if (STI.hasFeature(AMDGPU::FeatureUnpackedD16VMem)) {
+ Res = tryDecodeInst(DecoderTableGFX80_UNPACKED64, MI, QW, Address, CS);
if (Res)
break;
}
@@ -529,8 +494,8 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
// Some GFX9 subtargets repurposed the v_mad_mix_f32, v_mad_mixlo_f16 and
// v_mad_mixhi_f16 for FMA variants. Try to decode using this special
// table first so we print the correct name.
- if (STI.getFeatureBits()[AMDGPU::FeatureFmaMixInsts]) {
- Res = tryDecodeInst(DecoderTableGFX9_DL64, MI, QW, Address);
+ if (STI.hasFeature(AMDGPU::FeatureFmaMixInsts)) {
+ Res = tryDecodeInst(DecoderTableGFX9_DL64, MI, QW, Address, CS);
if (Res)
break;
}
@@ -542,64 +507,64 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
// Try decode 32-bit instruction
if (Bytes.size() < 4) break;
const uint32_t DW = eatBytes<uint32_t>(Bytes);
- Res = tryDecodeInst(DecoderTableGFX832, MI, DW, Address);
+ Res = tryDecodeInst(DecoderTableGFX832, MI, DW, Address, CS);
if (Res) break;
- Res = tryDecodeInst(DecoderTableAMDGPU32, MI, DW, Address);
+ Res = tryDecodeInst(DecoderTableAMDGPU32, MI, DW, Address, CS);
if (Res) break;
- Res = tryDecodeInst(DecoderTableGFX932, MI, DW, Address);
+ Res = tryDecodeInst(DecoderTableGFX932, MI, DW, Address, CS);
if (Res) break;
- if (STI.getFeatureBits()[AMDGPU::FeatureGFX90AInsts]) {
- Res = tryDecodeInst(DecoderTableGFX90A32, MI, DW, Address);
+ if (STI.hasFeature(AMDGPU::FeatureGFX90AInsts)) {
+ Res = tryDecodeInst(DecoderTableGFX90A32, MI, DW, Address, CS);
if (Res)
break;
}
- if (STI.getFeatureBits()[AMDGPU::FeatureGFX10_BEncoding]) {
- Res = tryDecodeInst(DecoderTableGFX10_B32, MI, DW, Address);
+ if (STI.hasFeature(AMDGPU::FeatureGFX10_BEncoding)) {
+ Res = tryDecodeInst(DecoderTableGFX10_B32, MI, DW, Address, CS);
if (Res) break;
}
- Res = tryDecodeInst(DecoderTableGFX1032, MI, DW, Address);
+ Res = tryDecodeInst(DecoderTableGFX1032, MI, DW, Address, CS);
if (Res) break;
- Res = tryDecodeInst(DecoderTableGFX1132, MI, DW, Address);
+ Res = tryDecodeInst(DecoderTableGFX1132, MI, DW, Address, CS);
if (Res) break;
if (Bytes.size() < 4) break;
const uint64_t QW = ((uint64_t)eatBytes<uint32_t>(Bytes) << 32) | DW;
- if (STI.getFeatureBits()[AMDGPU::FeatureGFX940Insts]) {
- Res = tryDecodeInst(DecoderTableGFX94064, MI, QW, Address);
+ if (STI.hasFeature(AMDGPU::FeatureGFX940Insts)) {
+ Res = tryDecodeInst(DecoderTableGFX94064, MI, QW, Address, CS);
if (Res)
break;
}
- if (STI.getFeatureBits()[AMDGPU::FeatureGFX90AInsts]) {
- Res = tryDecodeInst(DecoderTableGFX90A64, MI, QW, Address);
+ if (STI.hasFeature(AMDGPU::FeatureGFX90AInsts)) {
+ Res = tryDecodeInst(DecoderTableGFX90A64, MI, QW, Address, CS);
if (Res)
break;
}
- Res = tryDecodeInst(DecoderTableGFX864, MI, QW, Address);
+ Res = tryDecodeInst(DecoderTableGFX864, MI, QW, Address, CS);
if (Res) break;
- Res = tryDecodeInst(DecoderTableAMDGPU64, MI, QW, Address);
+ Res = tryDecodeInst(DecoderTableAMDGPU64, MI, QW, Address, CS);
if (Res) break;
- Res = tryDecodeInst(DecoderTableGFX964, MI, QW, Address);
+ Res = tryDecodeInst(DecoderTableGFX964, MI, QW, Address, CS);
if (Res) break;
- Res = tryDecodeInst(DecoderTableGFX1064, MI, QW, Address);
+ Res = tryDecodeInst(DecoderTableGFX1064, MI, QW, Address, CS);
if (Res) break;
- Res = tryDecodeInst(DecoderTableGFX1164, MI, QW, Address);
+ Res = tryDecodeInst(DecoderTableGFX1164, MI, QW, Address, CS);
if (Res)
break;
- Res = tryDecodeInst(DecoderTableWMMAGFX1164, MI, QW, Address);
+ Res = tryDecodeInst(DecoderTableWMMAGFX1164, MI, QW, Address, CS);
} while (false);
if (Res && AMDGPU::isMAC(MI.getOpcode())) {
@@ -627,7 +592,7 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
if (Res && (MCII->get(MI.getOpcode()).TSFlags &
(SIInstrFlags::MTBUF | SIInstrFlags::MUBUF)) &&
- (STI.getFeatureBits()[AMDGPU::FeatureGFX90AInsts])) {
+ (STI.hasFeature(AMDGPU::FeatureGFX90AInsts))) {
// GFX90A lost TFE, its place is occupied by ACC.
int TFEOpIdx =
AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::tfe);
@@ -714,7 +679,7 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
}
DecodeStatus AMDGPUDisassembler::convertEXPInst(MCInst &MI) const {
- if (STI.getFeatureBits()[AMDGPU::FeatureGFX11]) {
+ if (STI.hasFeature(AMDGPU::FeatureGFX11)) {
// The MCInst still has these fields even though they are no longer encoded
// in the GFX11 instruction.
insertNamedMCOperand(MI, MCOperand::createImm(0), AMDGPU::OpName::vm);
@@ -736,12 +701,12 @@ DecodeStatus AMDGPUDisassembler::convertVINTERPInst(MCInst &MI) const {
}
DecodeStatus AMDGPUDisassembler::convertSDWAInst(MCInst &MI) const {
- if (STI.getFeatureBits()[AMDGPU::FeatureGFX9] ||
- STI.getFeatureBits()[AMDGPU::FeatureGFX10]) {
+ if (STI.hasFeature(AMDGPU::FeatureGFX9) ||
+ STI.hasFeature(AMDGPU::FeatureGFX10)) {
if (AMDGPU::hasNamedOperand(MI.getOpcode(), AMDGPU::OpName::sdst))
// VOPC - insert clamp
insertNamedMCOperand(MI, MCOperand::createImm(0), AMDGPU::OpName::clamp);
- } else if (STI.getFeatureBits()[AMDGPU::FeatureVolcanicIslands]) {
+ } else if (STI.hasFeature(AMDGPU::FeatureVolcanicIslands)) {
int SDst = AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::sdst);
if (SDst != -1) {
// VOPC - insert VCC register as sdst
@@ -883,6 +848,8 @@ DecodeStatus AMDGPUDisassembler::convertMIMGInst(MCInst &MI) const {
AMDGPU::OpName::vdata);
int VAddr0Idx =
AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vaddr0);
+ int RsrcIdx =
+ AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::srsrc);
int DMaskIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
AMDGPU::OpName::dmask);
@@ -898,14 +865,14 @@ DecodeStatus AMDGPUDisassembler::convertMIMGInst(MCInst &MI) const {
assert(VDataIdx != -1);
if (BaseOpcode->BVH) {
// Add A16 operand for intersect_ray instructions
- if (AMDGPU::hasNamedOperand(MI.getOpcode(), AMDGPU::OpName::a16))
- addOperand(MI, MCOperand::createImm(1));
+ addOperand(MI, MCOperand::createImm(BaseOpcode->A16));
return MCDisassembler::Success;
}
bool IsAtomic = (VDstIdx != -1);
bool IsGather4 = MCII->get(MI.getOpcode()).TSFlags & SIInstrFlags::Gather4;
bool IsNSA = false;
+ bool IsPartialNSA = false;
unsigned AddrSize = Info->VAddrDwords;
if (isGFX10Plus()) {
@@ -927,9 +894,12 @@ DecodeStatus AMDGPUDisassembler::convertMIMGInst(MCInst &MI) const {
AddrSize = 16;
} else {
if (AddrSize > Info->VAddrDwords) {
- // The NSA encoding does not contain enough operands for the combination
- // of base opcode / dimension. Should this be an error?
- return MCDisassembler::Success;
+ if (!STI.hasFeature(AMDGPU::FeaturePartialNSAEncoding)) {
+ // The NSA encoding does not contain enough operands for the
+ // combination of base opcode / dimension. Should this be an error?
+ return MCDisassembler::Success;
+ }
+ IsPartialNSA = true;
}
}
}
@@ -972,17 +942,20 @@ DecodeStatus AMDGPUDisassembler::convertMIMGInst(MCInst &MI) const {
}
}
- // If not using NSA on GFX10+, widen address register to correct size.
- unsigned NewVAddr0 = AMDGPU::NoRegister;
- if (isGFX10Plus() && !IsNSA && AddrSize != Info->VAddrDwords) {
- unsigned VAddr0 = MI.getOperand(VAddr0Idx).getReg();
- unsigned VAddrSub0 = MRI.getSubReg(VAddr0, AMDGPU::sub0);
- VAddr0 = (VAddrSub0 != 0) ? VAddrSub0 : VAddr0;
+ // If not using NSA on GFX10+, widen vaddr0 address register to correct size.
+ // If using partial NSA on GFX11+ widen last address register.
+ int VAddrSAIdx = IsPartialNSA ? (RsrcIdx - 1) : VAddr0Idx;
+ unsigned NewVAddrSA = AMDGPU::NoRegister;
+ if (STI.hasFeature(AMDGPU::FeatureNSAEncoding) && (!IsNSA || IsPartialNSA) &&
+ AddrSize != Info->VAddrDwords) {
+ unsigned VAddrSA = MI.getOperand(VAddrSAIdx).getReg();
+ unsigned VAddrSubSA = MRI.getSubReg(VAddrSA, AMDGPU::sub0);
+ VAddrSA = VAddrSubSA ? VAddrSubSA : VAddrSA;
- auto AddrRCID = MCII->get(NewOpcode).operands()[VAddr0Idx].RegClass;
- NewVAddr0 = MRI.getMatchingSuperReg(VAddr0, AMDGPU::sub0,
+ auto AddrRCID = MCII->get(NewOpcode).operands()[VAddrSAIdx].RegClass;
+ NewVAddrSA = MRI.getMatchingSuperReg(VAddrSA, AMDGPU::sub0,
&MRI.getRegClass(AddrRCID));
- if (NewVAddr0 == AMDGPU::NoRegister)
+ if (!NewVAddrSA)
return MCDisassembler::Success;
}
@@ -997,8 +970,8 @@ DecodeStatus AMDGPUDisassembler::convertMIMGInst(MCInst &MI) const {
}
}
- if (NewVAddr0 != AMDGPU::NoRegister) {
- MI.getOperand(VAddr0Idx) = MCOperand::createReg(NewVAddr0);
+ if (NewVAddrSA) {
+ MI.getOperand(VAddrSAIdx) = MCOperand::createReg(NewVAddrSA);
} else if (IsNSA) {
assert(AddrSize <= Info->VAddrDwords);
MI.erase(MI.begin() + VAddr0Idx + AddrSize,
@@ -1159,214 +1132,6 @@ MCOperand AMDGPUDisassembler::createSRegOperand(unsigned SRegClassID,
return createRegOperand(SRegClassID, Val >> shift);
}
-MCOperand AMDGPUDisassembler::decodeOperand_VS_32(unsigned Val) const {
- return decodeSrcOp(OPW32, Val);
-}
-
-MCOperand AMDGPUDisassembler::decodeOperand_VS_64(unsigned Val) const {
- return decodeSrcOp(OPW64, Val);
-}
-
-MCOperand AMDGPUDisassembler::decodeOperand_VS_128(unsigned Val) const {
- return decodeSrcOp(OPW128, Val);
-}
-
-MCOperand AMDGPUDisassembler::decodeOperand_VSrc16(unsigned Val) const {
- return decodeSrcOp(OPW16, Val);
-}
-
-MCOperand AMDGPUDisassembler::decodeOperand_VSrcV216(unsigned Val) const {
- return decodeSrcOp(OPWV216, Val);
-}
-
-MCOperand AMDGPUDisassembler::decodeOperand_VSrcV232(unsigned Val) const {
- return decodeSrcOp(OPWV232, Val);
-}
-
-MCOperand AMDGPUDisassembler::decodeOperand_VGPR_32_Lo128(unsigned Val) const {
- return createRegOperand(AMDGPU::VGPR_32_Lo128RegClassID, Val);
-}
-
-MCOperand AMDGPUDisassembler::decodeOperand_VGPR_32(unsigned Val) const {
- // Some instructions have operand restrictions beyond what the encoding
- // allows. Some ordinarily VSrc_32 operands are VGPR_32, so clear the extra
- // high bit.
- Val &= 255;
-
- return createRegOperand(AMDGPU::VGPR_32RegClassID, Val);
-}
-
-MCOperand AMDGPUDisassembler::decodeOperand_VRegOrLds_32(unsigned Val) const {
- return decodeSrcOp(OPW32, Val);
-}
-
-MCOperand AMDGPUDisassembler::decodeOperand_AGPR_32(unsigned Val) const {
- return createRegOperand(AMDGPU::AGPR_32RegClassID, Val & 255);
-}
-
-MCOperand AMDGPUDisassembler::decodeOperand_AReg_64(unsigned Val) const {
- return createRegOperand(AMDGPU::AReg_64RegClassID, Val & 255);
-}
-
-MCOperand AMDGPUDisassembler::decodeOperand_AReg_128(unsigned Val) const {
- return createRegOperand(AMDGPU::AReg_128RegClassID, Val & 255);
-}
-
-MCOperand AMDGPUDisassembler::decodeOperand_AReg_256(unsigned Val) const {
- return createRegOperand(AMDGPU::AReg_256RegClassID, Val & 255);
-}
-
-MCOperand AMDGPUDisassembler::decodeOperand_AReg_288(unsigned Val) const {
- return createRegOperand(AMDGPU::AReg_288RegClassID, Val & 255);
-}
-
-MCOperand AMDGPUDisassembler::decodeOperand_AReg_320(unsigned Val) const {
- return createRegOperand(AMDGPU::AReg_320RegClassID, Val & 255);
-}
-
-MCOperand AMDGPUDisassembler::decodeOperand_AReg_352(unsigned Val) const {
- return createRegOperand(AMDGPU::AReg_352RegClassID, Val & 255);
-}
-
-MCOperand AMDGPUDisassembler::decodeOperand_AReg_384(unsigned Val) const {
- return createRegOperand(AMDGPU::AReg_384RegClassID, Val & 255);
-}
-
-
-MCOperand AMDGPUDisassembler::decodeOperand_AReg_512(unsigned Val) const {
- return createRegOperand(AMDGPU::AReg_512RegClassID, Val & 255);
-}
-
-MCOperand AMDGPUDisassembler::decodeOperand_AReg_1024(unsigned Val) const {
- return createRegOperand(AMDGPU::AReg_1024RegClassID, Val & 255);
-}
-
-MCOperand AMDGPUDisassembler::decodeOperand_AV_32(unsigned Val) const {
- return decodeSrcOp(OPW32, Val);
-}
-
-MCOperand AMDGPUDisassembler::decodeOperand_AV_64(unsigned Val) const {
- return decodeSrcOp(OPW64, Val);
-}
-
-MCOperand AMDGPUDisassembler::decodeOperand_AV_128(unsigned Val) const {
- return decodeSrcOp(OPW128, Val);
-}
-
-MCOperand AMDGPUDisassembler::decodeOperand_AVDst_128(unsigned Val) const {
- using namespace AMDGPU::EncValues;
- assert((Val & IS_VGPR) == 0); // Val{8} is not encoded but assumed to be 1.
- return decodeSrcOp(OPW128, Val | IS_VGPR);
-}
-
-MCOperand AMDGPUDisassembler::decodeOperand_AVDst_512(unsigned Val) const {
- using namespace AMDGPU::EncValues;
- assert((Val & IS_VGPR) == 0); // Val{8} is not encoded but assumed to be 1.
- return decodeSrcOp(OPW512, Val | IS_VGPR);
-}
-
-MCOperand AMDGPUDisassembler::decodeOperand_VReg_64(unsigned Val) const {
- return createRegOperand(AMDGPU::VReg_64RegClassID, Val);
-}
-
-MCOperand AMDGPUDisassembler::decodeOperand_VReg_96(unsigned Val) const {
- return createRegOperand(AMDGPU::VReg_96RegClassID, Val);
-}
-
-MCOperand AMDGPUDisassembler::decodeOperand_VReg_128(unsigned Val) const {
- return createRegOperand(AMDGPU::VReg_128RegClassID, Val);
-}
-
-MCOperand AMDGPUDisassembler::decodeOperand_VReg_256(unsigned Val) const {
- return createRegOperand(AMDGPU::VReg_256RegClassID, Val);
-}
-
-MCOperand AMDGPUDisassembler::decodeOperand_VReg_288(unsigned Val) const {
- return createRegOperand(AMDGPU::VReg_288RegClassID, Val);
-}
-
-MCOperand AMDGPUDisassembler::decodeOperand_VReg_320(unsigned Val) const {
- return createRegOperand(AMDGPU::VReg_320RegClassID, Val);
-}
-
-MCOperand AMDGPUDisassembler::decodeOperand_VReg_352(unsigned Val) const {
- return createRegOperand(AMDGPU::VReg_352RegClassID, Val);
-}
-
-MCOperand AMDGPUDisassembler::decodeOperand_VReg_384(unsigned Val) const {
- return createRegOperand(AMDGPU::VReg_384RegClassID, Val);
-}
-
-MCOperand AMDGPUDisassembler::decodeOperand_VReg_512(unsigned Val) const {
- return createRegOperand(AMDGPU::VReg_512RegClassID, Val);
-}
-
-MCOperand AMDGPUDisassembler::decodeOperand_VReg_1024(unsigned Val) const {
- return createRegOperand(AMDGPU::VReg_1024RegClassID, Val);
-}
-
-MCOperand AMDGPUDisassembler::decodeOperand_SReg_32(unsigned Val) const {
- // table-gen generated disassembler doesn't care about operand types
- // leaving only registry class so SSrc_32 operand turns into SReg_32
- // and therefore we accept immediates and literals here as well
- return decodeSrcOp(OPW32, Val);
-}
-
-MCOperand AMDGPUDisassembler::decodeOperand_SReg_32_XM0_XEXEC(
- unsigned Val) const {
- // SReg_32_XM0 is SReg_32 without M0 or EXEC_LO/EXEC_HI
- return decodeOperand_SReg_32(Val);
-}
-
-MCOperand AMDGPUDisassembler::decodeOperand_SReg_32_XEXEC_HI(
- unsigned Val) const {
- // SReg_32_XM0 is SReg_32 without EXEC_HI
- return decodeOperand_SReg_32(Val);
-}
-
-MCOperand AMDGPUDisassembler::decodeOperand_SRegOrLds_32(unsigned Val) const {
- // table-gen generated disassembler doesn't care about operand types
- // leaving only registry class so SSrc_32 operand turns into SReg_32
- // and therefore we accept immediates and literals here as well
- return decodeSrcOp(OPW32, Val);
-}
-
-MCOperand AMDGPUDisassembler::decodeOperand_SReg_64(unsigned Val) const {
- return decodeSrcOp(OPW64, Val);
-}
-
-MCOperand AMDGPUDisassembler::decodeOperand_SReg_64_XEXEC(unsigned Val) const {
- return decodeSrcOp(OPW64, Val);
-}
-
-MCOperand AMDGPUDisassembler::decodeOperand_SReg_128(unsigned Val) const {
- return decodeSrcOp(OPW128, Val);
-}
-
-MCOperand AMDGPUDisassembler::decodeOperand_SReg_256(unsigned Val) const {
- return decodeDstOp(OPW256, Val);
-}
-
-MCOperand AMDGPUDisassembler::decodeOperand_SReg_288(unsigned Val) const {
- return decodeDstOp(OPW288, Val);
-}
-
-MCOperand AMDGPUDisassembler::decodeOperand_SReg_320(unsigned Val) const {
- return decodeDstOp(OPW320, Val);
-}
-
-MCOperand AMDGPUDisassembler::decodeOperand_SReg_352(unsigned Val) const {
- return decodeDstOp(OPW352, Val);
-}
-
-MCOperand AMDGPUDisassembler::decodeOperand_SReg_384(unsigned Val) const {
- return decodeDstOp(OPW384, Val);
-}
-
-MCOperand AMDGPUDisassembler::decodeOperand_SReg_512(unsigned Val) const {
- return decodeDstOp(OPW512, Val);
-}
-
// Decode Literals for insts which always have a literal in the encoding
MCOperand
AMDGPUDisassembler::decodeMandatoryLiteralConstant(unsigned Val) const {
@@ -1410,21 +1175,21 @@ MCOperand AMDGPUDisassembler::decodeIntImmed(unsigned Imm) {
static int64_t getInlineImmVal32(unsigned Imm) {
switch (Imm) {
case 240:
- return FloatToBits(0.5f);
+ return llvm::bit_cast<uint32_t>(0.5f);
case 241:
- return FloatToBits(-0.5f);
+ return llvm::bit_cast<uint32_t>(-0.5f);
case 242:
- return FloatToBits(1.0f);
+ return llvm::bit_cast<uint32_t>(1.0f);
case 243:
- return FloatToBits(-1.0f);
+ return llvm::bit_cast<uint32_t>(-1.0f);
case 244:
- return FloatToBits(2.0f);
+ return llvm::bit_cast<uint32_t>(2.0f);
case 245:
- return FloatToBits(-2.0f);
+ return llvm::bit_cast<uint32_t>(-2.0f);
case 246:
- return FloatToBits(4.0f);
+ return llvm::bit_cast<uint32_t>(4.0f);
case 247:
- return FloatToBits(-4.0f);
+ return llvm::bit_cast<uint32_t>(-4.0f);
case 248: // 1 / (2 * PI)
return 0x3e22f983;
default:
@@ -1435,21 +1200,21 @@ static int64_t getInlineImmVal32(unsigned Imm) {
static int64_t getInlineImmVal64(unsigned Imm) {
switch (Imm) {
case 240:
- return DoubleToBits(0.5);
+ return llvm::bit_cast<uint64_t>(0.5);
case 241:
- return DoubleToBits(-0.5);
+ return llvm::bit_cast<uint64_t>(-0.5);
case 242:
- return DoubleToBits(1.0);
+ return llvm::bit_cast<uint64_t>(1.0);
case 243:
- return DoubleToBits(-1.0);
+ return llvm::bit_cast<uint64_t>(-1.0);
case 244:
- return DoubleToBits(2.0);
+ return llvm::bit_cast<uint64_t>(2.0);
case 245:
- return DoubleToBits(-2.0);
+ return llvm::bit_cast<uint64_t>(-2.0);
case 246:
- return DoubleToBits(4.0);
+ return llvm::bit_cast<uint64_t>(4.0);
case 247:
- return DoubleToBits(-4.0);
+ return llvm::bit_cast<uint64_t>(-4.0);
case 248: // 1 / (2 * PI)
return 0x3fc45f306dc9c882;
default:
@@ -1482,23 +1247,21 @@ static int64_t getInlineImmVal16(unsigned Imm) {
}
}
-MCOperand AMDGPUDisassembler::decodeFPImmed(OpWidthTy Width, unsigned Imm) {
+MCOperand AMDGPUDisassembler::decodeFPImmed(unsigned ImmWidth, unsigned Imm) {
assert(Imm >= AMDGPU::EncValues::INLINE_FLOATING_C_MIN
&& Imm <= AMDGPU::EncValues::INLINE_FLOATING_C_MAX);
// ToDo: case 248: 1/(2*PI) - is allowed only on VI
- switch (Width) {
- case OPW32:
- case OPW128: // splat constants
- case OPW512:
- case OPW1024:
- case OPWV232:
+ // ImmWidth 0 is a default case where operand should not allow immediates.
+ // Imm value is still decoded into 32 bit immediate operand, inst printer will
+ // use it to print verbose error message.
+ switch (ImmWidth) {
+ case 0:
+ case 32:
return MCOperand::createImm(getInlineImmVal32(Imm));
- case OPW64:
- case OPW256:
+ case 64:
return MCOperand::createImm(getInlineImmVal64(Imm));
- case OPW16:
- case OPWV216:
+ case 16:
return MCOperand::createImm(getInlineImmVal16(Imm));
default:
llvm_unreachable("implement me");
@@ -1612,7 +1375,8 @@ int AMDGPUDisassembler::getTTmpIdx(unsigned Val) const {
}
MCOperand AMDGPUDisassembler::decodeSrcOp(const OpWidthTy Width, unsigned Val,
- bool MandatoryLiteral) const {
+ bool MandatoryLiteral,
+ unsigned ImmWidth) const {
using namespace AMDGPU::EncValues;
assert(Val < 1024); // enum10
@@ -1639,7 +1403,7 @@ MCOperand AMDGPUDisassembler::decodeSrcOp(const OpWidthTy Width, unsigned Val,
return decodeIntImmed(Val);
if (INLINE_FLOATING_C_MIN <= Val && Val <= INLINE_FLOATING_C_MAX)
- return decodeFPImmed(Width, Val);
+ return decodeFPImmed(ImmWidth, Val);
if (Val == LITERAL_CONST) {
if (MandatoryLiteral)
@@ -1662,26 +1426,6 @@ MCOperand AMDGPUDisassembler::decodeSrcOp(const OpWidthTy Width, unsigned Val,
}
}
-MCOperand AMDGPUDisassembler::decodeDstOp(const OpWidthTy Width, unsigned Val) const {
- using namespace AMDGPU::EncValues;
-
- assert(Val < 128);
- assert(Width == OPW256 || Width == OPW512);
-
- if (Val <= SGPR_MAX) {
- // "SGPR_MIN <= Val" is always true and causes compilation warning.
- static_assert(SGPR_MIN == 0);
- return createSRegOperand(getSgprClassId(Width), Val - SGPR_MIN);
- }
-
- int TTmpIdx = getTTmpIdx(Val);
- if (TTmpIdx >= 0) {
- return createSRegOperand(getTtmpClassId(Width), TTmpIdx);
- }
-
- llvm_unreachable("unknown dst register");
-}
-
// Bit 0 of DstY isn't stored in the instruction, because it's always the
// opposite of bit 0 of DstX.
MCOperand AMDGPUDisassembler::decodeVOPDDstYOp(MCInst &Inst,
@@ -1764,12 +1508,13 @@ MCOperand AMDGPUDisassembler::decodeSpecialReg64(unsigned Val) const {
}
MCOperand AMDGPUDisassembler::decodeSDWASrc(const OpWidthTy Width,
- const unsigned Val) const {
+ const unsigned Val,
+ unsigned ImmWidth) const {
using namespace AMDGPU::SDWA;
using namespace AMDGPU::EncValues;
- if (STI.getFeatureBits()[AMDGPU::FeatureGFX9] ||
- STI.getFeatureBits()[AMDGPU::FeatureGFX10]) {
+ if (STI.hasFeature(AMDGPU::FeatureGFX9) ||
+ STI.hasFeature(AMDGPU::FeatureGFX10)) {
// XXX: cast to int is needed to avoid stupid warning:
// compare with unsigned is always true
if (int(SDWA9EncValues::SRC_VGPR_MIN) <= int(Val) &&
@@ -1795,31 +1540,31 @@ MCOperand AMDGPUDisassembler::decodeSDWASrc(const OpWidthTy Width,
return decodeIntImmed(SVal);
if (INLINE_FLOATING_C_MIN <= SVal && SVal <= INLINE_FLOATING_C_MAX)
- return decodeFPImmed(Width, SVal);
+ return decodeFPImmed(ImmWidth, SVal);
return decodeSpecialReg32(SVal);
- } else if (STI.getFeatureBits()[AMDGPU::FeatureVolcanicIslands]) {
+ } else if (STI.hasFeature(AMDGPU::FeatureVolcanicIslands)) {
return createRegOperand(getVgprClassId(Width), Val);
}
llvm_unreachable("unsupported target");
}
MCOperand AMDGPUDisassembler::decodeSDWASrc16(unsigned Val) const {
- return decodeSDWASrc(OPW16, Val);
+ return decodeSDWASrc(OPW16, Val, 16);
}
MCOperand AMDGPUDisassembler::decodeSDWASrc32(unsigned Val) const {
- return decodeSDWASrc(OPW32, Val);
+ return decodeSDWASrc(OPW32, Val, 32);
}
MCOperand AMDGPUDisassembler::decodeSDWAVopcDst(unsigned Val) const {
using namespace AMDGPU::SDWA;
- assert((STI.getFeatureBits()[AMDGPU::FeatureGFX9] ||
- STI.getFeatureBits()[AMDGPU::FeatureGFX10]) &&
+ assert((STI.hasFeature(AMDGPU::FeatureGFX9) ||
+ STI.hasFeature(AMDGPU::FeatureGFX10)) &&
"SDWAVopcDst should be present only on GFX9+");
- bool IsWave64 = STI.getFeatureBits()[AMDGPU::FeatureWavefrontSize64];
+ bool IsWave64 = STI.hasFeature(AMDGPU::FeatureWavefrontSize64);
if (Val & SDWA9EncValues::VOPC_DST_VCC_MASK) {
Val &= SDWA9EncValues::VOPC_DST_SGPR_MASK;
@@ -1840,18 +1585,19 @@ MCOperand AMDGPUDisassembler::decodeSDWAVopcDst(unsigned Val) const {
}
MCOperand AMDGPUDisassembler::decodeBoolReg(unsigned Val) const {
- return STI.getFeatureBits()[AMDGPU::FeatureWavefrontSize64] ?
- decodeOperand_SReg_64(Val) : decodeOperand_SReg_32(Val);
+ return STI.hasFeature(AMDGPU::FeatureWavefrontSize64)
+ ? decodeSrcOp(OPW64, Val)
+ : decodeSrcOp(OPW32, Val);
}
bool AMDGPUDisassembler::isVI() const {
- return STI.getFeatureBits()[AMDGPU::FeatureVolcanicIslands];
+ return STI.hasFeature(AMDGPU::FeatureVolcanicIslands);
}
bool AMDGPUDisassembler::isGFX9() const { return AMDGPU::isGFX9(STI); }
bool AMDGPUDisassembler::isGFX90A() const {
- return STI.getFeatureBits()[AMDGPU::FeatureGFX90AInsts];
+ return STI.hasFeature(AMDGPU::FeatureGFX90AInsts);
}
bool AMDGPUDisassembler::isGFX9Plus() const { return AMDGPU::isGFX9Plus(STI); }
@@ -1863,7 +1609,7 @@ bool AMDGPUDisassembler::isGFX10Plus() const {
}
bool AMDGPUDisassembler::isGFX11() const {
- return STI.getFeatureBits()[AMDGPU::FeatureGFX11];
+ return STI.hasFeature(AMDGPU::FeatureGFX11);
}
bool AMDGPUDisassembler::isGFX11Plus() const {
@@ -1872,16 +1618,21 @@ bool AMDGPUDisassembler::isGFX11Plus() const {
bool AMDGPUDisassembler::hasArchitectedFlatScratch() const {
- return STI.getFeatureBits()[AMDGPU::FeatureArchitectedFlatScratch];
+ return STI.hasFeature(AMDGPU::FeatureArchitectedFlatScratch);
}
//===----------------------------------------------------------------------===//
// AMDGPU specific symbol handling
//===----------------------------------------------------------------------===//
+#define GET_FIELD(MASK) (AMDHSA_BITS_GET(FourByteBuffer, MASK))
#define PRINT_DIRECTIVE(DIRECTIVE, MASK) \
do { \
- KdStream << Indent << DIRECTIVE " " \
- << ((FourByteBuffer & MASK) >> (MASK##_SHIFT)) << '\n'; \
+ KdStream << Indent << DIRECTIVE " " << GET_FIELD(MASK) << '\n'; \
+ } while (0)
+#define PRINT_PSEUDO_DIRECTIVE_COMMENT(DIRECTIVE, MASK) \
+ do { \
+ KdStream << Indent << MAI.getCommentString() << ' ' << DIRECTIVE " " \
+ << GET_FIELD(MASK) << '\n'; \
} while (0)
// NOLINTNEXTLINE(readability-identifier-naming)
@@ -1896,11 +1647,11 @@ MCDisassembler::DecodeStatus AMDGPUDisassembler::decodeCOMPUTE_PGM_RSRC1(
// simply calculate the inverse of what the assembler does.
uint32_t GranulatedWorkitemVGPRCount =
- (FourByteBuffer & COMPUTE_PGM_RSRC1_GRANULATED_WORKITEM_VGPR_COUNT) >>
- COMPUTE_PGM_RSRC1_GRANULATED_WORKITEM_VGPR_COUNT_SHIFT;
+ GET_FIELD(COMPUTE_PGM_RSRC1_GRANULATED_WORKITEM_VGPR_COUNT);
- uint32_t NextFreeVGPR = (GranulatedWorkitemVGPRCount + 1) *
- AMDGPU::IsaInfo::getVGPREncodingGranule(&STI);
+ uint32_t NextFreeVGPR =
+ (GranulatedWorkitemVGPRCount + 1) *
+ AMDGPU::IsaInfo::getVGPREncodingGranule(&STI, EnableWavefrontSize32);
KdStream << Indent << ".amdhsa_next_free_vgpr " << NextFreeVGPR << '\n';
@@ -1924,8 +1675,7 @@ MCDisassembler::DecodeStatus AMDGPUDisassembler::decodeCOMPUTE_PGM_RSRC1(
// The disassembler cannot recover the original values of those 3 directives.
uint32_t GranulatedWavefrontSGPRCount =
- (FourByteBuffer & COMPUTE_PGM_RSRC1_GRANULATED_WAVEFRONT_SGPR_COUNT) >>
- COMPUTE_PGM_RSRC1_GRANULATED_WAVEFRONT_SGPR_COUNT_SHIFT;
+ GET_FIELD(COMPUTE_PGM_RSRC1_GRANULATED_WAVEFRONT_SGPR_COUNT);
if (isGFX10Plus() && GranulatedWavefrontSGPRCount)
return MCDisassembler::Fail;
@@ -2035,7 +1785,46 @@ MCDisassembler::DecodeStatus AMDGPUDisassembler::decodeCOMPUTE_PGM_RSRC2(
return MCDisassembler::Success;
}
+// NOLINTNEXTLINE(readability-identifier-naming)
+MCDisassembler::DecodeStatus AMDGPUDisassembler::decodeCOMPUTE_PGM_RSRC3(
+ uint32_t FourByteBuffer, raw_string_ostream &KdStream) const {
+ using namespace amdhsa;
+ StringRef Indent = "\t";
+ if (isGFX90A()) {
+ KdStream << Indent << ".amdhsa_accum_offset "
+ << (GET_FIELD(COMPUTE_PGM_RSRC3_GFX90A_ACCUM_OFFSET) + 1) * 4
+ << '\n';
+ if (FourByteBuffer & COMPUTE_PGM_RSRC3_GFX90A_RESERVED0)
+ return MCDisassembler::Fail;
+ PRINT_DIRECTIVE(".amdhsa_tg_split", COMPUTE_PGM_RSRC3_GFX90A_TG_SPLIT);
+ if (FourByteBuffer & COMPUTE_PGM_RSRC3_GFX90A_RESERVED1)
+ return MCDisassembler::Fail;
+ } else if (isGFX10Plus()) {
+ if (!EnableWavefrontSize32 || !*EnableWavefrontSize32) {
+ PRINT_DIRECTIVE(".amdhsa_shared_vgpr_count",
+ COMPUTE_PGM_RSRC3_GFX10_PLUS_SHARED_VGPR_COUNT);
+ } else {
+ PRINT_PSEUDO_DIRECTIVE_COMMENT(
+ "SHARED_VGPR_COUNT", COMPUTE_PGM_RSRC3_GFX10_PLUS_SHARED_VGPR_COUNT);
+ }
+ PRINT_PSEUDO_DIRECTIVE_COMMENT("INST_PREF_SIZE",
+ COMPUTE_PGM_RSRC3_GFX10_PLUS_INST_PREF_SIZE);
+ PRINT_PSEUDO_DIRECTIVE_COMMENT("TRAP_ON_START",
+ COMPUTE_PGM_RSRC3_GFX10_PLUS_TRAP_ON_START);
+ PRINT_PSEUDO_DIRECTIVE_COMMENT("TRAP_ON_END",
+ COMPUTE_PGM_RSRC3_GFX10_PLUS_TRAP_ON_END);
+ if (FourByteBuffer & COMPUTE_PGM_RSRC3_GFX10_PLUS_RESERVED0)
+ return MCDisassembler::Fail;
+ PRINT_PSEUDO_DIRECTIVE_COMMENT("IMAGE_OP",
+ COMPUTE_PGM_RSRC3_GFX10_PLUS_TRAP_ON_START);
+ } else if (FourByteBuffer) {
+ return MCDisassembler::Fail;
+ }
+ return MCDisassembler::Success;
+}
+#undef PRINT_PSEUDO_DIRECTIVE_COMMENT
#undef PRINT_DIRECTIVE
+#undef GET_FIELD
MCDisassembler::DecodeStatus
AMDGPUDisassembler::decodeKernelDescriptorDirective(
@@ -2103,30 +1892,16 @@ AMDGPUDisassembler::decodeKernelDescriptorDirective(
return MCDisassembler::Success;
case amdhsa::COMPUTE_PGM_RSRC3_OFFSET:
- // COMPUTE_PGM_RSRC3
- // - Only set for GFX10, GFX6-9 have this to be 0.
- // - Currently no directives directly control this.
FourByteBuffer = DE.getU32(Cursor);
- if (!isGFX10Plus() && FourByteBuffer) {
- return MCDisassembler::Fail;
- }
- return MCDisassembler::Success;
+ return decodeCOMPUTE_PGM_RSRC3(FourByteBuffer, KdStream);
case amdhsa::COMPUTE_PGM_RSRC1_OFFSET:
FourByteBuffer = DE.getU32(Cursor);
- if (decodeCOMPUTE_PGM_RSRC1(FourByteBuffer, KdStream) ==
- MCDisassembler::Fail) {
- return MCDisassembler::Fail;
- }
- return MCDisassembler::Success;
+ return decodeCOMPUTE_PGM_RSRC1(FourByteBuffer, KdStream);
case amdhsa::COMPUTE_PGM_RSRC2_OFFSET:
FourByteBuffer = DE.getU32(Cursor);
- if (decodeCOMPUTE_PGM_RSRC2(FourByteBuffer, KdStream) ==
- MCDisassembler::Fail) {
- return MCDisassembler::Fail;
- }
- return MCDisassembler::Success;
+ return decodeCOMPUTE_PGM_RSRC2(FourByteBuffer, KdStream);
case amdhsa::KERNEL_CODE_PROPERTIES_OFFSET:
using namespace amdhsa;
@@ -2161,7 +1936,7 @@ AMDGPUDisassembler::decodeKernelDescriptorDirective(
KERNEL_CODE_PROPERTY_ENABLE_WAVEFRONT_SIZE32);
}
- if (AMDGPU::getAmdhsaCodeObjectVersion() >= 5)
+ if (AMDGPU::getAmdhsaCodeObjectVersion() >= AMDGPU::AMDHSA_COV5)
PRINT_DIRECTIVE(".amdhsa_uses_dynamic_stack",
KERNEL_CODE_PROPERTY_USES_DYNAMIC_STACK);
@@ -2192,6 +1967,20 @@ MCDisassembler::DecodeStatus AMDGPUDisassembler::decodeKernelDescriptor(
if (Bytes.size() != 64 || KdAddress % 64 != 0)
return MCDisassembler::Fail;
+ // FIXME: We can't actually decode "in order" as is done below, as e.g. GFX10
+ // requires us to know the setting of .amdhsa_wavefront_size32 in order to
+ // accurately produce .amdhsa_next_free_vgpr, and they appear in the wrong
+ // order. Workaround this by first looking up .amdhsa_wavefront_size32 here
+ // when required.
+ if (isGFX10Plus()) {
+ uint16_t KernelCodeProperties =
+ support::endian::read16(&Bytes[amdhsa::KERNEL_CODE_PROPERTIES_OFFSET],
+ support::endianness::little);
+ EnableWavefrontSize32 =
+ AMDHSA_BITS_GET(KernelCodeProperties,
+ amdhsa::KERNEL_CODE_PROPERTY_ENABLE_WAVEFRONT_SIZE32);
+ }
+
std::string Kd;
raw_string_ostream KdStream(Kd);
KdStream << ".amdhsa_kernel " << KdName << '\n';
diff --git a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h
index 870f7b17df20..444312473a5f 100644
--- a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h
+++ b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h
@@ -16,14 +16,16 @@
#define LLVM_LIB_TARGET_AMDGPU_DISASSEMBLER_AMDGPUDISASSEMBLER_H
#include "llvm/ADT/APInt.h"
+#include "llvm/ADT/SmallString.h"
#include "llvm/MC/MCDisassembler/MCDisassembler.h"
-#include "llvm/MC/MCInstrInfo.h"
#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCInstrInfo.h"
#include "llvm/Support/DataExtractor.h"
#include <memory>
namespace llvm {
+class MCAsmInfo;
class MCInst;
class MCOperand;
class MCSubtargetInfo;
@@ -91,10 +93,12 @@ class AMDGPUDisassembler : public MCDisassembler {
private:
std::unique_ptr<MCInstrInfo const> const MCII;
const MCRegisterInfo &MRI;
+ const MCAsmInfo &MAI;
const unsigned TargetMaxInstBytes;
mutable ArrayRef<uint8_t> Bytes;
mutable uint32_t Literal;
mutable bool HasLiteral;
+ mutable std::optional<bool> EnableWavefrontSize32;
public:
AMDGPUDisassembler(const MCSubtargetInfo &STI, MCContext &Ctx,
@@ -115,14 +119,25 @@ public:
template <typename InsnType>
DecodeStatus tryDecodeInst(const uint8_t *Table, MCInst &MI, InsnType Inst,
- uint64_t Address) const {
+ uint64_t Address, raw_ostream &Comments) const {
assert(MI.getOpcode() == 0);
assert(MI.getNumOperands() == 0);
MCInst TmpInst;
HasLiteral = false;
const auto SavedBytes = Bytes;
- if (decodeInstruction(Table, TmpInst, Inst, Address, this, STI)) {
+
+ SmallString<64> LocalComments;
+ raw_svector_ostream LocalCommentStream(LocalComments);
+ CommentStream = &LocalCommentStream;
+
+ DecodeStatus Res =
+ decodeInstruction(Table, TmpInst, Inst, Address, this, STI);
+
+ CommentStream = nullptr;
+
+ if (Res != Fail) {
MI = TmpInst;
+ Comments << LocalComments;
return MCDisassembler::Success;
}
Bytes = SavedBytes;
@@ -155,6 +170,13 @@ public:
DecodeStatus decodeCOMPUTE_PGM_RSRC2(uint32_t FourByteBuffer,
raw_string_ostream &KdStream) const;
+ /// Decode as directives that handle COMPUTE_PGM_RSRC3.
+ /// \param FourByteBuffer - Bytes holding contents of COMPUTE_PGM_RSRC3.
+ /// \param KdStream - Stream to write the disassembled directives to.
+ // NOLINTNEXTLINE(readability-identifier-naming)
+ DecodeStatus decodeCOMPUTE_PGM_RSRC3(uint32_t FourByteBuffer,
+ raw_string_ostream &KdStream) const;
+
DecodeStatus convertEXPInst(MCInst &MI) const;
DecodeStatus convertVINTERPInst(MCInst &MI) const;
DecodeStatus convertFMAanyK(MCInst &MI, int ImmLitIdx) const;
@@ -166,58 +188,6 @@ public:
DecodeStatus convertVOPCDPPInst(MCInst &MI) const;
void convertMacDPPInst(MCInst &MI) const;
- MCOperand decodeOperand_VGPR_32(unsigned Val) const;
- MCOperand decodeOperand_VGPR_32_Lo128(unsigned Val) const;
- MCOperand decodeOperand_VRegOrLds_32(unsigned Val) const;
-
- MCOperand decodeOperand_VS_32(unsigned Val) const;
- MCOperand decodeOperand_VS_64(unsigned Val) const;
- MCOperand decodeOperand_VS_128(unsigned Val) const;
- MCOperand decodeOperand_VSrc16(unsigned Val) const;
- MCOperand decodeOperand_VSrcV216(unsigned Val) const;
- MCOperand decodeOperand_VSrcV232(unsigned Val) const;
-
- MCOperand decodeOperand_VReg_64(unsigned Val) const;
- MCOperand decodeOperand_VReg_96(unsigned Val) const;
- MCOperand decodeOperand_VReg_128(unsigned Val) const;
- MCOperand decodeOperand_VReg_256(unsigned Val) const;
- MCOperand decodeOperand_VReg_288(unsigned Val) const;
- MCOperand decodeOperand_VReg_320(unsigned Val) const;
- MCOperand decodeOperand_VReg_352(unsigned Val) const;
- MCOperand decodeOperand_VReg_384(unsigned Val) const;
- MCOperand decodeOperand_VReg_512(unsigned Val) const;
- MCOperand decodeOperand_VReg_1024(unsigned Val) const;
-
- MCOperand decodeOperand_SReg_32(unsigned Val) const;
- MCOperand decodeOperand_SReg_32_XM0_XEXEC(unsigned Val) const;
- MCOperand decodeOperand_SReg_32_XEXEC_HI(unsigned Val) const;
- MCOperand decodeOperand_SRegOrLds_32(unsigned Val) const;
- MCOperand decodeOperand_SReg_64(unsigned Val) const;
- MCOperand decodeOperand_SReg_64_XEXEC(unsigned Val) const;
- MCOperand decodeOperand_SReg_128(unsigned Val) const;
- MCOperand decodeOperand_SReg_256(unsigned Val) const;
- MCOperand decodeOperand_SReg_288(unsigned Val) const;
- MCOperand decodeOperand_SReg_320(unsigned Val) const;
- MCOperand decodeOperand_SReg_352(unsigned Val) const;
- MCOperand decodeOperand_SReg_384(unsigned Val) const;
- MCOperand decodeOperand_SReg_512(unsigned Val) const;
-
- MCOperand decodeOperand_AGPR_32(unsigned Val) const;
- MCOperand decodeOperand_AReg_64(unsigned Val) const;
- MCOperand decodeOperand_AReg_128(unsigned Val) const;
- MCOperand decodeOperand_AReg_256(unsigned Val) const;
- MCOperand decodeOperand_AReg_288(unsigned Val) const;
- MCOperand decodeOperand_AReg_320(unsigned Val) const;
- MCOperand decodeOperand_AReg_352(unsigned Val) const;
- MCOperand decodeOperand_AReg_384(unsigned Val) const;
- MCOperand decodeOperand_AReg_512(unsigned Val) const;
- MCOperand decodeOperand_AReg_1024(unsigned Val) const;
- MCOperand decodeOperand_AV_32(unsigned Val) const;
- MCOperand decodeOperand_AV_64(unsigned Val) const;
- MCOperand decodeOperand_AV_128(unsigned Val) const;
- MCOperand decodeOperand_AVDst_128(unsigned Val) const;
- MCOperand decodeOperand_AVDst_512(unsigned Val) const;
-
enum OpWidthTy {
OPW32,
OPW64,
@@ -244,18 +214,21 @@ public:
unsigned getTtmpClassId(const OpWidthTy Width) const;
static MCOperand decodeIntImmed(unsigned Imm);
- static MCOperand decodeFPImmed(OpWidthTy Width, unsigned Imm);
+ static MCOperand decodeFPImmed(unsigned ImmWidth, unsigned Imm);
+
MCOperand decodeMandatoryLiteralConstant(unsigned Imm) const;
MCOperand decodeLiteralConstant() const;
MCOperand decodeSrcOp(const OpWidthTy Width, unsigned Val,
- bool MandatoryLiteral = false) const;
- MCOperand decodeDstOp(const OpWidthTy Width, unsigned Val) const;
+ bool MandatoryLiteral = false,
+ unsigned ImmWidth = 0) const;
+
MCOperand decodeVOPDDstYOp(MCInst &Inst, unsigned Val) const;
MCOperand decodeSpecialReg32(unsigned Val) const;
MCOperand decodeSpecialReg64(unsigned Val) const;
- MCOperand decodeSDWASrc(const OpWidthTy Width, unsigned Val) const;
+ MCOperand decodeSDWASrc(const OpWidthTy Width, unsigned Val,
+ unsigned ImmWidth = 0) const;
MCOperand decodeSDWASrc16(unsigned Val) const;
MCOperand decodeSDWASrc32(unsigned Val) const;
MCOperand decodeSDWAVopcDst(unsigned Val) const;
diff --git a/llvm/lib/Target/AMDGPU/FLATInstructions.td b/llvm/lib/Target/AMDGPU/FLATInstructions.td
index 09f59af06589..5c86d80e7dd2 100644
--- a/llvm/lib/Target/AMDGPU/FLATInstructions.td
+++ b/llvm/lib/Target/AMDGPU/FLATInstructions.td
@@ -466,7 +466,7 @@ class FLAT_AtomicRet_Pseudo<string opName, dag outs, dag ins,
let PseudoInstr = NAME # "_RTN";
}
-multiclass FLAT_Atomic_Pseudo<
+multiclass FLAT_Atomic_Pseudo_NO_RTN<
string opName,
RegisterClass vdst_rc,
ValueType vt,
@@ -484,7 +484,16 @@ multiclass FLAT_Atomic_Pseudo<
let FPAtomic = isFP;
let AddedComplexity = -1; // Prefer global atomics if available
}
+}
+multiclass FLAT_Atomic_Pseudo_RTN<
+ string opName,
+ RegisterClass vdst_rc,
+ ValueType vt,
+ ValueType data_vt = vt,
+ RegisterClass data_rc = vdst_rc,
+ bit isFP = isFloatType<data_vt>.ret,
+ RegisterOperand data_op = getLdStRegisterOperand<data_rc>.ret> {
def _RTN : FLAT_AtomicRet_Pseudo <opName,
(outs getLdStRegisterOperand<vdst_rc>.ret:$vdst),
(ins VReg_64:$vaddr, data_op:$vdata, flat_offset:$offset, CPol_GLC1:$cpol),
@@ -496,6 +505,18 @@ multiclass FLAT_Atomic_Pseudo<
}
}
+multiclass FLAT_Atomic_Pseudo<
+ string opName,
+ RegisterClass vdst_rc,
+ ValueType vt,
+ ValueType data_vt = vt,
+ RegisterClass data_rc = vdst_rc,
+ bit isFP = isFloatType<data_vt>.ret,
+ RegisterOperand data_op = getLdStRegisterOperand<data_rc>.ret> {
+ defm "" : FLAT_Atomic_Pseudo_NO_RTN<opName, vdst_rc, vt, data_vt, data_rc, isFP, data_op>;
+ defm "" : FLAT_Atomic_Pseudo_RTN<opName, vdst_rc, vt, data_vt, data_rc, isFP, data_op>;
+}
+
multiclass FLAT_Global_Atomic_Pseudo_NO_RTN<
string opName,
RegisterClass vdst_rc,
@@ -709,11 +730,14 @@ let SubtargetPredicate = isGFX90APlus in {
defm GLOBAL_ATOMIC_MAX_F64 : FLAT_Global_Atomic_Pseudo<"global_atomic_max_f64", VReg_64, f64>;
} // End SubtargetPredicate = isGFX90APlus
-let SubtargetPredicate = isGFX940Plus in {
+let SubtargetPredicate = HasAtomicFlatPkAdd16Insts in {
defm FLAT_ATOMIC_PK_ADD_F16 : FLAT_Atomic_Pseudo<"flat_atomic_pk_add_f16", VGPR_32, v2f16>;
- defm FLAT_ATOMIC_PK_ADD_BF16 : FLAT_Atomic_Pseudo<"flat_atomic_pk_add_bf16", VGPR_32, v2f16>;
- defm GLOBAL_ATOMIC_PK_ADD_BF16 : FLAT_Global_Atomic_Pseudo<"global_atomic_pk_add_bf16", VGPR_32, v2f16>;
-} // End SubtargetPredicate = isGFX940Plus
+ let FPAtomic = 1 in
+ defm FLAT_ATOMIC_PK_ADD_BF16 : FLAT_Atomic_Pseudo<"flat_atomic_pk_add_bf16", VGPR_32, v2i16>;
+} // End SubtargetPredicate = HasAtomicFlatPkAdd16Insts
+
+let SubtargetPredicate = HasAtomicGlobalPkAddBF16Inst, FPAtomic = 1 in
+ defm GLOBAL_ATOMIC_PK_ADD_BF16 : FLAT_Global_Atomic_Pseudo<"global_atomic_pk_add_bf16", VGPR_32, v2i16>;
// GFX7-, GFX10-, GFX11-only flat instructions.
let SubtargetPredicate = isGFX7GFX10GFX11 in {
@@ -917,7 +941,7 @@ let OtherPredicates = [HasAtomicFaddNoRtnInsts] in
defm GLOBAL_ATOMIC_ADD_F32 : FLAT_Global_Atomic_Pseudo_NO_RTN <
"global_atomic_add_f32", VGPR_32, f32
>;
-let OtherPredicates = [HasAtomicPkFaddNoRtnInsts] in
+let OtherPredicates = [HasAtomicBufferGlobalPkAddF16NoRtnInsts] in
defm GLOBAL_ATOMIC_PK_ADD_F16 : FLAT_Global_Atomic_Pseudo_NO_RTN <
"global_atomic_pk_add_f16", VGPR_32, v2f16
>;
@@ -925,7 +949,7 @@ let OtherPredicates = [HasAtomicFaddRtnInsts] in
defm GLOBAL_ATOMIC_ADD_F32 : FLAT_Global_Atomic_Pseudo_RTN <
"global_atomic_add_f32", VGPR_32, f32
>;
-let OtherPredicates = [isGFX90APlus] in
+let OtherPredicates = [HasAtomicBufferGlobalPkAddF16Insts] in
defm GLOBAL_ATOMIC_PK_ADD_F16 : FLAT_Global_Atomic_Pseudo_RTN <
"global_atomic_pk_add_f16", VGPR_32, v2f16
>;
@@ -937,73 +961,73 @@ let OtherPredicates = [isGFX90APlus] in
// Patterns for global loads with no offset.
class FlatLoadPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat <
- (vt (node (FlatOffset i64:$vaddr, i16:$offset))),
+ (vt (node (FlatOffset i64:$vaddr, i32:$offset))),
(inst $vaddr, $offset)
>;
class FlatLoadPat_D16 <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat <
- (node (FlatOffset (i64 VReg_64:$vaddr), i16:$offset), vt:$in),
+ (node (FlatOffset (i64 VReg_64:$vaddr), i32:$offset), vt:$in),
(inst $vaddr, $offset, 0, $in)
>;
class FlatSignedLoadPat_D16 <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat <
- (node (GlobalOffset (i64 VReg_64:$vaddr), i16:$offset), vt:$in),
+ (node (GlobalOffset (i64 VReg_64:$vaddr), i32:$offset), vt:$in),
(inst $vaddr, $offset, 0, $in)
>;
class GlobalLoadSaddrPat_D16 <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat <
- (vt (node (GlobalSAddr (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), i16:$offset), vt:$in)),
+ (vt (node (GlobalSAddr (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), i32:$offset), vt:$in)),
(inst $saddr, $voffset, $offset, 0, $in)
>;
class FlatLoadSignedPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat <
- (vt (node (GlobalOffset (i64 VReg_64:$vaddr), i16:$offset))),
+ (vt (node (GlobalOffset (i64 VReg_64:$vaddr), i32:$offset))),
(inst $vaddr, $offset)
>;
class GlobalLoadSaddrPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat <
- (vt (node (GlobalSAddr (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), i16:$offset))),
+ (vt (node (GlobalSAddr (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), i32:$offset))),
(inst $saddr, $voffset, $offset, 0)
>;
class GlobalStoreSaddrPat <FLAT_Pseudo inst, SDPatternOperator node,
ValueType vt> : GCNPat <
- (node vt:$data, (GlobalSAddr (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), i16:$offset)),
+ (node vt:$data, (GlobalSAddr (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), i32:$offset)),
(inst $voffset, getVregSrcForVT<vt>.ret:$data, $saddr, $offset)
>;
class GlobalAtomicStoreSaddrPat <FLAT_Pseudo inst, SDPatternOperator node,
ValueType vt> : GCNPat <
- (node (GlobalSAddr (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), i16:$offset), vt:$data),
+ (node (GlobalSAddr (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), i32:$offset), vt:$data),
(inst $voffset, getVregSrcForVT<vt>.ret:$data, $saddr, $offset)
>;
class GlobalAtomicSaddrPat <FLAT_Pseudo inst, SDPatternOperator node,
ValueType vt, ValueType data_vt = vt> : GCNPat <
- (vt (node (GlobalSAddr (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), i16:$offset), data_vt:$data)),
+ (vt (node (GlobalSAddr (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), i32:$offset), data_vt:$data)),
(inst $voffset, getVregSrcForVT<data_vt>.ret:$data, $saddr, $offset)
>;
class GlobalAtomicNoRtnSaddrPat <FLAT_Pseudo inst, SDPatternOperator node,
ValueType vt> : GCNPat <
- (node (GlobalSAddr (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), i16:$offset), vt:$data),
+ (node (GlobalSAddr (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), i32:$offset), vt:$data),
(inst $voffset, getVregSrcForVT<vt>.ret:$data, $saddr, $offset)
>;
class FlatStorePat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat <
- (node vt:$data, (FlatOffset i64:$vaddr, i16:$offset)),
+ (node vt:$data, (FlatOffset i64:$vaddr, i32:$offset)),
(inst $vaddr, getVregSrcForVT<vt>.ret:$data, $offset)
>;
class FlatStoreSignedPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat <
- (node vt:$data, (GlobalOffset i64:$vaddr, i16:$offset)),
+ (node vt:$data, (GlobalOffset i64:$vaddr, i32:$offset)),
(inst $vaddr, getVregSrcForVT<vt>.ret:$data, $offset)
>;
class FlatStoreAtomicPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat <
// atomic store follows atomic binop convention so the address comes
// first.
- (node (FlatOffset i64:$vaddr, i16:$offset), vt:$data),
+ (node (FlatOffset i64:$vaddr, i32:$offset), vt:$data),
(inst $vaddr, getVregSrcForVT<vt>.ret:$data, $offset)
>;
@@ -1011,7 +1035,7 @@ class FlatStoreSignedAtomicPat <FLAT_Pseudo inst, SDPatternOperator node,
ValueType vt, ValueType data_vt = vt> : GCNPat <
// atomic store follows atomic binop convention so the address comes
// first.
- (node (GlobalOffset i64:$vaddr, i16:$offset), data_vt:$data),
+ (node (GlobalOffset i64:$vaddr, i32:$offset), data_vt:$data),
(inst $vaddr, getVregSrcForVT<data_vt>.ret:$data, $offset)
>;
@@ -1020,17 +1044,17 @@ multiclass FlatAtomicPat <string inst, string node, ValueType vt,
defvar rtnNode = !cast<PatFrags>(node#"_"#vt.Size);
defvar noRtnNode = !cast<PatFrags>(node#"_noret_"#vt.Size);
- def : GCNPat <(vt (rtnNode (FlatOffset i64:$vaddr, i16:$offset), data_vt:$data)),
+ def : GCNPat <(vt (rtnNode (FlatOffset i64:$vaddr, i32:$offset), data_vt:$data)),
(!cast<FLAT_Pseudo>(inst#"_RTN") VReg_64:$vaddr, getVregSrcForVT<data_vt>.ret:$data, $offset)>;
let AddedComplexity = 1 in
- def : GCNPat <(vt (noRtnNode (FlatOffset i64:$vaddr, i16:$offset), data_vt:$data)),
+ def : GCNPat <(vt (noRtnNode (FlatOffset i64:$vaddr, i32:$offset), data_vt:$data)),
(!cast<FLAT_Pseudo>(inst) VReg_64:$vaddr, getVregSrcForVT<data_vt>.ret:$data, $offset)>;
}
class FlatSignedAtomicPatBase <FLAT_Pseudo inst, SDPatternOperator node,
ValueType vt, ValueType data_vt = vt> : GCNPat <
- (vt (node (GlobalOffset i64:$vaddr, i16:$offset), data_vt:$data)),
+ (vt (node (GlobalOffset i64:$vaddr, i32:$offset), data_vt:$data)),
(inst VReg_64:$vaddr, getVregSrcForVT<data_vt>.ret:$data, $offset)
>;
@@ -1063,49 +1087,49 @@ multiclass FlatSignedAtomicPatWithAddrSpace<string inst, string intr, string add
}
class ScratchLoadSignedPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat <
- (vt (node (ScratchOffset (i32 VGPR_32:$vaddr), i16:$offset))),
+ (vt (node (ScratchOffset (i32 VGPR_32:$vaddr), i32:$offset))),
(inst $vaddr, $offset)
>;
class ScratchLoadSignedPat_D16 <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat <
- (node (ScratchOffset (i32 VGPR_32:$vaddr), i16:$offset), vt:$in),
+ (node (ScratchOffset (i32 VGPR_32:$vaddr), i32:$offset), vt:$in),
(inst $vaddr, $offset, 0, $in)
>;
class ScratchStoreSignedPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat <
- (node vt:$data, (ScratchOffset (i32 VGPR_32:$vaddr), i16:$offset)),
+ (node vt:$data, (ScratchOffset (i32 VGPR_32:$vaddr), i32:$offset)),
(inst getVregSrcForVT<vt>.ret:$data, $vaddr, $offset)
>;
class ScratchLoadSaddrPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat <
- (vt (node (ScratchSAddr (i32 SGPR_32:$saddr), i16:$offset))),
+ (vt (node (ScratchSAddr (i32 SGPR_32:$saddr), i32:$offset))),
(inst $saddr, $offset)
>;
class ScratchLoadSaddrPat_D16 <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat <
- (vt (node (ScratchSAddr (i32 SGPR_32:$saddr), i16:$offset), vt:$in)),
+ (vt (node (ScratchSAddr (i32 SGPR_32:$saddr), i32:$offset), vt:$in)),
(inst $saddr, $offset, 0, $in)
>;
class ScratchStoreSaddrPat <FLAT_Pseudo inst, SDPatternOperator node,
ValueType vt> : GCNPat <
- (node vt:$data, (ScratchSAddr (i32 SGPR_32:$saddr), i16:$offset)),
+ (node vt:$data, (ScratchSAddr (i32 SGPR_32:$saddr), i32:$offset)),
(inst getVregSrcForVT<vt>.ret:$data, $saddr, $offset)
>;
class ScratchLoadSVaddrPat <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat <
- (vt (node (ScratchSVAddr (i32 VGPR_32:$vaddr), (i32 SGPR_32:$saddr), i16:$offset))),
+ (vt (node (ScratchSVAddr (i32 VGPR_32:$vaddr), (i32 SGPR_32:$saddr), i32:$offset))),
(inst $vaddr, $saddr, $offset, 0)
>;
class ScratchStoreSVaddrPat <FLAT_Pseudo inst, SDPatternOperator node,
ValueType vt> : GCNPat <
- (node vt:$data, (ScratchSVAddr (i32 VGPR_32:$vaddr), (i32 SGPR_32:$saddr), i16:$offset)),
+ (node vt:$data, (ScratchSVAddr (i32 VGPR_32:$vaddr), (i32 SGPR_32:$saddr), i32:$offset)),
(inst getVregSrcForVT<vt>.ret:$data, $vaddr, $saddr, $offset)
>;
class ScratchLoadSVaddrPat_D16 <FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> : GCNPat <
- (vt (node (ScratchSVAddr (i32 VGPR_32:$vaddr), (i32 SGPR_32:$saddr), i16:$offset), vt:$in)),
+ (vt (node (ScratchSVAddr (i32 VGPR_32:$vaddr), (i32 SGPR_32:$saddr), i32:$offset), vt:$in)),
(inst $vaddr, $saddr, $offset, 0, $in)
>;
@@ -1160,8 +1184,8 @@ def : FlatStoreAtomicPat <FLAT_STORE_SHORT, atomic_store_16_flat, i16>;
foreach as = [ "flat", "global" ] in {
defm : FlatAtomicPat <"FLAT_ATOMIC_ADD", "atomic_load_add_"#as, i32>;
defm : FlatAtomicPat <"FLAT_ATOMIC_SUB", "atomic_load_sub_"#as, i32>;
-defm : FlatAtomicPat <"FLAT_ATOMIC_INC", "atomic_inc_"#as, i32>;
-defm : FlatAtomicPat <"FLAT_ATOMIC_DEC", "atomic_dec_"#as, i32>;
+defm : FlatAtomicPat <"FLAT_ATOMIC_INC", "atomic_load_uinc_wrap_"#as, i32>;
+defm : FlatAtomicPat <"FLAT_ATOMIC_DEC", "atomic_load_udec_wrap_"#as, i32>;
defm : FlatAtomicPat <"FLAT_ATOMIC_AND", "atomic_load_and_"#as, i32>;
defm : FlatAtomicPat <"FLAT_ATOMIC_SMAX", "atomic_load_max_"#as, i32>;
defm : FlatAtomicPat <"FLAT_ATOMIC_UMAX", "atomic_load_umax_"#as, i32>;
@@ -1174,8 +1198,8 @@ defm : FlatAtomicPat <"FLAT_ATOMIC_XOR", "atomic_load_xor_"#as, i32>;
defm : FlatAtomicPat <"FLAT_ATOMIC_ADD_X2", "atomic_load_add_"#as, i64>;
defm : FlatAtomicPat <"FLAT_ATOMIC_SUB_X2", "atomic_load_sub_"#as, i64>;
-defm : FlatAtomicPat <"FLAT_ATOMIC_INC_X2", "atomic_inc_"#as, i64>;
-defm : FlatAtomicPat <"FLAT_ATOMIC_DEC_X2", "atomic_dec_"#as, i64>;
+defm : FlatAtomicPat <"FLAT_ATOMIC_INC_X2", "atomic_load_uinc_wrap_"#as, i64>;
+defm : FlatAtomicPat <"FLAT_ATOMIC_DEC_X2", "atomic_load_udec_wrap_"#as, i64>;
defm : FlatAtomicPat <"FLAT_ATOMIC_AND_X2", "atomic_load_and_"#as, i64>;
defm : FlatAtomicPat <"FLAT_ATOMIC_SMAX_X2", "atomic_load_max_"#as, i64>;
defm : FlatAtomicPat <"FLAT_ATOMIC_UMAX_X2", "atomic_load_umax_"#as, i64>;
@@ -1429,8 +1453,8 @@ defm : GlobalFLATAtomicStorePats <GLOBAL_STORE_DWORDX2, atomic_store_64_global,
defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_ADD", "atomic_load_add_global", i32>;
defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_SUB", "atomic_load_sub_global", i32>;
-defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_INC", "atomic_inc_global", i32>;
-defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_DEC", "atomic_dec_global", i32>;
+defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_INC", "atomic_load_uinc_wrap_global", i32>;
+defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_DEC", "atomic_load_udec_wrap_global", i32>;
defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_AND", "atomic_load_and_global", i32>;
defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_SMAX", "atomic_load_max_global", i32>;
defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_UMAX", "atomic_load_umax_global", i32>;
@@ -1444,8 +1468,8 @@ defm : GlobalFLATAtomicPatsRtn <"GLOBAL_ATOMIC_CSUB", "int_amdgcn_global_atomic_
defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_ADD_X2", "atomic_load_add_global", i64>;
defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_SUB_X2", "atomic_load_sub_global", i64>;
-defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_INC_X2", "atomic_inc_global", i64>;
-defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_DEC_X2", "atomic_dec_global", i64>;
+defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_INC_X2", "atomic_load_uinc_wrap_global", i64>;
+defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_DEC_X2", "atomic_load_udec_wrap_global", i64>;
defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_AND_X2", "atomic_load_and_global", i64>;
defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_SMAX_X2", "atomic_load_max_global", i64>;
defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_UMAX_X2", "atomic_load_umax_global", i64>;
@@ -1459,12 +1483,23 @@ defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_XOR_X2", "atomic_load_xor_global", i
let OtherPredicates = [isGFX10Plus] in {
defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_FMIN", "atomic_load_fmin_global", f32>;
defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_FMAX", "atomic_load_fmax_global", f32>;
-defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_FMIN_X2", "atomic_load_fmin_global", f64>;
-defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_FMAX_X2", "atomic_load_fmax_global", f64>;
defm : GlobalFLATAtomicIntrPats <"GLOBAL_ATOMIC_FMIN", "int_amdgcn_global_atomic_fmin", f32>;
defm : GlobalFLATAtomicIntrPats <"GLOBAL_ATOMIC_FMAX", "int_amdgcn_global_atomic_fmax", f32>;
+defm : FlatSignedAtomicPat <"FLAT_ATOMIC_FMIN", "atomic_load_fmin_flat", f32>;
+defm : FlatSignedAtomicPat <"FLAT_ATOMIC_FMAX", "atomic_load_fmax_flat", f32>;
+defm : FlatSignedAtomicIntrPat <"FLAT_ATOMIC_FMIN", "int_amdgcn_flat_atomic_fmin", f32>;
+defm : FlatSignedAtomicIntrPat <"FLAT_ATOMIC_FMAX", "int_amdgcn_flat_atomic_fmax", f32>;
+}
+
+let OtherPredicates = [isGFX10Only] in {
+defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_FMIN_X2", "atomic_load_fmin_global", f64>;
+defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_FMAX_X2", "atomic_load_fmax_global", f64>;
defm : GlobalFLATAtomicIntrPats <"GLOBAL_ATOMIC_FMIN_X2", "int_amdgcn_global_atomic_fmin", f64>;
defm : GlobalFLATAtomicIntrPats <"GLOBAL_ATOMIC_FMAX_X2", "int_amdgcn_global_atomic_fmax", f64>;
+defm : FlatSignedAtomicPat <"FLAT_ATOMIC_FMIN_X2", "atomic_load_fmin_flat", f64>;
+defm : FlatSignedAtomicPat <"FLAT_ATOMIC_FMAX_X2", "atomic_load_fmax_flat", f64>;
+defm : FlatSignedAtomicIntrPat <"FLAT_ATOMIC_FMIN_X2", "int_amdgcn_flat_atomic_fmin", f64>;
+defm : FlatSignedAtomicIntrPat <"FLAT_ATOMIC_FMAX_X2", "int_amdgcn_flat_atomic_fmax", f64>;
}
let OtherPredicates = [HasAtomicFaddNoRtnInsts] in {
@@ -1473,7 +1508,7 @@ defm : GlobalFLATAtomicPatsNoRtnWithAddrSpace <"GLOBAL_ATOMIC_ADD_F32", "int_amd
defm : GlobalFLATAtomicPatsNoRtnWithAddrSpace <"GLOBAL_ATOMIC_ADD_F32", "int_amdgcn_global_atomic_fadd", "global_addrspace", f32>;
}
-let OtherPredicates = [HasAtomicPkFaddNoRtnInsts] in {
+let OtherPredicates = [HasAtomicBufferGlobalPkAddF16NoRtnInsts] in {
defm : GlobalFLATAtomicPatsNoRtnWithAddrSpace <"GLOBAL_ATOMIC_PK_ADD_F16", "int_amdgcn_flat_atomic_fadd", "global_addrspace", v2f16>;
defm : GlobalFLATAtomicPatsNoRtnWithAddrSpace <"GLOBAL_ATOMIC_PK_ADD_F16", "int_amdgcn_global_atomic_fadd", "global_addrspace", v2f16>;
}
@@ -1484,14 +1519,17 @@ defm : GlobalFLATAtomicPatsRtnWithAddrSpace <"GLOBAL_ATOMIC_ADD_F32", "int_amdgc
defm : GlobalFLATAtomicPatsRtnWithAddrSpace <"GLOBAL_ATOMIC_ADD_F32", "int_amdgcn_global_atomic_fadd", "global_addrspace", f32>;
}
+let OtherPredicates = [HasAtomicBufferGlobalPkAddF16Insts] in {
+defm : GlobalFLATAtomicPatsRtnWithAddrSpace <"GLOBAL_ATOMIC_PK_ADD_F16", "int_amdgcn_flat_atomic_fadd", "global_addrspace", v2f16>;
+defm : GlobalFLATAtomicPatsRtnWithAddrSpace <"GLOBAL_ATOMIC_PK_ADD_F16", "int_amdgcn_global_atomic_fadd", "global_addrspace", v2f16>;
+}
+
let OtherPredicates = [isGFX90APlus] in {
defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_ADD_F64", "atomic_load_fadd_global", f64>;
defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_MIN_F64", "atomic_load_fmin_global", f64>;
defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_MAX_F64", "atomic_load_fmax_global", f64>;
defm : GlobalFLATAtomicPatsWithAddrSpace<"GLOBAL_ATOMIC_ADD_F64", "int_amdgcn_flat_atomic_fadd", "global_addrspace", f64>;
defm : GlobalFLATAtomicPatsWithAddrSpace<"GLOBAL_ATOMIC_ADD_F64", "int_amdgcn_global_atomic_fadd", "global_addrspace", f64>;
-defm : GlobalFLATAtomicPatsRtnWithAddrSpace <"GLOBAL_ATOMIC_PK_ADD_F16", "int_amdgcn_flat_atomic_fadd", "global_addrspace", v2f16>;
-defm : GlobalFLATAtomicPatsRtnWithAddrSpace <"GLOBAL_ATOMIC_PK_ADD_F16", "int_amdgcn_global_atomic_fadd", "global_addrspace", v2f16>;
defm : GlobalFLATAtomicIntrPats <"GLOBAL_ATOMIC_MIN_F64", "int_amdgcn_global_atomic_fmin", f64>;
defm : GlobalFLATAtomicIntrPats <"GLOBAL_ATOMIC_MAX_F64", "int_amdgcn_global_atomic_fmax", f64>;
defm : FlatSignedAtomicPat <"FLAT_ATOMIC_ADD_F64", "atomic_load_fadd_flat", f64>;
@@ -1507,12 +1545,14 @@ defm : FlatSignedAtomicPat <"FLAT_ATOMIC_ADD_F32", "atomic_load_fadd_flat", f32>
defm : FlatSignedAtomicPatWithAddrSpace <"FLAT_ATOMIC_ADD_F32", "int_amdgcn_flat_atomic_fadd", "flat_addrspace", f32>;
}
-let OtherPredicates = [isGFX940Plus] in {
+let OtherPredicates = [HasAtomicFlatPkAdd16Insts] in {
defm : FlatSignedAtomicPatWithAddrSpace <"FLAT_ATOMIC_PK_ADD_F16", "int_amdgcn_flat_atomic_fadd", "flat_addrspace", v2f16>;
defm : FlatSignedAtomicIntrPat <"FLAT_ATOMIC_PK_ADD_BF16", "int_amdgcn_flat_atomic_fadd_v2bf16", v2i16>;
-defm : GlobalFLATAtomicIntrPats <"GLOBAL_ATOMIC_PK_ADD_BF16", "int_amdgcn_global_atomic_fadd_v2bf16", v2i16>;
}
+let OtherPredicates = [HasAtomicGlobalPkAddBF16Inst] in
+defm : GlobalFLATAtomicIntrPats <"GLOBAL_ATOMIC_PK_ADD_BF16", "int_amdgcn_global_atomic_fadd_v2bf16", v2i16>;
+
} // End OtherPredicates = [HasFlatGlobalInsts], AddedComplexity = 10
let OtherPredicates = [HasFlatScratchInsts, EnableFlatScratch] in {
@@ -2171,12 +2211,16 @@ class FLAT_Real_gfx11 <bits<7> op, FLAT_Pseudo ps, string opName = ps.Mnemonic>
let Inst{55} = ps.sve;
}
-multiclass FLAT_Real_Base_gfx11<bits<7> op, string ps, string opName, int renamed = false> {
+multiclass FLAT_Aliases_gfx11<string ps, string opName, int renamed> {
+ if renamed then
+ def _renamed_gfx11 : MnemonicAlias<!cast<FLAT_Pseudo>(ps).Mnemonic, opName>, Requires<[isGFX11Plus]>;
+}
+
+multiclass FLAT_Real_Base_gfx11<bits<7> op, string ps, string opName, int renamed = false> :
+ FLAT_Aliases_gfx11<ps, opName, renamed> {
def _gfx11 : FLAT_Real_gfx11<op, !cast<FLAT_Pseudo>(ps), opName> {
let Inst{54-48} = !cast<int>(SGPR_NULL_gfx11plus.HWEncoding);
}
- if renamed then
- def _renamed_gfx11 : MnemonicAlias<!cast<FLAT_Pseudo>(ps).Mnemonic, opName>, Requires<[isGFX11Plus]>;
}
multiclass FLAT_Real_RTN_gfx11<bits<7> op, string ps, string opName> {
@@ -2219,7 +2263,8 @@ multiclass FLAT_Real_GlblAtomics_gfx11<bits<7> op, string ps, string opName, int
FLAT_Real_RTN_gfx11<op, ps, opName>,
FLAT_Real_SADDR_RTN_gfx11<op, ps, opName>;
-multiclass FLAT_Real_GlblAtomics_RTN_gfx11<bits<7> op, string ps, string opName> :
+multiclass FLAT_Real_GlblAtomics_RTN_gfx11<bits<7> op, string ps, string opName, int renamed = false> :
+ FLAT_Aliases_gfx11<ps#"_RTN", opName, renamed>,
FLAT_Real_RTN_gfx11<op, ps, opName>,
FLAT_Real_SADDR_RTN_gfx11<op, ps, opName>;
@@ -2312,7 +2357,7 @@ defm GLOBAL_ATOMIC_SWAP_B32 : FLAT_Real_GlblAtomics_gfx11<0x033, "GLOBAL_ATO
defm GLOBAL_ATOMIC_CMPSWAP_B32 : FLAT_Real_GlblAtomics_gfx11<0x034, "GLOBAL_ATOMIC_CMPSWAP", "global_atomic_cmpswap_b32", true>;
defm GLOBAL_ATOMIC_ADD_U32 : FLAT_Real_GlblAtomics_gfx11<0x035, "GLOBAL_ATOMIC_ADD", "global_atomic_add_u32", true>;
defm GLOBAL_ATOMIC_SUB_U32 : FLAT_Real_GlblAtomics_gfx11<0x036, "GLOBAL_ATOMIC_SUB", "global_atomic_sub_u32", true>;
-defm GLOBAL_ATOMIC_CSUB_U32 : FLAT_Real_GlblAtomics_RTN_gfx11<0x037, "GLOBAL_ATOMIC_CSUB", "global_atomic_csub_u32">;
+defm GLOBAL_ATOMIC_CSUB_U32 : FLAT_Real_GlblAtomics_RTN_gfx11<0x037, "GLOBAL_ATOMIC_CSUB", "global_atomic_csub_u32", true>;
defm GLOBAL_ATOMIC_MIN_I32 : FLAT_Real_GlblAtomics_gfx11<0x038, "GLOBAL_ATOMIC_SMIN", "global_atomic_min_i32", true>;
defm GLOBAL_ATOMIC_MIN_U32 : FLAT_Real_GlblAtomics_gfx11<0x039, "GLOBAL_ATOMIC_UMIN", "global_atomic_min_u32", true>;
defm GLOBAL_ATOMIC_MAX_I32 : FLAT_Real_GlblAtomics_gfx11<0x03a, "GLOBAL_ATOMIC_SMAX", "global_atomic_max_i32", true>;
diff --git a/llvm/lib/Target/AMDGPU/GCNCreateVOPD.cpp b/llvm/lib/Target/AMDGPU/GCNCreateVOPD.cpp
index f2452a275bdc..c9e0c6849568 100644
--- a/llvm/lib/Target/AMDGPU/GCNCreateVOPD.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNCreateVOPD.cpp
@@ -42,6 +42,16 @@ namespace {
class GCNCreateVOPD : public MachineFunctionPass {
private:
+ class VOPDCombineInfo {
+ public:
+ VOPDCombineInfo() {}
+ VOPDCombineInfo(MachineInstr *First, MachineInstr *Second)
+ : FirstMI(First), SecondMI(Second) {}
+
+ MachineInstr *FirstMI;
+ MachineInstr *SecondMI;
+ };
+
public:
static char ID;
const GCNSubtarget *ST = nullptr;
@@ -57,10 +67,9 @@ public:
return "GCN Create VOPD Instructions";
}
- bool doReplace(const SIInstrInfo *SII,
- std::pair<MachineInstr *, MachineInstr *> &Pair) {
- auto *FirstMI = Pair.first;
- auto *SecondMI = Pair.second;
+ bool doReplace(const SIInstrInfo *SII, VOPDCombineInfo &CI) {
+ auto *FirstMI = CI.FirstMI;
+ auto *SecondMI = CI.SecondMI;
unsigned Opc1 = FirstMI->getOpcode();
unsigned Opc2 = SecondMI->getOpcode();
int NewOpcode = AMDGPU::getVOPDFull(AMDGPU::getVOPDOpcode(Opc1),
@@ -94,7 +103,7 @@ public:
VOPDInst.copyImplicitOps(*MI[CompIdx]);
LLVM_DEBUG(dbgs() << "VOPD Fused: " << *VOPDInst << " from\tX: "
- << *Pair.first << "\tY: " << *Pair.second << "\n");
+ << *CI.FirstMI << "\tY: " << *CI.SecondMI << "\n");
for (auto CompIdx : VOPD::COMPONENTS)
MI[CompIdx]->eraseFromParent();
@@ -114,7 +123,7 @@ public:
const SIInstrInfo *SII = ST->getInstrInfo();
bool Changed = false;
- SmallVector<std::pair<MachineInstr *, MachineInstr *>> ReplaceCandidates;
+ SmallVector<VOPDCombineInfo> ReplaceCandidates;
for (auto &MBB : MF) {
auto MII = MBB.begin(), E = MBB.end();
@@ -130,24 +139,24 @@ public:
unsigned Opc2 = SecondMI->getOpcode();
llvm::AMDGPU::CanBeVOPD FirstCanBeVOPD = AMDGPU::getCanBeVOPD(Opc);
llvm::AMDGPU::CanBeVOPD SecondCanBeVOPD = AMDGPU::getCanBeVOPD(Opc2);
- std::pair<MachineInstr *, MachineInstr *> Pair;
+ VOPDCombineInfo CI;
if (FirstCanBeVOPD.X && SecondCanBeVOPD.Y)
- Pair = {FirstMI, SecondMI};
+ CI = VOPDCombineInfo(FirstMI, SecondMI);
else if (FirstCanBeVOPD.Y && SecondCanBeVOPD.X)
- Pair = {SecondMI, FirstMI};
+ CI = VOPDCombineInfo(SecondMI, FirstMI);
else
continue;
// checkVOPDRegConstraints cares about program order, but doReplace
// cares about X-Y order in the constituted VOPD
if (llvm::checkVOPDRegConstraints(*SII, *FirstMI, *SecondMI)) {
- ReplaceCandidates.push_back(Pair);
+ ReplaceCandidates.push_back(CI);
++MII;
}
}
}
- for (auto &Pair : ReplaceCandidates) {
- Changed |= doReplace(SII, Pair);
+ for (auto &CI : ReplaceCandidates) {
+ Changed |= doReplace(SII, CI);
}
return Changed;
diff --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
index b33e614a071c..2d53b2a70dbe 100644
--- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
@@ -16,7 +16,7 @@
#include "SIMachineFunctionInfo.h"
#include "llvm/CodeGen/MachineFunction.h"
#include "llvm/CodeGen/ScheduleDAG.h"
-#include "llvm/Support/TargetParser.h"
+#include "llvm/TargetParser/TargetParser.h"
using namespace llvm;
@@ -588,23 +588,21 @@ int GCNHazardRecognizer::getWaitStatesSinceSetReg(IsHazardFn IsHazard,
static void addRegUnits(const SIRegisterInfo &TRI, BitVector &BV,
MCRegister Reg) {
- for (MCRegUnitIterator RUI(Reg, &TRI); RUI.isValid(); ++RUI)
- BV.set(*RUI);
+ for (MCRegUnit Unit : TRI.regunits(Reg))
+ BV.set(Unit);
}
static void addRegsToSet(const SIRegisterInfo &TRI,
iterator_range<MachineInstr::const_mop_iterator> Ops,
- BitVector &Set) {
+ BitVector &DefSet, BitVector &UseSet) {
for (const MachineOperand &Op : Ops) {
if (Op.isReg())
- addRegUnits(TRI, Set, Op.getReg().asMCReg());
+ addRegUnits(TRI, Op.isDef() ? DefSet : UseSet, Op.getReg().asMCReg());
}
}
void GCNHazardRecognizer::addClauseInst(const MachineInstr &MI) {
- // XXX: Do we need to worry about implicit operands
- addRegsToSet(TRI, MI.defs(), ClauseDefs);
- addRegsToSet(TRI, MI.uses(), ClauseUses);
+ addRegsToSet(TRI, MI.operands(), ClauseDefs, ClauseUses);
}
static bool breaksSMEMSoftClause(MachineInstr *MI) {
@@ -1033,11 +1031,11 @@ int GCNHazardRecognizer::checkInlineAsmHazards(MachineInstr *IA) {
const MachineRegisterInfo &MRI = MF.getRegInfo();
int WaitStatesNeeded = 0;
- for (unsigned I = InlineAsm::MIOp_FirstOperand, E = IA->getNumOperands();
- I != E; ++I) {
- const MachineOperand &Op = IA->getOperand(I);
+ for (const MachineOperand &Op :
+ llvm::drop_begin(IA->operands(), InlineAsm::MIOp_FirstOperand)) {
if (Op.isReg() && Op.isDef()) {
- WaitStatesNeeded = std::max(WaitStatesNeeded, checkVALUHazardsHelper(Op, MRI));
+ WaitStatesNeeded =
+ std::max(WaitStatesNeeded, checkVALUHazardsHelper(Op, MRI));
}
}
@@ -1172,7 +1170,7 @@ bool GCNHazardRecognizer::fixVMEMtoScalarWriteHazards(MachineInstr *MI) {
(MI.getOpcode() == AMDGPU::S_WAITCNT &&
!MI.getOperand(0).getImm()) ||
(MI.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
- MI.getOperand(0).getImm() == 0xffe3);
+ AMDGPU::DepCtr::decodeFieldVmVsrc(MI.getOperand(0).getImm()) == 0);
};
if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
@@ -1182,7 +1180,7 @@ bool GCNHazardRecognizer::fixVMEMtoScalarWriteHazards(MachineInstr *MI) {
const SIInstrInfo *TII = ST.getInstrInfo();
BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
TII->get(AMDGPU::S_WAITCNT_DEPCTR))
- .addImm(0xffe3);
+ .addImm(AMDGPU::DepCtr::encodeFieldVmVsrc(0));
return true;
}
@@ -1295,7 +1293,7 @@ bool GCNHazardRecognizer::fixVcmpxExecWARHazard(MachineInstr *MI) {
return true;
}
if (MI.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
- (MI.getOperand(0).getImm() & 0xfffe) == 0xfffe)
+ AMDGPU::DepCtr::decodeFieldSaSdst(MI.getOperand(0).getImm()) == 0)
return true;
return false;
};
@@ -1306,7 +1304,7 @@ bool GCNHazardRecognizer::fixVcmpxExecWARHazard(MachineInstr *MI) {
BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
TII->get(AMDGPU::S_WAITCNT_DEPCTR))
- .addImm(0xfffe);
+ .addImm(AMDGPU::DepCtr::encodeFieldSaSdst(0));
return true;
}
@@ -1454,7 +1452,7 @@ bool GCNHazardRecognizer::fixLdsDirectVMEMHazard(MachineInstr *MI) {
return SIInstrInfo::isVALU(I) || SIInstrInfo::isEXP(I) ||
(I.getOpcode() == AMDGPU::S_WAITCNT && !I.getOperand(0).getImm()) ||
(I.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
- I.getOperand(0).getImm() == 0xffe3);
+ AMDGPU::DepCtr::decodeFieldVmVsrc(I.getOperand(0).getImm()) == 0);
};
if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) ==
@@ -1463,7 +1461,7 @@ bool GCNHazardRecognizer::fixLdsDirectVMEMHazard(MachineInstr *MI) {
BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
TII.get(AMDGPU::S_WAITCNT_DEPCTR))
- .addImm(0xffe3);
+ .addImm(AMDGPU::DepCtr::encodeFieldVmVsrc(0));
return true;
}
@@ -1525,7 +1523,7 @@ bool GCNHazardRecognizer::fixVALUPartialForwardingHazard(MachineInstr *MI) {
if (SIInstrInfo::isVMEM(I) || SIInstrInfo::isFLAT(I) ||
SIInstrInfo::isDS(I) || SIInstrInfo::isEXP(I) ||
(I.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
- I.getOperand(0).getImm() == 0x0fff))
+ AMDGPU::DepCtr::decodeFieldVaVdst(I.getOperand(0).getImm()) == 0))
return HazardExpired;
// Track registers writes
@@ -1687,10 +1685,10 @@ bool GCNHazardRecognizer::fixVALUTransUseHazard(MachineInstr *MI) {
return false;
// Hazard is observed - insert a wait on va_dst counter to ensure hazard is
- // avoided (mask 0x0fff achieves this).
+ // avoided.
BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
TII.get(AMDGPU::S_WAITCNT_DEPCTR))
- .addImm(0x0fff);
+ .addImm(AMDGPU::DepCtr::encodeFieldVaVdst(0));
return true;
}
@@ -2026,7 +2024,7 @@ int GCNHazardRecognizer::checkMAIHazards908(MachineInstr *MI) {
MaxWaitStates);
int NeedWaitStates = MFMAWritesAGPROverlappedSrcABWaitStates;
int SrcCIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2);
- int OpNo = MI->getOperandNo(&Op);
+ int OpNo = Op.getOperandNo();
if (OpNo == SrcCIdx) {
NeedWaitStates = MFMAWritesAGPROverlappedSrcCWaitStates;
} else if (Opc == AMDGPU::V_ACCVGPR_READ_B32_e64) {
@@ -2205,7 +2203,7 @@ int GCNHazardRecognizer::checkMAIHazards90A(MachineInstr *MI) {
if (NumWaitStates == std::numeric_limits<int>::max())
continue;
- int OpNo = MI->getOperandNo(&Use);
+ int OpNo = Use.getOperandNo();
unsigned Opc1 = MI1->getOpcode();
int NeedWaitStates = 0;
if (OpNo == SrcCIdx) {
@@ -2781,7 +2779,7 @@ bool GCNHazardRecognizer::fixVALUMaskWriteHazard(MachineInstr *MI) {
auto IsExpiredFn = [&MRI, this](const MachineInstr &I, int) {
// s_waitcnt_depctr sa_sdst(0) mitigates hazard.
if (I.getOpcode() == AMDGPU::S_WAITCNT_DEPCTR &&
- !(I.getOperand(0).getImm() & 0x1))
+ AMDGPU::DepCtr::decodeFieldSaSdst(I.getOperand(0).getImm()) == 0)
return true;
// VALU access to any SGPR or literal constant other than HazardReg
@@ -2831,7 +2829,7 @@ bool GCNHazardRecognizer::fixVALUMaskWriteHazard(MachineInstr *MI) {
// Add s_waitcnt_depctr sa_sdst(0) after SALU write.
BuildMI(*MI->getParent(), NextMI, MI->getDebugLoc(),
TII.get(AMDGPU::S_WAITCNT_DEPCTR))
- .addImm(0xfffe);
+ .addImm(AMDGPU::DepCtr::encodeFieldSaSdst(0));
// SALU write may be s_getpc in a bundle.
if (MI->getOpcode() == AMDGPU::S_GETPC_B64) {
diff --git a/llvm/lib/Target/AMDGPU/GCNIterativeScheduler.cpp b/llvm/lib/Target/AMDGPU/GCNIterativeScheduler.cpp
index 77960ef62f3a..d89c9b1febde 100644
--- a/llvm/lib/Target/AMDGPU/GCNIterativeScheduler.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNIterativeScheduler.cpp
@@ -367,9 +367,8 @@ void GCNIterativeScheduler::scheduleRegion(Region &R, Range &&Schedule,
}
if (!MI->isDebugInstr()) {
// Reset read - undef flags and update them later.
- for (auto &Op : MI->operands())
- if (Op.isReg() && Op.isDef())
- Op.setIsUndef(false);
+ for (auto &Op : MI->all_defs())
+ Op.setIsUndef(false);
RegisterOperands RegOpers;
RegOpers.collect(*MI, *TRI, MRI, /*ShouldTrackLaneMasks*/true,
diff --git a/llvm/lib/Target/AMDGPU/GCNNSAReassign.cpp b/llvm/lib/Target/AMDGPU/GCNNSAReassign.cpp
index 366bc0a8ec0d..4c9ad9b5bcf7 100644
--- a/llvm/lib/Target/AMDGPU/GCNNSAReassign.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNNSAReassign.cpp
@@ -237,7 +237,7 @@ GCNNSAReassign::CheckNSA(const MachineInstr &MI, bool Fast) const {
bool GCNNSAReassign::runOnMachineFunction(MachineFunction &MF) {
ST = &MF.getSubtarget<GCNSubtarget>();
- if (ST->getGeneration() < GCNSubtarget::GFX10)
+ if (!ST->hasNSAEncoding())
return false;
MRI = &MF.getRegInfo();
diff --git a/llvm/lib/Target/AMDGPU/GCNPreRALongBranchReg.cpp b/llvm/lib/Target/AMDGPU/GCNPreRALongBranchReg.cpp
new file mode 100644
index 000000000000..b50af38683ed
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/GCNPreRALongBranchReg.cpp
@@ -0,0 +1,139 @@
+//===-- GCNPreRALongBranchReg.cpp ----------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+// \file
+// \brief Pass to estimate pre RA branch size and reserve a pair of SGPRs if
+// there is a long branch. Branch size at this point is difficult to track since
+// we have no idea what spills will be inserted later on. We just assume 8 bytes
+// per instruction to compute approximations without computing the actual
+// instruction size to see if we're in the neighborhood of the maximum branch
+// distrance threshold tuning of what is considered "long" is handled through
+// amdgpu-long-branch-factor cl argument which sets LongBranchFactor.
+//===----------------------------------------------------------------------===//
+#include "AMDGPU.h"
+#include "GCNSubtarget.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
+#include "SIMachineFunctionInfo.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/InitializePasses.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "amdgpu-pre-ra-long-branch-reg"
+
+namespace {
+
+static cl::opt<double> LongBranchFactor(
+ "amdgpu-long-branch-factor", cl::init(1.0), cl::Hidden,
+ cl::desc("Factor to apply to what qualifies as a long branch "
+ "to reserve a pair of scalar registers. If this value "
+ "is 0 the long branch registers are never reserved. As this "
+ "value grows the greater chance the branch distance will fall "
+ "within the threshold and the registers will be marked to be "
+ "reserved. We lean towards always reserving a register for "
+ "long jumps"));
+
+class GCNPreRALongBranchReg : public MachineFunctionPass {
+
+ struct BasicBlockInfo {
+ // Offset - Distance from the beginning of the function to the beginning
+ // of this basic block.
+ uint64_t Offset = 0;
+ // Size - Size of the basic block in bytes
+ uint64_t Size = 0;
+ };
+ void generateBlockInfo(MachineFunction &MF,
+ SmallVectorImpl<BasicBlockInfo> &BlockInfo);
+
+public:
+ static char ID;
+ GCNPreRALongBranchReg() : MachineFunctionPass(ID) {
+ initializeGCNPreRALongBranchRegPass(*PassRegistry::getPassRegistry());
+ }
+ bool runOnMachineFunction(MachineFunction &MF) override;
+ StringRef getPassName() const override {
+ return "AMDGPU Pre-RA Long Branch Reg";
+ }
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.setPreservesAll();
+ MachineFunctionPass::getAnalysisUsage(AU);
+ }
+};
+} // End anonymous namespace.
+char GCNPreRALongBranchReg::ID = 0;
+
+INITIALIZE_PASS(GCNPreRALongBranchReg, DEBUG_TYPE,
+ "AMDGPU Pre-RA Long Branch Reg", false, false)
+
+char &llvm::GCNPreRALongBranchRegID = GCNPreRALongBranchReg::ID;
+void GCNPreRALongBranchReg::generateBlockInfo(
+ MachineFunction &MF, SmallVectorImpl<BasicBlockInfo> &BlockInfo) {
+
+ BlockInfo.resize(MF.getNumBlockIDs());
+
+ // Approximate the size of all basic blocks by just
+ // assuming 8 bytes per instruction
+ for (const MachineBasicBlock &MBB : MF) {
+ uint64_t NumInstr = 0;
+ // Loop through the basic block and add up all non-debug
+ // non-meta instructions
+ for (const MachineInstr &MI : MBB) {
+ // isMetaInstruction is a superset of isDebugIstr
+ if (MI.isMetaInstruction())
+ continue;
+ NumInstr += 1;
+ }
+ // Approximate size as just 8 bytes per instruction
+ BlockInfo[MBB.getNumber()].Size = 8 * NumInstr;
+ }
+ uint64_t PrevNum = (&MF)->begin()->getNumber();
+ for (auto &MBB :
+ make_range(std::next(MachineFunction::iterator((&MF)->begin())),
+ (&MF)->end())) {
+ uint64_t Num = MBB.getNumber();
+ // Compute the offset immediately following this block.
+ BlockInfo[Num].Offset = BlockInfo[PrevNum].Offset + BlockInfo[PrevNum].Size;
+ PrevNum = Num;
+ }
+}
+bool GCNPreRALongBranchReg::runOnMachineFunction(MachineFunction &MF) {
+ const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
+ const SIInstrInfo *TII = STM.getInstrInfo();
+ const SIRegisterInfo *TRI = STM.getRegisterInfo();
+ SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
+ MachineRegisterInfo &MRI = MF.getRegInfo();
+
+ // For now, reserve highest available SGPR pair. After RA,
+ // shift down to a lower unused pair of SGPRs
+ // If all registers are used, then findUnusedRegister will return
+ // AMDGPU::NoRegister.
+ constexpr bool ReserveHighestRegister = true;
+ Register LongBranchReservedReg = TRI->findUnusedRegister(
+ MRI, &AMDGPU::SGPR_64RegClass, MF, ReserveHighestRegister);
+ if (!LongBranchReservedReg)
+ return false;
+
+ // Approximate code size and offsets of each basic block
+ SmallVector<BasicBlockInfo, 16> BlockInfo;
+ generateBlockInfo(MF, BlockInfo);
+
+ for (const MachineBasicBlock &MBB : MF) {
+ MachineBasicBlock::const_iterator Last = MBB.getLastNonDebugInstr();
+ if (Last == MBB.end() || !Last->isUnconditionalBranch())
+ continue;
+ MachineBasicBlock *DestBB = TII->getBranchDestBlock(*Last);
+ uint64_t BlockDistance = static_cast<uint64_t>(
+ LongBranchFactor * BlockInfo[DestBB->getNumber()].Offset);
+ // If the distance falls outside the threshold assume it is a long branch
+ // and we need to reserve the registers
+ if (!TII->isBranchOffsetInRange(Last->getOpcode(), BlockDistance)) {
+ MFI->setLongBranchReservedReg(LongBranchReservedReg);
+ return true;
+ }
+ }
+ return false;
+}
diff --git a/llvm/lib/Target/AMDGPU/GCNProcessors.td b/llvm/lib/Target/AMDGPU/GCNProcessors.td
index d86138154be6..b9c9358f88b9 100644
--- a/llvm/lib/Target/AMDGPU/GCNProcessors.td
+++ b/llvm/lib/Target/AMDGPU/GCNProcessors.td
@@ -196,6 +196,14 @@ def : ProcessorModel<"gfx940", SIDPGFX940FullSpeedModel,
FeatureISAVersion9_4_0.Features
>;
+def : ProcessorModel<"gfx941", SIDPGFX940FullSpeedModel,
+ FeatureISAVersion9_4_1.Features
+>;
+
+def : ProcessorModel<"gfx942", SIDPGFX940FullSpeedModel,
+ FeatureISAVersion9_4_2.Features
+>;
+
//===----------------------------------------------------------------------===//
// GCN GFX10.
//===----------------------------------------------------------------------===//
@@ -263,3 +271,11 @@ def : ProcessorModel<"gfx1102", GFX11SpeedModel,
def : ProcessorModel<"gfx1103", GFX11SpeedModel,
FeatureISAVersion11_0_3.Features
>;
+
+def : ProcessorModel<"gfx1150", GFX11SpeedModel,
+ FeatureISAVersion11_5_0.Features
+>;
+
+def : ProcessorModel<"gfx1151", GFX11SpeedModel,
+ FeatureISAVersion11_5_1.Features
+>;
diff --git a/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp b/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp
index f9bed9a76c6f..68cf97170369 100644
--- a/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp
@@ -286,8 +286,8 @@ void GCNUpwardRPTracker::recede(const MachineInstr &MI) {
// update max pressure
MaxPressure = max(AtMIPressure, MaxPressure);
- for (const auto &MO : MI.operands()) {
- if (!MO.isReg() || !MO.isDef() || !MO.getReg().isVirtual() || MO.isDead())
+ for (const auto &MO : MI.all_defs()) {
+ if (!MO.getReg().isVirtual() || MO.isDead())
continue;
auto Reg = MO.getReg();
@@ -336,23 +336,38 @@ bool GCNDownwardRPTracker::advanceBeforeNext() {
assert(SI.isValid());
// Remove dead registers or mask bits.
- for (auto &It : LiveRegs) {
- const LiveInterval &LI = LIS.getInterval(It.first);
+ SmallSet<Register, 8> SeenRegs;
+ for (auto &MO : LastTrackedMI->operands()) {
+ if (!MO.isReg() || !MO.getReg().isVirtual())
+ continue;
+ if (MO.isUse() && !MO.readsReg())
+ continue;
+ if (!SeenRegs.insert(MO.getReg()).second)
+ continue;
+ const LiveInterval &LI = LIS.getInterval(MO.getReg());
if (LI.hasSubRanges()) {
+ auto It = LiveRegs.end();
for (const auto &S : LI.subranges()) {
if (!S.liveAt(SI)) {
- auto PrevMask = It.second;
- It.second &= ~S.LaneMask;
- CurPressure.inc(It.first, PrevMask, It.second, *MRI);
+ if (It == LiveRegs.end()) {
+ It = LiveRegs.find(MO.getReg());
+ if (It == LiveRegs.end())
+ llvm_unreachable("register isn't live");
+ }
+ auto PrevMask = It->second;
+ It->second &= ~S.LaneMask;
+ CurPressure.inc(MO.getReg(), PrevMask, It->second, *MRI);
}
}
+ if (It != LiveRegs.end() && It->second.none())
+ LiveRegs.erase(It);
} else if (!LI.liveAt(SI)) {
- auto PrevMask = It.second;
- It.second = LaneBitmask::getNone();
- CurPressure.inc(It.first, PrevMask, It.second, *MRI);
+ auto It = LiveRegs.find(MO.getReg());
+ if (It == LiveRegs.end())
+ llvm_unreachable("register isn't live");
+ CurPressure.inc(MO.getReg(), It->second, LaneBitmask::getNone(), *MRI);
+ LiveRegs.erase(It);
}
- if (It.second.none())
- LiveRegs.erase(It.first);
}
MaxPressure = max(MaxPressure, CurPressure);
@@ -367,9 +382,7 @@ void GCNDownwardRPTracker::advanceToNext() {
NextMI = skipDebugInstructionsForward(NextMI, MBBEnd);
// Add new registers or mask bits.
- for (const auto &MO : LastTrackedMI->operands()) {
- if (!MO.isReg() || !MO.isDef())
- continue;
+ for (const auto &MO : LastTrackedMI->all_defs()) {
Register Reg = MO.getReg();
if (!Reg.isVirtual())
continue;
diff --git a/llvm/lib/Target/AMDGPU/GCNRewritePartialRegUses.cpp b/llvm/lib/Target/AMDGPU/GCNRewritePartialRegUses.cpp
new file mode 100644
index 000000000000..99db7e4af9fd
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/GCNRewritePartialRegUses.cpp
@@ -0,0 +1,502 @@
+//===-------------- GCNRewritePartialRegUses.cpp --------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// RenameIndependentSubregs pass leaves large partially used super registers,
+/// for example:
+/// undef %0.sub4:VReg_1024 = ...
+/// %0.sub5:VReg_1024 = ...
+/// %0.sub6:VReg_1024 = ...
+/// %0.sub7:VReg_1024 = ...
+/// use %0.sub4_sub5_sub6_sub7
+/// use %0.sub6_sub7
+///
+/// GCNRewritePartialRegUses goes right after RenameIndependentSubregs and
+/// rewrites such partially used super registers with registers of minimal size:
+/// undef %0.sub0:VReg_128 = ...
+/// %0.sub1:VReg_128 = ...
+/// %0.sub2:VReg_128 = ...
+/// %0.sub3:VReg_128 = ...
+/// use %0.sub0_sub1_sub2_sub3
+/// use %0.sub2_sub3
+///
+/// This allows to avoid subreg lanemasks tracking during register pressure
+/// calculation and creates more possibilities for the code unaware of lanemasks
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
+#include "SIRegisterInfo.h"
+#include "llvm/CodeGen/LiveInterval.h"
+#include "llvm/CodeGen/LiveIntervals.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "rewrite-partial-reg-uses"
+
+namespace {
+
+class GCNRewritePartialRegUses : public MachineFunctionPass {
+public:
+ static char ID;
+ GCNRewritePartialRegUses() : MachineFunctionPass(ID) {}
+
+ StringRef getPassName() const override {
+ return "Rewrite Partial Register Uses";
+ }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.setPreservesCFG();
+ AU.addPreserved<LiveIntervals>();
+ AU.addPreserved<SlotIndexes>();
+ MachineFunctionPass::getAnalysisUsage(AU);
+ }
+
+ bool runOnMachineFunction(MachineFunction &MF) override;
+
+private:
+ MachineRegisterInfo *MRI;
+ const SIRegisterInfo *TRI;
+ const TargetInstrInfo *TII;
+ LiveIntervals *LIS;
+
+ /// Rewrite partially used register Reg by shifting all its subregisters to
+ /// the right and replacing the original register with a register of minimal
+ /// size. Return true if the change has been made.
+ bool rewriteReg(Register Reg) const;
+
+ /// Value type for SubRegMap below.
+ struct SubRegInfo {
+ /// Register class required to hold the value stored in the SubReg.
+ const TargetRegisterClass *RC;
+
+ /// Index for the right-shifted subregister. If 0 this is the "covering"
+ /// subreg i.e. subreg that covers all others. Covering subreg becomes the
+ /// whole register after the replacement.
+ unsigned SubReg = AMDGPU::NoSubRegister;
+ SubRegInfo(const TargetRegisterClass *RC_ = nullptr) : RC(RC_) {}
+ };
+
+ /// Map OldSubReg -> { RC, NewSubReg }. Used as in/out container.
+ typedef SmallDenseMap<unsigned, SubRegInfo> SubRegMap;
+
+ /// Given register class RC and the set of used subregs as keys in the SubRegs
+ /// map return new register class and indexes of right-shifted subregs as
+ /// values in SubRegs map such that the resulting regclass would contain
+ /// registers of minimal size.
+ const TargetRegisterClass *getMinSizeReg(const TargetRegisterClass *RC,
+ SubRegMap &SubRegs) const;
+
+ /// Given regclass RC and pairs of [OldSubReg, SubRegRC] in SubRegs try to
+ /// find new regclass such that:
+ /// 1. It has subregs obtained by shifting each OldSubReg by RShift number
+ /// of bits to the right. Every "shifted" subreg should have the same
+ /// SubRegRC. SubRegRC can be null, in this case it initialized using
+ /// getSubRegisterClass. If CoverSubregIdx is not zero it's a subreg that
+ /// "covers" all other subregs in pairs. Basically such subreg becomes a
+ /// whole register.
+ /// 2. Resulting register class contains registers of minimal size but not
+ /// less than RegNumBits.
+ ///
+ /// SubRegs is map of OldSubReg -> [SubRegRC, NewSubReg] and is used as in/out
+ /// parameter:
+ /// OldSubReg - input parameter,
+ /// SubRegRC - in/out, should be changed for unknown regclass,
+ /// NewSubReg - output, contains shifted subregs on return.
+ const TargetRegisterClass *
+ getRegClassWithShiftedSubregs(const TargetRegisterClass *RC, unsigned RShift,
+ unsigned RegNumBits, unsigned CoverSubregIdx,
+ SubRegMap &SubRegs) const;
+
+ /// Update live intervals after rewriting OldReg to NewReg with SubRegs map
+ /// describing OldSubReg -> NewSubReg mapping.
+ void updateLiveIntervals(Register OldReg, Register NewReg,
+ SubRegMap &SubRegs) const;
+
+ /// Helper methods.
+
+ /// Return reg class expected by a MO's parent instruction for a given MO.
+ const TargetRegisterClass *getOperandRegClass(MachineOperand &MO) const;
+
+ /// Find right-shifted by RShift amount version of the SubReg if it exists,
+ /// return 0 otherwise.
+ unsigned shiftSubReg(unsigned SubReg, unsigned RShift) const;
+
+ /// Find subreg index with a given Offset and Size, return 0 if there is no
+ /// such subregister index. The result is cached in SubRegs data-member.
+ unsigned getSubReg(unsigned Offset, unsigned Size) const;
+
+ /// Cache for getSubReg method: {Offset, Size} -> SubReg index.
+ mutable SmallDenseMap<std::pair<unsigned, unsigned>, unsigned> SubRegs;
+
+ /// Return bit mask that contains all register classes that are projected into
+ /// RC by SubRegIdx. The result is cached in SuperRegMasks data-member.
+ const uint32_t *getSuperRegClassMask(const TargetRegisterClass *RC,
+ unsigned SubRegIdx) const;
+
+ /// Cache for getSuperRegClassMask method: { RC, SubRegIdx } -> Class bitmask.
+ mutable SmallDenseMap<std::pair<const TargetRegisterClass *, unsigned>,
+ const uint32_t *>
+ SuperRegMasks;
+
+ /// Return bitmask containing all allocatable register classes with registers
+ /// aligned at AlignNumBits. The result is cached in
+ /// AllocatableAndAlignedRegClassMasks data-member.
+ const BitVector &
+ getAllocatableAndAlignedRegClassMask(unsigned AlignNumBits) const;
+
+ /// Cache for getAllocatableAndAlignedRegClassMask method:
+ /// AlignNumBits -> Class bitmask.
+ mutable SmallDenseMap<unsigned, BitVector> AllocatableAndAlignedRegClassMasks;
+};
+
+} // end anonymous namespace
+
+// TODO: move this to the tablegen and use binary search by Offset.
+unsigned GCNRewritePartialRegUses::getSubReg(unsigned Offset,
+ unsigned Size) const {
+ const auto [I, Inserted] = SubRegs.try_emplace({Offset, Size}, 0);
+ if (Inserted) {
+ for (unsigned Idx = 1, E = TRI->getNumSubRegIndices(); Idx < E; ++Idx) {
+ if (TRI->getSubRegIdxOffset(Idx) == Offset &&
+ TRI->getSubRegIdxSize(Idx) == Size) {
+ I->second = Idx;
+ break;
+ }
+ }
+ }
+ return I->second;
+}
+
+unsigned GCNRewritePartialRegUses::shiftSubReg(unsigned SubReg,
+ unsigned RShift) const {
+ unsigned Offset = TRI->getSubRegIdxOffset(SubReg) - RShift;
+ return getSubReg(Offset, TRI->getSubRegIdxSize(SubReg));
+}
+
+const uint32_t *
+GCNRewritePartialRegUses::getSuperRegClassMask(const TargetRegisterClass *RC,
+ unsigned SubRegIdx) const {
+ const auto [I, Inserted] =
+ SuperRegMasks.try_emplace({RC, SubRegIdx}, nullptr);
+ if (Inserted) {
+ for (SuperRegClassIterator RCI(RC, TRI); RCI.isValid(); ++RCI) {
+ if (RCI.getSubReg() == SubRegIdx) {
+ I->second = RCI.getMask();
+ break;
+ }
+ }
+ }
+ return I->second;
+}
+
+const BitVector &GCNRewritePartialRegUses::getAllocatableAndAlignedRegClassMask(
+ unsigned AlignNumBits) const {
+ const auto [I, Inserted] =
+ AllocatableAndAlignedRegClassMasks.try_emplace(AlignNumBits);
+ if (Inserted) {
+ BitVector &BV = I->second;
+ BV.resize(TRI->getNumRegClasses());
+ for (unsigned ClassID = 0; ClassID < TRI->getNumRegClasses(); ++ClassID) {
+ auto *RC = TRI->getRegClass(ClassID);
+ if (RC->isAllocatable() && TRI->isRegClassAligned(RC, AlignNumBits))
+ BV.set(ClassID);
+ }
+ }
+ return I->second;
+}
+
+const TargetRegisterClass *
+GCNRewritePartialRegUses::getRegClassWithShiftedSubregs(
+ const TargetRegisterClass *RC, unsigned RShift, unsigned RegNumBits,
+ unsigned CoverSubregIdx, SubRegMap &SubRegs) const {
+
+ unsigned RCAlign = TRI->getRegClassAlignmentNumBits(RC);
+ LLVM_DEBUG(dbgs() << " Shift " << RShift << ", reg align " << RCAlign
+ << '\n');
+
+ BitVector ClassMask(getAllocatableAndAlignedRegClassMask(RCAlign));
+ for (auto &[OldSubReg, SRI] : SubRegs) {
+ auto &[SubRegRC, NewSubReg] = SRI;
+
+ // Register class may be unknown, for example:
+ // undef %0.sub4:sgpr_1024 = S_MOV_B32 01
+ // %0.sub5:sgpr_1024 = S_MOV_B32 02
+ // %1:vreg_64 = COPY %0.sub4_sub5
+ // Register classes for subregs 'sub4' and 'sub5' are known from the
+ // description of destination operand of S_MOV_B32 instruction but the
+ // class for the subreg 'sub4_sub5' isn't specified by the COPY instruction.
+ if (!SubRegRC)
+ SubRegRC = TRI->getSubRegisterClass(RC, OldSubReg);
+
+ if (!SubRegRC)
+ return nullptr;
+
+ LLVM_DEBUG(dbgs() << " " << TRI->getSubRegIndexName(OldSubReg) << ':'
+ << TRI->getRegClassName(SubRegRC)
+ << (SubRegRC->isAllocatable() ? "" : " not alloc")
+ << " -> ");
+
+ if (OldSubReg == CoverSubregIdx) {
+ NewSubReg = AMDGPU::NoSubRegister;
+ LLVM_DEBUG(dbgs() << "whole reg");
+ } else {
+ NewSubReg = shiftSubReg(OldSubReg, RShift);
+ if (!NewSubReg) {
+ LLVM_DEBUG(dbgs() << "none\n");
+ return nullptr;
+ }
+ LLVM_DEBUG(dbgs() << TRI->getSubRegIndexName(NewSubReg));
+ }
+
+ const uint32_t *Mask = NewSubReg ? getSuperRegClassMask(SubRegRC, NewSubReg)
+ : SubRegRC->getSubClassMask();
+ if (!Mask)
+ llvm_unreachable("no register class mask?");
+
+ ClassMask.clearBitsNotInMask(Mask);
+ // Don't try to early exit because checking if ClassMask has set bits isn't
+ // that cheap and we expect it to pass in most cases.
+ LLVM_DEBUG(dbgs() << ", num regclasses " << ClassMask.count() << '\n');
+ }
+
+ // ClassMask is the set of all register classes such that each class is
+ // allocatable, aligned, has all shifted subregs and each subreg has required
+ // register class (see SubRegRC above). Now select first (that is largest)
+ // register class with registers of minimal but not less than RegNumBits size.
+ // We have to check register size because we may encounter classes of smaller
+ // registers like VReg_1 in some situations.
+ const TargetRegisterClass *MinRC = nullptr;
+ unsigned MinNumBits = std::numeric_limits<unsigned>::max();
+ for (unsigned ClassID : ClassMask.set_bits()) {
+ auto *RC = TRI->getRegClass(ClassID);
+ unsigned NumBits = TRI->getRegSizeInBits(*RC);
+ if (NumBits < MinNumBits && NumBits >= RegNumBits) {
+ MinNumBits = NumBits;
+ MinRC = RC;
+ }
+ if (MinNumBits == RegNumBits)
+ break;
+ }
+#ifndef NDEBUG
+ if (MinRC) {
+ assert(MinRC->isAllocatable() && TRI->isRegClassAligned(MinRC, RCAlign));
+ for (auto [SubReg, SRI] : SubRegs)
+ // Check that all registers in MinRC support SRI.SubReg subregister.
+ assert(MinRC == TRI->getSubClassWithSubReg(MinRC, SRI.SubReg));
+ }
+#endif
+ // There might be zero RShift - in this case we just trying to find smaller
+ // register.
+ return (MinRC != RC || RShift != 0) ? MinRC : nullptr;
+}
+
+const TargetRegisterClass *
+GCNRewritePartialRegUses::getMinSizeReg(const TargetRegisterClass *RC,
+ SubRegMap &SubRegs) const {
+ unsigned CoverSubreg = AMDGPU::NoSubRegister;
+ unsigned Offset = std::numeric_limits<unsigned>::max();
+ unsigned End = 0;
+ for (auto [SubReg, SRI] : SubRegs) {
+ unsigned SubRegOffset = TRI->getSubRegIdxOffset(SubReg);
+ unsigned SubRegEnd = SubRegOffset + TRI->getSubRegIdxSize(SubReg);
+ if (SubRegOffset < Offset) {
+ Offset = SubRegOffset;
+ CoverSubreg = AMDGPU::NoSubRegister;
+ }
+ if (SubRegEnd > End) {
+ End = SubRegEnd;
+ CoverSubreg = AMDGPU::NoSubRegister;
+ }
+ if (SubRegOffset == Offset && SubRegEnd == End)
+ CoverSubreg = SubReg;
+ }
+ // If covering subreg is found shift everything so the covering subreg would
+ // be in the rightmost position.
+ if (CoverSubreg != AMDGPU::NoSubRegister)
+ return getRegClassWithShiftedSubregs(RC, Offset, End - Offset, CoverSubreg,
+ SubRegs);
+
+ // Otherwise find subreg with maximum required alignment and shift it and all
+ // other subregs to the rightmost possible position with respect to the
+ // alignment.
+ unsigned MaxAlign = 0;
+ for (auto [SubReg, SRI] : SubRegs)
+ MaxAlign = std::max(MaxAlign, TRI->getSubRegAlignmentNumBits(RC, SubReg));
+
+ unsigned FirstMaxAlignedSubRegOffset = std::numeric_limits<unsigned>::max();
+ for (auto [SubReg, SRI] : SubRegs) {
+ if (TRI->getSubRegAlignmentNumBits(RC, SubReg) != MaxAlign)
+ continue;
+ FirstMaxAlignedSubRegOffset =
+ std::min(FirstMaxAlignedSubRegOffset, TRI->getSubRegIdxOffset(SubReg));
+ if (FirstMaxAlignedSubRegOffset == Offset)
+ break;
+ }
+
+ unsigned NewOffsetOfMaxAlignedSubReg =
+ alignTo(FirstMaxAlignedSubRegOffset - Offset, MaxAlign);
+
+ if (NewOffsetOfMaxAlignedSubReg > FirstMaxAlignedSubRegOffset)
+ llvm_unreachable("misaligned subreg");
+
+ unsigned RShift = FirstMaxAlignedSubRegOffset - NewOffsetOfMaxAlignedSubReg;
+ return getRegClassWithShiftedSubregs(RC, RShift, End - RShift, 0, SubRegs);
+}
+
+// Only the subrange's lanemasks of the original interval need to be modified.
+// Subrange for a covering subreg becomes the main range.
+void GCNRewritePartialRegUses::updateLiveIntervals(Register OldReg,
+ Register NewReg,
+ SubRegMap &SubRegs) const {
+ if (!LIS->hasInterval(OldReg))
+ return;
+
+ auto &OldLI = LIS->getInterval(OldReg);
+ auto &NewLI = LIS->createEmptyInterval(NewReg);
+
+ auto &Allocator = LIS->getVNInfoAllocator();
+ NewLI.setWeight(OldLI.weight());
+
+ for (auto &SR : OldLI.subranges()) {
+ auto I = find_if(SubRegs, [&](auto &P) {
+ return SR.LaneMask == TRI->getSubRegIndexLaneMask(P.first);
+ });
+
+ if (I == SubRegs.end()) {
+ // There might be a situation when subranges don't exactly match used
+ // subregs, for example:
+ // %120 [160r,1392r:0) 0@160r
+ // L000000000000C000 [160r,1392r:0) 0@160r
+ // L0000000000003000 [160r,1392r:0) 0@160r
+ // L0000000000000C00 [160r,1392r:0) 0@160r
+ // L0000000000000300 [160r,1392r:0) 0@160r
+ // L0000000000000003 [160r,1104r:0) 0@160r
+ // L000000000000000C [160r,1104r:0) 0@160r
+ // L0000000000000030 [160r,1104r:0) 0@160r
+ // L00000000000000C0 [160r,1104r:0) 0@160r
+ // but used subregs are:
+ // sub0_sub1_sub2_sub3_sub4_sub5_sub6_sub7, L000000000000FFFF
+ // sub0_sub1_sub2_sub3, L00000000000000FF
+ // sub4_sub5_sub6_sub7, L000000000000FF00
+ // In this example subregs sub0_sub1_sub2_sub3 and sub4_sub5_sub6_sub7
+ // have several subranges with the same lifetime. For such cases just
+ // recreate the interval.
+ LIS->removeInterval(OldReg);
+ LIS->removeInterval(NewReg);
+ LIS->createAndComputeVirtRegInterval(NewReg);
+ return;
+ }
+
+ if (unsigned NewSubReg = I->second.SubReg)
+ NewLI.createSubRangeFrom(Allocator,
+ TRI->getSubRegIndexLaneMask(NewSubReg), SR);
+ else // This is the covering subreg (0 index) - set it as main range.
+ NewLI.assign(SR, Allocator);
+
+ SubRegs.erase(I);
+ }
+ if (NewLI.empty())
+ NewLI.assign(OldLI, Allocator);
+ NewLI.verify(MRI);
+ LIS->removeInterval(OldReg);
+}
+
+const TargetRegisterClass *
+GCNRewritePartialRegUses::getOperandRegClass(MachineOperand &MO) const {
+ MachineInstr *MI = MO.getParent();
+ return TII->getRegClass(TII->get(MI->getOpcode()), MI->getOperandNo(&MO), TRI,
+ *MI->getParent()->getParent());
+}
+
+bool GCNRewritePartialRegUses::rewriteReg(Register Reg) const {
+ auto Range = MRI->reg_nodbg_operands(Reg);
+ if (Range.begin() == Range.end())
+ return false;
+
+ for (MachineOperand &MO : Range) {
+ if (MO.getSubReg() == AMDGPU::NoSubRegister) // Whole reg used, quit.
+ return false;
+ }
+
+ auto *RC = MRI->getRegClass(Reg);
+ LLVM_DEBUG(dbgs() << "Try to rewrite partial reg " << printReg(Reg, TRI)
+ << ':' << TRI->getRegClassName(RC) << '\n');
+
+ // Collect used subregs and constrained reg classes infered from instruction
+ // operands.
+ SubRegMap SubRegs;
+ for (MachineOperand &MO : MRI->reg_nodbg_operands(Reg)) {
+ assert(MO.getSubReg() != AMDGPU::NoSubRegister);
+ auto *OpDescRC = getOperandRegClass(MO);
+ const auto [I, Inserted] = SubRegs.try_emplace(MO.getSubReg(), OpDescRC);
+ if (!Inserted && OpDescRC) {
+ SubRegInfo &SRI = I->second;
+ SRI.RC = SRI.RC ? TRI->getCommonSubClass(SRI.RC, OpDescRC) : OpDescRC;
+ if (!SRI.RC) {
+ LLVM_DEBUG(dbgs() << " Couldn't find common target regclass\n");
+ return false;
+ }
+ }
+ }
+
+ auto *NewRC = getMinSizeReg(RC, SubRegs);
+ if (!NewRC) {
+ LLVM_DEBUG(dbgs() << " No improvement achieved\n");
+ return false;
+ }
+
+ Register NewReg = MRI->createVirtualRegister(NewRC);
+ LLVM_DEBUG(dbgs() << " Success " << printReg(Reg, TRI) << ':'
+ << TRI->getRegClassName(RC) << " -> "
+ << printReg(NewReg, TRI) << ':'
+ << TRI->getRegClassName(NewRC) << '\n');
+
+ for (auto &MO : make_early_inc_range(MRI->reg_operands(Reg))) {
+ MO.setReg(NewReg);
+ // Debug info can refer to the whole reg, just leave it as it is for now.
+ // TODO: create some DI shift expression?
+ if (MO.isDebug() && MO.getSubReg() == 0)
+ continue;
+ unsigned SubReg = SubRegs[MO.getSubReg()].SubReg;
+ MO.setSubReg(SubReg);
+ if (SubReg == AMDGPU::NoSubRegister && MO.isDef())
+ MO.setIsUndef(false);
+ }
+
+ if (LIS)
+ updateLiveIntervals(Reg, NewReg, SubRegs);
+
+ return true;
+}
+
+bool GCNRewritePartialRegUses::runOnMachineFunction(MachineFunction &MF) {
+ MRI = &MF.getRegInfo();
+ TRI = static_cast<const SIRegisterInfo *>(MRI->getTargetRegisterInfo());
+ TII = MF.getSubtarget().getInstrInfo();
+ LIS = getAnalysisIfAvailable<LiveIntervals>();
+ bool Changed = false;
+ for (size_t I = 0, E = MRI->getNumVirtRegs(); I < E; ++I) {
+ Changed |= rewriteReg(Register::index2VirtReg(I));
+ }
+ return Changed;
+}
+
+char GCNRewritePartialRegUses::ID;
+
+char &llvm::GCNRewritePartialRegUsesID = GCNRewritePartialRegUses::ID;
+
+INITIALIZE_PASS_BEGIN(GCNRewritePartialRegUses, DEBUG_TYPE,
+ "Rewrite Partial Register Uses", false, false)
+INITIALIZE_PASS_END(GCNRewritePartialRegUses, DEBUG_TYPE,
+ "Rewrite Partial Register Uses", false, false)
diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
index 6946a05bc551..994cfea1fd7d 100644
--- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
@@ -45,6 +45,13 @@ static cl::opt<unsigned> ScheduleMetricBias(
"100 to chase the occupancy only."),
cl::init(10));
+static cl::opt<bool>
+ RelaxedOcc("amdgpu-schedule-relaxed-occupancy", cl::Hidden,
+ cl::desc("Relax occupancy targets for kernels which are memory "
+ "bound (amdgpu-membound-threshold), or "
+ "Wave Limited (amdgpu-limit-wave-threshold)."),
+ cl::init(false));
+
const unsigned ScheduleMetrics::ScaleFactor = 100;
GCNSchedStrategy::GCNSchedStrategy(const MachineSchedContext *C)
@@ -67,7 +74,10 @@ void GCNSchedStrategy::initialize(ScheduleDAGMI *DAG) {
// Set the initial TargetOccupnacy to the maximum occupancy that we can
// achieve for this function. This effectively sets a lower bound on the
// 'Critical' register limits in the scheduler.
- TargetOccupancy = MFI.getOccupancy();
+ // Allow for lower occupancy targets if kernel is wave limited or memory
+ // bound, and using the relaxed occupancy feature.
+ TargetOccupancy =
+ RelaxedOcc ? MFI.getMinAllowedOccupancy() : MFI.getOccupancy();
SGPRCriticalLimit =
std::min(ST.getMaxNumSGPRs(TargetOccupancy, true), SGPRExcessLimit);
@@ -471,6 +481,12 @@ GCNScheduleDAGMILive::GCNScheduleDAGMILive(
StartingOccupancy(MFI.getOccupancy()), MinOccupancy(StartingOccupancy) {
LLVM_DEBUG(dbgs() << "Starting occupancy is " << StartingOccupancy << ".\n");
+ if (RelaxedOcc) {
+ MinOccupancy = std::min(MFI.getMinAllowedOccupancy(), StartingOccupancy);
+ if (MinOccupancy != StartingOccupancy)
+ LLVM_DEBUG(dbgs() << "Allowing Occupancy drops to " << MinOccupancy
+ << ".\n");
+ }
}
std::unique_ptr<GCNSchedStage>
@@ -511,11 +527,19 @@ void GCNScheduleDAGMILive::computeBlockPressure(unsigned RegionIdx,
// If the block has the only successor then live-ins of that successor are
// live-outs of the current block. We can reuse calculated live set if the
// successor will be sent to scheduling past current block.
+
+ // However, due to the bug in LiveInterval analysis it may happen that two
+ // predecessors of the same successor block have different lane bitmasks for
+ // a live-out register. Workaround that by sticking to one-to-one relationship
+ // i.e. one predecessor with one successor block.
const MachineBasicBlock *OnlySucc = nullptr;
- if (MBB->succ_size() == 1 && !(*MBB->succ_begin())->empty()) {
- SlotIndexes *Ind = LIS->getSlotIndexes();
- if (Ind->getMBBStartIdx(MBB) < Ind->getMBBStartIdx(*MBB->succ_begin()))
- OnlySucc = *MBB->succ_begin();
+ if (MBB->succ_size() == 1) {
+ auto *Candidate = *MBB->succ_begin();
+ if (!Candidate->empty() && Candidate->pred_size() == 1) {
+ SlotIndexes *Ind = LIS->getSlotIndexes();
+ if (Ind->getMBBStartIdx(MBB) < Ind->getMBBStartIdx(Candidate))
+ OnlySucc = Candidate;
+ }
}
// Scheduler sends regions from the end of the block upwards.
@@ -864,7 +888,8 @@ void GCNSchedStage::setupNewBlock() {
DAG.startBlock(CurrentMBB);
// Get real RP for the region if it hasn't be calculated before. After the
// initial schedule stage real RP will be collected after scheduling.
- if (StageID == GCNSchedStageID::OccInitialSchedule)
+ if (StageID == GCNSchedStageID::OccInitialSchedule ||
+ StageID == GCNSchedStageID::ILPInitialSchedule)
DAG.computeBlockPressure(RegionIdx, CurrentMBB);
}
@@ -1100,6 +1125,10 @@ bool UnclusteredHighRPStage::shouldRevertScheduling(unsigned WavesAfter) {
return true;
}
+ // Do not attempt to relax schedule even more if we are already spilling.
+ if (isRegionWithExcessRP())
+ return false;
+
LLVM_DEBUG(
dbgs()
<< "\n\t *** In shouldRevertScheduling ***\n"
@@ -1188,9 +1217,8 @@ void GCNSchedStage::revertScheduling() {
}
// Reset read-undef flags and update them later.
- for (auto &Op : MI->operands())
- if (Op.isReg() && Op.isDef())
- Op.setIsUndef(false);
+ for (auto &Op : MI->all_defs())
+ Op.setIsUndef(false);
RegisterOperands RegOpers;
RegOpers.collect(*MI, *DAG.TRI, DAG.MRI, DAG.ShouldTrackLaneMasks, false);
if (!MI->isDebugInstr()) {
@@ -1463,8 +1491,8 @@ bool PreRARematStage::isTriviallyReMaterializable(const MachineInstr &MI) {
if (!DAG.TII->isTriviallyReMaterializable(MI))
return false;
- for (const MachineOperand &MO : MI.operands())
- if (MO.isReg() && MO.isUse() && MO.getReg().isVirtual())
+ for (const MachineOperand &MO : MI.all_uses())
+ if (MO.getReg().isVirtual())
return false;
return true;
diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
index 2017ae84353c..ef5470df876d 100644
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
@@ -15,10 +15,12 @@
#define LLVM_LIB_TARGET_AMDGPU_GCNSUBTARGET_H
#include "AMDGPUCallLowering.h"
+#include "AMDGPURegisterBankInfo.h"
#include "AMDGPUSubtarget.h"
#include "SIFrameLowering.h"
#include "SIISelLowering.h"
#include "SIInstrInfo.h"
+#include "Utils/AMDGPUBaseInfo.h"
#include "llvm/CodeGen/SelectionDAGTargetInfo.h"
#define GET_SUBTARGETINFO_HEADER
@@ -51,7 +53,7 @@ private:
std::unique_ptr<InlineAsmLowering> InlineAsmLoweringInfo;
std::unique_ptr<InstructionSelector> InstSelector;
std::unique_ptr<LegalizerInfo> Legalizer;
- std::unique_ptr<RegisterBankInfo> RegBankInfo;
+ std::unique_ptr<AMDGPURegisterBankInfo> RegBankInfo;
protected:
// Basic subtarget description.
@@ -63,7 +65,6 @@ protected:
unsigned MaxPrivateElementSize = 0;
// Possibly statically set by tablegen, but may want to be overridden.
- bool FastFMAF32 = false;
bool FastDenormalF32 = false;
bool HalfRate64Ops = false;
bool FullRate64Ops = false;
@@ -132,7 +133,7 @@ protected:
bool HasA16 = false;
bool HasG16 = false;
bool HasNSAEncoding = false;
- unsigned NSAMaxSize = 0;
+ bool HasPartialNSAEncoding = false;
bool GFX10_AEncoding = false;
bool GFX10_BEncoding = false;
bool HasDLInsts = false;
@@ -146,12 +147,17 @@ protected:
bool HasDot7Insts = false;
bool HasDot8Insts = false;
bool HasDot9Insts = false;
+ bool HasDot10Insts = false;
bool HasMAIInsts = false;
bool HasFP8Insts = false;
bool HasPkFmacF16Inst = false;
+ bool HasAtomicDsPkAdd16Insts = false;
+ bool HasAtomicFlatPkAdd16Insts = false;
bool HasAtomicFaddRtnInsts = false;
bool HasAtomicFaddNoRtnInsts = false;
- bool HasAtomicPkFaddNoRtnInsts = false;
+ bool HasAtomicBufferGlobalPkAddF16NoRtnInsts = false;
+ bool HasAtomicBufferGlobalPkAddF16Insts = false;
+ bool HasAtomicGlobalPkAddBF16Inst = false;
bool HasFlatAtomicFaddF32Inst = false;
bool SupportsSRAMECC = false;
@@ -173,6 +179,7 @@ protected:
bool ScalarFlatScratchInsts = false;
bool HasArchitectedFlatScratch = false;
bool EnableFlatScratch = false;
+ bool HasArchitectedSGPRs = false;
bool AddNoCarryInsts = false;
bool HasUnpackedD16VMem = false;
bool LDSMisalignedBug = false;
@@ -198,6 +205,7 @@ protected:
bool HasMADIntraFwdBug = false;
bool HasVOPDInsts = false;
bool HasVALUTransUseHazard = false;
+ bool HasForceStoreSC0SC1 = false;
// Dummy feature to use for assembler in tablegen.
bool FeatureDisable = false;
@@ -248,7 +256,7 @@ public:
return Legalizer.get();
}
- const RegisterBankInfo *getRegBankInfo() const override {
+ const AMDGPURegisterBankInfo *getRegBankInfo() const override {
return RegBankInfo.get();
}
@@ -283,7 +291,7 @@ public:
/// Return the number of high bits known to be zero for a frame index.
unsigned getKnownHighZeroBitsForFrameIndex() const {
- return countLeadingZeros(getMaxWaveScratchSize()) + getWavefrontSizeLog2();
+ return llvm::countl_zero(getMaxWaveScratchSize()) + getWavefrontSizeLog2();
}
int getLDSBankCount() const {
@@ -319,10 +327,6 @@ public:
return FP64;
}
- bool hasFastFMAF32() const {
- return FastFMAF32;
- }
-
bool hasHalfRate64Ops() const {
return HalfRate64Ops;
}
@@ -738,6 +742,10 @@ public:
return HasDot9Insts;
}
+ bool hasDot10Insts() const {
+ return HasDot10Insts;
+ }
+
bool hasMAIInsts() const {
return HasMAIInsts;
}
@@ -750,6 +758,10 @@ public:
return HasPkFmacF16Inst;
}
+ bool hasAtomicDsPkAdd16Insts() const { return HasAtomicDsPkAdd16Insts; }
+
+ bool hasAtomicFlatPkAdd16Insts() const { return HasAtomicFlatPkAdd16Insts; }
+
bool hasAtomicFaddInsts() const {
return HasAtomicFaddRtnInsts || HasAtomicFaddNoRtnInsts;
}
@@ -758,7 +770,17 @@ public:
bool hasAtomicFaddNoRtnInsts() const { return HasAtomicFaddNoRtnInsts; }
- bool hasAtomicPkFaddNoRtnInsts() const { return HasAtomicPkFaddNoRtnInsts; }
+ bool hasAtomicBufferGlobalPkAddF16NoRtnInsts() const {
+ return HasAtomicBufferGlobalPkAddF16NoRtnInsts;
+ }
+
+ bool hasAtomicBufferGlobalPkAddF16Insts() const {
+ return HasAtomicBufferGlobalPkAddF16Insts;
+ }
+
+ bool hasAtomicGlobalPkAddBF16Inst() const {
+ return HasAtomicGlobalPkAddBF16Inst;
+ }
bool hasFlatAtomicFaddF32Inst() const { return HasFlatAtomicFaddF32Inst; }
@@ -924,7 +946,9 @@ public:
bool hasNSAEncoding() const { return HasNSAEncoding; }
- unsigned getNSAMaxSize() const { return NSAMaxSize; }
+ bool hasPartialNSAEncoding() const { return HasPartialNSAEncoding; }
+
+ unsigned getNSAMaxSize() const { return AMDGPU::getNSAMaxSize(*this); }
bool hasGFX10_AEncoding() const {
return GFX10_AEncoding;
@@ -1070,6 +1094,8 @@ public:
bool hasVALUTransUseHazard() const { return HasVALUTransUseHazard; }
+ bool hasForceStoreSC0SC1() const { return HasForceStoreSC0SC1; }
+
bool hasVALUMaskWriteHazard() const { return getGeneration() >= GFX11; }
/// Return if operations acting on VGPR tuples require even alignment.
@@ -1126,6 +1152,9 @@ public:
/// In this case it is readonly.
bool flatScratchIsArchitected() const { return HasArchitectedFlatScratch; }
+ /// \returns true if the architected SGPRs are enabled.
+ bool hasArchitectedSGPRs() const { return HasArchitectedSGPRs; }
+
/// \returns true if the machine has merged shaders in which s0-s7 are
/// reserved by the hardware and user SGPRs start at s8
bool hasMergedShaders() const {
@@ -1323,6 +1352,14 @@ public:
// \returns the number of address arguments from which to enable MIMG NSA
// on supported architectures.
unsigned getNSAThreshold(const MachineFunction &MF) const;
+
+ // \returns true if the subtarget has a hazard requiring an "s_nop 0"
+ // instruction before "s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)".
+ bool requiresNopBeforeDeallocVGPRs() const {
+ // Currently all targets that support the dealloc VGPRs message also require
+ // the nop.
+ return true;
+ }
};
} // end namespace llvm
diff --git a/llvm/lib/Target/AMDGPU/GCNVOPDUtils.cpp b/llvm/lib/Target/AMDGPU/GCNVOPDUtils.cpp
index 95ea42267ccf..29c9b9ccf276 100644
--- a/llvm/lib/Target/AMDGPU/GCNVOPDUtils.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNVOPDUtils.cpp
@@ -63,7 +63,7 @@ bool llvm::checkVOPDRegConstraints(const SIInstrInfo &TII,
}() && "Expected FirstMI to precede SecondMI");
// Cannot pair dependent instructions
for (const auto &Use : SecondMI.uses())
- if (Use.isReg() && FirstMI.modifiesRegister(Use.getReg()))
+ if (Use.isReg() && FirstMI.modifiesRegister(Use.getReg(), TRI))
return false;
auto getVRegIdx = [&](unsigned OpcodeIdx, unsigned OperandIdx) {
diff --git a/llvm/lib/Target/AMDGPU/LDSDIRInstructions.td b/llvm/lib/Target/AMDGPU/LDSDIRInstructions.td
index 1f65376890da..4956a1586774 100644
--- a/llvm/lib/Target/AMDGPU/LDSDIRInstructions.td
+++ b/llvm/lib/Target/AMDGPU/LDSDIRInstructions.td
@@ -34,7 +34,7 @@ class LDSDIRe<bits<2> op, bit is_direct> : Enc32 {
class LDSDIR_getIns<bit direct> {
dag ret = !if(direct,
(ins wait_vdst:$waitvdst),
- (ins Attr:$attr, AttrChan:$attrchan, wait_vdst:$waitvdst)
+ (ins InterpAttr:$attr, InterpAttrChan:$attrchan, wait_vdst:$waitvdst)
);
}
diff --git a/llvm/lib/Target/AMDGPU/MCA/AMDGPUCustomBehaviour.cpp b/llvm/lib/Target/AMDGPU/MCA/AMDGPUCustomBehaviour.cpp
index 24c9cc2d7dd2..a1f8be403c44 100644
--- a/llvm/lib/Target/AMDGPU/MCA/AMDGPUCustomBehaviour.cpp
+++ b/llvm/lib/Target/AMDGPU/MCA/AMDGPUCustomBehaviour.cpp
@@ -348,9 +348,9 @@ createAMDGPUInstrPostProcess(const MCSubtargetInfo &STI,
/// Extern function to initialize the targets for the AMDGPU backend
extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTargetMCA() {
- TargetRegistry::RegisterCustomBehaviour(getTheAMDGPUTarget(),
+ TargetRegistry::RegisterCustomBehaviour(getTheR600Target(),
createAMDGPUCustomBehaviour);
- TargetRegistry::RegisterInstrPostProcess(getTheAMDGPUTarget(),
+ TargetRegistry::RegisterInstrPostProcess(getTheR600Target(),
createAMDGPUInstrPostProcess);
TargetRegistry::RegisterCustomBehaviour(getTheGCNTarget(),
diff --git a/llvm/lib/Target/AMDGPU/MCA/AMDGPUCustomBehaviour.h b/llvm/lib/Target/AMDGPU/MCA/AMDGPUCustomBehaviour.h
index 7a0d454c3578..cb1436d319c9 100644
--- a/llvm/lib/Target/AMDGPU/MCA/AMDGPUCustomBehaviour.h
+++ b/llvm/lib/Target/AMDGPU/MCA/AMDGPUCustomBehaviour.h
@@ -19,7 +19,7 @@
#include "llvm/ADT/SmallVector.h"
#include "llvm/MCA/CustomBehaviour.h"
-#include "llvm/Support/TargetParser.h"
+#include "llvm/TargetParser/TargetParser.h"
namespace llvm {
namespace mca {
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp
index f0653aec925d..44109b9d2919 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp
@@ -19,7 +19,7 @@
#include "llvm/MC/MCSubtargetInfo.h"
#include "llvm/MC/TargetRegistry.h"
#include "llvm/Support/EndianStream.h"
-#include "llvm/Support/TargetParser.h"
+#include "llvm/TargetParser/TargetParser.h"
using namespace llvm;
using namespace llvm::AMDGPU;
@@ -79,7 +79,7 @@ bool AMDGPUAsmBackend::fixupNeedsRelaxation(const MCFixup &Fixup,
bool AMDGPUAsmBackend::mayNeedRelaxation(const MCInst &Inst,
const MCSubtargetInfo &STI) const {
- if (!STI.getFeatureBits()[AMDGPU::FeatureOffset3fBug])
+ if (!STI.hasFeature(AMDGPU::FeatureOffset3fBug))
return false;
if (AMDGPU::getSOPPWithRelaxation(Inst.getOpcode()) >= 0)
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFObjectWriter.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFObjectWriter.cpp
index 066b36622a16..3f188478ca8b 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFObjectWriter.cpp
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFObjectWriter.cpp
@@ -74,9 +74,9 @@ unsigned AMDGPUELFObjectWriter::getRelocType(MCContext &Ctx,
return ELF::R_AMDGPU_REL32;
case FK_Data_4:
case FK_SecRel_4:
- return ELF::R_AMDGPU_ABS32;
+ return IsPCRel ? ELF::R_AMDGPU_REL32 : ELF::R_AMDGPU_ABS32;
case FK_Data_8:
- return ELF::R_AMDGPU_ABS64;
+ return IsPCRel ? ELF::R_AMDGPU_REL64 : ELF::R_AMDGPU_ABS64;
}
if (Fixup.getTargetKind() == AMDGPU::fixup_si_sopp_br) {
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp
index e465267f2c20..ad55c73b22ea 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp
@@ -19,7 +19,7 @@
#include "llvm/MC/MCInstrInfo.h"
#include "llvm/MC/MCSubtargetInfo.h"
#include "llvm/Support/CommandLine.h"
-#include "llvm/Support/TargetParser.h"
+#include "llvm/TargetParser/TargetParser.h"
using namespace llvm;
using namespace llvm::AMDGPU;
@@ -60,11 +60,6 @@ void AMDGPUInstPrinter::printU4ImmOperand(const MCInst *MI, unsigned OpNo,
O << formatHex(MI->getOperand(OpNo).getImm() & 0xf);
}
-void AMDGPUInstPrinter::printU8ImmOperand(const MCInst *MI, unsigned OpNo,
- raw_ostream &O) {
- O << formatHex(MI->getOperand(OpNo).getImm() & 0xff);
-}
-
void AMDGPUInstPrinter::printU16ImmOperand(const MCInst *MI, unsigned OpNo,
const MCSubtargetInfo &STI,
raw_ostream &O) {
@@ -105,21 +100,6 @@ void AMDGPUInstPrinter::printNamedBit(const MCInst *MI, unsigned OpNo,
}
}
-void AMDGPUInstPrinter::printOffen(const MCInst *MI, unsigned OpNo,
- raw_ostream &O) {
- printNamedBit(MI, OpNo, O, "offen");
-}
-
-void AMDGPUInstPrinter::printIdxen(const MCInst *MI, unsigned OpNo,
- raw_ostream &O) {
- printNamedBit(MI, OpNo, O, "idxen");
-}
-
-void AMDGPUInstPrinter::printAddr64(const MCInst *MI, unsigned OpNo,
- raw_ostream &O) {
- printNamedBit(MI, OpNo, O, "addr64");
-}
-
void AMDGPUInstPrinter::printOffset(const MCInst *MI, unsigned OpNo,
const MCSubtargetInfo &STI,
raw_ostream &O) {
@@ -141,15 +121,10 @@ void AMDGPUInstPrinter::printFlatOffset(const MCInst *MI, unsigned OpNo,
bool IsFlatSeg = !(Desc.TSFlags &
(SIInstrFlags::FlatGlobal | SIInstrFlags::FlatScratch));
- if (IsFlatSeg) { // Unsigned offset
+ if (IsFlatSeg) // Unsigned offset
printU16ImmDecOperand(MI, OpNo, O);
- } else { // Signed offset
- if (AMDGPU::isGFX10(STI)) {
- O << formatDec(SignExtend32<12>(MI->getOperand(OpNo).getImm()));
- } else {
- O << formatDec(SignExtend32<13>(MI->getOperand(OpNo).getImm()));
- }
- }
+ else // Signed offset
+ O << formatDec(SignExtend32(Imm, AMDGPU::getNumFlatOffsetBits(STI)));
}
}
@@ -196,11 +171,6 @@ void AMDGPUInstPrinter::printSMRDLiteralOffset(const MCInst *MI, unsigned OpNo,
printU32ImmOperand(MI, OpNo, STI, O);
}
-void AMDGPUInstPrinter::printGDS(const MCInst *MI, unsigned OpNo,
- const MCSubtargetInfo &STI, raw_ostream &O) {
- printNamedBit(MI, OpNo, O, "gds");
-}
-
void AMDGPUInstPrinter::printCPol(const MCInst *MI, unsigned OpNo,
const MCSubtargetInfo &STI, raw_ostream &O) {
auto Imm = MI->getOperand(OpNo).getImm();
@@ -218,15 +188,6 @@ void AMDGPUInstPrinter::printCPol(const MCInst *MI, unsigned OpNo,
O << " /* unexpected cache policy bit */";
}
-void AMDGPUInstPrinter::printSWZ(const MCInst *MI, unsigned OpNo,
- const MCSubtargetInfo &STI, raw_ostream &O) {
-}
-
-void AMDGPUInstPrinter::printTFE(const MCInst *MI, unsigned OpNo,
- const MCSubtargetInfo &STI, raw_ostream &O) {
- printNamedBit(MI, OpNo, O, "tfe");
-}
-
void AMDGPUInstPrinter::printDMask(const MCInst *MI, unsigned OpNo,
const MCSubtargetInfo &STI, raw_ostream &O) {
if (MI->getOperand(OpNo).getImm()) {
@@ -247,16 +208,6 @@ void AMDGPUInstPrinter::printDim(const MCInst *MI, unsigned OpNo,
O << Dim;
}
-void AMDGPUInstPrinter::printUNorm(const MCInst *MI, unsigned OpNo,
- const MCSubtargetInfo &STI, raw_ostream &O) {
- printNamedBit(MI, OpNo, O, "unorm");
-}
-
-void AMDGPUInstPrinter::printDA(const MCInst *MI, unsigned OpNo,
- const MCSubtargetInfo &STI, raw_ostream &O) {
- printNamedBit(MI, OpNo, O, "da");
-}
-
void AMDGPUInstPrinter::printR128A16(const MCInst *MI, unsigned OpNo,
const MCSubtargetInfo &STI, raw_ostream &O) {
if (STI.hasFeature(AMDGPU::FeatureR128A16))
@@ -265,33 +216,6 @@ void AMDGPUInstPrinter::printR128A16(const MCInst *MI, unsigned OpNo,
printNamedBit(MI, OpNo, O, "r128");
}
-void AMDGPUInstPrinter::printA16(const MCInst *MI, unsigned OpNo,
- const MCSubtargetInfo &STI, raw_ostream &O) {
- printNamedBit(MI, OpNo, O, "a16");
-}
-
-void AMDGPUInstPrinter::printLWE(const MCInst *MI, unsigned OpNo,
- const MCSubtargetInfo &STI, raw_ostream &O) {
- printNamedBit(MI, OpNo, O, "lwe");
-}
-
-void AMDGPUInstPrinter::printD16(const MCInst *MI, unsigned OpNo,
- const MCSubtargetInfo &STI, raw_ostream &O) {
- printNamedBit(MI, OpNo, O, "d16");
-}
-
-void AMDGPUInstPrinter::printExpCompr(const MCInst *MI, unsigned OpNo,
- const MCSubtargetInfo &STI,
- raw_ostream &O) {
- printNamedBit(MI, OpNo, O, "compr");
-}
-
-void AMDGPUInstPrinter::printExpVM(const MCInst *MI, unsigned OpNo,
- const MCSubtargetInfo &STI,
- raw_ostream &O) {
- printNamedBit(MI, OpNo, O, "vm");
-}
-
void AMDGPUInstPrinter::printFORMAT(const MCInst *MI, unsigned OpNo,
const MCSubtargetInfo &STI,
raw_ostream &O) {
@@ -462,7 +386,7 @@ void AMDGPUInstPrinter::printImmediate16(uint32_t Imm,
else if (Imm == 0xC400)
O<< "-4.0";
else if (Imm == 0x3118 &&
- STI.getFeatureBits()[AMDGPU::FeatureInv2PiInlineImm]) {
+ STI.hasFeature(AMDGPU::FeatureInv2PiInlineImm)) {
O << "0.15915494";
} else {
uint64_t Imm16 = static_cast<uint16_t>(Imm);
@@ -486,26 +410,26 @@ void AMDGPUInstPrinter::printImmediate32(uint32_t Imm,
return;
}
- if (Imm == FloatToBits(0.0f))
+ if (Imm == llvm::bit_cast<uint32_t>(0.0f))
O << "0.0";
- else if (Imm == FloatToBits(1.0f))
+ else if (Imm == llvm::bit_cast<uint32_t>(1.0f))
O << "1.0";
- else if (Imm == FloatToBits(-1.0f))
+ else if (Imm == llvm::bit_cast<uint32_t>(-1.0f))
O << "-1.0";
- else if (Imm == FloatToBits(0.5f))
+ else if (Imm == llvm::bit_cast<uint32_t>(0.5f))
O << "0.5";
- else if (Imm == FloatToBits(-0.5f))
+ else if (Imm == llvm::bit_cast<uint32_t>(-0.5f))
O << "-0.5";
- else if (Imm == FloatToBits(2.0f))
+ else if (Imm == llvm::bit_cast<uint32_t>(2.0f))
O << "2.0";
- else if (Imm == FloatToBits(-2.0f))
+ else if (Imm == llvm::bit_cast<uint32_t>(-2.0f))
O << "-2.0";
- else if (Imm == FloatToBits(4.0f))
+ else if (Imm == llvm::bit_cast<uint32_t>(4.0f))
O << "4.0";
- else if (Imm == FloatToBits(-4.0f))
+ else if (Imm == llvm::bit_cast<uint32_t>(-4.0f))
O << "-4.0";
else if (Imm == 0x3e22f983 &&
- STI.getFeatureBits()[AMDGPU::FeatureInv2PiInlineImm])
+ STI.hasFeature(AMDGPU::FeatureInv2PiInlineImm))
O << "0.15915494";
else
O << formatHex(static_cast<uint64_t>(Imm));
@@ -520,26 +444,26 @@ void AMDGPUInstPrinter::printImmediate64(uint64_t Imm,
return;
}
- if (Imm == DoubleToBits(0.0))
+ if (Imm == llvm::bit_cast<uint64_t>(0.0))
O << "0.0";
- else if (Imm == DoubleToBits(1.0))
+ else if (Imm == llvm::bit_cast<uint64_t>(1.0))
O << "1.0";
- else if (Imm == DoubleToBits(-1.0))
+ else if (Imm == llvm::bit_cast<uint64_t>(-1.0))
O << "-1.0";
- else if (Imm == DoubleToBits(0.5))
+ else if (Imm == llvm::bit_cast<uint64_t>(0.5))
O << "0.5";
- else if (Imm == DoubleToBits(-0.5))
+ else if (Imm == llvm::bit_cast<uint64_t>(-0.5))
O << "-0.5";
- else if (Imm == DoubleToBits(2.0))
+ else if (Imm == llvm::bit_cast<uint64_t>(2.0))
O << "2.0";
- else if (Imm == DoubleToBits(-2.0))
+ else if (Imm == llvm::bit_cast<uint64_t>(-2.0))
O << "-2.0";
- else if (Imm == DoubleToBits(4.0))
+ else if (Imm == llvm::bit_cast<uint64_t>(4.0))
O << "4.0";
- else if (Imm == DoubleToBits(-4.0))
+ else if (Imm == llvm::bit_cast<uint64_t>(-4.0))
O << "-4.0";
else if (Imm == 0x3fc45f306dc9c882 &&
- STI.getFeatureBits()[AMDGPU::FeatureInv2PiInlineImm])
+ STI.hasFeature(AMDGPU::FeatureInv2PiInlineImm))
O << "0.15915494309189532";
else {
assert(isUInt<32>(Imm) || isInt<32>(Imm));
@@ -597,7 +521,7 @@ void AMDGPUInstPrinter::printDefaultVccOperand(bool FirstOperand,
raw_ostream &O) {
if (!FirstOperand)
O << ", ";
- printRegOperand(STI.getFeatureBits()[AMDGPU::FeatureWavefrontSize64]
+ printRegOperand(STI.hasFeature(AMDGPU::FeatureWavefrontSize64)
? AMDGPU::VCC
: AMDGPU::VCC_LO,
O, MRI);
@@ -718,7 +642,7 @@ void AMDGPUInstPrinter::printRegularOperand(const MCInst *MI, unsigned OpNo,
case AMDGPU::OPERAND_REG_IMM_V2INT16:
case AMDGPU::OPERAND_REG_IMM_V2FP16:
if (!isUInt<16>(Op.getImm()) &&
- STI.getFeatureBits()[AMDGPU::FeatureVOP3Literal]) {
+ STI.hasFeature(AMDGPU::FeatureVOP3Literal)) {
printImmediate32(Op.getImm(), STI, O);
break;
}
@@ -742,9 +666,10 @@ void AMDGPUInstPrinter::printRegularOperand(const MCInst *MI, unsigned OpNo,
O << formatDec(Op.getImm());
break;
case MCOI::OPERAND_REGISTER:
- // FIXME: This should be removed and handled somewhere else. Seems to come
- // from a disassembler bug.
- O << "/*invalid immediate*/";
+ // Disassembler does not fail when operand should not allow immediate
+ // operands but decodes them into 32bit immediate operand.
+ printImmediate32(Op.getImm(), STI, O);
+ O << "/*Invalid immediate*/";
break;
default:
// We hit this for the immediate instruction bits that don't yet have a
@@ -761,9 +686,9 @@ void AMDGPUInstPrinter::printRegularOperand(const MCInst *MI, unsigned OpNo,
int RCID = Desc.operands()[OpNo].RegClass;
unsigned RCBits = AMDGPU::getRegBitWidth(MRI.getRegClass(RCID));
if (RCBits == 32)
- printImmediate32(FloatToBits(Value), STI, O);
+ printImmediate32(llvm::bit_cast<uint32_t>((float)Value), STI, O);
else if (RCBits == 64)
- printImmediate64(DoubleToBits(Value), STI, O);
+ printImmediate64(llvm::bit_cast<uint64_t>(Value), STI, O);
else
llvm_unreachable("Invalid register class size");
}
@@ -1012,16 +937,16 @@ void AMDGPUInstPrinter::printDPPCtrl(const MCInst *MI, unsigned OpNo,
}
}
-void AMDGPUInstPrinter::printRowMask(const MCInst *MI, unsigned OpNo,
- const MCSubtargetInfo &STI,
- raw_ostream &O) {
+void AMDGPUInstPrinter::printDppRowMask(const MCInst *MI, unsigned OpNo,
+ const MCSubtargetInfo &STI,
+ raw_ostream &O) {
O << " row_mask:";
printU4ImmOperand(MI, OpNo, STI, O);
}
-void AMDGPUInstPrinter::printBankMask(const MCInst *MI, unsigned OpNo,
- const MCSubtargetInfo &STI,
- raw_ostream &O) {
+void AMDGPUInstPrinter::printDppBankMask(const MCInst *MI, unsigned OpNo,
+ const MCSubtargetInfo &STI,
+ raw_ostream &O) {
O << " bank_mask:";
printU4ImmOperand(MI, OpNo, STI, O);
}
@@ -1035,9 +960,8 @@ void AMDGPUInstPrinter::printDppBoundCtrl(const MCInst *MI, unsigned OpNo,
}
}
-void AMDGPUInstPrinter::printFI(const MCInst *MI, unsigned OpNo,
- const MCSubtargetInfo &STI,
- raw_ostream &O) {
+void AMDGPUInstPrinter::printDppFI(const MCInst *MI, unsigned OpNo,
+ const MCSubtargetInfo &STI, raw_ostream &O) {
using namespace llvm::AMDGPU::DPP;
unsigned Imm = MI->getOperand(OpNo).getImm();
if (Imm == DPP_FI_1 || Imm == DPP8_FI_1) {
@@ -1287,9 +1211,9 @@ void AMDGPUInstPrinter::printInterpAttrChan(const MCInst *MI, unsigned OpNum,
O << '.' << "xyzw"[Chan & 0x3];
}
-void AMDGPUInstPrinter::printVGPRIndexMode(const MCInst *MI, unsigned OpNo,
- const MCSubtargetInfo &STI,
- raw_ostream &O) {
+void AMDGPUInstPrinter::printGPRIdxMode(const MCInst *MI, unsigned OpNo,
+ const MCSubtargetInfo &STI,
+ raw_ostream &O) {
using namespace llvm::AMDGPU::VGPRIndexMode;
unsigned Val = MI->getOperand(OpNo).getImm();
@@ -1338,18 +1262,6 @@ void AMDGPUInstPrinter::printIfSet(const MCInst *MI, unsigned OpNo,
O << Asm;
}
-void AMDGPUInstPrinter::printHigh(const MCInst *MI, unsigned OpNo,
- const MCSubtargetInfo &STI,
- raw_ostream &O) {
- printNamedBit(MI, OpNo, O, "high");
-}
-
-void AMDGPUInstPrinter::printClampSI(const MCInst *MI, unsigned OpNo,
- const MCSubtargetInfo &STI,
- raw_ostream &O) {
- printNamedBit(MI, OpNo, O, "clamp");
-}
-
void AMDGPUInstPrinter::printOModSI(const MCInst *MI, unsigned OpNo,
const MCSubtargetInfo &STI,
raw_ostream &O) {
@@ -1496,7 +1408,7 @@ void AMDGPUInstPrinter::printSwizzle(const MCInst *MI, unsigned OpNo,
}
}
-void AMDGPUInstPrinter::printWaitFlag(const MCInst *MI, unsigned OpNo,
+void AMDGPUInstPrinter::printSWaitCnt(const MCInst *MI, unsigned OpNo,
const MCSubtargetInfo &STI,
raw_ostream &O) {
AMDGPU::IsaVersion ISA = AMDGPU::getIsaVersion(STI.getCPU());
@@ -1558,7 +1470,7 @@ void AMDGPUInstPrinter::printDepCtr(const MCInst *MI, unsigned OpNo,
}
}
-void AMDGPUInstPrinter::printDelayFlag(const MCInst *MI, unsigned OpNo,
+void AMDGPUInstPrinter::printSDelayALU(const MCInst *MI, unsigned OpNo,
const MCSubtargetInfo &STI,
raw_ostream &O) {
const char *BadInstId = "/* invalid instid value */";
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h
index 3486cca712ae..3b14faab136b 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h
@@ -38,7 +38,6 @@ public:
private:
void printU4ImmOperand(const MCInst *MI, unsigned OpNo,
const MCSubtargetInfo &STI, raw_ostream &O);
- void printU8ImmOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
void printU16ImmOperand(const MCInst *MI, unsigned OpNo,
const MCSubtargetInfo &STI, raw_ostream &O);
void printU4ImmDecOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
@@ -48,9 +47,6 @@ private:
const MCSubtargetInfo &STI, raw_ostream &O);
void printNamedBit(const MCInst *MI, unsigned OpNo, raw_ostream &O,
StringRef BitName);
- void printOffen(const MCInst *MI, unsigned OpNo, raw_ostream &O);
- void printIdxen(const MCInst *MI, unsigned OpNo, raw_ostream &O);
- void printAddr64(const MCInst *MI, unsigned OpNo, raw_ostream &O);
void printOffset(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
raw_ostream &O);
void printFlatOffset(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
@@ -68,34 +64,14 @@ private:
const MCSubtargetInfo &STI, raw_ostream &O);
void printSMRDLiteralOffset(const MCInst *MI, unsigned OpNo,
const MCSubtargetInfo &STI, raw_ostream &O);
- void printGDS(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
- raw_ostream &O);
void printCPol(const MCInst *MI, unsigned OpNo,
const MCSubtargetInfo &STI, raw_ostream &O);
- void printSWZ(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
- raw_ostream &O);
- void printTFE(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
- raw_ostream &O);
void printDMask(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
raw_ostream &O);
void printDim(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
raw_ostream &O);
- void printUNorm(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
- raw_ostream &O);
- void printDA(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
- raw_ostream &O);
void printR128A16(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
raw_ostream &O);
- void printA16(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
- raw_ostream &O);
- void printLWE(const MCInst *MI, unsigned OpNo,
- const MCSubtargetInfo &STI, raw_ostream &O);
- void printD16(const MCInst *MI, unsigned OpNo,
- const MCSubtargetInfo &STI, raw_ostream &O);
- void printExpCompr(const MCInst *MI, unsigned OpNo,
- const MCSubtargetInfo &STI, raw_ostream &O);
- void printExpVM(const MCInst *MI, unsigned OpNo,
- const MCSubtargetInfo &STI, raw_ostream &O);
void printFORMAT(const MCInst *MI, unsigned OpNo,
const MCSubtargetInfo &STI, raw_ostream &O);
void printSymbolicFormat(const MCInst *MI,
@@ -132,14 +108,14 @@ private:
raw_ostream &O);
void printDPPCtrl(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
raw_ostream &O);
- void printRowMask(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
- raw_ostream &O);
- void printBankMask(const MCInst *MI, unsigned OpNo,
- const MCSubtargetInfo &STI, raw_ostream &O);
+ void printDppRowMask(const MCInst *MI, unsigned OpNo,
+ const MCSubtargetInfo &STI, raw_ostream &O);
+ void printDppBankMask(const MCInst *MI, unsigned OpNo,
+ const MCSubtargetInfo &STI, raw_ostream &O);
void printDppBoundCtrl(const MCInst *MI, unsigned OpNo,
const MCSubtargetInfo &STI, raw_ostream &O);
- void printFI(const MCInst *MI, unsigned OpNo,
- const MCSubtargetInfo &STI, raw_ostream &O);
+ void printDppFI(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
+ raw_ostream &O);
void printSDWASel(const MCInst *MI, unsigned OpNo, raw_ostream &O);
void printSDWADstSel(const MCInst *MI, unsigned OpNo,
const MCSubtargetInfo &STI, raw_ostream &O);
@@ -166,8 +142,8 @@ private:
void printInterpAttrChan(const MCInst *MI, unsigned OpNo,
const MCSubtargetInfo &STI, raw_ostream &O);
- void printVGPRIndexMode(const MCInst *MI, unsigned OpNo,
- const MCSubtargetInfo &STI, raw_ostream &O);
+ void printGPRIdxMode(const MCInst *MI, unsigned OpNo,
+ const MCSubtargetInfo &STI, raw_ostream &O);
void printMemOperand(const MCInst *MI, unsigned OpNo,
const MCSubtargetInfo &STI, raw_ostream &O);
void printBLGP(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
@@ -205,12 +181,8 @@ public:
protected:
void printAbs(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
raw_ostream &O);
- void printHigh(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
- raw_ostream &O);
void printClamp(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
raw_ostream &O);
- void printClampSI(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
- raw_ostream &O);
void printOModSI(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
raw_ostream &O);
void printLiteral(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
@@ -241,11 +213,11 @@ protected:
raw_ostream &O);
void printSwizzle(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
raw_ostream &O);
- void printWaitFlag(const MCInst *MI, unsigned OpNo,
+ void printSWaitCnt(const MCInst *MI, unsigned OpNo,
const MCSubtargetInfo &STI, raw_ostream &O);
void printDepCtr(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
raw_ostream &O);
- void printDelayFlag(const MCInst *MI, unsigned OpNo,
+ void printSDelayALU(const MCInst *MI, unsigned OpNo,
const MCSubtargetInfo &STI, raw_ostream &O);
void printHwreg(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
raw_ostream &O);
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.cpp
index ded3fb7ab8d9..d539d75fdff0 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.cpp
@@ -8,9 +8,9 @@
//===----------------------------------------------------------------------===//
#include "AMDGPUMCAsmInfo.h"
-#include "llvm/ADT/Triple.h"
-#include "llvm/MC/MCSubtargetInfo.h"
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/TargetParser/Triple.h"
using namespace llvm;
@@ -40,7 +40,7 @@ AMDGPUMCAsmInfo::AMDGPUMCAsmInfo(const Triple &TT,
HasNoDeadStrip = true;
//===--- Dwarf Emission Directives -----------------------------------===//
SupportsDebugInformation = true;
- UsesCFIForDebug = true;
+ UsesCFIWithoutEH = true;
DwarfRegNumForCFI = true;
UseIntegratedAssembler = false;
@@ -58,11 +58,11 @@ unsigned AMDGPUMCAsmInfo::getMaxInstLength(const MCSubtargetInfo *STI) const {
return MaxInstLength;
// Maximum for NSA encoded images
- if (STI->getFeatureBits()[AMDGPU::FeatureNSAEncoding])
+ if (STI->hasFeature(AMDGPU::FeatureNSAEncoding))
return 20;
// 64-bit instruction with 32-bit literal.
- if (STI->getFeatureBits()[AMDGPU::FeatureVOP3Literal])
+ if (STI->hasFeature(AMDGPU::FeatureVOP3Literal))
return 12;
return 8;
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp
index 93bec8aaadfd..5e77a8caa04e 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp
@@ -1,4 +1,4 @@
-//===-- AMDGPUCodeEmitter.cpp - AMDGPU Code Emitter interface -------------===//
+//===-- AMDGPUMCCodeEmitter.cpp - AMDGPU Code Emitter ---------------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
@@ -7,14 +7,586 @@
//===----------------------------------------------------------------------===//
//
/// \file
-/// CodeEmitter interface for SI codegen.
+/// The AMDGPU code emitter produces machine code that can be executed
+/// directly on the GPU device.
//
//===----------------------------------------------------------------------===//
-#include "AMDGPUMCCodeEmitter.h"
+#include "MCTargetDesc/AMDGPUFixupKinds.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
+#include "SIDefines.h"
+#include "Utils/AMDGPUBaseInfo.h"
+#include "llvm/ADT/APInt.h"
+#include "llvm/MC/MCCodeEmitter.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/EndianStream.h"
+#include "llvm/TargetParser/SubtargetFeature.h"
+#include <optional>
using namespace llvm;
-// pin vtable to this file
-void AMDGPUMCCodeEmitter::anchor() {}
+namespace {
+class AMDGPUMCCodeEmitter : public MCCodeEmitter {
+ const MCRegisterInfo &MRI;
+ const MCInstrInfo &MCII;
+
+public:
+ AMDGPUMCCodeEmitter(const MCInstrInfo &MCII, const MCRegisterInfo &MRI)
+ : MRI(MRI), MCII(MCII) {}
+
+ /// Encode the instruction and write it to the OS.
+ void encodeInstruction(const MCInst &MI, SmallVectorImpl<char> &CB,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const override;
+
+ void getMachineOpValue(const MCInst &MI, const MCOperand &MO, APInt &Op,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const;
+
+ /// Use a fixup to encode the simm16 field for SOPP branch
+ /// instructions.
+ void getSOPPBrEncoding(const MCInst &MI, unsigned OpNo, APInt &Op,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const;
+
+ void getSMEMOffsetEncoding(const MCInst &MI, unsigned OpNo, APInt &Op,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const;
+
+ void getSDWASrcEncoding(const MCInst &MI, unsigned OpNo, APInt &Op,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const;
+
+ void getSDWAVopcDstEncoding(const MCInst &MI, unsigned OpNo, APInt &Op,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const;
+
+ void getAVOperandEncoding(const MCInst &MI, unsigned OpNo, APInt &Op,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const;
+
+private:
+ uint64_t getImplicitOpSelHiEncoding(int Opcode) const;
+ void getMachineOpValueCommon(const MCInst &MI, const MCOperand &MO,
+ unsigned OpNo, APInt &Op,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const;
+
+ /// Encode an fp or int literal.
+ std::optional<uint32_t> getLitEncoding(const MCOperand &MO,
+ const MCOperandInfo &OpInfo,
+ const MCSubtargetInfo &STI) const;
+
+ void getBinaryCodeForInstr(const MCInst &MI, SmallVectorImpl<MCFixup> &Fixups,
+ APInt &Inst, APInt &Scratch,
+ const MCSubtargetInfo &STI) const;
+};
+
+} // end anonymous namespace
+
+MCCodeEmitter *llvm::createAMDGPUMCCodeEmitter(const MCInstrInfo &MCII,
+ MCContext &Ctx) {
+ return new AMDGPUMCCodeEmitter(MCII, *Ctx.getRegisterInfo());
+}
+
+// Returns the encoding value to use if the given integer is an integer inline
+// immediate value, or 0 if it is not.
+template <typename IntTy>
+static uint32_t getIntInlineImmEncoding(IntTy Imm) {
+ if (Imm >= 0 && Imm <= 64)
+ return 128 + Imm;
+
+ if (Imm >= -16 && Imm <= -1)
+ return 192 + std::abs(Imm);
+
+ return 0;
+}
+
+static uint32_t getLit16IntEncoding(uint16_t Val, const MCSubtargetInfo &STI) {
+ uint16_t IntImm = getIntInlineImmEncoding(static_cast<int16_t>(Val));
+ return IntImm == 0 ? 255 : IntImm;
+}
+
+static uint32_t getLit16Encoding(uint16_t Val, const MCSubtargetInfo &STI) {
+ uint16_t IntImm = getIntInlineImmEncoding(static_cast<int16_t>(Val));
+ if (IntImm != 0)
+ return IntImm;
+
+ if (Val == 0x3800) // 0.5
+ return 240;
+
+ if (Val == 0xB800) // -0.5
+ return 241;
+
+ if (Val == 0x3C00) // 1.0
+ return 242;
+
+ if (Val == 0xBC00) // -1.0
+ return 243;
+
+ if (Val == 0x4000) // 2.0
+ return 244;
+
+ if (Val == 0xC000) // -2.0
+ return 245;
+
+ if (Val == 0x4400) // 4.0
+ return 246;
+
+ if (Val == 0xC400) // -4.0
+ return 247;
+
+ if (Val == 0x3118 && // 1.0 / (2.0 * pi)
+ STI.hasFeature(AMDGPU::FeatureInv2PiInlineImm))
+ return 248;
+
+ return 255;
+}
+
+static uint32_t getLit32Encoding(uint32_t Val, const MCSubtargetInfo &STI) {
+ uint32_t IntImm = getIntInlineImmEncoding(static_cast<int32_t>(Val));
+ if (IntImm != 0)
+ return IntImm;
+
+ if (Val == llvm::bit_cast<uint32_t>(0.5f))
+ return 240;
+
+ if (Val == llvm::bit_cast<uint32_t>(-0.5f))
+ return 241;
+
+ if (Val == llvm::bit_cast<uint32_t>(1.0f))
+ return 242;
+
+ if (Val == llvm::bit_cast<uint32_t>(-1.0f))
+ return 243;
+
+ if (Val == llvm::bit_cast<uint32_t>(2.0f))
+ return 244;
+
+ if (Val == llvm::bit_cast<uint32_t>(-2.0f))
+ return 245;
+
+ if (Val == llvm::bit_cast<uint32_t>(4.0f))
+ return 246;
+
+ if (Val == llvm::bit_cast<uint32_t>(-4.0f))
+ return 247;
+
+ if (Val == 0x3e22f983 && // 1.0 / (2.0 * pi)
+ STI.hasFeature(AMDGPU::FeatureInv2PiInlineImm))
+ return 248;
+
+ return 255;
+}
+
+static uint32_t getLit64Encoding(uint64_t Val, const MCSubtargetInfo &STI) {
+ uint32_t IntImm = getIntInlineImmEncoding(static_cast<int64_t>(Val));
+ if (IntImm != 0)
+ return IntImm;
+
+ if (Val == llvm::bit_cast<uint64_t>(0.5))
+ return 240;
+
+ if (Val == llvm::bit_cast<uint64_t>(-0.5))
+ return 241;
+
+ if (Val == llvm::bit_cast<uint64_t>(1.0))
+ return 242;
+
+ if (Val == llvm::bit_cast<uint64_t>(-1.0))
+ return 243;
+
+ if (Val == llvm::bit_cast<uint64_t>(2.0))
+ return 244;
+
+ if (Val == llvm::bit_cast<uint64_t>(-2.0))
+ return 245;
+
+ if (Val == llvm::bit_cast<uint64_t>(4.0))
+ return 246;
+
+ if (Val == llvm::bit_cast<uint64_t>(-4.0))
+ return 247;
+
+ if (Val == 0x3fc45f306dc9c882 && // 1.0 / (2.0 * pi)
+ STI.hasFeature(AMDGPU::FeatureInv2PiInlineImm))
+ return 248;
+
+ return 255;
+}
+
+std::optional<uint32_t>
+AMDGPUMCCodeEmitter::getLitEncoding(const MCOperand &MO,
+ const MCOperandInfo &OpInfo,
+ const MCSubtargetInfo &STI) const {
+ int64_t Imm;
+ if (MO.isExpr()) {
+ const auto *C = dyn_cast<MCConstantExpr>(MO.getExpr());
+ if (!C)
+ return 255;
+
+ Imm = C->getValue();
+ } else {
+
+ assert(!MO.isDFPImm());
+
+ if (!MO.isImm())
+ return {};
+
+ Imm = MO.getImm();
+ }
+
+ switch (OpInfo.OperandType) {
+ case AMDGPU::OPERAND_REG_IMM_INT32:
+ case AMDGPU::OPERAND_REG_IMM_FP32:
+ case AMDGPU::OPERAND_REG_IMM_FP32_DEFERRED:
+ case AMDGPU::OPERAND_REG_INLINE_C_INT32:
+ case AMDGPU::OPERAND_REG_INLINE_C_FP32:
+ case AMDGPU::OPERAND_REG_INLINE_AC_INT32:
+ case AMDGPU::OPERAND_REG_INLINE_AC_FP32:
+ case AMDGPU::OPERAND_REG_IMM_V2INT32:
+ case AMDGPU::OPERAND_REG_IMM_V2FP32:
+ case AMDGPU::OPERAND_REG_INLINE_C_V2INT32:
+ case AMDGPU::OPERAND_REG_INLINE_C_V2FP32:
+ return getLit32Encoding(static_cast<uint32_t>(Imm), STI);
+
+ case AMDGPU::OPERAND_REG_IMM_INT64:
+ case AMDGPU::OPERAND_REG_IMM_FP64:
+ case AMDGPU::OPERAND_REG_INLINE_C_INT64:
+ case AMDGPU::OPERAND_REG_INLINE_C_FP64:
+ case AMDGPU::OPERAND_REG_INLINE_AC_FP64:
+ return getLit64Encoding(static_cast<uint64_t>(Imm), STI);
+
+ case AMDGPU::OPERAND_REG_IMM_INT16:
+ case AMDGPU::OPERAND_REG_INLINE_C_INT16:
+ case AMDGPU::OPERAND_REG_INLINE_AC_INT16:
+ return getLit16IntEncoding(static_cast<uint16_t>(Imm), STI);
+ case AMDGPU::OPERAND_REG_IMM_FP16:
+ case AMDGPU::OPERAND_REG_IMM_FP16_DEFERRED:
+ case AMDGPU::OPERAND_REG_INLINE_C_FP16:
+ case AMDGPU::OPERAND_REG_INLINE_AC_FP16:
+ // FIXME Is this correct? What do inline immediates do on SI for f16 src
+ // which does not have f16 support?
+ return getLit16Encoding(static_cast<uint16_t>(Imm), STI);
+ case AMDGPU::OPERAND_REG_IMM_V2INT16:
+ case AMDGPU::OPERAND_REG_IMM_V2FP16: {
+ if (!isUInt<16>(Imm) && STI.hasFeature(AMDGPU::FeatureVOP3Literal))
+ return getLit32Encoding(static_cast<uint32_t>(Imm), STI);
+ if (OpInfo.OperandType == AMDGPU::OPERAND_REG_IMM_V2FP16)
+ return getLit16Encoding(static_cast<uint16_t>(Imm), STI);
+ [[fallthrough]];
+ }
+ case AMDGPU::OPERAND_REG_INLINE_C_V2INT16:
+ case AMDGPU::OPERAND_REG_INLINE_AC_V2INT16:
+ return getLit16IntEncoding(static_cast<uint16_t>(Imm), STI);
+ case AMDGPU::OPERAND_REG_INLINE_C_V2FP16:
+ case AMDGPU::OPERAND_REG_INLINE_AC_V2FP16: {
+ uint16_t Lo16 = static_cast<uint16_t>(Imm);
+ uint32_t Encoding = getLit16Encoding(Lo16, STI);
+ return Encoding;
+ }
+ case AMDGPU::OPERAND_KIMM32:
+ case AMDGPU::OPERAND_KIMM16:
+ return MO.getImm();
+ default:
+ llvm_unreachable("invalid operand size");
+ }
+}
+
+uint64_t AMDGPUMCCodeEmitter::getImplicitOpSelHiEncoding(int Opcode) const {
+ using namespace AMDGPU::VOP3PEncoding;
+ using namespace AMDGPU::OpName;
+
+ if (AMDGPU::hasNamedOperand(Opcode, op_sel_hi)) {
+ if (AMDGPU::hasNamedOperand(Opcode, src2))
+ return 0;
+ if (AMDGPU::hasNamedOperand(Opcode, src1))
+ return OP_SEL_HI_2;
+ if (AMDGPU::hasNamedOperand(Opcode, src0))
+ return OP_SEL_HI_1 | OP_SEL_HI_2;
+ }
+ return OP_SEL_HI_0 | OP_SEL_HI_1 | OP_SEL_HI_2;
+}
+
+static bool isVCMPX64(const MCInstrDesc &Desc) {
+ return (Desc.TSFlags & SIInstrFlags::VOP3) &&
+ Desc.hasImplicitDefOfPhysReg(AMDGPU::EXEC);
+}
+
+void AMDGPUMCCodeEmitter::encodeInstruction(const MCInst &MI,
+ SmallVectorImpl<char> &CB,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const {
+ int Opcode = MI.getOpcode();
+ APInt Encoding, Scratch;
+ getBinaryCodeForInstr(MI, Fixups, Encoding, Scratch, STI);
+ const MCInstrDesc &Desc = MCII.get(MI.getOpcode());
+ unsigned bytes = Desc.getSize();
+
+ // Set unused op_sel_hi bits to 1 for VOP3P and MAI instructions.
+ // Note that accvgpr_read/write are MAI, have src0, but do not use op_sel.
+ if ((Desc.TSFlags & SIInstrFlags::VOP3P) ||
+ Opcode == AMDGPU::V_ACCVGPR_READ_B32_vi ||
+ Opcode == AMDGPU::V_ACCVGPR_WRITE_B32_vi) {
+ Encoding |= getImplicitOpSelHiEncoding(Opcode);
+ }
+
+ // GFX10+ v_cmpx opcodes promoted to VOP3 have implied dst=EXEC.
+ // Documentation requires dst to be encoded as EXEC (0x7E),
+ // but it looks like the actual value encoded for dst operand
+ // is ignored by HW. It was decided to define dst as "do not care"
+ // in td files to allow disassembler accept any dst value.
+ // However, dst is encoded as EXEC for compatibility with SP3.
+ if (AMDGPU::isGFX10Plus(STI) && isVCMPX64(Desc)) {
+ assert((Encoding & 0xFF) == 0);
+ Encoding |= MRI.getEncodingValue(AMDGPU::EXEC_LO);
+ }
+
+ for (unsigned i = 0; i < bytes; i++) {
+ CB.push_back((uint8_t)Encoding.extractBitsAsZExtValue(8, 8 * i));
+ }
+
+ // NSA encoding.
+ if (AMDGPU::isGFX10Plus(STI) && Desc.TSFlags & SIInstrFlags::MIMG) {
+ int vaddr0 = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
+ AMDGPU::OpName::vaddr0);
+ int srsrc = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
+ AMDGPU::OpName::srsrc);
+ assert(vaddr0 >= 0 && srsrc > vaddr0);
+ unsigned NumExtraAddrs = srsrc - vaddr0 - 1;
+ unsigned NumPadding = (-NumExtraAddrs) & 3;
+
+ for (unsigned i = 0; i < NumExtraAddrs; ++i) {
+ getMachineOpValue(MI, MI.getOperand(vaddr0 + 1 + i), Encoding, Fixups,
+ STI);
+ CB.push_back((uint8_t)Encoding.getLimitedValue());
+ }
+ CB.append(NumPadding, 0);
+ }
+
+ if ((bytes > 8 && STI.hasFeature(AMDGPU::FeatureVOP3Literal)) ||
+ (bytes > 4 && !STI.hasFeature(AMDGPU::FeatureVOP3Literal)))
+ return;
+
+ // Do not print literals from SISrc Operands for insts with mandatory literals
+ if (AMDGPU::hasNamedOperand(MI.getOpcode(), AMDGPU::OpName::imm))
+ return;
+
+ // Check for additional literals
+ for (unsigned i = 0, e = Desc.getNumOperands(); i < e; ++i) {
+
+ // Check if this operand should be encoded as [SV]Src
+ if (!AMDGPU::isSISrcOperand(Desc, i))
+ continue;
+
+ // Is this operand a literal immediate?
+ const MCOperand &Op = MI.getOperand(i);
+ auto Enc = getLitEncoding(Op, Desc.operands()[i], STI);
+ if (!Enc || *Enc != 255)
+ continue;
+
+ // Yes! Encode it
+ int64_t Imm = 0;
+
+ if (Op.isImm())
+ Imm = Op.getImm();
+ else if (Op.isExpr()) {
+ if (const auto *C = dyn_cast<MCConstantExpr>(Op.getExpr()))
+ Imm = C->getValue();
+
+ } else if (!Op.isExpr()) // Exprs will be replaced with a fixup value.
+ llvm_unreachable("Must be immediate or expr");
+
+ support::endian::write<uint32_t>(CB, Imm, support::endianness::little);
+
+ // Only one literal value allowed
+ break;
+ }
+}
+
+void AMDGPUMCCodeEmitter::getSOPPBrEncoding(const MCInst &MI, unsigned OpNo,
+ APInt &Op,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const {
+ const MCOperand &MO = MI.getOperand(OpNo);
+
+ if (MO.isExpr()) {
+ const MCExpr *Expr = MO.getExpr();
+ MCFixupKind Kind = (MCFixupKind)AMDGPU::fixup_si_sopp_br;
+ Fixups.push_back(MCFixup::create(0, Expr, Kind, MI.getLoc()));
+ Op = APInt::getZero(96);
+ } else {
+ getMachineOpValue(MI, MO, Op, Fixups, STI);
+ }
+}
+
+void AMDGPUMCCodeEmitter::getSMEMOffsetEncoding(
+ const MCInst &MI, unsigned OpNo, APInt &Op,
+ SmallVectorImpl<MCFixup> &Fixups, const MCSubtargetInfo &STI) const {
+ auto Offset = MI.getOperand(OpNo).getImm();
+ // VI only supports 20-bit unsigned offsets.
+ assert(!AMDGPU::isVI(STI) || isUInt<20>(Offset));
+ Op = Offset;
+}
+
+void AMDGPUMCCodeEmitter::getSDWASrcEncoding(const MCInst &MI, unsigned OpNo,
+ APInt &Op,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const {
+ using namespace AMDGPU::SDWA;
+
+ uint64_t RegEnc = 0;
+
+ const MCOperand &MO = MI.getOperand(OpNo);
+
+ if (MO.isReg()) {
+ unsigned Reg = MO.getReg();
+ RegEnc |= MRI.getEncodingValue(Reg);
+ RegEnc &= SDWA9EncValues::SRC_VGPR_MASK;
+ if (AMDGPU::isSGPR(AMDGPU::mc2PseudoReg(Reg), &MRI)) {
+ RegEnc |= SDWA9EncValues::SRC_SGPR_MASK;
+ }
+ Op = RegEnc;
+ return;
+ } else {
+ const MCInstrDesc &Desc = MCII.get(MI.getOpcode());
+ auto Enc = getLitEncoding(MO, Desc.operands()[OpNo], STI);
+ if (Enc && *Enc != 255) {
+ Op = *Enc | SDWA9EncValues::SRC_SGPR_MASK;
+ return;
+ }
+ }
+
+ llvm_unreachable("Unsupported operand kind");
+}
+
+void AMDGPUMCCodeEmitter::getSDWAVopcDstEncoding(
+ const MCInst &MI, unsigned OpNo, APInt &Op,
+ SmallVectorImpl<MCFixup> &Fixups, const MCSubtargetInfo &STI) const {
+ using namespace AMDGPU::SDWA;
+
+ uint64_t RegEnc = 0;
+
+ const MCOperand &MO = MI.getOperand(OpNo);
+
+ unsigned Reg = MO.getReg();
+ if (Reg != AMDGPU::VCC && Reg != AMDGPU::VCC_LO) {
+ RegEnc |= MRI.getEncodingValue(Reg);
+ RegEnc &= SDWA9EncValues::VOPC_DST_SGPR_MASK;
+ RegEnc |= SDWA9EncValues::VOPC_DST_VCC_MASK;
+ }
+ Op = RegEnc;
+}
+
+void AMDGPUMCCodeEmitter::getAVOperandEncoding(
+ const MCInst &MI, unsigned OpNo, APInt &Op,
+ SmallVectorImpl<MCFixup> &Fixups, const MCSubtargetInfo &STI) const {
+ unsigned Reg = MI.getOperand(OpNo).getReg();
+ uint64_t Enc = MRI.getEncodingValue(Reg);
+
+ // VGPR and AGPR have the same encoding, but SrcA and SrcB operands of mfma
+ // instructions use acc[0:1] modifier bits to distinguish. These bits are
+ // encoded as a virtual 9th bit of the register for these operands.
+ if (MRI.getRegClass(AMDGPU::AGPR_32RegClassID).contains(Reg) ||
+ MRI.getRegClass(AMDGPU::AReg_64RegClassID).contains(Reg) ||
+ MRI.getRegClass(AMDGPU::AReg_96RegClassID).contains(Reg) ||
+ MRI.getRegClass(AMDGPU::AReg_128RegClassID).contains(Reg) ||
+ MRI.getRegClass(AMDGPU::AReg_160RegClassID).contains(Reg) ||
+ MRI.getRegClass(AMDGPU::AReg_192RegClassID).contains(Reg) ||
+ MRI.getRegClass(AMDGPU::AReg_224RegClassID).contains(Reg) ||
+ MRI.getRegClass(AMDGPU::AReg_256RegClassID).contains(Reg) ||
+ MRI.getRegClass(AMDGPU::AReg_288RegClassID).contains(Reg) ||
+ MRI.getRegClass(AMDGPU::AReg_320RegClassID).contains(Reg) ||
+ MRI.getRegClass(AMDGPU::AReg_352RegClassID).contains(Reg) ||
+ MRI.getRegClass(AMDGPU::AReg_384RegClassID).contains(Reg) ||
+ MRI.getRegClass(AMDGPU::AReg_512RegClassID).contains(Reg) ||
+ MRI.getRegClass(AMDGPU::AGPR_LO16RegClassID).contains(Reg))
+ Enc |= 512;
+
+ Op = Enc;
+}
+
+static bool needsPCRel(const MCExpr *Expr) {
+ switch (Expr->getKind()) {
+ case MCExpr::SymbolRef: {
+ auto *SE = cast<MCSymbolRefExpr>(Expr);
+ MCSymbolRefExpr::VariantKind Kind = SE->getKind();
+ return Kind != MCSymbolRefExpr::VK_AMDGPU_ABS32_LO &&
+ Kind != MCSymbolRefExpr::VK_AMDGPU_ABS32_HI;
+ }
+ case MCExpr::Binary: {
+ auto *BE = cast<MCBinaryExpr>(Expr);
+ if (BE->getOpcode() == MCBinaryExpr::Sub)
+ return false;
+ return needsPCRel(BE->getLHS()) || needsPCRel(BE->getRHS());
+ }
+ case MCExpr::Unary:
+ return needsPCRel(cast<MCUnaryExpr>(Expr)->getSubExpr());
+ case MCExpr::Target:
+ case MCExpr::Constant:
+ return false;
+ }
+ llvm_unreachable("invalid kind");
+}
+
+void AMDGPUMCCodeEmitter::getMachineOpValue(const MCInst &MI,
+ const MCOperand &MO, APInt &Op,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const {
+ if (MO.isReg()){
+ Op = MRI.getEncodingValue(MO.getReg());
+ return;
+ }
+ unsigned OpNo = &MO - MI.begin();
+ getMachineOpValueCommon(MI, MO, OpNo, Op, Fixups, STI);
+}
+
+void AMDGPUMCCodeEmitter::getMachineOpValueCommon(
+ const MCInst &MI, const MCOperand &MO, unsigned OpNo, APInt &Op,
+ SmallVectorImpl<MCFixup> &Fixups, const MCSubtargetInfo &STI) const {
+
+ if (MO.isExpr() && MO.getExpr()->getKind() != MCExpr::Constant) {
+ // FIXME: If this is expression is PCRel or not should not depend on what
+ // the expression looks like. Given that this is just a general expression,
+ // it should probably be FK_Data_4 and whatever is producing
+ //
+ // s_add_u32 s2, s2, (extern_const_addrspace+16
+ //
+ // And expecting a PCRel should instead produce
+ //
+ // .Ltmp1:
+ // s_add_u32 s2, s2, (extern_const_addrspace+16)-.Ltmp1
+ MCFixupKind Kind;
+ if (needsPCRel(MO.getExpr()))
+ Kind = FK_PCRel_4;
+ else
+ Kind = FK_Data_4;
+
+ const MCInstrDesc &Desc = MCII.get(MI.getOpcode());
+ uint32_t Offset = Desc.getSize();
+ assert(Offset == 4 || Offset == 8);
+
+ Fixups.push_back(MCFixup::create(Offset, MO.getExpr(), Kind, MI.getLoc()));
+ }
+
+ const MCInstrDesc &Desc = MCII.get(MI.getOpcode());
+ if (AMDGPU::isSISrcOperand(Desc, OpNo)) {
+ if (auto Enc = getLitEncoding(MO, Desc.operands()[OpNo], STI)) {
+ Op = *Enc;
+ return;
+ }
+ } else if (MO.isImm()) {
+ Op = MO.getImm();
+ return;
+ }
+
+ llvm_unreachable("Encoding of this operand type is not supported yet.");
+}
+
+#include "AMDGPUGenMCCodeEmitter.inc"
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.h b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.h
deleted file mode 100644
index 200c9b8726e2..000000000000
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.h
+++ /dev/null
@@ -1,68 +0,0 @@
-//===-- AMDGPUCodeEmitter.h - AMDGPU Code Emitter interface -----*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-/// \file
-/// CodeEmitter interface for SI codegen.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_LIB_TARGET_AMDGPU_MCTARGETDESC_AMDGPUMCCODEEMITTER_H
-#define LLVM_LIB_TARGET_AMDGPU_MCTARGETDESC_AMDGPUMCCODEEMITTER_H
-
-#include "llvm/ADT/APInt.h"
-#include "llvm/MC/MCCodeEmitter.h"
-
-namespace llvm {
-
-class MCInst;
-class MCInstrInfo;
-class MCOperand;
-class MCSubtargetInfo;
-
-class AMDGPUMCCodeEmitter : public MCCodeEmitter {
- virtual void anchor();
-
-protected:
- const MCInstrInfo &MCII;
-
- AMDGPUMCCodeEmitter(const MCInstrInfo &mcii) : MCII(mcii) {}
-
-public:
- void getBinaryCodeForInstr(const MCInst &MI, SmallVectorImpl<MCFixup> &Fixups,
- APInt &Inst, APInt &Scratch,
- const MCSubtargetInfo &STI) const;
-
- virtual void getMachineOpValue(const MCInst &MI, const MCOperand &MO,
- APInt &Op, SmallVectorImpl<MCFixup> &Fixups,
- const MCSubtargetInfo &STI) const = 0;
-
- virtual void getSOPPBrEncoding(const MCInst &MI, unsigned OpNo, APInt &Op,
- SmallVectorImpl<MCFixup> &Fixups,
- const MCSubtargetInfo &STI) const = 0;
-
- virtual void getSMEMOffsetEncoding(const MCInst &MI, unsigned OpNo, APInt &Op,
- SmallVectorImpl<MCFixup> &Fixups,
- const MCSubtargetInfo &STI) const = 0;
-
- virtual void getSDWASrcEncoding(const MCInst &MI, unsigned OpNo, APInt &Op,
- SmallVectorImpl<MCFixup> &Fixups,
- const MCSubtargetInfo &STI) const = 0;
-
- virtual void getSDWAVopcDstEncoding(const MCInst &MI, unsigned OpNo,
- APInt &Op,
- SmallVectorImpl<MCFixup> &Fixups,
- const MCSubtargetInfo &STI) const = 0;
-
- virtual void getAVOperandEncoding(const MCInst &MI, unsigned OpNo, APInt &Op,
- SmallVectorImpl<MCFixup> &Fixups,
- const MCSubtargetInfo &STI) const = 0;
-};
-
-} // End namespace llvm
-
-#endif
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.cpp
index 8a9fea3c8d26..a6a01479b5b1 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.cpp
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.cpp
@@ -150,8 +150,9 @@ static MCInstrAnalysis *createAMDGPUMCInstrAnalysis(const MCInstrInfo *Info) {
extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTargetMC() {
TargetRegistry::RegisterMCInstrInfo(getTheGCNTarget(), createAMDGPUMCInstrInfo);
- TargetRegistry::RegisterMCInstrInfo(getTheAMDGPUTarget(), createR600MCInstrInfo);
- for (Target *T : {&getTheAMDGPUTarget(), &getTheGCNTarget()}) {
+ TargetRegistry::RegisterMCInstrInfo(getTheR600Target(),
+ createR600MCInstrInfo);
+ for (Target *T : {&getTheR600Target(), &getTheGCNTarget()}) {
RegisterMCAsmInfo<AMDGPUMCAsmInfo> X(*T);
TargetRegistry::RegisterMCRegInfo(*T, createAMDGPUMCRegisterInfo);
@@ -163,14 +164,14 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTargetMC() {
}
// R600 specific registration
- TargetRegistry::RegisterMCCodeEmitter(getTheAMDGPUTarget(),
+ TargetRegistry::RegisterMCCodeEmitter(getTheR600Target(),
createR600MCCodeEmitter);
TargetRegistry::RegisterObjectTargetStreamer(
- getTheAMDGPUTarget(), createAMDGPUObjectTargetStreamer);
+ getTheR600Target(), createAMDGPUObjectTargetStreamer);
// GCN specific registration
TargetRegistry::RegisterMCCodeEmitter(getTheGCNTarget(),
- createSIMCCodeEmitter);
+ createAMDGPUMCCodeEmitter);
TargetRegistry::RegisterAsmTargetStreamer(getTheGCNTarget(),
createAMDGPUAsmTargetStreamer);
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.h b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.h
index c2e2563c3989..006115ba14fc 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.h
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.h
@@ -32,8 +32,8 @@ enum AMDGPUDwarfFlavour : unsigned { Wave64 = 0, Wave32 = 1 };
MCRegisterInfo *createGCNMCRegisterInfo(AMDGPUDwarfFlavour DwarfFlavour);
-MCCodeEmitter *createSIMCCodeEmitter(const MCInstrInfo &MCII,
- MCContext &Ctx);
+MCCodeEmitter *createAMDGPUMCCodeEmitter(const MCInstrInfo &MCII,
+ MCContext &Ctx);
MCAsmBackend *createAMDGPUAsmBackend(const Target &T,
const MCSubtargetInfo &STI,
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp
index 7a4af1af33d6..1bd3cdc67800 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp
@@ -26,7 +26,7 @@
#include "llvm/Support/AMDHSAKernelDescriptor.h"
#include "llvm/Support/Casting.h"
#include "llvm/Support/FormattedStream.h"
-#include "llvm/Support/TargetParser.h"
+#include "llvm/TargetParser/TargetParser.h"
using namespace llvm;
using namespace llvm::AMDGPU;
@@ -107,6 +107,8 @@ StringRef AMDGPUTargetStreamer::getArchNameFromElfMach(unsigned ElfMach) {
case ELF::EF_AMDGPU_MACH_AMDGCN_GFX90A: AK = GK_GFX90A; break;
case ELF::EF_AMDGPU_MACH_AMDGCN_GFX90C: AK = GK_GFX90C; break;
case ELF::EF_AMDGPU_MACH_AMDGCN_GFX940: AK = GK_GFX940; break;
+ case ELF::EF_AMDGPU_MACH_AMDGCN_GFX941: AK = GK_GFX941; break;
+ case ELF::EF_AMDGPU_MACH_AMDGCN_GFX942: AK = GK_GFX942; break;
case ELF::EF_AMDGPU_MACH_AMDGCN_GFX1010: AK = GK_GFX1010; break;
case ELF::EF_AMDGPU_MACH_AMDGCN_GFX1011: AK = GK_GFX1011; break;
case ELF::EF_AMDGPU_MACH_AMDGCN_GFX1012: AK = GK_GFX1012; break;
@@ -122,6 +124,8 @@ StringRef AMDGPUTargetStreamer::getArchNameFromElfMach(unsigned ElfMach) {
case ELF::EF_AMDGPU_MACH_AMDGCN_GFX1101: AK = GK_GFX1101; break;
case ELF::EF_AMDGPU_MACH_AMDGCN_GFX1102: AK = GK_GFX1102; break;
case ELF::EF_AMDGPU_MACH_AMDGCN_GFX1103: AK = GK_GFX1103; break;
+ case ELF::EF_AMDGPU_MACH_AMDGCN_GFX1150: AK = GK_GFX1150; break;
+ case ELF::EF_AMDGPU_MACH_AMDGCN_GFX1151: AK = GK_GFX1151; break;
case ELF::EF_AMDGPU_MACH_NONE: AK = GK_NONE; break;
}
@@ -176,6 +180,8 @@ unsigned AMDGPUTargetStreamer::getElfMach(StringRef GPU) {
case GK_GFX90A: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX90A;
case GK_GFX90C: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX90C;
case GK_GFX940: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX940;
+ case GK_GFX941: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX941;
+ case GK_GFX942: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX942;
case GK_GFX1010: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX1010;
case GK_GFX1011: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX1011;
case GK_GFX1012: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX1012;
@@ -191,6 +197,8 @@ unsigned AMDGPUTargetStreamer::getElfMach(StringRef GPU) {
case GK_GFX1101: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX1101;
case GK_GFX1102: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX1102;
case GK_GFX1103: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX1103;
+ case GK_GFX1150: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX1150;
+ case GK_GFX1151: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX1151;
case GK_NONE: return ELF::EF_AMDGPU_MACH_NONE;
}
@@ -320,7 +328,7 @@ bool AMDGPUTargetAsmStreamer::EmitCodeEnd(const MCSubtargetInfo &STI) {
void AMDGPUTargetAsmStreamer::EmitAmdhsaKernelDescriptor(
const MCSubtargetInfo &STI, StringRef KernelName,
const amdhsa::kernel_descriptor_t &KD, uint64_t NextVGPR, uint64_t NextSGPR,
- bool ReserveVCC, bool ReserveFlatScr) {
+ bool ReserveVCC, bool ReserveFlatScr, unsigned CodeObjectVersion) {
IsaVersion IVersion = getIsaVersion(STI.getCPU());
OS << "\t.amdhsa_kernel " << KernelName << '\n';
@@ -367,7 +375,7 @@ void AMDGPUTargetAsmStreamer::EmitAmdhsaKernelDescriptor(
PRINT_FIELD(OS, ".amdhsa_wavefront_size32", KD,
kernel_code_properties,
amdhsa::KERNEL_CODE_PROPERTY_ENABLE_WAVEFRONT_SIZE32);
- if (AMDGPU::getAmdhsaCodeObjectVersion() >= 5)
+ if (CodeObjectVersion >= AMDGPU::AMDHSA_COV5)
PRINT_FIELD(OS, ".amdhsa_uses_dynamic_stack", KD, kernel_code_properties,
amdhsa::KERNEL_CODE_PROPERTY_USES_DYNAMIC_STACK);
PRINT_FIELD(OS,
@@ -407,19 +415,17 @@ void AMDGPUTargetAsmStreamer::EmitAmdhsaKernelDescriptor(
if (IVersion.Major >= 7 && !ReserveFlatScr && !hasArchitectedFlatScratch(STI))
OS << "\t\t.amdhsa_reserve_flat_scratch " << ReserveFlatScr << '\n';
- if (std::optional<uint8_t> HsaAbiVer = getHsaAbiVersion(&STI)) {
- switch (*HsaAbiVer) {
- default:
- break;
- case ELF::ELFABIVERSION_AMDGPU_HSA_V2:
- break;
- case ELF::ELFABIVERSION_AMDGPU_HSA_V3:
- case ELF::ELFABIVERSION_AMDGPU_HSA_V4:
- case ELF::ELFABIVERSION_AMDGPU_HSA_V5:
- if (getTargetID()->isXnackSupported())
- OS << "\t\t.amdhsa_reserve_xnack_mask " << getTargetID()->isXnackOnOrAny() << '\n';
- break;
- }
+ switch (CodeObjectVersion) {
+ default:
+ break;
+ case AMDGPU::AMDHSA_COV2:
+ break;
+ case AMDGPU::AMDHSA_COV3:
+ case AMDGPU::AMDHSA_COV4:
+ case AMDGPU::AMDHSA_COV5:
+ if (getTargetID()->isXnackSupported())
+ OS << "\t\t.amdhsa_reserve_xnack_mask " << getTargetID()->isXnackOnOrAny() << '\n';
+ break;
}
PRINT_FIELD(OS, ".amdhsa_float_round_mode_32", KD,
@@ -850,7 +856,8 @@ bool AMDGPUTargetELFStreamer::EmitCodeEnd(const MCSubtargetInfo &STI) {
void AMDGPUTargetELFStreamer::EmitAmdhsaKernelDescriptor(
const MCSubtargetInfo &STI, StringRef KernelName,
const amdhsa::kernel_descriptor_t &KernelDescriptor, uint64_t NextVGPR,
- uint64_t NextSGPR, bool ReserveVCC, bool ReserveFlatScr) {
+ uint64_t NextSGPR, bool ReserveVCC, bool ReserveFlatScr,
+ unsigned CodeObjectVersion) {
auto &Streamer = getStreamer();
auto &Context = Streamer.getContext();
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h
index 50511794a013..db43de8fcc5f 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h
@@ -93,7 +93,8 @@ public:
virtual void EmitAmdhsaKernelDescriptor(
const MCSubtargetInfo &STI, StringRef KernelName,
const amdhsa::kernel_descriptor_t &KernelDescriptor, uint64_t NextVGPR,
- uint64_t NextSGPR, bool ReserveVCC, bool ReserveFlatScr){};
+ uint64_t NextSGPR, bool ReserveVCC, bool ReserveFlatScr,
+ unsigned CodeObjectVersion){};
static StringRef getArchNameFromElfMach(unsigned ElfMach);
static unsigned getElfMach(StringRef GPU);
@@ -104,12 +105,15 @@ public:
std::optional<AMDGPU::IsaInfo::AMDGPUTargetID> &getTargetID() {
return TargetID;
}
- void initializeTargetID(const MCSubtargetInfo &STI) {
+ void initializeTargetID(const MCSubtargetInfo &STI,
+ unsigned CodeObjectVersion) {
assert(TargetID == std::nullopt && "TargetID can only be initialized once");
TargetID.emplace(STI);
+ getTargetID()->setCodeObjectVersion(CodeObjectVersion);
}
- void initializeTargetID(const MCSubtargetInfo &STI, StringRef FeatureString) {
- initializeTargetID(STI);
+ void initializeTargetID(const MCSubtargetInfo &STI, StringRef FeatureString,
+ unsigned CodeObjectVersion) {
+ initializeTargetID(STI, CodeObjectVersion);
assert(getTargetID() != std::nullopt && "TargetID is None");
getTargetID()->setTargetIDFromFeaturesString(FeatureString);
@@ -153,7 +157,8 @@ public:
void EmitAmdhsaKernelDescriptor(
const MCSubtargetInfo &STI, StringRef KernelName,
const amdhsa::kernel_descriptor_t &KernelDescriptor, uint64_t NextVGPR,
- uint64_t NextSGPR, bool ReserveVCC, bool ReserveFlatScr) override;
+ uint64_t NextSGPR, bool ReserveVCC, bool ReserveFlatScr,
+ unsigned CodeObjectVersion) override;
};
class AMDGPUTargetELFStreamer final : public AMDGPUTargetStreamer {
@@ -213,7 +218,8 @@ public:
void EmitAmdhsaKernelDescriptor(
const MCSubtargetInfo &STI, StringRef KernelName,
const amdhsa::kernel_descriptor_t &KernelDescriptor, uint64_t NextVGPR,
- uint64_t NextSGPR, bool ReserveVCC, bool ReserveFlatScr) override;
+ uint64_t NextSGPR, bool ReserveVCC, bool ReserveFlatScr,
+ unsigned CodeObjectVersion) override;
};
}
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/R600InstPrinter.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/R600InstPrinter.cpp
index f77ed1faf029..22d0594e2b86 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/R600InstPrinter.cpp
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/R600InstPrinter.cpp
@@ -97,7 +97,7 @@ void R600InstPrinter::printLiteral(const MCInst *MI, unsigned OpNo,
assert(Op.isImm() || Op.isExpr());
if (Op.isImm()) {
int64_t Imm = Op.getImm();
- O << Imm << '(' << BitsToFloat(Imm) << ')';
+ O << Imm << '(' << llvm::bit_cast<float>(static_cast<uint32_t>(Imm)) << ')';
}
if (Op.isExpr()) {
Op.getExpr()->print(O << '@', &MAI);
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/R600MCCodeEmitter.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/R600MCCodeEmitter.cpp
index 3d926e52c368..bbbfbe4faa0f 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/R600MCCodeEmitter.cpp
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/R600MCCodeEmitter.cpp
@@ -21,8 +21,8 @@
#include "llvm/MC/MCInstrInfo.h"
#include "llvm/MC/MCRegisterInfo.h"
#include "llvm/MC/MCSubtargetInfo.h"
-#include "llvm/MC/SubtargetFeature.h"
#include "llvm/Support/EndianStream.h"
+#include "llvm/TargetParser/SubtargetFeature.h"
using namespace llvm;
@@ -39,7 +39,7 @@ public:
R600MCCodeEmitter &operator=(const R600MCCodeEmitter &) = delete;
/// Encode the instruction and write it to the OS.
- void encodeInstruction(const MCInst &MI, raw_ostream &OS,
+ void encodeInstruction(const MCInst &MI, SmallVectorImpl<char> &CB,
SmallVectorImpl<MCFixup> &Fixups,
const MCSubtargetInfo &STI) const override;
@@ -49,9 +49,8 @@ public:
const MCSubtargetInfo &STI) const;
private:
-
- void Emit(uint32_t value, raw_ostream &OS) const;
- void Emit(uint64_t value, raw_ostream &OS) const;
+ void emit(uint32_t value, SmallVectorImpl<char> &CB) const;
+ void emit(uint64_t value, SmallVectorImpl<char> &CB) const;
unsigned getHWReg(unsigned regNo) const;
@@ -84,7 +83,8 @@ MCCodeEmitter *llvm::createR600MCCodeEmitter(const MCInstrInfo &MCII,
return new R600MCCodeEmitter(MCII, *Ctx.getRegisterInfo());
}
-void R600MCCodeEmitter::encodeInstruction(const MCInst &MI, raw_ostream &OS,
+void R600MCCodeEmitter::encodeInstruction(const MCInst &MI,
+ SmallVectorImpl<char> &CB,
SmallVectorImpl<MCFixup> &Fixups,
const MCSubtargetInfo &STI) const {
const MCInstrDesc &Desc = MCII.get(MI.getOpcode());
@@ -97,13 +97,13 @@ void R600MCCodeEmitter::encodeInstruction(const MCInst &MI, raw_ostream &OS,
} else if (IS_VTX(Desc)) {
uint64_t InstWord01 = getBinaryCodeForInstr(MI, Fixups, STI);
uint32_t InstWord2 = MI.getOperand(2).getImm(); // Offset
- if (!(STI.getFeatureBits()[R600::FeatureCaymanISA])) {
+ if (!(STI.hasFeature(R600::FeatureCaymanISA))) {
InstWord2 |= 1 << 19; // Mega-Fetch bit
}
- Emit(InstWord01, OS);
- Emit(InstWord2, OS);
- Emit((uint32_t) 0, OS);
+ emit(InstWord01, CB);
+ emit(InstWord2, CB);
+ emit((uint32_t)0, CB);
} else if (IS_TEX(Desc)) {
int64_t Sampler = MI.getOperand(14).getImm();
@@ -125,28 +125,28 @@ void R600MCCodeEmitter::encodeInstruction(const MCInst &MI, raw_ostream &OS,
SrcSelect[ELEMENT_W] << 29 | Offsets[0] << 0 | Offsets[1] << 5 |
Offsets[2] << 10;
- Emit(Word01, OS);
- Emit(Word2, OS);
- Emit((uint32_t) 0, OS);
+ emit(Word01, CB);
+ emit(Word2, CB);
+ emit((uint32_t)0, CB);
} else {
uint64_t Inst = getBinaryCodeForInstr(MI, Fixups, STI);
- if ((STI.getFeatureBits()[R600::FeatureR600ALUInst]) &&
+ if ((STI.hasFeature(R600::FeatureR600ALUInst)) &&
((Desc.TSFlags & R600_InstFlag::OP1) ||
Desc.TSFlags & R600_InstFlag::OP2)) {
uint64_t ISAOpCode = Inst & (0x3FFULL << 39);
Inst &= ~(0x3FFULL << 39);
Inst |= ISAOpCode << 1;
}
- Emit(Inst, OS);
+ emit(Inst, CB);
}
}
-void R600MCCodeEmitter::Emit(uint32_t Value, raw_ostream &OS) const {
- support::endian::write(OS, Value, support::little);
+void R600MCCodeEmitter::emit(uint32_t Value, SmallVectorImpl<char> &CB) const {
+ support::endian::write(CB, Value, support::little);
}
-void R600MCCodeEmitter::Emit(uint64_t Value, raw_ostream &OS) const {
- support::endian::write(OS, Value, support::little);
+void R600MCCodeEmitter::emit(uint64_t Value, SmallVectorImpl<char> &CB) const {
+ support::endian::write(CB, Value, support::little);
}
unsigned R600MCCodeEmitter::getHWReg(unsigned RegNo) const {
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/R600MCTargetDesc.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/R600MCTargetDesc.cpp
index b9ff195e0ddc..6f2ccb137235 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/R600MCTargetDesc.cpp
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/R600MCTargetDesc.cpp
@@ -13,7 +13,7 @@
#include "R600MCTargetDesc.h"
#include "llvm/MC/MCInstrInfo.h"
-#include "llvm/MC/SubtargetFeature.h"
+#include "llvm/TargetParser/SubtargetFeature.h"
using namespace llvm;
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp
deleted file mode 100644
index f659f08de027..000000000000
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp
+++ /dev/null
@@ -1,594 +0,0 @@
-//===-- SIMCCodeEmitter.cpp - SI Code Emitter -----------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-/// \file
-/// The SI code emitter produces machine code that can be executed
-/// directly on the GPU device.
-//
-//===----------------------------------------------------------------------===//
-
-#include "MCTargetDesc/AMDGPUFixupKinds.h"
-#include "MCTargetDesc/AMDGPUMCCodeEmitter.h"
-#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
-#include "SIDefines.h"
-#include "Utils/AMDGPUBaseInfo.h"
-#include "llvm/ADT/APInt.h"
-#include "llvm/MC/MCCodeEmitter.h"
-#include "llvm/MC/MCContext.h"
-#include "llvm/MC/MCExpr.h"
-#include "llvm/MC/MCInstrInfo.h"
-#include "llvm/MC/MCRegisterInfo.h"
-#include "llvm/MC/MCSubtargetInfo.h"
-#include "llvm/MC/SubtargetFeature.h"
-#include "llvm/Support/Casting.h"
-#include <optional>
-
-using namespace llvm;
-
-namespace {
-
-class SIMCCodeEmitter : public AMDGPUMCCodeEmitter {
- const MCRegisterInfo &MRI;
-
- /// Encode an fp or int literal
- std::optional<uint32_t> getLitEncoding(const MCOperand &MO,
- const MCOperandInfo &OpInfo,
- const MCSubtargetInfo &STI) const;
-
-public:
- SIMCCodeEmitter(const MCInstrInfo &mcii, MCContext &ctx)
- : AMDGPUMCCodeEmitter(mcii), MRI(*ctx.getRegisterInfo()) {}
- SIMCCodeEmitter(const SIMCCodeEmitter &) = delete;
- SIMCCodeEmitter &operator=(const SIMCCodeEmitter &) = delete;
-
- /// Encode the instruction and write it to the OS.
- void encodeInstruction(const MCInst &MI, raw_ostream &OS,
- SmallVectorImpl<MCFixup> &Fixups,
- const MCSubtargetInfo &STI) const override;
-
- void getMachineOpValue(const MCInst &MI, const MCOperand &MO, APInt &Op,
- SmallVectorImpl<MCFixup> &Fixups,
- const MCSubtargetInfo &STI) const override;
-
- /// Use a fixup to encode the simm16 field for SOPP branch
- /// instructions.
- void getSOPPBrEncoding(const MCInst &MI, unsigned OpNo, APInt &Op,
- SmallVectorImpl<MCFixup> &Fixups,
- const MCSubtargetInfo &STI) const override;
-
- void getSMEMOffsetEncoding(const MCInst &MI, unsigned OpNo, APInt &Op,
- SmallVectorImpl<MCFixup> &Fixups,
- const MCSubtargetInfo &STI) const override;
-
- void getSDWASrcEncoding(const MCInst &MI, unsigned OpNo, APInt &Op,
- SmallVectorImpl<MCFixup> &Fixups,
- const MCSubtargetInfo &STI) const override;
-
- void getSDWAVopcDstEncoding(const MCInst &MI, unsigned OpNo, APInt &Op,
- SmallVectorImpl<MCFixup> &Fixups,
- const MCSubtargetInfo &STI) const override;
-
- void getAVOperandEncoding(const MCInst &MI, unsigned OpNo, APInt &Op,
- SmallVectorImpl<MCFixup> &Fixups,
- const MCSubtargetInfo &STI) const override;
-
-private:
- uint64_t getImplicitOpSelHiEncoding(int Opcode) const;
- void getMachineOpValueCommon(const MCInst &MI, const MCOperand &MO,
- unsigned OpNo, APInt &Op,
- SmallVectorImpl<MCFixup> &Fixups,
- const MCSubtargetInfo &STI) const;
-};
-
-} // end anonymous namespace
-
-MCCodeEmitter *llvm::createSIMCCodeEmitter(const MCInstrInfo &MCII,
- MCContext &Ctx) {
- return new SIMCCodeEmitter(MCII, Ctx);
-}
-
-// Returns the encoding value to use if the given integer is an integer inline
-// immediate value, or 0 if it is not.
-template <typename IntTy>
-static uint32_t getIntInlineImmEncoding(IntTy Imm) {
- if (Imm >= 0 && Imm <= 64)
- return 128 + Imm;
-
- if (Imm >= -16 && Imm <= -1)
- return 192 + std::abs(Imm);
-
- return 0;
-}
-
-static uint32_t getLit16IntEncoding(uint16_t Val, const MCSubtargetInfo &STI) {
- uint16_t IntImm = getIntInlineImmEncoding(static_cast<int16_t>(Val));
- return IntImm == 0 ? 255 : IntImm;
-}
-
-static uint32_t getLit16Encoding(uint16_t Val, const MCSubtargetInfo &STI) {
- uint16_t IntImm = getIntInlineImmEncoding(static_cast<int16_t>(Val));
- if (IntImm != 0)
- return IntImm;
-
- if (Val == 0x3800) // 0.5
- return 240;
-
- if (Val == 0xB800) // -0.5
- return 241;
-
- if (Val == 0x3C00) // 1.0
- return 242;
-
- if (Val == 0xBC00) // -1.0
- return 243;
-
- if (Val == 0x4000) // 2.0
- return 244;
-
- if (Val == 0xC000) // -2.0
- return 245;
-
- if (Val == 0x4400) // 4.0
- return 246;
-
- if (Val == 0xC400) // -4.0
- return 247;
-
- if (Val == 0x3118 && // 1.0 / (2.0 * pi)
- STI.getFeatureBits()[AMDGPU::FeatureInv2PiInlineImm])
- return 248;
-
- return 255;
-}
-
-static uint32_t getLit32Encoding(uint32_t Val, const MCSubtargetInfo &STI) {
- uint32_t IntImm = getIntInlineImmEncoding(static_cast<int32_t>(Val));
- if (IntImm != 0)
- return IntImm;
-
- if (Val == FloatToBits(0.5f))
- return 240;
-
- if (Val == FloatToBits(-0.5f))
- return 241;
-
- if (Val == FloatToBits(1.0f))
- return 242;
-
- if (Val == FloatToBits(-1.0f))
- return 243;
-
- if (Val == FloatToBits(2.0f))
- return 244;
-
- if (Val == FloatToBits(-2.0f))
- return 245;
-
- if (Val == FloatToBits(4.0f))
- return 246;
-
- if (Val == FloatToBits(-4.0f))
- return 247;
-
- if (Val == 0x3e22f983 && // 1.0 / (2.0 * pi)
- STI.getFeatureBits()[AMDGPU::FeatureInv2PiInlineImm])
- return 248;
-
- return 255;
-}
-
-static uint32_t getLit64Encoding(uint64_t Val, const MCSubtargetInfo &STI) {
- uint32_t IntImm = getIntInlineImmEncoding(static_cast<int64_t>(Val));
- if (IntImm != 0)
- return IntImm;
-
- if (Val == DoubleToBits(0.5))
- return 240;
-
- if (Val == DoubleToBits(-0.5))
- return 241;
-
- if (Val == DoubleToBits(1.0))
- return 242;
-
- if (Val == DoubleToBits(-1.0))
- return 243;
-
- if (Val == DoubleToBits(2.0))
- return 244;
-
- if (Val == DoubleToBits(-2.0))
- return 245;
-
- if (Val == DoubleToBits(4.0))
- return 246;
-
- if (Val == DoubleToBits(-4.0))
- return 247;
-
- if (Val == 0x3fc45f306dc9c882 && // 1.0 / (2.0 * pi)
- STI.getFeatureBits()[AMDGPU::FeatureInv2PiInlineImm])
- return 248;
-
- return 255;
-}
-
-std::optional<uint32_t>
-SIMCCodeEmitter::getLitEncoding(const MCOperand &MO,
- const MCOperandInfo &OpInfo,
- const MCSubtargetInfo &STI) const {
- int64_t Imm;
- if (MO.isExpr()) {
- const auto *C = dyn_cast<MCConstantExpr>(MO.getExpr());
- if (!C)
- return 255;
-
- Imm = C->getValue();
- } else {
-
- assert(!MO.isDFPImm());
-
- if (!MO.isImm())
- return {};
-
- Imm = MO.getImm();
- }
-
- switch (OpInfo.OperandType) {
- case AMDGPU::OPERAND_REG_IMM_INT32:
- case AMDGPU::OPERAND_REG_IMM_FP32:
- case AMDGPU::OPERAND_REG_IMM_FP32_DEFERRED:
- case AMDGPU::OPERAND_REG_INLINE_C_INT32:
- case AMDGPU::OPERAND_REG_INLINE_C_FP32:
- case AMDGPU::OPERAND_REG_INLINE_AC_INT32:
- case AMDGPU::OPERAND_REG_INLINE_AC_FP32:
- case AMDGPU::OPERAND_REG_IMM_V2INT32:
- case AMDGPU::OPERAND_REG_IMM_V2FP32:
- case AMDGPU::OPERAND_REG_INLINE_C_V2INT32:
- case AMDGPU::OPERAND_REG_INLINE_C_V2FP32:
- return getLit32Encoding(static_cast<uint32_t>(Imm), STI);
-
- case AMDGPU::OPERAND_REG_IMM_INT64:
- case AMDGPU::OPERAND_REG_IMM_FP64:
- case AMDGPU::OPERAND_REG_INLINE_C_INT64:
- case AMDGPU::OPERAND_REG_INLINE_C_FP64:
- case AMDGPU::OPERAND_REG_INLINE_AC_FP64:
- return getLit64Encoding(static_cast<uint64_t>(Imm), STI);
-
- case AMDGPU::OPERAND_REG_IMM_INT16:
- case AMDGPU::OPERAND_REG_INLINE_C_INT16:
- case AMDGPU::OPERAND_REG_INLINE_AC_INT16:
- return getLit16IntEncoding(static_cast<uint16_t>(Imm), STI);
- case AMDGPU::OPERAND_REG_IMM_FP16:
- case AMDGPU::OPERAND_REG_IMM_FP16_DEFERRED:
- case AMDGPU::OPERAND_REG_INLINE_C_FP16:
- case AMDGPU::OPERAND_REG_INLINE_AC_FP16:
- // FIXME Is this correct? What do inline immediates do on SI for f16 src
- // which does not have f16 support?
- return getLit16Encoding(static_cast<uint16_t>(Imm), STI);
- case AMDGPU::OPERAND_REG_IMM_V2INT16:
- case AMDGPU::OPERAND_REG_IMM_V2FP16: {
- if (!isUInt<16>(Imm) && STI.getFeatureBits()[AMDGPU::FeatureVOP3Literal])
- return getLit32Encoding(static_cast<uint32_t>(Imm), STI);
- if (OpInfo.OperandType == AMDGPU::OPERAND_REG_IMM_V2FP16)
- return getLit16Encoding(static_cast<uint16_t>(Imm), STI);
- [[fallthrough]];
- }
- case AMDGPU::OPERAND_REG_INLINE_C_V2INT16:
- case AMDGPU::OPERAND_REG_INLINE_AC_V2INT16:
- return getLit16IntEncoding(static_cast<uint16_t>(Imm), STI);
- case AMDGPU::OPERAND_REG_INLINE_C_V2FP16:
- case AMDGPU::OPERAND_REG_INLINE_AC_V2FP16: {
- uint16_t Lo16 = static_cast<uint16_t>(Imm);
- uint32_t Encoding = getLit16Encoding(Lo16, STI);
- return Encoding;
- }
- case AMDGPU::OPERAND_KIMM32:
- case AMDGPU::OPERAND_KIMM16:
- return MO.getImm();
- default:
- llvm_unreachable("invalid operand size");
- }
-}
-
-uint64_t SIMCCodeEmitter::getImplicitOpSelHiEncoding(int Opcode) const {
- using namespace AMDGPU::VOP3PEncoding;
- using namespace AMDGPU::OpName;
-
- if (AMDGPU::hasNamedOperand(Opcode, op_sel_hi)) {
- if (AMDGPU::hasNamedOperand(Opcode, src2))
- return 0;
- if (AMDGPU::hasNamedOperand(Opcode, src1))
- return OP_SEL_HI_2;
- if (AMDGPU::hasNamedOperand(Opcode, src0))
- return OP_SEL_HI_1 | OP_SEL_HI_2;
- }
- return OP_SEL_HI_0 | OP_SEL_HI_1 | OP_SEL_HI_2;
-}
-
-static bool isVCMPX64(const MCInstrDesc &Desc) {
- return (Desc.TSFlags & SIInstrFlags::VOP3) &&
- Desc.hasImplicitDefOfPhysReg(AMDGPU::EXEC);
-}
-
-void SIMCCodeEmitter::encodeInstruction(const MCInst &MI, raw_ostream &OS,
- SmallVectorImpl<MCFixup> &Fixups,
- const MCSubtargetInfo &STI) const {
- int Opcode = MI.getOpcode();
- APInt Encoding, Scratch;
- getBinaryCodeForInstr(MI, Fixups, Encoding, Scratch, STI);
- const MCInstrDesc &Desc = MCII.get(MI.getOpcode());
- unsigned bytes = Desc.getSize();
-
- // Set unused op_sel_hi bits to 1 for VOP3P and MAI instructions.
- // Note that accvgpr_read/write are MAI, have src0, but do not use op_sel.
- if ((Desc.TSFlags & SIInstrFlags::VOP3P) ||
- Opcode == AMDGPU::V_ACCVGPR_READ_B32_vi ||
- Opcode == AMDGPU::V_ACCVGPR_WRITE_B32_vi) {
- Encoding |= getImplicitOpSelHiEncoding(Opcode);
- }
-
- // GFX10+ v_cmpx opcodes promoted to VOP3 have implied dst=EXEC.
- // Documentation requires dst to be encoded as EXEC (0x7E),
- // but it looks like the actual value encoded for dst operand
- // is ignored by HW. It was decided to define dst as "do not care"
- // in td files to allow disassembler accept any dst value.
- // However, dst is encoded as EXEC for compatibility with SP3.
- if (AMDGPU::isGFX10Plus(STI) && isVCMPX64(Desc)) {
- assert((Encoding & 0xFF) == 0);
- Encoding |= MRI.getEncodingValue(AMDGPU::EXEC_LO);
- }
-
- for (unsigned i = 0; i < bytes; i++) {
- OS.write((uint8_t)Encoding.extractBitsAsZExtValue(8, 8 * i));
- }
-
- // NSA encoding.
- if (AMDGPU::isGFX10Plus(STI) && Desc.TSFlags & SIInstrFlags::MIMG) {
- int vaddr0 = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
- AMDGPU::OpName::vaddr0);
- int srsrc = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
- AMDGPU::OpName::srsrc);
- assert(vaddr0 >= 0 && srsrc > vaddr0);
- unsigned NumExtraAddrs = srsrc - vaddr0 - 1;
- unsigned NumPadding = (-NumExtraAddrs) & 3;
-
- for (unsigned i = 0; i < NumExtraAddrs; ++i) {
- getMachineOpValue(MI, MI.getOperand(vaddr0 + 1 + i), Encoding, Fixups,
- STI);
- OS.write((uint8_t)Encoding.getLimitedValue());
- }
- for (unsigned i = 0; i < NumPadding; ++i)
- OS.write(0);
- }
-
- if ((bytes > 8 && STI.getFeatureBits()[AMDGPU::FeatureVOP3Literal]) ||
- (bytes > 4 && !STI.getFeatureBits()[AMDGPU::FeatureVOP3Literal]))
- return;
-
- // Do not print literals from SISrc Operands for insts with mandatory literals
- if (AMDGPU::hasNamedOperand(MI.getOpcode(), AMDGPU::OpName::imm))
- return;
-
- // Check for additional literals
- for (unsigned i = 0, e = Desc.getNumOperands(); i < e; ++i) {
-
- // Check if this operand should be encoded as [SV]Src
- if (!AMDGPU::isSISrcOperand(Desc, i))
- continue;
-
- // Is this operand a literal immediate?
- const MCOperand &Op = MI.getOperand(i);
- auto Enc = getLitEncoding(Op, Desc.operands()[i], STI);
- if (!Enc || *Enc != 255)
- continue;
-
- // Yes! Encode it
- int64_t Imm = 0;
-
- if (Op.isImm())
- Imm = Op.getImm();
- else if (Op.isExpr()) {
- if (const auto *C = dyn_cast<MCConstantExpr>(Op.getExpr()))
- Imm = C->getValue();
-
- } else if (!Op.isExpr()) // Exprs will be replaced with a fixup value.
- llvm_unreachable("Must be immediate or expr");
-
- for (unsigned j = 0; j < 4; j++) {
- OS.write((uint8_t) ((Imm >> (8 * j)) & 0xff));
- }
-
- // Only one literal value allowed
- break;
- }
-}
-
-void SIMCCodeEmitter::getSOPPBrEncoding(const MCInst &MI, unsigned OpNo,
- APInt &Op,
- SmallVectorImpl<MCFixup> &Fixups,
- const MCSubtargetInfo &STI) const {
- const MCOperand &MO = MI.getOperand(OpNo);
-
- if (MO.isExpr()) {
- const MCExpr *Expr = MO.getExpr();
- MCFixupKind Kind = (MCFixupKind)AMDGPU::fixup_si_sopp_br;
- Fixups.push_back(MCFixup::create(0, Expr, Kind, MI.getLoc()));
- Op = APInt::getNullValue(96);
- } else {
- getMachineOpValue(MI, MO, Op, Fixups, STI);
- }
-}
-
-void SIMCCodeEmitter::getSMEMOffsetEncoding(const MCInst &MI, unsigned OpNo,
- APInt &Op,
- SmallVectorImpl<MCFixup> &Fixups,
- const MCSubtargetInfo &STI) const {
- auto Offset = MI.getOperand(OpNo).getImm();
- // VI only supports 20-bit unsigned offsets.
- assert(!AMDGPU::isVI(STI) || isUInt<20>(Offset));
- Op = Offset;
-}
-
-void SIMCCodeEmitter::getSDWASrcEncoding(const MCInst &MI, unsigned OpNo,
- APInt &Op,
- SmallVectorImpl<MCFixup> &Fixups,
- const MCSubtargetInfo &STI) const {
- using namespace AMDGPU::SDWA;
-
- uint64_t RegEnc = 0;
-
- const MCOperand &MO = MI.getOperand(OpNo);
-
- if (MO.isReg()) {
- unsigned Reg = MO.getReg();
- RegEnc |= MRI.getEncodingValue(Reg);
- RegEnc &= SDWA9EncValues::SRC_VGPR_MASK;
- if (AMDGPU::isSGPR(AMDGPU::mc2PseudoReg(Reg), &MRI)) {
- RegEnc |= SDWA9EncValues::SRC_SGPR_MASK;
- }
- Op = RegEnc;
- return;
- } else {
- const MCInstrDesc &Desc = MCII.get(MI.getOpcode());
- auto Enc = getLitEncoding(MO, Desc.operands()[OpNo], STI);
- if (Enc && *Enc != 255) {
- Op = *Enc | SDWA9EncValues::SRC_SGPR_MASK;
- return;
- }
- }
-
- llvm_unreachable("Unsupported operand kind");
-}
-
-void SIMCCodeEmitter::getSDWAVopcDstEncoding(const MCInst &MI, unsigned OpNo,
- APInt &Op,
- SmallVectorImpl<MCFixup> &Fixups,
- const MCSubtargetInfo &STI) const {
- using namespace AMDGPU::SDWA;
-
- uint64_t RegEnc = 0;
-
- const MCOperand &MO = MI.getOperand(OpNo);
-
- unsigned Reg = MO.getReg();
- if (Reg != AMDGPU::VCC && Reg != AMDGPU::VCC_LO) {
- RegEnc |= MRI.getEncodingValue(Reg);
- RegEnc &= SDWA9EncValues::VOPC_DST_SGPR_MASK;
- RegEnc |= SDWA9EncValues::VOPC_DST_VCC_MASK;
- }
- Op = RegEnc;
-}
-
-void SIMCCodeEmitter::getAVOperandEncoding(const MCInst &MI, unsigned OpNo,
- APInt &Op,
- SmallVectorImpl<MCFixup> &Fixups,
- const MCSubtargetInfo &STI) const {
- unsigned Reg = MI.getOperand(OpNo).getReg();
- uint64_t Enc = MRI.getEncodingValue(Reg);
-
- // VGPR and AGPR have the same encoding, but SrcA and SrcB operands of mfma
- // instructions use acc[0:1] modifier bits to distinguish. These bits are
- // encoded as a virtual 9th bit of the register for these operands.
- if (MRI.getRegClass(AMDGPU::AGPR_32RegClassID).contains(Reg) ||
- MRI.getRegClass(AMDGPU::AReg_64RegClassID).contains(Reg) ||
- MRI.getRegClass(AMDGPU::AReg_96RegClassID).contains(Reg) ||
- MRI.getRegClass(AMDGPU::AReg_128RegClassID).contains(Reg) ||
- MRI.getRegClass(AMDGPU::AReg_160RegClassID).contains(Reg) ||
- MRI.getRegClass(AMDGPU::AReg_192RegClassID).contains(Reg) ||
- MRI.getRegClass(AMDGPU::AReg_224RegClassID).contains(Reg) ||
- MRI.getRegClass(AMDGPU::AReg_256RegClassID).contains(Reg) ||
- MRI.getRegClass(AMDGPU::AReg_288RegClassID).contains(Reg) ||
- MRI.getRegClass(AMDGPU::AReg_320RegClassID).contains(Reg) ||
- MRI.getRegClass(AMDGPU::AReg_352RegClassID).contains(Reg) ||
- MRI.getRegClass(AMDGPU::AReg_384RegClassID).contains(Reg) ||
- MRI.getRegClass(AMDGPU::AReg_512RegClassID).contains(Reg) ||
- MRI.getRegClass(AMDGPU::AGPR_LO16RegClassID).contains(Reg))
- Enc |= 512;
-
- Op = Enc;
-}
-
-static bool needsPCRel(const MCExpr *Expr) {
- switch (Expr->getKind()) {
- case MCExpr::SymbolRef: {
- auto *SE = cast<MCSymbolRefExpr>(Expr);
- MCSymbolRefExpr::VariantKind Kind = SE->getKind();
- return Kind != MCSymbolRefExpr::VK_AMDGPU_ABS32_LO &&
- Kind != MCSymbolRefExpr::VK_AMDGPU_ABS32_HI;
- }
- case MCExpr::Binary: {
- auto *BE = cast<MCBinaryExpr>(Expr);
- if (BE->getOpcode() == MCBinaryExpr::Sub)
- return false;
- return needsPCRel(BE->getLHS()) || needsPCRel(BE->getRHS());
- }
- case MCExpr::Unary:
- return needsPCRel(cast<MCUnaryExpr>(Expr)->getSubExpr());
- case MCExpr::Target:
- case MCExpr::Constant:
- return false;
- }
- llvm_unreachable("invalid kind");
-}
-
-void SIMCCodeEmitter::getMachineOpValue(const MCInst &MI,
- const MCOperand &MO, APInt &Op,
- SmallVectorImpl<MCFixup> &Fixups,
- const MCSubtargetInfo &STI) const {
- if (MO.isReg()){
- Op = MRI.getEncodingValue(MO.getReg());
- return;
- }
- unsigned OpNo = &MO - MI.begin();
- getMachineOpValueCommon(MI, MO, OpNo, Op, Fixups, STI);
-}
-
-void SIMCCodeEmitter::getMachineOpValueCommon(
- const MCInst &MI, const MCOperand &MO, unsigned OpNo, APInt &Op,
- SmallVectorImpl<MCFixup> &Fixups, const MCSubtargetInfo &STI) const {
-
- if (MO.isExpr() && MO.getExpr()->getKind() != MCExpr::Constant) {
- // FIXME: If this is expression is PCRel or not should not depend on what
- // the expression looks like. Given that this is just a general expression,
- // it should probably be FK_Data_4 and whatever is producing
- //
- // s_add_u32 s2, s2, (extern_const_addrspace+16
- //
- // And expecting a PCRel should instead produce
- //
- // .Ltmp1:
- // s_add_u32 s2, s2, (extern_const_addrspace+16)-.Ltmp1
- MCFixupKind Kind;
- if (needsPCRel(MO.getExpr()))
- Kind = FK_PCRel_4;
- else
- Kind = FK_Data_4;
-
- const MCInstrDesc &Desc = MCII.get(MI.getOpcode());
- uint32_t Offset = Desc.getSize();
- assert(Offset == 4 || Offset == 8);
-
- Fixups.push_back(MCFixup::create(Offset, MO.getExpr(), Kind, MI.getLoc()));
- }
-
- const MCInstrDesc &Desc = MCII.get(MI.getOpcode());
- if (AMDGPU::isSISrcOperand(Desc, OpNo)) {
- if (auto Enc = getLitEncoding(MO, Desc.operands()[OpNo], STI)) {
- Op = *Enc;
- return;
- }
- } else if (MO.isImm()) {
- Op = MO.getImm();
- return;
- }
-
- llvm_unreachable("Encoding of this operand type is not supported yet.");
-}
-
-#include "AMDGPUGenMCCodeEmitter.inc"
diff --git a/llvm/lib/Target/AMDGPU/MIMGInstructions.td b/llvm/lib/Target/AMDGPU/MIMGInstructions.td
index c295b7f79442..d924f733624a 100644
--- a/llvm/lib/Target/AMDGPU/MIMGInstructions.td
+++ b/llvm/lib/Target/AMDGPU/MIMGInstructions.td
@@ -48,6 +48,7 @@ class MIMGBaseOpcode : PredicateControl {
bit IsAtomicRet = 0;
bit MSAA = 0;
bit BVH = 0;
+ bit A16 = 0;
}
def MIMGBaseOpcode : GenericEnum {
@@ -59,7 +60,7 @@ def MIMGBaseOpcodesTable : GenericTable {
let CppTypeName = "MIMGBaseOpcodeInfo";
let Fields = ["BaseOpcode", "Store", "Atomic", "AtomicX2", "Sampler",
"Gather4", "NumExtraArgs", "Gradients", "G16", "Coordinates",
- "LodOrClampOrMip", "HasD16", "MSAA", "BVH"];
+ "LodOrClampOrMip", "HasD16", "MSAA", "BVH", "A16"];
string TypeOf_BaseOpcode = "MIMGBaseOpcode";
let PrimaryKey = ["BaseOpcode"];
@@ -206,7 +207,6 @@ class MIMG <dag outs, string dns = "">
: MIMG_Base <outs, dns> {
let hasPostISelHook = 1;
- let AsmMatchConverter = "cvtMIMG";
Instruction Opcode = !cast<Instruction>(NAME);
MIMGBaseOpcode BaseOpcode;
@@ -235,22 +235,41 @@ def getMIMGInfo : SearchIndex {
let Key = ["Opcode"];
}
-// This class used to use !foldl to memoize the AddrAsmNames list.
-// It turned out that that was much slower than using !filter.
+class NSAHelper {
+ dag AddrIns;
+ string AddrAsm;
+ int NSA;
+}
+
class MIMGNSAHelper<int num_addrs,
- list<RegisterClass> addr_types=!listsplat(VGPR_32, num_addrs)> {
- list<string> AddrAsmNames =
- !foreach(i, !filter(i, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11],
- !lt(i, num_addrs)), "vaddr" # i);
- dag AddrIns = !dag(ins, addr_types, AddrAsmNames);
- string AddrAsm = "[$" # !interleave(AddrAsmNames, ", $") # "]";
+ list<RegisterClass> addr_types=!listsplat(VGPR_32, num_addrs)>
+ : NSAHelper<> {
+ list<string> AddrAsmNames = !foreach(i, !range(num_addrs), "vaddr" # i);
+ let AddrIns = !dag(ins, addr_types, AddrAsmNames);
+ let AddrAsm = "[$" # !interleave(AddrAsmNames, ", $") # "]";
- int NSA = !if(!le(num_addrs, 1), ?,
+ let NSA = !if(!le(num_addrs, 1), ?,
!if(!le(num_addrs, 5), 1,
!if(!le(num_addrs, 9), 2,
!if(!le(num_addrs, 13), 3, ?))));
}
+class PartialNSAHelper<int num_addrs, int max_addr, RegisterClass LastAddrRC>
+ : NSAHelper<> {
+
+ list<RegisterClass> addr_types =
+ !if(!ge(num_addrs, max_addr),
+ !listconcat(!listsplat(VGPR_32, !sub(max_addr, 1)), [LastAddrRC]),
+ !listsplat(VGPR_32, num_addrs));
+
+ int VAddrCount = !if(!gt(num_addrs, max_addr), max_addr, num_addrs);
+ list<string> AddrAsmNames = !foreach(i, !range(VAddrCount), "vaddr" # i);
+
+ let AddrIns = !dag(ins, addr_types, AddrAsmNames);
+ let AddrAsm = "[$" # !interleave(AddrAsmNames, ", $") # "]";
+ let NSA = 1;
+}
+
// Base class of all pre-gfx10 MIMG instructions.
class MIMG_gfx6789<bits<8> op, dag outs, string dns = "">
: MIMG<outs, dns>, MIMGe_gfx6789<op> {
@@ -321,7 +340,8 @@ class MIMG_gfx11<int op, dag outs, string dns = "">
// Base class for all NSA MIMG instructions.
// Note that 1-dword addresses always use non-NSA variants.
class MIMG_nsa_gfx11<int op, dag outs, int num_addrs, string dns="",
- list<RegisterClass> addr_types=[]>
+ list<RegisterClass> addr_types=[],
+ RegisterClass LastAddrRC = VGPR_32>
: MIMG<outs, dns>, MIMGe_gfx11<op> {
let SubtargetPredicate = isGFX11Plus;
let AssemblerPredicate = isGFX11Plus;
@@ -329,9 +349,9 @@ class MIMG_nsa_gfx11<int op, dag outs, int num_addrs, string dns="",
let MIMGEncoding = MIMGEncGfx11NSA;
let VAddrOperands = num_addrs;
- MIMGNSAHelper nsah = !if(!empty(addr_types),
- MIMGNSAHelper<num_addrs>,
- MIMGNSAHelper<num_addrs, addr_types>);
+ NSAHelper nsah = !if(!empty(addr_types),
+ PartialNSAHelper<num_addrs, 5, LastAddrRC>,
+ MIMGNSAHelper<num_addrs, addr_types>);
dag AddrIns = nsah.AddrIns;
string AddrAsm = nsah.AddrAsm;
@@ -672,7 +692,6 @@ class MIMG_Atomic_gfx6789_base <bits<8> op, string asm, RegisterClass data_rc,
RegisterClass addr_rc, string dns="">
: MIMG_gfx6789 <op, (outs data_rc:$vdst), dns> {
let Constraints = "$vdst = $vdata";
- let AsmMatchConverter = "cvtMIMGAtomic";
let InOperandList = (ins data_rc:$vdata, addr_rc:$vaddr, SReg_256:$srsrc,
DMask:$dmask, UNorm:$unorm, CPol:$cpol,
@@ -684,7 +703,6 @@ class MIMG_Atomic_gfx90a_base <bits<8> op, string asm, RegisterClass data_rc,
RegisterClass addr_rc, string dns="">
: MIMG_gfx90a <op, (outs getLdStRegisterOperand<data_rc>.ret:$vdst), dns> {
let Constraints = "$vdst = $vdata";
- let AsmMatchConverter = "cvtMIMGAtomic";
let InOperandList = (ins getLdStRegisterOperand<data_rc>.ret:$vdata,
addr_rc:$vaddr, SReg_256:$srsrc,
@@ -720,7 +738,6 @@ class MIMG_Atomic_gfx10<mimgopc op, string opcode,
: MIMG_gfx10<!cast<int>(op.GFX10M), (outs DataRC:$vdst),
!if(enableDisasm, "AMDGPU", "")> {
let Constraints = "$vdst = $vdata";
- let AsmMatchConverter = "cvtMIMGAtomic";
let InOperandList = (ins DataRC:$vdata, AddrRC:$vaddr0, SReg_256:$srsrc,
DMask:$dmask, Dim:$dim, UNorm:$unorm, CPol:$cpol,
@@ -734,7 +751,6 @@ class MIMG_Atomic_nsa_gfx10<mimgopc op, string opcode,
: MIMG_nsa_gfx10<!cast<int>(op.GFX10M), (outs DataRC:$vdst), num_addrs,
!if(enableDisasm, "AMDGPU", "")> {
let Constraints = "$vdst = $vdata";
- let AsmMatchConverter = "cvtMIMGAtomic";
let InOperandList = !con((ins DataRC:$vdata),
AddrIns,
@@ -750,7 +766,6 @@ class MIMG_Atomic_gfx11<mimgopc op, string opcode,
: MIMG_gfx11<!cast<int>(op.GFX11), (outs DataRC:$vdst),
!if(enableDisasm, "AMDGPU", "")> {
let Constraints = "$vdst = $vdata";
- let AsmMatchConverter = "cvtMIMGAtomic";
let InOperandList = (ins DataRC:$vdata, AddrRC:$vaddr0, SReg_256:$srsrc,
DMask:$dmask, Dim:$dim, UNorm:$unorm, CPol:$cpol,
@@ -764,7 +779,6 @@ class MIMG_Atomic_nsa_gfx11<mimgopc op, string opcode,
: MIMG_nsa_gfx11<!cast<int>(op.GFX11), (outs DataRC:$vdst), num_addrs,
!if(enableDisasm, "AMDGPU", "")> {
let Constraints = "$vdst = $vdata";
- let AsmMatchConverter = "cvtMIMGAtomic";
let InOperandList = !con((ins DataRC:$vdata),
AddrIns,
@@ -934,8 +948,9 @@ class MIMG_Sampler_gfx11<mimgopc op, string opcode,
class MIMG_Sampler_nsa_gfx11<mimgopc op, string opcode,
RegisterClass DataRC, int num_addrs,
- string dns="">
- : MIMG_nsa_gfx11<op.GFX11, (outs DataRC:$vdata), num_addrs, dns> {
+ RegisterClass LastVAddrSize, string dns="">
+ : MIMG_nsa_gfx11<op.GFX11, (outs DataRC:$vdata), num_addrs, dns, [],
+ LastVAddrSize> {
let InOperandList = !con(AddrIns,
(ins SReg_256:$srsrc, SReg_128:$ssamp, DMask:$dmask,
Dim:$dim, UNorm:$unorm, CPol:$cpol,
@@ -946,29 +961,34 @@ class MIMG_Sampler_nsa_gfx11<mimgopc op, string opcode,
#!if(BaseOpcode.HasD16, "$d16", "");
}
-class MIMGAddrSize<int dw, bit enable_disasm> {
+class MIMGAddrSize<int dw, bit enable_disasm, int AddrDW = dw> {
int NumWords = dw;
- RegisterClass RegClass = !if(!le(NumWords, 0), ?,
- !if(!eq(NumWords, 1), VGPR_32,
- !if(!eq(NumWords, 2), VReg_64,
- !if(!eq(NumWords, 3), VReg_96,
- !if(!eq(NumWords, 4), VReg_128,
- !if(!eq(NumWords, 5), VReg_160,
- !if(!eq(NumWords, 6), VReg_192,
- !if(!eq(NumWords, 7), VReg_224,
- !if(!le(NumWords, 8), VReg_256,
- !if(!le(NumWords, 9), VReg_288,
- !if(!le(NumWords, 10), VReg_320,
- !if(!le(NumWords, 11), VReg_352,
- !if(!le(NumWords, 12), VReg_384,
- !if(!le(NumWords, 16), VReg_512, ?))))))))))))));
+ RegisterClass RegClass = !if(!le(AddrDW, 0), ?,
+ !if(!eq(AddrDW, 1), VGPR_32,
+ !if(!eq(AddrDW, 2), VReg_64,
+ !if(!eq(AddrDW, 3), VReg_96,
+ !if(!eq(AddrDW, 4), VReg_128,
+ !if(!eq(AddrDW, 5), VReg_160,
+ !if(!eq(AddrDW, 6), VReg_192,
+ !if(!eq(AddrDW, 7), VReg_224,
+ !if(!eq(AddrDW, 8), VReg_256,
+ !if(!eq(AddrDW, 9), VReg_288,
+ !if(!eq(AddrDW, 10), VReg_320,
+ !if(!eq(AddrDW, 11), VReg_352,
+ !if(!eq(AddrDW, 12), VReg_384,
+ !if(!le(AddrDW, 16), VReg_512, ?))))))))))))));
// Whether the instruction variant with this vaddr size should be enabled for
// the auto-generated disassembler.
bit Disassemble = enable_disasm;
}
+// Returns the MIMGAddrSize with the size of last VAddr for partial NSA
+class LastVAddrSize <int dw, int max_idx, bit enable_disasm>
+ : MIMGAddrSize<dw, enable_disasm,
+ !if(!gt(dw, max_idx), !sub(dw, max_idx), 0)>;
+
// Return whether x is in lst.
class isIntInList<int x, list<int> lst> {
bit ret = !foldl(0, lst, lhs, y, !or(lhs, !eq(x, y)));
@@ -985,7 +1005,8 @@ class MIMGAddrSizes_dw_range<list<int> range> {
int Max = !if(!empty(!tail(range)), Min, !head(!tail(range)));
}
-class MIMG_Sampler_AddrSizes<AMDGPUSampleVariant sample, bit isG16> {
+class MIMG_Sampler_AddrSizes<AMDGPUSampleVariant sample, bit isG16,
+ int nsa_max_addr = 5> {
// List of all possible numbers of address words, taking all combinations of
// A16 and image dimension into account (note: no MSAA, since this is for
// sample/gather ops).
@@ -1031,6 +1052,21 @@ class MIMG_Sampler_AddrSizes<AMDGPUSampleVariant sample, bit isG16> {
!if(isIntInList<dw, AllNumAddrWords>.ret,
!listconcat(lhs, [MIMGAddrSize<dw, !empty(lhs)>]),
lhs))));
+
+ // In NSA format if there is a requirement for more VGPRs than the format
+ // supports, then the rest are sequential after the last one. Generate
+ // machine instructions for all possible number of words. The disassembler
+ // defaults to the largest number of arguments but no larger than max nsa
+ // size. List is generated with the register class needed for last vaddr since
+ // it is the only one that could have a register other than VGPR32.
+ int EnableDisasmNum = !foldl(!head(AllNumAddrWords), !tail(AllNumAddrWords),
+ acc, var, !if(!le(var, nsa_max_addr), var, acc));
+ list<LastVAddrSize> PartialNSAInstrs =
+ !foldl([]<LastVAddrSize>, [12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2], lhs, dw,
+ !if(isIntInList<dw, AllNumAddrWords>.ret,
+ !listconcat(lhs, [LastVAddrSize<dw, !sub(nsa_max_addr, 1),
+ !eq(dw, EnableDisasmNum)>]),
+ lhs));
}
multiclass MIMG_Sampler_Src_Helper <mimgopc op, string asm,
@@ -1066,9 +1102,14 @@ multiclass MIMG_Sampler_Src_Helper <mimgopc op, string asm,
: MIMG_Sampler_nsa_gfx10<op, asm, dst_rc, addr.NumWords,
!if(!and(enableDisasm, addr.Disassemble), "AMDGPU", "")>;
}
- if !and(op.HAS_GFX11, !le(addr.NumWords, 5)) then {
+ }
+ }
+
+ foreach addr = MIMG_Sampler_AddrSizes<sample, isG16, 5/*MaxNSASize*/>.PartialNSAInstrs in {
+ let VAddrDwords = addr.NumWords in {
+ if op.HAS_GFX11 then {
def _V # addr.NumWords # _nsa_gfx11
- : MIMG_Sampler_nsa_gfx11<op, asm, dst_rc, addr.NumWords,
+ : MIMG_Sampler_nsa_gfx11<op, asm, dst_rc, addr.NumWords, addr.RegClass,
!if(!and(enableDisasm, addr.Disassemble), "AMDGPU", "")>;
}
}
@@ -1144,51 +1185,43 @@ class MIMG_IntersectRay_Helper<bit Is64, bit IsA16> {
[node_ptr_type, VGPR_32, VReg_96, VReg_96, VReg_96]);
}
-class MIMG_IntersectRay_gfx10<mimgopc op, string opcode, RegisterClass AddrRC, bit IsA16>
+class MIMG_IntersectRay_gfx10<mimgopc op, string opcode, RegisterClass AddrRC>
: MIMG_gfx10<op.GFX10M, (outs VReg_128:$vdata), "AMDGPU"> {
-
- let InOperandList = !con((ins AddrRC:$vaddr0, SReg_128:$srsrc),
- !if(IsA16, (ins A16:$a16), (ins)));
- let AsmString = opcode#" $vdata, $vaddr0, $srsrc"#!if(IsA16, "$a16", "");
+ let InOperandList = (ins AddrRC:$vaddr0, SReg_128:$srsrc, A16:$a16);
+ let AsmString = opcode#" $vdata, $vaddr0, $srsrc$a16";
let nsa = 0;
}
-class MIMG_IntersectRay_nsa_gfx10<mimgopc op, string opcode, int num_addrs, bit IsA16>
+class MIMG_IntersectRay_nsa_gfx10<mimgopc op, string opcode, int num_addrs>
: MIMG_nsa_gfx10<op.GFX10M, (outs VReg_128:$vdata), num_addrs, "AMDGPU"> {
- let InOperandList = !con(nsah.AddrIns,
- (ins SReg_128:$srsrc),
- !if(IsA16, (ins A16:$a16), (ins)));
- let AsmString = opcode#" $vdata, "#nsah.AddrAsm#", $srsrc"#!if(IsA16, "$a16", "");
+ let InOperandList = !con(nsah.AddrIns, (ins SReg_128:$srsrc, A16:$a16));
+ let AsmString = opcode#" $vdata, "#nsah.AddrAsm#", $srsrc$a16";
}
-class MIMG_IntersectRay_gfx11<mimgopc op, string opcode, RegisterClass AddrRC, bit IsA16>
+class MIMG_IntersectRay_gfx11<mimgopc op, string opcode, RegisterClass AddrRC>
: MIMG_gfx11<op.GFX11, (outs VReg_128:$vdata), "AMDGPU"> {
-
- let InOperandList = !con((ins AddrRC:$vaddr0, SReg_128:$srsrc),
- !if(IsA16, (ins A16:$a16), (ins)));
- let AsmString = opcode#" $vdata, $vaddr0, $srsrc"#!if(IsA16, "$a16", "");
+ let InOperandList = (ins AddrRC:$vaddr0, SReg_128:$srsrc, A16:$a16);
+ let AsmString = opcode#" $vdata, $vaddr0, $srsrc$a16";
let nsa = 0;
}
class MIMG_IntersectRay_nsa_gfx11<mimgopc op, string opcode, int num_addrs,
- bit IsA16, list<RegisterClass> addr_types>
+ list<RegisterClass> addr_types>
: MIMG_nsa_gfx11<op.GFX11, (outs VReg_128:$vdata), num_addrs, "AMDGPU",
addr_types> {
- let InOperandList = !con(nsah.AddrIns,
- (ins SReg_128:$srsrc),
- !if(IsA16, (ins A16:$a16), (ins)));
- let AsmString = opcode#" $vdata, "#nsah.AddrAsm#", $srsrc"#!if(IsA16, "$a16", "");
+ let InOperandList = !con(nsah.AddrIns, (ins SReg_128:$srsrc, A16:$a16));
+ let AsmString = opcode#" $vdata, "#nsah.AddrAsm#", $srsrc$a16";
}
multiclass MIMG_IntersectRay<mimgopc op, string opcode, bit Is64, bit IsA16> {
defvar info = MIMG_IntersectRay_Helper<Is64, IsA16>;
def "" : MIMGBaseOpcode {
let BVH = 1;
+ let A16 = IsA16;
}
- let AsmMatchConverter = !if(IsA16, "cvtIntersectRay", ""),
- dmask = 0xf,
+ let dmask = 0xf,
unorm = 1,
d16 = 0,
cpol = 0,
@@ -1201,17 +1234,17 @@ multiclass MIMG_IntersectRay<mimgopc op, string opcode, bit Is64, bit IsA16> {
d16 = 0,
BaseOpcode = !cast<MIMGBaseOpcode>(NAME),
VDataDwords = 4 in {
- def _sa_gfx10 : MIMG_IntersectRay_gfx10<op, opcode, info.RegClass, IsA16> {
+ def _sa_gfx10 : MIMG_IntersectRay_gfx10<op, opcode, info.RegClass> {
let VAddrDwords = info.VAddrDwords;
}
- def _sa_gfx11 : MIMG_IntersectRay_gfx11<op, opcode, info.RegClass, IsA16> {
+ def _sa_gfx11 : MIMG_IntersectRay_gfx11<op, opcode, info.RegClass> {
let VAddrDwords = info.VAddrDwords;
}
- def _nsa_gfx10 : MIMG_IntersectRay_nsa_gfx10<op, opcode, info.num_addrs, IsA16> {
+ def _nsa_gfx10 : MIMG_IntersectRay_nsa_gfx10<op, opcode, info.num_addrs> {
let VAddrDwords = info.num_addrs;
}
def _nsa_gfx11 : MIMG_IntersectRay_nsa_gfx11<op, opcode,
- info.gfx11_nsa_addrs, IsA16,
+ info.gfx11_nsa_addrs,
info.gfx11_addr_types> {
let VAddrDwords = info.num_addrs;
}
diff --git a/llvm/lib/Target/AMDGPU/R600ISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/R600ISelDAGToDAG.cpp
index 50a90dd03f38..20c2ff8a4fd7 100644
--- a/llvm/lib/Target/AMDGPU/R600ISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AMDGPU/R600ISelDAGToDAG.cpp
@@ -20,7 +20,7 @@
namespace {
class R600DAGToDAGISel : public AMDGPUDAGToDAGISel {
- const R600Subtarget *Subtarget;
+ const R600Subtarget *Subtarget = nullptr;
bool isConstantLoad(const MemSDNode *N, int cbID) const;
bool SelectGlobalValueConstantOffset(SDValue Addr, SDValue &IntPtr);
diff --git a/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp b/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp
index fad393267a71..ad072cfe23b1 100644
--- a/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp
@@ -953,10 +953,8 @@ SDValue R600TargetLowering::lowerADDRSPACECAST(SDValue Op,
unsigned SrcAS = ASC->getSrcAddressSpace();
unsigned DestAS = ASC->getDestAddressSpace();
- if (auto *ConstSrc = dyn_cast<ConstantSDNode>(Op.getOperand(0))) {
- if (SrcAS == AMDGPUAS::FLAT_ADDRESS && ConstSrc->isNullValue())
- return DAG.getConstant(TM.getNullPointerValue(DestAS), SL, VT);
- }
+ if (isNullConstant(Op.getOperand(0)) && SrcAS == AMDGPUAS::FLAT_ADDRESS)
+ return DAG.getConstant(TM.getNullPointerValue(DestAS), SL, VT);
return Op;
}
@@ -1656,7 +1654,7 @@ SDValue R600TargetLowering::OptimizeSwizzle(SDValue BuildVector, SDValue Swz[],
BuildVector = CompactSwizzlableVector(DAG, BuildVector, SwizzleRemap);
for (unsigned i = 0; i < 4; i++) {
unsigned Idx = cast<ConstantSDNode>(Swz[i])->getZExtValue();
- if (SwizzleRemap.find(Idx) != SwizzleRemap.end())
+ if (SwizzleRemap.contains(Idx))
Swz[i] = DAG.getConstant(SwizzleRemap[Idx], DL, MVT::i32);
}
@@ -1664,7 +1662,7 @@ SDValue R600TargetLowering::OptimizeSwizzle(SDValue BuildVector, SDValue Swz[],
BuildVector = ReorganizeVector(DAG, BuildVector, SwizzleRemap);
for (unsigned i = 0; i < 4; i++) {
unsigned Idx = cast<ConstantSDNode>(Swz[i])->getZExtValue();
- if (SwizzleRemap.find(Idx) != SwizzleRemap.end())
+ if (SwizzleRemap.contains(Idx))
Swz[i] = DAG.getConstant(SwizzleRemap[Idx], DL, MVT::i32);
}
@@ -2182,3 +2180,18 @@ SDNode *R600TargetLowering::PostISelFolding(MachineSDNode *Node,
return Node;
}
+
+TargetLowering::AtomicExpansionKind
+R600TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const {
+ switch (RMW->getOperation()) {
+ case AtomicRMWInst::UIncWrap:
+ case AtomicRMWInst::UDecWrap:
+ // FIXME: Cayman at least appears to have instructions for this, but the
+ // instruction defintions appear to be missing.
+ return AtomicExpansionKind::CmpXChg;
+ default:
+ break;
+ }
+
+ return AMDGPUTargetLowering::shouldExpandAtomicRMWInIR(RMW);
+}
diff --git a/llvm/lib/Target/AMDGPU/R600ISelLowering.h b/llvm/lib/Target/AMDGPU/R600ISelLowering.h
index 8a5479db4ee6..fc361c01bc67 100644
--- a/llvm/lib/Target/AMDGPU/R600ISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/R600ISelLowering.h
@@ -114,6 +114,9 @@ private:
SelectionDAG &DAG) const;
SDNode *PostISelFolding(MachineSDNode *N, SelectionDAG &DAG) const override;
+
+ TargetLowering::AtomicExpansionKind
+ shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const override;
};
} // End namespace llvm;
diff --git a/llvm/lib/Target/AMDGPU/R600InstrInfo.cpp b/llvm/lib/Target/AMDGPU/R600InstrInfo.cpp
index 4056274cd440..7f874b245b8f 100644
--- a/llvm/lib/Target/AMDGPU/R600InstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/R600InstrInfo.cpp
@@ -328,7 +328,7 @@ R600InstrInfo::ExtractSrcs(MachineInstr &MI,
if (Reg == R600::OQAP) {
Result.push_back(std::pair(Index, 0U));
}
- if (PV.find(Reg) != PV.end()) {
+ if (PV.contains(Reg)) {
// 255 is used to tells its a PS/PV reg
Result.push_back(std::pair(255, 0U));
continue;
diff --git a/llvm/lib/Target/AMDGPU/R600Instructions.td b/llvm/lib/Target/AMDGPU/R600Instructions.td
index b53e9c258fd9..f4dfbe8adc75 100644
--- a/llvm/lib/Target/AMDGPU/R600Instructions.td
+++ b/llvm/lib/Target/AMDGPU/R600Instructions.td
@@ -1090,7 +1090,7 @@ multiclass CUBE_Common <bits<11> inst> {
} // End mayLoad = 0, mayStore = 0, hasSideEffects = 0
class EXP_IEEE_Common <bits<11> inst> : R600_1OP_Helper <
- inst, "EXP_IEEE", fexp2
+ inst, "EXP_IEEE", AMDGPUexp
> {
let Itinerary = TransALU;
}
@@ -1124,7 +1124,7 @@ class LOG_CLAMPED_Common <bits<11> inst> : R600_1OP <
>;
class LOG_IEEE_Common <bits<11> inst> : R600_1OP_Helper <
- inst, "LOG_IEEE", flog2
+ inst, "LOG_IEEE", AMDGPUlog
> {
let Itinerary = TransALU;
}
diff --git a/llvm/lib/Target/AMDGPU/R600TargetTransformInfo.cpp b/llvm/lib/Target/AMDGPU/R600TargetTransformInfo.cpp
index c01f9c4794c7..1a1be4a44285 100644
--- a/llvm/lib/Target/AMDGPU/R600TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/R600TargetTransformInfo.cpp
@@ -82,10 +82,10 @@ bool R600TTIImpl::isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes,
return isLegalToVectorizeMemChain(ChainSizeInBytes, Alignment, AddrSpace);
}
-unsigned R600TTIImpl::getMaxInterleaveFactor(unsigned VF) {
+unsigned R600TTIImpl::getMaxInterleaveFactor(ElementCount VF) {
// Disable unrolling if the loop is not vectorized.
// TODO: Enable this again.
- if (VF == 1)
+ if (VF.isScalar())
return 1;
return 8;
diff --git a/llvm/lib/Target/AMDGPU/R600TargetTransformInfo.h b/llvm/lib/Target/AMDGPU/R600TargetTransformInfo.h
index 8dacae0abb7b..2934b0151f4d 100644
--- a/llvm/lib/Target/AMDGPU/R600TargetTransformInfo.h
+++ b/llvm/lib/Target/AMDGPU/R600TargetTransformInfo.h
@@ -57,7 +57,7 @@ public:
unsigned AddrSpace) const;
bool isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes, Align Alignment,
unsigned AddrSpace) const;
- unsigned getMaxInterleaveFactor(unsigned VF);
+ unsigned getMaxInterleaveFactor(ElementCount VF);
InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind,
const Instruction *I = nullptr);
using BaseT::getVectorInstrCost;
diff --git a/llvm/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp b/llvm/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp
index f232bc9b3852..b87cd8c66cc8 100644
--- a/llvm/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp
+++ b/llvm/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp
@@ -13,8 +13,8 @@
#include "AMDGPU.h"
#include "GCNSubtarget.h"
-#include "llvm/Analysis/LegacyDivergenceAnalysis.h"
#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/UniformityAnalysis.h"
#include "llvm/CodeGen/TargetPassConfig.h"
#include "llvm/IR/BasicBlock.h"
#include "llvm/IR/Constants.h"
@@ -36,7 +36,7 @@ using StackEntry = std::pair<BasicBlock *, Value *>;
using StackVector = SmallVector<StackEntry, 16>;
class SIAnnotateControlFlow : public FunctionPass {
- LegacyDivergenceAnalysis *DA;
+ UniformityInfo *UA;
Type *Boolean;
Type *Void;
@@ -99,7 +99,7 @@ public:
void getAnalysisUsage(AnalysisUsage &AU) const override {
AU.addRequired<LoopInfoWrapperPass>();
AU.addRequired<DominatorTreeWrapperPass>();
- AU.addRequired<LegacyDivergenceAnalysis>();
+ AU.addRequired<UniformityInfoWrapperPass>();
AU.addPreserved<LoopInfoWrapperPass>();
AU.addPreserved<DominatorTreeWrapperPass>();
AU.addRequired<TargetPassConfig>();
@@ -112,7 +112,7 @@ public:
INITIALIZE_PASS_BEGIN(SIAnnotateControlFlow, DEBUG_TYPE,
"Annotate SI Control Flow", false, false)
INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(LegacyDivergenceAnalysis)
+INITIALIZE_PASS_DEPENDENCY(UniformityInfoWrapperPass)
INITIALIZE_PASS_DEPENDENCY(TargetPassConfig)
INITIALIZE_PASS_END(SIAnnotateControlFlow, DEBUG_TYPE,
"Annotate SI Control Flow", false, false)
@@ -146,7 +146,7 @@ void SIAnnotateControlFlow::initialize(Module &M, const GCNSubtarget &ST) {
/// Is the branch condition uniform or did the StructurizeCFG pass
/// consider it as such?
bool SIAnnotateControlFlow::isUniform(BranchInst *T) {
- return DA->isUniform(T) ||
+ return UA->isUniform(T) ||
T->getMetadata("structurizecfg.uniform") != nullptr;
}
@@ -336,7 +336,7 @@ bool SIAnnotateControlFlow::closeControlFlow(BasicBlock *BB) {
bool SIAnnotateControlFlow::runOnFunction(Function &F) {
DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
- DA = &getAnalysis<LegacyDivergenceAnalysis>();
+ UA = &getAnalysis<UniformityInfoWrapperPass>().getUniformityInfo();
TargetPassConfig &TPC = getAnalysis<TargetPassConfig>();
const TargetMachine &TM = TPC.getTM<TargetMachine>();
diff --git a/llvm/lib/Target/AMDGPU/SIDefines.h b/llvm/lib/Target/AMDGPU/SIDefines.h
index 97a583421a7e..cd1818285e3e 100644
--- a/llvm/lib/Target/AMDGPU/SIDefines.h
+++ b/llvm/lib/Target/AMDGPU/SIDefines.h
@@ -16,11 +16,36 @@ namespace llvm {
// This needs to be kept in sync with the field bits in SIRegisterClass.
enum SIRCFlags : uint8_t {
- // For vector registers.
- HasVGPR = 1 << 0,
- HasAGPR = 1 << 1,
- HasSGPR = 1 << 2
-}; // enum SIRCFlags
+ RegTupleAlignUnitsWidth = 2,
+ HasVGPRBit = RegTupleAlignUnitsWidth,
+ HasAGPRBit,
+ HasSGPRbit,
+
+ HasVGPR = 1 << HasVGPRBit,
+ HasAGPR = 1 << HasAGPRBit,
+ HasSGPR = 1 << HasSGPRbit,
+
+ RegTupleAlignUnitsMask = (1 << RegTupleAlignUnitsWidth) - 1,
+ RegKindMask = (HasVGPR | HasAGPR | HasSGPR)
+}; // enum SIRCFlagsr
+
+namespace SIEncodingFamily {
+// This must be kept in sync with the SIEncodingFamily class in SIInstrInfo.td
+// and the columns of the getMCOpcodeGen table.
+enum {
+ SI = 0,
+ VI = 1,
+ SDWA = 2,
+ SDWA9 = 3,
+ GFX80 = 4,
+ GFX9 = 5,
+ GFX10 = 6,
+ SDWA10 = 7,
+ GFX90A = 8,
+ GFX940 = 9,
+ GFX11 = 10,
+};
+}
namespace SIInstrFlags {
// This needs to be kept in sync with the field bits in InstSI.
@@ -133,6 +158,9 @@ enum : uint64_t {
// Whether tied sources will be read.
TiedSourceNotRead = UINT64_C(1) << 60,
+
+ // Is never uniform.
+ IsNeverUniform = UINT64_C(1) << 61,
};
// v_cmp_class_* etc. use a 10-bit mask for what operation is checked.
@@ -222,6 +250,7 @@ enum OperandType : unsigned {
// NEG and SEXT share same bit-mask because they can't be set simultaneously.
namespace SISrcMods {
enum : unsigned {
+ NONE = 0,
NEG = 1 << 0, // Floating-point negate modifier
ABS = 1 << 1, // Floating-point absolute modifier
SEXT = 1 << 0, // Integer sign-extend modifier
@@ -333,7 +362,7 @@ enum Id { // Message ID, width(4) [3:0].
ID_SAVEWAVE = 4, // added in GFX8, removed in GFX11
ID_STALL_WAVE_GEN = 5, // added in GFX9
ID_HALT_WAVES = 6, // added in GFX9
- ID_ORDERED_PS_DONE = 7, // added in GFX9
+ ID_ORDERED_PS_DONE = 7, // added in GFX9, removed in GFX11
ID_EARLY_PRIM_DEALLOC = 8, // added in GFX9, removed in GFX10
ID_GS_ALLOC_REQ = 9, // added in GFX9
ID_GET_DOORBELL = 10, // added in GFX9, removed in GFX11
@@ -401,19 +430,26 @@ enum Id { // HwRegCode, (6) [5:0]
ID_TBA_HI = 17,
ID_TMA_LO = 18,
ID_TMA_HI = 19,
- ID_XCC_ID = 20,
- ID_SQ_PERF_SNAPSHOT_DATA = 21,
- ID_SQ_PERF_SNAPSHOT_DATA1 = 22,
- ID_SQ_PERF_SNAPSHOT_PC_LO = 23,
- ID_SQ_PERF_SNAPSHOT_PC_HI = 24,
ID_FLAT_SCR_LO = 20,
ID_FLAT_SCR_HI = 21,
ID_XNACK_MASK = 22,
ID_HW_ID1 = 23,
ID_HW_ID2 = 24,
ID_POPS_PACKER = 25,
+ ID_PERF_SNAPSHOT_DATA = 27,
ID_SHADER_CYCLES = 29,
+ // Register numbers reused in GFX11+
+ ID_PERF_SNAPSHOT_PC_LO = 18,
+ ID_PERF_SNAPSHOT_PC_HI = 19,
+
+ // GFX940 specific registers
+ ID_XCC_ID = 20,
+ ID_SQ_PERF_SNAPSHOT_DATA = 21,
+ ID_SQ_PERF_SNAPSHOT_DATA1 = 22,
+ ID_SQ_PERF_SNAPSHOT_PC_LO = 23,
+ ID_SQ_PERF_SNAPSHOT_PC_HI = 24,
+
ID_SHIFT_ = 0,
ID_WIDTH_ = 6,
ID_MASK_ = (((1 << ID_WIDTH_) - 1) << ID_SHIFT_)
@@ -909,6 +945,17 @@ enum Offset_COV5 : unsigned {
};
} // namespace ImplicitArg
+
+namespace VirtRegFlag {
+// Virtual register flags used for various target specific handlings during
+// codegen.
+enum Register_Flag : uint8_t {
+ // Register operand in a whole-wave mode operation.
+ WWM_REG = 1 << 0,
+};
+
+} // namespace VirtRegFlag
+
} // namespace AMDGPU
#define R_00B028_SPI_SHADER_PGM_RSRC1_PS 0x00B028
diff --git a/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp b/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
index e5a028823e72..db323465c153 100644
--- a/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
@@ -231,7 +231,7 @@ static bool tryChangeVGPRtoSGPRinCopy(MachineInstr &MI,
UseMI->getOpcode() <= TargetOpcode::GENERIC_OP_END)
return false;
- unsigned OpIdx = UseMI->getOperandNo(&MO);
+ unsigned OpIdx = MO.getOperandNo();
if (OpIdx >= UseMI->getDesc().getNumOperands() ||
!TII->isOperandLegal(*UseMI, OpIdx, &Src))
return false;
@@ -658,7 +658,7 @@ bool SIFixSGPRCopies::runOnMachineFunction(MachineFunction &MF) {
TRI->getEquivalentSGPRClass(SrcRC);
Register NewDst = MRI->createVirtualRegister(DestRC);
MachineBasicBlock *BlockToInsertCopy =
- MI.isPHI() ? MI.getOperand(MI.getOperandNo(&MO) + 1).getMBB()
+ MI.isPHI() ? MI.getOperand(MO.getOperandNo() + 1).getMBB()
: MBB;
MachineBasicBlock::iterator PointToInsertCopy =
MI.isPHI() ? BlockToInsertCopy->getFirstInstrTerminator() : I;
@@ -869,7 +869,9 @@ bool SIFixSGPRCopies::lowerSpecialCase(MachineInstr &MI,
return true;
}
if (!SrcReg.isVirtual() || TRI->isAGPR(*MRI, SrcReg)) {
- TII->moveToVALU(MI, MDT);
+ SIInstrWorklist worklist;
+ worklist.insert(&MI);
+ TII->moveToVALU(worklist, MDT);
return true;
}
@@ -991,6 +993,10 @@ void SIFixSGPRCopies::lowerVGPR2SGPRCopies(MachineFunction &MF) {
LoweringWorklist.push_back(C.second.ID);
}
+ // Store all the V2S copy instructions that need to be moved to VALU
+ // in the Copies worklist.
+ SIInstrWorklist Copies;
+
while (!LoweringWorklist.empty()) {
unsigned CurID = LoweringWorklist.pop_back_val();
auto CurInfoIt = V2SCopies.find(CurID);
@@ -1013,10 +1019,13 @@ void SIFixSGPRCopies::lowerVGPR2SGPRCopies(MachineFunction &MF) {
LLVM_DEBUG(dbgs() << "V2S copy " << *C.Copy
<< " is being turned to VALU\n");
V2SCopies.erase(C.ID);
- TII->moveToVALU(*C.Copy, MDT);
+ Copies.insert(C.Copy);
}
}
+ TII->moveToVALU(Copies, MDT);
+ Copies.clear();
+
// Now do actual lowering
for (auto C : V2SCopies) {
MachineInstr *MI = C.second.Copy;
diff --git a/llvm/lib/Target/AMDGPU/SIFixVGPRCopies.cpp b/llvm/lib/Target/AMDGPU/SIFixVGPRCopies.cpp
index f7e3ea5fc072..08272a9ddfd3 100644
--- a/llvm/lib/Target/AMDGPU/SIFixVGPRCopies.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFixVGPRCopies.cpp
@@ -31,6 +31,11 @@ public:
initializeSIFixVGPRCopiesPass(*PassRegistry::getPassRegistry());
}
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.setPreservesAll();
+ MachineFunctionPass::getAnalysisUsage(AU);
+ }
+
bool runOnMachineFunction(MachineFunction &MF) override;
StringRef getPassName() const override { return "SI Fix VGPR copies"; }
diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
index 9c0c665a318c..9f1d6038f1b6 100644
--- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
@@ -111,9 +111,11 @@ public:
std::pair<const MachineOperand *, int> isOMod(const MachineInstr &MI) const;
bool tryFoldOMod(MachineInstr &MI);
bool tryFoldRegSequence(MachineInstr &MI);
- bool tryFoldLCSSAPhi(MachineInstr &MI);
+ bool tryFoldPhiAGPR(MachineInstr &MI);
bool tryFoldLoad(MachineInstr &MI);
+ bool tryOptimizeAGPRPhis(MachineBasicBlock &MBB);
+
public:
SIFoldOperands() : MachineFunctionPass(ID) {
initializeSIFoldOperandsPass(*PassRegistry::getPassRegistry());
@@ -138,6 +140,16 @@ char SIFoldOperands::ID = 0;
char &llvm::SIFoldOperandsID = SIFoldOperands::ID;
+static const TargetRegisterClass *getRegOpRC(const MachineRegisterInfo &MRI,
+ const TargetRegisterInfo &TRI,
+ const MachineOperand &MO) {
+ const TargetRegisterClass *RC = MRI.getRegClass(MO.getReg());
+ if (const TargetRegisterClass *SubRC =
+ TRI.getSubRegisterClass(RC, MO.getSubReg()))
+ RC = SubRC;
+ return RC;
+}
+
// Map multiply-accumulate opcode to corresponding multiply-add opcode if any.
static unsigned macToMad(unsigned Opc) {
switch (Opc) {
@@ -341,14 +353,17 @@ bool SIFoldOperands::tryAddToFoldList(SmallVectorImpl<FoldCandidate> &FoldList,
// Check if changing this to a v_mad_{f16, f32} instruction will allow us
// to fold the operand.
MI->setDesc(TII->get(NewOpc));
- if (!AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::op_sel) &&
- AMDGPU::hasNamedOperand(NewOpc, AMDGPU::OpName::op_sel))
+ bool AddOpSel = !AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::op_sel) &&
+ AMDGPU::hasNamedOperand(NewOpc, AMDGPU::OpName::op_sel);
+ if (AddOpSel)
MI->addOperand(MachineOperand::CreateImm(0));
bool FoldAsMAD = tryAddToFoldList(FoldList, MI, OpNo, OpToFold);
if (FoldAsMAD) {
MI->untieRegOperand(OpNo);
return true;
}
+ if (AddOpSel)
+ MI->removeOperand(MI->getNumExplicitOperands() - 1);
MI->setDesc(TII->get(Opc));
}
@@ -893,11 +908,10 @@ void SIFoldOperands::foldOperand(
TRI->getRegClass(FoldDesc.operands()[0].RegClass);
// Split 64-bit constants into 32-bits for folding.
- if (UseOp.getSubReg() && AMDGPU::getRegBitWidth(FoldRC->getID()) == 64) {
+ if (UseOp.getSubReg() && AMDGPU::getRegBitWidth(*FoldRC) == 64) {
Register UseReg = UseOp.getReg();
const TargetRegisterClass *UseRC = MRI->getRegClass(UseReg);
-
- if (AMDGPU::getRegBitWidth(UseRC->getID()) != 64)
+ if (AMDGPU::getRegBitWidth(*UseRC) != 64)
return;
APInt Imm(64, OpToFold.getImm());
@@ -1628,52 +1642,175 @@ bool SIFoldOperands::tryFoldRegSequence(MachineInstr &MI) {
return true;
}
-// Try to hoist an AGPR to VGPR copy out of the loop across a LCSSA PHI.
-// This should allow folding of an AGPR into a consumer which may support it.
-// I.e.:
-//
-// loop: // loop:
-// %1:vreg = COPY %0:areg // exit:
-// exit: => // %1:areg = PHI %0:areg, %loop
-// %2:vreg = PHI %1:vreg, %loop // %2:vreg = COPY %1:areg
-bool SIFoldOperands::tryFoldLCSSAPhi(MachineInstr &PHI) {
- assert(PHI.isPHI());
+/// Checks whether \p Copy is a AGPR -> VGPR copy. Returns `true` on success and
+/// stores the AGPR register in \p OutReg and the subreg in \p OutSubReg
+static bool isAGPRCopy(const SIRegisterInfo &TRI,
+ const MachineRegisterInfo &MRI, const MachineInstr &Copy,
+ Register &OutReg, unsigned &OutSubReg) {
+ assert(Copy.isCopy());
- if (PHI.getNumExplicitOperands() != 3) // Single input LCSSA PHI
+ const MachineOperand &CopySrc = Copy.getOperand(1);
+ Register CopySrcReg = CopySrc.getReg();
+ if (!CopySrcReg.isVirtual())
return false;
- Register PhiIn = PHI.getOperand(1).getReg();
- Register PhiOut = PHI.getOperand(0).getReg();
- if (PHI.getOperand(1).getSubReg() ||
- !TRI->isVGPR(*MRI, PhiIn) || !TRI->isVGPR(*MRI, PhiOut))
+ // Common case: copy from AGPR directly, e.g.
+ // %1:vgpr_32 = COPY %0:agpr_32
+ if (TRI.isAGPR(MRI, CopySrcReg)) {
+ OutReg = CopySrcReg;
+ OutSubReg = CopySrc.getSubReg();
+ return true;
+ }
+
+ // Sometimes it can also involve two copies, e.g.
+ // %1:vgpr_256 = COPY %0:agpr_256
+ // %2:vgpr_32 = COPY %1:vgpr_256.sub0
+ const MachineInstr *CopySrcDef = MRI.getVRegDef(CopySrcReg);
+ if (!CopySrcDef || !CopySrcDef->isCopy())
return false;
- // A single use should not matter for correctness, but if it has another use
- // inside the loop we may perform copy twice in a worst case.
- if (!MRI->hasOneNonDBGUse(PhiIn))
+ const MachineOperand &OtherCopySrc = CopySrcDef->getOperand(1);
+ Register OtherCopySrcReg = OtherCopySrc.getReg();
+ if (!OtherCopySrcReg.isVirtual() ||
+ CopySrcDef->getOperand(0).getSubReg() != AMDGPU::NoSubRegister ||
+ OtherCopySrc.getSubReg() != AMDGPU::NoSubRegister ||
+ !TRI.isAGPR(MRI, OtherCopySrcReg))
return false;
- MachineInstr *Copy = MRI->getVRegDef(PhiIn);
- if (!Copy || !Copy->isCopy())
+ OutReg = OtherCopySrcReg;
+ OutSubReg = CopySrc.getSubReg();
+ return true;
+}
+
+// Try to hoist an AGPR to VGPR copy across a PHI.
+// This should allow folding of an AGPR into a consumer which may support it.
+//
+// Example 1: LCSSA PHI
+// loop:
+// %1:vreg = COPY %0:areg
+// exit:
+// %2:vreg = PHI %1:vreg, %loop
+// =>
+// loop:
+// exit:
+// %1:areg = PHI %0:areg, %loop
+// %2:vreg = COPY %1:areg
+//
+// Example 2: PHI with multiple incoming values:
+// entry:
+// %1:vreg = GLOBAL_LOAD(..)
+// loop:
+// %2:vreg = PHI %1:vreg, %entry, %5:vreg, %loop
+// %3:areg = COPY %2:vreg
+// %4:areg = (instr using %3:areg)
+// %5:vreg = COPY %4:areg
+// =>
+// entry:
+// %1:vreg = GLOBAL_LOAD(..)
+// %2:areg = COPY %1:vreg
+// loop:
+// %3:areg = PHI %2:areg, %entry, %X:areg,
+// %4:areg = (instr using %3:areg)
+bool SIFoldOperands::tryFoldPhiAGPR(MachineInstr &PHI) {
+ assert(PHI.isPHI());
+
+ Register PhiOut = PHI.getOperand(0).getReg();
+ if (!TRI->isVGPR(*MRI, PhiOut))
return false;
- Register CopyIn = Copy->getOperand(1).getReg();
- if (!TRI->isAGPR(*MRI, CopyIn) || Copy->getOperand(1).getSubReg())
+ // Iterate once over all incoming values of the PHI to check if this PHI is
+ // eligible, and determine the exact AGPR RC we'll target.
+ const TargetRegisterClass *ARC = nullptr;
+ for (unsigned K = 1; K < PHI.getNumExplicitOperands(); K += 2) {
+ MachineOperand &MO = PHI.getOperand(K);
+ MachineInstr *Copy = MRI->getVRegDef(MO.getReg());
+ if (!Copy || !Copy->isCopy())
+ continue;
+
+ Register AGPRSrc;
+ unsigned AGPRRegMask = AMDGPU::NoSubRegister;
+ if (!isAGPRCopy(*TRI, *MRI, *Copy, AGPRSrc, AGPRRegMask))
+ continue;
+
+ const TargetRegisterClass *CopyInRC = MRI->getRegClass(AGPRSrc);
+ if (const auto *SubRC = TRI->getSubRegisterClass(CopyInRC, AGPRRegMask))
+ CopyInRC = SubRC;
+
+ if (ARC && !ARC->hasSubClassEq(CopyInRC))
+ return false;
+ ARC = CopyInRC;
+ }
+
+ if (!ARC)
return false;
- const TargetRegisterClass *ARC = MRI->getRegClass(CopyIn);
+ bool IsAGPR32 = (ARC == &AMDGPU::AGPR_32RegClass);
+
+ // Rewrite the PHI's incoming values to ARC.
+ LLVM_DEBUG(dbgs() << "Folding AGPR copies into: " << PHI);
+ for (unsigned K = 1; K < PHI.getNumExplicitOperands(); K += 2) {
+ MachineOperand &MO = PHI.getOperand(K);
+ Register Reg = MO.getReg();
+
+ MachineBasicBlock::iterator InsertPt;
+ MachineBasicBlock *InsertMBB = nullptr;
+
+ // Look at the def of Reg, ignoring all copies.
+ unsigned CopyOpc = AMDGPU::COPY;
+ if (MachineInstr *Def = MRI->getVRegDef(Reg)) {
+
+ // Look at pre-existing COPY instructions from ARC: Steal the operand. If
+ // the copy was single-use, it will be removed by DCE later.
+ if (Def->isCopy()) {
+ Register AGPRSrc;
+ unsigned AGPRSubReg = AMDGPU::NoSubRegister;
+ if (isAGPRCopy(*TRI, *MRI, *Def, AGPRSrc, AGPRSubReg)) {
+ MO.setReg(AGPRSrc);
+ MO.setSubReg(AGPRSubReg);
+ continue;
+ }
+
+ // If this is a multi-use SGPR -> VGPR copy, use V_ACCVGPR_WRITE on
+ // GFX908 directly instead of a COPY. Otherwise, SIFoldOperand may try
+ // to fold the sgpr -> vgpr -> agpr copy into a sgpr -> agpr copy which
+ // is unlikely to be profitable.
+ //
+ // Note that V_ACCVGPR_WRITE is only used for AGPR_32.
+ MachineOperand &CopyIn = Def->getOperand(1);
+ if (IsAGPR32 && !ST->hasGFX90AInsts() && !MRI->hasOneNonDBGUse(Reg) &&
+ TRI->isSGPRReg(*MRI, CopyIn.getReg()))
+ CopyOpc = AMDGPU::V_ACCVGPR_WRITE_B32_e64;
+ }
+
+ InsertMBB = Def->getParent();
+ InsertPt = InsertMBB->SkipPHIsLabelsAndDebug(++Def->getIterator());
+ } else {
+ InsertMBB = PHI.getOperand(MO.getOperandNo() + 1).getMBB();
+ InsertPt = InsertMBB->getFirstTerminator();
+ }
+
+ Register NewReg = MRI->createVirtualRegister(ARC);
+ MachineInstr *MI = BuildMI(*InsertMBB, InsertPt, PHI.getDebugLoc(),
+ TII->get(CopyOpc), NewReg)
+ .addReg(Reg);
+ MO.setReg(NewReg);
+
+ (void)MI;
+ LLVM_DEBUG(dbgs() << " Created COPY: " << *MI);
+ }
+
+ // Replace the PHI's result with a new register.
Register NewReg = MRI->createVirtualRegister(ARC);
- PHI.getOperand(1).setReg(CopyIn);
PHI.getOperand(0).setReg(NewReg);
+ // COPY that new register back to the original PhiOut register. This COPY will
+ // usually be folded out later.
MachineBasicBlock *MBB = PHI.getParent();
- BuildMI(*MBB, MBB->getFirstNonPHI(), Copy->getDebugLoc(),
+ BuildMI(*MBB, MBB->getFirstNonPHI(), PHI.getDebugLoc(),
TII->get(AMDGPU::COPY), PhiOut)
- .addReg(NewReg, RegState::Kill);
- Copy->eraseFromParent(); // We know this copy had a single use.
-
- LLVM_DEBUG(dbgs() << "Folded " << PHI);
+ .addReg(NewReg);
+ LLVM_DEBUG(dbgs() << " Done: Folded " << PHI);
return true;
}
@@ -1733,6 +1870,101 @@ bool SIFoldOperands::tryFoldLoad(MachineInstr &MI) {
return true;
}
+// tryFoldPhiAGPR will aggressively try to create AGPR PHIs.
+// For GFX90A and later, this is pretty much always a good thing, but for GFX908
+// there's cases where it can create a lot more AGPR-AGPR copies, which are
+// expensive on this architecture due to the lack of V_ACCVGPR_MOV.
+//
+// This function looks at all AGPR PHIs in a basic block and collects their
+// operands. Then, it checks for register that are used more than once across
+// all PHIs and caches them in a VGPR. This prevents ExpandPostRAPseudo from
+// having to create one VGPR temporary per use, which can get very messy if
+// these PHIs come from a broken-up large PHI (e.g. 32 AGPR phis, one per vector
+// element).
+//
+// Example
+// a:
+// %in:agpr_256 = COPY %foo:vgpr_256
+// c:
+// %x:agpr_32 = ..
+// b:
+// %0:areg = PHI %in.sub0:agpr_32, %a, %x, %c
+// %1:areg = PHI %in.sub0:agpr_32, %a, %y, %c
+// %2:areg = PHI %in.sub0:agpr_32, %a, %z, %c
+// =>
+// a:
+// %in:agpr_256 = COPY %foo:vgpr_256
+// %tmp:vgpr_32 = V_ACCVGPR_READ_B32_e64 %in.sub0:agpr_32
+// %tmp_agpr:agpr_32 = COPY %tmp
+// c:
+// %x:agpr_32 = ..
+// b:
+// %0:areg = PHI %tmp_agpr, %a, %x, %c
+// %1:areg = PHI %tmp_agpr, %a, %y, %c
+// %2:areg = PHI %tmp_agpr, %a, %z, %c
+bool SIFoldOperands::tryOptimizeAGPRPhis(MachineBasicBlock &MBB) {
+ // This is only really needed on GFX908 where AGPR-AGPR copies are
+ // unreasonably difficult.
+ if (ST->hasGFX90AInsts())
+ return false;
+
+ // Look at all AGPR Phis and collect the register + subregister used.
+ DenseMap<std::pair<Register, unsigned>, std::vector<MachineOperand *>>
+ RegToMO;
+
+ for (auto &MI : MBB) {
+ if (!MI.isPHI())
+ break;
+
+ if (!TRI->isAGPR(*MRI, MI.getOperand(0).getReg()))
+ continue;
+
+ for (unsigned K = 1; K < MI.getNumOperands(); K += 2) {
+ MachineOperand &PhiMO = MI.getOperand(K);
+ RegToMO[{PhiMO.getReg(), PhiMO.getSubReg()}].push_back(&PhiMO);
+ }
+ }
+
+ // For all (Reg, SubReg) pair that are used more than once, cache the value in
+ // a VGPR.
+ bool Changed = false;
+ for (const auto &[Entry, MOs] : RegToMO) {
+ if (MOs.size() == 1)
+ continue;
+
+ const auto [Reg, SubReg] = Entry;
+ MachineInstr *Def = MRI->getVRegDef(Reg);
+ MachineBasicBlock *DefMBB = Def->getParent();
+
+ // Create a copy in a VGPR using V_ACCVGPR_READ_B32_e64 so it's not folded
+ // out.
+ const TargetRegisterClass *ARC = getRegOpRC(*MRI, *TRI, *MOs.front());
+ Register TempVGPR =
+ MRI->createVirtualRegister(TRI->getEquivalentVGPRClass(ARC));
+ MachineInstr *VGPRCopy =
+ BuildMI(*DefMBB, ++Def->getIterator(), Def->getDebugLoc(),
+ TII->get(AMDGPU::V_ACCVGPR_READ_B32_e64), TempVGPR)
+ .addReg(Reg, /* flags */ 0, SubReg);
+
+ // Copy back to an AGPR and use that instead of the AGPR subreg in all MOs.
+ Register TempAGPR = MRI->createVirtualRegister(ARC);
+ BuildMI(*DefMBB, ++VGPRCopy->getIterator(), Def->getDebugLoc(),
+ TII->get(AMDGPU::COPY), TempAGPR)
+ .addReg(TempVGPR);
+
+ LLVM_DEBUG(dbgs() << "Caching AGPR into VGPR: " << *VGPRCopy);
+ for (MachineOperand *MO : MOs) {
+ MO->setReg(TempAGPR);
+ MO->setSubReg(AMDGPU::NoSubRegister);
+ LLVM_DEBUG(dbgs() << " Changed PHI Operand: " << *MO << "\n");
+ }
+
+ Changed = true;
+ }
+
+ return Changed;
+}
+
bool SIFoldOperands::runOnMachineFunction(MachineFunction &MF) {
if (skipFunction(MF.getFunction()))
return false;
@@ -1766,7 +1998,7 @@ bool SIFoldOperands::runOnMachineFunction(MachineFunction &MF) {
continue;
}
- if (MI.isPHI() && tryFoldLCSSAPhi(MI)) {
+ if (MI.isPHI() && tryFoldPhiAGPR(MI)) {
Changed = true;
continue;
}
@@ -1791,6 +2023,8 @@ bool SIFoldOperands::runOnMachineFunction(MachineFunction &MF) {
!tryFoldOMod(MI))
Changed |= tryFoldClamp(MI);
}
+
+ Changed |= tryOptimizeAGPRPhis(*MBB);
}
return Changed;
diff --git a/llvm/lib/Target/AMDGPU/SIFormMemoryClauses.cpp b/llvm/lib/Target/AMDGPU/SIFormMemoryClauses.cpp
index a1eb8150595f..edcfd994033e 100644
--- a/llvm/lib/Target/AMDGPU/SIFormMemoryClauses.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFormMemoryClauses.cpp
@@ -119,9 +119,7 @@ static bool isValidClauseInst(const MachineInstr &MI, bool IsVMEMClause) {
// If this is a load instruction where the result has been coalesced with an operand, then we cannot clause it.
for (const MachineOperand &ResMO : MI.defs()) {
Register ResReg = ResMO.getReg();
- for (const MachineOperand &MO : MI.uses()) {
- if (!MO.isReg() || MO.isDef())
- continue;
+ for (const MachineOperand &MO : MI.all_uses()) {
if (MO.getReg() == ResReg)
return false;
}
diff --git a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
index c2bc95930272..865caae240f3 100644
--- a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
@@ -64,9 +64,12 @@ static MCRegister findScratchNonCalleeSaveRegister(MachineRegisterInfo &MRI,
return MCRegister();
}
+/// Query target location for spilling SGPRs
+/// \p IncludeScratchCopy : Also look for free scratch SGPRs
static void getVGPRSpillLaneOrTempRegister(
MachineFunction &MF, LivePhysRegs &LiveRegs, Register SGPR,
- const TargetRegisterClass &RC = AMDGPU::SReg_32_XM0_XEXECRegClass) {
+ const TargetRegisterClass &RC = AMDGPU::SReg_32_XM0_XEXECRegClass,
+ bool IncludeScratchCopy = true) {
SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
MachineFrameInfo &FrameInfo = MF.getFrameInfo();
@@ -77,9 +80,12 @@ static void getVGPRSpillLaneOrTempRegister(
// We need to save and restore the given SGPR.
+ Register ScratchSGPR;
// 1: Try to save the given register into an unused scratch SGPR. The LiveRegs
- // should have all the callee saved registers marked as used.
- Register ScratchSGPR = findUnusedRegister(MF.getRegInfo(), LiveRegs, RC);
+ // should have all the callee saved registers marked as used. For certain
+ // cases we skip copy to scratch SGPR.
+ if (IncludeScratchCopy)
+ ScratchSGPR = findUnusedRegister(MF.getRegInfo(), LiveRegs, RC);
if (!ScratchSGPR) {
int FI = FrameInfo.CreateStackObject(Size, Alignment, true, nullptr,
@@ -93,10 +99,10 @@ static void getVGPRSpillLaneOrTempRegister(
SGPR, PrologEpilogSGPRSaveRestoreInfo(
SGPRSaveKind::SPILL_TO_VGPR_LANE, FI));
- LLVM_DEBUG(
- auto Spill = MFI->getPrologEpilogSGPRSpillToVGPRLanes(FI).front();
- dbgs() << printReg(SGPR, TRI) << " requires fallback spill to "
- << printReg(Spill.VGPR, TRI) << ':' << Spill.Lane << '\n';);
+ LLVM_DEBUG(auto Spill = MFI->getSGPRSpillToPhysicalVGPRLanes(FI).front();
+ dbgs() << printReg(SGPR, TRI) << " requires fallback spill to "
+ << printReg(Spill.VGPR, TRI) << ':' << Spill.Lane
+ << '\n';);
} else {
// Remove dead <FI> index
MF.getFrameInfo().RemoveStackObject(FI);
@@ -258,7 +264,7 @@ class PrologEpilogSGPRSpillBuilder {
assert(MFI.getStackID(FI) == TargetStackID::SGPRSpill);
ArrayRef<SIRegisterInfo::SpilledReg> Spill =
- FuncInfo->getPrologEpilogSGPRSpillToVGPRLanes(FI);
+ FuncInfo->getSGPRSpillToPhysicalVGPRLanes(FI);
assert(Spill.size() == NumSubRegs);
for (unsigned I = 0; I < NumSubRegs; ++I) {
@@ -303,7 +309,7 @@ class PrologEpilogSGPRSpillBuilder {
void restoreFromVGPRLane(const int FI) {
assert(MFI.getStackID(FI) == TargetStackID::SGPRSpill);
ArrayRef<SIRegisterInfo::SpilledReg> Spill =
- FuncInfo->getPrologEpilogSGPRSpillToVGPRLanes(FI);
+ FuncInfo->getSGPRSpillToPhysicalVGPRLanes(FI);
assert(Spill.size() == NumSubRegs);
for (unsigned I = 0; I < NumSubRegs; ++I) {
@@ -565,7 +571,7 @@ Register SIFrameLowering::getEntryFunctionReservedScratchRsrcReg(
// reserved input we needed. Also for PAL, make sure we don't clobber
// the GIT pointer passed in SGPR0 or SGPR8.
if (!MRI.isPhysRegUsed(Reg) && MRI.isAllocatable(Reg) &&
- !TRI->isSubRegisterEq(Reg, GITPtrLoReg)) {
+ (!GITPtrLoReg || !TRI->isSubRegisterEq(Reg, GITPtrLoReg))) {
MRI.replaceRegWith(ScratchRsrcReg, Reg);
MFI->setScratchRSrcReg(Reg);
return Reg;
@@ -935,8 +941,7 @@ void SIFrameLowering::emitCSRSpillStores(
if (!WWMCalleeSavedRegs.empty()) {
if (ScratchExecCopy) {
unsigned MovOpc = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
- MCRegister Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
- BuildMI(MBB, MBBI, DL, TII->get(MovOpc), Exec).addImm(-1);
+ BuildMI(MBB, MBBI, DL, TII->get(MovOpc), TRI.getExec()).addImm(-1);
} else {
ScratchExecCopy = buildScratchExecCopy(LiveRegs, MF, MBB, MBBI, DL,
/*IsProlog*/ true,
@@ -948,8 +953,7 @@ void SIFrameLowering::emitCSRSpillStores(
if (ScratchExecCopy) {
// FIXME: Split block and make terminator.
unsigned ExecMov = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
- MCRegister Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
- BuildMI(MBB, MBBI, DL, TII->get(ExecMov), Exec)
+ BuildMI(MBB, MBBI, DL, TII->get(ExecMov), TRI.getExec())
.addReg(ScratchExecCopy, RegState::Kill);
LiveRegs.addReg(ScratchExecCopy);
}
@@ -1040,8 +1044,7 @@ void SIFrameLowering::emitCSRSpillRestores(
if (!WWMCalleeSavedRegs.empty()) {
if (ScratchExecCopy) {
unsigned MovOpc = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
- MCRegister Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
- BuildMI(MBB, MBBI, DL, TII->get(MovOpc), Exec).addImm(-1);
+ BuildMI(MBB, MBBI, DL, TII->get(MovOpc), TRI.getExec()).addImm(-1);
} else {
ScratchExecCopy = buildScratchExecCopy(LiveRegs, MF, MBB, MBBI, DL,
/*IsProlog*/ false,
@@ -1053,8 +1056,7 @@ void SIFrameLowering::emitCSRSpillRestores(
if (ScratchExecCopy) {
// FIXME: Split block and make terminator.
unsigned ExecMov = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
- MCRegister Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
- BuildMI(MBB, MBBI, DL, TII->get(ExecMov), Exec)
+ BuildMI(MBB, MBBI, DL, TII->get(ExecMov), TRI.getExec())
.addReg(ScratchExecCopy, RegState::Kill);
}
}
@@ -1350,8 +1352,9 @@ void SIFrameLowering::processFunctionBeforeFrameFinalized(
TII->getNamedOperand(MI, AMDGPU::OpName::vdata)->getReg();
if (FuncInfo->allocateVGPRSpillToAGPR(MF, FI,
TRI->isAGPR(MRI, VReg))) {
- // FIXME: change to enterBasicBlockEnd()
- RS->enterBasicBlock(MBB);
+ assert(RS != nullptr);
+ RS->enterBasicBlockEnd(MBB);
+ RS->backward(MI);
TRI->eliminateFrameIndex(MI, 0, FIOp, RS);
SpillFIs.set(FI);
continue;
@@ -1436,20 +1439,36 @@ void SIFrameLowering::processFunctionBeforeFrameIndicesReplaced(
TRI->findUnusedRegister(MRI, &AMDGPU::VGPR_32RegClass, MF);
if (UnusedLowVGPR && (TRI->getHWRegIndex(UnusedLowVGPR) <
TRI->getHWRegIndex(VGPRForAGPRCopy))) {
- // Call to setVGPRForAGPRCopy() should happen first before calling
- // freezeReservedRegs() so that getReservedRegs() can reserve this newly
- // identified VGPR (for AGPR copy).
+ // Reserve this newly identified VGPR (for AGPR copy)
+ // reserved registers should already be frozen at this point
+ // so we can avoid calling MRI.freezeReservedRegs and just use
+ // MRI.reserveReg
FuncInfo->setVGPRForAGPRCopy(UnusedLowVGPR);
- MRI.freezeReservedRegs(MF);
+ MRI.reserveReg(UnusedLowVGPR, TRI);
}
}
+ // We initally reserved the highest available SGPR pair for long branches
+ // now, after RA, we shift down to a lower unused one if one exists
+ Register LongBranchReservedReg = FuncInfo->getLongBranchReservedReg();
+ Register UnusedLowSGPR =
+ TRI->findUnusedRegister(MRI, &AMDGPU::SGPR_64RegClass, MF);
+ // If LongBranchReservedReg is null then we didn't find a long branch
+ // and never reserved a register to begin with so there is nothing to
+ // shift down. Then if UnusedLowSGPR is null, there isn't available lower
+ // register to use so just keep the original one we set.
+ if (LongBranchReservedReg && UnusedLowSGPR) {
+ FuncInfo->setLongBranchReservedReg(UnusedLowSGPR);
+ MRI.reserveReg(UnusedLowSGPR, TRI);
+ }
}
// The special SGPR spills like the one needed for FP, BP or any reserved
// registers delayed until frame lowering.
void SIFrameLowering::determinePrologEpilogSGPRSaves(
- MachineFunction &MF, BitVector &SavedVGPRs) const {
+ MachineFunction &MF, BitVector &SavedVGPRs,
+ bool NeedExecCopyReservedReg) const {
MachineFrameInfo &FrameInfo = MF.getFrameInfo();
+ MachineRegisterInfo &MRI = MF.getRegInfo();
SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
const SIRegisterInfo *TRI = ST.getRegisterInfo();
@@ -1461,6 +1480,26 @@ void SIFrameLowering::determinePrologEpilogSGPRSaves(
for (unsigned I = 0; CSRegs[I]; ++I)
LiveRegs.addReg(CSRegs[I]);
+ const TargetRegisterClass &RC = *TRI->getWaveMaskRegClass();
+
+ if (NeedExecCopyReservedReg) {
+ Register ReservedReg = MFI->getSGPRForEXECCopy();
+ assert(ReservedReg && "Should have reserved an SGPR for EXEC copy.");
+ Register UnusedScratchReg = findUnusedRegister(MRI, LiveRegs, RC);
+ if (UnusedScratchReg) {
+ // If found any unused scratch SGPR, reserve the register itself for Exec
+ // copy and there is no need for any spills in that case.
+ MFI->setSGPRForEXECCopy(UnusedScratchReg);
+ LiveRegs.addReg(UnusedScratchReg);
+ } else {
+ // Needs spill.
+ assert(!MFI->hasPrologEpilogSGPRSpillEntry(ReservedReg) &&
+ "Re-reserving spill slot for EXEC copy register");
+ getVGPRSpillLaneOrTempRegister(MF, LiveRegs, ReservedReg, RC,
+ /*IncludeScratchCopy=*/false);
+ }
+ }
+
// hasFP only knows about stack objects that already exist. We're now
// determining the stack slots that will be created, so we have to predict
// them. Stack objects force FP usage with calls.
@@ -1499,7 +1538,10 @@ void SIFrameLowering::determineCalleeSaves(MachineFunction &MF,
const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
const SIRegisterInfo *TRI = ST.getRegisterInfo();
+ const SIInstrInfo *TII = ST.getInstrInfo();
+ bool NeedExecCopyReservedReg = false;
+ MachineInstr *ReturnMI = nullptr;
for (MachineBasicBlock &MBB : MF) {
for (MachineInstr &MI : MBB) {
// WRITELANE instructions used for SGPR spills can overwrite the inactive
@@ -1516,6 +1558,25 @@ void SIFrameLowering::determineCalleeSaves(MachineFunction &MF,
MFI->allocateWWMSpill(MF, MI.getOperand(0).getReg());
else if (MI.getOpcode() == AMDGPU::V_READLANE_B32)
MFI->allocateWWMSpill(MF, MI.getOperand(1).getReg());
+ else if (TII->isWWMRegSpillOpcode(MI.getOpcode()))
+ NeedExecCopyReservedReg = true;
+ else if (MI.getOpcode() == AMDGPU::SI_RETURN ||
+ MI.getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG) {
+ // We expect all return to be the same size.
+ assert(!ReturnMI ||
+ (count_if(MI.operands(), [](auto Op) { return Op.isReg(); }) ==
+ count_if(ReturnMI->operands(), [](auto Op) { return Op.isReg(); })));
+ ReturnMI = &MI;
+ }
+ }
+ }
+
+ // Remove any VGPRs used in the return value because these do not need to be saved.
+ // This prevents CSR restore from clobbering return VGPRs.
+ if (ReturnMI) {
+ for (auto &Op : ReturnMI->operands()) {
+ if (Op.isReg())
+ SavedVGPRs.reset(Op.getReg());
}
}
@@ -1528,7 +1589,7 @@ void SIFrameLowering::determineCalleeSaves(MachineFunction &MF,
if (!ST.hasGFX90AInsts())
SavedVGPRs.clearBitsInMask(TRI->getAllAGPRRegMask());
- determinePrologEpilogSGPRSaves(MF, SavedVGPRs);
+ determinePrologEpilogSGPRSaves(MF, SavedVGPRs, NeedExecCopyReservedReg);
// The Whole-Wave VGPRs need to be specially inserted in the prolog, so don't
// allow the default insertion to handle them.
diff --git a/llvm/lib/Target/AMDGPU/SIFrameLowering.h b/llvm/lib/Target/AMDGPU/SIFrameLowering.h
index def07dc4b1f7..0060fc0be431 100644
--- a/llvm/lib/Target/AMDGPU/SIFrameLowering.h
+++ b/llvm/lib/Target/AMDGPU/SIFrameLowering.h
@@ -34,8 +34,8 @@ public:
RegScavenger *RS = nullptr) const override;
void determineCalleeSavesSGPR(MachineFunction &MF, BitVector &SavedRegs,
RegScavenger *RS = nullptr) const;
- void determinePrologEpilogSGPRSaves(MachineFunction &MF,
- BitVector &SavedRegs) const;
+ void determinePrologEpilogSGPRSaves(MachineFunction &MF, BitVector &SavedRegs,
+ bool NeedExecCopyReservedReg) const;
void emitCSRSpillStores(MachineFunction &MF, MachineBasicBlock &MBB,
MachineBasicBlock::iterator MBBI, DebugLoc &DL,
LivePhysRegs &LiveRegs, Register FrameReg,
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index e0ad11d5af24..3148f49ff0d5 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -15,14 +15,17 @@
#include "AMDGPU.h"
#include "AMDGPUInstrInfo.h"
#include "AMDGPUTargetMachine.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
#include "SIMachineFunctionInfo.h"
#include "SIRegisterInfo.h"
+#include "llvm/ADT/APInt.h"
#include "llvm/ADT/FloatingPointMode.h"
#include "llvm/ADT/Statistic.h"
-#include "llvm/Analysis/LegacyDivergenceAnalysis.h"
#include "llvm/Analysis/OptimizationRemarkEmitter.h"
+#include "llvm/Analysis/UniformityAnalysis.h"
#include "llvm/BinaryFormat/ELF.h"
#include "llvm/CodeGen/Analysis.h"
+#include "llvm/CodeGen/ByteProvider.h"
#include "llvm/CodeGen/FunctionLoweringInfo.h"
#include "llvm/CodeGen/GlobalISel/GISelKnownBits.h"
#include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
@@ -35,8 +38,9 @@
#include "llvm/IR/IntrinsicsAMDGPU.h"
#include "llvm/IR/IntrinsicsR600.h"
#include "llvm/Support/CommandLine.h"
-#include "llvm/Support/ModRef.h"
#include "llvm/Support/KnownBits.h"
+#include "llvm/Support/ModRef.h"
+#include <optional>
using namespace llvm;
@@ -55,14 +59,14 @@ static cl::opt<bool> UseDivergentRegisterIndexing(
cl::desc("Use indirect register addressing for divergent indexes"),
cl::init(false));
-static bool hasFP32Denormals(const MachineFunction &MF) {
+static bool denormalModeIsFlushAllF32(const MachineFunction &MF) {
const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
- return Info->getMode().allFP32Denormals();
+ return Info->getMode().FP32Denormals == DenormalMode::getPreserveSign();
}
-static bool hasFP64FP16Denormals(const MachineFunction &MF) {
+static bool denormalModeIsFlushAllF64F16(const MachineFunction &MF) {
const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
- return Info->getMode().allFP64FP16Denormals();
+ return Info->getMode().FP64FP16Denormals == DenormalMode::getPreserveSign();
}
static unsigned findFirstFreeSGPR(CCState &CCInfo) {
@@ -215,6 +219,8 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
setOperationAction(ISD::SELECT, MVT::f64, Promote);
AddPromotedToType(ISD::SELECT, MVT::f64, MVT::i64);
+ setOperationAction(ISD::FSQRT, MVT::f64, Custom);
+
setOperationAction(ISD::SELECT_CC,
{MVT::f32, MVT::i32, MVT::i64, MVT::f64, MVT::i1}, Expand);
@@ -244,13 +250,13 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
setOperationAction({ISD::UADDO, ISD::USUBO}, MVT::i32, Legal);
- setOperationAction({ISD::ADDCARRY, ISD::SUBCARRY}, MVT::i32, Legal);
+ setOperationAction({ISD::UADDO_CARRY, ISD::USUBO_CARRY}, MVT::i32, Legal);
setOperationAction({ISD::SHL_PARTS, ISD::SRA_PARTS, ISD::SRL_PARTS}, MVT::i64,
Expand);
#if 0
- setOperationAction({ISD::ADDCARRY, ISD::SUBCARRY}, MVT::i64, Legal);
+ setOperationAction({ISD::UADDO_CARRY, ISD::USUBO_CARRY}, MVT::i64, Legal);
#endif
// We only support LOAD/STORE and vector manipulation ops for vectors
@@ -470,6 +476,9 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
MVT::f64, Custom);
setOperationAction(ISD::FFLOOR, MVT::f64, Legal);
+ setOperationAction({ISD::FLDEXP, ISD::STRICT_FLDEXP}, {MVT::f32, MVT::f64},
+ Legal);
+ setOperationAction(ISD::FFREXP, {MVT::f32, MVT::f64}, Custom);
setOperationAction({ISD::FSIN, ISD::FCOS, ISD::FDIV}, MVT::f32, Custom);
setOperationAction(ISD::FDIV, MVT::f64, Custom);
@@ -514,9 +523,9 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
AddPromotedToType(ISD::STORE, MVT::f16, MVT::i16);
// F16 - VOP1 Actions.
- setOperationAction(
- {ISD::FP_ROUND, ISD::FCOS, ISD::FSIN, ISD::FROUND, ISD::FPTRUNC_ROUND},
- MVT::f16, Custom);
+ setOperationAction({ISD::FP_ROUND, ISD::STRICT_FP_ROUND, ISD::FCOS,
+ ISD::FSIN, ISD::FROUND, ISD::FPTRUNC_ROUND},
+ MVT::f16, Custom);
setOperationAction({ISD::SINT_TO_FP, ISD::UINT_TO_FP}, MVT::i16, Custom);
@@ -526,7 +535,8 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
// F16 - VOP2 Actions.
setOperationAction({ISD::BR_CC, ISD::SELECT_CC}, MVT::f16, Expand);
-
+ setOperationAction({ISD::FLDEXP, ISD::STRICT_FLDEXP}, MVT::f16, Custom);
+ setOperationAction(ISD::FFREXP, MVT::f16, Custom);
setOperationAction(ISD::FDIV, MVT::f16, Custom);
// F16 - VOP3 Actions.
@@ -728,25 +738,25 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
setOperationAction(ISD::INTRINSIC_WO_CHAIN,
{MVT::Other, MVT::f32, MVT::v4f32, MVT::i16, MVT::f16,
- MVT::v2i16, MVT::v2f16},
+ MVT::v2i16, MVT::v2f16, MVT::i128},
Custom);
setOperationAction(ISD::INTRINSIC_W_CHAIN,
{MVT::v2f16, MVT::v2i16, MVT::v3f16, MVT::v3i16,
MVT::v4f16, MVT::v4i16, MVT::v8f16, MVT::Other, MVT::f16,
- MVT::i16, MVT::i8},
+ MVT::i16, MVT::i8, MVT::i128},
Custom);
setOperationAction(ISD::INTRINSIC_VOID,
{MVT::Other, MVT::v2i16, MVT::v2f16, MVT::v3i16,
MVT::v3f16, MVT::v4f16, MVT::v4i16, MVT::f16, MVT::i16,
- MVT::i8},
+ MVT::i8, MVT::i128},
Custom);
setTargetDAGCombine({ISD::ADD,
- ISD::ADDCARRY,
+ ISD::UADDO_CARRY,
ISD::SUB,
- ISD::SUBCARRY,
+ ISD::USUBO_CARRY,
ISD::FADD,
ISD::FSUB,
ISD::FMINNUM,
@@ -769,7 +779,11 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
ISD::ZERO_EXTEND,
ISD::SIGN_EXTEND_INREG,
ISD::EXTRACT_VECTOR_ELT,
- ISD::INSERT_VECTOR_ELT});
+ ISD::INSERT_VECTOR_ELT,
+ ISD::FCOPYSIGN});
+
+ if (Subtarget->has16BitInsts() && !Subtarget->hasMed3_16())
+ setTargetDAGCombine(ISD::FP_ROUND);
// All memory operations. Some folding on the pointer operand is done to help
// matching the constant offsets in the addressing modes.
@@ -791,6 +805,8 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
ISD::ATOMIC_LOAD_UMIN,
ISD::ATOMIC_LOAD_UMAX,
ISD::ATOMIC_LOAD_FADD,
+ ISD::ATOMIC_LOAD_UINC_WRAP,
+ ISD::ATOMIC_LOAD_UDEC_WRAP,
ISD::INTRINSIC_VOID,
ISD::INTRINSIC_W_CHAIN});
@@ -816,10 +832,10 @@ bool SITargetLowering::isFPExtFoldable(const SelectionDAG &DAG, unsigned Opcode,
EVT DestVT, EVT SrcVT) const {
return ((Opcode == ISD::FMAD && Subtarget->hasMadMixInsts()) ||
(Opcode == ISD::FMA && Subtarget->hasFmaMixInsts())) &&
- DestVT.getScalarType() == MVT::f32 &&
- SrcVT.getScalarType() == MVT::f16 &&
- // TODO: This probably only requires no input flushing?
- !hasFP32Denormals(DAG.getMachineFunction());
+ DestVT.getScalarType() == MVT::f32 &&
+ SrcVT.getScalarType() == MVT::f16 &&
+ // TODO: This probably only requires no input flushing?
+ denormalModeIsFlushAllF32(DAG.getMachineFunction());
}
bool SITargetLowering::isFPExtFoldable(const MachineInstr &MI, unsigned Opcode,
@@ -829,7 +845,7 @@ bool SITargetLowering::isFPExtFoldable(const MachineInstr &MI, unsigned Opcode,
DestTy.getScalarSizeInBits() == 32 &&
SrcTy.getScalarSizeInBits() == 16 &&
// TODO: This probably only requires no input flushing?
- !hasFP32Denormals(*MI.getMF());
+ denormalModeIsFlushAllF32(*MI.getMF());
}
bool SITargetLowering::isShuffleMaskLegal(ArrayRef<int>, EVT) const {
@@ -976,6 +992,26 @@ static EVT memVTFromLoadIntrReturn(Type *Ty, unsigned MaxNumLanes) {
return memVTFromLoadIntrData(ST->getContainedType(0), MaxNumLanes);
}
+/// Map address space 7 to MVT::v5i32 because that's its in-memory
+/// representation. This return value is vector-typed because there is no
+/// MVT::i160 and it is not clear if one can be added. While this could
+/// cause issues during codegen, these address space 7 pointers will be
+/// rewritten away by then. Therefore, we can return MVT::v5i32 in order
+/// to allow pre-codegen passes that query TargetTransformInfo, often for cost
+/// modeling, to work.
+MVT SITargetLowering::getPointerTy(const DataLayout &DL, unsigned AS) const {
+ if (AMDGPUAS::BUFFER_FAT_POINTER == AS && DL.getPointerSizeInBits(AS) == 160)
+ return MVT::v5i32;
+ return AMDGPUTargetLowering::getPointerTy(DL, AS);
+}
+/// Similarly, the in-memory representation of a p7 is {p8, i32}, aka
+/// v8i32 when padding is added.
+MVT SITargetLowering::getPointerMemTy(const DataLayout &DL, unsigned AS) const {
+ if (AMDGPUAS::BUFFER_FAT_POINTER == AS && DL.getPointerSizeInBits(AS) == 160)
+ return MVT::v8i32;
+ return AMDGPUTargetLowering::getPointerMemTy(DL, AS);
+}
+
bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
const CallInst &CI,
MachineFunction &MF,
@@ -993,11 +1029,22 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
return false;
// TODO: Should images get their own address space?
- Info.fallbackAddressSpace = AMDGPUAS::BUFFER_FAT_POINTER;
+ Info.fallbackAddressSpace = AMDGPUAS::BUFFER_RESOURCE;
if (RsrcIntr->IsImage)
Info.align.reset();
+ Value *RsrcArg = CI.getArgOperand(RsrcIntr->RsrcArg);
+ if (auto *RsrcPtrTy = dyn_cast<PointerType>(RsrcArg->getType())) {
+ if (RsrcPtrTy->getAddressSpace() == AMDGPUAS::BUFFER_RESOURCE)
+ // We conservatively set the memory operand of a buffer intrinsic to the
+ // base resource pointer, so that we can access alias information about
+ // those pointers. Cases like "this points at the same value
+ // but with a different offset" are handled in
+ // areMemAccessesTriviallyDisjoint.
+ Info.ptrVal = RsrcArg;
+ }
+
Info.flags |= MachineMemOperand::MODereferenceable;
if (ME.onlyReadsMemory()) {
unsigned MaxNumLanes = 4;
@@ -1050,7 +1097,9 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
default:
break;
case Intrinsic::amdgcn_raw_buffer_load_lds:
- case Intrinsic::amdgcn_struct_buffer_load_lds: {
+ case Intrinsic::amdgcn_raw_ptr_buffer_load_lds:
+ case Intrinsic::amdgcn_struct_buffer_load_lds:
+ case Intrinsic::amdgcn_struct_ptr_buffer_load_lds: {
unsigned Width = cast<ConstantInt>(CI.getArgOperand(2))->getZExtValue();
Info.memVT = EVT::getIntegerVT(CI.getContext(), Width * 8);
return true;
@@ -1061,8 +1110,6 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
}
switch (IntrID) {
- case Intrinsic::amdgcn_atomic_inc:
- case Intrinsic::amdgcn_atomic_dec:
case Intrinsic::amdgcn_ds_ordered_add:
case Intrinsic::amdgcn_ds_ordered_swap:
case Intrinsic::amdgcn_ds_fadd:
@@ -1083,7 +1130,7 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
case Intrinsic::amdgcn_buffer_atomic_fadd: {
Info.opc = ISD::INTRINSIC_W_CHAIN;
Info.memVT = MVT::getVT(CI.getOperand(0)->getType());
- Info.fallbackAddressSpace = AMDGPUAS::BUFFER_FAT_POINTER;
+ Info.fallbackAddressSpace = AMDGPUAS::BUFFER_RESOURCE;
Info.align.reset();
Info.flags |= MachineMemOperand::MOLoad | MachineMemOperand::MOStore;
@@ -1093,6 +1140,15 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
return true;
}
+ case Intrinsic::amdgcn_ds_add_gs_reg_rtn:
+ case Intrinsic::amdgcn_ds_sub_gs_reg_rtn: {
+ Info.opc = ISD::INTRINSIC_W_CHAIN;
+ Info.memVT = MVT::getVT(CI.getOperand(0)->getType());
+ Info.ptrVal = nullptr;
+ Info.fallbackAddressSpace = AMDGPUAS::STREAMOUT_REGISTER;
+ Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOStore;
+ return true;
+ }
case Intrinsic::amdgcn_ds_append:
case Intrinsic::amdgcn_ds_consume: {
Info.opc = ISD::INTRINSIC_W_CHAIN;
@@ -1121,7 +1177,7 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
Info.opc = ISD::INTRINSIC_W_CHAIN;
Info.memVT = MVT::getVT(CI.getType()); // XXX: what is correct VT?
- Info.fallbackAddressSpace = AMDGPUAS::BUFFER_FAT_POINTER;
+ Info.fallbackAddressSpace = AMDGPUAS::BUFFER_RESOURCE;
Info.align.reset();
Info.flags |= MachineMemOperand::MOLoad |
MachineMemOperand::MODereferenceable;
@@ -1204,8 +1260,6 @@ bool SITargetLowering::getAddrModeArguments(IntrinsicInst *II,
SmallVectorImpl<Value*> &Ops,
Type *&AccessTy) const {
switch (II->getIntrinsicID()) {
- case Intrinsic::amdgcn_atomic_inc:
- case Intrinsic::amdgcn_atomic_dec:
case Intrinsic::amdgcn_ds_ordered_add:
case Intrinsic::amdgcn_ds_ordered_swap:
case Intrinsic::amdgcn_ds_append:
@@ -1313,7 +1367,7 @@ bool SITargetLowering::isLegalAddressingMode(const DataLayout &DL,
if (AS == AMDGPUAS::CONSTANT_ADDRESS ||
AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT ||
- AS == AMDGPUAS::BUFFER_FAT_POINTER) {
+ AS == AMDGPUAS::BUFFER_FAT_POINTER || AS == AMDGPUAS::BUFFER_RESOURCE) {
// If the offset isn't a multiple of 4, it probably isn't going to be
// correctly aligned.
// FIXME: Can we get the real alignment here?
@@ -1336,12 +1390,16 @@ bool SITargetLowering::isLegalAddressingMode(const DataLayout &DL,
// in 8-bits, it can use a smaller encoding.
if (!isUInt<32>(AM.BaseOffs / 4))
return false;
- } else if (Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
+ } else if (Subtarget->getGeneration() < AMDGPUSubtarget::GFX9) {
// On VI, these use the SMEM format and the offset is 20-bit in bytes.
if (!isUInt<20>(AM.BaseOffs))
return false;
- } else
- llvm_unreachable("unhandled generation");
+ } else {
+ // On GFX9 the offset is signed 21-bit in bytes (but must not be negative
+ // for S_BUFFER_* instructions).
+ if (!isInt<21>(AM.BaseOffs))
+ return false;
+ }
if (AM.Scale == 0) // r + i or just i, depending on HasBaseReg.
return true;
@@ -1350,11 +1408,12 @@ bool SITargetLowering::isLegalAddressingMode(const DataLayout &DL,
return true;
return false;
+ }
- } else if (AS == AMDGPUAS::PRIVATE_ADDRESS) {
+ if (AS == AMDGPUAS::PRIVATE_ADDRESS)
return isLegalMUBUFAddressingMode(AM);
- } else if (AS == AMDGPUAS::LOCAL_ADDRESS ||
- AS == AMDGPUAS::REGION_ADDRESS) {
+
+ if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
// Basic, single offset DS instructions allow a 16-bit unsigned immediate
// field.
// XXX - If doing a 4-byte aligned 8-byte type access, we effectively have
@@ -1369,8 +1428,9 @@ bool SITargetLowering::isLegalAddressingMode(const DataLayout &DL,
return true;
return false;
- } else if (AS == AMDGPUAS::FLAT_ADDRESS ||
- AS == AMDGPUAS::UNKNOWN_ADDRESS_SPACE) {
+ }
+
+ if (AS == AMDGPUAS::FLAT_ADDRESS || AS == AMDGPUAS::UNKNOWN_ADDRESS_SPACE) {
// For an unknown address space, this usually means that this is for some
// reason being used for pure arithmetic, and not based on some addressing
// computation. We don't have instructions that compute pointers with any
@@ -1544,18 +1604,14 @@ bool SITargetLowering::allowsMisalignedMemoryAccessesImpl(
return AlignedBy4;
}
- if (Subtarget->hasUnalignedBufferAccessEnabled()) {
- // If we have a uniform constant load, it still requires using a slow
- // buffer instruction if unaligned.
- if (IsFast) {
- // Accesses can really be issued as 1-byte aligned or 4-byte aligned, so
- // 2-byte alignment is worse than 1 unless doing a 2-byte access.
- *IsFast = (AddrSpace == AMDGPUAS::CONSTANT_ADDRESS ||
- AddrSpace == AMDGPUAS::CONSTANT_ADDRESS_32BIT) ?
- Alignment >= Align(4) : Alignment != Align(2);
- }
+ // So long as they are correct, wide global memory operations perform better
+ // than multiple smaller memory ops -- even when misaligned
+ if (AMDGPU::isExtendedGlobalAddrSpace(AddrSpace)) {
+ if (IsFast)
+ *IsFast = Size;
- return true;
+ return Alignment >= Align(4) ||
+ Subtarget->hasUnalignedBufferAccessEnabled();
}
// Smaller than dword value must be aligned.
@@ -1864,7 +1920,7 @@ SDValue SITargetLowering::getPreloadedValue(SelectionDAG &DAG,
return DAG.getUNDEF(VT);
}
- return CreateLiveInRegister(DAG, RC, Reg->getRegister(), VT);
+ return loadInputValue(DAG, RC, VT, SDLoc(DAG.getEntryNode()), *Reg);
}
static void processPSInputArgs(SmallVectorImpl<ISD::InputArg> &Splits,
@@ -2082,7 +2138,9 @@ void SITargetLowering::allocateSpecialInputSGPRs(
if (Info.hasDispatchPtr())
allocateSGPR64Input(CCInfo, ArgInfo.DispatchPtr);
- if (Info.hasQueuePtr() && AMDGPU::getAmdhsaCodeObjectVersion() < 5)
+ const Module *M = MF.getFunction().getParent();
+ if (Info.hasQueuePtr() &&
+ AMDGPU::getCodeObjectVersion(*M) < AMDGPU::AMDHSA_COV5)
allocateSGPR64Input(CCInfo, ArgInfo.QueuePtr);
// Implicit arg ptr takes the place of the kernarg segment pointer. This is a
@@ -2132,7 +2190,9 @@ void SITargetLowering::allocateHSAUserSGPRs(CCState &CCInfo,
CCInfo.AllocateReg(DispatchPtrReg);
}
- if (Info.hasQueuePtr() && AMDGPU::getAmdhsaCodeObjectVersion() < 5) {
+ const Module *M = MF.getFunction().getParent();
+ if (Info.hasQueuePtr() &&
+ AMDGPU::getCodeObjectVersion(*M) < AMDGPU::AMDHSA_COV5) {
Register QueuePtrReg = Info.addQueuePtr(TRI);
MF.addLiveIn(QueuePtrReg, &AMDGPU::SGPR_64RegClass);
CCInfo.AllocateReg(QueuePtrReg);
@@ -2175,11 +2235,16 @@ void SITargetLowering::allocateSystemSGPRs(CCState &CCInfo,
SIMachineFunctionInfo &Info,
CallingConv::ID CallConv,
bool IsShader) const {
+ bool HasArchitectedSGPRs = Subtarget->hasArchitectedSGPRs();
if (Subtarget->hasUserSGPRInit16Bug() && !IsShader) {
// Note: user SGPRs are handled by the front-end for graphics shaders
// Pad up the used user SGPRs with dead inputs.
- unsigned CurrentUserSGPRs = Info.getNumUserSGPRs();
+ // TODO: NumRequiredSystemSGPRs computation should be adjusted appropriately
+ // before enabling architected SGPRs for workgroup IDs.
+ assert(!HasArchitectedSGPRs && "Unhandled feature for the subtarget");
+
+ unsigned CurrentUserSGPRs = Info.getNumUserSGPRs();
// Note we do not count the PrivateSegmentWaveByteOffset. We do not want to
// rely on it to reach 16 since if we end up having no stack usage, it will
// not really be added.
@@ -2195,20 +2260,26 @@ void SITargetLowering::allocateSystemSGPRs(CCState &CCInfo,
}
if (Info.hasWorkGroupIDX()) {
- Register Reg = Info.addWorkGroupIDX();
- MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
+ Register Reg = Info.addWorkGroupIDX(HasArchitectedSGPRs);
+ if (!HasArchitectedSGPRs)
+ MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
+
CCInfo.AllocateReg(Reg);
}
if (Info.hasWorkGroupIDY()) {
- Register Reg = Info.addWorkGroupIDY();
- MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
+ Register Reg = Info.addWorkGroupIDY(HasArchitectedSGPRs);
+ if (!HasArchitectedSGPRs)
+ MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
+
CCInfo.AllocateReg(Reg);
}
if (Info.hasWorkGroupIDZ()) {
- Register Reg = Info.addWorkGroupIDZ();
- MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
+ Register Reg = Info.addWorkGroupIDZ(HasArchitectedSGPRs);
+ if (!HasArchitectedSGPRs)
+ MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
+
CCInfo.AllocateReg(Reg);
}
@@ -2395,8 +2466,6 @@ SDValue SITargetLowering::LowerFormalArguments(
return DAG.getEntryNode();
}
- Info->allocateKnownAddressLDSGlobal(Fn);
-
SmallVector<ISD::InputArg, 16> Splits;
SmallVector<CCValAssign, 16> ArgLocs;
BitVector Skipped(Ins.size());
@@ -2409,11 +2478,14 @@ SDValue SITargetLowering::LowerFormalArguments(
if (IsGraphics) {
assert(!Info->hasDispatchPtr() && !Info->hasKernargSegmentPtr() &&
- (!Info->hasFlatScratchInit() || Subtarget->enableFlatScratch()) &&
- !Info->hasWorkGroupIDX() && !Info->hasWorkGroupIDY() &&
- !Info->hasWorkGroupIDZ() && !Info->hasWorkGroupInfo() &&
- !Info->hasLDSKernelId() && !Info->hasWorkItemIDX() &&
- !Info->hasWorkItemIDY() && !Info->hasWorkItemIDZ());
+ !Info->hasWorkGroupInfo() && !Info->hasLDSKernelId() &&
+ !Info->hasWorkItemIDX() && !Info->hasWorkItemIDY() &&
+ !Info->hasWorkItemIDZ());
+ if (!Subtarget->enableFlatScratch())
+ assert(!Info->hasFlatScratchInit());
+ if (CallConv != CallingConv::AMDGPU_CS || !Subtarget->hasArchitectedSGPRs())
+ assert(!Info->hasWorkGroupIDX() && !Info->hasWorkGroupIDY() &&
+ !Info->hasWorkGroupIDZ());
}
if (CallConv == CallingConv::AMDGPU_PS) {
@@ -2451,7 +2523,7 @@ SDValue SITargetLowering::LowerFormalArguments(
unsigned PsInputBits = Info->getPSInputAddr() & Info->getPSInputEnable();
if ((PsInputBits & 0x7F) == 0 ||
((PsInputBits & 0xF) == 0 && (PsInputBits >> 11 & 1)))
- Info->markPSInputEnabled(countTrailingZeros(Info->getPSInputAddr()));
+ Info->markPSInputEnabled(llvm::countr_zero(Info->getPSInputAddr()));
}
} else if (IsKernel) {
assert(Info->hasWorkGroupIDX() && Info->hasWorkItemIDX());
@@ -2610,7 +2682,7 @@ SDValue SITargetLowering::LowerFormalArguments(
DAG.getPass()->getAnalysis<AMDGPUArgumentUsageInfo>();
ArgUsageInfo.setFuncArgInfo(Fn, Info->getArgInfo());
- unsigned StackArgSize = CCInfo.getNextStackOffset();
+ unsigned StackArgSize = CCInfo.getStackSize();
Info->setBytesInStackArgArea(StackArgSize);
return Chains.empty() ? Chain :
@@ -2632,7 +2704,17 @@ bool SITargetLowering::CanLowerReturn(
SmallVector<CCValAssign, 16> RVLocs;
CCState CCInfo(CallConv, IsVarArg, MF, RVLocs, Context);
- return CCInfo.CheckReturn(Outs, CCAssignFnForReturn(CallConv, IsVarArg));
+ if (!CCInfo.CheckReturn(Outs, CCAssignFnForReturn(CallConv, IsVarArg)))
+ return false;
+
+ // We must use the stack if return would require unavailable registers.
+ unsigned MaxNumVGPRs = Subtarget->getMaxNumVGPRs(MF);
+ unsigned TotalNumVGPRs = AMDGPU::VGPR_32RegClass.getNumRegs();
+ for (unsigned i = MaxNumVGPRs; i < TotalNumVGPRs; ++i)
+ if (CCInfo.isAllocated(AMDGPU::VGPR_32RegClass.getRegister(i)))
+ return false;
+
+ return true;
}
SDValue
@@ -2665,7 +2747,7 @@ SITargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
// Analyze outgoing return values.
CCInfo.AnalyzeReturn(Outs, CCAssignFnForReturn(CallConv, isVarArg));
- SDValue Flag;
+ SDValue Glue;
SmallVector<SDValue, 48> RetOps;
RetOps.push_back(Chain); // Operand #0 = Chain (updated below)
@@ -2697,8 +2779,8 @@ SITargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
llvm_unreachable("Unknown loc info!");
}
- Chain = DAG.getCopyToReg(Chain, DL, VA.getLocReg(), Arg, Flag);
- Flag = Chain.getValue(1);
+ Chain = DAG.getCopyToReg(Chain, DL, VA.getLocReg(), Arg, Glue);
+ Glue = Chain.getValue(1);
RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
}
@@ -2721,17 +2803,17 @@ SITargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
// Update chain and glue.
RetOps[0] = Chain;
- if (Flag.getNode())
- RetOps.push_back(Flag);
+ if (Glue.getNode())
+ RetOps.push_back(Glue);
unsigned Opc = AMDGPUISD::ENDPGM;
if (!IsWaveEnd)
- Opc = IsShader ? AMDGPUISD::RETURN_TO_EPILOG : AMDGPUISD::RET_FLAG;
+ Opc = IsShader ? AMDGPUISD::RETURN_TO_EPILOG : AMDGPUISD::RET_GLUE;
return DAG.getNode(Opc, DL, MVT::Other, RetOps);
}
SDValue SITargetLowering::LowerCallResult(
- SDValue Chain, SDValue InFlag, CallingConv::ID CallConv, bool IsVarArg,
+ SDValue Chain, SDValue InGlue, CallingConv::ID CallConv, bool IsVarArg,
const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals, bool IsThisReturn,
SDValue ThisVal) const {
@@ -2749,9 +2831,9 @@ SDValue SITargetLowering::LowerCallResult(
SDValue Val;
if (VA.isRegLoc()) {
- Val = DAG.getCopyFromReg(Chain, DL, VA.getLocReg(), VA.getLocVT(), InFlag);
+ Val = DAG.getCopyFromReg(Chain, DL, VA.getLocReg(), VA.getLocVT(), InGlue);
Chain = Val.getValue(1);
- InFlag = Val.getValue(2);
+ InGlue = Val.getValue(2);
} else if (VA.isMemLoc()) {
report_fatal_error("TODO: return values in memory");
} else
@@ -3066,7 +3148,7 @@ bool SITargetLowering::isEligibleForTailCallOptimization(
// If the stack arguments for this call do not fit into our own save area then
// the call cannot be made tail.
// TODO: Is this really necessary?
- if (CCInfo.getNextStackOffset() > FuncInfo->getBytesInStackArgArea())
+ if (CCInfo.getStackSize() > FuncInfo->getBytesInStackArgArea())
return false;
const MachineRegisterInfo &MRI = MF.getRegInfo();
@@ -3122,21 +3204,6 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,
"unsupported required tail call to function ");
}
- if (AMDGPU::isShader(CallConv)) {
- // Note the issue is with the CC of the called function, not of the call
- // itself.
- return lowerUnhandledCall(CLI, InVals,
- "unsupported call to a shader function ");
- }
-
- if (AMDGPU::isShader(MF.getFunction().getCallingConv()) &&
- CallConv != CallingConv::AMDGPU_Gfx) {
- // Only allow calls with specific calling conventions.
- return lowerUnhandledCall(CLI, InVals,
- "unsupported calling convention for call from "
- "graphics shader of function ");
- }
-
if (IsTailCall) {
IsTailCall = isEligibleForTailCallOptimization(
Callee, CallConv, IsVarArg, Outs, OutVals, Ins, DAG);
@@ -3173,7 +3240,7 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,
CCInfo.AnalyzeCallOperands(Outs, AssignFn);
// Get a count of how many bytes are to be pushed on the stack.
- unsigned NumBytes = CCInfo.getNextStackOffset();
+ unsigned NumBytes = CCInfo.getStackSize();
if (IsSibCall) {
// Since we're not changing the ABI to make this a tail call, the memory
@@ -3309,11 +3376,11 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,
// Build a sequence of copy-to-reg nodes chained together with token chain
// and flag operands which copy the outgoing args into the appropriate regs.
- SDValue InFlag;
+ SDValue InGlue;
for (auto &RegToPass : RegsToPass) {
Chain = DAG.getCopyToReg(Chain, DL, RegToPass.first,
- RegToPass.second, InFlag);
- InFlag = Chain.getValue(1);
+ RegToPass.second, InGlue);
+ InGlue = Chain.getValue(1);
}
@@ -3322,8 +3389,8 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,
// we've carefully laid out the parameters so that when sp is reset they'll be
// in the correct location.
if (IsTailCall && !IsSibCall) {
- Chain = DAG.getCALLSEQ_END(Chain, NumBytes, 0, InFlag, DL);
- InFlag = Chain.getValue(1);
+ Chain = DAG.getCALLSEQ_END(Chain, NumBytes, 0, InGlue, DL);
+ InGlue = Chain.getValue(1);
}
std::vector<SDValue> Ops;
@@ -3359,8 +3426,8 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,
assert(Mask && "Missing call preserved mask for calling convention");
Ops.push_back(DAG.getRegisterMask(Mask));
- if (InFlag.getNode())
- Ops.push_back(InFlag);
+ if (InGlue.getNode())
+ Ops.push_back(InGlue);
SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
@@ -3368,22 +3435,24 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,
// actual call instruction.
if (IsTailCall) {
MFI.setHasTailCall();
- return DAG.getNode(AMDGPUISD::TC_RETURN, DL, NodeTys, Ops);
+ unsigned OPC = CallConv == CallingConv::AMDGPU_Gfx ?
+ AMDGPUISD::TC_RETURN_GFX : AMDGPUISD::TC_RETURN;
+ return DAG.getNode(OPC, DL, NodeTys, Ops);
}
// Returns a chain and a flag for retval copy to use.
SDValue Call = DAG.getNode(AMDGPUISD::CALL, DL, NodeTys, Ops);
Chain = Call.getValue(0);
- InFlag = Call.getValue(1);
+ InGlue = Call.getValue(1);
uint64_t CalleePopBytes = NumBytes;
- Chain = DAG.getCALLSEQ_END(Chain, 0, CalleePopBytes, InFlag, DL);
+ Chain = DAG.getCALLSEQ_END(Chain, 0, CalleePopBytes, InGlue, DL);
if (!Ins.empty())
- InFlag = Chain.getValue(1);
+ InGlue = Chain.getValue(1);
// Handle result values, copying them out of physregs into vregs that we
// return.
- return LowerCallResult(Chain, InFlag, CallConv, IsVarArg, Ins, DL, DAG,
+ return LowerCallResult(Chain, InGlue, CallConv, IsVarArg, Ins, DL, DAG,
InVals, IsThisReturn,
IsThisReturn ? OutVals[0] : SDValue());
}
@@ -4000,6 +4069,120 @@ static MachineBasicBlock *emitIndirectDst(MachineInstr &MI,
return LoopBB;
}
+static MachineBasicBlock *lowerWaveReduce(MachineInstr &MI,
+ MachineBasicBlock &BB,
+ const GCNSubtarget &ST,
+ unsigned Opc) {
+ MachineRegisterInfo &MRI = BB.getParent()->getRegInfo();
+ const SIRegisterInfo *TRI = ST.getRegisterInfo();
+ const DebugLoc &DL = MI.getDebugLoc();
+ const SIInstrInfo *TII = ST.getInstrInfo();
+
+ // Reduction operations depend on whether the input operand is SGPR or VGPR.
+ Register SrcReg = MI.getOperand(1).getReg();
+ bool isSGPR = TRI->isSGPRClass(MRI.getRegClass(SrcReg));
+ Register DstReg = MI.getOperand(0).getReg();
+ MachineBasicBlock *RetBB = nullptr;
+ if (isSGPR) {
+ // These operations with a uniform value i.e. SGPR are idempotent.
+ // Reduced value will be same as given sgpr.
+ BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MOV_B32), DstReg).addReg(SrcReg);
+ RetBB = &BB;
+ } else {
+ // TODO: Implement DPP Strategy and switch based on immediate strategy
+ // operand. For now, for all the cases (default, Iterative and DPP we use
+ // iterative approach by default.)
+
+ // To reduce the VGPR using iterative approach, we need to iterate
+ // over all the active lanes. Lowering consists of ComputeLoop,
+ // which iterate over only active lanes. We use copy of EXEC register
+ // as induction variable and every active lane modifies it using bitset0
+ // so that we will get the next active lane for next iteration.
+ MachineBasicBlock::iterator I = BB.end();
+ Register SrcReg = MI.getOperand(1).getReg();
+
+ // Create Control flow for loop
+ // Split MI's Machine Basic block into For loop
+ auto [ComputeLoop, ComputeEnd] = splitBlockForLoop(MI, BB, true);
+
+ // Create virtual registers required for lowering.
+ const TargetRegisterClass *WaveMaskRegClass = TRI->getWaveMaskRegClass();
+ const TargetRegisterClass *DstRegClass = MRI.getRegClass(DstReg);
+ Register LoopIterator = MRI.createVirtualRegister(WaveMaskRegClass);
+ Register InitalValReg = MRI.createVirtualRegister(DstRegClass);
+
+ Register AccumulatorReg = MRI.createVirtualRegister(DstRegClass);
+ Register ActiveBitsReg = MRI.createVirtualRegister(WaveMaskRegClass);
+ Register NewActiveBitsReg = MRI.createVirtualRegister(WaveMaskRegClass);
+
+ Register FF1Reg = MRI.createVirtualRegister(DstRegClass);
+ Register LaneValueReg = MRI.createVirtualRegister(DstRegClass);
+
+ bool IsWave32 = ST.isWave32();
+ unsigned MovOpc = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
+ unsigned ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
+
+ // Create initail values of induction variable from Exec, Accumulator and
+ // insert branch instr to newly created ComputeBlockk
+ uint32_t InitalValue =
+ (Opc == AMDGPU::S_MIN_U32) ? std::numeric_limits<uint32_t>::max() : 0;
+ auto TmpSReg =
+ BuildMI(BB, I, DL, TII->get(MovOpc), LoopIterator).addReg(ExecReg);
+ BuildMI(BB, I, DL, TII->get(AMDGPU::S_MOV_B32), InitalValReg)
+ .addImm(InitalValue);
+ BuildMI(BB, I, DL, TII->get(AMDGPU::S_BRANCH)).addMBB(ComputeLoop);
+
+ // Start constructing ComputeLoop
+ I = ComputeLoop->end();
+ auto Accumulator =
+ BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::PHI), AccumulatorReg)
+ .addReg(InitalValReg)
+ .addMBB(&BB);
+ auto ActiveBits =
+ BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::PHI), ActiveBitsReg)
+ .addReg(TmpSReg->getOperand(0).getReg())
+ .addMBB(&BB);
+
+ // Perform the computations
+ unsigned SFFOpc = IsWave32 ? AMDGPU::S_FF1_I32_B32 : AMDGPU::S_FF1_I32_B64;
+ auto FF1 = BuildMI(*ComputeLoop, I, DL, TII->get(SFFOpc), FF1Reg)
+ .addReg(ActiveBits->getOperand(0).getReg());
+ auto LaneValue = BuildMI(*ComputeLoop, I, DL,
+ TII->get(AMDGPU::V_READLANE_B32), LaneValueReg)
+ .addReg(SrcReg)
+ .addReg(FF1->getOperand(0).getReg());
+ auto NewAccumulator = BuildMI(*ComputeLoop, I, DL, TII->get(Opc), DstReg)
+ .addReg(Accumulator->getOperand(0).getReg())
+ .addReg(LaneValue->getOperand(0).getReg());
+
+ // Manipulate the iterator to get the next active lane
+ unsigned BITSETOpc =
+ IsWave32 ? AMDGPU::S_BITSET0_B32 : AMDGPU::S_BITSET0_B64;
+ auto NewActiveBits =
+ BuildMI(*ComputeLoop, I, DL, TII->get(BITSETOpc), NewActiveBitsReg)
+ .addReg(FF1->getOperand(0).getReg())
+ .addReg(ActiveBits->getOperand(0).getReg());
+
+ // Add phi nodes
+ Accumulator.addReg(NewAccumulator->getOperand(0).getReg())
+ .addMBB(ComputeLoop);
+ ActiveBits.addReg(NewActiveBits->getOperand(0).getReg())
+ .addMBB(ComputeLoop);
+
+ // Creating branching
+ unsigned CMPOpc = IsWave32 ? AMDGPU::S_CMP_LG_U32 : AMDGPU::S_CMP_LG_U64;
+ BuildMI(*ComputeLoop, I, DL, TII->get(CMPOpc))
+ .addReg(NewActiveBits->getOperand(0).getReg())
+ .addImm(0);
+ BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::S_CBRANCH_SCC1))
+ .addMBB(ComputeLoop);
+
+ RetBB = ComputeEnd;
+ }
+ MI.eraseFromParent();
+ return RetBB;
+}
+
MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter(
MachineInstr &MI, MachineBasicBlock *BB) const {
@@ -4008,6 +4191,10 @@ MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter(
SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
switch (MI.getOpcode()) {
+ case AMDGPU::WAVE_REDUCE_UMIN_PSEUDO_U32:
+ return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MIN_U32);
+ case AMDGPU::WAVE_REDUCE_UMAX_PSEUDO_U32:
+ return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MAX_U32);
case AMDGPU::S_UADDO_PSEUDO:
case AMDGPU::S_USUBO_PSEUDO: {
const DebugLoc &DL = MI.getDebugLoc();
@@ -4460,15 +4647,54 @@ MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter(
return BB;
}
+ case AMDGPU::S_INVERSE_BALLOT_U32:
+ case AMDGPU::S_INVERSE_BALLOT_U64: {
+ MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
+ const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
+ const SIRegisterInfo *TRI = ST.getRegisterInfo();
+ const DebugLoc &DL = MI.getDebugLoc();
+ const Register DstReg = MI.getOperand(0).getReg();
+ Register MaskReg = MI.getOperand(1).getReg();
+
+ const bool IsVALU = TRI->isVectorRegister(MRI, MaskReg);
+
+ if (IsVALU) {
+ MaskReg = TII->readlaneVGPRToSGPR(MaskReg, MI, MRI);
+ }
+
+ BuildMI(*BB, &MI, DL, TII->get(AMDGPU::COPY), DstReg).addReg(MaskReg);
+ MI.eraseFromParent();
+ return BB;
+ }
+ case AMDGPU::ENDPGM_TRAP: {
+ const DebugLoc &DL = MI.getDebugLoc();
+ if (BB->succ_empty() && std::next(MI.getIterator()) == BB->end()) {
+ MI.setDesc(TII->get(AMDGPU::S_ENDPGM));
+ MI.addOperand(MachineOperand::CreateImm(0));
+ return BB;
+ }
+
+ // We need a block split to make the real endpgm a terminator. We also don't
+ // want to break phis in successor blocks, so we can't just delete to the
+ // end of the block.
+
+ MachineBasicBlock *SplitBB = BB->splitAt(MI, false /*UpdateLiveIns*/);
+ MachineBasicBlock *TrapBB = MF->CreateMachineBasicBlock();
+ MF->push_back(TrapBB);
+ BuildMI(*TrapBB, TrapBB->end(), DL, TII->get(AMDGPU::S_ENDPGM))
+ .addImm(0);
+ BuildMI(*BB, &MI, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ))
+ .addMBB(TrapBB);
+
+ BB->addSuccessor(TrapBB);
+ MI.eraseFromParent();
+ return SplitBB;
+ }
default:
return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI, BB);
}
}
-bool SITargetLowering::hasBitPreservingFPLogic(EVT VT) const {
- return isTypeLegal(VT.getScalarType());
-}
-
bool SITargetLowering::hasAtomicFaddRtnForTy(SDValue &Op) const {
switch (Op.getValue(0).getSimpleValueType().SimpleTy) {
case MVT::f32:
@@ -4542,7 +4768,7 @@ bool SITargetLowering::isFMAFasterThanFMulAndFAdd(const MachineFunction &MF,
// Otherwise f32 mad is always full rate and returns the same result as
// the separate operations so should be preferred over fma.
// However does not support denormals.
- if (hasFP32Denormals(MF))
+ if (!denormalModeIsFlushAllF32(MF))
return Subtarget->hasFastFMAF32() || Subtarget->hasDLInsts();
// If the subtarget has v_fmac_f32, that's just as good as v_mac_f32.
@@ -4551,7 +4777,7 @@ bool SITargetLowering::isFMAFasterThanFMulAndFAdd(const MachineFunction &MF,
case MVT::f64:
return true;
case MVT::f16:
- return Subtarget->has16BitInsts() && hasFP64FP16Denormals(MF);
+ return Subtarget->has16BitInsts() && !denormalModeIsFlushAllF64F16(MF);
default:
break;
}
@@ -4580,9 +4806,10 @@ bool SITargetLowering::isFMADLegal(const MachineInstr &MI, LLT Ty) const {
return false;
if (Ty.getScalarSizeInBits() == 16)
- return Subtarget->hasMadF16() && !hasFP64FP16Denormals(*MI.getMF());
+ return Subtarget->hasMadF16() && denormalModeIsFlushAllF64F16(*MI.getMF());
if (Ty.getScalarSizeInBits() == 32)
- return Subtarget->hasMadMacF32Insts() && !hasFP32Denormals(*MI.getMF());
+ return Subtarget->hasMadMacF32Insts() &&
+ denormalModeIsFlushAllF32(*MI.getMF());
return false;
}
@@ -4594,10 +4821,10 @@ bool SITargetLowering::isFMADLegal(const SelectionDAG &DAG,
EVT VT = N->getValueType(0);
if (VT == MVT::f32)
return Subtarget->hasMadMacF32Insts() &&
- !hasFP32Denormals(DAG.getMachineFunction());
+ denormalModeIsFlushAllF32(DAG.getMachineFunction());
if (VT == MVT::f16) {
return Subtarget->hasMadF16() &&
- !hasFP64FP16Denormals(DAG.getMachineFunction());
+ denormalModeIsFlushAllF64F16(DAG.getMachineFunction());
}
return false;
@@ -4613,7 +4840,10 @@ SDValue SITargetLowering::splitUnaryVectorOp(SDValue Op,
SelectionDAG &DAG) const {
unsigned Opc = Op.getOpcode();
EVT VT = Op.getValueType();
- assert(VT == MVT::v4f16 || VT == MVT::v4i16);
+ assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4f32 ||
+ VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v16i16 ||
+ VT == MVT::v16f16 || VT == MVT::v8f32 || VT == MVT::v16f32 ||
+ VT == MVT::v32f32);
SDValue Lo, Hi;
std::tie(Lo, Hi) = DAG.SplitVectorOperand(Op.getNode(), 0);
@@ -4696,12 +4926,16 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
"Load should return a value and a chain");
return Result;
}
-
+ case ISD::FSQRT:
+ if (Op.getValueType() == MVT::f64)
+ return lowerFSQRTF64(Op, DAG);
+ return SDValue();
case ISD::FSIN:
case ISD::FCOS:
return LowerTrig(Op, DAG);
case ISD::SELECT: return LowerSELECT(Op, DAG);
case ISD::FDIV: return LowerFDIV(Op, DAG);
+ case ISD::FFREXP: return LowerFFREXP(Op, DAG);
case ISD::ATOMIC_CMP_SWAP: return LowerATOMIC_CMP_SWAP(Op, DAG);
case ISD::STORE: return LowerSTORE(Op, DAG);
case ISD::GlobalAddress: {
@@ -4726,6 +4960,7 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
case ISD::BUILD_VECTOR:
return lowerBUILD_VECTOR(Op, DAG);
case ISD::FP_ROUND:
+ case ISD::STRICT_FP_ROUND:
return lowerFP_ROUND(Op, DAG);
case ISD::FPTRUNC_ROUND: {
unsigned Opc;
@@ -4757,6 +4992,9 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
case ISD::FMINNUM:
case ISD::FMAXNUM:
return lowerFMINNUM_FMAXNUM(Op, DAG);
+ case ISD::FLDEXP:
+ case ISD::STRICT_FLDEXP:
+ return lowerFLDEXP(Op, DAG);
case ISD::FMA:
return splitTernaryVectorOp(Op, DAG);
case ISD::FP_TO_SINT:
@@ -5038,6 +5276,9 @@ void SITargetLowering::ReplaceNodeResults(SDNode *N,
case ISD::INTRINSIC_WO_CHAIN: {
unsigned IID = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
switch (IID) {
+ case Intrinsic::amdgcn_make_buffer_rsrc:
+ Results.push_back(lowerPointerAsRsrcIntrin(N, DAG));
+ return;
case Intrinsic::amdgcn_cvt_pkrtz: {
SDValue Src0 = N->getOperand(1);
SDValue Src1 = N->getOperand(2);
@@ -5142,6 +5383,7 @@ void SITargetLowering::ReplaceNodeResults(SDNode *N,
return;
}
default:
+ AMDGPUTargetLowering::ReplaceNodeResults(N, Results, DAG);
break;
}
}
@@ -5349,6 +5591,10 @@ SDValue SITargetLowering::lowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const {
if (SrcVT != MVT::f64)
return Op;
+ // TODO: Handle strictfp
+ if (Op.getOpcode() != ISD::FP_ROUND)
+ return Op;
+
SDLoc DL(Op);
SDValue FpToFp16 = DAG.getNode(ISD::FP_TO_FP16, DL, MVT::i32, Src);
@@ -5375,6 +5621,40 @@ SDValue SITargetLowering::lowerFMINNUM_FMAXNUM(SDValue Op,
return Op;
}
+SDValue SITargetLowering::lowerFLDEXP(SDValue Op, SelectionDAG &DAG) const {
+ bool IsStrict = Op.getOpcode() == ISD::STRICT_FLDEXP;
+ EVT VT = Op.getValueType();
+ assert(VT == MVT::f16);
+
+ SDValue Exp = Op.getOperand(IsStrict ? 2 : 1);
+ EVT ExpVT = Exp.getValueType();
+ if (ExpVT == MVT::i16)
+ return Op;
+
+ SDLoc DL(Op);
+
+ // Correct the exponent type for f16 to i16.
+ // Clamp the range of the exponent to the instruction's range.
+
+ // TODO: This should be a generic narrowing legalization, and can easily be
+ // for GlobalISel.
+
+ SDValue MinExp = DAG.getConstant(minIntN(16), DL, ExpVT);
+ SDValue ClampMin = DAG.getNode(ISD::SMAX, DL, ExpVT, Exp, MinExp);
+
+ SDValue MaxExp = DAG.getConstant(maxIntN(16), DL, ExpVT);
+ SDValue Clamp = DAG.getNode(ISD::SMIN, DL, ExpVT, ClampMin, MaxExp);
+
+ SDValue TruncExp = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, Clamp);
+
+ if (IsStrict) {
+ return DAG.getNode(ISD::STRICT_FLDEXP, DL, {VT, MVT::Other},
+ {Op.getOperand(0), Op.getOperand(1), TruncExp});
+ }
+
+ return DAG.getNode(ISD::FLDEXP, DL, VT, Op.getOperand(0), TruncExp);
+}
+
SDValue SITargetLowering::lowerXMULO(SDValue Op, SelectionDAG &DAG) const {
EVT VT = Op.getValueType();
SDLoc SL(Op);
@@ -5431,26 +5711,20 @@ SDValue SITargetLowering::lowerTRAP(SDValue Op, SelectionDAG &DAG) const {
Subtarget->getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA)
return lowerTrapEndpgm(Op, DAG);
- if (std::optional<uint8_t> HsaAbiVer = AMDGPU::getHsaAbiVersion(Subtarget)) {
- switch (*HsaAbiVer) {
- case ELF::ELFABIVERSION_AMDGPU_HSA_V2:
- case ELF::ELFABIVERSION_AMDGPU_HSA_V3:
- return lowerTrapHsaQueuePtr(Op, DAG);
- case ELF::ELFABIVERSION_AMDGPU_HSA_V4:
- case ELF::ELFABIVERSION_AMDGPU_HSA_V5:
- return Subtarget->supportsGetDoorbellID() ?
- lowerTrapHsa(Op, DAG) : lowerTrapHsaQueuePtr(Op, DAG);
- }
- }
+ const Module *M = DAG.getMachineFunction().getFunction().getParent();
+ unsigned CodeObjectVersion = AMDGPU::getCodeObjectVersion(*M);
+ if (CodeObjectVersion <= AMDGPU::AMDHSA_COV3)
+ return lowerTrapHsaQueuePtr(Op, DAG);
- llvm_unreachable("Unknown trap handler");
+ return Subtarget->supportsGetDoorbellID() ? lowerTrapHsa(Op, DAG) :
+ lowerTrapHsaQueuePtr(Op, DAG);
}
SDValue SITargetLowering::lowerTrapEndpgm(
SDValue Op, SelectionDAG &DAG) const {
SDLoc SL(Op);
SDValue Chain = Op.getOperand(0);
- return DAG.getNode(AMDGPUISD::ENDPGM, SL, MVT::Other, Chain);
+ return DAG.getNode(AMDGPUISD::ENDPGM_TRAP, SL, MVT::Other, Chain);
}
SDValue SITargetLowering::loadImplicitKernelArgument(SelectionDAG &DAG, MVT VT,
@@ -5471,7 +5745,8 @@ SDValue SITargetLowering::lowerTrapHsaQueuePtr(
SDValue QueuePtr;
// For code object version 5, QueuePtr is passed through implicit kernarg.
- if (AMDGPU::getAmdhsaCodeObjectVersion() == 5) {
+ const Module *M = DAG.getMachineFunction().getFunction().getParent();
+ if (AMDGPU::getCodeObjectVersion(*M) >= AMDGPU::AMDHSA_COV5) {
QueuePtr =
loadImplicitKernelArgument(DAG, MVT::i64, SL, Align(8), QUEUE_PTR);
} else {
@@ -5574,7 +5849,8 @@ SDValue SITargetLowering::getSegmentAperture(unsigned AS, const SDLoc &DL,
// For code object version 5, private_base and shared_base are passed through
// implicit kernargs.
- if (AMDGPU::getAmdhsaCodeObjectVersion() == 5) {
+ const Module *M = DAG.getMachineFunction().getFunction().getParent();
+ if (AMDGPU::getCodeObjectVersion(*M) >= AMDGPU::AMDHSA_COV5) {
ImplicitParameter Param =
(AS == AMDGPUAS::LOCAL_ADDRESS) ? SHARED_BASE : PRIVATE_BASE;
return loadImplicitKernelArgument(DAG, MVT::i32, DL, Align(4), Param);
@@ -5721,6 +5997,35 @@ SDValue SITargetLowering::lowerINSERT_SUBVECTOR(SDValue Op,
unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
SDLoc SL(Op);
+ if (EltVT.getScalarSizeInBits() == 16 && IdxVal % 2 == 0) {
+ // Insert 32-bit registers at a time.
+ assert(InsNumElts % 2 == 0 && "expect legal vector types");
+
+ unsigned VecNumElts = VecVT.getVectorNumElements();
+ EVT NewVecVT =
+ EVT::getVectorVT(*DAG.getContext(), MVT::i32, VecNumElts / 2);
+ EVT NewInsVT = InsNumElts == 2 ? MVT::i32
+ : EVT::getVectorVT(*DAG.getContext(),
+ MVT::i32, InsNumElts / 2);
+
+ Vec = DAG.getNode(ISD::BITCAST, SL, NewVecVT, Vec);
+ Ins = DAG.getNode(ISD::BITCAST, SL, NewInsVT, Ins);
+
+ for (unsigned I = 0; I != InsNumElts / 2; ++I) {
+ SDValue Elt;
+ if (InsNumElts == 2) {
+ Elt = Ins;
+ } else {
+ Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Ins,
+ DAG.getConstant(I, SL, MVT::i32));
+ }
+ Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, SL, NewVecVT, Vec, Elt,
+ DAG.getConstant(IdxVal / 2 + I, SL, MVT::i32));
+ }
+
+ return DAG.getNode(ISD::BITCAST, SL, VecVT, Vec);
+ }
+
for (unsigned I = 0; I != InsNumElts; ++I) {
SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, Ins,
DAG.getConstant(I, SL, MVT::i32));
@@ -6130,7 +6435,8 @@ SDValue SITargetLowering::LowerGlobalAddress(AMDGPUMachineFunction *MFI,
if (DAG.getDataLayout().getTypeAllocSize(Ty).isZero()) {
assert(PtrVT == MVT::i32 && "32-bit pointer is expected.");
// Adjust alignment for that dynamic shared memory array.
- MFI->setDynLDSAlign(DAG.getDataLayout(), *cast<GlobalVariable>(GV));
+ Function &F = DAG.getMachineFunction().getFunction();
+ MFI->setDynLDSAlign(F, *cast<GlobalVariable>(GV));
return SDValue(
DAG.getMachineNode(AMDGPU::GET_GROUPSTATICSIZE, DL, PtrVT), 0);
}
@@ -6572,15 +6878,24 @@ SDValue SITargetLowering::lowerImage(SDValue Op,
// SIShrinkInstructions will convert NSA encodings to non-NSA after register
// allocation when possible.
//
- // TODO: we can actually allow partial NSA where the final register is a
- // contiguous set of the remaining addresses.
- // This could help where there are more addresses than supported.
- bool UseNSA = ST->hasFeature(AMDGPU::FeatureNSAEncoding) &&
- VAddrs.size() >= (unsigned)ST->getNSAThreshold(MF) &&
- VAddrs.size() <= (unsigned)ST->getNSAMaxSize();
+ // Partial NSA is allowed on GFX11 where the final register is a contiguous
+ // set of the remaining addresses.
+ const unsigned NSAMaxSize = ST->getNSAMaxSize();
+ const bool HasPartialNSAEncoding = ST->hasPartialNSAEncoding();
+ const bool UseNSA = ST->hasNSAEncoding() &&
+ VAddrs.size() >= ST->getNSAThreshold(MF) &&
+ (VAddrs.size() <= NSAMaxSize || HasPartialNSAEncoding);
+ const bool UsePartialNSA =
+ UseNSA && HasPartialNSAEncoding && VAddrs.size() > NSAMaxSize;
+
SDValue VAddr;
- if (!UseNSA)
+ if (UsePartialNSA) {
+ VAddr = getBuildDwordsVector(DAG, DL,
+ ArrayRef(VAddrs).drop_front(NSAMaxSize - 1));
+ }
+ else if (!UseNSA) {
VAddr = getBuildDwordsVector(DAG, DL, VAddrs);
+ }
SDValue True = DAG.getTargetConstant(1, DL, MVT::i1);
SDValue False = DAG.getTargetConstant(0, DL, MVT::i1);
@@ -6648,7 +6963,11 @@ SDValue SITargetLowering::lowerImage(SDValue Op,
SmallVector<SDValue, 26> Ops;
if (BaseOpcode->Store || BaseOpcode->Atomic)
Ops.push_back(VData); // vdata
- if (UseNSA)
+ if (UsePartialNSA) {
+ append_range(Ops, ArrayRef(VAddrs).take_front(NSAMaxSize - 1));
+ Ops.push_back(VAddr);
+ }
+ else if (UseNSA)
append_range(Ops, VAddrs);
else
Ops.push_back(VAddr);
@@ -6696,7 +7015,8 @@ SDValue SITargetLowering::lowerImage(SDValue Op,
Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx90a,
NumVDataDwords, NumVAddrDwords);
if (Opcode == -1)
- return makeV_ILLEGAL(Op, DAG);
+ report_fatal_error(
+ "requested image instruction is not supported on this GPU");
}
if (Opcode == -1 &&
Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
@@ -6706,7 +7026,8 @@ SDValue SITargetLowering::lowerImage(SDValue Op,
Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx6,
NumVDataDwords, NumVAddrDwords);
}
- assert(Opcode != -1);
+ if (Opcode == -1)
+ return Op;
MachineSDNode *NewNode = DAG.getMachineNode(Opcode, DL, ResultTypes, Ops);
if (auto MemOp = dyn_cast<MemSDNode>(Op)) {
@@ -7021,8 +7342,7 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
return emitRemovedIntrinsicError(DAG, DL, VT);
}
case Intrinsic::amdgcn_ldexp:
- return DAG.getNode(AMDGPUISD::LDEXP, DL, VT,
- Op.getOperand(1), Op.getOperand(2));
+ return DAG.getNode(ISD::FLDEXP, DL, VT, Op.getOperand(1), Op.getOperand(2));
case Intrinsic::amdgcn_fract:
return DAG.getNode(AMDGPUISD::FRACT, DL, VT, Op.getOperand(1));
@@ -7170,52 +7490,27 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
}
}
-/// Update \p MMO based on the offset inputs to an intrinsic.
-static void updateBufferMMO(MachineMemOperand *MMO, SDValue VOffset,
- SDValue SOffset, SDValue Offset,
- SDValue VIndex = SDValue()) {
- if (!isa<ConstantSDNode>(VOffset) || !isa<ConstantSDNode>(SOffset) ||
- !isa<ConstantSDNode>(Offset)) {
- // The combined offset is not known to be constant, so we cannot represent
- // it in the MMO. Give up.
- MMO->setValue((Value *)nullptr);
- return;
- }
-
- if (VIndex && (!isa<ConstantSDNode>(VIndex) ||
- !cast<ConstantSDNode>(VIndex)->isZero())) {
- // The strided index component of the address is not known to be zero, so we
- // cannot represent it in the MMO. Give up.
- MMO->setValue((Value *)nullptr);
- return;
- }
-
- MMO->setOffset(cast<ConstantSDNode>(VOffset)->getSExtValue() +
- cast<ConstantSDNode>(SOffset)->getSExtValue() +
- cast<ConstantSDNode>(Offset)->getSExtValue());
-}
-
SDValue SITargetLowering::lowerRawBufferAtomicIntrin(SDValue Op,
SelectionDAG &DAG,
unsigned NewOpcode) const {
SDLoc DL(Op);
SDValue VData = Op.getOperand(2);
+ SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
auto Offsets = splitBufferOffsets(Op.getOperand(4), DAG);
SDValue Ops[] = {
- Op.getOperand(0), // Chain
- VData, // vdata
- Op.getOperand(3), // rsrc
- DAG.getConstant(0, DL, MVT::i32), // vindex
- Offsets.first, // voffset
- Op.getOperand(5), // soffset
- Offsets.second, // offset
- Op.getOperand(6), // cachepolicy
- DAG.getTargetConstant(0, DL, MVT::i1), // idxen
+ Op.getOperand(0), // Chain
+ VData, // vdata
+ Rsrc, // rsrc
+ DAG.getConstant(0, DL, MVT::i32), // vindex
+ Offsets.first, // voffset
+ Op.getOperand(5), // soffset
+ Offsets.second, // offset
+ Op.getOperand(6), // cachepolicy
+ DAG.getTargetConstant(0, DL, MVT::i1), // idxen
};
auto *M = cast<MemSDNode>(Op);
- updateBufferMMO(M->getMemOperand(), Ops[4], Ops[5], Ops[6]);
EVT MemVT = VData.getValueType();
return DAG.getMemIntrinsicNode(NewOpcode, DL, Op->getVTList(), Ops, MemVT,
@@ -7224,10 +7519,8 @@ SDValue SITargetLowering::lowerRawBufferAtomicIntrin(SDValue Op,
// Return a value to use for the idxen operand by examining the vindex operand.
static unsigned getIdxEn(SDValue VIndex) {
- if (auto VIndexC = dyn_cast<ConstantSDNode>(VIndex))
- // No need to set idxen if vindex is known to be zero.
- return VIndexC->getZExtValue() != 0;
- return 1;
+ // No need to set idxen if vindex is known to be zero.
+ return isNullConstant(VIndex) ? 0 : 1;
}
SDValue
@@ -7236,21 +7529,21 @@ SITargetLowering::lowerStructBufferAtomicIntrin(SDValue Op, SelectionDAG &DAG,
SDLoc DL(Op);
SDValue VData = Op.getOperand(2);
+ SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
auto Offsets = splitBufferOffsets(Op.getOperand(5), DAG);
SDValue Ops[] = {
- Op.getOperand(0), // Chain
- VData, // vdata
- Op.getOperand(3), // rsrc
- Op.getOperand(4), // vindex
- Offsets.first, // voffset
- Op.getOperand(6), // soffset
- Offsets.second, // offset
- Op.getOperand(7), // cachepolicy
- DAG.getTargetConstant(1, DL, MVT::i1), // idxen
+ Op.getOperand(0), // Chain
+ VData, // vdata
+ Rsrc, // rsrc
+ Op.getOperand(4), // vindex
+ Offsets.first, // voffset
+ Op.getOperand(6), // soffset
+ Offsets.second, // offset
+ Op.getOperand(7), // cachepolicy
+ DAG.getTargetConstant(1, DL, MVT::i1), // idxen
};
auto *M = cast<MemSDNode>(Op);
- updateBufferMMO(M->getMemOperand(), Ops[4], Ops[5], Ops[6], Ops[3]);
EVT MemVT = VData.getValueType();
return DAG.getMemIntrinsicNode(NewOpcode, DL, Op->getVTList(), Ops, MemVT,
@@ -7330,19 +7623,11 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
M->getOperand(0), M->getOperand(2), M->getOperand(3),
M->getMemOperand());
}
- case Intrinsic::amdgcn_atomic_inc:
- case Intrinsic::amdgcn_atomic_dec:
case Intrinsic::amdgcn_ds_fmin:
case Intrinsic::amdgcn_ds_fmax: {
MemSDNode *M = cast<MemSDNode>(Op);
unsigned Opc;
switch (IntrID) {
- case Intrinsic::amdgcn_atomic_inc:
- Opc = AMDGPUISD::ATOMIC_INC;
- break;
- case Intrinsic::amdgcn_atomic_dec:
- Opc = AMDGPUISD::ATOMIC_DEC;
- break;
case Intrinsic::amdgcn_ds_fmin:
Opc = AMDGPUISD::ATOMIC_LOAD_FMIN;
break;
@@ -7384,7 +7669,6 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
EVT VT = Op.getValueType();
EVT IntVT = VT.changeTypeToInteger();
auto *M = cast<MemSDNode>(Op);
- updateBufferMMO(M->getMemOperand(), Ops[3], Ops[4], Ops[5], Ops[2]);
EVT LoadVT = Op.getValueType();
if (LoadVT.getScalarType() == MVT::f16)
@@ -7400,43 +7684,50 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
M->getMemOperand(), DAG);
}
case Intrinsic::amdgcn_raw_buffer_load:
- case Intrinsic::amdgcn_raw_buffer_load_format: {
- const bool IsFormat = IntrID == Intrinsic::amdgcn_raw_buffer_load_format;
+ case Intrinsic::amdgcn_raw_ptr_buffer_load:
+ case Intrinsic::amdgcn_raw_buffer_load_format:
+ case Intrinsic::amdgcn_raw_ptr_buffer_load_format: {
+ const bool IsFormat =
+ IntrID == Intrinsic::amdgcn_raw_buffer_load_format ||
+ IntrID == Intrinsic::amdgcn_raw_ptr_buffer_load_format;
+ SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(2), DAG);
auto Offsets = splitBufferOffsets(Op.getOperand(3), DAG);
SDValue Ops[] = {
- Op.getOperand(0), // Chain
- Op.getOperand(2), // rsrc
- DAG.getConstant(0, DL, MVT::i32), // vindex
- Offsets.first, // voffset
- Op.getOperand(4), // soffset
- Offsets.second, // offset
- Op.getOperand(5), // cachepolicy, swizzled buffer
- DAG.getTargetConstant(0, DL, MVT::i1), // idxen
+ Op.getOperand(0), // Chain
+ Rsrc, // rsrc
+ DAG.getConstant(0, DL, MVT::i32), // vindex
+ Offsets.first, // voffset
+ Op.getOperand(4), // soffset
+ Offsets.second, // offset
+ Op.getOperand(5), // cachepolicy, swizzled buffer
+ DAG.getTargetConstant(0, DL, MVT::i1), // idxen
};
auto *M = cast<MemSDNode>(Op);
- updateBufferMMO(M->getMemOperand(), Ops[3], Ops[4], Ops[5]);
return lowerIntrinsicLoad(M, IsFormat, DAG, Ops);
}
case Intrinsic::amdgcn_struct_buffer_load:
- case Intrinsic::amdgcn_struct_buffer_load_format: {
- const bool IsFormat = IntrID == Intrinsic::amdgcn_struct_buffer_load_format;
+ case Intrinsic::amdgcn_struct_ptr_buffer_load:
+ case Intrinsic::amdgcn_struct_buffer_load_format:
+ case Intrinsic::amdgcn_struct_ptr_buffer_load_format: {
+ const bool IsFormat =
+ IntrID == Intrinsic::amdgcn_struct_buffer_load_format ||
+ IntrID == Intrinsic::amdgcn_struct_ptr_buffer_load_format;
+ SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(2), DAG);
auto Offsets = splitBufferOffsets(Op.getOperand(4), DAG);
SDValue Ops[] = {
- Op.getOperand(0), // Chain
- Op.getOperand(2), // rsrc
- Op.getOperand(3), // vindex
- Offsets.first, // voffset
- Op.getOperand(5), // soffset
- Offsets.second, // offset
- Op.getOperand(6), // cachepolicy, swizzled buffer
- DAG.getTargetConstant(1, DL, MVT::i1), // idxen
+ Op.getOperand(0), // Chain
+ Rsrc, // rsrc
+ Op.getOperand(3), // vindex
+ Offsets.first, // voffset
+ Op.getOperand(5), // soffset
+ Offsets.second, // offset
+ Op.getOperand(6), // cachepolicy, swizzled buffer
+ DAG.getTargetConstant(1, DL, MVT::i1), // idxen
};
- auto *M = cast<MemSDNode>(Op);
- updateBufferMMO(M->getMemOperand(), Ops[3], Ops[4], Ops[5], Ops[2]);
return lowerIntrinsicLoad(cast<MemSDNode>(Op), IsFormat, DAG, Ops);
}
case Intrinsic::amdgcn_tbuffer_load: {
@@ -7467,21 +7758,23 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
Op->getVTList(), Ops, LoadVT, M->getMemOperand(),
DAG);
}
- case Intrinsic::amdgcn_raw_tbuffer_load: {
+ case Intrinsic::amdgcn_raw_tbuffer_load:
+ case Intrinsic::amdgcn_raw_ptr_tbuffer_load: {
MemSDNode *M = cast<MemSDNode>(Op);
EVT LoadVT = Op.getValueType();
+ SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(2), DAG);
auto Offsets = splitBufferOffsets(Op.getOperand(3), DAG);
SDValue Ops[] = {
- Op.getOperand(0), // Chain
- Op.getOperand(2), // rsrc
- DAG.getConstant(0, DL, MVT::i32), // vindex
- Offsets.first, // voffset
- Op.getOperand(4), // soffset
- Offsets.second, // offset
- Op.getOperand(5), // format
- Op.getOperand(6), // cachepolicy, swizzled buffer
- DAG.getTargetConstant(0, DL, MVT::i1), // idxen
+ Op.getOperand(0), // Chain
+ Rsrc, // rsrc
+ DAG.getConstant(0, DL, MVT::i32), // vindex
+ Offsets.first, // voffset
+ Op.getOperand(4), // soffset
+ Offsets.second, // offset
+ Op.getOperand(5), // format
+ Op.getOperand(6), // cachepolicy, swizzled buffer
+ DAG.getTargetConstant(0, DL, MVT::i1), // idxen
};
if (LoadVT.getScalarType() == MVT::f16)
@@ -7491,21 +7784,23 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
Op->getVTList(), Ops, LoadVT, M->getMemOperand(),
DAG);
}
- case Intrinsic::amdgcn_struct_tbuffer_load: {
+ case Intrinsic::amdgcn_struct_tbuffer_load:
+ case Intrinsic::amdgcn_struct_ptr_tbuffer_load: {
MemSDNode *M = cast<MemSDNode>(Op);
EVT LoadVT = Op.getValueType();
+ SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(2), DAG);
auto Offsets = splitBufferOffsets(Op.getOperand(4), DAG);
SDValue Ops[] = {
- Op.getOperand(0), // Chain
- Op.getOperand(2), // rsrc
- Op.getOperand(3), // vindex
- Offsets.first, // voffset
- Op.getOperand(5), // soffset
- Offsets.second, // offset
- Op.getOperand(6), // format
- Op.getOperand(7), // cachepolicy, swizzled buffer
- DAG.getTargetConstant(1, DL, MVT::i1), // idxen
+ Op.getOperand(0), // Chain
+ Rsrc, // rsrc
+ Op.getOperand(3), // vindex
+ Offsets.first, // voffset
+ Op.getOperand(5), // soffset
+ Offsets.second, // offset
+ Op.getOperand(6), // format
+ Op.getOperand(7), // cachepolicy, swizzled buffer
+ DAG.getTargetConstant(1, DL, MVT::i1), // idxen
};
if (LoadVT.getScalarType() == MVT::f16)
@@ -7545,7 +7840,6 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
EVT VT = Op.getValueType();
auto *M = cast<MemSDNode>(Op);
- updateBufferMMO(M->getMemOperand(), Ops[4], Ops[5], Ops[6], Ops[3]);
unsigned Opcode = 0;
switch (IntrID) {
@@ -7593,69 +7887,99 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
M->getMemOperand());
}
case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
+ case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd:
return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FADD);
case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
+ case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fadd:
return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FADD);
case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
+ case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmin:
return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FMIN);
case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
+ case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmin:
return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FMIN);
case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
+ case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmax:
return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FMAX);
case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
+ case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmax:
return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FMAX);
case Intrinsic::amdgcn_raw_buffer_atomic_swap:
+ case Intrinsic::amdgcn_raw_ptr_buffer_atomic_swap:
return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SWAP);
case Intrinsic::amdgcn_raw_buffer_atomic_add:
+ case Intrinsic::amdgcn_raw_ptr_buffer_atomic_add:
return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_ADD);
case Intrinsic::amdgcn_raw_buffer_atomic_sub:
+ case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub:
return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SUB);
case Intrinsic::amdgcn_raw_buffer_atomic_smin:
+ case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smin:
return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SMIN);
case Intrinsic::amdgcn_raw_buffer_atomic_umin:
+ case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umin:
return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_UMIN);
case Intrinsic::amdgcn_raw_buffer_atomic_smax:
+ case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smax:
return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SMAX);
case Intrinsic::amdgcn_raw_buffer_atomic_umax:
+ case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umax:
return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_UMAX);
case Intrinsic::amdgcn_raw_buffer_atomic_and:
+ case Intrinsic::amdgcn_raw_ptr_buffer_atomic_and:
return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_AND);
case Intrinsic::amdgcn_raw_buffer_atomic_or:
+ case Intrinsic::amdgcn_raw_ptr_buffer_atomic_or:
return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_OR);
case Intrinsic::amdgcn_raw_buffer_atomic_xor:
+ case Intrinsic::amdgcn_raw_ptr_buffer_atomic_xor:
return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_XOR);
case Intrinsic::amdgcn_raw_buffer_atomic_inc:
+ case Intrinsic::amdgcn_raw_ptr_buffer_atomic_inc:
return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_INC);
case Intrinsic::amdgcn_raw_buffer_atomic_dec:
+ case Intrinsic::amdgcn_raw_ptr_buffer_atomic_dec:
return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_DEC);
case Intrinsic::amdgcn_struct_buffer_atomic_swap:
+ case Intrinsic::amdgcn_struct_ptr_buffer_atomic_swap:
return lowerStructBufferAtomicIntrin(Op, DAG,
AMDGPUISD::BUFFER_ATOMIC_SWAP);
case Intrinsic::amdgcn_struct_buffer_atomic_add:
+ case Intrinsic::amdgcn_struct_ptr_buffer_atomic_add:
return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_ADD);
case Intrinsic::amdgcn_struct_buffer_atomic_sub:
+ case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub:
return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SUB);
case Intrinsic::amdgcn_struct_buffer_atomic_smin:
+ case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smin:
return lowerStructBufferAtomicIntrin(Op, DAG,
AMDGPUISD::BUFFER_ATOMIC_SMIN);
case Intrinsic::amdgcn_struct_buffer_atomic_umin:
+ case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umin:
return lowerStructBufferAtomicIntrin(Op, DAG,
AMDGPUISD::BUFFER_ATOMIC_UMIN);
case Intrinsic::amdgcn_struct_buffer_atomic_smax:
+ case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smax:
return lowerStructBufferAtomicIntrin(Op, DAG,
AMDGPUISD::BUFFER_ATOMIC_SMAX);
case Intrinsic::amdgcn_struct_buffer_atomic_umax:
+ case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umax:
return lowerStructBufferAtomicIntrin(Op, DAG,
AMDGPUISD::BUFFER_ATOMIC_UMAX);
case Intrinsic::amdgcn_struct_buffer_atomic_and:
+ case Intrinsic::amdgcn_struct_ptr_buffer_atomic_and:
return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_AND);
case Intrinsic::amdgcn_struct_buffer_atomic_or:
+ case Intrinsic::amdgcn_struct_ptr_buffer_atomic_or:
return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_OR);
case Intrinsic::amdgcn_struct_buffer_atomic_xor:
+ case Intrinsic::amdgcn_struct_ptr_buffer_atomic_xor:
return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_XOR);
case Intrinsic::amdgcn_struct_buffer_atomic_inc:
+ case Intrinsic::amdgcn_struct_ptr_buffer_atomic_inc:
return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_INC);
case Intrinsic::amdgcn_struct_buffer_atomic_dec:
+ case Intrinsic::amdgcn_struct_ptr_buffer_atomic_dec:
return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_DEC);
case Intrinsic::amdgcn_buffer_atomic_cmpswap: {
@@ -7677,49 +8001,50 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
EVT VT = Op.getValueType();
auto *M = cast<MemSDNode>(Op);
- updateBufferMMO(M->getMemOperand(), Ops[5], Ops[6], Ops[7], Ops[4]);
return DAG.getMemIntrinsicNode(AMDGPUISD::BUFFER_ATOMIC_CMPSWAP, DL,
Op->getVTList(), Ops, VT, M->getMemOperand());
}
- case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap: {
+ case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
+ case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap: {
+ SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(4), DAG);
auto Offsets = splitBufferOffsets(Op.getOperand(5), DAG);
SDValue Ops[] = {
- Op.getOperand(0), // Chain
- Op.getOperand(2), // src
- Op.getOperand(3), // cmp
- Op.getOperand(4), // rsrc
- DAG.getConstant(0, DL, MVT::i32), // vindex
- Offsets.first, // voffset
- Op.getOperand(6), // soffset
- Offsets.second, // offset
- Op.getOperand(7), // cachepolicy
- DAG.getTargetConstant(0, DL, MVT::i1), // idxen
+ Op.getOperand(0), // Chain
+ Op.getOperand(2), // src
+ Op.getOperand(3), // cmp
+ Rsrc, // rsrc
+ DAG.getConstant(0, DL, MVT::i32), // vindex
+ Offsets.first, // voffset
+ Op.getOperand(6), // soffset
+ Offsets.second, // offset
+ Op.getOperand(7), // cachepolicy
+ DAG.getTargetConstant(0, DL, MVT::i1), // idxen
};
EVT VT = Op.getValueType();
auto *M = cast<MemSDNode>(Op);
- updateBufferMMO(M->getMemOperand(), Ops[5], Ops[6], Ops[7]);
return DAG.getMemIntrinsicNode(AMDGPUISD::BUFFER_ATOMIC_CMPSWAP, DL,
Op->getVTList(), Ops, VT, M->getMemOperand());
}
- case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap: {
+ case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
+ case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap: {
+ SDValue Rsrc = bufferRsrcPtrToVector(Op->getOperand(4), DAG);
auto Offsets = splitBufferOffsets(Op.getOperand(6), DAG);
SDValue Ops[] = {
- Op.getOperand(0), // Chain
- Op.getOperand(2), // src
- Op.getOperand(3), // cmp
- Op.getOperand(4), // rsrc
- Op.getOperand(5), // vindex
- Offsets.first, // voffset
- Op.getOperand(7), // soffset
- Offsets.second, // offset
- Op.getOperand(8), // cachepolicy
- DAG.getTargetConstant(1, DL, MVT::i1), // idxen
+ Op.getOperand(0), // Chain
+ Op.getOperand(2), // src
+ Op.getOperand(3), // cmp
+ Rsrc, // rsrc
+ Op.getOperand(5), // vindex
+ Offsets.first, // voffset
+ Op.getOperand(7), // soffset
+ Offsets.second, // offset
+ Op.getOperand(8), // cachepolicy
+ DAG.getTargetConstant(1, DL, MVT::i1), // idxen
};
EVT VT = Op.getValueType();
auto *M = cast<MemSDNode>(Op);
- updateBufferMMO(M->getMemOperand(), Ops[5], Ops[6], Ops[7], Ops[4]);
return DAG.getMemIntrinsicNode(AMDGPUISD::BUFFER_ATOMIC_CMPSWAP, DL,
Op->getVTList(), Ops, VT, M->getMemOperand());
@@ -7844,8 +8169,7 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
}
Ops.push_back(TDescr);
- if (IsA16)
- Ops.push_back(DAG.getTargetConstant(1, DL, MVT::i1));
+ Ops.push_back(DAG.getTargetConstant(IsA16, DL, MVT::i1));
Ops.push_back(M->getChain());
auto *NewNode = DAG.getMachineNode(Opcode, DL, M->getVTList(), Ops);
@@ -7853,11 +8177,6 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
DAG.setNodeMemRefs(NewNode, {MemRef});
return SDValue(NewNode, 0);
}
- case Intrinsic::amdgcn_global_atomic_fadd: {
- if (!Subtarget->hasAtomicFaddNoRtnInsts())
- return makeV_ILLEGAL(Op, DAG);
- return SDValue();
- }
case Intrinsic::amdgcn_global_atomic_fmin:
case Intrinsic::amdgcn_global_atomic_fmax:
case Intrinsic::amdgcn_flat_atomic_fmin:
@@ -8102,23 +8421,25 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
M->getMemoryVT(), M->getMemOperand());
}
- case Intrinsic::amdgcn_struct_tbuffer_store: {
+ case Intrinsic::amdgcn_struct_tbuffer_store:
+ case Intrinsic::amdgcn_struct_ptr_tbuffer_store: {
SDValue VData = Op.getOperand(2);
bool IsD16 = (VData.getValueType().getScalarType() == MVT::f16);
if (IsD16)
VData = handleD16VData(VData, DAG);
+ SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
auto Offsets = splitBufferOffsets(Op.getOperand(5), DAG);
SDValue Ops[] = {
- Chain,
- VData, // vdata
- Op.getOperand(3), // rsrc
- Op.getOperand(4), // vindex
- Offsets.first, // voffset
- Op.getOperand(6), // soffset
- Offsets.second, // offset
- Op.getOperand(7), // format
- Op.getOperand(8), // cachepolicy, swizzled buffer
- DAG.getTargetConstant(1, DL, MVT::i1), // idxen
+ Chain,
+ VData, // vdata
+ Rsrc, // rsrc
+ Op.getOperand(4), // vindex
+ Offsets.first, // voffset
+ Op.getOperand(6), // soffset
+ Offsets.second, // offset
+ Op.getOperand(7), // format
+ Op.getOperand(8), // cachepolicy, swizzled buffer
+ DAG.getTargetConstant(1, DL, MVT::i1), // idxen
};
unsigned Opc = IsD16 ? AMDGPUISD::TBUFFER_STORE_FORMAT_D16 :
AMDGPUISD::TBUFFER_STORE_FORMAT;
@@ -8127,23 +8448,25 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
M->getMemoryVT(), M->getMemOperand());
}
- case Intrinsic::amdgcn_raw_tbuffer_store: {
+ case Intrinsic::amdgcn_raw_tbuffer_store:
+ case Intrinsic::amdgcn_raw_ptr_tbuffer_store: {
SDValue VData = Op.getOperand(2);
bool IsD16 = (VData.getValueType().getScalarType() == MVT::f16);
if (IsD16)
VData = handleD16VData(VData, DAG);
+ SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
auto Offsets = splitBufferOffsets(Op.getOperand(4), DAG);
SDValue Ops[] = {
- Chain,
- VData, // vdata
- Op.getOperand(3), // rsrc
- DAG.getConstant(0, DL, MVT::i32), // vindex
- Offsets.first, // voffset
- Op.getOperand(5), // soffset
- Offsets.second, // offset
- Op.getOperand(6), // format
- Op.getOperand(7), // cachepolicy, swizzled buffer
- DAG.getTargetConstant(0, DL, MVT::i1), // idxen
+ Chain,
+ VData, // vdata
+ Rsrc, // rsrc
+ DAG.getConstant(0, DL, MVT::i32), // vindex
+ Offsets.first, // voffset
+ Op.getOperand(5), // soffset
+ Offsets.second, // offset
+ Op.getOperand(6), // format
+ Op.getOperand(7), // cachepolicy, swizzled buffer
+ DAG.getTargetConstant(0, DL, MVT::i1), // idxen
};
unsigned Opc = IsD16 ? AMDGPUISD::TBUFFER_STORE_FORMAT_D16 :
AMDGPUISD::TBUFFER_STORE_FORMAT;
@@ -8178,7 +8501,6 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
AMDGPUISD::BUFFER_STORE : AMDGPUISD::BUFFER_STORE_FORMAT;
Opc = IsD16 ? AMDGPUISD::BUFFER_STORE_FORMAT_D16 : Opc;
MemSDNode *M = cast<MemSDNode>(Op);
- updateBufferMMO(M->getMemOperand(), Ops[4], Ops[5], Ops[6], Ops[3]);
// Handle BUFFER_STORE_BYTE/SHORT overloaded intrinsics
EVT VDataType = VData.getValueType().getScalarType();
@@ -8190,9 +8512,12 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
}
case Intrinsic::amdgcn_raw_buffer_store:
- case Intrinsic::amdgcn_raw_buffer_store_format: {
+ case Intrinsic::amdgcn_raw_ptr_buffer_store:
+ case Intrinsic::amdgcn_raw_buffer_store_format:
+ case Intrinsic::amdgcn_raw_ptr_buffer_store_format: {
const bool IsFormat =
- IntrinsicID == Intrinsic::amdgcn_raw_buffer_store_format;
+ IntrinsicID == Intrinsic::amdgcn_raw_buffer_store_format ||
+ IntrinsicID == Intrinsic::amdgcn_raw_ptr_buffer_store_format;
SDValue VData = Op.getOperand(2);
EVT VDataVT = VData.getValueType();
@@ -8209,23 +8534,23 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
getEquivalentMemType(*DAG.getContext(), VDataVT), VData);
}
+ SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
auto Offsets = splitBufferOffsets(Op.getOperand(4), DAG);
SDValue Ops[] = {
- Chain,
- VData,
- Op.getOperand(3), // rsrc
- DAG.getConstant(0, DL, MVT::i32), // vindex
- Offsets.first, // voffset
- Op.getOperand(5), // soffset
- Offsets.second, // offset
- Op.getOperand(6), // cachepolicy, swizzled buffer
- DAG.getTargetConstant(0, DL, MVT::i1), // idxen
+ Chain,
+ VData,
+ Rsrc,
+ DAG.getConstant(0, DL, MVT::i32), // vindex
+ Offsets.first, // voffset
+ Op.getOperand(5), // soffset
+ Offsets.second, // offset
+ Op.getOperand(6), // cachepolicy, swizzled buffer
+ DAG.getTargetConstant(0, DL, MVT::i1), // idxen
};
unsigned Opc =
IsFormat ? AMDGPUISD::BUFFER_STORE_FORMAT : AMDGPUISD::BUFFER_STORE;
Opc = IsD16 ? AMDGPUISD::BUFFER_STORE_FORMAT_D16 : Opc;
MemSDNode *M = cast<MemSDNode>(Op);
- updateBufferMMO(M->getMemOperand(), Ops[4], Ops[5], Ops[6]);
// Handle BUFFER_STORE_BYTE/SHORT overloaded intrinsics
if (!IsD16 && !VDataVT.isVector() && EltType.getSizeInBits() < 32)
@@ -8236,9 +8561,12 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
}
case Intrinsic::amdgcn_struct_buffer_store:
- case Intrinsic::amdgcn_struct_buffer_store_format: {
+ case Intrinsic::amdgcn_struct_ptr_buffer_store:
+ case Intrinsic::amdgcn_struct_buffer_store_format:
+ case Intrinsic::amdgcn_struct_ptr_buffer_store_format: {
const bool IsFormat =
- IntrinsicID == Intrinsic::amdgcn_struct_buffer_store_format;
+ IntrinsicID == Intrinsic::amdgcn_struct_buffer_store_format ||
+ IntrinsicID == Intrinsic::amdgcn_struct_ptr_buffer_store_format;
SDValue VData = Op.getOperand(2);
EVT VDataVT = VData.getValueType();
@@ -8256,23 +8584,23 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
getEquivalentMemType(*DAG.getContext(), VDataVT), VData);
}
+ auto Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
auto Offsets = splitBufferOffsets(Op.getOperand(5), DAG);
SDValue Ops[] = {
- Chain,
- VData,
- Op.getOperand(3), // rsrc
- Op.getOperand(4), // vindex
- Offsets.first, // voffset
- Op.getOperand(6), // soffset
- Offsets.second, // offset
- Op.getOperand(7), // cachepolicy, swizzled buffer
- DAG.getTargetConstant(1, DL, MVT::i1), // idxen
+ Chain,
+ VData,
+ Rsrc,
+ Op.getOperand(4), // vindex
+ Offsets.first, // voffset
+ Op.getOperand(6), // soffset
+ Offsets.second, // offset
+ Op.getOperand(7), // cachepolicy, swizzled buffer
+ DAG.getTargetConstant(1, DL, MVT::i1), // idxen
};
- unsigned Opc = IntrinsicID == Intrinsic::amdgcn_struct_buffer_store ?
- AMDGPUISD::BUFFER_STORE : AMDGPUISD::BUFFER_STORE_FORMAT;
+ unsigned Opc =
+ !IsFormat ? AMDGPUISD::BUFFER_STORE : AMDGPUISD::BUFFER_STORE_FORMAT;
Opc = IsD16 ? AMDGPUISD::BUFFER_STORE_FORMAT_D16 : Opc;
MemSDNode *M = cast<MemSDNode>(Op);
- updateBufferMMO(M->getMemOperand(), Ops[4], Ops[5], Ops[6], Ops[3]);
// Handle BUFFER_STORE_BYTE/SHORT overloaded intrinsics
EVT VDataType = VData.getValueType().getScalarType();
@@ -8283,9 +8611,13 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
M->getMemoryVT(), M->getMemOperand());
}
case Intrinsic::amdgcn_raw_buffer_load_lds:
- case Intrinsic::amdgcn_struct_buffer_load_lds: {
+ case Intrinsic::amdgcn_raw_ptr_buffer_load_lds:
+ case Intrinsic::amdgcn_struct_buffer_load_lds:
+ case Intrinsic::amdgcn_struct_ptr_buffer_load_lds: {
unsigned Opc;
- bool HasVIndex = IntrinsicID == Intrinsic::amdgcn_struct_buffer_load_lds;
+ bool HasVIndex =
+ IntrinsicID == Intrinsic::amdgcn_struct_buffer_load_lds ||
+ IntrinsicID == Intrinsic::amdgcn_struct_ptr_buffer_load_lds;
unsigned OpOffset = HasVIndex ? 1 : 0;
SDValue VOffset = Op.getOperand(5 + OpOffset);
auto CVOffset = dyn_cast<ConstantSDNode>(VOffset);
@@ -8328,7 +8660,8 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
else if (HasVOffset)
Ops.push_back(VOffset);
- Ops.push_back(Op.getOperand(2)); // rsrc
+ SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(2), DAG);
+ Ops.push_back(Rsrc);
Ops.push_back(Op.getOperand(6 + OpOffset)); // soffset
Ops.push_back(Op.getOperand(7 + OpOffset)); // imm offset
unsigned Aux = Op.getConstantOperandVal(8 + OpOffset);
@@ -8341,8 +8674,10 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
auto *M = cast<MemSDNode>(Op);
MachineMemOperand *LoadMMO = M->getMemOperand();
+ // Don't set the offset value here because the pointer points to the base of
+ // the buffer.
MachinePointerInfo LoadPtrI = LoadMMO->getPointerInfo();
- LoadPtrI.Offset = Op->getConstantOperandVal(7 + OpOffset);
+
MachinePointerInfo StorePtrI = LoadPtrI;
StorePtrI.V = nullptr;
StorePtrI.AddrSpace = AMDGPUAS::LOCAL_ADDRESS;
@@ -8450,27 +8785,6 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
}
}
-SDValue SITargetLowering::makeV_ILLEGAL(SDValue Op, SelectionDAG & DAG) const {
- // Create the V_ILLEGAL node.
- SDLoc DL(Op);
- auto Opcode = Subtarget->getGeneration() < AMDGPUSubtarget::GFX10 ?
- AMDGPU::V_ILLEGAL_gfx6_gfx7_gfx8_gfx9 : AMDGPU::V_ILLEGAL;
- auto EntryNode = DAG.getEntryNode();
- auto IllegalNode = DAG.getMachineNode(Opcode, DL, MVT::Other, EntryNode);
- auto IllegalVal = SDValue(IllegalNode, 0u);
-
- // Add the V_ILLEGAL node to the root chain to prevent its removal.
- auto Chains = SmallVector<SDValue, 2u>();
- Chains.push_back(IllegalVal);
- Chains.push_back(DAG.getRoot());
- auto Root = DAG.getTokenFactor(SDLoc(Chains.back()), Chains);
- DAG.setRoot(Root);
-
- // Merge with UNDEF to satisfy return value requirements.
- auto UndefVal = DAG.getUNDEF(Op.getValueType());
- return DAG.getMergeValues({UndefVal, IllegalVal}, DL);
-}
-
// The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args:
// offset (the offset that is included in bounds checking and swizzling, to be
// split between the instruction's voffset and immoffset fields) and soffset
@@ -8480,7 +8794,7 @@ SDValue SITargetLowering::makeV_ILLEGAL(SDValue Op, SelectionDAG & DAG) const {
std::pair<SDValue, SDValue> SITargetLowering::splitBufferOffsets(
SDValue Offset, SelectionDAG &DAG) const {
SDLoc DL(Offset);
- const unsigned MaxImm = 4095;
+ const unsigned MaxImm = SIInstrInfo::getMaxMUBUFImmOffset();
SDValue N0 = Offset;
ConstantSDNode *C1 = nullptr;
@@ -8493,13 +8807,14 @@ std::pair<SDValue, SDValue> SITargetLowering::splitBufferOffsets(
if (C1) {
unsigned ImmOffset = C1->getZExtValue();
- // If the immediate value is too big for the immoffset field, put the value
- // and -4096 into the immoffset field so that the value that is copied/added
- // for the voffset field is a multiple of 4096, and it stands more chance
- // of being CSEd with the copy/add for another similar load/store.
- // However, do not do that rounding down to a multiple of 4096 if that is a
- // negative number, as it appears to be illegal to have a negative offset
- // in the vgpr, even if adding the immediate offset makes it positive.
+ // If the immediate value is too big for the immoffset field, put only bits
+ // that would normally fit in the immoffset field. The remaining value that
+ // is copied/added for the voffset field is a large power of 2, and it
+ // stands more chance of being CSEd with the copy/add for another similar
+ // load/store.
+ // However, do not do that rounding down if that is a negative
+ // number, as it appears to be illegal to have a negative offset in the
+ // vgpr, even if adding the immediate offset makes it positive.
unsigned Overflow = ImmOffset & ~MaxImm;
ImmOffset -= Overflow;
if ((int32_t)Overflow < 0) {
@@ -8530,12 +8845,12 @@ std::pair<SDValue, SDValue> SITargetLowering::splitBufferOffsets(
void SITargetLowering::setBufferOffsets(SDValue CombinedOffset,
SelectionDAG &DAG, SDValue *Offsets,
Align Alignment) const {
+ const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
SDLoc DL(CombinedOffset);
- if (auto C = dyn_cast<ConstantSDNode>(CombinedOffset)) {
+ if (auto *C = dyn_cast<ConstantSDNode>(CombinedOffset)) {
uint32_t Imm = C->getZExtValue();
uint32_t SOffset, ImmOffset;
- if (AMDGPU::splitMUBUFOffset(Imm, SOffset, ImmOffset, Subtarget,
- Alignment)) {
+ if (TII->splitMUBUFOffset(Imm, SOffset, ImmOffset, Alignment)) {
Offsets[0] = DAG.getConstant(0, DL, MVT::i32);
Offsets[1] = DAG.getConstant(SOffset, DL, MVT::i32);
Offsets[2] = DAG.getTargetConstant(ImmOffset, DL, MVT::i32);
@@ -8547,8 +8862,8 @@ void SITargetLowering::setBufferOffsets(SDValue CombinedOffset,
SDValue N1 = CombinedOffset.getOperand(1);
uint32_t SOffset, ImmOffset;
int Offset = cast<ConstantSDNode>(N1)->getSExtValue();
- if (Offset >= 0 && AMDGPU::splitMUBUFOffset(Offset, SOffset, ImmOffset,
- Subtarget, Alignment)) {
+ if (Offset >= 0 &&
+ TII->splitMUBUFOffset(Offset, SOffset, ImmOffset, Alignment)) {
Offsets[0] = N0;
Offsets[1] = DAG.getConstant(SOffset, DL, MVT::i32);
Offsets[2] = DAG.getTargetConstant(ImmOffset, DL, MVT::i32);
@@ -8560,6 +8875,55 @@ void SITargetLowering::setBufferOffsets(SDValue CombinedOffset,
Offsets[2] = DAG.getTargetConstant(0, DL, MVT::i32);
}
+SDValue SITargetLowering::bufferRsrcPtrToVector(SDValue MaybePointer,
+ SelectionDAG &DAG) const {
+ if (!MaybePointer.getValueType().isScalarInteger())
+ return MaybePointer;
+
+ SDLoc DL(MaybePointer);
+
+ SDValue Rsrc = DAG.getBitcast(MVT::v4i32, MaybePointer);
+ return Rsrc;
+}
+
+// Wrap a global or flat pointer into a buffer intrinsic using the flags
+// specified in the intrinsic.
+SDValue SITargetLowering::lowerPointerAsRsrcIntrin(SDNode *Op,
+ SelectionDAG &DAG) const {
+ SDLoc Loc(Op);
+
+ SDValue Pointer = Op->getOperand(1);
+ SDValue Stride = Op->getOperand(2);
+ SDValue NumRecords = Op->getOperand(3);
+ SDValue Flags = Op->getOperand(4);
+
+ auto [LowHalf, HighHalf] = DAG.SplitScalar(Pointer, Loc, MVT::i32, MVT::i32);
+ SDValue Mask = DAG.getConstant(0x0000ffff, Loc, MVT::i32);
+ SDValue Masked = DAG.getNode(ISD::AND, Loc, MVT::i32, HighHalf, Mask);
+ std::optional<uint32_t> ConstStride = std::nullopt;
+ if (auto *ConstNode = dyn_cast<ConstantSDNode>(Stride))
+ ConstStride = ConstNode->getZExtValue();
+
+ SDValue NewHighHalf = Masked;
+ if (!ConstStride || *ConstStride != 0) {
+ SDValue ShiftedStride;
+ if (ConstStride) {
+ ShiftedStride = DAG.getConstant(*ConstStride << 16, Loc, MVT::i32);
+ } else {
+ SDValue ExtStride = DAG.getAnyExtOrTrunc(Stride, Loc, MVT::i32);
+ ShiftedStride =
+ DAG.getNode(ISD::SHL, Loc, MVT::i32, ExtStride,
+ DAG.getShiftAmountConstant(16, MVT::i32, Loc));
+ }
+ NewHighHalf = DAG.getNode(ISD::OR, Loc, MVT::i32, Masked, ShiftedStride);
+ }
+
+ SDValue Rsrc = DAG.getNode(ISD::BUILD_VECTOR, Loc, MVT::v4i32, LowHalf,
+ NewHighHalf, NumRecords, Flags);
+ SDValue RsrcPtr = DAG.getNode(ISD::BITCAST, Loc, MVT::i128, Rsrc);
+ return RsrcPtr;
+}
+
// Handle 8 bit and 16 bit buffer loads
SDValue SITargetLowering::handleByteShortBufferLoads(SelectionDAG &DAG,
EVT LoadVT, SDLoc DL,
@@ -8683,6 +9047,14 @@ SDValue SITargetLowering::widenLoad(LoadSDNode *Ld, DAGCombinerInfo &DCI) const
return DAG.getMergeValues({ Cvt, NewLoad.getValue(1) }, SL);
}
+static bool addressMayBeAccessedAsPrivate(const MachineMemOperand *MMO,
+ const SIMachineFunctionInfo &Info) {
+ // TODO: Should check if the address can definitely not access stack.
+ if (Info.isEntryFunction())
+ return Info.hasFlatScratchInit();
+ return true;
+}
+
SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
SDLoc DL(Op);
LoadSDNode *Load = cast<LoadSDNode>(Op);
@@ -8749,7 +9121,7 @@ SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
// then we need to use the same legalization rules we use for private.
if (AS == AMDGPUAS::FLAT_ADDRESS &&
!Subtarget->hasMultiDwordFlatScratchAddressing())
- AS = MFI->hasFlatScratchInit() ?
+ AS = addressMayBeAccessedAsPrivate(Load->getMemOperand(), *MFI) ?
AMDGPUAS::PRIVATE_ADDRESS : AMDGPUAS::GLOBAL_ADDRESS;
unsigned NumElements = MemVT.getVectorNumElements();
@@ -8883,26 +9255,30 @@ SDValue SITargetLowering::lowerFastUnsafeFDIV(SDValue Op,
EVT VT = Op.getValueType();
const SDNodeFlags Flags = Op->getFlags();
- bool AllowInaccurateRcp = Flags.hasApproximateFuncs();
-
- // Without !fpmath accuracy information, we can't do more because we don't
- // know exactly whether rcp is accurate enough to meet !fpmath requirement.
- if (!AllowInaccurateRcp)
- return SDValue();
+ bool AllowInaccurateRcp = Flags.hasApproximateFuncs() ||
+ DAG.getTarget().Options.UnsafeFPMath;
if (const ConstantFPSDNode *CLHS = dyn_cast<ConstantFPSDNode>(LHS)) {
+ // Without !fpmath accuracy information, we can't do more because we don't
+ // know exactly whether rcp is accurate enough to meet !fpmath requirement.
+ // f16 is always accurate enough
+ if (!AllowInaccurateRcp && VT != MVT::f16)
+ return SDValue();
+
if (CLHS->isExactlyValue(1.0)) {
// v_rcp_f32 and v_rsq_f32 do not support denormals, and according to
// the CI documentation has a worst case error of 1 ulp.
// OpenCL requires <= 2.5 ulp for 1.0 / x, so it should always be OK to
// use it as long as we aren't trying to use denormals.
//
- // v_rcp_f16 and v_rsq_f16 DO support denormals.
+ // v_rcp_f16 and v_rsq_f16 DO support denormals and 0.51ulp.
// 1.0 / sqrt(x) -> rsq(x)
// XXX - Is UnsafeFPMath sufficient to do this for f64? The maximum ULP
// error seems really high at 2^29 ULP.
+
+ // XXX - do we need afn for this or is arcp sufficent?
if (RHS.getOpcode() == ISD::FSQRT)
return DAG.getNode(AMDGPUISD::RSQ, SL, VT, RHS.getOperand(0));
@@ -8918,6 +9294,11 @@ SDValue SITargetLowering::lowerFastUnsafeFDIV(SDValue Op,
}
}
+ // For f16 require arcp only.
+ // For f32 require afn+arcp.
+ if (!AllowInaccurateRcp && (VT != MVT::f16 || !Flags.hasAllowReciprocal()))
+ return SDValue();
+
// Turn into multiply by the reciprocal.
// x / y -> x * (1.0 / y)
SDValue Recip = DAG.getNode(AMDGPUISD::RCP, SL, VT, RHS);
@@ -9017,16 +9398,17 @@ SDValue SITargetLowering::LowerFDIV16(SDValue Op, SelectionDAG &DAG) const {
// Faster 2.5 ULP division that does not support denormals.
SDValue SITargetLowering::lowerFDIV_FAST(SDValue Op, SelectionDAG &DAG) const {
+ SDNodeFlags Flags = Op->getFlags();
SDLoc SL(Op);
SDValue LHS = Op.getOperand(1);
SDValue RHS = Op.getOperand(2);
- SDValue r1 = DAG.getNode(ISD::FABS, SL, MVT::f32, RHS);
+ SDValue r1 = DAG.getNode(ISD::FABS, SL, MVT::f32, RHS, Flags);
- const APFloat K0Val(BitsToFloat(0x6f800000));
+ const APFloat K0Val(0x1p+96f);
const SDValue K0 = DAG.getConstantFP(K0Val, SL, MVT::f32);
- const APFloat K1Val(BitsToFloat(0x2f800000));
+ const APFloat K1Val(0x1p-32f);
const SDValue K1 = DAG.getConstantFP(K1Val, SL, MVT::f32);
const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f32);
@@ -9036,30 +9418,27 @@ SDValue SITargetLowering::lowerFDIV_FAST(SDValue Op, SelectionDAG &DAG) const {
SDValue r2 = DAG.getSetCC(SL, SetCCVT, r1, K0, ISD::SETOGT);
- SDValue r3 = DAG.getNode(ISD::SELECT, SL, MVT::f32, r2, K1, One);
+ SDValue r3 = DAG.getNode(ISD::SELECT, SL, MVT::f32, r2, K1, One, Flags);
- // TODO: Should this propagate fast-math-flags?
- r1 = DAG.getNode(ISD::FMUL, SL, MVT::f32, RHS, r3);
+ r1 = DAG.getNode(ISD::FMUL, SL, MVT::f32, RHS, r3, Flags);
// rcp does not support denormals.
- SDValue r0 = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, r1);
+ SDValue r0 = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, r1, Flags);
- SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f32, LHS, r0);
+ SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f32, LHS, r0, Flags);
- return DAG.getNode(ISD::FMUL, SL, MVT::f32, r3, Mul);
+ return DAG.getNode(ISD::FMUL, SL, MVT::f32, r3, Mul, Flags);
}
// Returns immediate value for setting the F32 denorm mode when using the
// S_DENORM_MODE instruction.
-static SDValue getSPDenormModeValue(int SPDenormMode, SelectionDAG &DAG,
- const SDLoc &SL, const GCNSubtarget *ST) {
+static SDValue getSPDenormModeValue(uint32_t SPDenormMode, SelectionDAG &DAG,
+ const SIMachineFunctionInfo *Info,
+ const GCNSubtarget *ST) {
assert(ST->hasDenormModeInst() && "Requires S_DENORM_MODE");
- int DPDenormModeDefault = hasFP64FP16Denormals(DAG.getMachineFunction())
- ? FP_DENORM_FLUSH_NONE
- : FP_DENORM_FLUSH_IN_FLUSH_OUT;
-
- int Mode = SPDenormMode | (DPDenormModeDefault << 2);
- return DAG.getTargetConstant(Mode, SL, MVT::i32);
+ uint32_t DPDenormModeDefault = Info->getMode().fpDenormModeDPValue();
+ uint32_t Mode = SPDenormMode | (DPDenormModeDefault << 2);
+ return DAG.getTargetConstant(Mode, SDLoc(), MVT::i32);
}
SDValue SITargetLowering::LowerFDIV32(SDValue Op, SelectionDAG &DAG) const {
@@ -9097,7 +9476,11 @@ SDValue SITargetLowering::LowerFDIV32(SDValue Op, SelectionDAG &DAG) const {
(1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_);
const SDValue BitField = DAG.getTargetConstant(Denorm32Reg, SL, MVT::i32);
- const bool HasFP32Denormals = hasFP32Denormals(DAG.getMachineFunction());
+ const MachineFunction &MF = DAG.getMachineFunction();
+ const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
+ const DenormalMode DenormMode = Info->getMode().FP32Denormals;
+
+ const bool HasFP32Denormals = DenormMode == DenormalMode::getIEEE();
if (!HasFP32Denormals) {
// Note we can't use the STRICT_FMA/STRICT_FMUL for the non-strict FDIV
@@ -9109,7 +9492,7 @@ SDValue SITargetLowering::LowerFDIV32(SDValue Op, SelectionDAG &DAG) const {
SDNode *EnableDenorm;
if (Subtarget->hasDenormModeInst()) {
const SDValue EnableDenormValue =
- getSPDenormModeValue(FP_DENORM_FLUSH_NONE, DAG, SL, Subtarget);
+ getSPDenormModeValue(FP_DENORM_FLUSH_NONE, DAG, Info, Subtarget);
EnableDenorm = DAG.getNode(AMDGPUISD::DENORM_MODE, SL, BindParamVTs,
DAG.getEntryNode(), EnableDenormValue).getNode();
@@ -9149,10 +9532,13 @@ SDValue SITargetLowering::LowerFDIV32(SDValue Op, SelectionDAG &DAG) const {
NumeratorScaled, Fma3, Flags);
if (!HasFP32Denormals) {
+ // FIXME: This mishandles dynamic denormal mode. We need to query the
+ // current mode and restore the original.
+
SDNode *DisableDenorm;
if (Subtarget->hasDenormModeInst()) {
- const SDValue DisableDenormValue =
- getSPDenormModeValue(FP_DENORM_FLUSH_IN_FLUSH_OUT, DAG, SL, Subtarget);
+ const SDValue DisableDenormValue = getSPDenormModeValue(
+ FP_DENORM_FLUSH_IN_FLUSH_OUT, DAG, Info, Subtarget);
DisableDenorm = DAG.getNode(AMDGPUISD::DENORM_MODE, SL, MVT::Other,
Fma4.getValue(1), DisableDenormValue,
@@ -9260,6 +9646,36 @@ SDValue SITargetLowering::LowerFDIV(SDValue Op, SelectionDAG &DAG) const {
llvm_unreachable("Unexpected type for fdiv");
}
+SDValue SITargetLowering::LowerFFREXP(SDValue Op, SelectionDAG &DAG) const {
+ SDLoc dl(Op);
+ SDValue Val = Op.getOperand(0);
+ EVT VT = Val.getValueType();
+ EVT ResultExpVT = Op->getValueType(1);
+ EVT InstrExpVT = VT == MVT::f16 ? MVT::i16 : MVT::i32;
+
+ SDValue Mant = DAG.getNode(
+ ISD::INTRINSIC_WO_CHAIN, dl, VT,
+ DAG.getTargetConstant(Intrinsic::amdgcn_frexp_mant, dl, MVT::i32), Val);
+
+ SDValue Exp = DAG.getNode(
+ ISD::INTRINSIC_WO_CHAIN, dl, InstrExpVT,
+ DAG.getTargetConstant(Intrinsic::amdgcn_frexp_exp, dl, MVT::i32), Val);
+
+ if (Subtarget->hasFractBug()) {
+ SDValue Fabs = DAG.getNode(ISD::FABS, dl, VT, Val);
+ SDValue Inf = DAG.getConstantFP(
+ APFloat::getInf(SelectionDAG::EVTToAPFloatSemantics(VT)), dl, VT);
+
+ SDValue IsFinite = DAG.getSetCC(dl, MVT::i1, Fabs, Inf, ISD::SETOLT);
+ SDValue Zero = DAG.getConstant(0, dl, InstrExpVT);
+ Exp = DAG.getNode(ISD::SELECT, dl, InstrExpVT, IsFinite, Exp, Zero);
+ Mant = DAG.getNode(ISD::SELECT, dl, VT, IsFinite, Mant, Val);
+ }
+
+ SDValue CastExp = DAG.getSExtOrTrunc(Exp, dl, ResultExpVT);
+ return DAG.getMergeValues({Mant, CastExp}, dl);
+}
+
SDValue SITargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
SDLoc DL(Op);
StoreSDNode *Store = cast<StoreSDNode>(Op);
@@ -9287,7 +9703,7 @@ SDValue SITargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
// then we need to use the same legalization rules we use for private.
if (AS == AMDGPUAS::FLAT_ADDRESS &&
!Subtarget->hasMultiDwordFlatScratchAddressing())
- AS = MFI->hasFlatScratchInit() ?
+ AS = addressMayBeAccessedAsPrivate(Store->getMemOperand(), *MFI) ?
AMDGPUAS::PRIVATE_ADDRESS : AMDGPUAS::GLOBAL_ADDRESS;
unsigned NumElements = VT.getVectorNumElements();
@@ -9338,6 +9754,87 @@ SDValue SITargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
return SDValue();
}
+SDValue SITargetLowering::lowerFSQRTF64(SDValue Op, SelectionDAG &DAG) const {
+ // For double type, the SQRT and RSQ instructions don't have required
+ // precision, we apply Goldschmidt's algorithm to improve the result:
+ //
+ // y0 = rsq(x)
+ // g0 = x * y0
+ // h0 = 0.5 * y0
+ //
+ // r0 = 0.5 - h0 * g0
+ // g1 = g0 * r0 + g0
+ // h1 = h0 * r0 + h0
+ //
+ // r1 = 0.5 - h1 * g1 => d0 = x - g1 * g1
+ // g2 = g1 * r1 + g1 g2 = d0 * h1 + g1
+ // h2 = h1 * r1 + h1
+ //
+ // r2 = 0.5 - h2 * g2 => d1 = x - g2 * g2
+ // g3 = g2 * r2 + g2 g3 = d1 * h1 + g2
+ //
+ // sqrt(x) = g3
+
+ SDNodeFlags Flags = Op->getFlags();
+
+ SDLoc DL(Op);
+
+ SDValue X = Op.getOperand(0);
+ SDValue ScaleConstant = DAG.getConstantFP(0x1.0p-767, DL, MVT::f64);
+
+ SDValue Scaling = DAG.getSetCC(DL, MVT::i1, X, ScaleConstant, ISD::SETOLT);
+
+ SDValue ZeroInt = DAG.getConstant(0, DL, MVT::i32);
+
+ // Scale up input if it is too small.
+ SDValue ScaleUpFactor = DAG.getConstant(256, DL, MVT::i32);
+ SDValue ScaleUp =
+ DAG.getNode(ISD::SELECT, DL, MVT::i32, Scaling, ScaleUpFactor, ZeroInt);
+ SDValue SqrtX = DAG.getNode(ISD::FLDEXP, DL, MVT::f64, X, ScaleUp, Flags);
+
+ SDValue SqrtY = DAG.getNode(AMDGPUISD::RSQ, DL, MVT::f64, SqrtX);
+
+ SDValue SqrtS0 = DAG.getNode(ISD::FMUL, DL, MVT::f64, SqrtX, SqrtY);
+
+ SDValue Half = DAG.getConstantFP(0.5, DL, MVT::f64);
+ SDValue SqrtH0 = DAG.getNode(ISD::FMUL, DL, MVT::f64, SqrtY, Half);
+
+ SDValue NegSqrtH0 = DAG.getNode(ISD::FNEG, DL, MVT::f64, SqrtH0);
+ SDValue SqrtR0 = DAG.getNode(ISD::FMA, DL, MVT::f64, NegSqrtH0, SqrtS0, Half);
+
+ SDValue SqrtH1 = DAG.getNode(ISD::FMA, DL, MVT::f64, SqrtH0, SqrtR0, SqrtH0);
+
+ SDValue SqrtS1 = DAG.getNode(ISD::FMA, DL, MVT::f64, SqrtS0, SqrtR0, SqrtS0);
+
+ SDValue NegSqrtS1 = DAG.getNode(ISD::FNEG, DL, MVT::f64, SqrtS1);
+ SDValue SqrtD0 = DAG.getNode(ISD::FMA, DL, MVT::f64, NegSqrtS1, SqrtS1, SqrtX);
+
+ SDValue SqrtS2 = DAG.getNode(ISD::FMA, DL, MVT::f64, SqrtD0, SqrtH1, SqrtS1);
+
+ SDValue NegSqrtS2 = DAG.getNode(ISD::FNEG, DL, MVT::f64, SqrtS2);
+ SDValue SqrtD1 =
+ DAG.getNode(ISD::FMA, DL, MVT::f64, NegSqrtS2, SqrtS2, SqrtX);
+
+ SDValue SqrtRet = DAG.getNode(ISD::FMA, DL, MVT::f64, SqrtD1, SqrtH1, SqrtS2);
+
+ SDValue ScaleDownFactor = DAG.getConstant(-128, DL, MVT::i32);
+ SDValue ScaleDown =
+ DAG.getNode(ISD::SELECT, DL, MVT::i32, Scaling, ScaleDownFactor, ZeroInt);
+ SqrtRet = DAG.getNode(ISD::FLDEXP, DL, MVT::f64, SqrtRet, ScaleDown, Flags);
+
+ // TODO: Switch to fcmp oeq 0 for finite only. Can't fully remove this check
+ // with finite only or nsz because rsq(+/-0) = +/-inf
+
+ // TODO: Check for DAZ and expand to subnormals
+ SDValue IsZeroOrInf =
+ DAG.getNode(ISD::IS_FPCLASS, DL, MVT::i1, SqrtX,
+ DAG.getTargetConstant(fcZero | fcPosInf, DL, MVT::i32));
+
+ // If x is +INF, +0, or -0, use its original value
+ return DAG.getNode(ISD::SELECT, DL, MVT::f64, IsZeroOrInf, SqrtX, SqrtRet,
+ Flags);
+}
+
SDValue SITargetLowering::LowerTrig(SDValue Op, SelectionDAG &DAG) const {
SDLoc DL(Op);
EVT VT = Op.getValueType();
@@ -9432,7 +9929,53 @@ SDValue SITargetLowering::performUCharToFloatCombine(SDNode *N,
return SDValue();
}
+SDValue SITargetLowering::performFCopySignCombine(SDNode *N,
+ DAGCombinerInfo &DCI) const {
+ SDValue MagnitudeOp = N->getOperand(0);
+ SDValue SignOp = N->getOperand(1);
+ SelectionDAG &DAG = DCI.DAG;
+ SDLoc DL(N);
+
+ // f64 fcopysign is really an f32 copysign on the high bits, so replace the
+ // lower half with a copy.
+ // fcopysign f64:x, _:y -> x.lo32, (fcopysign (f32 x.hi32), _:y)
+ if (MagnitudeOp.getValueType() == MVT::f64) {
+ SDValue MagAsVector = DAG.getNode(ISD::BITCAST, DL, MVT::v2f32, MagnitudeOp);
+ SDValue MagLo =
+ DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, MagAsVector,
+ DAG.getConstant(0, DL, MVT::i32));
+ SDValue MagHi =
+ DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, MagAsVector,
+ DAG.getConstant(1, DL, MVT::i32));
+
+ SDValue HiOp =
+ DAG.getNode(ISD::FCOPYSIGN, DL, MVT::f32, MagHi, SignOp);
+
+ SDValue Vector = DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v2f32, MagLo, HiOp);
+
+ return DAG.getNode(ISD::BITCAST, DL, MVT::f64, Vector);
+ }
+
+ if (SignOp.getValueType() != MVT::f64)
+ return SDValue();
+
+ // Reduce width of sign operand, we only need the highest bit.
+ //
+ // fcopysign f64:x, f64:y ->
+ // fcopysign f64:x, (extract_vector_elt (bitcast f64:y to v2f32), 1)
+ // TODO: In some cases it might make sense to go all the way to f16.
+ SDValue SignAsVector = DAG.getNode(ISD::BITCAST, DL, MVT::v2f32, SignOp);
+ SDValue SignAsF32 =
+ DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, SignAsVector,
+ DAG.getConstant(1, DL, MVT::i32));
+
+ return DAG.getNode(ISD::FCOPYSIGN, DL, N->getValueType(0), N->getOperand(0),
+ SignAsF32);
+}
+
// (shl (add x, c1), c2) -> add (shl x, c2), (shl c1, c2)
+// (shl (or x, c1), c2) -> add (shl x, c2), (shl c1, c2) iff x and c1 share no
+// bits
// This is a variant of
// (mul (add x, c1), c2) -> add (mul x, c2), (mul c1, c2),
@@ -9467,8 +10010,14 @@ SDValue SITargetLowering::performSHLPtrCombine(SDNode *N,
if (!CAdd)
return SDValue();
- // If the resulting offset is too large, we can't fold it into the addressing
- // mode offset.
+ SelectionDAG &DAG = DCI.DAG;
+
+ if (N0->getOpcode() == ISD::OR &&
+ !DAG.haveNoCommonBitsSet(N0.getOperand(0), N0.getOperand(1)))
+ return SDValue();
+
+ // If the resulting offset is too large, we can't fold it into the
+ // addressing mode offset.
APInt Offset = CAdd->getAPIntValue() << CN1->getAPIntValue();
Type *Ty = MemVT.getTypeForEVT(*DCI.DAG.getContext());
@@ -9478,7 +10027,6 @@ SDValue SITargetLowering::performSHLPtrCombine(SDNode *N,
if (!isLegalAddressingMode(DCI.DAG.getDataLayout(), AM, Ty, AddrSpace))
return SDValue();
- SelectionDAG &DAG = DCI.DAG;
SDLoc SL(N);
EVT VT = N->getValueType(0);
@@ -9604,7 +10152,7 @@ static uint32_t getConstantPermuteMask(uint32_t C) {
// value 0-3 selects corresponding source byte;
// value 0xc selects zero;
// value 0xff selects 0xff.
-static uint32_t getPermuteMask(SelectionDAG &DAG, SDValue V) {
+static uint32_t getPermuteMask(SDValue V) {
assert(V.getValueSizeInBits() == 32);
if (V.getNumOperands() != 2)
@@ -9620,15 +10168,13 @@ static uint32_t getPermuteMask(SelectionDAG &DAG, SDValue V) {
default:
break;
case ISD::AND:
- if (uint32_t ConstMask = getConstantPermuteMask(C)) {
+ if (uint32_t ConstMask = getConstantPermuteMask(C))
return (0x03020100 & ConstMask) | (0x0c0c0c0c & ~ConstMask);
- }
break;
case ISD::OR:
- if (uint32_t ConstMask = getConstantPermuteMask(C)) {
+ if (uint32_t ConstMask = getConstantPermuteMask(C))
return (0x03020100 & ~ConstMask) | ConstMask;
- }
break;
case ISD::SHL:
@@ -9676,7 +10222,7 @@ SDValue SITargetLowering::performAndCombine(SDNode *N,
(Bits == 8 || Bits == 16) && isShiftedMask_64(Mask) && !(Mask & 1)) {
if (auto *CShift = dyn_cast<ConstantSDNode>(LHS->getOperand(1))) {
unsigned Shift = CShift->getZExtValue();
- unsigned NB = CRHS->getAPIntValue().countTrailingZeros();
+ unsigned NB = CRHS->getAPIntValue().countr_zero();
unsigned Offset = NB + Shift;
if ((Offset & (Bits - 1)) == 0) { // Starts at a byte or word boundary.
SDLoc SL(N);
@@ -9787,8 +10333,8 @@ SDValue SITargetLowering::performAndCombine(SDNode *N,
const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
if (VT == MVT::i32 && LHS.hasOneUse() && RHS.hasOneUse() &&
N->isDivergent() && TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32_e64) != -1) {
- uint32_t LHSMask = getPermuteMask(DAG, LHS);
- uint32_t RHSMask = getPermuteMask(DAG, RHS);
+ uint32_t LHSMask = getPermuteMask(LHS);
+ uint32_t RHSMask = getPermuteMask(RHS);
if (LHSMask != ~0u && RHSMask != ~0u) {
// Canonicalize the expression in an attempt to have fewer unique masks
// and therefore fewer registers used to hold the masks.
@@ -9834,6 +10380,325 @@ SDValue SITargetLowering::performAndCombine(SDNode *N,
return SDValue();
}
+// A key component of v_perm is a mapping between byte position of the src
+// operands, and the byte position of the dest. To provide such, we need: 1. the
+// node that provides x byte of the dest of the OR, and 2. the byte of the node
+// used to provide that x byte. calculateByteProvider finds which node provides
+// a certain byte of the dest of the OR, and calculateSrcByte takes that node,
+// and finds an ultimate src and byte position For example: The supported
+// LoadCombine pattern for vector loads is as follows
+// t1
+// or
+// / \
+// t2 t3
+// zext shl
+// | | \
+// t4 t5 16
+// or anyext
+// / \ |
+// t6 t7 t8
+// srl shl or
+// / | / \ / \
+// t9 t10 t11 t12 t13 t14
+// trunc* 8 trunc* 8 and and
+// | | / | | \
+// t15 t16 t17 t18 t19 t20
+// trunc* 255 srl -256
+// | / \
+// t15 t15 16
+//
+// *In this example, the truncs are from i32->i16
+//
+// calculateByteProvider would find t6, t7, t13, and t14 for bytes 0-3
+// respectively. calculateSrcByte would find (given node) -> ultimate src &
+// byteposition: t6 -> t15 & 1, t7 -> t16 & 0, t13 -> t15 & 0, t14 -> t15 & 3.
+// After finding the mapping, we can combine the tree into vperm t15, t16,
+// 0x05000407
+
+// Find the source and byte position from a node.
+// \p DestByte is the byte position of the dest of the or that the src
+// ultimately provides. \p SrcIndex is the byte of the src that maps to this
+// dest of the or byte. \p Depth tracks how many recursive iterations we have
+// performed.
+static const std::optional<ByteProvider<SDValue>>
+calculateSrcByte(const SDValue Op, uint64_t DestByte, uint64_t SrcIndex = 0,
+ unsigned Depth = 0) {
+ // We may need to recursively traverse a series of SRLs
+ if (Depth >= 6)
+ return std::nullopt;
+
+ switch (Op->getOpcode()) {
+ case ISD::TRUNCATE: {
+ if (Op->getOperand(0).getScalarValueSizeInBits() != 32)
+ return std::nullopt;
+ return calculateSrcByte(Op->getOperand(0), DestByte, SrcIndex, Depth + 1);
+ }
+
+ case ISD::SRL: {
+ auto ShiftOp = dyn_cast<ConstantSDNode>(Op->getOperand(1));
+ if (!ShiftOp)
+ return std::nullopt;
+
+ uint64_t BitShift = ShiftOp->getZExtValue();
+
+ if (BitShift % 8 != 0)
+ return std::nullopt;
+
+ SrcIndex += BitShift / 8;
+
+ return calculateSrcByte(Op->getOperand(0), DestByte, SrcIndex, Depth + 1);
+ }
+
+ default: {
+ if (Op.getScalarValueSizeInBits() != 32)
+ return std::nullopt;
+
+ return ByteProvider<SDValue>::getSrc(Op, DestByte, SrcIndex);
+ }
+ }
+ llvm_unreachable("fully handled switch");
+}
+
+// For a byte position in the result of an Or, traverse the tree and find the
+// node (and the byte of the node) which ultimately provides this {Or,
+// BytePosition}. \p Op is the operand we are currently examining. \p Index is
+// the byte position of the Op that corresponds with the originally requested
+// byte of the Or \p Depth tracks how many recursive iterations we have
+// performed. \p StartingIndex is the originally requested byte of the Or
+static const std::optional<ByteProvider<SDValue>>
+calculateByteProvider(const SDValue &Op, unsigned Index, unsigned Depth,
+ unsigned StartingIndex = 0) {
+ // Finding Src tree of RHS of or typically requires at least 1 additional
+ // depth
+ if (Depth > 6)
+ return std::nullopt;
+
+ unsigned BitWidth = Op.getScalarValueSizeInBits();
+ if (BitWidth % 8 != 0)
+ return std::nullopt;
+ assert(Index < BitWidth / 8 && "invalid index requested");
+
+ switch (Op.getOpcode()) {
+ case ISD::OR: {
+ auto RHS = calculateByteProvider(Op.getOperand(1), Index, Depth + 1,
+ StartingIndex);
+ if (!RHS)
+ return std::nullopt;
+ auto LHS = calculateByteProvider(Op.getOperand(0), Index, Depth + 1,
+ StartingIndex);
+ if (!LHS)
+ return std::nullopt;
+ // A well formed Or will have two ByteProviders for each byte, one of which
+ // is constant zero
+ if (!LHS->isConstantZero() && !RHS->isConstantZero())
+ return std::nullopt;
+ if (!LHS || LHS->isConstantZero())
+ return RHS;
+ if (!RHS || RHS->isConstantZero())
+ return LHS;
+ return std::nullopt;
+ }
+
+ case ISD::AND: {
+ auto BitMaskOp = dyn_cast<ConstantSDNode>(Op->getOperand(1));
+ if (!BitMaskOp)
+ return std::nullopt;
+
+ uint32_t BitMask = BitMaskOp->getZExtValue();
+ // Bits we expect for our StartingIndex
+ uint32_t IndexMask = 0xFF << (Index * 8);
+
+ if ((IndexMask & BitMask) != IndexMask) {
+ // If the result of the and partially provides the byte, then it
+ // is not well formatted
+ if (IndexMask & BitMask)
+ return std::nullopt;
+ return ByteProvider<SDValue>::getConstantZero();
+ }
+
+ return calculateSrcByte(Op->getOperand(0), StartingIndex, Index);
+ }
+
+ case ISD::SRL: {
+ auto ShiftOp = dyn_cast<ConstantSDNode>(Op->getOperand(1));
+ if (!ShiftOp)
+ return std::nullopt;
+
+ uint64_t BitShift = ShiftOp->getZExtValue();
+ if (BitShift % 8)
+ return std::nullopt;
+
+ auto BitsProvided = Op.getScalarValueSizeInBits();
+ if (BitsProvided % 8 != 0)
+ return std::nullopt;
+
+ uint64_t BytesProvided = BitsProvided / 8;
+ uint64_t ByteShift = BitShift / 8;
+ // The dest of shift will have good [0 : (BytesProvided - ByteShift)] bytes.
+ // If the byte we are trying to provide (as tracked by index) falls in this
+ // range, then the SRL provides the byte. The byte of interest of the src of
+ // the SRL is Index + ByteShift
+ return BytesProvided - ByteShift > Index
+ ? calculateSrcByte(Op->getOperand(0), StartingIndex,
+ Index + ByteShift)
+ : ByteProvider<SDValue>::getConstantZero();
+ }
+
+ case ISD::SHL: {
+ auto ShiftOp = dyn_cast<ConstantSDNode>(Op->getOperand(1));
+ if (!ShiftOp)
+ return std::nullopt;
+
+ uint64_t BitShift = ShiftOp->getZExtValue();
+ if (BitShift % 8 != 0)
+ return std::nullopt;
+ uint64_t ByteShift = BitShift / 8;
+
+ // If we are shifting by an amount greater than (or equal to)
+ // the index we are trying to provide, then it provides 0s. If not,
+ // then this bytes are not definitively 0s, and the corresponding byte
+ // of interest is Index - ByteShift of the src
+ return Index < ByteShift
+ ? ByteProvider<SDValue>::getConstantZero()
+ : calculateByteProvider(Op.getOperand(0), Index - ByteShift,
+ Depth + 1, StartingIndex);
+ }
+ case ISD::ANY_EXTEND:
+ case ISD::SIGN_EXTEND:
+ case ISD::ZERO_EXTEND: {
+ SDValue NarrowOp = Op->getOperand(0);
+ unsigned NarrowBitWidth = NarrowOp.getScalarValueSizeInBits();
+ if (NarrowBitWidth % 8 != 0)
+ return std::nullopt;
+ uint64_t NarrowByteWidth = NarrowBitWidth / 8;
+
+ if (Index >= NarrowByteWidth)
+ return Op.getOpcode() == ISD::ZERO_EXTEND
+ ? std::optional<ByteProvider<SDValue>>(
+ ByteProvider<SDValue>::getConstantZero())
+ : std::nullopt;
+ return calculateByteProvider(NarrowOp, Index, Depth + 1, StartingIndex);
+ }
+
+ case ISD::TRUNCATE: {
+ unsigned NarrowBitWidth = Op.getScalarValueSizeInBits();
+ if (NarrowBitWidth % 8 != 0)
+ return std::nullopt;
+ uint64_t NarrowByteWidth = NarrowBitWidth / 8;
+
+ if (NarrowByteWidth >= Index) {
+ return calculateByteProvider(Op.getOperand(0), Index, Depth + 1,
+ StartingIndex);
+ }
+
+ return std::nullopt;
+ }
+
+ case ISD::LOAD: {
+ auto L = cast<LoadSDNode>(Op.getNode());
+ unsigned NarrowBitWidth = L->getMemoryVT().getSizeInBits();
+ if (NarrowBitWidth % 8 != 0)
+ return std::nullopt;
+ uint64_t NarrowByteWidth = NarrowBitWidth / 8;
+
+ // If the width of the load does not reach byte we are trying to provide for
+ // and it is not a ZEXTLOAD, then the load does not provide for the byte in
+ // question
+ if (Index >= NarrowByteWidth) {
+ return L->getExtensionType() == ISD::ZEXTLOAD
+ ? std::optional<ByteProvider<SDValue>>(
+ ByteProvider<SDValue>::getConstantZero())
+ : std::nullopt;
+ }
+
+ if (NarrowByteWidth > Index) {
+ return calculateSrcByte(Op, StartingIndex, Index);
+ }
+
+ return std::nullopt;
+ }
+
+ case ISD::BSWAP:
+ return calculateByteProvider(Op->getOperand(0), BitWidth / 8 - Index - 1,
+ Depth + 1, StartingIndex);
+ default: {
+ return std::nullopt;
+ }
+ }
+
+ llvm_unreachable("fully handled switch");
+}
+
+// Returns true if the Operand is a scalar and is 16 bits
+static bool is16BitScalarOp(SDValue &Operand) {
+ switch (Operand.getOpcode()) {
+ case ISD::ANY_EXTEND:
+ case ISD::SIGN_EXTEND:
+ case ISD::ZERO_EXTEND: {
+ auto OpVT = Operand.getOperand(0).getValueType();
+ return !OpVT.isVector() && OpVT.getSizeInBits() == 16;
+ }
+ case ISD::LOAD: {
+ LoadSDNode *L = cast<LoadSDNode>(Operand.getNode());
+ auto ExtType = cast<LoadSDNode>(L)->getExtensionType();
+ if (ExtType == ISD::ZEXTLOAD || ExtType == ISD::SEXTLOAD ||
+ ExtType == ISD::EXTLOAD) {
+ auto MemVT = L->getMemoryVT();
+ return !MemVT.isVector() && MemVT.getSizeInBits() == 16;
+ }
+ return false;
+ }
+ default:
+ return false;
+ }
+}
+
+// Returns true if the mask matches consecutive bytes, and the first byte
+// begins at a power of 2 byte offset from 0th byte
+static bool addresses16Bits(int Mask) {
+ int Low8 = Mask & 0xff;
+ int Hi8 = (Mask & 0xff00) >> 8;
+
+ assert(Low8 < 8 && Hi8 < 8);
+ // Are the bytes contiguous in the order of increasing addresses.
+ bool IsConsecutive = (Hi8 - Low8 == 1);
+ // Is the first byte at location that is aligned for 16 bit instructions.
+ // A counter example is taking 2 consecutive bytes starting at the 8th bit.
+ // In this case, we still need code to extract the 16 bit operand, so it
+ // is better to use i8 v_perm
+ bool Is16Aligned = !(Low8 % 2);
+
+ return IsConsecutive && Is16Aligned;
+}
+
+// Do not lower into v_perm if the operands are actually 16 bit
+// and the selected bits (based on PermMask) correspond with two
+// easily addressable 16 bit operands.
+static bool hasEightBitAccesses(uint64_t PermMask, SDValue &Op,
+ SDValue &OtherOp) {
+ int Low16 = PermMask & 0xffff;
+ int Hi16 = (PermMask & 0xffff0000) >> 16;
+
+ // ByteProvider only accepts 32 bit operands
+ assert(Op.getValueType().getSizeInBits() == 32);
+ assert(OtherOp.getValueType().getSizeInBits() == 32);
+
+ auto OpIs16Bit = is16BitScalarOp(Op);
+ auto OtherOpIs16Bit = is16BitScalarOp(Op);
+
+ // If there is a size mismatch, then we must use masking on at least one
+ // operand
+ if (OpIs16Bit != OtherOpIs16Bit)
+ return true;
+
+ // If both operands are 16 bit, return whether or not we cleanly address both
+ if (is16BitScalarOp(Op) && is16BitScalarOp(OtherOp))
+ return !addresses16Bits(Low16) || !addresses16Bits(Hi16);
+
+ // Both are 32 bit operands
+ return true;
+}
+
SDValue SITargetLowering::performOrCombine(SDNode *N,
DAGCombinerInfo &DCI) const {
SelectionDAG &DAG = DCI.DAG;
@@ -9884,8 +10749,36 @@ SDValue SITargetLowering::performOrCombine(SDNode *N,
const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
if (VT == MVT::i32 && LHS.hasOneUse() && RHS.hasOneUse() &&
N->isDivergent() && TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32_e64) != -1) {
- uint32_t LHSMask = getPermuteMask(DAG, LHS);
- uint32_t RHSMask = getPermuteMask(DAG, RHS);
+
+ // If all the uses of an or need to extract the individual elements, do not
+ // attempt to lower into v_perm
+ auto usesCombinedOperand = [](SDNode *OrUse) {
+ // If we have any non-vectorized use, then it is a candidate for v_perm
+ if (OrUse->getOpcode() != ISD::BITCAST ||
+ !OrUse->getValueType(0).isVector())
+ return true;
+
+ // If we have any non-vectorized use, then it is a candidate for v_perm
+ for (auto VUse : OrUse->uses()) {
+ if (!VUse->getValueType(0).isVector())
+ return true;
+
+ // If the use of a vector is a store, then combining via a v_perm
+ // is beneficial.
+ // TODO -- whitelist more uses
+ for (auto VectorwiseOp : {ISD::STORE, ISD::CopyToReg, ISD::CopyFromReg})
+ if (VUse->getOpcode() == VectorwiseOp)
+ return true;
+ }
+ return false;
+ };
+
+ if (!any_of(N->uses(), usesCombinedOperand))
+ return SDValue();
+
+ uint32_t LHSMask = getPermuteMask(LHS);
+ uint32_t RHSMask = getPermuteMask(RHS);
+
if (LHSMask != ~0u && RHSMask != ~0u) {
// Canonicalize the expression in an attempt to have fewer unique masks
// and therefore fewer registers used to hold the masks.
@@ -9918,6 +10811,71 @@ SDValue SITargetLowering::performOrCombine(SDNode *N,
DAG.getConstant(Sel, DL, MVT::i32));
}
}
+ if (LHSMask == ~0u || RHSMask == ~0u) {
+ SmallVector<ByteProvider<SDValue>, 8> PermNodes;
+
+ // VT is known to be MVT::i32, so we need to provide 4 bytes.
+ assert(VT == MVT::i32);
+ for (int i = 0; i < 4; i++) {
+ // Find the ByteProvider that provides the ith byte of the result of OR
+ std::optional<ByteProvider<SDValue>> P =
+ calculateByteProvider(SDValue(N, 0), i, 0, /*StartingIndex = */ i);
+ // TODO support constantZero
+ if (!P || P->isConstantZero())
+ return SDValue();
+
+ PermNodes.push_back(*P);
+ }
+ if (PermNodes.size() != 4)
+ return SDValue();
+
+ int FirstSrc = 0;
+ std::optional<int> SecondSrc;
+ uint64_t permMask = 0x00000000;
+ for (size_t i = 0; i < PermNodes.size(); i++) {
+ auto PermOp = PermNodes[i];
+ // Since the mask is applied to Src1:Src2, Src1 bytes must be offset
+ // by sizeof(Src2) = 4
+ int SrcByteAdjust = 4;
+
+ if (!PermOp.hasSameSrc(PermNodes[FirstSrc])) {
+ if (SecondSrc.has_value())
+ if (!PermOp.hasSameSrc(PermNodes[*SecondSrc]))
+ return SDValue();
+ // Set the index of the second distinct Src node
+ SecondSrc = i;
+ assert(PermNodes[*SecondSrc].Src->getValueType().getSizeInBits() ==
+ 32);
+ SrcByteAdjust = 0;
+ }
+ assert(PermOp.SrcOffset + SrcByteAdjust < 8);
+ assert(!DAG.getDataLayout().isBigEndian());
+ permMask |= (PermOp.SrcOffset + SrcByteAdjust) << (i * 8);
+ }
+
+ SDValue Op = *PermNodes[FirstSrc].Src;
+ SDValue OtherOp = SecondSrc.has_value() ? *PermNodes[*SecondSrc].Src
+ : *PermNodes[FirstSrc].Src;
+
+ // Check that we are not just extracting the bytes in order from an op
+ if (Op == OtherOp) {
+ int Low16 = permMask & 0xffff;
+ int Hi16 = (permMask & 0xffff0000) >> 16;
+
+ bool WellFormedLow = (Low16 == 0x0504) || (Low16 == 0x0100);
+ bool WellFormedHi = (Hi16 == 0x0706) || (Hi16 == 0x0302);
+
+ // The perm op would really just produce Op. So combine into Op
+ if (WellFormedLow && WellFormedHi)
+ return Op;
+ }
+
+ if (hasEightBitAccesses(permMask, Op, OtherOp)) {
+ SDLoc DL(N);
+ return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, Op, OtherOp,
+ DAG.getConstant(permMask, DL, MVT::i32));
+ }
+ }
}
if (VT != MVT::i64 || DCI.isBeforeLegalizeOps())
@@ -9966,20 +10924,40 @@ SDValue SITargetLowering::performXorCombine(SDNode *N,
if (SDValue RV = reassociateScalarOps(N, DCI.DAG))
return RV;
- EVT VT = N->getValueType(0);
- if (VT != MVT::i64)
- return SDValue();
-
SDValue LHS = N->getOperand(0);
SDValue RHS = N->getOperand(1);
const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(RHS);
- if (CRHS) {
+ SelectionDAG &DAG = DCI.DAG;
+
+ EVT VT = N->getValueType(0);
+ if (CRHS && VT == MVT::i64) {
if (SDValue Split
= splitBinaryBitConstantOp(DCI, SDLoc(N), ISD::XOR, LHS, CRHS))
return Split;
}
+ // Make sure to apply the 64-bit constant splitting fold before trying to fold
+ // fneg-like xors into 64-bit select.
+ if (LHS.getOpcode() == ISD::SELECT && VT == MVT::i32) {
+ // This looks like an fneg, try to fold as a source modifier.
+ if (CRHS && CRHS->getAPIntValue().isSignMask() &&
+ shouldFoldFNegIntoSrc(N, LHS)) {
+ // xor (select c, a, b), 0x80000000 ->
+ // bitcast (select c, (fneg (bitcast a)), (fneg (bitcast b)))
+ SDLoc DL(N);
+ SDValue CastLHS =
+ DAG.getNode(ISD::BITCAST, DL, MVT::f32, LHS->getOperand(1));
+ SDValue CastRHS =
+ DAG.getNode(ISD::BITCAST, DL, MVT::f32, LHS->getOperand(2));
+ SDValue FNegLHS = DAG.getNode(ISD::FNEG, DL, MVT::f32, CastLHS);
+ SDValue FNegRHS = DAG.getNode(ISD::FNEG, DL, MVT::f32, CastRHS);
+ SDValue NewSelect = DAG.getNode(ISD::SELECT, DL, MVT::f32,
+ LHS->getOperand(0), FNegLHS, FNegRHS);
+ return DAG.getNode(ISD::BITCAST, DL, VT, NewSelect);
+ }
+ }
+
return SDValue();
}
@@ -10086,10 +11064,15 @@ bool SITargetLowering::isCanonicalized(SelectionDAG &DAG, SDValue Op,
return true;
if (auto *CFP = dyn_cast<ConstantFPSDNode>(Op)) {
- auto F = CFP->getValueAPF();
+ const auto &F = CFP->getValueAPF();
if (F.isNaN() && F.isSignaling())
return false;
- return !F.isDenormal() || denormalsEnabledForType(DAG, Op.getValueType());
+ if (!F.isDenormal())
+ return true;
+
+ DenormalMode Mode =
+ DAG.getMachineFunction().getDenormalMode(F.getSemantics());
+ return Mode == DenormalMode::getIEEE();
}
// If source is a result of another standard FP operation it is already in
@@ -10111,6 +11094,7 @@ bool SITargetLowering::isCanonicalized(SelectionDAG &DAG, SDValue Op,
case ISD::FREM:
case ISD::FP_ROUND:
case ISD::FP_EXTEND:
+ case ISD::FLDEXP:
case AMDGPUISD::FMUL_LEGACY:
case AMDGPUISD::FMAD_FTZ:
case AMDGPUISD::RCP:
@@ -10118,11 +11102,12 @@ bool SITargetLowering::isCanonicalized(SelectionDAG &DAG, SDValue Op,
case AMDGPUISD::RSQ_CLAMP:
case AMDGPUISD::RCP_LEGACY:
case AMDGPUISD::RCP_IFLAG:
+ case AMDGPUISD::LOG:
+ case AMDGPUISD::EXP:
case AMDGPUISD::DIV_SCALE:
case AMDGPUISD::DIV_FMAS:
case AMDGPUISD::DIV_FIXUP:
case AMDGPUISD::FRACT:
- case AMDGPUISD::LDEXP:
case AMDGPUISD::CVT_PKRTZ_F16_F32:
case AMDGPUISD::CVT_F32_UBYTE0:
case AMDGPUISD::CVT_F32_UBYTE1:
@@ -10156,6 +11141,7 @@ bool SITargetLowering::isCanonicalized(SelectionDAG &DAG, SDValue Op,
// snans will be quieted, so we only need to worry about denormals.
if (Subtarget->supportsMinMaxDenormModes() ||
+ // FIXME: denormalsEnabledForType is broken for dynamic
denormalsEnabledForType(DAG, Op.getValueType()))
return true;
@@ -10225,6 +11211,8 @@ bool SITargetLowering::isCanonicalized(SelectionDAG &DAG, SDValue Op,
case Intrinsic::amdgcn_rcp_legacy:
case Intrinsic::amdgcn_rsq_legacy:
case Intrinsic::amdgcn_trig_preop:
+ case Intrinsic::amdgcn_log:
+ case Intrinsic::amdgcn_exp2:
return true;
default:
break;
@@ -10233,6 +11221,7 @@ bool SITargetLowering::isCanonicalized(SelectionDAG &DAG, SDValue Op,
[[fallthrough]];
}
default:
+ // FIXME: denormalsEnabledForType is broken for dynamic
return denormalsEnabledForType(DAG, Op.getValueType()) &&
DAG.isKnownNeverSNaN(Op);
}
@@ -10254,8 +11243,11 @@ bool SITargetLowering::isCanonicalized(Register Reg, MachineFunction &MF,
if (mi_match(Reg, MRI, MIPatternMatch::m_GFCstOrSplat(FCR))) {
if (FCR->Value.isSignaling())
return false;
- return !FCR->Value.isDenormal() ||
- denormalsEnabledForType(MRI.getType(FCR->VReg), MF);
+ if (!FCR->Value.isDenormal())
+ return true;
+
+ DenormalMode Mode = MF.getDenormalMode(FCR->Value.getSemantics());
+ return Mode == DenormalMode::getIEEE();
}
if (MaxDepth == 0)
@@ -10298,6 +11290,7 @@ bool SITargetLowering::isCanonicalized(Register Reg, MachineFunction &MF,
case AMDGPU::G_FMINNUM_IEEE:
case AMDGPU::G_FMAXNUM_IEEE: {
if (Subtarget->supportsMinMaxDenormModes() ||
+ // FIXME: denormalsEnabledForType is broken for dynamic
denormalsEnabledForType(MRI.getType(Reg), MF))
return true;
@@ -10316,6 +11309,8 @@ bool SITargetLowering::isCanonicalized(Register Reg, MachineFunction &MF,
case Intrinsic::amdgcn_fmed3:
case Intrinsic::amdgcn_sin:
case Intrinsic::amdgcn_cos:
+ case Intrinsic::amdgcn_log:
+ case Intrinsic::amdgcn_exp2:
case Intrinsic::amdgcn_log_clamp:
case Intrinsic::amdgcn_rcp:
case Intrinsic::amdgcn_rcp_legacy:
@@ -10352,9 +11347,16 @@ bool SITargetLowering::isCanonicalized(Register Reg, MachineFunction &MF,
SDValue SITargetLowering::getCanonicalConstantFP(
SelectionDAG &DAG, const SDLoc &SL, EVT VT, const APFloat &C) const {
// Flush denormals to 0 if not enabled.
- if (C.isDenormal() && !denormalsEnabledForType(DAG, VT)) {
- return DAG.getConstantFP(APFloat::getZero(C.getSemantics(),
- C.isNegative()), SL, VT);
+ if (C.isDenormal()) {
+ DenormalMode Mode =
+ DAG.getMachineFunction().getDenormalMode(C.getSemantics());
+ if (Mode == DenormalMode::getPreserveSign()) {
+ return DAG.getConstantFP(
+ APFloat::getZero(C.getSemantics(), C.isNegative()), SL, VT);
+ }
+
+ if (Mode != DenormalMode::getIEEE())
+ return SDValue();
}
if (C.isNaN()) {
@@ -10490,45 +11492,41 @@ static unsigned minMaxOpcToMin3Max3Opc(unsigned Opc) {
}
}
-SDValue SITargetLowering::performIntMed3ImmCombine(
- SelectionDAG &DAG, const SDLoc &SL,
- SDValue Op0, SDValue Op1, bool Signed) const {
- ConstantSDNode *K1 = dyn_cast<ConstantSDNode>(Op1);
- if (!K1)
- return SDValue();
+SDValue SITargetLowering::performIntMed3ImmCombine(SelectionDAG &DAG,
+ const SDLoc &SL, SDValue Src,
+ SDValue MinVal,
+ SDValue MaxVal,
+ bool Signed) const {
- ConstantSDNode *K0 = dyn_cast<ConstantSDNode>(Op0.getOperand(1));
- if (!K0)
+ // med3 comes from
+ // min(max(x, K0), K1), K0 < K1
+ // max(min(x, K0), K1), K1 < K0
+ //
+ // "MinVal" and "MaxVal" respectively refer to the rhs of the
+ // min/max op.
+ ConstantSDNode *MinK = dyn_cast<ConstantSDNode>(MinVal);
+ ConstantSDNode *MaxK = dyn_cast<ConstantSDNode>(MaxVal);
+
+ if (!MinK || !MaxK)
return SDValue();
if (Signed) {
- if (K0->getAPIntValue().sge(K1->getAPIntValue()))
+ if (MaxK->getAPIntValue().sge(MinK->getAPIntValue()))
return SDValue();
} else {
- if (K0->getAPIntValue().uge(K1->getAPIntValue()))
+ if (MaxK->getAPIntValue().uge(MinK->getAPIntValue()))
return SDValue();
}
- EVT VT = K0->getValueType(0);
+ EVT VT = MinK->getValueType(0);
unsigned Med3Opc = Signed ? AMDGPUISD::SMED3 : AMDGPUISD::UMED3;
- if (VT == MVT::i32 || (VT == MVT::i16 && Subtarget->hasMed3_16())) {
- return DAG.getNode(Med3Opc, SL, VT,
- Op0.getOperand(0), SDValue(K0, 0), SDValue(K1, 0));
- }
-
- // If there isn't a 16-bit med3 operation, convert to 32-bit.
- if (VT == MVT::i16) {
- MVT NVT = MVT::i32;
- unsigned ExtOp = Signed ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
-
- SDValue Tmp1 = DAG.getNode(ExtOp, SL, NVT, Op0->getOperand(0));
- SDValue Tmp2 = DAG.getNode(ExtOp, SL, NVT, Op0->getOperand(1));
- SDValue Tmp3 = DAG.getNode(ExtOp, SL, NVT, Op1);
-
- SDValue Med3 = DAG.getNode(Med3Opc, SL, NVT, Tmp1, Tmp2, Tmp3);
- return DAG.getNode(ISD::TRUNCATE, SL, VT, Med3);
- }
+ if (VT == MVT::i32 || (VT == MVT::i16 && Subtarget->hasMed3_16()))
+ return DAG.getNode(Med3Opc, SL, VT, Src, MaxVal, MinVal);
+ // Note: we could also extend to i32 and use i32 med3 if i16 med3 is
+ // not available, but this is unlikely to be profitable as constants
+ // will often need to be materialized & extended, especially on
+ // pre-GFX10 where VOP3 instructions couldn't take literal operands.
return SDValue();
}
@@ -10640,13 +11638,26 @@ SDValue SITargetLowering::performMinMaxCombine(SDNode *N,
}
// min(max(x, K0), K1), K0 < K1 -> med3(x, K0, K1)
+ // max(min(x, K0), K1), K1 < K0 -> med3(x, K1, K0)
if (Opc == ISD::SMIN && Op0.getOpcode() == ISD::SMAX && Op0.hasOneUse()) {
- if (SDValue Med3 = performIntMed3ImmCombine(DAG, SDLoc(N), Op0, Op1, true))
+ if (SDValue Med3 = performIntMed3ImmCombine(
+ DAG, SDLoc(N), Op0->getOperand(0), Op1, Op0->getOperand(1), true))
+ return Med3;
+ }
+ if (Opc == ISD::SMAX && Op0.getOpcode() == ISD::SMIN && Op0.hasOneUse()) {
+ if (SDValue Med3 = performIntMed3ImmCombine(
+ DAG, SDLoc(N), Op0->getOperand(0), Op0->getOperand(1), Op1, true))
return Med3;
}
if (Opc == ISD::UMIN && Op0.getOpcode() == ISD::UMAX && Op0.hasOneUse()) {
- if (SDValue Med3 = performIntMed3ImmCombine(DAG, SDLoc(N), Op0, Op1, false))
+ if (SDValue Med3 = performIntMed3ImmCombine(
+ DAG, SDLoc(N), Op0->getOperand(0), Op1, Op0->getOperand(1), false))
+ return Med3;
+ }
+ if (Opc == ISD::UMAX && Op0.getOpcode() == ISD::UMIN && Op0.hasOneUse()) {
+ if (SDValue Med3 = performIntMed3ImmCombine(
+ DAG, SDLoc(N), Op0->getOperand(0), Op0->getOperand(1), Op1, false))
return Med3;
}
@@ -10930,6 +11941,70 @@ SITargetLowering::performInsertVectorEltCombine(SDNode *N,
return DAG.getBuildVector(VecVT, SL, Ops);
}
+/// Return the source of an fp_extend from f16 to f32, or a converted FP
+/// constant.
+static SDValue strictFPExtFromF16(SelectionDAG &DAG, SDValue Src) {
+ if (Src.getOpcode() == ISD::FP_EXTEND &&
+ Src.getOperand(0).getValueType() == MVT::f16) {
+ return Src.getOperand(0);
+ }
+
+ if (auto *CFP = dyn_cast<ConstantFPSDNode>(Src)) {
+ APFloat Val = CFP->getValueAPF();
+ bool LosesInfo = true;
+ Val.convert(APFloat::IEEEhalf(), APFloat::rmNearestTiesToEven, &LosesInfo);
+ if (!LosesInfo)
+ return DAG.getConstantFP(Val, SDLoc(Src), MVT::f16);
+ }
+
+ return SDValue();
+}
+
+SDValue SITargetLowering::performFPRoundCombine(SDNode *N,
+ DAGCombinerInfo &DCI) const {
+ assert(Subtarget->has16BitInsts() && !Subtarget->hasMed3_16() &&
+ "combine only useful on gfx8");
+
+ SDValue TruncSrc = N->getOperand(0);
+ EVT VT = N->getValueType(0);
+ if (VT != MVT::f16)
+ return SDValue();
+
+ if (TruncSrc.getOpcode() != AMDGPUISD::FMED3 ||
+ TruncSrc.getValueType() != MVT::f32 || !TruncSrc.hasOneUse())
+ return SDValue();
+
+ SelectionDAG &DAG = DCI.DAG;
+ SDLoc SL(N);
+
+ // Optimize f16 fmed3 pattern performed on f32. On gfx8 there is no f16 fmed3,
+ // and expanding it with min/max saves 1 instruction vs. casting to f32 and
+ // casting back.
+
+ // fptrunc (f32 (fmed3 (fpext f16:a, fpext f16:b, fpext f16:c))) =>
+ // fmin(fmax(a, b), fmax(fmin(a, b), c))
+ SDValue A = strictFPExtFromF16(DAG, TruncSrc.getOperand(0));
+ if (!A)
+ return SDValue();
+
+ SDValue B = strictFPExtFromF16(DAG, TruncSrc.getOperand(1));
+ if (!B)
+ return SDValue();
+
+ SDValue C = strictFPExtFromF16(DAG, TruncSrc.getOperand(2));
+ if (!C)
+ return SDValue();
+
+ // This changes signaling nan behavior. If an input is a signaling nan, it
+ // would have been quieted by the fpext originally. We don't care because
+ // these are unconstrained ops. If we needed to insert quieting canonicalizes
+ // we would be worse off than just doing the promotion.
+ SDValue A1 = DAG.getNode(ISD::FMINNUM_IEEE, SL, VT, A, B);
+ SDValue B1 = DAG.getNode(ISD::FMAXNUM_IEEE, SL, VT, A, B);
+ SDValue C1 = DAG.getNode(ISD::FMAXNUM_IEEE, SL, VT, A1, C);
+ return DAG.getNode(ISD::FMINNUM_IEEE, SL, VT, B1, C1);
+}
+
unsigned SITargetLowering::getFusedOpcode(const SelectionDAG &DAG,
const SDNode *N0,
const SDNode *N1) const {
@@ -10937,10 +12012,11 @@ unsigned SITargetLowering::getFusedOpcode(const SelectionDAG &DAG,
// Only do this if we are not trying to support denormals. v_mad_f32 does not
// support denormals ever.
- if (((VT == MVT::f32 && !hasFP32Denormals(DAG.getMachineFunction())) ||
- (VT == MVT::f16 && !hasFP64FP16Denormals(DAG.getMachineFunction()) &&
- getSubtarget()->hasMadF16())) &&
- isOperationLegal(ISD::FMAD, VT))
+ if (((VT == MVT::f32 &&
+ denormalModeIsFlushAllF32(DAG.getMachineFunction())) ||
+ (VT == MVT::f16 && Subtarget->hasMadF16() &&
+ denormalModeIsFlushAllF64F16(DAG.getMachineFunction()))) &&
+ isOperationLegal(ISD::FMAD, VT))
return ISD::FMAD;
const TargetOptions &Options = DAG.getTarget().Options;
@@ -11093,7 +12169,6 @@ SDValue SITargetLowering::tryFoldToMad64_32(SDNode *N,
// The actual DAG is noisier than the pseudo code, but only due to
// instructions that disassemble values into low and high parts, and
// assemble the final result.
- SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
SDValue One = DAG.getConstant(1, SL, MVT::i32);
auto MulLHSLo = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, MulLHS);
@@ -11102,8 +12177,8 @@ SDValue SITargetLowering::tryFoldToMad64_32(SDNode *N,
getMad64_32(DAG, SL, MVT::i64, MulLHSLo, MulRHSLo, AddRHS, MulSignedLo);
if (!MulSignedLo && (!MulLHSUnsigned32 || !MulRHSUnsigned32)) {
- auto AccumLo = DAG.getNode(ISD::EXTRACT_ELEMENT, SL, MVT::i32, Accum, Zero);
- auto AccumHi = DAG.getNode(ISD::EXTRACT_ELEMENT, SL, MVT::i32, Accum, One);
+ SDValue AccumLo, AccumHi;
+ std::tie(AccumLo, AccumHi) = DAG.SplitScalar(Accum, SL, MVT::i32, MVT::i32);
if (!MulLHSUnsigned32) {
auto MulLHSHi =
@@ -11152,11 +12227,11 @@ SDValue SITargetLowering::performAddCombine(SDNode *N,
if (VT != MVT::i32 || !DCI.isAfterLegalizeDAG())
return SDValue();
- // add x, zext (setcc) => addcarry x, 0, setcc
- // add x, sext (setcc) => subcarry x, 0, setcc
+ // add x, zext (setcc) => uaddo_carry x, 0, setcc
+ // add x, sext (setcc) => usubo_carry x, 0, setcc
unsigned Opc = LHS.getOpcode();
if (Opc == ISD::ZERO_EXTEND || Opc == ISD::SIGN_EXTEND ||
- Opc == ISD::ANY_EXTEND || Opc == ISD::ADDCARRY)
+ Opc == ISD::ANY_EXTEND || Opc == ISD::UADDO_CARRY)
std::swap(RHS, LHS);
Opc = RHS.getOpcode();
@@ -11172,15 +12247,15 @@ SDValue SITargetLowering::performAddCombine(SDNode *N,
break;
SDVTList VTList = DAG.getVTList(MVT::i32, MVT::i1);
SDValue Args[] = { LHS, DAG.getConstant(0, SL, MVT::i32), Cond };
- Opc = (Opc == ISD::SIGN_EXTEND) ? ISD::SUBCARRY : ISD::ADDCARRY;
+ Opc = (Opc == ISD::SIGN_EXTEND) ? ISD::USUBO_CARRY : ISD::UADDO_CARRY;
return DAG.getNode(Opc, SL, VTList, Args);
}
- case ISD::ADDCARRY: {
- // add x, (addcarry y, 0, cc) => addcarry x, y, cc
- auto C = dyn_cast<ConstantSDNode>(RHS.getOperand(1));
- if (!C || C->getZExtValue() != 0) break;
+ case ISD::UADDO_CARRY: {
+ // add x, (uaddo_carry y, 0, cc) => uaddo_carry x, y, cc
+ if (!isNullConstant(RHS.getOperand(1)))
+ break;
SDValue Args[] = { LHS, RHS.getOperand(0), RHS.getOperand(2) };
- return DAG.getNode(ISD::ADDCARRY, SDLoc(N), RHS->getVTList(), Args);
+ return DAG.getNode(ISD::UADDO_CARRY, SDLoc(N), RHS->getVTList(), Args);
}
}
return SDValue();
@@ -11198,8 +12273,8 @@ SDValue SITargetLowering::performSubCombine(SDNode *N,
SDValue LHS = N->getOperand(0);
SDValue RHS = N->getOperand(1);
- // sub x, zext (setcc) => subcarry x, 0, setcc
- // sub x, sext (setcc) => addcarry x, 0, setcc
+ // sub x, zext (setcc) => usubo_carry x, 0, setcc
+ // sub x, sext (setcc) => uaddo_carry x, 0, setcc
unsigned Opc = RHS.getOpcode();
switch (Opc) {
default: break;
@@ -11213,18 +12288,18 @@ SDValue SITargetLowering::performSubCombine(SDNode *N,
break;
SDVTList VTList = DAG.getVTList(MVT::i32, MVT::i1);
SDValue Args[] = { LHS, DAG.getConstant(0, SL, MVT::i32), Cond };
- Opc = (Opc == ISD::SIGN_EXTEND) ? ISD::ADDCARRY : ISD::SUBCARRY;
+ Opc = (Opc == ISD::SIGN_EXTEND) ? ISD::UADDO_CARRY : ISD::USUBO_CARRY;
return DAG.getNode(Opc, SL, VTList, Args);
}
}
- if (LHS.getOpcode() == ISD::SUBCARRY) {
- // sub (subcarry x, 0, cc), y => subcarry x, y, cc
+ if (LHS.getOpcode() == ISD::USUBO_CARRY) {
+ // sub (usubo_carry x, 0, cc), y => usubo_carry x, y, cc
auto C = dyn_cast<ConstantSDNode>(LHS.getOperand(1));
if (!C || !C->isZero())
return SDValue();
SDValue Args[] = { LHS.getOperand(0), RHS, LHS.getOperand(2) };
- return DAG.getNode(ISD::SUBCARRY, SDLoc(N), LHS->getVTList(), Args);
+ return DAG.getNode(ISD::USUBO_CARRY, SDLoc(N), LHS->getVTList(), Args);
}
return SDValue();
}
@@ -11235,19 +12310,18 @@ SDValue SITargetLowering::performAddCarrySubCarryCombine(SDNode *N,
if (N->getValueType(0) != MVT::i32)
return SDValue();
- auto C = dyn_cast<ConstantSDNode>(N->getOperand(1));
- if (!C || C->getZExtValue() != 0)
+ if (!isNullConstant(N->getOperand(1)))
return SDValue();
SelectionDAG &DAG = DCI.DAG;
SDValue LHS = N->getOperand(0);
- // addcarry (add x, y), 0, cc => addcarry x, y, cc
- // subcarry (sub x, y), 0, cc => subcarry x, y, cc
+ // uaddo_carry (add x, y), 0, cc => uaddo_carry x, y, cc
+ // usubo_carry (sub x, y), 0, cc => usubo_carry x, y, cc
unsigned LHSOpc = LHS.getOpcode();
unsigned Opc = N->getOpcode();
- if ((LHSOpc == ISD::ADD && Opc == ISD::ADDCARRY) ||
- (LHSOpc == ISD::SUB && Opc == ISD::SUBCARRY)) {
+ if ((LHSOpc == ISD::ADD && Opc == ISD::UADDO_CARRY) ||
+ (LHSOpc == ISD::SUB && Opc == ISD::USUBO_CARRY)) {
SDValue Args[] = { LHS.getOperand(0), LHS.getOperand(1), N->getOperand(2) };
return DAG.getNode(Opc, SDLoc(N), N->getVTList(), Args);
}
@@ -11599,8 +12673,8 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N,
return performAddCombine(N, DCI);
case ISD::SUB:
return performSubCombine(N, DCI);
- case ISD::ADDCARRY:
- case ISD::SUBCARRY:
+ case ISD::UADDO_CARRY:
+ case ISD::USUBO_CARRY:
return performAddCarrySubCarryCombine(N, DCI);
case ISD::FADD:
return performFAddCombine(N, DCI);
@@ -11637,12 +12711,12 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N,
return performFCanonicalizeCombine(N, DCI);
case AMDGPUISD::RCP:
return performRcpCombine(N, DCI);
+ case ISD::FLDEXP:
case AMDGPUISD::FRACT:
case AMDGPUISD::RSQ:
case AMDGPUISD::RCP_LEGACY:
case AMDGPUISD::RCP_IFLAG:
- case AMDGPUISD::RSQ_CLAMP:
- case AMDGPUISD::LDEXP: {
+ case AMDGPUISD::RSQ_CLAMP: {
// FIXME: This is probably wrong. If src is an sNaN, it won't be quieted
SDValue Src = N->getOperand(0);
if (Src.isUndef())
@@ -11652,6 +12726,8 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N,
case ISD::SINT_TO_FP:
case ISD::UINT_TO_FP:
return performUCharToFloatCombine(N, DCI);
+ case ISD::FCOPYSIGN:
+ return performFCopySignCombine(N, DCI);
case AMDGPUISD::CVT_F32_UBYTE0:
case AMDGPUISD::CVT_F32_UBYTE1:
case AMDGPUISD::CVT_F32_UBYTE2:
@@ -11685,6 +12761,8 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N,
return performExtractVectorEltCombine(N, DCI);
case ISD::INSERT_VECTOR_ELT:
return performInsertVectorEltCombine(N, DCI);
+ case ISD::FP_ROUND:
+ return performFPRoundCombine(N, DCI);
case ISD::LOAD: {
if (SDValue Widended = widenLoad(cast<LoadSDNode>(N), DCI))
return Widended;
@@ -11778,7 +12856,7 @@ SDNode *SITargetLowering::adjustWritemask(MachineSDNode *&Node,
// Set which texture component corresponds to the lane.
unsigned Comp;
for (unsigned i = 0, Dmask = OldDmask; (i <= Lane) && (Dmask != 0); i++) {
- Comp = countTrailingZeros(Dmask);
+ Comp = llvm::countr_zero(Dmask);
Dmask &= ~(1 << Comp);
}
@@ -12548,6 +13626,15 @@ void SITargetLowering::finalizeLowering(MachineFunction &MF) const {
reservePrivateMemoryRegs(getTargetMachine(), MF, *TRI, *Info);
}
+ // TODO: Move this logic to getReservedRegs()
+ // Reserve the SGPR(s) to save/restore EXEC for WWM spill/copy handling.
+ unsigned MaxNumSGPRs = ST.getMaxNumSGPRs(MF);
+ Register SReg = ST.isWave32()
+ ? AMDGPU::SGPR_32RegClass.getRegister(MaxNumSGPRs - 1)
+ : TRI->getAlignedHighSGPRForRC(MF, /*Align=*/2,
+ &AMDGPU::SGPR_64RegClass);
+ Info->setSGPRForEXECCopy(SReg);
+
assert(!TRI->isSubRegister(Info->getScratchRSrcReg(),
Info->getStackPtrOffsetReg()));
if (Info->getStackPtrOffsetReg() != AMDGPU::SP_REG)
@@ -12591,6 +13678,41 @@ void SITargetLowering::finalizeLowering(MachineFunction &MF) const {
TargetLoweringBase::finalizeLowering(MF);
}
+void SITargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
+ KnownBits &Known,
+ const APInt &DemandedElts,
+ const SelectionDAG &DAG,
+ unsigned Depth) const {
+ Known.resetAll();
+ unsigned Opc = Op.getOpcode();
+ switch (Opc) {
+ case ISD::INTRINSIC_WO_CHAIN: {
+ unsigned IID = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
+ switch (IID) {
+ case Intrinsic::amdgcn_mbcnt_lo:
+ case Intrinsic::amdgcn_mbcnt_hi: {
+ const GCNSubtarget &ST =
+ DAG.getMachineFunction().getSubtarget<GCNSubtarget>();
+ // These return at most the (wavefront size - 1) + src1
+ // As long as src1 is an immediate we can calc known bits
+ KnownBits Src1Known = DAG.computeKnownBits(Op.getOperand(2), Depth + 1);
+ unsigned Src1ValBits = Src1Known.countMaxActiveBits();
+ unsigned MaxActiveBits = std::max(Src1ValBits, ST.getWavefrontSizeLog2());
+ // Cater for potential carry
+ MaxActiveBits += Src1ValBits ? 1 : 0;
+ unsigned Size = Op.getValueType().getSizeInBits();
+ if (MaxActiveBits < Size)
+ Known.Zero.setHighBits(Size - MaxActiveBits);
+ return;
+ }
+ }
+ break;
+ }
+ }
+ return AMDGPUTargetLowering::computeKnownBitsForTargetNode(
+ Op, Known, DemandedElts, DAG, Depth);
+}
+
void SITargetLowering::computeKnownBitsForFrameIndex(
const int FI, KnownBits &Known, const MachineFunction &MF) const {
TargetLowering::computeKnownBitsForFrameIndex(FI, Known, MF);
@@ -12605,7 +13727,7 @@ static void knownBitsForWorkitemID(const GCNSubtarget &ST, GISelKnownBits &KB,
KnownBits &Known, unsigned Dim) {
unsigned MaxValue =
ST.getMaxWorkitemID(KB.getMachineFunction().getFunction(), Dim);
- Known.Zero.setHighBits(countLeadingZeros(MaxValue));
+ Known.Zero.setHighBits(llvm::countl_zero(MaxValue));
}
void SITargetLowering::computeKnownBitsForTargetInstr(
@@ -12636,7 +13758,7 @@ void SITargetLowering::computeKnownBitsForTargetInstr(
// based on the actual size because we don't know if it's accurate or not
// at any given point.
Known.Zero.setHighBits(
- countLeadingZeros(getSubtarget()->getAddressableLocalMemorySize()));
+ llvm::countl_zero(getSubtarget()->getAddressableLocalMemorySize()));
break;
}
}
@@ -12648,6 +13770,30 @@ void SITargetLowering::computeKnownBitsForTargetInstr(
case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT:
Known.Zero.setHighBits(16);
break;
+ case AMDGPU::G_AMDGPU_SMED3:
+ case AMDGPU::G_AMDGPU_UMED3: {
+ auto [Dst, Src0, Src1, Src2] = MI->getFirst4Regs();
+
+ KnownBits Known2;
+ KB.computeKnownBitsImpl(Src2, Known2, DemandedElts, Depth + 1);
+ if (Known2.isUnknown())
+ break;
+
+ KnownBits Known1;
+ KB.computeKnownBitsImpl(Src1, Known1, DemandedElts, Depth + 1);
+ if (Known1.isUnknown())
+ break;
+
+ KnownBits Known0;
+ KB.computeKnownBitsImpl(Src0, Known0, DemandedElts, Depth + 1);
+ if (Known0.isUnknown())
+ break;
+
+ // TODO: Handle LeadZero/LeadOne from UMIN/UMAX handling.
+ Known.Zero = Known0.Zero & Known1.Zero & Known2.Zero;
+ Known.One = Known0.One & Known1.One & Known2.One;
+ break;
+ }
}
}
@@ -12759,9 +13905,9 @@ static bool isCopyFromRegOfInlineAsm(const SDNode *N) {
return false;
}
-bool SITargetLowering::isSDNodeSourceOfDivergence(
- const SDNode *N, FunctionLoweringInfo *FLI,
- LegacyDivergenceAnalysis *KDA) const {
+bool SITargetLowering::isSDNodeSourceOfDivergence(const SDNode *N,
+ FunctionLoweringInfo *FLI,
+ UniformityInfo *UA) const {
switch (N->getOpcode()) {
case ISD::CopyFromReg: {
const RegisterSDNode *R = cast<RegisterSDNode>(N->getOperand(1));
@@ -12774,7 +13920,7 @@ bool SITargetLowering::isSDNodeSourceOfDivergence(
return !TRI->isSGPRReg(MRI, Reg);
if (const Value *V = FLI->getValueFromVirtualReg(R->getReg()))
- return KDA->isDivergent(V);
+ return UA->isDivergent(V);
assert(Reg == FLI->DemoteRegister || isCopyFromRegOfInlineAsm(N));
return !TRI->isSGPRReg(MRI, Reg);
@@ -12794,8 +13940,6 @@ bool SITargetLowering::isSDNodeSourceOfDivergence(
return AMDGPU::isIntrinsicSourceOfDivergence(
cast<ConstantSDNode>(N->getOperand(1))->getZExtValue());
case AMDGPUISD::ATOMIC_CMP_SWAP:
- case AMDGPUISD::ATOMIC_INC:
- case AMDGPUISD::ATOMIC_DEC:
case AMDGPUISD::ATOMIC_LOAD_FMIN:
case AMDGPUISD::ATOMIC_LOAD_FMAX:
case AMDGPUISD::BUFFER_ATOMIC_SWAP:
@@ -12830,10 +13974,10 @@ bool SITargetLowering::denormalsEnabledForType(const SelectionDAG &DAG,
EVT VT) const {
switch (VT.getScalarType().getSimpleVT().SimpleTy) {
case MVT::f32:
- return hasFP32Denormals(DAG.getMachineFunction());
+ return !denormalModeIsFlushAllF32(DAG.getMachineFunction());
case MVT::f64:
case MVT::f16:
- return hasFP64FP16Denormals(DAG.getMachineFunction());
+ return !denormalModeIsFlushAllF64F16(DAG.getMachineFunction());
default:
return false;
}
@@ -12843,10 +13987,10 @@ bool SITargetLowering::denormalsEnabledForType(LLT Ty,
MachineFunction &MF) const {
switch (Ty.getScalarSizeInBits()) {
case 32:
- return hasFP32Denormals(MF);
+ return !denormalModeIsFlushAllF32(MF);
case 64:
case 16:
- return hasFP64FP16Denormals(MF);
+ return !denormalModeIsFlushAllF64F16(MF);
default:
return false;
}
@@ -12930,6 +14074,9 @@ SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const {
if (AMDGPU::isFlatGlobalAddrSpace(AS) &&
Subtarget->hasAtomicFaddNoRtnInsts()) {
+ if (Subtarget->hasGFX940Insts())
+ return AtomicExpansionKind::None;
+
if (unsafeFPAtomicsDisabled(RMW->getFunction()))
return AtomicExpansionKind::CmpXChg;
@@ -13054,6 +14201,8 @@ SITargetLowering::getRegClassFor(MVT VT, bool isDivergent) const {
// uniform values (as produced by the mask results of control flow intrinsics)
// used outside of divergent blocks. The phi users need to also be treated as
// always uniform.
+//
+// FIXME: DA is no longer in-use. Does this still apply to UniformityAnalysis?
static bool hasCFUser(const Value *V, SmallPtrSet<const Value *, 16> &Visited,
unsigned WaveSize) {
// FIXME: We assume we never cast the mask results of a control flow
@@ -13153,6 +14302,11 @@ bool SITargetLowering::isReassocProfitable(SelectionDAG &DAG, SDValue N0,
hasMemSDNodeUser(*N0->use_begin()));
}
+bool SITargetLowering::isReassocProfitable(MachineRegisterInfo &MRI,
+ Register N0, Register N1) const {
+ return MRI.hasOneNonDBGUse(N0); // FIXME: handle regbanks
+}
+
MachineMemOperand::Flags
SITargetLowering::getTargetMMOFlags(const Instruction &I) const {
// Propagate metadata set by AMDGPUAnnotateUniformValues to the MMO of a load.
@@ -13196,37 +14350,36 @@ void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const {
assert(AI->getOperation() == AtomicRMWInst::FAdd &&
"only fadd is supported for now");
- // Given: atomicrmw fadd float* %addr, float %val ordering
+ // Given: atomicrmw fadd ptr %addr, float %val ordering
//
// With this expansion we produce the following code:
// [...]
- // %int8ptr = bitcast float* %addr to i8*
// br label %atomicrmw.check.shared
//
// atomicrmw.check.shared:
- // %is.shared = call i1 @llvm.amdgcn.is.shared(i8* %int8ptr)
+ // %is.shared = call i1 @llvm.amdgcn.is.shared(ptr %addr)
// br i1 %is.shared, label %atomicrmw.shared, label %atomicrmw.check.private
//
// atomicrmw.shared:
- // %cast.shared = addrspacecast float* %addr to float addrspace(3)*
- // %loaded.shared = atomicrmw fadd float addrspace(3)* %cast.shared,
+ // %cast.shared = addrspacecast ptr %addr to ptr addrspace(3)
+ // %loaded.shared = atomicrmw fadd ptr addrspace(3) %cast.shared,
// float %val ordering
// br label %atomicrmw.phi
//
// atomicrmw.check.private:
- // %is.private = call i1 @llvm.amdgcn.is.private(i8* %int8ptr)
+ // %is.private = call i1 @llvm.amdgcn.is.private(ptr %int8ptr)
// br i1 %is.private, label %atomicrmw.private, label %atomicrmw.global
//
// atomicrmw.private:
- // %cast.private = addrspacecast float* %addr to float addrspace(5)*
- // %loaded.private = load float, float addrspace(5)* %cast.private
+ // %cast.private = addrspacecast ptr %addr to ptr addrspace(5)
+ // %loaded.private = load float, ptr addrspace(5) %cast.private
// %val.new = fadd float %loaded.private, %val
- // store float %val.new, float addrspace(5)* %cast.private
+ // store float %val.new, ptr addrspace(5) %cast.private
// br label %atomicrmw.phi
//
// atomicrmw.global:
- // %cast.global = addrspacecast float* %addr to float addrspace(1)*
- // %loaded.global = atomicrmw fadd float addrspace(1)* %cast.global,
+ // %cast.global = addrspacecast ptr %addr to ptr addrspace(1)
+ // %loaded.global = atomicrmw fadd ptr addrspace(1) %cast.global,
// float %val ordering
// br label %atomicrmw.phi
//
@@ -13259,7 +14412,6 @@ void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const {
Value *Val = AI->getValOperand();
Type *ValTy = Val->getType();
Value *Addr = AI->getPointerOperand();
- PointerType *PtrTy = cast<PointerType>(Addr->getType());
auto CreateNewAtomicRMW = [AI](IRBuilder<> &Builder, Value *Addr,
Value *Val) -> Value * {
@@ -13275,30 +14427,27 @@ void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const {
std::prev(BB->end())->eraseFromParent();
Builder.SetInsertPoint(BB);
- Value *Int8Ptr = Builder.CreateBitCast(Addr, Builder.getInt8PtrTy());
Builder.CreateBr(CheckSharedBB);
Builder.SetInsertPoint(CheckSharedBB);
CallInst *IsShared = Builder.CreateIntrinsic(Intrinsic::amdgcn_is_shared, {},
- {Int8Ptr}, nullptr, "is.shared");
+ {Addr}, nullptr, "is.shared");
Builder.CreateCondBr(IsShared, SharedBB, CheckPrivateBB);
Builder.SetInsertPoint(SharedBB);
Value *CastToLocal = Builder.CreateAddrSpaceCast(
- Addr,
- PointerType::getWithSamePointeeType(PtrTy, AMDGPUAS::LOCAL_ADDRESS));
+ Addr, PointerType::get(Ctx, AMDGPUAS::LOCAL_ADDRESS));
Value *LoadedShared = CreateNewAtomicRMW(Builder, CastToLocal, Val);
Builder.CreateBr(PhiBB);
Builder.SetInsertPoint(CheckPrivateBB);
CallInst *IsPrivate = Builder.CreateIntrinsic(
- Intrinsic::amdgcn_is_private, {}, {Int8Ptr}, nullptr, "is.private");
+ Intrinsic::amdgcn_is_private, {}, {Addr}, nullptr, "is.private");
Builder.CreateCondBr(IsPrivate, PrivateBB, GlobalBB);
Builder.SetInsertPoint(PrivateBB);
Value *CastToPrivate = Builder.CreateAddrSpaceCast(
- Addr,
- PointerType::getWithSamePointeeType(PtrTy, AMDGPUAS::PRIVATE_ADDRESS));
+ Addr, PointerType::get(Ctx, AMDGPUAS::PRIVATE_ADDRESS));
Value *LoadedPrivate =
Builder.CreateLoad(ValTy, CastToPrivate, "loaded.private");
Value *NewVal = Builder.CreateFAdd(LoadedPrivate, Val, "val.new");
@@ -13307,8 +14456,7 @@ void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const {
Builder.SetInsertPoint(GlobalBB);
Value *CastToGlobal = Builder.CreateAddrSpaceCast(
- Addr,
- PointerType::getWithSamePointeeType(PtrTy, AMDGPUAS::GLOBAL_ADDRESS));
+ Addr, PointerType::get(Ctx, AMDGPUAS::GLOBAL_ADDRESS));
Value *LoadedGlobal = CreateNewAtomicRMW(Builder, CastToGlobal, Val);
Builder.CreateBr(PhiBB);
@@ -13322,3 +14470,25 @@ void SITargetLowering::emitExpandAtomicRMW(AtomicRMWInst *AI) const {
AI->replaceAllUsesWith(Loaded);
AI->eraseFromParent();
}
+
+LoadInst *
+SITargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const {
+ IRBuilder<> Builder(AI);
+ auto Order = AI->getOrdering();
+
+ // The optimization removes store aspect of the atomicrmw. Therefore, cache
+ // must be flushed if the atomic ordering had a release semantics. This is
+ // not necessary a fence, a release fence just coincides to do that flush.
+ // Avoid replacing of an atomicrmw with a release semantics.
+ if (isReleaseOrStronger(Order))
+ return nullptr;
+
+ LoadInst *LI = Builder.CreateAlignedLoad(
+ AI->getType(), AI->getPointerOperand(), AI->getAlign());
+ LI->setAtomic(Order, AI->getSyncScopeID());
+ LI->copyMetadata(*AI);
+ LI->takeName(AI);
+ AI->replaceAllUsesWith(LI);
+ AI->eraseFromParent();
+ return LI;
+}
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.h b/llvm/lib/Target/AMDGPU/SIISelLowering.h
index 3b2c58108667..1745c0b9e88e 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.h
@@ -87,8 +87,6 @@ private:
SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerINTRINSIC_VOID(SDValue Op, SelectionDAG &DAG) const;
- SDValue makeV_ILLEGAL(SDValue Op, SelectionDAG &DAG) const;
-
// The raw.tbuffer and struct.tbuffer intrinsics have two offset args: offset
// (the offset that is included in bounds checking and swizzling, to be split
// between the instruction's voffset and immoffset fields) and soffset (the
@@ -108,8 +106,10 @@ private:
SDValue LowerFDIV32(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerFDIV64(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerFDIV(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerFFREXP(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerSTORE(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerTrig(SDValue Op, SelectionDAG &DAG) const;
+ SDValue lowerFSQRTF64(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerATOMIC_CMP_SWAP(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerBRCOND(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const;
@@ -143,6 +143,7 @@ private:
/// Custom lowering for ISD::FP_ROUND for MVT::f16.
SDValue lowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerFMINNUM_FMAXNUM(SDValue Op, SelectionDAG &DAG) const;
+ SDValue lowerFLDEXP(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerXMULO(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerXMUL_LOHI(SDValue Op, SelectionDAG &DAG) const;
@@ -167,6 +168,8 @@ private:
SDValue performUCharToFloatCombine(SDNode *N,
DAGCombinerInfo &DCI) const;
+ SDValue performFCopySignCombine(SDNode *N, DAGCombinerInfo &DCI) const;
+
SDValue performSHLPtrCombine(SDNode *N,
unsigned AS,
EVT MemVT,
@@ -191,12 +194,14 @@ private:
SDValue performFPMed3ImmCombine(SelectionDAG &DAG, const SDLoc &SL,
SDValue Op0, SDValue Op1) const;
SDValue performIntMed3ImmCombine(SelectionDAG &DAG, const SDLoc &SL,
- SDValue Op0, SDValue Op1, bool Signed) const;
+ SDValue Src, SDValue MinVal, SDValue MaxVal,
+ bool Signed) const;
SDValue performMinMaxCombine(SDNode *N, DAGCombinerInfo &DCI) const;
SDValue performFMed3Combine(SDNode *N, DAGCombinerInfo &DCI) const;
SDValue performCvtPkRTZCombine(SDNode *N, DAGCombinerInfo &DCI) const;
SDValue performExtractVectorEltCombine(SDNode *N, DAGCombinerInfo &DCI) const;
SDValue performInsertVectorEltCombine(SDNode *N, DAGCombinerInfo &DCI) const;
+ SDValue performFPRoundCombine(SDNode *N, DAGCombinerInfo &DCI) const;
SDValue reassociateScalarOps(SDNode *N, SelectionDAG &DAG) const;
unsigned getFusedOpcode(const SelectionDAG &DAG,
@@ -250,6 +255,17 @@ private:
void setBufferOffsets(SDValue CombinedOffset, SelectionDAG &DAG,
SDValue *Offsets, Align Alignment = Align(4)) const;
+ // Convert the i128 that an addrspace(8) pointer is natively represented as
+ // into the v4i32 that all the buffer intrinsics expect to receive. We can't
+ // add register classes for i128 on pain of the promotion logic going haywire,
+ // so this slightly ugly hack is what we've got. If passed a non-pointer
+ // argument (as would be seen in older buffer intrinsics), does nothing.
+ SDValue bufferRsrcPtrToVector(SDValue MaybePointer, SelectionDAG &DAG) const;
+
+ // Wrap a 64-bit pointer into a v4i32 (which is how all SelectionDAG code
+ // represents ptr addrspace(8)) using the flags specified in the intrinsic.
+ SDValue lowerPointerAsRsrcIntrin(SDNode *Op, SelectionDAG &DAG) const;
+
// Handle 8 bit and 16 bit buffer loads
SDValue handleByteShortBufferLoads(SelectionDAG &DAG, EVT LoadVT, SDLoc DL,
ArrayRef<SDValue> Ops, MemSDNode *M) const;
@@ -272,6 +288,12 @@ public:
bool isShuffleMaskLegal(ArrayRef<int> /*Mask*/, EVT /*VT*/) const override;
+ // While address space 7 should never make it to codegen, it still needs to
+ // have a MVT to prevent some analyses that query this function from breaking,
+ // so, to work around the lack of i160, map it to v5i32.
+ MVT getPointerTy(const DataLayout &DL, unsigned AS) const override;
+ MVT getPointerMemTy(const DataLayout &DL, unsigned AS) const override;
+
bool getTgtMemIntrinsic(IntrinsicInfo &, const CallInst &,
MachineFunction &MF,
unsigned IntrinsicID) const override;
@@ -331,6 +353,12 @@ public:
bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const override;
+ unsigned combineRepeatedFPDivisors() const override {
+ // Combine multiple FDIVs with the same divisor into multiple FMULs by the
+ // reciprocal.
+ return 2;
+ }
+
bool supportSplitCSR(MachineFunction *MF) const override;
void initializeSplitCSR(MachineBasicBlock *Entry) const override;
void insertCopiesSplitCSR(
@@ -361,7 +389,7 @@ public:
SmallVectorImpl<SDValue> &MemOpChains,
SDValue Chain) const;
- SDValue LowerCallResult(SDValue Chain, SDValue InFlag,
+ SDValue LowerCallResult(SDValue Chain, SDValue InGlue,
CallingConv::ID CallConv, bool isVarArg,
const SmallVectorImpl<ISD::InputArg> &Ins,
const SDLoc &DL, SelectionDAG &DAG,
@@ -396,7 +424,6 @@ public:
EmitInstrWithCustomInserter(MachineInstr &MI,
MachineBasicBlock *BB) const override;
- bool hasBitPreservingFPLogic(EVT VT) const override;
bool hasAtomicFaddRtnForTy(SDValue &Op) const;
bool enableAggressiveFMAFusion(EVT VT) const override;
bool enableAggressiveFMAFusion(LLT Ty) const override;
@@ -452,6 +479,10 @@ public:
void finalizeLowering(MachineFunction &MF) const override;
+ void computeKnownBitsForTargetNode(const SDValue Op, KnownBits &Known,
+ const APInt &DemandedElts,
+ const SelectionDAG &DAG,
+ unsigned Depth = 0) const override;
void computeKnownBitsForFrameIndex(int FrameIdx,
KnownBits &Known,
const MachineFunction &MF) const override;
@@ -464,14 +495,17 @@ public:
Align computeKnownAlignForTargetInstr(GISelKnownBits &Analysis, Register R,
const MachineRegisterInfo &MRI,
unsigned Depth = 0) const override;
- bool isSDNodeSourceOfDivergence(const SDNode *N,
- FunctionLoweringInfo *FLI, LegacyDivergenceAnalysis *DA) const override;
+ bool isSDNodeSourceOfDivergence(const SDNode *N, FunctionLoweringInfo *FLI,
+ UniformityInfo *UA) const override;
bool hasMemSDNodeUser(SDNode *N) const;
bool isReassocProfitable(SelectionDAG &DAG, SDValue N0,
SDValue N1) const override;
+ bool isReassocProfitable(MachineRegisterInfo &MRI, Register N0,
+ Register N1) const override;
+
bool isCanonicalized(SelectionDAG &DAG, SDValue Op,
unsigned MaxDepth = 5) const;
bool isCanonicalized(Register Reg, MachineFunction &MF,
@@ -495,6 +529,9 @@ public:
shouldExpandAtomicCmpXchgInIR(AtomicCmpXchgInst *AI) const override;
void emitExpandAtomicRMW(AtomicRMWInst *AI) const override;
+ LoadInst *
+ lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const override;
+
const TargetRegisterClass *getRegClassFor(MVT VT,
bool isDivergent) const override;
bool requiresUniformRegister(MachineFunction &MF,
diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
index 851c407bb255..4b0283b27a6f 100644
--- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -35,7 +35,7 @@
#include "llvm/CodeGen/MachinePostDominators.h"
#include "llvm/InitializePasses.h"
#include "llvm/Support/DebugCounter.h"
-#include "llvm/Support/TargetParser.h"
+#include "llvm/TargetParser/TargetParser.h"
using namespace llvm;
#define DEBUG_TYPE "si-insert-waitcnts"
@@ -57,8 +57,6 @@ namespace {
// associated with the operand. Used for determining whether
// s_waitcnt instruction needs to be emitted.
-#define CNT_MASK(t) (1u << (t))
-
enum InstCounterType { VM_CNT = 0, LGKM_CNT, EXP_CNT, VS_CNT, NUM_INST_CNTS };
} // namespace
@@ -88,19 +86,20 @@ struct RegisterEncoding {
};
enum WaitEventType {
- VMEM_ACCESS, // vector-memory read & write
- VMEM_READ_ACCESS, // vector-memory read
- VMEM_WRITE_ACCESS, // vector-memory write
- LDS_ACCESS, // lds read & write
- GDS_ACCESS, // gds read & write
- SQ_MESSAGE, // send message
- SMEM_ACCESS, // scalar-memory read & write
- EXP_GPR_LOCK, // export holding on its data src
- GDS_GPR_LOCK, // GDS holding on its data and addr src
- EXP_POS_ACCESS, // write to export position
- EXP_PARAM_ACCESS, // write to export parameter
- VMW_GPR_LOCK, // vector-memory write holding on its data src
- EXP_LDS_ACCESS, // read by ldsdir counting as export
+ VMEM_ACCESS, // vector-memory read & write
+ VMEM_READ_ACCESS, // vector-memory read
+ VMEM_WRITE_ACCESS, // vector-memory write that is not scratch
+ SCRATCH_WRITE_ACCESS, // vector-memory write that may be scratch
+ LDS_ACCESS, // lds read & write
+ GDS_ACCESS, // gds read & write
+ SQ_MESSAGE, // send message
+ SMEM_ACCESS, // scalar-memory read & write
+ EXP_GPR_LOCK, // export holding on its data src
+ GDS_GPR_LOCK, // GDS holding on its data and addr src
+ EXP_POS_ACCESS, // write to export position
+ EXP_PARAM_ACCESS, // write to export parameter
+ VMW_GPR_LOCK, // vector-memory write holding on its data src
+ EXP_LDS_ACCESS, // read by ldsdir counting as export
NUM_WAIT_EVENTS,
};
@@ -110,7 +109,7 @@ static const unsigned WaitEventMaskForInst[NUM_INST_CNTS] = {
(1 << SQ_MESSAGE),
(1 << EXP_GPR_LOCK) | (1 << GDS_GPR_LOCK) | (1 << VMW_GPR_LOCK) |
(1 << EXP_PARAM_ACCESS) | (1 << EXP_POS_ACCESS) | (1 << EXP_LDS_ACCESS),
- (1 << VMEM_WRITE_ACCESS)};
+ (1 << VMEM_WRITE_ACCESS) | (1 << SCRATCH_WRITE_ACCESS)};
// The mapping is:
// 0 .. SQ_MAX_PGM_VGPRS-1 real VGPRs
@@ -372,11 +371,8 @@ private:
MachinePostDominatorTree *PDT;
struct BlockInfo {
- MachineBasicBlock *MBB;
std::unique_ptr<WaitcntBrackets> Incoming;
bool Dirty = true;
-
- explicit BlockInfo(MachineBasicBlock *MBB) : MBB(MBB) {}
};
MapVector<MachineBasicBlock *, BlockInfo> BlockInfos;
@@ -386,6 +382,10 @@ private:
bool ForceEmitZeroWaitcnts;
bool ForceEmitWaitcnt[NUM_INST_CNTS];
+ // S_ENDPGM instructions before which we should insert a DEALLOC_VGPRS
+ // message.
+ DenseSet<MachineInstr *> ReleaseVGPRInsts;
+
public:
static char ID;
@@ -398,6 +398,7 @@ public:
bool shouldFlushVmCnt(MachineLoop *ML, WaitcntBrackets &Brackets);
bool isPreheaderToFlush(MachineBasicBlock &MBB,
WaitcntBrackets &ScoreBrackets);
+ bool isVMEMOrFlatVMEM(const MachineInstr &MI) const;
bool runOnMachineFunction(MachineFunction &MF) override;
StringRef getPassName() const override {
@@ -418,10 +419,6 @@ public:
return false;
}
- AMDGPU::Waitcnt allZeroWaitcnt() const {
- return AMDGPU::Waitcnt::allZero(ST->hasVscnt());
- }
-
void setForceEmitWaitcnt() {
// For non-debug builds, ForceEmitWaitcnt has been initialized to false;
// For debug builds, get the debug counter info and adjust if need be
@@ -455,13 +452,19 @@ public:
assert(SIInstrInfo::isVMEM(Inst) || SIInstrInfo::isFLAT(Inst));
if (!ST->hasVscnt())
return VMEM_ACCESS;
- if (Inst.mayStore() && !SIInstrInfo::isAtomicRet(Inst))
+ if (Inst.mayStore() && !SIInstrInfo::isAtomicRet(Inst)) {
+ // FLAT and SCRATCH instructions may access scratch. Other VMEM
+ // instructions do not.
+ if (SIInstrInfo::isFLAT(Inst) && mayAccessScratchThroughFlat(Inst))
+ return SCRATCH_WRITE_ACCESS;
return VMEM_WRITE_ACCESS;
+ }
return VMEM_READ_ACCESS;
}
bool mayAccessVMEMThroughFlat(const MachineInstr &MI) const;
bool mayAccessLDSThroughFlat(const MachineInstr &MI) const;
+ bool mayAccessScratchThroughFlat(const MachineInstr &MI) const;
bool generateWaitcntInstBefore(MachineInstr &MI,
WaitcntBrackets &ScoreBrackets,
MachineInstr *OldWaitcntInstr,
@@ -1029,7 +1032,18 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI,
MI.getOpcode() == AMDGPU::SI_RETURN ||
MI.getOpcode() == AMDGPU::S_SETPC_B64_return ||
(MI.isReturn() && MI.isCall() && !callWaitsOnFunctionEntry(MI))) {
- Wait = Wait.combined(allZeroWaitcnt());
+ Wait = Wait.combined(AMDGPU::Waitcnt::allZeroExceptVsCnt());
+ }
+ // Identify S_ENDPGM instructions which may have to wait for outstanding VMEM
+ // stores. In this case it can be useful to send a message to explicitly
+ // release all VGPRs before the stores have completed, but it is only safe to
+ // do this if there are no outstanding scratch stores.
+ else if (MI.getOpcode() == AMDGPU::S_ENDPGM ||
+ MI.getOpcode() == AMDGPU::S_ENDPGM_SAVED) {
+ if (ST->getGeneration() >= AMDGPUSubtarget::GFX11 &&
+ ScoreBrackets.getScoreRange(VS_CNT) != 0 &&
+ !ScoreBrackets.hasPendingEvent(SCRATCH_WRITE_ACCESS))
+ ReleaseVGPRInsts.insert(&MI);
}
// Resolve vm waits before gs-done.
else if ((MI.getOpcode() == AMDGPU::S_SENDMSG ||
@@ -1214,7 +1228,7 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI,
// cause an exception. Otherwise, insert an explicit S_WAITCNT 0 here.
if (MI.getOpcode() == AMDGPU::S_BARRIER &&
!ST->hasAutoWaitcntBeforeBarrier() && !ST->supportsBackOffBarrier()) {
- Wait = Wait.combined(allZeroWaitcnt());
+ Wait = Wait.combined(AMDGPU::Waitcnt::allZero(ST->hasVscnt()));
}
// TODO: Remove this work-around, enable the assert for Bug 457939
@@ -1230,7 +1244,7 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI,
ScoreBrackets.simplifyWaitcnt(Wait);
if (ForceEmitZeroWaitcnts)
- Wait = allZeroWaitcnt();
+ Wait = AMDGPU::Waitcnt::allZeroExceptVsCnt();
if (ForceEmitWaitcnt[VM_CNT])
Wait.VmCnt = 0;
@@ -1238,8 +1252,6 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI,
Wait.ExpCnt = 0;
if (ForceEmitWaitcnt[LGKM_CNT])
Wait.LgkmCnt = 0;
- if (ForceEmitWaitcnt[VS_CNT])
- Wait.VsCnt = 0;
if (FlushVmCnt) {
if (ScoreBrackets.hasPendingEvent(VM_CNT))
@@ -1384,6 +1396,32 @@ bool SIInsertWaitcnts::mayAccessLDSThroughFlat(const MachineInstr &MI) const {
return false;
}
+// This is a flat memory operation. Check to see if it has memory tokens for
+// either scratch or FLAT.
+bool SIInsertWaitcnts::mayAccessScratchThroughFlat(
+ const MachineInstr &MI) const {
+ assert(TII->isFLAT(MI));
+
+ // SCRATCH instructions always access scratch.
+ if (TII->isFLATScratch(MI))
+ return true;
+
+ // GLOBAL instructions never access scratch.
+ if (TII->isFLATGlobal(MI))
+ return false;
+
+ // If there are no memory operands then conservatively assume the flat
+ // operation may access scratch.
+ if (MI.memoperands_empty())
+ return true;
+
+ // See if any memory operand specifies an address space that involves scratch.
+ return any_of(MI.memoperands(), [](const MachineMemOperand *Memop) {
+ unsigned AS = Memop->getAddrSpace();
+ return AS == AMDGPUAS::PRIVATE_ADDRESS || AS == AMDGPUAS::FLAT_ADDRESS;
+ });
+}
+
void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr &Inst,
WaitcntBrackets *ScoreBrackets) {
// Now look at the instruction opcode. If it is a memory access
@@ -1436,7 +1474,7 @@ void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr &Inst,
} else if (Inst.isCall()) {
if (callWaitsOnFunctionReturn(Inst)) {
// Act as a wait on everything
- ScoreBrackets->applyWaitcnt(allZeroWaitcnt());
+ ScoreBrackets->applyWaitcnt(AMDGPU::Waitcnt::allZeroExceptVsCnt());
} else {
// May need to way wait for anything.
ScoreBrackets->applyWaitcnt(AMDGPU::Waitcnt());
@@ -1703,6 +1741,11 @@ bool SIInsertWaitcnts::isPreheaderToFlush(MachineBasicBlock &MBB,
return UpdateCache(false);
}
+bool SIInsertWaitcnts::isVMEMOrFlatVMEM(const MachineInstr &MI) const {
+ return SIInstrInfo::isVMEM(MI) ||
+ (SIInstrInfo::isFLAT(MI) && mayAccessVMEMThroughFlat(MI));
+}
+
// Return true if it is better to flush the vmcnt counter in the preheader of
// the given loop. We currently decide to flush in two situations:
// 1. The loop contains vmem store(s), no vmem load and at least one use of a
@@ -1721,7 +1764,7 @@ bool SIInsertWaitcnts::shouldFlushVmCnt(MachineLoop *ML,
for (MachineBasicBlock *MBB : ML->blocks()) {
for (MachineInstr &MI : *MBB) {
- if (SIInstrInfo::isVMEM(MI)) {
+ if (isVMEMOrFlatVMEM(MI)) {
if (MI.mayLoad())
HasVMemLoad = true;
if (MI.mayStore())
@@ -1749,7 +1792,7 @@ bool SIInsertWaitcnts::shouldFlushVmCnt(MachineLoop *ML,
}
}
// VMem load vgpr def
- else if (SIInstrInfo::isVMEM(MI) && MI.mayLoad() && Op.isDef())
+ else if (isVMEMOrFlatVMEM(MI) && MI.mayLoad() && Op.isDef())
for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
// If we find a register that is loaded inside the loop, 1. and 2.
// are invalidated and we can exit.
@@ -1813,10 +1856,6 @@ bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) {
I != E && (I->isPHI() || I->isMetaInstruction()); ++I)
;
BuildMI(EntryBB, I, DebugLoc(), TII->get(AMDGPU::S_WAITCNT)).addImm(0);
- if (ST->hasVscnt())
- BuildMI(EntryBB, I, DebugLoc(), TII->get(AMDGPU::S_WAITCNT_VSCNT))
- .addReg(AMDGPU::SGPR_NULL, RegState::Undef)
- .addImm(0);
Modified = true;
}
@@ -1824,7 +1863,7 @@ bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) {
// Keep iterating over the blocks in reverse post order, inserting and
// updating s_waitcnt where needed, until a fix point is reached.
for (auto *MBB : ReversePostOrderTraversal<MachineFunction *>(&MF))
- BlockInfos.insert({MBB, BlockInfo(MBB)});
+ BlockInfos.insert({MBB, BlockInfo()});
std::unique_ptr<WaitcntBrackets> Brackets;
bool Repeat;
@@ -1833,6 +1872,7 @@ bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) {
for (auto BII = BlockInfos.begin(), BIE = BlockInfos.end(); BII != BIE;
++BII) {
+ MachineBasicBlock *MBB = BII->first;
BlockInfo &BI = BII->second;
if (!BI.Dirty)
continue;
@@ -1849,12 +1889,12 @@ bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) {
*Brackets = WaitcntBrackets(ST, Limits, Encoding);
}
- Modified |= insertWaitcntInBlock(MF, *BI.MBB, *Brackets);
+ Modified |= insertWaitcntInBlock(MF, *MBB, *Brackets);
BI.Dirty = false;
if (Brackets->hasPendingEvent()) {
BlockInfo *MoveBracketsToSucc = nullptr;
- for (MachineBasicBlock *Succ : BI.MBB->successors()) {
+ for (MachineBasicBlock *Succ : MBB->successors()) {
auto SuccBII = BlockInfos.find(Succ);
BlockInfo &SuccBI = SuccBII->second;
if (!SuccBI.Incoming) {
@@ -1924,5 +1964,18 @@ bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) {
}
}
+ // Insert DEALLOC_VGPR messages before previously identified S_ENDPGM
+ // instructions.
+ for (MachineInstr *MI : ReleaseVGPRInsts) {
+ if (ST->requiresNopBeforeDeallocVGPRs()) {
+ BuildMI(*MI->getParent(), MI, DebugLoc(), TII->get(AMDGPU::S_NOP))
+ .addImm(0);
+ }
+ BuildMI(*MI->getParent(), MI, DebugLoc(), TII->get(AMDGPU::S_SENDMSG))
+ .addImm(AMDGPU::SendMsg::ID_DEALLOC_VGPRS_GFX11Plus);
+ Modified = true;
+ }
+ ReleaseVGPRInsts.clear();
+
return Modified;
}
diff --git a/llvm/lib/Target/AMDGPU/SIInstrFormats.td b/llvm/lib/Target/AMDGPU/SIInstrFormats.td
index d86d4e659803..f674777724eb 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrFormats.td
+++ b/llvm/lib/Target/AMDGPU/SIInstrFormats.td
@@ -153,6 +153,9 @@ class InstSI <dag outs, dag ins, string asm = "",
// This bit indicates that tied source will not be read.
field bit TiedSourceNotRead = 0;
+ // This bit indicates that the instruction is never-uniform/divergent
+ field bit IsNeverUniform = 0;
+
// These need to be kept in sync with the enum in SIInstrFlags.
let TSFlags{0} = SALU;
let TSFlags{1} = VALU;
@@ -234,6 +237,8 @@ class InstSI <dag outs, dag ins, string asm = "",
let TSFlags{60} = TiedSourceNotRead;
+ let TSFlags{61} = IsNeverUniform;
+
let SchedRW = [Write32Bit];
let AsmVariantName = AMDGPUAsmVariants.Default;
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 2cbc90219334..278cf2b69ee3 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -329,8 +329,8 @@ bool SIInstrInfo::getMemOperandsWithOffsetWidth(
const MachineOperand *Offset1Op =
getNamedOperand(LdSt, AMDGPU::OpName::offset1);
- unsigned Offset0 = Offset0Op->getImm();
- unsigned Offset1 = Offset1Op->getImm();
+ unsigned Offset0 = Offset0Op->getImm() & 0xff;
+ unsigned Offset1 = Offset1Op->getImm() & 0xff;
if (Offset0 + 1 != Offset1)
return false;
@@ -537,7 +537,7 @@ static void reportIllegalCopy(const SIInstrInfo *TII, MachineBasicBlock &MBB,
MachineBasicBlock::iterator MI,
const DebugLoc &DL, MCRegister DestReg,
MCRegister SrcReg, bool KillSrc,
- const char *Msg = "illegal SGPR to VGPR copy") {
+ const char *Msg = "illegal VGPR to SGPR copy") {
MachineFunction *MF = MBB.getParent();
DiagnosticInfoUnsupported IllegalCopy(MF->getFunction(), Msg, DL, DS_Error);
LLVMContext &C = MF->getFunction().getContext();
@@ -578,9 +578,12 @@ static void indirectCopyToAGPR(const SIInstrInfo &TII,
if (!RegsOverlap) {
for (auto Def = MI, E = MBB.begin(); Def != E; ) {
--Def;
- if (!Def->definesRegister(SrcReg, &RI))
+
+ if (!Def->modifiesRegister(SrcReg, &RI))
continue;
- if (Def->getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64)
+
+ if (Def->getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32_e64 ||
+ Def->getOperand(0).getReg() != SrcReg)
break;
MachineOperand &DefOp = Def->getOperand(1);
@@ -615,8 +618,8 @@ static void indirectCopyToAGPR(const SIInstrInfo &TII,
}
}
- RS.enterBasicBlock(MBB);
- RS.forward(MI);
+ RS.enterBasicBlockEnd(MBB);
+ RS.backward(MI);
// Ideally we want to have three registers for a long reg_sequence copy
// to hide 2 waitstates between v_mov_b32 and accvgpr_write.
@@ -631,11 +634,12 @@ static void indirectCopyToAGPR(const SIInstrInfo &TII,
assert(MBB.getParent()->getRegInfo().isReserved(Tmp) &&
"VGPR used for an intermediate copy should have been reserved.");
- // Only loop through if there are any free registers left, otherwise
- // scavenger may report a fatal error without emergency spill slot
- // or spill with the slot.
- while (RegNo-- && RS.FindUnusedReg(&AMDGPU::VGPR_32RegClass)) {
- Register Tmp2 = RS.scavengeRegister(&AMDGPU::VGPR_32RegClass, 0);
+ // Only loop through if there are any free registers left. We don't want to
+ // spill.
+ while (RegNo--) {
+ Register Tmp2 = RS.scavengeRegisterBackwards(AMDGPU::VGPR_32RegClass, MI,
+ /* RestoreAfter */ false, 0,
+ /* AllowSpill */ false);
if (!Tmp2 || RI.getHWRegIndex(Tmp2) >= MaxVGPRs)
break;
Tmp = Tmp2;
@@ -1394,6 +1398,14 @@ static unsigned getIndirectSGPRWriteMovRelPseudo32(unsigned VecSize) {
return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V5;
if (VecSize <= 256) // 32 bytes
return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V8;
+ if (VecSize <= 288) // 36 bytes
+ return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V9;
+ if (VecSize <= 320) // 40 bytes
+ return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V10;
+ if (VecSize <= 352) // 44 bytes
+ return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V11;
+ if (VecSize <= 384) // 48 bytes
+ return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V12;
if (VecSize <= 512) // 64 bytes
return AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V16;
if (VecSize <= 1024) // 128 bytes
@@ -1575,6 +1587,30 @@ static unsigned getAVSpillSaveOpcode(unsigned Size) {
}
}
+static unsigned getWWMRegSpillSaveOpcode(unsigned Size) {
+ // Currently, there is only 32-bit WWM register spills needed.
+ if (Size != 4)
+ llvm_unreachable("unknown wwm register spill size");
+
+ return AMDGPU::SI_SPILL_WWM_V32_SAVE;
+}
+
+static unsigned getVectorRegSpillSaveOpcode(Register Reg,
+ const TargetRegisterClass *RC,
+ unsigned Size,
+ const SIRegisterInfo &TRI,
+ const SIMachineFunctionInfo &MFI) {
+ // Choose the right opcode if spilling a WWM register.
+ if (MFI.checkFlag(Reg, AMDGPU::VirtRegFlag::WWM_REG))
+ return getWWMRegSpillSaveOpcode(Size);
+
+ if (TRI.isVectorSuperClass(RC))
+ return getAVSpillSaveOpcode(Size);
+
+ return TRI.isAGPRClass(RC) ? getAGPRSpillSaveOpcode(Size)
+ : getVGPRSpillSaveOpcode(Size);
+}
+
void SIInstrInfo::storeRegToStackSlot(
MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, Register SrcReg,
bool isKill, int FrameIndex, const TargetRegisterClass *RC,
@@ -1619,11 +1655,8 @@ void SIInstrInfo::storeRegToStackSlot(
return;
}
- unsigned Opcode = RI.isVectorSuperClass(RC)
- ? getAVSpillSaveOpcode(SpillSize)
- : RI.isAGPRClass(RC)
- ? getAGPRSpillSaveOpcode(SpillSize)
- : getVGPRSpillSaveOpcode(SpillSize);
+ unsigned Opcode = getVectorRegSpillSaveOpcode(VReg ? VReg : SrcReg, RC,
+ SpillSize, RI, *MFI);
MFI->setHasSpilledVGPRs();
BuildMI(MBB, MI, DL, get(Opcode))
@@ -1774,6 +1807,29 @@ static unsigned getAVSpillRestoreOpcode(unsigned Size) {
}
}
+static unsigned getWWMRegSpillRestoreOpcode(unsigned Size) {
+ // Currently, there is only 32-bit WWM register spills needed.
+ if (Size != 4)
+ llvm_unreachable("unknown wwm register spill size");
+
+ return AMDGPU::SI_SPILL_WWM_V32_RESTORE;
+}
+
+static unsigned
+getVectorRegSpillRestoreOpcode(Register Reg, const TargetRegisterClass *RC,
+ unsigned Size, const SIRegisterInfo &TRI,
+ const SIMachineFunctionInfo &MFI) {
+ // Choose the right opcode if restoring a WWM register.
+ if (MFI.checkFlag(Reg, AMDGPU::VirtRegFlag::WWM_REG))
+ return getWWMRegSpillRestoreOpcode(Size);
+
+ if (TRI.isVectorSuperClass(RC))
+ return getAVSpillRestoreOpcode(Size);
+
+ return TRI.isAGPRClass(RC) ? getAGPRSpillRestoreOpcode(Size)
+ : getVGPRSpillRestoreOpcode(Size);
+}
+
void SIInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
MachineBasicBlock::iterator MI,
Register DestReg, int FrameIndex,
@@ -1817,11 +1873,8 @@ void SIInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
return;
}
- unsigned Opcode = RI.isVectorSuperClass(RC)
- ? getAVSpillRestoreOpcode(SpillSize)
- : RI.isAGPRClass(RC)
- ? getAGPRSpillRestoreOpcode(SpillSize)
- : getVGPRSpillRestoreOpcode(SpillSize);
+ unsigned Opcode = getVectorRegSpillRestoreOpcode(VReg ? VReg : DestReg, RC,
+ SpillSize, RI, *MFI);
BuildMI(MBB, MI, DL, get(Opcode), DestReg)
.addFrameIndex(FrameIndex) // vaddr
.addReg(MFI->getStackPtrOffsetReg()) // scratch_offset
@@ -1941,6 +1994,18 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
MI.setDesc(get(AMDGPU::S_AND_B32));
break;
+ case AMDGPU::S_AND_SAVEEXEC_B64_term:
+ // This is only a terminator to get the correct spill code placement during
+ // register allocation.
+ MI.setDesc(get(AMDGPU::S_AND_SAVEEXEC_B64));
+ break;
+
+ case AMDGPU::S_AND_SAVEEXEC_B32_term:
+ // This is only a terminator to get the correct spill code placement during
+ // register allocation.
+ MI.setDesc(get(AMDGPU::S_AND_SAVEEXEC_B32));
+ break;
+
case AMDGPU::V_MOV_B64_PSEUDO: {
Register Dst = MI.getOperand(0).getReg();
Register DstLo = RI.getSubReg(Dst, AMDGPU::sub0);
@@ -2084,6 +2149,10 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V4:
case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V5:
case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V8:
+ case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V9:
+ case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V10:
+ case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V11:
+ case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V12:
case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V16:
case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B32_V32:
case AMDGPU::S_INDIRECT_REG_WRITE_MOVREL_B64_V1:
@@ -2345,6 +2414,14 @@ SIInstrInfo::expandMovDPP64(MachineInstr &MI) const {
return std::pair(Split[0], Split[1]);
}
+std::optional<DestSourcePair>
+SIInstrInfo::isCopyInstrImpl(const MachineInstr &MI) const {
+ if (MI.getOpcode() == AMDGPU::WWM_COPY)
+ return DestSourcePair{MI.getOperand(0), MI.getOperand(1)};
+
+ return std::nullopt;
+}
+
bool SIInstrInfo::swapSourceModifiers(MachineInstr &MI,
MachineOperand &Src0,
unsigned Src0OpName,
@@ -2522,6 +2599,7 @@ void SIInstrInfo::insertIndirectBranch(MachineBasicBlock &MBB,
MachineFunction *MF = MBB.getParent();
MachineRegisterInfo &MRI = MF->getRegInfo();
+ const SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
// FIXME: Virtual register workaround for RegScavenger not working with empty
// blocks.
@@ -2555,12 +2633,6 @@ void SIInstrInfo::insertIndirectBranch(MachineBasicBlock &MBB,
BuildMI(&MBB, DL, get(AMDGPU::S_SETPC_B64))
.addReg(PCReg);
- // FIXME: If spilling is necessary, this will fail because this scavenger has
- // no emergency stack slots. It is non-trivial to spill in this situation,
- // because the restore code needs to be specially placed after the
- // jump. BranchRelaxation then needs to be made aware of the newly inserted
- // block.
- //
// If a spill is needed for the pc register pair, we need to insert a spill
// restore block right before the destination block, and insert a short branch
// into the old destination block's fallthrough predecessor.
@@ -2591,10 +2663,20 @@ void SIInstrInfo::insertIndirectBranch(MachineBasicBlock &MBB,
// dest_bb:
// buzz;
- RS->enterBasicBlockEnd(MBB);
- Register Scav = RS->scavengeRegisterBackwards(
- AMDGPU::SReg_64RegClass, MachineBasicBlock::iterator(GetPC),
- /* RestoreAfter */ false, 0, /* AllowSpill */ false);
+ Register LongBranchReservedReg = MFI->getLongBranchReservedReg();
+ Register Scav;
+
+ // If we've previously reserved a register for long branches
+ // avoid running the scavenger and just use those registers
+ if (LongBranchReservedReg) {
+ RS->enterBasicBlock(MBB);
+ Scav = LongBranchReservedReg;
+ } else {
+ RS->enterBasicBlockEnd(MBB);
+ Scav = RS->scavengeRegisterBackwards(
+ AMDGPU::SReg_64RegClass, MachineBasicBlock::iterator(GetPC),
+ /* RestoreAfter */ false, 0, /* AllowSpill */ false);
+ }
if (Scav) {
RS->setRegUsed(Scav);
MRI.replaceRegWith(PCReg, Scav);
@@ -2720,11 +2802,13 @@ bool SIInstrInfo::analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,
case AMDGPU::S_OR_B64_term:
case AMDGPU::S_ANDN2_B64_term:
case AMDGPU::S_AND_B64_term:
+ case AMDGPU::S_AND_SAVEEXEC_B64_term:
case AMDGPU::S_MOV_B32_term:
case AMDGPU::S_XOR_B32_term:
case AMDGPU::S_OR_B32_term:
case AMDGPU::S_ANDN2_B32_term:
case AMDGPU::S_AND_B32_term:
+ case AMDGPU::S_AND_SAVEEXEC_B32_term:
break;
case AMDGPU::SI_IF:
case AMDGPU::SI_ELSE:
@@ -2858,7 +2942,7 @@ bool SIInstrInfo::canInsertSelect(const MachineBasicBlock &MBB,
if (MRI.getRegClass(FalseReg) != RC)
return false;
- int NumInsts = AMDGPU::getRegBitWidth(RC->getID()) / 32;
+ int NumInsts = AMDGPU::getRegBitWidth(*RC) / 32;
CondCycles = TrueCycles = FalseCycles = NumInsts; // ???
// Limit to equal cost for branch vs. N v_cndmask_b32s.
@@ -2873,7 +2957,7 @@ bool SIInstrInfo::canInsertSelect(const MachineBasicBlock &MBB,
if (MRI.getRegClass(FalseReg) != RC)
return false;
- int NumInsts = AMDGPU::getRegBitWidth(RC->getID()) / 32;
+ int NumInsts = AMDGPU::getRegBitWidth(*RC) / 32;
// Multiples of 8 can do s_cselect_b64
if (NumInsts % 2 == 0)
@@ -3004,6 +3088,7 @@ bool SIInstrInfo::isFoldableCopy(const MachineInstr &MI) {
case AMDGPU::S_MOV_B32:
case AMDGPU::S_MOV_B64:
case AMDGPU::COPY:
+ case AMDGPU::WWM_COPY:
case AMDGPU::V_ACCVGPR_WRITE_B32_e64:
case AMDGPU::V_ACCVGPR_READ_B32_e64:
case AMDGPU::V_ACCVGPR_MOV_B32:
@@ -3084,7 +3169,12 @@ bool SIInstrInfo::FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
assert(UseMI.getOperand(1).getReg().isVirtual());
}
- UseMI.setDesc(get(NewOpc));
+ const MCInstrDesc &NewMCID = get(NewOpc);
+ if (DstReg.isPhysical() &&
+ !RI.getRegClass(NewMCID.operands()[0].RegClass)->contains(DstReg))
+ return false;
+
+ UseMI.setDesc(NewMCID);
UseMI.getOperand(1).ChangeToImmediate(Imm.getSExtValue());
UseMI.addImplicitDefUseOperands(*UseMI.getParent()->getParent());
return true;
@@ -4352,7 +4442,7 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI,
// Adjust for packed 16 bit values
if (D16 && D16->getImm() && !ST.hasUnpackedD16VMem())
- RegCount >>= 1;
+ RegCount = divideCeil(RegCount, 2);
// Adjust if using LWE or TFE
if ((LWE && LWE->getImm()) || (TFE && TFE->getImm()))
@@ -4365,7 +4455,7 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI,
const TargetRegisterClass *DstRC = getOpRegClass(MI, DstIdx);
uint32_t DstSize = RI.getRegSizeInBits(*DstRC) / 32;
if (RegCount > DstSize) {
- ErrInfo = "MIMG instruction returns too many registers for dst "
+ ErrInfo = "Image instruction returns too many registers for dst "
"register class";
return false;
}
@@ -4636,9 +4726,12 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI,
unsigned VAddrWords;
if (IsNSA) {
VAddrWords = SRsrcIdx - VAddr0Idx;
+ if (ST.hasPartialNSAEncoding() && AddrWords > ST.getNSAMaxSize()) {
+ unsigned LastVAddrIdx = SRsrcIdx - 1;
+ VAddrWords += getOpSize(MI, LastVAddrIdx) / 4 - 1;
+ }
} else {
- const TargetRegisterClass *RC = getOpRegClass(MI, VAddr0Idx);
- VAddrWords = MRI.getTargetRegisterInfo()->getRegSizeInBits(*RC) / 32;
+ VAddrWords = getOpSize(MI, VAddr0Idx) / 4;
if (AddrWords > 12)
AddrWords = 16;
}
@@ -4881,6 +4974,51 @@ unsigned SIInstrInfo::getVALUOp(const MachineInstr &MI) const {
"Unexpected scalar opcode without corresponding vector one!");
}
+void SIInstrInfo::insertScratchExecCopy(MachineFunction &MF,
+ MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MBBI,
+ const DebugLoc &DL, Register Reg,
+ bool IsSCCLive,
+ SlotIndexes *Indexes) const {
+ const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
+ const SIInstrInfo *TII = ST.getInstrInfo();
+ bool IsWave32 = ST.isWave32();
+ if (IsSCCLive) {
+ // Insert two move instructions, one to save the original value of EXEC and
+ // the other to turn on all bits in EXEC. This is required as we can't use
+ // the single instruction S_OR_SAVEEXEC that clobbers SCC.
+ unsigned MovOpc = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
+ MCRegister Exec = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
+ auto StoreExecMI = BuildMI(MBB, MBBI, DL, TII->get(MovOpc), Reg)
+ .addReg(Exec, RegState::Kill);
+ auto FlipExecMI = BuildMI(MBB, MBBI, DL, TII->get(MovOpc), Exec).addImm(-1);
+ if (Indexes) {
+ Indexes->insertMachineInstrInMaps(*StoreExecMI);
+ Indexes->insertMachineInstrInMaps(*FlipExecMI);
+ }
+ } else {
+ const unsigned OrSaveExec =
+ IsWave32 ? AMDGPU::S_OR_SAVEEXEC_B32 : AMDGPU::S_OR_SAVEEXEC_B64;
+ auto SaveExec =
+ BuildMI(MBB, MBBI, DL, TII->get(OrSaveExec), Reg).addImm(-1);
+ SaveExec->getOperand(3).setIsDead(); // Mark SCC as dead.
+ if (Indexes)
+ Indexes->insertMachineInstrInMaps(*SaveExec);
+ }
+}
+
+void SIInstrInfo::restoreExec(MachineFunction &MF, MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MBBI,
+ const DebugLoc &DL, Register Reg,
+ SlotIndexes *Indexes) const {
+ unsigned ExecMov = isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
+ MCRegister Exec = isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
+ auto ExecRestoreMI =
+ BuildMI(MBB, MBBI, DL, get(ExecMov), Exec).addReg(Reg, RegState::Kill);
+ if (Indexes)
+ Indexes->insertMachineInstrInMaps(*ExecRestoreMI);
+}
+
static const TargetRegisterClass *
adjustAllocatableRegClass(const GCNSubtarget &ST, const SIRegisterInfo &RI,
const MachineRegisterInfo &MRI,
@@ -4979,12 +5117,6 @@ void SIInstrInfo::legalizeOpWithMove(MachineInstr &MI, unsigned OpIdx) const {
Opcode = (Size == 64) ? AMDGPU::S_MOV_B64 : AMDGPU::S_MOV_B32;
const TargetRegisterClass *VRC = RI.getEquivalentVGPRClass(RC);
- const TargetRegisterClass *VRC64 = RI.getVGPR64Class();
- if (RI.getCommonSubClass(VRC64, VRC))
- VRC = VRC64;
- else
- VRC = &AMDGPU::VGPR_32RegClass;
-
Register Reg = MRI.createVirtualRegister(VRC);
DebugLoc DL = MBB->findDebugLoc(I);
BuildMI(*MI.getParent(), I, DL, get(Opcode), Reg).add(MO);
@@ -5585,13 +5717,12 @@ void SIInstrInfo::legalizeGenericOperand(MachineBasicBlock &InsertMBB,
}
// Emit the actual waterfall loop, executing the wrapped instruction for each
-// unique value of \p Rsrc across all lanes. In the best case we execute 1
+// unique value of \p ScalarOps across all lanes. In the best case we execute 1
// iteration, in the worst case we execute 64 (once per lane).
-static void
-emitLoadSRsrcFromVGPRLoop(const SIInstrInfo &TII, MachineRegisterInfo &MRI,
- MachineBasicBlock &OrigBB, MachineBasicBlock &LoopBB,
- MachineBasicBlock &BodyBB, const DebugLoc &DL,
- MachineOperand &Rsrc) {
+static void emitLoadScalarOpsFromVGPRLoop(
+ const SIInstrInfo &TII, MachineRegisterInfo &MRI, MachineBasicBlock &OrigBB,
+ MachineBasicBlock &LoopBB, MachineBasicBlock &BodyBB, const DebugLoc &DL,
+ ArrayRef<MachineOperand *> ScalarOps) {
MachineFunction &MF = *OrigBB.getParent();
const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
const SIRegisterInfo *TRI = ST.getRegisterInfo();
@@ -5609,72 +5740,105 @@ emitLoadSRsrcFromVGPRLoop(const SIInstrInfo &TII, MachineRegisterInfo &MRI,
SmallVector<Register, 8> ReadlanePieces;
Register CondReg;
- Register VRsrc = Rsrc.getReg();
- unsigned VRsrcUndef = getUndefRegState(Rsrc.isUndef());
+ for (MachineOperand *ScalarOp : ScalarOps) {
+ unsigned RegSize = TRI->getRegSizeInBits(ScalarOp->getReg(), MRI);
+ unsigned NumSubRegs = RegSize / 32;
+ Register VScalarOp = ScalarOp->getReg();
+
+ if (NumSubRegs == 1) {
+ Register CurReg = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
+
+ BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_READFIRSTLANE_B32), CurReg)
+ .addReg(VScalarOp);
+
+ Register NewCondReg = MRI.createVirtualRegister(BoolXExecRC);
+
+ BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_CMP_EQ_U32_e64), NewCondReg)
+ .addReg(CurReg)
+ .addReg(VScalarOp);
- unsigned RegSize = TRI->getRegSizeInBits(Rsrc.getReg(), MRI);
- unsigned NumSubRegs = RegSize / 32;
- assert(NumSubRegs % 2 == 0 && NumSubRegs <= 32 && "Unhandled register size");
+ // Combine the comparison results with AND.
+ if (!CondReg) // First.
+ CondReg = NewCondReg;
+ else { // If not the first, we create an AND.
+ Register AndReg = MRI.createVirtualRegister(BoolXExecRC);
+ BuildMI(LoopBB, I, DL, TII.get(AndOpc), AndReg)
+ .addReg(CondReg)
+ .addReg(NewCondReg);
+ CondReg = AndReg;
+ }
- for (unsigned Idx = 0; Idx < NumSubRegs; Idx += 2) {
+ // Update ScalarOp operand to use the SGPR ScalarOp.
+ ScalarOp->setReg(CurReg);
+ ScalarOp->setIsKill();
+ } else {
+ unsigned VScalarOpUndef = getUndefRegState(ScalarOp->isUndef());
+ assert(NumSubRegs % 2 == 0 && NumSubRegs <= 32 &&
+ "Unhandled register size");
- Register CurRegLo = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
- Register CurRegHi = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
+ for (unsigned Idx = 0; Idx < NumSubRegs; Idx += 2) {
+ Register CurRegLo = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
+ Register CurRegHi = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
- // Read the next variant <- also loop target.
- BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_READFIRSTLANE_B32), CurRegLo)
- .addReg(VRsrc, VRsrcUndef, TRI->getSubRegFromChannel(Idx));
+ // Read the next variant <- also loop target.
+ BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_READFIRSTLANE_B32), CurRegLo)
+ .addReg(VScalarOp, VScalarOpUndef, TRI->getSubRegFromChannel(Idx));
- // Read the next variant <- also loop target.
- BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_READFIRSTLANE_B32), CurRegHi)
- .addReg(VRsrc, VRsrcUndef, TRI->getSubRegFromChannel(Idx + 1));
+ // Read the next variant <- also loop target.
+ BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_READFIRSTLANE_B32), CurRegHi)
+ .addReg(VScalarOp, VScalarOpUndef,
+ TRI->getSubRegFromChannel(Idx + 1));
- ReadlanePieces.push_back(CurRegLo);
- ReadlanePieces.push_back(CurRegHi);
+ ReadlanePieces.push_back(CurRegLo);
+ ReadlanePieces.push_back(CurRegHi);
- // Comparison is to be done as 64-bit.
- Register CurReg = MRI.createVirtualRegister(&AMDGPU::SGPR_64RegClass);
- BuildMI(LoopBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), CurReg)
+ // Comparison is to be done as 64-bit.
+ Register CurReg = MRI.createVirtualRegister(&AMDGPU::SGPR_64RegClass);
+ BuildMI(LoopBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), CurReg)
.addReg(CurRegLo)
.addImm(AMDGPU::sub0)
.addReg(CurRegHi)
.addImm(AMDGPU::sub1);
- Register NewCondReg = MRI.createVirtualRegister(BoolXExecRC);
- auto Cmp =
- BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_CMP_EQ_U64_e64), NewCondReg)
- .addReg(CurReg);
- if (NumSubRegs <= 2)
- Cmp.addReg(VRsrc);
- else
- Cmp.addReg(VRsrc, VRsrcUndef, TRI->getSubRegFromChannel(Idx, 2));
+ Register NewCondReg = MRI.createVirtualRegister(BoolXExecRC);
+ auto Cmp = BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_CMP_EQ_U64_e64),
+ NewCondReg)
+ .addReg(CurReg);
+ if (NumSubRegs <= 2)
+ Cmp.addReg(VScalarOp);
+ else
+ Cmp.addReg(VScalarOp, VScalarOpUndef,
+ TRI->getSubRegFromChannel(Idx, 2));
- // Combine the comparison results with AND.
- if (!CondReg) // First.
- CondReg = NewCondReg;
- else { // If not the first, we create an AND.
- Register AndReg = MRI.createVirtualRegister(BoolXExecRC);
- BuildMI(LoopBB, I, DL, TII.get(AndOpc), AndReg)
+ // Combine the comparison results with AND.
+ if (!CondReg) // First.
+ CondReg = NewCondReg;
+ else { // If not the first, we create an AND.
+ Register AndReg = MRI.createVirtualRegister(BoolXExecRC);
+ BuildMI(LoopBB, I, DL, TII.get(AndOpc), AndReg)
.addReg(CondReg)
.addReg(NewCondReg);
- CondReg = AndReg;
- }
- } // End for loop.
+ CondReg = AndReg;
+ }
+ } // End for loop.
- auto SRsrcRC = TRI->getEquivalentSGPRClass(MRI.getRegClass(VRsrc));
- Register SRsrc = MRI.createVirtualRegister(SRsrcRC);
+ auto SScalarOpRC =
+ TRI->getEquivalentSGPRClass(MRI.getRegClass(VScalarOp));
+ Register SScalarOp = MRI.createVirtualRegister(SScalarOpRC);
- // Build scalar Rsrc.
- auto Merge = BuildMI(LoopBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), SRsrc);
- unsigned Channel = 0;
- for (Register Piece : ReadlanePieces) {
- Merge.addReg(Piece)
- .addImm(TRI->getSubRegFromChannel(Channel++));
- }
+ // Build scalar ScalarOp.
+ auto Merge =
+ BuildMI(LoopBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), SScalarOp);
+ unsigned Channel = 0;
+ for (Register Piece : ReadlanePieces) {
+ Merge.addReg(Piece).addImm(TRI->getSubRegFromChannel(Channel++));
+ }
- // Update Rsrc operand to use the SGPR Rsrc.
- Rsrc.setReg(SRsrc);
- Rsrc.setIsKill();
+ // Update ScalarOp operand to use the SGPR ScalarOp.
+ ScalarOp->setReg(SScalarOp);
+ ScalarOp->setIsKill();
+ }
+ }
Register SaveExec = MRI.createVirtualRegister(BoolXExecRC);
MRI.setSimpleHint(SaveExec, CondReg);
@@ -5694,14 +5858,15 @@ emitLoadSRsrcFromVGPRLoop(const SIInstrInfo &TII, MachineRegisterInfo &MRI,
BuildMI(BodyBB, I, DL, TII.get(AMDGPU::SI_WATERFALL_LOOP)).addMBB(&LoopBB);
}
-// Build a waterfall loop around \p MI, replacing the VGPR \p Rsrc register
+// Build a waterfall loop around \p MI, replacing the VGPR \p ScalarOp register
// with SGPRs by iterating over all unique values across all lanes.
// Returns the loop basic block that now contains \p MI.
static MachineBasicBlock *
-loadSRsrcFromVGPR(const SIInstrInfo &TII, MachineInstr &MI,
- MachineOperand &Rsrc, MachineDominatorTree *MDT,
- MachineBasicBlock::iterator Begin = nullptr,
- MachineBasicBlock::iterator End = nullptr) {
+loadMBUFScalarOperandsFromVGPR(const SIInstrInfo &TII, MachineInstr &MI,
+ ArrayRef<MachineOperand *> ScalarOps,
+ MachineDominatorTree *MDT,
+ MachineBasicBlock::iterator Begin = nullptr,
+ MachineBasicBlock::iterator End = nullptr) {
MachineBasicBlock &MBB = *MI.getParent();
MachineFunction &MF = *MBB.getParent();
const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
@@ -5728,11 +5893,8 @@ loadSRsrcFromVGPR(const SIInstrInfo &TII, MachineInstr &MI,
MachineBasicBlock::iterator AfterMI = MI;
++AfterMI;
for (auto I = Begin; I != AfterMI; I++) {
- for (auto &MO : I->uses()) {
- if (MO.isReg() && MO.isUse()) {
- MRI.clearKillFlags(MO.getReg());
- }
- }
+ for (auto &MO : I->all_uses())
+ MRI.clearKillFlags(MO.getReg());
}
// To insert the loop we need to split the block. Move everything after this
@@ -5774,7 +5936,7 @@ loadSRsrcFromVGPR(const SIInstrInfo &TII, MachineInstr &MI,
}
}
- emitLoadSRsrcFromVGPRLoop(TII, MRI, MBB, *LoopBB, *BodyBB, DL, Rsrc);
+ emitLoadScalarOpsFromVGPRLoop(TII, MRI, MBB, *LoopBB, *BodyBB, DL, ScalarOps);
// Restore the EXEC mask
MachineBasicBlock::iterator First = RemainderBB->begin();
@@ -5971,11 +6133,11 @@ SIInstrInfo::legalizeOperands(MachineInstr &MI,
(isMUBUF(MI) || isMTBUF(MI)))) {
MachineOperand *SRsrc = getNamedOperand(MI, AMDGPU::OpName::srsrc);
if (SRsrc && !RI.isSGPRClass(MRI.getRegClass(SRsrc->getReg())))
- CreatedBB = loadSRsrcFromVGPR(*this, MI, *SRsrc, MDT);
+ CreatedBB = loadMBUFScalarOperandsFromVGPR(*this, MI, {SRsrc}, MDT);
MachineOperand *SSamp = getNamedOperand(MI, AMDGPU::OpName::ssamp);
if (SSamp && !RI.isSGPRClass(MRI.getRegClass(SSamp->getReg())))
- CreatedBB = loadSRsrcFromVGPR(*this, MI, *SSamp, MDT);
+ CreatedBB = loadMBUFScalarOperandsFromVGPR(*this, MI, {SSamp}, MDT);
return CreatedBB;
}
@@ -6003,25 +6165,39 @@ SIInstrInfo::legalizeOperands(MachineInstr &MI,
while (End != MBB.end() && End->isCopy() && End->getOperand(1).isReg() &&
MI.definesRegister(End->getOperand(1).getReg()))
++End;
- CreatedBB = loadSRsrcFromVGPR(*this, MI, *Dest, MDT, Start, End);
+ CreatedBB =
+ loadMBUFScalarOperandsFromVGPR(*this, MI, {Dest}, MDT, Start, End);
+ }
+ }
+
+ // Legalize MUBUF instructions.
+ bool isSoffsetLegal = true;
+ int SoffsetIdx =
+ AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::soffset);
+ if (SoffsetIdx != -1) {
+ MachineOperand *Soffset = &MI.getOperand(SoffsetIdx);
+ if (Soffset->isReg() &&
+ !RI.isSGPRClass(MRI.getRegClass(Soffset->getReg()))) {
+ isSoffsetLegal = false;
}
}
- // Legalize MUBUF* instructions.
+ bool isRsrcLegal = true;
int RsrcIdx =
AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::srsrc);
if (RsrcIdx != -1) {
- // We have an MUBUF instruction
MachineOperand *Rsrc = &MI.getOperand(RsrcIdx);
- unsigned RsrcRC = get(MI.getOpcode()).operands()[RsrcIdx].RegClass;
- if (RI.getCommonSubClass(MRI.getRegClass(Rsrc->getReg()),
- RI.getRegClass(RsrcRC))) {
- // The operands are legal.
- // FIXME: We may need to legalize operands besides srsrc.
- return CreatedBB;
+ if (Rsrc->isReg() && !RI.isSGPRClass(MRI.getRegClass(Rsrc->getReg()))) {
+ isRsrcLegal = false;
}
+ }
- // Legalize a VGPR Rsrc.
+ // The operands are legal.
+ if (isRsrcLegal && isSoffsetLegal)
+ return CreatedBB;
+
+ if (!isRsrcLegal) {
+ // Legalize a VGPR Rsrc
//
// If the instruction is _ADDR64, we can avoid a waterfall by extracting
// the base pointer from the VGPR Rsrc, adding it to the VAddr, then using
@@ -6034,6 +6210,7 @@ SIInstrInfo::legalizeOperands(MachineInstr &MI,
// Otherwise we are on non-ADDR64 hardware, and/or we have
// idxen/offen/bothen and we fall back to a waterfall loop.
+ MachineOperand *Rsrc = &MI.getOperand(RsrcIdx);
MachineBasicBlock &MBB = *MI.getParent();
MachineOperand *VAddr = getNamedOperand(MI, AMDGPU::OpName::vaddr);
@@ -6143,433 +6320,447 @@ SIInstrInfo::legalizeOperands(MachineInstr &MI,
.addReg(RsrcPtr, 0, AMDGPU::sub1)
.addImm(AMDGPU::sub1);
} else {
- // This is another variant; legalize Rsrc with waterfall loop from VGPRs
- // to SGPRs.
- CreatedBB = loadSRsrcFromVGPR(*this, MI, *Rsrc, MDT);
+ // Legalize a VGPR Rsrc and soffset together.
+ if (!isSoffsetLegal) {
+ MachineOperand *Soffset = getNamedOperand(MI, AMDGPU::OpName::soffset);
+ CreatedBB =
+ loadMBUFScalarOperandsFromVGPR(*this, MI, {Rsrc, Soffset}, MDT);
+ return CreatedBB;
+ }
+ CreatedBB = loadMBUFScalarOperandsFromVGPR(*this, MI, {Rsrc}, MDT);
return CreatedBB;
}
}
+
+ // Legalize a VGPR soffset.
+ if (!isSoffsetLegal) {
+ MachineOperand *Soffset = getNamedOperand(MI, AMDGPU::OpName::soffset);
+ CreatedBB = loadMBUFScalarOperandsFromVGPR(*this, MI, {Soffset}, MDT);
+ return CreatedBB;
+ }
return CreatedBB;
}
-MachineBasicBlock *SIInstrInfo::moveToVALU(MachineInstr &TopInst,
- MachineDominatorTree *MDT) const {
- SetVectorType Worklist;
- Worklist.insert(&TopInst);
- MachineBasicBlock *CreatedBB = nullptr;
- MachineBasicBlock *CreatedBBTmp = nullptr;
-
- while (!Worklist.empty()) {
- MachineInstr &Inst = *Worklist.pop_back_val();
- MachineBasicBlock *MBB = Inst.getParent();
- MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
-
- unsigned Opcode = Inst.getOpcode();
- unsigned NewOpcode = getVALUOp(Inst);
-
- // Handle some special cases
- switch (Opcode) {
- default:
- break;
- case AMDGPU::S_ADD_U64_PSEUDO:
- case AMDGPU::S_SUB_U64_PSEUDO:
- splitScalar64BitAddSub(Worklist, Inst, MDT);
- Inst.eraseFromParent();
- continue;
- case AMDGPU::S_ADD_I32:
- case AMDGPU::S_SUB_I32: {
- // FIXME: The u32 versions currently selected use the carry.
- bool Changed;
- std::tie(Changed, CreatedBBTmp) = moveScalarAddSub(Worklist, Inst, MDT);
- if (CreatedBBTmp && TopInst.getParent() == CreatedBBTmp)
- CreatedBB = CreatedBBTmp;
- if (Changed)
- continue;
-
- // Default handling
- break;
- }
- case AMDGPU::S_AND_B64:
- splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_AND_B32, MDT);
- Inst.eraseFromParent();
- continue;
+void SIInstrWorklist::insert(MachineInstr *MI) {
+ InstrList.insert(MI);
+ // Add MBUF instructiosn to deferred list.
+ int RsrcIdx =
+ AMDGPU::getNamedOperandIdx(MI->getOpcode(), AMDGPU::OpName::srsrc);
+ if (RsrcIdx != -1) {
+ DeferredList.insert(MI);
+ }
+}
- case AMDGPU::S_OR_B64:
- splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_OR_B32, MDT);
- Inst.eraseFromParent();
- continue;
+bool SIInstrWorklist::isDeferred(MachineInstr *MI) {
+ return DeferredList.contains(MI);
+}
- case AMDGPU::S_XOR_B64:
- splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_XOR_B32, MDT);
- Inst.eraseFromParent();
- continue;
+void SIInstrInfo::moveToVALU(SIInstrWorklist &Worklist,
+ MachineDominatorTree *MDT) const {
- case AMDGPU::S_NAND_B64:
- splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_NAND_B32, MDT);
- Inst.eraseFromParent();
+ while (!Worklist.empty()) {
+ MachineInstr &Inst = *Worklist.top();
+ Worklist.erase_top();
+ // Skip MachineInstr in the deferred list.
+ if (Worklist.isDeferred(&Inst))
continue;
+ moveToVALUImpl(Worklist, MDT, Inst);
+ }
- case AMDGPU::S_NOR_B64:
- splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_NOR_B32, MDT);
- Inst.eraseFromParent();
- continue;
+ // Deferred list of instructions will be processed once
+ // all the MachineInstr in the worklist are done.
+ for (MachineInstr *Inst : Worklist.getDeferredList()) {
+ moveToVALUImpl(Worklist, MDT, *Inst);
+ assert(Worklist.empty() &&
+ "Deferred MachineInstr are not supposed to re-populate worklist");
+ }
+}
- case AMDGPU::S_XNOR_B64:
- if (ST.hasDLInsts())
- splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_XNOR_B32, MDT);
- else
- splitScalar64BitXnor(Worklist, Inst, MDT);
- Inst.eraseFromParent();
- continue;
+void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist,
+ MachineDominatorTree *MDT,
+ MachineInstr &Inst) const {
- case AMDGPU::S_ANDN2_B64:
- splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_ANDN2_B32, MDT);
- Inst.eraseFromParent();
- continue;
+ MachineBasicBlock *MBB = Inst.getParent();
+ if (!MBB)
+ return;
+ MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
+ unsigned Opcode = Inst.getOpcode();
+ unsigned NewOpcode = getVALUOp(Inst);
+ // Handle some special cases
+ switch (Opcode) {
+ default:
+ break;
+ case AMDGPU::S_ADD_U64_PSEUDO:
+ case AMDGPU::S_SUB_U64_PSEUDO:
+ splitScalar64BitAddSub(Worklist, Inst, MDT);
+ Inst.eraseFromParent();
+ return;
+ case AMDGPU::S_ADD_I32:
+ case AMDGPU::S_SUB_I32: {
+ // FIXME: The u32 versions currently selected use the carry.
+ bool Changed;
+ MachineBasicBlock *CreatedBBTmp = nullptr;
+ std::tie(Changed, CreatedBBTmp) = moveScalarAddSub(Worklist, Inst, MDT);
+ if (Changed)
+ return;
- case AMDGPU::S_ORN2_B64:
- splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_ORN2_B32, MDT);
- Inst.eraseFromParent();
- continue;
+ // Default handling
+ break;
+ }
+ case AMDGPU::S_AND_B64:
+ splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_AND_B32, MDT);
+ Inst.eraseFromParent();
+ return;
- case AMDGPU::S_BREV_B64:
- splitScalar64BitUnaryOp(Worklist, Inst, AMDGPU::S_BREV_B32, true);
- Inst.eraseFromParent();
- continue;
+ case AMDGPU::S_OR_B64:
+ splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_OR_B32, MDT);
+ Inst.eraseFromParent();
+ return;
- case AMDGPU::S_NOT_B64:
- splitScalar64BitUnaryOp(Worklist, Inst, AMDGPU::S_NOT_B32);
- Inst.eraseFromParent();
- continue;
+ case AMDGPU::S_XOR_B64:
+ splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_XOR_B32, MDT);
+ Inst.eraseFromParent();
+ return;
- case AMDGPU::S_BCNT1_I32_B64:
- splitScalar64BitBCNT(Worklist, Inst);
- Inst.eraseFromParent();
- continue;
+ case AMDGPU::S_NAND_B64:
+ splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_NAND_B32, MDT);
+ Inst.eraseFromParent();
+ return;
- case AMDGPU::S_BFE_I64:
- splitScalar64BitBFE(Worklist, Inst);
- Inst.eraseFromParent();
- continue;
+ case AMDGPU::S_NOR_B64:
+ splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_NOR_B32, MDT);
+ Inst.eraseFromParent();
+ return;
- case AMDGPU::S_LSHL_B32:
- if (ST.hasOnlyRevVALUShifts()) {
- NewOpcode = AMDGPU::V_LSHLREV_B32_e64;
- swapOperands(Inst);
- }
- break;
- case AMDGPU::S_ASHR_I32:
- if (ST.hasOnlyRevVALUShifts()) {
- NewOpcode = AMDGPU::V_ASHRREV_I32_e64;
- swapOperands(Inst);
- }
- break;
- case AMDGPU::S_LSHR_B32:
- if (ST.hasOnlyRevVALUShifts()) {
- NewOpcode = AMDGPU::V_LSHRREV_B32_e64;
- swapOperands(Inst);
- }
- break;
- case AMDGPU::S_LSHL_B64:
- if (ST.hasOnlyRevVALUShifts()) {
- NewOpcode = AMDGPU::V_LSHLREV_B64_e64;
- swapOperands(Inst);
- }
- break;
- case AMDGPU::S_ASHR_I64:
- if (ST.hasOnlyRevVALUShifts()) {
- NewOpcode = AMDGPU::V_ASHRREV_I64_e64;
- swapOperands(Inst);
- }
- break;
- case AMDGPU::S_LSHR_B64:
- if (ST.hasOnlyRevVALUShifts()) {
- NewOpcode = AMDGPU::V_LSHRREV_B64_e64;
- swapOperands(Inst);
- }
- break;
+ case AMDGPU::S_XNOR_B64:
+ if (ST.hasDLInsts())
+ splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_XNOR_B32, MDT);
+ else
+ splitScalar64BitXnor(Worklist, Inst, MDT);
+ Inst.eraseFromParent();
+ return;
- case AMDGPU::S_ABS_I32:
- lowerScalarAbs(Worklist, Inst);
- Inst.eraseFromParent();
- continue;
+ case AMDGPU::S_ANDN2_B64:
+ splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_ANDN2_B32, MDT);
+ Inst.eraseFromParent();
+ return;
- case AMDGPU::S_CBRANCH_SCC0:
- case AMDGPU::S_CBRANCH_SCC1: {
- // Clear unused bits of vcc
- Register CondReg = Inst.getOperand(1).getReg();
- bool IsSCC = CondReg == AMDGPU::SCC;
- Register VCC = RI.getVCC();
- Register EXEC = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
- unsigned Opc = ST.isWave32() ? AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64;
- BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(Opc), VCC)
- .addReg(EXEC)
- .addReg(IsSCC ? VCC : CondReg);
- Inst.removeOperand(1);
- }
- break;
+ case AMDGPU::S_ORN2_B64:
+ splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::S_ORN2_B32, MDT);
+ Inst.eraseFromParent();
+ return;
- case AMDGPU::S_BFE_U64:
- case AMDGPU::S_BFM_B64:
- llvm_unreachable("Moving this op to VALU not implemented");
+ case AMDGPU::S_BREV_B64:
+ splitScalar64BitUnaryOp(Worklist, Inst, AMDGPU::S_BREV_B32, true);
+ Inst.eraseFromParent();
+ return;
- case AMDGPU::S_PACK_LL_B32_B16:
- case AMDGPU::S_PACK_LH_B32_B16:
- case AMDGPU::S_PACK_HL_B32_B16:
- case AMDGPU::S_PACK_HH_B32_B16:
- movePackToVALU(Worklist, MRI, Inst);
- Inst.eraseFromParent();
- continue;
+ case AMDGPU::S_NOT_B64:
+ splitScalar64BitUnaryOp(Worklist, Inst, AMDGPU::S_NOT_B32);
+ Inst.eraseFromParent();
+ return;
- case AMDGPU::S_XNOR_B32:
- lowerScalarXnor(Worklist, Inst);
- Inst.eraseFromParent();
- continue;
+ case AMDGPU::S_BCNT1_I32_B64:
+ splitScalar64BitBCNT(Worklist, Inst);
+ Inst.eraseFromParent();
+ return;
- case AMDGPU::S_NAND_B32:
- splitScalarNotBinop(Worklist, Inst, AMDGPU::S_AND_B32);
- Inst.eraseFromParent();
- continue;
+ case AMDGPU::S_BFE_I64:
+ splitScalar64BitBFE(Worklist, Inst);
+ Inst.eraseFromParent();
+ return;
- case AMDGPU::S_NOR_B32:
- splitScalarNotBinop(Worklist, Inst, AMDGPU::S_OR_B32);
- Inst.eraseFromParent();
- continue;
+ case AMDGPU::S_LSHL_B32:
+ if (ST.hasOnlyRevVALUShifts()) {
+ NewOpcode = AMDGPU::V_LSHLREV_B32_e64;
+ swapOperands(Inst);
+ }
+ break;
+ case AMDGPU::S_ASHR_I32:
+ if (ST.hasOnlyRevVALUShifts()) {
+ NewOpcode = AMDGPU::V_ASHRREV_I32_e64;
+ swapOperands(Inst);
+ }
+ break;
+ case AMDGPU::S_LSHR_B32:
+ if (ST.hasOnlyRevVALUShifts()) {
+ NewOpcode = AMDGPU::V_LSHRREV_B32_e64;
+ swapOperands(Inst);
+ }
+ break;
+ case AMDGPU::S_LSHL_B64:
+ if (ST.hasOnlyRevVALUShifts()) {
+ NewOpcode = AMDGPU::V_LSHLREV_B64_e64;
+ swapOperands(Inst);
+ }
+ break;
+ case AMDGPU::S_ASHR_I64:
+ if (ST.hasOnlyRevVALUShifts()) {
+ NewOpcode = AMDGPU::V_ASHRREV_I64_e64;
+ swapOperands(Inst);
+ }
+ break;
+ case AMDGPU::S_LSHR_B64:
+ if (ST.hasOnlyRevVALUShifts()) {
+ NewOpcode = AMDGPU::V_LSHRREV_B64_e64;
+ swapOperands(Inst);
+ }
+ break;
- case AMDGPU::S_ANDN2_B32:
- splitScalarBinOpN2(Worklist, Inst, AMDGPU::S_AND_B32);
- Inst.eraseFromParent();
- continue;
+ case AMDGPU::S_ABS_I32:
+ lowerScalarAbs(Worklist, Inst);
+ Inst.eraseFromParent();
+ return;
- case AMDGPU::S_ORN2_B32:
- splitScalarBinOpN2(Worklist, Inst, AMDGPU::S_OR_B32);
- Inst.eraseFromParent();
- continue;
+ case AMDGPU::S_CBRANCH_SCC0:
+ case AMDGPU::S_CBRANCH_SCC1: {
+ // Clear unused bits of vcc
+ Register CondReg = Inst.getOperand(1).getReg();
+ bool IsSCC = CondReg == AMDGPU::SCC;
+ Register VCC = RI.getVCC();
+ Register EXEC = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
+ unsigned Opc = ST.isWave32() ? AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64;
+ BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(Opc), VCC)
+ .addReg(EXEC)
+ .addReg(IsSCC ? VCC : CondReg);
+ Inst.removeOperand(1);
+ } break;
- // TODO: remove as soon as everything is ready
- // to replace VGPR to SGPR copy with V_READFIRSTLANEs.
- // S_ADD/SUB_CO_PSEUDO as well as S_UADDO/USUBO_PSEUDO
- // can only be selected from the uniform SDNode.
- case AMDGPU::S_ADD_CO_PSEUDO:
- case AMDGPU::S_SUB_CO_PSEUDO: {
- unsigned Opc = (Inst.getOpcode() == AMDGPU::S_ADD_CO_PSEUDO)
- ? AMDGPU::V_ADDC_U32_e64
- : AMDGPU::V_SUBB_U32_e64;
- const auto *CarryRC = RI.getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
+ case AMDGPU::S_BFE_U64:
+ case AMDGPU::S_BFM_B64:
+ llvm_unreachable("Moving this op to VALU not implemented");
- Register CarryInReg = Inst.getOperand(4).getReg();
- if (!MRI.constrainRegClass(CarryInReg, CarryRC)) {
- Register NewCarryReg = MRI.createVirtualRegister(CarryRC);
- BuildMI(*MBB, &Inst, Inst.getDebugLoc(), get(AMDGPU::COPY), NewCarryReg)
- .addReg(CarryInReg);
- }
+ case AMDGPU::S_PACK_LL_B32_B16:
+ case AMDGPU::S_PACK_LH_B32_B16:
+ case AMDGPU::S_PACK_HL_B32_B16:
+ case AMDGPU::S_PACK_HH_B32_B16:
+ movePackToVALU(Worklist, MRI, Inst);
+ Inst.eraseFromParent();
+ return;
- Register CarryOutReg = Inst.getOperand(1).getReg();
+ case AMDGPU::S_XNOR_B32:
+ lowerScalarXnor(Worklist, Inst);
+ Inst.eraseFromParent();
+ return;
- Register DestReg = MRI.createVirtualRegister(RI.getEquivalentVGPRClass(
- MRI.getRegClass(Inst.getOperand(0).getReg())));
- MachineInstr *CarryOp =
- BuildMI(*MBB, &Inst, Inst.getDebugLoc(), get(Opc), DestReg)
- .addReg(CarryOutReg, RegState::Define)
- .add(Inst.getOperand(2))
- .add(Inst.getOperand(3))
- .addReg(CarryInReg)
- .addImm(0);
- CreatedBBTmp = legalizeOperands(*CarryOp);
- if (CreatedBBTmp && TopInst.getParent() == CreatedBBTmp)
- CreatedBB = CreatedBBTmp;
- MRI.replaceRegWith(Inst.getOperand(0).getReg(), DestReg);
- addUsersToMoveToVALUWorklist(DestReg, MRI, Worklist);
- Inst.eraseFromParent();
- }
- continue;
- case AMDGPU::S_UADDO_PSEUDO:
- case AMDGPU::S_USUBO_PSEUDO: {
- const DebugLoc &DL = Inst.getDebugLoc();
- MachineOperand &Dest0 = Inst.getOperand(0);
- MachineOperand &Dest1 = Inst.getOperand(1);
- MachineOperand &Src0 = Inst.getOperand(2);
- MachineOperand &Src1 = Inst.getOperand(3);
+ case AMDGPU::S_NAND_B32:
+ splitScalarNotBinop(Worklist, Inst, AMDGPU::S_AND_B32);
+ Inst.eraseFromParent();
+ return;
- unsigned Opc = (Inst.getOpcode() == AMDGPU::S_UADDO_PSEUDO)
- ? AMDGPU::V_ADD_CO_U32_e64
- : AMDGPU::V_SUB_CO_U32_e64;
- const TargetRegisterClass *NewRC =
- RI.getEquivalentVGPRClass(MRI.getRegClass(Dest0.getReg()));
- Register DestReg = MRI.createVirtualRegister(NewRC);
- MachineInstr *NewInstr = BuildMI(*MBB, &Inst, DL, get(Opc), DestReg)
- .addReg(Dest1.getReg(), RegState::Define)
- .add(Src0)
- .add(Src1)
- .addImm(0); // clamp bit
+ case AMDGPU::S_NOR_B32:
+ splitScalarNotBinop(Worklist, Inst, AMDGPU::S_OR_B32);
+ Inst.eraseFromParent();
+ return;
- CreatedBBTmp = legalizeOperands(*NewInstr, MDT);
- if (CreatedBBTmp && TopInst.getParent() == CreatedBBTmp)
- CreatedBB = CreatedBBTmp;
+ case AMDGPU::S_ANDN2_B32:
+ splitScalarBinOpN2(Worklist, Inst, AMDGPU::S_AND_B32);
+ Inst.eraseFromParent();
+ return;
- MRI.replaceRegWith(Dest0.getReg(), DestReg);
- addUsersToMoveToVALUWorklist(NewInstr->getOperand(0).getReg(), MRI,
- Worklist);
- Inst.eraseFromParent();
- }
- continue;
+ case AMDGPU::S_ORN2_B32:
+ splitScalarBinOpN2(Worklist, Inst, AMDGPU::S_OR_B32);
+ Inst.eraseFromParent();
+ return;
- case AMDGPU::S_CSELECT_B32:
- case AMDGPU::S_CSELECT_B64:
- lowerSelect(Worklist, Inst, MDT);
- Inst.eraseFromParent();
- continue;
- case AMDGPU::S_CMP_EQ_I32:
- case AMDGPU::S_CMP_LG_I32:
- case AMDGPU::S_CMP_GT_I32:
- case AMDGPU::S_CMP_GE_I32:
- case AMDGPU::S_CMP_LT_I32:
- case AMDGPU::S_CMP_LE_I32:
- case AMDGPU::S_CMP_EQ_U32:
- case AMDGPU::S_CMP_LG_U32:
- case AMDGPU::S_CMP_GT_U32:
- case AMDGPU::S_CMP_GE_U32:
- case AMDGPU::S_CMP_LT_U32:
- case AMDGPU::S_CMP_LE_U32:
- case AMDGPU::S_CMP_EQ_U64:
- case AMDGPU::S_CMP_LG_U64: {
- const MCInstrDesc &NewDesc = get(NewOpcode);
- Register CondReg = MRI.createVirtualRegister(RI.getWaveMaskRegClass());
- MachineInstr *NewInstr =
- BuildMI(*MBB, Inst, Inst.getDebugLoc(), NewDesc, CondReg)
- .add(Inst.getOperand(0))
- .add(Inst.getOperand(1));
- legalizeOperands(*NewInstr, MDT);
- int SCCIdx = Inst.findRegisterDefOperandIdx(AMDGPU::SCC);
- MachineOperand SCCOp = Inst.getOperand(SCCIdx);
- addSCCDefUsersToVALUWorklist(SCCOp, Inst, Worklist, CondReg);
- Inst.eraseFromParent();
- }
- continue;
- }
+ // TODO: remove as soon as everything is ready
+ // to replace VGPR to SGPR copy with V_READFIRSTLANEs.
+ // S_ADD/SUB_CO_PSEUDO as well as S_UADDO/USUBO_PSEUDO
+ // can only be selected from the uniform SDNode.
+ case AMDGPU::S_ADD_CO_PSEUDO:
+ case AMDGPU::S_SUB_CO_PSEUDO: {
+ unsigned Opc = (Inst.getOpcode() == AMDGPU::S_ADD_CO_PSEUDO)
+ ? AMDGPU::V_ADDC_U32_e64
+ : AMDGPU::V_SUBB_U32_e64;
+ const auto *CarryRC = RI.getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
- if (NewOpcode == AMDGPU::INSTRUCTION_LIST_END) {
- // We cannot move this instruction to the VALU, so we should try to
- // legalize its operands instead.
- CreatedBBTmp = legalizeOperands(Inst, MDT);
- if (CreatedBBTmp && TopInst.getParent() == CreatedBBTmp)
- CreatedBB = CreatedBBTmp;
- continue;
+ Register CarryInReg = Inst.getOperand(4).getReg();
+ if (!MRI.constrainRegClass(CarryInReg, CarryRC)) {
+ Register NewCarryReg = MRI.createVirtualRegister(CarryRC);
+ BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(AMDGPU::COPY), NewCarryReg)
+ .addReg(CarryInReg);
}
- // Handle converting generic instructions like COPY-to-SGPR into
- // COPY-to-VGPR.
- if (NewOpcode == Opcode) {
- Register DstReg = Inst.getOperand(0).getReg();
- const TargetRegisterClass *NewDstRC = getDestEquivalentVGPRClass(Inst);
-
- if (Inst.isCopy() && Inst.getOperand(1).getReg().isVirtual() &&
- NewDstRC == RI.getRegClassForReg(MRI, Inst.getOperand(1).getReg())) {
- // Instead of creating a copy where src and dst are the same register
- // class, we just replace all uses of dst with src. These kinds of
- // copies interfere with the heuristics MachineSink uses to decide
- // whether or not to split a critical edge. Since the pass assumes
- // that copies will end up as machine instructions and not be
- // eliminated.
- addUsersToMoveToVALUWorklist(DstReg, MRI, Worklist);
- MRI.replaceRegWith(DstReg, Inst.getOperand(1).getReg());
- MRI.clearKillFlags(Inst.getOperand(1).getReg());
- Inst.getOperand(0).setReg(DstReg);
-
- // Make sure we don't leave around a dead VGPR->SGPR copy. Normally
- // these are deleted later, but at -O0 it would leave a suspicious
- // looking illegal copy of an undef register.
- for (unsigned I = Inst.getNumOperands() - 1; I != 0; --I)
- Inst.removeOperand(I);
- Inst.setDesc(get(AMDGPU::IMPLICIT_DEF));
- continue;
- }
-
- Register NewDstReg = MRI.createVirtualRegister(NewDstRC);
- MRI.replaceRegWith(DstReg, NewDstReg);
- legalizeOperands(Inst, MDT);
- addUsersToMoveToVALUWorklist(NewDstReg, MRI, Worklist);
- continue;
- }
+ Register CarryOutReg = Inst.getOperand(1).getReg();
- // Use the new VALU Opcode.
- auto NewInstr = BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(NewOpcode))
- .setMIFlags(Inst.getFlags());
- for (const MachineOperand &Op : Inst.explicit_operands())
- NewInstr->addOperand(Op);
+ Register DestReg = MRI.createVirtualRegister(RI.getEquivalentVGPRClass(
+ MRI.getRegClass(Inst.getOperand(0).getReg())));
+ MachineInstr *CarryOp =
+ BuildMI(*MBB, &Inst, Inst.getDebugLoc(), get(Opc), DestReg)
+ .addReg(CarryOutReg, RegState::Define)
+ .add(Inst.getOperand(2))
+ .add(Inst.getOperand(3))
+ .addReg(CarryInReg)
+ .addImm(0);
+ legalizeOperands(*CarryOp);
+ MRI.replaceRegWith(Inst.getOperand(0).getReg(), DestReg);
+ addUsersToMoveToVALUWorklist(DestReg, MRI, Worklist);
+ Inst.eraseFromParent();
+ }
+ return;
+ case AMDGPU::S_UADDO_PSEUDO:
+ case AMDGPU::S_USUBO_PSEUDO: {
+ const DebugLoc &DL = Inst.getDebugLoc();
+ MachineOperand &Dest0 = Inst.getOperand(0);
+ MachineOperand &Dest1 = Inst.getOperand(1);
+ MachineOperand &Src0 = Inst.getOperand(2);
+ MachineOperand &Src1 = Inst.getOperand(3);
- // Remove any references to SCC. Vector instructions can't read from it, and
- // We're just about to add the implicit use / defs of VCC, and we don't want
- // both.
- for (MachineOperand &Op : Inst.implicit_operands()) {
- if (Op.getReg() == AMDGPU::SCC) {
- // Only propagate through live-def of SCC.
- if (Op.isDef() && !Op.isDead())
- addSCCDefUsersToVALUWorklist(Op, Inst, Worklist);
- if (Op.isUse())
- addSCCDefsToVALUWorklist(NewInstr, Worklist);
- }
- }
+ unsigned Opc = (Inst.getOpcode() == AMDGPU::S_UADDO_PSEUDO)
+ ? AMDGPU::V_ADD_CO_U32_e64
+ : AMDGPU::V_SUB_CO_U32_e64;
+ const TargetRegisterClass *NewRC =
+ RI.getEquivalentVGPRClass(MRI.getRegClass(Dest0.getReg()));
+ Register DestReg = MRI.createVirtualRegister(NewRC);
+ MachineInstr *NewInstr = BuildMI(*MBB, &Inst, DL, get(Opc), DestReg)
+ .addReg(Dest1.getReg(), RegState::Define)
+ .add(Src0)
+ .add(Src1)
+ .addImm(0); // clamp bit
+ legalizeOperands(*NewInstr, MDT);
+ MRI.replaceRegWith(Dest0.getReg(), DestReg);
+ addUsersToMoveToVALUWorklist(NewInstr->getOperand(0).getReg(), MRI,
+ Worklist);
Inst.eraseFromParent();
+ }
+ return;
- Register NewDstReg;
- if (NewInstr->getOperand(0).isReg() && NewInstr->getOperand(0).isDef()) {
- Register DstReg = NewInstr->getOperand(0).getReg();
- assert(DstReg.isVirtual());
-
- // Update the destination register class.
- const TargetRegisterClass *NewDstRC =
- getDestEquivalentVGPRClass(*NewInstr);
- assert(NewDstRC);
+ case AMDGPU::S_CSELECT_B32:
+ case AMDGPU::S_CSELECT_B64:
+ lowerSelect(Worklist, Inst, MDT);
+ Inst.eraseFromParent();
+ return;
+ case AMDGPU::S_CMP_EQ_I32:
+ case AMDGPU::S_CMP_LG_I32:
+ case AMDGPU::S_CMP_GT_I32:
+ case AMDGPU::S_CMP_GE_I32:
+ case AMDGPU::S_CMP_LT_I32:
+ case AMDGPU::S_CMP_LE_I32:
+ case AMDGPU::S_CMP_EQ_U32:
+ case AMDGPU::S_CMP_LG_U32:
+ case AMDGPU::S_CMP_GT_U32:
+ case AMDGPU::S_CMP_GE_U32:
+ case AMDGPU::S_CMP_LT_U32:
+ case AMDGPU::S_CMP_LE_U32:
+ case AMDGPU::S_CMP_EQ_U64:
+ case AMDGPU::S_CMP_LG_U64: {
+ const MCInstrDesc &NewDesc = get(NewOpcode);
+ Register CondReg = MRI.createVirtualRegister(RI.getWaveMaskRegClass());
+ MachineInstr *NewInstr =
+ BuildMI(*MBB, Inst, Inst.getDebugLoc(), NewDesc, CondReg)
+ .add(Inst.getOperand(0))
+ .add(Inst.getOperand(1));
+ legalizeOperands(*NewInstr, MDT);
+ int SCCIdx = Inst.findRegisterDefOperandIdx(AMDGPU::SCC);
+ MachineOperand SCCOp = Inst.getOperand(SCCIdx);
+ addSCCDefUsersToVALUWorklist(SCCOp, Inst, Worklist, CondReg);
+ Inst.eraseFromParent();
+ }
+ return;
+ }
- NewDstReg = MRI.createVirtualRegister(NewDstRC);
- MRI.replaceRegWith(DstReg, NewDstReg);
- }
+ if (NewOpcode == AMDGPU::INSTRUCTION_LIST_END) {
+ // We cannot move this instruction to the VALU, so we should try to
+ // legalize its operands instead.
+ legalizeOperands(Inst, MDT);
+ return;
+ }
+ // Handle converting generic instructions like COPY-to-SGPR into
+ // COPY-to-VGPR.
+ if (NewOpcode == Opcode) {
+ Register DstReg = Inst.getOperand(0).getReg();
+ const TargetRegisterClass *NewDstRC = getDestEquivalentVGPRClass(Inst);
- if (Opcode == AMDGPU::S_SEXT_I32_I8 || Opcode == AMDGPU::S_SEXT_I32_I16) {
- // We are converting these to a BFE, so we need to add the missing
- // operands for the size and offset.
- unsigned Size = (Opcode == AMDGPU::S_SEXT_I32_I8) ? 8 : 16;
- NewInstr.addImm(0);
- NewInstr.addImm(Size);
- } else if (Opcode == AMDGPU::S_BCNT1_I32_B32) {
- // The VALU version adds the second operand to the result, so insert an
- // extra 0 operand.
- NewInstr.addImm(0);
+ if (Inst.isCopy() && Inst.getOperand(1).getReg().isVirtual() &&
+ NewDstRC == RI.getRegClassForReg(MRI, Inst.getOperand(1).getReg())) {
+ // Instead of creating a copy where src and dst are the same register
+ // class, we just replace all uses of dst with src. These kinds of
+ // copies interfere with the heuristics MachineSink uses to decide
+ // whether or not to split a critical edge. Since the pass assumes
+ // that copies will end up as machine instructions and not be
+ // eliminated.
+ addUsersToMoveToVALUWorklist(DstReg, MRI, Worklist);
+ MRI.replaceRegWith(DstReg, Inst.getOperand(1).getReg());
+ MRI.clearKillFlags(Inst.getOperand(1).getReg());
+ Inst.getOperand(0).setReg(DstReg);
+ // Make sure we don't leave around a dead VGPR->SGPR copy. Normally
+ // these are deleted later, but at -O0 it would leave a suspicious
+ // looking illegal copy of an undef register.
+ for (unsigned I = Inst.getNumOperands() - 1; I != 0; --I)
+ Inst.removeOperand(I);
+ Inst.setDesc(get(AMDGPU::IMPLICIT_DEF));
+ return;
}
+ Register NewDstReg = MRI.createVirtualRegister(NewDstRC);
+ MRI.replaceRegWith(DstReg, NewDstReg);
+ legalizeOperands(Inst, MDT);
+ addUsersToMoveToVALUWorklist(NewDstReg, MRI, Worklist);
+ return;
+ }
- if (Opcode == AMDGPU::S_BFE_I32 || Opcode == AMDGPU::S_BFE_U32) {
- const MachineOperand &OffsetWidthOp = NewInstr->getOperand(2);
- // If we need to move this to VGPRs, we need to unpack the second operand
- // back into the 2 separate ones for bit offset and width.
- assert(OffsetWidthOp.isImm() &&
- "Scalar BFE is only implemented for constant width and offset");
- uint32_t Imm = OffsetWidthOp.getImm();
-
- uint32_t Offset = Imm & 0x3f; // Extract bits [5:0].
- uint32_t BitWidth = (Imm & 0x7f0000) >> 16; // Extract bits [22:16].
- NewInstr->removeOperand(2);
- NewInstr.addImm(Offset);
- NewInstr.addImm(BitWidth);
+ // Use the new VALU Opcode.
+ auto NewInstr = BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(NewOpcode))
+ .setMIFlags(Inst.getFlags());
+ for (const MachineOperand &Op : Inst.explicit_operands())
+ NewInstr->addOperand(Op);
+ // Remove any references to SCC. Vector instructions can't read from it, and
+ // We're just about to add the implicit use / defs of VCC, and we don't want
+ // both.
+ for (MachineOperand &Op : Inst.implicit_operands()) {
+ if (Op.getReg() == AMDGPU::SCC) {
+ // Only propagate through live-def of SCC.
+ if (Op.isDef() && !Op.isDead())
+ addSCCDefUsersToVALUWorklist(Op, Inst, Worklist);
+ if (Op.isUse())
+ addSCCDefsToVALUWorklist(NewInstr, Worklist);
}
-
- fixImplicitOperands(*NewInstr);
-
- // Legalize the operands
- CreatedBBTmp = legalizeOperands(*NewInstr, MDT);
- if (CreatedBBTmp && TopInst.getParent() == CreatedBBTmp)
- CreatedBB = CreatedBBTmp;
-
- if (NewDstReg)
- addUsersToMoveToVALUWorklist(NewDstReg, MRI, Worklist);
}
- return CreatedBB;
+ Inst.eraseFromParent();
+ Register NewDstReg;
+ if (NewInstr->getOperand(0).isReg() && NewInstr->getOperand(0).isDef()) {
+ Register DstReg = NewInstr->getOperand(0).getReg();
+ assert(DstReg.isVirtual());
+ // Update the destination register class.
+ const TargetRegisterClass *NewDstRC = getDestEquivalentVGPRClass(*NewInstr);
+ assert(NewDstRC);
+ NewDstReg = MRI.createVirtualRegister(NewDstRC);
+ MRI.replaceRegWith(DstReg, NewDstReg);
+ }
+ if (Opcode == AMDGPU::S_SEXT_I32_I8 || Opcode == AMDGPU::S_SEXT_I32_I16) {
+ // We are converting these to a BFE, so we need to add the missing
+ // operands for the size and offset.
+ unsigned Size = (Opcode == AMDGPU::S_SEXT_I32_I8) ? 8 : 16;
+ NewInstr.addImm(0);
+ NewInstr.addImm(Size);
+ } else if (Opcode == AMDGPU::S_BCNT1_I32_B32) {
+ // The VALU version adds the second operand to the result, so insert an
+ // extra 0 operand.
+ NewInstr.addImm(0);
+ }
+ if (Opcode == AMDGPU::S_BFE_I32 || Opcode == AMDGPU::S_BFE_U32) {
+ const MachineOperand &OffsetWidthOp = NewInstr->getOperand(2);
+ // If we need to move this to VGPRs, we need to unpack the second operand
+ // back into the 2 separate ones for bit offset and width.
+ assert(OffsetWidthOp.isImm() &&
+ "Scalar BFE is only implemented for constant width and offset");
+ uint32_t Imm = OffsetWidthOp.getImm();
+ uint32_t Offset = Imm & 0x3f; // Extract bits [5:0].
+ uint32_t BitWidth = (Imm & 0x7f0000) >> 16; // Extract bits [22:16].
+ NewInstr->removeOperand(2);
+ NewInstr.addImm(Offset);
+ NewInstr.addImm(BitWidth);
+ }
+ fixImplicitOperands(*NewInstr);
+ // Legalize the operands
+ legalizeOperands(*NewInstr, MDT);
+ if (NewDstReg)
+ addUsersToMoveToVALUWorklist(NewDstReg, MRI, Worklist);
}
// Add/sub require special handling to deal with carry outs.
std::pair<bool, MachineBasicBlock *>
-SIInstrInfo::moveScalarAddSub(SetVectorType &Worklist, MachineInstr &Inst,
+SIInstrInfo::moveScalarAddSub(SIInstrWorklist &Worklist, MachineInstr &Inst,
MachineDominatorTree *MDT) const {
if (ST.hasAddNoCarry()) {
// Assume there is no user of scc since we don't select this in that case.
@@ -6604,7 +6795,7 @@ SIInstrInfo::moveScalarAddSub(SetVectorType &Worklist, MachineInstr &Inst,
return std::pair(false, nullptr);
}
-void SIInstrInfo::lowerSelect(SetVectorType &Worklist, MachineInstr &Inst,
+void SIInstrInfo::lowerSelect(SIInstrWorklist &Worklist, MachineInstr &Inst,
MachineDominatorTree *MDT) const {
MachineBasicBlock &MBB = *Inst.getParent();
@@ -6680,7 +6871,7 @@ void SIInstrInfo::lowerSelect(SetVectorType &Worklist, MachineInstr &Inst,
addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
}
-void SIInstrInfo::lowerScalarAbs(SetVectorType &Worklist,
+void SIInstrInfo::lowerScalarAbs(SIInstrWorklist &Worklist,
MachineInstr &Inst) const {
MachineBasicBlock &MBB = *Inst.getParent();
MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
@@ -6707,7 +6898,7 @@ void SIInstrInfo::lowerScalarAbs(SetVectorType &Worklist,
addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
}
-void SIInstrInfo::lowerScalarXnor(SetVectorType &Worklist,
+void SIInstrInfo::lowerScalarXnor(SIInstrWorklist &Worklist,
MachineInstr &Inst) const {
MachineBasicBlock &MBB = *Inst.getParent();
MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
@@ -6772,7 +6963,7 @@ void SIInstrInfo::lowerScalarXnor(SetVectorType &Worklist,
}
}
-void SIInstrInfo::splitScalarNotBinop(SetVectorType &Worklist,
+void SIInstrInfo::splitScalarNotBinop(SIInstrWorklist &Worklist,
MachineInstr &Inst,
unsigned Opcode) const {
MachineBasicBlock &MBB = *Inst.getParent();
@@ -6801,7 +6992,7 @@ void SIInstrInfo::splitScalarNotBinop(SetVectorType &Worklist,
addUsersToMoveToVALUWorklist(NewDest, MRI, Worklist);
}
-void SIInstrInfo::splitScalarBinOpN2(SetVectorType& Worklist,
+void SIInstrInfo::splitScalarBinOpN2(SIInstrWorklist &Worklist,
MachineInstr &Inst,
unsigned Opcode) const {
MachineBasicBlock &MBB = *Inst.getParent();
@@ -6830,9 +7021,9 @@ void SIInstrInfo::splitScalarBinOpN2(SetVectorType& Worklist,
addUsersToMoveToVALUWorklist(NewDest, MRI, Worklist);
}
-void SIInstrInfo::splitScalar64BitUnaryOp(
- SetVectorType &Worklist, MachineInstr &Inst,
- unsigned Opcode, bool Swap) const {
+void SIInstrInfo::splitScalar64BitUnaryOp(SIInstrWorklist &Worklist,
+ MachineInstr &Inst, unsigned Opcode,
+ bool Swap) const {
MachineBasicBlock &MBB = *Inst.getParent();
MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
@@ -6889,7 +7080,7 @@ void SIInstrInfo::splitScalar64BitUnaryOp(
addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist);
}
-void SIInstrInfo::splitScalar64BitAddSub(SetVectorType &Worklist,
+void SIInstrInfo::splitScalar64BitAddSub(SIInstrWorklist &Worklist,
MachineInstr &Inst,
MachineDominatorTree *MDT) const {
bool IsAdd = (Inst.getOpcode() == AMDGPU::S_ADD_U64_PSEUDO);
@@ -6963,7 +7154,7 @@ void SIInstrInfo::splitScalar64BitAddSub(SetVectorType &Worklist,
addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist);
}
-void SIInstrInfo::splitScalar64BitBinaryOp(SetVectorType &Worklist,
+void SIInstrInfo::splitScalar64BitBinaryOp(SIInstrWorklist &Worklist,
MachineInstr &Inst, unsigned Opcode,
MachineDominatorTree *MDT) const {
MachineBasicBlock &MBB = *Inst.getParent();
@@ -7030,7 +7221,7 @@ void SIInstrInfo::splitScalar64BitBinaryOp(SetVectorType &Worklist,
addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist);
}
-void SIInstrInfo::splitScalar64BitXnor(SetVectorType &Worklist,
+void SIInstrInfo::splitScalar64BitXnor(SIInstrWorklist &Worklist,
MachineInstr &Inst,
MachineDominatorTree *MDT) const {
MachineBasicBlock &MBB = *Inst.getParent();
@@ -7072,8 +7263,8 @@ void SIInstrInfo::splitScalar64BitXnor(SetVectorType &Worklist,
Worklist.insert(&Xor);
}
-void SIInstrInfo::splitScalar64BitBCNT(
- SetVectorType &Worklist, MachineInstr &Inst) const {
+void SIInstrInfo::splitScalar64BitBCNT(SIInstrWorklist &Worklist,
+ MachineInstr &Inst) const {
MachineBasicBlock &MBB = *Inst.getParent();
MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
@@ -7110,7 +7301,7 @@ void SIInstrInfo::splitScalar64BitBCNT(
addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
}
-void SIInstrInfo::splitScalar64BitBFE(SetVectorType &Worklist,
+void SIInstrInfo::splitScalar64BitBFE(SIInstrWorklist &Worklist,
MachineInstr &Inst) const {
MachineBasicBlock &MBB = *Inst.getParent();
MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
@@ -7172,9 +7363,8 @@ void SIInstrInfo::splitScalar64BitBFE(SetVectorType &Worklist,
}
void SIInstrInfo::addUsersToMoveToVALUWorklist(
- Register DstReg,
- MachineRegisterInfo &MRI,
- SetVectorType &Worklist) const {
+ Register DstReg, MachineRegisterInfo &MRI,
+ SIInstrWorklist &Worklist) const {
for (MachineRegisterInfo::use_iterator I = MRI.use_begin(DstReg),
E = MRI.use_end(); I != E;) {
MachineInstr &UseMI = *I->getParent();
@@ -7208,7 +7398,7 @@ void SIInstrInfo::addUsersToMoveToVALUWorklist(
}
}
-void SIInstrInfo::movePackToVALU(SetVectorType &Worklist,
+void SIInstrInfo::movePackToVALU(SIInstrWorklist &Worklist,
MachineRegisterInfo &MRI,
MachineInstr &Inst) const {
Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
@@ -7283,7 +7473,7 @@ void SIInstrInfo::movePackToVALU(SetVectorType &Worklist,
void SIInstrInfo::addSCCDefUsersToVALUWorklist(MachineOperand &Op,
MachineInstr &SCCDefInst,
- SetVectorType &Worklist,
+ SIInstrWorklist &Worklist,
Register NewCond) const {
// Ensure that def inst defines SCC, which is still live.
@@ -7326,7 +7516,7 @@ void SIInstrInfo::addSCCDefUsersToVALUWorklist(MachineOperand &Op,
// sure that the instruction that defines SCC is added to the moveToVALU
// worklist.
void SIInstrInfo::addSCCDefsToVALUWorklist(MachineInstr *SCCUseInst,
- SetVectorType &Worklist) const {
+ SIInstrWorklist &Worklist) const {
// Look for a preceding instruction that either defines VCC or SCC. If VCC
// then there is nothing to do because the defining instruction has been
// converted to a VALU already. If SCC then that instruction needs to be
@@ -7811,6 +8001,16 @@ SIInstrInfo::getSerializableMachineMemOperandTargetFlags() const {
return ArrayRef(TargetFlags);
}
+unsigned SIInstrInfo::getLiveRangeSplitOpcode(Register SrcReg,
+ const MachineFunction &MF) const {
+ const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
+ assert(SrcReg.isVirtual());
+ if (MFI->checkFlag(SrcReg, AMDGPU::VirtRegFlag::WWM_REG))
+ return AMDGPU::WWM_COPY;
+
+ return AMDGPU::COPY;
+}
+
bool SIInstrInfo::isBasicBlockPrologue(const MachineInstr &MI) const {
return !MI.isTerminator() && MI.getOpcode() != AMDGPU::COPY &&
MI.modifiesRegister(AMDGPU::EXEC, &RI);
@@ -7843,7 +8043,9 @@ MachineInstrBuilder SIInstrInfo::getAddNoCarry(MachineBasicBlock &MBB,
// If available, prefer to use vcc.
Register UnusedCarry = !RS.isRegUsed(AMDGPU::VCC)
? Register(RI.getVCC())
- : RS.scavengeRegister(RI.getBoolRC(), I, 0, false);
+ : RS.scavengeRegisterBackwards(
+ *RI.getBoolRC(), I, /* RestoreAfter */ false,
+ 0, /* AllowSpill */ false);
// TODO: Users need to deal with this.
if (!UnusedCarry.isValid())
@@ -7874,10 +8076,15 @@ const MCInstrDesc &SIInstrInfo::getKillTerminatorFromPseudo(unsigned Opcode) con
}
}
+unsigned SIInstrInfo::getMaxMUBUFImmOffset() { return (1 << 12) - 1; }
+
void SIInstrInfo::fixImplicitOperands(MachineInstr &MI) const {
if (!ST.isWave32())
return;
+ if (MI.isInlineAsm())
+ return;
+
for (auto &Op : MI.implicit_operands()) {
if (Op.isReg() && Op.getReg() == AMDGPU::VCC)
Op.setReg(AMDGPU::VCC_LO);
@@ -7897,6 +8104,52 @@ bool SIInstrInfo::isBufferSMRD(const MachineInstr &MI) const {
return RI.getRegClass(RCID)->hasSubClassEq(&AMDGPU::SGPR_128RegClass);
}
+// Given Imm, split it into the values to put into the SOffset and ImmOffset
+// fields in an MUBUF instruction. Return false if it is not possible (due to a
+// hardware bug needing a workaround).
+//
+// The required alignment ensures that individual address components remain
+// aligned if they are aligned to begin with. It also ensures that additional
+// offsets within the given alignment can be added to the resulting ImmOffset.
+bool SIInstrInfo::splitMUBUFOffset(uint32_t Imm, uint32_t &SOffset,
+ uint32_t &ImmOffset, Align Alignment) const {
+ const uint32_t MaxOffset = SIInstrInfo::getMaxMUBUFImmOffset();
+ const uint32_t MaxImm = alignDown(MaxOffset, Alignment.value());
+ uint32_t Overflow = 0;
+
+ if (Imm > MaxImm) {
+ if (Imm <= MaxImm + 64) {
+ // Use an SOffset inline constant for 4..64
+ Overflow = Imm - MaxImm;
+ Imm = MaxImm;
+ } else {
+ // Try to keep the same value in SOffset for adjacent loads, so that
+ // the corresponding register contents can be re-used.
+ //
+ // Load values with all low-bits (except for alignment bits) set into
+ // SOffset, so that a larger range of values can be covered using
+ // s_movk_i32.
+ //
+ // Atomic operations fail to work correctly when individual address
+ // components are unaligned, even if their sum is aligned.
+ uint32_t High = (Imm + Alignment.value()) & ~MaxOffset;
+ uint32_t Low = (Imm + Alignment.value()) & MaxOffset;
+ Imm = Low;
+ Overflow = High - Alignment.value();
+ }
+ }
+
+ // There is a hardware bug in SI and CI which prevents address clamping in
+ // MUBUF instructions from working correctly with SOffsets. The immediate
+ // offset is unaffected.
+ if (Overflow > 0 && ST.getGeneration() <= AMDGPUSubtarget::SEA_ISLANDS)
+ return false;
+
+ ImmOffset = Imm;
+ SOffset = Overflow;
+ return true;
+}
+
// Depending on the used address space and instructions, some immediate offsets
// are allowed and some are not.
// In general, flat instruction offsets can only be non-negative, global and
@@ -7987,23 +8240,7 @@ SIInstrInfo::splitFlatOffset(int64_t COffsetVal, unsigned AddrSpace,
return {ImmField, RemainderOffset};
}
-// This must be kept in sync with the SIEncodingFamily class in SIInstrInfo.td
-// and the columns of the getMCOpcodeGen table.
-enum SIEncodingFamily {
- SI = 0,
- VI = 1,
- SDWA = 2,
- SDWA9 = 3,
- GFX80 = 4,
- GFX9 = 5,
- GFX10 = 6,
- SDWA10 = 7,
- GFX90A = 8,
- GFX940 = 9,
- GFX11 = 10,
-};
-
-static SIEncodingFamily subtargetEncodingFamily(const GCNSubtarget &ST) {
+static unsigned subtargetEncodingFamily(const GCNSubtarget &ST) {
switch (ST.getGeneration()) {
default:
break;
@@ -8042,7 +8279,7 @@ bool SIInstrInfo::isAsmOnlyOpcode(int MCOp) const {
}
int SIInstrInfo::pseudoToMCOpcode(int Opcode) const {
- SIEncodingFamily Gen = subtargetEncodingFamily(ST);
+ unsigned Gen = subtargetEncodingFamily(ST);
if ((get(Opcode).TSFlags & SIInstrFlags::renamedInGFX9) != 0 &&
ST.getGeneration() == AMDGPUSubtarget::GFX9)
@@ -8325,7 +8562,7 @@ MachineInstr *SIInstrInfo::foldMemoryOperandImpl(
// A similar issue also exists with spilling and reloading $exec registers.
//
// To prevent that, constrain the %0 register class here.
- if (MI.isFullCopy()) {
+ if (isFullCopyInstr(MI)) {
Register DstReg = MI.getOperand(0).getReg();
Register SrcReg = MI.getOperand(1).getReg();
if ((DstReg.isVirtual() || SrcReg.isVirtual()) &&
@@ -8368,9 +8605,20 @@ SIInstrInfo::getGenericInstructionUniformity(const MachineInstr &MI) const {
unsigned opcode = MI.getOpcode();
if (opcode == AMDGPU::G_INTRINSIC ||
opcode == AMDGPU::G_INTRINSIC_W_SIDE_EFFECTS) {
- return AMDGPU::isIntrinsicSourceOfDivergence(MI.getIntrinsicID())
- ? InstructionUniformity::NeverUniform
- : InstructionUniformity::AlwaysUniform;
+ auto IID = static_cast<Intrinsic::ID>(MI.getIntrinsicID());
+ if (AMDGPU::isIntrinsicSourceOfDivergence(IID))
+ return InstructionUniformity::NeverUniform;
+ if (AMDGPU::isIntrinsicAlwaysUniform(IID))
+ return InstructionUniformity::AlwaysUniform;
+
+ switch (IID) {
+ case Intrinsic::amdgcn_if:
+ case Intrinsic::amdgcn_else:
+ // FIXME: Uniform if second result
+ break;
+ }
+
+ return InstructionUniformity::Default;
}
// Loads from the private and flat address spaces are divergent, because
@@ -8403,6 +8651,29 @@ SIInstrInfo::getGenericInstructionUniformity(const MachineInstr &MI) const {
InstructionUniformity
SIInstrInfo::getInstructionUniformity(const MachineInstr &MI) const {
+
+ if (isNeverUniform(MI))
+ return InstructionUniformity::NeverUniform;
+
+ unsigned opcode = MI.getOpcode();
+ if (opcode == AMDGPU::V_READLANE_B32 || opcode == AMDGPU::V_READFIRSTLANE_B32)
+ return InstructionUniformity::AlwaysUniform;
+
+ if (isCopyInstr(MI)) {
+ const MachineOperand &srcOp = MI.getOperand(1);
+ if (srcOp.isReg() && srcOp.getReg().isPhysical()) {
+ const TargetRegisterClass *regClass =
+ RI.getPhysRegBaseClass(srcOp.getReg());
+ return RI.isSGPRClass(regClass) ? InstructionUniformity::AlwaysUniform
+ : InstructionUniformity::NeverUniform;
+ }
+ return InstructionUniformity::Default;
+ }
+
+ // GMIR handling
+ if (MI.isPreISelOpcode())
+ return SIInstrInfo::getGenericInstructionUniformity(MI);
+
// Atomics are divergent because they are executed sequentially: when an
// atomic operation refers to the same address in each thread, then each
// thread after the first sees the value written by the previous thread as
@@ -8429,44 +8700,26 @@ SIInstrInfo::getInstructionUniformity(const MachineInstr &MI) const {
return InstructionUniformity::Default;
}
- unsigned opcode = MI.getOpcode();
- if (opcode == AMDGPU::COPY) {
- const MachineOperand &srcOp = MI.getOperand(1);
- if (srcOp.isReg() && srcOp.getReg().isPhysical()) {
- const TargetRegisterClass *regClass = RI.getPhysRegBaseClass(srcOp.getReg());
- return RI.isSGPRClass(regClass) ? InstructionUniformity::AlwaysUniform
- : InstructionUniformity::NeverUniform;
- }
- return InstructionUniformity::Default;
- }
- if (opcode == AMDGPU::INLINEASM || opcode == AMDGPU::INLINEASM_BR) {
- const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
- for (auto &op : MI.operands()) {
- if (!op.isReg() || !op.isDef())
- continue;
- auto *RC = MRI.getRegClass(op.getReg());
- if (!RC || RI.isDivergentRegClass(RC))
- return InstructionUniformity::NeverUniform;
- }
- return InstructionUniformity::AlwaysUniform;
- }
- if (opcode == AMDGPU::V_READLANE_B32 || opcode == AMDGPU::V_READFIRSTLANE_B32)
- return InstructionUniformity::AlwaysUniform;
+ const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
+ const AMDGPURegisterBankInfo *RBI = ST.getRegBankInfo();
- if (opcode == AMDGPU::V_WRITELANE_B32)
- return InstructionUniformity::NeverUniform;
+ // FIXME: It's conceptually broken to report this for an instruction, and not
+ // a specific def operand. For inline asm in particular, there could be mixed
+ // uniform and divergent results.
+ for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) {
+ const MachineOperand &SrcOp = MI.getOperand(I);
+ if (!SrcOp.isReg())
+ continue;
- // GMIR handling
- if (SIInstrInfo::isGenericOpcode(opcode))
- return SIInstrInfo::getGenericInstructionUniformity(MI);
+ Register Reg = SrcOp.getReg();
+ if (!Reg || !SrcOp.readsReg())
+ continue;
- // Handling $vpgr reads
- for (auto srcOp : MI.operands()) {
- if (srcOp.isReg() && srcOp.getReg().isPhysical()) {
- const TargetRegisterClass *regClass = RI.getPhysRegBaseClass(srcOp.getReg());
- if (RI.isVGPRClass(regClass))
- return InstructionUniformity::NeverUniform;
- }
+ // If RegBank is null, this is unassigned or an unallocatable special
+ // register, which are all scalars.
+ const RegisterBank *RegBank = RBI->getRegBank(Reg, MRI, RI);
+ if (RegBank && RegBank->getID() != AMDGPU::SGPRRegBankID)
+ return InstructionUniformity::NeverUniform;
}
// TODO: Uniformity check condtions above can be rearranged for more
@@ -8622,7 +8875,7 @@ bool SIInstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg,
else
return false;
- unsigned BitNo = countTrailingZeros((uint64_t)Mask);
+ unsigned BitNo = llvm::countr_zero((uint64_t)Mask);
if (IsSigned && BitNo == SrcSize - 1)
return false;
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
index 025faec0e2cc..b25aae7b2fb0 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
@@ -41,6 +41,41 @@ class ScheduleHazardRecognizer;
static const MachineMemOperand::Flags MONoClobber =
MachineMemOperand::MOTargetFlag1;
+/// Utility to store machine instructions worklist.
+struct SIInstrWorklist {
+ SIInstrWorklist() : InstrList() {}
+
+ void insert(MachineInstr *MI);
+
+ MachineInstr *top() const {
+ auto iter = InstrList.begin();
+ return *iter;
+ }
+
+ void erase_top() {
+ auto iter = InstrList.begin();
+ InstrList.erase(iter);
+ }
+
+ bool empty() const { return InstrList.empty(); }
+
+ void clear() {
+ InstrList.clear();
+ DeferredList.clear();
+ }
+
+ bool isDeferred(MachineInstr *MI);
+
+ SetVector<MachineInstr *> &getDeferredList() { return DeferredList; }
+
+private:
+ /// InstrList contains the MachineInstrs.
+ SetVector<MachineInstr *> InstrList;
+ /// Deferred instructions are specific MachineInstr
+ /// that will be added by insert method.
+ SetVector<MachineInstr *> DeferredList;
+};
+
class SIInstrInfo final : public AMDGPUGenInstrInfo {
private:
const SIRegisterInfo RI;
@@ -81,57 +116,50 @@ private:
void swapOperands(MachineInstr &Inst) const;
std::pair<bool, MachineBasicBlock *>
- moveScalarAddSub(SetVectorType &Worklist, MachineInstr &Inst,
+ moveScalarAddSub(SIInstrWorklist &Worklist, MachineInstr &Inst,
MachineDominatorTree *MDT = nullptr) const;
- void lowerSelect(SetVectorType &Worklist, MachineInstr &Inst,
+ void lowerSelect(SIInstrWorklist &Worklist, MachineInstr &Inst,
MachineDominatorTree *MDT = nullptr) const;
- void lowerScalarAbs(SetVectorType &Worklist,
- MachineInstr &Inst) const;
+ void lowerScalarAbs(SIInstrWorklist &Worklist, MachineInstr &Inst) const;
- void lowerScalarXnor(SetVectorType &Worklist,
- MachineInstr &Inst) const;
+ void lowerScalarXnor(SIInstrWorklist &Worklist, MachineInstr &Inst) const;
- void splitScalarNotBinop(SetVectorType &Worklist,
- MachineInstr &Inst,
+ void splitScalarNotBinop(SIInstrWorklist &Worklist, MachineInstr &Inst,
unsigned Opcode) const;
- void splitScalarBinOpN2(SetVectorType &Worklist,
- MachineInstr &Inst,
+ void splitScalarBinOpN2(SIInstrWorklist &Worklist, MachineInstr &Inst,
unsigned Opcode) const;
- void splitScalar64BitUnaryOp(SetVectorType &Worklist,
- MachineInstr &Inst, unsigned Opcode,
- bool Swap = false) const;
+ void splitScalar64BitUnaryOp(SIInstrWorklist &Worklist, MachineInstr &Inst,
+ unsigned Opcode, bool Swap = false) const;
- void splitScalar64BitAddSub(SetVectorType &Worklist, MachineInstr &Inst,
+ void splitScalar64BitAddSub(SIInstrWorklist &Worklist, MachineInstr &Inst,
MachineDominatorTree *MDT = nullptr) const;
- void splitScalar64BitBinaryOp(SetVectorType &Worklist, MachineInstr &Inst,
+ void splitScalar64BitBinaryOp(SIInstrWorklist &Worklist, MachineInstr &Inst,
unsigned Opcode,
MachineDominatorTree *MDT = nullptr) const;
- void splitScalar64BitXnor(SetVectorType &Worklist, MachineInstr &Inst,
- MachineDominatorTree *MDT = nullptr) const;
+ void splitScalar64BitXnor(SIInstrWorklist &Worklist, MachineInstr &Inst,
+ MachineDominatorTree *MDT = nullptr) const;
- void splitScalar64BitBCNT(SetVectorType &Worklist,
+ void splitScalar64BitBCNT(SIInstrWorklist &Worklist,
MachineInstr &Inst) const;
- void splitScalar64BitBFE(SetVectorType &Worklist,
- MachineInstr &Inst) const;
- void movePackToVALU(SetVectorType &Worklist,
- MachineRegisterInfo &MRI,
+ void splitScalar64BitBFE(SIInstrWorklist &Worklist, MachineInstr &Inst) const;
+ void movePackToVALU(SIInstrWorklist &Worklist, MachineRegisterInfo &MRI,
MachineInstr &Inst) const;
void addUsersToMoveToVALUWorklist(Register Reg, MachineRegisterInfo &MRI,
- SetVectorType &Worklist) const;
+ SIInstrWorklist &Worklist) const;
void addSCCDefUsersToVALUWorklist(MachineOperand &Op,
MachineInstr &SCCDefInst,
- SetVectorType &Worklist,
+ SIInstrWorklist &Worklist,
Register NewCond = Register()) const;
void addSCCDefsToVALUWorklist(MachineInstr *SCCUseInst,
- SetVectorType &Worklist) const;
+ SIInstrWorklist &Worklist) const;
const TargetRegisterClass *
getDestEquivalentVGPRClass(const MachineInstr &Inst) const;
@@ -142,6 +170,12 @@ private:
Register findUsedSGPR(const MachineInstr &MI, int OpIndices[3]) const;
protected:
+ /// If the specific machine instruction is a instruction that moves/copies
+ /// value from one register to another register return destination and source
+ /// registers as machine operands.
+ std::optional<DestSourcePair>
+ isCopyInstrImpl(const MachineInstr &MI) const override;
+
bool swapSourceModifiers(MachineInstr &MI,
MachineOperand &Src0, unsigned Src0OpName,
MachineOperand &Src1, unsigned Src1OpName) const;
@@ -626,6 +660,11 @@ public:
return get(Opcode).TSFlags & SIInstrFlags::SGPRSpill;
}
+ static bool isWWMRegSpillOpcode(uint16_t Opcode) {
+ return Opcode == AMDGPU::SI_SPILL_WWM_V32_SAVE ||
+ Opcode == AMDGPU::SI_SPILL_WWM_V32_RESTORE;
+ }
+
static bool isDPP(const MachineInstr &MI) {
return MI.getDesc().TSFlags & SIInstrFlags::DPP;
}
@@ -781,6 +820,10 @@ public:
return get(Opcode).TSFlags & SIInstrFlags::FPAtomic;
}
+ static bool isNeverUniform(const MachineInstr &MI) {
+ return MI.getDesc().TSFlags & SIInstrFlags::IsNeverUniform;
+ }
+
static bool doesNotReadTiedSource(const MachineInstr &MI) {
return MI.getDesc().TSFlags & SIInstrFlags::TiedSourceNotRead;
}
@@ -790,7 +833,7 @@ public:
}
bool isVGPRCopy(const MachineInstr &MI) const {
- assert(MI.isCopy());
+ assert(isCopyInstr(MI));
Register Dest = MI.getOperand(0).getReg();
const MachineFunction &MF = *MI.getParent()->getParent();
const MachineRegisterInfo &MRI = MF.getRegInfo();
@@ -841,7 +884,7 @@ public:
const MachineOperand &UseMO,
const MachineOperand &DefMO) const {
assert(UseMO.getParent() == &MI);
- int OpIdx = MI.getOperandNo(&UseMO);
+ int OpIdx = UseMO.getOperandNo();
if (OpIdx >= MI.getDesc().NumOperands)
return false;
@@ -860,7 +903,7 @@ public:
if (OpIdx >= MI.getDesc().NumOperands)
return false;
- if (MI.isCopy()) {
+ if (isCopyInstr(MI)) {
unsigned Size = getOpSize(MI, OpIdx);
assert(Size == 8 || Size == 4);
@@ -873,8 +916,7 @@ public:
}
bool isInlineConstant(const MachineOperand &MO) const {
- const MachineInstr *Parent = MO.getParent();
- return isInlineConstant(*Parent, Parent->getOperandNo(&MO));
+ return isInlineConstant(*MO.getParent(), MO.getOperandNo());
}
bool isImmOperandLegal(const MachineInstr &MI, unsigned OpNo,
@@ -908,6 +950,15 @@ public:
unsigned getVALUOp(const MachineInstr &MI) const;
+ void insertScratchExecCopy(MachineFunction &MF, MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MBBI,
+ const DebugLoc &DL, Register Reg, bool IsSCCLive,
+ SlotIndexes *Indexes = nullptr) const;
+
+ void restoreExec(MachineFunction &MF, MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MBBI, const DebugLoc &DL,
+ Register Reg, SlotIndexes *Indexes = nullptr) const;
+
/// Return the correct register class for \p OpNo. For target-specific
/// instructions, this will return the register class that has been defined
/// in tablegen. For generic instructions, like REG_SEQUENCE it will return
@@ -1005,11 +1056,14 @@ public:
/// was moved to VGPR. \returns true if succeeded.
bool moveFlatAddrToVGPR(MachineInstr &Inst) const;
- /// Replace this instruction's opcode with the equivalent VALU
- /// opcode. This function will also move the users of \p MI to the
- /// VALU if necessary. If present, \p MDT is updated.
- MachineBasicBlock *moveToVALU(MachineInstr &MI,
- MachineDominatorTree *MDT = nullptr) const;
+ /// Replace the instructions opcode with the equivalent VALU
+ /// opcode. This function will also move the users of MachineInstruntions
+ /// in the \p WorkList to the VALU if necessary. If present, \p MDT is
+ /// updated.
+ void moveToVALU(SIInstrWorklist &Worklist, MachineDominatorTree *MDT) const;
+
+ void moveToVALUImpl(SIInstrWorklist &Worklist, MachineDominatorTree *MDT,
+ MachineInstr &Inst) const;
void insertNoop(MachineBasicBlock &MBB,
MachineBasicBlock::iterator MI) const override;
@@ -1095,6 +1149,9 @@ public:
CreateTargetMIHazardRecognizer(const InstrItineraryData *II,
const ScheduleDAGMI *DAG) const override;
+ unsigned getLiveRangeSplitOpcode(Register Reg,
+ const MachineFunction &MF) const override;
+
bool isBasicBlockPrologue(const MachineInstr &MI) const override;
MachineInstr *createPHIDestinationCopy(MachineBasicBlock &MBB,
@@ -1132,6 +1189,11 @@ public:
return isUInt<12>(Imm);
}
+ static unsigned getMaxMUBUFImmOffset();
+
+ bool splitMUBUFOffset(uint32_t Imm, uint32_t &SOffset, uint32_t &ImmOffset,
+ Align Alignment = Align(4)) const;
+
/// Returns if \p Offset is legal for the subtarget as the offset to a FLAT
/// encoded instruction. If \p Signed, this is for an instruction that
/// interprets the offset as signed.
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
index 2066abb0268d..044bc4507d3a 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
@@ -50,14 +50,6 @@ def SIds_ordered_count : SDNode<"AMDGPUISD::DS_ORDERED_COUNT",
[SDNPMayLoad, SDNPMayStore, SDNPMemOperand, SDNPHasChain, SDNPInGlue]
>;
-def SIatomic_inc : SDNode<"AMDGPUISD::ATOMIC_INC", SDTAtomic2,
- [SDNPMayLoad, SDNPMayStore, SDNPMemOperand, SDNPHasChain]
->;
-
-def SIatomic_dec : SDNode<"AMDGPUISD::ATOMIC_DEC", SDTAtomic2,
- [SDNPMayLoad, SDNPMayStore, SDNPMemOperand, SDNPHasChain]
->;
-
def SDTAtomic2_f32 : SDTypeProfile<1, 2, [
SDTCisSameAs<0,2>, SDTCisFP<0>, SDTCisPtrTy<1>
]>;
@@ -355,8 +347,6 @@ class isPackedType<ValueType SrcVT> {
// PatFrags for global memory operations
//===----------------------------------------------------------------------===//
-defm atomic_inc : binary_atomic_op_all_as<SIatomic_inc>;
-defm atomic_dec : binary_atomic_op_all_as<SIatomic_dec>;
defm atomic_load_fmin : binary_atomic_op_all_as<SIatomic_fmin, 0>;
defm atomic_load_fmax : binary_atomic_op_all_as<SIatomic_fmax, 0>;
@@ -762,8 +752,8 @@ multiclass SIAtomicM0Glue2 <string op_name, bit is_amdgpu = 0,
defm atomic_load_add : SIAtomicM0Glue2 <"LOAD_ADD">;
defm atomic_load_sub : SIAtomicM0Glue2 <"LOAD_SUB">;
-defm atomic_inc : SIAtomicM0Glue2 <"INC", 1>;
-defm atomic_dec : SIAtomicM0Glue2 <"DEC", 1>;
+defm atomic_load_uinc_wrap : SIAtomicM0Glue2 <"LOAD_UINC_WRAP">;
+defm atomic_load_udec_wrap : SIAtomicM0Glue2 <"LOAD_UDEC_WRAP">;
defm atomic_load_and : SIAtomicM0Glue2 <"LOAD_AND">;
defm atomic_load_min : SIAtomicM0Glue2 <"LOAD_MIN">;
defm atomic_load_max : SIAtomicM0Glue2 <"LOAD_MAX">;
@@ -931,144 +921,39 @@ def set_glc : SDNodeXForm<timm, [{
// Custom Operands
//===----------------------------------------------------------------------===//
-def SoppBrTarget : AsmOperandClass {
- let Name = "SoppBrTarget";
- let ParserMethod = "parseSOppBrTarget";
-}
-
-def sopp_brtarget : Operand<OtherVT> {
+def SOPPBrTarget : CustomOperand<OtherVT> {
+ let PrintMethod = "printOperand";
let EncoderMethod = "getSOPPBrEncoding";
- let DecoderMethod = "decodeSoppBrTarget";
+ let DecoderMethod = "decodeSOPPBrTarget";
let OperandType = "OPERAND_PCREL";
- let ParserMatchClass = SoppBrTarget;
}
def si_ga : Operand<iPTR>;
-def InterpSlotMatchClass : AsmOperandClass {
- let Name = "InterpSlot";
- let PredicateMethod = "isInterpSlot";
- let ParserMethod = "parseInterpSlot";
- let RenderMethod = "addImmOperands";
-}
-
-def InterpSlot : Operand<i32> {
- let PrintMethod = "printInterpSlot";
- let ParserMatchClass = InterpSlotMatchClass;
- let OperandType = "OPERAND_IMMEDIATE";
-}
-
-def AttrMatchClass : AsmOperandClass {
- let Name = "Attr";
- let PredicateMethod = "isInterpAttr";
- let ParserMethod = "parseInterpAttr";
- let RenderMethod = "addImmOperands";
-}
+def InterpSlot : CustomOperand<i32>;
// It appears to be necessary to create a separate operand for this to
// be able to parse attr<num> with no space.
-def Attr : Operand<i32> {
- let PrintMethod = "printInterpAttr";
- let ParserMatchClass = AttrMatchClass;
- let OperandType = "OPERAND_IMMEDIATE";
-}
-
-def AttrChanMatchClass : AsmOperandClass {
- let Name = "AttrChan";
- let PredicateMethod = "isAttrChan";
- let RenderMethod = "addImmOperands";
-}
-
-def AttrChan : Operand<i32> {
- let PrintMethod = "printInterpAttrChan";
- let ParserMatchClass = AttrChanMatchClass;
- let OperandType = "OPERAND_IMMEDIATE";
-}
+def InterpAttr : CustomOperand<i32>;
-def SendMsgMatchClass : AsmOperandClass {
- let Name = "SendMsg";
- let PredicateMethod = "isSendMsg";
- let ParserMethod = "parseSendMsgOp";
- let RenderMethod = "addImmOperands";
-}
-
-def SwizzleMatchClass : AsmOperandClass {
- let Name = "Swizzle";
- let PredicateMethod = "isSwizzle";
- let ParserMethod = "parseSwizzleOp";
- let RenderMethod = "addImmOperands";
- let IsOptional = 1;
-}
-
-def EndpgmMatchClass : AsmOperandClass {
- let Name = "EndpgmImm";
- let PredicateMethod = "isEndpgm";
- let ParserMethod = "parseEndpgmOp";
- let RenderMethod = "addImmOperands";
- let IsOptional = 1;
-}
-
-def ExpTgtMatchClass : AsmOperandClass {
- let Name = "ExpTgt";
- let PredicateMethod = "isExpTgt";
- let ParserMethod = "parseExpTgt";
- let RenderMethod = "printExpTgt";
-}
-
-def SWaitMatchClass : AsmOperandClass {
- let Name = "SWaitCnt";
- let RenderMethod = "addImmOperands";
- let ParserMethod = "parseSWaitCntOps";
-}
-
-def DepCtrMatchClass : AsmOperandClass {
- let Name = "DepCtr";
- let RenderMethod = "addImmOperands";
- let ParserMethod = "parseDepCtrOps";
-}
-
-def SDelayMatchClass : AsmOperandClass {
- let Name = "SDelayAlu";
- let RenderMethod = "addImmOperands";
- let ParserMethod = "parseSDelayAluOps";
-}
+def InterpAttrChan : ImmOperand<i32>;
def VReg32OrOffClass : AsmOperandClass {
let Name = "VReg32OrOff";
let ParserMethod = "parseVReg32OrOff";
}
-let OperandType = "OPERAND_IMMEDIATE" in {
-def SendMsgImm : Operand<i32> {
- let PrintMethod = "printSendMsg";
- let ParserMatchClass = SendMsgMatchClass;
-}
+def SendMsg : CustomOperand<i32>;
-def SwizzleImm : Operand<i16> {
- let PrintMethod = "printSwizzle";
- let ParserMatchClass = SwizzleMatchClass;
-}
+def Swizzle : CustomOperand<i16, 1>;
-def EndpgmImm : Operand<i16> {
- let PrintMethod = "printEndpgm";
- let ParserMatchClass = EndpgmMatchClass;
-}
+def Endpgm : CustomOperand<i16, 1>;
-def WAIT_FLAG : Operand <i32> {
- let ParserMatchClass = SWaitMatchClass;
- let PrintMethod = "printWaitFlag";
-}
+def SWaitCnt : CustomOperand<i32>;
-def DepCtrImm : Operand <i32> {
- let ParserMatchClass = DepCtrMatchClass;
- let PrintMethod = "printDepCtr";
-}
+def DepCtr : CustomOperand<i32>;
-def DELAY_FLAG : Operand <i32> {
- let ParserMatchClass = SDelayMatchClass;
- let PrintMethod = "printDelayFlag";
-}
-} // End OperandType = "OPERAND_IMMEDIATE"
+def SDelayALU : CustomOperand<i32>;
include "SIInstrFormats.td"
include "VIInstrFormats.td"
@@ -1148,111 +1033,71 @@ def SDWAVopcDst : BoolRC {
let PrintMethod = "printVOPDst";
}
-class NamedMatchClass<string CName, bit Optional = 1> : AsmOperandClass {
- let Name = "Imm"#CName;
- let PredicateMethod = "is"#CName;
- let ParserMethod = !if(Optional, "", "parse"#CName);
- let RenderMethod = "addImmOperands";
- let IsOptional = Optional;
- let DefaultMethod = !if(Optional, "default"#CName, ?);
-}
-
-class CustomOperandClass<string CName, bit Optional> : AsmOperandClass {
- let Name = CName;
- let PredicateMethod = "is"#CName;
- let ParserMethod = "parse"#CName;
- let RenderMethod = "addImmOperands";
- let IsOptional = Optional;
- let DefaultMethod = "default"#CName;
-}
-
-class CustomOperandProps<bit Optional = 0, string Name = NAME,
- AsmOperandClass Class = CustomOperandClass<Name, Optional>> {
- string PrintMethod = "print"#Name;
- AsmOperandClass ParserMatchClass = Class;
-}
-
-class CustomOperand<ValueType Type, bit Optional = 0, string Name = NAME,
- AsmOperandClass Class = CustomOperandClass<Name, Optional>>
- : Operand<Type>, CustomOperandProps<Optional, Name, Class>;
-
-class NamedIntOperandClass<string Prefix, string Name, string ConvertMethod>
- : CustomOperandClass<Name, 1> {
- string ImmTy = "AMDGPUOperand::ImmTy"#Name;
- let ParserMethod =
- "[this](OperandVector &Operands) -> OperandMatchResultTy { "#
- "return parseIntWithPrefix(\""#Prefix#"\", Operands, "#ImmTy#", "#
- ConvertMethod#"); }";
-}
-
class NamedIntOperand<ValueType Type, string Prefix, string Name = NAME,
string ConvertMethod = "nullptr">
- : CustomOperand<Type, 1, Name, NamedIntOperandClass<Prefix, Name, ConvertMethod>>;
-
-class BitOperandClass<string Id, string Name>
- : CustomOperandClass<Name, 1> {
- string ImmTy = "AMDGPUOperand::ImmTy"#Name;
+ : CustomOperand<Type, 1, Name> {
let ParserMethod =
- "[this](OperandVector &Operands) -> OperandMatchResultTy { "#
- "return parseNamedBit(\""#Id#"\", Operands, "#ImmTy#"); }";
+ "[this](OperandVector &Operands) -> ParseStatus { "#
+ "return parseIntWithPrefix(\""#Prefix#"\", Operands, "#
+ "AMDGPUOperand::"#ImmTy#", "#ConvertMethod#"); }";
}
class NamedBitOperand<string Id, string Name = NAME>
- : CustomOperand<i1, 1, Name, BitOperandClass<Id, Name>>;
-
-class DefaultOperand_0<CustomOperand Op>
- : OperandWithDefaultOps<Op.Type, (ops (Op.Type 0))>,
- CustomOperandProps<1, Op.ParserMatchClass.Name, Op.ParserMatchClass>;
-
-class NamedOperandU32<string Name, AsmOperandClass MatchClass> : Operand<i32> {
- let PrintMethod = "print"#Name;
- let ParserMatchClass = MatchClass;
+ : CustomOperand<i1, 1, Name> {
+ let ParserMethod =
+ "[this](OperandVector &Operands) -> ParseStatus { "#
+ "return parseNamedBit(\""#Id#"\", Operands, AMDGPUOperand::"#ImmTy#"); }";
+ let PrintMethod = "[this](const MCInst *MI, unsigned OpNo, "#
+ "const MCSubtargetInfo &STI, raw_ostream &O) { "#
+ "printNamedBit(MI, OpNo, O, \""#Id#"\"); }";
}
-class NamedOperandU32_0<string Name, AsmOperandClass MatchClass> :
- OperandWithDefaultOps<i32, (ops (i32 0))> {
- let PrintMethod = "print"#Name;
- let ParserMatchClass = MatchClass;
+class DefaultOperand<CustomOperand Op, int Value>
+ : OperandWithDefaultOps<Op.Type, (ops (Op.Type Value))>,
+ CustomOperandProps<1, Op.ParserMatchClass.Name> {
+ let ParserMethod = Op.ParserMatchClass.ParserMethod;
+ let PrintMethod = Op.PrintMethod;
}
-class NamedOperandU32Default0<string Name, AsmOperandClass MatchClass> :
- OperandWithDefaultOps<i32, (ops (i32 0))> {
- let PrintMethod = "print"#Name;
- let ParserMatchClass = MatchClass;
+class SDWAOperand<string Id, string Name = NAME>
+ : CustomOperand<i32, 1, Name> {
+ let ParserMethod =
+ "[this](OperandVector &Operands) -> ParseStatus { "#
+ "return parseSDWASel(Operands, \""#Id#"\", AMDGPUOperand::"#ImmTy#"); }";
}
-class NamedOperandU32Default1<string Name, AsmOperandClass MatchClass> :
- OperandWithDefaultOps<i32, (ops (i32 1))> {
- let PrintMethod = "print"#Name;
- let ParserMatchClass = MatchClass;
+class ArrayOperand0<string Id, string Name = NAME>
+ : OperandWithDefaultOps<i32, (ops (i32 0))>,
+ CustomOperandProps<1, Name> {
+ let ParserMethod =
+ "[this](OperandVector &Operands) -> ParseStatus { "#
+ "return parseOperandArrayWithPrefix(\""#Id#"\", Operands, "#
+ "AMDGPUOperand::"#ImmTy#"); }";
}
-let OperandType = "OPERAND_IMMEDIATE" in {
-
-def flat_offset : CustomOperand<i16, 1, "FlatOffset">;
-def offset : NamedIntOperand<i16, "offset", "Offset">;
+let ImmTy = "ImmTyOffset" in
+def flat_offset : CustomOperand<i32, 1, "FlatOffset">;
+def offset : NamedIntOperand<i32, "offset", "Offset">;
def offset0 : NamedIntOperand<i8, "offset0", "Offset0">;
def offset1 : NamedIntOperand<i8, "offset1", "Offset1">;
def gds : NamedBitOperand<"gds", "GDS">;
-def omod : NamedOperandU32<"OModSI", NamedMatchClass<"OModSI">>;
-def omod0 : NamedOperandU32_0<"OModSI", NamedMatchClass<"OModSI">>;
+def omod : CustomOperand<i32, 1, "OModSI">;
+def omod0 : DefaultOperand<omod, 0>;
// We need to make the cases with a default of 0 distinct from no
// default to help deal with some cases where the operand appears
// before a mandatory operand.
def clampmod : NamedBitOperand<"clamp", "ClampSI">;
-def clampmod0 : DefaultOperand_0<clampmod>;
+def clampmod0 : DefaultOperand<clampmod, 0>;
def highmod : NamedBitOperand<"high", "High">;
-def CPol : NamedOperandU32<"CPol", NamedMatchClass<"CPol">>;
-def CPol_0 : NamedOperandU32Default0<"CPol", NamedMatchClass<"CPol">>;
-def CPol_GLC1 : NamedOperandU32Default1<"CPol", NamedMatchClass<"CPol">>;
+def CPol : CustomOperand<i32, 1>;
+def CPol_0 : DefaultOperand<CPol, 0>;
+def CPol_GLC1 : DefaultOperand<CPol, 1>;
def TFE : NamedBitOperand<"tfe">;
-def SWZ : NamedBitOperand<"swz">;
-def SWZ_0 : DefaultOperand_0<SWZ>;
def UNorm : NamedBitOperand<"unorm">;
def DA : NamedBitOperand<"da">;
def R128A16 : CustomOperand<i1, 1>;
@@ -1267,62 +1112,51 @@ def FORMAT : CustomOperand<i8>;
def DMask : NamedIntOperand<i16, "dmask">;
def Dim : CustomOperand<i8>;
-def dst_sel : NamedOperandU32<"SDWADstSel", NamedMatchClass<"SDWADstSel">>;
-def src0_sel : NamedOperandU32<"SDWASrc0Sel", NamedMatchClass<"SDWASrc0Sel">>;
-def src1_sel : NamedOperandU32<"SDWASrc1Sel", NamedMatchClass<"SDWASrc1Sel">>;
-def dst_unused : NamedOperandU32<"SDWADstUnused", NamedMatchClass<"SDWADstUnused">>;
-
-def op_sel0 : NamedOperandU32Default0<"OpSel", NamedMatchClass<"OpSel">>;
-def op_sel_hi0 : NamedOperandU32Default0<"OpSelHi", NamedMatchClass<"OpSelHi">>;
-def neg_lo0 : NamedOperandU32Default0<"NegLo", NamedMatchClass<"NegLo">>;
-def neg_hi0 : NamedOperandU32Default0<"NegHi", NamedMatchClass<"NegHi">>;
+def dst_sel : SDWAOperand<"dst_sel", "SDWADstSel">;
+def src0_sel : SDWAOperand<"src0_sel", "SDWASrc0Sel">;
+def src1_sel : SDWAOperand<"src1_sel", "SDWASrc1Sel">;
+def dst_unused : CustomOperand<i32, 1, "SDWADstUnused">;
-def dpp8 : NamedOperandU32<"DPP8", NamedMatchClass<"DPP8", 0>>;
-def dpp_ctrl : NamedOperandU32<"DPPCtrl", NamedMatchClass<"DPPCtrl", 0>>;
+def op_sel0 : ArrayOperand0<"op_sel", "OpSel">;
+def op_sel_hi0 : ArrayOperand0<"op_sel_hi", "OpSelHi">;
+def neg_lo0 : ArrayOperand0<"neg_lo", "NegLo">;
+def neg_hi0 : ArrayOperand0<"neg_hi", "NegHi">;
-def row_mask : NamedOperandU32<"RowMask", NamedMatchClass<"RowMask">>;
-def bank_mask : NamedOperandU32<"BankMask", NamedMatchClass<"BankMask">>;
-def bound_ctrl : NamedIntOperand<i1, "bound_ctrl", "DppBoundCtrl", "ConvertDppBoundCtrl">;
-def FI : NamedOperandU32<"FI", NamedMatchClass<"FI">>;
+def dpp8 : CustomOperand<i32, 0, "DPP8">;
+def dpp_ctrl : CustomOperand<i32, 0, "DPPCtrl">;
-def blgp : NamedOperandU32<"BLGP", NamedMatchClass<"BLGP">>;
-def cbsz : NamedOperandU32<"CBSZ", NamedMatchClass<"CBSZ">>;
-def abid : NamedOperandU32<"ABID", NamedMatchClass<"ABID">>;
+let DefaultValue = "0xf" in {
+def row_mask : NamedIntOperand<i32, "row_mask", "DppRowMask">;
+def bank_mask : NamedIntOperand<i32, "bank_mask", "DppBankMask">;
+}
+def bound_ctrl : NamedIntOperand<i1, "bound_ctrl", "DppBoundCtrl",
+ "[this] (int64_t &BC) -> bool { return convertDppBoundCtrl(BC); }">;
+def FI : NamedIntOperand<i32, "fi", "DppFI">;
-def hwreg : NamedOperandU32<"Hwreg", NamedMatchClass<"Hwreg", 0>>;
+def blgp : CustomOperand<i32, 1, "BLGP">;
+def cbsz : NamedIntOperand<i32, "cbsz", "CBSZ">;
+def abid : NamedIntOperand<i32, "abid", "ABID">;
-def exp_tgt : NamedOperandU32<"ExpTgt", NamedMatchClass<"ExpTgt", 0>> {
+def hwreg : CustomOperand<i32, 0, "Hwreg">;
-}
+def exp_tgt : CustomOperand<i32, 0, "ExpTgt">;
def wait_vdst : NamedIntOperand<i8, "wait_vdst", "WaitVDST">;
def wait_exp : NamedIntOperand<i8, "wait_exp", "WaitEXP">;
-} // End OperandType = "OPERAND_IMMEDIATE"
-
-class KImmMatchClass<int size> : AsmOperandClass {
- let Name = "KImmFP"#size;
- let PredicateMethod = "isKImmFP"#size;
- let ParserMethod = "parseImm";
- let RenderMethod = "addKImmFP"#size#"Operands";
-}
-
-class kimmOperand<ValueType vt> : Operand<vt> {
+class KImmFPOperand<ValueType vt> : ImmOperand<vt> {
let OperandNamespace = "AMDGPU";
let OperandType = "OPERAND_KIMM"#vt.Size;
let PrintMethod = "printU"#vt.Size#"ImmOperand";
- let ParserMatchClass = !cast<AsmOperandClass>("KImmFP"#vt.Size#"MatchClass");
- let DecoderMethod = "decodeOperand_f"#vt.Size#"kimm";
+ let DecoderMethod = "decodeOperand_KImmFP";
}
// 32-bit VALU immediate operand that uses the constant bus.
-def KImmFP32MatchClass : KImmMatchClass<32>;
-def f32kimm : kimmOperand<i32>;
+def KImmFP32 : KImmFPOperand<i32>;
// 32-bit VALU immediate operand with a 16-bit value that uses the
// constant bus.
-def KImmFP16MatchClass : KImmMatchClass<16>;
-def f16kimm : kimmOperand<i16>;
+def KImmFP16 : KImmFPOperand<i16>;
class FPInputModsMatchClass <int opSize> : AsmOperandClass {
let Name = "RegOrImmWithFP"#opSize#"InputMods";
@@ -1506,7 +1340,16 @@ def DS128Bit8ByteAligned : ComplexPattern<iPTR, 3, "SelectDS128Bit8ByteAligned">
def MOVRELOffset : ComplexPattern<iPTR, 2, "SelectMOVRELOffset">;
def VOP3Mods0 : ComplexPattern<untyped, 4, "SelectVOP3Mods0">;
+
+// Modifiers for floating point instructions.
def VOP3Mods : ComplexPattern<untyped, 2, "SelectVOP3Mods">;
+
+// VOP3 modifiers used for instructions that do not read canonicalized
+// floating point values (i.e. integer operations with FP source
+// modifiers)
+def VOP3ModsNonCanonicalizing : ComplexPattern<untyped, 2,
+ "SelectVOP3ModsNonCanonicalizing">;
+
def VOP3NoMods : ComplexPattern<untyped, 1, "SelectVOP3NoMods">;
def VOP3OMods : ComplexPattern<untyped, 3, "SelectVOP3OMods">;
@@ -1521,7 +1364,8 @@ def VOP3OpSel : ComplexPattern<untyped, 2, "SelectVOP3OpSel">;
def VOP3OpSelMods : ComplexPattern<untyped, 2, "SelectVOP3OpSelMods">;
-def VOP3PMadMixMods : ComplexPattern<untyped, 2, "SelectVOP3PMadMixMods">;
+def VOP3PMadMixModsExt : ComplexPattern<untyped, 2, "SelectVOP3PMadMixModsExt">;
+def VOP3PMadMixMods : ComplexPattern<untyped, 2, "SelectVOP3PMadMixMods">;
def VINTERPMods : ComplexPattern<untyped, 2, "SelectVINTERPMods">;
def VINTERPModsHi : ComplexPattern<untyped, 2, "SelectVINTERPModsHi">;
@@ -1717,7 +1561,7 @@ class getVOP3SrcForVT<ValueType VT> {
bit isFP = isFloatType<VT>.ret;
RegisterOperand ret =
!if(!eq(VT.Size, 128),
- VSrc_128,
+ VRegSrc_128,
!if(!eq(VT.Size, 64),
!if(isFP,
!if(!eq(VT.Value, v2f32.Value),
@@ -2390,14 +2234,6 @@ class getLdStRegisterOperand<RegisterClass RC> {
)))));
}
-class BitOr<bit a, bit b> {
- bit ret = !if(a, 1, !if(b, 1, 0));
-}
-
-class BitAnd<bit a, bit b> {
- bit ret = !if(a, !if(b, 1, 0), 0);
-}
-
class getHasVOP3DPP <ValueType DstVT = i32, ValueType Src0VT = i32,
ValueType Src1VT = i32, ValueType Src2VT = i32> {
bit ret = !if(!eq(DstVT.Size, 64),
@@ -2445,7 +2281,7 @@ class VOPProfile <list<ValueType> _ArgVT, bit _EnableClamp = 0> {
field RegisterClass Src1DPP = getVregSrcForVT<Src1VT>.ret;
field RegisterClass Src2DPP = getVregSrcForVT<Src2VT>.ret;
field RegisterOperand Src0VOP3DPP = VGPRSrc_32;
- field RegisterOperand Src1VOP3DPP = VGPRSrc_32;
+ field RegisterOperand Src1VOP3DPP = VRegSrc_32;
field RegisterOperand Src2VOP3DPP = getVOP3DPPSrcForVT<Src2VT>.ret;
field RegisterOperand Src0SDWA = getSDWASrcForVT<Src0VT>.ret;
field RegisterOperand Src1SDWA = getSDWASrcForVT<Src0VT>.ret;
@@ -2509,8 +2345,7 @@ class VOPProfile <list<ValueType> _ArgVT, bit _EnableClamp = 0> {
field bit HasExt = getHasExt<NumSrcArgs, DstVT, Src0VT, Src1VT>.ret;
field bit HasExtVOP3DPP = getHasVOP3DPP<DstVT, Src0VT, Src1VT, Src2VT>.ret;
- field bit HasExtDPP = !if(!or(getHasDPP<NumSrcArgs>.ret,
- HasExtVOP3DPP), 1, 0);
+ field bit HasExtDPP = !or(getHasDPP<NumSrcArgs>.ret, HasExtVOP3DPP);
field bit HasExt32BitDPP = getHasExt32BitDPP<NumSrcArgs, DstVT, Src0VT, Src1VT>.ret;
field bit HasExt64BitDPP = getHasExt64BitDPP<NumSrcArgs, DstVT, Src0VT, Src1VT>.ret;
field bit HasExtSDWA = getHasSDWA<NumSrcArgs, DstVT, Src0VT, Src1VT>.ret;
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index 0c2a13852fcb..7fe76b4c13ca 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -47,7 +47,7 @@ let Uses = [MODE, M0, EXEC] in {
multiclass V_INTERP_P1_F32_m : VINTRP_m <
0x00000000,
(outs VINTRPDst:$vdst),
- (ins VGPR_32:$vsrc, Attr:$attr, AttrChan:$attrchan),
+ (ins VGPR_32:$vsrc, InterpAttr:$attr, InterpAttrChan:$attrchan),
"v_interp_p1_f32$vdst, $vsrc, $attr$attrchan",
[(set f32:$vdst, (int_amdgcn_interp_p1 f32:$vsrc,
(i32 timm:$attrchan), (i32 timm:$attr), M0))]
@@ -73,7 +73,8 @@ let DisableEncoding = "$src0", Constraints = "$src0 = $vdst" in {
defm V_INTERP_P2_F32 : VINTRP_m <
0x00000001,
(outs VINTRPDst:$vdst),
- (ins VGPR_32:$src0, VGPR_32:$vsrc, Attr:$attr, AttrChan:$attrchan),
+ (ins VGPR_32:$src0, VGPR_32:$vsrc, InterpAttr:$attr,
+ InterpAttrChan:$attrchan),
"v_interp_p2_f32$vdst, $vsrc, $attr$attrchan",
[(set f32:$vdst, (int_amdgcn_interp_p2 f32:$src0, f32:$vsrc,
(i32 timm:$attrchan), (i32 timm:$attr), M0))]>;
@@ -83,7 +84,7 @@ defm V_INTERP_P2_F32 : VINTRP_m <
defm V_INTERP_MOV_F32 : VINTRP_m <
0x00000002,
(outs VINTRPDst:$vdst),
- (ins InterpSlot:$vsrc, Attr:$attr, AttrChan:$attrchan),
+ (ins InterpSlot:$vsrc, InterpAttr:$attr, InterpAttrChan:$attrchan),
"v_interp_mov_f32$vdst, $vsrc, $attr$attrchan",
[(set f32:$vdst, (int_amdgcn_interp_mov (i32 timm:$vsrc),
(i32 timm:$attrchan), (i32 timm:$attr), M0))]>;
@@ -95,6 +96,16 @@ defm V_INTERP_MOV_F32 : VINTRP_m <
//===----------------------------------------------------------------------===//
// Pseudo Instructions
//===----------------------------------------------------------------------===//
+
+// Insert a branch to an endpgm block to use as a fallback trap.
+def ENDPGM_TRAP : SPseudoInstSI<
+ (outs), (ins),
+ [(AMDGPUendpgm_trap)],
+ "ENDPGM_TRAP"> {
+ let hasSideEffects = 1;
+ let usesCustomInserter = 1;
+}
+
def ATOMIC_FENCE : SPseudoInstSI<
(outs), (ins i32imm:$ordering, i32imm:$scope),
[(atomic_fence (i32 timm:$ordering), (i32 timm:$scope))],
@@ -161,6 +172,13 @@ def STRICT_WQM : PseudoInstSI <(outs unknown:$vdst), (ins unknown:$src0)>;
} // End let hasSideEffects = 0, mayLoad = 0, mayStore = 0, Uses = [EXEC]
+def WWM_COPY : SPseudoInstSI <
+ (outs unknown:$dst), (ins unknown:$src)> {
+ let hasSideEffects = 0;
+ let isAsCheapAsAMove = 1;
+ let isConvergent = 1;
+}
+
def ENTER_STRICT_WWM : SPseudoInstSI <(outs SReg_1:$sdst), (ins i64imm:$src0)> {
let Uses = [EXEC];
let Defs = [EXEC, SCC];
@@ -189,6 +207,12 @@ def EXIT_STRICT_WQM : SPseudoInstSI <(outs SReg_1:$sdst), (ins SReg_1:$src0)> {
let mayStore = 0;
}
+let usesCustomInserter = 1 in {
+def S_INVERSE_BALLOT_U32 : SPseudoInstSI <(outs SReg_32:$sdst), (ins SSrc_b32:$mask)>;
+
+def S_INVERSE_BALLOT_U64 : SPseudoInstSI <(outs SReg_64:$sdst), (ins SSrc_b64:$mask)>;
+} // End usesCustomInserter = 1
+
// PSEUDO_WM is treated like STRICT_WWM/STRICT_WQM without exec changes.
def ENTER_PSEUDO_WM : SPseudoInstSI <(outs), (ins)> {
let Uses = [EXEC];
@@ -222,7 +246,7 @@ def FPTRUNC_DOWNWARD_PSEUDO : VPseudoInstSI <(outs VGPR_32:$vdst),
// Invert the exec mask and overwrite the inactive lanes of dst with inactive,
// restoring it after we're done.
-let Defs = [SCC] in {
+let Defs = [SCC], isConvergent = 1 in {
def V_SET_INACTIVE_B32 : VPseudoInstSI <(outs VGPR_32:$vdst),
(ins VSrc_b32: $src, VSrc_b32:$inactive),
[(set i32:$vdst, (int_amdgcn_set_inactive i32:$src, i32:$inactive))]> {
@@ -234,6 +258,18 @@ def V_SET_INACTIVE_B64 : VPseudoInstSI <(outs VReg_64:$vdst),
}
} // End Defs = [SCC]
+let usesCustomInserter = 1, hasSideEffects = 0, mayLoad = 0, mayStore = 0, Uses = [EXEC] in {
+ def WAVE_REDUCE_UMIN_PSEUDO_U32 : VPseudoInstSI <(outs SGPR_32:$sdst),
+ (ins VSrc_b32: $src, VSrc_b32:$strategy),
+ [(set i32:$sdst, (int_amdgcn_wave_reduce_umin i32:$src, i32:$strategy))]> {
+ }
+
+ def WAVE_REDUCE_UMAX_PSEUDO_U32 : VPseudoInstSI <(outs SGPR_32:$sdst),
+ (ins VSrc_b32: $src, VSrc_b32:$strategy),
+ [(set i32:$sdst, (int_amdgcn_wave_reduce_umax i32:$src, i32:$strategy))]> {
+ }
+}
+
let usesCustomInserter = 1, Defs = [VCC, EXEC] in {
def V_ADD_U64_PSEUDO : VPseudoInstSI <
(outs VReg_64:$vdst), (ins VSrc_b64:$src0, VSrc_b64:$src1),
@@ -300,6 +336,7 @@ def S_XOR_B64_term : WrapTerminatorInst<S_XOR_B64>;
def S_OR_B64_term : WrapTerminatorInst<S_OR_B64>;
def S_ANDN2_B64_term : WrapTerminatorInst<S_ANDN2_B64>;
def S_AND_B64_term : WrapTerminatorInst<S_AND_B64>;
+def S_AND_SAVEEXEC_B64_term : WrapTerminatorInst<S_AND_SAVEEXEC_B64>;
}
let WaveSizePredicate = isWave32 in {
@@ -308,6 +345,7 @@ def S_XOR_B32_term : WrapTerminatorInst<S_XOR_B32>;
def S_OR_B32_term : WrapTerminatorInst<S_OR_B32>;
def S_ANDN2_B32_term : WrapTerminatorInst<S_ANDN2_B32>;
def S_AND_B32_term : WrapTerminatorInst<S_AND_B32>;
+def S_AND_SAVEEXEC_B32_term : WrapTerminatorInst<S_AND_SAVEEXEC_B32>;
}
@@ -368,7 +406,13 @@ def IGLP_OPT : SPseudoInstSI<(outs), (ins i32imm:$mask),
// SI pseudo instructions. These are used by the CFG structurizer pass
// and should be lowered to ISA instructions prior to codegen.
-let isTerminator = 1 in {
+// As we have enhanced control flow intrinsics to work under unstructured CFG,
+// duplicating such intrinsics can be actually treated as legal. On the contrary,
+// by making them non-duplicable, we are observing better code generation result.
+// So we choose to mark them non-duplicable in hope of getting better code
+// generation as well as simplied CFG during Machine IR optimization stage.
+
+let isTerminator = 1, isNotDuplicable = 1 in {
let OtherPredicates = [EnableLateCFGStructurize] in {
def SI_NON_UNIFORM_BRCOND_PSEUDO : CFPseudoInstSI <
@@ -385,6 +429,7 @@ def SI_IF: CFPseudoInstSI <
let Constraints = "";
let Size = 12;
let hasSideEffects = 1;
+ let IsNeverUniform = 1;
}
def SI_ELSE : CFPseudoInstSI <
@@ -392,6 +437,7 @@ def SI_ELSE : CFPseudoInstSI <
(ins SReg_1:$src, brtarget:$target), [], 1, 1> {
let Size = 12;
let hasSideEffects = 1;
+ let IsNeverUniform = 1;
}
def SI_WATERFALL_LOOP : CFPseudoInstSI <
@@ -408,6 +454,7 @@ def SI_LOOP : CFPseudoInstSI <
let Size = 8;
let isBranch = 1;
let hasSideEffects = 1;
+ let IsNeverUniform = 1;
}
} // End isTerminator = 1
@@ -418,6 +465,7 @@ def SI_END_CF : CFPseudoInstSI <
let isAsCheapAsAMove = 1;
let isReMaterializable = 1;
let hasSideEffects = 1;
+ let isNotDuplicable = 1; // Not a hard requirement, see long comments above for details.
let mayLoad = 1; // FIXME: Should not need memory flags
let mayStore = 1;
}
@@ -425,6 +473,7 @@ def SI_END_CF : CFPseudoInstSI <
def SI_IF_BREAK : CFPseudoInstSI <
(outs SReg_1:$dst), (ins SReg_1:$vcc, SReg_1:$src), []> {
let Size = 4;
+ let isNotDuplicable = 1; // Not a hard requirement, see long comments above for details.
let isAsCheapAsAMove = 1;
let isReMaterializable = 1;
}
@@ -470,7 +519,7 @@ def SI_ILLEGAL_COPY : SPseudoInstSI <
// Branch on undef scc. Used to avoid intermediate copy from
// IMPLICIT_DEF to SCC.
-def SI_BR_UNDEF : SPseudoInstSI <(outs), (ins sopp_brtarget:$simm16)> {
+def SI_BR_UNDEF : SPseudoInstSI <(outs), (ins SOPPBrTarget:$simm16)> {
let isTerminator = 1;
let usesCustomInserter = 1;
let isBranch = 1;
@@ -543,7 +592,7 @@ def SI_RETURN_TO_EPILOG : SPseudoInstSI <
// Return for returning function calls.
def SI_RETURN : SPseudoInstSI <
- (outs), (ins), [(AMDGPUret_flag)],
+ (outs), (ins), [(AMDGPUret_glue)],
"; return"> {
let isTerminator = 1;
let isBarrier = 1;
@@ -584,10 +633,9 @@ def SI_CALL : SPseudoInstSI <
let isConvergent = 1;
}
-// Tail call handling pseudo
-def SI_TCRETURN : SPseudoInstSI <(outs),
- (ins SReg_64:$src0, unknown:$callee, i32imm:$fpdiff),
- [(AMDGPUtc_return i64:$src0, tglobaladdr:$callee, i32:$fpdiff)]> {
+class SI_TCRETURN_Pseudo<RegisterClass rc, SDNode sd> : SPseudoInstSI <(outs),
+ (ins rc:$src0, unknown:$callee, i32imm:$fpdiff),
+ [(sd i64:$src0, tglobaladdr:$callee, i32:$fpdiff)]> {
let Size = 4;
let FixedSize = 1;
let isCall = 1;
@@ -600,10 +648,20 @@ def SI_TCRETURN : SPseudoInstSI <(outs),
let isConvergent = 1;
}
+// Tail call handling pseudo
+def SI_TCRETURN : SI_TCRETURN_Pseudo<CCR_SGPR_64, AMDGPUtc_return>;
+def SI_TCRETURN_GFX : SI_TCRETURN_Pseudo<Gfx_CCR_SGPR_64, AMDGPUtc_return_gfx>;
+
// Handle selecting indirect tail calls
def : GCNPat<
(AMDGPUtc_return i64:$src0, (i64 0), (i32 timm:$fpdiff)),
- (SI_TCRETURN SReg_64:$src0, (i64 0), i32imm:$fpdiff)
+ (SI_TCRETURN CCR_SGPR_64:$src0, (i64 0), i32imm:$fpdiff)
+>;
+
+// Handle selecting indirect tail calls for AMDGPU_gfx
+def : GCNPat<
+ (AMDGPUtc_return_gfx i64:$src0, (i64 0), (i32 timm:$fpdiff)),
+ (SI_TCRETURN_GFX Gfx_CCR_SGPR_64:$src0, (i64 0), i32imm:$fpdiff)
>;
def ADJCALLSTACKUP : SPseudoInstSI<
@@ -720,6 +778,10 @@ def S_INDIRECT_REG_WRITE_MOVREL_B32_V3 : S_INDIRECT_REG_WRITE_MOVREL_B32_pseudo<
def S_INDIRECT_REG_WRITE_MOVREL_B32_V4 : S_INDIRECT_REG_WRITE_MOVREL_B32_pseudo<SReg_128>;
def S_INDIRECT_REG_WRITE_MOVREL_B32_V5 : S_INDIRECT_REG_WRITE_MOVREL_B32_pseudo<SReg_160>;
def S_INDIRECT_REG_WRITE_MOVREL_B32_V8 : S_INDIRECT_REG_WRITE_MOVREL_B32_pseudo<SReg_256>;
+def S_INDIRECT_REG_WRITE_MOVREL_B32_V9 : S_INDIRECT_REG_WRITE_MOVREL_B32_pseudo<SReg_288>;
+def S_INDIRECT_REG_WRITE_MOVREL_B32_V10 : S_INDIRECT_REG_WRITE_MOVREL_B32_pseudo<SReg_320>;
+def S_INDIRECT_REG_WRITE_MOVREL_B32_V11 : S_INDIRECT_REG_WRITE_MOVREL_B32_pseudo<SReg_352>;
+def S_INDIRECT_REG_WRITE_MOVREL_B32_V12 : S_INDIRECT_REG_WRITE_MOVREL_B32_pseudo<SReg_384>;
def S_INDIRECT_REG_WRITE_MOVREL_B32_V16 : S_INDIRECT_REG_WRITE_MOVREL_B32_pseudo<SReg_512>;
def S_INDIRECT_REG_WRITE_MOVREL_B32_V32 : S_INDIRECT_REG_WRITE_MOVREL_B32_pseudo<SReg_1024>;
@@ -890,6 +952,9 @@ defm SI_SPILL_AV384 : SI_SPILL_VGPR <AV_384, 1>;
defm SI_SPILL_AV512 : SI_SPILL_VGPR <AV_512, 1>;
defm SI_SPILL_AV1024 : SI_SPILL_VGPR <AV_1024, 1>;
+let isConvergent = 1 in
+defm SI_SPILL_WWM_V32 : SI_SPILL_VGPR <VGPR_32>;
+
def SI_PC_ADD_REL_OFFSET : SPseudoInstSI <
(outs SReg_64:$dst),
(ins si_ga:$ptr_lo, si_ga:$ptr_hi),
@@ -954,25 +1019,6 @@ def : Pat <
// VOP1 Patterns
//===----------------------------------------------------------------------===//
-let OtherPredicates = [UnsafeFPMath] in {
-
-// Convert (x - floor(x)) to fract(x)
-def : GCNPat <
- (f32 (fsub (f32 (VOP3Mods f32:$x, i32:$mods)),
- (f32 (ffloor (f32 (VOP3Mods f32:$x, i32:$mods)))))),
- (V_FRACT_F32_e64 $mods, $x)
->;
-
-// Convert (x + (-floor(x))) to fract(x)
-def : GCNPat <
- (f64 (fadd (f64 (VOP3Mods f64:$x, i32:$mods)),
- (f64 (fneg (f64 (ffloor (f64 (VOP3Mods f64:$x, i32:$mods)))))))),
- (V_FRACT_F64_e64 $mods, $x)
->;
-
-} // End OtherPredicates = [UnsafeFPMath]
-
-
multiclass f16_fp_Pats<Instruction cvt_f16_f32_inst_e64, Instruction cvt_f32_f16_inst_e64> {
// f16_to_fp patterns
def : GCNPat <
@@ -1094,8 +1140,8 @@ def : GCNPat <
>;
class VOPSelectModsPat <ValueType vt> : GCNPat <
- (vt (select i1:$src0, (VOP3Mods vt:$src1, i32:$src1_mods),
- (VOP3Mods vt:$src2, i32:$src2_mods))),
+ (vt (select i1:$src0, (VOP3ModsNonCanonicalizing vt:$src1, i32:$src1_mods),
+ (VOP3ModsNonCanonicalizing vt:$src2, i32:$src2_mods))),
(V_CNDMASK_B32_e64 FP32InputMods:$src2_mods, VSrc_b32:$src2,
FP32InputMods:$src1_mods, VSrc_b32:$src1, SSrc_i1:$src0)
>;
@@ -1343,66 +1389,6 @@ foreach Index = 0-15 in {
}
-def : Pat <
- (extract_subvector v4i16:$vec, (i32 0)),
- (v2i16 (EXTRACT_SUBREG v4i16:$vec, sub0))
->;
-
-def : Pat <
- (extract_subvector v4i16:$vec, (i32 2)),
- (v2i16 (EXTRACT_SUBREG v4i16:$vec, sub1))
->;
-
-def : Pat <
- (extract_subvector v4f16:$vec, (i32 0)),
- (v2f16 (EXTRACT_SUBREG v4f16:$vec, sub0))
->;
-
-def : Pat <
- (extract_subvector v4f16:$vec, (i32 2)),
- (v2f16 (EXTRACT_SUBREG v4f16:$vec, sub1))
->;
-
-def : Pat <
- (extract_subvector v8i16:$vec, (i32 0)),
- (v4i16 (EXTRACT_SUBREG v8i16:$vec, sub0_sub1))
->;
-
-def : Pat <
- (extract_subvector v8i16:$vec, (i32 4)),
- (v4i16 (EXTRACT_SUBREG v8i16:$vec, sub2_sub3))
->;
-
-def : Pat <
- (extract_subvector v8f16:$vec, (i32 0)),
- (v4f16 (EXTRACT_SUBREG v8f16:$vec, sub0_sub1))
->;
-
-def : Pat <
- (extract_subvector v8f16:$vec, (i32 4)),
- (v4f16 (EXTRACT_SUBREG v8f16:$vec, sub2_sub3))
->;
-
-def : Pat <
- (extract_subvector v16i16:$vec, (i32 0)),
- (v8i16 (EXTRACT_SUBREG v16i16:$vec, sub0_sub1_sub2_sub3))
->;
-
-def : Pat <
- (extract_subvector v16i16:$vec, (i32 8)),
- (v8i16 (EXTRACT_SUBREG v16i16:$vec, sub4_sub5_sub6_sub7))
->;
-
-def : Pat <
- (extract_subvector v16f16:$vec, (i32 0)),
- (v8f16 (EXTRACT_SUBREG v16f16:$vec, sub0_sub1_sub2_sub3))
->;
-
-def : Pat <
- (extract_subvector v16f16:$vec, (i32 8)),
- (v8f16 (EXTRACT_SUBREG v16f16:$vec, sub4_sub5_sub6_sub7))
->;
-
foreach Index = 0-31 in {
def Extract_Element_v32i32_#Index : Extract_Element <
i32, v32i32, Index, !cast<SubRegIndex>(sub#Index)
@@ -2002,13 +1988,13 @@ def : GCNPat <
def : GCNPat <
(i32 (sext i1:$src0)),
(V_CNDMASK_B32_e64 /*src0mod*/(i32 0), /*src0*/(i32 0),
- /*src1mod*/(i32 0), /*src1*/(i32 -1), $src0)
+ /*src1mod*/(i32 0), /*src1*/(i32 -1), i1:$src0)
>;
class Ext32Pat <SDNode ext> : GCNPat <
(i32 (ext i1:$src0)),
(V_CNDMASK_B32_e64 /*src0mod*/(i32 0), /*src0*/(i32 0),
- /*src1mod*/(i32 0), /*src1*/(i32 1), $src0)
+ /*src1mod*/(i32 0), /*src1*/(i32 1), i1:$src0)
>;
def : Ext32Pat <zext>;
@@ -2043,48 +2029,53 @@ def BFIImm32 : PatFrag<
}]
>;
+
// Definition from ISA doc:
// (y & x) | (z & ~x)
-def : AMDGPUPat <
+def : AMDGPUPatIgnoreCopies <
(DivergentBinFrag<or> (and i32:$y, i32:$x), (and i32:$z, (not i32:$x))),
- (V_BFI_B32_e64 VSrc_b32:$x, VSrc_b32:$y, VSrc_b32:$z)
+ (V_BFI_B32_e64 (COPY_TO_REGCLASS VSrc_b32:$x, VGPR_32),
+ (COPY_TO_REGCLASS VSrc_b32:$y, VGPR_32),
+ (COPY_TO_REGCLASS VSrc_b32:$z, VGPR_32))
>;
// (y & C) | (z & ~C)
-def : AMDGPUPat <
+def : AMDGPUPatIgnoreCopies <
(BFIImm32 i32:$x, i32:$y, i32:$z),
(V_BFI_B32_e64 VSrc_b32:$x, VSrc_b32:$y, VSrc_b32:$z)
>;
// 64-bit version
-def : AMDGPUPat <
+def : AMDGPUPatIgnoreCopies <
(DivergentBinFrag<or> (and i64:$y, i64:$x), (and i64:$z, (not i64:$x))),
(REG_SEQUENCE VReg_64,
(V_BFI_B32_e64 (i32 (EXTRACT_SUBREG VReg_64:$x, sub0)),
- (i32 (EXTRACT_SUBREG VReg_64:$y, sub0)),
- (i32 (EXTRACT_SUBREG VReg_64:$z, sub0))), sub0,
+ (i32 (EXTRACT_SUBREG VReg_64:$y, sub0)),
+ (i32 (EXTRACT_SUBREG VReg_64:$z, sub0))), sub0,
(V_BFI_B32_e64 (i32 (EXTRACT_SUBREG VReg_64:$x, sub1)),
- (i32 (EXTRACT_SUBREG VReg_64:$y, sub1)),
- (i32 (EXTRACT_SUBREG VReg_64:$z, sub1))), sub1)
+ (i32 (EXTRACT_SUBREG VReg_64:$y, sub1)),
+ (i32 (EXTRACT_SUBREG VReg_64:$z, sub1))), sub1)
>;
// SHA-256 Ch function
// z ^ (x & (y ^ z))
-def : AMDGPUPat <
+def : AMDGPUPatIgnoreCopies <
(DivergentBinFrag<xor> i32:$z, (and i32:$x, (xor i32:$y, i32:$z))),
- (V_BFI_B32_e64 VSrc_b32:$x, VSrc_b32:$y, VSrc_b32:$z)
+ (V_BFI_B32_e64 (COPY_TO_REGCLASS VSrc_b32:$x, VGPR_32),
+ (COPY_TO_REGCLASS VSrc_b32:$y, VGPR_32),
+ (COPY_TO_REGCLASS VSrc_b32:$z, VGPR_32))
>;
// 64-bit version
-def : AMDGPUPat <
+def : AMDGPUPatIgnoreCopies <
(DivergentBinFrag<xor> i64:$z, (and i64:$x, (xor i64:$y, i64:$z))),
(REG_SEQUENCE VReg_64,
(V_BFI_B32_e64 (i32 (EXTRACT_SUBREG VReg_64:$x, sub0)),
- (i32 (EXTRACT_SUBREG VReg_64:$y, sub0)),
- (i32 (EXTRACT_SUBREG VReg_64:$z, sub0))), sub0,
+ (i32 (EXTRACT_SUBREG VReg_64:$y, sub0)),
+ (i32 (EXTRACT_SUBREG VReg_64:$z, sub0))), sub0,
(V_BFI_B32_e64 (i32 (EXTRACT_SUBREG VReg_64:$x, sub1)),
- (i32 (EXTRACT_SUBREG VReg_64:$y, sub1)),
- (i32 (EXTRACT_SUBREG VReg_64:$z, sub1))), sub1)
+ (i32 (EXTRACT_SUBREG VReg_64:$y, sub1)),
+ (i32 (EXTRACT_SUBREG VReg_64:$z, sub1))), sub1)
>;
def : AMDGPUPat <
@@ -3185,24 +3176,27 @@ def : AMDGPUPat <
// SHA-256 Ma patterns
// ((x & z) | (y & (x | z))) -> BFI (XOR x, y), z, y
-def : AMDGPUPat <
+def : AMDGPUPatIgnoreCopies <
(DivergentBinFrag<or> (and i32:$x, i32:$z),
(and i32:$y, (or i32:$x, i32:$z))),
- (V_BFI_B32_e64 (V_XOR_B32_e64 VSrc_b32:$x, VSrc_b32:$y), VSrc_b32:$z, VSrc_b32:$y)
+ (V_BFI_B32_e64 (V_XOR_B32_e64 (COPY_TO_REGCLASS VSrc_b32:$x, VGPR_32),
+ (COPY_TO_REGCLASS VSrc_b32:$y, VGPR_32)),
+ (COPY_TO_REGCLASS VSrc_b32:$z, VGPR_32),
+ (COPY_TO_REGCLASS VSrc_b32:$y, VGPR_32))
>;
-def : AMDGPUPat <
+def : AMDGPUPatIgnoreCopies <
(DivergentBinFrag<or> (and i64:$x, i64:$z),
(and i64:$y, (or i64:$x, i64:$z))),
(REG_SEQUENCE VReg_64,
(V_BFI_B32_e64 (V_XOR_B32_e64 (i32 (EXTRACT_SUBREG VReg_64:$x, sub0)),
(i32 (EXTRACT_SUBREG VReg_64:$y, sub0))),
- (i32 (EXTRACT_SUBREG VReg_64:$z, sub0)),
- (i32 (EXTRACT_SUBREG VReg_64:$y, sub0))), sub0,
+ (i32 (EXTRACT_SUBREG VReg_64:$z, sub0)),
+ (i32 (EXTRACT_SUBREG VReg_64:$y, sub0))), sub0,
(V_BFI_B32_e64 (V_XOR_B32_e64 (i32 (EXTRACT_SUBREG VReg_64:$x, sub1)),
(i32 (EXTRACT_SUBREG VReg_64:$y, sub1))),
- (i32 (EXTRACT_SUBREG VReg_64:$z, sub1)),
- (i32 (EXTRACT_SUBREG VReg_64:$y, sub1))), sub1)
+ (i32 (EXTRACT_SUBREG VReg_64:$z, sub1)),
+ (i32 (EXTRACT_SUBREG VReg_64:$y, sub1))), sub1)
>;
multiclass IntMed3Pat<Instruction med3Inst,
@@ -3486,8 +3480,6 @@ def G_AMDGPU_ATOMIC_CMPXCHG : AMDGPUGenericInstruction {
}
let Namespace = "AMDGPU" in {
-def G_AMDGPU_ATOMIC_INC : G_ATOMICRMW_OP;
-def G_AMDGPU_ATOMIC_DEC : G_ATOMICRMW_OP;
def G_AMDGPU_ATOMIC_FMIN : G_ATOMICRMW_OP;
def G_AMDGPU_ATOMIC_FMAX : G_ATOMICRMW_OP;
}
@@ -3614,15 +3606,6 @@ def G_FPTRUNC_ROUND_DOWNWARD : AMDGPUGenericInstruction {
// Dummy Instructions
//============================================================================//
-def V_ILLEGAL_gfx6_gfx7_gfx8_gfx9 : Enc32, InstSI<(outs), (ins), "v_illegal"> {
- let Inst{31-0} = 0xFFFFFFFF;
- let FixedSize = 1;
- let Size = 4;
- let Uses = [EXEC];
- let hasSideEffects = 1;
- let SubtargetPredicate = isGFX6GFX7GFX8GFX9;
-}
-
def V_ILLEGAL : Enc32, InstSI<(outs), (ins), "v_illegal"> {
let Inst{31-0} = 0x00000000;
let FixedSize = 1;
diff --git a/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp b/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
index 2b5ca33b0e4f..c252d30e250e 100644
--- a/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
+++ b/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
@@ -331,7 +331,6 @@ static unsigned getOpcodeWidth(const MachineInstr &MI, const SIInstrInfo &TII) {
switch (Opc) {
case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
- case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR:
case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM:
case AMDGPU::S_LOAD_DWORD_IMM:
case AMDGPU::GLOBAL_LOAD_DWORD:
@@ -342,7 +341,6 @@ static unsigned getOpcodeWidth(const MachineInstr &MI, const SIInstrInfo &TII) {
case AMDGPU::FLAT_STORE_DWORD:
return 1;
case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
- case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR:
case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM:
case AMDGPU::S_LOAD_DWORDX2_IMM:
case AMDGPU::GLOBAL_LOAD_DWORDX2:
@@ -360,7 +358,6 @@ static unsigned getOpcodeWidth(const MachineInstr &MI, const SIInstrInfo &TII) {
case AMDGPU::FLAT_STORE_DWORDX3:
return 3;
case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
- case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR:
case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM:
case AMDGPU::S_LOAD_DWORDX4_IMM:
case AMDGPU::GLOBAL_LOAD_DWORDX4:
@@ -371,7 +368,6 @@ static unsigned getOpcodeWidth(const MachineInstr &MI, const SIInstrInfo &TII) {
case AMDGPU::FLAT_STORE_DWORDX4:
return 4;
case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
- case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR:
case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM:
case AMDGPU::S_LOAD_DWORDX8_IMM:
return 8;
@@ -432,6 +428,10 @@ static InstClassEnum getInstClass(unsigned Opc, const SIInstrInfo &TII) {
case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFEN_exact:
case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFSET:
case AMDGPU::TBUFFER_LOAD_FORMAT_X_OFFSET_exact:
+ case AMDGPU::TBUFFER_LOAD_FORMAT_X_IDXEN:
+ case AMDGPU::TBUFFER_LOAD_FORMAT_X_IDXEN_exact:
+ case AMDGPU::TBUFFER_LOAD_FORMAT_X_BOTHEN:
+ case AMDGPU::TBUFFER_LOAD_FORMAT_X_BOTHEN_exact:
return TBUFFER_LOAD;
case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFEN:
case AMDGPU::TBUFFER_STORE_FORMAT_X_OFFEN_exact:
@@ -446,12 +446,6 @@ static InstClassEnum getInstClass(unsigned Opc, const SIInstrInfo &TII) {
case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
return S_BUFFER_LOAD_IMM;
- // For the purposes of this optimization SGPR variants of buffer loads
- // are considered to be zero-offsetted SGPR_IMM loads.
- case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR:
- case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR:
- case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR:
- case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR:
case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM:
case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM:
case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM:
@@ -533,12 +527,6 @@ static unsigned getInstSubclass(unsigned Opc, const SIInstrInfo &TII) {
case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
case AMDGPU::S_BUFFER_LOAD_DWORDX8_IMM:
return AMDGPU::S_BUFFER_LOAD_DWORD_IMM;
- // For the purposes of this optimization SGPR variants of buffer loads
- // are considered to be zero-offsetted SGPR_IMM loads.
- case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR:
- case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR:
- case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR:
- case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR:
case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM:
case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM:
case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM:
@@ -641,10 +629,6 @@ static AddressRegs getRegs(unsigned Opc, const SIInstrInfo &TII) {
switch (Opc) {
default:
return Result;
- case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR:
- case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR:
- case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR:
- case AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR:
case AMDGPU::S_BUFFER_LOAD_DWORD_SGPR_IMM:
case AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM:
case AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM:
@@ -740,7 +724,7 @@ void SILoadStoreOptimizer::CombineInfo::setMI(MachineBasicBlock::iterator MI,
Offset = 0;
} else {
int OffsetIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::offset);
- Offset = OffsetIdx == -1 ? 0 : I->getOperand(OffsetIdx).getImm();
+ Offset = I->getOperand(OffsetIdx).getImm();
}
if (InstClass == TBUFFER_LOAD || InstClass == TBUFFER_STORE)
@@ -887,7 +871,7 @@ bool SILoadStoreOptimizer::dmasksCanBeCombined(const CombineInfo &CI,
unsigned MaxMask = std::max(CI.DMask, Paired.DMask);
unsigned MinMask = std::min(CI.DMask, Paired.DMask);
- unsigned AllowedBitsForMin = llvm::countTrailingZeros(MaxMask);
+ unsigned AllowedBitsForMin = llvm::countr_zero(MaxMask);
if ((1u << AllowedBitsForMin) <= MinMask)
return false;
@@ -926,7 +910,7 @@ static unsigned getBufferFormatWithCompCount(unsigned OldFormat,
// - if Lo == 0, return 0 (even though the "- 1" below underflows
// - if Lo > Hi, return 0 (as if the range wrapped around)
static uint32_t mostAlignedValueInRange(uint32_t Lo, uint32_t Hi) {
- return Hi & maskLeadingOnes<uint32_t>(countLeadingZeros((Lo - 1) ^ Hi) + 1);
+ return Hi & maskLeadingOnes<uint32_t>(llvm::countl_zero((Lo - 1) ^ Hi) + 1);
}
bool SILoadStoreOptimizer::offsetsCanBeCombined(CombineInfo &CI,
@@ -975,9 +959,12 @@ bool SILoadStoreOptimizer::offsetsCanBeCombined(CombineInfo &CI,
// Handle all non-DS instructions.
if ((CI.InstClass != DS_READ) && (CI.InstClass != DS_WRITE)) {
- return (EltOffset0 + CI.Width == EltOffset1 ||
- EltOffset1 + Paired.Width == EltOffset0) &&
- CI.CPol == Paired.CPol;
+ if (EltOffset0 + CI.Width != EltOffset1 &&
+ EltOffset1 + Paired.Width != EltOffset0)
+ return false;
+ if (CI.CPol != Paired.CPol)
+ return false;
+ return true;
}
// If the offset in elements doesn't fit in 8-bits, we might be able to use
@@ -1383,10 +1370,7 @@ MachineBasicBlock::iterator SILoadStoreOptimizer::mergeSMemLoadImmPair(
.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::sbase));
if (CI.InstClass == S_BUFFER_LOAD_SGPR_IMM)
New.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset));
- // For convenience, when SGPR_IMM buffer loads are merged into a
- // zero-offset load, we generate its SGPR variant.
- if (AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::offset))
- New.addImm(MergedOffset);
+ New.addImm(MergedOffset);
New.addImm(CI.CPol).addMemOperand(combineKnownAdjacentMMOs(CI, Paired));
std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI, Paired);
@@ -1697,14 +1681,11 @@ unsigned SILoadStoreOptimizer::getNewOpcode(const CombineInfo &CI,
default:
return 0;
case 2:
- return CI.Offset == 0 ? AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR
- : AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM;
+ return AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR_IMM;
case 4:
- return CI.Offset == 0 ? AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR
- : AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM;
+ return AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR_IMM;
case 8:
- return CI.Offset == 0 ? AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR
- : AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM;
+ return AMDGPU::S_BUFFER_LOAD_DWORDX8_SGPR_IMM;
}
case S_LOAD_IMM:
switch (Width) {
@@ -2092,7 +2073,7 @@ bool SILoadStoreOptimizer::promoteConstantOffsetToImm(
// Step1: Find the base-registers and a 64bit constant offset.
MachineOperand &Base = *TII->getNamedOperand(MI, AMDGPU::OpName::vaddr);
MemAddress MAddr;
- if (Visited.find(&MI) == Visited.end()) {
+ if (!Visited.contains(&MI)) {
processBaseWithConstOffset(Base, MAddr);
Visited[&MI] = MAddr;
} else
@@ -2155,7 +2136,7 @@ bool SILoadStoreOptimizer::promoteConstantOffsetToImm(
const MachineOperand &BaseNext =
*TII->getNamedOperand(MINext, AMDGPU::OpName::vaddr);
MemAddress MAddrNext;
- if (Visited.find(&MINext) == Visited.end()) {
+ if (!Visited.contains(&MINext)) {
processBaseWithConstOffset(BaseNext, MAddrNext);
Visited[&MINext] = MAddrNext;
} else
diff --git a/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp b/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp
index 67077a2eaa6b..00cb5b2878f4 100644
--- a/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp
+++ b/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp
@@ -427,6 +427,8 @@ void SILowerControlFlow::emitLoop(MachineInstr &MI) {
BuildMI(MBB, &MI, DL, TII->get(Andn2TermOpc), Exec)
.addReg(Exec)
.add(MI.getOperand(0));
+ if (LV)
+ LV->replaceKillInstruction(MI.getOperand(0).getReg(), MI, *AndN2);
auto BranchPt = skipToUncondBrOrEnd(MBB, MI.getIterator());
MachineInstr *Branch =
@@ -514,13 +516,18 @@ MachineBasicBlock *SILowerControlFlow::emitEndCf(MachineInstr &MI) {
LV->replaceKillInstruction(DataReg, MI, *NewMI);
if (SplitBB != &MBB) {
- // Track the set of registers defined in the split block so we don't
- // accidentally add the original block to AliveBlocks.
- DenseSet<Register> SplitDefs;
- for (MachineInstr &X : *SplitBB) {
- for (MachineOperand &Op : X.operands()) {
- if (Op.isReg() && Op.isDef() && Op.getReg().isVirtual())
- SplitDefs.insert(Op.getReg());
+ // Track the set of registers defined in the original block so we don't
+ // accidentally add the original block to AliveBlocks. AliveBlocks only
+ // includes blocks which are live through, which excludes live outs and
+ // local defs.
+ DenseSet<Register> DefInOrigBlock;
+
+ for (MachineBasicBlock *BlockPiece : {&MBB, SplitBB}) {
+ for (MachineInstr &X : *BlockPiece) {
+ for (MachineOperand &Op : X.all_defs()) {
+ if (Op.getReg().isVirtual())
+ DefInOrigBlock.insert(Op.getReg());
+ }
}
}
@@ -532,7 +539,7 @@ MachineBasicBlock *SILowerControlFlow::emitEndCf(MachineInstr &MI) {
VI.AliveBlocks.set(SplitBB->getNumber());
else {
for (MachineInstr *Kill : VI.Kills) {
- if (Kill->getParent() == SplitBB && !SplitDefs.contains(Reg))
+ if (Kill->getParent() == SplitBB && !DefInOrigBlock.contains(Reg))
VI.AliveBlocks.set(MBB.getNumber());
}
}
diff --git a/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp b/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp
index 3450a9f0681f..47d28d5d0eab 100644
--- a/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp
+++ b/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp
@@ -50,7 +50,9 @@ public:
SILowerSGPRSpills() : MachineFunctionPass(ID) {}
void calculateSaveRestoreBlocks(MachineFunction &MF);
- bool spillCalleeSavedRegs(MachineFunction &MF);
+ bool spillCalleeSavedRegs(MachineFunction &MF,
+ SmallVectorImpl<int> &CalleeSavedFIs);
+ void extendWWMVirtRegLiveness(MachineFunction &MF, LiveIntervals *LIS);
bool runOnMachineFunction(MachineFunction &MF) override;
@@ -58,6 +60,13 @@ public:
AU.setPreservesAll();
MachineFunctionPass::getAnalysisUsage(AU);
}
+
+ MachineFunctionProperties getClearedProperties() const override {
+ // SILowerSGPRSpills introduces new Virtual VGPRs for spilling SGPRs.
+ return MachineFunctionProperties()
+ .set(MachineFunctionProperties::Property::IsSSA)
+ .set(MachineFunctionProperties::Property::NoVRegs);
+ }
};
} // end anonymous namespace
@@ -197,7 +206,8 @@ static void updateLiveness(MachineFunction &MF, ArrayRef<CalleeSavedInfo> CSI) {
EntryBB.sortUniqueLiveIns();
}
-bool SILowerSGPRSpills::spillCalleeSavedRegs(MachineFunction &MF) {
+bool SILowerSGPRSpills::spillCalleeSavedRegs(
+ MachineFunction &MF, SmallVectorImpl<int> &CalleeSavedFIs) {
MachineRegisterInfo &MRI = MF.getRegInfo();
const Function &F = MF.getFunction();
const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
@@ -228,6 +238,7 @@ bool SILowerSGPRSpills::spillCalleeSavedRegs(MachineFunction &MF) {
TRI->getSpillAlign(*RC), true);
CSI.push_back(CalleeSavedInfo(Reg, JunkFI));
+ CalleeSavedFIs.push_back(JunkFI);
}
}
@@ -248,6 +259,50 @@ bool SILowerSGPRSpills::spillCalleeSavedRegs(MachineFunction &MF) {
return false;
}
+void SILowerSGPRSpills::extendWWMVirtRegLiveness(MachineFunction &MF,
+ LiveIntervals *LIS) {
+ // TODO: This is a workaround to avoid the unmodelled liveness computed with
+ // whole-wave virtual registers when allocated together with the regular VGPR
+ // virtual registers. Presently, the liveness computed during the regalloc is
+ // only uniform (or single lane aware) and it doesn't take account of the
+ // divergent control flow that exists for our GPUs. Since the WWM registers
+ // can modify inactive lanes, the wave-aware liveness should be computed for
+ // the virtual registers to accurately plot their interferences. Without
+ // having the divergent CFG for the function, it is difficult to implement the
+ // wave-aware liveness info. Until then, we conservatively extend the liveness
+ // of the wwm registers into the entire function so that they won't be reused
+ // without first spilling/splitting their liveranges.
+ SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
+
+ // Insert the IMPLICIT_DEF for the wwm-registers in the entry blocks.
+ for (auto Reg : MFI->getSGPRSpillVGPRs()) {
+ for (MachineBasicBlock *SaveBlock : SaveBlocks) {
+ MachineBasicBlock::iterator InsertBefore = SaveBlock->begin();
+ auto MIB = BuildMI(*SaveBlock, *InsertBefore, InsertBefore->getDebugLoc(),
+ TII->get(AMDGPU::IMPLICIT_DEF), Reg);
+ MFI->setFlag(Reg, AMDGPU::VirtRegFlag::WWM_REG);
+ if (LIS) {
+ LIS->InsertMachineInstrInMaps(*MIB);
+ }
+ }
+ }
+
+ // Insert the KILL in the return blocks to extend their liveness untill the
+ // end of function. Insert a separate KILL for each VGPR.
+ for (MachineBasicBlock *RestoreBlock : RestoreBlocks) {
+ MachineBasicBlock::iterator InsertBefore =
+ RestoreBlock->getFirstTerminator();
+ for (auto Reg : MFI->getSGPRSpillVGPRs()) {
+ auto MIB =
+ BuildMI(*RestoreBlock, *InsertBefore, InsertBefore->getDebugLoc(),
+ TII->get(TargetOpcode::KILL));
+ MIB.addReg(Reg);
+ if (LIS)
+ LIS->InsertMachineInstrInMaps(*MIB);
+ }
+ }
+}
+
bool SILowerSGPRSpills::runOnMachineFunction(MachineFunction &MF) {
const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
TII = ST.getInstrInfo();
@@ -261,7 +316,8 @@ bool SILowerSGPRSpills::runOnMachineFunction(MachineFunction &MF) {
// First, expose any CSR SGPR spills. This is mostly the same as what PEI
// does, but somewhat simpler.
calculateSaveRestoreBlocks(MF);
- bool HasCSRs = spillCalleeSavedRegs(MF);
+ SmallVector<int> CalleeSavedFIs;
+ bool HasCSRs = spillCalleeSavedRegs(MF, CalleeSavedFIs);
MachineFrameInfo &MFI = MF.getFrameInfo();
MachineRegisterInfo &MRI = MF.getRegInfo();
@@ -275,6 +331,7 @@ bool SILowerSGPRSpills::runOnMachineFunction(MachineFunction &MF) {
bool MadeChange = false;
bool NewReservedRegs = false;
+ bool SpilledToVirtVGPRLanes = false;
// TODO: CSR VGPRs will never be spilled to AGPRs. These can probably be
// handled as SpilledToReg in regular PrologEpilogInserter.
@@ -297,23 +354,53 @@ bool SILowerSGPRSpills::runOnMachineFunction(MachineFunction &MF) {
int FI = TII->getNamedOperand(MI, AMDGPU::OpName::addr)->getIndex();
assert(MFI.getStackID(FI) == TargetStackID::SGPRSpill);
- if (FuncInfo->allocateSGPRSpillToVGPRLane(MF, FI)) {
- NewReservedRegs = true;
- bool Spilled = TRI->eliminateSGPRToVGPRSpillFrameIndex(
- MI, FI, nullptr, Indexes, LIS);
- (void)Spilled;
- assert(Spilled && "failed to spill SGPR to VGPR when allocated");
- SpillFIs.set(FI);
+
+ bool IsCalleeSaveSGPRSpill =
+ std::find(CalleeSavedFIs.begin(), CalleeSavedFIs.end(), FI) !=
+ CalleeSavedFIs.end();
+ if (IsCalleeSaveSGPRSpill) {
+ // Spill callee-saved SGPRs into physical VGPR lanes.
+
+ // TODO: This is to ensure the CFIs are static for efficient frame
+ // unwinding in the debugger. Spilling them into virtual VGPR lanes
+ // involve regalloc to allocate the physical VGPRs and that might
+ // cause intermediate spill/split of such liveranges for successful
+ // allocation. This would result in broken CFI encoding unless the
+ // regalloc aware CFI generation to insert new CFIs along with the
+ // intermediate spills is implemented. There is no such support
+ // currently exist in the LLVM compiler.
+ if (FuncInfo->allocateSGPRSpillToVGPRLane(MF, FI, true)) {
+ NewReservedRegs = true;
+ bool Spilled = TRI->eliminateSGPRToVGPRSpillFrameIndex(
+ MI, FI, nullptr, Indexes, LIS, true);
+ if (!Spilled)
+ llvm_unreachable(
+ "failed to spill SGPR to physical VGPR lane when allocated");
+ }
+ } else {
+ if (FuncInfo->allocateSGPRSpillToVGPRLane(MF, FI)) {
+ bool Spilled = TRI->eliminateSGPRToVGPRSpillFrameIndex(
+ MI, FI, nullptr, Indexes, LIS);
+ if (!Spilled)
+ llvm_unreachable(
+ "failed to spill SGPR to virtual VGPR lane when allocated");
+ SpillFIs.set(FI);
+ SpilledToVirtVGPRLanes = true;
+ }
}
}
}
- // FIXME: Adding to live-ins redundant with reserving registers.
- for (MachineBasicBlock &MBB : MF) {
- for (auto Reg : FuncInfo->getSGPRSpillVGPRs())
- MBB.addLiveIn(Reg);
- MBB.sortUniqueLiveIns();
+ if (SpilledToVirtVGPRLanes) {
+ extendWWMVirtRegLiveness(MF, LIS);
+ if (LIS) {
+ // Compute the LiveInterval for the newly created virtual registers.
+ for (auto Reg : FuncInfo->getSGPRSpillVGPRs())
+ LIS->createAndComputeVirtRegInterval(Reg);
+ }
+ }
+ for (MachineBasicBlock &MBB : MF) {
// FIXME: The dead frame indices are replaced with a null register from
// the debug value instructions. We should instead, update it with the
// correct register value. But not sure the register value alone is
@@ -337,12 +424,30 @@ bool SILowerSGPRSpills::runOnMachineFunction(MachineFunction &MF) {
MadeChange = true;
}
+ if (SpilledToVirtVGPRLanes) {
+ const TargetRegisterClass *RC = TRI->getWaveMaskRegClass();
+ // Shift back the reserved SGPR for EXEC copy into the lowest range.
+ // This SGPR is reserved to handle the whole-wave spill/copy operations
+ // that might get inserted during vgpr regalloc.
+ Register UnusedLowSGPR = TRI->findUnusedRegister(MRI, RC, MF);
+ if (UnusedLowSGPR && TRI->getHWRegIndex(UnusedLowSGPR) <
+ TRI->getHWRegIndex(FuncInfo->getSGPRForEXECCopy()))
+ FuncInfo->setSGPRForEXECCopy(UnusedLowSGPR);
+ } else {
+ // No SGPR spills to virtual VGPR lanes and hence there won't be any WWM
+ // spills/copies. Reset the SGPR reserved for EXEC copy.
+ FuncInfo->setSGPRForEXECCopy(AMDGPU::NoRegister);
+ }
+
SaveBlocks.clear();
RestoreBlocks.clear();
- // Updated the reserved registers with any VGPRs added for SGPR spills.
- if (NewReservedRegs)
- MRI.freezeReservedRegs(MF);
+ // Updated the reserved registers with any physical VGPRs added for SGPR
+ // spills.
+ if (NewReservedRegs) {
+ for (Register Reg : FuncInfo->getWWMReservedRegs())
+ MRI.reserveReg(Reg, TRI);
+ }
return MadeChange;
}
diff --git a/llvm/lib/Target/AMDGPU/SILowerWWMCopies.cpp b/llvm/lib/Target/AMDGPU/SILowerWWMCopies.cpp
new file mode 100644
index 000000000000..9c3cd1bbd6b0
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/SILowerWWMCopies.cpp
@@ -0,0 +1,141 @@
+//===-- SILowerWWMCopies.cpp - Lower Copies after regalloc ---===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// Lowering the WWM_COPY instructions for various register classes.
+/// AMDGPU target generates WWM_COPY instruction to differentiate WWM
+/// copy from COPY. This pass generates the necessary exec mask manipulation
+/// instructions to replicate 'Whole Wave Mode' and lowers WWM_COPY back to
+/// COPY.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "GCNSubtarget.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
+#include "SIMachineFunctionInfo.h"
+#include "llvm/CodeGen/LiveIntervals.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/VirtRegMap.h"
+#include "llvm/InitializePasses.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "si-lower-wwm-copies"
+
+namespace {
+
+class SILowerWWMCopies : public MachineFunctionPass {
+public:
+ static char ID;
+
+ SILowerWWMCopies() : MachineFunctionPass(ID) {
+ initializeSILowerWWMCopiesPass(*PassRegistry::getPassRegistry());
+ }
+
+ bool runOnMachineFunction(MachineFunction &MF) override;
+
+ StringRef getPassName() const override { return "SI Lower WWM Copies"; }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.setPreservesAll();
+ MachineFunctionPass::getAnalysisUsage(AU);
+ }
+
+private:
+ bool isSCCLiveAtMI(const MachineInstr &MI);
+ void addToWWMSpills(MachineFunction &MF, Register Reg);
+
+ LiveIntervals *LIS;
+ SlotIndexes *Indexes;
+ VirtRegMap *VRM;
+ const SIRegisterInfo *TRI;
+ const MachineRegisterInfo *MRI;
+ SIMachineFunctionInfo *MFI;
+};
+
+} // End anonymous namespace.
+
+INITIALIZE_PASS_BEGIN(SILowerWWMCopies, DEBUG_TYPE, "SI Lower WWM Copies",
+ false, false)
+INITIALIZE_PASS_DEPENDENCY(LiveIntervals)
+INITIALIZE_PASS_DEPENDENCY(VirtRegMap)
+INITIALIZE_PASS_END(SILowerWWMCopies, DEBUG_TYPE, "SI Lower WWM Copies", false,
+ false)
+
+char SILowerWWMCopies::ID = 0;
+
+char &llvm::SILowerWWMCopiesID = SILowerWWMCopies::ID;
+
+bool SILowerWWMCopies::isSCCLiveAtMI(const MachineInstr &MI) {
+ // We can't determine the liveness info if LIS isn't available. Early return
+ // in that case and always assume SCC is live.
+ if (!LIS)
+ return true;
+
+ LiveRange &LR =
+ LIS->getRegUnit(*MCRegUnitIterator(MCRegister::from(AMDGPU::SCC), TRI));
+ SlotIndex Idx = LIS->getInstructionIndex(MI);
+ return LR.liveAt(Idx);
+}
+
+// If \p Reg is assigned with a physical VGPR, add the latter into wwm-spills
+// for preserving its entire lanes at function prolog/epilog.
+void SILowerWWMCopies::addToWWMSpills(MachineFunction &MF, Register Reg) {
+ if (Reg.isPhysical())
+ return;
+
+ Register PhysReg = VRM->getPhys(Reg);
+ assert(PhysReg != VirtRegMap::NO_PHYS_REG &&
+ "should have allocated a physical register");
+
+ MFI->allocateWWMSpill(MF, PhysReg);
+}
+
+bool SILowerWWMCopies::runOnMachineFunction(MachineFunction &MF) {
+ const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
+ const SIInstrInfo *TII = ST.getInstrInfo();
+
+ MFI = MF.getInfo<SIMachineFunctionInfo>();
+ LIS = getAnalysisIfAvailable<LiveIntervals>();
+ Indexes = getAnalysisIfAvailable<SlotIndexes>();
+ VRM = getAnalysisIfAvailable<VirtRegMap>();
+ TRI = ST.getRegisterInfo();
+ MRI = &MF.getRegInfo();
+
+ if (!MFI->hasVRegFlags())
+ return false;
+
+ bool Changed = false;
+ for (MachineBasicBlock &MBB : MF) {
+ for (MachineInstr &MI : MBB) {
+ if (MI.getOpcode() != AMDGPU::WWM_COPY)
+ continue;
+
+ // TODO: Club adjacent WWM ops between same exec save/restore
+ assert(TII->isVGPRCopy(MI));
+
+ // For WWM vector copies, manipulate the exec mask around the copy
+ // instruction.
+ const DebugLoc &DL = MI.getDebugLoc();
+ MachineBasicBlock::iterator InsertPt = MI.getIterator();
+ Register RegForExecCopy = MFI->getSGPRForEXECCopy();
+ TII->insertScratchExecCopy(MF, MBB, InsertPt, DL, RegForExecCopy,
+ isSCCLiveAtMI(MI), Indexes);
+ TII->restoreExec(MF, MBB, ++InsertPt, DL, RegForExecCopy, Indexes);
+ addToWWMSpills(MF, MI.getOperand(0).getReg());
+ LLVM_DEBUG(dbgs() << "WWM copy manipulation for " << MI);
+
+ // Lower WWM_COPY back to COPY
+ MI.setDesc(TII->get(AMDGPU::COPY));
+ Changed |= true;
+ }
+ }
+
+ return Changed;
+}
diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
index b2a433dd3db9..219464eac9ec 100644
--- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
@@ -65,6 +65,8 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const Function &F,
Occupancy = ST.computeOccupancy(F, getLDSSize());
CallingConv::ID CC = F.getCallingConv();
+ VRegFlags.reserve(1024);
+
// FIXME: Should have analysis or something rather than attribute to detect
// calls.
const bool HasCalls = F.hasFnAttribute("amdgpu-calls");
@@ -119,7 +121,8 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const Function &F,
else if (ST.isMesaGfxShader(F))
ImplicitBufferPtr = true;
- if (!AMDGPU::isGraphics(CC)) {
+ if (!AMDGPU::isGraphics(CC) ||
+ (CC == CallingConv::AMDGPU_CS && ST.hasArchitectedSGPRs())) {
if (IsKernel || !F.hasFnAttribute("amdgpu-no-workgroup-id-x"))
WorkGroupIDX = true;
@@ -128,7 +131,9 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const Function &F,
if (!F.hasFnAttribute("amdgpu-no-workgroup-id-z"))
WorkGroupIDZ = true;
+ }
+ if (!AMDGPU::isGraphics(CC)) {
if (IsKernel || !F.hasFnAttribute("amdgpu-no-workitem-id-x"))
WorkItemIDX = true;
@@ -309,37 +314,23 @@ bool SIMachineFunctionInfo::isCalleeSavedReg(const MCPhysReg *CSRegs,
return false;
}
-bool SIMachineFunctionInfo::allocateVGPRForSGPRSpills(MachineFunction &MF,
- int FI,
- unsigned LaneIndex) {
- const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
- const SIRegisterInfo *TRI = ST.getRegisterInfo();
+bool SIMachineFunctionInfo::allocateVirtualVGPRForSGPRSpills(
+ MachineFunction &MF, int FI, unsigned LaneIndex) {
MachineRegisterInfo &MRI = MF.getRegInfo();
Register LaneVGPR;
if (!LaneIndex) {
- LaneVGPR = TRI->findUnusedRegister(MRI, &AMDGPU::VGPR_32RegClass, MF);
- if (LaneVGPR == AMDGPU::NoRegister) {
- // We have no VGPRs left for spilling SGPRs. Reset because we will not
- // partially spill the SGPR to VGPRs.
- SGPRSpillToVGPRLanes.erase(FI);
- return false;
- }
-
+ LaneVGPR = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
SpillVGPRs.push_back(LaneVGPR);
- // Add this register as live-in to all blocks to avoid machine verifier
- // complaining about use of an undefined physical register.
- for (MachineBasicBlock &BB : MF)
- BB.addLiveIn(LaneVGPR);
} else {
LaneVGPR = SpillVGPRs.back();
}
- SGPRSpillToVGPRLanes[FI].push_back(
+ SGPRSpillsToVirtualVGPRLanes[FI].push_back(
SIRegisterInfo::SpilledReg(LaneVGPR, LaneIndex));
return true;
}
-bool SIMachineFunctionInfo::allocateVGPRForPrologEpilogSGPRSpills(
+bool SIMachineFunctionInfo::allocatePhysicalVGPRForSGPRSpills(
MachineFunction &MF, int FI, unsigned LaneIndex) {
const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
const SIRegisterInfo *TRI = ST.getRegisterInfo();
@@ -350,16 +341,21 @@ bool SIMachineFunctionInfo::allocateVGPRForPrologEpilogSGPRSpills(
if (LaneVGPR == AMDGPU::NoRegister) {
// We have no VGPRs left for spilling SGPRs. Reset because we will not
// partially spill the SGPR to VGPRs.
- PrologEpilogSGPRSpillToVGPRLanes.erase(FI);
+ SGPRSpillsToPhysicalVGPRLanes.erase(FI);
return false;
}
allocateWWMSpill(MF, LaneVGPR);
+ reserveWWMRegister(LaneVGPR);
+ for (MachineBasicBlock &MBB : MF) {
+ MBB.addLiveIn(LaneVGPR);
+ MBB.sortUniqueLiveIns();
+ }
} else {
- LaneVGPR = WWMSpills.back().first;
+ LaneVGPR = WWMReservedRegs.back();
}
- PrologEpilogSGPRSpillToVGPRLanes[FI].push_back(
+ SGPRSpillsToPhysicalVGPRLanes[FI].push_back(
SIRegisterInfo::SpilledReg(LaneVGPR, LaneIndex));
return true;
}
@@ -368,8 +364,8 @@ bool SIMachineFunctionInfo::allocateSGPRSpillToVGPRLane(MachineFunction &MF,
int FI,
bool IsPrologEpilog) {
std::vector<SIRegisterInfo::SpilledReg> &SpillLanes =
- IsPrologEpilog ? PrologEpilogSGPRSpillToVGPRLanes[FI]
- : SGPRSpillToVGPRLanes[FI];
+ IsPrologEpilog ? SGPRSpillsToPhysicalVGPRLanes[FI]
+ : SGPRSpillsToVirtualVGPRLanes[FI];
// This has already been allocated.
if (!SpillLanes.empty())
@@ -390,15 +386,14 @@ bool SIMachineFunctionInfo::allocateSGPRSpillToVGPRLane(MachineFunction &MF,
"not spilling SGPRs to VGPRs");
unsigned &NumSpillLanes =
- IsPrologEpilog ? NumVGPRPrologEpilogSpillLanes : NumVGPRSpillLanes;
+ IsPrologEpilog ? NumPhysicalVGPRSpillLanes : NumVirtualVGPRSpillLanes;
for (unsigned I = 0; I < NumLanes; ++I, ++NumSpillLanes) {
unsigned LaneIndex = (NumSpillLanes % WaveSize);
- bool Allocated =
- IsPrologEpilog
- ? allocateVGPRForPrologEpilogSGPRSpills(MF, FI, LaneIndex)
- : allocateVGPRForSGPRSpills(MF, FI, LaneIndex);
+ bool Allocated = IsPrologEpilog
+ ? allocatePhysicalVGPRForSGPRSpills(MF, FI, LaneIndex)
+ : allocateVirtualVGPRForSGPRSpills(MF, FI, LaneIndex);
if (!Allocated) {
NumSpillLanes -= I;
return false;
@@ -479,16 +474,25 @@ bool SIMachineFunctionInfo::allocateVGPRSpillToAGPR(MachineFunction &MF,
bool SIMachineFunctionInfo::removeDeadFrameIndices(
MachineFrameInfo &MFI, bool ResetSGPRSpillStackIDs) {
- // Remove dead frame indices from function frame. And also make sure to remove
- // the frame indices from `SGPRSpillToVGPRLanes` data structure, otherwise, it
- // could result in an unexpected side effect and bug, in case of any
- // re-mapping of freed frame indices by later pass(es) like "stack slot
+ // Remove dead frame indices from function frame, however keep FP & BP since
+ // spills for them haven't been inserted yet. And also make sure to remove the
+ // frame indices from `SGPRSpillsToVirtualVGPRLanes` data structure,
+ // otherwise, it could result in an unexpected side effect and bug, in case of
+ // any re-mapping of freed frame indices by later pass(es) like "stack slot
// coloring".
- for (auto &R : make_early_inc_range(SGPRSpillToVGPRLanes)) {
+ for (auto &R : make_early_inc_range(SGPRSpillsToVirtualVGPRLanes)) {
MFI.RemoveStackObject(R.first);
- SGPRSpillToVGPRLanes.erase(R.first);
+ SGPRSpillsToVirtualVGPRLanes.erase(R.first);
}
+ // Remove the dead frame indices of CSR SGPRs which are spilled to physical
+ // VGPR lanes during SILowerSGPRSpills pass.
+ if (!ResetSGPRSpillStackIDs) {
+ for (auto &R : make_early_inc_range(SGPRSpillsToPhysicalVGPRLanes)) {
+ MFI.RemoveStackObject(R.first);
+ SGPRSpillsToPhysicalVGPRLanes.erase(R.first);
+ }
+ }
bool HaveSGPRToMemory = false;
if (ResetSGPRSpillStackIDs) {
@@ -537,6 +541,16 @@ MCPhysReg SIMachineFunctionInfo::getNextSystemSGPR() const {
return AMDGPU::SGPR0 + NumUserSGPRs + NumSystemSGPRs;
}
+void SIMachineFunctionInfo::MRI_NoteNewVirtualRegister(Register Reg) {
+ VRegFlags.grow(Reg);
+}
+
+void SIMachineFunctionInfo::MRI_NoteCloneVirtualRegister(Register NewReg,
+ Register SrcReg) {
+ VRegFlags.grow(NewReg);
+ VRegFlags[NewReg] = VRegFlags[SrcReg];
+}
+
Register
SIMachineFunctionInfo::getGITPtrLoReg(const MachineFunction &MF) const {
const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
@@ -638,12 +652,21 @@ yaml::SIMachineFunctionInfo::SIMachineFunctionInfo(
StackPtrOffsetReg(regToString(MFI.getStackPtrOffsetReg(), TRI)),
BytesInStackArgArea(MFI.getBytesInStackArgArea()),
ReturnsVoid(MFI.returnsVoid()),
- ArgInfo(convertArgumentInfo(MFI.getArgInfo(), TRI)), Mode(MFI.getMode()) {
+ ArgInfo(convertArgumentInfo(MFI.getArgInfo(), TRI)),
+ PSInputAddr(MFI.getPSInputAddr()),
+ PSInputEnable(MFI.getPSInputEnable()),
+ Mode(MFI.getMode()) {
for (Register Reg : MFI.getWWMReservedRegs())
WWMReservedRegs.push_back(regToString(Reg, TRI));
+ if (MFI.getLongBranchReservedReg())
+ LongBranchReservedReg = regToString(MFI.getLongBranchReservedReg(), TRI);
if (MFI.getVGPRForAGPRCopy())
VGPRForAGPRCopy = regToString(MFI.getVGPRForAGPRCopy(), TRI);
+
+ if (MFI.getSGPRForEXECCopy())
+ SGPRForEXECCopy = regToString(MFI.getSGPRForEXECCopy(), TRI);
+
auto SFI = MFI.getOptionalScavengeFI();
if (SFI)
ScavengeFI = yaml::FrameIndex(*SFI, MF.getFrameInfo());
@@ -661,6 +684,8 @@ bool SIMachineFunctionInfo::initializeBaseYamlFields(
LDSSize = YamlMFI.LDSSize;
GDSSize = YamlMFI.GDSSize;
DynLDSAlign = YamlMFI.DynLDSAlign;
+ PSInputAddr = YamlMFI.PSInputAddr;
+ PSInputEnable = YamlMFI.PSInputEnable;
HighBitsOf32BitAddress = YamlMFI.HighBitsOf32BitAddress;
Occupancy = YamlMFI.Occupancy;
IsEntryFunction = YamlMFI.IsEntryFunction;
diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
index c869ee875711..37572d30dff6 100644
--- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
@@ -18,6 +18,7 @@
#include "AMDGPUTargetMachine.h"
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
#include "SIInstrInfo.h"
+#include "SIModeRegisterDefaults.h"
#include "llvm/ADT/SetVector.h"
#include "llvm/CodeGen/MIRYamlMapping.h"
#include "llvm/CodeGen/PseudoSourceValue.h"
@@ -215,7 +216,7 @@ struct SIMode {
SIMode() = default;
- SIMode(const AMDGPU::SIModeRegisterDefaults &Mode) {
+ SIMode(const SIModeRegisterDefaults &Mode) {
IEEE = Mode.IEEE;
DX10Clamp = Mode.DX10Clamp;
FP32InputDenormals = Mode.FP32Denormals.Input != DenormalMode::PreserveSign;
@@ -275,9 +276,15 @@ struct SIMachineFunctionInfo final : public yaml::MachineFunctionInfo {
bool ReturnsVoid = true;
std::optional<SIArgumentInfo> ArgInfo;
+
+ unsigned PSInputAddr = 0;
+ unsigned PSInputEnable = 0;
+
SIMode Mode;
std::optional<FrameIndex> ScavengeFI;
StringValue VGPRForAGPRCopy;
+ StringValue SGPRForEXECCopy;
+ StringValue LongBranchReservedReg;
SIMachineFunctionInfo() = default;
SIMachineFunctionInfo(const llvm::SIMachineFunctionInfo &,
@@ -311,6 +318,8 @@ template <> struct MappingTraits<SIMachineFunctionInfo> {
YamlIO.mapOptional("bytesInStackArgArea", MFI.BytesInStackArgArea, 0u);
YamlIO.mapOptional("returnsVoid", MFI.ReturnsVoid, true);
YamlIO.mapOptional("argumentInfo", MFI.ArgInfo);
+ YamlIO.mapOptional("psInputAddr", MFI.PSInputAddr, 0u);
+ YamlIO.mapOptional("psInputEnable", MFI.PSInputEnable, 0u);
YamlIO.mapOptional("mode", MFI.Mode, SIMode());
YamlIO.mapOptional("highBitsOf32BitAddress",
MFI.HighBitsOf32BitAddress, 0u);
@@ -319,6 +328,10 @@ template <> struct MappingTraits<SIMachineFunctionInfo> {
YamlIO.mapOptional("scavengeFI", MFI.ScavengeFI);
YamlIO.mapOptional("vgprForAGPRCopy", MFI.VGPRForAGPRCopy,
StringValue()); // Don't print out when it's empty.
+ YamlIO.mapOptional("sgprForEXECCopy", MFI.SGPRForEXECCopy,
+ StringValue()); // Don't print out when it's empty.
+ YamlIO.mapOptional("longBranchReservedReg", MFI.LongBranchReservedReg,
+ StringValue());
}
};
@@ -355,11 +368,12 @@ public:
/// This class keeps track of the SPI_SP_INPUT_ADDR config register, which
/// tells the hardware which interpolation parameters to load.
-class SIMachineFunctionInfo final : public AMDGPUMachineFunction {
+class SIMachineFunctionInfo final : public AMDGPUMachineFunction,
+ private MachineRegisterInfo::Delegate {
friend class GCNTargetMachine;
// State of MODE register, assumed FP mode.
- AMDGPU::SIModeRegisterDefaults Mode;
+ SIModeRegisterDefaults Mode;
// Registers that may be reserved for spilling purposes. These may be the same
// as the input registers.
@@ -374,6 +388,11 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction {
// base to the beginning of the new function's frame.
Register StackPtrOffsetReg = AMDGPU::SP_REG;
+ // Registers that may be reserved when RA doesn't allocate enough
+ // registers to plan for the case where an indirect branch ends up
+ // being needed during branch relaxation.
+ Register LongBranchReservedReg;
+
AMDGPUFunctionArgInfo ArgInfo;
// Graphics info.
@@ -453,6 +472,9 @@ private:
unsigned HighBitsOf32BitAddress;
+ // Flags associated with the virtual registers.
+ IndexedMap<uint8_t, VirtReg2IndexFunctor> VRegFlags;
+
// Current recorded maximum possible occupancy.
unsigned Occupancy;
@@ -462,6 +484,10 @@ private:
MCPhysReg getNextSystemSGPR() const;
+ // MachineRegisterInfo callback functions to notify events.
+ void MRI_NoteNewVirtualRegister(Register Reg) override;
+ void MRI_NoteCloneVirtualRegister(Register NewReg, Register SrcReg) override;
+
public:
struct VGPRSpillToAGPR {
SmallVector<MCPhysReg, 32> Lanes;
@@ -470,15 +496,16 @@ public:
};
private:
- // To track VGPR + lane index for each subregister of the SGPR spilled to
- // frameindex key during SILowerSGPRSpills pass.
- DenseMap<int, std::vector<SIRegisterInfo::SpilledReg>> SGPRSpillToVGPRLanes;
- // To track VGPR + lane index for spilling special SGPRs like Frame Pointer
- // identified during PrologEpilogInserter.
+ // To track virtual VGPR + lane index for each subregister of the SGPR spilled
+ // to frameindex key during SILowerSGPRSpills pass.
+ DenseMap<int, std::vector<SIRegisterInfo::SpilledReg>>
+ SGPRSpillsToVirtualVGPRLanes;
+ // To track physical VGPR + lane index for CSR SGPR spills and special SGPRs
+ // like Frame Pointer identified during PrologEpilogInserter.
DenseMap<int, std::vector<SIRegisterInfo::SpilledReg>>
- PrologEpilogSGPRSpillToVGPRLanes;
- unsigned NumVGPRSpillLanes = 0;
- unsigned NumVGPRPrologEpilogSpillLanes = 0;
+ SGPRSpillsToPhysicalVGPRLanes;
+ unsigned NumVirtualVGPRSpillLanes = 0;
+ unsigned NumPhysicalVGPRSpillLanes = 0;
SmallVector<Register, 2> SpillVGPRs;
using WWMSpillsMap = MapVector<Register, int>;
// To track the registers used in instructions that can potentially modify the
@@ -504,6 +531,9 @@ private:
// PrologEpilogInserter.
PrologEpilogSGPRSpillsMap PrologEpilogSGPRSpills;
+ // To save/restore EXEC MASK around WWM spills and copies.
+ Register SGPRForEXECCopy;
+
DenseMap<int, VGPRSpillToAGPR> VGPRToAGPRSpills;
// AGPRs used for VGPR spills.
@@ -519,10 +549,10 @@ private:
private:
Register VGPRForAGPRCopy;
- bool allocateVGPRForSGPRSpills(MachineFunction &MF, int FI,
- unsigned LaneIndex);
- bool allocateVGPRForPrologEpilogSGPRSpills(MachineFunction &MF, int FI,
- unsigned LaneIndex);
+ bool allocateVirtualVGPRForSGPRSpills(MachineFunction &MF, int FI,
+ unsigned LaneIndex);
+ bool allocatePhysicalVGPRForSGPRSpills(MachineFunction &MF, int FI,
+ unsigned LaneIndex);
public:
Register getVGPRForAGPRCopy() const {
@@ -551,14 +581,12 @@ public:
void reserveWWMRegister(Register Reg) { WWMReservedRegs.insert(Reg); }
- AMDGPU::SIModeRegisterDefaults getMode() const {
- return Mode;
- }
+ SIModeRegisterDefaults getMode() const { return Mode; }
ArrayRef<SIRegisterInfo::SpilledReg>
- getSGPRSpillToVGPRLanes(int FrameIndex) const {
- auto I = SGPRSpillToVGPRLanes.find(FrameIndex);
- return (I == SGPRSpillToVGPRLanes.end())
+ getSGPRSpillToVirtualVGPRLanes(int FrameIndex) const {
+ auto I = SGPRSpillsToVirtualVGPRLanes.find(FrameIndex);
+ return (I == SGPRSpillsToVirtualVGPRLanes.end())
? ArrayRef<SIRegisterInfo::SpilledReg>()
: ArrayRef(I->second);
}
@@ -579,7 +607,7 @@ public:
// Check if an entry created for \p Reg in PrologEpilogSGPRSpills. Return true
// on success and false otherwise.
bool hasPrologEpilogSGPRSpillEntry(Register Reg) const {
- return PrologEpilogSGPRSpills.find(Reg) != PrologEpilogSGPRSpills.end();
+ return PrologEpilogSGPRSpills.contains(Reg);
}
// Get the scratch SGPR if allocated to save/restore \p Reg.
@@ -620,13 +648,28 @@ public:
}
ArrayRef<SIRegisterInfo::SpilledReg>
- getPrologEpilogSGPRSpillToVGPRLanes(int FrameIndex) const {
- auto I = PrologEpilogSGPRSpillToVGPRLanes.find(FrameIndex);
- return (I == PrologEpilogSGPRSpillToVGPRLanes.end())
+ getSGPRSpillToPhysicalVGPRLanes(int FrameIndex) const {
+ auto I = SGPRSpillsToPhysicalVGPRLanes.find(FrameIndex);
+ return (I == SGPRSpillsToPhysicalVGPRLanes.end())
? ArrayRef<SIRegisterInfo::SpilledReg>()
: ArrayRef(I->second);
}
+ void setFlag(Register Reg, uint8_t Flag) {
+ assert(Reg.isVirtual());
+ if (VRegFlags.inBounds(Reg))
+ VRegFlags[Reg] |= Flag;
+ }
+
+ bool checkFlag(Register Reg, uint8_t Flag) const {
+ if (Reg.isPhysical())
+ return false;
+
+ return VRegFlags.inBounds(Reg) && VRegFlags[Reg] & Flag;
+ }
+
+ bool hasVRegFlags() { return VRegFlags.size(); }
+
void allocateWWMSpill(MachineFunction &MF, Register VGPR, uint64_t Size = 4,
Align Alignment = Align(4));
@@ -639,6 +682,10 @@ public:
return SpillAGPR;
}
+ Register getSGPRForEXECCopy() const { return SGPRForEXECCopy; }
+
+ void setSGPRForEXECCopy(Register Reg) { SGPRForEXECCopy = Reg; }
+
ArrayRef<MCPhysReg> getVGPRSpillAGPRs() const {
return SpillVGPR;
}
@@ -693,21 +740,35 @@ public:
}
// Add system SGPRs.
- Register addWorkGroupIDX() {
- ArgInfo.WorkGroupIDX = ArgDescriptor::createRegister(getNextSystemSGPR());
- NumSystemSGPRs += 1;
+ Register addWorkGroupIDX(bool HasArchitectedSGPRs) {
+ Register Reg =
+ HasArchitectedSGPRs ? (MCPhysReg)AMDGPU::TTMP9 : getNextSystemSGPR();
+ ArgInfo.WorkGroupIDX = ArgDescriptor::createRegister(Reg);
+ if (!HasArchitectedSGPRs)
+ NumSystemSGPRs += 1;
+
return ArgInfo.WorkGroupIDX.getRegister();
}
- Register addWorkGroupIDY() {
- ArgInfo.WorkGroupIDY = ArgDescriptor::createRegister(getNextSystemSGPR());
- NumSystemSGPRs += 1;
+ Register addWorkGroupIDY(bool HasArchitectedSGPRs) {
+ Register Reg =
+ HasArchitectedSGPRs ? (MCPhysReg)AMDGPU::TTMP7 : getNextSystemSGPR();
+ unsigned Mask = HasArchitectedSGPRs && hasWorkGroupIDZ() ? 0xffff : ~0u;
+ ArgInfo.WorkGroupIDY = ArgDescriptor::createRegister(Reg, Mask);
+ if (!HasArchitectedSGPRs)
+ NumSystemSGPRs += 1;
+
return ArgInfo.WorkGroupIDY.getRegister();
}
- Register addWorkGroupIDZ() {
- ArgInfo.WorkGroupIDZ = ArgDescriptor::createRegister(getNextSystemSGPR());
- NumSystemSGPRs += 1;
+ Register addWorkGroupIDZ(bool HasArchitectedSGPRs) {
+ Register Reg =
+ HasArchitectedSGPRs ? (MCPhysReg)AMDGPU::TTMP7 : getNextSystemSGPR();
+ unsigned Mask = HasArchitectedSGPRs ? 0xffff << 16 : ~0u;
+ ArgInfo.WorkGroupIDZ = ArgDescriptor::createRegister(Reg, Mask);
+ if (!HasArchitectedSGPRs)
+ NumSystemSGPRs += 1;
+
return ArgInfo.WorkGroupIDZ.getRegister();
}
@@ -872,6 +933,8 @@ public:
StackPtrOffsetReg = Reg;
}
+ void setLongBranchReservedReg(Register Reg) { LongBranchReservedReg = Reg; }
+
// Note the unset value for this is AMDGPU::SP_REG rather than
// NoRegister. This is mostly a workaround for MIR tests where state that
// can't be directly computed from the function is not preserved in serialized
@@ -880,6 +943,8 @@ public:
return StackPtrOffsetReg;
}
+ Register getLongBranchReservedReg() const { return LongBranchReservedReg; }
+
Register getQueuePtrUserSGPR() const {
return ArgInfo.QueuePtr.getRegister();
}
diff --git a/llvm/lib/Target/AMDGPU/SIMachineScheduler.cpp b/llvm/lib/Target/AMDGPU/SIMachineScheduler.cpp
index 6d901d6783f0..677f1590287e 100644
--- a/llvm/lib/Target/AMDGPU/SIMachineScheduler.cpp
+++ b/llvm/lib/Target/AMDGPU/SIMachineScheduler.cpp
@@ -1883,7 +1883,7 @@ void SIScheduleDAGMI::schedule()
LLVM_DEBUG(dbgs() << "Preparing Scheduling\n");
buildDAGWithRegPressure();
- postprocessDAG();
+ postProcessDAG();
LLVM_DEBUG(dump());
if (PrintDAGs)
diff --git a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
index 5f2707317984..bc48f7b76c6d 100644
--- a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
+++ b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
@@ -22,7 +22,7 @@
#include "llvm/CodeGen/MachineFunctionPass.h"
#include "llvm/IR/DiagnosticInfo.h"
#include "llvm/Support/AtomicOrdering.h"
-#include "llvm/Support/TargetParser.h"
+#include "llvm/TargetParser/TargetParser.h"
using namespace llvm;
using namespace llvm::AMDGPU;
@@ -351,6 +351,10 @@ public:
/// Virtual destructor to allow derivations to be deleted.
virtual ~SICacheControl() = default;
+ virtual bool tryForceStoreSC0SC1(const SIMemOpInfo &MOI,
+ MachineBasicBlock::iterator &MI) const {
+ return false;
+ }
};
class SIGfx6CacheControl : public SICacheControl {
@@ -509,6 +513,20 @@ public:
bool insertRelease(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
SIAtomicAddrSpace AddrSpace, bool IsCrossAddrSpaceOrdering,
Position Pos) const override;
+
+ bool tryForceStoreSC0SC1(const SIMemOpInfo &MOI,
+ MachineBasicBlock::iterator &MI) const override {
+ bool Changed = false;
+ if (ST.hasForceStoreSC0SC1() &&
+ (MOI.getInstrAddrSpace() & (SIAtomicAddrSpace::SCRATCH |
+ SIAtomicAddrSpace::GLOBAL |
+ SIAtomicAddrSpace::OTHER)) !=
+ SIAtomicAddrSpace::NONE) {
+ Changed |= enableSC0Bit(MI);
+ Changed |= enableSC1Bit(MI);
+ }
+ return Changed;
+ }
};
class SIGfx10CacheControl : public SIGfx7CacheControl {
@@ -2209,8 +2227,13 @@ bool SIMemoryLegalizer::expandAtomicFence(const SIMemOpInfo &MOI,
bool Changed = false;
if (MOI.isAtomic()) {
- if (MOI.getOrdering() == AtomicOrdering::Acquire ||
- MOI.getOrdering() == AtomicOrdering::Release ||
+ if (MOI.getOrdering() == AtomicOrdering::Acquire)
+ Changed |= CC->insertWait(MI, MOI.getScope(), MOI.getOrderingAddrSpace(),
+ SIMemOp::LOAD | SIMemOp::STORE,
+ MOI.getIsCrossAddressSpaceOrdering(),
+ Position::BEFORE);
+
+ if (MOI.getOrdering() == AtomicOrdering::Release ||
MOI.getOrdering() == AtomicOrdering::AcquireRelease ||
MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent)
/// TODO: This relies on a barrier always generating a waitcnt
@@ -2319,9 +2342,10 @@ bool SIMemoryLegalizer::runOnMachineFunction(MachineFunction &MF) {
if (const auto &MOI = MOA.getLoadInfo(MI))
Changed |= expandLoad(*MOI, MI);
- else if (const auto &MOI = MOA.getStoreInfo(MI))
+ else if (const auto &MOI = MOA.getStoreInfo(MI)) {
Changed |= expandStore(*MOI, MI);
- else if (const auto &MOI = MOA.getAtomicFenceInfo(MI))
+ Changed |= CC->tryForceStoreSC0SC1(*MOI, MI);
+ } else if (const auto &MOI = MOA.getAtomicFenceInfo(MI))
Changed |= expandAtomicFence(*MOI, MI);
else if (const auto &MOI = MOA.getAtomicCmpxchgOrRmwInfo(MI))
Changed |= expandAtomicCmpxchgOrRmw(*MOI, MI);
diff --git a/llvm/lib/Target/AMDGPU/SIModeRegister.cpp b/llvm/lib/Target/AMDGPU/SIModeRegister.cpp
index 0d48c3159c6f..be395d53c34e 100644
--- a/llvm/lib/Target/AMDGPU/SIModeRegister.cpp
+++ b/llvm/lib/Target/AMDGPU/SIModeRegister.cpp
@@ -29,10 +29,10 @@ using namespace llvm;
struct Status {
// Mask is a bitmask where a '1' indicates the corresponding Mode bit has a
// known value
- unsigned Mask;
- unsigned Mode;
+ unsigned Mask = 0;
+ unsigned Mode = 0;
- Status() : Mask(0), Mode(0){};
+ Status() = default;
Status(unsigned NewMask, unsigned NewMode) : Mask(NewMask), Mode(NewMode) {
Mode &= Mask;
@@ -96,13 +96,13 @@ public:
// In Phase 1 we record the first instruction that has a mode requirement,
// which is used in Phase 3 if we need to insert a mode change.
- MachineInstr *FirstInsertionPoint;
+ MachineInstr *FirstInsertionPoint = nullptr;
// A flag to indicate whether an Exit value has been set (we can't tell by
// examining the Exit value itself as all values may be valid results).
- bool ExitSet;
+ bool ExitSet = false;
- BlockData() : FirstInsertionPoint(nullptr), ExitSet(false){};
+ BlockData() = default;
};
namespace {
@@ -222,8 +222,8 @@ Status SIModeRegister::getInstructionMode(MachineInstr &MI,
void SIModeRegister::insertSetreg(MachineBasicBlock &MBB, MachineInstr *MI,
const SIInstrInfo *TII, Status InstrMode) {
while (InstrMode.Mask) {
- unsigned Offset = countTrailingZeros<unsigned>(InstrMode.Mask);
- unsigned Width = countTrailingOnes<unsigned>(InstrMode.Mask >> Offset);
+ unsigned Offset = llvm::countr_zero<unsigned>(InstrMode.Mask);
+ unsigned Width = llvm::countr_one<unsigned>(InstrMode.Mask >> Offset);
unsigned Value = (InstrMode.Mode >> Offset) & ((1 << Width) - 1);
BuildMI(MBB, MI, nullptr, TII->get(AMDGPU::S_SETREG_IMM32_B32))
.addImm(Value)
diff --git a/llvm/lib/Target/AMDGPU/SIModeRegisterDefaults.cpp b/llvm/lib/Target/AMDGPU/SIModeRegisterDefaults.cpp
new file mode 100644
index 000000000000..413ef5d162a7
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/SIModeRegisterDefaults.cpp
@@ -0,0 +1,38 @@
+//===-- SIModeRegisterDefaults.cpp ------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "SIModeRegisterDefaults.h"
+
+using namespace llvm;
+
+SIModeRegisterDefaults::SIModeRegisterDefaults(const Function &F) {
+ *this = getDefaultForCallingConv(F.getCallingConv());
+
+ StringRef IEEEAttr = F.getFnAttribute("amdgpu-ieee").getValueAsString();
+ if (!IEEEAttr.empty())
+ IEEE = IEEEAttr == "true";
+
+ StringRef DX10ClampAttr =
+ F.getFnAttribute("amdgpu-dx10-clamp").getValueAsString();
+ if (!DX10ClampAttr.empty())
+ DX10Clamp = DX10ClampAttr == "true";
+
+ StringRef DenormF32Attr =
+ F.getFnAttribute("denormal-fp-math-f32").getValueAsString();
+ if (!DenormF32Attr.empty())
+ FP32Denormals = parseDenormalFPAttribute(DenormF32Attr);
+
+ StringRef DenormAttr =
+ F.getFnAttribute("denormal-fp-math").getValueAsString();
+ if (!DenormAttr.empty()) {
+ DenormalMode DenormMode = parseDenormalFPAttribute(DenormAttr);
+ if (DenormF32Attr.empty())
+ FP32Denormals = DenormMode;
+ FP64FP16Denormals = DenormMode;
+ }
+}
diff --git a/llvm/lib/Target/AMDGPU/SIModeRegisterDefaults.h b/llvm/lib/Target/AMDGPU/SIModeRegisterDefaults.h
new file mode 100644
index 000000000000..df2e3f9bff32
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/SIModeRegisterDefaults.h
@@ -0,0 +1,90 @@
+//===-- SIModeRegisterDefaults.h --------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_AMDGPU_SIMODEREGISTERDEFAULTS_H
+#define LLVM_LIB_TARGET_AMDGPU_SIMODEREGISTERDEFAULTS_H
+
+#include "Utils/AMDGPUBaseInfo.h"
+#include "llvm/ADT/FloatingPointMode.h"
+
+namespace llvm {
+
+// Track defaults for fields in the MODE register.
+struct SIModeRegisterDefaults {
+ /// Floating point opcodes that support exception flag gathering quiet and
+ /// propagate signaling NaN inputs per IEEE 754-2008. Min_dx10 and max_dx10
+ /// become IEEE 754- 2008 compliant due to signaling NaN propagation and
+ /// quieting.
+ bool IEEE : 1;
+
+ /// Used by the vector ALU to force DX10-style treatment of NaNs: when set,
+ /// clamp NaN to zero; otherwise, pass NaN through.
+ bool DX10Clamp : 1;
+
+ /// If this is set, neither input or output denormals are flushed for most f32
+ /// instructions.
+ DenormalMode FP32Denormals;
+
+ /// If this is set, neither input or output denormals are flushed for both f64
+ /// and f16/v2f16 instructions.
+ DenormalMode FP64FP16Denormals;
+
+ SIModeRegisterDefaults() :
+ IEEE(true),
+ DX10Clamp(true),
+ FP32Denormals(DenormalMode::getIEEE()),
+ FP64FP16Denormals(DenormalMode::getIEEE()) {}
+
+ SIModeRegisterDefaults(const Function &F);
+
+ static SIModeRegisterDefaults getDefaultForCallingConv(CallingConv::ID CC) {
+ SIModeRegisterDefaults Mode;
+ Mode.IEEE = !AMDGPU::isShader(CC);
+ return Mode;
+ }
+
+ bool operator==(const SIModeRegisterDefaults Other) const {
+ return IEEE == Other.IEEE && DX10Clamp == Other.DX10Clamp &&
+ FP32Denormals == Other.FP32Denormals &&
+ FP64FP16Denormals == Other.FP64FP16Denormals;
+ }
+
+ /// Get the encoding value for the FP_DENORM bits of the mode register for the
+ /// FP32 denormal mode.
+ uint32_t fpDenormModeSPValue() const {
+ if (FP32Denormals == DenormalMode::getPreserveSign())
+ return FP_DENORM_FLUSH_IN_FLUSH_OUT;
+ if (FP32Denormals.Output == DenormalMode::PreserveSign)
+ return FP_DENORM_FLUSH_OUT;
+ if (FP32Denormals.Input == DenormalMode::PreserveSign)
+ return FP_DENORM_FLUSH_IN;
+ return FP_DENORM_FLUSH_NONE;
+ }
+
+ /// Get the encoding value for the FP_DENORM bits of the mode register for the
+ /// FP64/FP16 denormal mode.
+ uint32_t fpDenormModeDPValue() const {
+ if (FP64FP16Denormals == DenormalMode::getPreserveSign())
+ return FP_DENORM_FLUSH_IN_FLUSH_OUT;
+ if (FP64FP16Denormals.Output == DenormalMode::PreserveSign)
+ return FP_DENORM_FLUSH_OUT;
+ if (FP64FP16Denormals.Input == DenormalMode::PreserveSign)
+ return FP_DENORM_FLUSH_IN;
+ return FP_DENORM_FLUSH_NONE;
+ }
+
+ // FIXME: Inlining should be OK for dx10-clamp, since the caller's mode should
+ // be able to override.
+ bool isInlineCompatible(SIModeRegisterDefaults CalleeMode) const {
+ return DX10Clamp == CalleeMode.DX10Clamp && IEEE == CalleeMode.IEEE;
+ }
+};
+
+} // end namespace llvm
+
+#endif // LLVM_LIB_TARGET_AMDGPU_SIMODEREGISTERDEFAULTS_H
diff --git a/llvm/lib/Target/AMDGPU/SIOptimizeExecMaskingPreRA.cpp b/llvm/lib/Target/AMDGPU/SIOptimizeExecMaskingPreRA.cpp
index 85de3a548411..d2a5eb89da12 100644
--- a/llvm/lib/Target/AMDGPU/SIOptimizeExecMaskingPreRA.cpp
+++ b/llvm/lib/Target/AMDGPU/SIOptimizeExecMaskingPreRA.cpp
@@ -96,8 +96,8 @@ static bool isDefBetween(const SIRegisterInfo &TRI,
if (Reg.isVirtual())
return isDefBetween(LIS->getInterval(Reg), AndIdx, SelIdx);
- for (MCRegUnitIterator UI(Reg.asMCReg(), &TRI); UI.isValid(); ++UI) {
- if (isDefBetween(LIS->getRegUnit(*UI), AndIdx, SelIdx))
+ for (MCRegUnit Unit : TRI.regunits(Reg.asMCReg())) {
+ if (isDefBetween(LIS->getRegUnit(Unit), AndIdx, SelIdx))
return true;
}
@@ -106,7 +106,7 @@ static bool isDefBetween(const SIRegisterInfo &TRI,
// Optimize sequence
// %sel = V_CNDMASK_B32_e64 0, 1, %cc
-// %cmp = V_CMP_NE_U32 1, %1
+// %cmp = V_CMP_NE_U32 1, %sel
// $vcc = S_AND_B64 $exec, %cmp
// S_CBRANCH_VCC[N]Z
// =>
@@ -218,46 +218,11 @@ bool SIOptimizeExecMaskingPreRA::optimizeVcndVcmpPair(MachineBasicBlock &MBB) {
// and their associated liveness information.
SlotIndex CmpIdx = LIS->getInstructionIndex(*Cmp);
if (CCReg.isVirtual()) {
- // Apply live ranges from SelLI to CCReg potentially matching splits
- // and extending to loop boundaries.
-
- auto applyLiveRanges = [&](LiveRange &Dst, VNInfo *VNI) {
- // Copy live ranges from SelLI, adjusting start and end as required
- auto DefSegment = SelLI->FindSegmentContaining(SelIdx.getRegSlot());
- assert(DefSegment != SelLI->end() &&
- "No live interval segment covering definition?");
- for (auto I = DefSegment; I != SelLI->end() && I->start <= AndIdx; ++I) {
- SlotIndex Start = I->start < SelIdx.getRegSlot() ?
- SelIdx.getRegSlot() : I->start;
- SlotIndex End = I->end < AndIdx.getRegSlot() || I->end.isBlock() ?
- I->end : AndIdx.getRegSlot();
- Dst.addSegment(LiveRange::Segment(Start, End, VNI));
- }
- // If SelLI does not cover AndIdx (because Cmp killed Sel) then extend.
- if (!SelLI->getSegmentContaining(AndIdx.getRegSlot()))
- Dst.addSegment(LiveRange::Segment(CmpIdx.getRegSlot(), AndIdx.getRegSlot(), VNI));
- };
-
LiveInterval &CCLI = LIS->getInterval(CCReg);
auto CCQ = CCLI.Query(SelIdx.getRegSlot());
- if (CCQ.valueIn())
- applyLiveRanges(CCLI, CCQ.valueIn());
-
- if (CC->getSubReg()) {
- LaneBitmask Mask = TRI->getSubRegIndexLaneMask(CC->getSubReg());
- BumpPtrAllocator &Allocator = LIS->getVNInfoAllocator();
- CCLI.refineSubRanges(
- Allocator, Mask,
- [=](LiveInterval::SubRange &SR) {
- auto CCQS = SR.Query(SelIdx.getRegSlot());
- if (CCQS.valueIn())
- applyLiveRanges(SR, CCQS.valueIn());
- },
- *LIS->getSlotIndexes(), *TRI);
- CCLI.removeEmptySubRanges();
-
- SmallVector<LiveInterval *> SplitLIs;
- LIS->splitSeparateComponents(CCLI, SplitLIs);
+ if (CCQ.valueIn()) {
+ LIS->removeInterval(CCReg);
+ LIS->createAndComputeVirtRegInterval(CCReg);
}
} else
LIS->removeAllRegUnitsForPhysReg(CCReg);
@@ -287,7 +252,13 @@ bool SIOptimizeExecMaskingPreRA::optimizeVcndVcmpPair(MachineBasicBlock &MBB) {
LIS->removeVRegDefAt(*SelLI, SelIdx.getRegSlot());
LIS->RemoveMachineInstrFromMaps(*Sel);
+ bool ShrinkSel = Sel->getOperand(0).readsReg();
Sel->eraseFromParent();
+ if (ShrinkSel) {
+ // The result of the V_CNDMASK was a subreg def which counted as a read
+ // from the other parts of the reg. Shrink their live ranges.
+ LIS->shrinkToUses(SelLI);
+ }
}
}
@@ -349,8 +320,8 @@ bool SIOptimizeExecMaskingPreRA::optimizeElseBranch(MachineBasicBlock &MBB) {
// Instead just check that the def segments are adjacent.
SlotIndex StartIdx = LIS->getInstructionIndex(SaveExecMI);
SlotIndex EndIdx = LIS->getInstructionIndex(*AndExecMI);
- for (MCRegUnitIterator UI(ExecReg, TRI); UI.isValid(); ++UI) {
- LiveRange &RegUnit = LIS->getRegUnit(*UI);
+ for (MCRegUnit Unit : TRI->regunits(ExecReg)) {
+ LiveRange &RegUnit = LIS->getRegUnit(Unit);
if (RegUnit.find(StartIdx) != std::prev(RegUnit.find(EndIdx)))
return false;
}
diff --git a/llvm/lib/Target/AMDGPU/SIOptimizeVGPRLiveRange.cpp b/llvm/lib/Target/AMDGPU/SIOptimizeVGPRLiveRange.cpp
index ae2c10116de8..e95abae88d7a 100644
--- a/llvm/lib/Target/AMDGPU/SIOptimizeVGPRLiveRange.cpp
+++ b/llvm/lib/Target/AMDGPU/SIOptimizeVGPRLiveRange.cpp
@@ -357,8 +357,8 @@ void SIOptimizeVGPRLiveRange::collectWaterfallCandidateRegisters(
for (auto *I : Instructions) {
auto &MI = *I;
- for (auto &MO : MI.operands()) {
- if (!MO.isReg() || !MO.getReg() || MO.isDef())
+ for (auto &MO : MI.all_uses()) {
+ if (!MO.getReg())
continue;
Register MOReg = MO.getReg();
@@ -522,8 +522,15 @@ void SIOptimizeVGPRLiveRange::optimizeLiveRange(
auto *UseBlock = UseMI->getParent();
// Replace uses in Endif block
if (UseBlock == Endif) {
- assert(UseMI->isPHI() && "Uses should be PHI in Endif block");
- O.setReg(NewReg);
+ if (UseMI->isPHI()) {
+ O.setReg(NewReg);
+ } else {
+ // DetectDeadLanes may mark register uses as undef without removing
+ // them, in which case a non-phi instruction using the original register
+ // may exist in the Endif block even though the register is not live
+ // into it.
+ assert(!O.readsReg());
+ }
continue;
}
diff --git a/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp b/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
index c21ff06454da..97b3161c7f98 100644
--- a/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
+++ b/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
@@ -759,7 +759,7 @@ SIPeepholeSDWA::matchSDWAOperand(MachineInstr &MI) {
break;
SdwaSel DstSel = static_cast<SdwaSel>(
- TII->getNamedImmOperand(*SDWAInst, AMDGPU::OpName::dst_sel));;
+ TII->getNamedImmOperand(*SDWAInst, AMDGPU::OpName::dst_sel));
SdwaSel OtherDstSel = static_cast<SdwaSel>(
TII->getNamedImmOperand(*OtherInst, AMDGPU::OpName::dst_sel));
@@ -1158,7 +1158,7 @@ void SIPeepholeSDWA::legalizeScalarOperands(MachineInstr &MI,
if (!Op.isImm() && !(Op.isReg() && !TRI->isVGPR(*MRI, Op.getReg())))
continue;
- unsigned I = MI.getOperandNo(&Op);
+ unsigned I = Op.getOperandNo();
if (Desc.operands()[I].RegClass == -1 ||
!TRI->isVSSuperClass(TRI->getRegClass(Desc.operands()[I].RegClass)))
continue;
diff --git a/llvm/lib/Target/AMDGPU/SIPostRABundler.cpp b/llvm/lib/Target/AMDGPU/SIPostRABundler.cpp
index 8553a0ab2a68..8464cb3d6fc4 100644
--- a/llvm/lib/Target/AMDGPU/SIPostRABundler.cpp
+++ b/llvm/lib/Target/AMDGPU/SIPostRABundler.cpp
@@ -101,8 +101,8 @@ void SIPostRABundler::collectUsedRegUnits(const MachineInstr &MI,
assert(!Op.getSubReg() &&
"subregister indexes should not be present after RA");
- for (MCRegUnitIterator Units(Reg, TRI); Units.isValid(); ++Units)
- UsedRegUnits.set(*Units);
+ for (MCRegUnit Unit : TRI->regunits(Reg))
+ UsedRegUnits.set(Unit);
}
}
diff --git a/llvm/lib/Target/AMDGPU/SIProgramInfo.cpp b/llvm/lib/Target/AMDGPU/SIProgramInfo.cpp
index 877c8b81b2c0..b6839c8308d8 100644
--- a/llvm/lib/Target/AMDGPU/SIProgramInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIProgramInfo.cpp
@@ -54,3 +54,23 @@ uint64_t SIProgramInfo::getPGMRSrc1(CallingConv::ID CC) const {
}
return Reg;
}
+
+uint64_t SIProgramInfo::getComputePGMRSrc2() const {
+ uint64_t Reg =
+ S_00B84C_SCRATCH_EN(ScratchEnable) | S_00B84C_USER_SGPR(UserSGPR) |
+ S_00B84C_TRAP_HANDLER(TrapHandlerEnable) |
+ S_00B84C_TGID_X_EN(TGIdXEnable) | S_00B84C_TGID_Y_EN(TGIdYEnable) |
+ S_00B84C_TGID_Z_EN(TGIdZEnable) | S_00B84C_TG_SIZE_EN(TGSizeEnable) |
+ S_00B84C_TIDIG_COMP_CNT(TIdIGCompCount) |
+ S_00B84C_EXCP_EN_MSB(EXCPEnMSB) | S_00B84C_LDS_SIZE(LdsSize) |
+ S_00B84C_EXCP_EN(EXCPEnable);
+
+ return Reg;
+}
+
+uint64_t SIProgramInfo::getPGMRSrc2(CallingConv::ID CC) const {
+ if (AMDGPU::isCompute(CC))
+ return getComputePGMRSrc2();
+
+ return 0;
+}
diff --git a/llvm/lib/Target/AMDGPU/SIProgramInfo.h b/llvm/lib/Target/AMDGPU/SIProgramInfo.h
index 553fb4cf496c..aab127e49463 100644
--- a/llvm/lib/Target/AMDGPU/SIProgramInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIProgramInfo.h
@@ -36,11 +36,23 @@ struct SIProgramInfo {
uint32_t MemOrdered = 0; // GFX10+
uint64_t ScratchSize = 0;
- // Fields set in PGM_RSRC2 pm4 packet.
+ // State used to calculate fields set in PGM_RSRC2 pm4 packet.
uint32_t LDSBlocks = 0;
uint32_t ScratchBlocks = 0;
- uint64_t ComputePGMRSrc2 = 0;
+ // Fields set in PGM_RSRC2 pm4 packet
+ uint32_t ScratchEnable = 0;
+ uint32_t UserSGPR = 0;
+ uint32_t TrapHandlerEnable = 0;
+ uint32_t TGIdXEnable = 0;
+ uint32_t TGIdYEnable = 0;
+ uint32_t TGIdZEnable = 0;
+ uint32_t TGSizeEnable = 0;
+ uint32_t TIdIGCompCount = 0;
+ uint32_t EXCPEnMSB = 0;
+ uint32_t LdsSize = 0;
+ uint32_t EXCPEnable = 0;
+
uint64_t ComputePGMRSrc3GFX90A = 0;
uint32_t NumVGPR = 0;
@@ -75,6 +87,10 @@ struct SIProgramInfo {
/// Compute the value of the ComputePGMRsrc1 register.
uint64_t getComputePGMRSrc1() const;
uint64_t getPGMRSrc1(CallingConv::ID CC) const;
+
+ /// Compute the value of the ComputePGMRsrc2 register.
+ uint64_t getComputePGMRSrc2() const;
+ uint64_t getPGMRSrc2(CallingConv::ID CC) const;
};
} // namespace llvm
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
index f7ce581f9736..1d50dff4a7d9 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
@@ -31,7 +31,7 @@ using namespace llvm;
static cl::opt<bool> EnableSpillSGPRToVGPR(
"amdgpu-spill-sgpr-to-vgpr",
- cl::desc("Enable spilling VGPRs to SGPRs"),
+ cl::desc("Enable spilling SGPRs to VGPRs"),
cl::ReallyHidden,
cl::init(true));
@@ -170,7 +170,8 @@ struct SGPRSpillBuilder {
// a register as actually in use in another lane, so we need to save all
// used lanes of the chosen VGPR.
assert(RS && "Cannot spill SGPR to memory without RegScavenger");
- TmpVGPR = RS->scavengeRegister(&AMDGPU::VGPR_32RegClass, MI, 0, false);
+ TmpVGPR = RS->scavengeRegisterBackwards(AMDGPU::VGPR_32RegClass, MI, false,
+ 0, false);
// Reserve temporary stack slot
TmpVGPRIndex = MFI.getScavengeFI(MF.getFrameInfo(), TRI);
@@ -199,7 +200,7 @@ struct SGPRSpillBuilder {
const TargetRegisterClass &RC =
IsWave32 ? AMDGPU::SGPR_32RegClass : AMDGPU::SGPR_64RegClass;
RS->setRegUsed(SuperReg);
- SavedExecReg = RS->scavengeRegister(&RC, MI, 0, false);
+ SavedExecReg = RS->scavengeRegisterBackwards(RC, MI, false, 0, false);
int64_t VGPRLanes = getPerVGPRData().VGPRLanes;
@@ -328,10 +329,9 @@ SIRegisterInfo::SIRegisterInfo(const GCNSubtarget &ST)
"getNumCoveredRegs() will not work with generated subreg masks!");
RegPressureIgnoredUnits.resize(getNumRegUnits());
- RegPressureIgnoredUnits.set(
- *MCRegUnitIterator(MCRegister::from(AMDGPU::M0), this));
+ RegPressureIgnoredUnits.set(*regunits(MCRegister::from(AMDGPU::M0)).begin());
for (auto Reg : AMDGPU::VGPR_HI16RegClass)
- RegPressureIgnoredUnits.set(*MCRegUnitIterator(Reg, this));
+ RegPressureIgnoredUnits.set(*regunits(Reg).begin());
// HACK: Until this is fully tablegen'd.
static llvm::once_flag InitializeRegSplitPartsFlag;
@@ -380,9 +380,7 @@ SIRegisterInfo::SIRegisterInfo(const GCNSubtarget &ST)
void SIRegisterInfo::reserveRegisterTuples(BitVector &Reserved,
MCRegister Reg) const {
- MCRegAliasIterator R(Reg, this, true);
-
- for (; R.isValid(); ++R)
+ for (MCRegAliasIterator R(Reg, this, true); R.isValid(); ++R)
Reserved.set(*R);
}
@@ -535,11 +533,18 @@ unsigned SIRegisterInfo::getSubRegFromChannel(unsigned Channel,
return SubRegFromChannelTable[NumRegIndex - 1][Channel];
}
+MCRegister
+SIRegisterInfo::getAlignedHighSGPRForRC(const MachineFunction &MF,
+ const unsigned Align,
+ const TargetRegisterClass *RC) const {
+ unsigned BaseIdx = alignDown(ST.getMaxNumSGPRs(MF), Align) - Align;
+ MCRegister BaseReg(AMDGPU::SGPR_32RegClass.getRegister(BaseIdx));
+ return getMatchingSuperReg(BaseReg, AMDGPU::sub0, RC);
+}
+
MCRegister SIRegisterInfo::reservedPrivateSegmentBufferReg(
const MachineFunction &MF) const {
- unsigned BaseIdx = alignDown(ST.getMaxNumSGPRs(MF), 4) - 4;
- MCRegister BaseReg(AMDGPU::SGPR_32RegClass.getRegister(BaseIdx));
- return getMatchingSuperReg(BaseReg, AMDGPU::sub0, &AMDGPU::SGPR_128RegClass);
+ return getAlignedHighSGPRForRC(MF, /*Align=*/4, &AMDGPU::SGPR_128RegClass);
}
BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
@@ -609,14 +614,6 @@ BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
reserveRegisterTuples(Reserved, Reg);
}
- for (auto Reg : AMDGPU::SReg_32RegClass) {
- Reserved.set(getSubReg(Reg, AMDGPU::hi16));
- Register Low = getSubReg(Reg, AMDGPU::lo16);
- // This is to prevent BB vcc liveness errors.
- if (!AMDGPU::SGPR_LO16RegClass.contains(Low))
- Reserved.set(Low);
- }
-
Register ScratchRSrcReg = MFI->getScratchRSrcReg();
if (ScratchRSrcReg != AMDGPU::NoRegister) {
// Reserve 4 SGPRs for the scratch buffer resource descriptor in case we
@@ -625,6 +622,10 @@ BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
reserveRegisterTuples(Reserved, ScratchRSrcReg);
}
+ Register LongBranchReservedReg = MFI->getLongBranchReservedReg();
+ if (LongBranchReservedReg)
+ reserveRegisterTuples(Reserved, LongBranchReservedReg);
+
// We have to assume the SP is needed in case there are calls in the function,
// which is detected after the function is lowered. If we aren't really going
// to need SP, don't bother reserving it.
@@ -646,24 +647,18 @@ BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
assert(!isSubRegister(ScratchRSrcReg, BasePtrReg));
}
+ // FIXME: Use same reserved register introduced in D149775
+ // SGPR used to preserve EXEC MASK around WWM spill/copy instructions.
+ Register ExecCopyReg = MFI->getSGPRForEXECCopy();
+ if (ExecCopyReg)
+ reserveRegisterTuples(Reserved, ExecCopyReg);
+
// Reserve VGPRs/AGPRs.
//
unsigned MaxNumVGPRs = ST.getMaxNumVGPRs(MF);
unsigned MaxNumAGPRs = MaxNumVGPRs;
unsigned TotalNumVGPRs = AMDGPU::VGPR_32RegClass.getNumRegs();
- // Reserve all the AGPRs if there are no instructions to use it.
- if (!ST.hasMAIInsts()) {
- for (unsigned i = 0; i < MaxNumAGPRs; ++i) {
- unsigned Reg = AMDGPU::AGPR_32RegClass.getRegister(i);
- reserveRegisterTuples(Reserved, Reg);
- }
- }
-
- for (auto Reg : AMDGPU::AGPR_32RegClass) {
- Reserved.set(getSubReg(Reg, AMDGPU::hi16));
- }
-
// On GFX90A, the number of VGPRs and AGPRs need not be equal. Theoretically,
// a wave may have up to 512 total vector registers combining together both
// VGPRs and AGPRs. Hence, in an entry function without calls and without
@@ -690,9 +685,15 @@ BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
reserveRegisterTuples(Reserved, Reg);
}
- for (unsigned i = MaxNumAGPRs; i < TotalNumVGPRs; ++i) {
- unsigned Reg = AMDGPU::AGPR_32RegClass.getRegister(i);
- reserveRegisterTuples(Reserved, Reg);
+ if (ST.hasMAIInsts()) {
+ for (unsigned i = MaxNumAGPRs; i < TotalNumVGPRs; ++i) {
+ unsigned Reg = AMDGPU::AGPR_32RegClass.getRegister(i);
+ reserveRegisterTuples(Reserved, Reg);
+ }
+ } else {
+ // Reserve all the AGPRs if there are no instructions to use it.
+ for (MCRegister Reg : AMDGPU::AGPR_32RegClass)
+ reserveRegisterTuples(Reserved, Reg);
}
// On GFX908, in order to guarantee copying between AGPRs, we need a scratch
@@ -711,9 +712,6 @@ BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
for (MCPhysReg Reg : MFI->getVGPRSpillAGPRs())
reserveRegisterTuples(Reserved, Reg);
- for (auto Reg : MFI->getSGPRSpillVGPRs())
- reserveRegisterTuples(Reserved, Reg);
-
return Reserved;
}
@@ -1065,6 +1063,8 @@ static unsigned getNumSubRegsForSpillOp(unsigned Op) {
case AMDGPU::SI_SPILL_A32_RESTORE:
case AMDGPU::SI_SPILL_AV32_SAVE:
case AMDGPU::SI_SPILL_AV32_RESTORE:
+ case AMDGPU::SI_SPILL_WWM_V32_SAVE:
+ case AMDGPU::SI_SPILL_WWM_V32_RESTORE:
return 1;
default: llvm_unreachable("Invalid spill opcode");
}
@@ -1326,7 +1326,7 @@ void SIRegisterInfo::buildSpillLoadStore(
const TargetRegisterClass *RC = getRegClassForReg(MF->getRegInfo(), ValueReg);
// On gfx90a+ AGPR is a regular VGPR acceptable for loads and stores.
const bool IsAGPR = !ST.hasGFX90AInsts() && isAGPRClass(RC);
- const unsigned RegWidth = AMDGPU::getRegBitWidth(RC->getID()) / 8;
+ const unsigned RegWidth = AMDGPU::getRegBitWidth(*RC) / 8;
// Always use 4 byte operations for AGPRs because we need to scavenge
// a temporary VGPR.
@@ -1607,7 +1607,8 @@ void SIRegisterInfo::buildSpillLoadStore(
} else if (UseVGPROffset) {
// FIXME: change to scavengeRegisterBackwards()
if (!TmpOffsetVGPR) {
- TmpOffsetVGPR = RS->scavengeRegister(&AMDGPU::VGPR_32RegClass, MI, 0);
+ TmpOffsetVGPR = RS->scavengeRegisterBackwards(AMDGPU::VGPR_32RegClass,
+ MI, false, 0);
RS->setRegUsed(TmpOffsetVGPR);
}
}
@@ -1660,6 +1661,33 @@ void SIRegisterInfo::buildSpillLoadStore(
if (NeedSuperRegImpOperand && (IsFirstSubReg || IsLastSubReg))
MIB.addReg(ValueReg, RegState::Implicit | SrcDstRegState);
+
+ // The epilog restore of a wwm-scratch register can cause undesired
+ // optimization during machine-cp post PrologEpilogInserter if the same
+ // register was assigned for return value ABI lowering with a COPY
+ // instruction. As given below, with the epilog reload, the earlier COPY
+ // appeared to be dead during machine-cp.
+ // ...
+ // v0 in WWM operation, needs the WWM spill at prolog/epilog.
+ // $vgpr0 = V_WRITELANE_B32 $sgpr20, 0, $vgpr0
+ // ...
+ // Epilog block:
+ // $vgpr0 = COPY $vgpr1 // outgoing value moved to v0
+ // ...
+ // WWM spill restore to preserve the inactive lanes of v0.
+ // $sgpr4_sgpr5 = S_XOR_SAVEEXEC_B64 -1
+ // $vgpr0 = BUFFER_LOAD $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0
+ // $exec = S_MOV_B64 killed $sgpr4_sgpr5
+ // ...
+ // SI_RETURN implicit $vgpr0
+ // ...
+ // To fix it, mark the same reg as a tied op for such restore instructions
+ // so that it marks a usage for the preceding COPY.
+ if (!IsStore && MI != MBB.end() && MI->isReturn() &&
+ MI->readsRegister(SubReg, this)) {
+ MIB.addReg(SubReg, RegState::Implicit);
+ MIB->tieOperands(0, MIB->getNumOperands() - 1);
+ }
}
if (ScratchOffsetRegDelta != 0) {
@@ -1705,10 +1733,13 @@ void SIRegisterInfo::buildVGPRSpillLoadStore(SGPRSpillBuilder &SB, int Index,
bool SIRegisterInfo::spillSGPR(MachineBasicBlock::iterator MI, int Index,
RegScavenger *RS, SlotIndexes *Indexes,
- LiveIntervals *LIS, bool OnlyToVGPR) const {
+ LiveIntervals *LIS, bool OnlyToVGPR,
+ bool SpillToPhysVGPRLane) const {
SGPRSpillBuilder SB(*this, *ST.getInstrInfo(), isWave32, MI, Index, RS);
- ArrayRef<SpilledReg> VGPRSpills = SB.MFI.getSGPRSpillToVGPRLanes(Index);
+ ArrayRef<SpilledReg> VGPRSpills =
+ SpillToPhysVGPRLane ? SB.MFI.getSGPRSpillToPhysicalVGPRLanes(Index)
+ : SB.MFI.getSGPRSpillToVirtualVGPRLanes(Index);
bool SpillToVGPR = !VGPRSpills.empty();
if (OnlyToVGPR && !SpillToVGPR)
return false;
@@ -1825,10 +1856,13 @@ bool SIRegisterInfo::spillSGPR(MachineBasicBlock::iterator MI, int Index,
bool SIRegisterInfo::restoreSGPR(MachineBasicBlock::iterator MI, int Index,
RegScavenger *RS, SlotIndexes *Indexes,
- LiveIntervals *LIS, bool OnlyToVGPR) const {
+ LiveIntervals *LIS, bool OnlyToVGPR,
+ bool SpillToPhysVGPRLane) const {
SGPRSpillBuilder SB(*this, *ST.getInstrInfo(), isWave32, MI, Index, RS);
- ArrayRef<SpilledReg> VGPRSpills = SB.MFI.getSGPRSpillToVGPRLanes(Index);
+ ArrayRef<SpilledReg> VGPRSpills =
+ SpillToPhysVGPRLane ? SB.MFI.getSGPRSpillToPhysicalVGPRLanes(Index)
+ : SB.MFI.getSGPRSpillToVirtualVGPRLanes(Index);
bool SpillToVGPR = !VGPRSpills.empty();
if (OnlyToVGPR && !SpillToVGPR)
return false;
@@ -1974,7 +2008,7 @@ bool SIRegisterInfo::spillEmergencySGPR(MachineBasicBlock::iterator MI,
/// handled.
bool SIRegisterInfo::eliminateSGPRToVGPRSpillFrameIndex(
MachineBasicBlock::iterator MI, int FI, RegScavenger *RS,
- SlotIndexes *Indexes, LiveIntervals *LIS) const {
+ SlotIndexes *Indexes, LiveIntervals *LIS, bool SpillToPhysVGPRLane) const {
switch (MI->getOpcode()) {
case AMDGPU::SI_SPILL_S1024_SAVE:
case AMDGPU::SI_SPILL_S512_SAVE:
@@ -1990,7 +2024,7 @@ bool SIRegisterInfo::eliminateSGPRToVGPRSpillFrameIndex(
case AMDGPU::SI_SPILL_S96_SAVE:
case AMDGPU::SI_SPILL_S64_SAVE:
case AMDGPU::SI_SPILL_S32_SAVE:
- return spillSGPR(MI, FI, RS, Indexes, LIS, true);
+ return spillSGPR(MI, FI, RS, Indexes, LIS, true, SpillToPhysVGPRLane);
case AMDGPU::SI_SPILL_S1024_RESTORE:
case AMDGPU::SI_SPILL_S512_RESTORE:
case AMDGPU::SI_SPILL_S384_RESTORE:
@@ -2005,7 +2039,7 @@ bool SIRegisterInfo::eliminateSGPRToVGPRSpillFrameIndex(
case AMDGPU::SI_SPILL_S96_RESTORE:
case AMDGPU::SI_SPILL_S64_RESTORE:
case AMDGPU::SI_SPILL_S32_RESTORE:
- return restoreSGPR(MI, FI, RS, Indexes, LIS, true);
+ return restoreSGPR(MI, FI, RS, Indexes, LIS, true, SpillToPhysVGPRLane);
default:
llvm_unreachable("not an SGPR spill instruction");
}
@@ -2109,7 +2143,8 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
case AMDGPU::SI_SPILL_AV128_SAVE:
case AMDGPU::SI_SPILL_AV96_SAVE:
case AMDGPU::SI_SPILL_AV64_SAVE:
- case AMDGPU::SI_SPILL_AV32_SAVE: {
+ case AMDGPU::SI_SPILL_AV32_SAVE:
+ case AMDGPU::SI_SPILL_WWM_V32_SAVE: {
const MachineOperand *VData = TII->getNamedOperand(*MI,
AMDGPU::OpName::vdata);
assert(TII->getNamedOperand(*MI, AMDGPU::OpName::soffset)->getReg() ==
@@ -2118,11 +2153,19 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
unsigned Opc = ST.enableFlatScratch() ? AMDGPU::SCRATCH_STORE_DWORD_SADDR
: AMDGPU::BUFFER_STORE_DWORD_OFFSET;
auto *MBB = MI->getParent();
+ bool IsWWMRegSpill = TII->isWWMRegSpillOpcode(MI->getOpcode());
+ if (IsWWMRegSpill) {
+ TII->insertScratchExecCopy(*MF, *MBB, MI, DL, MFI->getSGPRForEXECCopy(),
+ RS->isRegUsed(AMDGPU::SCC));
+ }
buildSpillLoadStore(
*MBB, MI, DL, Opc, Index, VData->getReg(), VData->isKill(), FrameReg,
TII->getNamedOperand(*MI, AMDGPU::OpName::offset)->getImm(),
*MI->memoperands_begin(), RS);
MFI->addToSpilledVGPRs(getNumSubRegsForSpillOp(MI->getOpcode()));
+ if (IsWWMRegSpill)
+ TII->restoreExec(*MF, *MBB, MI, DL, MFI->getSGPRForEXECCopy());
+
MI->eraseFromParent();
return true;
}
@@ -2167,7 +2210,8 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
case AMDGPU::SI_SPILL_AV352_RESTORE:
case AMDGPU::SI_SPILL_AV384_RESTORE:
case AMDGPU::SI_SPILL_AV512_RESTORE:
- case AMDGPU::SI_SPILL_AV1024_RESTORE: {
+ case AMDGPU::SI_SPILL_AV1024_RESTORE:
+ case AMDGPU::SI_SPILL_WWM_V32_RESTORE: {
const MachineOperand *VData = TII->getNamedOperand(*MI,
AMDGPU::OpName::vdata);
assert(TII->getNamedOperand(*MI, AMDGPU::OpName::soffset)->getReg() ==
@@ -2176,10 +2220,19 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
unsigned Opc = ST.enableFlatScratch() ? AMDGPU::SCRATCH_LOAD_DWORD_SADDR
: AMDGPU::BUFFER_LOAD_DWORD_OFFSET;
auto *MBB = MI->getParent();
+ bool IsWWMRegSpill = TII->isWWMRegSpillOpcode(MI->getOpcode());
+ if (IsWWMRegSpill) {
+ TII->insertScratchExecCopy(*MF, *MBB, MI, DL, MFI->getSGPRForEXECCopy(),
+ RS->isRegUsed(AMDGPU::SCC));
+ }
buildSpillLoadStore(
*MBB, MI, DL, Opc, Index, VData->getReg(), VData->isKill(), FrameReg,
TII->getNamedOperand(*MI, AMDGPU::OpName::offset)->getImm(),
*MI->memoperands_begin(), RS);
+
+ if (IsWWMRegSpill)
+ TII->restoreExec(*MF, *MBB, MI, DL, MFI->getSGPRForEXECCopy());
+
MI->eraseFromParent();
return true;
}
@@ -2271,7 +2324,8 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
const TargetRegisterClass *RC = UseSGPR ? &AMDGPU::SReg_32_XM0RegClass
: &AMDGPU::VGPR_32RegClass;
- Register TmpReg = RS->scavengeRegister(RC, MI, 0, !UseSGPR);
+ Register TmpReg =
+ RS->scavengeRegisterBackwards(*RC, MI, false, 0, !UseSGPR);
FIOp.setReg(TmpReg);
FIOp.setIsKill();
@@ -2291,8 +2345,8 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
Register TmpSReg =
UseSGPR ? TmpReg
- : RS->scavengeRegister(&AMDGPU::SReg_32_XM0RegClass, MI, 0,
- !UseSGPR);
+ : RS->scavengeRegisterBackwards(AMDGPU::SReg_32_XM0RegClass,
+ MI, false, 0, !UseSGPR);
// TODO: for flat scratch another attempt can be made with a VGPR index
// if no SGPRs can be scavenged.
@@ -2366,8 +2420,9 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
: &AMDGPU::VGPR_32RegClass;
bool IsCopy = MI->getOpcode() == AMDGPU::V_MOV_B32_e32 ||
MI->getOpcode() == AMDGPU::V_MOV_B32_e64;
- Register ResultReg = IsCopy ? MI->getOperand(0).getReg()
- : RS->scavengeRegister(RC, MI, 0);
+ Register ResultReg =
+ IsCopy ? MI->getOperand(0).getReg()
+ : RS->scavengeRegisterBackwards(*RC, MI, false, 0);
int64_t Offset = FrameInfo.getObjectOffset(Index);
if (Offset == 0) {
@@ -2380,8 +2435,8 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
if (IsSALU && !LiveSCC)
Shift.getInstr()->getOperand(3).setIsDead(); // Mark SCC as dead.
if (IsSALU && LiveSCC) {
- Register NewDest =
- RS->scavengeRegister(&AMDGPU::SReg_32RegClass, Shift, 0);
+ Register NewDest = RS->scavengeRegisterBackwards(
+ AMDGPU::SReg_32RegClass, Shift, false, 0);
BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32),
NewDest)
.addReg(ResultReg);
@@ -2435,8 +2490,8 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
// We may have 1 free scratch SGPR even though a carry out is
// unavailable. Only one additional mov is needed.
- Register TmpScaledReg =
- RS->scavengeRegister(&AMDGPU::SReg_32_XM0RegClass, MI, 0, false);
+ Register TmpScaledReg = RS->scavengeRegisterBackwards(
+ AMDGPU::SReg_32_XM0RegClass, MI, false, 0, false);
Register ScaledReg = TmpScaledReg.isValid() ? TmpScaledReg : FrameReg;
BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_LSHR_B32), ScaledReg)
@@ -2501,7 +2556,8 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
FIOp.ChangeToImmediate(Offset);
if (!TII->isImmOperandLegal(*MI, FIOperandNum, FIOp)) {
- Register TmpReg = RS->scavengeRegister(&AMDGPU::VGPR_32RegClass, MI, 0);
+ Register TmpReg = RS->scavengeRegisterBackwards(AMDGPU::VGPR_32RegClass,
+ MI, false, 0);
BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpReg)
.addImm(Offset);
FIOp.ChangeToRegister(TmpReg, false, false, true);
@@ -2517,31 +2573,31 @@ StringRef SIRegisterInfo::getRegAsmName(MCRegister Reg) const {
static const TargetRegisterClass *
getAnyVGPRClassForBitWidth(unsigned BitWidth) {
- if (BitWidth <= 64)
+ if (BitWidth == 64)
return &AMDGPU::VReg_64RegClass;
- if (BitWidth <= 96)
+ if (BitWidth == 96)
return &AMDGPU::VReg_96RegClass;
- if (BitWidth <= 128)
+ if (BitWidth == 128)
return &AMDGPU::VReg_128RegClass;
- if (BitWidth <= 160)
+ if (BitWidth == 160)
return &AMDGPU::VReg_160RegClass;
- if (BitWidth <= 192)
+ if (BitWidth == 192)
return &AMDGPU::VReg_192RegClass;
- if (BitWidth <= 224)
+ if (BitWidth == 224)
return &AMDGPU::VReg_224RegClass;
- if (BitWidth <= 256)
+ if (BitWidth == 256)
return &AMDGPU::VReg_256RegClass;
- if (BitWidth <= 288)
+ if (BitWidth == 288)
return &AMDGPU::VReg_288RegClass;
- if (BitWidth <= 320)
+ if (BitWidth == 320)
return &AMDGPU::VReg_320RegClass;
- if (BitWidth <= 352)
+ if (BitWidth == 352)
return &AMDGPU::VReg_352RegClass;
- if (BitWidth <= 384)
+ if (BitWidth == 384)
return &AMDGPU::VReg_384RegClass;
- if (BitWidth <= 512)
+ if (BitWidth == 512)
return &AMDGPU::VReg_512RegClass;
- if (BitWidth <= 1024)
+ if (BitWidth == 1024)
return &AMDGPU::VReg_1024RegClass;
return nullptr;
@@ -2549,31 +2605,31 @@ getAnyVGPRClassForBitWidth(unsigned BitWidth) {
static const TargetRegisterClass *
getAlignedVGPRClassForBitWidth(unsigned BitWidth) {
- if (BitWidth <= 64)
+ if (BitWidth == 64)
return &AMDGPU::VReg_64_Align2RegClass;
- if (BitWidth <= 96)
+ if (BitWidth == 96)
return &AMDGPU::VReg_96_Align2RegClass;
- if (BitWidth <= 128)
+ if (BitWidth == 128)
return &AMDGPU::VReg_128_Align2RegClass;
- if (BitWidth <= 160)
+ if (BitWidth == 160)
return &AMDGPU::VReg_160_Align2RegClass;
- if (BitWidth <= 192)
+ if (BitWidth == 192)
return &AMDGPU::VReg_192_Align2RegClass;
- if (BitWidth <= 224)
+ if (BitWidth == 224)
return &AMDGPU::VReg_224_Align2RegClass;
- if (BitWidth <= 256)
+ if (BitWidth == 256)
return &AMDGPU::VReg_256_Align2RegClass;
- if (BitWidth <= 288)
+ if (BitWidth == 288)
return &AMDGPU::VReg_288_Align2RegClass;
- if (BitWidth <= 320)
+ if (BitWidth == 320)
return &AMDGPU::VReg_320_Align2RegClass;
- if (BitWidth <= 352)
+ if (BitWidth == 352)
return &AMDGPU::VReg_352_Align2RegClass;
- if (BitWidth <= 384)
+ if (BitWidth == 384)
return &AMDGPU::VReg_384_Align2RegClass;
- if (BitWidth <= 512)
+ if (BitWidth == 512)
return &AMDGPU::VReg_512_Align2RegClass;
- if (BitWidth <= 1024)
+ if (BitWidth == 1024)
return &AMDGPU::VReg_1024_Align2RegClass;
return nullptr;
@@ -2583,9 +2639,9 @@ const TargetRegisterClass *
SIRegisterInfo::getVGPRClassForBitWidth(unsigned BitWidth) const {
if (BitWidth == 1)
return &AMDGPU::VReg_1RegClass;
- if (BitWidth <= 16)
+ if (BitWidth == 16)
return &AMDGPU::VGPR_LO16RegClass;
- if (BitWidth <= 32)
+ if (BitWidth == 32)
return &AMDGPU::VGPR_32RegClass;
return ST.needsAlignedVGPRs() ? getAlignedVGPRClassForBitWidth(BitWidth)
: getAnyVGPRClassForBitWidth(BitWidth);
@@ -2593,31 +2649,31 @@ SIRegisterInfo::getVGPRClassForBitWidth(unsigned BitWidth) const {
static const TargetRegisterClass *
getAnyAGPRClassForBitWidth(unsigned BitWidth) {
- if (BitWidth <= 64)
+ if (BitWidth == 64)
return &AMDGPU::AReg_64RegClass;
- if (BitWidth <= 96)
+ if (BitWidth == 96)
return &AMDGPU::AReg_96RegClass;
- if (BitWidth <= 128)
+ if (BitWidth == 128)
return &AMDGPU::AReg_128RegClass;
- if (BitWidth <= 160)
+ if (BitWidth == 160)
return &AMDGPU::AReg_160RegClass;
- if (BitWidth <= 192)
+ if (BitWidth == 192)
return &AMDGPU::AReg_192RegClass;
- if (BitWidth <= 224)
+ if (BitWidth == 224)
return &AMDGPU::AReg_224RegClass;
- if (BitWidth <= 256)
+ if (BitWidth == 256)
return &AMDGPU::AReg_256RegClass;
- if (BitWidth <= 288)
+ if (BitWidth == 288)
return &AMDGPU::AReg_288RegClass;
- if (BitWidth <= 320)
+ if (BitWidth == 320)
return &AMDGPU::AReg_320RegClass;
- if (BitWidth <= 352)
+ if (BitWidth == 352)
return &AMDGPU::AReg_352RegClass;
- if (BitWidth <= 384)
+ if (BitWidth == 384)
return &AMDGPU::AReg_384RegClass;
- if (BitWidth <= 512)
+ if (BitWidth == 512)
return &AMDGPU::AReg_512RegClass;
- if (BitWidth <= 1024)
+ if (BitWidth == 1024)
return &AMDGPU::AReg_1024RegClass;
return nullptr;
@@ -2625,31 +2681,31 @@ getAnyAGPRClassForBitWidth(unsigned BitWidth) {
static const TargetRegisterClass *
getAlignedAGPRClassForBitWidth(unsigned BitWidth) {
- if (BitWidth <= 64)
+ if (BitWidth == 64)
return &AMDGPU::AReg_64_Align2RegClass;
- if (BitWidth <= 96)
+ if (BitWidth == 96)
return &AMDGPU::AReg_96_Align2RegClass;
- if (BitWidth <= 128)
+ if (BitWidth == 128)
return &AMDGPU::AReg_128_Align2RegClass;
- if (BitWidth <= 160)
+ if (BitWidth == 160)
return &AMDGPU::AReg_160_Align2RegClass;
- if (BitWidth <= 192)
+ if (BitWidth == 192)
return &AMDGPU::AReg_192_Align2RegClass;
- if (BitWidth <= 224)
+ if (BitWidth == 224)
return &AMDGPU::AReg_224_Align2RegClass;
- if (BitWidth <= 256)
+ if (BitWidth == 256)
return &AMDGPU::AReg_256_Align2RegClass;
- if (BitWidth <= 288)
+ if (BitWidth == 288)
return &AMDGPU::AReg_288_Align2RegClass;
- if (BitWidth <= 320)
+ if (BitWidth == 320)
return &AMDGPU::AReg_320_Align2RegClass;
- if (BitWidth <= 352)
+ if (BitWidth == 352)
return &AMDGPU::AReg_352_Align2RegClass;
- if (BitWidth <= 384)
+ if (BitWidth == 384)
return &AMDGPU::AReg_384_Align2RegClass;
- if (BitWidth <= 512)
+ if (BitWidth == 512)
return &AMDGPU::AReg_512_Align2RegClass;
- if (BitWidth <= 1024)
+ if (BitWidth == 1024)
return &AMDGPU::AReg_1024_Align2RegClass;
return nullptr;
@@ -2657,9 +2713,9 @@ getAlignedAGPRClassForBitWidth(unsigned BitWidth) {
const TargetRegisterClass *
SIRegisterInfo::getAGPRClassForBitWidth(unsigned BitWidth) const {
- if (BitWidth <= 16)
+ if (BitWidth == 16)
return &AMDGPU::AGPR_LO16RegClass;
- if (BitWidth <= 32)
+ if (BitWidth == 32)
return &AMDGPU::AGPR_32RegClass;
return ST.needsAlignedVGPRs() ? getAlignedAGPRClassForBitWidth(BitWidth)
: getAnyAGPRClassForBitWidth(BitWidth);
@@ -2667,31 +2723,31 @@ SIRegisterInfo::getAGPRClassForBitWidth(unsigned BitWidth) const {
static const TargetRegisterClass *
getAnyVectorSuperClassForBitWidth(unsigned BitWidth) {
- if (BitWidth <= 64)
+ if (BitWidth == 64)
return &AMDGPU::AV_64RegClass;
- if (BitWidth <= 96)
+ if (BitWidth == 96)
return &AMDGPU::AV_96RegClass;
- if (BitWidth <= 128)
+ if (BitWidth == 128)
return &AMDGPU::AV_128RegClass;
- if (BitWidth <= 160)
+ if (BitWidth == 160)
return &AMDGPU::AV_160RegClass;
- if (BitWidth <= 192)
+ if (BitWidth == 192)
return &AMDGPU::AV_192RegClass;
- if (BitWidth <= 224)
+ if (BitWidth == 224)
return &AMDGPU::AV_224RegClass;
- if (BitWidth <= 256)
+ if (BitWidth == 256)
return &AMDGPU::AV_256RegClass;
- if (BitWidth <= 288)
+ if (BitWidth == 288)
return &AMDGPU::AV_288RegClass;
- if (BitWidth <= 320)
+ if (BitWidth == 320)
return &AMDGPU::AV_320RegClass;
- if (BitWidth <= 352)
+ if (BitWidth == 352)
return &AMDGPU::AV_352RegClass;
- if (BitWidth <= 384)
+ if (BitWidth == 384)
return &AMDGPU::AV_384RegClass;
- if (BitWidth <= 512)
+ if (BitWidth == 512)
return &AMDGPU::AV_512RegClass;
- if (BitWidth <= 1024)
+ if (BitWidth == 1024)
return &AMDGPU::AV_1024RegClass;
return nullptr;
@@ -2699,31 +2755,31 @@ getAnyVectorSuperClassForBitWidth(unsigned BitWidth) {
static const TargetRegisterClass *
getAlignedVectorSuperClassForBitWidth(unsigned BitWidth) {
- if (BitWidth <= 64)
+ if (BitWidth == 64)
return &AMDGPU::AV_64_Align2RegClass;
- if (BitWidth <= 96)
+ if (BitWidth == 96)
return &AMDGPU::AV_96_Align2RegClass;
- if (BitWidth <= 128)
+ if (BitWidth == 128)
return &AMDGPU::AV_128_Align2RegClass;
- if (BitWidth <= 160)
+ if (BitWidth == 160)
return &AMDGPU::AV_160_Align2RegClass;
- if (BitWidth <= 192)
+ if (BitWidth == 192)
return &AMDGPU::AV_192_Align2RegClass;
- if (BitWidth <= 224)
+ if (BitWidth == 224)
return &AMDGPU::AV_224_Align2RegClass;
- if (BitWidth <= 256)
+ if (BitWidth == 256)
return &AMDGPU::AV_256_Align2RegClass;
- if (BitWidth <= 288)
+ if (BitWidth == 288)
return &AMDGPU::AV_288_Align2RegClass;
- if (BitWidth <= 320)
+ if (BitWidth == 320)
return &AMDGPU::AV_320_Align2RegClass;
- if (BitWidth <= 352)
+ if (BitWidth == 352)
return &AMDGPU::AV_352_Align2RegClass;
- if (BitWidth <= 384)
+ if (BitWidth == 384)
return &AMDGPU::AV_384_Align2RegClass;
- if (BitWidth <= 512)
+ if (BitWidth == 512)
return &AMDGPU::AV_512_Align2RegClass;
- if (BitWidth <= 1024)
+ if (BitWidth == 1024)
return &AMDGPU::AV_1024_Align2RegClass;
return nullptr;
@@ -2731,9 +2787,9 @@ getAlignedVectorSuperClassForBitWidth(unsigned BitWidth) {
const TargetRegisterClass *
SIRegisterInfo::getVectorSuperClassForBitWidth(unsigned BitWidth) const {
- if (BitWidth <= 16)
+ if (BitWidth == 16)
return &AMDGPU::VGPR_LO16RegClass;
- if (BitWidth <= 32)
+ if (BitWidth == 32)
return &AMDGPU::AV_32RegClass;
return ST.needsAlignedVGPRs()
? getAlignedVectorSuperClassForBitWidth(BitWidth)
@@ -2742,35 +2798,35 @@ SIRegisterInfo::getVectorSuperClassForBitWidth(unsigned BitWidth) const {
const TargetRegisterClass *
SIRegisterInfo::getSGPRClassForBitWidth(unsigned BitWidth) {
- if (BitWidth <= 16)
+ if (BitWidth == 16)
return &AMDGPU::SGPR_LO16RegClass;
- if (BitWidth <= 32)
+ if (BitWidth == 32)
return &AMDGPU::SReg_32RegClass;
- if (BitWidth <= 64)
+ if (BitWidth == 64)
return &AMDGPU::SReg_64RegClass;
- if (BitWidth <= 96)
+ if (BitWidth == 96)
return &AMDGPU::SGPR_96RegClass;
- if (BitWidth <= 128)
+ if (BitWidth == 128)
return &AMDGPU::SGPR_128RegClass;
- if (BitWidth <= 160)
+ if (BitWidth == 160)
return &AMDGPU::SGPR_160RegClass;
- if (BitWidth <= 192)
+ if (BitWidth == 192)
return &AMDGPU::SGPR_192RegClass;
- if (BitWidth <= 224)
+ if (BitWidth == 224)
return &AMDGPU::SGPR_224RegClass;
- if (BitWidth <= 256)
+ if (BitWidth == 256)
return &AMDGPU::SGPR_256RegClass;
- if (BitWidth <= 288)
+ if (BitWidth == 288)
return &AMDGPU::SGPR_288RegClass;
- if (BitWidth <= 320)
+ if (BitWidth == 320)
return &AMDGPU::SGPR_320RegClass;
- if (BitWidth <= 352)
+ if (BitWidth == 352)
return &AMDGPU::SGPR_352RegClass;
- if (BitWidth <= 384)
+ if (BitWidth == 384)
return &AMDGPU::SGPR_384RegClass;
- if (BitWidth <= 512)
+ if (BitWidth == 512)
return &AMDGPU::SGPR_512RegClass;
- if (BitWidth <= 1024)
+ if (BitWidth == 1024)
return &AMDGPU::SGPR_1024RegClass;
return nullptr;
@@ -2863,13 +2919,12 @@ bool SIRegisterInfo::opCanUseLiteralConstant(unsigned OpType) const {
/// Returns a lowest register that is not used at any point in the function.
/// If all registers are used, then this function will return
-/// AMDGPU::NoRegister. If \p ReserveHighestVGPR = true, then return
+/// AMDGPU::NoRegister. If \p ReserveHighestRegister = true, then return
/// highest unused register.
-MCRegister SIRegisterInfo::findUnusedRegister(const MachineRegisterInfo &MRI,
- const TargetRegisterClass *RC,
- const MachineFunction &MF,
- bool ReserveHighestVGPR) const {
- if (ReserveHighestVGPR) {
+MCRegister SIRegisterInfo::findUnusedRegister(
+ const MachineRegisterInfo &MRI, const TargetRegisterClass *RC,
+ const MachineFunction &MF, bool ReserveHighestRegister) const {
+ if (ReserveHighestRegister) {
for (MCRegister Reg : reverse(*RC))
if (MRI.isAllocatable(Reg) && !MRI.isPhysRegUsed(Reg))
return Reg;
@@ -2881,9 +2936,19 @@ MCRegister SIRegisterInfo::findUnusedRegister(const MachineRegisterInfo &MRI,
return MCRegister();
}
+bool SIRegisterInfo::isUniformReg(const MachineRegisterInfo &MRI,
+ const RegisterBankInfo &RBI,
+ Register Reg) const {
+ auto *RB = RBI.getRegBank(Reg, MRI, *MRI.getTargetRegisterInfo());
+ if (!RB)
+ return false;
+
+ return !RBI.isDivergentRegBank(RB);
+}
+
ArrayRef<int16_t> SIRegisterInfo::getRegSplitParts(const TargetRegisterClass *RC,
unsigned EltSize) const {
- const unsigned RegBitWidth = AMDGPU::getRegBitWidth(*RC->MC);
+ const unsigned RegBitWidth = AMDGPU::getRegBitWidth(*RC);
assert(RegBitWidth >= 32 && RegBitWidth <= 1024);
const unsigned RegDWORDs = RegBitWidth / 32;
@@ -3084,9 +3149,8 @@ MachineInstr *SIRegisterInfo::findReachingDef(Register Reg, unsigned SubReg,
DefIdx = V->def;
} else {
// Find last def.
- for (MCRegUnitIterator Units(Reg.asMCReg(), this); Units.isValid();
- ++Units) {
- LiveRange &LR = LIS->getRegUnit(*Units);
+ for (MCRegUnit Unit : regunits(Reg.asMCReg())) {
+ LiveRange &LR = LIS->getRegUnit(Unit);
if (VNInfo *V = LR.getVNInfoAt(UseIdx)) {
if (!DefIdx.isValid() ||
MDT.dominates(LIS->getInstructionFromIndex(DefIdx),
@@ -3173,3 +3237,19 @@ ArrayRef<MCPhysReg>
SIRegisterInfo::getAllSGPR32(const MachineFunction &MF) const {
return ArrayRef(AMDGPU::SGPR_32RegClass.begin(), ST.getMaxNumSGPRs(MF));
}
+
+unsigned
+SIRegisterInfo::getSubRegAlignmentNumBits(const TargetRegisterClass *RC,
+ unsigned SubReg) const {
+ switch (RC->TSFlags & SIRCFlags::RegKindMask) {
+ case SIRCFlags::HasSGPR:
+ return std::min(128u, getSubRegIdxSize(SubReg));
+ case SIRCFlags::HasAGPR:
+ case SIRCFlags::HasVGPR:
+ case SIRCFlags::HasVGPR | SIRCFlags::HasAGPR:
+ return std::min(32u, getSubRegIdxSize(SubReg));
+ default:
+ break;
+ }
+ return 0;
+}
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h
index e9ddf82fb5c8..17fce43891c5 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h
@@ -70,6 +70,12 @@ public:
return SpillSGPRToVGPR;
}
+ /// Return the largest available SGPR aligned to \p Align for the register
+ /// class \p RC.
+ MCRegister getAlignedHighSGPRForRC(const MachineFunction &MF,
+ const unsigned Align,
+ const TargetRegisterClass *RC) const;
+
/// Return the end register initially reserved for the scratch buffer in case
/// spilling is needed.
MCRegister reservedPrivateSegmentBufferReg(const MachineFunction &MF) const;
@@ -136,14 +142,17 @@ public:
void buildVGPRSpillLoadStore(SGPRSpillBuilder &SB, int Index, int Offset,
bool IsLoad, bool IsKill = true) const;
- /// If \p OnlyToVGPR is true, this will only succeed if this
+ /// If \p OnlyToVGPR is true, this will only succeed if this manages to find a
+ /// free VGPR lane to spill.
bool spillSGPR(MachineBasicBlock::iterator MI, int FI, RegScavenger *RS,
SlotIndexes *Indexes = nullptr, LiveIntervals *LIS = nullptr,
- bool OnlyToVGPR = false) const;
+ bool OnlyToVGPR = false,
+ bool SpillToPhysVGPRLane = false) const;
bool restoreSGPR(MachineBasicBlock::iterator MI, int FI, RegScavenger *RS,
SlotIndexes *Indexes = nullptr, LiveIntervals *LIS = nullptr,
- bool OnlyToVGPR = false) const;
+ bool OnlyToVGPR = false,
+ bool SpillToPhysVGPRLane = false) const;
bool spillEmergencySGPR(MachineBasicBlock::iterator MI,
MachineBasicBlock &RestoreMBB, Register SGPR,
@@ -157,10 +166,10 @@ public:
unsigned FIOperandNum,
RegScavenger *RS) const override;
- bool eliminateSGPRToVGPRSpillFrameIndex(MachineBasicBlock::iterator MI,
- int FI, RegScavenger *RS,
- SlotIndexes *Indexes = nullptr,
- LiveIntervals *LIS = nullptr) const;
+ bool eliminateSGPRToVGPRSpillFrameIndex(
+ MachineBasicBlock::iterator MI, int FI, RegScavenger *RS,
+ SlotIndexes *Indexes = nullptr, LiveIntervals *LIS = nullptr,
+ bool SpillToPhysVGPRLane = false) const;
StringRef getRegAsmName(MCRegister Reg) const override;
@@ -286,10 +295,17 @@ public:
return isVGPR(MRI, Reg) || isAGPR(MRI, Reg);
}
+ // FIXME: SGPRs are assumed to be uniform, but this is not true for i1 SGPRs
+ // (such as VCC) which hold a wave-wide vector of boolean values. Examining
+ // just the register class is not suffcient; it needs to be combined with a
+ // value type. The next predicate isUniformReg() does this correctly.
bool isDivergentRegClass(const TargetRegisterClass *RC) const override {
return !isSGPRClass(RC);
}
+ bool isUniformReg(const MachineRegisterInfo &MRI, const RegisterBankInfo &RBI,
+ Register Reg) const override;
+
ArrayRef<int16_t> getRegSplitParts(const TargetRegisterClass *RC,
unsigned EltSize) const;
@@ -411,6 +427,25 @@ public:
int64_t InstrOffset, MachineMemOperand *MMO,
RegScavenger *RS,
LivePhysRegs *LiveRegs = nullptr) const;
+
+ // Return alignment in register file of first register in a register tuple.
+ unsigned getRegClassAlignmentNumBits(const TargetRegisterClass *RC) const {
+ return (RC->TSFlags & SIRCFlags::RegTupleAlignUnitsMask) * 32;
+ }
+
+ // Check if register class RC has required alignment.
+ bool isRegClassAligned(const TargetRegisterClass *RC,
+ unsigned AlignNumBits) const {
+ assert(AlignNumBits != 0);
+ unsigned RCAlign = getRegClassAlignmentNumBits(RC);
+ return RCAlign == AlignNumBits ||
+ (RCAlign > AlignNumBits && (RCAlign % AlignNumBits) == 0);
+ }
+
+ // Return alignment of a SubReg relative to start of a register in RC class.
+ // No check if the subreg is supported by the current RC is made.
+ unsigned getSubRegAlignmentNumBits(const TargetRegisterClass *RC,
+ unsigned SubReg) const;
};
} // End namespace llvm
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
index 12053c4b8724..b2b1b458a63a 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
@@ -10,16 +10,6 @@
// Subregister declarations
//===----------------------------------------------------------------------===//
-class Indexes<int N> {
- list<int> all = [0, 1, 2, 3, 4, 5, 6 , 7,
- 8, 9, 10, 11, 12, 13, 14, 15,
- 16, 17, 18, 19, 20, 21, 22, 23,
- 24, 25, 26, 27, 28, 29, 30, 31];
-
- // Returns list of indexes [0..N)
- list<int> slice = !filter(i, all, !lt(i, N));
-}
-
let Namespace = "AMDGPU" in {
def lo16 : SubRegIndex<16, 0>;
@@ -35,13 +25,11 @@ foreach Index = 1...31 in {
}
foreach Size = {2...6,8,16} in {
- foreach Index = Indexes<!sub(33, Size)>.slice in {
- def !interleave(!foreach(cur, Indexes<Size>.slice, "sub"#!add(cur, Index)),
- "_") :
+ foreach Index = !range(!sub(33, Size)) in {
+ def !interleave(!foreach(cur, !range(Size), "sub"#!add(cur, Index)), "_") :
SubRegIndex<!mul(Size, 32), !shl(Index, 5)> {
let CoveringSubRegIndices =
- !foreach(cur, Indexes<Size>.slice,
- !cast<SubRegIndex>(sub#!add(cur, Index)));
+ !foreach(cur, !range(Size), !cast<SubRegIndex>(sub#!add(cur, Index)));
}
}
}
@@ -150,10 +138,14 @@ class SIRegisterClass <string n, list<ValueType> rTypes, int Align, dag rList>
// For scalar register classes.
field bit HasSGPR = 0;
+ // Alignment of the first register in tuple (in 32-bit units).
+ field int RegTupleAlignUnits = 1;
+
// These need to be kept in sync with the enum SIRCFlags.
- let TSFlags{0} = HasVGPR;
- let TSFlags{1} = HasAGPR;
- let TSFlags{2} = HasSGPR;
+ let TSFlags{1-0} = RegTupleAlignUnits;
+ let TSFlags{2} = HasVGPR;
+ let TSFlags{3} = HasAGPR;
+ let TSFlags{4} = HasSGPR;
}
multiclass SIRegLoHi16 <string n, bits<16> regIdx, bit ArtificialHigh = 1,
@@ -421,7 +413,7 @@ def SGPR_32 : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 32,
def SGPR_64Regs : SIRegisterTuples<getSubRegs<2>.ret, SGPR_32, 105, 2, 2, "s">;
// SGPR 96-bit registers. No operations use these, but for symmetry with 96-bit VGPRs.
-def SGPR_96Regs : SIRegisterTuples<getSubRegs<3>.ret, SGPR_32, 105, 3, 3, "s">;
+def SGPR_96Regs : SIRegisterTuples<getSubRegs<3>.ret, SGPR_32, 105, 4, 3, "s">;
// SGPR 128-bit registers
def SGPR_128Regs : SIRegisterTuples<getSubRegs<4>.ret, SGPR_32, 105, 4, 4, "s">;
@@ -774,7 +766,7 @@ def SReg_LO16 : SIRegisterClass<"AMDGPU", [i16, f16], 16,
SRC_PRIVATE_LIMIT_HI_LO16, SRC_POPS_EXITING_WAVE_ID_LO16, SRC_VCCZ_LO16,
SRC_EXECZ_LO16, SRC_SCC_LO16, EXEC_LO_LO16, EXEC_HI_LO16, M0_CLASS_LO16)> {
let Size = 16;
- let AllocationPriority = 0;
+ let isAllocatable = 0;
let BaseClassOrder = 16;
}
@@ -817,6 +809,21 @@ def SGPR_64 : SIRegisterClass<"AMDGPU", [v2i32, i64, v2f32, f64, v4i16, v4f16],
let HasSGPR = 1;
}
+// CCR (call clobbered registers) SGPR 64-bit registers
+def CCR_SGPR_64 : SIRegisterClass<"AMDGPU", SGPR_64.RegTypes, 32, (add (trunc SGPR_64, 15))> {
+ let CopyCost = SGPR_64.CopyCost;
+ let AllocationPriority = SGPR_64.AllocationPriority;
+ let HasSGPR = 1;
+}
+
+// Call clobbered 64-bit SGPRs for AMDGPU_Gfx CC
+def Gfx_CCR_SGPR_64 : SIRegisterClass<"AMDGPU", SGPR_64.RegTypes, 32,
+ (add (trunc (shl SGPR_64, 18), 14))> { // s[36:37]-s[s62:63]
+ let CopyCost = SGPR_64.CopyCost;
+ let AllocationPriority = SGPR_64.AllocationPriority;
+ let HasSGPR = 1;
+}
+
def TTMP_64 : SIRegisterClass<"AMDGPU", [v2i32, i64, f64, v4i16, v4f16], 32,
(add TTMP_64Regs)> {
let isAllocatable = 0;
@@ -931,6 +938,7 @@ multiclass VRegClass<int numRegs, list<ValueType> regTypes, dag regList> {
def _Align2 : VRegClassBase<numRegs, regTypes, (decimate regList, 2)> {
// Give aligned class higher priority in base class resolution
let BaseClassOrder = !sub(!mul(numRegs, 32), 1);
+ let RegTupleAlignUnits = 2;
}
}
}
@@ -965,6 +973,7 @@ multiclass ARegClass<int numRegs, list<ValueType> regTypes, dag regList> {
def _Align2 : VRegClassBase<numRegs, regTypes, (decimate regList, 2)> {
// Give aligned class higher priority in base class resolution
let BaseClassOrder = !sub(!mul(numRegs, 32), 1);
+ let RegTupleAlignUnits = 2;
}
}
}
@@ -1033,10 +1042,12 @@ multiclass AVRegClass<int numRegs, list<ValueType> regTypes,
// Define the regular class.
def "" : VRegClassBase<numRegs, regTypes, (add vregList, aregList)>;
- // Define 2-aligned variant
+ // Define 2-aligned variant
def _Align2 : VRegClassBase<numRegs, regTypes,
(add (decimate vregList, 2),
- (decimate aregList, 2))>;
+ (decimate aregList, 2))> {
+ let RegTupleAlignUnits = 2;
+ }
}
}
@@ -1066,185 +1077,123 @@ class RegImmMatcher<string name> : AsmOperandClass {
let RenderMethod = "addRegOrImmOperands";
}
-// For VOP1,2,C True16 instructions. Uses first 128 32-bit VGPRs only
-multiclass SIRegOperand16 <string rc, string MatchName, string opType,
- string rc_suffix = "_32"> {
- let OperandNamespace = "AMDGPU" in {
- def _b16_Lo128 : RegisterOperand<!cast<RegisterClass>(rc#rc_suffix#"_Lo128")> {
- let OperandType = opType#"_INT16";
- let ParserMatchClass = RegImmMatcher<MatchName#"B16_Lo128">;
- let DecoderMethod = "decodeOperand_VSrc16";
- }
-
- def _f16_Lo128 : RegisterOperand<!cast<RegisterClass>(rc#rc_suffix#"_Lo128")> {
- let OperandType = opType#"_FP16";
- let ParserMatchClass = RegImmMatcher<MatchName#"F16_Lo128">;
- let DecoderMethod = "decodeOperand_" # rc # "_16";
- }
- }
-}
-
-
-multiclass SIRegOperand32 <string rc, string MatchName, string opType,
- string rc_suffix = "_32"> {
- let OperandNamespace = "AMDGPU" in {
- def _b16 : RegisterOperand<!cast<RegisterClass>(rc#rc_suffix)> {
- let OperandType = opType#"_INT16";
- let ParserMatchClass = RegImmMatcher<MatchName#"B16">;
- let DecoderMethod = "decodeOperand_VSrc16";
- }
-
- def _f16 : RegisterOperand<!cast<RegisterClass>(rc#rc_suffix)> {
- let OperandType = opType#"_FP16";
- let ParserMatchClass = RegImmMatcher<MatchName#"F16">;
- let DecoderMethod = "decodeOperand_" # rc # "_16";
- }
+class RegOrImmOperand <string RegisterClassName, string OperandTypeName,
+ string ParserMatchClassName, string decoderImmSize>
+ : RegisterOperand<!cast<RegisterClass>(RegisterClassName)> {
+ let OperandNamespace = "AMDGPU";
+ let OperandType = OperandTypeName;
+ let ParserMatchClass = RegImmMatcher<ParserMatchClassName>;
+ let DecoderMethod = "decodeOperand_" # RegisterClassName # decoderImmSize;
+ }
- def _b32 : RegisterOperand<!cast<RegisterClass>(rc#rc_suffix)> {
- let OperandType = opType#"_INT32";
- let ParserMatchClass = RegImmMatcher<MatchName#"B32">;
- let DecoderMethod = "decodeOperand_" # rc # rc_suffix;
- }
+class RegOrB16 <string RegisterClass, string OperandTypePrefix>
+ : RegOrImmOperand <RegisterClass, OperandTypePrefix # "_INT16",
+ !subst("_b16", "B16", NAME), "_Imm16">;
- def _f32 : RegisterOperand<!cast<RegisterClass>(rc#rc_suffix)> {
- let OperandType = opType#"_FP32";
- let ParserMatchClass = RegImmMatcher<MatchName#"F32">;
- let DecoderMethod = "decodeOperand_" # rc # rc_suffix;
- }
+class RegOrF16 <string RegisterClass, string OperandTypePrefix>
+ : RegOrImmOperand <RegisterClass, OperandTypePrefix # "_FP16",
+ !subst("_f16", "F16", NAME), "_Imm16">;
- def _v2b16 : RegisterOperand<!cast<RegisterClass>(rc#rc_suffix)> {
- let OperandType = opType#"_V2INT16";
- let ParserMatchClass = RegImmMatcher<MatchName#"V2B16">;
- let DecoderMethod = "decodeOperand_VSrcV216";
- }
+class RegOrB32 <string RegisterClass, string OperandTypePrefix>
+ : RegOrImmOperand <RegisterClass, OperandTypePrefix # "_INT32",
+ !subst("_b32", "B32", NAME), "_Imm32">;
- def _v2f16 : RegisterOperand<!cast<RegisterClass>(rc#rc_suffix)> {
- let OperandType = opType#"_V2FP16";
- let ParserMatchClass = RegImmMatcher<MatchName#"V2F16">;
- let DecoderMethod = "decodeOperand_VSrcV216";
- }
- }
-}
+class RegOrF32 <string RegisterClass, string OperandTypePrefix>
+ : RegOrImmOperand <RegisterClass, OperandTypePrefix # "_FP32",
+ !subst("_f32", "F32", NAME), "_Imm32">;
-multiclass SIRegOperand64 <string rc, string MatchName, string opType,
- string rc_suffix = "_64", bit Vectors = 1> {
- let OperandNamespace = "AMDGPU" in {
- def _b64 : RegisterOperand<!cast<RegisterClass>(rc#rc_suffix)> {
- let OperandType = opType#"_INT64";
- let ParserMatchClass = RegImmMatcher<MatchName#"B64">;
- }
+class RegOrV2B16 <string RegisterClass, string OperandTypePrefix>
+ : RegOrImmOperand <RegisterClass, OperandTypePrefix # "_V2INT16",
+ !subst("_v2b16", "V2B16", NAME), "_Imm16">;
- def _f64 : RegisterOperand<!cast<RegisterClass>(rc#rc_suffix)> {
- let OperandType = opType#"_FP64";
- let ParserMatchClass = RegImmMatcher<MatchName#"F64">;
- }
+class RegOrV2F16 <string RegisterClass, string OperandTypePrefix>
+ : RegOrImmOperand <RegisterClass, OperandTypePrefix # "_V2FP16",
+ !subst("_v2f16", "V2F16", NAME), "_Imm16">;
- if Vectors then
- def _v2f32 : RegisterOperand<!cast<RegisterClass>(rc#rc_suffix)> {
- let OperandType = opType#"_V2FP32";
- let ParserMatchClass = RegImmMatcher<MatchName#"V2FP32">;
- let DecoderMethod = "decodeOperand_VSrcV232";
- }
- if Vectors then
- def _v2b32 : RegisterOperand<!cast<RegisterClass>(rc#rc_suffix)> {
- let OperandType = opType#"_V2INT32";
- let ParserMatchClass = RegImmMatcher<MatchName#"V2INT32">;
- let DecoderMethod = "decodeOperand_VSrcV232";
- }
- }
-}
+class RegOrF64 <string RegisterClass, string OperandTypePrefix>
+ : RegOrImmOperand <RegisterClass, OperandTypePrefix # "_FP64",
+ !subst("_f64", "F64", NAME), "_Imm64">;
-multiclass SIRegOperand <string rc, string MatchName, string opType> :
- SIRegOperand32<rc, MatchName, opType>,
- SIRegOperand64<rc, MatchName, opType>;
+class RegOrB64 <string RegisterClass, string OperandTypePrefix>
+ : RegOrImmOperand <RegisterClass, OperandTypePrefix # "_INT64",
+ !subst("_b64", "B64", NAME), "_Imm64">;
-// FIXME: 64-bit sources can sometimes use 32-bit constants.
-multiclass RegImmOperand <string rc, string MatchName>
- : SIRegOperand<rc, MatchName, "OPERAND_REG_IMM">;
+class RegOrV2F32 <string RegisterClass, string OperandTypePrefix>
+ : RegOrImmOperand <RegisterClass, OperandTypePrefix # "_V2FP32",
+ !subst("_v2f32", "V2FP32", NAME), "_Imm32">;
-multiclass RegInlineOperand <string rc, string MatchName>
- : SIRegOperand<rc, MatchName, "OPERAND_REG_INLINE_C">;
+class RegOrV2B32 <string RegisterClass, string OperandTypePrefix>
+ : RegOrImmOperand <RegisterClass, OperandTypePrefix # "_V2INT32",
+ !subst("_v2b32", "V2INT32", NAME), "_Imm32">;
-multiclass RegInlineOperand32 <string rc, string MatchName,
- string rc_suffix = "_32">
- : SIRegOperand32<rc, MatchName, "OPERAND_REG_INLINE_C", rc_suffix>;
+// For VOP1,2,C True16 instructions. _Lo128 use first 128 32-bit VGPRs only.
+class RegOrB16_Lo128 <string RegisterClass, string OperandTypePrefix>
+ : RegOrImmOperand <RegisterClass, OperandTypePrefix # "_INT16",
+ !subst("_b16_Lo128", "B16_Lo128", NAME), "_Imm16">;
-multiclass RegInlineOperand64 <string rc, string MatchName,
- string rc_suffix = "_64">
- : SIRegOperand64<rc, MatchName, "OPERAND_REG_INLINE_C", rc_suffix>;
+class RegOrF16_Lo128 <string RegisterClass, string OperandTypePrefix>
+ : RegOrImmOperand <RegisterClass, OperandTypePrefix # "_FP16",
+ !subst("_f16_Lo128", "F16_Lo128", NAME), "_Imm16">;
-multiclass RegInlineOperandAC <string rc, string MatchName,
- string rc_suffix = "_32">
- : SIRegOperand32<rc, MatchName, "OPERAND_REG_INLINE_AC", rc_suffix>;
+// Deferred operands
+class RegOrF16_Deferred <string RegisterClass, string OperandTypePrefix>
+ : RegOrImmOperand <RegisterClass, OperandTypePrefix # "_FP16_DEFERRED",
+ !subst("_f16_Deferred", "F16", NAME), "_Deferred_Imm16">;
-multiclass RegInlineOperandAC64 <string rc, string MatchName,
- string rc_suffix = "_64">
- : SIRegOperand64<rc, MatchName, "OPERAND_REG_INLINE_AC", rc_suffix, 0>;
+class RegOrF32_Deferred <string RegisterClass, string OperandTypePrefix>
+ : RegOrImmOperand <RegisterClass, OperandTypePrefix # "_FP32_DEFERRED",
+ !subst("_f32_Deferred", "F32", NAME), "_Deferred_Imm32">;
+class RegOrF16_Lo128_Deferred <string RegisterClass,
+ string OperandTypePrefix>
+ : RegOrImmOperand <RegisterClass, OperandTypePrefix # "_FP16_DEFERRED",
+ !subst("_f16_Lo128_Deferred", "F16_Lo128", NAME),
+ "_Deferred_Imm16">;
//===----------------------------------------------------------------------===//
// SSrc_* Operands with an SGPR or a 32-bit immediate
//===----------------------------------------------------------------------===//
-defm SSrc : RegImmOperand<"SReg", "SSrc">;
+def SSrc_b32 : RegOrB32 <"SReg_32", "OPERAND_REG_IMM">;
+def SSrc_f32 : RegOrF32 <"SReg_32", "OPERAND_REG_IMM">;
+def SSrc_b64 : RegOrB64 <"SReg_64", "OPERAND_REG_IMM">;
-def SSrcOrLds_b32 : RegisterOperand<SRegOrLds_32> {
- let OperandNamespace = "AMDGPU";
- let OperandType = "OPERAND_REG_IMM_INT32";
- let ParserMatchClass = RegImmMatcher<"SSrcOrLdsB32">;
-}
+def SSrcOrLds_b32 : RegOrB32 <"SRegOrLds_32", "OPERAND_REG_IMM">;
//===----------------------------------------------------------------------===//
// SCSrc_* Operands with an SGPR or a inline constant
//===----------------------------------------------------------------------===//
-defm SCSrc : RegInlineOperand<"SReg", "SCSrc"> ;
+def SCSrc_b32 : RegOrB32 <"SReg_32", "OPERAND_REG_INLINE_C">;
+def SCSrc_b64 : RegOrB64 <"SReg_64", "OPERAND_REG_INLINE_C">;
//===----------------------------------------------------------------------===//
// VSrc_* Operands with an SGPR, VGPR or a 32-bit immediate
//===----------------------------------------------------------------------===//
-defm VSrc : RegImmOperand<"VS", "VSrc">;
-defm VSrcT : SIRegOperand16<"VS", "VSrcT", "OPERAND_REG_IMM">;
+def VSrc_b16 : RegOrB16 <"VS_32", "OPERAND_REG_IMM">;
+def VSrc_f16 : RegOrF16 <"VS_32", "OPERAND_REG_IMM">;
+def VSrc_b32 : RegOrB32 <"VS_32", "OPERAND_REG_IMM">;
+def VSrc_f32 : RegOrF32 <"VS_32", "OPERAND_REG_IMM">;
+def VSrc_v2b16 : RegOrV2B16 <"VS_32", "OPERAND_REG_IMM">;
+def VSrc_v2f16 : RegOrV2F16 <"VS_32", "OPERAND_REG_IMM">;
+def VSrc_b64 : RegOrB64 <"VS_64", "OPERAND_REG_IMM">;
+def VSrc_f64 : RegOrF64 <"VS_64", "OPERAND_REG_IMM">;
+def VSrc_v2b32 : RegOrV2B32 <"VS_64", "OPERAND_REG_IMM">;
+def VSrc_v2f32 : RegOrV2F32 <"VS_64", "OPERAND_REG_IMM">;
-def VSrc_128 : RegisterOperand<VReg_128> {
- let DecoderMethod = "DecodeVS_128RegisterClass";
-}
+def VSrcT_b16_Lo128 : RegOrB16_Lo128 <"VS_32_Lo128", "OPERAND_REG_IMM">;
+def VSrcT_f16_Lo128 : RegOrF16_Lo128 <"VS_32_Lo128", "OPERAND_REG_IMM">;
//===----------------------------------------------------------------------===//
// VSrc_*_Deferred Operands with an SGPR, VGPR or a 32-bit immediate for use
// with FMAMK/FMAAK
//===----------------------------------------------------------------------===//
-multiclass SIRegOperand16_Deferred <string rc, string MatchName, string opType,
- string rc_suffix = "_32"> {
- let OperandNamespace = "AMDGPU" in {
- def _f16_Lo128_Deferred : RegisterOperand<!cast<RegisterClass>(rc#rc_suffix#"_Lo128")> {
- let OperandType = opType#"_FP16_DEFERRED";
- let ParserMatchClass = RegImmMatcher<MatchName#"F16_Lo128">;
- let DecoderMethod = "decodeOperand_" # rc # "_16_Deferred";
- }
- }
-}
-
-multiclass SIRegOperand32_Deferred <string rc, string MatchName, string opType,
- string rc_suffix = "_32"> {
- let OperandNamespace = "AMDGPU" in {
- def _f16_Deferred : RegisterOperand<!cast<RegisterClass>(rc#rc_suffix)> {
- let OperandType = opType#"_FP16_DEFERRED";
- let ParserMatchClass = RegImmMatcher<MatchName#"F16">;
- let DecoderMethod = "decodeOperand_" # rc # "_16_Deferred";
- }
-
- def _f32_Deferred : RegisterOperand<!cast<RegisterClass>(rc#rc_suffix)> {
- let OperandType = opType#"_FP32_DEFERRED";
- let ParserMatchClass = RegImmMatcher<MatchName#"F32">;
- let DecoderMethod = "decodeOperand_" # rc # "_32_Deferred";
- }
- }
-}
+def VSrc_f16_Deferred : RegOrF16_Deferred<"VS_32", "OPERAND_REG_IMM">;
+def VSrc_f32_Deferred : RegOrF32_Deferred<"VS_32", "OPERAND_REG_IMM">;
-defm VSrc : SIRegOperand32_Deferred<"VS", "VSrc", "OPERAND_REG_IMM">;
-defm VSrcT : SIRegOperand16_Deferred<"VS", "VSrcT", "OPERAND_REG_IMM">;
+def VSrcT_f16_Lo128_Deferred : RegOrF16_Lo128_Deferred<"VS_32_Lo128",
+ "OPERAND_REG_IMM">;
//===----------------------------------------------------------------------===//
// VRegSrc_* Operands with a VGPR
@@ -1253,8 +1202,7 @@ defm VSrcT : SIRegOperand16_Deferred<"VS", "VSrcT", "OPERAND_REG_IMM">;
// This is for operands with the enum(9), VSrc encoding restriction,
// but only allows VGPRs.
def VRegSrc_32 : RegisterOperand<VGPR_32> {
- //let ParserMatchClass = RegImmMatcher<"VRegSrc32">;
- let DecoderMethod = "DecodeVS_32RegisterClass";
+ let DecoderMethod = "decodeOperand_VGPR_32";
}
def VRegSrc_64 : RegisterOperand<VReg_64> {
@@ -1269,6 +1217,10 @@ def VRegSrc_256 : RegisterOperand<VReg_256> {
let DecoderMethod = "decodeOperand_VReg_256";
}
+def VRegOrLdsSrc_32 : RegisterOperand<VRegOrLds_32> {
+ let DecoderMethod = "decodeOperand_VRegOrLds_32";
+}
+
//===----------------------------------------------------------------------===//
// VGPRSrc_*
//===----------------------------------------------------------------------===//
@@ -1286,7 +1238,7 @@ def VGPRSrc_32_Lo128 : RegisterOperand<VGPR_32_Lo128> {
//===----------------------------------------------------------------------===//
def ARegSrc_32 : RegisterOperand<AGPR_32> {
- let DecoderMethod = "DecodeAGPR_32RegisterClass";
+ let DecoderMethod = "decodeOperand_AGPR_32";
let EncoderMethod = "getAVOperandEncoding";
}
@@ -1294,38 +1246,42 @@ def ARegSrc_32 : RegisterOperand<AGPR_32> {
// VCSrc_* Operands with an SGPR, VGPR or an inline constant
//===----------------------------------------------------------------------===//
-defm VCSrc : RegInlineOperand<"VS", "VCSrc">;
-defm VCSrcT : SIRegOperand16<"VS", "VCSrcT", "OPERAND_REG_INLINE_C">;
+def VCSrc_b16 : RegOrB16 <"VS_32", "OPERAND_REG_INLINE_C">;
+def VCSrc_f16 : RegOrF16 <"VS_32", "OPERAND_REG_INLINE_C">;
+def VCSrc_b32 : RegOrB32 <"VS_32", "OPERAND_REG_INLINE_C">;
+def VCSrc_f32 : RegOrF32 <"VS_32", "OPERAND_REG_INLINE_C">;
+def VCSrc_v2b16 : RegOrV2B16 <"VS_32", "OPERAND_REG_INLINE_C">;
+def VCSrc_v2f16 : RegOrV2F16 <"VS_32", "OPERAND_REG_INLINE_C">;
//===----------------------------------------------------------------------===//
// VISrc_* Operands with a VGPR or an inline constant
//===----------------------------------------------------------------------===//
-defm VISrc : RegInlineOperand32<"VGPR", "VISrc">;
-let DecoderMethod = "decodeOperand_VReg_64" in
-defm VISrc_64 : RegInlineOperand64<"VReg", "VISrc_64", "_64">;
-defm VISrc_128 : RegInlineOperandAC<"VReg", "VISrc_128", "_128">;
-let DecoderMethod = "decodeOperand_VReg_256" in
-defm VISrc_256 : RegInlineOperand64<"VReg", "VISrc_256", "_256">;
-defm VISrc_512 : RegInlineOperandAC<"VReg", "VISrc_512", "_512">;
-defm VISrc_1024 : RegInlineOperandAC<"VReg", "VISrc_1024", "_1024">;
+def VISrc_64_f64 : RegOrF64 <"VReg_64", "OPERAND_REG_INLINE_C">;
+def VISrc_128_b32 : RegOrB32 <"VReg_128", "OPERAND_REG_INLINE_C">;
+def VISrc_128_f32 : RegOrF32 <"VReg_128", "OPERAND_REG_INLINE_C">;
+def VISrc_256_f64 : RegOrF64 <"VReg_256", "OPERAND_REG_INLINE_C">;
+def VISrc_512_b32 : RegOrB32 <"VReg_512", "OPERAND_REG_INLINE_C">;
+def VISrc_512_f32 : RegOrF32 <"VReg_512", "OPERAND_REG_INLINE_C">;
+def VISrc_1024_b32 : RegOrB32 <"VReg_1024", "OPERAND_REG_INLINE_C">;
+def VISrc_1024_f32 : RegOrF32 <"VReg_1024", "OPERAND_REG_INLINE_C">;
//===----------------------------------------------------------------------===//
// AVSrc_*, AVDst_*, AVLdSt_* Operands with an AGPR or VGPR
//===----------------------------------------------------------------------===//
def AVSrc_32 : RegisterOperand<AV_32> {
- let DecoderMethod = "DecodeAV_32RegisterClass";
+ let DecoderMethod = "decodeOperand_AV_32";
let EncoderMethod = "getAVOperandEncoding";
}
def AVSrc_64 : RegisterOperand<AV_64> {
- let DecoderMethod = "DecodeAV_64RegisterClass";
+ let DecoderMethod = "decodeOperand_AV_64";
let EncoderMethod = "getAVOperandEncoding";
}
def AVSrc_128 : RegisterOperand<AV_128> {
- let DecoderMethod = "DecodeAV_128RegisterClass";
+ let DecoderMethod = "decodeOperand_AV_128";
let EncoderMethod = "getAVOperandEncoding";
}
@@ -1368,12 +1324,11 @@ def AVLdSt_160 : RegisterOperand<AV_160> {
// ACSrc_* Operands with an AGPR or an inline constant
//===----------------------------------------------------------------------===//
-defm AISrc : RegInlineOperandAC<"AGPR", "AISrc">;
-defm AISrc_128 : RegInlineOperandAC<"AReg", "AISrc_128", "_128">;
-defm AISrc_512 : RegInlineOperandAC<"AReg", "AISrc_512", "_512">;
-defm AISrc_1024 : RegInlineOperandAC<"AReg", "AISrc_1024", "_1024">;
-
-let DecoderMethod = "decodeOperand_AReg_64" in
-defm AISrc_64 : RegInlineOperandAC64<"AReg", "AISrc_64", "_64">;
-let DecoderMethod = "decodeOperand_AReg_256" in
-defm AISrc_256 : RegInlineOperandAC64<"AReg", "AISrc_256", "_256">;
+def AISrc_64_f64 : RegOrF64 <"AReg_64", "OPERAND_REG_INLINE_AC">;
+def AISrc_128_f32 : RegOrF32 <"AReg_128", "OPERAND_REG_INLINE_AC">;
+def AISrc_128_b32 : RegOrB32 <"AReg_128", "OPERAND_REG_INLINE_AC">;
+def AISrc_256_f64 : RegOrF64 <"AReg_256", "OPERAND_REG_INLINE_AC">;
+def AISrc_512_f32 : RegOrF32 <"AReg_512", "OPERAND_REG_INLINE_AC">;
+def AISrc_512_b32 : RegOrB32 <"AReg_512", "OPERAND_REG_INLINE_AC">;
+def AISrc_1024_f32 : RegOrF32 <"AReg_1024", "OPERAND_REG_INLINE_AC">;
+def AISrc_1024_b32 : RegOrB32 <"AReg_1024", "OPERAND_REG_INLINE_AC">;
diff --git a/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp b/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp
index bec07d990380..4159dc694c1e 100644
--- a/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp
+++ b/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp
@@ -161,14 +161,12 @@ bool SIShrinkInstructions::shouldShrinkTrue16(MachineInstr &MI) const {
bool SIShrinkInstructions::isKImmOperand(const MachineOperand &Src) const {
return isInt<16>(Src.getImm()) &&
- !TII->isInlineConstant(*Src.getParent(),
- Src.getParent()->getOperandNo(&Src));
+ !TII->isInlineConstant(*Src.getParent(), Src.getOperandNo());
}
bool SIShrinkInstructions::isKUImmOperand(const MachineOperand &Src) const {
return isUInt<16>(Src.getImm()) &&
- !TII->isInlineConstant(*Src.getParent(),
- Src.getParent()->getOperandNo(&Src));
+ !TII->isInlineConstant(*Src.getParent(), Src.getOperandNo());
}
bool SIShrinkInstructions::isKImmOrKUImmOperand(const MachineOperand &Src,
@@ -310,7 +308,10 @@ void SIShrinkInstructions::shrinkMIMG(MachineInstr &MI) const {
unsigned NextVgpr = 0;
bool IsUndef = true;
bool IsKill = NewAddrDwords == Info->VAddrDwords;
- for (unsigned Idx = 0; Idx < Info->VAddrOperands; ++Idx) {
+ const unsigned NSAMaxSize = ST->getNSAMaxSize();
+ const bool IsPartialNSA = NewAddrDwords > NSAMaxSize;
+ const unsigned EndVAddr = IsPartialNSA ? NSAMaxSize : Info->VAddrOperands;
+ for (unsigned Idx = 0; Idx < EndVAddr; ++Idx) {
const MachineOperand &Op = MI.getOperand(VAddr0Idx + Idx);
unsigned Vgpr = TRI->getHWRegIndex(Op.getReg());
unsigned Dwords = TRI->getRegSizeInBits(Op.getReg(), *MRI) / 32;
@@ -363,13 +364,13 @@ void SIShrinkInstructions::shrinkMIMG(MachineInstr &MI) const {
MI.getOperand(VAddr0Idx).setIsUndef(IsUndef);
MI.getOperand(VAddr0Idx).setIsKill(IsKill);
- for (int i = 1; i < Info->VAddrOperands; ++i)
+ for (unsigned i = 1; i < EndVAddr; ++i)
MI.removeOperand(VAddr0Idx + 1);
if (ToUntie >= 0) {
MI.tieOperands(
AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vdata),
- ToUntie - (Info->VAddrOperands - 1));
+ ToUntie - (EndVAddr - 1));
}
}
@@ -475,7 +476,7 @@ void SIShrinkInstructions::shrinkMadFma(MachineInstr &MI) const {
}
}
-/// Attempt to shink AND/OR/XOR operations requiring non-inlineable literals.
+/// Attempt to shrink AND/OR/XOR operations requiring non-inlineable literals.
/// For AND or OR, try using S_BITSET{0,1} to clear or set bits.
/// If the inverse of the immediate is legal, use ANDN2, ORN2 or
/// XNOR (as a ^ b == ~(a ^ ~b)).
@@ -497,7 +498,7 @@ bool SIShrinkInstructions::shrinkScalarLogicOp(MachineInstr &MI) const {
if (Opc == AMDGPU::S_AND_B32) {
if (isPowerOf2_32(~Imm)) {
- NewImm = countTrailingOnes(Imm);
+ NewImm = llvm::countr_one(Imm);
Opc = AMDGPU::S_BITSET0_B32;
} else if (AMDGPU::isInlinableLiteral32(~Imm, ST->hasInv2PiInlineImm())) {
NewImm = ~Imm;
@@ -505,7 +506,7 @@ bool SIShrinkInstructions::shrinkScalarLogicOp(MachineInstr &MI) const {
}
} else if (Opc == AMDGPU::S_OR_B32) {
if (isPowerOf2_32(Imm)) {
- NewImm = countTrailingZeros(Imm);
+ NewImm = llvm::countr_zero(Imm);
Opc = AMDGPU::S_BITSET1_B32;
} else if (AMDGPU::isInlinableLiteral32(~Imm, ST->hasInv2PiInlineImm())) {
NewImm = ~Imm;
diff --git a/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp b/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
index 4d6669f8f94d..3143d437e370 100644
--- a/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
+++ b/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
@@ -158,10 +158,11 @@ private:
MachinePostDominatorTree *PDT;
unsigned AndOpc;
+ unsigned AndTermOpc;
unsigned AndN2Opc;
unsigned XorOpc;
unsigned AndSaveExecOpc;
- unsigned OrSaveExecOpc;
+ unsigned AndSaveExecTermOpc;
unsigned WQMOpc;
Register Exec;
Register LiveMaskReg;
@@ -380,8 +381,8 @@ void SIWholeQuadMode::markDefs(const MachineInstr &UseMI, LiveRange &LR,
if (Reg.isVirtual()) {
// Iterate over all operands to find relevant definitions
bool HasDef = false;
- for (const MachineOperand &Op : MI->operands()) {
- if (!(Op.isReg() && Op.isDef() && Op.getReg() == Reg))
+ for (const MachineOperand &Op : MI->all_defs()) {
+ if (Op.getReg() != Reg)
continue;
// Compute lanes defined and overlap with use
@@ -453,14 +454,13 @@ void SIWholeQuadMode::markOperand(const MachineInstr &MI,
// Handle physical registers that we need to track; this is mostly relevant
// for VCC, which can appear as the (implicit) input of a uniform branch,
// e.g. when a loop counter is stored in a VGPR.
- for (MCRegUnitIterator RegUnit(Reg.asMCReg(), TRI); RegUnit.isValid();
- ++RegUnit) {
- LiveRange &LR = LIS->getRegUnit(*RegUnit);
+ for (MCRegUnit Unit : TRI->regunits(Reg.asMCReg())) {
+ LiveRange &LR = LIS->getRegUnit(Unit);
const VNInfo *Value = LR.Query(LIS->getInstructionIndex(MI)).valueIn();
if (!Value)
continue;
- markDefs(MI, LR, *RegUnit, AMDGPU::NoSubRegister, Flag, Worklist);
+ markDefs(MI, LR, Unit, AMDGPU::NoSubRegister, Flag, Worklist);
}
}
}
@@ -471,11 +471,8 @@ void SIWholeQuadMode::markInstructionUses(const MachineInstr &MI, char Flag,
LLVM_DEBUG(dbgs() << "markInstructionUses " << PrintState(Flag) << ": "
<< MI);
- for (const MachineOperand &Use : MI.uses()) {
- if (!Use.isReg() || !Use.isUse())
- continue;
+ for (const MachineOperand &Use : MI.all_uses())
markOperand(MI, Use, Flag, Worklist);
- }
}
// Scan instructions to determine which ones require an Exact execmask and
@@ -1139,7 +1136,7 @@ MachineBasicBlock::iterator SIWholeQuadMode::prepareInsertion(
return PreferLast ? Last : First;
LiveRange &LR =
- LIS->getRegUnit(*MCRegUnitIterator(MCRegister::from(AMDGPU::SCC), TRI));
+ LIS->getRegUnit(*TRI->regunits(MCRegister::from(AMDGPU::SCC)).begin());
auto MBBE = MBB.end();
SlotIndex FirstIdx = First != MBBE ? LIS->getInstructionIndex(*First)
: LIS->getMBBEndIdx(&MBB);
@@ -1185,11 +1182,9 @@ MachineBasicBlock::iterator SIWholeQuadMode::prepareInsertion(
// does not need to be preserved.
while (MBBI != Last) {
bool IsExecDef = false;
- for (const MachineOperand &MO : MBBI->operands()) {
- if (MO.isReg() && MO.isDef()) {
- IsExecDef |=
- MO.getReg() == AMDGPU::EXEC_LO || MO.getReg() == AMDGPU::EXEC;
- }
+ for (const MachineOperand &MO : MBBI->all_defs()) {
+ IsExecDef |=
+ MO.getReg() == AMDGPU::EXEC_LO || MO.getReg() == AMDGPU::EXEC;
}
if (!IsExecDef)
break;
@@ -1206,13 +1201,25 @@ MachineBasicBlock::iterator SIWholeQuadMode::prepareInsertion(
void SIWholeQuadMode::toExact(MachineBasicBlock &MBB,
MachineBasicBlock::iterator Before,
Register SaveWQM) {
+ bool IsTerminator = Before == MBB.end();
+ if (!IsTerminator) {
+ auto FirstTerm = MBB.getFirstTerminator();
+ if (FirstTerm != MBB.end()) {
+ SlotIndex FirstTermIdx = LIS->getInstructionIndex(*FirstTerm);
+ SlotIndex BeforeIdx = LIS->getInstructionIndex(*Before);
+ IsTerminator = BeforeIdx > FirstTermIdx;
+ }
+ }
+
MachineInstr *MI;
if (SaveWQM) {
- MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AndSaveExecOpc), SaveWQM)
+ unsigned Opcode = IsTerminator ? AndSaveExecTermOpc : AndSaveExecOpc;
+ MI = BuildMI(MBB, Before, DebugLoc(), TII->get(Opcode), SaveWQM)
.addReg(LiveMaskReg);
} else {
- MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AndOpc), Exec)
+ unsigned Opcode = IsTerminator ? AndTermOpc : AndOpc;
+ MI = BuildMI(MBB, Before, DebugLoc(), TII->get(Opcode), Exec)
.addReg(Exec)
.addReg(LiveMaskReg);
}
@@ -1365,7 +1372,8 @@ void SIWholeQuadMode::processBlock(MachineBasicBlock &MBB, bool IsEntry) {
Needs = StateExact | StateWQM | StateStrict;
}
- if (MI.isTerminator() && OutNeeds == StateExact)
+ // Exact mode exit can occur in terminators, but must be before branches.
+ if (MI.isBranch() && OutNeeds == StateExact)
Needs = StateExact;
++Next;
@@ -1539,7 +1547,11 @@ void SIWholeQuadMode::lowerCopyInstrs() {
assert(MI->getNumExplicitOperands() == 2);
}
- MI->setDesc(TII->get(AMDGPU::COPY));
+ unsigned CopyOp = MI->getOperand(1).isReg()
+ ? (unsigned)AMDGPU::COPY
+ : TII->getMovOpcode(TRI->getRegClassForOperandReg(
+ *MRI, MI->getOperand(0)));
+ MI->setDesc(TII->get(CopyOp));
}
}
@@ -1587,18 +1599,20 @@ bool SIWholeQuadMode::runOnMachineFunction(MachineFunction &MF) {
if (ST->isWave32()) {
AndOpc = AMDGPU::S_AND_B32;
+ AndTermOpc = AMDGPU::S_AND_B32_term;
AndN2Opc = AMDGPU::S_ANDN2_B32;
XorOpc = AMDGPU::S_XOR_B32;
AndSaveExecOpc = AMDGPU::S_AND_SAVEEXEC_B32;
- OrSaveExecOpc = AMDGPU::S_OR_SAVEEXEC_B32;
+ AndSaveExecTermOpc = AMDGPU::S_AND_SAVEEXEC_B32_term;
WQMOpc = AMDGPU::S_WQM_B32;
Exec = AMDGPU::EXEC_LO;
} else {
AndOpc = AMDGPU::S_AND_B64;
+ AndTermOpc = AMDGPU::S_AND_B64_term;
AndN2Opc = AMDGPU::S_ANDN2_B64;
XorOpc = AMDGPU::S_XOR_B64;
AndSaveExecOpc = AMDGPU::S_AND_SAVEEXEC_B64;
- OrSaveExecOpc = AMDGPU::S_OR_SAVEEXEC_B64;
+ AndSaveExecTermOpc = AMDGPU::S_AND_SAVEEXEC_B64_term;
WQMOpc = AMDGPU::S_WQM_B64;
Exec = AMDGPU::EXEC;
}
diff --git a/llvm/lib/Target/AMDGPU/SMInstructions.td b/llvm/lib/Target/AMDGPU/SMInstructions.td
index f271f6d42857..7ca685a0cc5d 100644
--- a/llvm/lib/Target/AMDGPU/SMInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SMInstructions.td
@@ -6,22 +6,12 @@
//
//===----------------------------------------------------------------------===//
-def smrd_offset_8 : NamedOperandU32<"SMRDOffset8",
- NamedMatchClass<"SMRDOffset8">> {
- let OperandType = "OPERAND_IMMEDIATE";
-}
-
-class SMEMOffset : NamedOperandU32<"SMEMOffset",
- NamedMatchClass<"SMEMOffset">> {
- let OperandType = "OPERAND_IMMEDIATE";
- let EncoderMethod = "getSMEMOffsetEncoding";
- let DecoderMethod = "decodeSMEMOffset";
-}
+def smrd_offset_8 : ImmOperand<i32, "SMRDOffset8", 1>;
-def smem_offset : SMEMOffset;
-
-def smem_offset_mod : SMEMOffset {
- let PrintMethod = "printSMEMOffsetMod";
+let EncoderMethod = "getSMEMOffsetEncoding",
+ DecoderMethod = "decodeSMEMOffset" in {
+def smem_offset : ImmOperand<i32, "SMEMOffset", 1>;
+def smem_offset_mod : NamedIntOperand<i32, "offset", "SMEMOffsetMod">;
}
//===----------------------------------------------------------------------===//
@@ -124,6 +114,7 @@ class SM_Load_Pseudo <string opName, RegisterClass baseClass,
" $sdst, $sbase, " # offsets.Asm # "$cpol", []> {
RegisterClass BaseClass = baseClass;
let mayLoad = 1;
+ let isReMaterializable = 1;
let mayStore = 0;
let has_glc = 1;
let has_dlc = 1;
@@ -138,7 +129,6 @@ class SM_Store_Pseudo <string opName, RegisterClass baseClass,
offsets.Ins, (ins CPol:$cpol)),
" $sdata, $sbase, " # offsets.Asm # "$cpol"> {
RegisterClass BaseClass = baseClass;
- RegisterClass SrcClass = srcClass;
let mayLoad = 0;
let mayStore = 1;
let has_glc = 1;
@@ -163,23 +153,24 @@ class SM_Discard_Pseudo <string opName, OffsetMode offsets>
let PseudoInstr = opName # offsets.Variant;
}
-multiclass SM_Pseudo_Loads<string opName,
- RegisterClass baseClass,
+multiclass SM_Pseudo_Loads<RegisterClass baseClass,
RegisterClass dstClass> {
+ defvar opName = !tolower(NAME);
def _IMM : SM_Load_Pseudo <opName, baseClass, dstClass, IMM_Offset>;
def _SGPR : SM_Load_Pseudo <opName, baseClass, dstClass, SGPR_Offset>;
def _SGPR_IMM : SM_Load_Pseudo <opName, baseClass, dstClass, SGPR_IMM_Offset>;
}
-multiclass SM_Pseudo_Stores<string opName,
- RegisterClass baseClass,
- RegisterClass srcClass> {
+multiclass SM_Pseudo_Stores<RegisterClass baseClass,
+ RegisterClass srcClass> {
+ defvar opName = !tolower(NAME);
def _IMM : SM_Store_Pseudo <opName, baseClass, srcClass, IMM_Offset>;
def _SGPR : SM_Store_Pseudo <opName, baseClass, srcClass, SGPR_Offset>;
def _SGPR_IMM : SM_Store_Pseudo <opName, baseClass, srcClass, SGPR_IMM_Offset>;
}
-multiclass SM_Pseudo_Discards<string opName> {
+multiclass SM_Pseudo_Discards {
+ defvar opName = !tolower(NAME);
def _IMM : SM_Discard_Pseudo <opName, IMM_Offset>;
def _SGPR : SM_Discard_Pseudo <opName, SGPR_Offset>;
def _SGPR_IMM : SM_Discard_Pseudo <opName, SGPR_IMM_Offset>;
@@ -204,7 +195,8 @@ class SM_Inval_Pseudo <string opName, SDPatternOperator node = null_frag> : SM_P
let has_sbase = 0;
}
-multiclass SM_Pseudo_Probe<string opName, RegisterClass baseClass> {
+multiclass SM_Pseudo_Probe<RegisterClass baseClass> {
+ defvar opName = !tolower(NAME);
def _IMM : SM_Probe_Pseudo <opName, baseClass, IMM_Offset>;
def _SGPR : SM_Probe_Pseudo <opName, baseClass, SGPR_Offset>;
def _SGPR_IMM : SM_Probe_Pseudo <opName, baseClass, SGPR_IMM_Offset>;
@@ -270,9 +262,9 @@ class SM_Pseudo_Atomic<string opName,
let DisableEncoding = !if(isRet, "$sdata", "");
}
-multiclass SM_Pseudo_Atomics<string opName,
- RegisterClass baseClass,
+multiclass SM_Pseudo_Atomics<RegisterClass baseClass,
RegisterClass dataClass> {
+ defvar opName = !tolower(NAME);
def _IMM : SM_Pseudo_Atomic <opName, baseClass, dataClass, IMM_Offset, 0>;
def _SGPR : SM_Pseudo_Atomic <opName, baseClass, dataClass, SGPR_Offset, 0>;
def _SGPR_IMM : SM_Pseudo_Atomic <opName, baseClass, dataClass, SGPR_IMM_Offset, 0>;
@@ -291,53 +283,31 @@ multiclass SM_Pseudo_Atomics<string opName,
// XXX - SMEM instructions do not allow exec for data operand, but
// does sdst for SMRD on SI/CI?
-defm S_LOAD_DWORD : SM_Pseudo_Loads <"s_load_dword", SReg_64, SReg_32_XM0_XEXEC>;
-defm S_LOAD_DWORDX2 : SM_Pseudo_Loads <"s_load_dwordx2", SReg_64, SReg_64_XEXEC>;
-defm S_LOAD_DWORDX4 : SM_Pseudo_Loads <"s_load_dwordx4", SReg_64, SReg_128>;
-defm S_LOAD_DWORDX8 : SM_Pseudo_Loads <"s_load_dwordx8", SReg_64, SReg_256>;
-defm S_LOAD_DWORDX16 : SM_Pseudo_Loads <"s_load_dwordx16", SReg_64, SReg_512>;
+defm S_LOAD_DWORD : SM_Pseudo_Loads <SReg_64, SReg_32_XM0_XEXEC>;
+defm S_LOAD_DWORDX2 : SM_Pseudo_Loads <SReg_64, SReg_64_XEXEC>;
+defm S_LOAD_DWORDX4 : SM_Pseudo_Loads <SReg_64, SReg_128>;
+defm S_LOAD_DWORDX8 : SM_Pseudo_Loads <SReg_64, SReg_256>;
+defm S_LOAD_DWORDX16 : SM_Pseudo_Loads <SReg_64, SReg_512>;
let is_buffer = 1 in {
-defm S_BUFFER_LOAD_DWORD : SM_Pseudo_Loads <
- "s_buffer_load_dword", SReg_128, SReg_32_XM0_XEXEC
->;
-
+defm S_BUFFER_LOAD_DWORD : SM_Pseudo_Loads <SReg_128, SReg_32_XM0_XEXEC>;
// FIXME: exec_lo/exec_hi appear to be allowed for SMRD loads on
// SI/CI, bit disallowed for SMEM on VI.
-defm S_BUFFER_LOAD_DWORDX2 : SM_Pseudo_Loads <
- "s_buffer_load_dwordx2", SReg_128, SReg_64_XEXEC
->;
-
-defm S_BUFFER_LOAD_DWORDX4 : SM_Pseudo_Loads <
- "s_buffer_load_dwordx4", SReg_128, SReg_128
->;
-
-defm S_BUFFER_LOAD_DWORDX8 : SM_Pseudo_Loads <
- "s_buffer_load_dwordx8", SReg_128, SReg_256
->;
-
-defm S_BUFFER_LOAD_DWORDX16 : SM_Pseudo_Loads <
- "s_buffer_load_dwordx16", SReg_128, SReg_512
->;
+defm S_BUFFER_LOAD_DWORDX2 : SM_Pseudo_Loads <SReg_128, SReg_64_XEXEC>;
+defm S_BUFFER_LOAD_DWORDX4 : SM_Pseudo_Loads <SReg_128, SReg_128>;
+defm S_BUFFER_LOAD_DWORDX8 : SM_Pseudo_Loads <SReg_128, SReg_256>;
+defm S_BUFFER_LOAD_DWORDX16 : SM_Pseudo_Loads <SReg_128, SReg_512>;
}
let SubtargetPredicate = HasScalarStores in {
-defm S_STORE_DWORD : SM_Pseudo_Stores <"s_store_dword", SReg_64, SReg_32_XM0_XEXEC>;
-defm S_STORE_DWORDX2 : SM_Pseudo_Stores <"s_store_dwordx2", SReg_64, SReg_64_XEXEC>;
-defm S_STORE_DWORDX4 : SM_Pseudo_Stores <"s_store_dwordx4", SReg_64, SReg_128>;
+defm S_STORE_DWORD : SM_Pseudo_Stores <SReg_64, SReg_32_XM0_XEXEC>;
+defm S_STORE_DWORDX2 : SM_Pseudo_Stores <SReg_64, SReg_64_XEXEC>;
+defm S_STORE_DWORDX4 : SM_Pseudo_Stores <SReg_64, SReg_128>;
let is_buffer = 1 in {
-defm S_BUFFER_STORE_DWORD : SM_Pseudo_Stores <
- "s_buffer_store_dword", SReg_128, SReg_32_XM0_XEXEC
->;
-
-defm S_BUFFER_STORE_DWORDX2 : SM_Pseudo_Stores <
- "s_buffer_store_dwordx2", SReg_128, SReg_64_XEXEC
->;
-
-defm S_BUFFER_STORE_DWORDX4 : SM_Pseudo_Stores <
- "s_buffer_store_dwordx4", SReg_128, SReg_128
->;
+defm S_BUFFER_STORE_DWORD : SM_Pseudo_Stores <SReg_128, SReg_32_XM0_XEXEC>;
+defm S_BUFFER_STORE_DWORDX2 : SM_Pseudo_Stores <SReg_128, SReg_64_XEXEC>;
+defm S_BUFFER_STORE_DWORDX4 : SM_Pseudo_Stores <SReg_128, SReg_128>;
}
} // End SubtargetPredicate = HasScalarStores
@@ -355,9 +325,9 @@ def S_DCACHE_WB : SM_Inval_Pseudo <"s_dcache_wb", int_amdgcn_s_dcache_wb>;
def S_DCACHE_WB_VOL : SM_Inval_Pseudo <"s_dcache_wb_vol", int_amdgcn_s_dcache_wb_vol>;
} // End OtherPredicates = [HasScalarStores]
-defm S_ATC_PROBE : SM_Pseudo_Probe <"s_atc_probe", SReg_64>;
+defm S_ATC_PROBE : SM_Pseudo_Probe <SReg_64>;
let is_buffer = 1 in {
-defm S_ATC_PROBE_BUFFER : SM_Pseudo_Probe <"s_atc_probe_buffer", SReg_128>;
+defm S_ATC_PROBE_BUFFER : SM_Pseudo_Probe <SReg_128>;
}
} // SubtargetPredicate = isGFX8Plus
@@ -371,80 +341,80 @@ def S_GET_WAVEID_IN_WORKGROUP : SM_WaveId_Pseudo <"s_get_waveid_in_workgroup", i
let SubtargetPredicate = HasScalarFlatScratchInsts, Uses = [FLAT_SCR] in {
-defm S_SCRATCH_LOAD_DWORD : SM_Pseudo_Loads <"s_scratch_load_dword", SReg_64, SReg_32_XM0_XEXEC>;
-defm S_SCRATCH_LOAD_DWORDX2 : SM_Pseudo_Loads <"s_scratch_load_dwordx2", SReg_64, SReg_64_XEXEC>;
-defm S_SCRATCH_LOAD_DWORDX4 : SM_Pseudo_Loads <"s_scratch_load_dwordx4", SReg_64, SReg_128>;
+defm S_SCRATCH_LOAD_DWORD : SM_Pseudo_Loads <SReg_64, SReg_32_XM0_XEXEC>;
+defm S_SCRATCH_LOAD_DWORDX2 : SM_Pseudo_Loads <SReg_64, SReg_64_XEXEC>;
+defm S_SCRATCH_LOAD_DWORDX4 : SM_Pseudo_Loads <SReg_64, SReg_128>;
-defm S_SCRATCH_STORE_DWORD : SM_Pseudo_Stores <"s_scratch_store_dword", SReg_64, SReg_32_XM0_XEXEC>;
-defm S_SCRATCH_STORE_DWORDX2 : SM_Pseudo_Stores <"s_scratch_store_dwordx2", SReg_64, SReg_64_XEXEC>;
-defm S_SCRATCH_STORE_DWORDX4 : SM_Pseudo_Stores <"s_scratch_store_dwordx4", SReg_64, SReg_128>;
+defm S_SCRATCH_STORE_DWORD : SM_Pseudo_Stores <SReg_64, SReg_32_XM0_XEXEC>;
+defm S_SCRATCH_STORE_DWORDX2 : SM_Pseudo_Stores <SReg_64, SReg_64_XEXEC>;
+defm S_SCRATCH_STORE_DWORDX4 : SM_Pseudo_Stores <SReg_64, SReg_128>;
} // SubtargetPredicate = HasScalarFlatScratchInsts
let SubtargetPredicate = HasScalarAtomics in {
let is_buffer = 1 in {
-defm S_BUFFER_ATOMIC_SWAP : SM_Pseudo_Atomics <"s_buffer_atomic_swap", SReg_128, SReg_32_XM0_XEXEC>;
-defm S_BUFFER_ATOMIC_CMPSWAP : SM_Pseudo_Atomics <"s_buffer_atomic_cmpswap", SReg_128, SReg_64_XEXEC>;
-defm S_BUFFER_ATOMIC_ADD : SM_Pseudo_Atomics <"s_buffer_atomic_add", SReg_128, SReg_32_XM0_XEXEC>;
-defm S_BUFFER_ATOMIC_SUB : SM_Pseudo_Atomics <"s_buffer_atomic_sub", SReg_128, SReg_32_XM0_XEXEC>;
-defm S_BUFFER_ATOMIC_SMIN : SM_Pseudo_Atomics <"s_buffer_atomic_smin", SReg_128, SReg_32_XM0_XEXEC>;
-defm S_BUFFER_ATOMIC_UMIN : SM_Pseudo_Atomics <"s_buffer_atomic_umin", SReg_128, SReg_32_XM0_XEXEC>;
-defm S_BUFFER_ATOMIC_SMAX : SM_Pseudo_Atomics <"s_buffer_atomic_smax", SReg_128, SReg_32_XM0_XEXEC>;
-defm S_BUFFER_ATOMIC_UMAX : SM_Pseudo_Atomics <"s_buffer_atomic_umax", SReg_128, SReg_32_XM0_XEXEC>;
-defm S_BUFFER_ATOMIC_AND : SM_Pseudo_Atomics <"s_buffer_atomic_and", SReg_128, SReg_32_XM0_XEXEC>;
-defm S_BUFFER_ATOMIC_OR : SM_Pseudo_Atomics <"s_buffer_atomic_or", SReg_128, SReg_32_XM0_XEXEC>;
-defm S_BUFFER_ATOMIC_XOR : SM_Pseudo_Atomics <"s_buffer_atomic_xor", SReg_128, SReg_32_XM0_XEXEC>;
-defm S_BUFFER_ATOMIC_INC : SM_Pseudo_Atomics <"s_buffer_atomic_inc", SReg_128, SReg_32_XM0_XEXEC>;
-defm S_BUFFER_ATOMIC_DEC : SM_Pseudo_Atomics <"s_buffer_atomic_dec", SReg_128, SReg_32_XM0_XEXEC>;
+defm S_BUFFER_ATOMIC_SWAP : SM_Pseudo_Atomics <SReg_128, SReg_32_XM0_XEXEC>;
+defm S_BUFFER_ATOMIC_CMPSWAP : SM_Pseudo_Atomics <SReg_128, SReg_64_XEXEC>;
+defm S_BUFFER_ATOMIC_ADD : SM_Pseudo_Atomics <SReg_128, SReg_32_XM0_XEXEC>;
+defm S_BUFFER_ATOMIC_SUB : SM_Pseudo_Atomics <SReg_128, SReg_32_XM0_XEXEC>;
+defm S_BUFFER_ATOMIC_SMIN : SM_Pseudo_Atomics <SReg_128, SReg_32_XM0_XEXEC>;
+defm S_BUFFER_ATOMIC_UMIN : SM_Pseudo_Atomics <SReg_128, SReg_32_XM0_XEXEC>;
+defm S_BUFFER_ATOMIC_SMAX : SM_Pseudo_Atomics <SReg_128, SReg_32_XM0_XEXEC>;
+defm S_BUFFER_ATOMIC_UMAX : SM_Pseudo_Atomics <SReg_128, SReg_32_XM0_XEXEC>;
+defm S_BUFFER_ATOMIC_AND : SM_Pseudo_Atomics <SReg_128, SReg_32_XM0_XEXEC>;
+defm S_BUFFER_ATOMIC_OR : SM_Pseudo_Atomics <SReg_128, SReg_32_XM0_XEXEC>;
+defm S_BUFFER_ATOMIC_XOR : SM_Pseudo_Atomics <SReg_128, SReg_32_XM0_XEXEC>;
+defm S_BUFFER_ATOMIC_INC : SM_Pseudo_Atomics <SReg_128, SReg_32_XM0_XEXEC>;
+defm S_BUFFER_ATOMIC_DEC : SM_Pseudo_Atomics <SReg_128, SReg_32_XM0_XEXEC>;
-defm S_BUFFER_ATOMIC_SWAP_X2 : SM_Pseudo_Atomics <"s_buffer_atomic_swap_x2", SReg_128, SReg_64_XEXEC>;
-defm S_BUFFER_ATOMIC_CMPSWAP_X2 : SM_Pseudo_Atomics <"s_buffer_atomic_cmpswap_x2", SReg_128, SReg_128>;
-defm S_BUFFER_ATOMIC_ADD_X2 : SM_Pseudo_Atomics <"s_buffer_atomic_add_x2", SReg_128, SReg_64_XEXEC>;
-defm S_BUFFER_ATOMIC_SUB_X2 : SM_Pseudo_Atomics <"s_buffer_atomic_sub_x2", SReg_128, SReg_64_XEXEC>;
-defm S_BUFFER_ATOMIC_SMIN_X2 : SM_Pseudo_Atomics <"s_buffer_atomic_smin_x2", SReg_128, SReg_64_XEXEC>;
-defm S_BUFFER_ATOMIC_UMIN_X2 : SM_Pseudo_Atomics <"s_buffer_atomic_umin_x2", SReg_128, SReg_64_XEXEC>;
-defm S_BUFFER_ATOMIC_SMAX_X2 : SM_Pseudo_Atomics <"s_buffer_atomic_smax_x2", SReg_128, SReg_64_XEXEC>;
-defm S_BUFFER_ATOMIC_UMAX_X2 : SM_Pseudo_Atomics <"s_buffer_atomic_umax_x2", SReg_128, SReg_64_XEXEC>;
-defm S_BUFFER_ATOMIC_AND_X2 : SM_Pseudo_Atomics <"s_buffer_atomic_and_x2", SReg_128, SReg_64_XEXEC>;
-defm S_BUFFER_ATOMIC_OR_X2 : SM_Pseudo_Atomics <"s_buffer_atomic_or_x2", SReg_128, SReg_64_XEXEC>;
-defm S_BUFFER_ATOMIC_XOR_X2 : SM_Pseudo_Atomics <"s_buffer_atomic_xor_x2", SReg_128, SReg_64_XEXEC>;
-defm S_BUFFER_ATOMIC_INC_X2 : SM_Pseudo_Atomics <"s_buffer_atomic_inc_x2", SReg_128, SReg_64_XEXEC>;
-defm S_BUFFER_ATOMIC_DEC_X2 : SM_Pseudo_Atomics <"s_buffer_atomic_dec_x2", SReg_128, SReg_64_XEXEC>;
+defm S_BUFFER_ATOMIC_SWAP_X2 : SM_Pseudo_Atomics <SReg_128, SReg_64_XEXEC>;
+defm S_BUFFER_ATOMIC_CMPSWAP_X2 : SM_Pseudo_Atomics <SReg_128, SReg_128>;
+defm S_BUFFER_ATOMIC_ADD_X2 : SM_Pseudo_Atomics <SReg_128, SReg_64_XEXEC>;
+defm S_BUFFER_ATOMIC_SUB_X2 : SM_Pseudo_Atomics <SReg_128, SReg_64_XEXEC>;
+defm S_BUFFER_ATOMIC_SMIN_X2 : SM_Pseudo_Atomics <SReg_128, SReg_64_XEXEC>;
+defm S_BUFFER_ATOMIC_UMIN_X2 : SM_Pseudo_Atomics <SReg_128, SReg_64_XEXEC>;
+defm S_BUFFER_ATOMIC_SMAX_X2 : SM_Pseudo_Atomics <SReg_128, SReg_64_XEXEC>;
+defm S_BUFFER_ATOMIC_UMAX_X2 : SM_Pseudo_Atomics <SReg_128, SReg_64_XEXEC>;
+defm S_BUFFER_ATOMIC_AND_X2 : SM_Pseudo_Atomics <SReg_128, SReg_64_XEXEC>;
+defm S_BUFFER_ATOMIC_OR_X2 : SM_Pseudo_Atomics <SReg_128, SReg_64_XEXEC>;
+defm S_BUFFER_ATOMIC_XOR_X2 : SM_Pseudo_Atomics <SReg_128, SReg_64_XEXEC>;
+defm S_BUFFER_ATOMIC_INC_X2 : SM_Pseudo_Atomics <SReg_128, SReg_64_XEXEC>;
+defm S_BUFFER_ATOMIC_DEC_X2 : SM_Pseudo_Atomics <SReg_128, SReg_64_XEXEC>;
}
-defm S_ATOMIC_SWAP : SM_Pseudo_Atomics <"s_atomic_swap", SReg_64, SReg_32_XM0_XEXEC>;
-defm S_ATOMIC_CMPSWAP : SM_Pseudo_Atomics <"s_atomic_cmpswap", SReg_64, SReg_64_XEXEC>;
-defm S_ATOMIC_ADD : SM_Pseudo_Atomics <"s_atomic_add", SReg_64, SReg_32_XM0_XEXEC>;
-defm S_ATOMIC_SUB : SM_Pseudo_Atomics <"s_atomic_sub", SReg_64, SReg_32_XM0_XEXEC>;
-defm S_ATOMIC_SMIN : SM_Pseudo_Atomics <"s_atomic_smin", SReg_64, SReg_32_XM0_XEXEC>;
-defm S_ATOMIC_UMIN : SM_Pseudo_Atomics <"s_atomic_umin", SReg_64, SReg_32_XM0_XEXEC>;
-defm S_ATOMIC_SMAX : SM_Pseudo_Atomics <"s_atomic_smax", SReg_64, SReg_32_XM0_XEXEC>;
-defm S_ATOMIC_UMAX : SM_Pseudo_Atomics <"s_atomic_umax", SReg_64, SReg_32_XM0_XEXEC>;
-defm S_ATOMIC_AND : SM_Pseudo_Atomics <"s_atomic_and", SReg_64, SReg_32_XM0_XEXEC>;
-defm S_ATOMIC_OR : SM_Pseudo_Atomics <"s_atomic_or", SReg_64, SReg_32_XM0_XEXEC>;
-defm S_ATOMIC_XOR : SM_Pseudo_Atomics <"s_atomic_xor", SReg_64, SReg_32_XM0_XEXEC>;
-defm S_ATOMIC_INC : SM_Pseudo_Atomics <"s_atomic_inc", SReg_64, SReg_32_XM0_XEXEC>;
-defm S_ATOMIC_DEC : SM_Pseudo_Atomics <"s_atomic_dec", SReg_64, SReg_32_XM0_XEXEC>;
+defm S_ATOMIC_SWAP : SM_Pseudo_Atomics <SReg_64, SReg_32_XM0_XEXEC>;
+defm S_ATOMIC_CMPSWAP : SM_Pseudo_Atomics <SReg_64, SReg_64_XEXEC>;
+defm S_ATOMIC_ADD : SM_Pseudo_Atomics <SReg_64, SReg_32_XM0_XEXEC>;
+defm S_ATOMIC_SUB : SM_Pseudo_Atomics <SReg_64, SReg_32_XM0_XEXEC>;
+defm S_ATOMIC_SMIN : SM_Pseudo_Atomics <SReg_64, SReg_32_XM0_XEXEC>;
+defm S_ATOMIC_UMIN : SM_Pseudo_Atomics <SReg_64, SReg_32_XM0_XEXEC>;
+defm S_ATOMIC_SMAX : SM_Pseudo_Atomics <SReg_64, SReg_32_XM0_XEXEC>;
+defm S_ATOMIC_UMAX : SM_Pseudo_Atomics <SReg_64, SReg_32_XM0_XEXEC>;
+defm S_ATOMIC_AND : SM_Pseudo_Atomics <SReg_64, SReg_32_XM0_XEXEC>;
+defm S_ATOMIC_OR : SM_Pseudo_Atomics <SReg_64, SReg_32_XM0_XEXEC>;
+defm S_ATOMIC_XOR : SM_Pseudo_Atomics <SReg_64, SReg_32_XM0_XEXEC>;
+defm S_ATOMIC_INC : SM_Pseudo_Atomics <SReg_64, SReg_32_XM0_XEXEC>;
+defm S_ATOMIC_DEC : SM_Pseudo_Atomics <SReg_64, SReg_32_XM0_XEXEC>;
-defm S_ATOMIC_SWAP_X2 : SM_Pseudo_Atomics <"s_atomic_swap_x2", SReg_64, SReg_64_XEXEC>;
-defm S_ATOMIC_CMPSWAP_X2 : SM_Pseudo_Atomics <"s_atomic_cmpswap_x2", SReg_64, SReg_128>;
-defm S_ATOMIC_ADD_X2 : SM_Pseudo_Atomics <"s_atomic_add_x2", SReg_64, SReg_64_XEXEC>;
-defm S_ATOMIC_SUB_X2 : SM_Pseudo_Atomics <"s_atomic_sub_x2", SReg_64, SReg_64_XEXEC>;
-defm S_ATOMIC_SMIN_X2 : SM_Pseudo_Atomics <"s_atomic_smin_x2", SReg_64, SReg_64_XEXEC>;
-defm S_ATOMIC_UMIN_X2 : SM_Pseudo_Atomics <"s_atomic_umin_x2", SReg_64, SReg_64_XEXEC>;
-defm S_ATOMIC_SMAX_X2 : SM_Pseudo_Atomics <"s_atomic_smax_x2", SReg_64, SReg_64_XEXEC>;
-defm S_ATOMIC_UMAX_X2 : SM_Pseudo_Atomics <"s_atomic_umax_x2", SReg_64, SReg_64_XEXEC>;
-defm S_ATOMIC_AND_X2 : SM_Pseudo_Atomics <"s_atomic_and_x2", SReg_64, SReg_64_XEXEC>;
-defm S_ATOMIC_OR_X2 : SM_Pseudo_Atomics <"s_atomic_or_x2", SReg_64, SReg_64_XEXEC>;
-defm S_ATOMIC_XOR_X2 : SM_Pseudo_Atomics <"s_atomic_xor_x2", SReg_64, SReg_64_XEXEC>;
-defm S_ATOMIC_INC_X2 : SM_Pseudo_Atomics <"s_atomic_inc_x2", SReg_64, SReg_64_XEXEC>;
-defm S_ATOMIC_DEC_X2 : SM_Pseudo_Atomics <"s_atomic_dec_x2", SReg_64, SReg_64_XEXEC>;
+defm S_ATOMIC_SWAP_X2 : SM_Pseudo_Atomics <SReg_64, SReg_64_XEXEC>;
+defm S_ATOMIC_CMPSWAP_X2 : SM_Pseudo_Atomics <SReg_64, SReg_128>;
+defm S_ATOMIC_ADD_X2 : SM_Pseudo_Atomics <SReg_64, SReg_64_XEXEC>;
+defm S_ATOMIC_SUB_X2 : SM_Pseudo_Atomics <SReg_64, SReg_64_XEXEC>;
+defm S_ATOMIC_SMIN_X2 : SM_Pseudo_Atomics <SReg_64, SReg_64_XEXEC>;
+defm S_ATOMIC_UMIN_X2 : SM_Pseudo_Atomics <SReg_64, SReg_64_XEXEC>;
+defm S_ATOMIC_SMAX_X2 : SM_Pseudo_Atomics <SReg_64, SReg_64_XEXEC>;
+defm S_ATOMIC_UMAX_X2 : SM_Pseudo_Atomics <SReg_64, SReg_64_XEXEC>;
+defm S_ATOMIC_AND_X2 : SM_Pseudo_Atomics <SReg_64, SReg_64_XEXEC>;
+defm S_ATOMIC_OR_X2 : SM_Pseudo_Atomics <SReg_64, SReg_64_XEXEC>;
+defm S_ATOMIC_XOR_X2 : SM_Pseudo_Atomics <SReg_64, SReg_64_XEXEC>;
+defm S_ATOMIC_INC_X2 : SM_Pseudo_Atomics <SReg_64, SReg_64_XEXEC>;
+defm S_ATOMIC_DEC_X2 : SM_Pseudo_Atomics <SReg_64, SReg_64_XEXEC>;
} // let SubtargetPredicate = HasScalarAtomics
let SubtargetPredicate = HasScalarAtomics in {
-defm S_DCACHE_DISCARD : SM_Pseudo_Discards <"s_dcache_discard">;
-defm S_DCACHE_DISCARD_X2 : SM_Pseudo_Discards <"s_dcache_discard_x2">;
+defm S_DCACHE_DISCARD : SM_Pseudo_Discards;
+defm S_DCACHE_DISCARD_X2 : SM_Pseudo_Discards;
}
//===----------------------------------------------------------------------===//
@@ -471,30 +441,27 @@ class SMRD_Real_si <bits<5> op, SM_Pseudo ps>
let Inst{31-27} = 0x18; //encoding
}
-multiclass SM_Real_Loads_si<bits<5> op, string ps,
- SM_Load_Pseudo immPs = !cast<SM_Load_Pseudo>(ps#_IMM),
- SM_Load_Pseudo sgprPs = !cast<SM_Load_Pseudo>(ps#_SGPR)> {
-
+multiclass SM_Real_Loads_si<bits<5> op> {
+ defvar ps = NAME;
+ defvar immPs = !cast<SM_Load_Pseudo>(ps#_IMM);
def _IMM_si : SMRD_Real_si <op, immPs> {
let InOperandList = (ins immPs.BaseClass:$sbase, smrd_offset_8:$offset, CPol:$cpol);
}
- def _SGPR_si : SMRD_Real_si <op, sgprPs> {
- let InOperandList = (ins sgprPs.BaseClass:$sbase, SReg_32:$soffset, CPol:$cpol);
- }
-
+ defvar sgprPs = !cast<SM_Load_Pseudo>(ps#_SGPR);
+ def _SGPR_si : SMRD_Real_si <op, sgprPs>;
}
-defm S_LOAD_DWORD : SM_Real_Loads_si <0x00, "S_LOAD_DWORD">;
-defm S_LOAD_DWORDX2 : SM_Real_Loads_si <0x01, "S_LOAD_DWORDX2">;
-defm S_LOAD_DWORDX4 : SM_Real_Loads_si <0x02, "S_LOAD_DWORDX4">;
-defm S_LOAD_DWORDX8 : SM_Real_Loads_si <0x03, "S_LOAD_DWORDX8">;
-defm S_LOAD_DWORDX16 : SM_Real_Loads_si <0x04, "S_LOAD_DWORDX16">;
-defm S_BUFFER_LOAD_DWORD : SM_Real_Loads_si <0x08, "S_BUFFER_LOAD_DWORD">;
-defm S_BUFFER_LOAD_DWORDX2 : SM_Real_Loads_si <0x09, "S_BUFFER_LOAD_DWORDX2">;
-defm S_BUFFER_LOAD_DWORDX4 : SM_Real_Loads_si <0x0a, "S_BUFFER_LOAD_DWORDX4">;
-defm S_BUFFER_LOAD_DWORDX8 : SM_Real_Loads_si <0x0b, "S_BUFFER_LOAD_DWORDX8">;
-defm S_BUFFER_LOAD_DWORDX16 : SM_Real_Loads_si <0x0c, "S_BUFFER_LOAD_DWORDX16">;
+defm S_LOAD_DWORD : SM_Real_Loads_si <0x00>;
+defm S_LOAD_DWORDX2 : SM_Real_Loads_si <0x01>;
+defm S_LOAD_DWORDX4 : SM_Real_Loads_si <0x02>;
+defm S_LOAD_DWORDX8 : SM_Real_Loads_si <0x03>;
+defm S_LOAD_DWORDX16 : SM_Real_Loads_si <0x04>;
+defm S_BUFFER_LOAD_DWORD : SM_Real_Loads_si <0x08>;
+defm S_BUFFER_LOAD_DWORDX2 : SM_Real_Loads_si <0x09>;
+defm S_BUFFER_LOAD_DWORDX4 : SM_Real_Loads_si <0x0a>;
+defm S_BUFFER_LOAD_DWORDX8 : SM_Real_Loads_si <0x0b>;
+defm S_BUFFER_LOAD_DWORDX16 : SM_Real_Loads_si <0x0c>;
def S_MEMTIME_si : SMRD_Real_si <0x1e, S_MEMTIME>;
def S_DCACHE_INV_si : SMRD_Real_si <0x1f, S_DCACHE_INV>;
@@ -548,11 +515,8 @@ class SMEM_Real_vi <bits<8> op, SM_Pseudo ps>
soffset{6-0}, ?);
}
-class SMEM_Real_Load_vi<bits<8> op, string ps, OffsetMode offsets>
- : SMEM_Real_vi<op, !cast<SM_Pseudo>(ps # offsets.Variant)> {
- RegisterClass BaseClass = !cast<SM_Load_Pseudo>(ps # offsets.Variant).BaseClass;
- let InOperandList = !con((ins BaseClass:$sbase), offsets.Ins, (ins CPol:$cpol));
-}
+class SMEM_Real_Load_vi<bits<8> op, string ps>
+ : SMEM_Real_vi<op, !cast<SM_Pseudo>(ps)>;
// The alternative GFX9 SGPR encoding using soffset to encode the
// offset register. Not available in assembler and goes to the GFX9
@@ -565,13 +529,14 @@ class SMEM_Real_SGPR_alt_gfx9 {
string AsmVariantName = "NonParsable";
}
-multiclass SM_Real_Loads_vi<bits<8> op, string ps> {
- def _IMM_vi : SMEM_Real_Load_vi <op, ps, IMM_Offset>;
- def _SGPR_vi : SMEM_Real_Load_vi <op, ps, SGPR_Offset>;
- def _SGPR_alt_gfx9 : SMEM_Real_Load_vi <op, ps, SGPR_Offset>,
+multiclass SM_Real_Loads_vi<bits<8> op> {
+ defvar ps = NAME;
+ def _IMM_vi : SMEM_Real_Load_vi <op, ps#"_IMM">;
+ def _SGPR_vi : SMEM_Real_Load_vi <op, ps#"_SGPR">;
+ def _SGPR_alt_gfx9 : SMEM_Real_Load_vi <op, ps#"_SGPR">,
SMEM_Real_SGPR_alt_gfx9;
let IsGFX9SpecificEncoding = true in
- def _SGPR_IMM_gfx9 : SMEM_Real_Load_vi <op, ps, SGPR_IMM_Offset>;
+ def _SGPR_IMM_gfx9 : SMEM_Real_Load_vi <op, ps#"_SGPR_IMM">;
}
class SMEM_Real_Store_Base_vi <bits<8> op, SM_Pseudo ps> : SMEM_Real_vi <op, ps> {
@@ -582,24 +547,21 @@ class SMEM_Real_Store_Base_vi <bits<8> op, SM_Pseudo ps> : SMEM_Real_vi <op, ps>
let Inst{12-6} = !if(ps.has_sdst, sdata{6-0}, ?);
}
-class SMEM_Real_Store_vi <bits<8> op, string ps, OffsetMode offsets>
- : SMEM_Real_Store_Base_vi <op, !cast<SM_Pseudo>(ps # offsets.Variant)> {
- RegisterClass SrcClass = !cast<SM_Store_Pseudo>(ps # offsets.Variant).SrcClass;
- RegisterClass BaseClass = !cast<SM_Store_Pseudo>(ps # offsets.Variant).BaseClass;
- let InOperandList = !con((ins SrcClass:$sdata, BaseClass:$sbase),
- offsets.Ins, (ins CPol:$cpol));
-}
+class SMEM_Real_Store_vi <bits<8> op, string ps>
+ : SMEM_Real_Store_Base_vi <op, !cast<SM_Pseudo>(ps)>;
-multiclass SM_Real_Stores_vi<bits<8> op, string ps> {
- def _IMM_vi : SMEM_Real_Store_vi <op, ps, IMM_Offset>;
- def _SGPR_vi : SMEM_Real_Store_vi <op, ps, SGPR_Offset>;
- def _SGPR_alt_gfx9 : SMEM_Real_Store_vi <op, ps, SGPR_Offset>,
+multiclass SM_Real_Stores_vi<bits<8> op> {
+ defvar ps = NAME;
+ def _IMM_vi : SMEM_Real_Store_vi <op, ps#"_IMM">;
+ def _SGPR_vi : SMEM_Real_Store_vi <op, ps#"_SGPR">;
+ def _SGPR_alt_gfx9 : SMEM_Real_Store_vi <op, ps#"_SGPR">,
SMEM_Real_SGPR_alt_gfx9;
let IsGFX9SpecificEncoding = true in
- def _SGPR_IMM_gfx9 : SMEM_Real_Store_vi <op, ps, SGPR_IMM_Offset>;
+ def _SGPR_IMM_gfx9 : SMEM_Real_Store_vi <op, ps#"_SGPR_IMM">;
}
-multiclass SM_Real_Probe_vi<bits<8> op, string ps> {
+multiclass SM_Real_Probe_vi<bits<8> op> {
+ defvar ps = NAME;
def _IMM_vi : SMEM_Real_Store_Base_vi <op, !cast<SM_Probe_Pseudo>(ps#_IMM)>;
def _SGPR_vi : SMEM_Real_Store_Base_vi <op, !cast<SM_Probe_Pseudo>(ps#_SGPR)>;
def _SGPR_alt_gfx9
@@ -610,24 +572,24 @@ multiclass SM_Real_Probe_vi<bits<8> op, string ps> {
: SMEM_Real_Store_Base_vi <op, !cast<SM_Probe_Pseudo>(ps#_SGPR_IMM)>;
}
-defm S_LOAD_DWORD : SM_Real_Loads_vi <0x00, "S_LOAD_DWORD">;
-defm S_LOAD_DWORDX2 : SM_Real_Loads_vi <0x01, "S_LOAD_DWORDX2">;
-defm S_LOAD_DWORDX4 : SM_Real_Loads_vi <0x02, "S_LOAD_DWORDX4">;
-defm S_LOAD_DWORDX8 : SM_Real_Loads_vi <0x03, "S_LOAD_DWORDX8">;
-defm S_LOAD_DWORDX16 : SM_Real_Loads_vi <0x04, "S_LOAD_DWORDX16">;
-defm S_BUFFER_LOAD_DWORD : SM_Real_Loads_vi <0x08, "S_BUFFER_LOAD_DWORD">;
-defm S_BUFFER_LOAD_DWORDX2 : SM_Real_Loads_vi <0x09, "S_BUFFER_LOAD_DWORDX2">;
-defm S_BUFFER_LOAD_DWORDX4 : SM_Real_Loads_vi <0x0a, "S_BUFFER_LOAD_DWORDX4">;
-defm S_BUFFER_LOAD_DWORDX8 : SM_Real_Loads_vi <0x0b, "S_BUFFER_LOAD_DWORDX8">;
-defm S_BUFFER_LOAD_DWORDX16 : SM_Real_Loads_vi <0x0c, "S_BUFFER_LOAD_DWORDX16">;
+defm S_LOAD_DWORD : SM_Real_Loads_vi <0x00>;
+defm S_LOAD_DWORDX2 : SM_Real_Loads_vi <0x01>;
+defm S_LOAD_DWORDX4 : SM_Real_Loads_vi <0x02>;
+defm S_LOAD_DWORDX8 : SM_Real_Loads_vi <0x03>;
+defm S_LOAD_DWORDX16 : SM_Real_Loads_vi <0x04>;
+defm S_BUFFER_LOAD_DWORD : SM_Real_Loads_vi <0x08>;
+defm S_BUFFER_LOAD_DWORDX2 : SM_Real_Loads_vi <0x09>;
+defm S_BUFFER_LOAD_DWORDX4 : SM_Real_Loads_vi <0x0a>;
+defm S_BUFFER_LOAD_DWORDX8 : SM_Real_Loads_vi <0x0b>;
+defm S_BUFFER_LOAD_DWORDX16 : SM_Real_Loads_vi <0x0c>;
-defm S_STORE_DWORD : SM_Real_Stores_vi <0x10, "S_STORE_DWORD">;
-defm S_STORE_DWORDX2 : SM_Real_Stores_vi <0x11, "S_STORE_DWORDX2">;
-defm S_STORE_DWORDX4 : SM_Real_Stores_vi <0x12, "S_STORE_DWORDX4">;
+defm S_STORE_DWORD : SM_Real_Stores_vi <0x10>;
+defm S_STORE_DWORDX2 : SM_Real_Stores_vi <0x11>;
+defm S_STORE_DWORDX4 : SM_Real_Stores_vi <0x12>;
-defm S_BUFFER_STORE_DWORD : SM_Real_Stores_vi <0x18, "S_BUFFER_STORE_DWORD">;
-defm S_BUFFER_STORE_DWORDX2 : SM_Real_Stores_vi <0x19, "S_BUFFER_STORE_DWORDX2">;
-defm S_BUFFER_STORE_DWORDX4 : SM_Real_Stores_vi <0x1a, "S_BUFFER_STORE_DWORDX4">;
+defm S_BUFFER_STORE_DWORD : SM_Real_Stores_vi <0x18>;
+defm S_BUFFER_STORE_DWORDX2 : SM_Real_Stores_vi <0x19>;
+defm S_BUFFER_STORE_DWORDX4 : SM_Real_Stores_vi <0x1a>;
// These instructions use same encoding
def S_DCACHE_INV_vi : SMEM_Real_vi <0x20, S_DCACHE_INV>;
@@ -637,16 +599,16 @@ def S_DCACHE_WB_VOL_vi : SMEM_Real_vi <0x23, S_DCACHE_WB_VOL>;
def S_MEMTIME_vi : SMEM_Real_vi <0x24, S_MEMTIME>;
def S_MEMREALTIME_vi : SMEM_Real_vi <0x25, S_MEMREALTIME>;
-defm S_SCRATCH_LOAD_DWORD : SM_Real_Loads_vi <0x05, "S_SCRATCH_LOAD_DWORD">;
-defm S_SCRATCH_LOAD_DWORDX2 : SM_Real_Loads_vi <0x06, "S_SCRATCH_LOAD_DWORDX2">;
-defm S_SCRATCH_LOAD_DWORDX4 : SM_Real_Loads_vi <0x07, "S_SCRATCH_LOAD_DWORDX4">;
+defm S_SCRATCH_LOAD_DWORD : SM_Real_Loads_vi <0x05>;
+defm S_SCRATCH_LOAD_DWORDX2 : SM_Real_Loads_vi <0x06>;
+defm S_SCRATCH_LOAD_DWORDX4 : SM_Real_Loads_vi <0x07>;
-defm S_SCRATCH_STORE_DWORD : SM_Real_Stores_vi <0x15, "S_SCRATCH_STORE_DWORD">;
-defm S_SCRATCH_STORE_DWORDX2 : SM_Real_Stores_vi <0x16, "S_SCRATCH_STORE_DWORDX2">;
-defm S_SCRATCH_STORE_DWORDX4 : SM_Real_Stores_vi <0x17, "S_SCRATCH_STORE_DWORDX4">;
+defm S_SCRATCH_STORE_DWORD : SM_Real_Stores_vi <0x15>;
+defm S_SCRATCH_STORE_DWORDX2 : SM_Real_Stores_vi <0x16>;
+defm S_SCRATCH_STORE_DWORDX4 : SM_Real_Stores_vi <0x17>;
-defm S_ATC_PROBE : SM_Real_Probe_vi <0x26, "S_ATC_PROBE">;
-defm S_ATC_PROBE_BUFFER : SM_Real_Probe_vi <0x27, "S_ATC_PROBE_BUFFER">;
+defm S_ATC_PROBE : SM_Real_Probe_vi <0x26>;
+defm S_ATC_PROBE_BUFFER : SM_Real_Probe_vi <0x27>;
//===----------------------------------------------------------------------===//
// GFX9
@@ -665,7 +627,8 @@ class SMEM_Atomic_Real_vi <bits<8> op, SM_Atomic_Pseudo ps>
let Inst{12-6} = !if(ps.glc, sdst{6-0}, sdata{6-0});
}
-multiclass SM_Real_Atomics_vi<bits<8> op, string ps> {
+multiclass SM_Real_Atomics_vi<bits<8> op> {
+ defvar ps = NAME;
def _IMM_vi : SMEM_Atomic_Real_vi <op, !cast<SM_Atomic_Pseudo>(ps#_IMM)>;
def _SGPR_vi : SMEM_Atomic_Real_vi <op, !cast<SM_Atomic_Pseudo>(ps#_SGPR)>;
def _SGPR_alt_gfx9
@@ -684,63 +647,64 @@ multiclass SM_Real_Atomics_vi<bits<8> op, string ps> {
: SMEM_Atomic_Real_vi <op, !cast<SM_Atomic_Pseudo>(ps#_SGPR_IMM_RTN)>;
}
-defm S_BUFFER_ATOMIC_SWAP : SM_Real_Atomics_vi <0x40, "S_BUFFER_ATOMIC_SWAP">;
-defm S_BUFFER_ATOMIC_CMPSWAP : SM_Real_Atomics_vi <0x41, "S_BUFFER_ATOMIC_CMPSWAP">;
-defm S_BUFFER_ATOMIC_ADD : SM_Real_Atomics_vi <0x42, "S_BUFFER_ATOMIC_ADD">;
-defm S_BUFFER_ATOMIC_SUB : SM_Real_Atomics_vi <0x43, "S_BUFFER_ATOMIC_SUB">;
-defm S_BUFFER_ATOMIC_SMIN : SM_Real_Atomics_vi <0x44, "S_BUFFER_ATOMIC_SMIN">;
-defm S_BUFFER_ATOMIC_UMIN : SM_Real_Atomics_vi <0x45, "S_BUFFER_ATOMIC_UMIN">;
-defm S_BUFFER_ATOMIC_SMAX : SM_Real_Atomics_vi <0x46, "S_BUFFER_ATOMIC_SMAX">;
-defm S_BUFFER_ATOMIC_UMAX : SM_Real_Atomics_vi <0x47, "S_BUFFER_ATOMIC_UMAX">;
-defm S_BUFFER_ATOMIC_AND : SM_Real_Atomics_vi <0x48, "S_BUFFER_ATOMIC_AND">;
-defm S_BUFFER_ATOMIC_OR : SM_Real_Atomics_vi <0x49, "S_BUFFER_ATOMIC_OR">;
-defm S_BUFFER_ATOMIC_XOR : SM_Real_Atomics_vi <0x4a, "S_BUFFER_ATOMIC_XOR">;
-defm S_BUFFER_ATOMIC_INC : SM_Real_Atomics_vi <0x4b, "S_BUFFER_ATOMIC_INC">;
-defm S_BUFFER_ATOMIC_DEC : SM_Real_Atomics_vi <0x4c, "S_BUFFER_ATOMIC_DEC">;
+defm S_BUFFER_ATOMIC_SWAP : SM_Real_Atomics_vi <0x40>;
+defm S_BUFFER_ATOMIC_CMPSWAP : SM_Real_Atomics_vi <0x41>;
+defm S_BUFFER_ATOMIC_ADD : SM_Real_Atomics_vi <0x42>;
+defm S_BUFFER_ATOMIC_SUB : SM_Real_Atomics_vi <0x43>;
+defm S_BUFFER_ATOMIC_SMIN : SM_Real_Atomics_vi <0x44>;
+defm S_BUFFER_ATOMIC_UMIN : SM_Real_Atomics_vi <0x45>;
+defm S_BUFFER_ATOMIC_SMAX : SM_Real_Atomics_vi <0x46>;
+defm S_BUFFER_ATOMIC_UMAX : SM_Real_Atomics_vi <0x47>;
+defm S_BUFFER_ATOMIC_AND : SM_Real_Atomics_vi <0x48>;
+defm S_BUFFER_ATOMIC_OR : SM_Real_Atomics_vi <0x49>;
+defm S_BUFFER_ATOMIC_XOR : SM_Real_Atomics_vi <0x4a>;
+defm S_BUFFER_ATOMIC_INC : SM_Real_Atomics_vi <0x4b>;
+defm S_BUFFER_ATOMIC_DEC : SM_Real_Atomics_vi <0x4c>;
-defm S_BUFFER_ATOMIC_SWAP_X2 : SM_Real_Atomics_vi <0x60, "S_BUFFER_ATOMIC_SWAP_X2">;
-defm S_BUFFER_ATOMIC_CMPSWAP_X2 : SM_Real_Atomics_vi <0x61, "S_BUFFER_ATOMIC_CMPSWAP_X2">;
-defm S_BUFFER_ATOMIC_ADD_X2 : SM_Real_Atomics_vi <0x62, "S_BUFFER_ATOMIC_ADD_X2">;
-defm S_BUFFER_ATOMIC_SUB_X2 : SM_Real_Atomics_vi <0x63, "S_BUFFER_ATOMIC_SUB_X2">;
-defm S_BUFFER_ATOMIC_SMIN_X2 : SM_Real_Atomics_vi <0x64, "S_BUFFER_ATOMIC_SMIN_X2">;
-defm S_BUFFER_ATOMIC_UMIN_X2 : SM_Real_Atomics_vi <0x65, "S_BUFFER_ATOMIC_UMIN_X2">;
-defm S_BUFFER_ATOMIC_SMAX_X2 : SM_Real_Atomics_vi <0x66, "S_BUFFER_ATOMIC_SMAX_X2">;
-defm S_BUFFER_ATOMIC_UMAX_X2 : SM_Real_Atomics_vi <0x67, "S_BUFFER_ATOMIC_UMAX_X2">;
-defm S_BUFFER_ATOMIC_AND_X2 : SM_Real_Atomics_vi <0x68, "S_BUFFER_ATOMIC_AND_X2">;
-defm S_BUFFER_ATOMIC_OR_X2 : SM_Real_Atomics_vi <0x69, "S_BUFFER_ATOMIC_OR_X2">;
-defm S_BUFFER_ATOMIC_XOR_X2 : SM_Real_Atomics_vi <0x6a, "S_BUFFER_ATOMIC_XOR_X2">;
-defm S_BUFFER_ATOMIC_INC_X2 : SM_Real_Atomics_vi <0x6b, "S_BUFFER_ATOMIC_INC_X2">;
-defm S_BUFFER_ATOMIC_DEC_X2 : SM_Real_Atomics_vi <0x6c, "S_BUFFER_ATOMIC_DEC_X2">;
+defm S_BUFFER_ATOMIC_SWAP_X2 : SM_Real_Atomics_vi <0x60>;
+defm S_BUFFER_ATOMIC_CMPSWAP_X2 : SM_Real_Atomics_vi <0x61>;
+defm S_BUFFER_ATOMIC_ADD_X2 : SM_Real_Atomics_vi <0x62>;
+defm S_BUFFER_ATOMIC_SUB_X2 : SM_Real_Atomics_vi <0x63>;
+defm S_BUFFER_ATOMIC_SMIN_X2 : SM_Real_Atomics_vi <0x64>;
+defm S_BUFFER_ATOMIC_UMIN_X2 : SM_Real_Atomics_vi <0x65>;
+defm S_BUFFER_ATOMIC_SMAX_X2 : SM_Real_Atomics_vi <0x66>;
+defm S_BUFFER_ATOMIC_UMAX_X2 : SM_Real_Atomics_vi <0x67>;
+defm S_BUFFER_ATOMIC_AND_X2 : SM_Real_Atomics_vi <0x68>;
+defm S_BUFFER_ATOMIC_OR_X2 : SM_Real_Atomics_vi <0x69>;
+defm S_BUFFER_ATOMIC_XOR_X2 : SM_Real_Atomics_vi <0x6a>;
+defm S_BUFFER_ATOMIC_INC_X2 : SM_Real_Atomics_vi <0x6b>;
+defm S_BUFFER_ATOMIC_DEC_X2 : SM_Real_Atomics_vi <0x6c>;
-defm S_ATOMIC_SWAP : SM_Real_Atomics_vi <0x80, "S_ATOMIC_SWAP">;
-defm S_ATOMIC_CMPSWAP : SM_Real_Atomics_vi <0x81, "S_ATOMIC_CMPSWAP">;
-defm S_ATOMIC_ADD : SM_Real_Atomics_vi <0x82, "S_ATOMIC_ADD">;
-defm S_ATOMIC_SUB : SM_Real_Atomics_vi <0x83, "S_ATOMIC_SUB">;
-defm S_ATOMIC_SMIN : SM_Real_Atomics_vi <0x84, "S_ATOMIC_SMIN">;
-defm S_ATOMIC_UMIN : SM_Real_Atomics_vi <0x85, "S_ATOMIC_UMIN">;
-defm S_ATOMIC_SMAX : SM_Real_Atomics_vi <0x86, "S_ATOMIC_SMAX">;
-defm S_ATOMIC_UMAX : SM_Real_Atomics_vi <0x87, "S_ATOMIC_UMAX">;
-defm S_ATOMIC_AND : SM_Real_Atomics_vi <0x88, "S_ATOMIC_AND">;
-defm S_ATOMIC_OR : SM_Real_Atomics_vi <0x89, "S_ATOMIC_OR">;
-defm S_ATOMIC_XOR : SM_Real_Atomics_vi <0x8a, "S_ATOMIC_XOR">;
-defm S_ATOMIC_INC : SM_Real_Atomics_vi <0x8b, "S_ATOMIC_INC">;
-defm S_ATOMIC_DEC : SM_Real_Atomics_vi <0x8c, "S_ATOMIC_DEC">;
+defm S_ATOMIC_SWAP : SM_Real_Atomics_vi <0x80>;
+defm S_ATOMIC_CMPSWAP : SM_Real_Atomics_vi <0x81>;
+defm S_ATOMIC_ADD : SM_Real_Atomics_vi <0x82>;
+defm S_ATOMIC_SUB : SM_Real_Atomics_vi <0x83>;
+defm S_ATOMIC_SMIN : SM_Real_Atomics_vi <0x84>;
+defm S_ATOMIC_UMIN : SM_Real_Atomics_vi <0x85>;
+defm S_ATOMIC_SMAX : SM_Real_Atomics_vi <0x86>;
+defm S_ATOMIC_UMAX : SM_Real_Atomics_vi <0x87>;
+defm S_ATOMIC_AND : SM_Real_Atomics_vi <0x88>;
+defm S_ATOMIC_OR : SM_Real_Atomics_vi <0x89>;
+defm S_ATOMIC_XOR : SM_Real_Atomics_vi <0x8a>;
+defm S_ATOMIC_INC : SM_Real_Atomics_vi <0x8b>;
+defm S_ATOMIC_DEC : SM_Real_Atomics_vi <0x8c>;
-defm S_ATOMIC_SWAP_X2 : SM_Real_Atomics_vi <0xa0, "S_ATOMIC_SWAP_X2">;
-defm S_ATOMIC_CMPSWAP_X2 : SM_Real_Atomics_vi <0xa1, "S_ATOMIC_CMPSWAP_X2">;
-defm S_ATOMIC_ADD_X2 : SM_Real_Atomics_vi <0xa2, "S_ATOMIC_ADD_X2">;
-defm S_ATOMIC_SUB_X2 : SM_Real_Atomics_vi <0xa3, "S_ATOMIC_SUB_X2">;
-defm S_ATOMIC_SMIN_X2 : SM_Real_Atomics_vi <0xa4, "S_ATOMIC_SMIN_X2">;
-defm S_ATOMIC_UMIN_X2 : SM_Real_Atomics_vi <0xa5, "S_ATOMIC_UMIN_X2">;
-defm S_ATOMIC_SMAX_X2 : SM_Real_Atomics_vi <0xa6, "S_ATOMIC_SMAX_X2">;
-defm S_ATOMIC_UMAX_X2 : SM_Real_Atomics_vi <0xa7, "S_ATOMIC_UMAX_X2">;
-defm S_ATOMIC_AND_X2 : SM_Real_Atomics_vi <0xa8, "S_ATOMIC_AND_X2">;
-defm S_ATOMIC_OR_X2 : SM_Real_Atomics_vi <0xa9, "S_ATOMIC_OR_X2">;
-defm S_ATOMIC_XOR_X2 : SM_Real_Atomics_vi <0xaa, "S_ATOMIC_XOR_X2">;
-defm S_ATOMIC_INC_X2 : SM_Real_Atomics_vi <0xab, "S_ATOMIC_INC_X2">;
-defm S_ATOMIC_DEC_X2 : SM_Real_Atomics_vi <0xac, "S_ATOMIC_DEC_X2">;
+defm S_ATOMIC_SWAP_X2 : SM_Real_Atomics_vi <0xa0>;
+defm S_ATOMIC_CMPSWAP_X2 : SM_Real_Atomics_vi <0xa1>;
+defm S_ATOMIC_ADD_X2 : SM_Real_Atomics_vi <0xa2>;
+defm S_ATOMIC_SUB_X2 : SM_Real_Atomics_vi <0xa3>;
+defm S_ATOMIC_SMIN_X2 : SM_Real_Atomics_vi <0xa4>;
+defm S_ATOMIC_UMIN_X2 : SM_Real_Atomics_vi <0xa5>;
+defm S_ATOMIC_SMAX_X2 : SM_Real_Atomics_vi <0xa6>;
+defm S_ATOMIC_UMAX_X2 : SM_Real_Atomics_vi <0xa7>;
+defm S_ATOMIC_AND_X2 : SM_Real_Atomics_vi <0xa8>;
+defm S_ATOMIC_OR_X2 : SM_Real_Atomics_vi <0xa9>;
+defm S_ATOMIC_XOR_X2 : SM_Real_Atomics_vi <0xaa>;
+defm S_ATOMIC_INC_X2 : SM_Real_Atomics_vi <0xab>;
+defm S_ATOMIC_DEC_X2 : SM_Real_Atomics_vi <0xac>;
-multiclass SM_Real_Discard_vi<bits<8> op, string ps> {
+multiclass SM_Real_Discard_vi<bits<8> op> {
+ defvar ps = NAME;
def _IMM_vi : SMEM_Real_vi <op, !cast<SM_Discard_Pseudo>(ps#_IMM)>;
def _SGPR_vi : SMEM_Real_vi <op, !cast<SM_Discard_Pseudo>(ps#_SGPR)>;
def _SGPR_alt_gfx9 : SMEM_Real_vi <op, !cast<SM_Discard_Pseudo>(ps#_SGPR)>,
@@ -749,17 +713,14 @@ multiclass SM_Real_Discard_vi<bits<8> op, string ps> {
def _SGPR_IMM_gfx9 : SMEM_Real_vi <op, !cast<SM_Discard_Pseudo>(ps#_SGPR_IMM)>;
}
-defm S_DCACHE_DISCARD : SM_Real_Discard_vi <0x28, "S_DCACHE_DISCARD">;
-defm S_DCACHE_DISCARD_X2 : SM_Real_Discard_vi <0x29, "S_DCACHE_DISCARD_X2">;
+defm S_DCACHE_DISCARD : SM_Real_Discard_vi <0x28>;
+defm S_DCACHE_DISCARD_X2 : SM_Real_Discard_vi <0x29>;
//===----------------------------------------------------------------------===//
// CI
//===----------------------------------------------------------------------===//
-def smrd_literal_offset : NamedOperandU32<"SMRDLiteralOffset",
- NamedMatchClass<"SMRDLiteralOffset">> {
- let OperandType = "OPERAND_IMMEDIATE";
-}
+def smrd_literal_offset : ImmOperand<i32, "SMRDLiteralOffset">;
class SMRD_Real_Load_IMM_ci <bits<5> op, SM_Load_Pseudo ps> :
SM_Real<ps>,
@@ -854,8 +815,14 @@ multiclass SMRD_Pattern <string Instr, ValueType vt> {
// 3. SGPR offset
def : GCNPat <
(smrd_load (SMRDSgpr i64:$sbase, i32:$soffset)),
- (vt (!cast<SM_Pseudo>(Instr#"_SGPR") $sbase, $soffset, 0))
- >;
+ (vt (!cast<SM_Pseudo>(Instr#"_SGPR") $sbase, $soffset, 0))> {
+ let OtherPredicates = [isNotGFX9Plus];
+ }
+ def : GCNPat <
+ (smrd_load (SMRDSgpr i64:$sbase, i32:$soffset)),
+ (vt (!cast<SM_Pseudo>(Instr#"_SGPR_IMM") $sbase, $soffset, 0, 0))> {
+ let OtherPredicates = [isGFX9Plus];
+ }
// 4. SGPR+IMM offset
def : GCNPat <
@@ -891,8 +858,14 @@ multiclass SMLoad_Pattern <string Instr, ValueType vt> {
// 3. Offset loaded in an 32bit SGPR
def : GCNPat <
(SIsbuffer_load v4i32:$sbase, i32:$soffset, timm:$cachepolicy),
- (vt (!cast<SM_Pseudo>(Instr#"_SGPR") SReg_128:$sbase, SReg_32:$soffset, (extract_cpol $cachepolicy)))
- >;
+ (vt (!cast<SM_Pseudo>(Instr#"_SGPR") SReg_128:$sbase, SReg_32:$soffset, (extract_cpol $cachepolicy)))> {
+ let OtherPredicates = [isNotGFX9Plus];
+ }
+ def : GCNPat <
+ (SIsbuffer_load v4i32:$sbase, i32:$soffset, timm:$cachepolicy),
+ (vt (!cast<SM_Pseudo>(Instr#"_SGPR_IMM") SReg_128:$sbase, SReg_32:$soffset, 0, (extract_cpol $cachepolicy)))> {
+ let OtherPredicates = [isGFX9Plus];
+ }
// 4. Offset as an 32-bit SGPR + immediate
def : GCNPat <
@@ -929,6 +902,8 @@ foreach vt = SReg_512.RegTypes in {
defm : SMRD_Pattern <"S_LOAD_DWORDX16", vt>;
}
+} // End let AddedComplexity = 100
+
defm : SMLoad_Pattern <"S_BUFFER_LOAD_DWORD", i32>;
defm : SMLoad_Pattern <"S_BUFFER_LOAD_DWORDX2", v2i32>;
defm : SMLoad_Pattern <"S_BUFFER_LOAD_DWORDX4", v4i32>;
@@ -940,7 +915,6 @@ defm : SMLoad_Pattern <"S_BUFFER_LOAD_DWORDX2", v2f32>;
defm : SMLoad_Pattern <"S_BUFFER_LOAD_DWORDX4", v4f32>;
defm : SMLoad_Pattern <"S_BUFFER_LOAD_DWORDX8", v8f32>;
defm : SMLoad_Pattern <"S_BUFFER_LOAD_DWORDX16", v16f32>;
-} // End let AddedComplexity = 100
let OtherPredicates = [HasSMemTimeInst] in {
def : GCNPat <
@@ -987,16 +961,14 @@ class SMEM_Real_gfx10<bits<8> op, SM_Pseudo ps>
let Inst{16} = !if(ps.has_glc, cpol{CPolBit.GLC}, ?);
}
-class SMEM_Real_Load_gfx10<bits<8> op, string ps, OffsetMode offsets>
- : SMEM_Real_gfx10<op, !cast<SM_Pseudo>(ps # offsets.Variant)> {
- RegisterClass BaseClass = !cast<SM_Load_Pseudo>(ps # offsets.Variant).BaseClass;
- let InOperandList = !con((ins BaseClass:$sbase), offsets.Ins, (ins CPol:$cpol));
-}
+class SMEM_Real_Load_gfx10<bits<8> op, string ps>
+ : SMEM_Real_gfx10<op, !cast<SM_Pseudo>(ps)>;
-multiclass SM_Real_Loads_gfx10<bits<8> op, string ps> {
- def _IMM_gfx10 : SMEM_Real_Load_gfx10<op, ps, IMM_Offset>;
- def _SGPR_gfx10 : SMEM_Real_Load_gfx10<op, ps, SGPR_Offset>;
- def _SGPR_IMM_gfx10 : SMEM_Real_Load_gfx10<op, ps, SGPR_IMM_Offset>;
+multiclass SM_Real_Loads_gfx10<bits<8> op> {
+ defvar ps = NAME;
+ def _IMM_gfx10 : SMEM_Real_Load_gfx10<op, ps#"_IMM">;
+ def _SGPR_gfx10 : SMEM_Real_Load_gfx10<op, ps#"_SGPR">;
+ def _SGPR_IMM_gfx10 : SMEM_Real_Load_gfx10<op, ps#"_SGPR_IMM">;
}
class SMEM_Real_Store_gfx10<bits<8> op, SM_Pseudo ps> : SMEM_Real_gfx10<op, ps> {
@@ -1006,53 +978,48 @@ class SMEM_Real_Store_gfx10<bits<8> op, SM_Pseudo ps> : SMEM_Real_gfx10<op, ps>
let Inst{12-6} = !if(ps.has_sdst, sdata{6-0}, ?);
}
-multiclass SM_Real_Stores_gfx10<bits<8> op, string ps,
- SM_Store_Pseudo immPs = !cast<SM_Store_Pseudo>(ps#_IMM),
- SM_Store_Pseudo sgprPs = !cast<SM_Store_Pseudo>(ps#_SGPR)> {
- def _IMM_gfx10 : SMEM_Real_Store_gfx10 <op, immPs> {
- let InOperandList = (ins immPs.SrcClass:$sdata, immPs.BaseClass:$sbase, smem_offset:$offset, CPol:$cpol);
- }
+multiclass SM_Real_Stores_gfx10<bits<8> op> {
+ defvar ps = NAME;
+ defvar immPs = !cast<SM_Store_Pseudo>(ps#_IMM);
+ def _IMM_gfx10 : SMEM_Real_Store_gfx10 <op, immPs>;
- def _SGPR_gfx10 : SMEM_Real_Store_gfx10 <op, sgprPs> {
- let InOperandList = (ins sgprPs.SrcClass:$sdata, sgprPs.BaseClass:$sbase, SReg_32:$soffset, CPol:$cpol);
- }
+ defvar sgprPs = !cast<SM_Store_Pseudo>(ps#_SGPR);
+ def _SGPR_gfx10 : SMEM_Real_Store_gfx10 <op, sgprPs>;
- def _SGPR_IMM_gfx10 : SMEM_Real_Store_gfx10 <op, !cast<SM_Store_Pseudo>(ps#_SGPR_IMM)> {
- let InOperandList = (ins sgprPs.SrcClass:$sdata, sgprPs.BaseClass:$sbase,
- SReg_32:$soffset, smem_offset_mod:$offset, CPol:$cpol);
- }
+ defvar sgprImmPs = !cast<SM_Store_Pseudo>(ps#_SGPR_IMM);
+ def _SGPR_IMM_gfx10 : SMEM_Real_Store_gfx10 <op, sgprImmPs>;
}
-defm S_LOAD_DWORD : SM_Real_Loads_gfx10<0x000, "S_LOAD_DWORD">;
-defm S_LOAD_DWORDX2 : SM_Real_Loads_gfx10<0x001, "S_LOAD_DWORDX2">;
-defm S_LOAD_DWORDX4 : SM_Real_Loads_gfx10<0x002, "S_LOAD_DWORDX4">;
-defm S_LOAD_DWORDX8 : SM_Real_Loads_gfx10<0x003, "S_LOAD_DWORDX8">;
-defm S_LOAD_DWORDX16 : SM_Real_Loads_gfx10<0x004, "S_LOAD_DWORDX16">;
+defm S_LOAD_DWORD : SM_Real_Loads_gfx10<0x000>;
+defm S_LOAD_DWORDX2 : SM_Real_Loads_gfx10<0x001>;
+defm S_LOAD_DWORDX4 : SM_Real_Loads_gfx10<0x002>;
+defm S_LOAD_DWORDX8 : SM_Real_Loads_gfx10<0x003>;
+defm S_LOAD_DWORDX16 : SM_Real_Loads_gfx10<0x004>;
let SubtargetPredicate = HasScalarFlatScratchInsts in {
-defm S_SCRATCH_LOAD_DWORD : SM_Real_Loads_gfx10<0x005, "S_SCRATCH_LOAD_DWORD">;
-defm S_SCRATCH_LOAD_DWORDX2 : SM_Real_Loads_gfx10<0x006, "S_SCRATCH_LOAD_DWORDX2">;
-defm S_SCRATCH_LOAD_DWORDX4 : SM_Real_Loads_gfx10<0x007, "S_SCRATCH_LOAD_DWORDX4">;
+defm S_SCRATCH_LOAD_DWORD : SM_Real_Loads_gfx10<0x005>;
+defm S_SCRATCH_LOAD_DWORDX2 : SM_Real_Loads_gfx10<0x006>;
+defm S_SCRATCH_LOAD_DWORDX4 : SM_Real_Loads_gfx10<0x007>;
} // End SubtargetPredicate = HasScalarFlatScratchInsts
-defm S_BUFFER_LOAD_DWORD : SM_Real_Loads_gfx10<0x008, "S_BUFFER_LOAD_DWORD">;
-defm S_BUFFER_LOAD_DWORDX2 : SM_Real_Loads_gfx10<0x009, "S_BUFFER_LOAD_DWORDX2">;
-defm S_BUFFER_LOAD_DWORDX4 : SM_Real_Loads_gfx10<0x00a, "S_BUFFER_LOAD_DWORDX4">;
-defm S_BUFFER_LOAD_DWORDX8 : SM_Real_Loads_gfx10<0x00b, "S_BUFFER_LOAD_DWORDX8">;
-defm S_BUFFER_LOAD_DWORDX16 : SM_Real_Loads_gfx10<0x00c, "S_BUFFER_LOAD_DWORDX16">;
+defm S_BUFFER_LOAD_DWORD : SM_Real_Loads_gfx10<0x008>;
+defm S_BUFFER_LOAD_DWORDX2 : SM_Real_Loads_gfx10<0x009>;
+defm S_BUFFER_LOAD_DWORDX4 : SM_Real_Loads_gfx10<0x00a>;
+defm S_BUFFER_LOAD_DWORDX8 : SM_Real_Loads_gfx10<0x00b>;
+defm S_BUFFER_LOAD_DWORDX16 : SM_Real_Loads_gfx10<0x00c>;
let SubtargetPredicate = HasScalarStores in {
-defm S_STORE_DWORD : SM_Real_Stores_gfx10<0x010, "S_STORE_DWORD">;
-defm S_STORE_DWORDX2 : SM_Real_Stores_gfx10<0x011, "S_STORE_DWORDX2">;
-defm S_STORE_DWORDX4 : SM_Real_Stores_gfx10<0x012, "S_STORE_DWORDX4">;
+defm S_STORE_DWORD : SM_Real_Stores_gfx10<0x010>;
+defm S_STORE_DWORDX2 : SM_Real_Stores_gfx10<0x011>;
+defm S_STORE_DWORDX4 : SM_Real_Stores_gfx10<0x012>;
let OtherPredicates = [HasScalarFlatScratchInsts] in {
-defm S_SCRATCH_STORE_DWORD : SM_Real_Stores_gfx10<0x015, "S_SCRATCH_STORE_DWORD">;
-defm S_SCRATCH_STORE_DWORDX2 : SM_Real_Stores_gfx10<0x016, "S_SCRATCH_STORE_DWORDX2">;
-defm S_SCRATCH_STORE_DWORDX4 : SM_Real_Stores_gfx10<0x017, "S_SCRATCH_STORE_DWORDX4">;
+defm S_SCRATCH_STORE_DWORD : SM_Real_Stores_gfx10<0x015>;
+defm S_SCRATCH_STORE_DWORDX2 : SM_Real_Stores_gfx10<0x016>;
+defm S_SCRATCH_STORE_DWORDX4 : SM_Real_Stores_gfx10<0x017>;
} // End OtherPredicates = [HasScalarFlatScratchInsts]
-defm S_BUFFER_STORE_DWORD : SM_Real_Stores_gfx10<0x018, "S_BUFFER_STORE_DWORD">;
-defm S_BUFFER_STORE_DWORDX2 : SM_Real_Stores_gfx10<0x019, "S_BUFFER_STORE_DWORDX2">;
-defm S_BUFFER_STORE_DWORDX4 : SM_Real_Stores_gfx10<0x01a, "S_BUFFER_STORE_DWORDX4">;
+defm S_BUFFER_STORE_DWORD : SM_Real_Stores_gfx10<0x018>;
+defm S_BUFFER_STORE_DWORDX2 : SM_Real_Stores_gfx10<0x019>;
+defm S_BUFFER_STORE_DWORDX4 : SM_Real_Stores_gfx10<0x01a>;
} // End SubtargetPredicate = HasScalarStores
def S_MEMREALTIME_gfx10 : SMEM_Real_gfx10<0x025, S_MEMREALTIME>;
@@ -1065,15 +1032,16 @@ let SubtargetPredicate = HasScalarStores in {
def S_DCACHE_WB_gfx10 : SMEM_Real_gfx10<0x021, S_DCACHE_WB>;
} // End SubtargetPredicate = HasScalarStores
-multiclass SM_Real_Probe_gfx10<bits<8> op, string ps> {
+multiclass SM_Real_Probe_gfx10<bits<8> op> {
+ defvar ps = NAME;
def _IMM_gfx10 : SMEM_Real_Store_gfx10 <op, !cast<SM_Pseudo>(ps#_IMM)>;
def _SGPR_gfx10 : SMEM_Real_Store_gfx10 <op, !cast<SM_Pseudo>(ps#_SGPR)>;
def _SGPR_IMM_gfx10
: SMEM_Real_Store_gfx10 <op, !cast<SM_Pseudo>(ps#_SGPR_IMM)>;
}
-defm S_ATC_PROBE : SM_Real_Probe_gfx10 <0x26, "S_ATC_PROBE">;
-defm S_ATC_PROBE_BUFFER : SM_Real_Probe_gfx10 <0x27, "S_ATC_PROBE_BUFFER">;
+defm S_ATC_PROBE : SM_Real_Probe_gfx10 <0x26>;
+defm S_ATC_PROBE_BUFFER : SM_Real_Probe_gfx10 <0x27>;
class SMEM_Atomic_Real_gfx10 <bits<8> op, SM_Atomic_Pseudo ps>
: SMEM_Real_gfx10 <op, ps>,
@@ -1090,7 +1058,8 @@ class SMEM_Atomic_Real_gfx10 <bits<8> op, SM_Atomic_Pseudo ps>
let Inst{12-6} = !if(ps.glc, sdst{6-0}, sdata{6-0});
}
-multiclass SM_Real_Atomics_gfx10<bits<8> op, string ps> {
+multiclass SM_Real_Atomics_gfx10<bits<8> op> {
+ defvar ps = NAME;
def _IMM_gfx10 : SMEM_Atomic_Real_gfx10 <op, !cast<SM_Atomic_Pseudo>(ps#_IMM)>;
def _SGPR_gfx10 : SMEM_Atomic_Real_gfx10 <op, !cast<SM_Atomic_Pseudo>(ps#_SGPR)>;
def _SGPR_IMM_gfx10 : SMEM_Atomic_Real_gfx10 <op, !cast<SM_Atomic_Pseudo>(ps#_SGPR_IMM)>;
@@ -1101,70 +1070,71 @@ multiclass SM_Real_Atomics_gfx10<bits<8> op, string ps> {
let SubtargetPredicate = HasScalarAtomics in {
-defm S_BUFFER_ATOMIC_SWAP : SM_Real_Atomics_gfx10 <0x40, "S_BUFFER_ATOMIC_SWAP">;
-defm S_BUFFER_ATOMIC_CMPSWAP : SM_Real_Atomics_gfx10 <0x41, "S_BUFFER_ATOMIC_CMPSWAP">;
-defm S_BUFFER_ATOMIC_ADD : SM_Real_Atomics_gfx10 <0x42, "S_BUFFER_ATOMIC_ADD">;
-defm S_BUFFER_ATOMIC_SUB : SM_Real_Atomics_gfx10 <0x43, "S_BUFFER_ATOMIC_SUB">;
-defm S_BUFFER_ATOMIC_SMIN : SM_Real_Atomics_gfx10 <0x44, "S_BUFFER_ATOMIC_SMIN">;
-defm S_BUFFER_ATOMIC_UMIN : SM_Real_Atomics_gfx10 <0x45, "S_BUFFER_ATOMIC_UMIN">;
-defm S_BUFFER_ATOMIC_SMAX : SM_Real_Atomics_gfx10 <0x46, "S_BUFFER_ATOMIC_SMAX">;
-defm S_BUFFER_ATOMIC_UMAX : SM_Real_Atomics_gfx10 <0x47, "S_BUFFER_ATOMIC_UMAX">;
-defm S_BUFFER_ATOMIC_AND : SM_Real_Atomics_gfx10 <0x48, "S_BUFFER_ATOMIC_AND">;
-defm S_BUFFER_ATOMIC_OR : SM_Real_Atomics_gfx10 <0x49, "S_BUFFER_ATOMIC_OR">;
-defm S_BUFFER_ATOMIC_XOR : SM_Real_Atomics_gfx10 <0x4a, "S_BUFFER_ATOMIC_XOR">;
-defm S_BUFFER_ATOMIC_INC : SM_Real_Atomics_gfx10 <0x4b, "S_BUFFER_ATOMIC_INC">;
-defm S_BUFFER_ATOMIC_DEC : SM_Real_Atomics_gfx10 <0x4c, "S_BUFFER_ATOMIC_DEC">;
+defm S_BUFFER_ATOMIC_SWAP : SM_Real_Atomics_gfx10 <0x40>;
+defm S_BUFFER_ATOMIC_CMPSWAP : SM_Real_Atomics_gfx10 <0x41>;
+defm S_BUFFER_ATOMIC_ADD : SM_Real_Atomics_gfx10 <0x42>;
+defm S_BUFFER_ATOMIC_SUB : SM_Real_Atomics_gfx10 <0x43>;
+defm S_BUFFER_ATOMIC_SMIN : SM_Real_Atomics_gfx10 <0x44>;
+defm S_BUFFER_ATOMIC_UMIN : SM_Real_Atomics_gfx10 <0x45>;
+defm S_BUFFER_ATOMIC_SMAX : SM_Real_Atomics_gfx10 <0x46>;
+defm S_BUFFER_ATOMIC_UMAX : SM_Real_Atomics_gfx10 <0x47>;
+defm S_BUFFER_ATOMIC_AND : SM_Real_Atomics_gfx10 <0x48>;
+defm S_BUFFER_ATOMIC_OR : SM_Real_Atomics_gfx10 <0x49>;
+defm S_BUFFER_ATOMIC_XOR : SM_Real_Atomics_gfx10 <0x4a>;
+defm S_BUFFER_ATOMIC_INC : SM_Real_Atomics_gfx10 <0x4b>;
+defm S_BUFFER_ATOMIC_DEC : SM_Real_Atomics_gfx10 <0x4c>;
-defm S_BUFFER_ATOMIC_SWAP_X2 : SM_Real_Atomics_gfx10 <0x60, "S_BUFFER_ATOMIC_SWAP_X2">;
-defm S_BUFFER_ATOMIC_CMPSWAP_X2 : SM_Real_Atomics_gfx10 <0x61, "S_BUFFER_ATOMIC_CMPSWAP_X2">;
-defm S_BUFFER_ATOMIC_ADD_X2 : SM_Real_Atomics_gfx10 <0x62, "S_BUFFER_ATOMIC_ADD_X2">;
-defm S_BUFFER_ATOMIC_SUB_X2 : SM_Real_Atomics_gfx10 <0x63, "S_BUFFER_ATOMIC_SUB_X2">;
-defm S_BUFFER_ATOMIC_SMIN_X2 : SM_Real_Atomics_gfx10 <0x64, "S_BUFFER_ATOMIC_SMIN_X2">;
-defm S_BUFFER_ATOMIC_UMIN_X2 : SM_Real_Atomics_gfx10 <0x65, "S_BUFFER_ATOMIC_UMIN_X2">;
-defm S_BUFFER_ATOMIC_SMAX_X2 : SM_Real_Atomics_gfx10 <0x66, "S_BUFFER_ATOMIC_SMAX_X2">;
-defm S_BUFFER_ATOMIC_UMAX_X2 : SM_Real_Atomics_gfx10 <0x67, "S_BUFFER_ATOMIC_UMAX_X2">;
-defm S_BUFFER_ATOMIC_AND_X2 : SM_Real_Atomics_gfx10 <0x68, "S_BUFFER_ATOMIC_AND_X2">;
-defm S_BUFFER_ATOMIC_OR_X2 : SM_Real_Atomics_gfx10 <0x69, "S_BUFFER_ATOMIC_OR_X2">;
-defm S_BUFFER_ATOMIC_XOR_X2 : SM_Real_Atomics_gfx10 <0x6a, "S_BUFFER_ATOMIC_XOR_X2">;
-defm S_BUFFER_ATOMIC_INC_X2 : SM_Real_Atomics_gfx10 <0x6b, "S_BUFFER_ATOMIC_INC_X2">;
-defm S_BUFFER_ATOMIC_DEC_X2 : SM_Real_Atomics_gfx10 <0x6c, "S_BUFFER_ATOMIC_DEC_X2">;
+defm S_BUFFER_ATOMIC_SWAP_X2 : SM_Real_Atomics_gfx10 <0x60>;
+defm S_BUFFER_ATOMIC_CMPSWAP_X2 : SM_Real_Atomics_gfx10 <0x61>;
+defm S_BUFFER_ATOMIC_ADD_X2 : SM_Real_Atomics_gfx10 <0x62>;
+defm S_BUFFER_ATOMIC_SUB_X2 : SM_Real_Atomics_gfx10 <0x63>;
+defm S_BUFFER_ATOMIC_SMIN_X2 : SM_Real_Atomics_gfx10 <0x64>;
+defm S_BUFFER_ATOMIC_UMIN_X2 : SM_Real_Atomics_gfx10 <0x65>;
+defm S_BUFFER_ATOMIC_SMAX_X2 : SM_Real_Atomics_gfx10 <0x66>;
+defm S_BUFFER_ATOMIC_UMAX_X2 : SM_Real_Atomics_gfx10 <0x67>;
+defm S_BUFFER_ATOMIC_AND_X2 : SM_Real_Atomics_gfx10 <0x68>;
+defm S_BUFFER_ATOMIC_OR_X2 : SM_Real_Atomics_gfx10 <0x69>;
+defm S_BUFFER_ATOMIC_XOR_X2 : SM_Real_Atomics_gfx10 <0x6a>;
+defm S_BUFFER_ATOMIC_INC_X2 : SM_Real_Atomics_gfx10 <0x6b>;
+defm S_BUFFER_ATOMIC_DEC_X2 : SM_Real_Atomics_gfx10 <0x6c>;
-defm S_ATOMIC_SWAP : SM_Real_Atomics_gfx10 <0x80, "S_ATOMIC_SWAP">;
-defm S_ATOMIC_CMPSWAP : SM_Real_Atomics_gfx10 <0x81, "S_ATOMIC_CMPSWAP">;
-defm S_ATOMIC_ADD : SM_Real_Atomics_gfx10 <0x82, "S_ATOMIC_ADD">;
-defm S_ATOMIC_SUB : SM_Real_Atomics_gfx10 <0x83, "S_ATOMIC_SUB">;
-defm S_ATOMIC_SMIN : SM_Real_Atomics_gfx10 <0x84, "S_ATOMIC_SMIN">;
-defm S_ATOMIC_UMIN : SM_Real_Atomics_gfx10 <0x85, "S_ATOMIC_UMIN">;
-defm S_ATOMIC_SMAX : SM_Real_Atomics_gfx10 <0x86, "S_ATOMIC_SMAX">;
-defm S_ATOMIC_UMAX : SM_Real_Atomics_gfx10 <0x87, "S_ATOMIC_UMAX">;
-defm S_ATOMIC_AND : SM_Real_Atomics_gfx10 <0x88, "S_ATOMIC_AND">;
-defm S_ATOMIC_OR : SM_Real_Atomics_gfx10 <0x89, "S_ATOMIC_OR">;
-defm S_ATOMIC_XOR : SM_Real_Atomics_gfx10 <0x8a, "S_ATOMIC_XOR">;
-defm S_ATOMIC_INC : SM_Real_Atomics_gfx10 <0x8b, "S_ATOMIC_INC">;
-defm S_ATOMIC_DEC : SM_Real_Atomics_gfx10 <0x8c, "S_ATOMIC_DEC">;
+defm S_ATOMIC_SWAP : SM_Real_Atomics_gfx10 <0x80>;
+defm S_ATOMIC_CMPSWAP : SM_Real_Atomics_gfx10 <0x81>;
+defm S_ATOMIC_ADD : SM_Real_Atomics_gfx10 <0x82>;
+defm S_ATOMIC_SUB : SM_Real_Atomics_gfx10 <0x83>;
+defm S_ATOMIC_SMIN : SM_Real_Atomics_gfx10 <0x84>;
+defm S_ATOMIC_UMIN : SM_Real_Atomics_gfx10 <0x85>;
+defm S_ATOMIC_SMAX : SM_Real_Atomics_gfx10 <0x86>;
+defm S_ATOMIC_UMAX : SM_Real_Atomics_gfx10 <0x87>;
+defm S_ATOMIC_AND : SM_Real_Atomics_gfx10 <0x88>;
+defm S_ATOMIC_OR : SM_Real_Atomics_gfx10 <0x89>;
+defm S_ATOMIC_XOR : SM_Real_Atomics_gfx10 <0x8a>;
+defm S_ATOMIC_INC : SM_Real_Atomics_gfx10 <0x8b>;
+defm S_ATOMIC_DEC : SM_Real_Atomics_gfx10 <0x8c>;
-defm S_ATOMIC_SWAP_X2 : SM_Real_Atomics_gfx10 <0xa0, "S_ATOMIC_SWAP_X2">;
-defm S_ATOMIC_CMPSWAP_X2 : SM_Real_Atomics_gfx10 <0xa1, "S_ATOMIC_CMPSWAP_X2">;
-defm S_ATOMIC_ADD_X2 : SM_Real_Atomics_gfx10 <0xa2, "S_ATOMIC_ADD_X2">;
-defm S_ATOMIC_SUB_X2 : SM_Real_Atomics_gfx10 <0xa3, "S_ATOMIC_SUB_X2">;
-defm S_ATOMIC_SMIN_X2 : SM_Real_Atomics_gfx10 <0xa4, "S_ATOMIC_SMIN_X2">;
-defm S_ATOMIC_UMIN_X2 : SM_Real_Atomics_gfx10 <0xa5, "S_ATOMIC_UMIN_X2">;
-defm S_ATOMIC_SMAX_X2 : SM_Real_Atomics_gfx10 <0xa6, "S_ATOMIC_SMAX_X2">;
-defm S_ATOMIC_UMAX_X2 : SM_Real_Atomics_gfx10 <0xa7, "S_ATOMIC_UMAX_X2">;
-defm S_ATOMIC_AND_X2 : SM_Real_Atomics_gfx10 <0xa8, "S_ATOMIC_AND_X2">;
-defm S_ATOMIC_OR_X2 : SM_Real_Atomics_gfx10 <0xa9, "S_ATOMIC_OR_X2">;
-defm S_ATOMIC_XOR_X2 : SM_Real_Atomics_gfx10 <0xaa, "S_ATOMIC_XOR_X2">;
-defm S_ATOMIC_INC_X2 : SM_Real_Atomics_gfx10 <0xab, "S_ATOMIC_INC_X2">;
-defm S_ATOMIC_DEC_X2 : SM_Real_Atomics_gfx10 <0xac, "S_ATOMIC_DEC_X2">;
+defm S_ATOMIC_SWAP_X2 : SM_Real_Atomics_gfx10 <0xa0>;
+defm S_ATOMIC_CMPSWAP_X2 : SM_Real_Atomics_gfx10 <0xa1>;
+defm S_ATOMIC_ADD_X2 : SM_Real_Atomics_gfx10 <0xa2>;
+defm S_ATOMIC_SUB_X2 : SM_Real_Atomics_gfx10 <0xa3>;
+defm S_ATOMIC_SMIN_X2 : SM_Real_Atomics_gfx10 <0xa4>;
+defm S_ATOMIC_UMIN_X2 : SM_Real_Atomics_gfx10 <0xa5>;
+defm S_ATOMIC_SMAX_X2 : SM_Real_Atomics_gfx10 <0xa6>;
+defm S_ATOMIC_UMAX_X2 : SM_Real_Atomics_gfx10 <0xa7>;
+defm S_ATOMIC_AND_X2 : SM_Real_Atomics_gfx10 <0xa8>;
+defm S_ATOMIC_OR_X2 : SM_Real_Atomics_gfx10 <0xa9>;
+defm S_ATOMIC_XOR_X2 : SM_Real_Atomics_gfx10 <0xaa>;
+defm S_ATOMIC_INC_X2 : SM_Real_Atomics_gfx10 <0xab>;
+defm S_ATOMIC_DEC_X2 : SM_Real_Atomics_gfx10 <0xac>;
-multiclass SM_Real_Discard_gfx10<bits<8> op, string ps> {
+multiclass SM_Real_Discard_gfx10<bits<8> op> {
+ defvar ps = NAME;
def _IMM_gfx10 : SMEM_Real_gfx10 <op, !cast<SM_Pseudo>(ps#_IMM)>;
def _SGPR_gfx10 : SMEM_Real_gfx10 <op, !cast<SM_Pseudo>(ps#_SGPR)>;
def _SGPR_IMM_gfx10 : SMEM_Real_gfx10 <op, !cast<SM_Pseudo>(ps#_SGPR_IMM)>;
}
-defm S_DCACHE_DISCARD : SM_Real_Discard_gfx10 <0x28, "S_DCACHE_DISCARD">;
-defm S_DCACHE_DISCARD_X2 : SM_Real_Discard_gfx10 <0x29, "S_DCACHE_DISCARD_X2">;
+defm S_DCACHE_DISCARD : SM_Real_Discard_gfx10 <0x28>;
+defm S_DCACHE_DISCARD_X2 : SM_Real_Discard_gfx10 <0x29>;
} // End SubtargetPredicate = HasScalarAtomics
@@ -1190,31 +1160,29 @@ class SMEM_Real_gfx11<bits<8> op, SM_Pseudo ps, string opName = ps.Mnemonic> :
let Inst{14} = !if(ps.has_glc, cpol{CPolBit.GLC}, 0);
}
-class SMEM_Real_Load_gfx11<bits<8> op, string ps, string opName, OffsetMode offsets> :
- SMEM_Real_gfx11<op, !cast<SM_Pseudo>(ps # offsets.Variant), opName> {
- RegisterClass BaseClass = !cast<SM_Load_Pseudo>(ps # offsets.Variant).BaseClass;
- let InOperandList = !con((ins BaseClass:$sbase), offsets.Ins, (ins CPol:$cpol));
-}
+class SMEM_Real_Load_gfx11<bits<8> op, string ps, string opName> :
+ SMEM_Real_gfx11<op, !cast<SM_Pseudo>(ps), opName>;
-multiclass SM_Real_Loads_gfx11<bits<8> op, string ps, string opName> {
- def _IMM_gfx11 : SMEM_Real_Load_gfx11<op, ps, opName, IMM_Offset>;
- def _SGPR_gfx11 : SMEM_Real_Load_gfx11<op, ps, opName, SGPR_Offset>;
- def _SGPR_IMM_gfx11 : SMEM_Real_Load_gfx11<op, ps, opName, SGPR_IMM_Offset>;
+multiclass SM_Real_Loads_gfx11<bits<8> op, string ps> {
+ defvar opName = !tolower(NAME);
+ def _IMM_gfx11 : SMEM_Real_Load_gfx11<op, ps#"_IMM", opName>;
+ def _SGPR_gfx11 : SMEM_Real_Load_gfx11<op, ps#"_SGPR", opName>;
+ def _SGPR_IMM_gfx11 : SMEM_Real_Load_gfx11<op, ps#"_SGPR_IMM", opName>;
def : MnemonicAlias<!cast<SM_Pseudo>(ps#"_IMM").Mnemonic, opName>,
Requires<[isGFX11Plus]>;
}
-defm S_LOAD_B32 : SM_Real_Loads_gfx11<0x000, "S_LOAD_DWORD", "s_load_b32">;
-defm S_LOAD_B64 : SM_Real_Loads_gfx11<0x001, "S_LOAD_DWORDX2", "s_load_b64">;
-defm S_LOAD_B128 : SM_Real_Loads_gfx11<0x002, "S_LOAD_DWORDX4", "s_load_b128">;
-defm S_LOAD_B256 : SM_Real_Loads_gfx11<0x003, "S_LOAD_DWORDX8", "s_load_b256">;
-defm S_LOAD_B512 : SM_Real_Loads_gfx11<0x004, "S_LOAD_DWORDX16", "s_load_b512">;
+defm S_LOAD_B32 : SM_Real_Loads_gfx11<0x000, "S_LOAD_DWORD">;
+defm S_LOAD_B64 : SM_Real_Loads_gfx11<0x001, "S_LOAD_DWORDX2">;
+defm S_LOAD_B128 : SM_Real_Loads_gfx11<0x002, "S_LOAD_DWORDX4">;
+defm S_LOAD_B256 : SM_Real_Loads_gfx11<0x003, "S_LOAD_DWORDX8">;
+defm S_LOAD_B512 : SM_Real_Loads_gfx11<0x004, "S_LOAD_DWORDX16">;
-defm S_BUFFER_LOAD_B32 : SM_Real_Loads_gfx11<0x008, "S_BUFFER_LOAD_DWORD", "s_buffer_load_b32">;
-defm S_BUFFER_LOAD_B64 : SM_Real_Loads_gfx11<0x009, "S_BUFFER_LOAD_DWORDX2", "s_buffer_load_b64">;
-defm S_BUFFER_LOAD_B128 : SM_Real_Loads_gfx11<0x00a, "S_BUFFER_LOAD_DWORDX4", "s_buffer_load_b128">;
-defm S_BUFFER_LOAD_B256 : SM_Real_Loads_gfx11<0x00b, "S_BUFFER_LOAD_DWORDX8", "s_buffer_load_b256">;
-defm S_BUFFER_LOAD_B512 : SM_Real_Loads_gfx11<0x00c, "S_BUFFER_LOAD_DWORDX16", "s_buffer_load_b512">;
+defm S_BUFFER_LOAD_B32 : SM_Real_Loads_gfx11<0x008, "S_BUFFER_LOAD_DWORD">;
+defm S_BUFFER_LOAD_B64 : SM_Real_Loads_gfx11<0x009, "S_BUFFER_LOAD_DWORDX2">;
+defm S_BUFFER_LOAD_B128 : SM_Real_Loads_gfx11<0x00a, "S_BUFFER_LOAD_DWORDX4">;
+defm S_BUFFER_LOAD_B256 : SM_Real_Loads_gfx11<0x00b, "S_BUFFER_LOAD_DWORDX8">;
+defm S_BUFFER_LOAD_B512 : SM_Real_Loads_gfx11<0x00c, "S_BUFFER_LOAD_DWORDX16">;
def S_GL1_INV_gfx11 : SMEM_Real_gfx11<0x020, S_GL1_INV>;
def S_DCACHE_INV_gfx11 : SMEM_Real_gfx11<0x021, S_DCACHE_INV>;
@@ -1227,12 +1195,13 @@ class SMEM_Real_Store_gfx11 <bits<8> op, SM_Pseudo ps> : SMEM_Real_gfx11<op, ps>
let Inst{12-6} = !if(ps.has_sdst, sdata{6-0}, ?);
}
-multiclass SM_Real_Probe_gfx11<bits<8> op, string ps> {
+multiclass SM_Real_Probe_gfx11<bits<8> op> {
+ defvar ps = NAME;
def _IMM_gfx11 : SMEM_Real_Store_gfx11 <op, !cast<SM_Probe_Pseudo>(ps#_IMM)>;
def _SGPR_gfx11 : SMEM_Real_Store_gfx11 <op, !cast<SM_Probe_Pseudo>(ps#_SGPR)>;
def _SGPR_IMM_gfx11
: SMEM_Real_Store_gfx11 <op, !cast<SM_Probe_Pseudo>(ps#_SGPR_IMM)>;
}
-defm S_ATC_PROBE : SM_Real_Probe_gfx11 <0x22, "S_ATC_PROBE">;
-defm S_ATC_PROBE_BUFFER : SM_Real_Probe_gfx11 <0x23, "S_ATC_PROBE_BUFFER">;
+defm S_ATC_PROBE : SM_Real_Probe_gfx11 <0x22>;
+defm S_ATC_PROBE_BUFFER : SM_Real_Probe_gfx11 <0x23>;
diff --git a/llvm/lib/Target/AMDGPU/SOPInstructions.td b/llvm/lib/Target/AMDGPU/SOPInstructions.td
index ad9af662307f..bee996d1b0df 100644
--- a/llvm/lib/Target/AMDGPU/SOPInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SOPInstructions.td
@@ -6,18 +6,7 @@
//
//===----------------------------------------------------------------------===//
-def GPRIdxModeMatchClass : AsmOperandClass {
- let Name = "GPRIdxMode";
- let PredicateMethod = "isGPRIdxMode";
- let ParserMethod = "parseGPRIdxMode";
- let RenderMethod = "addImmOperands";
-}
-
-def GPRIdxMode : Operand<i32> {
- let PrintMethod = "printVGPRIndexMode";
- let ParserMatchClass = GPRIdxModeMatchClass;
- let OperandType = "OPERAND_IMMEDIATE";
-}
+def GPRIdxMode : CustomOperand<i32>;
class SOP_Pseudo<string opName, dag outs, dag ins, string asmOps,
list<dag> pattern=[]> :
@@ -402,11 +391,11 @@ let SubtargetPredicate = isGFX11Plus in {
// For s_sendmsg_rtn_* the src0 field encodes the message type directly; it
// is not an SGPR number.
def S_SENDMSG_RTN_B32 : SOP1_Pseudo<
- "s_sendmsg_rtn_b32", (outs SReg_32:$sdst), (ins SendMsgImm:$src0),
+ "s_sendmsg_rtn_b32", (outs SReg_32:$sdst), (ins SendMsg:$src0),
"$sdst, $src0", [(set i32:$sdst, (int_amdgcn_s_sendmsg_rtn timm:$src0))]
>;
def S_SENDMSG_RTN_B64 : SOP1_Pseudo<
- "s_sendmsg_rtn_b64", (outs SReg_64:$sdst), (ins SendMsgImm:$src0),
+ "s_sendmsg_rtn_b64", (outs SReg_64:$sdst), (ins SendMsg:$src0),
"$sdst, $src0", [(set i64:$sdst, (int_amdgcn_s_sendmsg_rtn timm:$src0))]
>;
}
@@ -795,7 +784,7 @@ class SOPK_32 <string opName, list<dag> pattern=[]> : SOPK_Pseudo <
class SOPK_32_BR <string opName, list<dag> pattern=[]> : SOPK_Pseudo <
opName,
(outs),
- (ins sopp_brtarget:$simm16, SReg_32:$sdst),
+ (ins SOPPBrTarget:$simm16, SReg_32:$sdst),
"$sdst, $simm16",
pattern> {
let Defs = [EXEC];
@@ -875,7 +864,7 @@ let isCommutable = 1, DisableEncoding = "$src0",
let SubtargetPredicate = isGFX6GFX7GFX8GFX9 in
def S_CBRANCH_I_FORK : SOPK_Pseudo <
"s_cbranch_i_fork",
- (outs), (ins SReg_64:$sdst, sopp_brtarget:$simm16),
+ (outs), (ins SReg_64:$sdst, SOPPBrTarget:$simm16),
"$sdst, $simm16"
>;
@@ -953,7 +942,7 @@ let SubtargetPredicate = isGFX9Plus in {
def S_CALL_B64 : SOPK_Pseudo<
"s_call_b64",
(outs SReg_64:$sdst),
- (ins sopp_brtarget:$simm16),
+ (ins SOPPBrTarget:$simm16),
"$sdst, $simm16"> {
let isCall = 1;
}
@@ -1175,7 +1164,7 @@ multiclass SOPP_With_Relaxation <string opName, dag ins,
def S_NOP : SOPP_Pseudo<"s_nop" , (ins i16imm:$simm16), "$simm16">;
let isTerminator = 1 in {
-def S_ENDPGM : SOPP_Pseudo<"s_endpgm", (ins EndpgmImm:$simm16), "$simm16", [], ""> {
+def S_ENDPGM : SOPP_Pseudo<"s_endpgm", (ins Endpgm:$simm16), "$simm16", [], ""> {
let isBarrier = 1;
let isReturn = 1;
let hasSideEffects = 1;
@@ -1206,60 +1195,60 @@ let SubtargetPredicate = isGFX10Plus in {
let isBranch = 1, SchedRW = [WriteBranch] in {
let isBarrier = 1 in {
defm S_BRANCH : SOPP_With_Relaxation<
- "s_branch" , (ins sopp_brtarget:$simm16), "$simm16",
+ "s_branch" , (ins SOPPBrTarget:$simm16), "$simm16",
[(br bb:$simm16)]>;
}
let Uses = [SCC] in {
defm S_CBRANCH_SCC0 : SOPP_With_Relaxation<
- "s_cbranch_scc0" , (ins sopp_brtarget:$simm16),
+ "s_cbranch_scc0" , (ins SOPPBrTarget:$simm16),
"$simm16"
>;
defm S_CBRANCH_SCC1 : SOPP_With_Relaxation <
- "s_cbranch_scc1" , (ins sopp_brtarget:$simm16),
+ "s_cbranch_scc1" , (ins SOPPBrTarget:$simm16),
"$simm16"
>;
} // End Uses = [SCC]
let Uses = [VCC] in {
defm S_CBRANCH_VCCZ : SOPP_With_Relaxation <
- "s_cbranch_vccz" , (ins sopp_brtarget:$simm16),
+ "s_cbranch_vccz" , (ins SOPPBrTarget:$simm16),
"$simm16"
>;
defm S_CBRANCH_VCCNZ : SOPP_With_Relaxation <
- "s_cbranch_vccnz" , (ins sopp_brtarget:$simm16),
+ "s_cbranch_vccnz" , (ins SOPPBrTarget:$simm16),
"$simm16"
>;
} // End Uses = [VCC]
let Uses = [EXEC] in {
defm S_CBRANCH_EXECZ : SOPP_With_Relaxation <
- "s_cbranch_execz" , (ins sopp_brtarget:$simm16),
+ "s_cbranch_execz" , (ins SOPPBrTarget:$simm16),
"$simm16"
>;
defm S_CBRANCH_EXECNZ : SOPP_With_Relaxation <
- "s_cbranch_execnz" , (ins sopp_brtarget:$simm16),
+ "s_cbranch_execnz" , (ins SOPPBrTarget:$simm16),
"$simm16"
>;
} // End Uses = [EXEC]
defm S_CBRANCH_CDBGSYS : SOPP_With_Relaxation <
- "s_cbranch_cdbgsys" , (ins sopp_brtarget:$simm16),
+ "s_cbranch_cdbgsys" , (ins SOPPBrTarget:$simm16),
"$simm16"
>;
defm S_CBRANCH_CDBGSYS_AND_USER : SOPP_With_Relaxation <
- "s_cbranch_cdbgsys_and_user" , (ins sopp_brtarget:$simm16),
+ "s_cbranch_cdbgsys_and_user" , (ins SOPPBrTarget:$simm16),
"$simm16"
>;
defm S_CBRANCH_CDBGSYS_OR_USER : SOPP_With_Relaxation <
- "s_cbranch_cdbgsys_or_user" , (ins sopp_brtarget:$simm16),
+ "s_cbranch_cdbgsys_or_user" , (ins SOPPBrTarget:$simm16),
"$simm16"
>;
defm S_CBRANCH_CDBGUSER : SOPP_With_Relaxation <
- "s_cbranch_cdbguser" , (ins sopp_brtarget:$simm16),
+ "s_cbranch_cdbguser" , (ins SOPPBrTarget:$simm16),
"$simm16"
>;
@@ -1284,7 +1273,7 @@ def S_WAKEUP : SOPP_Pseudo <"s_wakeup", (ins) > {
}
let hasSideEffects = 1 in
-def S_WAITCNT : SOPP_Pseudo <"s_waitcnt" , (ins WAIT_FLAG:$simm16), "$simm16",
+def S_WAITCNT : SOPP_Pseudo <"s_waitcnt" , (ins SWaitCnt:$simm16), "$simm16",
[(int_amdgcn_s_waitcnt timm:$simm16)]>;
def S_SETHALT : SOPP_Pseudo <"s_sethalt" , (ins i32imm:$simm16), "$simm16",
[(int_amdgcn_s_sethalt timm:$simm16)]>;
@@ -1305,12 +1294,12 @@ def S_SETPRIO : SOPP_Pseudo <"s_setprio", (ins i16imm:$simm16), "$simm16",
}
let Uses = [EXEC, M0] in {
-def S_SENDMSG : SOPP_Pseudo <"s_sendmsg" , (ins SendMsgImm:$simm16), "$simm16",
+def S_SENDMSG : SOPP_Pseudo <"s_sendmsg" , (ins SendMsg:$simm16), "$simm16",
[(int_amdgcn_s_sendmsg (i32 timm:$simm16), M0)]> {
let hasSideEffects = 1;
}
-def S_SENDMSGHALT : SOPP_Pseudo <"s_sendmsghalt" , (ins SendMsgImm:$simm16), "$simm16",
+def S_SENDMSGHALT : SOPP_Pseudo <"s_sendmsghalt" , (ins SendMsg:$simm16), "$simm16",
[(int_amdgcn_s_sendmsghalt (i32 timm:$simm16), M0)]> {
let hasSideEffects = 1;
}
@@ -1367,7 +1356,7 @@ let SubtargetPredicate = isGFX10Plus in {
let fixed_imm = 1;
}
def S_WAITCNT_DEPCTR :
- SOPP_Pseudo <"s_waitcnt_depctr" , (ins DepCtrImm:$simm16), "$simm16">;
+ SOPP_Pseudo <"s_waitcnt_depctr" , (ins DepCtr:$simm16), "$simm16">;
let hasSideEffects = 0, Uses = [MODE], Defs = [MODE] in {
def S_ROUND_MODE :
@@ -1386,7 +1375,7 @@ let SubtargetPredicate = isGFX11Plus in {
"$simm16"> {
let hasSideEffects = 1;
}
- def S_DELAY_ALU : SOPP_Pseudo<"s_delay_alu", (ins DELAY_FLAG:$simm16),
+ def S_DELAY_ALU : SOPP_Pseudo<"s_delay_alu", (ins SDelayALU:$simm16),
"$simm16">;
} // End SubtargetPredicate = isGFX11Plus
diff --git a/llvm/lib/Target/AMDGPU/TargetInfo/AMDGPUTargetInfo.cpp b/llvm/lib/Target/AMDGPU/TargetInfo/AMDGPUTargetInfo.cpp
index 7573af597056..98fd16e59bf1 100644
--- a/llvm/lib/Target/AMDGPU/TargetInfo/AMDGPUTargetInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/TargetInfo/AMDGPUTargetInfo.cpp
@@ -15,13 +15,13 @@
using namespace llvm;
-/// The target which supports all AMD GPUs. This will eventually
-/// be deprecated and there will be a R600 target and a GCN target.
-Target &llvm::getTheAMDGPUTarget() {
+/// The target for R600 GPUs.
+Target &llvm::getTheR600Target() {
static Target TheAMDGPUTarget;
return TheAMDGPUTarget;
}
-/// The target for GCN GPUs
+
+/// The target for GCN GPUs.
Target &llvm::getTheGCNTarget() {
static Target TheGCNTarget;
return TheGCNTarget;
@@ -29,7 +29,7 @@ Target &llvm::getTheGCNTarget() {
/// Extern function to initialize the targets for the AMDGPU backend
extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTargetInfo() {
- RegisterTarget<Triple::r600, false> R600(getTheAMDGPUTarget(), "r600",
+ RegisterTarget<Triple::r600, false> R600(getTheR600Target(), "r600",
"AMD GPUs HD2XXX-HD6XXX", "AMDGPU");
RegisterTarget<Triple::amdgcn, false> GCN(getTheGCNTarget(), "amdgcn",
"AMD GCN GPUs", "AMDGPU");
diff --git a/llvm/lib/Target/AMDGPU/TargetInfo/AMDGPUTargetInfo.h b/llvm/lib/Target/AMDGPU/TargetInfo/AMDGPUTargetInfo.h
index 1e6dbd90b0c1..45470167a331 100644
--- a/llvm/lib/Target/AMDGPU/TargetInfo/AMDGPUTargetInfo.h
+++ b/llvm/lib/Target/AMDGPU/TargetInfo/AMDGPUTargetInfo.h
@@ -17,11 +17,10 @@ namespace llvm {
class Target;
-/// The target which supports all AMD GPUs. This will eventually
-/// be deprecated and there will be a R600 target and a GCN target.
-Target &getTheAMDGPUTarget();
+/// The target for R600 GPUs.
+Target &getTheR600Target();
-/// The target for GCN GPUs
+/// The target for GCN GPUs.
Target &getTheGCNTarget();
}
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.cpp
index c0fd5bc69325..ce40d82021cf 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.cpp
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.cpp
@@ -44,7 +44,7 @@ const CustomOperand<const MCSubtargetInfo &> Msg[] = {
{{"MSG_SAVEWAVE"}, ID_SAVEWAVE, isGFX8_GFX9_GFX10},
{{"MSG_STALL_WAVE_GEN"}, ID_STALL_WAVE_GEN, isGFX9Plus},
{{"MSG_HALT_WAVES"}, ID_HALT_WAVES, isGFX9Plus},
- {{"MSG_ORDERED_PS_DONE"}, ID_ORDERED_PS_DONE, isGFX9Plus},
+ {{"MSG_ORDERED_PS_DONE"}, ID_ORDERED_PS_DONE, isGFX9_GFX10},
{{"MSG_EARLY_PRIM_DEALLOC"}, ID_EARLY_PRIM_DEALLOC, isGFX9_GFX10},
{{"MSG_GS_ALLOC_REQ"}, ID_GS_ALLOC_REQ, isGFX9Plus},
{{"MSG_GET_DOORBELL"}, ID_GET_DOORBELL, isGFX9_GFX10},
@@ -115,10 +115,14 @@ const CustomOperand<const MCSubtargetInfo &> Opr[] = {
{{"HW_REG_HW_ID2"}, ID_HW_ID2, isGFX10Plus},
{{"HW_REG_POPS_PACKER"}, ID_POPS_PACKER, isGFX10},
{{""}},
- {{""}},
+ {{"HW_REG_PERF_SNAPSHOT_DATA"}, ID_PERF_SNAPSHOT_DATA, isGFX11Plus},
{{""}},
{{"HW_REG_SHADER_CYCLES"}, ID_SHADER_CYCLES, isGFX10_BEncoding},
+ // Register numbers reused in GFX11+
+ {{"HW_REG_PERF_SNAPSHOT_PC_LO"}, ID_PERF_SNAPSHOT_PC_LO, isGFX11Plus},
+ {{"HW_REG_PERF_SNAPSHOT_PC_HI"}, ID_PERF_SNAPSHOT_PC_HI, isGFX11Plus},
+
// GFX940 specific registers
{{"HW_REG_XCC_ID"}, ID_XCC_ID, isGFX940},
{{"HW_REG_SQ_PERF_SNAPSHOT_DATA"}, ID_SQ_PERF_SNAPSHOT_DATA, isGFX940},
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
index 4263e3e9eeac..296ea18b2a8d 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
@@ -10,19 +10,22 @@
#include "AMDGPU.h"
#include "AMDGPUAsmUtils.h"
#include "AMDKernelCodeT.h"
-#include "GCNSubtarget.h"
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
#include "llvm/BinaryFormat/ELF.h"
+#include "llvm/CodeGen/TargetRegisterInfo.h"
#include "llvm/IR/Attributes.h"
+#include "llvm/IR/Constants.h"
#include "llvm/IR/Function.h"
#include "llvm/IR/GlobalValue.h"
#include "llvm/IR/IntrinsicsAMDGPU.h"
#include "llvm/IR/IntrinsicsR600.h"
#include "llvm/IR/LLVMContext.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCRegisterInfo.h"
#include "llvm/MC/MCSubtargetInfo.h"
#include "llvm/Support/AMDHSAKernelDescriptor.h"
#include "llvm/Support/CommandLine.h"
-#include "llvm/Support/TargetParser.h"
+#include "llvm/TargetParser/TargetParser.h"
#include <optional>
#define GET_INSTRINFO_NAMED_OPS
@@ -92,6 +95,24 @@ unsigned getVmcntBitWidthHi(unsigned VersionMajor) {
return (VersionMajor == 9 || VersionMajor == 10) ? 2 : 0;
}
+/// \returns VmVsrc bit width
+inline unsigned getVmVsrcBitWidth() { return 3; }
+
+/// \returns VmVsrc bit shift
+inline unsigned getVmVsrcBitShift() { return 2; }
+
+/// \returns VaVdst bit width
+inline unsigned getVaVdstBitWidth() { return 4; }
+
+/// \returns VaVdst bit shift
+inline unsigned getVaVdstBitShift() { return 12; }
+
+/// \returns SaSdst bit width
+inline unsigned getSaSdstBitWidth() { return 1; }
+
+/// \returns SaSdst bit shift
+inline unsigned getSaSdstBitShift() { return 0; }
+
} // end namespace anonymous
namespace llvm {
@@ -150,56 +171,62 @@ unsigned getAmdhsaCodeObjectVersion() {
return AmdhsaCodeObjectVersion;
}
-unsigned getMultigridSyncArgImplicitArgPosition() {
- switch (AmdhsaCodeObjectVersion) {
- case 2:
- case 3:
- case 4:
+unsigned getCodeObjectVersion(const Module &M) {
+ if (auto Ver = mdconst::extract_or_null<ConstantInt>(
+ M.getModuleFlag("amdgpu_code_object_version"))) {
+ return (unsigned)Ver->getZExtValue() / 100;
+ }
+
+ // Default code object version.
+ return AMDHSA_COV4;
+}
+
+unsigned getMultigridSyncArgImplicitArgPosition(unsigned CodeObjectVersion) {
+ switch (CodeObjectVersion) {
+ case AMDHSA_COV2:
+ case AMDHSA_COV3:
+ case AMDHSA_COV4:
return 48;
- case 5:
- return AMDGPU::ImplicitArg::MULTIGRID_SYNC_ARG_OFFSET;
+ case AMDHSA_COV5:
default:
- llvm_unreachable("Unexpected code object version");
- return 0;
+ return AMDGPU::ImplicitArg::MULTIGRID_SYNC_ARG_OFFSET;
}
}
// FIXME: All such magic numbers about the ABI should be in a
// central TD file.
-unsigned getHostcallImplicitArgPosition() {
- switch (AmdhsaCodeObjectVersion) {
- case 2:
- case 3:
- case 4:
+unsigned getHostcallImplicitArgPosition(unsigned CodeObjectVersion) {
+ switch (CodeObjectVersion) {
+ case AMDHSA_COV2:
+ case AMDHSA_COV3:
+ case AMDHSA_COV4:
return 24;
- case 5:
- return AMDGPU::ImplicitArg::HOSTCALL_PTR_OFFSET;
+ case AMDHSA_COV5:
default:
- llvm_unreachable("Unexpected code object version");
- return 0;
+ return AMDGPU::ImplicitArg::HOSTCALL_PTR_OFFSET;
}
}
-unsigned getDefaultQueueImplicitArgPosition() {
- switch (AmdhsaCodeObjectVersion) {
- case 2:
- case 3:
- case 4:
+unsigned getDefaultQueueImplicitArgPosition(unsigned CodeObjectVersion) {
+ switch (CodeObjectVersion) {
+ case AMDHSA_COV2:
+ case AMDHSA_COV3:
+ case AMDHSA_COV4:
return 32;
- case 5:
+ case AMDHSA_COV5:
default:
return AMDGPU::ImplicitArg::DEFAULT_QUEUE_OFFSET;
}
}
-unsigned getCompletionActionImplicitArgPosition() {
- switch (AmdhsaCodeObjectVersion) {
- case 2:
- case 3:
- case 4:
+unsigned getCompletionActionImplicitArgPosition(unsigned CodeObjectVersion) {
+ switch (CodeObjectVersion) {
+ case AMDHSA_COV2:
+ case AMDHSA_COV3:
+ case AMDHSA_COV4:
return 40;
- case 5:
+ case AMDHSA_COV5:
default:
return AMDGPU::ImplicitArg::COMPLETION_ACTION_OFFSET;
}
@@ -568,9 +595,10 @@ std::optional<unsigned> InstInfo::getInvalidCompOperandIndex(
unsigned CompOprIdx;
for (CompOprIdx = 0; CompOprIdx < Component::MAX_OPR_NUM; ++CompOprIdx) {
- unsigned BanksNum = BANKS_NUM[CompOprIdx];
+ unsigned BanksMasks = VOPD_VGPR_BANK_MASKS[CompOprIdx];
if (OpXRegs[CompOprIdx] && OpYRegs[CompOprIdx] &&
- (OpXRegs[CompOprIdx] % BanksNum == OpYRegs[CompOprIdx] % BanksNum))
+ ((OpXRegs[CompOprIdx] & BanksMasks) ==
+ (OpYRegs[CompOprIdx] & BanksMasks)))
return CompOprIdx;
}
@@ -624,7 +652,7 @@ namespace IsaInfo {
AMDGPUTargetID::AMDGPUTargetID(const MCSubtargetInfo &STI)
: STI(STI), XnackSetting(TargetIDSetting::Any),
- SramEccSetting(TargetIDSetting::Any) {
+ SramEccSetting(TargetIDSetting::Any), CodeObjectVersion(0) {
if (!STI.getFeatureBits().test(FeatureSupportsXNACK))
XnackSetting = TargetIDSetting::Unsupported;
if (!STI.getFeatureBits().test(FeatureSupportsSRAMECC))
@@ -735,9 +763,9 @@ std::string AMDGPUTargetID::toString() const {
.str();
std::string Features;
- if (std::optional<uint8_t> HsaAbiVersion = getHsaAbiVersion(&STI)) {
- switch (*HsaAbiVersion) {
- case ELF::ELFABIVERSION_AMDGPU_HSA_V2:
+ if (STI.getTargetTriple().getOS() == Triple::AMDHSA) {
+ switch (CodeObjectVersion) {
+ case AMDGPU::AMDHSA_COV2:
// Code object V2 only supported specific processors and had fixed
// settings for the XNACK.
if (Processor == "gfx600") {
@@ -785,7 +813,7 @@ std::string AMDGPUTargetID::toString() const {
Twine(Processor));
}
break;
- case ELF::ELFABIVERSION_AMDGPU_HSA_V3:
+ case AMDGPU::AMDHSA_COV3:
// xnack.
if (isXnackOnOrAny())
Features += "+xnack";
@@ -794,8 +822,8 @@ std::string AMDGPUTargetID::toString() const {
if (isSramEccOnOrAny())
Features += "+sram-ecc";
break;
- case ELF::ELFABIVERSION_AMDGPU_HSA_V4:
- case ELF::ELFABIVERSION_AMDGPU_HSA_V5:
+ case AMDGPU::AMDHSA_COV4:
+ case AMDGPU::AMDHSA_COV5:
// sramecc.
if (getSramEccSetting() == TargetIDSetting::Off)
Features += ":sramecc-";
@@ -1205,16 +1233,16 @@ bool shouldEmitConstantsToTextSection(const Triple &TT) {
return TT.getArch() == Triple::r600;
}
-std::pair<int, int> getIntegerPairAttribute(const Function &F,
- StringRef Name,
- std::pair<int, int> Default,
- bool OnlyFirstRequired) {
+std::pair<unsigned, unsigned>
+getIntegerPairAttribute(const Function &F, StringRef Name,
+ std::pair<unsigned, unsigned> Default,
+ bool OnlyFirstRequired) {
Attribute A = F.getFnAttribute(Name);
if (!A.isStringAttribute())
return Default;
LLVMContext &Ctx = F.getContext();
- std::pair<int, int> Ints = Default;
+ std::pair<unsigned, unsigned> Ints = Default;
std::pair<StringRef, StringRef> Strs = A.getValueAsString().split(',');
if (Strs.first.trim().getAsInteger(0, Ints.first)) {
Ctx.emitError("can't parse first integer attribute " + Name);
@@ -1491,6 +1519,42 @@ int encodeDepCtr(const StringRef Name, int64_t Val, unsigned &UsedOprMask,
STI);
}
+unsigned decodeFieldVmVsrc(unsigned Encoded) {
+ return unpackBits(Encoded, getVmVsrcBitShift(), getVmVsrcBitWidth());
+}
+
+unsigned decodeFieldVaVdst(unsigned Encoded) {
+ return unpackBits(Encoded, getVaVdstBitShift(), getVaVdstBitWidth());
+}
+
+unsigned decodeFieldSaSdst(unsigned Encoded) {
+ return unpackBits(Encoded, getSaSdstBitShift(), getSaSdstBitWidth());
+}
+
+unsigned encodeFieldVmVsrc(unsigned Encoded, unsigned VmVsrc) {
+ return packBits(VmVsrc, Encoded, getVmVsrcBitShift(), getVmVsrcBitWidth());
+}
+
+unsigned encodeFieldVmVsrc(unsigned VmVsrc) {
+ return encodeFieldVmVsrc(0xffff, VmVsrc);
+}
+
+unsigned encodeFieldVaVdst(unsigned Encoded, unsigned VaVdst) {
+ return packBits(VaVdst, Encoded, getVaVdstBitShift(), getVaVdstBitWidth());
+}
+
+unsigned encodeFieldVaVdst(unsigned VaVdst) {
+ return encodeFieldVaVdst(0xffff, VaVdst);
+}
+
+unsigned encodeFieldSaSdst(unsigned Encoded, unsigned SaSdst) {
+ return packBits(SaSdst, Encoded, getSaSdstBitShift(), getSaSdstBitWidth());
+}
+
+unsigned encodeFieldSaSdst(unsigned SaSdst) {
+ return encodeFieldSaSdst(0xffff, SaSdst);
+}
+
} // namespace DepCtr
//===----------------------------------------------------------------------===//
@@ -1913,44 +1977,53 @@ bool isKernelCC(const Function *Func) {
}
bool hasXNACK(const MCSubtargetInfo &STI) {
- return STI.getFeatureBits()[AMDGPU::FeatureXNACK];
+ return STI.hasFeature(AMDGPU::FeatureXNACK);
}
bool hasSRAMECC(const MCSubtargetInfo &STI) {
- return STI.getFeatureBits()[AMDGPU::FeatureSRAMECC];
+ return STI.hasFeature(AMDGPU::FeatureSRAMECC);
}
bool hasMIMG_R128(const MCSubtargetInfo &STI) {
- return STI.getFeatureBits()[AMDGPU::FeatureMIMG_R128] && !STI.getFeatureBits()[AMDGPU::FeatureR128A16];
+ return STI.hasFeature(AMDGPU::FeatureMIMG_R128) && !STI.hasFeature(AMDGPU::FeatureR128A16);
}
bool hasA16(const MCSubtargetInfo &STI) {
- return STI.getFeatureBits()[AMDGPU::FeatureA16];
+ return STI.hasFeature(AMDGPU::FeatureA16);
}
bool hasG16(const MCSubtargetInfo &STI) {
- return STI.getFeatureBits()[AMDGPU::FeatureG16];
+ return STI.hasFeature(AMDGPU::FeatureG16);
}
bool hasPackedD16(const MCSubtargetInfo &STI) {
- return !STI.getFeatureBits()[AMDGPU::FeatureUnpackedD16VMem] && !isCI(STI) &&
+ return !STI.hasFeature(AMDGPU::FeatureUnpackedD16VMem) && !isCI(STI) &&
!isSI(STI);
}
+unsigned getNSAMaxSize(const MCSubtargetInfo &STI) {
+ auto Version = getIsaVersion(STI.getCPU());
+ if (Version.Major == 10)
+ return Version.Minor >= 3 ? 13 : 5;
+ if (Version.Major == 11)
+ return 5;
+ return 0;
+}
+
bool isSI(const MCSubtargetInfo &STI) {
- return STI.getFeatureBits()[AMDGPU::FeatureSouthernIslands];
+ return STI.hasFeature(AMDGPU::FeatureSouthernIslands);
}
bool isCI(const MCSubtargetInfo &STI) {
- return STI.getFeatureBits()[AMDGPU::FeatureSeaIslands];
+ return STI.hasFeature(AMDGPU::FeatureSeaIslands);
}
bool isVI(const MCSubtargetInfo &STI) {
- return STI.getFeatureBits()[AMDGPU::FeatureVolcanicIslands];
+ return STI.hasFeature(AMDGPU::FeatureVolcanicIslands);
}
bool isGFX9(const MCSubtargetInfo &STI) {
- return STI.getFeatureBits()[AMDGPU::FeatureGFX9];
+ return STI.hasFeature(AMDGPU::FeatureGFX9);
}
bool isGFX9_GFX10(const MCSubtargetInfo &STI) {
@@ -1970,7 +2043,7 @@ bool isGFX9Plus(const MCSubtargetInfo &STI) {
}
bool isGFX10(const MCSubtargetInfo &STI) {
- return STI.getFeatureBits()[AMDGPU::FeatureGFX10];
+ return STI.hasFeature(AMDGPU::FeatureGFX10);
}
bool isGFX10Plus(const MCSubtargetInfo &STI) {
@@ -1978,7 +2051,7 @@ bool isGFX10Plus(const MCSubtargetInfo &STI) {
}
bool isGFX11(const MCSubtargetInfo &STI) {
- return STI.getFeatureBits()[AMDGPU::FeatureGFX11];
+ return STI.hasFeature(AMDGPU::FeatureGFX11);
}
bool isGFX11Plus(const MCSubtargetInfo &STI) {
@@ -1998,39 +2071,39 @@ bool isGFX10Before1030(const MCSubtargetInfo &STI) {
}
bool isGCN3Encoding(const MCSubtargetInfo &STI) {
- return STI.getFeatureBits()[AMDGPU::FeatureGCN3Encoding];
+ return STI.hasFeature(AMDGPU::FeatureGCN3Encoding);
}
bool isGFX10_AEncoding(const MCSubtargetInfo &STI) {
- return STI.getFeatureBits()[AMDGPU::FeatureGFX10_AEncoding];
+ return STI.hasFeature(AMDGPU::FeatureGFX10_AEncoding);
}
bool isGFX10_BEncoding(const MCSubtargetInfo &STI) {
- return STI.getFeatureBits()[AMDGPU::FeatureGFX10_BEncoding];
+ return STI.hasFeature(AMDGPU::FeatureGFX10_BEncoding);
}
bool hasGFX10_3Insts(const MCSubtargetInfo &STI) {
- return STI.getFeatureBits()[AMDGPU::FeatureGFX10_3Insts];
+ return STI.hasFeature(AMDGPU::FeatureGFX10_3Insts);
}
bool isGFX90A(const MCSubtargetInfo &STI) {
- return STI.getFeatureBits()[AMDGPU::FeatureGFX90AInsts];
+ return STI.hasFeature(AMDGPU::FeatureGFX90AInsts);
}
bool isGFX940(const MCSubtargetInfo &STI) {
- return STI.getFeatureBits()[AMDGPU::FeatureGFX940Insts];
+ return STI.hasFeature(AMDGPU::FeatureGFX940Insts);
}
bool hasArchitectedFlatScratch(const MCSubtargetInfo &STI) {
- return STI.getFeatureBits()[AMDGPU::FeatureArchitectedFlatScratch];
+ return STI.hasFeature(AMDGPU::FeatureArchitectedFlatScratch);
}
bool hasMAIInsts(const MCSubtargetInfo &STI) {
- return STI.getFeatureBits()[AMDGPU::FeatureMAIInsts];
+ return STI.hasFeature(AMDGPU::FeatureMAIInsts);
}
bool hasVOPD(const MCSubtargetInfo &STI) {
- return STI.getFeatureBits()[AMDGPU::FeatureVOPD];
+ return STI.hasFeature(AMDGPU::FeatureVOPD);
}
int32_t getTotalNumVGPRs(bool has90AInsts, int32_t ArgNumAGPR,
@@ -2350,11 +2423,15 @@ unsigned getRegBitWidth(const MCRegisterClass &RC) {
return getRegBitWidth(RC.getID());
}
+unsigned getRegBitWidth(const TargetRegisterClass &RC) {
+ return getRegBitWidth(RC.getID());
+}
+
unsigned getRegOperandSize(const MCRegisterInfo *MRI, const MCInstrDesc &Desc,
unsigned OpNo) {
assert(OpNo < Desc.NumOperands);
unsigned RCID = Desc.operands()[OpNo].RegClass;
- return getRegBitWidth(MRI->getRegClass(RCID)) / 8;
+ return getRegBitWidth(RCID) / 8;
}
bool isInlinableLiteral64(int64_t Literal, bool HasInv2Pi) {
@@ -2362,15 +2439,15 @@ bool isInlinableLiteral64(int64_t Literal, bool HasInv2Pi) {
return true;
uint64_t Val = static_cast<uint64_t>(Literal);
- return (Val == DoubleToBits(0.0)) ||
- (Val == DoubleToBits(1.0)) ||
- (Val == DoubleToBits(-1.0)) ||
- (Val == DoubleToBits(0.5)) ||
- (Val == DoubleToBits(-0.5)) ||
- (Val == DoubleToBits(2.0)) ||
- (Val == DoubleToBits(-2.0)) ||
- (Val == DoubleToBits(4.0)) ||
- (Val == DoubleToBits(-4.0)) ||
+ return (Val == llvm::bit_cast<uint64_t>(0.0)) ||
+ (Val == llvm::bit_cast<uint64_t>(1.0)) ||
+ (Val == llvm::bit_cast<uint64_t>(-1.0)) ||
+ (Val == llvm::bit_cast<uint64_t>(0.5)) ||
+ (Val == llvm::bit_cast<uint64_t>(-0.5)) ||
+ (Val == llvm::bit_cast<uint64_t>(2.0)) ||
+ (Val == llvm::bit_cast<uint64_t>(-2.0)) ||
+ (Val == llvm::bit_cast<uint64_t>(4.0)) ||
+ (Val == llvm::bit_cast<uint64_t>(-4.0)) ||
(Val == 0x3fc45f306dc9c882 && HasInv2Pi);
}
@@ -2388,15 +2465,15 @@ bool isInlinableLiteral32(int32_t Literal, bool HasInv2Pi) {
// floating-point, so it is a legal inline immediate.
uint32_t Val = static_cast<uint32_t>(Literal);
- return (Val == FloatToBits(0.0f)) ||
- (Val == FloatToBits(1.0f)) ||
- (Val == FloatToBits(-1.0f)) ||
- (Val == FloatToBits(0.5f)) ||
- (Val == FloatToBits(-0.5f)) ||
- (Val == FloatToBits(2.0f)) ||
- (Val == FloatToBits(-2.0f)) ||
- (Val == FloatToBits(4.0f)) ||
- (Val == FloatToBits(-4.0f)) ||
+ return (Val == llvm::bit_cast<uint32_t>(0.0f)) ||
+ (Val == llvm::bit_cast<uint32_t>(1.0f)) ||
+ (Val == llvm::bit_cast<uint32_t>(-1.0f)) ||
+ (Val == llvm::bit_cast<uint32_t>(0.5f)) ||
+ (Val == llvm::bit_cast<uint32_t>(-0.5f)) ||
+ (Val == llvm::bit_cast<uint32_t>(2.0f)) ||
+ (Val == llvm::bit_cast<uint32_t>(-2.0f)) ||
+ (Val == llvm::bit_cast<uint32_t>(4.0f)) ||
+ (Val == llvm::bit_cast<uint32_t>(-4.0f)) ||
(Val == 0x3e22f983 && HasInv2Pi);
}
@@ -2475,10 +2552,35 @@ bool isArgPassedInSGPR(const Argument *A) {
case CallingConv::AMDGPU_PS:
case CallingConv::AMDGPU_CS:
case CallingConv::AMDGPU_Gfx:
- // For non-compute shaders, SGPR inputs are marked with either inreg or byval.
- // Everything else is in VGPRs.
- return F->getAttributes().hasParamAttr(A->getArgNo(), Attribute::InReg) ||
- F->getAttributes().hasParamAttr(A->getArgNo(), Attribute::ByVal);
+ // For non-compute shaders, SGPR inputs are marked with either inreg or
+ // byval. Everything else is in VGPRs.
+ return A->hasAttribute(Attribute::InReg) ||
+ A->hasAttribute(Attribute::ByVal);
+ default:
+ // TODO: Should calls support inreg for SGPR inputs?
+ return false;
+ }
+}
+
+bool isArgPassedInSGPR(const CallBase *CB, unsigned ArgNo) {
+ // Arguments to compute shaders are never a source of divergence.
+ CallingConv::ID CC = CB->getCallingConv();
+ switch (CC) {
+ case CallingConv::AMDGPU_KERNEL:
+ case CallingConv::SPIR_KERNEL:
+ return true;
+ case CallingConv::AMDGPU_VS:
+ case CallingConv::AMDGPU_LS:
+ case CallingConv::AMDGPU_HS:
+ case CallingConv::AMDGPU_ES:
+ case CallingConv::AMDGPU_GS:
+ case CallingConv::AMDGPU_PS:
+ case CallingConv::AMDGPU_CS:
+ case CallingConv::AMDGPU_Gfx:
+ // For non-compute shaders, SGPR inputs are marked with either inreg or
+ // byval. Everything else is in VGPRs.
+ return CB->paramHasAttr(ArgNo, Attribute::InReg) ||
+ CB->paramHasAttr(ArgNo, Attribute::ByVal);
default:
// TODO: Should calls support inreg for SGPR inputs?
return false;
@@ -2556,77 +2658,6 @@ unsigned getNumFlatOffsetBits(const MCSubtargetInfo &ST) {
return 13;
}
-// Given Imm, split it into the values to put into the SOffset and ImmOffset
-// fields in an MUBUF instruction. Return false if it is not possible (due to a
-// hardware bug needing a workaround).
-//
-// The required alignment ensures that individual address components remain
-// aligned if they are aligned to begin with. It also ensures that additional
-// offsets within the given alignment can be added to the resulting ImmOffset.
-bool splitMUBUFOffset(uint32_t Imm, uint32_t &SOffset, uint32_t &ImmOffset,
- const GCNSubtarget *Subtarget, Align Alignment) {
- const uint32_t MaxImm = alignDown(4095, Alignment.value());
- uint32_t Overflow = 0;
-
- if (Imm > MaxImm) {
- if (Imm <= MaxImm + 64) {
- // Use an SOffset inline constant for 4..64
- Overflow = Imm - MaxImm;
- Imm = MaxImm;
- } else {
- // Try to keep the same value in SOffset for adjacent loads, so that
- // the corresponding register contents can be re-used.
- //
- // Load values with all low-bits (except for alignment bits) set into
- // SOffset, so that a larger range of values can be covered using
- // s_movk_i32.
- //
- // Atomic operations fail to work correctly when individual address
- // components are unaligned, even if their sum is aligned.
- uint32_t High = (Imm + Alignment.value()) & ~4095;
- uint32_t Low = (Imm + Alignment.value()) & 4095;
- Imm = Low;
- Overflow = High - Alignment.value();
- }
- }
-
- // There is a hardware bug in SI and CI which prevents address clamping in
- // MUBUF instructions from working correctly with SOffsets. The immediate
- // offset is unaffected.
- if (Overflow > 0 &&
- Subtarget->getGeneration() <= AMDGPUSubtarget::SEA_ISLANDS)
- return false;
-
- ImmOffset = Imm;
- SOffset = Overflow;
- return true;
-}
-
-SIModeRegisterDefaults::SIModeRegisterDefaults(const Function &F) {
- *this = getDefaultForCallingConv(F.getCallingConv());
-
- StringRef IEEEAttr = F.getFnAttribute("amdgpu-ieee").getValueAsString();
- if (!IEEEAttr.empty())
- IEEE = IEEEAttr == "true";
-
- StringRef DX10ClampAttr
- = F.getFnAttribute("amdgpu-dx10-clamp").getValueAsString();
- if (!DX10ClampAttr.empty())
- DX10Clamp = DX10ClampAttr == "true";
-
- StringRef DenormF32Attr = F.getFnAttribute("denormal-fp-math-f32").getValueAsString();
- if (!DenormF32Attr.empty())
- FP32Denormals = parseDenormalFPAttribute(DenormF32Attr);
-
- StringRef DenormAttr = F.getFnAttribute("denormal-fp-math").getValueAsString();
- if (!DenormAttr.empty()) {
- DenormalMode DenormMode = parseDenormalFPAttribute(DenormAttr);
- if (DenormF32Attr.empty())
- FP32Denormals = DenormMode;
- FP64FP16Denormals = DenormMode;
- }
-}
-
namespace {
struct SourceOfDivergence {
@@ -2634,7 +2665,13 @@ struct SourceOfDivergence {
};
const SourceOfDivergence *lookupSourceOfDivergence(unsigned Intr);
+struct AlwaysUniform {
+ unsigned Intr;
+};
+const AlwaysUniform *lookupAlwaysUniform(unsigned Intr);
+
#define GET_SourcesOfDivergence_IMPL
+#define GET_UniformIntrinsics_IMPL
#define GET_Gfx9BufferFormat_IMPL
#define GET_Gfx10BufferFormat_IMPL
#define GET_Gfx11PlusBufferFormat_IMPL
@@ -2646,6 +2683,10 @@ bool isIntrinsicSourceOfDivergence(unsigned IntrID) {
return lookupSourceOfDivergence(IntrID);
}
+bool isIntrinsicAlwaysUniform(unsigned IntrID) {
+ return lookupAlwaysUniform(IntrID);
+}
+
const GcnBufferFormatInfo *getGcnBufferFormatInfo(uint8_t BitsPerComp,
uint8_t NumComponents,
uint8_t NumFormat,
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
index 4d3423592353..bdf7ccad9c76 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
@@ -10,8 +10,9 @@
#define LLVM_LIB_TARGET_AMDGPU_UTILS_AMDGPUBASEINFO_H
#include "SIDefines.h"
-#include "llvm/ADT/FloatingPointMode.h"
#include "llvm/IR/CallingConv.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Module.h"
#include "llvm/Support/Alignment.h"
#include <array>
#include <functional>
@@ -24,13 +25,13 @@ namespace llvm {
struct Align;
class Argument;
class Function;
-class GCNSubtarget;
class GlobalValue;
class MCInstrInfo;
class MCRegisterClass;
class MCRegisterInfo;
class MCSubtargetInfo;
class StringRef;
+class TargetRegisterClass;
class Triple;
class raw_ostream;
@@ -42,6 +43,13 @@ namespace AMDGPU {
struct IsaVersion;
+enum {
+ AMDHSA_COV2 = 2,
+ AMDHSA_COV3 = 3,
+ AMDHSA_COV4 = 4,
+ AMDHSA_COV5 = 5
+};
+
/// \returns HSA OS ABI Version identification.
std::optional<uint8_t> getHsaAbiVersion(const MCSubtargetInfo *STI);
/// \returns True if HSA OS ABI Version identification is 2,
@@ -61,17 +69,20 @@ bool isHsaAbiVersion5(const MCSubtargetInfo *STI);
bool isHsaAbiVersion3AndAbove(const MCSubtargetInfo *STI);
/// \returns The offset of the multigrid_sync_arg argument from implicitarg_ptr
-unsigned getMultigridSyncArgImplicitArgPosition();
+unsigned getMultigridSyncArgImplicitArgPosition(unsigned COV);
/// \returns The offset of the hostcall pointer argument from implicitarg_ptr
-unsigned getHostcallImplicitArgPosition();
+unsigned getHostcallImplicitArgPosition(unsigned COV);
-unsigned getDefaultQueueImplicitArgPosition();
-unsigned getCompletionActionImplicitArgPosition();
+unsigned getDefaultQueueImplicitArgPosition(unsigned COV);
+unsigned getCompletionActionImplicitArgPosition(unsigned COV);
/// \returns Code object version.
unsigned getAmdhsaCodeObjectVersion();
+/// \returns Code object version.
+unsigned getCodeObjectVersion(const Module &M);
+
struct GcnBufferFormatInfo {
unsigned Format;
unsigned BitsPerComp;
@@ -116,6 +127,7 @@ private:
const MCSubtargetInfo &STI;
TargetIDSetting XnackSetting;
TargetIDSetting SramEccSetting;
+ unsigned CodeObjectVersion;
public:
explicit AMDGPUTargetID(const MCSubtargetInfo &STI);
@@ -145,6 +157,10 @@ public:
return XnackSetting;
}
+ void setCodeObjectVersion(unsigned COV) {
+ CodeObjectVersion = COV;
+ }
+
/// Sets xnack setting to \p NewXnackSetting.
void setXnackSetting(TargetIDSetting NewXnackSetting) {
XnackSetting = NewXnackSetting;
@@ -339,6 +355,7 @@ struct MIMGBaseOpcodeInfo {
bool HasD16;
bool MSAA;
bool BVH;
+ bool A16;
};
LLVM_READONLY
@@ -544,8 +561,9 @@ enum Component : unsigned {
MAX_OPR_NUM = DST_NUM + MAX_SRC_NUM
};
-// Number of VGPR banks per VOPD component operand.
-constexpr unsigned BANKS_NUM[] = {2, 4, 4, 2};
+// LSB mask for VGPR banks per VOPD component operand.
+// 4 banks result in a mask 3, setting 2 lower bits.
+constexpr unsigned VOPD_VGPR_BANK_MASKS[] = {1, 3, 3, 1};
enum ComponentIndex : unsigned { X = 0, Y = 1 };
constexpr unsigned COMPONENTS[] = {ComponentIndex::X, ComponentIndex::Y};
@@ -555,7 +573,7 @@ constexpr unsigned COMPONENTS_NUM = 2;
class ComponentProps {
private:
unsigned SrcOperandsNum = 0;
- std::optional<unsigned> MandatoryLiteralIdx;
+ unsigned MandatoryLiteralIdx = ~0u;
bool HasSrc2Acc = false;
public:
@@ -571,13 +589,13 @@ public:
}
// Return true iif this component has a mandatory literal.
- bool hasMandatoryLiteral() const { return MandatoryLiteralIdx.has_value(); }
+ bool hasMandatoryLiteral() const { return MandatoryLiteralIdx != ~0u; }
// If this component has a mandatory literal, return component operand
// index of this literal (i.e. either Component::SRC1 or Component::SRC2).
unsigned getMandatoryLiteralCompOperandIndex() const {
assert(hasMandatoryLiteral());
- return *MandatoryLiteralIdx;
+ return MandatoryLiteralIdx;
}
// Return true iif this component has operand
@@ -593,8 +611,7 @@ public:
private:
bool hasMandatoryLiteralAt(unsigned CompSrcIdx) const {
assert(CompSrcIdx < Component::MAX_SRC_NUM);
- return hasMandatoryLiteral() &&
- *MandatoryLiteralIdx == Component::DST_NUM + CompSrcIdx;
+ return MandatoryLiteralIdx == Component::DST_NUM + CompSrcIdx;
}
};
@@ -811,10 +828,10 @@ int getIntegerAttribute(const Function &F, StringRef Name, int Default);
/// \returns \p Default and emits error if one of the requested values cannot be
/// converted to integer, or \p OnlyFirstRequired is false and "second" value is
/// not present.
-std::pair<int, int> getIntegerPairAttribute(const Function &F,
- StringRef Name,
- std::pair<int, int> Default,
- bool OnlyFirstRequired = false);
+std::pair<unsigned, unsigned>
+getIntegerPairAttribute(const Function &F, StringRef Name,
+ std::pair<unsigned, unsigned> Default,
+ bool OnlyFirstRequired = false);
/// Represents the counter values to wait for in an s_waitcnt instruction.
///
@@ -847,11 +864,6 @@ struct Waitcnt {
return VsCnt != ~0u;
}
- bool dominates(const Waitcnt &Other) const {
- return VmCnt <= Other.VmCnt && ExpCnt <= Other.ExpCnt &&
- LgkmCnt <= Other.LgkmCnt && VsCnt <= Other.VsCnt;
- }
-
Waitcnt combined(const Waitcnt &Other) const {
return Waitcnt(std::min(VmCnt, Other.VmCnt), std::min(ExpCnt, Other.ExpCnt),
std::min(LgkmCnt, Other.LgkmCnt),
@@ -965,6 +977,33 @@ bool isSymbolicDepCtrEncoding(unsigned Code, bool &HasNonDefaultVal,
bool decodeDepCtr(unsigned Code, int &Id, StringRef &Name, unsigned &Val,
bool &IsDefault, const MCSubtargetInfo &STI);
+/// \returns Decoded VaVdst from given immediate \p Encoded.
+unsigned decodeFieldVaVdst(unsigned Encoded);
+
+/// \returns Decoded VmVsrc from given immediate \p Encoded.
+unsigned decodeFieldVmVsrc(unsigned Encoded);
+
+/// \returns Decoded SaSdst from given immediate \p Encoded.
+unsigned decodeFieldSaSdst(unsigned Encoded);
+
+/// \returns \p VmVsrc as an encoded Depctr immediate.
+unsigned encodeFieldVmVsrc(unsigned VmVsrc);
+
+/// \returns \p Encoded combined with encoded \p VmVsrc.
+unsigned encodeFieldVmVsrc(unsigned Encoded, unsigned VmVsrc);
+
+/// \returns \p VaVdst as an encoded Depctr immediate.
+unsigned encodeFieldVaVdst(unsigned VaVdst);
+
+/// \returns \p Encoded combined with encoded \p VaVdst.
+unsigned encodeFieldVaVdst(unsigned Encoded, unsigned VaVdst);
+
+/// \returns \p SaSdst as an encoded Depctr immediate.
+unsigned encodeFieldSaSdst(unsigned SaSdst);
+
+/// \returns \p Encoded combined with encoded \p SaSdst.
+unsigned encodeFieldSaSdst(unsigned Encoded, unsigned SaSdst);
+
} // namespace DepCtr
namespace Exp {
@@ -1102,6 +1141,7 @@ bool hasMIMG_R128(const MCSubtargetInfo &STI);
bool hasA16(const MCSubtargetInfo &STI);
bool hasG16(const MCSubtargetInfo &STI);
bool hasPackedD16(const MCSubtargetInfo &STI);
+unsigned getNSAMaxSize(const MCSubtargetInfo &STI);
bool isSI(const MCSubtargetInfo &STI);
bool isCI(const MCSubtargetInfo &STI);
@@ -1162,6 +1202,9 @@ unsigned getRegBitWidth(unsigned RCID);
/// Get the size in bits of a register from the register class \p RC.
unsigned getRegBitWidth(const MCRegisterClass &RC);
+/// Get the size in bits of a register from the register class \p RC.
+unsigned getRegBitWidth(const TargetRegisterClass &RC);
+
/// Get size of register operand
unsigned getRegOperandSize(const MCRegisterInfo *MRI, const MCInstrDesc &Desc,
unsigned OpNo);
@@ -1244,6 +1287,8 @@ bool isFoldableLiteralV216(int32_t Literal, bool HasInv2Pi);
bool isArgPassedInSGPR(const Argument *Arg);
+bool isArgPassedInSGPR(const CallBase *CB, unsigned ArgNo);
+
LLVM_READONLY
bool isLegalSMRDEncodedUnsignedOffset(const MCSubtargetInfo &ST,
int64_t EncodedOffset);
@@ -1282,10 +1327,6 @@ unsigned getNumFlatOffsetBits(const MCSubtargetInfo &ST);
/// not the encoded offset.
bool isLegalSMRDImmOffset(const MCSubtargetInfo &ST, int64_t ByteOffset);
-bool splitMUBUFOffset(uint32_t Imm, uint32_t &SOffset, uint32_t &ImmOffset,
- const GCNSubtarget *Subtarget,
- Align Alignment = Align(4));
-
LLVM_READNONE
inline bool isLegal64BitDPPControl(unsigned DC) {
return DC >= DPP::ROW_NEWBCAST_FIRST && DC <= DPP::ROW_NEWBCAST_LAST;
@@ -1294,109 +1335,8 @@ inline bool isLegal64BitDPPControl(unsigned DC) {
/// \returns true if the intrinsic is divergent
bool isIntrinsicSourceOfDivergence(unsigned IntrID);
-// Track defaults for fields in the MODE register.
-struct SIModeRegisterDefaults {
- /// Floating point opcodes that support exception flag gathering quiet and
- /// propagate signaling NaN inputs per IEEE 754-2008. Min_dx10 and max_dx10
- /// become IEEE 754- 2008 compliant due to signaling NaN propagation and
- /// quieting.
- bool IEEE : 1;
-
- /// Used by the vector ALU to force DX10-style treatment of NaNs: when set,
- /// clamp NaN to zero; otherwise, pass NaN through.
- bool DX10Clamp : 1;
-
- /// If this is set, neither input or output denormals are flushed for most f32
- /// instructions.
- DenormalMode FP32Denormals;
-
- /// If this is set, neither input or output denormals are flushed for both f64
- /// and f16/v2f16 instructions.
- DenormalMode FP64FP16Denormals;
-
- SIModeRegisterDefaults() :
- IEEE(true),
- DX10Clamp(true),
- FP32Denormals(DenormalMode::getIEEE()),
- FP64FP16Denormals(DenormalMode::getIEEE()) {}
-
- SIModeRegisterDefaults(const Function &F);
-
- static SIModeRegisterDefaults getDefaultForCallingConv(CallingConv::ID CC) {
- SIModeRegisterDefaults Mode;
- Mode.IEEE = !AMDGPU::isShader(CC);
- return Mode;
- }
-
- bool operator ==(const SIModeRegisterDefaults Other) const {
- return IEEE == Other.IEEE && DX10Clamp == Other.DX10Clamp &&
- FP32Denormals == Other.FP32Denormals &&
- FP64FP16Denormals == Other.FP64FP16Denormals;
- }
-
- bool allFP32Denormals() const {
- return FP32Denormals == DenormalMode::getIEEE();
- }
-
- bool allFP64FP16Denormals() const {
- return FP64FP16Denormals == DenormalMode::getIEEE();
- }
-
- /// Get the encoding value for the FP_DENORM bits of the mode register for the
- /// FP32 denormal mode.
- uint32_t fpDenormModeSPValue() const {
- if (FP32Denormals == DenormalMode::getPreserveSign())
- return FP_DENORM_FLUSH_IN_FLUSH_OUT;
- if (FP32Denormals.Output == DenormalMode::PreserveSign)
- return FP_DENORM_FLUSH_OUT;
- if (FP32Denormals.Input == DenormalMode::PreserveSign)
- return FP_DENORM_FLUSH_IN;
- return FP_DENORM_FLUSH_NONE;
- }
-
- /// Get the encoding value for the FP_DENORM bits of the mode register for the
- /// FP64/FP16 denormal mode.
- uint32_t fpDenormModeDPValue() const {
- if (FP64FP16Denormals == DenormalMode::getPreserveSign())
- return FP_DENORM_FLUSH_IN_FLUSH_OUT;
- if (FP64FP16Denormals.Output == DenormalMode::PreserveSign)
- return FP_DENORM_FLUSH_OUT;
- if (FP64FP16Denormals.Input == DenormalMode::PreserveSign)
- return FP_DENORM_FLUSH_IN;
- return FP_DENORM_FLUSH_NONE;
- }
-
- /// Returns true if a flag is compatible if it's enabled in the callee, but
- /// disabled in the caller.
- static bool oneWayCompatible(bool CallerMode, bool CalleeMode) {
- return CallerMode == CalleeMode || (!CallerMode && CalleeMode);
- }
-
- // FIXME: Inlining should be OK for dx10-clamp, since the caller's mode should
- // be able to override.
- bool isInlineCompatible(SIModeRegisterDefaults CalleeMode) const {
- if (DX10Clamp != CalleeMode.DX10Clamp)
- return false;
- if (IEEE != CalleeMode.IEEE)
- return false;
-
- // Allow inlining denormals enabled into denormals flushed functions.
- return oneWayCompatible(FP64FP16Denormals.Input !=
- DenormalMode::PreserveSign,
- CalleeMode.FP64FP16Denormals.Input !=
- DenormalMode::PreserveSign) &&
- oneWayCompatible(FP64FP16Denormals.Output !=
- DenormalMode::PreserveSign,
- CalleeMode.FP64FP16Denormals.Output !=
- DenormalMode::PreserveSign) &&
- oneWayCompatible(FP32Denormals.Input != DenormalMode::PreserveSign,
- CalleeMode.FP32Denormals.Input !=
- DenormalMode::PreserveSign) &&
- oneWayCompatible(FP32Denormals.Output != DenormalMode::PreserveSign,
- CalleeMode.FP32Denormals.Output !=
- DenormalMode::PreserveSign);
- }
-};
+/// \returns true if the intrinsic is uniform
+bool isIntrinsicAlwaysUniform(unsigned IntrID);
} // end namespace AMDGPU
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUMemoryUtils.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUMemoryUtils.cpp
index b1418253fd13..cbdbf1c16f9f 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUMemoryUtils.cpp
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUMemoryUtils.cpp
@@ -31,65 +31,25 @@ Align getAlign(DataLayout const &DL, const GlobalVariable *GV) {
GV->getValueType());
}
-static bool shouldLowerLDSToStruct(const GlobalVariable &GV,
- const Function *F) {
- // We are not interested in kernel LDS lowering for module LDS itself.
- if (F && GV.getName() == "llvm.amdgcn.module.lds")
+bool isDynamicLDS(const GlobalVariable &GV) {
+ // external zero size addrspace(3) without initializer implies cuda/hip extern
+ // __shared__ the semantics for such a variable appears to be that all extern
+ // __shared__ variables alias one another. This hits different handling.
+ const Module *M = GV.getParent();
+ const DataLayout &DL = M->getDataLayout();
+ if (GV.getType()->getPointerAddressSpace() != AMDGPUAS::LOCAL_ADDRESS) {
return false;
-
- bool Ret = false;
- SmallPtrSet<const User *, 8> Visited;
- SmallVector<const User *, 16> Stack(GV.users());
-
- assert(!F || isKernelCC(F));
-
- while (!Stack.empty()) {
- const User *V = Stack.pop_back_val();
- Visited.insert(V);
-
- if (isa<GlobalValue>(V)) {
- // This use of the LDS variable is the initializer of a global variable.
- // This is ill formed. The address of an LDS variable is kernel dependent
- // and unknown until runtime. It can't be written to a global variable.
- continue;
- }
-
- if (auto *I = dyn_cast<Instruction>(V)) {
- const Function *UF = I->getFunction();
- if (UF == F) {
- // Used from this kernel, we want to put it into the structure.
- Ret = true;
- } else if (!F) {
- // For module LDS lowering, lowering is required if the user instruction
- // is from non-kernel function.
- Ret |= !isKernelCC(UF);
- }
- continue;
- }
-
- // User V should be a constant, recursively visit users of V.
- assert(isa<Constant>(V) && "Expected a constant.");
- append_range(Stack, V->users());
}
-
- return Ret;
+ uint64_t AllocSize = DL.getTypeAllocSize(GV.getValueType());
+ return GV.hasExternalLinkage() && AllocSize == 0;
}
bool isLDSVariableToLower(const GlobalVariable &GV) {
if (GV.getType()->getPointerAddressSpace() != AMDGPUAS::LOCAL_ADDRESS) {
return false;
}
- if (!GV.hasInitializer()) {
- // addrspace(3) without initializer implies cuda/hip extern __shared__
- // the semantics for such a variable appears to be that all extern
- // __shared__ variables alias one another, in which case this transform
- // is not required
- return false;
- }
- if (!isa<UndefValue>(GV.getInitializer())) {
- // Initializers are unimplemented for LDS address space.
- // Leave such variables in place for consistent error reporting.
- return false;
+ if (isDynamicLDS(GV)) {
+ return true;
}
if (GV.isConstant()) {
// A constant undef variable can't be written to, and any load is
@@ -97,22 +57,12 @@ bool isLDSVariableToLower(const GlobalVariable &GV) {
// dropped by the back end if not. This pass skips over it.
return false;
}
- return true;
-}
-
-std::vector<GlobalVariable *> findLDSVariablesToLower(Module &M,
- const Function *F) {
- std::vector<llvm::GlobalVariable *> LocalVars;
- for (auto &GV : M.globals()) {
- if (!isLDSVariableToLower(GV)) {
- continue;
- }
- if (!shouldLowerLDSToStruct(GV, F)) {
- continue;
- }
- LocalVars.push_back(&GV);
+ if (GV.hasInitializer() && !isa<UndefValue>(GV.getInitializer())) {
+ // Initializers are unimplemented for LDS address space.
+ // Leave such variables in place for consistent error reporting.
+ return false;
}
- return LocalVars;
+ return true;
}
bool isReallyAClobber(const Value *Ptr, MemoryDef *Def, AAResults *AA) {
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUMemoryUtils.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUMemoryUtils.h
index 92373fc14a98..df37c420fa72 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUMemoryUtils.h
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUMemoryUtils.h
@@ -28,9 +28,8 @@ namespace AMDGPU {
Align getAlign(DataLayout const &DL, const GlobalVariable *GV);
+bool isDynamicLDS(const GlobalVariable &GV);
bool isLDSVariableToLower(const GlobalVariable &GV);
-std::vector<GlobalVariable *> findLDSVariablesToLower(Module &M,
- const Function *F);
/// Given a \p Def clobbering a load from \p Ptr according to the MSSA check
/// if this is actually a memory update or an artificial clobber to facilitate
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.cpp
index 4ad93f7b0b68..a92d574b1848 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.cpp
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.cpp
@@ -811,6 +811,38 @@ msgpack::MapDocNode AMDGPUPALMetadata::getShaderFunction(StringRef Name) {
return Functions[Name].getMap(/*Convert=*/true);
}
+msgpack::DocNode &AMDGPUPALMetadata::refComputeRegisters() {
+ auto &N =
+ MsgPackDoc.getRoot()
+ .getMap(/*Convert=*/true)[MsgPackDoc.getNode("amdpal.pipelines")]
+ .getArray(/*Convert=*/true)[0]
+ .getMap(/*Convert=*/true)[MsgPackDoc.getNode(".compute_registers")];
+ N.getMap(/*Convert=*/true);
+ return N;
+}
+
+msgpack::MapDocNode AMDGPUPALMetadata::getComputeRegisters() {
+ if (ComputeRegisters.isEmpty())
+ ComputeRegisters = refComputeRegisters();
+ return ComputeRegisters.getMap();
+}
+
+msgpack::DocNode &AMDGPUPALMetadata::refGraphicsRegisters() {
+ auto &N =
+ MsgPackDoc.getRoot()
+ .getMap(/*Convert=*/true)[MsgPackDoc.getNode("amdpal.pipelines")]
+ .getArray(/*Convert=*/true)[0]
+ .getMap(/*Convert=*/true)[MsgPackDoc.getNode(".graphics_registers")];
+ N.getMap(/*Convert=*/true);
+ return N;
+}
+
+msgpack::MapDocNode AMDGPUPALMetadata::getGraphicsRegisters() {
+ if (GraphicsRegisters.isEmpty())
+ GraphicsRegisters = refGraphicsRegisters();
+ return GraphicsRegisters.getMap();
+}
+
// Return the PAL metadata hardware shader stage name.
static const char *getStageName(CallingConv::ID CC) {
switch (CC) {
@@ -833,15 +865,21 @@ static const char *getStageName(CallingConv::ID CC) {
}
}
+msgpack::DocNode &AMDGPUPALMetadata::refHwStage() {
+ auto &N =
+ MsgPackDoc.getRoot()
+ .getMap(/*Convert=*/true)[MsgPackDoc.getNode("amdpal.pipelines")]
+ .getArray(/*Convert=*/true)[0]
+ .getMap(/*Convert=*/true)[MsgPackDoc.getNode(".hardware_stages")];
+ N.getMap(/*Convert=*/true);
+ return N;
+}
+
// Get (create if necessary) the .hardware_stages entry for the given calling
// convention.
msgpack::MapDocNode AMDGPUPALMetadata::getHwStage(unsigned CC) {
if (HwStages.isEmpty())
- HwStages = MsgPackDoc.getRoot()
- .getMap(/*Convert=*/true)["amdpal.pipelines"]
- .getArray(/*Convert=*/true)[0]
- .getMap(/*Convert=*/true)[".hardware_stages"]
- .getMap(/*Convert=*/true);
+ HwStages = refHwStage();
return HwStages.getMap()[getStageName(CC)].getMap(/*Convert=*/true);
}
@@ -874,3 +912,78 @@ void AMDGPUPALMetadata::reset() {
Registers = MsgPackDoc.getEmptyNode();
HwStages = MsgPackDoc.getEmptyNode();
}
+
+unsigned AMDGPUPALMetadata::getPALVersion(unsigned idx) {
+ assert(idx < 2 &&
+ "illegal index to PAL version - should be 0 (major) or 1 (minor)");
+ if (!VersionChecked) {
+ if (Version.isEmpty()) {
+ auto &M = MsgPackDoc.getRoot().getMap(/*Convert=*/true);
+ auto I = M.find(MsgPackDoc.getNode("amdpal.version"));
+ if (I != M.end())
+ Version = I->second;
+ }
+ VersionChecked = true;
+ }
+ if (Version.isEmpty())
+ // Default to 2.6 if there's no version info
+ return idx ? 6 : 2;
+ return Version.getArray()[idx].getUInt();
+}
+
+unsigned AMDGPUPALMetadata::getPALMajorVersion() { return getPALVersion(0); }
+
+unsigned AMDGPUPALMetadata::getPALMinorVersion() { return getPALVersion(1); }
+
+// Set the field in a given .hardware_stages entry
+void AMDGPUPALMetadata::setHwStage(unsigned CC, StringRef field, unsigned Val) {
+ getHwStage(CC)[field] = Val;
+}
+
+void AMDGPUPALMetadata::setHwStage(unsigned CC, StringRef field, bool Val) {
+ getHwStage(CC)[field] = Val;
+}
+
+void AMDGPUPALMetadata::setComputeRegisters(StringRef field, unsigned Val) {
+ getComputeRegisters()[field] = Val;
+}
+
+void AMDGPUPALMetadata::setComputeRegisters(StringRef field, bool Val) {
+ getComputeRegisters()[field] = Val;
+}
+
+msgpack::DocNode *AMDGPUPALMetadata::refComputeRegister(StringRef field) {
+ auto M = getComputeRegisters();
+ auto I = M.find(field);
+ return I == M.end() ? nullptr : &I->second;
+}
+
+bool AMDGPUPALMetadata::checkComputeRegisters(StringRef field, unsigned Val) {
+ if (auto N = refComputeRegister(field))
+ return N->getUInt() == Val;
+ return false;
+}
+
+bool AMDGPUPALMetadata::checkComputeRegisters(StringRef field, bool Val) {
+ if (auto N = refComputeRegister(field))
+ return N->getBool() == Val;
+ return false;
+}
+
+void AMDGPUPALMetadata::setGraphicsRegisters(StringRef field, unsigned Val) {
+ getGraphicsRegisters()[field] = Val;
+}
+
+void AMDGPUPALMetadata::setGraphicsRegisters(StringRef field, bool Val) {
+ getGraphicsRegisters()[field] = Val;
+}
+
+void AMDGPUPALMetadata::setGraphicsRegisters(StringRef field1, StringRef field2,
+ unsigned Val) {
+ getGraphicsRegisters()[field1].getMap(true)[field2] = Val;
+}
+
+void AMDGPUPALMetadata::setGraphicsRegisters(StringRef field1, StringRef field2,
+ bool Val) {
+ getGraphicsRegisters()[field1].getMap(true)[field2] = Val;
+}
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.h
index a45a799e38a9..e477904cb81f 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.h
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.h
@@ -27,6 +27,11 @@ class AMDGPUPALMetadata {
msgpack::DocNode Registers;
msgpack::DocNode HwStages;
msgpack::DocNode ShaderFunctions;
+ bool VersionChecked = false;
+ msgpack::DocNode Version;
+ // From PAL version >= 3.0
+ msgpack::DocNode ComputeRegisters;
+ msgpack::DocNode GraphicsRegisters;
public:
// Read the amdgpu.pal.metadata supplied by the frontend, ready for
@@ -129,6 +134,26 @@ public:
// Set legacy PAL metadata format.
void setLegacy();
+ unsigned getPALMajorVersion();
+ unsigned getPALMinorVersion();
+
+ void setHwStage(unsigned CC, StringRef field, unsigned Val);
+ void setHwStage(unsigned CC, StringRef field, bool Val);
+
+ void setComputeRegisters(StringRef field, unsigned Val);
+ void setComputeRegisters(StringRef field, bool Val);
+
+ // If the field does not exist will return nullptr rather than creating a new
+ // entry (which is the behaviour of the other functions).
+ msgpack::DocNode *refComputeRegister(StringRef field);
+ bool checkComputeRegisters(StringRef field, unsigned Val);
+ bool checkComputeRegisters(StringRef field, bool Val);
+
+ void setGraphicsRegisters(StringRef field, unsigned Val);
+ void setGraphicsRegisters(StringRef field, bool Val);
+ void setGraphicsRegisters(StringRef field1, StringRef field2, unsigned Val);
+ void setGraphicsRegisters(StringRef field1, StringRef field2, bool Val);
+
// Erase all PAL metadata.
void reset();
@@ -151,10 +176,29 @@ private:
// Get (create if necessary) a function in the shader functions map.
msgpack::MapDocNode getShaderFunction(StringRef Name);
+ // Reference (create if necessary) the node for the compute_registers map.
+ msgpack::DocNode &refComputeRegisters();
+
+ // Get (create if necessary) the .compute_registers entry.
+ msgpack::MapDocNode getComputeRegisters();
+
+ // Reference (create if necessary) the node for the graphics registers map.
+ msgpack::DocNode &refGraphicsRegisters();
+
+ // Get (create if necessary) the .graphics_registers entry.
+ msgpack::MapDocNode getGraphicsRegisters();
+
+ // Reference (create if necessary) the node for the hardware_stages map.
+ msgpack::DocNode &refHwStage();
+
// Get (create if necessary) the .hardware_stages entry for the given calling
// convention.
msgpack::MapDocNode getHwStage(unsigned CC);
+ // Get the PAL version major (idx 0) or minor (idx 1). This is an internal
+ // helper for the public wrapper functions that request Major or Minor
+ unsigned getPALVersion(unsigned idx);
+
bool setFromLegacyBlob(StringRef Blob);
bool setFromMsgPackBlob(StringRef Blob);
void toLegacyBlob(std::string &Blob);
diff --git a/llvm/lib/Target/AMDGPU/VINTERPInstructions.td b/llvm/lib/Target/AMDGPU/VINTERPInstructions.td
index 71de20223e9f..7d03150bf5b1 100644
--- a/llvm/lib/Target/AMDGPU/VINTERPInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VINTERPInstructions.td
@@ -23,7 +23,6 @@ class VINTERPe_gfx11 <bits<7> op, VOPProfile P> : Enc64 {
let Inst{31-26} = 0x33; // VOP3P encoding
let Inst{25-24} = 0x1; // VINTERP sub-encoding
- let Inst{23} = 0; // reserved
let Inst{7-0} = vdst;
let Inst{10-8} = waitexp;
diff --git a/llvm/lib/Target/AMDGPU/VOP1Instructions.td b/llvm/lib/Target/AMDGPU/VOP1Instructions.td
index 285499ad6984..1a8efc6e3df2 100644
--- a/llvm/lib/Target/AMDGPU/VOP1Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP1Instructions.td
@@ -120,28 +120,28 @@ multiclass VOP1Inst <string opName, VOPProfile P,
def _e32 : VOP1_Pseudo <opName, P>;
else
// Only for V_MOV_B32
- def _e32 : VOP1_Pseudo <opName, P>, VOPD_Component<VOPDOp, "v_mov_b32">;
+ def _e32 : VOP1_Pseudo <opName, P>, VOPD_Component<VOPDOp, opName>;
def _e64 : VOP3InstBase <opName, P, node>;
}
- foreach _ = BoolToList<P.HasExtSDWA>.ret in
+ if P.HasExtSDWA then
def _sdwa : VOP1_SDWA_Pseudo <opName, P>;
- foreach _ = BoolToList<P.HasExtDPP>.ret in
+ if P.HasExtDPP then
def _dpp : VOP1_DPP_Pseudo <opName, P>;
let SubtargetPredicate = isGFX11Plus in {
- foreach _ = BoolToList<P.HasExtVOP3DPP>.ret in
+ if P.HasExtVOP3DPP then
def _e64_dpp : VOP3_DPP_Pseudo <opName, P>;
} // End SubtargetPredicate = isGFX11Plus
def : MnemonicAlias<opName#"_e32", opName>, LetDummies;
def : MnemonicAlias<opName#"_e64", opName>, LetDummies;
- foreach _ = BoolToList<P.HasExtSDWA>.ret in
+ if P.HasExtSDWA then
def : MnemonicAlias<opName#"_sdwa", opName>, LetDummies;
- foreach _ = BoolToList<P.HasExtDPP>.ret in
+ if P.HasExtDPP then
def : MnemonicAlias<opName#"_dpp", opName, AMDGPUAsmVariants.DPP>, LetDummies;
}
@@ -229,9 +229,9 @@ defm V_MOV_B64 : VOP1Inst <"v_mov_b64", VOP_I64_I64>;
// TODO: Make profile for this, there is VOP3 encoding also
def V_READFIRSTLANE_B32 :
InstSI <(outs SReg_32:$vdst),
- (ins VRegOrLds_32:$src0),
+ (ins VRegOrLdsSrc_32:$src0),
"v_readfirstlane_b32 $vdst, $src0",
- [(set i32:$vdst, (int_amdgcn_readfirstlane (i32 VRegOrLds_32:$src0)))]>,
+ [(set i32:$vdst, (int_amdgcn_readfirstlane (i32 VRegOrLdsSrc_32:$src0)))]>,
Enc32 {
let isCodeGenOnly = 0;
@@ -266,7 +266,7 @@ defm V_CVT_F64_I32 : VOP1Inst <"v_cvt_f64_i32", VOP1_F64_I32, sint_to_fp>;
}
defm V_CVT_F32_F64 : VOP1Inst <"v_cvt_f32_f64", VOP_F32_F64, fpround>;
-defm V_CVT_F64_F32 : VOP1Inst <"v_cvt_f64_f32", VOP_F64_F32, fpextend>;
+defm V_CVT_F64_F32 : VOP1Inst <"v_cvt_f64_f32", VOP_F64_F32, any_fpextend>;
// OMod clears exceptions when set in this instruction
defm V_CVT_U32_F64 : VOP1Inst <"v_cvt_u32_f64", VOP_I32_F64_SPECIAL_OMOD, fp_to_uint>;
@@ -290,15 +290,15 @@ defm V_CVT_U32_F32 : VOP1Inst <"v_cvt_u32_f32", VOP_I32_F32_SPECIAL_OMOD, fp_to_
defm V_CVT_I32_F32 : VOP1Inst <"v_cvt_i32_f32", VOP_I32_F32_SPECIAL_OMOD, fp_to_sint>;
let FPDPRounding = 1, isReMaterializable = 0 in {
let OtherPredicates = [NotHasTrue16BitInsts] in
- defm V_CVT_F16_F32 : VOP1Inst <"v_cvt_f16_f32", VOP_F16_F32, fpround>;
+ defm V_CVT_F16_F32 : VOP1Inst <"v_cvt_f16_f32", VOP_F16_F32, any_fpround>;
let OtherPredicates = [HasTrue16BitInsts] in
- defm V_CVT_F16_F32_t16 : VOP1Inst <"v_cvt_f16_f32_t16", VOPProfile_True16<VOP_F16_F32>, fpround>;
+ defm V_CVT_F16_F32_t16 : VOP1Inst <"v_cvt_f16_f32_t16", VOPProfile_True16<VOP_F16_F32>, any_fpround>;
} // End FPDPRounding = 1, isReMaterializable = 0
let OtherPredicates = [NotHasTrue16BitInsts] in
-defm V_CVT_F32_F16 : VOP1Inst <"v_cvt_f32_f16", VOP_F32_F16, fpextend>;
+defm V_CVT_F32_F16 : VOP1Inst <"v_cvt_f32_f16", VOP_F32_F16, any_fpextend>;
let OtherPredicates = [HasTrue16BitInsts] in
-defm V_CVT_F32_F16_t16 : VOP1Inst <"v_cvt_f32_f16_t16", VOPProfile_True16<VOP_F32_F16>, fpextend>;
+defm V_CVT_F32_F16_t16 : VOP1Inst <"v_cvt_f32_f16_t16", VOPProfile_True16<VOP_F32_F16>, any_fpextend>;
let ReadsModeReg = 0, mayRaiseFPException = 0 in {
defm V_CVT_RPI_I32_F32 : VOP1Inst <"v_cvt_rpi_i32_f32", VOP_I32_F32, cvt_rpi_i32_f32>;
@@ -321,8 +321,8 @@ defm V_RNDNE_F32 : VOP1Inst <"v_rndne_f32", VOP_F32_F32, frint>;
defm V_FLOOR_F32 : VOP1Inst <"v_floor_f32", VOP_F32_F32, ffloor>;
let TRANS = 1, SchedRW = [WriteTrans32] in {
-defm V_EXP_F32 : VOP1Inst <"v_exp_f32", VOP_F32_F32, fexp2>;
-defm V_LOG_F32 : VOP1Inst <"v_log_f32", VOP_F32_F32, flog2>;
+defm V_EXP_F32 : VOP1Inst <"v_exp_f32", VOP_F32_F32, AMDGPUexp>;
+defm V_LOG_F32 : VOP1Inst <"v_log_f32", VOP_F32_F32, AMDGPUlog>;
defm V_RCP_F32 : VOP1Inst <"v_rcp_f32", VOP_F32_F32, AMDGPUrcp>;
defm V_RCP_IFLAG_F32 : VOP1Inst <"v_rcp_iflag_f32", VOP_F32_F32, AMDGPUrcp_iflag>;
defm V_RSQ_F32 : VOP1Inst <"v_rsq_f32", VOP_F32_F32, AMDGPUrsq>;
@@ -332,7 +332,7 @@ defm V_SQRT_F32 : VOP1Inst <"v_sqrt_f32", VOP_F32_F32, any_amdgcn_sqrt>;
let TRANS = 1, SchedRW = [WriteTrans64] in {
defm V_RCP_F64 : VOP1Inst <"v_rcp_f64", VOP_F64_F64, AMDGPUrcp>;
defm V_RSQ_F64 : VOP1Inst <"v_rsq_f64", VOP_F64_F64, AMDGPUrsq>;
-defm V_SQRT_F64 : VOP1Inst <"v_sqrt_f64", VOP_F64_F64, any_amdgcn_sqrt>;
+defm V_SQRT_F64 : VOP1Inst <"v_sqrt_f64", VOP_F64_F64, int_amdgcn_sqrt>;
} // End TRANS = 1, SchedRW = [WriteTrans64]
let TRANS = 1, SchedRW = [WriteTrans32] in {
@@ -487,8 +487,8 @@ let TRANS = 1, SchedRW = [WriteTrans32] in {
defm V_RCP_F16 : VOP1Inst_t16 <"v_rcp_f16", VOP_F16_F16, AMDGPUrcp>;
defm V_SQRT_F16 : VOP1Inst_t16 <"v_sqrt_f16", VOP_F16_F16, any_amdgcn_sqrt>;
defm V_RSQ_F16 : VOP1Inst_t16 <"v_rsq_f16", VOP_F16_F16, AMDGPUrsq>;
-defm V_LOG_F16 : VOP1Inst_t16 <"v_log_f16", VOP_F16_F16, flog2>;
-defm V_EXP_F16 : VOP1Inst_t16 <"v_exp_f16", VOP_F16_F16, fexp2>;
+defm V_LOG_F16 : VOP1Inst_t16 <"v_log_f16", VOP_F16_F16, AMDGPUlogf16>;
+defm V_EXP_F16 : VOP1Inst_t16 <"v_exp_f16", VOP_F16_F16, AMDGPUexpf16>;
defm V_SIN_F16 : VOP1Inst_t16 <"v_sin_f16", VOP_F16_F16, AMDGPUsin>;
defm V_COS_F16 : VOP1Inst_t16 <"v_cos_f16", VOP_F16_F16, AMDGPUcos>;
} // End TRANS = 1, SchedRW = [WriteTrans32]
@@ -528,13 +528,10 @@ def : GCNPat<
>;
}
-def VOP_SWAP_I32 : VOPProfile<[i32, i32, i32, untyped]> {
- let Outs32 = (outs VGPR_32:$vdst, VGPR_32:$vdst1);
- let Ins32 = (ins VGPR_32:$src0, VGPR_32:$src1);
- let Outs64 = Outs32;
+def VOP_SWAP_I32 : VOPProfile<[i32, i32, untyped, untyped]> {
+ let Outs32 = (outs VGPR_32:$vdst, VRegSrc_32:$vdst1);
+ let Ins32 = (ins VRegSrc_32:$src0, VGPR_32:$src1);
let Asm32 = " $vdst, $src0";
- let Asm64 = "";
- let Ins64 = (ins);
}
let SubtargetPredicate = isGFX9Plus in {
@@ -633,7 +630,7 @@ let SubtargetPredicate = isGFX10Plus in {
def VOPProfileAccMov : VOP_NO_EXT<VOP_I32_I32> {
let DstRC = RegisterOperand<AGPR_32>;
- let Src0RC32 = RegisterOperand<AGPR_32>;
+ let Src0RC32 = ARegSrc_32;
let Asm32 = " $vdst, $src0";
}
@@ -847,7 +844,7 @@ let AssemblerPredicate = isGFX10Only, DecoderNamespace = "GFX10" in {
VOP3e_gfx10<{0, 1, 1, op{6-0}}, !cast<VOP3_Pseudo>(NAME#"_e64").Pfl>;
}
multiclass VOP1_Real_sdwa_gfx10<bits<9> op> {
- foreach _ = BoolToList<!cast<VOP1_Pseudo>(NAME#"_e32").Pfl.HasExtSDWA9>.ret in
+ if !cast<VOP1_Pseudo>(NAME#"_e32").Pfl.HasExtSDWA9 then
def _sdwa_gfx10 :
VOP_SDWA10_Real<!cast<VOP1_SDWA_Pseudo>(NAME#"_sdwa")>,
VOP1_SDWA9Ae<op{7-0}, !cast<VOP1_SDWA_Pseudo>(NAME#"_sdwa").Pfl> {
@@ -855,13 +852,13 @@ let AssemblerPredicate = isGFX10Only, DecoderNamespace = "GFX10" in {
}
}
multiclass VOP1_Real_dpp_gfx10<bits<9> op> {
- foreach _ = BoolToList<!cast<VOP1_Pseudo>(NAME#"_e32").Pfl.HasExt32BitDPP>.ret in
+ if !cast<VOP1_Pseudo>(NAME#"_e32").Pfl.HasExt32BitDPP then
def _dpp_gfx10 : VOP1_DPP16<op{7-0}, !cast<VOP1_DPP_Pseudo>(NAME#"_dpp"), SIEncodingFamily.GFX10> {
let DecoderNamespace = "SDWA10";
}
}
multiclass VOP1_Real_dpp8_gfx10<bits<9> op> {
- foreach _ = BoolToList<!cast<VOP1_Pseudo>(NAME#"_e32").Pfl.HasExt32BitDPP>.ret in
+ if !cast<VOP1_Pseudo>(NAME#"_e32").Pfl.HasExt32BitDPP then
def _dpp8_gfx10 : VOP1_DPP8<op{7-0}, !cast<VOP1_Pseudo>(NAME#"_e32")> {
let DecoderNamespace = "DPP8";
}
@@ -1067,17 +1064,17 @@ multiclass VOP1_Real_e32e64_vi <bits<10> op> {
multiclass VOP1_Real_vi <bits<10> op> {
defm NAME : VOP1_Real_e32e64_vi <op>;
- foreach _ = BoolToList<!cast<VOP1_Pseudo>(NAME#"_e32").Pfl.HasExtSDWA>.ret in
+ if !cast<VOP1_Pseudo>(NAME#"_e32").Pfl.HasExtSDWA then
def _sdwa_vi :
VOP_SDWA_Real <!cast<VOP1_SDWA_Pseudo>(NAME#"_sdwa")>,
VOP1_SDWAe <op{7-0}, !cast<VOP1_SDWA_Pseudo>(NAME#"_sdwa").Pfl>;
- foreach _ = BoolToList<!cast<VOP1_Pseudo>(NAME#"_e32").Pfl.HasExtSDWA9>.ret in
+ if !cast<VOP1_Pseudo>(NAME#"_e32").Pfl.HasExtSDWA9 then
def _sdwa_gfx9 :
VOP_SDWA9_Real <!cast<VOP1_SDWA_Pseudo>(NAME#"_sdwa")>,
VOP1_SDWA9Ae <op{7-0}, !cast<VOP1_SDWA_Pseudo>(NAME#"_sdwa").Pfl>;
- foreach _ = BoolToList<!cast<VOP1_Pseudo>(NAME#"_e32").Pfl.HasExtDPP>.ret in
+ if !cast<VOP1_Pseudo>(NAME#"_e32").Pfl.HasExtDPP then
def _dpp_vi :
VOP_DPP_Real<!cast<VOP1_DPP_Pseudo>(NAME#"_dpp"), SIEncodingFamily.VI>,
VOP1_DPPe<op{7-0}, !cast<VOP1_DPP_Pseudo>(NAME#"_dpp")>;
@@ -1241,12 +1238,12 @@ multiclass VOP1_Real_gfx9 <bits<10> op> {
defm NAME : VOP1_Real_e32e64_vi <op>;
}
- foreach _ = BoolToList<!cast<VOP1_Pseudo>(NAME#"_e32").Pfl.HasExtSDWA9>.ret in
+ if !cast<VOP1_Pseudo>(NAME#"_e32").Pfl.HasExtSDWA9 then
def _sdwa_gfx9 :
VOP_SDWA9_Real <!cast<VOP1_SDWA_Pseudo>(NAME#"_sdwa")>,
VOP1_SDWA9Ae <op{7-0}, !cast<VOP1_SDWA_Pseudo>(NAME#"_sdwa").Pfl>;
- foreach _ = BoolToList<!cast<VOP1_Pseudo>(NAME#"_e32").Pfl.HasExtDPP>.ret in
+ if !cast<VOP1_Pseudo>(NAME#"_e32").Pfl.HasExtDPP then
def _dpp_gfx9 :
VOP_DPP_Real<!cast<VOP1_DPP_Pseudo>(NAME#"_dpp"), SIEncodingFamily.GFX9>,
VOP1_DPPe<op{7-0}, !cast<VOP1_DPP_Pseudo>(NAME#"_dpp")>;
@@ -1258,14 +1255,14 @@ multiclass VOP1_Real_NoDstSel_SDWA_gfx9 <bits<10> op> {
defm NAME : VOP1_Real_e32e64_vi <op>;
}
- foreach _ = BoolToList<!cast<VOP1_Pseudo>(NAME#"_e32").Pfl.HasExtSDWA9>.ret in
+ if !cast<VOP1_Pseudo>(NAME#"_e32").Pfl.HasExtSDWA9 then
def _sdwa_gfx9 :
VOP_SDWA9_Real <!cast<VOP1_SDWA_Pseudo>(NAME#"_sdwa")>,
VOP1_SDWA9Ae <op{7-0}, !cast<VOP1_SDWA_Pseudo>(NAME#"_sdwa").Pfl> {
let Inst{42-40} = 6;
}
- foreach _ = BoolToList<!cast<VOP1_Pseudo>(NAME#"_e32").Pfl.HasExtDPP>.ret in
+ if !cast<VOP1_Pseudo>(NAME#"_e32").Pfl.HasExtDPP then
def _dpp_gfx9 :
VOP_DPP_Real<!cast<VOP1_DPP_Pseudo>(NAME#"_dpp"), SIEncodingFamily.GFX9>,
VOP1_DPPe<op{7-0}, !cast<VOP1_DPP_Pseudo>(NAME#"_dpp")>;
diff --git a/llvm/lib/Target/AMDGPU/VOP2Instructions.td b/llvm/lib/Target/AMDGPU/VOP2Instructions.td
index a1f99ca3aefa..481a162748e6 100644
--- a/llvm/lib/Target/AMDGPU/VOP2Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP2Instructions.td
@@ -157,7 +157,7 @@ multiclass VOP2Inst_e64<string opName,
Commutable_REV<revOp#"_e64", !eq(revOp, opName)>;
let SubtargetPredicate = isGFX11Plus in {
- foreach _ = BoolToList<P.HasExtVOP3DPP>.ret in
+ if P.HasExtVOP3DPP then
def _e64_dpp : VOP3_DPP_Pseudo <opName, P>;
} // End SubtargetPredicate = isGFX11Plus
} // End renamedInGFX9 = GFX9Renamed
@@ -167,7 +167,7 @@ multiclass VOP2Inst_sdwa<string opName,
VOPProfile P,
bit GFX9Renamed = 0> {
let renamedInGFX9 = GFX9Renamed in {
- foreach _ = BoolToList<P.HasExtSDWA>.ret in
+ if P.HasExtSDWA then
def _sdwa : VOP2_SDWA_Pseudo <opName, P>;
} // End renamedInGFX9 = GFX9Renamed
}
@@ -181,7 +181,7 @@ multiclass VOP2Inst<string opName,
VOP2Inst_e64<opName, P, node, revOp, GFX9Renamed>,
VOP2Inst_sdwa<opName, P, GFX9Renamed> {
let renamedInGFX9 = GFX9Renamed in {
- foreach _ = BoolToList<P.HasExtDPP>.ret in
+ if P.HasExtDPP then
def _dpp : VOP2_DPP_Pseudo <opName, P>;
}
}
@@ -227,7 +227,7 @@ multiclass VOP2Inst_VOPD<string opName,
VOP2Inst_e64<opName, P, node, revOp, GFX9Renamed>,
VOP2Inst_sdwa<opName, P, GFX9Renamed> {
let renamedInGFX9 = GFX9Renamed in {
- foreach _ = BoolToList<P.HasExtDPP>.ret in
+ if P.HasExtDPP then
def _dpp : VOP2_DPP_Pseudo <opName, P>;
}
}
@@ -246,11 +246,11 @@ multiclass VOP2bInst <string opName,
let usesCustomInserter = true;
}
- foreach _ = BoolToList<P.HasExtSDWA>.ret in
+ if P.HasExtSDWA then
def _sdwa : VOP2_SDWA_Pseudo <opName, P> {
let AsmMatchConverter = "cvtSdwaVOP2b";
}
- foreach _ = BoolToList<P.HasExtDPP>.ret in
+ if P.HasExtDPP then
def _dpp : VOP2_DPP_Pseudo <opName, P>;
} // End Uses = !if(useSGPRInput, [VCC, EXEC], [EXEC]), Defs = [VCC]
@@ -258,7 +258,7 @@ multiclass VOP2bInst <string opName,
Commutable_REV<revOp#"_e64", !eq(revOp, opName)>;
let SubtargetPredicate = isGFX11Plus in {
- foreach _ = BoolToList<P.HasExtVOP3DPP>.ret in
+ if P.HasExtVOP3DPP then
def _e64_dpp : VOP3_DPP_Pseudo <opName, P>;
} // End SubtargetPredicate = isGFX11Plus
}
@@ -297,12 +297,12 @@ multiclass
Commutable_REV<revOp#"_e32", !eq(revOp, opName)>,
VOPD_Component<VOPDOp, VOPDName>;
- foreach _ = BoolToList<P.HasExtSDWA>.ret in
+ if P.HasExtSDWA then
def _sdwa : VOP2_SDWA_Pseudo <opName, P> {
let AsmMatchConverter = "cvtSdwaVOP2e";
}
- foreach _ = BoolToList<P.HasExtDPP>.ret in
+ if P.HasExtDPP then
def _dpp : VOP2_DPP_Pseudo <opName, P>;
}
@@ -312,7 +312,7 @@ multiclass
}
let SubtargetPredicate = isGFX11Plus in {
- foreach _ = BoolToList<P.HasExtVOP3DPP>.ret in
+ if P.HasExtVOP3DPP then
def _e64_dpp : VOP3_DPP_Pseudo <opName, P>;
} // End SubtargetPredicate = isGFX11Plus
}
@@ -357,7 +357,7 @@ class VOP_MADK_Base<ValueType vt> : VOPProfile <[vt, vt, vt, vt]> {
}
class VOP_MADAK <ValueType vt> : VOP_MADK_Base<vt> {
- field Operand ImmOpType = !if(!eq(vt.Size, 32), f32kimm, f16kimm);
+ field Operand ImmOpType = !if(!eq(vt.Size, 32), KImmFP32, KImmFP16);
field dag Ins32 = !if(!eq(vt.Size, 32),
(ins VSrc_f32_Deferred:$src0, VGPR_32:$src1, ImmOpType:$imm),
(ins VSrc_f16_Deferred:$src0, VGPR_32:$src1, ImmOpType:$imm));
@@ -383,7 +383,7 @@ def VOP_MADAK_F16_t16 : VOP_MADAK <f16> {
def VOP_MADAK_F32 : VOP_MADAK <f32>;
class VOP_MADMK <ValueType vt> : VOP_MADK_Base<vt> {
- field Operand ImmOpType = !if(!eq(vt.Size, 32), f32kimm, f16kimm);
+ field Operand ImmOpType = !if(!eq(vt.Size, 32), KImmFP32, KImmFP16);
field dag Ins32 = !if(!eq(vt.Size, 32),
(ins VSrc_f32_Deferred:$src0, ImmOpType:$imm, VGPR_32:$src1),
(ins VSrc_f16_Deferred:$src0, ImmOpType:$imm, VGPR_32:$src1));
@@ -660,7 +660,7 @@ def VOP2e_I16_I16_I16_I1 : VOP2e_SGPR<[i16, i16, i16, i1]>;
def VOP_READLANE : VOPProfile<[i32, i32, i32, untyped]> {
let Outs32 = (outs SReg_32:$vdst);
let Outs64 = Outs32;
- let Ins32 = (ins VRegOrLds_32:$src0, SCSrc_b32:$src1);
+ let Ins32 = (ins VRegOrLdsSrc_32:$src0, SCSrc_b32:$src1);
let Ins64 = Ins32;
let Asm32 = " $vdst, $src0, $src1";
let Asm64 = Asm32;
@@ -764,19 +764,20 @@ defm V_SUBREV_U32 : VOP2Inst <"v_subrev_u32", VOP_I32_I32_I32_ARITH, null_frag,
let isConvergent = 1, Uses = []<Register> in {
def V_READLANE_B32 : VOP2_Pseudo<"v_readlane_b32", VOP_READLANE,
[(set i32:$vdst, (int_amdgcn_readlane i32:$src0, i32:$src1))]>;
-
-let Constraints = "$vdst = $vdst_in", DisableEncoding="$vdst_in" in {
+let IsNeverUniform = 1, Constraints = "$vdst = $vdst_in", DisableEncoding="$vdst_in" in {
def V_WRITELANE_B32 : VOP2_Pseudo<"v_writelane_b32", VOP_WRITELANE,
[(set i32:$vdst, (int_amdgcn_writelane i32:$src0, i32:$src1, i32:$vdst_in))]>;
-} // End $vdst = $vdst_in, DisableEncoding $vdst_in
+} // End IsNeverUniform, $vdst = $vdst_in, DisableEncoding $vdst_in
} // End isConvergent = 1
let isReMaterializable = 1 in {
defm V_BFM_B32 : VOP2Inst <"v_bfm_b32", VOP_I32_I32_I32>;
defm V_BCNT_U32_B32 : VOP2Inst <"v_bcnt_u32_b32", VOP_I32_I32_I32, add_ctpop>;
+let IsNeverUniform = 1 in {
defm V_MBCNT_LO_U32_B32 : VOP2Inst <"v_mbcnt_lo_u32_b32", VOP_I32_I32_I32, int_amdgcn_mbcnt_lo>;
defm V_MBCNT_HI_U32_B32 : VOP2Inst <"v_mbcnt_hi_u32_b32", VOP_I32_I32_I32, int_amdgcn_mbcnt_hi>;
-defm V_LDEXP_F32 : VOP2Inst <"v_ldexp_f32", VOP_F32_F32_I32, AMDGPUldexp>;
+} // End IsNeverUniform = 1
+defm V_LDEXP_F32 : VOP2Inst <"v_ldexp_f32", VOP_F32_F32_I32, any_fldexp>;
let ReadsModeReg = 0, mayRaiseFPException = 0 in {
defm V_CVT_PKNORM_I16_F32 : VOP2Inst <"v_cvt_pknorm_i16_f32", VOP_V2I16_F32_F32, AMDGPUpknorm_i16_f32>;
@@ -862,9 +863,18 @@ def : divergent_i64_BinOp <xor, V_XOR_B32_e64>;
// 16-Bit Operand Instructions
//===----------------------------------------------------------------------===//
-def LDEXP_F16_VOPProfile_True16 : VOPProfile_True16<VOP_F16_F16_I32> {
- // The ldexp.f16 intrinsic expects a i32 src1 operand, though the hardware
- // encoding treats src1 as an f16
+// The ldexp.f16 intrinsic expects a integer src1 operand, though the hardware
+// encoding treats src1 as an f16
+def LDEXP_F16_VOPProfile : VOPProfile <[f16, f16, f16, untyped]> {
+ let Src1Mod = Int32InputMods;
+ let Src1ModDPP = IntVRegInputMods;
+ let Src1ModVOP3DPP = IntVRegInputMods;
+ // SDWA sext is the only modifier allowed.
+ let HasSrc1IntMods = 1;
+ let HasSrc1FloatMods = 0;
+ let Src1ModSDWA = Int16SDWAInputMods;
+}
+def LDEXP_F16_VOPProfile_True16 : VOPProfile_True16<VOP_F16_F16_F16> {
let Src1RC32 = RegisterOperand<VGPR_32_Lo128>;
let Src1DPP = VGPR_32_Lo128;
let Src1ModDPP = IntT16VRegInputMods;
@@ -873,9 +883,9 @@ def LDEXP_F16_VOPProfile_True16 : VOPProfile_True16<VOP_F16_F16_I32> {
let isReMaterializable = 1 in {
let FPDPRounding = 1 in {
let SubtargetPredicate = NotHasTrue16BitInsts, OtherPredicates = [Has16BitInsts] in
- defm V_LDEXP_F16 : VOP2Inst <"v_ldexp_f16", VOP_F16_F16_I32, AMDGPUldexp>;
+ defm V_LDEXP_F16 : VOP2Inst <"v_ldexp_f16", LDEXP_F16_VOPProfile>;
let SubtargetPredicate = HasTrue16BitInsts in
- defm V_LDEXP_F16_t16 : VOP2Inst <"v_ldexp_f16_t16", LDEXP_F16_VOPProfile_True16, AMDGPUldexp>;
+ defm V_LDEXP_F16_t16 : VOP2Inst <"v_ldexp_f16_t16", LDEXP_F16_VOPProfile_True16>;
} // End FPDPRounding = 1
// FIXME VOP3 Only instructions. NFC using VOPProfile_True16 for these until a planned change to use a new register class for VOP3 encoded True16 instuctions
defm V_LSHLREV_B16 : VOP2Inst_e64_t16 <"v_lshlrev_b16", VOP_I16_I16_I16, clshl_rev_16>;
@@ -898,6 +908,21 @@ defm V_MIN_I16 : VOP2Inst_e64_t16 <"v_min_i16", VOP_I16_I16_I16, smin>;
} // End isCommutable = 1
} // End isReMaterializable = 1
+class LDEXP_F16_Pat <SDPatternOperator op, VOP_Pseudo inst, VOPProfile P = inst.Pfl> : GCNPat <
+ (P.DstVT (op (P.Src0VT (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers, i1:$clamp, i32:$omod)),
+ (i16 (VOP3Mods0 P.Src1VT:$src1, i32:$src1_modifiers)))),
+ (inst $src0_modifiers, $src0,
+ $src1_modifiers, $src1,
+ $clamp, /* clamp */
+ $omod /* omod */)
+>;
+
+let OtherPredicates = [NotHasTrue16BitInsts] in
+def : LDEXP_F16_Pat<any_fldexp, V_LDEXP_F16_e64>;
+
+let OtherPredicates = [HasTrue16BitInsts] in
+def : LDEXP_F16_Pat<any_fldexp, V_LDEXP_F16_t16_e64>;
+
let SubtargetPredicate = isGFX11Plus in {
let isCommutable = 1 in {
defm V_AND_B16_t16 : VOP2Inst_e64 <"v_and_b16_t16", VOPProfile_True16<VOP_I16_I16_I16>, and>;
@@ -1266,13 +1291,13 @@ let AssemblerPredicate = isGFX11Only, DecoderNamespace = "GFX11" in {
VOP3e_gfx11<{0, 1, 0, 0, op{5-0}}, !cast<VOP3_Pseudo>(NAME#"_e64").Pfl>;
}
multiclass VOP2_Real_dpp_gfx11<bits<6> op> {
- foreach _ = BoolToList<!cast<VOP2_Pseudo>(NAME#"_e32").Pfl.HasExtDPP>.ret in
+ if !cast<VOP2_Pseudo>(NAME#"_e32").Pfl.HasExtDPP then
def _dpp_gfx11 : VOP2_DPP16<op, !cast<VOP2_DPP_Pseudo>(NAME#"_dpp"), SIEncodingFamily.GFX11> {
let DecoderNamespace = "DPPGFX11";
}
}
multiclass VOP2_Real_dpp8_gfx11<bits<6> op> {
- foreach _ = BoolToList<!cast<VOP2_Pseudo>(NAME#"_e32").Pfl.HasExtDPP>.ret in
+ if !cast<VOP2_Pseudo>(NAME#"_e32").Pfl.HasExtDPP then
def _dpp8_gfx11 : VOP2_DPP8<op, !cast<VOP2_Pseudo>(NAME#"_e32")> {
let DecoderNamespace = "DPP8GFX11";
}
@@ -1302,7 +1327,7 @@ let AssemblerPredicate = isGFX11Only, DecoderNamespace = "GFX11" in {
multiclass VOP2_Real_dpp_with_name_gfx11<bits<6> op, string opName,
string asmName> {
defvar ps = !cast<VOP2_Pseudo>(opName#"_e32");
- foreach _ = BoolToList<ps.Pfl.HasExtDPP>.ret in
+ if ps.Pfl.HasExtDPP then
def _dpp_gfx11 : VOP2_DPP16<op, !cast<VOP2_DPP_Pseudo>(opName#"_dpp"),
SIEncodingFamily.GFX11> {
let AsmString = asmName # ps.Pfl.AsmDPP16;
@@ -1312,7 +1337,7 @@ let AssemblerPredicate = isGFX11Only, DecoderNamespace = "GFX11" in {
multiclass VOP2_Real_dpp8_with_name_gfx11<bits<6> op, string opName,
string asmName> {
defvar ps = !cast<VOP2_Pseudo>(opName#"_e32");
- foreach _ = BoolToList<ps.Pfl.HasExtDPP>.ret in
+ if ps.Pfl.HasExtDPP then
def _dpp8_gfx11 : VOP2_DPP8<op, ps> {
let AsmString = asmName # ps.Pfl.AsmDPP8;
let DecoderNamespace = "DPP8GFX11";
@@ -1329,14 +1354,14 @@ let AssemblerPredicate = isGFX11Only, DecoderNamespace = "GFX11" in {
}
}
multiclass VOP2be_Real_dpp_gfx11<bits<6> op, string opName, string asmName> {
- foreach _ = BoolToList<!cast<VOP2_Pseudo>(opName#"_e32").Pfl.HasExtDPP>.ret in
+ if !cast<VOP2_Pseudo>(opName#"_e32").Pfl.HasExtDPP then
def _dpp_gfx11 :
VOP2_DPP16<op, !cast<VOP2_DPP_Pseudo>(opName#"_dpp"), SIEncodingFamily.GFX11, asmName> {
string AsmDPP = !cast<VOP2_Pseudo>(opName#"_e32").Pfl.AsmDPP16;
let AsmString = asmName # !subst(", vcc", "", AsmDPP);
let DecoderNamespace = "DPPGFX11";
}
- foreach _ = BoolToList<!cast<VOP2_Pseudo>(opName#"_e32").Pfl.HasExtDPP>.ret in
+ if !cast<VOP2_Pseudo>(opName#"_e32").Pfl.HasExtDPP then
def _dpp_w32_gfx11 :
Base_VOP2_DPP16<op, !cast<VOP2_DPP_Pseudo>(opName#"_dpp"), asmName> {
string AsmDPP = !cast<VOP2_Pseudo>(opName#"_e32").Pfl.AsmDPP16;
@@ -1344,7 +1369,7 @@ let AssemblerPredicate = isGFX11Only, DecoderNamespace = "GFX11" in {
let isAsmParserOnly = 1;
let WaveSizePredicate = isWave32;
}
- foreach _ = BoolToList<!cast<VOP2_Pseudo>(opName#"_e32").Pfl.HasExtDPP>.ret in
+ if !cast<VOP2_Pseudo>(opName#"_e32").Pfl.HasExtDPP then
def _dpp_w64_gfx11 :
Base_VOP2_DPP16<op, !cast<VOP2_DPP_Pseudo>(opName#"_dpp"), asmName> {
string AsmDPP = !cast<VOP2_Pseudo>(opName#"_e32").Pfl.AsmDPP16;
@@ -1354,14 +1379,14 @@ let AssemblerPredicate = isGFX11Only, DecoderNamespace = "GFX11" in {
}
}
multiclass VOP2be_Real_dpp8_gfx11<bits<6> op, string opName, string asmName> {
- foreach _ = BoolToList<!cast<VOP2_Pseudo>(opName#"_e32").Pfl.HasExtDPP>.ret in
+ if !cast<VOP2_Pseudo>(opName#"_e32").Pfl.HasExtDPP then
def _dpp8_gfx11 :
VOP2_DPP8<op, !cast<VOP2_Pseudo>(opName#"_e32")> {
string AsmDPP8 = !cast<VOP2_Pseudo>(opName#"_e32").Pfl.AsmDPP8;
let AsmString = asmName # !subst(", vcc", "", AsmDPP8);
let DecoderNamespace = "DPP8GFX11";
}
- foreach _ = BoolToList<!cast<VOP2_Pseudo>(opName#"_e32").Pfl.HasExtDPP>.ret in
+ if !cast<VOP2_Pseudo>(opName#"_e32").Pfl.HasExtDPP then
def _dpp8_w32_gfx11 :
VOP2_DPP8<op, !cast<VOP2_Pseudo>(opName#"_e32")> {
string AsmDPP8 = !cast<VOP2_Pseudo>(opName#"_e32").Pfl.AsmDPP8;
@@ -1369,7 +1394,7 @@ let AssemblerPredicate = isGFX11Only, DecoderNamespace = "GFX11" in {
let isAsmParserOnly = 1;
let WaveSizePredicate = isWave32;
}
- foreach _ = BoolToList<!cast<VOP2_Pseudo>(opName#"_e32").Pfl.HasExtDPP>.ret in
+ if !cast<VOP2_Pseudo>(opName#"_e32").Pfl.HasExtDPP then
def _dpp8_w64_gfx11 :
VOP2_DPP8<op, !cast<VOP2_Pseudo>(opName#"_e32")> {
string AsmDPP8 = !cast<VOP2_Pseudo>(opName#"_e32").Pfl.AsmDPP8;
@@ -1477,19 +1502,19 @@ defm V_FMAMK_F16_t16 : VOP2Only_Real_MADK_gfx11_with_name<0x037, "v_fmamk_
defm V_FMAAK_F16_t16 : VOP2Only_Real_MADK_gfx11_with_name<0x038, "v_fmaak_f16">;
// VOP3 only.
-defm V_CNDMASK_B16 : VOP3Only_Realtriple_gfx11<0x25d>;
-defm V_LDEXP_F32 : VOP3Only_Realtriple_gfx11<0x31c>;
-defm V_BFM_B32 : VOP3Only_Realtriple_gfx11<0x31d>;
-defm V_BCNT_U32_B32 : VOP3Only_Realtriple_gfx11<0x31e>;
-defm V_MBCNT_LO_U32_B32 : VOP3Only_Realtriple_gfx11<0x31f>;
-defm V_MBCNT_HI_U32_B32 : VOP3Only_Realtriple_gfx11<0x320>;
-defm V_CVT_PKNORM_I16_F32 : VOP3Only_Realtriple_gfx11<0x321>;
-defm V_CVT_PKNORM_U16_F32 : VOP3Only_Realtriple_gfx11<0x322>;
-defm V_CVT_PK_U16_U32 : VOP3Only_Realtriple_gfx11<0x323>;
-defm V_CVT_PK_I16_I32 : VOP3Only_Realtriple_gfx11<0x324>;
-defm V_ADD_CO_U32 : VOP3beOnly_Realtriple_gfx11<0x300>;
-defm V_SUB_CO_U32 : VOP3beOnly_Realtriple_gfx11<0x301>;
-defm V_SUBREV_CO_U32 : VOP3beOnly_Realtriple_gfx11<0x302>;
+defm V_CNDMASK_B16 : VOP3Only_Realtriple_gfx11<0x25d>;
+defm V_LDEXP_F32 : VOP3Only_Realtriple_gfx11<0x31c>;
+defm V_BFM_B32 : VOP3Only_Realtriple_gfx11<0x31d>;
+defm V_BCNT_U32_B32 : VOP3Only_Realtriple_gfx11<0x31e>;
+defm V_MBCNT_LO_U32_B32 : VOP3Only_Realtriple_gfx11<0x31f>;
+defm V_MBCNT_HI_U32_B32 : VOP3Only_Realtriple_gfx11<0x320>;
+defm V_CVT_PK_NORM_I16_F32 : VOP3Only_Realtriple_with_name_gfx11<0x321, "V_CVT_PKNORM_I16_F32", "v_cvt_pk_norm_i16_f32">;
+defm V_CVT_PK_NORM_U16_F32 : VOP3Only_Realtriple_with_name_gfx11<0x322, "V_CVT_PKNORM_U16_F32", "v_cvt_pk_norm_u16_f32">;
+defm V_CVT_PK_U16_U32 : VOP3Only_Realtriple_gfx11<0x323>;
+defm V_CVT_PK_I16_I32 : VOP3Only_Realtriple_gfx11<0x324>;
+defm V_ADD_CO_U32 : VOP3beOnly_Realtriple_gfx11<0x300>;
+defm V_SUB_CO_U32 : VOP3beOnly_Realtriple_gfx11<0x301>;
+defm V_SUBREV_CO_U32 : VOP3beOnly_Realtriple_gfx11<0x302>;
let SubtargetPredicate = isGFX11Plus in {
defm : VOP2eInstAliases<V_CNDMASK_B32_e32, V_CNDMASK_B32_e32_gfx11>;
@@ -1533,7 +1558,7 @@ let AssemblerPredicate = isGFX10Only, DecoderNamespace = "GFX10" in {
VOP3e_gfx10<{0, 1, 0, 0, op{5-0}}, !cast<VOP3_Pseudo>(NAME#"_e64").Pfl>;
}
multiclass VOP2_Real_sdwa_gfx10<bits<6> op> {
- foreach _ = BoolToList<!cast<VOP2_Pseudo>(NAME#"_e32").Pfl.HasExtSDWA9>.ret in
+ if !cast<VOP2_Pseudo>(NAME#"_e32").Pfl.HasExtSDWA9 then
def _sdwa_gfx10 :
VOP_SDWA10_Real<!cast<VOP2_SDWA_Pseudo>(NAME#"_sdwa")>,
VOP2_SDWA9Ae<op{5-0}, !cast<VOP2_SDWA_Pseudo>(NAME#"_sdwa").Pfl> {
@@ -1541,13 +1566,13 @@ let AssemblerPredicate = isGFX10Only, DecoderNamespace = "GFX10" in {
}
}
multiclass VOP2_Real_dpp_gfx10<bits<6> op> {
- foreach _ = BoolToList<!cast<VOP2_Pseudo>(NAME#"_e32").Pfl.HasExt32BitDPP>.ret in
+ if !cast<VOP2_Pseudo>(NAME#"_e32").Pfl.HasExt32BitDPP then
def _dpp_gfx10 : VOP2_DPP16<op, !cast<VOP2_DPP_Pseudo>(NAME#"_dpp"), SIEncodingFamily.GFX10> {
let DecoderNamespace = "SDWA10";
}
}
multiclass VOP2_Real_dpp8_gfx10<bits<6> op> {
- foreach _ = BoolToList<!cast<VOP2_Pseudo>(NAME#"_e32").Pfl.HasExt32BitDPP>.ret in
+ if !cast<VOP2_Pseudo>(NAME#"_e32").Pfl.HasExt32BitDPP then
def _dpp8_gfx10 : VOP2_DPP8<op, !cast<VOP2_Pseudo>(NAME#"_e32")> {
let DecoderNamespace = "DPP8";
}
@@ -1576,7 +1601,7 @@ let AssemblerPredicate = isGFX10Only, DecoderNamespace = "GFX10" in {
let DecoderNamespace = "SDWA10" in {
multiclass VOP2_Real_sdwa_gfx10_with_name<bits<6> op, string opName,
string asmName> {
- foreach _ = BoolToList<!cast<VOP2_Pseudo>(opName#"_e32").Pfl.HasExtSDWA9>.ret in
+ if !cast<VOP2_Pseudo>(opName#"_e32").Pfl.HasExtSDWA9 then
def _sdwa_gfx10 :
VOP_SDWA10_Real<!cast<VOP2_SDWA_Pseudo>(opName#"_sdwa")>,
VOP2_SDWA9Ae<op{5-0}, !cast<VOP2_SDWA_Pseudo>(opName#"_sdwa").Pfl> {
@@ -1586,7 +1611,7 @@ let AssemblerPredicate = isGFX10Only, DecoderNamespace = "GFX10" in {
}
multiclass VOP2_Real_dpp_gfx10_with_name<bits<6> op, string opName,
string asmName> {
- foreach _ = BoolToList<!cast<VOP2_Pseudo>(opName#"_e32").Pfl.HasExt32BitDPP>.ret in
+ if !cast<VOP2_Pseudo>(opName#"_e32").Pfl.HasExt32BitDPP then
def _dpp_gfx10 : VOP2_DPP16<op, !cast<VOP2_DPP_Pseudo>(opName#"_dpp"), SIEncodingFamily.GFX10> {
VOP2_Pseudo ps = !cast<VOP2_Pseudo>(opName#"_e32");
let AsmString = asmName # ps.Pfl.AsmDPP16;
@@ -1594,7 +1619,7 @@ let AssemblerPredicate = isGFX10Only, DecoderNamespace = "GFX10" in {
}
multiclass VOP2_Real_dpp8_gfx10_with_name<bits<6> op, string opName,
string asmName> {
- foreach _ = BoolToList<!cast<VOP2_Pseudo>(opName#"_e32").Pfl.HasExt32BitDPP>.ret in
+ if !cast<VOP2_Pseudo>(opName#"_e32").Pfl.HasExt32BitDPP then
def _dpp8_gfx10 : VOP2_DPP8<op, !cast<VOP2_Pseudo>(opName#"_e32")> {
VOP2_Pseudo ps = !cast<VOP2_Pseudo>(opName#"_e32");
let AsmString = asmName # ps.Pfl.AsmDPP8;
@@ -1622,7 +1647,7 @@ let AssemblerPredicate = isGFX10Only, DecoderNamespace = "GFX10" in {
}
}
multiclass VOP2be_Real_sdwa_gfx10<bits<6> op, string opName, string asmName> {
- foreach _ = BoolToList<!cast<VOP2_Pseudo>(opName#"_e32").Pfl.HasExtSDWA9>.ret in
+ if !cast<VOP2_Pseudo>(opName#"_e32").Pfl.HasExtSDWA9 then
def _sdwa_gfx10 :
VOP_SDWA10_Real<!cast<VOP2_SDWA_Pseudo>(opName#"_sdwa")>,
VOP2_SDWA9Ae<op{5-0}, !cast<VOP2_SDWA_Pseudo>(opName#"_sdwa").Pfl> {
@@ -1630,7 +1655,7 @@ let AssemblerPredicate = isGFX10Only, DecoderNamespace = "GFX10" in {
let AsmString = asmName # !subst(", vcc", "", Ps.AsmOperands);
let DecoderNamespace = "SDWA10";
}
- foreach _ = BoolToList<!cast<VOP2_Pseudo>(opName#"_e32").Pfl.HasExtSDWA9>.ret in
+ if !cast<VOP2_Pseudo>(opName#"_e32").Pfl.HasExtSDWA9 then
def _sdwa_w32_gfx10 :
Base_VOP_SDWA10_Real<!cast<VOP2_SDWA_Pseudo>(opName#"_sdwa")>,
VOP2_SDWA9Ae<op{5-0}, !cast<VOP2_SDWA_Pseudo>(opName#"_sdwa").Pfl> {
@@ -1640,7 +1665,7 @@ let AssemblerPredicate = isGFX10Only, DecoderNamespace = "GFX10" in {
let DecoderNamespace = "SDWA10";
let WaveSizePredicate = isWave32;
}
- foreach _ = BoolToList<!cast<VOP2_Pseudo>(opName#"_e32").Pfl.HasExtSDWA9>.ret in
+ if !cast<VOP2_Pseudo>(opName#"_e32").Pfl.HasExtSDWA9 then
def _sdwa_w64_gfx10 :
Base_VOP_SDWA10_Real<!cast<VOP2_SDWA_Pseudo>(opName#"_sdwa")>,
VOP2_SDWA9Ae<op{5-0}, !cast<VOP2_SDWA_Pseudo>(opName#"_sdwa").Pfl> {
@@ -1652,14 +1677,14 @@ let AssemblerPredicate = isGFX10Only, DecoderNamespace = "GFX10" in {
}
}
multiclass VOP2be_Real_dpp_gfx10<bits<6> op, string opName, string asmName> {
- foreach _ = BoolToList<!cast<VOP2_Pseudo>(opName#"_e32").Pfl.HasExt32BitDPP>.ret in
+ if !cast<VOP2_Pseudo>(opName#"_e32").Pfl.HasExt32BitDPP then
def _dpp_gfx10 :
VOP2_DPP16<op, !cast<VOP2_DPP_Pseudo>(opName#"_dpp"), SIEncodingFamily.GFX10, asmName> {
string AsmDPP = !cast<VOP2_Pseudo>(opName#"_e32").Pfl.AsmDPP16;
let AsmString = asmName # !subst(", vcc", "", AsmDPP);
let DecoderNamespace = "SDWA10";
}
- foreach _ = BoolToList<!cast<VOP2_Pseudo>(opName#"_e32").Pfl.HasExt32BitDPP>.ret in
+ if !cast<VOP2_Pseudo>(opName#"_e32").Pfl.HasExt32BitDPP then
def _dpp_w32_gfx10 :
Base_VOP2_DPP16<op, !cast<VOP2_DPP_Pseudo>(opName#"_dpp"), asmName> {
string AsmDPP = !cast<VOP2_Pseudo>(opName#"_e32").Pfl.AsmDPP16;
@@ -1667,7 +1692,7 @@ let AssemblerPredicate = isGFX10Only, DecoderNamespace = "GFX10" in {
let isAsmParserOnly = 1;
let WaveSizePredicate = isWave32;
}
- foreach _ = BoolToList<!cast<VOP2_Pseudo>(opName#"_e32").Pfl.HasExt32BitDPP>.ret in
+ if !cast<VOP2_Pseudo>(opName#"_e32").Pfl.HasExt32BitDPP then
def _dpp_w64_gfx10 :
Base_VOP2_DPP16<op, !cast<VOP2_DPP_Pseudo>(opName#"_dpp"), asmName> {
string AsmDPP = !cast<VOP2_Pseudo>(opName#"_e32").Pfl.AsmDPP16;
@@ -1677,14 +1702,14 @@ let AssemblerPredicate = isGFX10Only, DecoderNamespace = "GFX10" in {
}
}
multiclass VOP2be_Real_dpp8_gfx10<bits<6> op, string opName, string asmName> {
- foreach _ = BoolToList<!cast<VOP2_Pseudo>(opName#"_e32").Pfl.HasExt32BitDPP>.ret in
+ if !cast<VOP2_Pseudo>(opName#"_e32").Pfl.HasExt32BitDPP then
def _dpp8_gfx10 :
VOP2_DPP8<op, !cast<VOP2_Pseudo>(opName#"_e32")> {
string AsmDPP8 = !cast<VOP2_Pseudo>(opName#"_e32").Pfl.AsmDPP8;
let AsmString = asmName # !subst(", vcc", "", AsmDPP8);
let DecoderNamespace = "DPP8";
}
- foreach _ = BoolToList<!cast<VOP2_Pseudo>(opName#"_e32").Pfl.HasExt32BitDPP>.ret in
+ if !cast<VOP2_Pseudo>(opName#"_e32").Pfl.HasExt32BitDPP then
def _dpp8_w32_gfx10 :
VOP2_DPP8<op, !cast<VOP2_Pseudo>(opName#"_e32")> {
string AsmDPP8 = !cast<VOP2_Pseudo>(opName#"_e32").Pfl.AsmDPP8;
@@ -1692,7 +1717,7 @@ let AssemblerPredicate = isGFX10Only, DecoderNamespace = "GFX10" in {
let isAsmParserOnly = 1;
let WaveSizePredicate = isWave32;
}
- foreach _ = BoolToList<!cast<VOP2_Pseudo>(opName#"_e32").Pfl.HasExt32BitDPP>.ret in
+ if !cast<VOP2_Pseudo>(opName#"_e32").Pfl.HasExt32BitDPP then
def _dpp8_w64_gfx10 :
VOP2_DPP8<op, !cast<VOP2_Pseudo>(opName#"_e32")> {
string AsmDPP8 = !cast<VOP2_Pseudo>(opName#"_e32").Pfl.AsmDPP8;
@@ -2014,14 +2039,14 @@ multiclass Base_VOP2_Real_e32e64_vi <bits<6> op> :
} // End AssemblerPredicate = isGFX8GFX9, DecoderNamespace = "GFX8"
multiclass VOP2_SDWA_Real <bits<6> op> {
- foreach _ = BoolToList<!cast<VOP2_Pseudo>(NAME#"_e32").Pfl.HasExtSDWA>.ret in
+ if !cast<VOP2_Pseudo>(NAME#"_e32").Pfl.HasExtSDWA then
def _sdwa_vi :
VOP_SDWA_Real <!cast<VOP2_SDWA_Pseudo>(NAME#"_sdwa")>,
VOP2_SDWAe <op{5-0}, !cast<VOP2_SDWA_Pseudo>(NAME#"_sdwa").Pfl>;
}
multiclass VOP2_SDWA9_Real <bits<6> op> {
- foreach _ = BoolToList<!cast<VOP2_Pseudo>(NAME#"_e32").Pfl.HasExtSDWA9>.ret in
+ if !cast<VOP2_Pseudo>(NAME#"_e32").Pfl.HasExtSDWA9 then
def _sdwa_gfx9 :
VOP_SDWA9_Real <!cast<VOP2_SDWA_Pseudo>(NAME#"_sdwa")>,
VOP2_SDWA9Ae <op{5-0}, !cast<VOP2_SDWA_Pseudo>(NAME#"_sdwa").Pfl>;
@@ -2044,14 +2069,14 @@ multiclass VOP2be_Real_e32e64_vi_only <bits<6> op, string OpName, string AsmName
let AsmString = AsmName # ps.AsmOperands;
let DecoderNamespace = "GFX8";
}
- foreach _ = BoolToList<!cast<VOP2_Pseudo>(OpName#"_e32").Pfl.HasExtSDWA>.ret in
+ if !cast<VOP2_Pseudo>(OpName#"_e32").Pfl.HasExtSDWA then
def _sdwa_vi :
VOP_SDWA_Real <!cast<VOP2_SDWA_Pseudo>(OpName#"_sdwa")>,
VOP2_SDWAe <op{5-0}, !cast<VOP2_SDWA_Pseudo>(OpName#"_sdwa").Pfl> {
VOP2_SDWA_Pseudo ps = !cast<VOP2_SDWA_Pseudo>(OpName#"_sdwa");
let AsmString = AsmName # ps.AsmOperands;
}
- foreach _ = BoolToList<!cast<VOP2_Pseudo>(OpName#"_e32").Pfl.HasExtDPP>.ret in
+ if !cast<VOP2_Pseudo>(OpName#"_e32").Pfl.HasExtDPP then
def _dpp_vi :
VOP_DPP_Real<!cast<VOP2_DPP_Pseudo>(OpName#"_dpp"), SIEncodingFamily.VI>,
VOP2_DPPe<op, !cast<VOP2_DPP_Pseudo>(OpName#"_dpp")> {
@@ -2078,14 +2103,14 @@ multiclass VOP2be_Real_e32e64_gfx9 <bits<6> op, string OpName, string AsmName> {
let AsmString = AsmName # ps.AsmOperands;
let DecoderNamespace = "GFX9";
}
- foreach _ = BoolToList<!cast<VOP2_Pseudo>(OpName#"_e32").Pfl.HasExtSDWA9>.ret in
+ if !cast<VOP2_Pseudo>(OpName#"_e32").Pfl.HasExtSDWA9 then
def _sdwa_gfx9 :
VOP_SDWA9_Real <!cast<VOP2_SDWA_Pseudo>(OpName#"_sdwa")>,
VOP2_SDWA9Ae <op{5-0}, !cast<VOP2_SDWA_Pseudo>(OpName#"_sdwa").Pfl> {
VOP2_SDWA_Pseudo ps = !cast<VOP2_SDWA_Pseudo>(OpName#"_sdwa");
let AsmString = AsmName # ps.AsmOperands;
}
- foreach _ = BoolToList<!cast<VOP2_Pseudo>(OpName#"_e32").Pfl.HasExtDPP>.ret in
+ if !cast<VOP2_Pseudo>(OpName#"_e32").Pfl.HasExtDPP then
def _dpp_gfx9 :
VOP_DPP_Real<!cast<VOP2_DPP_Pseudo>(OpName#"_dpp"), SIEncodingFamily.GFX9>,
VOP2_DPPe<op, !cast<VOP2_DPP_Pseudo>(OpName#"_dpp")> {
@@ -2106,12 +2131,12 @@ multiclass VOP2_Real_e32e64_gfx9 <bits<6> op> {
VOP3e_vi <{0, 1, 0, 0, op{5-0}}, !cast<VOP3_Pseudo>(NAME#"_e64").Pfl> {
let DecoderNamespace = "GFX9";
}
- foreach _ = BoolToList<!cast<VOP2_Pseudo>(NAME#"_e32").Pfl.HasExtSDWA9>.ret in
+ if !cast<VOP2_Pseudo>(NAME#"_e32").Pfl.HasExtSDWA9 then
def _sdwa_gfx9 :
VOP_SDWA9_Real <!cast<VOP2_SDWA_Pseudo>(NAME#"_sdwa")>,
VOP2_SDWA9Ae <op{5-0}, !cast<VOP2_SDWA_Pseudo>(NAME#"_sdwa").Pfl> {
}
- foreach _ = BoolToList<!cast<VOP2_Pseudo>(NAME#"_e32").Pfl.HasExtDPP>.ret in
+ if !cast<VOP2_Pseudo>(NAME#"_e32").Pfl.HasExtDPP then
def _dpp_gfx9 :
VOP_DPP_Real<!cast<VOP2_DPP_Pseudo>(NAME#"_dpp"), SIEncodingFamily.GFX9>,
VOP2_DPPe<op, !cast<VOP2_DPP_Pseudo>(NAME#"_dpp")> {
@@ -2124,7 +2149,7 @@ multiclass VOP2_Real_e32e64_gfx9 <bits<6> op> {
multiclass VOP2_Real_e32e64_vi <bits<6> op> :
Base_VOP2_Real_e32e64_vi<op>, VOP2_SDWA_Real<op>, VOP2_SDWA9_Real<op> {
- foreach _ = BoolToList<!cast<VOP2_Pseudo>(NAME#"_e32").Pfl.HasExtDPP>.ret in
+ if !cast<VOP2_Pseudo>(NAME#"_e32").Pfl.HasExtDPP then
def _dpp_vi :
VOP_DPP_Real<!cast<VOP2_DPP_Pseudo>(NAME#"_dpp"), SIEncodingFamily.VI>,
VOP2_DPPe<op, !cast<VOP2_DPP_Pseudo>(NAME#"_dpp")>;
@@ -2271,7 +2296,7 @@ let AssemblerPredicate = isGFX90APlus, DecoderNamespace = "GFX90A" in {
multiclass VOP2_Real_e32e64_gfx90a <bits<6> op> :
Base_VOP2_Real_e32e64_gfx90a<op> {
- foreach _ = BoolToList<!cast<VOP2_Pseudo>(NAME#"_e32").Pfl.HasExtDPP>.ret in
+ if !cast<VOP2_Pseudo>(NAME#"_e32").Pfl.HasExtDPP then
def _dpp_gfx90a :
VOP_DPP_Real<!cast<VOP2_DPP_Pseudo>(NAME#"_dpp"), SIEncodingFamily.GFX90A>,
VOP2_DPPe<op, !cast<VOP2_DPP_Pseudo>(NAME#"_dpp")> {
diff --git a/llvm/lib/Target/AMDGPU/VOP3Instructions.td b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
index 848d1ad1f6c7..c0e0ac1b4ec8 100644
--- a/llvm/lib/Target/AMDGPU/VOP3Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
@@ -61,7 +61,7 @@ class VOP3Interp<string OpName, VOPProfile P, list<dag> pattern = []> :
def VOP3_INTERP : VOPProfile<[f32, f32, i32, untyped]> {
let Src0Mod = FPVRegInputMods;
let Ins64 = (ins Src0Mod:$src0_modifiers, VRegSrc_32:$src0,
- Attr:$attr, AttrChan:$attrchan,
+ InterpAttr:$attr, InterpAttrChan:$attrchan,
clampmod0:$clamp, omod0:$omod);
let Asm64 = "$vdst, $src0_modifiers, $attr$attrchan$clamp$omod";
@@ -69,7 +69,7 @@ def VOP3_INTERP : VOPProfile<[f32, f32, i32, untyped]> {
def VOP3_INTERP_MOV : VOPProfile<[f32, i32, i32, untyped]> {
let Ins64 = (ins InterpSlot:$src0,
- Attr:$attr, AttrChan:$attrchan,
+ InterpAttr:$attr, InterpAttrChan:$attrchan,
clampmod0:$clamp, omod0:$omod);
let Asm64 = "$vdst, $src0, $attr$attrchan$clamp$omod";
@@ -90,16 +90,16 @@ class getInterp16Ins <bit HasSrc2, bit HasOMod,
dag ret = !if(HasSrc2,
!if(HasOMod,
(ins Src0Mod:$src0_modifiers, VRegSrc_32:$src0,
- Attr:$attr, AttrChan:$attrchan,
+ InterpAttr:$attr, InterpAttrChan:$attrchan,
Src2Mod:$src2_modifiers, VRegSrc_32:$src2,
highmod:$high, clampmod0:$clamp, omod0:$omod),
(ins Src0Mod:$src0_modifiers, VRegSrc_32:$src0,
- Attr:$attr, AttrChan:$attrchan,
+ InterpAttr:$attr, InterpAttrChan:$attrchan,
Src2Mod:$src2_modifiers, VRegSrc_32:$src2,
highmod:$high, clampmod0:$clamp)
),
(ins Src0Mod:$src0_modifiers, VRegSrc_32:$src0,
- Attr:$attr, AttrChan:$attrchan,
+ InterpAttr:$attr, InterpAttrChan:$attrchan,
highmod:$high, clampmod0:$clamp, omod0:$omod)
);
}
@@ -219,7 +219,7 @@ defm V_DIV_FIXUP_F32 : VOP3Inst <"v_div_fixup_f32", DIV_FIXUP_F32_PROF, AMDGPUdi
let SchedRW = [WriteDoubleAdd], FPDPRounding = 1 in {
defm V_DIV_FIXUP_F64 : VOP3Inst <"v_div_fixup_f64", VOP3_Profile<VOP_F64_F64_F64_F64>, AMDGPUdiv_fixup>;
- defm V_LDEXP_F64 : VOP3Inst <"v_ldexp_f64", VOP3_Profile<VOP_F64_F64_I32>, AMDGPUldexp>;
+ defm V_LDEXP_F64 : VOP3Inst <"v_ldexp_f64", VOP3_Profile<VOP_F64_F64_I32>, any_fldexp>;
} // End SchedRW = [WriteDoubleAdd], FPDPRounding = 1
} // End isReMaterializable = 1
@@ -263,7 +263,7 @@ let SchedRW = [Write64Bit] in {
def : GCNPat<
(i32 (DivergentUnaryFrag<sext> i16:$src)),
- (i32 (V_BFE_I32_e64 $src, (S_MOV_B32 (i32 0)), (S_MOV_B32 (i32 0x10))))
+ (i32 (V_BFE_I32_e64 i16:$src, (i32 0), (i32 0x10)))
>;
let isReMaterializable = 1 in {
@@ -308,11 +308,11 @@ let FPDPRounding = 1 in {
defm V_FMA_F16 : VOP3Inst <"v_fma_f16", VOP3_Profile<VOP_F16_F16_F16_F16>, any_fma>;
} // End Predicates = [Has16BitInsts, isGFX8Only]
- let renamedInGFX9 = 1, Predicates = [Has16BitInsts, isGFX9Plus] in {
+ let renamedInGFX9 = 1, SubtargetPredicate = isGFX9Plus in {
defm V_DIV_FIXUP_F16_gfx9 : VOP3Inst <"v_div_fixup_f16_gfx9",
VOP3_Profile<VOP_F16_F16_F16_F16, VOP3_OPSEL>, AMDGPUdiv_fixup>;
defm V_FMA_F16_gfx9 : VOP3Inst <"v_fma_f16_gfx9", VOP3_Profile<VOP_F16_F16_F16_F16, VOP3_OPSEL>, any_fma>;
- } // End renamedInGFX9 = 1, Predicates = [Has16BitInsts, isGFX9Plus]
+ } // End renamedInGFX9 = 1, SubtargetPredicate = isGFX9Plus
} // End FPDPRounding = 1
let SubtargetPredicate = Has16BitInsts, isCommutable = 1 in {
@@ -381,36 +381,43 @@ def V_INTERP_P2_F32_e64 : VOP3Interp <"v_interp_p2_f32", VOP3_INTERP>;
def V_INTERP_MOV_F32_e64 : VOP3Interp <"v_interp_mov_f32", VOP3_INTERP_MOV>;
} // End SubtargetPredicate = isGFX8Plus, Uses = [MODE, M0, EXEC], OtherPredicates = [isNotGFX90APlus]
-let Predicates = [Has16BitInsts, isGFX6GFX7GFX8GFX9] in {
+// Note: 16-bit instructions produce a 0 result in the high 16-bits
+// on GFX8 and GFX9 and preserve high 16 bits on GFX10+
+multiclass Arithmetic_i16_0Hi_TernaryPats <SDPatternOperator op, Instruction inst> {
+ def : GCNPat<
+ (i32 (zext (op i16:$src0, i16:$src1, i16:$src2))),
+ (inst VSrc_b16:$src0, VSrc_b16:$src1, VSrc_b16:$src2)
+ >;
+}
-multiclass Ternary_i16_Pats <SDPatternOperator op1, SDPatternOperator op2,
- Instruction inst> {
-def : GCNPat <
- (op2 (op1 i16:$src0, i16:$src1), i16:$src2),
- (inst i16:$src0, i16:$src1, i16:$src2, (i1 0))
->;
+let Predicates = [Has16BitInsts, isGFX8GFX9] in {
+defm : Arithmetic_i16_0Hi_TernaryPats<imad, V_MAD_U16_e64>;
+}
+
+let Predicates = [Has16BitInsts, isGFX6GFX7GFX8GFX9] in {
+// FIXME: Should be able to just pass imad to the instruction
+// definition pattern, but the implied clamp input interferes.
+multiclass Ternary_i16_Pats <SDPatternOperator op, Instruction inst> {
+ def : GCNPat <
+ (op i16:$src0, i16:$src1, i16:$src2),
+ (inst i16:$src0, i16:$src1, i16:$src2, (i1 0))
+ >;
}
-defm: Ternary_i16_Pats<mul, add, V_MAD_U16_e64>;
-defm: Ternary_i16_Pats<mul, add, V_MAD_I16_e64>;
+defm: Ternary_i16_Pats<imad, V_MAD_U16_e64>;
} // End Predicates = [Has16BitInsts, isGFX6GFX7GFX8GFX9]
-let Predicates = [Has16BitInsts, isGFX10Plus] in {
-multiclass Ternary_i16_Pats_gfx9<SDPatternOperator op1, SDPatternOperator op2,
- Instruction inst> {
-def : GCNPat <
+class Ternary_i16_Pats_gfx9<SDPatternOperator op1, SDPatternOperator op2,
+ Instruction inst> : GCNPat <
(op2 (op1 i16:$src0, i16:$src1), i16:$src2),
(inst SRCMODS.NONE, $src0, SRCMODS.NONE, $src1, SRCMODS.NONE, $src2, DSTCLAMP.NONE)
>;
-}
-
-defm: Ternary_i16_Pats_gfx9<mul, add, V_MAD_U16_gfx9_e64>;
-defm: Ternary_i16_Pats_gfx9<mul, add, V_MAD_I16_gfx9_e64>;
-
+let Predicates = [Has16BitInsts, isGFX10Plus] in {
+def: Ternary_i16_Pats_gfx9<mul, add, V_MAD_U16_gfx9_e64>;
} // End Predicates = [Has16BitInsts, isGFX10Plus]
class ThreeOpFragSDAG<SDPatternOperator op1, SDPatternOperator op2> : PatFrag<
@@ -673,11 +680,19 @@ def VOP3_PERMLANE_Profile : VOP3_Profile<VOPProfile <[i32, i32, i32, i32]>, VOP3
let HasExtDPP = 0;
}
+def opsel_i1timm : SDNodeXForm<timm, [{
+ return CurDAG->getTargetConstant(
+ N->getZExtValue() ? SISrcMods::OP_SEL_0 : SISrcMods::NONE,
+ SDLoc(N), MVT::i32);
+}]>;
+def gi_opsel_i1timm : GICustomOperandRenderer<"renderOpSelTImm">,
+ GISDNodeXFormEquiv<opsel_i1timm>;
+
class PermlanePat<SDPatternOperator permlane,
Instruction inst> : GCNPat<
(permlane i32:$vdst_in, i32:$src0, i32:$src1, i32:$src2,
timm:$fi, timm:$bc),
- (inst (as_i1timm $fi), VGPR_32:$src0, (as_i1timm $bc),
+ (inst (opsel_i1timm $fi), VGPR_32:$src0, (opsel_i1timm $bc),
SCSrc_b32:$src1, 0, SCSrc_b32:$src2, VGPR_32:$vdst_in)
>;
diff --git a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
index da12515c817b..71e09611e74e 100644
--- a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
@@ -35,7 +35,7 @@ class VOP3P_Mix_Profile<VOPProfile P, VOP3Features Features = VOP3_REGULAR,
FP16InputMods:$src2_modifiers, VCSrc_f16:$src2);
dag dpp_srcs =
(ins FPVRegInputMods:$src0_modifiers, VGPRSrc_32:$src0,
- FPVRegInputMods:$src1_modifiers, VGPRSrc_32:$src1,
+ FPVRegInputMods:$src1_modifiers, VRegSrc_32:$src1,
FP16InputMods:$src2_modifiers, VCSrc_f16:$src2);
// FIXME: clampmod0 misbehaves with the non-default vdst_in
@@ -142,9 +142,34 @@ def : VOP3PSatPat<usubsat, V_PK_SUB_U16>;
def : VOP3PSatPat<ssubsat, V_PK_SUB_I16>;
} // End SubtargetPredicate = HasVOP3PInsts
+// TODO: Make sure we're doing the right thing with denormals. Note
+// that FMA and MAD will differ.
multiclass MadFmaMixPats<SDPatternOperator fma_like,
+ Instruction mix_inst,
Instruction mixlo_inst,
Instruction mixhi_inst> {
+ // At least one of the operands needs to be an fpextend of an f16
+ // for this to be worthwhile, so we need three patterns here.
+ // TODO: Could we use a predicate to inspect src1/2/3 instead?
+ def : GCNPat <
+ (f32 (fma_like (f32 (VOP3PMadMixModsExt f16:$src0, i32:$src0_mods)),
+ (f32 (VOP3PMadMixMods f16:$src1, i32:$src1_mods)),
+ (f32 (VOP3PMadMixMods f16:$src2, i32:$src2_mods)))),
+ (mix_inst $src0_mods, $src0, $src1_mods, $src1, $src2_mods, $src2,
+ DSTCLAMP.NONE)>;
+ def : GCNPat <
+ (f32 (fma_like (f32 (VOP3PMadMixMods f16:$src0, i32:$src0_mods)),
+ (f32 (VOP3PMadMixModsExt f16:$src1, i32:$src1_mods)),
+ (f32 (VOP3PMadMixMods f32:$src2, i32:$src2_mods)))),
+ (mix_inst $src0_mods, $src0, $src1_mods, $src1, $src2_mods, $src2,
+ DSTCLAMP.NONE)>;
+ def : GCNPat <
+ (f32 (fma_like (f32 (VOP3PMadMixMods f16:$src0, i32:$src0_mods)),
+ (f32 (VOP3PMadMixMods f32:$src1, i32:$src1_mods)),
+ (f32 (VOP3PMadMixModsExt f16:$src2, i32:$src2_mods)))),
+ (mix_inst $src0_mods, $src0, $src1_mods, $src1, $src2_mods, $src2,
+ DSTCLAMP.NONE)>;
+
def : GCNPat <
(f16 (fpround (fma_like (f32 (VOP3PMadMixMods f16:$src0, i32:$src0_modifiers)),
(f32 (VOP3PMadMixMods f16:$src1, i32:$src1_modifiers)),
@@ -201,9 +226,29 @@ multiclass MadFmaMixPats<SDPatternOperator fma_like,
DSTCLAMP.ENABLE,
(i32 (IMPLICIT_DEF)))))
>;
+
+ def : GCNPat <
+ (f16 (fpround (fmul (f32 (VOP3PMadMixMods f32:$src0, i32:$src0_modifiers)),
+ (f32 (VOP3PMadMixMods f32:$src1, i32:$src1_modifiers))))),
+ (mixlo_inst $src0_modifiers, $src0,
+ $src1_modifiers, $src1,
+ (i32 0), (i32 0),
+ DSTCLAMP.NONE,
+ (i32 (IMPLICIT_DEF)))
+ >;
+
+ def : GCNPat <
+ (build_vector f16:$elt0, (fpround (fmul (f32 (VOP3PMadMixMods f32:$src0, i32:$src0_modifiers)),
+ (f32 (VOP3PMadMixMods f32:$src1, i32:$src1_modifiers))))),
+ (v2f16 (mixhi_inst $src0_modifiers, $src0,
+ $src1_modifiers, $src1,
+ (i32 0), (i32 0),
+ DSTCLAMP.NONE,
+ VGPR_32:$elt0))
+ >;
}
-let SubtargetPredicate = HasMadMixInsts in {
+let SubtargetPredicate = HasMadMixInsts, OtherPredicates = [NoFP32Denormals] in {
// These are VOP3a-like opcodes which accept no omod.
// Size of src arguments (16/32) is controlled by op_sel.
@@ -222,8 +267,8 @@ defm V_MAD_MIXHI_F16 : VOP3_VOP3PInst<"v_mad_mixhi_f16", VOP3P_Mix_Profile<VOP_F
} // End FPDPRounding = 1
}
-defm : MadFmaMixPats<fmad, V_MAD_MIXLO_F16, V_MAD_MIXHI_F16>;
-} // End SubtargetPredicate = HasMadMixInsts
+defm : MadFmaMixPats<fmad, V_MAD_MIX_F32, V_MAD_MIXLO_F16, V_MAD_MIXHI_F16>;
+} // End SubtargetPredicate = HasMadMixInsts, OtherPredicates = [NoFP32Denormals]
// Essentially the same as the mad_mix versions
@@ -243,7 +288,7 @@ defm V_FMA_MIXHI_F16 : VOP3_VOP3PInst<"v_fma_mixhi_f16", VOP3P_Mix_Profile<VOP_F
} // End FPDPRounding = 1
}
-defm : MadFmaMixPats<fma, V_FMA_MIXLO_F16, V_FMA_MIXHI_F16>;
+defm : MadFmaMixPats<fma, V_FMA_MIX_F32, V_FMA_MIXLO_F16, V_FMA_MIXHI_F16>;
}
// Defines patterns that extract signed 4bit from each Idx[0].
@@ -337,11 +382,12 @@ defm V_DOT2_U32_U16 : VOP3PInst<"v_dot2_u32_u16",
} // End SubtargetPredicate = HasDot2Insts
-let SubtargetPredicate = HasDot7Insts in {
-
+let SubtargetPredicate = HasDot10Insts in
defm V_DOT2_F32_F16 : VOP3PInst<"v_dot2_f32_f16",
VOP3P_Profile<VOP_F32_V2F16_V2F16_F32, VOP3_REGULAR, /*HasDPP*/ 1>,
AMDGPUfdot2, 1/*ExplicitClamp*/>;
+
+let SubtargetPredicate = HasDot7Insts in {
defm V_DOT4_U32_U8 : VOP3PInst<"v_dot4_u32_u8",
VOP3P_Profile<VOP_I32_I32_I32_I32, VOP3_PACKED>, int_amdgcn_udot4, 1>;
defm V_DOT8_U32_U4 : VOP3PInst<"v_dot8_u32_u4",
@@ -581,7 +627,7 @@ multiclass MAIInst<string OpName, string P, SDPatternOperator node,
MFMATable<0, NAME # "_vgprcd_e64">;
}
- foreach _ = BoolToList<NoDstOverlap>.ret in {
+ if NoDstOverlap then {
let Constraints = !if(NoDstOverlap, "$vdst = $src2", ""),
isConvertibleToThreeAddress = NoDstOverlap,
Mnemonic = OpName in {
@@ -989,7 +1035,7 @@ multiclass VOP3P_Real_MFMA_gfx940_aliases<string NameFrom, string NameTo, string
VOPProfile Pfl_ACD = PS_ACD.Pfl,
VOPProfile Pfl_VCD = PS_VCD.Pfl> {
let Predicates = [isGFX940Plus] in {
- foreach _ = BoolToList<!ne(NameFrom, NameTo)>.ret in {
+ if !ne(NameFrom, NameTo) then {
def : InstAlias <NameTo # " " # PS_ACD.AsmOperands,
(!cast<VOP3P_Real>(Op # "_gfx940_acd") Pfl_ACD.DstRC:$vdst,
Pfl_ACD.Src0RC64:$src0, Pfl_ACD.Src1RC64:$src1, Pfl_ACD.Src2RC64:$src2,
@@ -1017,7 +1063,7 @@ multiclass VOP3P_Real_MFMA_gfx940<bits<7> op, string Name = !cast<VOP3_Pseudo>(N
defm : VOP3P_Real_MFMA_gfx940_aliases<Name, PS_ACD.Mnemonic, NAME>;
- foreach _ = BoolToList<!ne(!subst("_1k", "", PS_ACD.Mnemonic), PS_ACD.Mnemonic)>.ret in
+ if !ne(!subst("_1k", "", PS_ACD.Mnemonic), PS_ACD.Mnemonic) then
defm : VOP3P_Real_MFMA_gfx940_aliases<Name, !subst("_1k", "", PS_ACD.Mnemonic), NAME>;
}
@@ -1081,28 +1127,16 @@ defm V_FMA_MIXHI_F16 : VOP3P_Real_vi <0x22>;
}
-let SubtargetPredicate = HasDot2Insts in {
-
defm V_DOT2_I32_I16 : VOP3P_Real_vi <0x26>;
defm V_DOT2_U32_U16 : VOP3P_Real_vi <0x27>;
-} // End SubtargetPredicate = HasDot2Insts
-
-let SubtargetPredicate = HasDot7Insts in {
-
defm V_DOT2_F32_F16 : VOP3P_Real_vi <0x23>;
defm V_DOT4_U32_U8 : VOP3P_Real_vi <0x29>;
defm V_DOT8_U32_U4 : VOP3P_Real_vi <0x2b>;
-} // End SubtargetPredicate = HasDot7Insts
-
-let SubtargetPredicate = HasDot1Insts in {
-
defm V_DOT4_I32_I8 : VOP3P_Real_vi <0x28>;
defm V_DOT8_I32_I4 : VOP3P_Real_vi <0x2a>;
-} // End SubtargetPredicate = HasDot1Insts
-
let SubtargetPredicate = HasMAIInsts in {
defm V_ACCVGPR_READ_B32 : VOP3P_Real_MAI <0x58>;
@@ -1225,24 +1259,12 @@ defm V_FMA_MIX_F32 : VOP3P_Real_gfx10_gfx11_Triple <0x20>;
defm V_FMA_MIXLO_F16 : VOP3P_Real_gfx10_gfx11_Triple <0x21>;
defm V_FMA_MIXHI_F16 : VOP3P_Real_gfx10_gfx11_Triple <0x22>;
-let SubtargetPredicate = HasDot2Insts in {
-
defm V_DOT2_I32_I16 : VOP3P_Real_gfx10 <0x14>;
defm V_DOT2_U32_U16 : VOP3P_Real_gfx10 <0x15>;
-} // End SubtargetPredicate = HasDot2Insts
-
-let SubtargetPredicate = HasDot7Insts in {
-
defm V_DOT2_F32_F16 : VOP3P_Real_gfx10_gfx11_Triple <0x13>;
defm V_DOT4_U32_U8 : VOP3P_Real_gfx10_gfx11 <0x17>;
defm V_DOT8_U32_U4 : VOP3P_Real_gfx10_gfx11 <0x19>;
-} // End SubtargetPredicate = HasDot7Insts
-
-let SubtargetPredicate = HasDot1Insts in {
-
defm V_DOT4_I32_I8 : VOP3P_Real_gfx10 <0x16>;
defm V_DOT8_I32_I4 : VOP3P_Real_gfx10 <0x18>;
-
-} // End SubtargetPredicate = HasDot1Insts
diff --git a/llvm/lib/Target/AMDGPU/VOPCInstructions.td b/llvm/lib/Target/AMDGPU/VOPCInstructions.td
index 439ca40ae3fb..6fc3d0957dce 100644
--- a/llvm/lib/Target/AMDGPU/VOPCInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOPCInstructions.td
@@ -299,7 +299,7 @@ multiclass VOPC_Pseudos <string opName,
let isCommutable = 1;
}
- foreach _ = BoolToList<P.HasExtSDWA>.ret in
+ if P.HasExtSDWA then
def _sdwa : VOPC_SDWA_Pseudo <opName, P> {
let Defs = !if(DefExec, [EXEC], []);
let SchedRW = P.Schedule;
@@ -360,7 +360,7 @@ multiclass VOPCX_Pseudos <string opName,
let IsVCMPX = 1;
}
- foreach _ = BoolToList<P_NoSDst.HasExtSDWA>.ret in
+ if P_NoSDst.HasExtSDWA then
def _nosdst_sdwa : VOPC_SDWA_Pseudo <opName#"_nosdst", P_NoSDst> {
let Defs = [EXEC];
let SchedRW = P_NoSDst.Schedule;
@@ -770,7 +770,7 @@ class VOPC_Class_Profile<list<SchedReadWrite> sched, ValueType src0VT, ValueType
// DPP8 forbids modifiers and can inherit from VOPC_Profile
let Ins64 = (ins Src0Mod:$src0_modifiers, Src0RC64:$src0, Src1RC64:$src1);
- dag InsPartVOP3DPP = (ins FPVRegInputMods:$src0_modifiers, VGPRSrc_32:$src0, VGPRSrc_32:$src1);
+ dag InsPartVOP3DPP = (ins FPVRegInputMods:$src0_modifiers, VGPRSrc_32:$src0, VRegSrc_32:$src1);
let InsVOP3Base = !con(InsPartVOP3DPP, !if(HasOpSel, (ins op_sel0:$op_sel),
(ins)));
let AsmVOP3Base = "$sdst, $src0_modifiers, $src1";
@@ -831,7 +831,7 @@ class getVOPCClassPat64 <VOPProfile P> {
list<dag> ret =
[(set i1:$sdst,
(AMDGPUfp_class
- (P.Src0VT (VOP3Mods P.Src0VT:$src0, i32:$src0_modifiers)),
+ (P.Src0VT (VOP3ModsNonCanonicalizing P.Src0VT:$src0, i32:$src0_modifiers)),
i32:$src1))];
}
@@ -854,7 +854,7 @@ multiclass VOPC_Class_Pseudos <string opName, VOPC_Profile p, bit DefExec,
let SchedRW = p.Schedule;
}
- foreach _ = BoolToList<p.HasExtSDWA>.ret in
+ if p.HasExtSDWA then
def _sdwa : VOPC_SDWA_Pseudo <opName, p> {
let Defs = !if(DefExec, !if(DefVcc, [VCC, EXEC], [EXEC]),
!if(DefVcc, [VCC], []));
@@ -902,7 +902,7 @@ multiclass VOPCX_Class_Pseudos <string opName,
let SubtargetPredicate = HasNoSdstCMPX;
}
- foreach _ = BoolToList<P_NoSDst.HasExtSDWA>.ret in
+ if P_NoSDst.HasExtSDWA then
def _nosdst_sdwa : VOPC_SDWA_Pseudo <opName#"_nosdst", P_NoSDst> {
let Defs = [EXEC];
let SchedRW = P_NoSDst.Schedule;
@@ -992,11 +992,18 @@ multiclass ICMP_Pattern <PatFrags cond, Instruction inst, ValueType vt> {
(i64 (COPY_TO_REGCLASS (inst $src0, $src1), SReg_64))
>;
- let WaveSizePredicate = isWave32 in
- def : GCNPat <
- (i32 (AMDGPUsetcc vt:$src0, vt:$src1, cond)),
- (i32 (COPY_TO_REGCLASS (inst $src0, $src1), SReg_32))
- >;
+ let WaveSizePredicate = isWave32 in {
+ def : GCNPat <
+ (i32 (AMDGPUsetcc vt:$src0, vt:$src1, cond)),
+ (i32 (COPY_TO_REGCLASS (inst $src0, $src1), SReg_32))
+ >;
+
+ // Support codegen of i64 setcc in wave32 mode.
+ def : GCNPat <
+ (i64 (AMDGPUsetcc vt:$src0, vt:$src1, cond)),
+ (i64 (REG_SEQUENCE SReg_64, (inst $src0, $src1), sub0, (S_MOV_B32 (i32 0)), sub1))
+ >;
+ }
}
defm : ICMP_Pattern <COND_EQ, V_CMP_EQ_U32_e64, i32>;
@@ -1056,13 +1063,22 @@ multiclass FCMP_Pattern <PatFrags cond, Instruction inst, ValueType vt> {
DSTCLAMP.NONE), SReg_64))
>;
- let WaveSizePredicate = isWave32 in
- def : GCNPat <
- (i32 (AMDGPUsetcc (vt (VOP3Mods vt:$src0, i32:$src0_modifiers)),
- (vt (VOP3Mods vt:$src1, i32:$src1_modifiers)), cond)),
- (i32 (COPY_TO_REGCLASS (inst $src0_modifiers, $src0, $src1_modifiers, $src1,
- DSTCLAMP.NONE), SReg_32))
- >;
+ let WaveSizePredicate = isWave32 in {
+ def : GCNPat <
+ (i32 (AMDGPUsetcc (vt (VOP3Mods vt:$src0, i32:$src0_modifiers)),
+ (vt (VOP3Mods vt:$src1, i32:$src1_modifiers)), cond)),
+ (i32 (COPY_TO_REGCLASS (inst $src0_modifiers, $src0, $src1_modifiers, $src1,
+ DSTCLAMP.NONE), SReg_32))
+ >;
+
+ def : GCNPat <
+ (i64 (AMDGPUsetcc (vt (VOP3Mods vt:$src0, i32:$src0_modifiers)),
+ (vt (VOP3Mods vt:$src1, i32:$src1_modifiers)), cond)),
+ (i64 (REG_SEQUENCE SReg_64, (inst $src0_modifiers, $src0, $src1_modifiers, $src1,
+ DSTCLAMP.NONE), sub0,
+ (S_MOV_B32 (i32 0)), sub1))
+ >;
+ }
}
defm : FCMP_Pattern <COND_OEQ, V_CMP_EQ_F32_e64, f32>;
@@ -1320,7 +1336,7 @@ let AssemblerPredicate = isGFX11Only in {
defm : VOPCInstAliases<NAME, "gfx11">;
- foreach _ = BoolToList<ps32.Pfl.HasExtDPP>.ret in {
+ if ps32.Pfl.HasExtDPP then {
defvar psDPP = !cast<VOP_DPP_Pseudo>(NAME #"_e32" #"_dpp");
defvar AsmDPP = ps32.Pfl.AsmDPP16;
let DecoderNamespace = "DPPGFX11" in {
@@ -1352,7 +1368,7 @@ let AssemblerPredicate = isGFX11Only in {
}
}
}
- foreach _ = BoolToList<ps64.Pfl.HasExtVOP3DPP>.ret in {
+ if ps64.Pfl.HasExtVOP3DPP then {
defvar psDPP = !cast<VOP_DPP_Pseudo>(NAME #"_e64" #"_dpp");
defvar AsmDPP = ps64.Pfl.AsmVOP3DPP16;
let DecoderNamespace = "DPPGFX11" in {
@@ -1419,7 +1435,7 @@ let AssemblerPredicate = isGFX11Only in {
defm : VOPCInstAliases<OpName, "gfx11", NAME, asm_name>;
- foreach _ = BoolToList<ps32.Pfl.HasExtDPP>.ret in {
+ if ps32.Pfl.HasExtDPP then {
defvar psDPP = !cast<VOP_DPP_Pseudo>(OpName #"_e32" #"_dpp");
defvar AsmDPP = ps32.Pfl.AsmDPP16;
let DecoderNamespace = "DPPGFX11" in {
@@ -1456,7 +1472,7 @@ let AssemblerPredicate = isGFX11Only in {
}
}
- foreach _ = BoolToList<ps64.Pfl.HasExtVOP3DPP>.ret in {
+ if ps64.Pfl.HasExtVOP3DPP then {
defvar psDPP = !cast<VOP_DPP_Pseudo>(OpName #"_e64" #"_dpp");
defvar AsmDPP = ps64.Pfl.AsmVOP3DPP16;
let DecoderNamespace = "DPPGFX11" in {
@@ -1518,7 +1534,7 @@ let AssemblerPredicate = isGFX11Only in {
defm : VOPCXInstAliases<NAME, "gfx11">;
- foreach _ = BoolToList<ps32.Pfl.HasExtDPP>.ret in {
+ if ps32.Pfl.HasExtDPP then {
defvar psDPP = !cast<VOP_DPP_Pseudo>(NAME #"_nosdst_e32" #"_dpp");
defvar AsmDPP = ps32.Pfl.AsmDPP16;
let DecoderNamespace = "DPPGFX11" in {
@@ -1535,7 +1551,7 @@ let AssemblerPredicate = isGFX11Only in {
}
}
- foreach _ = BoolToList<ps64.Pfl.HasExtVOP3DPP>.ret in {
+ if ps64.Pfl.HasExtVOP3DPP then {
defvar psDPP = !cast<VOP_DPP_Pseudo>(NAME #"_nosdst_e64" #"_dpp");
defvar AsmDPP = ps64.Pfl.AsmVOP3DPP16;
let DecoderNamespace = "DPPGFX11" in {
@@ -1584,7 +1600,7 @@ let AssemblerPredicate = isGFX11Only in {
defm : VOPCXInstAliases<OpName, "gfx11", NAME, asm_name>;
- foreach _ = BoolToList<ps32.Pfl.HasExtDPP>.ret in {
+ if ps32.Pfl.HasExtDPP then {
defvar psDPP = !cast<VOP_DPP_Pseudo>(OpName#"_nosdst_e32"#"_dpp");
let DecoderNamespace = "DPPGFX11" in {
def _e32_dpp_gfx11 : VOPC_DPP16_SIMC<op{7-0}, psDPP,
@@ -1594,7 +1610,7 @@ let AssemblerPredicate = isGFX11Only in {
def _e32_dpp8_gfx11 : VOPC_DPP8<op{7-0}, ps32, asm_name>;
}
}
- foreach _ = BoolToList<ps64.Pfl.HasExtVOP3DPP>.ret in {
+ if ps64.Pfl.HasExtVOP3DPP then {
defvar psDPP = !cast<VOP_DPP_Pseudo>(OpName#"_nosdst_e64"#"_dpp");
defvar AsmDPP = ps64.Pfl.AsmVOP3DPP16;
let DecoderNamespace = "DPPGFX11" in {
@@ -1821,7 +1837,7 @@ let AssemblerPredicate = isGFX10Only in {
}
} // End DecoderNamespace = "GFX10"
- foreach _ = BoolToList<!cast<VOPC_Pseudo>(NAME#"_e32").Pfl.HasExtSDWA9>.ret in
+ if !cast<VOPC_Pseudo>(NAME#"_e32").Pfl.HasExtSDWA9 then
def _sdwa_gfx10 :
VOP_SDWA10_Real<!cast<VOPC_SDWA_Pseudo>(NAME#"_sdwa")>,
VOPC_SDWA9e<op{7-0}, !cast<VOPC_SDWA_Pseudo>(NAME#"_sdwa").Pfl>;
@@ -1847,7 +1863,7 @@ let AssemblerPredicate = isGFX10Only in {
}
} // End DecoderNamespace = "GFX10"
- foreach _ = BoolToList<!cast<VOPC_Pseudo>(NAME#"_nosdst_e32").Pfl.HasExtSDWA9>.ret in
+ if !cast<VOPC_Pseudo>(NAME#"_nosdst_e32").Pfl.HasExtSDWA9 then
def _sdwa_gfx10 :
VOP_SDWA10_Real<!cast<VOPC_SDWA_Pseudo>(NAME#"_nosdst_sdwa")>,
VOPC_SDWA9e<op{7-0}, !cast<VOPC_SDWA_Pseudo>(NAME#"_nosdst_sdwa").Pfl> {
@@ -2174,12 +2190,12 @@ multiclass VOPC_Real_vi <bits<10> op> {
}
}
- foreach _ = BoolToList<!cast<VOPC_Pseudo>(NAME#"_e32").Pfl.HasExtSDWA>.ret in
+ if !cast<VOPC_Pseudo>(NAME#"_e32").Pfl.HasExtSDWA then
def _sdwa_vi :
VOP_SDWA_Real <!cast<VOPC_SDWA_Pseudo>(NAME#"_sdwa")>,
VOPC_SDWAe <op{7-0}, !cast<VOPC_SDWA_Pseudo>(NAME#"_sdwa").Pfl>;
- foreach _ = BoolToList<!cast<VOPC_Pseudo>(NAME#"_e32").Pfl.HasExtSDWA9>.ret in
+ if !cast<VOPC_Pseudo>(NAME#"_e32").Pfl.HasExtSDWA9 then
def _sdwa_gfx9 :
VOP_SDWA9_Real <!cast<VOPC_SDWA_Pseudo>(NAME#"_sdwa")>,
VOPC_SDWA9e <op{7-0}, !cast<VOPC_SDWA_Pseudo>(NAME#"_sdwa").Pfl>;
diff --git a/llvm/lib/Target/AMDGPU/VOPInstructions.td b/llvm/lib/Target/AMDGPU/VOPInstructions.td
index d5c662ac0574..3755daf4f9b1 100644
--- a/llvm/lib/Target/AMDGPU/VOPInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOPInstructions.td
@@ -1268,7 +1268,7 @@ class VOP3InstBase<string OpName, VOPProfile P, SDPatternOperator node = null_fr
multiclass VOP3Inst<string OpName, VOPProfile P, SDPatternOperator node = null_frag> {
def _e64 : VOP3InstBase<OpName, P, node>;
let SubtargetPredicate = isGFX11Plus in {
- foreach _ = BoolToList<P.HasExtVOP3DPP>.ret in
+ if P.HasExtVOP3DPP then
def _e64_dpp : VOP3_DPP_Pseudo <OpName, P>;
} // end SubtargetPredicate = isGFX11Plus
}
@@ -1329,11 +1329,11 @@ let AssemblerPredicate = isGFX11Only,
bit isSingle = 0> {
defvar ps = !cast<VOP_Pseudo>(opName#"_e64");
let IsSingle = !or(isSingle, ps.Pfl.IsSingle) in {
- foreach _ = BoolToList<ps.Pfl.HasOpSel>.ret in
+ if ps.Pfl.HasOpSel then
def _e64_gfx11 :
VOP3_Real<ps, SIEncodingFamily.GFX11>,
VOP3OpSel_gfx11<op, ps.Pfl>;
- foreach _ = BoolToList<!not(ps.Pfl.HasOpSel)>.ret in
+ if !not(ps.Pfl.HasOpSel) then
def _e64_gfx11 :
VOP3_Real<ps, SIEncodingFamily.GFX11>,
VOP3e_gfx11<op, ps.Pfl>;
@@ -1353,11 +1353,11 @@ let AssemblerPredicate = isGFX11Only,
defvar ps = !cast<VOP_Pseudo>(opName#"_e64");
let AsmString = asmName # ps.AsmOperands,
IsSingle = !or(isSingle, ps.Pfl.IsSingle) in {
- foreach _ = BoolToList<ps.Pfl.HasOpSel>.ret in
+ if ps.Pfl.HasOpSel then
def _e64_gfx11 :
VOP3_Real<ps, SIEncodingFamily.GFX11>,
VOP3OpSel_gfx11<op, ps.Pfl>;
- foreach _ = BoolToList<!not(ps.Pfl.HasOpSel)>.ret in
+ if !not(ps.Pfl.HasOpSel) then
def _e64_gfx11 :
VOP3_Real<ps, SIEncodingFamily.GFX11>,
VOP3e_gfx11<op, ps.Pfl>;
@@ -1487,7 +1487,7 @@ include "VOP3PInstructions.td"
include "VOPDInstructions.td"
class ClassPat<Instruction inst, ValueType vt> : GCNPat <
- (is_fpclass (vt (VOP3Mods vt:$src0, i32:$src0_mods)), (i32 timm:$mask)),
+ (is_fpclass (vt (VOP3ModsNonCanonicalizing vt:$src0, i32:$src0_mods)), (i32 timm:$mask)),
(inst i32:$src0_mods, vt:$src0, (V_MOV_B32_e32 timm:$mask))
>;