aboutsummaryrefslogtreecommitdiff
path: root/contrib/llvm-project/llvm/lib/Target/AMDGPU
diff options
context:
space:
mode:
Diffstat (limited to 'contrib/llvm-project/llvm/lib/Target/AMDGPU')
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPU.h3
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPU.td54
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp26
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp2
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp17
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUGISel.td16
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp3
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp21
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h2
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp35
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h8
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp38
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp28
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h6
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td3
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp10
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp63
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp3
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPULowerKernelAttributes.cpp5
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUMachineCFGStructurizer.cpp2
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUMarkLastScratchLoad.cpp142
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp6
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp42
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPURemoveIncompatibleFunctions.cpp3
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp2
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td10
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp2
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp3
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp144
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/BUFInstructions.td96
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/DSInstructions.td41
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp7
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/FLATInstructions.td189
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/GCNDPPCombine.cpp28
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp31
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/GCNNSAReassign.cpp2
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/GCNSubtarget.h49
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp14
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFObjectWriter.cpp19
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp17
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp43
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.h4
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp134
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h59
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/MIMGInstructions.td5
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp13
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp7
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/SIDefines.h10
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp14
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp7
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/SIISelLowering.cpp262
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/SIISelLowering.h3
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp1095
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp44
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInstrInfo.h41
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInstrInfo.td291
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInstructions.td24
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp12
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp54
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h25
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp180
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/SIModeRegister.cpp2
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp9
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/SMInstructions.td77
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/SOPInstructions.td47
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp233
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h154
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/VOP1Instructions.td8
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/VOP2Instructions.td45
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/VOP3Instructions.td4
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/VOP3PInstructions.td83
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/VOPCInstructions.td44
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/VOPInstructions.td19
73 files changed, 3023 insertions, 1221 deletions
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPU.h b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPU.h
index 35d33cb60bc4..36af767a70b0 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPU.h
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPU.h
@@ -166,6 +166,9 @@ extern char &SILowerI1CopiesID;
void initializeAMDGPUGlobalISelDivergenceLoweringPass(PassRegistry &);
extern char &AMDGPUGlobalISelDivergenceLoweringID;
+void initializeAMDGPUMarkLastScratchLoadPass(PassRegistry &);
+extern char &AMDGPUMarkLastScratchLoadID;
+
void initializeSILowerSGPRSpillsPass(PassRegistry &);
extern char &SILowerSGPRSpillsID;
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPU.td b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPU.td
index df8c35ffd457..cb29d5d94759 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPU.td
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPU.td
@@ -646,6 +646,12 @@ def FeatureFP8Insts : SubtargetFeature<"fp8-insts",
"Has fp8 and bf8 instructions"
>;
+def FeatureFP8ConversionInsts : SubtargetFeature<"fp8-conversion-insts",
+ "HasFP8ConversionInsts",
+ "true",
+ "Has fp8 and bf8 conversion instructions"
+>;
+
def FeaturePkFmacF16Inst : SubtargetFeature<"pk-fmac-f16-inst",
"HasPkFmacF16Inst",
"true",
@@ -719,6 +725,18 @@ def FeatureFlatAtomicFaddF32Inst
"Has flat_atomic_add_f32 instruction"
>;
+def FeatureDefaultComponentZero : SubtargetFeature<"default-component-zero",
+ "HasDefaultComponentZero",
+ "true",
+ "BUFFER/IMAGE store instructions set unspecified components to zero (before GFX12)"
+>;
+
+def FeatureDefaultComponentBroadcast : SubtargetFeature<"default-component-broadcast",
+ "HasDefaultComponentBroadcast",
+ "true",
+ "BUFFER/IMAGE store instructions set unspecified components to x component (GFX12)"
+>;
+
def FeatureSupportsSRAMECC : SubtargetFeature<"sramecc-support",
"SupportsSRAMECC",
"true",
@@ -1003,7 +1021,7 @@ def FeatureSouthernIslands : GCNSubtargetFeatureGeneration<"SOUTHERN_ISLANDS",
FeatureWavefrontSize64, FeatureSMemTimeInst, FeatureMadMacF32Insts,
FeatureDsSrc2Insts, FeatureLDSBankCount32, FeatureMovrel,
FeatureTrigReducedRange, FeatureExtendedImageInsts, FeatureImageInsts,
- FeatureGDS, FeatureGWS
+ FeatureGDS, FeatureGWS, FeatureDefaultComponentZero
]
>;
@@ -1014,7 +1032,7 @@ def FeatureSeaIslands : GCNSubtargetFeatureGeneration<"SEA_ISLANDS",
FeatureCIInsts, FeatureMovrel, FeatureTrigReducedRange,
FeatureGFX7GFX8GFX9Insts, FeatureSMemTimeInst, FeatureMadMacF32Insts,
FeatureDsSrc2Insts, FeatureExtendedImageInsts, FeatureUnalignedBufferAccess,
- FeatureImageInsts, FeatureGDS, FeatureGWS
+ FeatureImageInsts, FeatureGDS, FeatureGWS, FeatureDefaultComponentZero
]
>;
@@ -1029,7 +1047,8 @@ def FeatureVolcanicIslands : GCNSubtargetFeatureGeneration<"VOLCANIC_ISLANDS",
FeatureIntClamp, FeatureTrigReducedRange, FeatureGFX8Insts,
FeatureGFX7GFX8GFX9Insts, FeatureSMemTimeInst, FeatureMadMacF32Insts,
FeatureDsSrc2Insts, FeatureExtendedImageInsts, FeatureFastDenormalF32,
- FeatureUnalignedBufferAccess, FeatureImageInsts, FeatureGDS, FeatureGWS
+ FeatureUnalignedBufferAccess, FeatureImageInsts, FeatureGDS, FeatureGWS,
+ FeatureDefaultComponentZero
]
>;
@@ -1047,7 +1066,7 @@ def FeatureGFX9 : GCNSubtargetFeatureGeneration<"GFX9",
FeatureScalarFlatScratchInsts, FeatureScalarAtomics, FeatureR128A16,
FeatureA16, FeatureSMemTimeInst, FeatureFastDenormalF32, FeatureSupportsXNACK,
FeatureUnalignedBufferAccess, FeatureUnalignedDSAccess,
- FeatureNegativeScratchOffsetBug, FeatureGWS
+ FeatureNegativeScratchOffsetBug, FeatureGWS, FeatureDefaultComponentZero
]
>;
@@ -1067,7 +1086,7 @@ def FeatureGFX10 : GCNSubtargetFeatureGeneration<"GFX10",
FeatureNoDataDepHazard, FeaturePkFmacF16Inst,
FeatureA16, FeatureSMemTimeInst, FeatureFastDenormalF32, FeatureG16,
FeatureUnalignedBufferAccess, FeatureUnalignedDSAccess, FeatureImageInsts,
- FeatureGDS, FeatureGWS
+ FeatureGDS, FeatureGWS, FeatureDefaultComponentZero
]
>;
@@ -1087,7 +1106,7 @@ def FeatureGFX11 : GCNSubtargetFeatureGeneration<"GFX11",
FeatureNoDataDepHazard, FeaturePkFmacF16Inst,
FeatureA16, FeatureFastDenormalF32, FeatureG16,
FeatureUnalignedBufferAccess, FeatureUnalignedDSAccess, FeatureGDS,
- FeatureGWS
+ FeatureGWS, FeatureDefaultComponentZero
]
>;
@@ -1107,7 +1126,7 @@ def FeatureGFX12 : GCNSubtargetFeatureGeneration<"GFX12",
FeatureNoDataDepHazard, FeaturePkFmacF16Inst,
FeatureA16, FeatureFastDenormalF32, FeatureG16,
FeatureUnalignedBufferAccess, FeatureUnalignedDSAccess,
- FeatureTrue16BitInsts
+ FeatureTrue16BitInsts, FeatureDefaultComponentBroadcast
]
>;
@@ -1311,6 +1330,7 @@ def FeatureISAVersion9_4_Common : FeatureSet<
FeaturePackedFP32Ops,
FeatureMAIInsts,
FeatureFP8Insts,
+ FeatureFP8ConversionInsts,
FeaturePkFmacF16Inst,
FeatureAtomicFaddRtnInsts,
FeatureAtomicFaddNoRtnInsts,
@@ -1467,7 +1487,6 @@ def FeatureISAVersion12 : FeatureSet<
[FeatureGFX12,
FeatureLDSBankCount32,
FeatureDLInsts,
- FeatureDot5Insts,
FeatureDot7Insts,
FeatureDot8Insts,
FeatureDot9Insts,
@@ -1477,8 +1496,13 @@ def FeatureISAVersion12 : FeatureSet<
FeatureWavefrontSize32,
FeatureShaderCyclesHiLoRegisters,
FeatureArchitectedFlatScratch,
+ FeatureArchitectedSGPRs,
FeatureAtomicFaddRtnInsts,
FeatureAtomicFaddNoRtnInsts,
+ FeatureAtomicDsPkAdd16Insts,
+ FeatureAtomicFlatPkAdd16Insts,
+ FeatureAtomicBufferGlobalPkAddF16Insts,
+ FeatureAtomicGlobalPkAddBF16Inst,
FeatureFlatAtomicFaddF32Inst,
FeatureImageInsts,
FeatureExtendedImageInsts,
@@ -1488,8 +1512,8 @@ def FeatureISAVersion12 : FeatureSet<
FeaturePseudoScalarTrans,
FeatureHasRestrictedSOffset,
FeatureVGPRSingleUseHintInsts,
- FeatureMADIntraFwdBug,
- FeatureScalarDwordx3Loads]>;
+ FeatureScalarDwordx3Loads,
+ FeatureDPPSrc1SGPR]>;
//===----------------------------------------------------------------------===//
@@ -1981,6 +2005,9 @@ def HasShaderCyclesHiLoRegisters : Predicate<"Subtarget->hasShaderCyclesHiLoRegi
def HasFP8Insts : Predicate<"Subtarget->hasFP8Insts()">,
AssemblerPredicate<(all_of FeatureFP8Insts)>;
+def HasFP8ConversionInsts : Predicate<"Subtarget->hasFP8ConversionInsts()">,
+ AssemblerPredicate<(all_of FeatureFP8ConversionInsts)>;
+
def HasPkFmacF16Inst : Predicate<"Subtarget->hasPkFmacF16Inst()">,
AssemblerPredicate<(all_of FeaturePkFmacF16Inst)>;
@@ -2013,6 +2040,13 @@ def HasFlatAtomicFaddF32Inst
: Predicate<"Subtarget->hasFlatAtomicFaddF32Inst()">,
AssemblerPredicate<(all_of FeatureFlatAtomicFaddF32Inst)>;
+def HasDefaultComponentZero
+ : Predicate<"Subtarget->hasDefaultComponentZero()">,
+ AssemblerPredicate<(all_of FeatureDefaultComponentZero)>;
+def HasDefaultComponentBroadcast
+ : Predicate<"Subtarget->hasDefaultComponentBroadcast()">,
+ AssemblerPredicate<(all_of FeatureDefaultComponentBroadcast)>;
+
def HasDsSrc2Insts : Predicate<"!Subtarget->hasDsSrc2Insts()">,
AssemblerPredicate<(all_of FeatureDsSrc2Insts)>;
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
index d317a733d433..279ef8ca2751 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
@@ -123,8 +123,11 @@ void AMDGPUAsmPrinter::initTargetStreamer(Module &M) {
getTargetStreamer()->EmitDirectiveAMDGCNTarget();
- if (TM.getTargetTriple().getOS() == Triple::AMDHSA)
+ if (TM.getTargetTriple().getOS() == Triple::AMDHSA) {
+ getTargetStreamer()->EmitDirectiveAMDHSACodeObjectVersion(
+ CodeObjectVersion);
HSAMetadataStream->begin(M, *getTargetStreamer()->getTargetID());
+ }
if (TM.getTargetTriple().getOS() == Triple::AMDPAL)
getTargetStreamer()->getPALMetadata()->readFromIR(M);
@@ -230,8 +233,7 @@ void AMDGPUAsmPrinter::emitFunctionBodyEnd() {
IsaInfo::getNumExtraSGPRs(
&STM, CurrentProgramInfo.VCCUsed, CurrentProgramInfo.FlatUsed,
getTargetStreamer()->getTargetID()->isXnackOnOrAny()),
- CurrentProgramInfo.VCCUsed, CurrentProgramInfo.FlatUsed,
- CodeObjectVersion);
+ CurrentProgramInfo.VCCUsed, CurrentProgramInfo.FlatUsed);
Streamer.popSection();
}
@@ -323,7 +325,7 @@ void AMDGPUAsmPrinter::emitGlobalVariable(const GlobalVariable *GV) {
}
bool AMDGPUAsmPrinter::doInitialization(Module &M) {
- CodeObjectVersion = AMDGPU::getCodeObjectVersion(M);
+ CodeObjectVersion = AMDGPU::getAMDHSACodeObjectVersion(M);
if (TM.getTargetTriple().getOS() == Triple::AMDHSA) {
switch (CodeObjectVersion) {
@@ -631,8 +633,8 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
void AMDGPUAsmPrinter::initializeTargetID(const Module &M) {
// In the beginning all features are either 'Any' or 'NotSupported',
// depending on global target features. This will cover empty modules.
- getTargetStreamer()->initializeTargetID(
- *getGlobalSTI(), getGlobalSTI()->getFeatureString(), CodeObjectVersion);
+ getTargetStreamer()->initializeTargetID(*getGlobalSTI(),
+ getGlobalSTI()->getFeatureString());
// If module is empty, we are done.
if (M.empty())
@@ -981,8 +983,10 @@ void AMDGPUAsmPrinter::EmitProgramInfoSI(const MachineFunction &MF,
OutStreamer->emitInt32(R_00B860_COMPUTE_TMPRING_SIZE);
OutStreamer->emitInt32(
- STM.getGeneration() >= AMDGPUSubtarget::GFX11
- ? S_00B860_WAVESIZE_GFX11Plus(CurrentProgramInfo.ScratchBlocks)
+ STM.getGeneration() >= AMDGPUSubtarget::GFX12
+ ? S_00B860_WAVESIZE_GFX12Plus(CurrentProgramInfo.ScratchBlocks)
+ : STM.getGeneration() == AMDGPUSubtarget::GFX11
+ ? S_00B860_WAVESIZE_GFX11(CurrentProgramInfo.ScratchBlocks)
: S_00B860_WAVESIZE_PreGFX11(CurrentProgramInfo.ScratchBlocks));
// TODO: Should probably note flat usage somewhere. SC emits a "FlatPtr32 =
@@ -993,8 +997,10 @@ void AMDGPUAsmPrinter::EmitProgramInfoSI(const MachineFunction &MF,
S_00B028_SGPRS(CurrentProgramInfo.SGPRBlocks), 4);
OutStreamer->emitInt32(R_0286E8_SPI_TMPRING_SIZE);
OutStreamer->emitInt32(
- STM.getGeneration() >= AMDGPUSubtarget::GFX11
- ? S_0286E8_WAVESIZE_GFX11Plus(CurrentProgramInfo.ScratchBlocks)
+ STM.getGeneration() >= AMDGPUSubtarget::GFX12
+ ? S_0286E8_WAVESIZE_GFX12Plus(CurrentProgramInfo.ScratchBlocks)
+ : STM.getGeneration() == AMDGPUSubtarget::GFX11
+ ? S_0286E8_WAVESIZE_GFX11(CurrentProgramInfo.ScratchBlocks)
: S_0286E8_WAVESIZE_PreGFX11(CurrentProgramInfo.ScratchBlocks));
}
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
index 5fd9e571282d..d7f5110427ec 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
@@ -144,7 +144,7 @@ public:
BumpPtrAllocator &Allocator,
SetVector<Function *> *CGSCC, TargetMachine &TM)
: InformationCache(M, AG, Allocator, CGSCC), TM(TM),
- CodeObjectVersion(AMDGPU::getCodeObjectVersion(M)) {}
+ CodeObjectVersion(AMDGPU::getAMDHSACodeObjectVersion(M)) {}
TargetMachine &TM;
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
index cf2896f80f19..6d05c3678bf0 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
@@ -474,7 +474,7 @@ static void allocateHSAUserSGPRs(CCState &CCInfo,
const Module *M = MF.getFunction().getParent();
if (UserSGPRInfo.hasQueuePtr() &&
- AMDGPU::getCodeObjectVersion(*M) < AMDGPU::AMDHSA_COV5) {
+ AMDGPU::getAMDHSACodeObjectVersion(*M) < AMDGPU::AMDHSA_COV5) {
Register QueuePtrReg = Info.addQueuePtr(TRI);
MF.addLiveIn(QueuePtrReg, &AMDGPU::SGPR_64RegClass);
CCInfo.AllocateReg(QueuePtrReg);
@@ -632,10 +632,6 @@ bool AMDGPUCallLowering::lowerFormalArguments(
const bool InReg = Arg.hasAttribute(Attribute::InReg);
- // SGPR arguments to functions not implemented.
- if (!IsGraphics && InReg)
- return false;
-
if (Arg.hasAttribute(Attribute::SwiftSelf) ||
Arg.hasAttribute(Attribute::SwiftError) ||
Arg.hasAttribute(Attribute::Nest))
@@ -719,6 +715,10 @@ bool AMDGPUCallLowering::lowerFormalArguments(
if (!IsEntryFunc && !IsGraphics) {
// For the fixed ABI, pass workitem IDs in the last argument register.
TLI.allocateSpecialInputVGPRsFixed(CCInfo, MF, *TRI, *Info);
+
+ if (!Subtarget.enableFlatScratch())
+ CCInfo.AllocateReg(Info->getScratchRSrcReg());
+ TLI.allocateSpecialInputSGPRs(CCInfo, MF, *TRI, *Info);
}
IncomingValueAssigner Assigner(AssignFn);
@@ -732,13 +732,8 @@ bool AMDGPUCallLowering::lowerFormalArguments(
uint64_t StackSize = Assigner.StackSize;
// Start adding system SGPRs.
- if (IsEntryFunc) {
+ if (IsEntryFunc)
TLI.allocateSystemSGPRs(CCInfo, MF, *Info, CC, IsGraphics);
- } else {
- if (!Subtarget.enableFlatScratch())
- CCInfo.AllocateReg(Info->getScratchRSrcReg());
- TLI.allocateSpecialInputSGPRs(CCInfo, MF, *TRI, *Info);
- }
// When we tail call, we need to check if the callee's arguments will fit on
// the caller's stack. So, whenever we lower formal arguments, we should keep
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUGISel.td b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUGISel.td
index 2b85024a9b40..a19b03b92923 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUGISel.td
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUGISel.td
@@ -51,9 +51,9 @@ def gi_vop3pmodsdot :
GIComplexOperandMatcher<s32, "selectVOP3PModsDOT">,
GIComplexPatternEquiv<VOP3PModsDOT>;
-def gi_dotiuvop3pmods :
- GIComplexOperandMatcher<s32, "selectDotIUVOP3PMods">,
- GIComplexPatternEquiv<DotIUVOP3PMods>;
+def gi_vop3pmodsneg :
+ GIComplexOperandMatcher<s32, "selectVOP3PModsNeg">,
+ GIComplexPatternEquiv<VOP3PModsNeg>;
def gi_wmmaopselvop3pmods :
GIComplexOperandMatcher<s32, "selectWMMAOpSelVOP3PMods">,
@@ -261,10 +261,16 @@ def : GINodeEquiv<G_AMDGPU_BUFFER_ATOMIC_XOR, SIbuffer_atomic_xor>;
def : GINodeEquiv<G_AMDGPU_BUFFER_ATOMIC_INC, SIbuffer_atomic_inc>;
def : GINodeEquiv<G_AMDGPU_BUFFER_ATOMIC_DEC, SIbuffer_atomic_dec>;
def : GINodeEquiv<G_AMDGPU_BUFFER_ATOMIC_FADD, SIbuffer_atomic_fadd>;
+def : GINodeEquiv<G_AMDGPU_BUFFER_ATOMIC_FADD_BF16, SIbuffer_atomic_fadd_bf16>;
def : GINodeEquiv<G_AMDGPU_BUFFER_ATOMIC_FMIN, SIbuffer_atomic_fmin>;
def : GINodeEquiv<G_AMDGPU_BUFFER_ATOMIC_FMAX, SIbuffer_atomic_fmax>;
def : GINodeEquiv<G_AMDGPU_BUFFER_ATOMIC_CMPSWAP, SIbuffer_atomic_cmpswap>;
+def : GINodeEquiv<G_AMDGPU_BUFFER_ATOMIC_COND_SUB_U32, SIbuffer_atomic_cond_sub_u32>;
def : GINodeEquiv<G_AMDGPU_S_BUFFER_LOAD, SIsbuffer_load>;
+def : GINodeEquiv<G_AMDGPU_S_BUFFER_LOAD_SBYTE, SIsbuffer_load_byte>;
+def : GINodeEquiv<G_AMDGPU_S_BUFFER_LOAD_UBYTE, SIsbuffer_load_ubyte>;
+def : GINodeEquiv<G_AMDGPU_S_BUFFER_LOAD_SSHORT, SIsbuffer_load_short>;
+def : GINodeEquiv<G_AMDGPU_S_BUFFER_LOAD_USHORT, SIsbuffer_load_ushort>;
def : GINodeEquiv<G_FPTRUNC_ROUND_UPWARD, SIfptrunc_round_upward>;
def : GINodeEquiv<G_FPTRUNC_ROUND_DOWNWARD, SIfptrunc_round_downward>;
@@ -379,8 +385,8 @@ def gi_extract_cpol : GICustomOperandRenderer<"renderExtractCPol">,
def gi_extract_swz : GICustomOperandRenderer<"renderExtractSWZ">,
GISDNodeXFormEquiv<extract_swz>;
-def gi_set_glc : GICustomOperandRenderer<"renderSetGLC">,
- GISDNodeXFormEquiv<set_glc>;
+def gi_extract_cpol_set_glc : GICustomOperandRenderer<"renderExtractCpolSetGLC">,
+ GISDNodeXFormEquiv<extract_cpol_set_glc>;
def gi_frameindex_to_targetframeindex : GICustomOperandRenderer<"renderFrameIndex">,
GISDNodeXFormEquiv<frameindex_to_targetframeindex>;
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp
index 74e9cd7d0965..186fa58524b9 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp
@@ -532,7 +532,8 @@ void MetadataStreamerMsgPackV4::emitKernel(const MachineFunction &MF,
Func.getCallingConv() != CallingConv::SPIR_KERNEL)
return;
- auto CodeObjectVersion = AMDGPU::getCodeObjectVersion(*Func.getParent());
+ auto CodeObjectVersion =
+ AMDGPU::getAMDHSACodeObjectVersion(*Func.getParent());
auto Kern = getHSAKernelProps(MF, ProgramInfo, CodeObjectVersion);
auto Kernels =
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
index 41462d7a133e..4c35649cec6c 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
@@ -1159,7 +1159,7 @@ bool AMDGPUDAGToDAGISel::isFlatScratchBaseLegal(SDValue Addr) const {
// Starting with GFX12, VADDR and SADDR fields in VSCRATCH can use negative
// values.
- if (AMDGPU::isGFX12Plus(*Subtarget))
+ if (Subtarget->hasSignedScratchOffsets())
return true;
auto LHS = Addr.getOperand(0);
@@ -1184,6 +1184,11 @@ bool AMDGPUDAGToDAGISel::isFlatScratchBaseLegalSV(SDValue Addr) const {
if (isNoUnsignedWrap(Addr))
return true;
+ // Starting with GFX12, VADDR and SADDR fields in VSCRATCH can use negative
+ // values.
+ if (Subtarget->hasSignedScratchOffsets())
+ return true;
+
auto LHS = Addr.getOperand(0);
auto RHS = Addr.getOperand(1);
return CurDAG->SignBitIsZero(RHS) && CurDAG->SignBitIsZero(LHS);
@@ -1192,6 +1197,11 @@ bool AMDGPUDAGToDAGISel::isFlatScratchBaseLegalSV(SDValue Addr) const {
// Check address value in SGPR/VGPR are legal for flat scratch in the form
// of: SGPR + VGPR + Imm.
bool AMDGPUDAGToDAGISel::isFlatScratchBaseLegalSVImm(SDValue Addr) const {
+ // Starting with GFX12, VADDR and SADDR fields in VSCRATCH can use negative
+ // values.
+ if (AMDGPU::isGFX12Plus(*Subtarget))
+ return true;
+
auto Base = Addr.getOperand(0);
auto *RHSImm = cast<ConstantSDNode>(Addr.getOperand(1));
// If the immediate offset is negative and within certain range, the base
@@ -3009,7 +3019,7 @@ bool AMDGPUDAGToDAGISel::SelectVOP3PModsDOT(SDValue In, SDValue &Src,
return SelectVOP3PMods(In, Src, SrcMods, true);
}
-bool AMDGPUDAGToDAGISel::SelectDotIUVOP3PMods(SDValue In, SDValue &Src) const {
+bool AMDGPUDAGToDAGISel::SelectVOP3PModsNeg(SDValue In, SDValue &Src) const {
const ConstantSDNode *C = cast<ConstantSDNode>(In);
// Literal i1 value set in intrinsic, represents SrcMods for the next operand.
// 1 promotes packed values to signed, 0 treats them as unsigned.
@@ -3183,13 +3193,14 @@ bool AMDGPUDAGToDAGISel::isVGPRImm(const SDNode * N) const {
return !AllUsesAcceptSReg && (Limit < 10);
}
-bool AMDGPUDAGToDAGISel::isUniformLoad(const SDNode * N) const {
+bool AMDGPUDAGToDAGISel::isUniformLoad(const SDNode *N) const {
auto Ld = cast<LoadSDNode>(N);
- if (N->isDivergent() && !AMDGPUInstrInfo::isUniformMMO(Ld->getMemOperand()))
+ const MachineMemOperand *MMO = Ld->getMemOperand();
+ if (N->isDivergent() && !AMDGPUInstrInfo::isUniformMMO(MMO))
return false;
- return Ld->getAlign() >= Align(4) &&
+ return Ld->getAlign() >= Align(std::min(MMO->getSize(), uint64_t(4))) &&
((Ld->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS ||
Ld->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT) ||
(Subtarget->getScalarizeGlobalBehavior() &&
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
index df4a211d42a0..8645490f0b16 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
@@ -237,7 +237,7 @@ private:
bool IsDOT = false) const;
bool SelectVOP3PModsDOT(SDValue In, SDValue &Src, SDValue &SrcMods) const;
- bool SelectDotIUVOP3PMods(SDValue In, SDValue &Src) const;
+ bool SelectVOP3PModsNeg(SDValue In, SDValue &Src) const;
bool SelectWMMAOpSelVOP3PMods(SDValue In, SDValue &Src) const;
bool SelectVOP3OpSel(SDValue In, SDValue &Src, SDValue &SrcMods) const;
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index 0dbcaf5a1b13..55d95154c758 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -446,6 +446,9 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
{ISD::CTTZ, ISD::CTTZ_ZERO_UNDEF, ISD::CTLZ, ISD::CTLZ_ZERO_UNDEF},
MVT::i64, Custom);
+ for (auto VT : {MVT::i8, MVT::i16})
+ setOperationAction({ISD::CTLZ, ISD::CTLZ_ZERO_UNDEF}, VT, Custom);
+
static const MVT::SimpleValueType VectorIntTypes[] = {
MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32, MVT::v6i32, MVT::v7i32,
MVT::v9i32, MVT::v10i32, MVT::v11i32, MVT::v12i32};
@@ -784,6 +787,7 @@ bool AMDGPUTargetLowering::shouldReduceLoadWidth(SDNode *N,
unsigned AS = MN->getAddressSpace();
// Do not shrink an aligned scalar load to sub-dword.
// Scalar engine cannot do sub-dword loads.
+ // TODO: Update this for GFX12 which does have scalar sub-dword loads.
if (OldSize >= 32 && NewSize < 32 && MN->getAlign() >= Align(4) &&
(AS == AMDGPUAS::CONSTANT_ADDRESS ||
AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT ||
@@ -1397,6 +1401,11 @@ void AMDGPUTargetLowering::ReplaceNodeResults(SDNode *N,
if (SDValue Lowered = lowerFEXP(SDValue(N, 0), DAG))
Results.push_back(Lowered);
return;
+ case ISD::CTLZ:
+ case ISD::CTLZ_ZERO_UNDEF:
+ if (auto Lowered = lowerCTLZResults(SDValue(N, 0u), DAG))
+ Results.push_back(Lowered);
+ return;
default:
return;
}
@@ -3062,6 +3071,26 @@ static bool isCttzOpc(unsigned Opc) {
return Opc == ISD::CTTZ || Opc == ISD::CTTZ_ZERO_UNDEF;
}
+SDValue AMDGPUTargetLowering::lowerCTLZResults(SDValue Op,
+ SelectionDAG &DAG) const {
+ auto SL = SDLoc(Op);
+ auto Arg = Op.getOperand(0u);
+ auto ResultVT = Op.getValueType();
+
+ if (ResultVT != MVT::i8 && ResultVT != MVT::i16)
+ return {};
+
+ assert(isCtlzOpc(Op.getOpcode()));
+ assert(ResultVT == Arg.getValueType());
+
+ auto const LeadingZeroes = 32u - ResultVT.getFixedSizeInBits();
+ auto NewOp = DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i32, Arg);
+ auto ShiftVal = DAG.getConstant(LeadingZeroes, SL, MVT::i32);
+ NewOp = DAG.getNode(ISD::SHL, SL, MVT::i32, NewOp, ShiftVal);
+ NewOp = DAG.getNode(Op.getOpcode(), SL, MVT::i32, NewOp);
+ return DAG.getNode(ISD::TRUNCATE, SL, ResultVT, NewOp);
+}
+
SDValue AMDGPUTargetLowering::LowerCTLZ_CTTZ(SDValue Op, SelectionDAG &DAG) const {
SDLoc SL(Op);
SDValue Src = Op.getOperand(0);
@@ -5453,6 +5482,10 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
NODE_NAME_CASE(BUFFER_LOAD_FORMAT_TFE)
NODE_NAME_CASE(BUFFER_LOAD_FORMAT_D16)
NODE_NAME_CASE(SBUFFER_LOAD)
+ NODE_NAME_CASE(SBUFFER_LOAD_BYTE)
+ NODE_NAME_CASE(SBUFFER_LOAD_UBYTE)
+ NODE_NAME_CASE(SBUFFER_LOAD_SHORT)
+ NODE_NAME_CASE(SBUFFER_LOAD_USHORT)
NODE_NAME_CASE(BUFFER_STORE)
NODE_NAME_CASE(BUFFER_STORE_BYTE)
NODE_NAME_CASE(BUFFER_STORE_SHORT)
@@ -5473,8 +5506,10 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
NODE_NAME_CASE(BUFFER_ATOMIC_CMPSWAP)
NODE_NAME_CASE(BUFFER_ATOMIC_CSUB)
NODE_NAME_CASE(BUFFER_ATOMIC_FADD)
+ NODE_NAME_CASE(BUFFER_ATOMIC_FADD_BF16)
NODE_NAME_CASE(BUFFER_ATOMIC_FMIN)
NODE_NAME_CASE(BUFFER_ATOMIC_FMAX)
+ NODE_NAME_CASE(BUFFER_ATOMIC_COND_SUB_U32)
case AMDGPUISD::LAST_AMDGPU_ISD_NUMBER: break;
}
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
index 827fb106b551..f10a357125e5 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
@@ -84,6 +84,8 @@ protected:
SDNodeFlags Flags) const;
SDValue lowerFEXP(SDValue Op, SelectionDAG &DAG) const;
+ SDValue lowerCTLZResults(SDValue Op, SelectionDAG &DAG) const;
+
SDValue LowerCTLZ_CTTZ(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerINT_TO_FP32(SDValue Op, SelectionDAG &DAG, bool Signed) const;
@@ -567,6 +569,10 @@ enum NodeType : unsigned {
BUFFER_LOAD_FORMAT_TFE,
BUFFER_LOAD_FORMAT_D16,
SBUFFER_LOAD,
+ SBUFFER_LOAD_BYTE,
+ SBUFFER_LOAD_UBYTE,
+ SBUFFER_LOAD_SHORT,
+ SBUFFER_LOAD_USHORT,
BUFFER_STORE,
BUFFER_STORE_BYTE,
BUFFER_STORE_SHORT,
@@ -587,8 +593,10 @@ enum NodeType : unsigned {
BUFFER_ATOMIC_CMPSWAP,
BUFFER_ATOMIC_CSUB,
BUFFER_ATOMIC_FADD,
+ BUFFER_ATOMIC_FADD_BF16,
BUFFER_ATOMIC_FMIN,
BUFFER_ATOMIC_FMAX,
+ BUFFER_ATOMIC_COND_SUB_U32,
LAST_AMDGPU_ISD_NUMBER
};
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
index 2bb7b6bd0674..898289019c71 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
@@ -402,6 +402,35 @@ static APInt trimTrailingZerosInVector(InstCombiner &IC, Value *UseV,
return DemandedElts;
}
+// Trim elements of the end of the vector \p V, if they are
+// equal to the first element of the vector.
+static APInt defaultComponentBroadcast(Value *V) {
+ auto *VTy = cast<FixedVectorType>(V->getType());
+ unsigned VWidth = VTy->getNumElements();
+ APInt DemandedElts = APInt::getAllOnes(VWidth);
+ Value *FirstComponent = findScalarElement(V, 0);
+
+ SmallVector<int> ShuffleMask;
+ if (auto *SVI = dyn_cast<ShuffleVectorInst>(V))
+ SVI->getShuffleMask(ShuffleMask);
+
+ for (int I = VWidth - 1; I > 0; --I) {
+ if (ShuffleMask.empty()) {
+ auto *Elt = findScalarElement(V, I);
+ if (!Elt || (Elt != FirstComponent && !isa<UndefValue>(Elt)))
+ break;
+ } else {
+ // Detect identical elements in the shufflevector result, even though
+ // findScalarElement cannot tell us what that element is.
+ if (ShuffleMask[I] != ShuffleMask[0] && ShuffleMask[I] != PoisonMaskElem)
+ break;
+ }
+ DemandedElts.clearBit(I);
+ }
+
+ return DemandedElts;
+}
+
static Value *simplifyAMDGCNMemoryIntrinsicDemanded(InstCombiner &IC,
IntrinsicInst &II,
APInt DemandedElts,
@@ -1140,8 +1169,13 @@ GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
if (!isa<FixedVectorType>(II.getArgOperand(0)->getType()))
break;
- APInt DemandedElts =
- trimTrailingZerosInVector(IC, II.getArgOperand(0), &II);
+ APInt DemandedElts;
+ if (ST->hasDefaultComponentBroadcast())
+ DemandedElts = defaultComponentBroadcast(II.getArgOperand(0));
+ else if (ST->hasDefaultComponentZero())
+ DemandedElts = trimTrailingZerosInVector(IC, II.getArgOperand(0), &II);
+ else
+ break;
int DMaskIdx = getAMDGPUImageDMaskIntrinsic(II.getIntrinsicID()) ? 1 : -1;
if (simplifyAMDGCNMemoryIntrinsicDemanded(IC, II, DemandedElts, DMaskIdx,
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
index ad8dcda93c36..fdee74d58d26 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -1917,7 +1917,8 @@ bool AMDGPUInstructionSelector::selectImageIntrinsic(
unsigned CPol = MI.getOperand(ArgOffset + Intr->CachePolicyIndex).getImm();
if (BaseOpcode->Atomic)
CPol |= AMDGPU::CPol::GLC; // TODO no-return optimization
- if (CPol & ~(IsGFX12Plus ? AMDGPU::CPol::ALL : AMDGPU::CPol::ALL_pregfx12))
+ if (CPol & ~((IsGFX12Plus ? AMDGPU::CPol::ALL : AMDGPU::CPol::ALL_pregfx12) |
+ AMDGPU::CPol::VOLATILE))
return false;
int NumVAddrRegs = 0;
@@ -3927,7 +3928,7 @@ AMDGPUInstructionSelector::selectVOP3PModsDOT(MachineOperand &Root) const {
}
InstructionSelector::ComplexRendererFns
-AMDGPUInstructionSelector::selectDotIUVOP3PMods(MachineOperand &Root) const {
+AMDGPUInstructionSelector::selectVOP3PModsNeg(MachineOperand &Root) const {
// Literal i1 value set in intrinsic, represents SrcMods for the next operand.
// Value is in Imm operand as i1 sign extended to int64_t.
// 1(-1) promotes packed values to signed, 0 treats them as unsigned.
@@ -4556,7 +4557,7 @@ bool AMDGPUInstructionSelector::isFlatScratchBaseLegal(Register Addr) const {
// Starting with GFX12, VADDR and SADDR fields in VSCRATCH can use negative
// values.
- if (AMDGPU::isGFX12Plus(STI))
+ if (STI.hasSignedScratchOffsets())
return true;
Register LHS = AddrMI->getOperand(1).getReg();
@@ -4585,6 +4586,11 @@ bool AMDGPUInstructionSelector::isFlatScratchBaseLegalSV(Register Addr) const {
if (isNoUnsignedWrap(AddrMI))
return true;
+ // Starting with GFX12, VADDR and SADDR fields in VSCRATCH can use negative
+ // values.
+ if (STI.hasSignedScratchOffsets())
+ return true;
+
Register LHS = AddrMI->getOperand(1).getReg();
Register RHS = AddrMI->getOperand(2).getReg();
return KB->signBitIsZero(RHS) && KB->signBitIsZero(LHS);
@@ -4594,6 +4600,11 @@ bool AMDGPUInstructionSelector::isFlatScratchBaseLegalSV(Register Addr) const {
// of: SGPR + VGPR + Imm.
bool AMDGPUInstructionSelector::isFlatScratchBaseLegalSVImm(
Register Addr) const {
+ // Starting with GFX12, VADDR and SADDR fields in VSCRATCH can use negative
+ // values.
+ if (STI.hasSignedScratchOffsets())
+ return true;
+
MachineInstr *AddrMI = getDefIgnoringCopies(Addr, *MRI);
Register Base = AddrMI->getOperand(1).getReg();
std::optional<DefinitionAndSourceRegister> BaseDef =
@@ -5411,6 +5422,7 @@ bool AMDGPUInstructionSelector::selectNamedBarrierInst(
I.eraseFromParent();
return true;
}
+
bool AMDGPUInstructionSelector::selectSBarrierLeave(MachineInstr &I) const {
MachineBasicBlock *BB = I.getParent();
const DebugLoc &DL = I.getDebugLoc();
@@ -5496,11 +5508,13 @@ void AMDGPUInstructionSelector::renderExtractSWZ(MachineInstrBuilder &MIB,
MIB.addImm(Swizzle);
}
-void AMDGPUInstructionSelector::renderSetGLC(MachineInstrBuilder &MIB,
- const MachineInstr &MI,
- int OpIdx) const {
+void AMDGPUInstructionSelector::renderExtractCpolSetGLC(
+ MachineInstrBuilder &MIB, const MachineInstr &MI, int OpIdx) const {
assert(OpIdx >= 0 && "expected to match an immediate operand");
- MIB.addImm(MI.getOperand(OpIdx).getImm() | AMDGPU::CPol::GLC);
+ const uint32_t Cpol = MI.getOperand(OpIdx).getImm() &
+ (AMDGPU::isGFX12Plus(STI) ? AMDGPU::CPol::ALL
+ : AMDGPU::CPol::ALL_pregfx12);
+ MIB.addImm(Cpol | AMDGPU::CPol::GLC);
}
void AMDGPUInstructionSelector::renderFrameIndex(MachineInstrBuilder &MIB,
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
index ab7cc0a6beb8..12ea46c2895b 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
@@ -194,7 +194,7 @@ private:
selectVOP3PModsDOT(MachineOperand &Root) const;
InstructionSelector::ComplexRendererFns
- selectDotIUVOP3PMods(MachineOperand &Root) const;
+ selectVOP3PModsNeg(MachineOperand &Root) const;
InstructionSelector::ComplexRendererFns
selectWMMAOpSelVOP3PMods(MachineOperand &Root) const;
@@ -331,8 +331,8 @@ private:
int OpIdx) const;
void renderExtractSWZ(MachineInstrBuilder &MIB, const MachineInstr &MI,
int OpIdx) const;
- void renderSetGLC(MachineInstrBuilder &MIB, const MachineInstr &MI,
- int OpIdx) const;
+ void renderExtractCpolSetGLC(MachineInstrBuilder &MIB, const MachineInstr &MI,
+ int OpIdx) const;
void renderFrameIndex(MachineInstrBuilder &MIB, const MachineInstr &MI,
int OpIdx) const;
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td
index 36e07d944c94..360aafedc522 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td
@@ -647,6 +647,9 @@ defm int_amdgcn_flat_atomic_fmin_num : noret_op;
defm int_amdgcn_flat_atomic_fmax_num : noret_op;
defm int_amdgcn_global_atomic_fmin_num : noret_op;
defm int_amdgcn_global_atomic_fmax_num : noret_op;
+defm int_amdgcn_atomic_cond_sub_u32 : local_addr_space_atomic_op;
+defm int_amdgcn_atomic_cond_sub_u32 : flat_addr_space_atomic_op;
+defm int_amdgcn_atomic_cond_sub_u32 : global_addr_space_atomic_op;
multiclass noret_binary_atomic_op<SDNode atomic_op, bit IsInt = 1> {
let HasNoUse = true in
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp
index fb7148ba10ac..69fdeaebe0a0 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp
@@ -13,9 +13,11 @@
//===----------------------------------------------------------------------===//
#include "AMDGPU.h"
+#include "AMDGPUTargetMachine.h"
#include "llvm/Analysis/AssumptionCache.h"
#include "llvm/Analysis/UniformityAnalysis.h"
#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/CodeGen/TargetPassConfig.h"
#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/InstVisitor.h"
#include "llvm/InitializePasses.h"
@@ -58,6 +60,7 @@ public:
}
void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.addRequired<TargetPassConfig>();
AU.addRequired<AssumptionCacheTracker>();
AU.addRequired<UniformityInfoWrapperPass>();
AU.setPreservesAll();
@@ -90,6 +93,12 @@ bool AMDGPULateCodeGenPrepare::runOnFunction(Function &F) {
if (skipFunction(F))
return false;
+ const TargetPassConfig &TPC = getAnalysis<TargetPassConfig>();
+ const TargetMachine &TM = TPC.getTM<TargetMachine>();
+ const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
+ if (ST.hasScalarSubwordLoads())
+ return false;
+
AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
UA = &getAnalysis<UniformityInfoWrapperPass>().getUniformityInfo();
@@ -179,6 +188,7 @@ bool AMDGPULateCodeGenPrepare::visitLoadInst(LoadInst &LI) {
INITIALIZE_PASS_BEGIN(AMDGPULateCodeGenPrepare, DEBUG_TYPE,
"AMDGPU IR late optimizations", false, false)
+INITIALIZE_PASS_DEPENDENCY(TargetPassConfig)
INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
INITIALIZE_PASS_DEPENDENCY(UniformityInfoWrapperPass)
INITIALIZE_PASS_END(AMDGPULateCodeGenPrepare, DEBUG_TYPE,
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
index aa235c07e995..8e74d4c0e945 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -2139,7 +2139,7 @@ Register AMDGPULegalizerInfo::getSegmentAperture(
LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
// For code object version 5, private_base and shared_base are passed through
// implicit kernargs.
- if (AMDGPU::getCodeObjectVersion(*MF.getFunction().getParent()) >=
+ if (AMDGPU::getAMDHSACodeObjectVersion(*MF.getFunction().getParent()) >=
AMDGPU::AMDHSA_COV5) {
AMDGPUTargetLowering::ImplicitParameter Param =
AS == AMDGPUAS::LOCAL_ADDRESS ? AMDGPUTargetLowering::SHARED_BASE
@@ -5883,6 +5883,9 @@ static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) {
case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fadd:
return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD;
+ case Intrinsic::amdgcn_raw_buffer_atomic_fadd_v2bf16:
+ case Intrinsic::amdgcn_struct_buffer_atomic_fadd_v2bf16:
+ return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD_BF16;
case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmin:
case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
@@ -5893,6 +5896,9 @@ static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) {
case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmax:
return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX;
+ case Intrinsic::amdgcn_raw_buffer_atomic_cond_sub_u32:
+ case Intrinsic::amdgcn_struct_buffer_atomic_cond_sub_u32:
+ return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_COND_SUB_U32;
default:
llvm_unreachable("unhandled atomic opcode");
}
@@ -6090,6 +6096,10 @@ bool AMDGPULegalizerInfo::legalizeImageIntrinsic(
Register VData = MI.getOperand(NumDefs == 0 ? 1 : 0).getReg();
LLT Ty = MRI->getType(VData);
+ const bool IsAtomicPacked16Bit =
+ (BaseOpcode->BaseOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_F16 ||
+ BaseOpcode->BaseOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_BF16);
+
// Check for 16 bit addresses and pack if true.
LLT GradTy =
MRI->getType(MI.getOperand(ArgOffset + Intr->GradientStart).getReg());
@@ -6098,7 +6108,7 @@ bool AMDGPULegalizerInfo::legalizeImageIntrinsic(
const bool IsG16 =
ST.hasG16() ? (BaseOpcode->Gradients && GradTy == S16) : GradTy == S16;
const bool IsA16 = AddrTy == S16;
- const bool IsD16 = Ty.getScalarType() == S16;
+ const bool IsD16 = !IsAtomicPacked16Bit && Ty.getScalarType() == S16;
int DMaskLanes = 0;
if (!BaseOpcode->Atomic) {
@@ -6140,7 +6150,7 @@ bool AMDGPULegalizerInfo::legalizeImageIntrinsic(
LLT Ty = MRI->getType(VData0);
// TODO: Allow atomic swap and bit ops for v2s16/v4s16
- if (Ty.isVector())
+ if (Ty.isVector() && !IsAtomicPacked16Bit)
return false;
if (BaseOpcode->AtomicX2) {
@@ -6276,9 +6286,18 @@ bool AMDGPULegalizerInfo::legalizeImageIntrinsic(
if (NumElts > 4 || DMaskLanes > 4)
return false;
+ // Image atomic instructions are using DMask to specify how many bits
+ // input/output data will have. 32-bits (s32, v2s16) or 64-bits (s64, v4s16).
+ // DMaskLanes for image atomic has default value '0'.
+ // We must be sure that atomic variants (especially packed) will not be
+ // truncated from v2s16 or v4s16 to s16 type.
+ //
+ // ChangeElementCount will be needed for image load where Ty is always scalar.
const unsigned AdjustedNumElts = DMaskLanes == 0 ? 1 : DMaskLanes;
const LLT AdjustedTy =
- Ty.changeElementCount(ElementCount::getFixed(AdjustedNumElts));
+ DMaskLanes == 0
+ ? Ty
+ : Ty.changeElementCount(ElementCount::getFixed(AdjustedNumElts));
// The raw dword aligned data component of the load. The only legal cases
// where this matters should be when using the packed D16 format, for
@@ -6443,15 +6462,28 @@ bool AMDGPULegalizerInfo::legalizeImageIntrinsic(
return true;
}
-bool AMDGPULegalizerInfo::legalizeSBufferLoad(
- LegalizerHelper &Helper, MachineInstr &MI) const {
+bool AMDGPULegalizerInfo::legalizeSBufferLoad(LegalizerHelper &Helper,
+ MachineInstr &MI) const {
MachineIRBuilder &B = Helper.MIRBuilder;
GISelChangeObserver &Observer = Helper.Observer;
- Register Dst = MI.getOperand(0).getReg();
- LLT Ty = B.getMRI()->getType(Dst);
+ Register OrigDst = MI.getOperand(0).getReg();
+ Register Dst;
+ LLT Ty = B.getMRI()->getType(OrigDst);
unsigned Size = Ty.getSizeInBits();
MachineFunction &MF = B.getMF();
+ unsigned Opc = 0;
+ if (Size < 32 && ST.hasScalarSubwordLoads()) {
+ assert(Size == 8 || Size == 16);
+ Opc = Size == 8 ? AMDGPU::G_AMDGPU_S_BUFFER_LOAD_UBYTE
+ : AMDGPU::G_AMDGPU_S_BUFFER_LOAD_USHORT;
+ // The 8-bit and 16-bit scalar buffer load instructions have 32-bit
+ // destination register.
+ Dst = B.getMRI()->createGenericVirtualRegister(LLT::scalar(32));
+ } else {
+ Opc = AMDGPU::G_AMDGPU_S_BUFFER_LOAD;
+ Dst = OrigDst;
+ }
Observer.changingInstr(MI);
@@ -6469,19 +6501,24 @@ bool AMDGPULegalizerInfo::legalizeSBufferLoad(
// FIXME: We don't really need this intermediate instruction. The intrinsic
// should be fixed to have a memory operand. Since it's readnone, we're not
// allowed to add one.
- MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_S_BUFFER_LOAD));
+ MI.setDesc(B.getTII().get(Opc));
MI.removeOperand(1); // Remove intrinsic ID
// FIXME: When intrinsic definition is fixed, this should have an MMO already.
// TODO: Should this use datalayout alignment?
const unsigned MemSize = (Size + 7) / 8;
- const Align MemAlign(4);
+ const Align MemAlign(std::min(MemSize, 4u));
MachineMemOperand *MMO = MF.getMachineMemOperand(
MachinePointerInfo(),
MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
MachineMemOperand::MOInvariant,
MemSize, MemAlign);
MI.addMemOperand(MF, MMO);
+ if (Dst != OrigDst) {
+ MI.getOperand(0).setReg(Dst);
+ B.setInsertPt(B.getMBB(), ++B.getInsertPt());
+ B.buildTrunc(OrigDst, Dst);
+ }
// If we don't have 96-bit result scalar loads, widening to 128-bit should
// always be legal. We may need to restore this to a 96-bit result if it turns
@@ -6545,7 +6582,7 @@ bool AMDGPULegalizerInfo::legalizeTrapHsaQueuePtr(
Register SGPR01(AMDGPU::SGPR0_SGPR1);
// For code object version 5, queue_ptr is passed through implicit kernarg.
- if (AMDGPU::getCodeObjectVersion(*MF.getFunction().getParent()) >=
+ if (AMDGPU::getAMDHSACodeObjectVersion(*MF.getFunction().getParent()) >=
AMDGPU::AMDHSA_COV5) {
AMDGPUTargetLowering::ImplicitParameter Param =
AMDGPUTargetLowering::QUEUE_PTR;
@@ -7080,6 +7117,10 @@ bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd:
case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fadd:
+ case Intrinsic::amdgcn_raw_buffer_atomic_fadd_v2bf16:
+ case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd_v2bf16:
+ case Intrinsic::amdgcn_struct_buffer_atomic_fadd_v2bf16:
+ case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fadd_v2bf16:
return legalizeBufferAtomic(MI, B, IntrID);
case Intrinsic::trap:
return legalizeTrapIntrinsic(MI, MRI, B);
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp
index c32303defe7f..015c71080d67 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp
@@ -106,7 +106,7 @@ static bool lowerKernelArguments(Function &F, const TargetMachine &TM) {
LLVMContext &Ctx = F.getParent()->getContext();
const DataLayout &DL = F.getParent()->getDataLayout();
BasicBlock &EntryBlock = *F.begin();
- IRBuilder<> Builder(&*getInsertPt(EntryBlock));
+ IRBuilder<> Builder(&EntryBlock, getInsertPt(EntryBlock));
const Align KernArgBaseAlign(16); // FIXME: Increase if necessary
const uint64_t BaseOffset = ST.getExplicitKernelArgOffset();
@@ -202,6 +202,7 @@ static bool lowerKernelArguments(Function &F, const TargetMachine &TM) {
// Since we don't have sub-dword scalar loads, avoid doing an extload by
// loading earlier than the argument address, and extracting the relevant
// bits.
+ // TODO: Update this for GFX12 which does have scalar sub-dword loads.
//
// Additionally widen any sub-dword load to i32 even if suitably aligned,
// so that CSE between different argument loads works easily.
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPULowerKernelAttributes.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPULowerKernelAttributes.cpp
index 097722157d41..bf7f67c086f2 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPULowerKernelAttributes.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPULowerKernelAttributes.cpp
@@ -323,7 +323,8 @@ static bool processUse(CallInst *CI, bool IsV5OrAbove) {
// TargetPassConfig for subtarget.
bool AMDGPULowerKernelAttributes::runOnModule(Module &M) {
bool MadeChange = false;
- bool IsV5OrAbove = AMDGPU::getCodeObjectVersion(M) >= AMDGPU::AMDHSA_COV5;
+ bool IsV5OrAbove =
+ AMDGPU::getAMDHSACodeObjectVersion(M) >= AMDGPU::AMDHSA_COV5;
Function *BasePtr = getBasePtrIntrinsic(M, IsV5OrAbove);
if (!BasePtr) // ImplicitArgPtr/DispatchPtr not used.
@@ -356,7 +357,7 @@ ModulePass *llvm::createAMDGPULowerKernelAttributesPass() {
PreservedAnalyses
AMDGPULowerKernelAttributesPass::run(Function &F, FunctionAnalysisManager &AM) {
bool IsV5OrAbove =
- AMDGPU::getCodeObjectVersion(*F.getParent()) >= AMDGPU::AMDHSA_COV5;
+ AMDGPU::getAMDHSACodeObjectVersion(*F.getParent()) >= AMDGPU::AMDHSA_COV5;
Function *BasePtr = getBasePtrIntrinsic(*F.getParent(), IsV5OrAbove);
if (!BasePtr) // ImplicitArgPtr/DispatchPtr not used.
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUMachineCFGStructurizer.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUMachineCFGStructurizer.cpp
index d90fcac87540..289c35e11beb 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUMachineCFGStructurizer.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUMachineCFGStructurizer.cpp
@@ -1721,7 +1721,7 @@ void AMDGPUMachineCFGStructurizer::insertMergePHI(MachineBasicBlock *IfBB,
unsigned CodeSourceRegister,
bool IsUndefIfSource) {
// If this is the function exit block, we don't need a phi.
- if (MergeBB->succ_begin() == MergeBB->succ_end()) {
+ if (MergeBB->succ_empty()) {
return;
}
LLVM_DEBUG(dbgs() << "Merge PHI (" << printMBBReference(*MergeBB)
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUMarkLastScratchLoad.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUMarkLastScratchLoad.cpp
new file mode 100644
index 000000000000..0692a12a4061
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUMarkLastScratchLoad.cpp
@@ -0,0 +1,142 @@
+//===-- AMDGPUMarkLastScratchLoad.cpp -------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Mark scratch load/spill instructions which are guaranteed to be the last time
+// this scratch slot is used so it can be evicted from caches.
+//
+// TODO: Handle general stack accesses not just spilling.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "GCNSubtarget.h"
+#include "llvm/CodeGen/LiveIntervals.h"
+#include "llvm/CodeGen/LiveStacks.h"
+#include "llvm/CodeGen/MachineOperand.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "amdgpu-mark-last-scratch-load"
+
+namespace {
+
+class AMDGPUMarkLastScratchLoad : public MachineFunctionPass {
+private:
+ LiveStacks *LS = nullptr;
+ LiveIntervals *LIS = nullptr;
+ SlotIndexes *SI = nullptr;
+ const SIInstrInfo *SII = nullptr;
+
+public:
+ static char ID;
+
+ AMDGPUMarkLastScratchLoad() : MachineFunctionPass(ID) {
+ initializeAMDGPUMarkLastScratchLoadPass(*PassRegistry::getPassRegistry());
+ }
+
+ bool runOnMachineFunction(MachineFunction &MF) override;
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.addRequired<SlotIndexes>();
+ AU.addRequired<LiveIntervals>();
+ AU.addRequired<LiveStacks>();
+ AU.setPreservesAll();
+ MachineFunctionPass::getAnalysisUsage(AU);
+ }
+
+ StringRef getPassName() const override {
+ return "AMDGPU Mark Last Scratch Load";
+ }
+};
+
+} // end anonymous namespace
+
+bool AMDGPUMarkLastScratchLoad::runOnMachineFunction(MachineFunction &MF) {
+ if (skipFunction(MF.getFunction()))
+ return false;
+
+ const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
+ if (ST.getGeneration() < AMDGPUSubtarget::GFX12)
+ return false;
+
+ LS = &getAnalysis<LiveStacks>();
+ LIS = &getAnalysis<LiveIntervals>();
+ SI = &getAnalysis<SlotIndexes>();
+ SII = ST.getInstrInfo();
+ SlotIndexes &Slots = *LIS->getSlotIndexes();
+
+ const unsigned NumSlots = LS->getNumIntervals();
+ if (NumSlots == 0) {
+ LLVM_DEBUG(dbgs() << "No live slots, skipping\n");
+ return false;
+ }
+
+ LLVM_DEBUG(dbgs() << LS->getNumIntervals() << " intervals\n");
+
+ bool Changed = false;
+
+ for (auto &[SS, LI] : *LS) {
+ for (const LiveRange::Segment &Segment : LI.segments) {
+
+ // Ignore segments that run to the end of basic block because in this case
+ // slot is still live at the end of it.
+ if (Segment.end.isBlock())
+ continue;
+
+ const int FrameIndex = Register::stackSlot2Index(LI.reg());
+ MachineInstr *LastLoad = nullptr;
+
+ MachineInstr *MISegmentEnd = SI->getInstructionFromIndex(Segment.end);
+
+ // If there is no instruction at this slot because it was deleted take the
+ // instruction from the next slot.
+ if (!MISegmentEnd) {
+ SlotIndex NextSlot = Slots.getNextNonNullIndex(Segment.end);
+ MISegmentEnd = SI->getInstructionFromIndex(NextSlot);
+ }
+
+ MachineInstr *MISegmentStart = SI->getInstructionFromIndex(Segment.start);
+ MachineBasicBlock *BB = MISegmentEnd->getParent();
+
+ // Start iteration backwards from segment end until the start of basic
+ // block or start of segment if it is in the same basic block.
+ auto End = BB->rend();
+ if (MISegmentStart && MISegmentStart->getParent() == BB)
+ End = MISegmentStart->getReverseIterator();
+
+ for (auto MI = MISegmentEnd->getReverseIterator(); MI != End; ++MI) {
+ int LoadFI = 0;
+
+ if (SII->isLoadFromStackSlot(*MI, LoadFI) && LoadFI == FrameIndex) {
+ LastLoad = &*MI;
+ break;
+ }
+ }
+
+ if (LastLoad && !LastLoad->memoperands_empty()) {
+ MachineMemOperand *MMO = *LastLoad->memoperands_begin();
+ MMO->setFlags(MOLastUse);
+ Changed = true;
+ LLVM_DEBUG(dbgs() << " Found last load: " << *LastLoad);
+ }
+ }
+ }
+
+ return Changed;
+}
+
+char AMDGPUMarkLastScratchLoad::ID = 0;
+
+char &llvm::AMDGPUMarkLastScratchLoadID = AMDGPUMarkLastScratchLoad::ID;
+
+INITIALIZE_PASS_BEGIN(AMDGPUMarkLastScratchLoad, DEBUG_TYPE,
+ "AMDGPU Mark last scratch load", false, false)
+INITIALIZE_PASS_DEPENDENCY(SlotIndexes)
+INITIALIZE_PASS_DEPENDENCY(LiveStacks)
+INITIALIZE_PASS_END(AMDGPUMarkLastScratchLoad, DEBUG_TYPE,
+ "AMDGPU Mark last scratch load", false, false)
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp
index bb1d6cb72e80..a1c34e92a57f 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp
@@ -411,6 +411,12 @@ bool AMDGPUPostLegalizerCombinerImpl::matchCombineSignExtendInReg(
case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT:
MatchData = {LoadMI, AMDGPU::G_AMDGPU_BUFFER_LOAD_SSHORT};
return Width == 16;
+ case AMDGPU::G_AMDGPU_S_BUFFER_LOAD_UBYTE:
+ MatchData = {LoadMI, AMDGPU::G_AMDGPU_S_BUFFER_LOAD_SBYTE};
+ return Width == 8;
+ case AMDGPU::G_AMDGPU_S_BUFFER_LOAD_USHORT:
+ MatchData = {LoadMI, AMDGPU::G_AMDGPU_S_BUFFER_LOAD_SSHORT};
+ return Width == 16;
}
return false;
}
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
index 391c2b9ec256..bdd4e891f158 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -449,8 +449,13 @@ bool AMDGPURegisterBankInfo::isScalarLoadLegal(const MachineInstr &MI) const {
const unsigned AS = MMO->getAddrSpace();
const bool IsConst = AS == AMDGPUAS::CONSTANT_ADDRESS ||
AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT;
+ const unsigned MemSize = 8 * MMO->getSize();
+
// Require 4-byte alignment.
- return MMO->getAlign() >= Align(4) &&
+ return (MMO->getAlign() >= Align(4) ||
+ (Subtarget.hasScalarSubwordLoads() &&
+ ((MemSize == 16 && MMO->getAlign() >= Align(2)) ||
+ (MemSize == 8 && MMO->getAlign() >= Align(1))))) &&
// Can't do a scalar atomic load.
!MMO->isAtomic() &&
// Don't use scalar loads for volatile accesses to non-constant address
@@ -1074,6 +1079,13 @@ bool AMDGPURegisterBankInfo::applyMappingLoad(
(MemSize == 32 || LoadTy.isVector() || !isScalarLoadLegal(MI)))
return false;
+ if (LoadSize == 32 &&
+ ((MemSize == 8 && MMO->getAlign() >= Align(1)) ||
+ (MemSize == 16 && MMO->getAlign() >= Align(2))) &&
+ isScalarLoadLegal(MI) &&
+ Subtarget.getGeneration() >= AMDGPUSubtarget::GFX12)
+ return false;
+
Register PtrReg = MI.getOperand(1).getReg();
ApplyRegBankMapping ApplyBank(B, *this, MRI, DstBank);
@@ -3062,6 +3074,7 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
return;
}
case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD:
+ case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD_BF16:
case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN:
case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX: {
applyDefaultMapping(OpdMapper);
@@ -3073,7 +3086,11 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
executeInWaterfallLoop(B, MI, {3, 6});
return;
}
- case AMDGPU::G_AMDGPU_S_BUFFER_LOAD: {
+ case AMDGPU::G_AMDGPU_S_BUFFER_LOAD:
+ case AMDGPU::G_AMDGPU_S_BUFFER_LOAD_UBYTE:
+ case AMDGPU::G_AMDGPU_S_BUFFER_LOAD_SBYTE:
+ case AMDGPU::G_AMDGPU_S_BUFFER_LOAD_USHORT:
+ case AMDGPU::G_AMDGPU_S_BUFFER_LOAD_SSHORT: {
applyMappingSBufferLoad(B, OpdMapper);
return;
}
@@ -3765,16 +3782,16 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
// properly.
//
// TODO: There are additional exec masking dependencies to analyze.
- if (MI.getOpcode() == TargetOpcode::G_PHI) {
+ if (auto *PHI = dyn_cast<GPhi>(&MI)) {
unsigned ResultBank = AMDGPU::InvalidRegBankID;
- Register DstReg = MI.getOperand(0).getReg();
+ Register DstReg = PHI->getReg(0);
// Sometimes the result may have already been assigned a bank.
if (const RegisterBank *DstBank = getRegBank(DstReg, MRI, *TRI))
ResultBank = DstBank->getID();
- for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) {
- Register Reg = MI.getOperand(I).getReg();
+ for (unsigned I = 0; I < PHI->getNumIncomingValues(); ++I) {
+ Register Reg = PHI->getIncomingValue(I);
const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI);
// FIXME: Assuming VGPR for any undetermined inputs.
@@ -4346,6 +4363,7 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC:
case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC:
case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD:
+ case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD_BF16:
case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN:
case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX: {
// vdata_out
@@ -4396,7 +4414,11 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
// initialized.
break;
}
- case AMDGPU::G_AMDGPU_S_BUFFER_LOAD: {
+ case AMDGPU::G_AMDGPU_S_BUFFER_LOAD:
+ case AMDGPU::G_AMDGPU_S_BUFFER_LOAD_UBYTE:
+ case AMDGPU::G_AMDGPU_S_BUFFER_LOAD_SBYTE:
+ case AMDGPU::G_AMDGPU_S_BUFFER_LOAD_USHORT:
+ case AMDGPU::G_AMDGPU_S_BUFFER_LOAD_SSHORT: {
// Lie and claim everything is legal, even though some need to be
// SGPRs. applyMapping will have to deal with it as a waterfall loop.
OpdsMapping[1] = getSGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
@@ -4471,6 +4493,10 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
case Intrinsic::amdgcn_fdot2_f32_bf16:
case Intrinsic::amdgcn_sudot4:
case Intrinsic::amdgcn_sudot8:
+ case Intrinsic::amdgcn_dot4_f32_fp8_bf8:
+ case Intrinsic::amdgcn_dot4_f32_bf8_fp8:
+ case Intrinsic::amdgcn_dot4_f32_fp8_fp8:
+ case Intrinsic::amdgcn_dot4_f32_bf8_bf8:
case Intrinsic::amdgcn_wmma_bf16_16x16x16_bf16:
case Intrinsic::amdgcn_wmma_f16_16x16x16_f16:
case Intrinsic::amdgcn_wmma_bf16_16x16x16_bf16_tied:
@@ -4836,7 +4862,9 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
case Intrinsic::amdgcn_flat_atomic_fmax_num:
case Intrinsic::amdgcn_global_atomic_fadd_v2bf16:
case Intrinsic::amdgcn_flat_atomic_fadd_v2bf16:
+ case Intrinsic::amdgcn_atomic_cond_sub_u32:
case Intrinsic::amdgcn_global_atomic_ordered_add_b64:
+ case Intrinsic::amdgcn_global_load_tr:
return getDefaultMappingAllVGPR(MI);
case Intrinsic::amdgcn_ds_ordered_add:
case Intrinsic::amdgcn_ds_ordered_swap:
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPURemoveIncompatibleFunctions.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPURemoveIncompatibleFunctions.cpp
index 552380d54dfd..6f1236fd3b7d 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPURemoveIncompatibleFunctions.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPURemoveIncompatibleFunctions.cpp
@@ -105,7 +105,8 @@ constexpr unsigned FeaturesToCheck[] = {AMDGPU::FeatureGFX11Insts,
AMDGPU::FeatureDot8Insts,
AMDGPU::FeatureExtendedImageInsts,
AMDGPU::FeatureSMemRealTime,
- AMDGPU::FeatureSMemTimeInst};
+ AMDGPU::FeatureSMemTimeInst,
+ AMDGPU::FeatureGWS};
FeatureBitset expandImpliedFeatures(const FeatureBitset &Features) {
FeatureBitset Result = Features;
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp
index fc47b02c98e0..0c759e7f3b09 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp
@@ -112,7 +112,7 @@ bool AMDGPUResourceUsageAnalysis::runOnModule(Module &M) {
// By default, for code object v5 and later, track only the minimum scratch
// size
- if (AMDGPU::getCodeObjectVersion(M) >= AMDGPU::AMDHSA_COV5 ||
+ if (AMDGPU::getAMDHSACodeObjectVersion(M) >= AMDGPU::AMDHSA_COV5 ||
STI.getTargetTriple().getOS() == Triple::AMDPAL) {
if (!AssumedStackSizeForDynamicSizeObjects.getNumOccurrences())
AssumedStackSizeForDynamicSizeObjects = 0;
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td
index 4cc8871a00fe..67263f23b983 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td
@@ -237,6 +237,7 @@ def : SourceOfDivergence<int_amdgcn_mbcnt_lo>;
def : SourceOfDivergence<int_r600_read_tidig_x>;
def : SourceOfDivergence<int_r600_read_tidig_y>;
def : SourceOfDivergence<int_r600_read_tidig_z>;
+def : SourceOfDivergence<int_amdgcn_atomic_cond_sub_u32>;
def : SourceOfDivergence<int_amdgcn_global_atomic_csub>;
def : SourceOfDivergence<int_amdgcn_global_atomic_fadd>;
def : SourceOfDivergence<int_amdgcn_global_atomic_fmin>;
@@ -279,9 +280,11 @@ def : SourceOfDivergence<int_amdgcn_raw_buffer_atomic_xor>;
def : SourceOfDivergence<int_amdgcn_raw_buffer_atomic_inc>;
def : SourceOfDivergence<int_amdgcn_raw_buffer_atomic_dec>;
def : SourceOfDivergence<int_amdgcn_raw_buffer_atomic_fadd>;
+def : SourceOfDivergence<int_amdgcn_raw_buffer_atomic_fadd_v2bf16>;
def : SourceOfDivergence<int_amdgcn_raw_buffer_atomic_fmin>;
def : SourceOfDivergence<int_amdgcn_raw_buffer_atomic_fmax>;
def : SourceOfDivergence<int_amdgcn_raw_buffer_atomic_cmpswap>;
+def : SourceOfDivergence<int_amdgcn_raw_buffer_atomic_cond_sub_u32>;
def : SourceOfDivergence<int_amdgcn_raw_ptr_buffer_atomic_swap>;
def : SourceOfDivergence<int_amdgcn_raw_ptr_buffer_atomic_add>;
def : SourceOfDivergence<int_amdgcn_raw_ptr_buffer_atomic_sub>;
@@ -295,9 +298,11 @@ def : SourceOfDivergence<int_amdgcn_raw_ptr_buffer_atomic_xor>;
def : SourceOfDivergence<int_amdgcn_raw_ptr_buffer_atomic_inc>;
def : SourceOfDivergence<int_amdgcn_raw_ptr_buffer_atomic_dec>;
def : SourceOfDivergence<int_amdgcn_raw_ptr_buffer_atomic_fadd>;
+def : SourceOfDivergence<int_amdgcn_raw_ptr_buffer_atomic_fadd_v2bf16>;
def : SourceOfDivergence<int_amdgcn_raw_ptr_buffer_atomic_fmin>;
def : SourceOfDivergence<int_amdgcn_raw_ptr_buffer_atomic_fmax>;
def : SourceOfDivergence<int_amdgcn_raw_ptr_buffer_atomic_cmpswap>;
+def : SourceOfDivergence<int_amdgcn_raw_ptr_buffer_atomic_cond_sub_u32>;
def : SourceOfDivergence<int_amdgcn_struct_buffer_atomic_swap>;
def : SourceOfDivergence<int_amdgcn_struct_buffer_atomic_add>;
def : SourceOfDivergence<int_amdgcn_struct_buffer_atomic_sub>;
@@ -311,9 +316,11 @@ def : SourceOfDivergence<int_amdgcn_struct_buffer_atomic_xor>;
def : SourceOfDivergence<int_amdgcn_struct_buffer_atomic_inc>;
def : SourceOfDivergence<int_amdgcn_struct_buffer_atomic_dec>;
def : SourceOfDivergence<int_amdgcn_struct_buffer_atomic_fadd>;
+def : SourceOfDivergence<int_amdgcn_struct_buffer_atomic_fadd_v2bf16>;
def : SourceOfDivergence<int_amdgcn_struct_buffer_atomic_fmin>;
def : SourceOfDivergence<int_amdgcn_struct_buffer_atomic_fmax>;
def : SourceOfDivergence<int_amdgcn_struct_buffer_atomic_cmpswap>;
+def : SourceOfDivergence<int_amdgcn_struct_buffer_atomic_cond_sub_u32>;
def : SourceOfDivergence<int_amdgcn_struct_ptr_buffer_atomic_swap>;
def : SourceOfDivergence<int_amdgcn_struct_ptr_buffer_atomic_add>;
def : SourceOfDivergence<int_amdgcn_struct_ptr_buffer_atomic_sub>;
@@ -327,9 +334,11 @@ def : SourceOfDivergence<int_amdgcn_struct_ptr_buffer_atomic_xor>;
def : SourceOfDivergence<int_amdgcn_struct_ptr_buffer_atomic_inc>;
def : SourceOfDivergence<int_amdgcn_struct_ptr_buffer_atomic_dec>;
def : SourceOfDivergence<int_amdgcn_struct_ptr_buffer_atomic_fadd>;
+def : SourceOfDivergence<int_amdgcn_struct_ptr_buffer_atomic_fadd_v2bf16>;
def : SourceOfDivergence<int_amdgcn_struct_ptr_buffer_atomic_fmin>;
def : SourceOfDivergence<int_amdgcn_struct_ptr_buffer_atomic_fmax>;
def : SourceOfDivergence<int_amdgcn_struct_ptr_buffer_atomic_cmpswap>;
+def : SourceOfDivergence<int_amdgcn_struct_ptr_buffer_atomic_cond_sub_u32>;
def : SourceOfDivergence<int_amdgcn_buffer_atomic_csub>;
def : SourceOfDivergence<int_amdgcn_ps_live>;
def : SourceOfDivergence<int_amdgcn_live_mask>;
@@ -405,6 +414,7 @@ def : SourceOfDivergence<int_amdgcn_wmma_f16_16x16x16_f16>;
def : SourceOfDivergence<int_amdgcn_wmma_bf16_16x16x16_bf16>;
def : SourceOfDivergence<int_amdgcn_wmma_i32_16x16x16_iu8>;
def : SourceOfDivergence<int_amdgcn_wmma_i32_16x16x16_iu4>;
+def : SourceOfDivergence<int_amdgcn_global_load_tr>;
// The dummy boolean output is divergent from the IR's perspective,
// but the mask results are uniform. These produce a divergent and
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
index f19c57668564..bcc7dedf3229 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
@@ -571,7 +571,7 @@ unsigned AMDGPUSubtarget::getImplicitArgNumBytes(const Function &F) const {
// Assume all implicit inputs are used by default
const Module *M = F.getParent();
unsigned NBytes =
- AMDGPU::getCodeObjectVersion(*M) >= AMDGPU::AMDHSA_COV5 ? 256 : 56;
+ AMDGPU::getAMDHSACodeObjectVersion(*M) >= AMDGPU::AMDHSA_COV5 ? 256 : 56;
return F.getFnAttributeAsParsedInteger("amdgpu-implicitarg-num-bytes",
NBytes);
}
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index 0f3bb3e7b0d8..b8a7a5e20802 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -382,6 +382,7 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() {
initializeSILowerI1CopiesPass(*PR);
initializeAMDGPUGlobalISelDivergenceLoweringPass(*PR);
initializeSILowerWWMCopiesPass(*PR);
+ initializeAMDGPUMarkLastScratchLoadPass(*PR);
initializeSILowerSGPRSpillsPass(*PR);
initializeSIFixSGPRCopiesPass(*PR);
initializeSIFixVGPRCopiesPass(*PR);
@@ -1424,6 +1425,8 @@ bool GCNPassConfig::addRegAssignAndRewriteOptimized() {
addPreRewrite();
addPass(&VirtRegRewriterID);
+ addPass(&AMDGPUMarkLastScratchLoadID);
+
return true;
}
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
index ba79affe683d..489cf85693ed 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
@@ -346,7 +346,7 @@ public:
}
bool isVRegWithInputMods() const;
- bool isT16VRegWithInputMods() const;
+ template <bool IsFake16> bool isT16VRegWithInputMods() const;
bool isSDWAOperand(MVT type) const;
bool isSDWAFP16Operand() const;
@@ -1303,10 +1303,8 @@ private:
unsigned NextFreeSGPR, SMRange SGPRRange,
unsigned &VGPRBlocks, unsigned &SGPRBlocks);
bool ParseDirectiveAMDGCNTarget();
+ bool ParseDirectiveAMDHSACodeObjectVersion();
bool ParseDirectiveAMDHSAKernel();
- bool ParseDirectiveMajorMinor(uint32_t &Major, uint32_t &Minor);
- bool ParseDirectiveHSACodeObjectVersion();
- bool ParseDirectiveHSACodeObjectISA();
bool ParseAMDKernelCodeTValue(StringRef ID, amd_kernel_code_t &Header);
bool ParseDirectiveAMDKernelCodeT();
// TODO: Possibly make subtargetHasRegister const.
@@ -1688,6 +1686,7 @@ private:
bool validateMIMGD16(const MCInst &Inst);
bool validateMIMGMSAA(const MCInst &Inst);
bool validateOpSel(const MCInst &Inst);
+ bool validateNeg(const MCInst &Inst, int OpName);
bool validateDPP(const MCInst &Inst, const OperandVector &Operands);
bool validateVccOperand(unsigned Reg) const;
bool validateVOPLiteral(const MCInst &Inst, const OperandVector &Operands);
@@ -2055,8 +2054,9 @@ bool AMDGPUOperand::isVRegWithInputMods() const {
AsmParser->getFeatureBits()[AMDGPU::FeatureDPALU_DPP]);
}
-bool AMDGPUOperand::isT16VRegWithInputMods() const {
- return isRegClass(AMDGPU::VGPR_32_Lo128RegClassID);
+template <bool IsFake16> bool AMDGPUOperand::isT16VRegWithInputMods() const {
+ return isRegClass(IsFake16 ? AMDGPU::VGPR_32_Lo128RegClassID
+ : AMDGPU::VGPR_16_Lo128RegClassID);
}
bool AMDGPUOperand::isSDWAOperand(MVT type) const {
@@ -4357,6 +4357,41 @@ bool AMDGPUAsmParser::validateOpSel(const MCInst &Inst) {
return true;
}
+bool AMDGPUAsmParser::validateNeg(const MCInst &Inst, int OpName) {
+ assert(OpName == AMDGPU::OpName::neg_lo || OpName == AMDGPU::OpName::neg_hi);
+
+ const unsigned Opc = Inst.getOpcode();
+ uint64_t TSFlags = MII.get(Opc).TSFlags;
+
+ // v_dot4 fp8/bf8 neg_lo/neg_hi not allowed on src0 and src1 (allowed on src2)
+ if (!(TSFlags & SIInstrFlags::IsDOT))
+ return true;
+
+ int NegIdx = AMDGPU::getNamedOperandIdx(Opc, OpName);
+ if (NegIdx == -1)
+ return true;
+
+ unsigned Neg = Inst.getOperand(NegIdx).getImm();
+
+ // Instructions that have neg_lo or neg_hi operand but neg modifier is allowed
+ // on some src operands but not allowed on other.
+ // It is convenient that such instructions don't have src_modifiers operand
+ // for src operands that don't allow neg because they also don't allow opsel.
+
+ int SrcMods[3] = {AMDGPU::OpName::src0_modifiers,
+ AMDGPU::OpName::src1_modifiers,
+ AMDGPU::OpName::src2_modifiers};
+
+ for (unsigned i = 0; i < 3; ++i) {
+ if (!AMDGPU::hasNamedOperand(Opc, SrcMods[i])) {
+ if (Neg & (1 << i))
+ return false;
+ }
+ }
+
+ return true;
+}
+
bool AMDGPUAsmParser::validateDPP(const MCInst &Inst,
const OperandVector &Operands) {
const unsigned Opc = Inst.getOpcode();
@@ -4834,6 +4869,16 @@ bool AMDGPUAsmParser::validateInstruction(const MCInst &Inst,
"invalid op_sel operand");
return false;
}
+ if (!validateNeg(Inst, AMDGPU::OpName::neg_lo)) {
+ Error(getImmLoc(AMDGPUOperand::ImmTyNegLo, Operands),
+ "invalid neg_lo operand");
+ return false;
+ }
+ if (!validateNeg(Inst, AMDGPU::OpName::neg_hi)) {
+ Error(getImmLoc(AMDGPUOperand::ImmTyNegHi, Operands),
+ "invalid neg_hi operand");
+ return false;
+ }
if (!validateDPP(Inst, Operands)) {
return false;
}
@@ -5087,20 +5132,6 @@ bool AMDGPUAsmParser::ParseAsAbsoluteExpression(uint32_t &Ret) {
return false;
}
-bool AMDGPUAsmParser::ParseDirectiveMajorMinor(uint32_t &Major,
- uint32_t &Minor) {
- if (ParseAsAbsoluteExpression(Major))
- return TokError("invalid major version");
-
- if (!trySkipToken(AsmToken::Comma))
- return TokError("minor version number required, comma expected");
-
- if (ParseAsAbsoluteExpression(Minor))
- return TokError("invalid minor version");
-
- return false;
-}
-
bool AMDGPUAsmParser::ParseDirectiveAMDGCNTarget() {
if (getSTI().getTargetTriple().getArch() != Triple::amdgcn)
return TokError("directive only supported for amdgcn architecture");
@@ -5566,63 +5597,18 @@ bool AMDGPUAsmParser::ParseDirectiveAMDHSAKernel() {
}
}
- getTargetStreamer().EmitAmdhsaKernelDescriptor(
- getSTI(), KernelName, KD, NextFreeVGPR, NextFreeSGPR, ReserveVCC,
- ReserveFlatScr, AMDGPU::getAmdhsaCodeObjectVersion());
+ getTargetStreamer().EmitAmdhsaKernelDescriptor(getSTI(), KernelName, KD,
+ NextFreeVGPR, NextFreeSGPR,
+ ReserveVCC, ReserveFlatScr);
return false;
}
-bool AMDGPUAsmParser::ParseDirectiveHSACodeObjectVersion() {
- uint32_t Major;
- uint32_t Minor;
-
- if (ParseDirectiveMajorMinor(Major, Minor))
+bool AMDGPUAsmParser::ParseDirectiveAMDHSACodeObjectVersion() {
+ uint32_t Version;
+ if (ParseAsAbsoluteExpression(Version))
return true;
- getTargetStreamer().EmitDirectiveHSACodeObjectVersion(Major, Minor);
- return false;
-}
-
-bool AMDGPUAsmParser::ParseDirectiveHSACodeObjectISA() {
- uint32_t Major;
- uint32_t Minor;
- uint32_t Stepping;
- StringRef VendorName;
- StringRef ArchName;
-
- // If this directive has no arguments, then use the ISA version for the
- // targeted GPU.
- if (isToken(AsmToken::EndOfStatement)) {
- AMDGPU::IsaVersion ISA = AMDGPU::getIsaVersion(getSTI().getCPU());
- getTargetStreamer().EmitDirectiveHSACodeObjectISAV2(ISA.Major, ISA.Minor,
- ISA.Stepping,
- "AMD", "AMDGPU");
- return false;
- }
-
- if (ParseDirectiveMajorMinor(Major, Minor))
- return true;
-
- if (!trySkipToken(AsmToken::Comma))
- return TokError("stepping version number required, comma expected");
-
- if (ParseAsAbsoluteExpression(Stepping))
- return TokError("invalid stepping version");
-
- if (!trySkipToken(AsmToken::Comma))
- return TokError("vendor name required, comma expected");
-
- if (!parseString(VendorName, "invalid vendor name"))
- return true;
-
- if (!trySkipToken(AsmToken::Comma))
- return TokError("arch name required, comma expected");
-
- if (!parseString(ArchName, "invalid arch name"))
- return true;
-
- getTargetStreamer().EmitDirectiveHSACodeObjectISAV2(Major, Minor, Stepping,
- VendorName, ArchName);
+ getTargetStreamer().EmitDirectiveAMDHSACodeObjectVersion(Version);
return false;
}
@@ -5909,16 +5895,13 @@ bool AMDGPUAsmParser::ParseDirective(AsmToken DirectiveID) {
if (IDVal == ".amdhsa_kernel")
return ParseDirectiveAMDHSAKernel();
+ if (IDVal == ".amdhsa_code_object_version")
+ return ParseDirectiveAMDHSACodeObjectVersion();
+
// TODO: Restructure/combine with PAL metadata directive.
if (IDVal == AMDGPU::HSAMD::V3::AssemblerDirectiveBegin)
return ParseDirectiveHSAMetadata();
} else {
- if (IDVal == ".hsa_code_object_version")
- return ParseDirectiveHSACodeObjectVersion();
-
- if (IDVal == ".hsa_code_object_isa")
- return ParseDirectiveHSACodeObjectISA();
-
if (IDVal == ".amd_kernel_code_t")
return ParseDirectiveAMDKernelCodeT();
@@ -8091,9 +8074,8 @@ void AMDGPUAsmParser::onBeginOfFile() {
return;
if (!getTargetStreamer().getTargetID())
- getTargetStreamer().initializeTargetID(getSTI(), getSTI().getFeatureString(),
- // TODO: Should try to check code object version from directive???
- AMDGPU::getAmdhsaCodeObjectVersion());
+ getTargetStreamer().initializeTargetID(getSTI(),
+ getSTI().getFeatureString());
if (isHsaAbi(getSTI()))
getTargetStreamer().EmitDirectiveAMDGCNTarget();
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/BUFInstructions.td b/contrib/llvm-project/llvm/lib/Target/AMDGPU/BUFInstructions.td
index 9e99d382ed9b..ae0955f0cf6a 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/BUFInstructions.td
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/BUFInstructions.td
@@ -485,7 +485,7 @@ class MUBUF_Load_Pseudo <string opName,
list<dag> pattern=[],
// Workaround bug bz30254
int addrKindCopy = addrKind,
- RegisterClass vdata_rc = getVregSrcForVT<vdata_vt>.ret,
+ RegisterClass vdata_rc = getVregSrcForVT<vdata_vt>.ret.RegClass,
RegisterOperand vdata_op = getLdStVDataRegisterOperand<vdata_rc, isTFE>.ret>
: MUBUF_Pseudo<opName,
!if(!or(isLds, isLdsOpc), (outs), (outs vdata_op:$vdata)),
@@ -601,7 +601,7 @@ class MUBUF_Store_Pseudo <string opName,
int addrKindCopy = addrKind>
: MUBUF_Pseudo<opName,
(outs),
- getMUBUFIns<addrKindCopy, [getVregSrcForVT<store_vt>.ret], isTFE, hasGFX12Enc>.ret,
+ getMUBUFIns<addrKindCopy, [getVregSrcForVT<store_vt>.ret.RegClass], isTFE, hasGFX12Enc>.ret,
getMUBUFAsmOps<addrKindCopy, 0, 0, isTFE>.ret,
pattern>,
MUBUF_SetupAddr<addrKindCopy> {
@@ -780,9 +780,8 @@ class MUBUF_AtomicRet_Pseudo<string opName, int addrKind,
multiclass MUBUF_Pseudo_Atomics_NO_RTN <string opName,
RegisterClass vdataClass,
- ValueType vdataType,
- bit isFP = isFloatType<vdataType>.ret> {
- let FPAtomic = isFP in {
+ ValueType vdataType> {
+ let FPAtomic = vdataType.isFP in {
def _OFFSET : MUBUF_AtomicNoRet_Pseudo <opName, BUFAddrKind.Offset, vdataClass, 0>,
MUBUFAddr64Table <0, NAME>;
def _ADDR64 : MUBUF_AtomicNoRet_Pseudo <opName, BUFAddrKind.Addr64, vdataClass, 0>,
@@ -804,9 +803,8 @@ multiclass MUBUF_Pseudo_Atomics_NO_RTN <string opName,
multiclass MUBUF_Pseudo_Atomics_RTN <string opName,
RegisterClass vdataClass,
ValueType vdataType,
- SDPatternOperator atomic,
- bit isFP = isFloatType<vdataType>.ret> {
- let FPAtomic = isFP in {
+ SDPatternOperator atomic> {
+ let FPAtomic = vdataType.isFP in {
def _OFFSET_RTN : MUBUF_AtomicRet_Pseudo <opName, BUFAddrKind.Offset, vdataClass, 0,
[(set vdataType:$vdata,
(atomic (MUBUFOffset v4i32:$srsrc, i32:$soffset, i32:$offset),
@@ -1243,6 +1241,17 @@ defm BUFFER_ATOMIC_PK_ADD_F16 : MUBUF_Pseudo_Atomics_RTN <
"buffer_atomic_pk_add_f16", VGPR_32, v2f16, null_frag
>;
+let SubtargetPredicate = isGFX12Plus in {
+defm BUFFER_ATOMIC_COND_SUB_U32 : MUBUF_Pseudo_Atomics <
+ "buffer_atomic_cond_sub_u32", VGPR_32, i32
+>;
+
+let FPAtomic = 1 in
+defm BUFFER_ATOMIC_PK_ADD_BF16 : MUBUF_Pseudo_Atomics <
+ "buffer_atomic_pk_add_bf16", VGPR_32, v2bf16
+>;
+}
+
//===----------------------------------------------------------------------===//
// MTBUF Instructions
//===----------------------------------------------------------------------===//
@@ -1560,27 +1569,28 @@ multiclass BufferAtomicCmpSwapPat_Common<ValueType vt, ValueType data_vt, string
# !if(!eq(RtnMode, "ret"), "", "_noret")
# "_" # vt.Size);
defvar InstSuffix = !if(!eq(RtnMode, "ret"), "_RTN", "");
+ defvar data_vt_RC = getVregSrcForVT<data_vt>.ret.RegClass;
let AddedComplexity = !if(!eq(RtnMode, "ret"), 0, 1) in {
defvar OffsetResDag = (!cast<MUBUF_Pseudo>(Inst # "_OFFSET" # InstSuffix)
- getVregSrcForVT<data_vt>.ret:$vdata_in, SReg_128:$srsrc, SCSrc_b32:$soffset,
+ data_vt_RC:$vdata_in, SReg_128:$srsrc, SCSrc_b32:$soffset,
offset:$offset);
def : GCNPat<
(vt (Op (MUBUFOffset v4i32:$srsrc, i32:$soffset, i32:$offset), data_vt:$vdata_in)),
!if(!eq(RtnMode, "ret"),
- (EXTRACT_SUBREG (vt (COPY_TO_REGCLASS OffsetResDag, getVregSrcForVT<data_vt>.ret)),
+ (EXTRACT_SUBREG (vt (COPY_TO_REGCLASS OffsetResDag, data_vt_RC)),
!if(!eq(vt, i32), sub0, sub0_sub1)),
OffsetResDag)
>;
defvar Addr64ResDag = (!cast<MUBUF_Pseudo>(Inst # "_ADDR64" # InstSuffix)
- getVregSrcForVT<data_vt>.ret:$vdata_in, VReg_64:$vaddr, SReg_128:$srsrc,
+ data_vt_RC:$vdata_in, VReg_64:$vaddr, SReg_128:$srsrc,
SCSrc_b32:$soffset, offset:$offset);
def : GCNPat<
(vt (Op (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, i32:$soffset, i32:$offset),
data_vt:$vdata_in)),
!if(!eq(RtnMode, "ret"),
- (EXTRACT_SUBREG (vt (COPY_TO_REGCLASS Addr64ResDag, getVregSrcForVT<data_vt>.ret)),
+ (EXTRACT_SUBREG (vt (COPY_TO_REGCLASS Addr64ResDag, data_vt_RC)),
!if(!eq(vt, i32), sub0, sub0_sub1)),
Addr64ResDag)
>;
@@ -1628,12 +1638,12 @@ multiclass SIBufferAtomicPat_Common<string OpPrefix, ValueType vt, string Inst,
defvar InstSuffix = !if(!eq(RtnMode, "ret"), "_RTN", "");
defvar CachePolicy = !if(!eq(RtnMode, "ret"),
- (set_glc $cachepolicy), (timm:$cachepolicy));
+ (extract_cpol_set_glc $auxiliary), (extract_cpol $auxiliary));
let AddedComplexity = !if(!eq(RtnMode, "ret"), 0, 1) in {
def : GCNPat<
(vt (Op vt:$vdata_in, v4i32:$rsrc, 0, 0, (BUFSOffset i32:$soffset),
- timm:$offset, timm:$cachepolicy, 0)),
+ timm:$offset, timm:$auxiliary, 0)),
(!cast<MUBUF_Pseudo>(Inst # "_OFFSET" # InstSuffix)
getVregSrcForVT<vt>.ret:$vdata_in, SReg_128:$rsrc, SCSrc_b32:$soffset,
timm:$offset, CachePolicy)
@@ -1641,7 +1651,7 @@ multiclass SIBufferAtomicPat_Common<string OpPrefix, ValueType vt, string Inst,
def : GCNPat<
(vt (Op vt:$vdata_in, v4i32:$rsrc, i32:$vindex, 0, (BUFSOffset i32:$soffset),
- timm:$offset, timm:$cachepolicy, timm)),
+ timm:$offset, timm:$auxiliary, timm)),
(!cast<MUBUF_Pseudo>(Inst # "_IDXEN" # InstSuffix)
getVregSrcForVT<vt>.ret:$vdata_in, VGPR_32:$vindex, SReg_128:$rsrc,
SCSrc_b32:$soffset, timm:$offset, CachePolicy)
@@ -1649,7 +1659,7 @@ multiclass SIBufferAtomicPat_Common<string OpPrefix, ValueType vt, string Inst,
def : GCNPat<
(vt (Op vt:$vdata_in, v4i32:$rsrc, 0, i32:$voffset,
- (BUFSOffset i32:$soffset), timm:$offset, timm:$cachepolicy, 0)),
+ (BUFSOffset i32:$soffset), timm:$offset, timm:$auxiliary, 0)),
(!cast<MUBUF_Pseudo>(Inst # "_OFFEN" # InstSuffix)
getVregSrcForVT<vt>.ret:$vdata_in, VGPR_32:$voffset, SReg_128:$rsrc,
SCSrc_b32:$soffset, timm:$offset, CachePolicy)
@@ -1657,7 +1667,7 @@ multiclass SIBufferAtomicPat_Common<string OpPrefix, ValueType vt, string Inst,
def : GCNPat<
(vt (Op vt:$vdata_in, v4i32:$rsrc, i32:$vindex, i32:$voffset,
- (BUFSOffset i32:$soffset), timm:$offset, timm:$cachepolicy, timm)),
+ (BUFSOffset i32:$soffset), timm:$offset, timm:$auxiliary, timm)),
(!cast<MUBUF_Pseudo>(Inst # "_BOTHEN" # InstSuffix)
getVregSrcForVT<vt>.ret:$vdata_in,
(REG_SEQUENCE VReg_64, VGPR_32:$vindex, sub0, VGPR_32:$voffset, sub1),
@@ -1703,9 +1713,17 @@ defm : SIBufferAtomicPat<"SIbuffer_atomic_xor", i64, "BUFFER_ATOMIC_XOR_X2">;
defm : SIBufferAtomicPat<"SIbuffer_atomic_inc", i64, "BUFFER_ATOMIC_INC_X2">;
defm : SIBufferAtomicPat<"SIbuffer_atomic_dec", i64, "BUFFER_ATOMIC_DEC_X2">;
-let SubtargetPredicate = HasAtomicCSubNoRtnInsts in
+let OtherPredicates = [HasAtomicCSubNoRtnInsts] in
defm : SIBufferAtomicPat<"SIbuffer_atomic_csub", i32, "BUFFER_ATOMIC_CSUB", ["noret"]>;
+let SubtargetPredicate = isGFX12Plus in {
+ defm : SIBufferAtomicPat_Common<"SIbuffer_atomic_fadd_bf16", v2bf16, "BUFFER_ATOMIC_PK_ADD_BF16_VBUFFER">;
+ defm : SIBufferAtomicPat_Common<"SIbuffer_atomic_cond_sub_u32", i32, "BUFFER_ATOMIC_COND_SUB_U32_VBUFFER", ["ret"]>;
+
+ let OtherPredicates = [HasAtomicCSubNoRtnInsts] in
+ defm : SIBufferAtomicPat_Common<"SIbuffer_atomic_cond_sub_u32", i32, "BUFFER_ATOMIC_COND_SUB_U32_VBUFFER", ["noret"]>;
+}
+
let SubtargetPredicate = isGFX6GFX7GFX10Plus in {
defm : SIBufferAtomicPat<"SIbuffer_atomic_fmin", f32, "BUFFER_ATOMIC_FMIN">;
defm : SIBufferAtomicPat<"SIbuffer_atomic_fmax", f32, "BUFFER_ATOMIC_FMAX">;
@@ -1726,35 +1744,35 @@ multiclass BufferAtomicPatterns_NO_RTN_Common<SDPatternOperator name, ValueType
def : GCNPat<
(NoUseBufferAtomic<name, vt> vt:$vdata_in, v4i32:$rsrc, 0,
0, (BUFSOffset i32:$soffset), timm:$offset,
- timm:$cachepolicy, 0),
+ timm:$auxiliary, 0),
(!cast<MUBUF_Pseudo>(opcode # _OFFSET) getVregSrcForVT<vt>.ret:$vdata_in, SReg_128:$rsrc, SCSrc_b32:$soffset,
- timm:$offset, timm:$cachepolicy)
+ timm:$offset, (extract_cpol $auxiliary))
>;
def : GCNPat<
(NoUseBufferAtomic<name, vt> vt:$vdata_in, v4i32:$rsrc, i32:$vindex,
0, (BUFSOffset i32:$soffset), timm:$offset,
- timm:$cachepolicy, timm),
+ timm:$auxiliary, timm),
(!cast<MUBUF_Pseudo>(opcode # _IDXEN) getVregSrcForVT<vt>.ret:$vdata_in, VGPR_32:$vindex, SReg_128:$rsrc, SCSrc_b32:$soffset,
- timm:$offset, timm:$cachepolicy)
+ timm:$offset, (extract_cpol $auxiliary))
>;
def : GCNPat<
(NoUseBufferAtomic<name, vt> vt:$vdata_in, v4i32:$rsrc, 0,
i32:$voffset, (BUFSOffset i32:$soffset), timm:$offset,
- timm:$cachepolicy, 0),
+ timm:$auxiliary, 0),
(!cast<MUBUF_Pseudo>(opcode # _OFFEN) getVregSrcForVT<vt>.ret:$vdata_in, VGPR_32:$voffset, SReg_128:$rsrc, SCSrc_b32:$soffset,
- timm:$offset, timm:$cachepolicy)
+ timm:$offset, (extract_cpol $auxiliary))
>;
def : GCNPat<
(NoUseBufferAtomic<name, vt> vt:$vdata_in, v4i32:$rsrc, i32:$vindex,
i32:$voffset, (BUFSOffset i32:$soffset), timm:$offset,
- timm:$cachepolicy, timm),
+ timm:$auxiliary, timm),
(!cast<MUBUF_Pseudo>(opcode # _BOTHEN)
getVregSrcForVT<vt>.ret:$vdata_in,
(REG_SEQUENCE VReg_64, VGPR_32:$vindex, sub0, VGPR_32:$voffset, sub1),
- SReg_128:$rsrc, SCSrc_b32:$soffset, timm:$offset, timm:$cachepolicy)
+ SReg_128:$rsrc, SCSrc_b32:$soffset, timm:$offset, (extract_cpol $auxiliary))
>;
}
@@ -1770,14 +1788,22 @@ let OtherPredicates = [HasAtomicFaddNoRtnInsts] in
defm : SIBufferAtomicPat<"SIbuffer_atomic_fadd", f32, "BUFFER_ATOMIC_ADD_F32", ["noret"]>;
let OtherPredicates = [HasAtomicBufferGlobalPkAddF16NoRtnInsts] in {
+ let SubtargetPredicate = isGFX9Only in
defm : SIBufferAtomicPat_Common<"SIbuffer_atomic_fadd", v2f16, "BUFFER_ATOMIC_PK_ADD_F16", ["noret"]>;
+
+ let SubtargetPredicate = isGFX12Plus in
+ defm : SIBufferAtomicPat_Common<"SIbuffer_atomic_fadd", v2f16, "BUFFER_ATOMIC_PK_ADD_F16_VBUFFER", ["noret"]>;
} // End OtherPredicates = [HasAtomicBufferGlobalPkAddF16NoRtnInsts]
let OtherPredicates = [HasAtomicFaddRtnInsts] in
defm : SIBufferAtomicPat<"SIbuffer_atomic_fadd", f32, "BUFFER_ATOMIC_ADD_F32", ["ret"]>;
let OtherPredicates = [HasAtomicBufferGlobalPkAddF16Insts] in {
+ let SubtargetPredicate = isGFX9Only in
defm : SIBufferAtomicPat_Common<"SIbuffer_atomic_fadd", v2f16, "BUFFER_ATOMIC_PK_ADD_F16", ["ret"]>;
+
+ let SubtargetPredicate = isGFX12Plus in
+ defm : SIBufferAtomicPat_Common<"SIbuffer_atomic_fadd", v2f16, "BUFFER_ATOMIC_PK_ADD_F16_VBUFFER", ["ret"]>;
} // End OtherPredicates = [HasAtomicBufferGlobalPkAddF16Insts]
let OtherPredicates = [isGFX90APlus] in {
@@ -1791,10 +1817,11 @@ multiclass SIBufferAtomicCmpSwapPat_Common<ValueType vt, ValueType data_vt, stri
defvar Op = !cast<SDPatternOperator>(SIbuffer_atomic_cmpswap
# !if(!eq(RtnMode, "ret"), "", "_noret"));
defvar InstSuffix = !if(!eq(RtnMode, "ret"), "_RTN", "");
- defvar CachePolicy = !if(!eq(RtnMode, "ret"), (set_glc $cachepolicy),
- (timm:$cachepolicy));
+ defvar CachePolicy = !if(!eq(RtnMode, "ret"),
+ (extract_cpol_set_glc $auxiliary),
+ (extract_cpol $auxiliary));
defvar SrcRC = getVregSrcForVT<vt>.ret;
- defvar DataRC = getVregSrcForVT<data_vt>.ret;
+ defvar DataRC = getVregSrcForVT<data_vt>.ret.RegClass;
defvar SubLo = !if(!eq(vt, i32), sub0, sub0_sub1);
defvar SubHi = !if(!eq(vt, i32), sub1, sub2_sub3);
@@ -1804,7 +1831,7 @@ multiclass SIBufferAtomicCmpSwapPat_Common<ValueType vt, ValueType data_vt, stri
def : GCNPat<
(vt (Op
vt:$data, vt:$cmp, v4i32:$rsrc, 0, 0, (BUFSOffset i32:$soffset),
- timm:$offset, timm:$cachepolicy, 0)),
+ timm:$offset, timm:$auxiliary, 0)),
!if(!eq(RtnMode, "ret"),
(EXTRACT_SUBREG OffsetResDag, SubLo),
OffsetResDag)
@@ -1818,7 +1845,7 @@ multiclass SIBufferAtomicCmpSwapPat_Common<ValueType vt, ValueType data_vt, stri
(vt (Op
vt:$data, vt:$cmp, v4i32:$rsrc, i32:$vindex,
0, (BUFSOffset i32:$soffset), timm:$offset,
- timm:$cachepolicy, timm)),
+ timm:$auxiliary, timm)),
!if(!eq(RtnMode, "ret"),
(EXTRACT_SUBREG IdxenResDag, SubLo),
IdxenResDag)
@@ -1832,7 +1859,7 @@ multiclass SIBufferAtomicCmpSwapPat_Common<ValueType vt, ValueType data_vt, stri
(vt (Op
vt:$data, vt:$cmp, v4i32:$rsrc, 0,
i32:$voffset, (BUFSOffset i32:$soffset), timm:$offset,
- timm:$cachepolicy, 0)),
+ timm:$auxiliary, 0)),
!if(!eq(RtnMode, "ret"),
(EXTRACT_SUBREG OffenResDag, SubLo),
OffenResDag)
@@ -1846,7 +1873,7 @@ multiclass SIBufferAtomicCmpSwapPat_Common<ValueType vt, ValueType data_vt, stri
(vt (Op
vt:$data, vt:$cmp, v4i32:$rsrc, i32:$vindex,
i32:$voffset, (BUFSOffset i32:$soffset), timm:$offset,
- timm:$cachepolicy, timm)),
+ timm:$auxiliary, timm)),
!if(!eq(RtnMode, "ret"),
(EXTRACT_SUBREG BothenResDag, SubLo),
BothenResDag)
@@ -2608,6 +2635,7 @@ defm BUFFER_ATOMIC_AND_X2 : MUBUF_Real_Atomic_gfx11_gfx12_Renamed<0x049,
defm BUFFER_ATOMIC_CMPSWAP : MUBUF_Real_Atomic_gfx11_gfx12_Renamed<0x034, "buffer_atomic_cmpswap_b32">;
defm BUFFER_ATOMIC_CMPSWAP_X2 : MUBUF_Real_Atomic_gfx11_gfx12_Renamed<0x042, "buffer_atomic_cmpswap_b64">;
defm BUFFER_ATOMIC_FCMPSWAP : MUBUF_Real_Atomic_gfx11_Renamed<0x050, "buffer_atomic_cmpswap_f32">;
+defm BUFFER_ATOMIC_COND_SUB_U32 : MUBUF_Real_Atomic_gfx12<0x050>;
defm BUFFER_ATOMIC_CSUB : MUBUF_Real_Atomic_gfx11_gfx12_Renamed_gfx12_Renamed<0x037, "buffer_atomic_sub_clamp_u32", "buffer_atomic_csub_u32">;
def : Mnem_gfx11_gfx12<"buffer_atomic_csub", "buffer_atomic_csub_u32">;
defm BUFFER_ATOMIC_DEC : MUBUF_Real_Atomic_gfx11_gfx12_Renamed<0x040, "buffer_atomic_dec_u32">;
@@ -2632,6 +2660,8 @@ defm BUFFER_ATOMIC_SWAP : MUBUF_Real_Atomic_gfx11_gfx12_Renamed<0x033,
defm BUFFER_ATOMIC_SWAP_X2 : MUBUF_Real_Atomic_gfx11_gfx12_Renamed<0x041, "buffer_atomic_swap_b64">;
defm BUFFER_ATOMIC_XOR : MUBUF_Real_Atomic_gfx11_gfx12_Renamed<0x03E, "buffer_atomic_xor_b32">;
defm BUFFER_ATOMIC_XOR_X2 : MUBUF_Real_Atomic_gfx11_gfx12_Renamed<0x04B, "buffer_atomic_xor_b64">;
+defm BUFFER_ATOMIC_PK_ADD_F16 : MUBUF_Real_Atomic_gfx12<0x059>;
+defm BUFFER_ATOMIC_PK_ADD_BF16 : MUBUF_Real_Atomic_gfx12<0x05a>;
//===----------------------------------------------------------------------===//
// MUBUF - GFX10.
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/DSInstructions.td b/contrib/llvm-project/llvm/lib/Target/AMDGPU/DSInstructions.td
index 3cccd8c50e66..d09e1ef3bcb2 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/DSInstructions.td
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/DSInstructions.td
@@ -437,6 +437,12 @@ class DS_1A1D_PERMUTE <string opName, SDPatternOperator node = null_frag,
let has_gds = 0;
}
+class DSAtomicRetPat<DS_Pseudo inst, ValueType vt, PatFrag frag, int complexity = 0,
+ bit gds=0> : GCNPat <(frag (DS1Addr1Offset i32:$ptr, i32:$offset), vt:$value),
+ (inst $ptr, getVregSrcForVT<vt>.ret:$value, offset:$offset, (i1 gds))> {
+ let AddedComplexity = complexity;
+}
+
defm DS_ADD_U32 : DS_1A1D_NORET_mc<"ds_add_u32">;
defm DS_SUB_U32 : DS_1A1D_NORET_mc<"ds_sub_u32">;
defm DS_RSUB_U32 : DS_1A1D_NORET_mc<"ds_rsub_u32">;
@@ -486,10 +492,10 @@ let SubtargetPredicate = isGFX90APlus in {
} // End SubtargetPredicate = isGFX90APlus
let SubtargetPredicate = HasAtomicDsPkAdd16Insts in {
- defm DS_PK_ADD_F16 : DS_1A1D_NORET_mc_gfx9<"ds_pk_add_f16">;
- defm DS_PK_ADD_RTN_F16 : DS_1A1D_RET_mc_gfx9<"ds_pk_add_rtn_f16", VGPR_32, "ds_pk_add_f16">;
- defm DS_PK_ADD_BF16 : DS_1A1D_NORET_mc_gfx9<"ds_pk_add_bf16">;
- defm DS_PK_ADD_RTN_BF16 : DS_1A1D_RET_mc_gfx9<"ds_pk_add_rtn_bf16", VGPR_32, "ds_pk_add_bf16">;
+ defm DS_PK_ADD_F16 : DS_1A1D_NORET_mc<"ds_pk_add_f16">;
+ defm DS_PK_ADD_RTN_F16 : DS_1A1D_RET_mc<"ds_pk_add_rtn_f16", VGPR_32, "ds_pk_add_f16">;
+ defm DS_PK_ADD_BF16 : DS_1A1D_NORET_mc<"ds_pk_add_bf16">;
+ defm DS_PK_ADD_RTN_BF16 : DS_1A1D_RET_mc<"ds_pk_add_rtn_bf16", VGPR_32, "ds_pk_add_bf16">;
} // End SubtargetPredicate = HasAtomicDsPkAdd16Insts
defm DS_CMPSTORE_B32 : DS_1A2D_NORET_mc<"ds_cmpstore_b32">;
@@ -732,9 +738,22 @@ def DS_BVH_STACK_RTN_B32 : DS_BVH_STACK<"ds_bvh_stack_rtn_b32">;
let SubtargetPredicate = isGFX12Plus in {
+defm DS_COND_SUB_U32 : DS_1A1D_NORET_mc<"ds_cond_sub_u32">;
+defm DS_COND_SUB_RTN_U32 : DS_1A1D_RET_mc<"ds_cond_sub_rtn_u32", VGPR_32, "ds_cond_sub_u32">;
defm DS_SUB_CLAMP_U32 : DS_1A1D_NORET_mc<"ds_sub_clamp_u32">;
defm DS_SUB_CLAMP_RTN_U32 : DS_1A1D_RET_mc<"ds_sub_clamp_rtn_u32", VGPR_32, "ds_sub_clamp_u32">;
+multiclass DSAtomicRetNoRetPatIntrinsic_mc<DS_Pseudo inst, DS_Pseudo noRetInst,
+ ValueType vt, string frag> {
+ def : DSAtomicRetPat<inst, vt,
+ !cast<PatFrag>(frag#"_local_addrspace")>;
+
+ let OtherPredicates = [HasAtomicCSubNoRtnInsts] in
+ def : DSAtomicRetPat<noRetInst, vt,
+ !cast<PatFrag>(frag#"_noret_local_addrspace"), /* complexity */ 1>;
+}
+
+defm : DSAtomicRetNoRetPatIntrinsic_mc<DS_COND_SUB_RTN_U32, DS_COND_SUB_U32, i32, "int_amdgcn_atomic_cond_sub_u32">;
} // let SubtargetPredicate = isGFX12Plus
//===----------------------------------------------------------------------===//
@@ -954,12 +973,6 @@ defm : DSWritePat_mc <DS_WRITE_B128, vt, "store_align_less_than_4_local">;
} // End AddedComplexity = 100
-class DSAtomicRetPat<DS_Pseudo inst, ValueType vt, PatFrag frag, int complexity = 0,
- bit gds=0> : GCNPat <(frag (DS1Addr1Offset i32:$ptr, i32:$offset), vt:$value),
- (inst $ptr, getVregSrcForVT<vt>.ret:$value, offset:$offset, (i1 gds))> {
- let AddedComplexity = complexity;
-}
-
multiclass DSAtomicRetPat_mc<DS_Pseudo inst, ValueType vt, string frag> {
let OtherPredicates = [LDSRequiresM0Init] in {
def : DSAtomicRetPat<inst, vt, !cast<PatFrag>(frag#"_local_m0_"#vt.Size)>;
@@ -1237,8 +1250,14 @@ defm DS_MIN_NUM_F64 : DS_Real_Renamed_gfx12<0x052, DS_MIN_F64, "ds_min_num
defm DS_MAX_NUM_F64 : DS_Real_Renamed_gfx12<0x053, DS_MAX_F64, "ds_max_num_f64">;
defm DS_MIN_NUM_RTN_F64 : DS_Real_Renamed_gfx12<0x072, DS_MIN_RTN_F64, "ds_min_num_rtn_f64">;
defm DS_MAX_NUM_RTN_F64 : DS_Real_Renamed_gfx12<0x073, DS_MAX_RTN_F64, "ds_max_num_rtn_f64">;
+defm DS_COND_SUB_U32 : DS_Real_gfx12<0x098>;
defm DS_SUB_CLAMP_U32 : DS_Real_gfx12<0x099>;
+defm DS_COND_SUB_RTN_U32 : DS_Real_gfx12<0x0a8>;
defm DS_SUB_CLAMP_RTN_U32 : DS_Real_gfx12<0x0a9>;
+defm DS_PK_ADD_F16 : DS_Real_gfx12<0x09a>;
+defm DS_PK_ADD_RTN_F16 : DS_Real_gfx12<0x0aa>;
+defm DS_PK_ADD_BF16 : DS_Real_gfx12<0x09b>;
+defm DS_PK_ADD_RTN_BF16 : DS_Real_gfx12<0x0ab>;
//===----------------------------------------------------------------------===//
// GFX11.
@@ -1248,7 +1267,7 @@ let AssemblerPredicate = isGFX11Only, DecoderNamespace = "GFX11" in {
multiclass DS_Real_gfx11<bits<8> op> {
def _gfx11 :
Base_DS_Real_gfx6_gfx7_gfx10_gfx11_gfx12<op, !cast<DS_Pseudo>(NAME),
- SIEncodingFamily.GFX11>;
+ SIEncodingFamily.GFX11>;
}
multiclass DS_Real_Renamed_gfx11<bits<8> op, DS_Pseudo backing_pseudo, string real_name> {
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
index 9dff3f6c2efd..86096b0d80b4 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
@@ -544,6 +544,10 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
Res = tryDecodeInst(DecoderTableGFX1296, MI, DecW, Address, CS);
if (Res)
break;
+
+ Res = tryDecodeInst(DecoderTableGFX12W6496, MI, DecW, Address, CS);
+ if (Res)
+ break;
}
// Reinitialize Bytes
Bytes = Bytes_.slice(0, MaxInstBytesNum);
@@ -2180,7 +2184,8 @@ AMDGPUDisassembler::decodeKernelDescriptorDirective(
KERNEL_CODE_PROPERTY_ENABLE_WAVEFRONT_SIZE32);
}
- if (AMDGPU::getAmdhsaCodeObjectVersion() >= AMDGPU::AMDHSA_COV5)
+ // FIXME: We should be looking at the ELF header ABI version for this.
+ if (AMDGPU::getDefaultAMDHSACodeObjectVersion() >= AMDGPU::AMDHSA_COV5)
PRINT_DIRECTIVE(".amdhsa_uses_dynamic_stack",
KERNEL_CODE_PROPERTY_USES_DYNAMIC_STACK);
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/FLATInstructions.td b/contrib/llvm-project/llvm/lib/Target/AMDGPU/FLATInstructions.td
index 16a8b770e057..cb830b128df8 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/FLATInstructions.td
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/FLATInstructions.td
@@ -535,7 +535,6 @@ multiclass FLAT_Atomic_Pseudo_NO_RTN<
ValueType vt,
ValueType data_vt = vt,
RegisterClass data_rc = vdst_rc,
- bit isFP = isFloatType<data_vt>.ret,
RegisterOperand data_op = getLdStRegisterOperand<data_rc>.ret> {
def "" : FLAT_AtomicNoRet_Pseudo <opName,
(outs),
@@ -544,7 +543,7 @@ multiclass FLAT_Atomic_Pseudo_NO_RTN<
GlobalSaddrTable<0, opName>,
AtomicNoRet <opName, 0> {
let PseudoInstr = NAME;
- let FPAtomic = isFP;
+ let FPAtomic = data_vt.isFP;
let AddedComplexity = -1; // Prefer global atomics if available
}
}
@@ -555,7 +554,6 @@ multiclass FLAT_Atomic_Pseudo_RTN<
ValueType vt,
ValueType data_vt = vt,
RegisterClass data_rc = vdst_rc,
- bit isFP = isFloatType<data_vt>.ret,
RegisterOperand data_op = getLdStRegisterOperand<data_rc>.ret> {
def _RTN : FLAT_AtomicRet_Pseudo <opName,
(outs getLdStRegisterOperand<vdst_rc>.ret:$vdst),
@@ -563,7 +561,7 @@ multiclass FLAT_Atomic_Pseudo_RTN<
" $vdst, $vaddr, $vdata$offset$cpol">,
GlobalSaddrTable<0, opName#"_rtn">,
AtomicNoRet <opName, 1> {
- let FPAtomic = isFP;
+ let FPAtomic = data_vt.isFP;
let AddedComplexity = -1; // Prefer global atomics if available
}
}
@@ -574,10 +572,9 @@ multiclass FLAT_Atomic_Pseudo<
ValueType vt,
ValueType data_vt = vt,
RegisterClass data_rc = vdst_rc,
- bit isFP = isFloatType<data_vt>.ret,
RegisterOperand data_op = getLdStRegisterOperand<data_rc>.ret> {
- defm "" : FLAT_Atomic_Pseudo_NO_RTN<opName, vdst_rc, vt, data_vt, data_rc, isFP, data_op>;
- defm "" : FLAT_Atomic_Pseudo_RTN<opName, vdst_rc, vt, data_vt, data_rc, isFP, data_op>;
+ defm "" : FLAT_Atomic_Pseudo_NO_RTN<opName, vdst_rc, vt, data_vt, data_rc, data_op>;
+ defm "" : FLAT_Atomic_Pseudo_RTN<opName, vdst_rc, vt, data_vt, data_rc, data_op>;
}
multiclass FLAT_Global_Atomic_Pseudo_NO_RTN<
@@ -586,7 +583,6 @@ multiclass FLAT_Global_Atomic_Pseudo_NO_RTN<
ValueType vt,
ValueType data_vt = vt,
RegisterClass data_rc = vdst_rc,
- bit isFP = isFloatType<data_vt>.ret,
RegisterOperand data_op = getLdStRegisterOperand<data_rc>.ret> {
def "" : FLAT_AtomicNoRet_Pseudo <opName,
@@ -597,7 +593,7 @@ multiclass FLAT_Global_Atomic_Pseudo_NO_RTN<
AtomicNoRet <opName, 0> {
let has_saddr = 1;
let PseudoInstr = NAME;
- let FPAtomic = isFP;
+ let FPAtomic = data_vt.isFP;
}
def _SADDR : FLAT_AtomicNoRet_Pseudo <opName,
@@ -609,7 +605,7 @@ multiclass FLAT_Global_Atomic_Pseudo_NO_RTN<
let has_saddr = 1;
let enabled_saddr = 1;
let PseudoInstr = NAME#"_SADDR";
- let FPAtomic = isFP;
+ let FPAtomic = data_vt.isFP;
}
}
@@ -619,7 +615,6 @@ multiclass FLAT_Global_Atomic_Pseudo_RTN<
ValueType vt,
ValueType data_vt = vt,
RegisterClass data_rc = vdst_rc,
- bit isFP = isFloatType<data_vt>.ret,
RegisterOperand data_op = getLdStRegisterOperand<data_rc>.ret,
RegisterOperand vdst_op = getLdStRegisterOperand<vdst_rc>.ret> {
@@ -630,7 +625,7 @@ multiclass FLAT_Global_Atomic_Pseudo_RTN<
GlobalSaddrTable<0, opName#"_rtn">,
AtomicNoRet <opName, 1> {
let has_saddr = 1;
- let FPAtomic = isFP;
+ let FPAtomic = data_vt.isFP;
}
def _SADDR_RTN : FLAT_AtomicRet_Pseudo <opName,
@@ -642,7 +637,7 @@ multiclass FLAT_Global_Atomic_Pseudo_RTN<
let has_saddr = 1;
let enabled_saddr = 1;
let PseudoInstr = NAME#"_SADDR_RTN";
- let FPAtomic = isFP;
+ let FPAtomic = data_vt.isFP;
}
}
@@ -823,6 +818,7 @@ let SubtargetPredicate = HasFlatAtomicFaddF32Inst in {
let SubtargetPredicate = isGFX12Plus in {
defm FLAT_ATOMIC_CSUB_U32 : FLAT_Atomic_Pseudo <"flat_atomic_csub_u32", VGPR_32, i32>;
+ defm FLAT_ATOMIC_COND_SUB_U32 : FLAT_Atomic_Pseudo <"flat_atomic_cond_sub_u32", VGPR_32, i32>;
} // End SubtargetPredicate = isGFX12Plus
defm GLOBAL_LOAD_UBYTE : FLAT_Global_Load_Pseudo <"global_load_ubyte", VGPR_32>;
@@ -949,6 +945,7 @@ defm GLOBAL_LOAD_LDS_SSHORT : FLAT_Global_Load_LDS_Pseudo <"global_load_lds_ssho
defm GLOBAL_LOAD_LDS_DWORD : FLAT_Global_Load_LDS_Pseudo <"global_load_lds_dword">;
let SubtargetPredicate = isGFX12Plus in {
+ defm GLOBAL_ATOMIC_COND_SUB_U32 : FLAT_Global_Atomic_Pseudo <"global_atomic_cond_sub_u32", VGPR_32, i32>;
defm GLOBAL_ATOMIC_ORDERED_ADD_B64 : FLAT_Global_Atomic_Pseudo <"global_atomic_ordered_add_b64", VReg_64, i64>;
def GLOBAL_INV : FLAT_Global_Invalidate_Writeback<"global_inv">;
@@ -995,6 +992,17 @@ defm SCRATCH_LOAD_LDS_DWORD : FLAT_Scratch_Load_LDS_Pseudo <"scratch_load_lds_d
} // End SubtargetPredicate = HasFlatScratchInsts
+let SubtargetPredicate = isGFX12Plus in {
+ let WaveSizePredicate = isWave32 in {
+ defm GLOBAL_LOAD_TR_B128_w32 : FLAT_Global_Load_Pseudo <"global_load_tr_b128_w32", VReg_128>;
+ defm GLOBAL_LOAD_TR_B64_w32 : FLAT_Global_Load_Pseudo <"global_load_tr_b64_w32", VReg_64>;
+ }
+ let WaveSizePredicate = isWave64 in {
+ defm GLOBAL_LOAD_TR_B128_w64 : FLAT_Global_Load_Pseudo <"global_load_tr_b128_w64", VReg_64>;
+ defm GLOBAL_LOAD_TR_B64_w64 : FLAT_Global_Load_Pseudo <"global_load_tr_b64_w64", VGPR_32>;
+ }
+} // End SubtargetPredicate = isGFX12Plus
+
let SubtargetPredicate = isGFX10Plus, is_flat_global = 1 in {
defm GLOBAL_ATOMIC_FCMPSWAP :
FLAT_Global_Atomic_Pseudo<"global_atomic_fcmpswap", VGPR_32, f32, v2f32, VReg_64>;
@@ -1100,23 +1108,43 @@ class FlatStoreSignedAtomicPat <FLAT_Pseudo inst, SDPatternOperator node,
(inst $vaddr, getVregSrcForVT<data_vt>.ret:$data, $offset)
>;
-multiclass FlatAtomicNoRtnPat <string inst, string node, ValueType vt,
- ValueType data_vt = vt, bit isIntr = 0> {
- defvar noRtnNode = !cast<PatFrags>(node # "_noret" # !if(isIntr, "", "_"#vt.Size));
+multiclass FlatAtomicNoRtnPatBase <string inst, string node, ValueType vt,
+ ValueType data_vt = vt> {
+
+ defvar noRtnNode = !cast<PatFrags>(node);
let AddedComplexity = 1 in
def : GCNPat <(vt (noRtnNode (FlatOffset i64:$vaddr, i32:$offset), data_vt:$data)),
(!cast<FLAT_Pseudo>(inst) VReg_64:$vaddr, getVregSrcForVT<data_vt>.ret:$data, $offset)>;
}
-multiclass FlatAtomicRtnPat <string inst, string node, ValueType vt,
- ValueType data_vt = vt, bit isIntr = 0> {
- defvar rtnNode = !cast<SDPatternOperator>(node # !if(isIntr, "", "_"#vt.Size));
+multiclass FlatAtomicNoRtnPatWithAddrSpace<string inst, string node, string addrSpaceSuffix,
+ ValueType vt> :
+ FlatAtomicNoRtnPatBase<inst, node # "_noret_" # addrSpaceSuffix, vt, vt>;
+
+multiclass FlatAtomicNoRtnPat <string inst, string node, ValueType vt,
+ ValueType data_vt = vt, bit isIntr = 0> :
+ FlatAtomicNoRtnPatBase<inst, node # "_noret" # !if(isIntr, "", "_"#vt.Size), vt, data_vt>;
+
+
+multiclass FlatAtomicRtnPatBase <string inst, string node, ValueType vt,
+ ValueType data_vt = vt> {
+
+ defvar rtnNode = !cast<SDPatternOperator>(node);
def : GCNPat <(vt (rtnNode (FlatOffset i64:$vaddr, i32:$offset), data_vt:$data)),
(!cast<FLAT_Pseudo>(inst#"_RTN") VReg_64:$vaddr, getVregSrcForVT<data_vt>.ret:$data, $offset)>;
}
+multiclass FlatAtomicRtnPatWithAddrSpace<string inst, string intr, string addrSpaceSuffix,
+ ValueType vt> :
+ FlatAtomicRtnPatBase<inst, intr # "_" # addrSpaceSuffix, vt, vt>;
+
+multiclass FlatAtomicRtnPat <string inst, string node, ValueType vt,
+ ValueType data_vt = vt, bit isIntr = 0> :
+ FlatAtomicRtnPatBase<inst, node # !if(isIntr, "", "_"#vt.Size), vt, data_vt>;
+
+
multiclass FlatAtomicPat <string inst, string node, ValueType vt,
ValueType data_vt = vt, bit isIntr = 0> :
FlatAtomicRtnPat<inst, node, vt, data_vt, isIntr>,
@@ -1296,6 +1324,13 @@ defm : FlatAtomicPat <"FLAT_ATOMIC_CMPSWAP_X2", "AMDGPUatomic_cmp_swap_"#as, i64
defm : FlatAtomicPat <"FLAT_ATOMIC_XOR_X2", "atomic_load_xor_"#as, i64>;
} // end foreach as
+let SubtargetPredicate = isGFX12Plus in {
+ defm : FlatAtomicRtnPatWithAddrSpace<"FLAT_ATOMIC_COND_SUB_U32", "int_amdgcn_atomic_cond_sub_u32", "flat_addrspace", i32 >;
+
+ let OtherPredicates = [HasAtomicCSubNoRtnInsts] in
+ defm : FlatAtomicNoRtnPatWithAddrSpace<"FLAT_ATOMIC_COND_SUB_U32", "int_amdgcn_atomic_cond_sub_u32", "flat_addrspace", i32>;
+}
+
def : FlatStorePat <FLAT_STORE_BYTE, truncstorei8_flat, i16>;
def : FlatStorePat <FLAT_STORE_SHORT, store_flat, i16>;
@@ -1557,8 +1592,28 @@ defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_SWAP_X2", "atomic_swap_global", i64>
defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_CMPSWAP_X2", "AMDGPUatomic_cmp_swap_global", i64, v2i64>;
defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_XOR_X2", "atomic_load_xor_global", i64>;
+let SubtargetPredicate = isGFX12Plus in {
+ defm : GlobalFLATAtomicPatsRtnWithAddrSpace <"GLOBAL_ATOMIC_COND_SUB_U32", "int_amdgcn_atomic_cond_sub_u32", "global_addrspace", i32>;
+
+ let OtherPredicates = [HasAtomicCSubNoRtnInsts] in
+ defm : GlobalFLATAtomicPatsNoRtnWithAddrSpace <"GLOBAL_ATOMIC_COND_SUB_U32", "int_amdgcn_atomic_cond_sub_u32", "global_addrspace", i32>;
+}
+
let OtherPredicates = [isGFX12Plus] in {
defm : GlobalFLATAtomicPatsRtn <"GLOBAL_ATOMIC_ORDERED_ADD_B64", "int_amdgcn_global_atomic_ordered_add_b64", i64, i64, /* isIntr */ 1>;
+
+ let WaveSizePredicate = isWave32 in {
+ defm : GlobalFLATLoadPats <GLOBAL_LOAD_TR_B64_w32, int_amdgcn_global_load_tr, v2i32>;
+ defm : GlobalFLATLoadPats <GLOBAL_LOAD_TR_B128_w32, int_amdgcn_global_load_tr, v8i16>;
+ defm : GlobalFLATLoadPats <GLOBAL_LOAD_TR_B128_w32, int_amdgcn_global_load_tr, v8f16>;
+ defm : GlobalFLATLoadPats <GLOBAL_LOAD_TR_B128_w32, int_amdgcn_global_load_tr, v8bf16>;
+ }
+ let WaveSizePredicate = isWave64 in {
+ defm : GlobalFLATLoadPats <GLOBAL_LOAD_TR_B64_w64, int_amdgcn_global_load_tr, i32>;
+ defm : GlobalFLATLoadPats <GLOBAL_LOAD_TR_B128_w64, int_amdgcn_global_load_tr, v4i16>;
+ defm : GlobalFLATLoadPats <GLOBAL_LOAD_TR_B128_w64, int_amdgcn_global_load_tr, v4f16>;
+ defm : GlobalFLATLoadPats <GLOBAL_LOAD_TR_B128_w64, int_amdgcn_global_load_tr, v4bf16>;
+ }
}
let OtherPredicates = [isGFX10Plus] in {
@@ -2523,7 +2578,8 @@ multiclass VFLAT_Aliases_gfx12<string ps, string opName, int renamed, string ali
def _alias_gfx12 : MnemonicAlias<alias, opName>, Requires<[isGFX12Plus]>;
}
-multiclass VFLAT_Real_Base_gfx12<bits<8> op, string ps, string opName, int renamed = false, string alias = ""> :
+multiclass VFLAT_Real_Base_gfx12<bits<8> op, string ps = NAME, string opName = !tolower(NAME),
+ int renamed = false, string alias = ""> :
VFLAT_Aliases_gfx12<ps, opName, renamed, alias> {
def _gfx12 : VFLAT_Real_gfx12<op, !cast<FLAT_Pseudo>(ps), opName> {
let Inst{6-0} = !cast<int>(SGPR_NULL_gfx11plus.HWEncoding);
@@ -2557,20 +2613,24 @@ multiclass VFLAT_Real_SVS_gfx12<bits<8> op, string ps, string opName> {
}
}
-multiclass VFLAT_Real_Atomics_gfx12<bits<8> op, string ps, string opName, int renamed = false, string alias = ""> :
+multiclass VFLAT_Real_Atomics_gfx12<bits<8> op, string ps = NAME, string opName = !tolower(NAME),
+ int renamed = false, string alias = ""> :
VFLAT_Real_Base_gfx12<op, ps, opName, renamed, alias>,
VFLAT_Real_RTN_gfx12<op, ps, opName>;
-multiclass VGLOBAL_Real_AllAddr_gfx12<bits<8> op, string ps, string opName, int renamed = false, string alias = ""> :
+multiclass VGLOBAL_Real_AllAddr_gfx12<bits<8> op, string ps = NAME, string opName = !tolower(NAME),
+ int renamed = false, string alias = ""> :
VFLAT_Real_Base_gfx12<op, ps, opName, renamed, alias>,
VFLAT_Real_SADDR_gfx12<op, ps, opName>;
-multiclass VGLOBAL_Real_Atomics_gfx12<bits<8> op, string ps, string opName, int renamed = false, string alias = ""> :
+multiclass VGLOBAL_Real_Atomics_gfx12<bits<8> op, string ps = NAME, string opName = !tolower(NAME),
+ int renamed = false, string alias = ""> :
VGLOBAL_Real_AllAddr_gfx12<op, ps, opName, renamed, alias>,
VFLAT_Real_RTN_gfx12<op, ps, opName>,
VFLAT_Real_SADDR_RTN_gfx12<op, ps, opName>;
-multiclass VSCRATCH_Real_AllAddr_gfx12<bits<8> op, string ps, string opName, int renamed = false> :
+multiclass VSCRATCH_Real_AllAddr_gfx12<bits<8> op, string ps = NAME, string opName = !tolower(NAME),
+ int renamed = false> :
VFLAT_Real_Base_gfx12<op, ps, opName, renamed>,
VFLAT_Real_SADDR_gfx12<op, ps, opName>,
VFLAT_Real_ST_gfx12<op, ps, opName>,
@@ -2591,14 +2651,14 @@ defm FLAT_STORE_B32 : VFLAT_Real_Base_gfx12<0x01a, "FLAT_STORE_DW
defm FLAT_STORE_B64 : VFLAT_Real_Base_gfx12<0x01b, "FLAT_STORE_DWORDX2", "flat_store_b64", true>;
defm FLAT_STORE_B96 : VFLAT_Real_Base_gfx12<0x01c, "FLAT_STORE_DWORDX3", "flat_store_b96", true>;
defm FLAT_STORE_B128 : VFLAT_Real_Base_gfx12<0x01d, "FLAT_STORE_DWORDX4", "flat_store_b128", true>;
-defm FLAT_LOAD_D16_U8 : VFLAT_Real_Base_gfx12<0x01e, "FLAT_LOAD_UBYTE_D16", "flat_load_d16_u8">;
-defm FLAT_LOAD_D16_I8 : VFLAT_Real_Base_gfx12<0x01f, "FLAT_LOAD_SBYTE_D16", "flat_load_d16_i8">;
-defm FLAT_LOAD_D16_B16 : VFLAT_Real_Base_gfx12<0x020, "FLAT_LOAD_SHORT_D16", "flat_load_d16_b16">;
-defm FLAT_LOAD_D16_HI_U8 : VFLAT_Real_Base_gfx12<0x021, "FLAT_LOAD_UBYTE_D16_HI", "flat_load_d16_hi_u8">;
-defm FLAT_LOAD_D16_HI_I8 : VFLAT_Real_Base_gfx12<0x022, "FLAT_LOAD_SBYTE_D16_HI", "flat_load_d16_hi_i8">;
-defm FLAT_LOAD_D16_HI_B16 : VFLAT_Real_Base_gfx12<0x023, "FLAT_LOAD_SHORT_D16_HI", "flat_load_d16_hi_b16">;
-defm FLAT_STORE_D16_HI_B8 : VFLAT_Real_Base_gfx12<0x024, "FLAT_STORE_BYTE_D16_HI", "flat_store_d16_hi_b8">;
-defm FLAT_STORE_D16_HI_B16 : VFLAT_Real_Base_gfx12<0x025, "FLAT_STORE_SHORT_D16_HI", "flat_store_d16_hi_b16">;
+defm FLAT_LOAD_D16_U8 : VFLAT_Real_Base_gfx12<0x01e, "FLAT_LOAD_UBYTE_D16">;
+defm FLAT_LOAD_D16_I8 : VFLAT_Real_Base_gfx12<0x01f, "FLAT_LOAD_SBYTE_D16">;
+defm FLAT_LOAD_D16_B16 : VFLAT_Real_Base_gfx12<0x020, "FLAT_LOAD_SHORT_D16">;
+defm FLAT_LOAD_D16_HI_U8 : VFLAT_Real_Base_gfx12<0x021, "FLAT_LOAD_UBYTE_D16_HI">;
+defm FLAT_LOAD_D16_HI_I8 : VFLAT_Real_Base_gfx12<0x022, "FLAT_LOAD_SBYTE_D16_HI">;
+defm FLAT_LOAD_D16_HI_B16 : VFLAT_Real_Base_gfx12<0x023, "FLAT_LOAD_SHORT_D16_HI">;
+defm FLAT_STORE_D16_HI_B8 : VFLAT_Real_Base_gfx12<0x024, "FLAT_STORE_BYTE_D16_HI">;
+defm FLAT_STORE_D16_HI_B16 : VFLAT_Real_Base_gfx12<0x025, "FLAT_STORE_SHORT_D16_HI">;
defm FLAT_ATOMIC_SWAP_B32 : VFLAT_Real_Atomics_gfx12<0x033, "FLAT_ATOMIC_SWAP", "flat_atomic_swap_b32", true>;
defm FLAT_ATOMIC_CMPSWAP_B32 : VFLAT_Real_Atomics_gfx12<0x034, "FLAT_ATOMIC_CMPSWAP", "flat_atomic_cmpswap_b32", true>;
defm FLAT_ATOMIC_ADD_U32 : VFLAT_Real_Atomics_gfx12<0x035, "FLAT_ATOMIC_ADD", "flat_atomic_add_u32", true>;
@@ -2626,9 +2686,12 @@ defm FLAT_ATOMIC_OR_B64 : VFLAT_Real_Atomics_gfx12<0x04a, "FLAT_ATOMI
defm FLAT_ATOMIC_XOR_B64 : VFLAT_Real_Atomics_gfx12<0x04b, "FLAT_ATOMIC_XOR_X2", "flat_atomic_xor_b64", true>;
defm FLAT_ATOMIC_INC_U64 : VFLAT_Real_Atomics_gfx12<0x04c, "FLAT_ATOMIC_INC_X2", "flat_atomic_inc_u64", true>;
defm FLAT_ATOMIC_DEC_U64 : VFLAT_Real_Atomics_gfx12<0x04d, "FLAT_ATOMIC_DEC_X2", "flat_atomic_dec_u64", true>;
+defm FLAT_ATOMIC_COND_SUB_U32 : VFLAT_Real_Atomics_gfx12<0x050, "FLAT_ATOMIC_COND_SUB_U32", "flat_atomic_cond_sub_u32">;
defm FLAT_ATOMIC_MIN_NUM_F32 : VFLAT_Real_Atomics_gfx12<0x051, "FLAT_ATOMIC_FMIN", "flat_atomic_min_num_f32", true, "flat_atomic_min_f32">;
defm FLAT_ATOMIC_MAX_NUM_F32 : VFLAT_Real_Atomics_gfx12<0x052, "FLAT_ATOMIC_FMAX", "flat_atomic_max_num_f32", true, "flat_atomic_max_f32">;
-defm FLAT_ATOMIC_ADD_F32 : VFLAT_Real_Atomics_gfx12<0x056, "FLAT_ATOMIC_ADD_F32", "flat_atomic_add_f32">;
+defm FLAT_ATOMIC_ADD_F32 : VFLAT_Real_Atomics_gfx12<0x056>;
+defm FLAT_ATOMIC_PK_ADD_F16 : VFLAT_Real_Atomics_gfx12<0x059>;
+defm FLAT_ATOMIC_PK_ADD_BF16 : VFLAT_Real_Atomics_gfx12<0x05a>;
// ENC_VGLOBAL.
defm GLOBAL_LOAD_U8 : VGLOBAL_Real_AllAddr_gfx12<0x010, "GLOBAL_LOAD_UBYTE", "global_load_u8", true>;
@@ -2645,16 +2708,16 @@ defm GLOBAL_STORE_B32 : VGLOBAL_Real_AllAddr_gfx12<0x01a, "GLOBAL_S
defm GLOBAL_STORE_B64 : VGLOBAL_Real_AllAddr_gfx12<0x01b, "GLOBAL_STORE_DWORDX2", "global_store_b64", true>;
defm GLOBAL_STORE_B96 : VGLOBAL_Real_AllAddr_gfx12<0x01c, "GLOBAL_STORE_DWORDX3", "global_store_b96", true>;
defm GLOBAL_STORE_B128 : VGLOBAL_Real_AllAddr_gfx12<0x01d, "GLOBAL_STORE_DWORDX4", "global_store_b128", true>;
-defm GLOBAL_LOAD_D16_U8 : VGLOBAL_Real_AllAddr_gfx12<0x01e, "GLOBAL_LOAD_UBYTE_D16", "global_load_d16_u8">;
-defm GLOBAL_LOAD_D16_I8 : VGLOBAL_Real_AllAddr_gfx12<0x01f, "GLOBAL_LOAD_SBYTE_D16", "global_load_d16_i8">;
-defm GLOBAL_LOAD_D16_B16 : VGLOBAL_Real_AllAddr_gfx12<0x020, "GLOBAL_LOAD_SHORT_D16", "global_load_d16_b16">;
-defm GLOBAL_LOAD_D16_HI_U8 : VGLOBAL_Real_AllAddr_gfx12<0x021, "GLOBAL_LOAD_UBYTE_D16_HI", "global_load_d16_hi_u8">;
-defm GLOBAL_LOAD_D16_HI_I8 : VGLOBAL_Real_AllAddr_gfx12<0x022, "GLOBAL_LOAD_SBYTE_D16_HI", "global_load_d16_hi_i8">;
-defm GLOBAL_LOAD_D16_HI_B16 : VGLOBAL_Real_AllAddr_gfx12<0x023, "GLOBAL_LOAD_SHORT_D16_HI", "global_load_d16_hi_b16">;
-defm GLOBAL_STORE_D16_HI_B8 : VGLOBAL_Real_AllAddr_gfx12<0x024, "GLOBAL_STORE_BYTE_D16_HI", "global_store_d16_hi_b8">;
-defm GLOBAL_STORE_D16_HI_B16 : VGLOBAL_Real_AllAddr_gfx12<0x025, "GLOBAL_STORE_SHORT_D16_HI", "global_store_d16_hi_b16">;
-defm GLOBAL_LOAD_ADDTID_B32 : VGLOBAL_Real_AllAddr_gfx12<0x028, "GLOBAL_LOAD_DWORD_ADDTID", "global_load_addtid_b32">;
-defm GLOBAL_STORE_ADDTID_B32 : VGLOBAL_Real_AllAddr_gfx12<0x029, "GLOBAL_STORE_DWORD_ADDTID", "global_store_addtid_b32">;
+defm GLOBAL_LOAD_D16_U8 : VGLOBAL_Real_AllAddr_gfx12<0x01e, "GLOBAL_LOAD_UBYTE_D16">;
+defm GLOBAL_LOAD_D16_I8 : VGLOBAL_Real_AllAddr_gfx12<0x01f, "GLOBAL_LOAD_SBYTE_D16">;
+defm GLOBAL_LOAD_D16_B16 : VGLOBAL_Real_AllAddr_gfx12<0x020, "GLOBAL_LOAD_SHORT_D16">;
+defm GLOBAL_LOAD_D16_HI_U8 : VGLOBAL_Real_AllAddr_gfx12<0x021, "GLOBAL_LOAD_UBYTE_D16_HI">;
+defm GLOBAL_LOAD_D16_HI_I8 : VGLOBAL_Real_AllAddr_gfx12<0x022, "GLOBAL_LOAD_SBYTE_D16_HI">;
+defm GLOBAL_LOAD_D16_HI_B16 : VGLOBAL_Real_AllAddr_gfx12<0x023, "GLOBAL_LOAD_SHORT_D16_HI">;
+defm GLOBAL_STORE_D16_HI_B8 : VGLOBAL_Real_AllAddr_gfx12<0x024, "GLOBAL_STORE_BYTE_D16_HI">;
+defm GLOBAL_STORE_D16_HI_B16 : VGLOBAL_Real_AllAddr_gfx12<0x025, "GLOBAL_STORE_SHORT_D16_HI">;
+defm GLOBAL_LOAD_ADDTID_B32 : VGLOBAL_Real_AllAddr_gfx12<0x028, "GLOBAL_LOAD_DWORD_ADDTID">;
+defm GLOBAL_STORE_ADDTID_B32 : VGLOBAL_Real_AllAddr_gfx12<0x029, "GLOBAL_STORE_DWORD_ADDTID">;
defm GLOBAL_ATOMIC_SWAP_B32 : VGLOBAL_Real_Atomics_gfx12<0x033, "GLOBAL_ATOMIC_SWAP", "global_atomic_swap_b32", true>;
defm GLOBAL_ATOMIC_CMPSWAP_B32 : VGLOBAL_Real_Atomics_gfx12<0x034, "GLOBAL_ATOMIC_CMPSWAP", "global_atomic_cmpswap_b32", true>;
@@ -2683,14 +2746,28 @@ defm GLOBAL_ATOMIC_OR_B64 : VGLOBAL_Real_Atomics_gfx12<0x04a, "GLOBAL_A
defm GLOBAL_ATOMIC_XOR_B64 : VGLOBAL_Real_Atomics_gfx12<0x04b, "GLOBAL_ATOMIC_XOR_X2", "global_atomic_xor_b64", true>;
defm GLOBAL_ATOMIC_INC_U64 : VGLOBAL_Real_Atomics_gfx12<0x04c, "GLOBAL_ATOMIC_INC_X2", "global_atomic_inc_u64", true>;
defm GLOBAL_ATOMIC_DEC_U64 : VGLOBAL_Real_Atomics_gfx12<0x04d, "GLOBAL_ATOMIC_DEC_X2", "global_atomic_dec_u64", true>;
+defm GLOBAL_ATOMIC_COND_SUB_U32 : VGLOBAL_Real_Atomics_gfx12<0x050, "GLOBAL_ATOMIC_COND_SUB_U32", "global_atomic_cond_sub_u32">;
defm GLOBAL_ATOMIC_MIN_NUM_F32 : VGLOBAL_Real_Atomics_gfx12<0x051, "GLOBAL_ATOMIC_FMIN", "global_atomic_min_num_f32", true, "global_atomic_min_f32">;
defm GLOBAL_ATOMIC_MAX_NUM_F32 : VGLOBAL_Real_Atomics_gfx12<0x052, "GLOBAL_ATOMIC_FMAX", "global_atomic_max_num_f32", true, "global_atomic_max_f32">;
-defm GLOBAL_ATOMIC_ADD_F32 : VGLOBAL_Real_Atomics_gfx12<0x056, "GLOBAL_ATOMIC_ADD_F32", "global_atomic_add_f32">;
-defm GLOBAL_ATOMIC_ORDERED_ADD_B64 : VGLOBAL_Real_Atomics_gfx12<0x073, "GLOBAL_ATOMIC_ORDERED_ADD_B64", "global_atomic_ordered_add_b64">;
+defm GLOBAL_ATOMIC_ADD_F32 : VGLOBAL_Real_Atomics_gfx12<0x056>;
+
+let WaveSizePredicate = isWave32, DecoderNamespace = "GFX12" in {
+ defm GLOBAL_LOAD_TR_B128_w32 : VGLOBAL_Real_AllAddr_gfx12<0x057, "GLOBAL_LOAD_TR_B128_w32", "global_load_tr_b128">;
+ defm GLOBAL_LOAD_TR_B64_w32 : VGLOBAL_Real_AllAddr_gfx12<0x058, "GLOBAL_LOAD_TR_B64_w32", "global_load_tr_b64">;
+}
+
+let WaveSizePredicate = isWave64, DecoderNamespace = "GFX12W64" in {
+ defm GLOBAL_LOAD_TR_B128_w64 : VGLOBAL_Real_AllAddr_gfx12<0x057, "GLOBAL_LOAD_TR_B128_w64", "global_load_tr_b128">;
+ defm GLOBAL_LOAD_TR_B64_w64 : VGLOBAL_Real_AllAddr_gfx12<0x058, "GLOBAL_LOAD_TR_B64_w64", "global_load_tr_b64">;
+}
+
+defm GLOBAL_ATOMIC_ORDERED_ADD_B64 : VGLOBAL_Real_Atomics_gfx12<0x073>;
+defm GLOBAL_ATOMIC_PK_ADD_F16 : VGLOBAL_Real_Atomics_gfx12<0x059>;
+defm GLOBAL_ATOMIC_PK_ADD_BF16 : VGLOBAL_Real_Atomics_gfx12<0x05a>;
-defm GLOBAL_INV : VFLAT_Real_Base_gfx12<0x02b, "GLOBAL_INV", "global_inv">;
-defm GLOBAL_WB : VFLAT_Real_Base_gfx12<0x02c, "GLOBAL_WB", "global_wb">;
-defm GLOBAL_WBINV : VFLAT_Real_Base_gfx12<0x04f, "GLOBAL_WBINV", "global_wbinv">;
+defm GLOBAL_INV : VFLAT_Real_Base_gfx12<0x02b>;
+defm GLOBAL_WB : VFLAT_Real_Base_gfx12<0x02c>;
+defm GLOBAL_WBINV : VFLAT_Real_Base_gfx12<0x04f>;
// ENC_VSCRATCH.
defm SCRATCH_LOAD_U8 : VSCRATCH_Real_AllAddr_gfx12<0x10, "SCRATCH_LOAD_UBYTE", "scratch_load_u8", true>;
@@ -2707,11 +2784,11 @@ defm SCRATCH_STORE_B32 : VSCRATCH_Real_AllAddr_gfx12<0x1a, "SCRATCH_
defm SCRATCH_STORE_B64 : VSCRATCH_Real_AllAddr_gfx12<0x1b, "SCRATCH_STORE_DWORDX2", "scratch_store_b64", true>;
defm SCRATCH_STORE_B96 : VSCRATCH_Real_AllAddr_gfx12<0x1c, "SCRATCH_STORE_DWORDX3", "scratch_store_b96", true>;
defm SCRATCH_STORE_B128 : VSCRATCH_Real_AllAddr_gfx12<0x1d, "SCRATCH_STORE_DWORDX4", "scratch_store_b128", true>;
-defm SCRATCH_LOAD_D16_U8 : VSCRATCH_Real_AllAddr_gfx12<0x1e, "SCRATCH_LOAD_UBYTE_D16", "scratch_load_d16_u8">;
-defm SCRATCH_LOAD_D16_I8 : VSCRATCH_Real_AllAddr_gfx12<0x1f, "SCRATCH_LOAD_SBYTE_D16", "scratch_load_d16_i8">;
-defm SCRATCH_LOAD_D16_B16 : VSCRATCH_Real_AllAddr_gfx12<0x20, "SCRATCH_LOAD_SHORT_D16", "scratch_load_d16_b16">;
-defm SCRATCH_LOAD_D16_HI_U8 : VSCRATCH_Real_AllAddr_gfx12<0x21, "SCRATCH_LOAD_UBYTE_D16_HI", "scratch_load_d16_hi_u8">;
-defm SCRATCH_LOAD_D16_HI_I8 : VSCRATCH_Real_AllAddr_gfx12<0x22, "SCRATCH_LOAD_SBYTE_D16_HI", "scratch_load_d16_hi_i8">;
-defm SCRATCH_LOAD_D16_HI_B16 : VSCRATCH_Real_AllAddr_gfx12<0x23, "SCRATCH_LOAD_SHORT_D16_HI", "scratch_load_d16_hi_b16">;
-defm SCRATCH_STORE_D16_HI_B8 : VSCRATCH_Real_AllAddr_gfx12<0x24, "SCRATCH_STORE_BYTE_D16_HI", "scratch_store_d16_hi_b8">;
-defm SCRATCH_STORE_D16_HI_B16 : VSCRATCH_Real_AllAddr_gfx12<0x25, "SCRATCH_STORE_SHORT_D16_HI", "scratch_store_d16_hi_b16">;
+defm SCRATCH_LOAD_D16_U8 : VSCRATCH_Real_AllAddr_gfx12<0x1e, "SCRATCH_LOAD_UBYTE_D16">;
+defm SCRATCH_LOAD_D16_I8 : VSCRATCH_Real_AllAddr_gfx12<0x1f, "SCRATCH_LOAD_SBYTE_D16">;
+defm SCRATCH_LOAD_D16_B16 : VSCRATCH_Real_AllAddr_gfx12<0x20, "SCRATCH_LOAD_SHORT_D16">;
+defm SCRATCH_LOAD_D16_HI_U8 : VSCRATCH_Real_AllAddr_gfx12<0x21, "SCRATCH_LOAD_UBYTE_D16_HI">;
+defm SCRATCH_LOAD_D16_HI_I8 : VSCRATCH_Real_AllAddr_gfx12<0x22, "SCRATCH_LOAD_SBYTE_D16_HI">;
+defm SCRATCH_LOAD_D16_HI_B16 : VSCRATCH_Real_AllAddr_gfx12<0x23, "SCRATCH_LOAD_SHORT_D16_HI">;
+defm SCRATCH_STORE_D16_HI_B8 : VSCRATCH_Real_AllAddr_gfx12<0x24, "SCRATCH_STORE_BYTE_D16_HI">;
+defm SCRATCH_STORE_D16_HI_B16 : VSCRATCH_Real_AllAddr_gfx12<0x25, "SCRATCH_STORE_SHORT_D16_HI">;
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/GCNDPPCombine.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/GCNDPPCombine.cpp
index a75082268c77..94d28dc0a2c7 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/GCNDPPCombine.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/GCNDPPCombine.cpp
@@ -274,8 +274,8 @@ MachineInstr *GCNDPPCombine::createDPPInst(MachineInstr &OrigMI,
break;
}
- if (auto *Mod0 = TII->getNamedOperand(OrigMI,
- AMDGPU::OpName::src0_modifiers)) {
+ auto *Mod0 = TII->getNamedOperand(OrigMI, AMDGPU::OpName::src0_modifiers);
+ if (Mod0) {
assert(NumOperands == AMDGPU::getNamedOperandIdx(DPPOp,
AMDGPU::OpName::src0_modifiers));
assert(HasVOP3DPP ||
@@ -298,8 +298,8 @@ MachineInstr *GCNDPPCombine::createDPPInst(MachineInstr &OrigMI,
DPPInst->getOperand(NumOperands).setIsKill(false);
++NumOperands;
- if (auto *Mod1 = TII->getNamedOperand(OrigMI,
- AMDGPU::OpName::src1_modifiers)) {
+ auto *Mod1 = TII->getNamedOperand(OrigMI, AMDGPU::OpName::src1_modifiers);
+ if (Mod1) {
assert(NumOperands == AMDGPU::getNamedOperandIdx(DPPOp,
AMDGPU::OpName::src1_modifiers));
assert(HasVOP3DPP ||
@@ -330,8 +330,9 @@ MachineInstr *GCNDPPCombine::createDPPInst(MachineInstr &OrigMI,
DPPInst.add(*Src1);
++NumOperands;
}
- if (auto *Mod2 =
- TII->getNamedOperand(OrigMI, AMDGPU::OpName::src2_modifiers)) {
+
+ auto *Mod2 = TII->getNamedOperand(OrigMI, AMDGPU::OpName::src2_modifiers);
+ if (Mod2) {
assert(NumOperands ==
AMDGPU::getNamedOperandIdx(DPPOp, AMDGPU::OpName::src2_modifiers));
assert(HasVOP3DPP ||
@@ -350,6 +351,7 @@ MachineInstr *GCNDPPCombine::createDPPInst(MachineInstr &OrigMI,
DPPInst.add(*Src2);
++NumOperands;
}
+
if (HasVOP3DPP) {
auto *ClampOpr = TII->getNamedOperand(OrigMI, AMDGPU::OpName::clamp);
if (ClampOpr && AMDGPU::hasNamedOperand(DPPOp, AMDGPU::OpName::clamp)) {
@@ -368,7 +370,13 @@ MachineInstr *GCNDPPCombine::createDPPInst(MachineInstr &OrigMI,
// all 1.
if (auto *OpSelOpr =
TII->getNamedOperand(OrigMI, AMDGPU::OpName::op_sel)) {
- auto OpSel = OpSelOpr->getImm();
+ int64_t OpSel = 0;
+ OpSel |= (Mod0 ? (!!(Mod0->getImm() & SISrcMods::OP_SEL_0) << 0) : 0);
+ OpSel |= (Mod1 ? (!!(Mod1->getImm() & SISrcMods::OP_SEL_0) << 1) : 0);
+ OpSel |= (Mod2 ? (!!(Mod2->getImm() & SISrcMods::OP_SEL_0) << 2) : 0);
+ if (Mod0 && TII->isVOP3(OrigMI) && !TII->isVOP3P(OrigMI))
+ OpSel |= !!(Mod0->getImm() & SISrcMods::DST_OP_SEL) << 3;
+
if (OpSel != 0) {
LLVM_DEBUG(dbgs() << " failed: op_sel must be zero\n");
Fail = true;
@@ -379,7 +387,11 @@ MachineInstr *GCNDPPCombine::createDPPInst(MachineInstr &OrigMI,
}
if (auto *OpSelHiOpr =
TII->getNamedOperand(OrigMI, AMDGPU::OpName::op_sel_hi)) {
- auto OpSelHi = OpSelHiOpr->getImm();
+ int64_t OpSelHi = 0;
+ OpSelHi |= (Mod0 ? (!!(Mod0->getImm() & SISrcMods::OP_SEL_1) << 0) : 0);
+ OpSelHi |= (Mod1 ? (!!(Mod1->getImm() & SISrcMods::OP_SEL_1) << 1) : 0);
+ OpSelHi |= (Mod2 ? (!!(Mod2->getImm() & SISrcMods::OP_SEL_1) << 2) : 0);
+
// Only vop3p has op_sel_hi, and all vop3p have 3 operands, so check
// the bitmask for 3 op_sel_hi bits set
assert(Src2 && "Expected vop3p with 3 operands");
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
index bcd93e30d6c2..b6e4e65ff5b0 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
@@ -163,6 +163,7 @@ static bool isSendMsgTraceDataOrGDS(const SIInstrInfo &TII,
static bool isPermlane(const MachineInstr &MI) {
unsigned Opcode = MI.getOpcode();
return Opcode == AMDGPU::V_PERMLANE16_B32_e64 ||
+ Opcode == AMDGPU::V_PERMLANE64_B32 ||
Opcode == AMDGPU::V_PERMLANEX16_B32_e64 ||
Opcode == AMDGPU::V_PERMLANE16_VAR_B32_e64 ||
Opcode == AMDGPU::V_PERMLANEX16_VAR_B32_e64;
@@ -1143,6 +1144,7 @@ bool GCNHazardRecognizer::fixVcmpxPermlaneHazards(MachineInstr *MI) {
bool GCNHazardRecognizer::fixVMEMtoScalarWriteHazards(MachineInstr *MI) {
if (!ST.hasVMEMtoScalarWriteHazard())
return false;
+ assert(!ST.hasExtendedWaitCounts());
if (!SIInstrInfo::isSALU(*MI) && !SIInstrInfo::isSMRD(*MI))
return false;
@@ -1189,6 +1191,7 @@ bool GCNHazardRecognizer::fixVMEMtoScalarWriteHazards(MachineInstr *MI) {
bool GCNHazardRecognizer::fixSMEMtoVectorWriteHazards(MachineInstr *MI) {
if (!ST.hasSMEMtoVectorWriteHazard())
return false;
+ assert(!ST.hasExtendedWaitCounts());
if (!SIInstrInfo::isVALU(*MI))
return false;
@@ -1242,7 +1245,8 @@ bool GCNHazardRecognizer::fixSMEMtoVectorWriteHazards(MachineInstr *MI) {
case AMDGPU::S_WAITCNT: {
const int64_t Imm = MI.getOperand(0).getImm();
AMDGPU::Waitcnt Decoded = AMDGPU::decodeWaitcnt(IV, Imm);
- return (Decoded.LgkmCnt == 0);
+ // DsCnt corresponds to LGKMCnt here.
+ return (Decoded.DsCnt == 0);
}
default:
// SOPP instructions cannot mitigate the hazard.
@@ -1272,7 +1276,11 @@ bool GCNHazardRecognizer::fixSMEMtoVectorWriteHazards(MachineInstr *MI) {
}
bool GCNHazardRecognizer::fixVcmpxExecWARHazard(MachineInstr *MI) {
- if (!ST.hasVcmpxExecWARHazard() || !SIInstrInfo::isVALU(*MI))
+ if (!ST.hasVcmpxExecWARHazard())
+ return false;
+ assert(!ST.hasExtendedWaitCounts());
+
+ if (!SIInstrInfo::isVALU(*MI))
return false;
const SIRegisterInfo *TRI = ST.getRegisterInfo();
@@ -1342,6 +1350,7 @@ bool GCNHazardRecognizer::fixLdsBranchVmemWARHazard(MachineInstr *MI) {
return false;
assert(ST.hasLdsBranchVmemWARHazard());
+ assert(!ST.hasExtendedWaitCounts());
auto IsHazardInst = [](const MachineInstr &MI) {
if (SIInstrInfo::isDS(MI))
@@ -1451,6 +1460,8 @@ bool GCNHazardRecognizer::fixLdsDirectVMEMHazard(MachineInstr *MI) {
return I.readsRegister(VDSTReg, &TRI) || I.modifiesRegister(VDSTReg, &TRI);
};
bool LdsdirCanWait = ST.hasLdsWaitVMSRC();
+ // TODO: On GFX12 the hazard should expire on S_WAIT_LOADCNT/SAMPLECNT/BVHCNT
+ // according to the type of VMEM instruction.
auto IsExpiredFn = [this, LdsdirCanWait](const MachineInstr &I, int) {
return SIInstrInfo::isVALU(I) || SIInstrInfo::isEXP(I) ||
(I.getOpcode() == AMDGPU::S_WAITCNT && !I.getOperand(0).getImm()) ||
@@ -1476,11 +1487,11 @@ bool GCNHazardRecognizer::fixLdsDirectVMEMHazard(MachineInstr *MI) {
}
bool GCNHazardRecognizer::fixVALUPartialForwardingHazard(MachineInstr *MI) {
- if (!ST.isWave64())
- return false;
if (!ST.hasVALUPartialForwardingHazard())
return false;
- if (!SIInstrInfo::isVALU(*MI))
+ assert(!ST.hasExtendedWaitCounts());
+
+ if (!ST.isWave64() || !SIInstrInfo::isVALU(*MI))
return false;
SmallSetVector<Register, 4> SrcVGPRs;
@@ -1627,6 +1638,8 @@ bool GCNHazardRecognizer::fixVALUPartialForwardingHazard(MachineInstr *MI) {
bool GCNHazardRecognizer::fixVALUTransUseHazard(MachineInstr *MI) {
if (!ST.hasVALUTransUseHazard())
return false;
+ assert(!ST.hasExtendedWaitCounts());
+
if (!SIInstrInfo::isVALU(*MI))
return false;
@@ -1766,6 +1779,7 @@ bool GCNHazardRecognizer::fixWMMAHazards(MachineInstr *MI) {
bool GCNHazardRecognizer::fixShift64HighRegBug(MachineInstr *MI) {
if (!ST.hasShift64HighRegBug())
return false;
+ assert(!ST.hasExtendedWaitCounts());
switch (MI->getOpcode()) {
default:
@@ -1895,6 +1909,7 @@ int GCNHazardRecognizer::checkFPAtomicToDenormModeHazard(MachineInstr *MI) {
if (!ST.hasFPAtomicToDenormModeHazard())
return 0;
+ assert(!ST.hasExtendedWaitCounts());
if (MI->getOpcode() != AMDGPU::S_DENORM_MODE)
return 0;
@@ -2720,11 +2735,11 @@ bool GCNHazardRecognizer::ShouldPreferAnother(SUnit *SU) {
}
bool GCNHazardRecognizer::fixVALUMaskWriteHazard(MachineInstr *MI) {
- if (!ST.isWave64())
- return false;
if (!ST.hasVALUMaskWriteHazard())
return false;
- if (!SIInstrInfo::isSALU(*MI))
+ assert(!ST.hasExtendedWaitCounts());
+
+ if (!ST.isWave64() || !SIInstrInfo::isSALU(*MI))
return false;
// The hazard sequence is three instructions:
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/GCNNSAReassign.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/GCNNSAReassign.cpp
index 4c9ad9b5bcf7..272cc7fa6bc6 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/GCNNSAReassign.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/GCNNSAReassign.cpp
@@ -237,7 +237,7 @@ GCNNSAReassign::CheckNSA(const MachineInstr &MI, bool Fast) const {
bool GCNNSAReassign::runOnMachineFunction(MachineFunction &MF) {
ST = &MF.getSubtarget<GCNSubtarget>();
- if (!ST->hasNSAEncoding())
+ if (!ST->hasNSAEncoding() || !ST->hasNonNSAEncoding())
return false;
MRI = &MF.getRegInfo();
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/contrib/llvm-project/llvm/lib/Target/AMDGPU/GCNSubtarget.h
index 85d062a9a6f5..8019b98b1c68 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/GCNSubtarget.h
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/GCNSubtarget.h
@@ -155,6 +155,7 @@ protected:
bool HasDot10Insts = false;
bool HasMAIInsts = false;
bool HasFP8Insts = false;
+ bool HasFP8ConversionInsts = false;
bool HasPkFmacF16Inst = false;
bool HasAtomicDsPkAdd16Insts = false;
bool HasAtomicFlatPkAdd16Insts = false;
@@ -165,6 +166,8 @@ protected:
bool HasAtomicCSubNoRtnInsts = false;
bool HasAtomicGlobalPkAddBF16Inst = false;
bool HasFlatAtomicFaddF32Inst = false;
+ bool HasDefaultComponentZero = false;
+ bool HasDefaultComponentBroadcast = false;
bool SupportsSRAMECC = false;
// This should not be used directly. 'TargetID' tracks the dynamic settings
@@ -295,12 +298,16 @@ public:
unsigned getMaxWaveScratchSize() const {
// See COMPUTE_TMPRING_SIZE.WAVESIZE.
- if (getGeneration() < GFX11) {
- // 13-bit field in units of 256-dword.
- return (256 * 4) * ((1 << 13) - 1);
+ if (getGeneration() >= GFX12) {
+ // 18-bit field in units of 64-dword.
+ return (64 * 4) * ((1 << 18) - 1);
}
- // 15-bit field in units of 64-dword.
- return (64 * 4) * ((1 << 15) - 1);
+ if (getGeneration() == GFX11) {
+ // 15-bit field in units of 64-dword.
+ return (64 * 4) * ((1 << 15) - 1);
+ }
+ // 13-bit field in units of 256-dword.
+ return (256 * 4) * ((1 << 13) - 1);
}
/// Return the number of high bits known to be zero for a frame index.
@@ -423,6 +430,8 @@ public:
return GFX9Insts;
}
+ bool hasScalarSubwordLoads() const { return getGeneration() >= GFX12; }
+
TrapHandlerAbi getTrapHandlerAbi() const {
return isAmdHsaOS() ? TrapHandlerAbi::AMDHSA : TrapHandlerAbi::NONE;
}
@@ -772,6 +781,8 @@ public:
return HasFP8Insts;
}
+ bool hasFP8ConversionInsts() const { return HasFP8ConversionInsts; }
+
bool hasPkFmacF16Inst() const {
return HasPkFmacF16Inst;
}
@@ -802,6 +813,12 @@ public:
bool hasFlatAtomicFaddF32Inst() const { return HasFlatAtomicFaddF32Inst; }
+ bool hasDefaultComponentZero() const { return HasDefaultComponentZero; }
+
+ bool hasDefaultComponentBroadcast() const {
+ return HasDefaultComponentBroadcast;
+ }
+
bool hasNoSdstCMPX() const {
return HasNoSdstCMPX;
}
@@ -838,7 +855,9 @@ public:
return getGeneration() < SEA_ISLANDS;
}
- bool hasInstPrefetch() const { return getGeneration() >= GFX10; }
+ bool hasInstPrefetch() const {
+ return getGeneration() == GFX10 || getGeneration() == GFX11;
+ }
bool hasPrefetch() const { return GFX12Insts; }
@@ -984,6 +1003,8 @@ public:
bool hasNSAEncoding() const { return HasNSAEncoding; }
+ bool hasNonNSAEncoding() const { return getGeneration() < GFX12; }
+
bool hasPartialNSAEncoding() const { return HasPartialNSAEncoding; }
unsigned getNSAMaxSize(bool HasSampler = false) const {
@@ -1131,14 +1152,14 @@ public:
bool hasLdsWaitVMSRC() const { return getGeneration() >= GFX12; }
bool hasVALUPartialForwardingHazard() const {
- return getGeneration() >= GFX11;
+ return getGeneration() == GFX11;
}
bool hasVALUTransUseHazard() const { return HasVALUTransUseHazard; }
bool hasForceStoreSC0SC1() const { return HasForceStoreSC0SC1; }
- bool hasVALUMaskWriteHazard() const { return getGeneration() >= GFX11; }
+ bool hasVALUMaskWriteHazard() const { return getGeneration() == GFX11; }
/// Return if operations acting on VGPR tuples require even alignment.
bool needsAlignedVGPRs() const { return GFX90AInsts; }
@@ -1177,6 +1198,10 @@ public:
bool hasRestrictedSOffset() const { return HasRestrictedSOffset; }
+ /// \returns true if the target uses LOADcnt/SAMPLEcnt/BVHcnt, DScnt/KMcnt
+ /// and STOREcnt rather than VMcnt, LGKMcnt and VScnt respectively.
+ bool hasExtendedWaitCounts() const { return getGeneration() >= GFX12; }
+
/// Return the maximum number of waves per SIMD for kernels using \p SGPRs
/// SGPRs
unsigned getOccupancyWithNumSGPRs(unsigned SGPRs) const;
@@ -1251,6 +1276,14 @@ public:
// \returns true if the target has WG_RR_MODE kernel descriptor mode bit
bool hasRrWGMode() const { return getGeneration() >= GFX12; }
+ /// \returns true if VADDR and SADDR fields in VSCRATCH can use negative
+ /// values.
+ bool hasSignedScratchOffsets() const { return getGeneration() >= GFX12; }
+
+ // \returns true if S_GETPC_B64 zero-extends the result from 48 bits instead
+ // of sign-extending.
+ bool hasGetPCZeroExtension() const { return GFX12Insts; }
+
/// \returns SGPR allocation granularity supported by the subtarget.
unsigned getSGPRAllocGranule() const {
return AMDGPU::IsaInfo::getSGPRAllocGranule(this);
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp
index f91f36ed851b..8eb246ef57c9 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp
@@ -232,13 +232,11 @@ class ELFAMDGPUAsmBackend : public AMDGPUAsmBackend {
bool Is64Bit;
bool HasRelocationAddend;
uint8_t OSABI = ELF::ELFOSABI_NONE;
- uint8_t ABIVersion = 0;
public:
- ELFAMDGPUAsmBackend(const Target &T, const Triple &TT, uint8_t ABIVersion) :
- AMDGPUAsmBackend(T), Is64Bit(TT.getArch() == Triple::amdgcn),
- HasRelocationAddend(TT.getOS() == Triple::AMDHSA),
- ABIVersion(ABIVersion) {
+ ELFAMDGPUAsmBackend(const Target &T, const Triple &TT)
+ : AMDGPUAsmBackend(T), Is64Bit(TT.getArch() == Triple::amdgcn),
+ HasRelocationAddend(TT.getOS() == Triple::AMDHSA) {
switch (TT.getOS()) {
case Triple::AMDHSA:
OSABI = ELF::ELFOSABI_AMDGPU_HSA;
@@ -256,8 +254,7 @@ public:
std::unique_ptr<MCObjectTargetWriter>
createObjectTargetWriter() const override {
- return createAMDGPUELFObjectWriter(Is64Bit, OSABI, HasRelocationAddend,
- ABIVersion);
+ return createAMDGPUELFObjectWriter(Is64Bit, OSABI, HasRelocationAddend);
}
};
@@ -267,6 +264,5 @@ MCAsmBackend *llvm::createAMDGPUAsmBackend(const Target &T,
const MCSubtargetInfo &STI,
const MCRegisterInfo &MRI,
const MCTargetOptions &Options) {
- return new ELFAMDGPUAsmBackend(T, STI.getTargetTriple(),
- getHsaAbiVersion(&STI).value_or(0));
+ return new ELFAMDGPUAsmBackend(T, STI.getTargetTriple());
}
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFObjectWriter.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFObjectWriter.cpp
index 58eed81e0755..2d960a32339f 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFObjectWriter.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFObjectWriter.cpp
@@ -18,8 +18,7 @@ namespace {
class AMDGPUELFObjectWriter : public MCELFObjectTargetWriter {
public:
- AMDGPUELFObjectWriter(bool Is64Bit, uint8_t OSABI, bool HasRelocationAddend,
- uint8_t ABIVersion);
+ AMDGPUELFObjectWriter(bool Is64Bit, uint8_t OSABI, bool HasRelocationAddend);
protected:
unsigned getRelocType(MCContext &Ctx, const MCValue &Target,
@@ -29,12 +28,10 @@ protected:
} // end anonymous namespace
-AMDGPUELFObjectWriter::AMDGPUELFObjectWriter(bool Is64Bit,
- uint8_t OSABI,
- bool HasRelocationAddend,
- uint8_t ABIVersion)
- : MCELFObjectTargetWriter(Is64Bit, OSABI, ELF::EM_AMDGPU,
- HasRelocationAddend, ABIVersion) {}
+AMDGPUELFObjectWriter::AMDGPUELFObjectWriter(bool Is64Bit, uint8_t OSABI,
+ bool HasRelocationAddend)
+ : MCELFObjectTargetWriter(Is64Bit, OSABI, ELF::EM_AMDGPU,
+ HasRelocationAddend) {}
unsigned AMDGPUELFObjectWriter::getRelocType(MCContext &Ctx,
const MCValue &Target,
@@ -100,9 +97,7 @@ unsigned AMDGPUELFObjectWriter::getRelocType(MCContext &Ctx,
std::unique_ptr<MCObjectTargetWriter>
llvm::createAMDGPUELFObjectWriter(bool Is64Bit, uint8_t OSABI,
- bool HasRelocationAddend,
- uint8_t ABIVersion) {
+ bool HasRelocationAddend) {
return std::make_unique<AMDGPUELFObjectWriter>(Is64Bit, OSABI,
- HasRelocationAddend,
- ABIVersion);
+ HasRelocationAddend);
}
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp
index 6c7977e22599..e73e53aa270f 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp
@@ -1260,14 +1260,19 @@ void AMDGPUInstPrinter::printPackedModifier(const MCInst *MI,
int NumOps = 0;
int Ops[3];
- for (int OpName : { AMDGPU::OpName::src0_modifiers,
- AMDGPU::OpName::src1_modifiers,
- AMDGPU::OpName::src2_modifiers }) {
- int Idx = AMDGPU::getNamedOperandIdx(Opc, OpName);
- if (Idx == -1)
+ std::pair<int, int> MOps[] = {
+ {AMDGPU::OpName::src0_modifiers, AMDGPU::OpName::src0},
+ {AMDGPU::OpName::src1_modifiers, AMDGPU::OpName::src1},
+ {AMDGPU::OpName::src2_modifiers, AMDGPU::OpName::src2}};
+ int DefaultValue = (Mod == SISrcMods::OP_SEL_1);
+
+ for (auto [SrcMod, Src] : MOps) {
+ if (!AMDGPU::hasNamedOperand(Opc, Src))
break;
- Ops[NumOps++] = MI->getOperand(Idx).getImm();
+ int ModIdx = AMDGPU::getNamedOperandIdx(Opc, SrcMod);
+ Ops[NumOps++] =
+ (ModIdx != -1) ? MI->getOperand(ModIdx).getImm() : DefaultValue;
}
const bool HasDstSel =
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp
index de1abaf29c56..c3e87244c0c8 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp
@@ -562,7 +562,48 @@ void AMDGPUMCCodeEmitter::getMachineOpValue(const MCInst &MI,
void AMDGPUMCCodeEmitter::getMachineOpValueT16(
const MCInst &MI, unsigned OpNo, APInt &Op,
SmallVectorImpl<MCFixup> &Fixups, const MCSubtargetInfo &STI) const {
- llvm_unreachable("TODO: Implement getMachineOpValueT16().");
+ const MCOperand &MO = MI.getOperand(OpNo);
+ if (MO.isReg()) {
+ unsigned Enc = MRI.getEncodingValue(MO.getReg());
+ unsigned Idx = Enc & AMDGPU::HWEncoding::REG_IDX_MASK;
+ bool IsVGPR = Enc & AMDGPU::HWEncoding::IS_VGPR_OR_AGPR;
+ Op = Idx | (IsVGPR << 8);
+ return;
+ }
+ getMachineOpValueCommon(MI, MO, OpNo, Op, Fixups, STI);
+ // VGPRs include the suffix/op_sel bit in the register encoding, but
+ // immediates and SGPRs include it in src_modifiers. Therefore, copy the
+ // op_sel bit from the src operands into src_modifier operands if Op is
+ // src_modifiers and the corresponding src is a VGPR
+ int SrcMOIdx = -1;
+ assert(OpNo < INT_MAX);
+ if ((int)OpNo == AMDGPU::getNamedOperandIdx(MI.getOpcode(),
+ AMDGPU::OpName::src0_modifiers)) {
+ SrcMOIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::src0);
+ int VDstMOIdx =
+ AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vdst);
+ if (VDstMOIdx != -1) {
+ auto DstReg = MI.getOperand(VDstMOIdx).getReg();
+ if (AMDGPU::isHi(DstReg, MRI))
+ Op |= SISrcMods::DST_OP_SEL;
+ }
+ } else if ((int)OpNo == AMDGPU::getNamedOperandIdx(
+ MI.getOpcode(), AMDGPU::OpName::src1_modifiers))
+ SrcMOIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::src1);
+ else if ((int)OpNo == AMDGPU::getNamedOperandIdx(
+ MI.getOpcode(), AMDGPU::OpName::src2_modifiers))
+ SrcMOIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::src2);
+ if (SrcMOIdx == -1)
+ return;
+
+ const MCOperand &SrcMO = MI.getOperand(SrcMOIdx);
+ if (!SrcMO.isReg())
+ return;
+ auto SrcReg = SrcMO.getReg();
+ if (AMDGPU::isSGPR(SrcReg, &MRI))
+ return;
+ if (AMDGPU::isHi(SrcReg, MRI))
+ Op |= SISrcMods::OP_SEL_0;
}
void AMDGPUMCCodeEmitter::getMachineOpValueT16Lo128(
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.h b/contrib/llvm-project/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.h
index 006115ba14fc..3ef00f75735b 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.h
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.h
@@ -42,8 +42,8 @@ MCAsmBackend *createAMDGPUAsmBackend(const Target &T,
std::unique_ptr<MCObjectTargetWriter>
createAMDGPUELFObjectWriter(bool Is64Bit, uint8_t OSABI,
- bool HasRelocationAddend, uint8_t ABIVersion);
-} // End llvm namespace
+ bool HasRelocationAddend);
+} // namespace llvm
#define GET_REGINFO_ENUM
#include "AMDGPUGenRegisterInfo.inc"
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp
index e135a4e25dd1..d7e8ab76d5ff 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp
@@ -20,6 +20,7 @@
#include "llvm/MC/MCAssembler.h"
#include "llvm/MC/MCContext.h"
#include "llvm/MC/MCELFStreamer.h"
+#include "llvm/MC/MCObjectWriter.h"
#include "llvm/MC/MCSectionELF.h"
#include "llvm/MC/MCSubtargetInfo.h"
#include "llvm/Support/AMDGPUMetadata.h"
@@ -35,27 +36,6 @@ using namespace llvm::AMDGPU;
// AMDGPUTargetStreamer
//===----------------------------------------------------------------------===//
-static void convertIsaVersionV2(uint32_t &Major, uint32_t &Minor,
- uint32_t &Stepping, bool Sramecc, bool Xnack) {
- if (Major == 9 && Minor == 0) {
- switch (Stepping) {
- case 0:
- case 2:
- case 4:
- case 6:
- if (Xnack)
- Stepping++;
- }
- }
-}
-
-bool AMDGPUTargetStreamer::EmitHSAMetadataV2(StringRef HSAMetadataString) {
- HSAMD::Metadata HSAMetadata;
- if (HSAMD::fromString(HSAMetadataString, HSAMetadata))
- return false;
- return EmitHSAMetadata(HSAMetadata);
-}
-
bool AMDGPUTargetStreamer::EmitHSAMetadataV3(StringRef HSAMetadataString) {
msgpack::Document HSAMetadataDoc;
if (!HSAMetadataDoc.fromYAML(HSAMetadataString))
@@ -238,21 +218,10 @@ void AMDGPUTargetAsmStreamer::EmitDirectiveAMDGCNTarget() {
OS << "\t.amdgcn_target \"" << getTargetID()->toString() << "\"\n";
}
-void AMDGPUTargetAsmStreamer::EmitDirectiveHSACodeObjectVersion(
- uint32_t Major, uint32_t Minor) {
- OS << "\t.hsa_code_object_version " <<
- Twine(Major) << "," << Twine(Minor) << '\n';
-}
-
-void
-AMDGPUTargetAsmStreamer::EmitDirectiveHSACodeObjectISAV2(uint32_t Major,
- uint32_t Minor,
- uint32_t Stepping,
- StringRef VendorName,
- StringRef ArchName) {
- convertIsaVersionV2(Major, Minor, Stepping, TargetID->isSramEccOnOrAny(), TargetID->isXnackOnOrAny());
- OS << "\t.hsa_code_object_isa " << Twine(Major) << "," << Twine(Minor) << ","
- << Twine(Stepping) << ",\"" << VendorName << "\",\"" << ArchName << "\"\n";
+void AMDGPUTargetAsmStreamer::EmitDirectiveAMDHSACodeObjectVersion(
+ unsigned COV) {
+ AMDGPUTargetStreamer::EmitDirectiveAMDHSACodeObjectVersion(COV);
+ OS << "\t.amdhsa_code_object_version " << COV << '\n';
}
void
@@ -284,18 +253,6 @@ bool AMDGPUTargetAsmStreamer::EmitISAVersion() {
}
bool AMDGPUTargetAsmStreamer::EmitHSAMetadata(
- const AMDGPU::HSAMD::Metadata &HSAMetadata) {
- std::string HSAMetadataString;
- if (HSAMD::toString(HSAMetadata, HSAMetadataString))
- return false;
-
- OS << '\t' << HSAMD::AssemblerDirectiveBegin << '\n';
- OS << HSAMetadataString << '\n';
- OS << '\t' << HSAMD::AssemblerDirectiveEnd << '\n';
- return true;
-}
-
-bool AMDGPUTargetAsmStreamer::EmitHSAMetadata(
msgpack::Document &HSAMetadataDoc, bool Strict) {
HSAMD::V3::MetadataVerifier Verifier(Strict);
if (!Verifier.verify(HSAMetadataDoc.getRoot()))
@@ -336,7 +293,7 @@ bool AMDGPUTargetAsmStreamer::EmitCodeEnd(const MCSubtargetInfo &STI) {
void AMDGPUTargetAsmStreamer::EmitAmdhsaKernelDescriptor(
const MCSubtargetInfo &STI, StringRef KernelName,
const amdhsa::kernel_descriptor_t &KD, uint64_t NextVGPR, uint64_t NextSGPR,
- bool ReserveVCC, bool ReserveFlatScr, unsigned CodeObjectVersion) {
+ bool ReserveVCC, bool ReserveFlatScr) {
IsaVersion IVersion = getIsaVersion(STI.getCPU());
OS << "\t.amdhsa_kernel " << KernelName << '\n';
@@ -529,6 +486,8 @@ MCELFStreamer &AMDGPUTargetELFStreamer::getStreamer() {
void AMDGPUTargetELFStreamer::finish() {
MCAssembler &MCA = getStreamer().getAssembler();
MCA.setELFHeaderEFlags(getEFlags());
+ MCA.getWriter().setOverrideABIVersion(
+ getELFABIVersion(STI.getTargetTriple(), CodeObjectVersion));
std::string Blob;
const char *Vendor = getPALMetadata()->getVendor();
@@ -616,17 +575,7 @@ unsigned AMDGPUTargetELFStreamer::getEFlagsUnknownOS() {
unsigned AMDGPUTargetELFStreamer::getEFlagsAMDHSA() {
assert(isHsaAbi(STI));
- if (std::optional<uint8_t> HsaAbiVer = getHsaAbiVersion(&STI)) {
- switch (*HsaAbiVer) {
- case ELF::ELFABIVERSION_AMDGPU_HSA_V3:
- return getEFlagsV3();
- case ELF::ELFABIVERSION_AMDGPU_HSA_V4:
- case ELF::ELFABIVERSION_AMDGPU_HSA_V5:
- return getEFlagsV4();
- }
- }
-
- llvm_unreachable("HSA OS ABI Version identification must be defined");
+ return getEFlagsV4();
}
unsigned AMDGPUTargetELFStreamer::getEFlagsAMDPAL() {
@@ -699,44 +648,6 @@ unsigned AMDGPUTargetELFStreamer::getEFlagsV4() {
void AMDGPUTargetELFStreamer::EmitDirectiveAMDGCNTarget() {}
-void AMDGPUTargetELFStreamer::EmitDirectiveHSACodeObjectVersion(
- uint32_t Major, uint32_t Minor) {
-
- EmitNote(ElfNote::NoteNameV2, MCConstantExpr::create(8, getContext()),
- ELF::NT_AMD_HSA_CODE_OBJECT_VERSION, [&](MCELFStreamer &OS) {
- OS.emitInt32(Major);
- OS.emitInt32(Minor);
- });
-}
-
-void
-AMDGPUTargetELFStreamer::EmitDirectiveHSACodeObjectISAV2(uint32_t Major,
- uint32_t Minor,
- uint32_t Stepping,
- StringRef VendorName,
- StringRef ArchName) {
- uint16_t VendorNameSize = VendorName.size() + 1;
- uint16_t ArchNameSize = ArchName.size() + 1;
-
- unsigned DescSZ = sizeof(VendorNameSize) + sizeof(ArchNameSize) +
- sizeof(Major) + sizeof(Minor) + sizeof(Stepping) +
- VendorNameSize + ArchNameSize;
-
- convertIsaVersionV2(Major, Minor, Stepping, TargetID->isSramEccOnOrAny(), TargetID->isXnackOnOrAny());
- EmitNote(ElfNote::NoteNameV2, MCConstantExpr::create(DescSZ, getContext()),
- ELF::NT_AMD_HSA_ISA_VERSION, [&](MCELFStreamer &OS) {
- OS.emitInt16(VendorNameSize);
- OS.emitInt16(ArchNameSize);
- OS.emitInt32(Major);
- OS.emitInt32(Minor);
- OS.emitInt32(Stepping);
- OS.emitBytes(VendorName);
- OS.emitInt8(0); // NULL terminate VendorName
- OS.emitBytes(ArchName);
- OS.emitInt8(0); // NULL terminate ArchName
- });
-}
-
void
AMDGPUTargetELFStreamer::EmitAMDKernelCodeT(const amd_kernel_code_t &Header) {
@@ -818,30 +729,6 @@ bool AMDGPUTargetELFStreamer::EmitHSAMetadata(msgpack::Document &HSAMetadataDoc,
return true;
}
-bool AMDGPUTargetELFStreamer::EmitHSAMetadata(
- const AMDGPU::HSAMD::Metadata &HSAMetadata) {
- std::string HSAMetadataString;
- if (HSAMD::toString(HSAMetadata, HSAMetadataString))
- return false;
-
- // Create two labels to mark the beginning and end of the desc field
- // and a MCExpr to calculate the size of the desc field.
- auto &Context = getContext();
- auto *DescBegin = Context.createTempSymbol();
- auto *DescEnd = Context.createTempSymbol();
- auto *DescSZ = MCBinaryExpr::createSub(
- MCSymbolRefExpr::create(DescEnd, Context),
- MCSymbolRefExpr::create(DescBegin, Context), Context);
-
- EmitNote(ElfNote::NoteNameV2, DescSZ, ELF::NT_AMD_HSA_METADATA,
- [&](MCELFStreamer &OS) {
- OS.emitLabel(DescBegin);
- OS.emitBytes(HSAMetadataString);
- OS.emitLabel(DescEnd);
- });
- return true;
-}
-
bool AMDGPUTargetAsmStreamer::EmitKernargPreloadHeader(
const MCSubtargetInfo &STI) {
for (int i = 0; i < 64; ++i) {
@@ -889,8 +776,7 @@ bool AMDGPUTargetELFStreamer::EmitCodeEnd(const MCSubtargetInfo &STI) {
void AMDGPUTargetELFStreamer::EmitAmdhsaKernelDescriptor(
const MCSubtargetInfo &STI, StringRef KernelName,
const amdhsa::kernel_descriptor_t &KernelDescriptor, uint64_t NextVGPR,
- uint64_t NextSGPR, bool ReserveVCC, bool ReserveFlatScr,
- unsigned CodeObjectVersion) {
+ uint64_t NextSGPR, bool ReserveVCC, bool ReserveFlatScr) {
auto &Streamer = getStreamer();
auto &Context = Streamer.getContext();
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h b/contrib/llvm-project/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h
index 55b5246c9210..7f8ddc42b2ee 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h
@@ -37,23 +37,24 @@ class AMDGPUTargetStreamer : public MCTargetStreamer {
protected:
// TODO: Move HSAMetadataStream to AMDGPUTargetStreamer.
std::optional<AMDGPU::IsaInfo::AMDGPUTargetID> TargetID;
+ unsigned CodeObjectVersion;
MCContext &getContext() const { return Streamer.getContext(); }
public:
- AMDGPUTargetStreamer(MCStreamer &S) : MCTargetStreamer(S) {}
+ AMDGPUTargetStreamer(MCStreamer &S)
+ : MCTargetStreamer(S),
+ // Assume the default COV for now, EmitDirectiveAMDHSACodeObjectVersion
+ // will update this if it is encountered.
+ CodeObjectVersion(AMDGPU::getDefaultAMDHSACodeObjectVersion()) {}
AMDGPUPALMetadata *getPALMetadata() { return &PALMetadata; }
virtual void EmitDirectiveAMDGCNTarget(){};
- virtual void EmitDirectiveHSACodeObjectVersion(uint32_t Major,
- uint32_t Minor){};
-
- virtual void EmitDirectiveHSACodeObjectISAV2(uint32_t Major, uint32_t Minor,
- uint32_t Stepping,
- StringRef VendorName,
- StringRef ArchName){};
+ virtual void EmitDirectiveAMDHSACodeObjectVersion(unsigned COV) {
+ CodeObjectVersion = COV;
+ }
virtual void EmitAMDKernelCodeT(const amd_kernel_code_t &Header){};
@@ -66,9 +67,6 @@ public:
virtual bool EmitISAVersion() { return true; }
/// \returns True on success, false on failure.
- virtual bool EmitHSAMetadataV2(StringRef HSAMetadataString);
-
- /// \returns True on success, false on failure.
virtual bool EmitHSAMetadataV3(StringRef HSAMetadataString);
/// Emit HSA Metadata
@@ -98,8 +96,7 @@ public:
virtual void EmitAmdhsaKernelDescriptor(
const MCSubtargetInfo &STI, StringRef KernelName,
const amdhsa::kernel_descriptor_t &KernelDescriptor, uint64_t NextVGPR,
- uint64_t NextSGPR, bool ReserveVCC, bool ReserveFlatScr,
- unsigned CodeObjectVersion){};
+ uint64_t NextSGPR, bool ReserveVCC, bool ReserveFlatScr) {}
static StringRef getArchNameFromElfMach(unsigned ElfMach);
static unsigned getElfMach(StringRef GPU);
@@ -110,15 +107,12 @@ public:
std::optional<AMDGPU::IsaInfo::AMDGPUTargetID> &getTargetID() {
return TargetID;
}
- void initializeTargetID(const MCSubtargetInfo &STI,
- unsigned CodeObjectVersion) {
+ void initializeTargetID(const MCSubtargetInfo &STI) {
assert(TargetID == std::nullopt && "TargetID can only be initialized once");
TargetID.emplace(STI);
- getTargetID()->setCodeObjectVersion(CodeObjectVersion);
}
- void initializeTargetID(const MCSubtargetInfo &STI, StringRef FeatureString,
- unsigned CodeObjectVersion) {
- initializeTargetID(STI, CodeObjectVersion);
+ void initializeTargetID(const MCSubtargetInfo &STI, StringRef FeatureString) {
+ initializeTargetID(STI);
assert(getTargetID() != std::nullopt && "TargetID is None");
getTargetID()->setTargetIDFromFeaturesString(FeatureString);
@@ -134,12 +128,7 @@ public:
void EmitDirectiveAMDGCNTarget() override;
- void EmitDirectiveHSACodeObjectVersion(uint32_t Major,
- uint32_t Minor) override;
-
- void EmitDirectiveHSACodeObjectISAV2(uint32_t Major, uint32_t Minor,
- uint32_t Stepping, StringRef VendorName,
- StringRef ArchName) override;
+ void EmitDirectiveAMDHSACodeObjectVersion(unsigned COV) override;
void EmitAMDKernelCodeT(const amd_kernel_code_t &Header) override;
@@ -154,9 +143,6 @@ public:
bool EmitHSAMetadata(msgpack::Document &HSAMetadata, bool Strict) override;
/// \returns True on success, false on failure.
- bool EmitHSAMetadata(const AMDGPU::HSAMD::Metadata &HSAMetadata) override;
-
- /// \returns True on success, false on failure.
bool EmitCodeEnd(const MCSubtargetInfo &STI) override;
/// \returns True on success, false on failure.
@@ -165,8 +151,7 @@ public:
void EmitAmdhsaKernelDescriptor(
const MCSubtargetInfo &STI, StringRef KernelName,
const amdhsa::kernel_descriptor_t &KernelDescriptor, uint64_t NextVGPR,
- uint64_t NextSGPR, bool ReserveVCC, bool ReserveFlatScr,
- unsigned CodeObjectVersion) override;
+ uint64_t NextSGPR, bool ReserveVCC, bool ReserveFlatScr) override;
};
class AMDGPUTargetELFStreamer final : public AMDGPUTargetStreamer {
@@ -198,13 +183,6 @@ public:
void EmitDirectiveAMDGCNTarget() override;
- void EmitDirectiveHSACodeObjectVersion(uint32_t Major,
- uint32_t Minor) override;
-
- void EmitDirectiveHSACodeObjectISAV2(uint32_t Major, uint32_t Minor,
- uint32_t Stepping, StringRef VendorName,
- StringRef ArchName) override;
-
void EmitAMDKernelCodeT(const amd_kernel_code_t &Header) override;
void EmitAMDGPUSymbolType(StringRef SymbolName, unsigned Type) override;
@@ -218,9 +196,6 @@ public:
bool EmitHSAMetadata(msgpack::Document &HSAMetadata, bool Strict) override;
/// \returns True on success, false on failure.
- bool EmitHSAMetadata(const AMDGPU::HSAMD::Metadata &HSAMetadata) override;
-
- /// \returns True on success, false on failure.
bool EmitCodeEnd(const MCSubtargetInfo &STI) override;
/// \returns True on success, false on failure.
@@ -229,9 +204,7 @@ public:
void EmitAmdhsaKernelDescriptor(
const MCSubtargetInfo &STI, StringRef KernelName,
const amdhsa::kernel_descriptor_t &KernelDescriptor, uint64_t NextVGPR,
- uint64_t NextSGPR, bool ReserveVCC, bool ReserveFlatScr,
- unsigned CodeObjectVersion) override;
+ uint64_t NextSGPR, bool ReserveVCC, bool ReserveFlatScr) override;
};
-
}
#endif
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/MIMGInstructions.td b/contrib/llvm-project/llvm/lib/Target/AMDGPU/MIMGInstructions.td
index 240366c8e7da..3c7cd61444fa 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/MIMGInstructions.td
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/MIMGInstructions.td
@@ -1553,6 +1553,11 @@ defm IMAGE_ATOMIC_DEC : MIMG_Atomic_Renamed <mimgopc<0x16, 0x16, 0x1c>
defm IMAGE_ATOMIC_FCMPSWAP : MIMG_Atomic <mimgopc<MIMG.NOP, MIMG.NOP, 0x1d, MIMG.NOP>, "image_atomic_fcmpswap", 1, 1>;
defm IMAGE_ATOMIC_FMIN : MIMG_Atomic <mimgopc<MIMG.NOP, MIMG.NOP, 0x1e, MIMG.NOP>, "image_atomic_fmin", 0, 1>;
defm IMAGE_ATOMIC_FMAX : MIMG_Atomic <mimgopc<MIMG.NOP, MIMG.NOP, 0x1f, MIMG.NOP>, "image_atomic_fmax", 0, 1>;
+defm IMAGE_ATOMIC_PK_ADD_F16 : MIMG_Atomic <mimgopc<0x86, MIMG.NOP, MIMG.NOP, MIMG.NOP, MIMG.NOP>, "image_atomic_pk_add_f16", 0, 1>;
+defm IMAGE_ATOMIC_PK_ADD_BF16 : MIMG_Atomic <mimgopc<0x87, MIMG.NOP, MIMG.NOP, MIMG.NOP, MIMG.NOP>, "image_atomic_pk_add_bf16", 0, 1>;
+defm IMAGE_ATOMIC_ADD_FLT : MIMG_Atomic <mimgopc<0x83, MIMG.NOP, MIMG.NOP, MIMG.NOP>, "image_atomic_add_flt", 0, 1>;
+defm IMAGE_ATOMIC_MIN_FLT : MIMG_Atomic <mimgopc<0x84, MIMG.NOP, MIMG.NOP, MIMG.NOP>, "image_atomic_min_num_flt", 0, 1, "image_atomic_min_flt">;
+defm IMAGE_ATOMIC_MAX_FLT : MIMG_Atomic <mimgopc<0x85, MIMG.NOP, MIMG.NOP, MIMG.NOP>, "image_atomic_max_num_flt", 0, 1, "image_atomic_max_flt">;
defm IMAGE_SAMPLE : MIMG_Sampler_WQM <mimgopc<0x1b, 0x1b, 0x20>, AMDGPUSample>;
let OtherPredicates = [HasExtendedImageInsts] in {
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp
index 674fd04f2fc1..159b2d440b31 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp
@@ -1619,8 +1619,7 @@ static SDValue ReorganizeVector(SelectionDAG &DAG, SDValue VectorEntry,
for (unsigned i = 0; i < 4; i++) {
RemapSwizzle[i] = i;
if (NewBldVec[i].getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
- unsigned Idx = cast<ConstantSDNode>(NewBldVec[i].getOperand(1))
- ->getZExtValue();
+ unsigned Idx = NewBldVec[i].getConstantOperandVal(1);
if (i == Idx)
isUnmovable[Idx] = true;
}
@@ -1628,8 +1627,7 @@ static SDValue ReorganizeVector(SelectionDAG &DAG, SDValue VectorEntry,
for (unsigned i = 0; i < 4; i++) {
if (NewBldVec[i].getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
- unsigned Idx = cast<ConstantSDNode>(NewBldVec[i].getOperand(1))
- ->getZExtValue();
+ unsigned Idx = NewBldVec[i].getConstantOperandVal(1);
if (isUnmovable[Idx])
continue;
// Swap i and Idx
@@ -2002,9 +2000,7 @@ bool R600TargetLowering::FoldOperand(SDNode *ParentNode, unsigned SrcIdx,
if (RegisterSDNode *Reg =
dyn_cast<RegisterSDNode>(ParentNode->getOperand(OtherSrcIdx))) {
if (Reg->getReg() == R600::ALU_CONST) {
- ConstantSDNode *Cst
- = cast<ConstantSDNode>(ParentNode->getOperand(OtherSelIdx));
- Consts.push_back(Cst->getZExtValue());
+ Consts.push_back(ParentNode->getConstantOperandVal(OtherSelIdx));
}
}
}
@@ -2044,8 +2040,7 @@ bool R600TargetLowering::FoldOperand(SDNode *ParentNode, unsigned SrcIdx,
ImmValue = FPC->getValueAPF().bitcastToAPInt().getZExtValue();
}
} else {
- ConstantSDNode *C = cast<ConstantSDNode>(Src.getOperand(0));
- uint64_t Value = C->getZExtValue();
+ uint64_t Value = Src.getConstantOperandVal(0);
if (Value == 0) {
ImmReg = R600::ZERO;
} else if (Value == 1) {
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp
index 932c0d6216ce..c921e5a35d2d 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp
@@ -329,15 +329,16 @@ bool SIAnnotateControlFlow::closeControlFlow(BasicBlock *BB) {
}
Value *Exec = popSaved();
- Instruction *FirstInsertionPt = &*BB->getFirstInsertionPt();
+ BasicBlock::iterator FirstInsertionPt = BB->getFirstInsertionPt();
if (!isa<UndefValue>(Exec) && !isa<UnreachableInst>(FirstInsertionPt)) {
Instruction *ExecDef = cast<Instruction>(Exec);
BasicBlock *DefBB = ExecDef->getParent();
if (!DT->dominates(DefBB, BB)) {
// Split edge to make Def dominate Use
- FirstInsertionPt = &*SplitEdge(DefBB, BB, DT, LI)->getFirstInsertionPt();
+ FirstInsertionPt = SplitEdge(DefBB, BB, DT, LI)->getFirstInsertionPt();
}
- IRBuilder<>(FirstInsertionPt).CreateCall(EndCf, {Exec});
+ IRBuilder<>(FirstInsertionPt->getParent(), FirstInsertionPt)
+ .CreateCall(EndCf, {Exec});
}
return true;
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIDefines.h b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIDefines.h
index b291400a947c..8ab66d4fd5b8 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIDefines.h
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIDefines.h
@@ -400,6 +400,10 @@ enum CPol {
TH_TYPE_STORE = 1 << 8, // TH_STORE policy
TH_TYPE_ATOMIC = 1 << 9, // TH_ATOMIC policy
TH_REAL_BYPASS = 1 << 10, // is TH=3 bypass policy or not
+
+ // Volatile (used to preserve/signal operation volatility for buffer
+ // operations not a real instruction bit)
+ VOLATILE = 1 << 31,
};
} // namespace CPol
@@ -1172,11 +1176,13 @@ enum Type { TRAP = -2, WORKGROUP = -1 };
#define R_00B860_COMPUTE_TMPRING_SIZE 0x00B860
#define S_00B860_WAVESIZE_PreGFX11(x) (((x) & 0x1FFF) << 12)
-#define S_00B860_WAVESIZE_GFX11Plus(x) (((x) & 0x7FFF) << 12)
+#define S_00B860_WAVESIZE_GFX11(x) (((x) & 0x7FFF) << 12)
+#define S_00B860_WAVESIZE_GFX12Plus(x) (((x) & 0x3FFFF) << 12)
#define R_0286E8_SPI_TMPRING_SIZE 0x0286E8
#define S_0286E8_WAVESIZE_PreGFX11(x) (((x) & 0x1FFF) << 12)
-#define S_0286E8_WAVESIZE_GFX11Plus(x) (((x) & 0x7FFF) << 12)
+#define S_0286E8_WAVESIZE_GFX11(x) (((x) & 0x7FFF) << 12)
+#define S_0286E8_WAVESIZE_GFX12Plus(x) (((x) & 0x3FFFF) << 12)
#define R_028B54_VGT_SHADER_STAGES_EN 0x028B54
#define S_028B54_HS_W32_EN(x) (((x) & 0x1) << 21)
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
index aa7639a0f186..2862a7787e75 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
@@ -1498,6 +1498,7 @@ const MachineOperand *SIFoldOperands::isClamp(const MachineInstr &MI) const {
case AMDGPU::V_MAX_F16_t16_e64:
case AMDGPU::V_MAX_F16_fake16_e64:
case AMDGPU::V_MAX_F64_e64:
+ case AMDGPU::V_MAX_NUM_F64_e64:
case AMDGPU::V_PK_MAX_F16: {
if (!TII->getNamedOperand(MI, AMDGPU::OpName::clamp)->getImm())
return nullptr;
@@ -1567,7 +1568,8 @@ bool SIFoldOperands::tryFoldClamp(MachineInstr &MI) {
static int getOModValue(unsigned Opc, int64_t Val) {
switch (Opc) {
- case AMDGPU::V_MUL_F64_e64: {
+ case AMDGPU::V_MUL_F64_e64:
+ case AMDGPU::V_MUL_F64_pseudo_e64: {
switch (Val) {
case 0x3fe0000000000000: // 0.5
return SIOutMods::DIV2;
@@ -1618,6 +1620,7 @@ SIFoldOperands::isOMod(const MachineInstr &MI) const {
unsigned Op = MI.getOpcode();
switch (Op) {
case AMDGPU::V_MUL_F64_e64:
+ case AMDGPU::V_MUL_F64_pseudo_e64:
case AMDGPU::V_MUL_F32_e64:
case AMDGPU::V_MUL_F16_t16_e64:
case AMDGPU::V_MUL_F16_fake16_e64:
@@ -1625,8 +1628,8 @@ SIFoldOperands::isOMod(const MachineInstr &MI) const {
// If output denormals are enabled, omod is ignored.
if ((Op == AMDGPU::V_MUL_F32_e64 &&
MFI->getMode().FP32Denormals.Output != DenormalMode::PreserveSign) ||
- ((Op == AMDGPU::V_MUL_F64_e64 || Op == AMDGPU::V_MUL_F16_e64 ||
- Op == AMDGPU::V_MUL_F16_t16_e64 ||
+ ((Op == AMDGPU::V_MUL_F64_e64 || Op == AMDGPU::V_MUL_F64_pseudo_e64 ||
+ Op == AMDGPU::V_MUL_F16_e64 || Op == AMDGPU::V_MUL_F16_t16_e64 ||
Op == AMDGPU::V_MUL_F16_fake16_e64) &&
MFI->getMode().FP64FP16Denormals.Output != DenormalMode::PreserveSign))
return std::pair(nullptr, SIOutMods::NONE);
@@ -1655,6 +1658,7 @@ SIFoldOperands::isOMod(const MachineInstr &MI) const {
return std::pair(RegOp, OMod);
}
case AMDGPU::V_ADD_F64_e64:
+ case AMDGPU::V_ADD_F64_pseudo_e64:
case AMDGPU::V_ADD_F32_e64:
case AMDGPU::V_ADD_F16_e64:
case AMDGPU::V_ADD_F16_t16_e64:
@@ -1662,8 +1666,8 @@ SIFoldOperands::isOMod(const MachineInstr &MI) const {
// If output denormals are enabled, omod is ignored.
if ((Op == AMDGPU::V_ADD_F32_e64 &&
MFI->getMode().FP32Denormals.Output != DenormalMode::PreserveSign) ||
- ((Op == AMDGPU::V_ADD_F64_e64 || Op == AMDGPU::V_ADD_F16_e64 ||
- Op == AMDGPU::V_ADD_F16_t16_e64 ||
+ ((Op == AMDGPU::V_ADD_F64_e64 || Op == AMDGPU::V_ADD_F64_pseudo_e64 ||
+ Op == AMDGPU::V_ADD_F16_e64 || Op == AMDGPU::V_ADD_F16_t16_e64 ||
Op == AMDGPU::V_ADD_F16_fake16_e64) &&
MFI->getMode().FP64FP16Denormals.Output != DenormalMode::PreserveSign))
return std::pair(nullptr, SIOutMods::NONE);
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
index 0f89df144486..9d062eb156d5 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
@@ -95,7 +95,8 @@ static void getVGPRSpillLaneOrTempRegister(
TargetStackID::SGPRSpill);
if (TRI->spillSGPRToVGPR() &&
- MFI->allocateSGPRSpillToVGPRLane(MF, FI, /* IsPrologEpilog */ true)) {
+ MFI->allocateSGPRSpillToVGPRLane(MF, FI, /*SpillToPhysVGPRLane=*/true,
+ /*IsPrologEpilog=*/true)) {
// 2: There's no free lane to spill, and no free register to save the
// SGPR, so we're forced to take another VGPR to use for the spill.
MFI->addToPrologEpilogSGPRSpills(
@@ -188,7 +189,7 @@ static void buildGitPtr(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
.addImm(MFI->getGITPtrHigh())
.addReg(TargetReg, RegState::ImplicitDefine);
} else {
- const MCInstrDesc &GetPC64 = TII->get(AMDGPU::S_GETPC_B64);
+ const MCInstrDesc &GetPC64 = TII->get(AMDGPU::S_GETPC_B64_pseudo);
BuildMI(MBB, I, DL, GetPC64, TargetReg);
}
Register GitPtrLo = MFI->getGITPtrLoReg(*MF);
@@ -1560,6 +1561,8 @@ void SIFrameLowering::determineCalleeSaves(MachineFunction &MF,
if (MFI->isChainFunction() && !MF.getFrameInfo().hasTailCall())
return;
+ MFI->shiftSpillPhysVGPRsToLowestRange(MF);
+
TargetFrameLowering::determineCalleeSaves(MF, SavedVGPRs, RS);
if (MFI->isEntryFunction())
return;
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 5a9222e91588..cf947dccafac 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -855,7 +855,7 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
setOperationAction(ISD::INTRINSIC_WO_CHAIN,
{MVT::Other, MVT::f32, MVT::v4f32, MVT::i16, MVT::f16,
- MVT::v2i16, MVT::v2f16, MVT::i128},
+ MVT::v2i16, MVT::v2f16, MVT::i128, MVT::i8},
Custom);
setOperationAction(ISD::INTRINSIC_W_CHAIN,
@@ -1183,6 +1183,9 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
Info.ptrVal = RsrcArg;
}
+ auto *Aux = cast<ConstantInt>(CI.getArgOperand(CI.arg_size() - 1));
+ if (Aux->getZExtValue() & AMDGPU::CPol::VOLATILE)
+ Info.flags |= MachineMemOperand::MOVolatile;
Info.flags |= MachineMemOperand::MODereferenceable;
if (ME.onlyReadsMemory()) {
unsigned MaxNumLanes = 4;
@@ -1333,6 +1336,7 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
case Intrinsic::amdgcn_flat_atomic_fmin_num:
case Intrinsic::amdgcn_flat_atomic_fmax_num:
case Intrinsic::amdgcn_global_atomic_fadd_v2bf16:
+ case Intrinsic::amdgcn_atomic_cond_sub_u32:
case Intrinsic::amdgcn_flat_atomic_fadd_v2bf16: {
Info.opc = ISD::INTRINSIC_W_CHAIN;
Info.memVT = MVT::getVT(CI.getType());
@@ -1344,6 +1348,14 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
MachineMemOperand::MOVolatile;
return true;
}
+ case Intrinsic::amdgcn_global_load_tr: {
+ Info.opc = ISD::INTRINSIC_W_CHAIN;
+ Info.memVT = MVT::getVT(CI.getType());
+ Info.ptrVal = CI.getOperand(0);
+ Info.align.reset();
+ Info.flags |= MachineMemOperand::MOLoad;
+ return true;
+ }
case Intrinsic::amdgcn_ds_gws_init:
case Intrinsic::amdgcn_ds_gws_barrier:
case Intrinsic::amdgcn_ds_gws_sema_v:
@@ -1403,6 +1415,7 @@ bool SITargetLowering::getAddrModeArguments(IntrinsicInst *II,
SmallVectorImpl<Value*> &Ops,
Type *&AccessTy) const {
switch (II->getIntrinsicID()) {
+ case Intrinsic::amdgcn_global_load_tr:
case Intrinsic::amdgcn_ds_ordered_add:
case Intrinsic::amdgcn_ds_ordered_swap:
case Intrinsic::amdgcn_ds_append:
@@ -1525,6 +1538,7 @@ bool SITargetLowering::isLegalAddressingMode(const DataLayout &DL,
// will use a MUBUF load.
// FIXME?: We also need to do this if unaligned, but we don't know the
// alignment here.
+ // TODO: Update this for GFX12 which does have scalar sub-dword loads.
if (Ty->isSized() && DL.getTypeStoreSize(Ty) < 4)
return isLegalGlobalAddressingMode(AM);
@@ -2297,7 +2311,7 @@ void SITargetLowering::allocateSpecialInputSGPRs(
const Module *M = MF.getFunction().getParent();
if (UserSGPRInfo.hasQueuePtr() &&
- AMDGPU::getCodeObjectVersion(*M) < AMDGPU::AMDHSA_COV5)
+ AMDGPU::getAMDHSACodeObjectVersion(*M) < AMDGPU::AMDHSA_COV5)
allocateSGPR64Input(CCInfo, ArgInfo.QueuePtr);
// Implicit arg ptr takes the place of the kernarg segment pointer. This is a
@@ -2350,7 +2364,7 @@ void SITargetLowering::allocateHSAUserSGPRs(CCState &CCInfo,
const Module *M = MF.getFunction().getParent();
if (UserSGPRInfo.hasQueuePtr() &&
- AMDGPU::getCodeObjectVersion(*M) < AMDGPU::AMDHSA_COV5) {
+ AMDGPU::getAMDHSACodeObjectVersion(*M) < AMDGPU::AMDHSA_COV5) {
Register QueuePtrReg = Info.addQueuePtr(TRI);
MF.addLiveIn(QueuePtrReg, &AMDGPU::SGPR_64RegClass);
CCInfo.AllocateReg(QueuePtrReg);
@@ -2779,15 +2793,16 @@ SDValue SITargetLowering::LowerFormalArguments(
} else if (!IsGraphics) {
// For the fixed ABI, pass workitem IDs in the last argument register.
allocateSpecialInputVGPRsFixed(CCInfo, MF, *TRI, *Info);
+
+ // FIXME: Sink this into allocateSpecialInputSGPRs
+ if (!Subtarget->enableFlatScratch())
+ CCInfo.AllocateReg(Info->getScratchRSrcReg());
+
+ allocateSpecialInputSGPRs(CCInfo, MF, *TRI, *Info);
}
if (!IsKernel) {
CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, isVarArg);
- if (!IsGraphics && !Subtarget->enableFlatScratch()) {
- CCInfo.AllocateRegBlock(ArrayRef<MCPhysReg>{AMDGPU::SGPR0, AMDGPU::SGPR1,
- AMDGPU::SGPR2, AMDGPU::SGPR3},
- 4);
- }
CCInfo.AnalyzeFormalArguments(Splits, AssignFn);
}
@@ -2987,13 +3002,8 @@ SDValue SITargetLowering::LowerFormalArguments(
}
// Start adding system SGPRs.
- if (IsEntryFunc) {
+ if (IsEntryFunc)
allocateSystemSGPRs(CCInfo, MF, *Info, CallConv, IsGraphics);
- } else {
- CCInfo.AllocateReg(Info->getScratchRSrcReg());
- if (!IsGraphics)
- allocateSpecialInputSGPRs(CCInfo, MF, *TRI, *Info);
- }
auto &ArgUsageInfo =
DAG.getPass()->getAnalysis<AMDGPUArgumentUsageInfo>();
@@ -5720,7 +5730,7 @@ SDValue SITargetLowering::lowerIntrinsicLoad(MemSDNode *M, bool IsFormat,
// Handle BUFFER_LOAD_BYTE/UBYTE/SHORT/USHORT overloaded intrinsics
if (!IsD16 && !LoadVT.isVector() && EltType.getSizeInBits() < 32)
- return handleByteShortBufferLoads(DAG, LoadVT, DL, Ops, M);
+ return handleByteShortBufferLoads(DAG, LoadVT, DL, Ops, M->getMemOperand());
if (isTypeLegal(LoadVT)) {
return getMemIntrinsicNode(Opc, DL, M->getVTList(), Ops, IntVT,
@@ -5739,8 +5749,7 @@ SDValue SITargetLowering::lowerIntrinsicLoad(MemSDNode *M, bool IsFormat,
static SDValue lowerICMPIntrinsic(const SITargetLowering &TLI,
SDNode *N, SelectionDAG &DAG) {
EVT VT = N->getValueType(0);
- const auto *CD = cast<ConstantSDNode>(N->getOperand(3));
- unsigned CondCode = CD->getZExtValue();
+ unsigned CondCode = N->getConstantOperandVal(3);
if (!ICmpInst::isIntPredicate(static_cast<ICmpInst::Predicate>(CondCode)))
return DAG.getUNDEF(VT);
@@ -5774,9 +5783,8 @@ static SDValue lowerICMPIntrinsic(const SITargetLowering &TLI,
static SDValue lowerFCMPIntrinsic(const SITargetLowering &TLI,
SDNode *N, SelectionDAG &DAG) {
EVT VT = N->getValueType(0);
- const auto *CD = cast<ConstantSDNode>(N->getOperand(3));
- unsigned CondCode = CD->getZExtValue();
+ unsigned CondCode = N->getConstantOperandVal(3);
if (!FCmpInst::isFPPredicate(static_cast<FCmpInst::Predicate>(CondCode)))
return DAG.getUNDEF(VT);
@@ -5894,6 +5902,55 @@ void SITargetLowering::ReplaceNodeResults(SDNode *N,
}
return;
}
+ case Intrinsic::amdgcn_s_buffer_load: {
+ // Lower llvm.amdgcn.s.buffer.load.(i8, u8) intrinsics. First, we generate
+ // s_buffer_load_u8 for signed and unsigned load instructions. Next, DAG
+ // combiner tries to merge the s_buffer_load_u8 with a sext instruction
+ // (performSignExtendInRegCombine()) and it replaces s_buffer_load_u8 with
+ // s_buffer_load_i8.
+ if (!Subtarget->hasScalarSubwordLoads())
+ return;
+ SDValue Op = SDValue(N, 0);
+ SDValue Rsrc = Op.getOperand(1);
+ SDValue Offset = Op.getOperand(2);
+ SDValue CachePolicy = Op.getOperand(3);
+ EVT VT = Op.getValueType();
+ assert(VT == MVT::i8 && "Expected 8-bit s_buffer_load intrinsics.\n");
+ SDLoc DL(Op);
+ MachineFunction &MF = DAG.getMachineFunction();
+ const DataLayout &DataLayout = DAG.getDataLayout();
+ Align Alignment =
+ DataLayout.getABITypeAlign(VT.getTypeForEVT(*DAG.getContext()));
+ MachineMemOperand *MMO = MF.getMachineMemOperand(
+ MachinePointerInfo(),
+ MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
+ MachineMemOperand::MOInvariant,
+ VT.getStoreSize(), Alignment);
+ SDValue LoadVal;
+ if (!Offset->isDivergent()) {
+ SDValue Ops[] = {Rsrc, // source register
+ Offset, CachePolicy};
+ SDValue BufferLoad =
+ DAG.getMemIntrinsicNode(AMDGPUISD::SBUFFER_LOAD_UBYTE, DL,
+ DAG.getVTList(MVT::i32), Ops, VT, MMO);
+ LoadVal = DAG.getNode(ISD::TRUNCATE, DL, VT, BufferLoad);
+ } else {
+ SDValue Ops[] = {
+ DAG.getEntryNode(), // Chain
+ Rsrc, // rsrc
+ DAG.getConstant(0, DL, MVT::i32), // vindex
+ {}, // voffset
+ {}, // soffset
+ {}, // offset
+ CachePolicy, // cachepolicy
+ DAG.getTargetConstant(0, DL, MVT::i1), // idxen
+ };
+ setBufferOffsets(Offset, DAG, &Ops[3], Align(4));
+ LoadVal = handleByteShortBufferLoads(DAG, VT, DL, Ops, MMO);
+ }
+ Results.push_back(LoadVal);
+ return;
+ }
}
break;
}
@@ -6390,7 +6447,7 @@ SDValue SITargetLowering::lowerTrapHsaQueuePtr(
SDValue QueuePtr;
// For code object version 5, QueuePtr is passed through implicit kernarg.
const Module *M = DAG.getMachineFunction().getFunction().getParent();
- if (AMDGPU::getCodeObjectVersion(*M) >= AMDGPU::AMDHSA_COV5) {
+ if (AMDGPU::getAMDHSACodeObjectVersion(*M) >= AMDGPU::AMDHSA_COV5) {
QueuePtr =
loadImplicitKernelArgument(DAG, MVT::i64, SL, Align(8), QUEUE_PTR);
} else {
@@ -6494,7 +6551,7 @@ SDValue SITargetLowering::getSegmentAperture(unsigned AS, const SDLoc &DL,
// For code object version 5, private_base and shared_base are passed through
// implicit kernargs.
const Module *M = DAG.getMachineFunction().getFunction().getParent();
- if (AMDGPU::getCodeObjectVersion(*M) >= AMDGPU::AMDHSA_COV5) {
+ if (AMDGPU::getAMDHSACodeObjectVersion(*M) >= AMDGPU::AMDHSA_COV5) {
ImplicitParameter Param =
(AS == AMDGPUAS::LOCAL_ADDRESS) ? SHARED_BASE : PRIVATE_BASE;
return loadImplicitKernelArgument(DAG, MVT::i32, DL, Align(4), Param);
@@ -7248,17 +7305,17 @@ static SDValue padEltsToUndef(SelectionDAG &DAG, const SDLoc &DL, EVT CastVT,
// Re-construct the required return value for a image load intrinsic.
// This is more complicated due to the optional use TexFailCtrl which means the required
// return type is an aggregate
-static SDValue constructRetValue(SelectionDAG &DAG,
- MachineSDNode *Result,
- ArrayRef<EVT> ResultTypes,
- bool IsTexFail, bool Unpacked, bool IsD16,
- int DMaskPop, int NumVDataDwords,
+static SDValue constructRetValue(SelectionDAG &DAG, MachineSDNode *Result,
+ ArrayRef<EVT> ResultTypes, bool IsTexFail,
+ bool Unpacked, bool IsD16, int DMaskPop,
+ int NumVDataDwords, bool IsAtomicPacked16Bit,
const SDLoc &DL) {
// Determine the required return type. This is the same regardless of IsTexFail flag
EVT ReqRetVT = ResultTypes[0];
int ReqRetNumElts = ReqRetVT.isVector() ? ReqRetVT.getVectorNumElements() : 1;
- int NumDataDwords = (!IsD16 || (IsD16 && Unpacked)) ?
- ReqRetNumElts : (ReqRetNumElts + 1) / 2;
+ int NumDataDwords = ((IsD16 && !Unpacked) || IsAtomicPacked16Bit)
+ ? (ReqRetNumElts + 1) / 2
+ : ReqRetNumElts;
int MaskPopDwords = (!IsD16 || (IsD16 && Unpacked)) ?
DMaskPop : (DMaskPop + 1) / 2;
@@ -7283,7 +7340,7 @@ static SDValue constructRetValue(SelectionDAG &DAG,
}
}
- if (DataDwordVT.isVector())
+ if (DataDwordVT.isVector() && !IsAtomicPacked16Bit)
Data = padEltsToUndef(DAG, DL, DataDwordVT, Data,
NumDataDwords - MaskPopDwords);
@@ -7390,6 +7447,7 @@ SDValue SITargetLowering::lowerImage(SDValue Op,
SDValue VData;
int NumVDataDwords;
bool AdjustRetType = false;
+ bool IsAtomicPacked16Bit = false;
// Offset of intrinsic arguments
const unsigned ArgOffset = WithChain ? 2 : 1;
@@ -7400,6 +7458,10 @@ SDValue SITargetLowering::lowerImage(SDValue Op,
if (BaseOpcode->Atomic) {
VData = Op.getOperand(2);
+ IsAtomicPacked16Bit =
+ (Intr->BaseOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_F16 ||
+ Intr->BaseOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_BF16);
+
bool Is64Bit = VData.getValueSizeInBits() == 64;
if (BaseOpcode->AtomicX2) {
SDValue VData2 = Op.getOperand(3);
@@ -7416,9 +7478,7 @@ SDValue SITargetLowering::lowerImage(SDValue Op,
NumVDataDwords = Is64Bit ? 2 : 1;
}
} else {
- auto *DMaskConst =
- cast<ConstantSDNode>(Op.getOperand(ArgOffset + Intr->DMaskIndex));
- DMask = DMaskConst->getZExtValue();
+ DMask = Op->getConstantOperandVal(ArgOffset + Intr->DMaskIndex);
DMaskLanes = BaseOpcode->Gather4 ? 4 : llvm::popcount(DMask);
if (BaseOpcode->Store) {
@@ -7639,7 +7699,8 @@ SDValue SITargetLowering::lowerImage(SDValue Op,
Op.getOperand(ArgOffset + Intr->CachePolicyIndex))->getZExtValue();
if (BaseOpcode->Atomic)
CPol |= AMDGPU::CPol::GLC; // TODO no-return optimization
- if (CPol & ~(IsGFX12Plus ? AMDGPU::CPol::ALL : AMDGPU::CPol::ALL_pregfx12))
+ if (CPol & ~((IsGFX12Plus ? AMDGPU::CPol::ALL : AMDGPU::CPol::ALL_pregfx12) |
+ AMDGPU::CPol::VOLATILE))
return Op;
SmallVector<SDValue, 26> Ops;
@@ -7729,10 +7790,9 @@ SDValue SITargetLowering::lowerImage(SDValue Op,
}
if (BaseOpcode->Store)
return SDValue(NewNode, 0);
- return constructRetValue(DAG, NewNode,
- OrigResultTypes, IsTexFail,
- Subtarget->hasUnpackedD16VMem(), IsD16,
- DMaskLanes, NumVDataDwords, DL);
+ return constructRetValue(DAG, NewNode, OrigResultTypes, IsTexFail,
+ Subtarget->hasUnpackedD16VMem(), IsD16, DMaskLanes,
+ NumVDataDwords, IsAtomicPacked16Bit, DL);
}
SDValue SITargetLowering::lowerSBuffer(EVT VT, SDLoc DL, SDValue Rsrc,
@@ -7751,11 +7811,18 @@ SDValue SITargetLowering::lowerSBuffer(EVT VT, SDLoc DL, SDValue Rsrc,
VT.getStoreSize(), Alignment);
if (!Offset->isDivergent()) {
- SDValue Ops[] = {
- Rsrc,
- Offset, // Offset
- CachePolicy
- };
+ SDValue Ops[] = {Rsrc, Offset, CachePolicy};
+
+ // Lower llvm.amdgcn.s.buffer.load.{i16, u16} intrinsics. Initially, the
+ // s_buffer_load_u16 instruction is emitted for both signed and unsigned
+ // loads. Later, DAG combiner tries to combine s_buffer_load_u16 with sext
+ // and generates s_buffer_load_i16 (performSignExtendInRegCombine).
+ if (VT == MVT::i16 && Subtarget->hasScalarSubwordLoads()) {
+ SDValue BufferLoad =
+ DAG.getMemIntrinsicNode(AMDGPUISD::SBUFFER_LOAD_USHORT, DL,
+ DAG.getVTList(MVT::i32), Ops, VT, MMO);
+ return DAG.getNode(ISD::TRUNCATE, DL, VT, BufferLoad);
+ }
// Widen vec3 load to vec4.
if (VT.isVector() && VT.getVectorNumElements() == 3 &&
@@ -7776,6 +7843,21 @@ SDValue SITargetLowering::lowerSBuffer(EVT VT, SDLoc DL, SDValue Rsrc,
// We have a divergent offset. Emit a MUBUF buffer load instead. We can
// assume that the buffer is unswizzled.
+ SDValue Ops[] = {
+ DAG.getEntryNode(), // Chain
+ Rsrc, // rsrc
+ DAG.getConstant(0, DL, MVT::i32), // vindex
+ {}, // voffset
+ {}, // soffset
+ {}, // offset
+ CachePolicy, // cachepolicy
+ DAG.getTargetConstant(0, DL, MVT::i1), // idxen
+ };
+ if (VT == MVT::i16 && Subtarget->hasScalarSubwordLoads()) {
+ setBufferOffsets(Offset, DAG, &Ops[3], Align(4));
+ return handleByteShortBufferLoads(DAG, VT, DL, Ops, MMO);
+ }
+
SmallVector<SDValue, 4> Loads;
unsigned NumLoads = 1;
MVT LoadVT = VT.getSimpleVT();
@@ -7789,16 +7871,6 @@ SDValue SITargetLowering::lowerSBuffer(EVT VT, SDLoc DL, SDValue Rsrc,
}
SDVTList VTList = DAG.getVTList({LoadVT, MVT::Glue});
- SDValue Ops[] = {
- DAG.getEntryNode(), // Chain
- Rsrc, // rsrc
- DAG.getConstant(0, DL, MVT::i32), // vindex
- {}, // voffset
- {}, // soffset
- {}, // offset
- CachePolicy, // cachepolicy
- DAG.getTargetConstant(0, DL, MVT::i1), // idxen
- };
// Use the alignment to ensure that the required offsets will fit into the
// immediate offsets.
@@ -8005,6 +8077,8 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
SDLoc(Op), MVT::i32);
case Intrinsic::amdgcn_s_buffer_load: {
unsigned CPol = Op.getConstantOperandVal(3);
+ // s_buffer_load, because of how it's optimized, can't be volatile
+ // so reject ones with the volatile bit set.
if (CPol & ~((Subtarget->getGeneration() >= AMDGPUSubtarget::GFX12)
? AMDGPU::CPol::ALL
: AMDGPU::CPol::ALL_pregfx12))
@@ -8374,9 +8448,9 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
M, DAG, Ops);
// Handle BUFFER_LOAD_BYTE/UBYTE/SHORT/USHORT overloaded intrinsics
- if (LoadVT.getScalarType() == MVT::i8 ||
- LoadVT.getScalarType() == MVT::i16)
- return handleByteShortBufferLoads(DAG, LoadVT, DL, Ops, M);
+ if (LoadVT.getScalarType() == MVT::i8 || LoadVT.getScalarType() == MVT::i16)
+ return handleByteShortBufferLoads(DAG, LoadVT, DL, Ops,
+ M->getMemOperand());
return getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops, IntVT,
M->getMemOperand(), DAG);
@@ -8592,9 +8666,15 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd:
return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FADD);
+ case Intrinsic::amdgcn_raw_buffer_atomic_fadd_v2bf16:
+ return lowerRawBufferAtomicIntrin(Op, DAG,
+ AMDGPUISD::BUFFER_ATOMIC_FADD_BF16);
case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fadd:
return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FADD);
+ case Intrinsic::amdgcn_struct_buffer_atomic_fadd_v2bf16:
+ return lowerStructBufferAtomicIntrin(Op, DAG,
+ AMDGPUISD::BUFFER_ATOMIC_FADD_BF16);
case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmin:
return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FMIN);
@@ -8643,6 +8723,9 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
case Intrinsic::amdgcn_raw_buffer_atomic_dec:
case Intrinsic::amdgcn_raw_ptr_buffer_atomic_dec:
return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_DEC);
+ case Intrinsic::amdgcn_raw_buffer_atomic_cond_sub_u32:
+ return lowerRawBufferAtomicIntrin(Op, DAG,
+ AMDGPUISD::BUFFER_ATOMIC_COND_SUB_U32);
case Intrinsic::amdgcn_struct_buffer_atomic_swap:
case Intrinsic::amdgcn_struct_ptr_buffer_atomic_swap:
return lowerStructBufferAtomicIntrin(Op, DAG,
@@ -8684,6 +8767,9 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
case Intrinsic::amdgcn_struct_buffer_atomic_dec:
case Intrinsic::amdgcn_struct_ptr_buffer_atomic_dec:
return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_DEC);
+ case Intrinsic::amdgcn_struct_buffer_atomic_cond_sub_u32:
+ return lowerStructBufferAtomicIntrin(Op, DAG,
+ AMDGPUISD::BUFFER_ATOMIC_COND_SUB_U32);
case Intrinsic::amdgcn_buffer_atomic_cmpswap: {
unsigned Slc = Op.getConstantOperandVal(7);
@@ -9376,6 +9462,7 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
case Intrinsic::amdgcn_raw_ptr_buffer_load_lds:
case Intrinsic::amdgcn_struct_buffer_load_lds:
case Intrinsic::amdgcn_struct_ptr_buffer_load_lds: {
+ assert(!AMDGPU::isGFX12Plus(*Subtarget));
unsigned Opc;
bool HasVIndex =
IntrinsicID == Intrinsic::amdgcn_struct_buffer_load_lds ||
@@ -9428,8 +9515,8 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
unsigned Aux = Op.getConstantOperandVal(8 + OpOffset);
Ops.push_back(
DAG.getTargetConstant(Aux & AMDGPU::CPol::ALL, DL, MVT::i8)); // cpol
- Ops.push_back(
- DAG.getTargetConstant((Aux >> 3) & 1, DL, MVT::i8)); // swz
+ Ops.push_back(DAG.getTargetConstant(
+ Aux & AMDGPU::CPol::SWZ_pregfx12 ? 1 : 0, DL, MVT::i8)); // swz
Ops.push_back(M0Val.getValue(0)); // Chain
Ops.push_back(M0Val.getValue(1)); // Glue
@@ -9766,18 +9853,17 @@ SDValue SITargetLowering::lowerPointerAsRsrcIntrin(SDNode *Op,
}
// Handle 8 bit and 16 bit buffer loads
-SDValue SITargetLowering::handleByteShortBufferLoads(SelectionDAG &DAG,
- EVT LoadVT, SDLoc DL,
- ArrayRef<SDValue> Ops,
- MemSDNode *M) const {
+SDValue
+SITargetLowering::handleByteShortBufferLoads(SelectionDAG &DAG, EVT LoadVT,
+ SDLoc DL, ArrayRef<SDValue> Ops,
+ MachineMemOperand *MMO) const {
EVT IntVT = LoadVT.changeTypeToInteger();
unsigned Opc = (LoadVT.getScalarType() == MVT::i8) ?
AMDGPUISD::BUFFER_LOAD_UBYTE : AMDGPUISD::BUFFER_LOAD_USHORT;
SDVTList ResList = DAG.getVTList(MVT::i32, MVT::Other);
- SDValue BufferLoad = DAG.getMemIntrinsicNode(Opc, DL, ResList,
- Ops, IntVT,
- M->getMemOperand());
+ SDValue BufferLoad =
+ DAG.getMemIntrinsicNode(Opc, DL, ResList, Ops, IntVT, MMO);
SDValue LoadVal = DAG.getNode(ISD::TRUNCATE, DL, IntVT, BufferLoad);
LoadVal = DAG.getNode(ISD::BITCAST, DL, LoadVT, LoadVal);
@@ -9821,6 +9907,8 @@ static SDValue getLoadExtOrTrunc(SelectionDAG &DAG,
llvm_unreachable("invalid ext type");
}
+// Try to turn 8 and 16-bit scalar loads into SMEM eligible 32-bit loads.
+// TODO: Skip this on GFX12 which does have scalar sub-dword loads.
SDValue SITargetLowering::widenLoad(LoadSDNode *Ld, DAGCombinerInfo &DCI) const {
SelectionDAG &DAG = DCI.DAG;
if (Ld->getAlign() < Align(4) || Ld->isDivergent())
@@ -12058,17 +12146,42 @@ SDValue SITargetLowering::performZeroExtendCombine(SDNode *N,
return SDValue();
}
-SDValue SITargetLowering::performSignExtendInRegCombine(SDNode *N,
- DAGCombinerInfo &DCI)
- const {
+SDValue
+SITargetLowering::performSignExtendInRegCombine(SDNode *N,
+ DAGCombinerInfo &DCI) const {
SDValue Src = N->getOperand(0);
auto *VTSign = cast<VTSDNode>(N->getOperand(1));
- if (((Src.getOpcode() == AMDGPUISD::BUFFER_LOAD_UBYTE &&
- VTSign->getVT() == MVT::i8) ||
- (Src.getOpcode() == AMDGPUISD::BUFFER_LOAD_USHORT &&
- VTSign->getVT() == MVT::i16)) &&
- Src.hasOneUse()) {
+ // Combine s_buffer_load_u8 or s_buffer_load_u16 with sext and replace them
+ // with s_buffer_load_i8 and s_buffer_load_i16 respectively.
+ if (((Src.getOpcode() == AMDGPUISD::SBUFFER_LOAD_UBYTE &&
+ VTSign->getVT() == MVT::i8) ||
+ (Src.getOpcode() == AMDGPUISD::SBUFFER_LOAD_USHORT &&
+ VTSign->getVT() == MVT::i16))) {
+ assert(Subtarget->hasScalarSubwordLoads() &&
+ "s_buffer_load_{u8, i8} are supported "
+ "in GFX12 (or newer) architectures.");
+ EVT VT = Src.getValueType();
+ unsigned Opc = (Src.getOpcode() == AMDGPUISD::SBUFFER_LOAD_UBYTE)
+ ? AMDGPUISD::SBUFFER_LOAD_BYTE
+ : AMDGPUISD::SBUFFER_LOAD_SHORT;
+ SDLoc DL(N);
+ SDVTList ResList = DCI.DAG.getVTList(MVT::i32);
+ SDValue Ops[] = {
+ Src.getOperand(0), // source register
+ Src.getOperand(1), // offset
+ Src.getOperand(2) // cachePolicy
+ };
+ auto *M = cast<MemSDNode>(Src);
+ SDValue BufferLoad = DCI.DAG.getMemIntrinsicNode(
+ Opc, DL, ResList, Ops, M->getMemoryVT(), M->getMemOperand());
+ SDValue LoadVal = DCI.DAG.getNode(ISD::TRUNCATE, DL, VT, BufferLoad);
+ return LoadVal;
+ } else if (((Src.getOpcode() == AMDGPUISD::BUFFER_LOAD_UBYTE &&
+ VTSign->getVT() == MVT::i8) ||
+ (Src.getOpcode() == AMDGPUISD::BUFFER_LOAD_USHORT &&
+ VTSign->getVT() == MVT::i16)) &&
+ Src.hasOneUse()) {
auto *M = cast<MemSDNode>(Src);
SDValue Ops[] = {
Src.getOperand(0), // Chain
@@ -14283,8 +14396,8 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N,
case ISD::FP_ROUND:
return performFPRoundCombine(N, DCI);
case ISD::LOAD: {
- if (SDValue Widended = widenLoad(cast<LoadSDNode>(N), DCI))
- return Widended;
+ if (SDValue Widened = widenLoad(cast<LoadSDNode>(N), DCI))
+ return Widened;
[[fallthrough]];
}
default: {
@@ -15483,6 +15596,7 @@ bool SITargetLowering::isSDNodeSourceOfDivergence(const SDNode *N,
case AMDGPUISD::BUFFER_ATOMIC_CMPSWAP:
case AMDGPUISD::BUFFER_ATOMIC_CSUB:
case AMDGPUISD::BUFFER_ATOMIC_FADD:
+ case AMDGPUISD::BUFFER_ATOMIC_FADD_BF16:
case AMDGPUISD::BUFFER_ATOMIC_FMIN:
case AMDGPUISD::BUFFER_ATOMIC_FMAX:
// Target-specific read-modify-write atomics are sources of divergence.
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIISelLowering.h b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIISelLowering.h
index 92b38ebade62..d66ba0b59ba9 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIISelLowering.h
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIISelLowering.h
@@ -273,7 +273,8 @@ private:
// Handle 8 bit and 16 bit buffer loads
SDValue handleByteShortBufferLoads(SelectionDAG &DAG, EVT LoadVT, SDLoc DL,
- ArrayRef<SDValue> Ops, MemSDNode *M) const;
+ ArrayRef<SDValue> Ops,
+ MachineMemOperand *MMO) const;
// Handle 8 bit and 16 bit buffer stores
SDValue handleByteShortBufferStores(SelectionDAG &DAG, EVT VDataType,
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
index 1f480c248154..6ecb1c8bf6e1 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -31,6 +31,7 @@
#include "llvm/ADT/MapVector.h"
#include "llvm/ADT/PostOrderIterator.h"
#include "llvm/ADT/Sequence.h"
+#include "llvm/Analysis/AliasAnalysis.h"
#include "llvm/CodeGen/MachineLoopInfo.h"
#include "llvm/CodeGen/MachinePostDominators.h"
#include "llvm/InitializePasses.h"
@@ -57,7 +58,18 @@ namespace {
// associated with the operand. Used for determining whether
// s_waitcnt instruction needs to be emitted.
-enum InstCounterType { VM_CNT = 0, LGKM_CNT, EXP_CNT, VS_CNT, NUM_INST_CNTS };
+enum InstCounterType {
+ LOAD_CNT = 0, // VMcnt prior to gfx12.
+ DS_CNT, // LKGMcnt prior to gfx12.
+ EXP_CNT, //
+ STORE_CNT, // VScnt in gfx10/gfx11.
+ NUM_NORMAL_INST_CNTS,
+ SAMPLE_CNT = NUM_NORMAL_INST_CNTS, // gfx12+ only.
+ BVH_CNT, // gfx12+ only.
+ KM_CNT, // gfx12+ only.
+ NUM_EXTENDED_INST_CNTS,
+ NUM_INST_CNTS = NUM_EXTENDED_INST_CNTS
+};
} // namespace
namespace llvm {
@@ -67,15 +79,23 @@ template <> struct enum_iteration_traits<InstCounterType> {
} // namespace llvm
namespace {
-auto inst_counter_types() { return enum_seq(VM_CNT, NUM_INST_CNTS); }
+// Return an iterator over all counters between LOAD_CNT (the first counter)
+// and \c MaxCounter (exclusive, default value yields an enumeration over
+// all counters).
+auto inst_counter_types(InstCounterType MaxCounter = NUM_INST_CNTS) {
+ return enum_seq(LOAD_CNT, MaxCounter);
+}
using RegInterval = std::pair<int, int>;
struct HardwareLimits {
- unsigned VmcntMax;
+ unsigned LoadcntMax; // Corresponds to VMcnt prior to gfx12.
unsigned ExpcntMax;
- unsigned LgkmcntMax;
- unsigned VscntMax;
+ unsigned DscntMax; // Corresponds to LGKMcnt prior to gfx12.
+ unsigned StorecntMax; // Corresponds to VScnt in gfx10/gfx11.
+ unsigned SamplecntMax; // gfx12+ only.
+ unsigned BvhcntMax; // gfx12+ only.
+ unsigned KmcntMax; // gfx12+ only.
};
struct RegisterEncoding {
@@ -86,31 +106,25 @@ struct RegisterEncoding {
};
enum WaitEventType {
- VMEM_ACCESS, // vector-memory read & write
- VMEM_READ_ACCESS, // vector-memory read
- VMEM_WRITE_ACCESS, // vector-memory write that is not scratch
- SCRATCH_WRITE_ACCESS, // vector-memory write that may be scratch
- LDS_ACCESS, // lds read & write
- GDS_ACCESS, // gds read & write
- SQ_MESSAGE, // send message
- SMEM_ACCESS, // scalar-memory read & write
- EXP_GPR_LOCK, // export holding on its data src
- GDS_GPR_LOCK, // GDS holding on its data and addr src
- EXP_POS_ACCESS, // write to export position
- EXP_PARAM_ACCESS, // write to export parameter
- VMW_GPR_LOCK, // vector-memory write holding on its data src
- EXP_LDS_ACCESS, // read by ldsdir counting as export
+ VMEM_ACCESS, // vector-memory read & write
+ VMEM_READ_ACCESS, // vector-memory read
+ VMEM_SAMPLER_READ_ACCESS, // vector-memory SAMPLER read (gfx12+ only)
+ VMEM_BVH_READ_ACCESS, // vector-memory BVH read (gfx12+ only)
+ VMEM_WRITE_ACCESS, // vector-memory write that is not scratch
+ SCRATCH_WRITE_ACCESS, // vector-memory write that may be scratch
+ LDS_ACCESS, // lds read & write
+ GDS_ACCESS, // gds read & write
+ SQ_MESSAGE, // send message
+ SMEM_ACCESS, // scalar-memory read & write
+ EXP_GPR_LOCK, // export holding on its data src
+ GDS_GPR_LOCK, // GDS holding on its data and addr src
+ EXP_POS_ACCESS, // write to export position
+ EXP_PARAM_ACCESS, // write to export parameter
+ VMW_GPR_LOCK, // vector-memory write holding on its data src
+ EXP_LDS_ACCESS, // read by ldsdir counting as export
NUM_WAIT_EVENTS,
};
-static const unsigned WaitEventMaskForInst[NUM_INST_CNTS] = {
- (1 << VMEM_ACCESS) | (1 << VMEM_READ_ACCESS),
- (1 << SMEM_ACCESS) | (1 << LDS_ACCESS) | (1 << GDS_ACCESS) |
- (1 << SQ_MESSAGE),
- (1 << EXP_GPR_LOCK) | (1 << GDS_GPR_LOCK) | (1 << VMW_GPR_LOCK) |
- (1 << EXP_PARAM_ACCESS) | (1 << EXP_POS_ACCESS) | (1 << EXP_LDS_ACCESS),
- (1 << VMEM_WRITE_ACCESS) | (1 << SCRATCH_WRITE_ACCESS)};
-
// The mapping is:
// 0 .. SQ_MAX_PGM_VGPRS-1 real VGPRs
// SQ_MAX_PGM_VGPRS .. NUM_ALL_VGPRS-1 extra VGPR-like slots
@@ -121,8 +135,13 @@ enum RegisterMapping {
SQ_MAX_PGM_VGPRS = 512, // Maximum programmable VGPRs across all targets.
AGPR_OFFSET = 256, // Maximum programmable ArchVGPRs across all targets.
SQ_MAX_PGM_SGPRS = 256, // Maximum programmable SGPRs across all targets.
- NUM_EXTRA_VGPRS = 1, // A reserved slot for DS.
- EXTRA_VGPR_LDS = 0, // An artificial register to track LDS writes.
+ NUM_EXTRA_VGPRS = 9, // Reserved slots for DS.
+ // Artificial register slots to track LDS writes into specific LDS locations
+ // if a location is known. When slots are exhausted or location is
+ // unknown use the first slot. The first slot is also always updated in
+ // addition to known location's slot to properly generate waits if dependent
+ // instruction's location is unknown.
+ EXTRA_VGPR_LDS = 0,
NUM_ALL_VGPRS = SQ_MAX_PGM_VGPRS + NUM_EXTRA_VGPRS, // Where SGPR starts.
};
@@ -137,17 +156,33 @@ enum VmemType {
// MIMG instructions with a sampler.
VMEM_SAMPLER,
// BVH instructions
- VMEM_BVH
+ VMEM_BVH,
+ NUM_VMEM_TYPES
};
+// Maps values of InstCounterType to the instruction that waits on that
+// counter. Only used if GCNSubtarget::hasExtendedWaitCounts()
+// returns true.
+static const unsigned instrsForExtendedCounterTypes[NUM_EXTENDED_INST_CNTS] = {
+ AMDGPU::S_WAIT_LOADCNT, AMDGPU::S_WAIT_DSCNT, AMDGPU::S_WAIT_EXPCNT,
+ AMDGPU::S_WAIT_STORECNT, AMDGPU::S_WAIT_SAMPLECNT, AMDGPU::S_WAIT_BVHCNT,
+ AMDGPU::S_WAIT_KMCNT};
+
static bool updateVMCntOnly(const MachineInstr &Inst) {
return SIInstrInfo::isVMEM(Inst) || SIInstrInfo::isFLATGlobal(Inst) ||
SIInstrInfo::isFLATScratch(Inst);
}
+#ifndef NDEBUG
+static bool isNormalMode(InstCounterType MaxCounter) {
+ return MaxCounter == NUM_NORMAL_INST_CNTS;
+}
+#endif // NDEBUG
+
VmemType getVmemType(const MachineInstr &Inst) {
assert(updateVMCntOnly(Inst));
- if (!SIInstrInfo::isMIMG(Inst))
+ if (!SIInstrInfo::isMIMG(Inst) && !SIInstrInfo::isVIMAGE(Inst) &&
+ !SIInstrInfo::isVSAMPLE(Inst))
return VMEM_NOSAMPLER;
const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Inst.getOpcode());
const AMDGPU::MIMGBaseOpcodeInfo *BaseInfo =
@@ -156,25 +191,49 @@ VmemType getVmemType(const MachineInstr &Inst) {
: BaseInfo->Sampler ? VMEM_SAMPLER : VMEM_NOSAMPLER;
}
-void addWait(AMDGPU::Waitcnt &Wait, InstCounterType T, unsigned Count) {
+unsigned &getCounterRef(AMDGPU::Waitcnt &Wait, InstCounterType T) {
switch (T) {
- case VM_CNT:
- Wait.VmCnt = std::min(Wait.VmCnt, Count);
- break;
+ case LOAD_CNT:
+ return Wait.LoadCnt;
case EXP_CNT:
- Wait.ExpCnt = std::min(Wait.ExpCnt, Count);
- break;
- case LGKM_CNT:
- Wait.LgkmCnt = std::min(Wait.LgkmCnt, Count);
- break;
- case VS_CNT:
- Wait.VsCnt = std::min(Wait.VsCnt, Count);
- break;
+ return Wait.ExpCnt;
+ case DS_CNT:
+ return Wait.DsCnt;
+ case STORE_CNT:
+ return Wait.StoreCnt;
+ case SAMPLE_CNT:
+ return Wait.SampleCnt;
+ case BVH_CNT:
+ return Wait.BvhCnt;
+ case KM_CNT:
+ return Wait.KmCnt;
default:
llvm_unreachable("bad InstCounterType");
}
}
+void addWait(AMDGPU::Waitcnt &Wait, InstCounterType T, unsigned Count) {
+ unsigned &WC = getCounterRef(Wait, T);
+ WC = std::min(WC, Count);
+}
+
+void setNoWait(AMDGPU::Waitcnt &Wait, InstCounterType T) {
+ getCounterRef(Wait, T) = ~0u;
+}
+
+unsigned getWait(AMDGPU::Waitcnt &Wait, InstCounterType T) {
+ return getCounterRef(Wait, T);
+}
+
+// Mapping from event to counter according to the table masks.
+InstCounterType eventCounter(const unsigned *masks, WaitEventType E) {
+ for (auto T : inst_counter_types()) {
+ if (masks[T] & (1 << E))
+ return T;
+ }
+ llvm_unreachable("event type has no associated counter");
+}
+
// This objects maintains the current score brackets of each wait counter, and
// a per-register scoreboard for each wait counter.
//
@@ -185,20 +244,30 @@ void addWait(AMDGPU::Waitcnt &Wait, InstCounterType T, unsigned Count) {
// "s_waitcnt 0" before use.
class WaitcntBrackets {
public:
- WaitcntBrackets(const GCNSubtarget *SubTarget, HardwareLimits Limits,
- RegisterEncoding Encoding)
- : ST(SubTarget), Limits(Limits), Encoding(Encoding) {}
+ WaitcntBrackets(const GCNSubtarget *SubTarget, InstCounterType MaxCounter,
+ HardwareLimits Limits, RegisterEncoding Encoding,
+ const unsigned *WaitEventMaskForInst,
+ InstCounterType SmemAccessCounter)
+ : ST(SubTarget), MaxCounter(MaxCounter), Limits(Limits),
+ Encoding(Encoding), WaitEventMaskForInst(WaitEventMaskForInst),
+ SmemAccessCounter(SmemAccessCounter) {}
unsigned getWaitCountMax(InstCounterType T) const {
switch (T) {
- case VM_CNT:
- return Limits.VmcntMax;
- case LGKM_CNT:
- return Limits.LgkmcntMax;
+ case LOAD_CNT:
+ return Limits.LoadcntMax;
+ case DS_CNT:
+ return Limits.DscntMax;
case EXP_CNT:
return Limits.ExpcntMax;
- case VS_CNT:
- return Limits.VscntMax;
+ case STORE_CNT:
+ return Limits.StorecntMax;
+ case SAMPLE_CNT:
+ return Limits.SamplecntMax;
+ case BVH_CNT:
+ return Limits.BvhcntMax;
+ case KM_CNT:
+ return Limits.KmcntMax;
default:
break;
}
@@ -219,20 +288,11 @@ public:
return getScoreUB(T) - getScoreLB(T);
}
- // Mapping from event to counter.
- InstCounterType eventCounter(WaitEventType E) const {
- for (auto T : inst_counter_types()) {
- if (WaitEventMaskForInst[T] & (1 << E))
- return T;
- }
- llvm_unreachable("event type has no associated counter");
- }
-
unsigned getRegScore(int GprNo, InstCounterType T) const {
if (GprNo < NUM_ALL_VGPRS) {
return VgprScores[T][GprNo];
}
- assert(T == LGKM_CNT);
+ assert(T == SmemAccessCounter);
return SgprScores[GprNo - NUM_ALL_VGPRS];
}
@@ -269,15 +329,15 @@ public:
}
bool hasPendingFlat() const {
- return ((LastFlat[LGKM_CNT] > ScoreLBs[LGKM_CNT] &&
- LastFlat[LGKM_CNT] <= ScoreUBs[LGKM_CNT]) ||
- (LastFlat[VM_CNT] > ScoreLBs[VM_CNT] &&
- LastFlat[VM_CNT] <= ScoreUBs[VM_CNT]));
+ return ((LastFlat[DS_CNT] > ScoreLBs[DS_CNT] &&
+ LastFlat[DS_CNT] <= ScoreUBs[DS_CNT]) ||
+ (LastFlat[LOAD_CNT] > ScoreLBs[LOAD_CNT] &&
+ LastFlat[LOAD_CNT] <= ScoreUBs[LOAD_CNT]));
}
void setPendingFlat() {
- LastFlat[VM_CNT] = ScoreUBs[VM_CNT];
- LastFlat[LGKM_CNT] = ScoreUBs[LGKM_CNT];
+ LastFlat[LOAD_CNT] = ScoreUBs[LOAD_CNT];
+ LastFlat[DS_CNT] = ScoreUBs[DS_CNT];
}
// Return true if there might be pending writes to the specified vgpr by VMEM
@@ -293,8 +353,12 @@ public:
}
void setStateOnFunctionEntryOrReturn() {
- setScoreUB(VS_CNT, getWaitCountMax(VS_CNT));
- PendingEvents |= WaitEventMaskForInst[VS_CNT];
+ setScoreUB(STORE_CNT, getScoreUB(STORE_CNT) + getWaitCountMax(STORE_CNT));
+ PendingEvents |= WaitEventMaskForInst[STORE_CNT];
+ }
+
+ ArrayRef<const MachineInstr *> getLDSDMAStores() const {
+ return LDSDMAStores;
}
void print(raw_ostream &);
@@ -331,7 +395,7 @@ private:
VgprUB = std::max(VgprUB, GprNo);
VgprScores[T][GprNo] = Val;
} else {
- assert(T == LGKM_CNT);
+ assert(T == SmemAccessCounter);
SgprUB = std::max(SgprUB, GprNo - NUM_ALL_VGPRS);
SgprScores[GprNo - NUM_ALL_VGPRS] = Val;
}
@@ -342,8 +406,11 @@ private:
unsigned OpNo, unsigned Val);
const GCNSubtarget *ST = nullptr;
+ InstCounterType MaxCounter = NUM_EXTENDED_INST_CNTS;
HardwareLimits Limits = {};
RegisterEncoding Encoding = {};
+ const unsigned *WaitEventMaskForInst;
+ InstCounterType SmemAccessCounter;
unsigned ScoreLBs[NUM_INST_CNTS] = {0};
unsigned ScoreUBs[NUM_INST_CNTS] = {0};
unsigned PendingEvents = 0;
@@ -354,11 +421,134 @@ private:
int VgprUB = -1;
int SgprUB = -1;
unsigned VgprScores[NUM_INST_CNTS][NUM_ALL_VGPRS] = {{0}};
- // Wait cnt scores for every sgpr, only lgkmcnt is relevant.
+ // Wait cnt scores for every sgpr, only DS_CNT (corresponding to LGKMcnt
+ // pre-gfx12) or KM_CNT (gfx12+ only) are relevant.
unsigned SgprScores[SQ_MAX_PGM_SGPRS] = {0};
// Bitmask of the VmemTypes of VMEM instructions that might have a pending
// write to each vgpr.
unsigned char VgprVmemTypes[NUM_ALL_VGPRS] = {0};
+ // Store representative LDS DMA operations. The only useful info here is
+ // alias info. One store is kept per unique AAInfo.
+ SmallVector<const MachineInstr *, NUM_EXTRA_VGPRS - 1> LDSDMAStores;
+};
+
+// This abstracts the logic for generating and updating S_WAIT* instructions
+// away from the analysis that determines where they are needed. This was
+// done because the set of counters and instructions for waiting on them
+// underwent a major shift with gfx12, sufficiently so that having this
+// abstraction allows the main analysis logic to be simpler than it would
+// otherwise have had to become.
+class WaitcntGenerator {
+protected:
+ const GCNSubtarget *ST = nullptr;
+ const SIInstrInfo *TII = nullptr;
+ AMDGPU::IsaVersion IV;
+ InstCounterType MaxCounter;
+
+public:
+ WaitcntGenerator() {}
+ WaitcntGenerator(const GCNSubtarget *ST, InstCounterType MaxCounter)
+ : ST(ST), TII(ST->getInstrInfo()),
+ IV(AMDGPU::getIsaVersion(ST->getCPU())), MaxCounter(MaxCounter) {}
+
+ // Edits an existing sequence of wait count instructions according
+ // to an incoming Waitcnt value, which is itself updated to reflect
+ // any new wait count instructions which may need to be generated by
+ // WaitcntGenerator::createNewWaitcnt(). It will return true if any edits
+ // were made.
+ //
+ // This editing will usually be merely updated operands, but it may also
+ // delete instructions if the incoming Wait value indicates they are not
+ // needed. It may also remove existing instructions for which a wait
+ // is needed if it can be determined that it is better to generate new
+ // instructions later, as can happen on gfx12.
+ virtual bool
+ applyPreexistingWaitcnt(WaitcntBrackets &ScoreBrackets,
+ MachineInstr &OldWaitcntInstr, AMDGPU::Waitcnt &Wait,
+ MachineBasicBlock::instr_iterator It) const = 0;
+
+ // Transform a soft waitcnt into a normal one.
+ bool promoteSoftWaitCnt(MachineInstr *Waitcnt) const;
+
+ // Generates new wait count instructions according to the value of
+ // Wait, returning true if any new instructions were created.
+ virtual bool createNewWaitcnt(MachineBasicBlock &Block,
+ MachineBasicBlock::instr_iterator It,
+ AMDGPU::Waitcnt Wait) = 0;
+
+ // Returns an array of bit masks which can be used to map values in
+ // WaitEventType to corresponding counter values in InstCounterType.
+ virtual const unsigned *getWaitEventMask() const = 0;
+
+ virtual ~WaitcntGenerator() = default;
+};
+
+class WaitcntGeneratorPreGFX12 : public WaitcntGenerator {
+public:
+ WaitcntGeneratorPreGFX12() {}
+ WaitcntGeneratorPreGFX12(const GCNSubtarget *ST)
+ : WaitcntGenerator(ST, NUM_NORMAL_INST_CNTS) {}
+
+ bool
+ applyPreexistingWaitcnt(WaitcntBrackets &ScoreBrackets,
+ MachineInstr &OldWaitcntInstr, AMDGPU::Waitcnt &Wait,
+ MachineBasicBlock::instr_iterator It) const override;
+
+ bool createNewWaitcnt(MachineBasicBlock &Block,
+ MachineBasicBlock::instr_iterator It,
+ AMDGPU::Waitcnt Wait) override;
+
+ const unsigned *getWaitEventMask() const override {
+ assert(ST);
+
+ static const unsigned WaitEventMaskForInstPreGFX12[NUM_INST_CNTS] = {
+ (1 << VMEM_ACCESS) | (1 << VMEM_READ_ACCESS) |
+ (1 << VMEM_SAMPLER_READ_ACCESS) | (1 << VMEM_BVH_READ_ACCESS),
+ (1 << SMEM_ACCESS) | (1 << LDS_ACCESS) | (1 << GDS_ACCESS) |
+ (1 << SQ_MESSAGE),
+ (1 << EXP_GPR_LOCK) | (1 << GDS_GPR_LOCK) | (1 << VMW_GPR_LOCK) |
+ (1 << EXP_PARAM_ACCESS) | (1 << EXP_POS_ACCESS) |
+ (1 << EXP_LDS_ACCESS),
+ (1 << VMEM_WRITE_ACCESS) | (1 << SCRATCH_WRITE_ACCESS),
+ 0,
+ 0,
+ 0};
+
+ return WaitEventMaskForInstPreGFX12;
+ }
+};
+
+class WaitcntGeneratorGFX12Plus : public WaitcntGenerator {
+public:
+ WaitcntGeneratorGFX12Plus() {}
+ WaitcntGeneratorGFX12Plus(const GCNSubtarget *ST, InstCounterType MaxCounter)
+ : WaitcntGenerator(ST, MaxCounter) {}
+
+ bool
+ applyPreexistingWaitcnt(WaitcntBrackets &ScoreBrackets,
+ MachineInstr &OldWaitcntInstr, AMDGPU::Waitcnt &Wait,
+ MachineBasicBlock::instr_iterator It) const override;
+
+ bool createNewWaitcnt(MachineBasicBlock &Block,
+ MachineBasicBlock::instr_iterator It,
+ AMDGPU::Waitcnt Wait) override;
+
+ const unsigned *getWaitEventMask() const override {
+ assert(ST);
+
+ static const unsigned WaitEventMaskForInstGFX12Plus[NUM_INST_CNTS] = {
+ (1 << VMEM_ACCESS) | (1 << VMEM_READ_ACCESS),
+ (1 << LDS_ACCESS) | (1 << GDS_ACCESS),
+ (1 << EXP_GPR_LOCK) | (1 << GDS_GPR_LOCK) | (1 << VMW_GPR_LOCK) |
+ (1 << EXP_PARAM_ACCESS) | (1 << EXP_POS_ACCESS) |
+ (1 << EXP_LDS_ACCESS),
+ (1 << VMEM_WRITE_ACCESS) | (1 << SCRATCH_WRITE_ACCESS),
+ (1 << VMEM_SAMPLER_READ_ACCESS),
+ (1 << VMEM_BVH_READ_ACCESS),
+ (1 << SMEM_ACCESS) | (1 << SQ_MESSAGE)};
+
+ return WaitEventMaskForInstGFX12Plus;
+ }
};
class SIInsertWaitcnts : public MachineFunctionPass {
@@ -367,18 +557,20 @@ private:
const SIInstrInfo *TII = nullptr;
const SIRegisterInfo *TRI = nullptr;
const MachineRegisterInfo *MRI = nullptr;
- AMDGPU::IsaVersion IV;
DenseMap<const Value *, MachineBasicBlock *> SLoadAddresses;
DenseMap<MachineBasicBlock *, bool> PreheadersToFlush;
MachineLoopInfo *MLI;
MachinePostDominatorTree *PDT;
+ AliasAnalysis *AA = nullptr;
struct BlockInfo {
std::unique_ptr<WaitcntBrackets> Incoming;
bool Dirty = true;
};
+ InstCounterType SmemAccessCounter;
+
MapVector<MachineBasicBlock *, BlockInfo> BlockInfos;
// ForceEmitZeroWaitcnts: force all waitcnts insts to be s_waitcnt 0
@@ -388,10 +580,20 @@ private:
bool OptNone;
+ // In any given run of this pass, WCG will point to one of these two
+ // generator objects, which must have been re-initialised before use
+ // from a value made using a subtarget constructor.
+ WaitcntGeneratorPreGFX12 WCGPreGFX12;
+ WaitcntGeneratorGFX12Plus WCGGFX12Plus;
+
+ WaitcntGenerator *WCG = nullptr;
+
// S_ENDPGM instructions before which we should insert a DEALLOC_VGPRS
// message.
DenseSet<MachineInstr *> ReleaseVGPRInsts;
+ InstCounterType MaxCounter = NUM_NORMAL_INST_CNTS;
+
public:
static char ID;
@@ -415,6 +617,8 @@ public:
AU.setPreservesCFG();
AU.addRequired<MachineLoopInfo>();
AU.addRequired<MachinePostDominatorTree>();
+ AU.addUsedIfAvailable<AAResultsWrapperPass>();
+ AU.addPreserved<AAResultsWrapperPass>();
MachineFunctionPass::getAnalysisUsage(AU);
}
@@ -438,16 +642,22 @@ public:
if (DebugCounter::isCounterSet(ForceLgkmCounter) &&
DebugCounter::shouldExecute(ForceLgkmCounter)) {
- ForceEmitWaitcnt[LGKM_CNT] = true;
+ ForceEmitWaitcnt[DS_CNT] = true;
+ ForceEmitWaitcnt[KM_CNT] = true;
} else {
- ForceEmitWaitcnt[LGKM_CNT] = false;
+ ForceEmitWaitcnt[DS_CNT] = false;
+ ForceEmitWaitcnt[KM_CNT] = false;
}
if (DebugCounter::isCounterSet(ForceVMCounter) &&
DebugCounter::shouldExecute(ForceVMCounter)) {
- ForceEmitWaitcnt[VM_CNT] = true;
+ ForceEmitWaitcnt[LOAD_CNT] = true;
+ ForceEmitWaitcnt[SAMPLE_CNT] = true;
+ ForceEmitWaitcnt[BVH_CNT] = true;
} else {
- ForceEmitWaitcnt[VM_CNT] = false;
+ ForceEmitWaitcnt[LOAD_CNT] = false;
+ ForceEmitWaitcnt[SAMPLE_CNT] = false;
+ ForceEmitWaitcnt[BVH_CNT] = false;
}
#endif // NDEBUG
}
@@ -455,6 +665,10 @@ public:
// Return the appropriate VMEM_*_ACCESS type for Inst, which must be a VMEM or
// FLAT instruction.
WaitEventType getVmemWaitEventType(const MachineInstr &Inst) const {
+ // Maps VMEM access types to their corresponding WaitEventType.
+ static const WaitEventType VmemReadMapping[NUM_VMEM_TYPES] = {
+ VMEM_READ_ACCESS, VMEM_SAMPLER_READ_ACCESS, VMEM_BVH_READ_ACCESS};
+
assert(SIInstrInfo::isVMEM(Inst) || SIInstrInfo::isFLAT(Inst));
// LDS DMA loads are also stores, but on the LDS side. On the VMEM side
// these should use VM_CNT.
@@ -467,7 +681,9 @@ public:
return SCRATCH_WRITE_ACCESS;
return VMEM_WRITE_ACCESS;
}
- return VMEM_READ_ACCESS;
+ if (!ST->hasExtendedWaitCounts() || SIInstrInfo::isFLAT(Inst))
+ return VMEM_READ_ACCESS;
+ return VmemReadMapping[getVmemType(Inst)];
}
bool mayAccessVMEMThroughFlat(const MachineInstr &MI) const;
@@ -488,13 +704,6 @@ public:
WaitcntBrackets *ScoreBrackets);
bool insertWaitcntInBlock(MachineFunction &MF, MachineBasicBlock &Block,
WaitcntBrackets &ScoreBrackets);
- bool applyPreexistingWaitcnt(WaitcntBrackets &ScoreBrackets,
- MachineInstr &OldWaitcntInstr,
- AMDGPU::Waitcnt &Wait,
- MachineBasicBlock::instr_iterator It) const;
-
- // Transform a soft waitcnt into a normal one.
- bool promoteSoftWaitCnt(MachineInstr *Waitcnt) const;
};
} // end anonymous namespace
@@ -556,8 +765,10 @@ void WaitcntBrackets::updateByEvent(const SIInstrInfo *TII,
const SIRegisterInfo *TRI,
const MachineRegisterInfo *MRI,
WaitEventType E, MachineInstr &Inst) {
- InstCounterType T = eventCounter(E);
- unsigned CurrScore = getScoreUB(T) + 1;
+ InstCounterType T = eventCounter(WaitEventMaskForInst, E);
+
+ unsigned UB = getScoreUB(T);
+ unsigned CurrScore = UB + 1;
if (CurrScore == 0)
report_fatal_error("InsertWaitcnt score wraparound");
// PendingEvents and ScoreUB need to be update regardless if this event
@@ -686,7 +897,7 @@ void WaitcntBrackets::updateByEvent(const SIInstrInfo *TII,
if (!Op.isReg() || !Op.isDef())
continue;
RegInterval Interval = getRegInterval(&Inst, MRI, TRI, I);
- if (T == VM_CNT) {
+ if (T == LOAD_CNT || T == SAMPLE_CNT || T == BVH_CNT) {
if (Interval.first >= NUM_ALL_VGPRS)
continue;
if (updateVMCntOnly(Inst)) {
@@ -707,28 +918,73 @@ void WaitcntBrackets::updateByEvent(const SIInstrInfo *TII,
(TII->isDS(Inst) || TII->mayWriteLDSThroughDMA(Inst))) {
// MUBUF and FLAT LDS DMA operations need a wait on vmcnt before LDS
// written can be accessed. A load from LDS to VMEM does not need a wait.
- setRegScore(SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS, T, CurrScore);
+ unsigned Slot = 0;
+ for (const auto *MemOp : Inst.memoperands()) {
+ if (!MemOp->isStore() ||
+ MemOp->getAddrSpace() != AMDGPUAS::LOCAL_ADDRESS)
+ continue;
+ // Comparing just AA info does not guarantee memoperands are equal
+ // in general, but this is so for LDS DMA in practice.
+ auto AAI = MemOp->getAAInfo();
+ // Alias scope information gives a way to definitely identify an
+ // original memory object and practically produced in the module LDS
+ // lowering pass. If there is no scope available we will not be able
+ // to disambiguate LDS aliasing as after the module lowering all LDS
+ // is squashed into a single big object. Do not attempt to use one of
+ // the limited LDSDMAStores for something we will not be able to use
+ // anyway.
+ if (!AAI || !AAI.Scope)
+ break;
+ for (unsigned I = 0, E = LDSDMAStores.size(); I != E && !Slot; ++I) {
+ for (const auto *MemOp : LDSDMAStores[I]->memoperands()) {
+ if (MemOp->isStore() && AAI == MemOp->getAAInfo()) {
+ Slot = I + 1;
+ break;
+ }
+ }
+ }
+ if (Slot || LDSDMAStores.size() == NUM_EXTRA_VGPRS - 1)
+ break;
+ LDSDMAStores.push_back(&Inst);
+ Slot = LDSDMAStores.size();
+ break;
+ }
+ setRegScore(SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS + Slot, T, CurrScore);
+ if (Slot)
+ setRegScore(SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS, T, CurrScore);
}
}
}
void WaitcntBrackets::print(raw_ostream &OS) {
OS << '\n';
- for (auto T : inst_counter_types()) {
+ for (auto T : inst_counter_types(MaxCounter)) {
unsigned SR = getScoreRange(T);
switch (T) {
- case VM_CNT:
- OS << " VM_CNT(" << SR << "): ";
+ case LOAD_CNT:
+ OS << " " << (ST->hasExtendedWaitCounts() ? "LOAD" : "VM") << "_CNT("
+ << SR << "): ";
break;
- case LGKM_CNT:
- OS << " LGKM_CNT(" << SR << "): ";
+ case DS_CNT:
+ OS << " " << (ST->hasExtendedWaitCounts() ? "DS" : "LGKM") << "_CNT("
+ << SR << "): ";
break;
case EXP_CNT:
OS << " EXP_CNT(" << SR << "): ";
break;
- case VS_CNT:
- OS << " VS_CNT(" << SR << "): ";
+ case STORE_CNT:
+ OS << " " << (ST->hasExtendedWaitCounts() ? "STORE" : "VS") << "_CNT("
+ << SR << "): ";
+ break;
+ case SAMPLE_CNT:
+ OS << " SAMPLE_CNT(" << SR << "): ";
+ break;
+ case BVH_CNT:
+ OS << " BVH_CNT(" << SR << "): ";
+ break;
+ case KM_CNT:
+ OS << " KM_CNT(" << SR << "): ";
break;
default:
OS << " UNKNOWN(" << SR << "): ";
@@ -751,9 +1007,9 @@ void WaitcntBrackets::print(raw_ostream &OS) {
}
}
// Also need to print sgpr scores for lgkm_cnt.
- if (T == LGKM_CNT) {
+ if (T == SmemAccessCounter) {
for (int J = 0; J <= SgprUB; J++) {
- unsigned RegScore = getRegScore(J + NUM_ALL_VGPRS, LGKM_CNT);
+ unsigned RegScore = getRegScore(J + NUM_ALL_VGPRS, T);
if (RegScore <= LB)
continue;
unsigned RelScore = RegScore - LB - 1;
@@ -769,10 +1025,13 @@ void WaitcntBrackets::print(raw_ostream &OS) {
/// Simplify the waitcnt, in the sense of removing redundant counts, and return
/// whether a waitcnt instruction is needed at all.
void WaitcntBrackets::simplifyWaitcnt(AMDGPU::Waitcnt &Wait) const {
- simplifyWaitcnt(VM_CNT, Wait.VmCnt);
+ simplifyWaitcnt(LOAD_CNT, Wait.LoadCnt);
simplifyWaitcnt(EXP_CNT, Wait.ExpCnt);
- simplifyWaitcnt(LGKM_CNT, Wait.LgkmCnt);
- simplifyWaitcnt(VS_CNT, Wait.VsCnt);
+ simplifyWaitcnt(DS_CNT, Wait.DsCnt);
+ simplifyWaitcnt(STORE_CNT, Wait.StoreCnt);
+ simplifyWaitcnt(SAMPLE_CNT, Wait.SampleCnt);
+ simplifyWaitcnt(BVH_CNT, Wait.BvhCnt);
+ simplifyWaitcnt(KM_CNT, Wait.KmCnt);
}
void WaitcntBrackets::simplifyWaitcnt(InstCounterType T,
@@ -793,8 +1052,7 @@ void WaitcntBrackets::determineWait(InstCounterType T, int RegNo,
const unsigned LB = getScoreLB(T);
const unsigned UB = getScoreUB(T);
if ((UB >= ScoreToWait) && (ScoreToWait > LB)) {
- if ((T == VM_CNT || T == LGKM_CNT) &&
- hasPendingFlat() &&
+ if ((T == LOAD_CNT || T == DS_CNT) && hasPendingFlat() &&
!ST->hasFlatLgkmVMemCountInOrder()) {
// If there is a pending FLAT operation, and this is a VMem or LGKM
// waitcnt and the target can report early completion, then we need
@@ -815,10 +1073,13 @@ void WaitcntBrackets::determineWait(InstCounterType T, int RegNo,
}
void WaitcntBrackets::applyWaitcnt(const AMDGPU::Waitcnt &Wait) {
- applyWaitcnt(VM_CNT, Wait.VmCnt);
+ applyWaitcnt(LOAD_CNT, Wait.LoadCnt);
applyWaitcnt(EXP_CNT, Wait.ExpCnt);
- applyWaitcnt(LGKM_CNT, Wait.LgkmCnt);
- applyWaitcnt(VS_CNT, Wait.VsCnt);
+ applyWaitcnt(DS_CNT, Wait.DsCnt);
+ applyWaitcnt(STORE_CNT, Wait.StoreCnt);
+ applyWaitcnt(SAMPLE_CNT, Wait.SampleCnt);
+ applyWaitcnt(BVH_CNT, Wait.BvhCnt);
+ applyWaitcnt(KM_CNT, Wait.KmCnt);
}
void WaitcntBrackets::applyWaitcnt(InstCounterType T, unsigned Count) {
@@ -839,7 +1100,7 @@ void WaitcntBrackets::applyWaitcnt(InstCounterType T, unsigned Count) {
// the decrement may go out of order.
bool WaitcntBrackets::counterOutOfOrder(InstCounterType T) const {
// Scalar memory read always can go out of order.
- if (T == LGKM_CNT && hasPendingEvent(SMEM_ACCESS))
+ if (T == SmemAccessCounter && hasPendingEvent(SMEM_ACCESS))
return true;
return hasMixedPendingEvents(T);
}
@@ -873,22 +1134,49 @@ static bool updateOperandIfDifferent(MachineInstr &MI, uint16_t OpName,
return true;
}
-bool SIInsertWaitcnts::promoteSoftWaitCnt(MachineInstr *Waitcnt) const {
- unsigned Opcode = Waitcnt->getOpcode();
- if (!SIInstrInfo::isSoftWaitcnt(Opcode))
+/// Determine if \p MI is a gfx12+ single-counter S_WAIT_*CNT instruction,
+/// and if so, which counter it is waiting on.
+static std::optional<InstCounterType> counterTypeForInstr(unsigned Opcode) {
+ switch (Opcode) {
+ case AMDGPU::S_WAIT_LOADCNT:
+ return LOAD_CNT;
+ case AMDGPU::S_WAIT_EXPCNT:
+ return EXP_CNT;
+ case AMDGPU::S_WAIT_STORECNT:
+ return STORE_CNT;
+ case AMDGPU::S_WAIT_SAMPLECNT:
+ return SAMPLE_CNT;
+ case AMDGPU::S_WAIT_BVHCNT:
+ return BVH_CNT;
+ case AMDGPU::S_WAIT_DSCNT:
+ return DS_CNT;
+ case AMDGPU::S_WAIT_KMCNT:
+ return KM_CNT;
+ default:
+ return {};
+ }
+}
+
+bool WaitcntGenerator::promoteSoftWaitCnt(MachineInstr *Waitcnt) const {
+ unsigned Opcode = SIInstrInfo::getNonSoftWaitcntOpcode(Waitcnt->getOpcode());
+ if (Opcode == Waitcnt->getOpcode())
return false;
- Waitcnt->setDesc(TII->get(SIInstrInfo::getNonSoftWaitcntOpcode(Opcode)));
+ Waitcnt->setDesc(TII->get(Opcode));
return true;
}
-/// Combine consecutive waitcnt instructions that precede \p It and follow
-/// \p OldWaitcntInstr and apply any extra wait from waitcnt that were added
-/// by previous passes. Currently this pass conservatively assumes that these
-/// preexisting waitcnt are required for correctness.
-bool SIInsertWaitcnts::applyPreexistingWaitcnt(
+/// Combine consecutive S_WAITCNT and S_WAITCNT_VSCNT instructions that
+/// precede \p It and follow \p OldWaitcntInstr and apply any extra waits
+/// from \p Wait that were added by previous passes. Currently this pass
+/// conservatively assumes that these preexisting waits are required for
+/// correctness.
+bool WaitcntGeneratorPreGFX12::applyPreexistingWaitcnt(
WaitcntBrackets &ScoreBrackets, MachineInstr &OldWaitcntInstr,
AMDGPU::Waitcnt &Wait, MachineBasicBlock::instr_iterator It) const {
+ assert(ST);
+ assert(isNormalMode(MaxCounter));
+
bool Modified = false;
MachineInstr *WaitcntInstr = nullptr;
MachineInstr *WaitcntVsCntInstr = nullptr;
@@ -898,12 +1186,12 @@ bool SIInsertWaitcnts::applyPreexistingWaitcnt(
if (II.isMetaInstruction())
continue;
- unsigned Opcode = II.getOpcode();
- bool IsSoft = SIInstrInfo::isSoftWaitcnt(Opcode);
+ unsigned Opcode = SIInstrInfo::getNonSoftWaitcntOpcode(II.getOpcode());
+ bool IsSoft = Opcode != II.getOpcode();
- if (SIInstrInfo::isWaitcnt(Opcode)) {
- // Update required wait count. If this is a soft waitcnt (= it was added
- // by an earlier pass), it may be entirely removed.
+ // Update required wait count. If this is a soft waitcnt (= it was added
+ // by an earlier pass), it may be entirely removed.
+ if (Opcode == AMDGPU::S_WAITCNT) {
unsigned IEnc = II.getOperand(0).getImm();
AMDGPU::Waitcnt OldWait = AMDGPU::decodeWaitcnt(IV, IEnc);
if (IsSoft)
@@ -911,23 +1199,22 @@ bool SIInsertWaitcnts::applyPreexistingWaitcnt(
Wait = Wait.combined(OldWait);
// Merge consecutive waitcnt of the same type by erasing multiples.
- if (WaitcntInstr || (!Wait.hasWaitExceptVsCnt() && IsSoft)) {
+ if (WaitcntInstr || (!Wait.hasWaitExceptStoreCnt() && IsSoft)) {
II.eraseFromParent();
Modified = true;
} else
WaitcntInstr = &II;
-
} else {
- assert(SIInstrInfo::isWaitcntVsCnt(Opcode));
+ assert(Opcode == AMDGPU::S_WAITCNT_VSCNT);
assert(II.getOperand(0).getReg() == AMDGPU::SGPR_NULL);
unsigned OldVSCnt =
TII->getNamedOperand(II, AMDGPU::OpName::simm16)->getImm();
if (IsSoft)
- ScoreBrackets.simplifyWaitcnt(InstCounterType::VS_CNT, OldVSCnt);
- Wait.VsCnt = std::min(Wait.VsCnt, OldVSCnt);
+ ScoreBrackets.simplifyWaitcnt(InstCounterType::STORE_CNT, OldVSCnt);
+ Wait.StoreCnt = std::min(Wait.StoreCnt, OldVSCnt);
- if (WaitcntVsCntInstr || (!Wait.hasWaitVsCnt() && IsSoft)) {
+ if (WaitcntVsCntInstr || (!Wait.hasWaitStoreCnt() && IsSoft)) {
II.eraseFromParent();
Modified = true;
} else
@@ -935,18 +1222,19 @@ bool SIInsertWaitcnts::applyPreexistingWaitcnt(
}
}
- // Updated encoding of merged waitcnt with the required wait.
if (WaitcntInstr) {
Modified |= updateOperandIfDifferent(*WaitcntInstr, AMDGPU::OpName::simm16,
AMDGPU::encodeWaitcnt(IV, Wait));
Modified |= promoteSoftWaitCnt(WaitcntInstr);
- ScoreBrackets.applyWaitcnt(Wait);
- Wait.VmCnt = ~0u;
- Wait.LgkmCnt = ~0u;
+ ScoreBrackets.applyWaitcnt(LOAD_CNT, Wait.LoadCnt);
+ ScoreBrackets.applyWaitcnt(EXP_CNT, Wait.ExpCnt);
+ ScoreBrackets.applyWaitcnt(DS_CNT, Wait.DsCnt);
+ Wait.LoadCnt = ~0u;
Wait.ExpCnt = ~0u;
+ Wait.DsCnt = ~0u;
- LLVM_DEBUG(It == OldWaitcntInstr.getParent()->end()
+ LLVM_DEBUG(It == WaitcntInstr->getParent()->end()
? dbgs()
<< "applyPreexistingWaitcnt\n"
<< "New Instr at block end: " << *WaitcntInstr << '\n'
@@ -957,12 +1245,13 @@ bool SIInsertWaitcnts::applyPreexistingWaitcnt(
if (WaitcntVsCntInstr) {
Modified |= updateOperandIfDifferent(*WaitcntVsCntInstr,
- AMDGPU::OpName::simm16, Wait.VsCnt);
+ AMDGPU::OpName::simm16, Wait.StoreCnt);
Modified |= promoteSoftWaitCnt(WaitcntVsCntInstr);
- ScoreBrackets.applyWaitcnt(Wait);
- Wait.VsCnt = ~0u;
- LLVM_DEBUG(It == OldWaitcntInstr.getParent()->end()
+ ScoreBrackets.applyWaitcnt(STORE_CNT, Wait.StoreCnt);
+ Wait.StoreCnt = ~0u;
+
+ LLVM_DEBUG(It == WaitcntVsCntInstr->getParent()->end()
? dbgs() << "applyPreexistingWaitcnt\n"
<< "New Instr at block end: " << *WaitcntVsCntInstr
<< '\n'
@@ -974,6 +1263,293 @@ bool SIInsertWaitcnts::applyPreexistingWaitcnt(
return Modified;
}
+/// Generate S_WAITCNT and/or S_WAITCNT_VSCNT instructions for any
+/// required counters in \p Wait
+bool WaitcntGeneratorPreGFX12::createNewWaitcnt(
+ MachineBasicBlock &Block, MachineBasicBlock::instr_iterator It,
+ AMDGPU::Waitcnt Wait) {
+ assert(ST);
+ assert(isNormalMode(MaxCounter));
+
+ bool Modified = false;
+ const DebugLoc &DL = Block.findDebugLoc(It);
+
+ // Waits for VMcnt, LKGMcnt and/or EXPcnt are encoded together into a
+ // single instruction while VScnt has its own instruction.
+ if (Wait.hasWaitExceptStoreCnt()) {
+ unsigned Enc = AMDGPU::encodeWaitcnt(IV, Wait);
+ [[maybe_unused]] auto SWaitInst =
+ BuildMI(Block, It, DL, TII->get(AMDGPU::S_WAITCNT)).addImm(Enc);
+ Modified = true;
+
+ LLVM_DEBUG(dbgs() << "generateWaitcnt\n";
+ if (It != Block.instr_end()) dbgs() << "Old Instr: " << *It;
+ dbgs() << "New Instr: " << *SWaitInst << '\n');
+ }
+
+ if (Wait.hasWaitStoreCnt()) {
+ assert(ST->hasVscnt());
+
+ [[maybe_unused]] auto SWaitInst =
+ BuildMI(Block, It, DL, TII->get(AMDGPU::S_WAITCNT_VSCNT))
+ .addReg(AMDGPU::SGPR_NULL, RegState::Undef)
+ .addImm(Wait.StoreCnt);
+ Modified = true;
+
+ LLVM_DEBUG(dbgs() << "generateWaitcnt\n";
+ if (It != Block.instr_end()) dbgs() << "Old Instr: " << *It;
+ dbgs() << "New Instr: " << *SWaitInst << '\n');
+ }
+
+ return Modified;
+}
+
+/// Combine consecutive S_WAIT_*CNT instructions that precede \p It and
+/// follow \p OldWaitcntInstr and apply any extra waits from \p Wait that
+/// were added by previous passes. Currently this pass conservatively
+/// assumes that these preexisting waits are required for correctness.
+bool WaitcntGeneratorGFX12Plus::applyPreexistingWaitcnt(
+ WaitcntBrackets &ScoreBrackets, MachineInstr &OldWaitcntInstr,
+ AMDGPU::Waitcnt &Wait, MachineBasicBlock::instr_iterator It) const {
+ assert(ST);
+ assert(!isNormalMode(MaxCounter));
+
+ bool Modified = false;
+ MachineInstr *CombinedLoadDsCntInstr = nullptr;
+ MachineInstr *CombinedStoreDsCntInstr = nullptr;
+ MachineInstr *WaitInstrs[NUM_EXTENDED_INST_CNTS] = {};
+
+ for (auto &II :
+ make_early_inc_range(make_range(OldWaitcntInstr.getIterator(), It))) {
+ if (II.isMetaInstruction())
+ continue;
+
+ MachineInstr **UpdatableInstr;
+
+ // Update required wait count. If this is a soft waitcnt (= it was added
+ // by an earlier pass), it may be entirely removed.
+
+ unsigned Opcode = SIInstrInfo::getNonSoftWaitcntOpcode(II.getOpcode());
+ bool IsSoft = Opcode != II.getOpcode();
+
+ if (Opcode == AMDGPU::S_WAIT_LOADCNT_DSCNT) {
+ unsigned OldEnc =
+ TII->getNamedOperand(II, AMDGPU::OpName::simm16)->getImm();
+ AMDGPU::Waitcnt OldWait = AMDGPU::decodeLoadcntDscnt(IV, OldEnc);
+ if (IsSoft)
+ ScoreBrackets.simplifyWaitcnt(OldWait);
+ Wait = Wait.combined(OldWait);
+ UpdatableInstr = &CombinedLoadDsCntInstr;
+ } else if (Opcode == AMDGPU::S_WAIT_STORECNT_DSCNT) {
+ unsigned OldEnc =
+ TII->getNamedOperand(II, AMDGPU::OpName::simm16)->getImm();
+ AMDGPU::Waitcnt OldWait = AMDGPU::decodeStorecntDscnt(IV, OldEnc);
+ if (IsSoft)
+ ScoreBrackets.simplifyWaitcnt(OldWait);
+ Wait = Wait.combined(OldWait);
+ UpdatableInstr = &CombinedStoreDsCntInstr;
+ } else {
+ std::optional<InstCounterType> CT = counterTypeForInstr(Opcode);
+ assert(CT.has_value());
+ unsigned OldCnt =
+ TII->getNamedOperand(II, AMDGPU::OpName::simm16)->getImm();
+ if (IsSoft)
+ ScoreBrackets.simplifyWaitcnt(CT.value(), OldCnt);
+ addWait(Wait, CT.value(), OldCnt);
+ UpdatableInstr = &WaitInstrs[CT.value()];
+ }
+
+ // Merge consecutive waitcnt of the same type by erasing multiples.
+ if (!*UpdatableInstr) {
+ *UpdatableInstr = &II;
+ } else {
+ II.eraseFromParent();
+ Modified = true;
+ }
+ }
+
+ if (CombinedLoadDsCntInstr) {
+ // Only keep an S_WAIT_LOADCNT_DSCNT if both counters actually need
+ // to be waited for. Otherwise, let the instruction be deleted so
+ // the appropriate single counter wait instruction can be inserted
+ // instead, when new S_WAIT_*CNT instructions are inserted by
+ // createNewWaitcnt(). As a side effect, resetting the wait counts will
+ // cause any redundant S_WAIT_LOADCNT or S_WAIT_DSCNT to be removed by
+ // the loop below that deals with single counter instructions.
+ if (Wait.LoadCnt != ~0u && Wait.DsCnt != ~0u) {
+ unsigned NewEnc = AMDGPU::encodeLoadcntDscnt(IV, Wait);
+ Modified |= updateOperandIfDifferent(*CombinedLoadDsCntInstr,
+ AMDGPU::OpName::simm16, NewEnc);
+ Modified |= promoteSoftWaitCnt(CombinedLoadDsCntInstr);
+ ScoreBrackets.applyWaitcnt(LOAD_CNT, Wait.LoadCnt);
+ ScoreBrackets.applyWaitcnt(DS_CNT, Wait.DsCnt);
+ Wait.LoadCnt = ~0u;
+ Wait.DsCnt = ~0u;
+
+ LLVM_DEBUG(It == OldWaitcntInstr.getParent()->end()
+ ? dbgs() << "applyPreexistingWaitcnt\n"
+ << "New Instr at block end: "
+ << *CombinedLoadDsCntInstr << '\n'
+ : dbgs() << "applyPreexistingWaitcnt\n"
+ << "Old Instr: " << *It << "New Instr: "
+ << *CombinedLoadDsCntInstr << '\n');
+ } else {
+ CombinedLoadDsCntInstr->eraseFromParent();
+ Modified = true;
+ }
+ }
+
+ if (CombinedStoreDsCntInstr) {
+ // Similarly for S_WAIT_STORECNT_DSCNT.
+ if (Wait.StoreCnt != ~0u && Wait.DsCnt != ~0u) {
+ unsigned NewEnc = AMDGPU::encodeStorecntDscnt(IV, Wait);
+ Modified |= updateOperandIfDifferent(*CombinedStoreDsCntInstr,
+ AMDGPU::OpName::simm16, NewEnc);
+ Modified |= promoteSoftWaitCnt(CombinedStoreDsCntInstr);
+ ScoreBrackets.applyWaitcnt(STORE_CNT, Wait.StoreCnt);
+ ScoreBrackets.applyWaitcnt(DS_CNT, Wait.DsCnt);
+ Wait.StoreCnt = ~0u;
+ Wait.DsCnt = ~0u;
+
+ LLVM_DEBUG(It == OldWaitcntInstr.getParent()->end()
+ ? dbgs() << "applyPreexistingWaitcnt\n"
+ << "New Instr at block end: "
+ << *CombinedStoreDsCntInstr << '\n'
+ : dbgs() << "applyPreexistingWaitcnt\n"
+ << "Old Instr: " << *It << "New Instr: "
+ << *CombinedStoreDsCntInstr << '\n');
+ } else {
+ CombinedStoreDsCntInstr->eraseFromParent();
+ Modified = true;
+ }
+ }
+
+ // Look for an opportunity to convert existing S_WAIT_LOADCNT,
+ // S_WAIT_STORECNT and S_WAIT_DSCNT into new S_WAIT_LOADCNT_DSCNT
+ // or S_WAIT_STORECNT_DSCNT. This is achieved by selectively removing
+ // instructions so that createNewWaitcnt() will create new combined
+ // instructions to replace them.
+
+ if (Wait.DsCnt != ~0u) {
+ // This is a vector of addresses in WaitInstrs pointing to instructions
+ // that should be removed if they are present.
+ SmallVector<MachineInstr **, 2> WaitsToErase;
+
+ // If it's known that both DScnt and either LOADcnt or STOREcnt (but not
+ // both) need to be waited for, ensure that there are no existing
+ // individual wait count instructions for these.
+
+ if (Wait.LoadCnt != ~0u) {
+ WaitsToErase.push_back(&WaitInstrs[LOAD_CNT]);
+ WaitsToErase.push_back(&WaitInstrs[DS_CNT]);
+ } else if (Wait.StoreCnt != ~0u) {
+ WaitsToErase.push_back(&WaitInstrs[STORE_CNT]);
+ WaitsToErase.push_back(&WaitInstrs[DS_CNT]);
+ }
+
+ for (MachineInstr **WI : WaitsToErase) {
+ if (!*WI)
+ continue;
+
+ (*WI)->eraseFromParent();
+ *WI = nullptr;
+ Modified = true;
+ }
+ }
+
+ for (auto CT : inst_counter_types(NUM_EXTENDED_INST_CNTS)) {
+ if (!WaitInstrs[CT])
+ continue;
+
+ unsigned NewCnt = getWait(Wait, CT);
+ if (NewCnt != ~0u) {
+ Modified |= updateOperandIfDifferent(*WaitInstrs[CT],
+ AMDGPU::OpName::simm16, NewCnt);
+ Modified |= promoteSoftWaitCnt(WaitInstrs[CT]);
+
+ ScoreBrackets.applyWaitcnt(CT, NewCnt);
+ setNoWait(Wait, CT);
+
+ LLVM_DEBUG(It == OldWaitcntInstr.getParent()->end()
+ ? dbgs() << "applyPreexistingWaitcnt\n"
+ << "New Instr at block end: " << *WaitInstrs[CT]
+ << '\n'
+ : dbgs() << "applyPreexistingWaitcnt\n"
+ << "Old Instr: " << *It
+ << "New Instr: " << *WaitInstrs[CT] << '\n');
+ } else {
+ WaitInstrs[CT]->eraseFromParent();
+ Modified = true;
+ }
+ }
+
+ return Modified;
+}
+
+/// Generate S_WAIT_*CNT instructions for any required counters in \p Wait
+bool WaitcntGeneratorGFX12Plus::createNewWaitcnt(
+ MachineBasicBlock &Block, MachineBasicBlock::instr_iterator It,
+ AMDGPU::Waitcnt Wait) {
+ assert(ST);
+ assert(!isNormalMode(MaxCounter));
+
+ bool Modified = false;
+ const DebugLoc &DL = Block.findDebugLoc(It);
+
+ // Check for opportunities to use combined wait instructions.
+ if (Wait.DsCnt != ~0u) {
+ MachineInstr *SWaitInst = nullptr;
+
+ if (Wait.LoadCnt != ~0u) {
+ unsigned Enc = AMDGPU::encodeLoadcntDscnt(IV, Wait);
+
+ SWaitInst = BuildMI(Block, It, DL, TII->get(AMDGPU::S_WAIT_LOADCNT_DSCNT))
+ .addImm(Enc);
+
+ Wait.LoadCnt = ~0u;
+ Wait.DsCnt = ~0u;
+ } else if (Wait.StoreCnt != ~0u) {
+ unsigned Enc = AMDGPU::encodeStorecntDscnt(IV, Wait);
+
+ SWaitInst =
+ BuildMI(Block, It, DL, TII->get(AMDGPU::S_WAIT_STORECNT_DSCNT))
+ .addImm(Enc);
+
+ Wait.StoreCnt = ~0u;
+ Wait.DsCnt = ~0u;
+ }
+
+ if (SWaitInst) {
+ Modified = true;
+
+ LLVM_DEBUG(dbgs() << "generateWaitcnt\n";
+ if (It != Block.instr_end()) dbgs() << "Old Instr: " << *It;
+ dbgs() << "New Instr: " << *SWaitInst << '\n');
+ }
+ }
+
+ // Generate an instruction for any remaining counter that needs
+ // waiting for.
+
+ for (auto CT : inst_counter_types(NUM_EXTENDED_INST_CNTS)) {
+ unsigned Count = getWait(Wait, CT);
+ if (Count == ~0u)
+ continue;
+
+ [[maybe_unused]] auto SWaitInst =
+ BuildMI(Block, It, DL, TII->get(instrsForExtendedCounterTypes[CT]))
+ .addImm(Count);
+
+ Modified = true;
+
+ LLVM_DEBUG(dbgs() << "generateWaitcnt\n";
+ if (It != Block.instr_end()) dbgs() << "Old Instr: " << *It;
+ dbgs() << "New Instr: " << *SWaitInst << '\n');
+ }
+
+ return Modified;
+}
+
static bool readsVCCZ(const MachineInstr &MI) {
unsigned Opc = MI.getOpcode();
return (Opc == AMDGPU::S_CBRANCH_VCCNZ || Opc == AMDGPU::S_CBRANCH_VCCZ) &&
@@ -1027,7 +1603,7 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI,
MI.getOpcode() == AMDGPU::BUFFER_WBINVL1_VOL ||
MI.getOpcode() == AMDGPU::BUFFER_GL0_INV ||
MI.getOpcode() == AMDGPU::BUFFER_GL1_INV) {
- Wait.VmCnt = 0;
+ Wait.LoadCnt = 0;
}
// All waits must be resolved at call return.
@@ -1037,16 +1613,19 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI,
MI.getOpcode() == AMDGPU::SI_RETURN ||
MI.getOpcode() == AMDGPU::S_SETPC_B64_return ||
(MI.isReturn() && MI.isCall() && !callWaitsOnFunctionEntry(MI))) {
- Wait = Wait.combined(AMDGPU::Waitcnt::allZeroExceptVsCnt());
+ Wait = Wait.combined(
+ AMDGPU::Waitcnt::allZeroExceptVsCnt(ST->hasExtendedWaitCounts()));
}
// Identify S_ENDPGM instructions which may have to wait for outstanding VMEM
// stores. In this case it can be useful to send a message to explicitly
// release all VGPRs before the stores have completed, but it is only safe to
- // do this if there are no outstanding scratch stores.
+ // do this if:
+ // * there are no outstanding scratch stores
+ // * we are not in Dynamic VGPR mode
else if (MI.getOpcode() == AMDGPU::S_ENDPGM ||
MI.getOpcode() == AMDGPU::S_ENDPGM_SAVED) {
if (ST->getGeneration() >= AMDGPUSubtarget::GFX11 && !OptNone &&
- ScoreBrackets.getScoreRange(VS_CNT) != 0 &&
+ ScoreBrackets.getScoreRange(STORE_CNT) != 0 &&
!ScoreBrackets.hasPendingEvent(SCRATCH_WRITE_ACCESS))
ReleaseVGPRInsts.insert(&MI);
}
@@ -1056,7 +1635,7 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI,
ST->hasLegacyGeometry() &&
((MI.getOperand(0).getImm() & AMDGPU::SendMsg::ID_MASK_PreGFX11_) ==
AMDGPU::SendMsg::ID_GS_DONE_PreGFX11)) {
- Wait.VmCnt = 0;
+ Wait.LoadCnt = 0;
}
#if 0 // TODO: the following blocks of logic when we have fence.
else if (MI.getOpcode() == SC_FENCE) {
@@ -1073,12 +1652,12 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI,
case SCMEM_LDS:
if (group_is_multi_wave ||
context->OptFlagIsOn(OPT_R1100_LDSMEM_FENCE_CHICKEN_BIT)) {
- EmitWaitcnt |= ScoreBrackets->updateByWait(LGKM_CNT,
- ScoreBrackets->getScoreUB(LGKM_CNT));
- // LDS may have to wait for VM_CNT after buffer load to LDS
+ EmitWaitcnt |= ScoreBrackets->updateByWait(DS_CNT,
+ ScoreBrackets->getScoreUB(DS_CNT));
+ // LDS may have to wait for VMcnt after buffer load to LDS
if (target_info->HasBufferLoadToLDS()) {
- EmitWaitcnt |= ScoreBrackets->updateByWait(VM_CNT,
- ScoreBrackets->getScoreUB(VM_CNT));
+ EmitWaitcnt |= ScoreBrackets->updateByWait(LOAD_CNT,
+ ScoreBrackets->getScoreUB(LOAD_CNT));
}
}
break;
@@ -1087,8 +1666,8 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI,
if (group_is_multi_wave || fence_is_global) {
EmitWaitcnt |= ScoreBrackets->updateByWait(EXP_CNT,
ScoreBrackets->getScoreUB(EXP_CNT));
- EmitWaitcnt |= ScoreBrackets->updateByWait(LGKM_CNT,
- ScoreBrackets->getScoreUB(LGKM_CNT));
+ EmitWaitcnt |= ScoreBrackets->updateByWait(DS_CNT,
+ ScoreBrackets->getScoreUB(DS_CNT));
}
break;
@@ -1099,8 +1678,8 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI,
if (group_is_multi_wave || fence_is_global) {
EmitWaitcnt |= ScoreBrackets->updateByWait(EXP_CNT,
ScoreBrackets->getScoreUB(EXP_CNT));
- EmitWaitcnt |= ScoreBrackets->updateByWait(VM_CNT,
- ScoreBrackets->getScoreUB(VM_CNT));
+ EmitWaitcnt |= ScoreBrackets->updateByWait(LOAD_CNT,
+ ScoreBrackets->getScoreUB(LOAD_CNT));
}
break;
@@ -1143,7 +1722,7 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI,
for (int RegNo = CallAddrOpInterval.first;
RegNo < CallAddrOpInterval.second; ++RegNo)
- ScoreBrackets.determineWait(LGKM_CNT, RegNo, Wait);
+ ScoreBrackets.determineWait(SmemAccessCounter, RegNo, Wait);
int RtnAddrOpIdx =
AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::dst);
@@ -1153,7 +1732,7 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI,
for (int RegNo = RtnAddrOpInterval.first;
RegNo < RtnAddrOpInterval.second; ++RegNo)
- ScoreBrackets.determineWait(LGKM_CNT, RegNo, Wait);
+ ScoreBrackets.determineWait(SmemAccessCounter, RegNo, Wait);
}
}
} else {
@@ -1170,10 +1749,11 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI,
// instruction to guarantee the right WAW order.
// 2) If a destination operand that was used by a recent export/store ins,
// add s_waitcnt on exp_cnt to guarantee the WAR order.
+
for (const MachineMemOperand *Memop : MI.memoperands()) {
const Value *Ptr = Memop->getValue();
if (Memop->isStore() && SLoadAddresses.count(Ptr)) {
- addWait(Wait, LGKM_CNT, 0);
+ addWait(Wait, SmemAccessCounter, 0);
if (PDT->dominates(MI.getParent(), SLoadAddresses.find(Ptr)->second))
SLoadAddresses.erase(Ptr);
}
@@ -1183,9 +1763,27 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI,
// No need to wait before load from VMEM to LDS.
if (TII->mayWriteLDSThroughDMA(MI))
continue;
+
+ // LOAD_CNT is only relevant to vgpr or LDS.
unsigned RegNo = SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS;
- // VM_CNT is only relevant to vgpr or LDS.
- ScoreBrackets.determineWait(VM_CNT, RegNo, Wait);
+ bool FoundAliasingStore = false;
+ // Only objects with alias scope info were added to LDSDMAScopes array.
+ // In the absense of the scope info we will not be able to disambiguate
+ // aliasing here. There is no need to try searching for a corresponding
+ // store slot. This is conservatively correct because in that case we
+ // will produce a wait using the first (general) LDS DMA wait slot which
+ // will wait on all of them anyway.
+ if (Ptr && Memop->getAAInfo() && Memop->getAAInfo().Scope) {
+ const auto &LDSDMAStores = ScoreBrackets.getLDSDMAStores();
+ for (unsigned I = 0, E = LDSDMAStores.size(); I != E; ++I) {
+ if (MI.mayAlias(AA, *LDSDMAStores[I], true)) {
+ FoundAliasingStore = true;
+ ScoreBrackets.determineWait(LOAD_CNT, RegNo + I + 1, Wait);
+ }
+ }
+ }
+ if (!FoundAliasingStore)
+ ScoreBrackets.determineWait(LOAD_CNT, RegNo, Wait);
if (Memop->isStore()) {
ScoreBrackets.determineWait(EXP_CNT, RegNo, Wait);
}
@@ -1213,14 +1811,18 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI,
if (Op.isUse() || !updateVMCntOnly(MI) ||
ScoreBrackets.hasOtherPendingVmemTypes(RegNo,
getVmemType(MI))) {
- ScoreBrackets.determineWait(VM_CNT, RegNo, Wait);
+ ScoreBrackets.determineWait(LOAD_CNT, RegNo, Wait);
+ ScoreBrackets.determineWait(SAMPLE_CNT, RegNo, Wait);
+ ScoreBrackets.determineWait(BVH_CNT, RegNo, Wait);
ScoreBrackets.clearVgprVmemTypes(RegNo);
}
if (Op.isDef() || ScoreBrackets.hasPendingEvent(EXP_LDS_ACCESS)) {
ScoreBrackets.determineWait(EXP_CNT, RegNo, Wait);
}
+ ScoreBrackets.determineWait(DS_CNT, RegNo, Wait);
+ } else {
+ ScoreBrackets.determineWait(SmemAccessCounter, RegNo, Wait);
}
- ScoreBrackets.determineWait(LGKM_CNT, RegNo, Wait);
}
}
}
@@ -1232,7 +1834,8 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI,
// cause an exception. Otherwise, insert an explicit S_WAITCNT 0 here.
if (MI.getOpcode() == AMDGPU::S_BARRIER &&
!ST->hasAutoWaitcntBeforeBarrier() && !ST->supportsBackOffBarrier()) {
- Wait = Wait.combined(AMDGPU::Waitcnt::allZero(ST->hasVscnt()));
+ Wait = Wait.combined(
+ AMDGPU::Waitcnt::allZero(ST->hasExtendedWaitCounts(), ST->hasVscnt()));
}
// TODO: Remove this work-around, enable the assert for Bug 457939
@@ -1240,7 +1843,7 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI,
// independent of target.
if (readsVCCZ(MI) && ST->hasReadVCCZBug()) {
if (ScoreBrackets.hasPendingEvent(SMEM_ACCESS)) {
- Wait.LgkmCnt = 0;
+ Wait.DsCnt = 0;
}
}
@@ -1248,35 +1851,54 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI,
ScoreBrackets.simplifyWaitcnt(Wait);
if (ForceEmitZeroWaitcnts)
- Wait = AMDGPU::Waitcnt::allZeroExceptVsCnt();
+ Wait = AMDGPU::Waitcnt::allZeroExceptVsCnt(ST->hasExtendedWaitCounts());
- if (ForceEmitWaitcnt[VM_CNT])
- Wait.VmCnt = 0;
+ if (ForceEmitWaitcnt[LOAD_CNT])
+ Wait.LoadCnt = 0;
if (ForceEmitWaitcnt[EXP_CNT])
Wait.ExpCnt = 0;
- if (ForceEmitWaitcnt[LGKM_CNT])
- Wait.LgkmCnt = 0;
+ if (ForceEmitWaitcnt[DS_CNT])
+ Wait.DsCnt = 0;
+ if (ForceEmitWaitcnt[SAMPLE_CNT])
+ Wait.SampleCnt = 0;
+ if (ForceEmitWaitcnt[BVH_CNT])
+ Wait.BvhCnt = 0;
+ if (ForceEmitWaitcnt[KM_CNT])
+ Wait.KmCnt = 0;
if (FlushVmCnt) {
- if (ScoreBrackets.hasPendingEvent(VM_CNT))
- Wait.VmCnt = 0;
+ if (ScoreBrackets.hasPendingEvent(LOAD_CNT))
+ Wait.LoadCnt = 0;
+ if (ScoreBrackets.hasPendingEvent(SAMPLE_CNT))
+ Wait.SampleCnt = 0;
+ if (ScoreBrackets.hasPendingEvent(BVH_CNT))
+ Wait.BvhCnt = 0;
}
return generateWaitcnt(Wait, MI.getIterator(), *MI.getParent(), ScoreBrackets,
OldWaitcntInstr);
}
-// Add a waitcnt to flush the vmcnt counter at the end of the given block if
-// needed.
+// Add a waitcnt to flush the LOADcnt, SAMPLEcnt and BVHcnt counters at the
+// end of the given block if needed.
bool SIInsertWaitcnts::generateWaitcntBlockEnd(MachineBasicBlock &Block,
WaitcntBrackets &ScoreBrackets,
MachineInstr *OldWaitcntInstr) {
AMDGPU::Waitcnt Wait;
- if (!ScoreBrackets.hasPendingEvent(VM_CNT))
+ unsigned LoadCntPending = ScoreBrackets.hasPendingEvent(LOAD_CNT);
+ unsigned SampleCntPending = ScoreBrackets.hasPendingEvent(SAMPLE_CNT);
+ unsigned BvhCntPending = ScoreBrackets.hasPendingEvent(BVH_CNT);
+
+ if (LoadCntPending == 0 && SampleCntPending == 0 && BvhCntPending == 0)
return false;
- Wait.VmCnt = 0;
+ if (LoadCntPending != 0)
+ Wait.LoadCnt = 0;
+ if (SampleCntPending != 0)
+ Wait.SampleCnt = 0;
+ if (BvhCntPending != 0)
+ Wait.BvhCnt = 0;
return generateWaitcnt(Wait, Block.instr_end(), Block, ScoreBrackets,
OldWaitcntInstr);
@@ -1288,15 +1910,16 @@ bool SIInsertWaitcnts::generateWaitcnt(AMDGPU::Waitcnt Wait,
WaitcntBrackets &ScoreBrackets,
MachineInstr *OldWaitcntInstr) {
bool Modified = false;
- const DebugLoc &DL = Block.findDebugLoc(It);
if (OldWaitcntInstr)
// Try to merge the required wait with preexisting waitcnt instructions.
// Also erase redundant waitcnt.
Modified =
- applyPreexistingWaitcnt(ScoreBrackets, *OldWaitcntInstr, Wait, It);
- else
- ScoreBrackets.applyWaitcnt(Wait);
+ WCG->applyPreexistingWaitcnt(ScoreBrackets, *OldWaitcntInstr, Wait, It);
+
+ // Any counts that could have been applied to any existing waitcnt
+ // instructions will have been done so, now deal with any remaining.
+ ScoreBrackets.applyWaitcnt(Wait);
// ExpCnt can be merged into VINTERP.
if (Wait.ExpCnt != ~0u && It != Block.instr_end() &&
@@ -1309,35 +1932,13 @@ bool SIInsertWaitcnts::generateWaitcnt(AMDGPU::Waitcnt Wait,
}
Wait.ExpCnt = ~0u;
- LLVM_DEBUG(dbgs() << "generateWaitcntInstBefore\n"
+ LLVM_DEBUG(dbgs() << "generateWaitcnt\n"
<< "Update Instr: " << *It);
}
- // Build new waitcnt instructions unless no wait is needed or the old waitcnt
- // instruction was modified to handle the required wait.
- if (Wait.hasWaitExceptVsCnt()) {
- unsigned Enc = AMDGPU::encodeWaitcnt(IV, Wait);
- [[maybe_unused]] auto SWaitInst =
- BuildMI(Block, It, DL, TII->get(AMDGPU::S_WAITCNT)).addImm(Enc);
+ if (WCG->createNewWaitcnt(Block, It, Wait))
Modified = true;
- LLVM_DEBUG(dbgs() << "generateWaitcnt\n";
- if (It != Block.instr_end()) dbgs() << "Old Instr: " << *It;
- dbgs() << "New Instr: " << *SWaitInst << '\n');
- }
-
- if (Wait.hasWaitVsCnt()) {
- assert(ST->hasVscnt());
-
- [[maybe_unused]] auto SWaitInst = BuildMI(Block, It, DL, TII->get(AMDGPU::S_WAITCNT_VSCNT))
- .addReg(AMDGPU::SGPR_NULL, RegState::Undef)
- .addImm(Wait.VsCnt);
- Modified = true;
-
- LLVM_DEBUG(dbgs() << "generateWaitcnt\n";
- if (It != Block.instr_end()) dbgs() << "Old Instr: " << *It;
- dbgs() << "New Instr: " << *SWaitInst << '\n');
- }
return Modified;
}
@@ -1435,7 +2036,8 @@ void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr &Inst,
// Now look at the instruction opcode. If it is a memory access
// instruction, update the upper-bound of the appropriate counter's
// bracket and the destination operand scores.
- // TODO: Use the (TSFlags & SIInstrFlags::LGKM_CNT) property everywhere.
+ // TODO: Use the (TSFlags & SIInstrFlags::DS_CNT) property everywhere.
+
if (TII->isDS(Inst) && TII->usesLGKM_CNT(Inst)) {
if (TII->isAlwaysGDS(Inst.getOpcode()) ||
TII->hasModifiersSet(Inst, AMDGPU::OpName::gds)) {
@@ -1486,7 +2088,8 @@ void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr &Inst,
} else if (Inst.isCall()) {
if (callWaitsOnFunctionReturn(Inst)) {
// Act as a wait on everything
- ScoreBrackets->applyWaitcnt(AMDGPU::Waitcnt::allZeroExceptVsCnt());
+ ScoreBrackets->applyWaitcnt(
+ AMDGPU::Waitcnt::allZeroExceptVsCnt(ST->hasExtendedWaitCounts()));
ScoreBrackets->setStateOnFunctionEntryOrReturn();
} else {
// May need to way wait for anything.
@@ -1546,7 +2149,7 @@ bool WaitcntBrackets::merge(const WaitcntBrackets &Other) {
VgprUB = std::max(VgprUB, Other.VgprUB);
SgprUB = std::max(SgprUB, Other.SgprUB);
- for (auto T : inst_counter_types()) {
+ for (auto T : inst_counter_types(MaxCounter)) {
// Merge event flags for this counter
const unsigned OldEvents = PendingEvents & WaitEventMaskForInst[T];
const unsigned OtherEvents = Other.PendingEvents & WaitEventMaskForInst[T];
@@ -1574,7 +2177,7 @@ bool WaitcntBrackets::merge(const WaitcntBrackets &Other) {
for (int J = 0; J <= VgprUB; J++)
StrictDom |= mergeScore(M, VgprScores[T][J], Other.VgprScores[T][J]);
- if (T == LGKM_CNT) {
+ if (T == SmemAccessCounter) {
for (int J = 0; J <= SgprUB; J++)
StrictDom |= mergeScore(M, SgprScores[J], Other.SgprScores[J]);
}
@@ -1590,10 +2193,13 @@ bool WaitcntBrackets::merge(const WaitcntBrackets &Other) {
}
static bool isWaitInstr(MachineInstr &Inst) {
- auto Opcode = Inst.getOpcode();
- return SIInstrInfo::isWaitcnt(Opcode) ||
- (SIInstrInfo::isWaitcntVsCnt(Opcode) && Inst.getOperand(0).isReg() &&
- Inst.getOperand(0).getReg() == AMDGPU::SGPR_NULL);
+ unsigned Opcode = SIInstrInfo::getNonSoftWaitcntOpcode(Inst.getOpcode());
+ return Opcode == AMDGPU::S_WAITCNT ||
+ (Opcode == AMDGPU::S_WAITCNT_VSCNT && Inst.getOperand(0).isReg() &&
+ Inst.getOperand(0).getReg() == AMDGPU::SGPR_NULL) ||
+ Opcode == AMDGPU::S_WAIT_LOADCNT_DSCNT ||
+ Opcode == AMDGPU::S_WAIT_STORECNT_DSCNT ||
+ counterTypeForInstr(Opcode).has_value();
}
// Generate s_waitcnt instructions where needed.
@@ -1699,8 +2305,8 @@ bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
// an S_WAITCNT vmcnt(0)
if (RequireCheckResourceType(Inst, context)) {
// Force the score to as if an S_WAITCNT vmcnt(0) is emitted.
- ScoreBrackets->setScoreLB(VM_CNT,
- ScoreBrackets->getScoreUB(VM_CNT));
+ ScoreBrackets->setScoreLB(LOAD_CNT,
+ ScoreBrackets->getScoreUB(LOAD_CNT));
}
#endif
@@ -1802,7 +2408,12 @@ bool SIInsertWaitcnts::shouldFlushVmCnt(MachineLoop *ML,
VgprUse.insert(RegNo);
// If at least one of Op's registers is in the score brackets, the
// value is likely loaded outside of the loop.
- if (Brackets.getRegScore(RegNo, VM_CNT) > Brackets.getScoreLB(VM_CNT)) {
+ if (Brackets.getRegScore(RegNo, LOAD_CNT) >
+ Brackets.getScoreLB(LOAD_CNT) ||
+ Brackets.getRegScore(RegNo, SAMPLE_CNT) >
+ Brackets.getScoreLB(SAMPLE_CNT) ||
+ Brackets.getRegScore(RegNo, BVH_CNT) >
+ Brackets.getScoreLB(BVH_CNT)) {
UsesVgprLoadedOutside = true;
break;
}
@@ -1830,23 +2441,48 @@ bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) {
TII = ST->getInstrInfo();
TRI = &TII->getRegisterInfo();
MRI = &MF.getRegInfo();
- IV = AMDGPU::getIsaVersion(ST->getCPU());
const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
MLI = &getAnalysis<MachineLoopInfo>();
PDT = &getAnalysis<MachinePostDominatorTree>();
+ if (auto AAR = getAnalysisIfAvailable<AAResultsWrapperPass>())
+ AA = &AAR->getAAResults();
+
+ AMDGPU::IsaVersion IV = AMDGPU::getIsaVersion(ST->getCPU());
+
+ if (ST->hasExtendedWaitCounts()) {
+ MaxCounter = NUM_EXTENDED_INST_CNTS;
+ WCGGFX12Plus = WaitcntGeneratorGFX12Plus(ST, MaxCounter);
+ WCG = &WCGGFX12Plus;
+ } else {
+ MaxCounter = NUM_NORMAL_INST_CNTS;
+ WCGPreGFX12 = WaitcntGeneratorPreGFX12(ST);
+ WCG = &WCGPreGFX12;
+ }
ForceEmitZeroWaitcnts = ForceEmitZeroFlag;
for (auto T : inst_counter_types())
ForceEmitWaitcnt[T] = false;
+ const unsigned *WaitEventMaskForInst = WCG->getWaitEventMask();
+
+ SmemAccessCounter = eventCounter(WaitEventMaskForInst, SMEM_ACCESS);
+
OptNone = MF.getFunction().hasOptNone() ||
MF.getTarget().getOptLevel() == CodeGenOptLevel::None;
HardwareLimits Limits = {};
- Limits.VmcntMax = AMDGPU::getVmcntBitMask(IV);
+ if (ST->hasExtendedWaitCounts()) {
+ Limits.LoadcntMax = AMDGPU::getLoadcntBitMask(IV);
+ Limits.DscntMax = AMDGPU::getDscntBitMask(IV);
+ } else {
+ Limits.LoadcntMax = AMDGPU::getVmcntBitMask(IV);
+ Limits.DscntMax = AMDGPU::getLgkmcntBitMask(IV);
+ }
Limits.ExpcntMax = AMDGPU::getExpcntBitMask(IV);
- Limits.LgkmcntMax = AMDGPU::getLgkmcntBitMask(IV);
- Limits.VscntMax = ST->hasVscnt() ? 63 : 0;
+ Limits.StorecntMax = AMDGPU::getStorecntBitMask(IV);
+ Limits.SamplecntMax = AMDGPU::getSamplecntBitMask(IV);
+ Limits.BvhcntMax = AMDGPU::getBvhcntBitMask(IV);
+ Limits.KmcntMax = AMDGPU::getKmcntBitMask(IV);
unsigned NumVGPRsMax = ST->getAddressableNumVGPRs();
unsigned NumSGPRsMax = ST->getAddressableNumSGPRs();
@@ -1864,6 +2500,9 @@ bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) {
BlockInfos.clear();
bool Modified = false;
+ MachineBasicBlock &EntryBB = MF.front();
+ MachineBasicBlock::iterator I = EntryBB.begin();
+
if (!MFI->isEntryFunction()) {
// Wait for any outstanding memory operations that the input registers may
// depend on. We can't track them and it's better to do the wait after the
@@ -1871,15 +2510,28 @@ bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) {
// TODO: Could insert earlier and schedule more liberally with operations
// that only use caller preserved registers.
- MachineBasicBlock &EntryBB = MF.front();
- MachineBasicBlock::iterator I = EntryBB.begin();
for (MachineBasicBlock::iterator E = EntryBB.end();
I != E && (I->isPHI() || I->isMetaInstruction()); ++I)
;
- BuildMI(EntryBB, I, DebugLoc(), TII->get(AMDGPU::S_WAITCNT)).addImm(0);
- auto NonKernelInitialState =
- std::make_unique<WaitcntBrackets>(ST, Limits, Encoding);
+ if (ST->hasExtendedWaitCounts()) {
+ BuildMI(EntryBB, I, DebugLoc(), TII->get(AMDGPU::S_WAIT_LOADCNT_DSCNT))
+ .addImm(0);
+ for (auto CT : inst_counter_types(NUM_EXTENDED_INST_CNTS)) {
+ if (CT == LOAD_CNT || CT == DS_CNT || CT == STORE_CNT)
+ continue;
+
+ BuildMI(EntryBB, I, DebugLoc(),
+ TII->get(instrsForExtendedCounterTypes[CT]))
+ .addImm(0);
+ }
+ } else {
+ BuildMI(EntryBB, I, DebugLoc(), TII->get(AMDGPU::S_WAITCNT)).addImm(0);
+ }
+
+ auto NonKernelInitialState = std::make_unique<WaitcntBrackets>(
+ ST, MaxCounter, Limits, Encoding, WaitEventMaskForInst,
+ SmemAccessCounter);
NonKernelInitialState->setStateOnFunctionEntryOrReturn();
BlockInfos[&EntryBB].Incoming = std::move(NonKernelInitialState);
@@ -1910,9 +2562,12 @@ bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) {
*Brackets = *BI.Incoming;
} else {
if (!Brackets)
- Brackets = std::make_unique<WaitcntBrackets>(ST, Limits, Encoding);
+ Brackets = std::make_unique<WaitcntBrackets>(
+ ST, MaxCounter, Limits, Encoding, WaitEventMaskForInst,
+ SmemAccessCounter);
else
- *Brackets = WaitcntBrackets(ST, Limits, Encoding);
+ *Brackets = WaitcntBrackets(ST, MaxCounter, Limits, Encoding,
+ WaitEventMaskForInst, SmemAccessCounter);
}
Modified |= insertWaitcntInBlock(MF, *MBB, *Brackets);
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index e50f5f28e030..f4ca27808a30 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -2410,13 +2410,22 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
// the encoding of $symbol starts 12 bytes after the start of the s_add_u32
// instruction.
+ int64_t Adjust = 0;
+ if (ST.hasGetPCZeroExtension()) {
+ // Fix up hardware that does not sign-extend the 48-bit PC value by
+ // inserting: s_sext_i32_i16 reghi, reghi
+ Bundler.append(
+ BuildMI(MF, DL, get(AMDGPU::S_SEXT_I32_I16), RegHi).addReg(RegHi));
+ Adjust += 4;
+ }
+
if (OpLo.isGlobal())
- OpLo.setOffset(OpLo.getOffset() + 4);
+ OpLo.setOffset(OpLo.getOffset() + Adjust + 4);
Bundler.append(
BuildMI(MF, DL, get(AMDGPU::S_ADD_U32), RegLo).addReg(RegLo).add(OpLo));
if (OpHi.isGlobal())
- OpHi.setOffset(OpHi.getOffset() + 12);
+ OpHi.setOffset(OpHi.getOffset() + Adjust + 12);
Bundler.append(BuildMI(MF, DL, get(AMDGPU::S_ADDC_U32), RegHi)
.addReg(RegHi)
.add(OpHi));
@@ -2480,6 +2489,19 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
case AMDGPU::S_MUL_I64_I32_PSEUDO:
MI.setDesc(get(AMDGPU::S_MUL_U64));
break;
+
+ case AMDGPU::S_GETPC_B64_pseudo:
+ MI.setDesc(get(AMDGPU::S_GETPC_B64));
+ if (ST.hasGetPCZeroExtension()) {
+ Register Dst = MI.getOperand(0).getReg();
+ Register DstHi = RI.getSubReg(Dst, AMDGPU::sub1);
+ // Fix up hardware that does not sign-extend the 48-bit PC value by
+ // inserting: s_sext_i32_i16 dsthi, dsthi
+ BuildMI(MBB, std::next(MI.getIterator()), DL, get(AMDGPU::S_SEXT_I32_I16),
+ DstHi)
+ .addReg(DstHi);
+ }
+ break;
}
return true;
}
@@ -5280,7 +5302,8 @@ unsigned SIInstrInfo::getVALUOp(const MachineInstr &MI) const {
return ST.useRealTrue16Insts() ? AMDGPU::V_CEIL_F16_t16_e64
: AMDGPU::V_CEIL_F16_fake16_e64;
case AMDGPU::S_FLOOR_F16:
- return AMDGPU::V_FLOOR_F16_fake16_e64;
+ return ST.useRealTrue16Insts() ? AMDGPU::V_FLOOR_F16_t16_e64
+ : AMDGPU::V_FLOOR_F16_fake16_e64;
case AMDGPU::S_TRUNC_F16:
return AMDGPU::V_TRUNC_F16_fake16_e64;
case AMDGPU::S_RNDNE_F16:
@@ -8756,6 +8779,7 @@ SIInstrInfo::getSerializableMachineMemOperandTargetFlags() const {
static const std::pair<MachineMemOperand::Flags, const char *> TargetFlags[] =
{
{MONoClobber, "amdgpu-noclobber"},
+ {MOLastUse, "amdgpu-last-use"},
};
return ArrayRef(TargetFlags);
@@ -8944,8 +8968,9 @@ bool SIInstrInfo::splitMUBUFOffset(uint32_t Imm, uint32_t &SOffset,
// Depending on the used address space and instructions, some immediate offsets
// are allowed and some are not.
-// In general, flat instruction offsets can only be non-negative, global and
-// scratch instruction offsets can also be negative.
+// Pre-GFX12, flat instruction offsets can only be non-negative, global and
+// scratch instruction offsets can also be negative. On GFX12, offsets can be
+// negative for all variants.
//
// There are several bugs related to these offsets:
// On gfx10.1, flat instructions that go into the global address space cannot
@@ -9076,8 +9101,7 @@ bool SIInstrInfo::isAsmOnlyOpcode(int MCOp) const {
}
int SIInstrInfo::pseudoToMCOpcode(int Opcode) const {
- if (SIInstrInfo::isSoftWaitcnt(Opcode))
- Opcode = SIInstrInfo::getNonSoftWaitcntOpcode(Opcode);
+ Opcode = SIInstrInfo::getNonSoftWaitcntOpcode(Opcode);
unsigned Gen = subtargetEncodingFamily(ST);
@@ -9113,12 +9137,6 @@ int SIInstrInfo::pseudoToMCOpcode(int Opcode) const {
int MCOp = AMDGPU::getMCOpcode(Opcode, Gen);
- // TODO-GFX12: Remove this.
- // Hack to allow some GFX12 codegen tests to run before all the encodings are
- // implemented.
- if (MCOp == (uint16_t)-1 && Gen == SIEncodingFamily::GFX12)
- MCOp = AMDGPU::getMCOpcode(Opcode, SIEncodingFamily::GFX11);
-
// -1 means that Opcode is already a native instruction.
if (MCOp == -1)
return Opcode;
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInstrInfo.h
index 37ee159362a2..fc85b089aa47 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInstrInfo.h
@@ -41,6 +41,10 @@ class ScheduleHazardRecognizer;
static const MachineMemOperand::Flags MONoClobber =
MachineMemOperand::MOTargetFlag1;
+/// Mark the MMO of a load as the last use.
+static const MachineMemOperand::Flags MOLastUse =
+ MachineMemOperand::MOTargetFlag2;
+
/// Utility to store machine instructions worklist.
struct SIInstrWorklist {
SIInstrWorklist() = default;
@@ -905,29 +909,24 @@ public:
}
static unsigned getNonSoftWaitcntOpcode(unsigned Opcode) {
- if (isWaitcnt(Opcode))
+ switch (Opcode) {
+ case AMDGPU::S_WAITCNT_soft:
return AMDGPU::S_WAITCNT;
-
- if (isWaitcntVsCnt(Opcode))
+ case AMDGPU::S_WAITCNT_VSCNT_soft:
return AMDGPU::S_WAITCNT_VSCNT;
-
- llvm_unreachable("Expected opcode S_WAITCNT/S_WAITCNT_VSCNT");
- }
-
- static bool isWaitcnt(unsigned Opcode) {
- return Opcode == AMDGPU::S_WAITCNT || Opcode == AMDGPU::S_WAITCNT_soft;
- }
-
- static bool isWaitcntVsCnt(unsigned Opcode) {
- return Opcode == AMDGPU::S_WAITCNT_VSCNT ||
- Opcode == AMDGPU::S_WAITCNT_VSCNT_soft;
- }
-
- // "Soft" waitcnt instructions can be relaxed/optimized out by
- // SIInsertWaitcnts.
- static bool isSoftWaitcnt(unsigned Opcode) {
- return Opcode == AMDGPU::S_WAITCNT_soft ||
- Opcode == AMDGPU::S_WAITCNT_VSCNT_soft;
+ case AMDGPU::S_WAIT_LOADCNT_soft:
+ return AMDGPU::S_WAIT_LOADCNT;
+ case AMDGPU::S_WAIT_STORECNT_soft:
+ return AMDGPU::S_WAIT_STORECNT;
+ case AMDGPU::S_WAIT_SAMPLECNT_soft:
+ return AMDGPU::S_WAIT_SAMPLECNT;
+ case AMDGPU::S_WAIT_BVHCNT_soft:
+ return AMDGPU::S_WAIT_BVHCNT;
+ case AMDGPU::S_WAIT_DSCNT_soft:
+ return AMDGPU::S_WAIT_DSCNT;
+ default:
+ return Opcode;
+ }
}
bool isVGPRCopy(const MachineInstr &MI) const {
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInstrInfo.td
index 04c92155f5aa..a6820544f4b4 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInstrInfo.td
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInstrInfo.td
@@ -41,10 +41,29 @@ def SIEncodingFamily {
def AMDGPUclamp : SDNode<"AMDGPUISD::CLAMP", SDTFPUnaryOp>;
-def SIsbuffer_load : SDNode<"AMDGPUISD::SBUFFER_LOAD",
- SDTypeProfile<1, 3, [SDTCisVT<1, v4i32>, SDTCisVT<2, i32>, SDTCisVT<3, i32>]>,
- [SDNPMayLoad, SDNPMemOperand]
->;
+def SDTSBufferLoad : SDTypeProfile<1, 3,
+ [ // vdata
+ SDTCisVT<1, v4i32>, // rsrc
+ SDTCisVT<2, i32>, // offset(imm)
+ SDTCisVT<3, i32>]>; // cachepolicy
+
+def SIsbuffer_load : SDNode<"AMDGPUISD::SBUFFER_LOAD", SDTSBufferLoad,
+ [SDNPMayLoad, SDNPMemOperand]>;
+
+def SIsbuffer_load_byte : SDNode<"AMDGPUISD::SBUFFER_LOAD_BYTE", SDTSBufferLoad,
+ [SDNPMayLoad, SDNPMemOperand]>;
+
+def SIsbuffer_load_ubyte
+ : SDNode<"AMDGPUISD::SBUFFER_LOAD_UBYTE", SDTSBufferLoad,
+ [SDNPMayLoad, SDNPMemOperand]>;
+
+def SIsbuffer_load_short
+ : SDNode<"AMDGPUISD::SBUFFER_LOAD_SHORT", SDTSBufferLoad,
+ [SDNPMayLoad, SDNPMemOperand]>;
+
+def SIsbuffer_load_ushort
+ : SDNode<"AMDGPUISD::SBUFFER_LOAD_USHORT", SDTSBufferLoad,
+ [SDNPMayLoad, SDNPMemOperand]>;
def SIds_ordered_count : SDNode<"AMDGPUISD::DS_ORDERED_COUNT",
SDTypeProfile<1, 2, [SDTCisVT<0, i32>, SDTCisVT<1, i32>, SDTCisVT<2, i16>]>,
@@ -195,8 +214,10 @@ defm SIbuffer_atomic_inc : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_INC">;
defm SIbuffer_atomic_dec : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_DEC">;
defm SIbuffer_atomic_csub : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_CSUB">;
defm SIbuffer_atomic_fadd : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_FADD">;
+defm SIbuffer_atomic_fadd_bf16 : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_FADD_BF16">;
defm SIbuffer_atomic_fmin : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_FMIN">;
defm SIbuffer_atomic_fmax : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_FMAX">;
+defm SIbuffer_atomic_cond_sub_u32 : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_COND_SUB_U32">;
def SIbuffer_atomic_cmpswap : SDNode <"AMDGPUISD::BUFFER_ATOMIC_CMPSWAP",
SDTypeProfile<1, 9,
@@ -281,49 +302,10 @@ def SIfptrunc_round_downward : SDNode<"AMDGPUISD::FPTRUNC_ROUND_DOWNWARD",
// ValueType helpers
//===----------------------------------------------------------------------===//
-// Returns 1 if the source arguments have modifiers, 0 if they do not.
-class isFloatType<ValueType SrcVT> {
- bit ret = !or(!eq(SrcVT.Value, f16.Value),
- !eq(SrcVT.Value, f32.Value),
- !eq(SrcVT.Value, f64.Value),
- !eq(SrcVT.Value, v2f16.Value),
- !eq(SrcVT.Value, v4f16.Value),
- !eq(SrcVT.Value, v8f16.Value),
- !eq(SrcVT.Value, v16f16.Value),
- !eq(SrcVT.Value, v2f32.Value),
- !eq(SrcVT.Value, v4f32.Value),
- !eq(SrcVT.Value, v8f32.Value),
- !eq(SrcVT.Value, v2f64.Value),
- !eq(SrcVT.Value, v4f64.Value));
-}
-
-// XXX - do v2i16 instructions?
class isIntType<ValueType SrcVT> {
- bit ret = !or(!eq(SrcVT.Value, i8.Value),
- !eq(SrcVT.Value, i16.Value),
- !eq(SrcVT.Value, i32.Value),
- !eq(SrcVT.Value, i64.Value),
- !eq(SrcVT.Value, v4i16.Value),
- !eq(SrcVT.Value, v8i16.Value),
- !eq(SrcVT.Value, v16i16.Value),
- !eq(SrcVT.Value, v2i32.Value),
- !eq(SrcVT.Value, v4i32.Value),
- !eq(SrcVT.Value, v8i32.Value));
+ bit ret = !and(SrcVT.isInteger, !ne(SrcVT.Value, i1.Value));
}
-class isPackedType<ValueType SrcVT> {
- bit ret = !or(!eq(SrcVT.Value, v2i16.Value),
- !eq(SrcVT.Value, v2f16.Value),
- !eq(SrcVT.Value, v4f16.Value),
- !eq(SrcVT.Value, v2i32.Value),
- !eq(SrcVT.Value, v2f32.Value),
- !eq(SrcVT.Value, v4i32.Value),
- !eq(SrcVT.Value, v4f32.Value),
- !eq(SrcVT.Value, v8i32.Value),
- !eq(SrcVT.Value, v8f32.Value));
-}
-
-
//===----------------------------------------------------------------------===//
// PatFrags for global memory operations
//===----------------------------------------------------------------------===//
@@ -806,12 +788,9 @@ class bitextract_imm<int bitnum> : SDNodeXForm<imm, [{
return CurDAG->getTargetConstant(Bit, SDLoc(N), MVT::i1);
}]>;
-def SIMM16bit : ImmLeaf <i32,
- [{return isInt<16>(Imm);}]
->;
-
-def UIMM16bit : ImmLeaf <i32,
- [{return isUInt<16>(Imm);}]
+def SIMM16bit : TImmLeaf <i32,
+ [{return isInt<16>(Imm) || isUInt<16>(Imm);}],
+ as_i16timm
>;
def i64imm_32bit : ImmLeaf<i64, [{
@@ -885,8 +864,11 @@ def extract_swz : SDNodeXForm<timm, [{
return CurDAG->getTargetConstant(Swizzle, SDLoc(N), MVT::i8);
}]>;
-def set_glc : SDNodeXForm<timm, [{
- return CurDAG->getTargetConstant(N->getZExtValue() | AMDGPU::CPol::GLC, SDLoc(N), MVT::i8);
+def extract_cpol_set_glc : SDNodeXForm<timm, [{
+ const uint32_t cpol = N->getZExtValue() & (Subtarget->getGeneration() >= AMDGPUSubtarget::GFX12
+ ? AMDGPU::CPol::ALL
+ : AMDGPU::CPol::ALL_pregfx12);
+ return CurDAG->getTargetConstant(cpol | AMDGPU::CPol::GLC, SDLoc(N), MVT::i8);
}]>;
//===----------------------------------------------------------------------===//
@@ -993,7 +975,7 @@ def ExpSrc3 : RegisterOperand<VGPR_32> {
class SDWASrc<ValueType vt> : RegisterOperand<VS_32> {
let OperandNamespace = "AMDGPU";
- string Type = !if(isFloatType<vt>.ret, "FP", "INT");
+ string Type = !if(vt.isFP, "FP", "INT");
let OperandType = "OPERAND_REG_INLINE_C_"#Type#vt.Size;
let DecoderMethod = "decodeSDWASrc"#vt.Size;
let EncoderMethod = "getSDWASrcEncoding";
@@ -1241,17 +1223,20 @@ def FPVRegInputModsMatchClass : AsmOperandClass {
let PredicateMethod = "isVRegWithInputMods";
}
-def FPT16VRegInputModsMatchClass : AsmOperandClass {
- let Name = "T16VRegWithFPInputMods";
+class FPT16VRegInputModsMatchClass<bit IsFake16> : AsmOperandClass {
+ let Name = !if(IsFake16, "Fake16VRegWithFPInputMods",
+ "T16VRegWithFPInputMods");
let ParserMethod = "parseRegWithFPInputMods";
- let PredicateMethod = "isT16VRegWithInputMods";
+ let PredicateMethod = "isT16VRegWithInputMods<" #
+ !if(IsFake16, "true", "false") # ">";
}
def FPVRegInputMods : InputMods <FPVRegInputModsMatchClass> {
let PrintMethod = "printOperandAndFPInputMods";
}
-def FPT16VRegInputMods : InputMods <FPT16VRegInputModsMatchClass> {
+class FPT16VRegInputMods<bit IsFake16>
+ : InputMods <FPT16VRegInputModsMatchClass<IsFake16>> {
let PrintMethod = "printOperandAndFPInputMods";
}
@@ -1283,13 +1268,16 @@ def IntVRegInputModsMatchClass : AsmOperandClass {
let PredicateMethod = "isVRegWithInputMods";
}
-def IntT16VRegInputModsMatchClass : AsmOperandClass {
- let Name = "T16VRegWithIntInputMods";
+class IntT16VRegInputModsMatchClass<bit IsFake16> : AsmOperandClass {
+ let Name = !if(IsFake16, "Fake16VRegWithIntInputMods",
+ "T16VRegWithIntInputMods");
let ParserMethod = "parseRegWithIntInputMods";
- let PredicateMethod = "isT16VRegWithInputMods";
+ let PredicateMethod = "isT16VRegWithInputMods<" #
+ !if(IsFake16, "true", "false") # ">";
}
-def IntT16VRegInputMods : InputMods <IntT16VRegInputModsMatchClass> {
+class IntT16VRegInputMods<bit IsFake16>
+ : InputMods <IntT16VRegInputModsMatchClass<IsFake16>> {
let PrintMethod = "printOperandAndIntInputMods";
}
@@ -1353,7 +1341,7 @@ def VOP3OMods : ComplexPattern<untyped, 3, "SelectVOP3OMods">;
def VOP3PMods : ComplexPattern<untyped, 2, "SelectVOP3PMods">;
def VOP3PModsDOT : ComplexPattern<untyped, 2, "SelectVOP3PModsDOT">;
-def DotIUVOP3PMods : ComplexPattern<untyped, 1, "SelectDotIUVOP3PMods">;
+def VOP3PModsNeg : ComplexPattern<untyped, 1, "SelectVOP3PModsNeg">;
def WMMAOpSelVOP3PMods : ComplexPattern<untyped, 1, "SelectWMMAOpSelVOP3PMods">;
def VOP3OpSel : ComplexPattern<untyped, 2, "SelectVOP3OpSel">;
@@ -1489,20 +1477,18 @@ class getSDWADstForVT<ValueType VT> {
// Returns the register class to use for source 0 of VOP[12C]
// instructions for the given VT.
class getVOPSrc0ForVT<ValueType VT, bit IsTrue16, bit IsFake16 = 1> {
- bit isFP = isFloatType<VT>.ret;
-
RegisterOperand ret =
- !if(isFP,
+ !if(VT.isFP,
!if(!eq(VT.Size, 64),
VSrc_f64,
- !if(!eq(VT.Value, f16.Value),
+ !if(!or(!eq(VT.Value, f16.Value), !eq(VT.Value, bf16.Value)),
!if(IsTrue16,
!if(IsFake16, VSrcFake16_f16_Lo128, VSrcT_f16_Lo128),
VSrc_f16
),
- !if(!eq(VT.Value, v2f16.Value),
+ !if(!or(!eq(VT.Value, v2f16.Value), !eq(VT.Value, v2bf16.Value)),
VSrc_v2f16,
- !if(!eq(VT.Value, v4f16.Value),
+ !if(!or(!eq(VT.Value, v4f16.Value), !eq(VT.Value, v4bf16.Value)),
AVSrc_64,
VSrc_f32
)
@@ -1530,43 +1516,33 @@ class getSOPSrcForVT<ValueType VT> {
}
// Returns the vreg register class to use for source operand given VT
-class getVregSrcForVT<ValueType VT> {
- RegisterClass ret = !if(!eq(VT.Size, 128), VReg_128,
- !if(!eq(VT.Size, 96), VReg_96,
- !if(!eq(VT.Size, 64), VReg_64,
- !if(!eq(VT.Size, 48), VReg_64,
- VGPR_32))));
-}
-
-class getVregSrcForVT_t16<ValueType VT, bit IsFake16 = 1> {
- RegisterClass ret = !if(!eq(VT.Size, 128), VReg_128,
- !if(!eq(VT.Size, 96), VReg_96,
- !if(!eq(VT.Size, 64), VReg_64,
- !if(!eq(VT.Size, 48), VReg_64,
- !if(!eq(VT.Size, 16),
- !if(IsFake16, VGPR_32_Lo128, VGPR_16_Lo128),
- VGPR_32)))));
-
- RegisterOperand op = !if (!and(!eq(VT.Size, 16), !not(IsFake16)),
- VGPRSrc_16_Lo128, RegisterOperand<ret>);
+class getVregSrcForVT<ValueType VT, bit IsTrue16 = 0, bit IsFake16 = 0> {
+ RegisterOperand ret =
+ !if (!eq(VT.Size, 128), RegisterOperand<VReg_128>,
+ !if (!eq(VT.Size, 96), RegisterOperand<VReg_96>,
+ !if (!eq(VT.Size, 64), RegisterOperand<VReg_64>,
+ !if (!eq(VT.Size, 48), RegisterOperand<VReg_64>,
+ !if (!eq(VT.Size, 16),
+ !if (IsTrue16,
+ !if (IsFake16, VGPRSrc_32_Lo128, VGPRSrc_16_Lo128),
+ RegisterOperand<VGPR_32>),
+ RegisterOperand<VGPR_32>)))));
}
class getSDWASrcForVT <ValueType VT> {
- bit isFP = isFloatType<VT>.ret;
RegisterOperand retFlt = !if(!eq(VT.Size, 16), SDWASrc_f16, SDWASrc_f32);
RegisterOperand retInt = !if(!eq(VT.Size, 16), SDWASrc_i16, SDWASrc_i32);
- RegisterOperand ret = !if(isFP, retFlt, retInt);
+ RegisterOperand ret = !if(VT.isFP, retFlt, retInt);
}
// Returns the register class to use for sources of VOP3 instructions for the
// given VT.
class getVOP3SrcForVT<ValueType VT, bit IsTrue16 = 0> {
- bit isFP = isFloatType<VT>.ret;
RegisterOperand ret =
!if(!eq(VT.Size, 128),
VRegSrc_128,
!if(!eq(VT.Size, 64),
- !if(isFP,
+ !if(VT.isFP,
!if(!eq(VT.Value, v2f32.Value),
VSrc_v2f32,
VSrc_f64),
@@ -1575,12 +1551,12 @@ class getVOP3SrcForVT<ValueType VT, bit IsTrue16 = 0> {
VSrc_b64)),
!if(!eq(VT.Value, i1.Value),
SSrc_i1,
- !if(isFP,
- !if(!eq(VT.Value, f16.Value),
+ !if(VT.isFP,
+ !if(!or(!eq(VT.Value, f16.Value), !eq(VT.Value, bf16.Value)),
!if(IsTrue16, VSrcT_f16, VSrc_f16),
- !if(!eq(VT.Value, v2f16.Value),
+ !if(!or(!eq(VT.Value, v2f16.Value), !eq(VT.Value, v2bf16.Value)),
VSrc_v2f16,
- !if(!eq(VT.Value, v4f16.Value),
+ !if(!or(!eq(VT.Value, v4f16.Value), !eq(VT.Value, v4bf16.Value)),
AVSrc_64,
VSrc_f32
)
@@ -1601,12 +1577,11 @@ class getVOP3SrcForVT<ValueType VT, bit IsTrue16 = 0> {
// Src2 of VOP3 DPP instructions cannot be a literal
class getVOP3DPPSrcForVT<ValueType VT> {
- bit isFP = isFloatType<VT>.ret;
RegisterOperand ret =
!if (!eq(VT.Value, i1.Value), SSrc_i1,
- !if (isFP,
- !if (!eq(VT.Value, f16.Value), VCSrc_f16,
- !if (!eq(VT.Value, v2f16.Value), VCSrc_v2f16, VCSrc_f32)),
+ !if (VT.isFP,
+ !if (!or(!eq(VT.Value, f16.Value), !eq(VT.Value, bf16.Value)), VCSrc_f16,
+ !if (!or(!eq(VT.Value, v2f16.Value), !eq(VT.Value, v2bf16.Value)), VCSrc_v2f16, VCSrc_f32)),
!if (!eq(VT.Value, i16.Value), VCSrc_b16,
!if (!eq(VT.Value, v2i16.Value), VCSrc_v2b16,
VCSrc_b32))));
@@ -1615,64 +1590,64 @@ class getVOP3DPPSrcForVT<ValueType VT> {
// Float or packed int
class isModifierType<ValueType SrcVT> {
bit ret = !or(!eq(SrcVT.Value, f16.Value),
+ !eq(SrcVT.Value, bf16.Value),
!eq(SrcVT.Value, f32.Value),
!eq(SrcVT.Value, f64.Value),
!eq(SrcVT.Value, v2f16.Value),
!eq(SrcVT.Value, v2i16.Value),
+ !eq(SrcVT.Value, v2bf16.Value),
!eq(SrcVT.Value, v2f32.Value),
!eq(SrcVT.Value, v2i32.Value),
!eq(SrcVT.Value, v4f16.Value),
!eq(SrcVT.Value, v4i16.Value),
+ !eq(SrcVT.Value, v4bf16.Value),
!eq(SrcVT.Value, v4f32.Value),
!eq(SrcVT.Value, v4i32.Value),
!eq(SrcVT.Value, v8f16.Value),
!eq(SrcVT.Value, v8i16.Value),
+ !eq(SrcVT.Value, v8bf16.Value),
!eq(SrcVT.Value, v8f32.Value),
!eq(SrcVT.Value, v8i32.Value),
!eq(SrcVT.Value, v16f16.Value),
- !eq(SrcVT.Value, v16i16.Value));
+ !eq(SrcVT.Value, v16i16.Value),
+ !eq(SrcVT.Value, v16bf16.Value));
}
// Return type of input modifiers operand for specified input operand
class getSrcMod <ValueType VT, bit IsTrue16 = 0> {
- bit isFP = isFloatType<VT>.ret;
- bit isPacked = isPackedType<VT>.ret;
Operand ret = !if(!eq(VT.Size, 64),
- !if(isFP, FP64InputMods, Int64InputMods),
+ !if(VT.isFP, FP64InputMods, Int64InputMods),
!if(!eq(VT.Size, 16),
- !if(isFP, !if(IsTrue16, FPT16InputMods, FP16InputMods),
- !if(IsTrue16, IntT16InputMods, IntOpSelMods)),
- !if(isFP, FP32InputMods, Int32InputMods)));
+ !if(VT.isFP, !if(IsTrue16, FPT16InputMods, FP16InputMods),
+ !if(IsTrue16, IntT16InputMods, IntOpSelMods)),
+ !if(VT.isFP, FP32InputMods, Int32InputMods)));
}
class getOpSelMod <ValueType VT> {
- Operand ret = !if(!eq(VT.Value, f16.Value), FP16InputMods, IntOpSelMods);
+ Operand ret = !if(!or(!eq(VT.Value, f16.Value), !eq(VT.Value, bf16.Value)),
+ FP16InputMods, IntOpSelMods);
}
// Return type of input modifiers operand specified input operand for DPP
class getSrcModDPP <ValueType VT> {
- bit isFP = isFloatType<VT>.ret;
- Operand ret = !if(isFP, FPVRegInputMods, IntVRegInputMods);
+ Operand ret = !if(VT.isFP, FPVRegInputMods, IntVRegInputMods);
}
-class getSrcModDPP_t16 <ValueType VT> {
- bit isFP = isFloatType<VT>.ret;
+class getSrcModDPP_t16 <ValueType VT, bit IsFake16 = 1> {
Operand ret =
- !if (isFP,
- !if (!eq(VT.Value, f16.Value), FPT16VRegInputMods,
- FPVRegInputMods),
- !if (!eq(VT.Value, i16.Value), IntT16VRegInputMods,
- IntVRegInputMods));
+ !if (VT.isFP,
+ !if (!or(!eq(VT.Value, f16.Value), !eq(VT.Value, bf16.Value)),
+ FPT16VRegInputMods<IsFake16>, FPVRegInputMods),
+ !if (!eq(VT.Value, i16.Value),
+ IntT16VRegInputMods<IsFake16>, IntVRegInputMods));
}
// Return type of input modifiers operand for specified input operand for DPP
class getSrcModVOP3DPP <ValueType VT> {
- bit isFP = isFloatType<VT>.ret;
- bit isPacked = isPackedType<VT>.ret;
Operand ret =
- !if (isFP,
- !if (!eq(VT.Value, f16.Value), FP16VCSrcInputMods,
- FP32VCSrcInputMods),
+ !if (VT.isFP,
+ !if (!or(!eq(VT.Value, f16.Value), !eq(VT.Value, bf16.Value)),
+ FP16VCSrcInputMods, FP32VCSrcInputMods),
Int32VCSrcInputMods);
}
@@ -1681,7 +1656,8 @@ class getSrcModSDWA <ValueType VT> {
Operand ret = !if(!eq(VT.Value, f16.Value), FP16SDWAInputMods,
!if(!eq(VT.Value, f32.Value), FP32SDWAInputMods,
!if(!eq(VT.Value, i16.Value), Int16SDWAInputMods,
- Int32SDWAInputMods)));
+ !if(!eq(VT.Value, bf16.Value), FP16SDWAInputMods,
+ Int32SDWAInputMods))));
}
// Returns the input arguments for VOP[12C] instructions for the given SrcVT.
@@ -1806,10 +1782,9 @@ class getInsVOP3OpSel <RegisterOperand Src0RC, RegisterOperand Src1RC,
Src0Mod, Src1Mod, Src2Mod, /*HasOpSel=*/1>.ret;
}
-class getInsDPPBase <RegisterOperand OldRC, RegisterClass Src0RC, RegisterClass Src1RC,
- RegisterClass Src2RC, int NumSrcArgs, bit HasModifiers,
- Operand Src0Mod, Operand Src1Mod, Operand Src2Mod, bit HasOld> {
-
+class getInsDPPBase <RegisterOperand OldRC, RegisterOperand Src0RC, RegisterOperand Src1RC,
+ RegisterOperand Src2RC, int NumSrcArgs, bit HasModifiers,
+ Operand Src0Mod, Operand Src1Mod, Operand Src2Mod, bit HasOld> {
dag ret = !if(!eq(NumSrcArgs, 0),
// VOP1 without input operands (V_NOP)
(ins ),
@@ -1849,8 +1824,8 @@ class getInsDPPBase <RegisterOperand OldRC, RegisterClass Src0RC, RegisterClass
);
}
-class getInsDPP <RegisterOperand OldRC, RegisterClass Src0RC, RegisterClass Src1RC,
- RegisterClass Src2RC, int NumSrcArgs, bit HasModifiers,
+class getInsDPP <RegisterOperand OldRC, RegisterOperand Src0RC, RegisterOperand Src1RC,
+ RegisterOperand Src2RC, int NumSrcArgs, bit HasModifiers,
Operand Src0Mod, Operand Src1Mod, Operand Src2Mod, bit HasOld = 1> {
dag ret = !con(getInsDPPBase<OldRC, Src0RC, Src1RC, Src2RC, NumSrcArgs,
HasModifiers, Src0Mod, Src1Mod, Src2Mod, HasOld>.ret,
@@ -1858,17 +1833,17 @@ class getInsDPP <RegisterOperand OldRC, RegisterClass Src0RC, RegisterClass Src1
bank_mask:$bank_mask, bound_ctrl:$bound_ctrl));
}
-class getInsDPP16 <RegisterOperand OldRC, RegisterClass Src0RC, RegisterClass Src1RC,
- RegisterClass Src2RC, int NumSrcArgs, bit HasModifiers,
- Operand Src0Mod, Operand Src1Mod, Operand Src2Mod, bit HasOld = 1> {
+class getInsDPP16 <RegisterOperand OldRC, RegisterOperand Src0RC, RegisterOperand Src1RC,
+ RegisterOperand Src2RC, int NumSrcArgs, bit HasModifiers,
+ Operand Src0Mod, Operand Src1Mod, Operand Src2Mod, bit HasOld = 1> {
dag ret = !con(getInsDPP<OldRC, Src0RC, Src1RC, Src2RC, NumSrcArgs,
HasModifiers, Src0Mod, Src1Mod, Src2Mod, HasOld>.ret,
(ins FI:$fi));
}
-class getInsDPP8 <RegisterOperand OldRC, RegisterClass Src0RC, RegisterClass Src1RC,
- RegisterClass Src2RC, int NumSrcArgs, bit HasModifiers,
- Operand Src0Mod, Operand Src1Mod, Operand Src2Mod, bit HasOld = 1> {
+class getInsDPP8 <RegisterOperand OldRC, RegisterOperand Src0RC, RegisterOperand Src1RC,
+ RegisterOperand Src2RC, int NumSrcArgs, bit HasModifiers,
+ Operand Src0Mod, Operand Src1Mod, Operand Src2Mod, bit HasOld = 1> {
dag ret = !con(getInsDPPBase<OldRC, Src0RC, Src1RC, Src2RC, NumSrcArgs,
HasModifiers, Src0Mod, Src1Mod, Src2Mod, HasOld>.ret,
(ins dpp8:$dpp8, FI:$fi));
@@ -2273,13 +2248,13 @@ class VOPProfile <list<ValueType> _ArgVT, bit _EnableClamp = 0> {
field RegisterOperand DstRCVOP3DPP = DstRC64;
field RegisterOperand DstRCSDWA = getSDWADstForVT<DstVT>.ret;
field RegisterOperand Src0RC32 = getVOPSrc0ForVT<Src0VT, IsTrue16>.ret;
- field RegisterOperand Src1RC32 = RegisterOperand<getVregSrcForVT<Src1VT>.ret>;
+ field RegisterOperand Src1RC32 = getVregSrcForVT<Src1VT>.ret;
field RegisterOperand Src0RC64 = getVOP3SrcForVT<Src0VT>.ret;
field RegisterOperand Src1RC64 = getVOP3SrcForVT<Src1VT>.ret;
field RegisterOperand Src2RC64 = getVOP3SrcForVT<Src2VT>.ret;
- field RegisterClass Src0DPP = getVregSrcForVT<Src0VT>.ret;
- field RegisterClass Src1DPP = getVregSrcForVT<Src1VT>.ret;
- field RegisterClass Src2DPP = getVregSrcForVT<Src2VT>.ret;
+ field RegisterOperand Src0DPP = getVregSrcForVT<Src0VT>.ret;
+ field RegisterOperand Src1DPP = getVregSrcForVT<Src1VT>.ret;
+ field RegisterOperand Src2DPP = getVregSrcForVT<Src2VT>.ret;
field RegisterOperand Src0VOP3DPP = VGPRSrc_32;
field RegisterOperand Src1VOP3DPP = getVOP3DPPSrcForVT<Src1VT>.ret;
field RegisterOperand Src2VOP3DPP = getVOP3DPPSrcForVT<Src2VT>.ret;
@@ -2313,9 +2288,9 @@ class VOPProfile <list<ValueType> _ArgVT, bit _EnableClamp = 0> {
field bit HasSrc1 = !ne(Src1VT.Value, untyped.Value);
field bit HasSrc2 = !ne(Src2VT.Value, untyped.Value);
- field bit HasSrc0FloatMods = isFloatType<Src0VT>.ret;
- field bit HasSrc1FloatMods = isFloatType<Src1VT>.ret;
- field bit HasSrc2FloatMods = isFloatType<Src2VT>.ret;
+ field bit HasSrc0FloatMods = Src0VT.isFP;
+ field bit HasSrc1FloatMods = Src1VT.isFP;
+ field bit HasSrc2FloatMods = Src2VT.isFP;
field bit HasSrc0IntMods = isIntType<Src0VT>.ret;
field bit HasSrc1IntMods = isIntType<Src1VT>.ret;
@@ -2323,16 +2298,16 @@ class VOPProfile <list<ValueType> _ArgVT, bit _EnableClamp = 0> {
field bit HasClamp = !or(isModifierType<Src0VT>.ret, EnableClamp);
field bit HasSDWAClamp = EmitDst;
- field bit HasFPClamp = !and(isFloatType<DstVT>.ret, HasClamp);
- field bit HasIntClamp = !if(isFloatType<DstVT>.ret, 0, HasClamp);
+ field bit HasFPClamp = !and(DstVT.isFP, HasClamp);
+ field bit HasIntClamp = !if(DstVT.isFP, 0, HasClamp);
field bit HasClampLo = HasClamp;
- field bit HasClampHi = !and(isPackedType<DstVT>.ret, HasClamp);
+ field bit HasClampHi = !and(DstVT.isVector, HasClamp);
field bit HasHigh = 0;
- field bit IsPacked = isPackedType<Src0VT>.ret;
+ field bit IsPacked = Src0VT.isVector;
field bit HasOpSel = IsPacked;
- field bit HasOMod = !if(IsVOP3P, 0, isFloatType<DstVT>.ret);
- field bit HasSDWAOMod = isFloatType<DstVT>.ret;
+ field bit HasOMod = !if(IsVOP3P, 0, DstVT.isFP);
+ field bit HasSDWAOMod = DstVT.isFP;
field bit HasModifiers = !or(isModifierType<Src0VT>.ret,
isModifierType<Src1VT>.ret,
@@ -2465,13 +2440,13 @@ class VOPProfile_True16<VOPProfile P> : VOPProfile<P.ArgVT> {
let DstRC = getVALUDstForVT<DstVT, 1 /*IsTrue16*/, 0 /*IsVOP3Encoding*/>.ret;
let DstRC64 = getVALUDstForVT<DstVT>.ret;
let Src0RC32 = getVOPSrc0ForVT<Src0VT, 1 /*IsTrue16*/, 0 /*IsFake16*/>.ret;
- let Src1RC32 = getVregSrcForVT_t16<Src1VT, 0 /*IsFake16*/>.op;
- let Src0DPP = getVregSrcForVT_t16<Src0VT>.ret;
- let Src1DPP = getVregSrcForVT_t16<Src1VT>.ret;
- let Src2DPP = getVregSrcForVT_t16<Src2VT>.ret;
- let Src0ModDPP = getSrcModDPP_t16<Src0VT>.ret;
- let Src1ModDPP = getSrcModDPP_t16<Src1VT>.ret;
- let Src2ModDPP = getSrcModDPP_t16<Src2VT>.ret;
+ let Src1RC32 = getVregSrcForVT<Src1VT, 1 /*IsTrue16*/, 0 /*IsFake16*/>.ret;
+ let Src0DPP = getVregSrcForVT<Src0VT, 1 /*IsTrue16*/, 0 /*IsFake16*/>.ret;
+ let Src1DPP = getVregSrcForVT<Src1VT, 1 /*IsTrue16*/, 0 /*IsFake16*/>.ret;
+ let Src2DPP = getVregSrcForVT<Src2VT, 1 /*IsTrue16*/, 0 /*IsFake16*/>.ret;
+ let Src0ModDPP = getSrcModDPP_t16<Src0VT, 0 /*IsFake16*/>.ret;
+ let Src1ModDPP = getSrcModDPP_t16<Src1VT, 0 /*IsFake16*/>.ret;
+ let Src2ModDPP = getSrcModDPP_t16<Src2VT, 0 /*IsFake16*/>.ret;
let DstRC64 = getVALUDstForVT<DstVT, 1 /*IsTrue16*/, 1 /*IsVOP3Encoding*/>.ret;
let Src0RC64 = getVOP3SrcForVT<Src0VT, 1 /*IsTrue16*/>.ret;
@@ -2487,10 +2462,10 @@ class VOPProfile_Fake16<VOPProfile P> : VOPProfile<P.ArgVT> {
// Most DstVT are 16-bit, but not all
let DstRC = getVALUDstForVT_fake16<DstVT>.ret;
let DstRC64 = getVALUDstForVT<DstVT>.ret;
- let Src1RC32 = RegisterOperand<getVregSrcForVT_t16<Src1VT>.ret>;
- let Src0DPP = getVregSrcForVT_t16<Src0VT>.ret;
- let Src1DPP = getVregSrcForVT_t16<Src1VT>.ret;
- let Src2DPP = getVregSrcForVT_t16<Src2VT>.ret;
+ let Src1RC32 = getVregSrcForVT<Src1VT, 1/*IsTrue16*/, 1/*IsFake16*/>.ret;
+ let Src0DPP = getVregSrcForVT<Src0VT, 1/*IsTrue16*/, 1/*IsFake16*/>.ret;
+ let Src1DPP = getVregSrcForVT<Src1VT, 1/*IsTrue16*/, 1/*IsFake16*/>.ret;
+ let Src2DPP = getVregSrcForVT<Src2VT, 1/*IsTrue16*/, 1/*IsFake16*/>.ret;
let Src0ModDPP = getSrcModDPP_t16<Src0VT>.ret;
let Src1ModDPP = getSrcModDPP_t16<Src1VT>.ret;
let Src2ModDPP = getSrcModDPP_t16<Src2VT>.ret;
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInstructions.td b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInstructions.td
index b4bd46d33c1f..788e3162fb37 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -1862,7 +1862,10 @@ class ClampPat<Instruction inst, ValueType vt> : GCNPat <
>;
def : ClampPat<V_MAX_F32_e64, f32>;
+let SubtargetPredicate = isNotGFX12Plus in
def : ClampPat<V_MAX_F64_e64, f64>;
+let SubtargetPredicate = isGFX12Plus in
+def : ClampPat<V_MAX_NUM_F64_e64, f64>;
let SubtargetPredicate = NotHasTrue16BitInsts in
def : ClampPat<V_MAX_F16_e64, f16>;
let SubtargetPredicate = UseRealTrue16Insts in
@@ -2990,10 +2993,12 @@ def : GCNPat<
}
// TODO: Handle fneg like other types.
+let SubtargetPredicate = isNotGFX12Plus in {
def : GCNPat<
(fcanonicalize (f64 (VOP3Mods f64:$src, i32:$src_mods))),
(V_MUL_F64_e64 0, CONST.FP64_ONE, $src_mods, $src)
>;
+}
} // End AddedComplexity = -5
multiclass SelectCanonicalizeAsMax<
@@ -3009,7 +3014,13 @@ multiclass SelectCanonicalizeAsMax<
def : GCNPat<
(fcanonicalize (f64 (VOP3Mods f64:$src, i32:$src_mods))),
(V_MAX_F64_e64 $src_mods, $src, $src_mods, $src)> {
- let OtherPredicates = f64_preds;
+ let OtherPredicates = !listconcat(f64_preds, [isNotGFX12Plus]);
+ }
+
+ def : GCNPat<
+ (fcanonicalize (f64 (VOP3Mods f64:$src, i32:$src_mods))),
+ (V_MAX_NUM_F64_e64 $src_mods, $src, $src_mods, $src)> {
+ let OtherPredicates = !listconcat(f64_preds, [isGFX12Plus]);
}
def : GCNPat<
@@ -3856,11 +3867,13 @@ def G_AMDGPU_BUFFER_ATOMIC_UMIN : BufferAtomicGenericInstruction;
def G_AMDGPU_BUFFER_ATOMIC_SMAX : BufferAtomicGenericInstruction;
def G_AMDGPU_BUFFER_ATOMIC_UMAX : BufferAtomicGenericInstruction;
def G_AMDGPU_BUFFER_ATOMIC_AND : BufferAtomicGenericInstruction;
+def G_AMDGPU_BUFFER_ATOMIC_COND_SUB_U32 : BufferAtomicGenericInstruction;
def G_AMDGPU_BUFFER_ATOMIC_OR : BufferAtomicGenericInstruction;
def G_AMDGPU_BUFFER_ATOMIC_XOR : BufferAtomicGenericInstruction;
def G_AMDGPU_BUFFER_ATOMIC_INC : BufferAtomicGenericInstruction;
def G_AMDGPU_BUFFER_ATOMIC_DEC : BufferAtomicGenericInstruction;
def G_AMDGPU_BUFFER_ATOMIC_FADD : BufferAtomicGenericInstruction;
+def G_AMDGPU_BUFFER_ATOMIC_FADD_BF16 : BufferAtomicGenericInstruction;
def G_AMDGPU_BUFFER_ATOMIC_FMIN : BufferAtomicGenericInstruction;
def G_AMDGPU_BUFFER_ATOMIC_FMAX : BufferAtomicGenericInstruction;
@@ -3877,7 +3890,8 @@ def G_AMDGPU_BUFFER_ATOMIC_CMPSWAP : AMDGPUGenericInstruction {
// Wrapper around llvm.amdgcn.s.buffer.load. This is mostly needed as
// a workaround for the intrinsic being defined as readnone, but
// really needs a memory operand.
-def G_AMDGPU_S_BUFFER_LOAD : AMDGPUGenericInstruction {
+
+class SBufferLoadInstruction : AMDGPUGenericInstruction {
let OutOperandList = (outs type0:$dst);
let InOperandList = (ins type1:$rsrc, type2:$offset, untyped_imm_0:$cachepolicy);
let hasSideEffects = 0;
@@ -3885,6 +3899,12 @@ def G_AMDGPU_S_BUFFER_LOAD : AMDGPUGenericInstruction {
let mayStore = 0;
}
+def G_AMDGPU_S_BUFFER_LOAD : SBufferLoadInstruction;
+def G_AMDGPU_S_BUFFER_LOAD_SBYTE : SBufferLoadInstruction;
+def G_AMDGPU_S_BUFFER_LOAD_UBYTE : SBufferLoadInstruction;
+def G_AMDGPU_S_BUFFER_LOAD_SSHORT : SBufferLoadInstruction;
+def G_AMDGPU_S_BUFFER_LOAD_USHORT : SBufferLoadInstruction;
+
def G_AMDGPU_S_MUL_U64_U32 : AMDGPUGenericInstruction {
let OutOperandList = (outs type0:$dst);
let InOperandList = (ins type0:$src0, type0:$src1);
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp
index 0ba7792ac436..4b13825040eb 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp
@@ -332,7 +332,6 @@ bool SILowerSGPRSpills::runOnMachineFunction(MachineFunction &MF) {
}
bool MadeChange = false;
- bool NewReservedRegs = false;
bool SpilledToVirtVGPRLanes = false;
// TODO: CSR VGPRs will never be spilled to AGPRs. These can probably be
@@ -369,8 +368,8 @@ bool SILowerSGPRSpills::runOnMachineFunction(MachineFunction &MF) {
// regalloc aware CFI generation to insert new CFIs along with the
// intermediate spills is implemented. There is no such support
// currently exist in the LLVM compiler.
- if (FuncInfo->allocateSGPRSpillToVGPRLane(MF, FI, true)) {
- NewReservedRegs = true;
+ if (FuncInfo->allocateSGPRSpillToVGPRLane(
+ MF, FI, /*SpillToPhysVGPRLane=*/true)) {
bool Spilled = TRI->eliminateSGPRToVGPRSpillFrameIndex(
MI, FI, nullptr, Indexes, LIS, true);
if (!Spilled)
@@ -442,12 +441,5 @@ bool SILowerSGPRSpills::runOnMachineFunction(MachineFunction &MF) {
SaveBlocks.clear();
RestoreBlocks.clear();
- // Updated the reserved registers with any physical VGPRs added for SGPR
- // spills.
- if (NewReservedRegs) {
- for (Register Reg : FuncInfo->getWWMReservedRegs())
- MRI.reserveReg(Reg, TRI);
- }
-
return MadeChange;
}
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
index e8142244b7db..b94d143a75e5 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
@@ -312,6 +312,33 @@ bool SIMachineFunctionInfo::isCalleeSavedReg(const MCPhysReg *CSRegs,
return false;
}
+void SIMachineFunctionInfo::shiftSpillPhysVGPRsToLowestRange(
+ MachineFunction &MF) {
+ const SIRegisterInfo *TRI = MF.getSubtarget<GCNSubtarget>().getRegisterInfo();
+ MachineRegisterInfo &MRI = MF.getRegInfo();
+ for (unsigned I = 0, E = SpillPhysVGPRs.size(); I < E; ++I) {
+ Register Reg = SpillPhysVGPRs[I];
+ Register NewReg =
+ TRI->findUnusedRegister(MRI, &AMDGPU::VGPR_32RegClass, MF);
+ if (!NewReg || NewReg >= Reg)
+ break;
+
+ MRI.replaceRegWith(Reg, NewReg);
+
+ // Update various tables with the new VGPR.
+ SpillPhysVGPRs[I] = NewReg;
+ WWMReservedRegs.remove(Reg);
+ WWMReservedRegs.insert(NewReg);
+ WWMSpills.insert(std::make_pair(NewReg, WWMSpills[Reg]));
+ WWMSpills.erase(Reg);
+
+ for (MachineBasicBlock &MBB : MF) {
+ MBB.removeLiveIn(Reg);
+ MBB.sortUniqueLiveIns();
+ }
+ }
+}
+
bool SIMachineFunctionInfo::allocateVirtualVGPRForSGPRSpills(
MachineFunction &MF, int FI, unsigned LaneIndex) {
MachineRegisterInfo &MRI = MF.getRegInfo();
@@ -329,13 +356,17 @@ bool SIMachineFunctionInfo::allocateVirtualVGPRForSGPRSpills(
}
bool SIMachineFunctionInfo::allocatePhysicalVGPRForSGPRSpills(
- MachineFunction &MF, int FI, unsigned LaneIndex) {
+ MachineFunction &MF, int FI, unsigned LaneIndex, bool IsPrologEpilog) {
const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
const SIRegisterInfo *TRI = ST.getRegisterInfo();
MachineRegisterInfo &MRI = MF.getRegInfo();
Register LaneVGPR;
if (!LaneIndex) {
- LaneVGPR = TRI->findUnusedRegister(MRI, &AMDGPU::VGPR_32RegClass, MF);
+ // Find the highest available register if called before RA to ensure the
+ // lowest registers are available for allocation. The LaneVGPR, in that
+ // case, will be shifted back to the lowest range after VGPR allocation.
+ LaneVGPR = TRI->findUnusedRegister(MRI, &AMDGPU::VGPR_32RegClass, MF,
+ !IsPrologEpilog);
if (LaneVGPR == AMDGPU::NoRegister) {
// We have no VGPRs left for spilling SGPRs. Reset because we will not
// partially spill the SGPR to VGPRs.
@@ -359,12 +390,12 @@ bool SIMachineFunctionInfo::allocatePhysicalVGPRForSGPRSpills(
return true;
}
-bool SIMachineFunctionInfo::allocateSGPRSpillToVGPRLane(MachineFunction &MF,
- int FI,
- bool IsPrologEpilog) {
+bool SIMachineFunctionInfo::allocateSGPRSpillToVGPRLane(
+ MachineFunction &MF, int FI, bool SpillToPhysVGPRLane,
+ bool IsPrologEpilog) {
std::vector<SIRegisterInfo::SpilledReg> &SpillLanes =
- IsPrologEpilog ? SGPRSpillsToPhysicalVGPRLanes[FI]
- : SGPRSpillsToVirtualVGPRLanes[FI];
+ SpillToPhysVGPRLane ? SGPRSpillsToPhysicalVGPRLanes[FI]
+ : SGPRSpillsToVirtualVGPRLanes[FI];
// This has already been allocated.
if (!SpillLanes.empty())
@@ -384,14 +415,15 @@ bool SIMachineFunctionInfo::allocateSGPRSpillToVGPRLane(MachineFunction &MF,
assert(ST.getRegisterInfo()->spillSGPRToVGPR() &&
"not spilling SGPRs to VGPRs");
- unsigned &NumSpillLanes =
- IsPrologEpilog ? NumPhysicalVGPRSpillLanes : NumVirtualVGPRSpillLanes;
+ unsigned &NumSpillLanes = SpillToPhysVGPRLane ? NumPhysicalVGPRSpillLanes
+ : NumVirtualVGPRSpillLanes;
for (unsigned I = 0; I < NumLanes; ++I, ++NumSpillLanes) {
unsigned LaneIndex = (NumSpillLanes % WaveSize);
- bool Allocated = IsPrologEpilog
- ? allocatePhysicalVGPRForSGPRSpills(MF, FI, LaneIndex)
+ bool Allocated = SpillToPhysVGPRLane
+ ? allocatePhysicalVGPRForSGPRSpills(MF, FI, LaneIndex,
+ IsPrologEpilog)
: allocateVirtualVGPRForSGPRSpills(MF, FI, LaneIndex);
if (!Allocated) {
NumSpillLanes -= I;
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
index dc63ae44c528..9ff66a094f99 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
@@ -548,7 +548,8 @@ private:
bool allocateVirtualVGPRForSGPRSpills(MachineFunction &MF, int FI,
unsigned LaneIndex);
bool allocatePhysicalVGPRForSGPRSpills(MachineFunction &MF, int FI,
- unsigned LaneIndex);
+ unsigned LaneIndex,
+ bool IsPrologEpilog);
public:
Register getVGPRForAGPRCopy() const {
@@ -588,6 +589,7 @@ public:
}
ArrayRef<Register> getSGPRSpillVGPRs() const { return SpillVGPRs; }
+
const WWMSpillsMap &getWWMSpills() const { return WWMSpills; }
const ReservedRegSet &getWWMReservedRegs() const { return WWMReservedRegs; }
@@ -702,7 +704,12 @@ public:
I->second.IsDead = true;
}
+ // To bring the Physical VGPRs in the highest range allocated for CSR SGPR
+ // spilling into the lowest available range.
+ void shiftSpillPhysVGPRsToLowestRange(MachineFunction &MF);
+
bool allocateSGPRSpillToVGPRLane(MachineFunction &MF, int FI,
+ bool SpillToPhysVGPRLane = false,
bool IsPrologEpilog = false);
bool allocateVGPRSpillToAGPR(MachineFunction &MF, int FI, bool isAGPRtoVGPR);
@@ -1041,22 +1048,6 @@ public:
return WavesPerEU.second;
}
- /// \returns SGPR used for \p Dim's work group ID.
- Register getWorkGroupIDSGPR(unsigned Dim) const {
- switch (Dim) {
- case 0:
- assert(hasWorkGroupIDX());
- return ArgInfo.WorkGroupIDX.getRegister();
- case 1:
- assert(hasWorkGroupIDY());
- return ArgInfo.WorkGroupIDY.getRegister();
- case 2:
- assert(hasWorkGroupIDZ());
- return ArgInfo.WorkGroupIDZ.getRegister();
- }
- llvm_unreachable("unexpected dimension");
- }
-
const AMDGPUGWSResourcePseudoSourceValue *
getGWSPSV(const AMDGPUTargetMachine &TM) {
return &GWSResourcePSV;
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
index 6d749ad1ad24..84b9330ef963 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
@@ -579,11 +579,30 @@ public:
};
class SIGfx12CacheControl : public SIGfx11CacheControl {
+protected:
+ // Sets TH policy to \p Value if CPol operand is present in instruction \p MI.
+ // \returns Returns true if \p MI is modified, false otherwise.
+ bool setTH(const MachineBasicBlock::iterator MI,
+ AMDGPU::CPol::CPol Value) const;
+ // Sets Scope policy to \p Value if CPol operand is present in instruction \p
+ // MI. \returns Returns true if \p MI is modified, false otherwise.
+ bool setScope(const MachineBasicBlock::iterator MI,
+ AMDGPU::CPol::CPol Value) const;
+
public:
SIGfx12CacheControl(const GCNSubtarget &ST) : SIGfx11CacheControl(ST) {}
+ bool insertWait(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
+ SIAtomicAddrSpace AddrSpace, SIMemOp Op,
+ bool IsCrossAddrSpaceOrdering, Position Pos) const override;
+
bool insertAcquire(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
SIAtomicAddrSpace AddrSpace, Position Pos) const override;
+
+ bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
+ SIAtomicAddrSpace AddrSpace, SIMemOp Op,
+ bool IsVolatile,
+ bool IsNonTemporal) const override;
};
class SIMemoryLegalizer final : public MachineFunctionPass {
@@ -2142,6 +2161,132 @@ bool SIGfx11CacheControl::enableVolatileAndOrNonTemporal(
return Changed;
}
+bool SIGfx12CacheControl::setTH(const MachineBasicBlock::iterator MI,
+ AMDGPU::CPol::CPol Value) const {
+ MachineOperand *CPol = TII->getNamedOperand(*MI, OpName::cpol);
+ if (!CPol)
+ return false;
+
+ uint64_t NewTH = Value & AMDGPU::CPol::TH;
+ if ((CPol->getImm() & AMDGPU::CPol::TH) != NewTH) {
+ CPol->setImm((CPol->getImm() & ~AMDGPU::CPol::TH) | NewTH);
+ return true;
+ }
+
+ return false;
+}
+
+bool SIGfx12CacheControl::setScope(const MachineBasicBlock::iterator MI,
+ AMDGPU::CPol::CPol Value) const {
+ MachineOperand *CPol = TII->getNamedOperand(*MI, OpName::cpol);
+ if (!CPol)
+ return false;
+
+ uint64_t NewScope = Value & AMDGPU::CPol::SCOPE;
+ if ((CPol->getImm() & AMDGPU::CPol::SCOPE) != NewScope) {
+ CPol->setImm((CPol->getImm() & ~AMDGPU::CPol::SCOPE) | NewScope);
+ return true;
+ }
+
+ return false;
+}
+
+bool SIGfx12CacheControl::insertWait(MachineBasicBlock::iterator &MI,
+ SIAtomicScope Scope,
+ SIAtomicAddrSpace AddrSpace, SIMemOp Op,
+ bool IsCrossAddrSpaceOrdering,
+ Position Pos) const {
+ bool Changed = false;
+
+ MachineBasicBlock &MBB = *MI->getParent();
+ DebugLoc DL = MI->getDebugLoc();
+
+ bool LOADCnt = false;
+ bool DSCnt = false;
+ bool STORECnt = false;
+
+ if (Pos == Position::AFTER)
+ ++MI;
+
+ if ((AddrSpace & (SIAtomicAddrSpace::GLOBAL | SIAtomicAddrSpace::SCRATCH)) !=
+ SIAtomicAddrSpace::NONE) {
+ switch (Scope) {
+ case SIAtomicScope::SYSTEM:
+ case SIAtomicScope::AGENT:
+ if ((Op & SIMemOp::LOAD) != SIMemOp::NONE)
+ LOADCnt |= true;
+ if ((Op & SIMemOp::STORE) != SIMemOp::NONE)
+ STORECnt |= true;
+ break;
+ case SIAtomicScope::WORKGROUP:
+ // In WGP mode the waves of a work-group can be executing on either CU of
+ // the WGP. Therefore need to wait for operations to complete to ensure
+ // they are visible to waves in the other CU as the L0 is per CU.
+ // Otherwise in CU mode and all waves of a work-group are on the same CU
+ // which shares the same L0.
+ if (!ST.isCuModeEnabled()) {
+ if ((Op & SIMemOp::LOAD) != SIMemOp::NONE)
+ LOADCnt |= true;
+ if ((Op & SIMemOp::STORE) != SIMemOp::NONE)
+ STORECnt |= true;
+ }
+ break;
+ case SIAtomicScope::WAVEFRONT:
+ case SIAtomicScope::SINGLETHREAD:
+ // The L0 cache keeps all memory operations in order for
+ // work-items in the same wavefront.
+ break;
+ default:
+ llvm_unreachable("Unsupported synchronization scope");
+ }
+ }
+
+ if ((AddrSpace & SIAtomicAddrSpace::LDS) != SIAtomicAddrSpace::NONE) {
+ switch (Scope) {
+ case SIAtomicScope::SYSTEM:
+ case SIAtomicScope::AGENT:
+ case SIAtomicScope::WORKGROUP:
+ // If no cross address space ordering then an "S_WAITCNT lgkmcnt(0)" is
+ // not needed as LDS operations for all waves are executed in a total
+ // global ordering as observed by all waves. Required if also
+ // synchronizing with global/GDS memory as LDS operations could be
+ // reordered with respect to later global/GDS memory operations of the
+ // same wave.
+ DSCnt |= IsCrossAddrSpaceOrdering;
+ break;
+ case SIAtomicScope::WAVEFRONT:
+ case SIAtomicScope::SINGLETHREAD:
+ // The LDS keeps all memory operations in order for
+ // the same wavefront.
+ break;
+ default:
+ llvm_unreachable("Unsupported synchronization scope");
+ }
+ }
+
+ if (LOADCnt) {
+ BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAIT_BVHCNT_soft)).addImm(0);
+ BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAIT_SAMPLECNT_soft)).addImm(0);
+ BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAIT_LOADCNT_soft)).addImm(0);
+ Changed = true;
+ }
+
+ if (STORECnt) {
+ BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAIT_STORECNT_soft)).addImm(0);
+ Changed = true;
+ }
+
+ if (DSCnt) {
+ BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAIT_DSCNT_soft)).addImm(0);
+ Changed = true;
+ }
+
+ if (Pos == Position::AFTER)
+ --MI;
+
+ return Changed;
+}
+
bool SIGfx12CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
SIAtomicScope Scope,
SIAtomicAddrSpace AddrSpace,
@@ -2198,6 +2343,41 @@ bool SIGfx12CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
return true;
}
+bool SIGfx12CacheControl::enableVolatileAndOrNonTemporal(
+ MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,
+ bool IsVolatile, bool IsNonTemporal) const {
+
+ // Only handle load and store, not atomic read-modify-write instructions.
+ assert(MI->mayLoad() ^ MI->mayStore());
+
+ // Only update load and store, not LLVM IR atomic read-modify-write
+ // instructions. The latter are always marked as volatile so cannot sensibly
+ // handle it as do not want to pessimize all atomics. Also they do not support
+ // the nontemporal attribute.
+ assert(Op == SIMemOp::LOAD || Op == SIMemOp::STORE);
+
+ bool Changed = false;
+
+ if (IsVolatile) {
+ Changed |= setScope(MI, AMDGPU::CPol::SCOPE_SYS);
+
+ // Ensure operation has completed at system scope to cause all volatile
+ // operations to be visible outside the program in a global order. Do not
+ // request cross address space as only the global address space can be
+ // observable outside the program, so no need to cause a waitcnt for LDS
+ // address space operations.
+ Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false,
+ Position::AFTER);
+ }
+
+ if (IsNonTemporal) {
+ // Set non-temporal hint for all cache levels.
+ Changed |= setTH(MI, AMDGPU::CPol::TH_NT);
+ }
+
+ return Changed;
+}
+
bool SIMemoryLegalizer::removeAtomicPseudoMIs() {
if (AtomicPseudoMIs.empty())
return false;
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIModeRegister.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIModeRegister.cpp
index be395d53c34e..e62ad026dc5c 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIModeRegister.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIModeRegister.cpp
@@ -285,7 +285,7 @@ void SIModeRegister::processBlockPhase1(MachineBasicBlock &MBB,
1;
unsigned Offset =
(Dst & AMDGPU::Hwreg::OFFSET_MASK_) >> AMDGPU::Hwreg::OFFSET_SHIFT_;
- unsigned Mask = ((1 << Width) - 1) << Offset;
+ unsigned Mask = maskTrailingOnes<unsigned>(Width) << Offset;
// If an InsertionPoint is set we will insert a setreg there.
if (InsertionPoint) {
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
index a93cf5cad411..a2cacb5cbaa3 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
@@ -1657,8 +1657,12 @@ void SIRegisterInfo::buildSpillLoadStore(
} else {
MIB.addReg(SOffset, SOffsetRegState);
}
- MIB.addImm(Offset + RegOffset)
- .addImm(0); // cpol
+
+ MIB.addImm(Offset + RegOffset);
+
+ bool LastUse = MMO->getFlags() & MOLastUse;
+ MIB.addImm(LastUse ? AMDGPU::CPol::TH_LU : 0); // cpol
+
if (!IsFlat)
MIB.addImm(0); // swz
MIB.addMemOperand(NewMMO);
@@ -2241,6 +2245,7 @@ bool SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
TII->insertScratchExecCopy(*MF, *MBB, MI, DL, MFI->getSGPRForEXECCopy(),
RS->isRegUsed(AMDGPU::SCC));
}
+
buildSpillLoadStore(
*MBB, MI, DL, Opc, Index, VData->getReg(), VData->isKill(), FrameReg,
TII->getNamedOperand(*MI, AMDGPU::OpName::offset)->getImm(),
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SMInstructions.td b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SMInstructions.td
index fc29ce8d71f2..9a27d22d585e 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SMInstructions.td
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SMInstructions.td
@@ -810,7 +810,7 @@ def S_DCACHE_INV_VOL_ci : SMRD_Real_ci <0x1d, S_DCACHE_INV_VOL>;
// Scalar Memory Patterns
//===----------------------------------------------------------------------===//
-def smrd_load : PatFrag <(ops node:$ptr), (load node:$ptr), [{ return isUniformLoad(N);}]> {
+class SMRDLoadPat<PatFrag Op> : PatFrag <(ops node:$ptr), (Op node:$ptr), [{ return isUniformLoad(N);}]> {
let GISelPredicateCode = [{
if (!MI.hasOneMemOperand())
return false;
@@ -827,6 +827,14 @@ def smrd_load : PatFrag <(ops node:$ptr), (load node:$ptr), [{ return isUniformL
}];
}
+def smrd_load : SMRDLoadPat<load>;
+def smrd_extloadi8 : SMRDLoadPat<extloadi8>;
+def smrd_zextloadi8 : SMRDLoadPat<zextloadi8>;
+def smrd_sextloadi8 : SMRDLoadPat<sextloadi8>;
+def smrd_extloadi16 : SMRDLoadPat<extloadi16>;
+def smrd_zextloadi16 : SMRDLoadPat<zextloadi16>;
+def smrd_sextloadi16 : SMRDLoadPat<sextloadi16>;
+
def smrd_prefetch : PatFrag <(ops node:$ptr, node:$rw, node:$loc, node:$type),
(prefetch node:$ptr, node:$rw, node:$loc, node:$type),
[{ return !N->getOperand(1)->isDivergent();}]> {
@@ -923,11 +931,78 @@ multiclass SMLoad_Pattern <string Instr, ValueType vt, bit immci = true> {
}
}
+multiclass ScalarLoadWithExtensionPat <string Instr, SDPatternOperator node, ValueType vt> {
+ // 1. IMM offset
+ def : GCNPat <
+ (node (SMRDImm i64:$sbase, i32:$offset)),
+ (vt (!cast<SM_Pseudo>(Instr#"_IMM") $sbase, $offset, 0))>{
+ let OtherPredicates = [isGFX12Plus];
+ }
+
+ // 2. SGPR offset
+ def : GCNPat <
+ (node (SMRDSgpr i64:$sbase, i32:$soffset)),
+ (vt (!cast<SM_Pseudo>(Instr#"_SGPR_IMM") $sbase, $soffset, 0, 0))>{
+ let OtherPredicates = [isGFX12Plus];
+ }
+
+ // 3. SGPR+IMM offset
+ def : GCNPat <
+ (node (SMRDSgprImm i64:$sbase, i32:$soffset, i32:$offset)),
+ (vt (!cast<SM_Pseudo>(Instr#"_SGPR_IMM") $sbase, $soffset, $offset, 0))>{
+ let OtherPredicates = [isGFX12Plus];
+ }
+
+ // 4. No offset
+ def : GCNPat <
+ (vt (node (i64 SReg_64:$sbase))),
+ (vt (!cast<SM_Pseudo>(Instr#"_IMM") i64:$sbase, 0, 0))>{
+ let OtherPredicates = [isGFX12Plus];
+ }
+}
+
+multiclass ScalarBufferLoadIntrinsicPat <SDPatternOperator name, string Instr> {
+
+ // 1. Offset as an immediate
+ def : GCNPat <
+ (name v4i32:$sbase, (SMRDBufferImm i32:$offset), timm:$cachepolicy),
+ (i32 (!cast<SM_Pseudo>(Instr#"_IMM") SReg_128:$sbase, i32imm:$offset, (extract_cpol $cachepolicy)))> {
+ let OtherPredicates = [isGFX12Plus];
+ }
+
+ // 2. Offset as an 32-bit SGPR
+ def : GCNPat <
+ (name v4i32:$sbase, i32:$soffset, timm:$cachepolicy),
+ (i32 (!cast<SM_Pseudo>(Instr#"_SGPR_IMM") SReg_128:$sbase, SReg_32:$soffset, 0, (extract_cpol $cachepolicy)))> {
+ let OtherPredicates = [isGFX12Plus];
+ }
+
+ // 3. Offset as an 32-bit SGPR + immediate
+ def : GCNPat <
+ (name v4i32:$sbase, (SMRDBufferSgprImm i32:$soffset, i32:$offset),
+ timm:$cachepolicy),
+ (i32 (!cast<SM_Pseudo>(Instr#"_SGPR_IMM") SReg_128:$sbase, SReg_32:$soffset, i32imm:$offset,
+ (extract_cpol $cachepolicy)))> {
+ let OtherPredicates = [isGFX12Plus];
+ }
+}
+
// Global and constant loads can be selected to either MUBUF or SMRD
// instructions, but SMRD instructions are faster so we want the instruction
// selector to prefer those.
let AddedComplexity = 100 in {
+defm : ScalarLoadWithExtensionPat <"S_LOAD_U8", smrd_extloadi8, i32>;
+defm : ScalarLoadWithExtensionPat <"S_LOAD_U8", smrd_zextloadi8, i32>;
+defm : ScalarLoadWithExtensionPat <"S_LOAD_I8", smrd_sextloadi8, i32>;
+defm : ScalarLoadWithExtensionPat <"S_LOAD_U16", smrd_extloadi16, i32>;
+defm : ScalarLoadWithExtensionPat <"S_LOAD_U16", smrd_zextloadi16, i32>;
+defm : ScalarLoadWithExtensionPat <"S_LOAD_I16", smrd_sextloadi16, i32>;
+defm : ScalarBufferLoadIntrinsicPat <SIsbuffer_load_byte, "S_BUFFER_LOAD_I8">;
+defm : ScalarBufferLoadIntrinsicPat <SIsbuffer_load_ubyte, "S_BUFFER_LOAD_U8">;
+defm : ScalarBufferLoadIntrinsicPat <SIsbuffer_load_short, "S_BUFFER_LOAD_I16">;
+defm : ScalarBufferLoadIntrinsicPat <SIsbuffer_load_ushort, "S_BUFFER_LOAD_U16">;
+
foreach vt = Reg32Types.types in {
defm : SMRD_Pattern <"S_LOAD_DWORD", vt>;
}
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SOPInstructions.td b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SOPInstructions.td
index 46fa3d57a21c..ae5ef0541929 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SOPInstructions.td
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SOPInstructions.td
@@ -292,8 +292,11 @@ def S_BITSET0_B64 : SOP1_64_32 <"s_bitset0_b64", [], 1>;
def S_BITSET1_B32 : SOP1_32 <"s_bitset1_b32", [], 1>;
def S_BITSET1_B64 : SOP1_64_32 <"s_bitset1_b64", [], 1>;
+def S_GETPC_B64 : SOP1_64_0 <"s_getpc_b64">;
+// PSEUDO includes a workaround for a hardware anomaly where some ASICs
+// zero-extend the result from 48 bits instead of sign-extending.
let isReMaterializable = 1 in
-def S_GETPC_B64 : SOP1_64_0 <"s_getpc_b64",
+def S_GETPC_B64_pseudo : SOP1_64_0 <"s_getpc_b64",
[(set i64:$sdst, (int_amdgcn_s_getpc))]
>;
@@ -502,8 +505,6 @@ def S_WAKEUP_BARRIER_IMM : SOP1_Pseudo <"s_wakeup_barrier", (outs),
(ins SplitBarrier:$src0), "$src0", []>{
let SchedRW = [WriteBarrier];
let isConvergent = 1;
-
-
}
} // End has_sdst = 0
@@ -1124,7 +1125,7 @@ class S_SETREG_B32_Pseudo <list<dag> pattern=[]> : SOPK_Pseudo <
pattern>;
def S_SETREG_B32 : S_SETREG_B32_Pseudo <
- [(int_amdgcn_s_setreg (i32 timm:$simm16), i32:$sdst)]> {
+ [(int_amdgcn_s_setreg (i32 SIMM16bit:$simm16), i32:$sdst)]> {
// Use custom inserter to optimize some cases to
// S_DENORM_MODE/S_ROUND_MODE/S_SETREG_B32_mode.
let usesCustomInserter = 1;
@@ -1597,6 +1598,13 @@ def S_WAITCNT : SOPP_Pseudo <"s_waitcnt" , (ins SWaitCnt:$simm16), "$simm16",
// that doesn't access memory.
def S_WAITCNT_soft : SOPP_Pseudo <"s_soft_waitcnt" , (ins SWaitCnt:$simm16), "$simm16">;
def S_WAITCNT_VSCNT_soft : SOPK_WAITCNT<"s_soft_waitcnt_vscnt">;
+let SubtargetPredicate = isGFX12Plus in {
+ def S_WAIT_LOADCNT_soft : SOPP_Pseudo <"s_soft_wait_loadcnt", (ins s16imm:$simm16), "$simm16">;
+ def S_WAIT_STORECNT_soft : SOPP_Pseudo <"s_soft_wait_storecnt", (ins s16imm:$simm16), "$simm16">;
+ def S_WAIT_SAMPLECNT_soft : SOPP_Pseudo <"s_soft_wait_samplecnt", (ins s16imm:$simm16), "$simm16">;
+ def S_WAIT_BVHCNT_soft : SOPP_Pseudo <"s_soft_wait_bvhcnt", (ins s16imm:$simm16), "$simm16">;
+ def S_WAIT_DSCNT_soft : SOPP_Pseudo <"s_soft_wait_dscnt", (ins s16imm:$simm16), "$simm16">;
+}
def S_SETHALT : SOPP_Pseudo <"s_sethalt" , (ins i32imm:$simm16), "$simm16",
[(int_amdgcn_s_sethalt timm:$simm16)]>;
@@ -1712,23 +1720,30 @@ let SubtargetPredicate = HasVGPRSingleUseHintInsts in {
let SubtargetPredicate = isGFX12Plus, hasSideEffects = 1 in {
def S_WAIT_LOADCNT :
- SOPP_Pseudo<"s_wait_loadcnt", (ins s16imm:$simm16), "$simm16">;
+ SOPP_Pseudo<"s_wait_loadcnt", (ins s16imm:$simm16), "$simm16",
+ [(int_amdgcn_s_wait_loadcnt timm:$simm16)]>;
def S_WAIT_LOADCNT_DSCNT :
SOPP_Pseudo<"s_wait_loadcnt_dscnt", (ins s16imm:$simm16), "$simm16">;
def S_WAIT_STORECNT :
- SOPP_Pseudo<"s_wait_storecnt", (ins s16imm:$simm16), "$simm16">;
+ SOPP_Pseudo<"s_wait_storecnt", (ins s16imm:$simm16), "$simm16",
+ [(int_amdgcn_s_wait_storecnt timm:$simm16)]>;
def S_WAIT_STORECNT_DSCNT :
SOPP_Pseudo<"s_wait_storecnt_dscnt", (ins s16imm:$simm16), "$simm16">;
def S_WAIT_SAMPLECNT :
- SOPP_Pseudo<"s_wait_samplecnt", (ins s16imm:$simm16), "$simm16">;
+ SOPP_Pseudo<"s_wait_samplecnt", (ins s16imm:$simm16), "$simm16",
+ [(int_amdgcn_s_wait_samplecnt timm:$simm16)]>;
def S_WAIT_BVHCNT :
- SOPP_Pseudo<"s_wait_bvhcnt", (ins s16imm:$simm16), "$simm16">;
+ SOPP_Pseudo<"s_wait_bvhcnt", (ins s16imm:$simm16), "$simm16",
+ [(int_amdgcn_s_wait_bvhcnt timm:$simm16)]>;
def S_WAIT_EXPCNT :
- SOPP_Pseudo<"s_wait_expcnt", (ins s16imm:$simm16), "$simm16">;
+ SOPP_Pseudo<"s_wait_expcnt", (ins s16imm:$simm16), "$simm16",
+ [(int_amdgcn_s_wait_expcnt timm:$simm16)]>;
def S_WAIT_DSCNT :
- SOPP_Pseudo<"s_wait_dscnt", (ins s16imm:$simm16), "$simm16">;
+ SOPP_Pseudo<"s_wait_dscnt", (ins s16imm:$simm16), "$simm16",
+ [(int_amdgcn_s_wait_dscnt timm:$simm16)]>;
def S_WAIT_KMCNT :
- SOPP_Pseudo<"s_wait_kmcnt", (ins s16imm:$simm16), "$simm16">;
+ SOPP_Pseudo<"s_wait_kmcnt", (ins s16imm:$simm16), "$simm16",
+ [(int_amdgcn_s_wait_kmcnt timm:$simm16)]>;
} // End SubtargetPredicate = isGFX12Plus, hasSideEffects = 1
//===----------------------------------------------------------------------===//
@@ -1768,10 +1783,10 @@ def : GCNPat<
(S_SEXT_I32_I16 $src)
>;
-def : GCNPat <
- (int_amdgcn_s_wait_event_export_ready),
- (S_WAIT_EVENT (i16 0))
->;
+let SubtargetPredicate = isNotGFX12Plus in
+ def : GCNPat <(int_amdgcn_s_wait_event_export_ready), (S_WAIT_EVENT (i16 0))>;
+let SubtargetPredicate = isGFX12Plus in
+ def : GCNPat <(int_amdgcn_s_wait_event_export_ready), (S_WAIT_EVENT (i16 1))>;
// The first 10 bits of the mode register are the core FP mode on all
// subtargets.
@@ -2610,7 +2625,7 @@ multiclass SOPP_Real_With_Relaxation_gfx11_gfx12<bits<7>op> :
defm S_SETKILL : SOPP_Real_32_gfx11_gfx12<0x001>;
defm S_SETHALT : SOPP_Real_32_gfx11_gfx12<0x002>;
defm S_SLEEP : SOPP_Real_32_gfx11_gfx12<0x003>;
-defm S_SET_INST_PREFETCH_DISTANCE : SOPP_Real_32_Renamed_gfx11_gfx12<0x004, S_INST_PREFETCH, "s_set_inst_prefetch_distance">;
+defm S_SET_INST_PREFETCH_DISTANCE : SOPP_Real_32_Renamed_gfx11<0x004, S_INST_PREFETCH, "s_set_inst_prefetch_distance">;
defm S_CLAUSE : SOPP_Real_32_gfx11_gfx12<0x005>;
defm S_DELAY_ALU : SOPP_Real_32_gfx11_gfx12<0x007>;
defm S_WAITCNT_DEPCTR : SOPP_Real_32_gfx11<0x008>;
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
index 26ba2575ff34..0bf9452d822e 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
@@ -31,10 +31,11 @@
#define GET_INSTRMAP_INFO
#include "AMDGPUGenInstrInfo.inc"
-static llvm::cl::opt<unsigned>
- AmdhsaCodeObjectVersion("amdhsa-code-object-version", llvm::cl::Hidden,
- llvm::cl::desc("AMDHSA Code Object Version"),
- llvm::cl::init(4));
+static llvm::cl::opt<unsigned> DefaultAMDHSACodeObjectVersion(
+ "amdhsa-code-object-version", llvm::cl::Hidden,
+ llvm::cl::init(llvm::AMDGPU::AMDHSA_COV5),
+ llvm::cl::desc("Set default AMDHSA Code Object Version (module flag "
+ "or asm directive still take priority if present)"));
namespace {
@@ -94,6 +95,44 @@ unsigned getVmcntBitWidthHi(unsigned VersionMajor) {
return (VersionMajor == 9 || VersionMajor == 10) ? 2 : 0;
}
+/// \returns Loadcnt bit width
+unsigned getLoadcntBitWidth(unsigned VersionMajor) {
+ return VersionMajor >= 12 ? 6 : 0;
+}
+
+/// \returns Samplecnt bit width.
+unsigned getSamplecntBitWidth(unsigned VersionMajor) {
+ return VersionMajor >= 12 ? 6 : 0;
+}
+
+/// \returns Bvhcnt bit width.
+unsigned getBvhcntBitWidth(unsigned VersionMajor) {
+ return VersionMajor >= 12 ? 3 : 0;
+}
+
+/// \returns Dscnt bit width.
+unsigned getDscntBitWidth(unsigned VersionMajor) {
+ return VersionMajor >= 12 ? 6 : 0;
+}
+
+/// \returns Dscnt bit shift in combined S_WAIT instructions.
+unsigned getDscntBitShift(unsigned VersionMajor) { return 0; }
+
+/// \returns Storecnt or Vscnt bit width, depending on VersionMajor.
+unsigned getStorecntBitWidth(unsigned VersionMajor) {
+ return VersionMajor >= 10 ? 6 : 0;
+}
+
+/// \returns Kmcnt bit width.
+unsigned getKmcntBitWidth(unsigned VersionMajor) {
+ return VersionMajor >= 12 ? 5 : 0;
+}
+
+/// \returns shift for Loadcnt/Storecnt in combined S_WAIT instructions.
+unsigned getLoadcntStorecntBitShift(unsigned VersionMajor) {
+ return VersionMajor >= 12 ? 8 : 0;
+}
+
/// \returns VmVsrc bit width
inline unsigned getVmVsrcBitWidth() { return 3; }
@@ -123,45 +162,32 @@ bool isHsaAbi(const MCSubtargetInfo &STI) {
return STI.getTargetTriple().getOS() == Triple::AMDHSA;
}
-std::optional<uint8_t> getHsaAbiVersion(const MCSubtargetInfo *STI) {
- if (STI && STI->getTargetTriple().getOS() != Triple::AMDHSA)
- return std::nullopt;
-
- switch (AmdhsaCodeObjectVersion) {
- case 4:
- return ELF::ELFABIVERSION_AMDGPU_HSA_V4;
- case 5:
- return ELF::ELFABIVERSION_AMDGPU_HSA_V5;
- default:
- report_fatal_error(Twine("Unsupported AMDHSA Code Object Version ") +
- Twine(AmdhsaCodeObjectVersion));
+unsigned getAMDHSACodeObjectVersion(const Module &M) {
+ if (auto Ver = mdconst::extract_or_null<ConstantInt>(
+ M.getModuleFlag("amdgpu_code_object_version"))) {
+ return (unsigned)Ver->getZExtValue() / 100;
}
-}
-bool isHsaAbiVersion4(const MCSubtargetInfo *STI) {
- if (std::optional<uint8_t> HsaAbiVer = getHsaAbiVersion(STI))
- return *HsaAbiVer == ELF::ELFABIVERSION_AMDGPU_HSA_V4;
- return false;
+ return getDefaultAMDHSACodeObjectVersion();
}
-bool isHsaAbiVersion5(const MCSubtargetInfo *STI) {
- if (std::optional<uint8_t> HsaAbiVer = getHsaAbiVersion(STI))
- return *HsaAbiVer == ELF::ELFABIVERSION_AMDGPU_HSA_V5;
- return false;
+unsigned getDefaultAMDHSACodeObjectVersion() {
+ return DefaultAMDHSACodeObjectVersion;
}
-unsigned getAmdhsaCodeObjectVersion() {
- return AmdhsaCodeObjectVersion;
-}
+uint8_t getELFABIVersion(const Triple &T, unsigned CodeObjectVersion) {
+ if (T.getOS() != Triple::AMDHSA)
+ return 0;
-unsigned getCodeObjectVersion(const Module &M) {
- if (auto Ver = mdconst::extract_or_null<ConstantInt>(
- M.getModuleFlag("amdgpu_code_object_version"))) {
- return (unsigned)Ver->getZExtValue() / 100;
+ switch (CodeObjectVersion) {
+ case 4:
+ return ELF::ELFABIVERSION_AMDGPU_HSA_V4;
+ case 5:
+ return ELF::ELFABIVERSION_AMDGPU_HSA_V5;
+ default:
+ report_fatal_error("Unsupported AMDHSA Code Object Version " +
+ Twine(CodeObjectVersion));
}
-
- // Default code object version.
- return AMDHSA_COV4;
}
unsigned getMultigridSyncArgImplicitArgPosition(unsigned CodeObjectVersion) {
@@ -667,7 +693,7 @@ namespace IsaInfo {
AMDGPUTargetID::AMDGPUTargetID(const MCSubtargetInfo &STI)
: STI(STI), XnackSetting(TargetIDSetting::Any),
- SramEccSetting(TargetIDSetting::Any), CodeObjectVersion(0) {
+ SramEccSetting(TargetIDSetting::Any) {
if (!STI.getFeatureBits().test(FeatureSupportsXNACK))
XnackSetting = TargetIDSetting::Unsupported;
if (!STI.getFeatureBits().test(FeatureSupportsSRAMECC))
@@ -779,23 +805,16 @@ std::string AMDGPUTargetID::toString() const {
std::string Features;
if (STI.getTargetTriple().getOS() == Triple::AMDHSA) {
- switch (CodeObjectVersion) {
- case AMDGPU::AMDHSA_COV4:
- case AMDGPU::AMDHSA_COV5:
- // sramecc.
- if (getSramEccSetting() == TargetIDSetting::Off)
- Features += ":sramecc-";
- else if (getSramEccSetting() == TargetIDSetting::On)
- Features += ":sramecc+";
- // xnack.
- if (getXnackSetting() == TargetIDSetting::Off)
- Features += ":xnack-";
- else if (getXnackSetting() == TargetIDSetting::On)
- Features += ":xnack+";
- break;
- default:
- break;
- }
+ // sramecc.
+ if (getSramEccSetting() == TargetIDSetting::Off)
+ Features += ":sramecc-";
+ else if (getSramEccSetting() == TargetIDSetting::On)
+ Features += ":sramecc+";
+ // xnack.
+ if (getXnackSetting() == TargetIDSetting::Off)
+ Features += ":xnack-";
+ else if (getXnackSetting() == TargetIDSetting::On)
+ Features += ":xnack+";
}
StreamRep << Processor << Features;
@@ -1229,6 +1248,18 @@ unsigned getVmcntBitMask(const IsaVersion &Version) {
1;
}
+unsigned getLoadcntBitMask(const IsaVersion &Version) {
+ return (1 << getLoadcntBitWidth(Version.Major)) - 1;
+}
+
+unsigned getSamplecntBitMask(const IsaVersion &Version) {
+ return (1 << getSamplecntBitWidth(Version.Major)) - 1;
+}
+
+unsigned getBvhcntBitMask(const IsaVersion &Version) {
+ return (1 << getBvhcntBitWidth(Version.Major)) - 1;
+}
+
unsigned getExpcntBitMask(const IsaVersion &Version) {
return (1 << getExpcntBitWidth(Version.Major)) - 1;
}
@@ -1237,6 +1268,18 @@ unsigned getLgkmcntBitMask(const IsaVersion &Version) {
return (1 << getLgkmcntBitWidth(Version.Major)) - 1;
}
+unsigned getDscntBitMask(const IsaVersion &Version) {
+ return (1 << getDscntBitWidth(Version.Major)) - 1;
+}
+
+unsigned getKmcntBitMask(const IsaVersion &Version) {
+ return (1 << getKmcntBitWidth(Version.Major)) - 1;
+}
+
+unsigned getStorecntBitMask(const IsaVersion &Version) {
+ return (1 << getStorecntBitWidth(Version.Major)) - 1;
+}
+
unsigned getWaitcntBitMask(const IsaVersion &Version) {
unsigned VmcntLo = getBitMask(getVmcntBitShiftLo(Version.Major),
getVmcntBitWidthLo(Version.Major));
@@ -1276,9 +1319,9 @@ void decodeWaitcnt(const IsaVersion &Version, unsigned Waitcnt,
Waitcnt decodeWaitcnt(const IsaVersion &Version, unsigned Encoded) {
Waitcnt Decoded;
- Decoded.VmCnt = decodeVmcnt(Version, Encoded);
+ Decoded.LoadCnt = decodeVmcnt(Version, Encoded);
Decoded.ExpCnt = decodeExpcnt(Version, Encoded);
- Decoded.LgkmCnt = decodeLgkmcnt(Version, Encoded);
+ Decoded.DsCnt = decodeLgkmcnt(Version, Encoded);
return Decoded;
}
@@ -1313,7 +1356,85 @@ unsigned encodeWaitcnt(const IsaVersion &Version,
}
unsigned encodeWaitcnt(const IsaVersion &Version, const Waitcnt &Decoded) {
- return encodeWaitcnt(Version, Decoded.VmCnt, Decoded.ExpCnt, Decoded.LgkmCnt);
+ return encodeWaitcnt(Version, Decoded.LoadCnt, Decoded.ExpCnt, Decoded.DsCnt);
+}
+
+static unsigned getCombinedCountBitMask(const IsaVersion &Version,
+ bool IsStore) {
+ unsigned Dscnt = getBitMask(getDscntBitShift(Version.Major),
+ getDscntBitWidth(Version.Major));
+ if (IsStore) {
+ unsigned Storecnt = getBitMask(getLoadcntStorecntBitShift(Version.Major),
+ getStorecntBitWidth(Version.Major));
+ return Dscnt | Storecnt;
+ } else {
+ unsigned Loadcnt = getBitMask(getLoadcntStorecntBitShift(Version.Major),
+ getLoadcntBitWidth(Version.Major));
+ return Dscnt | Loadcnt;
+ }
+}
+
+Waitcnt decodeLoadcntDscnt(const IsaVersion &Version, unsigned LoadcntDscnt) {
+ Waitcnt Decoded;
+ Decoded.LoadCnt =
+ unpackBits(LoadcntDscnt, getLoadcntStorecntBitShift(Version.Major),
+ getLoadcntBitWidth(Version.Major));
+ Decoded.DsCnt = unpackBits(LoadcntDscnt, getDscntBitShift(Version.Major),
+ getDscntBitWidth(Version.Major));
+ return Decoded;
+}
+
+Waitcnt decodeStorecntDscnt(const IsaVersion &Version, unsigned StorecntDscnt) {
+ Waitcnt Decoded;
+ Decoded.StoreCnt =
+ unpackBits(StorecntDscnt, getLoadcntStorecntBitShift(Version.Major),
+ getStorecntBitWidth(Version.Major));
+ Decoded.DsCnt = unpackBits(StorecntDscnt, getDscntBitShift(Version.Major),
+ getDscntBitWidth(Version.Major));
+ return Decoded;
+}
+
+static unsigned encodeLoadcnt(const IsaVersion &Version, unsigned Waitcnt,
+ unsigned Loadcnt) {
+ return packBits(Loadcnt, Waitcnt, getLoadcntStorecntBitShift(Version.Major),
+ getLoadcntBitWidth(Version.Major));
+}
+
+static unsigned encodeStorecnt(const IsaVersion &Version, unsigned Waitcnt,
+ unsigned Storecnt) {
+ return packBits(Storecnt, Waitcnt, getLoadcntStorecntBitShift(Version.Major),
+ getStorecntBitWidth(Version.Major));
+}
+
+static unsigned encodeDscnt(const IsaVersion &Version, unsigned Waitcnt,
+ unsigned Dscnt) {
+ return packBits(Dscnt, Waitcnt, getDscntBitShift(Version.Major),
+ getDscntBitWidth(Version.Major));
+}
+
+static unsigned encodeLoadcntDscnt(const IsaVersion &Version, unsigned Loadcnt,
+ unsigned Dscnt) {
+ unsigned Waitcnt = getCombinedCountBitMask(Version, false);
+ Waitcnt = encodeLoadcnt(Version, Waitcnt, Loadcnt);
+ Waitcnt = encodeDscnt(Version, Waitcnt, Dscnt);
+ return Waitcnt;
+}
+
+unsigned encodeLoadcntDscnt(const IsaVersion &Version, const Waitcnt &Decoded) {
+ return encodeLoadcntDscnt(Version, Decoded.LoadCnt, Decoded.DsCnt);
+}
+
+static unsigned encodeStorecntDscnt(const IsaVersion &Version,
+ unsigned Storecnt, unsigned Dscnt) {
+ unsigned Waitcnt = getCombinedCountBitMask(Version, true);
+ Waitcnt = encodeStorecnt(Version, Waitcnt, Storecnt);
+ Waitcnt = encodeDscnt(Version, Waitcnt, Dscnt);
+ return Waitcnt;
+}
+
+unsigned encodeStorecntDscnt(const IsaVersion &Version,
+ const Waitcnt &Decoded) {
+ return encodeStorecntDscnt(Version, Decoded.StoreCnt, Decoded.DsCnt);
}
//===----------------------------------------------------------------------===//
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/contrib/llvm-project/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
index 50c741760d71..d3f55c792017 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
@@ -46,14 +46,18 @@ enum { AMDHSA_COV4 = 4, AMDHSA_COV5 = 5 };
/// \returns True if \p STI is AMDHSA.
bool isHsaAbi(const MCSubtargetInfo &STI);
-/// \returns HSA OS ABI Version identification.
-std::optional<uint8_t> getHsaAbiVersion(const MCSubtargetInfo *STI);
-/// \returns True if HSA OS ABI Version identification is 4,
-/// false otherwise.
-bool isHsaAbiVersion4(const MCSubtargetInfo *STI);
-/// \returns True if HSA OS ABI Version identification is 5,
-/// false otherwise.
-bool isHsaAbiVersion5(const MCSubtargetInfo *STI);
+
+/// \returns Code object version from the IR module flag.
+unsigned getAMDHSACodeObjectVersion(const Module &M);
+
+/// \returns The default HSA code object version. This should only be used when
+/// we lack a more accurate CodeObjectVersion value (e.g. from the IR module
+/// flag or a .amdhsa_code_object_version directive)
+unsigned getDefaultAMDHSACodeObjectVersion();
+
+/// \returns ABIVersion suitable for use in ELF's e_ident[ABIVERSION]. \param
+/// CodeObjectVersion is a value returned by getAMDHSACodeObjectVersion().
+uint8_t getELFABIVersion(const Triple &OS, unsigned CodeObjectVersion);
/// \returns The offset of the multigrid_sync_arg argument from implicitarg_ptr
unsigned getMultigridSyncArgImplicitArgPosition(unsigned COV);
@@ -64,12 +68,6 @@ unsigned getHostcallImplicitArgPosition(unsigned COV);
unsigned getDefaultQueueImplicitArgPosition(unsigned COV);
unsigned getCompletionActionImplicitArgPosition(unsigned COV);
-/// \returns Code object version.
-unsigned getAmdhsaCodeObjectVersion();
-
-/// \returns Code object version.
-unsigned getCodeObjectVersion(const Module &M);
-
struct GcnBufferFormatInfo {
unsigned Format;
unsigned BitsPerComp;
@@ -114,7 +112,6 @@ private:
const MCSubtargetInfo &STI;
TargetIDSetting XnackSetting;
TargetIDSetting SramEccSetting;
- unsigned CodeObjectVersion;
public:
explicit AMDGPUTargetID(const MCSubtargetInfo &STI);
@@ -144,10 +141,6 @@ public:
return XnackSetting;
}
- void setCodeObjectVersion(unsigned COV) {
- CodeObjectVersion = COV;
- }
-
/// Sets xnack setting to \p NewXnackSetting.
void setXnackSetting(TargetIDSetting NewXnackSetting) {
XnackSetting = NewXnackSetting;
@@ -837,39 +830,58 @@ getIntegerPairAttribute(const Function &F, StringRef Name,
/// Large values (including the maximum possible integer) can be used to
/// represent "don't care" waits.
struct Waitcnt {
- unsigned VmCnt = ~0u;
+ unsigned LoadCnt = ~0u; // Corresponds to Vmcnt prior to gfx12.
unsigned ExpCnt = ~0u;
- unsigned LgkmCnt = ~0u;
- unsigned VsCnt = ~0u;
+ unsigned DsCnt = ~0u; // Corresponds to LGKMcnt prior to gfx12.
+ unsigned StoreCnt = ~0u; // Corresponds to VScnt on gfx10/gfx11.
+ unsigned SampleCnt = ~0u; // gfx12+ only.
+ unsigned BvhCnt = ~0u; // gfx12+ only.
+ unsigned KmCnt = ~0u; // gfx12+ only.
Waitcnt() = default;
+ // Pre-gfx12 constructor.
Waitcnt(unsigned VmCnt, unsigned ExpCnt, unsigned LgkmCnt, unsigned VsCnt)
- : VmCnt(VmCnt), ExpCnt(ExpCnt), LgkmCnt(LgkmCnt), VsCnt(VsCnt) {}
-
- static Waitcnt allZero(bool HasVscnt) {
- return Waitcnt(0, 0, 0, HasVscnt ? 0 : ~0u);
+ : LoadCnt(VmCnt), ExpCnt(ExpCnt), DsCnt(LgkmCnt), StoreCnt(VsCnt),
+ SampleCnt(~0u), BvhCnt(~0u), KmCnt(~0u) {}
+
+ // gfx12+ constructor.
+ Waitcnt(unsigned LoadCnt, unsigned ExpCnt, unsigned DsCnt, unsigned StoreCnt,
+ unsigned SampleCnt, unsigned BvhCnt, unsigned KmCnt)
+ : LoadCnt(LoadCnt), ExpCnt(ExpCnt), DsCnt(DsCnt), StoreCnt(StoreCnt),
+ SampleCnt(SampleCnt), BvhCnt(BvhCnt), KmCnt(KmCnt) {}
+
+ static Waitcnt allZero(bool Extended, bool HasStorecnt) {
+ return Extended ? Waitcnt(0, 0, 0, 0, 0, 0, 0)
+ : Waitcnt(0, 0, 0, HasStorecnt ? 0 : ~0u);
}
- static Waitcnt allZeroExceptVsCnt() { return Waitcnt(0, 0, 0, ~0u); }
- bool hasWait() const {
- return VmCnt != ~0u || ExpCnt != ~0u || LgkmCnt != ~0u || VsCnt != ~0u;
+ static Waitcnt allZeroExceptVsCnt(bool Extended) {
+ return Extended ? Waitcnt(0, 0, 0, ~0u, 0, 0, 0) : Waitcnt(0, 0, 0, ~0u);
}
- bool hasWaitExceptVsCnt() const {
- return VmCnt != ~0u || ExpCnt != ~0u || LgkmCnt != ~0u;
- }
+ bool hasWait() const { return StoreCnt != ~0u || hasWaitExceptStoreCnt(); }
- bool hasWaitVsCnt() const {
- return VsCnt != ~0u;
+ bool hasWaitExceptStoreCnt() const {
+ return LoadCnt != ~0u || ExpCnt != ~0u || DsCnt != ~0u ||
+ SampleCnt != ~0u || BvhCnt != ~0u || KmCnt != ~0u;
}
+ bool hasWaitStoreCnt() const { return StoreCnt != ~0u; }
+
Waitcnt combined(const Waitcnt &Other) const {
- return Waitcnt(std::min(VmCnt, Other.VmCnt), std::min(ExpCnt, Other.ExpCnt),
- std::min(LgkmCnt, Other.LgkmCnt),
- std::min(VsCnt, Other.VsCnt));
+ // Does the right thing provided self and Other are either both pre-gfx12
+ // or both gfx12+.
+ return Waitcnt(
+ std::min(LoadCnt, Other.LoadCnt), std::min(ExpCnt, Other.ExpCnt),
+ std::min(DsCnt, Other.DsCnt), std::min(StoreCnt, Other.StoreCnt),
+ std::min(SampleCnt, Other.SampleCnt), std::min(BvhCnt, Other.BvhCnt),
+ std::min(KmCnt, Other.KmCnt));
}
};
+// The following methods are only meaningful on targets that support
+// S_WAITCNT.
+
/// \returns Vmcnt bit mask for given isa \p Version.
unsigned getVmcntBitMask(const IsaVersion &Version);
@@ -893,17 +905,19 @@ unsigned decodeLgkmcnt(const IsaVersion &Version, unsigned Waitcnt);
/// Decodes Vmcnt, Expcnt and Lgkmcnt from given \p Waitcnt for given isa
/// \p Version, and writes decoded values into \p Vmcnt, \p Expcnt and
-/// \p Lgkmcnt respectively.
+/// \p Lgkmcnt respectively. Should not be used on gfx12+, the instruction
+/// which needs it is deprecated
///
/// \details \p Vmcnt, \p Expcnt and \p Lgkmcnt are decoded as follows:
/// \p Vmcnt = \p Waitcnt[3:0] (pre-gfx9)
/// \p Vmcnt = \p Waitcnt[15:14,3:0] (gfx9,10)
-/// \p Vmcnt = \p Waitcnt[15:10] (gfx11+)
+/// \p Vmcnt = \p Waitcnt[15:10] (gfx11)
/// \p Expcnt = \p Waitcnt[6:4] (pre-gfx11)
-/// \p Expcnt = \p Waitcnt[2:0] (gfx11+)
+/// \p Expcnt = \p Waitcnt[2:0] (gfx11)
/// \p Lgkmcnt = \p Waitcnt[11:8] (pre-gfx10)
/// \p Lgkmcnt = \p Waitcnt[13:8] (gfx10)
-/// \p Lgkmcnt = \p Waitcnt[9:4] (gfx11+)
+/// \p Lgkmcnt = \p Waitcnt[9:4] (gfx11)
+///
void decodeWaitcnt(const IsaVersion &Version, unsigned Waitcnt,
unsigned &Vmcnt, unsigned &Expcnt, unsigned &Lgkmcnt);
@@ -922,26 +936,78 @@ unsigned encodeLgkmcnt(const IsaVersion &Version, unsigned Waitcnt,
unsigned Lgkmcnt);
/// Encodes \p Vmcnt, \p Expcnt and \p Lgkmcnt into Waitcnt for given isa
-/// \p Version.
+/// \p Version. Should not be used on gfx12+, the instruction which needs
+/// it is deprecated
///
/// \details \p Vmcnt, \p Expcnt and \p Lgkmcnt are encoded as follows:
/// Waitcnt[2:0] = \p Expcnt (gfx11+)
/// Waitcnt[3:0] = \p Vmcnt (pre-gfx9)
/// Waitcnt[3:0] = \p Vmcnt[3:0] (gfx9,10)
/// Waitcnt[6:4] = \p Expcnt (pre-gfx11)
-/// Waitcnt[9:4] = \p Lgkmcnt (gfx11+)
+/// Waitcnt[9:4] = \p Lgkmcnt (gfx11)
/// Waitcnt[11:8] = \p Lgkmcnt (pre-gfx10)
/// Waitcnt[13:8] = \p Lgkmcnt (gfx10)
-/// Waitcnt[15:10] = \p Vmcnt (gfx11+)
+/// Waitcnt[15:10] = \p Vmcnt (gfx11)
/// Waitcnt[15:14] = \p Vmcnt[5:4] (gfx9,10)
///
/// \returns Waitcnt with encoded \p Vmcnt, \p Expcnt and \p Lgkmcnt for given
/// isa \p Version.
+///
unsigned encodeWaitcnt(const IsaVersion &Version,
unsigned Vmcnt, unsigned Expcnt, unsigned Lgkmcnt);
unsigned encodeWaitcnt(const IsaVersion &Version, const Waitcnt &Decoded);
+// The following methods are only meaningful on targets that support
+// S_WAIT_*CNT, introduced with gfx12.
+
+/// \returns Loadcnt bit mask for given isa \p Version.
+/// Returns 0 for versions that do not support LOADcnt
+unsigned getLoadcntBitMask(const IsaVersion &Version);
+
+/// \returns Samplecnt bit mask for given isa \p Version.
+/// Returns 0 for versions that do not support SAMPLEcnt
+unsigned getSamplecntBitMask(const IsaVersion &Version);
+
+/// \returns Bvhcnt bit mask for given isa \p Version.
+/// Returns 0 for versions that do not support BVHcnt
+unsigned getBvhcntBitMask(const IsaVersion &Version);
+
+/// \returns Dscnt bit mask for given isa \p Version.
+/// Returns 0 for versions that do not support DScnt
+unsigned getDscntBitMask(const IsaVersion &Version);
+
+/// \returns Dscnt bit mask for given isa \p Version.
+/// Returns 0 for versions that do not support KMcnt
+unsigned getKmcntBitMask(const IsaVersion &Version);
+
+/// \return STOREcnt or VScnt bit mask for given isa \p Version.
+/// returns 0 for versions that do not support STOREcnt or VScnt.
+/// STOREcnt and VScnt are the same counter, the name used
+/// depends on the ISA version.
+unsigned getStorecntBitMask(const IsaVersion &Version);
+
+// The following are only meaningful on targets that support
+// S_WAIT_LOADCNT_DSCNT and S_WAIT_STORECNT_DSCNT.
+
+/// \returns Decoded Waitcnt structure from given \p LoadcntDscnt for given
+/// isa \p Version.
+Waitcnt decodeLoadcntDscnt(const IsaVersion &Version, unsigned LoadcntDscnt);
+
+/// \returns Decoded Waitcnt structure from given \p StorecntDscnt for given
+/// isa \p Version.
+Waitcnt decodeStorecntDscnt(const IsaVersion &Version, unsigned StorecntDscnt);
+
+/// \returns \p Loadcnt and \p Dscnt components of \p Decoded encoded as an
+/// immediate that can be used with S_WAIT_LOADCNT_DSCNT for given isa
+/// \p Version.
+unsigned encodeLoadcntDscnt(const IsaVersion &Version, const Waitcnt &Decoded);
+
+/// \returns \p Storecnt and \p Dscnt components of \p Decoded encoded as an
+/// immediate that can be used with S_WAIT_STORECNT_DSCNT for given isa
+/// \p Version.
+unsigned encodeStorecntDscnt(const IsaVersion &Version, const Waitcnt &Decoded);
+
namespace Hwreg {
LLVM_READONLY
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/VOP1Instructions.td b/contrib/llvm-project/llvm/lib/Target/AMDGPU/VOP1Instructions.td
index 99960c94e598..95a1d8696347 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/VOP1Instructions.td
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/VOP1Instructions.td
@@ -48,7 +48,7 @@ class VOP1_Pseudo <string opName, VOPProfile P, list<dag> pattern=[], bit VOP1On
let mayStore = 0;
let hasSideEffects = 0;
- let ReadsModeReg = !or(isFloatType<P.DstVT>.ret, isFloatType<P.Src0VT>.ret);
+ let ReadsModeReg = !or(P.DstVT.isFP, P.Src0VT.isFP);
let mayRaiseFPException = ReadsModeReg;
@@ -585,7 +585,7 @@ class VOPProfile_Base_CVT_F32_F8<ValueType vt> : VOPProfileI2F <vt, i32> {
def VOPProfileCVT_F32_F8 : VOPProfile_Base_CVT_F32_F8 <f32>;
def VOPProfileCVT_PK_F32_F8 : VOPProfile_Base_CVT_F32_F8 <v2f32>;
-let SubtargetPredicate = HasFP8Insts, mayRaiseFPException = 0,
+let SubtargetPredicate = HasFP8ConversionInsts, mayRaiseFPException = 0,
SchedRW = [WriteFloatCvt] in {
defm V_CVT_F32_FP8 : VOP1Inst<"v_cvt_f32_fp8", VOPProfileCVT_F32_F8>;
defm V_CVT_F32_BF8 : VOP1Inst<"v_cvt_f32_bf8", VOPProfileCVT_F32_F8>;
@@ -705,7 +705,6 @@ class VOP1_DPP16_Gen<bits<8> op, VOP1_DPP_Pseudo ps, GFXGen Gen, VOPProfile p =
let DecoderNamespace = "DPP"#Gen.DecoderNamespace;
}
-
class VOP1_DPP8<bits<8> op, VOP1_Pseudo ps, VOPProfile p = ps.Pfl> :
VOP_DPP8<ps.OpName, p> {
let hasSideEffects = ps.hasSideEffects;
@@ -881,6 +880,7 @@ defm V_LOG_F16_fake16 : VOP1_Real_FULL_t16_gfx11_gfx12<0x057, "v_log_f16"
defm V_EXP_F16_fake16 : VOP1_Real_FULL_t16_gfx11_gfx12<0x058, "v_exp_f16">;
defm V_FREXP_MANT_F16_fake16 : VOP1_Real_FULL_t16_gfx11_gfx12<0x059, "v_frexp_mant_f16">;
defm V_FREXP_EXP_I16_F16_t16 : VOP1_Real_FULL_t16_gfx11_gfx12<0x05a, "v_frexp_exp_i16_f16">;
+defm V_FLOOR_F16_t16 : VOP1_Real_FULL_t16_gfx11_gfx12<0x05b, "v_floor_f16">;
defm V_FLOOR_F16_fake16 : VOP1_Real_FULL_t16_gfx11_gfx12<0x05b, "v_floor_f16">;
defm V_CEIL_F16_t16 : VOP1_Real_FULL_t16_gfx11_gfx12<0x05c, "v_ceil_f16">;
defm V_CEIL_F16_fake16 : VOP1_Real_FULL_t16_gfx11_gfx12<0x05c, "v_ceil_f16">;
@@ -1357,7 +1357,7 @@ defm V_SCREEN_PARTITION_4SE_B32 : VOP1_Real_gfx9 <0x37>;
let AssemblerPredicate = isGFX940Plus, DecoderNamespace = "GFX9" in
defm V_MOV_B64 : VOP1_Real_gfx9 <0x38>;
-let OtherPredicates = [HasFP8Insts] in {
+let OtherPredicates = [HasFP8ConversionInsts] in {
defm V_CVT_F32_FP8 : VOP1_Real_NoDstSel_SDWA_gfx9<0x54>;
defm V_CVT_F32_BF8 : VOP1_Real_NoDstSel_SDWA_gfx9<0x55>;
defm V_CVT_PK_F32_FP8 : VOP1_Real_NoDstSel_SDWA_gfx9<0x56>;
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/VOP2Instructions.td b/contrib/llvm-project/llvm/lib/Target/AMDGPU/VOP2Instructions.td
index 48d4e259bc1c..27eec64f59a6 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/VOP2Instructions.td
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/VOP2Instructions.td
@@ -69,7 +69,7 @@ class VOP2_Pseudo <string opName, VOPProfile P, list<dag> pattern=[], string suf
let mayStore = 0;
let hasSideEffects = 0;
- let ReadsModeReg = !or(isFloatType<P.DstVT>.ret, isFloatType<P.Src0VT>.ret);
+ let ReadsModeReg = !or(P.DstVT.isFP, P.Src0VT.isFP);
let mayRaiseFPException = ReadsModeReg;
@@ -418,15 +418,11 @@ def VOP_MADMK_F16_t16 : VOP_MADMK <f16> {
}
def VOP_MADMK_F32 : VOP_MADMK <f32>;
-class getRegisterOperandForVT<ValueType VT> {
- RegisterOperand ret = RegisterOperand<getVregSrcForVT<VT>.ret>;
-}
-
// FIXME: Remove src2_modifiers. It isn't used, so is wasting memory
// and processing time but it makes it easier to convert to mad.
class VOP_MAC <ValueType vt0, ValueType vt1=vt0> : VOPProfile <[vt0, vt1, vt1, vt0]> {
let Ins32 = (ins Src0RC32:$src0, Src1RC32:$src1, getVregSrcForVT<Src2VT>.ret:$src2);
- let Ins64 = getIns64<Src0RC64, Src1RC64, getRegisterOperandForVT<Src2VT>.ret, 3,
+ let Ins64 = getIns64<Src0RC64, Src1RC64, getVregSrcForVT<Src2VT>.ret, 3,
0, HasModifiers, HasModifiers, HasOMod,
Src0Mod, Src1Mod, Src2Mod>.ret;
let InsDPP = (ins Src0ModDPP:$src0_modifiers, Src0DPP:$src0,
@@ -489,21 +485,21 @@ def VOP_MAC_F16_t16 : VOP_MAC <f16> {
let DstRC = VOPDstOperand<VGPR_32_Lo128>;
let DstRC64 = VOPDstOperand<VGPR_32>;
let Src1RC32 = VGPRSrc_32_Lo128;
- let Ins32 = (ins Src0RC32:$src0, Src1RC32:$src1, getVregSrcForVT_t16<Src2VT>.ret:$src2);
- let Src0DPP = getVregSrcForVT_t16<Src0VT>.ret;
- let Src1DPP = getVregSrcForVT_t16<Src1VT>.ret;
- let Src2DPP = getVregSrcForVT_t16<Src2VT>.ret;
+ let Ins32 = (ins Src0RC32:$src0, Src1RC32:$src1, getVregSrcForVT<Src2VT, 1/*IsTrue16*/, 1/*IsFake16*/>.ret:$src2);
+ let Src0DPP = getVregSrcForVT<Src0VT, 1/*IsTrue16*/, 1/*IsFake16*/>.ret;
+ let Src1DPP = getVregSrcForVT<Src1VT, 1/*IsTrue16*/, 1/*IsFake16*/>.ret;
+ let Src2DPP = getVregSrcForVT<Src2VT, 1/*IsTrue16*/, 1/*IsFake16*/>.ret;
let Src0ModDPP = getSrcModDPP_t16<Src0VT>.ret;
let Src1ModDPP = getSrcModDPP_t16<Src1VT>.ret;
let Src2ModDPP = getSrcModDPP_t16<Src2VT>.ret;
let InsDPP = (ins Src0ModDPP:$src0_modifiers, Src0DPP:$src0,
Src1ModDPP:$src1_modifiers, Src1DPP:$src1,
- getVregSrcForVT_t16<Src2VT>.ret:$src2, // stub argument
+ getVregSrcForVT<Src2VT, 1/*IsTrue16*/, 1/*IsFake16*/>.ret:$src2, // stub argument
dpp_ctrl:$dpp_ctrl, row_mask:$row_mask,
bank_mask:$bank_mask, bound_ctrl:$bound_ctrl);
let InsDPP8 = (ins Src0ModDPP:$src0_modifiers, Src0DPP:$src0,
Src1ModDPP:$src1_modifiers, Src1DPP:$src1,
- getVregSrcForVT_t16<Src2VT>.ret:$src2, // stub argument
+ getVregSrcForVT<Src2VT, 1/*IsTrue16*/, 1/*IsFake16*/>.ret:$src2, // stub argument
dpp8:$dpp8, FI:$fi);
let Src2Mod = FP32InputMods; // dummy unused modifiers
let Src2RC64 = VGPRSrc_32; // stub argument
@@ -535,7 +531,7 @@ def VOP_DOT_ACC_I32_I32 : VOP_DOT_ACC<i32, i32> {
let Src0Mod = Int32InputMods;
let Src1Mod = Int32InputMods;
- let Ins64 = getIns64<Src0RC64, Src1RC64, getRegisterOperandForVT<Src2VT>.ret,
+ let Ins64 = getIns64<Src0RC64, Src1RC64, getVregSrcForVT<Src2VT>.ret,
3 /*NumSrcArgs*/, HasClamp, 1 /*HasModifiers*/,
1 /*HasSrc2Mods*/, HasOMod,
Src0Mod, Src1Mod, Src2Mod>.ret;
@@ -898,8 +894,8 @@ def LDEXP_F16_VOPProfile : VOPProfile <[f16, f16, f16, untyped]> {
}
def LDEXP_F16_VOPProfile_True16 : VOPProfile_Fake16<VOP_F16_F16_F16> {
let Src1RC32 = RegisterOperand<VGPR_32_Lo128>;
- let Src1DPP = VGPR_32_Lo128;
- let Src1ModDPP = IntT16VRegInputMods;
+ let Src1DPP = RegisterOperand<VGPR_32_Lo128>;
+ let Src1ModDPP = IntT16VRegInputMods</* IsFake16= */ 1>;
}
let isReMaterializable = 1 in {
@@ -2512,6 +2508,7 @@ defm V_FMAAK_F32 : VOP2_Real_MADK_gfx940 <0x18>;
}
multiclass VOP2_Real_DOT_ACC_gfx9<bits<6> op> : Base_VOP2_Real_e32e64_vi<op> {
+ let SubtargetPredicate = isGFX9Only in
def _dpp_vi : VOP2_DPP<op, !cast<VOP2_DPP_Pseudo>(NAME#"_dpp")>;
}
@@ -2520,22 +2517,28 @@ multiclass VOP2_Real_DOT_ACC_gfx10<bits<6> op> :
VOP2_Real_dpp_gfx10<op>,
VOP2_Real_dpp8_gfx10<op>;
-let SubtargetPredicate = HasDot5Insts in {
+multiclass VOP2Only_Real_DOT_ACC_gfx10<bits<6> op> : VOP2_Real_dpp_gfx10<op>,
+ VOP2_Real_dpp8_gfx10<op> {
+ let IsSingle = 1 in
+ defm NAME : VOP2_Real_e32_gfx10<op>;
+}
+
+let OtherPredicates = [HasDot5Insts] in {
defm V_DOT2C_F32_F16 : VOP2_Real_DOT_ACC_gfx9<0x37>;
// NB: Opcode conflicts with V_DOT8C_I32_I4
// This opcode exists in gfx 10.1* only
- defm V_DOT2C_F32_F16 : VOP2_Real_DOT_ACC_gfx10<0x02>;
+ defm V_DOT2C_F32_F16 : VOP2Only_Real_DOT_ACC_gfx10<0x02>;
}
-let SubtargetPredicate = HasDot6Insts in {
+let OtherPredicates = [HasDot6Insts] in {
defm V_DOT4C_I32_I8 : VOP2_Real_DOT_ACC_gfx9<0x39>;
- defm V_DOT4C_I32_I8 : VOP2_Real_DOT_ACC_gfx10<0x0d>;
+ defm V_DOT4C_I32_I8 : VOP2Only_Real_DOT_ACC_gfx10<0x0d>;
}
-let SubtargetPredicate = HasDot4Insts in {
+let OtherPredicates = [HasDot4Insts] in {
defm V_DOT2C_I32_I16 : VOP2_Real_DOT_ACC_gfx9<0x38>;
}
-let SubtargetPredicate = HasDot3Insts in {
+let OtherPredicates = [HasDot3Insts] in {
defm V_DOT8C_I32_I4 : VOP2_Real_DOT_ACC_gfx9<0x3a>;
}
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/VOP3Instructions.td b/contrib/llvm-project/llvm/lib/Target/AMDGPU/VOP3Instructions.td
index eebd323210f9..713b4712d563 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/VOP3Instructions.td
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/VOP3Instructions.td
@@ -600,7 +600,7 @@ defm V_LSHL_OR_B32 : VOP3Inst <"v_lshl_or_b32", VOP3_Profile<VOP_I32_I32_I32_I32
let SubtargetPredicate = isGFX940Plus in
defm V_LSHL_ADD_U64 : VOP3Inst <"v_lshl_add_u64", VOP3_Profile<VOP_I64_I64_I32_I64>>;
-let SubtargetPredicate = HasFP8Insts, mayRaiseFPException = 0,
+let SubtargetPredicate = HasFP8ConversionInsts, mayRaiseFPException = 0,
SchedRW = [WriteFloatCvt] in {
let Constraints = "$vdst = $vdst_in", DisableEncoding = "$vdst_in" in {
defm V_CVT_PK_FP8_F32 : VOP3Inst<"v_cvt_pk_fp8_f32", VOP3_CVT_PK_F8_F32_Profile>;
@@ -1611,7 +1611,7 @@ defm V_CVT_PKNORM_U16_F16 : VOP3OpSel_Real_gfx9 <0x29a>;
defm V_LSHL_ADD_U64 : VOP3_Real_vi <0x208>;
-let OtherPredicates = [HasFP8Insts] in {
+let OtherPredicates = [HasFP8ConversionInsts] in {
defm V_CVT_PK_FP8_F32 : VOP3OpSel_Real_gfx9 <0x2a2>;
defm V_CVT_PK_BF8_F32 : VOP3OpSel_Real_gfx9 <0x2a3>;
defm V_CVT_SR_FP8_F32 : VOP3OpSel_Real_gfx9_forced_opsel2 <0x2a4>;
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/VOP3PInstructions.td b/contrib/llvm-project/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
index e9d6f67aee16..0c7a08cd4bc9 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
@@ -415,8 +415,8 @@ multiclass VOP3PDOTIUInst <string OpName, SDPatternOperator intrinsic_node> {
null_frag, 1>;
// Dot-iu instructions consider input as signed if imod neg bits are set. Thus
// Dot-iu Intrinsics have extra operands and require separate codegen pattern.
- def : GCNPat < (intrinsic_node (DotIUVOP3PMods i32:$src0_mods), i32:$src0,
- (DotIUVOP3PMods i32:$src1_mods), i32:$src1,
+ def : GCNPat < (intrinsic_node (VOP3PModsNeg i32:$src0_mods), i32:$src0,
+ (VOP3PModsNeg i32:$src1_mods), i32:$src1,
i32:$src2, (i1 timm:$clamp)),
(!cast<Instruction>(NAME) $src0_mods, i32:$src0,
$src1_mods, i32:$src1,
@@ -443,6 +443,48 @@ def : GCNPat < (int_amdgcn_sdot4 i32:$src0,
>;
} // End SubtargetPredicate = HasDot8Insts
+// Does not use opsel, no src_modifiers on src0 and src1.
+// src_modifiers on src2(f32) are f32 fneg(neg_lo[2]) and f32 fabs(neg_hi[2]).
+def VOP3P_DOTF8_Profile : VOP3P_Profile<VOPProfile <[f32, i32, i32, f32]>,
+ VOP3_PACKED, 1> {
+ let HasClamp = 0;
+ let HasOpSel = 0;
+ let HasOMod = 0;
+ let IsDOT = 1;
+ let HasSrc0Mods = 0;
+ let HasSrc1Mods = 0;
+ let HasSrc2Mods = 1;
+
+ let InsVOP3P = (ins VSrc_b32:$src0, VSrc_b32:$src1,
+ PackedF16InputMods:$src2_modifiers, VSrc_f32:$src2,
+ neg_lo0:$neg_lo, neg_hi0:$neg_hi);
+
+ let InsVOP3DPP8 = (ins DstRC:$old, VGPR_32:$src0, VRegSrc_32:$src1,
+ PackedF16InputMods:$src2_modifiers, VRegSrc_32:$src2,
+ neg_lo0:$neg_lo, neg_hi0:$neg_hi, dpp8:$dpp8, FI:$fi);
+
+ let InsVOP3DPP16 = (ins DstRC:$old, VGPR_32:$src0, VRegSrc_32:$src1,
+ PackedF16InputMods:$src2_modifiers, VRegSrc_32:$src2,
+ neg_lo0:$neg_lo, neg_hi0:$neg_hi, dpp_ctrl:$dpp_ctrl,
+ row_mask:$row_mask, bank_mask:$bank_mask,
+ bound_ctrl:$bound_ctrl, FI:$fi);
+}
+
+multiclass VOP3PDOTF8Inst <string OpName, SDPatternOperator intrinsic_node> {
+ defm NAME : VOP3PInst<OpName, VOP3P_DOTF8_Profile, null_frag, 1>;
+
+ let SubtargetPredicate = isGFX12Plus in
+ def : GCNPat <(intrinsic_node i32:$src0, i32:$src1,
+ (VOP3Mods f32:$src2, i32:$src2_modifiers)),
+ (!cast<Instruction>(NAME) i32:$src0, i32:$src1,
+ i32:$src2_modifiers, f32:$src2)>;
+}
+
+defm V_DOT4_F32_FP8_BF8 : VOP3PDOTF8Inst<"v_dot4_f32_fp8_bf8", int_amdgcn_dot4_f32_fp8_bf8>;
+defm V_DOT4_F32_BF8_FP8 : VOP3PDOTF8Inst<"v_dot4_f32_bf8_fp8", int_amdgcn_dot4_f32_bf8_fp8>;
+defm V_DOT4_F32_FP8_FP8 : VOP3PDOTF8Inst<"v_dot4_f32_fp8_fp8", int_amdgcn_dot4_f32_fp8_fp8>;
+defm V_DOT4_F32_BF8_BF8 : VOP3PDOTF8Inst<"v_dot4_f32_bf8_bf8", int_amdgcn_dot4_f32_bf8_bf8>;
+
def : UDot2Pat<V_DOT2_U32_U16>;
def : SDot2Pat<V_DOT2_I32_I16>;
@@ -593,13 +635,29 @@ class MAIFrag<SDPatternOperator Op, code pred> : PatFrag <
pred
>;
-let GISelPredicateCode = [{ return MF.getInfo<SIMachineFunctionInfo>()->mayNeedAGPRs(); }] in
-class AgprMAIFrag<SDPatternOperator Op> :
- MAIFrag<Op, [{ return MF->getInfo<SIMachineFunctionInfo>()->mayNeedAGPRs(); }]>;
+defvar MayNeedAGPRs = [{
+ return MF->getInfo<SIMachineFunctionInfo>()->mayNeedAGPRs();
+}];
+
+defvar MayNeedAGPRs_gisel = [{
+ return MF.getInfo<SIMachineFunctionInfo>()->mayNeedAGPRs();
+}];
+
+defvar MayNotNeedAGPRs = [{
+ return !MF->getInfo<SIMachineFunctionInfo>()->mayNeedAGPRs();
+}];
+
+defvar MayNotNeedAGPRs_gisel = [{
+ return !MF.getInfo<SIMachineFunctionInfo>()->mayNeedAGPRs();
+}];
-let GISelPredicateCode = [{ return !MF.getInfo<SIMachineFunctionInfo>()->mayNeedAGPRs(); }] in
-class VgprMAIFrag<SDPatternOperator Op> :
- MAIFrag<Op, [{ return !MF->getInfo<SIMachineFunctionInfo>()->mayNeedAGPRs(); }]>;
+class AgprMAIFrag<SDPatternOperator Op> : MAIFrag<Op, MayNeedAGPRs> {
+ let GISelPredicateCode = MayNeedAGPRs_gisel;
+}
+
+class VgprMAIFrag<SDPatternOperator Op> : MAIFrag<Op, MayNotNeedAGPRs> {
+ let GISelPredicateCode = MayNotNeedAGPRs_gisel;
+}
let SubtargetPredicate = HasMAIInsts in {
@@ -812,8 +870,8 @@ class WMMAOpSelPat<Instruction Inst, SDPatternOperator node, VOPProfile P> :
class WMMAUIClampPat<Instruction Inst, SDPatternOperator node, VOPProfile P> :
GCNPat < (P.DstVT (node
- (DotIUVOP3PMods i32:$src0_modifiers), (P.Src0VT P.Src0VT:$src0),
- (DotIUVOP3PMods i32:$src1_modifiers), (P.Src1VT P.Src1VT:$src1),
+ (VOP3PModsNeg i32:$src0_modifiers), (P.Src0VT P.Src0VT:$src0),
+ (VOP3PModsNeg i32:$src1_modifiers), (P.Src1VT P.Src1VT:$src1),
(P.Src2VT P.Src2VT:$src2), (i1 timm:$clamp)
)),
(P.DstVT (Inst i32:$src0_modifiers, P.Src0VT:$src0, i32:$src1_modifiers, P.Src1VT:$src1, (i32 8), P.Src2VT:$src2, i1:$clamp))
@@ -1003,6 +1061,11 @@ defm V_PK_MAX_NUM_F16 : VOP3P_Real_with_name_gfx12<0x1c, "V_PK_MAX_F16", "v_pk_m
defm V_PK_MINIMUM_F16 : VOP3P_Real_gfx12<0x1d>;
defm V_PK_MAXIMUM_F16 : VOP3P_Real_gfx12<0x1e>;
+defm V_DOT4_F32_FP8_BF8 : VOP3P_Realtriple<GFX12Gen, 0x24>;
+defm V_DOT4_F32_BF8_FP8 : VOP3P_Realtriple<GFX12Gen, 0x25>;
+defm V_DOT4_F32_FP8_FP8 : VOP3P_Realtriple<GFX12Gen, 0x26>;
+defm V_DOT4_F32_BF8_BF8 : VOP3P_Realtriple<GFX12Gen, 0x27>;
+
//===----------------------------------------------------------------------===//
// GFX11
//===----------------------------------------------------------------------===//
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/VOPCInstructions.td b/contrib/llvm-project/llvm/lib/Target/AMDGPU/VOPCInstructions.td
index e5b801048e6d..3ca97f0291e0 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/VOPCInstructions.td
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/VOPCInstructions.td
@@ -61,13 +61,13 @@ class VOPC_Profile<list<SchedReadWrite> sched, ValueType vt0, ValueType vt1 = vt
let AsmDPP16 = AsmDPP#"$fi";
// VOPC DPP Instructions do not need an old operand
let TieRegDPP = "";
- let InsDPP = getInsDPP<VOPDstOperand<Src0DPP>, Src0DPP, Src1DPP, Src2DPP,
+ let InsDPP = getInsDPP<VOPDstOperand<Src0DPP.RegClass>, Src0DPP, Src1DPP, Src2DPP,
NumSrcArgs, HasModifiers, Src0ModDPP, Src1ModDPP,
Src2ModDPP, 0/*HasOld*/>.ret;
- let InsDPP16 = getInsDPP16<VOPDstOperand<Src0DPP>, Src0DPP, Src1DPP, Src2DPP,
+ let InsDPP16 = getInsDPP16<VOPDstOperand<Src0DPP.RegClass>, Src0DPP, Src1DPP, Src2DPP,
NumSrcArgs, HasModifiers, Src0ModDPP, Src1ModDPP,
Src2ModDPP, 0/*HasOld*/>.ret;
- let InsDPP8 = getInsDPP8<VOPDstOperand<Src0DPP>, Src0DPP, Src1DPP, Src2DPP,
+ let InsDPP8 = getInsDPP8<VOPDstOperand<Src0DPP.RegClass>, Src0DPP, Src1DPP, Src2DPP,
NumSrcArgs, HasModifiers, Src0ModDPP, Src1ModDPP,
Src2ModDPP, 0/*HasOld*/>.ret;
@@ -88,10 +88,10 @@ multiclass VOPC_Profile_t16<list<SchedReadWrite> sched, ValueType vt0, ValueType
def NAME : VOPC_Profile<sched, vt0, vt1>;
def _t16 : VOPC_Profile<sched, vt0, vt1> {
let IsTrue16 = 1;
- let Src1RC32 = RegisterOperand<getVregSrcForVT_t16<Src1VT>.ret>;
- let Src0DPP = getVregSrcForVT_t16<Src0VT>.ret;
- let Src1DPP = getVregSrcForVT_t16<Src1VT>.ret;
- let Src2DPP = getVregSrcForVT_t16<Src2VT>.ret;
+ let Src1RC32 = getVregSrcForVT<Src1VT, 1/*IsTrue16*/, 1/*IsFake16*/>.ret;
+ let Src0DPP = getVregSrcForVT<Src0VT, 1/*IsTrue16*/, 1/*IsFake16*/>.ret;
+ let Src1DPP = getVregSrcForVT<Src1VT, 1/*IsTrue16*/, 1/*IsFake16*/>.ret;
+ let Src2DPP = getVregSrcForVT<Src2VT, 1/*IsTrue16*/, 1/*IsFake16*/>.ret;
let Src0ModDPP = getSrcModDPP_t16<Src0VT>.ret;
let Src1ModDPP = getSrcModDPP_t16<Src1VT>.ret;
let Src2ModDPP = getSrcModDPP_t16<Src2VT>.ret;
@@ -108,8 +108,8 @@ class VOPC_NoSdst_Profile<list<SchedReadWrite> sched, ValueType vt0,
let InsSDWA = (ins Src0ModSDWA:$src0_modifiers, Src0SDWA:$src0,
Src1ModSDWA:$src1_modifiers, Src1SDWA:$src1,
src0_sel:$src0_sel, src1_sel:$src1_sel);
- let AsmVOP3Base = !if(isFloatType<Src0VT>.ret, "$src0_modifiers, $src1_modifiers$clamp",
- "$src0, $src1");
+ let AsmVOP3Base = !if(Src0VT.isFP, "$src0_modifiers, $src1_modifiers$clamp",
+ "$src0, $src1");
let AsmSDWA9 = "$src0_modifiers, $src1_modifiers $src0_sel $src1_sel";
let EmitDst = 0;
}
@@ -118,10 +118,10 @@ multiclass VOPC_NoSdst_Profile_t16<list<SchedReadWrite> sched, ValueType vt0, Va
def NAME : VOPC_NoSdst_Profile<sched, vt0, vt1>;
def _t16 : VOPC_NoSdst_Profile<sched, vt0, vt1> {
let IsTrue16 = 1;
- let Src1RC32 = RegisterOperand<getVregSrcForVT_t16<Src1VT>.ret>;
- let Src0DPP = getVregSrcForVT_t16<Src0VT>.ret;
- let Src1DPP = getVregSrcForVT_t16<Src1VT>.ret;
- let Src2DPP = getVregSrcForVT_t16<Src2VT>.ret;
+ let Src1RC32 = getVregSrcForVT<Src1VT, 1/*IsTrue16*/, 1/*IsFake16*/>.ret;
+ let Src0DPP = getVregSrcForVT<Src0VT, 1/*IsTrue16*/, 1/*IsFake16*/>.ret;
+ let Src1DPP = getVregSrcForVT<Src1VT, 1/*IsTrue16*/, 1/*IsFake16*/>.ret;
+ let Src2DPP = getVregSrcForVT<Src2VT, 1/*IsTrue16*/, 1/*IsFake16*/>.ret;
let Src0ModDPP = getSrcModDPP_t16<Src0VT>.ret;
let Src1ModDPP = getSrcModDPP_t16<Src1VT>.ret;
let Src2ModDPP = getSrcModDPP_t16<Src2VT>.ret;
@@ -146,7 +146,7 @@ class VOPC_Pseudo <string opName, VOPC_Profile P, list<dag> pattern=[],
let mayStore = 0;
let hasSideEffects = 0;
- let ReadsModeReg = isFloatType<P.Src0VT>.ret;
+ let ReadsModeReg = P.Src0VT.isFP;
let VALU = 1;
let VOPC = 1;
@@ -789,11 +789,11 @@ multiclass VOPC_Class_Profile_t16<list<SchedReadWrite> sched> {
def NAME : VOPC_Class_Profile<sched, f16>;
def _t16 : VOPC_Class_Profile<sched, f16, i16> {
let IsTrue16 = 1;
- let Src1RC32 = RegisterOperand<getVregSrcForVT_t16<Src1VT>.ret>;
+ let Src1RC32 = getVregSrcForVT<Src1VT, 1/*IsTrue16*/, 1/*IsFake16*/>.ret;
let Src1RC64 = VSrc_b32;
- let Src0DPP = getVregSrcForVT_t16<Src0VT>.ret;
- let Src1DPP = getVregSrcForVT_t16<Src1VT>.ret;
- let Src2DPP = getVregSrcForVT_t16<Src2VT>.ret;
+ let Src0DPP = getVregSrcForVT<Src0VT, 1/*IsTrue16*/, 1/*IsFake16*/>.ret;
+ let Src1DPP = getVregSrcForVT<Src1VT, 1/*IsTrue16*/, 1/*IsFake16*/>.ret;
+ let Src2DPP = getVregSrcForVT<Src2VT, 1/*IsTrue16*/, 1/*IsFake16*/>.ret;
let Src0ModDPP = getSrcModDPP_t16<Src0VT>.ret;
let Src1ModDPP = getSrcModDPP_t16<Src1VT>.ret;
let Src2ModDPP = getSrcModDPP_t16<Src2VT>.ret;
@@ -816,11 +816,11 @@ multiclass VOPC_Class_NoSdst_Profile_t16<list<SchedReadWrite> sched> {
def NAME : VOPC_Class_NoSdst_Profile<sched, f16>;
def _t16 : VOPC_Class_NoSdst_Profile<sched, f16, i16> {
let IsTrue16 = 1;
- let Src1RC32 = RegisterOperand<getVregSrcForVT_t16<Src1VT>.ret>;
+ let Src1RC32 = getVregSrcForVT<Src1VT, 1/*IsTrue16*/, 1/*IsFake16*/>.ret;
let Src1RC64 = VSrc_b32;
- let Src0DPP = getVregSrcForVT_t16<Src0VT>.ret;
- let Src1DPP = getVregSrcForVT_t16<Src1VT>.ret;
- let Src2DPP = getVregSrcForVT_t16<Src2VT>.ret;
+ let Src0DPP = getVregSrcForVT<Src0VT, 1/*IsTrue16*/, 1/*IsFake16*/>.ret;
+ let Src1DPP = getVregSrcForVT<Src1VT, 1/*IsTrue16*/, 1/*IsFake16*/>.ret;
+ let Src2DPP = getVregSrcForVT<Src2VT, 1/*IsTrue16*/, 1/*IsFake16*/>.ret;
let Src0ModDPP = getSrcModDPP_t16<Src0VT>.ret;
let Src1ModDPP = getSrcModDPP_t16<Src1VT>.ret;
let Src2ModDPP = getSrcModDPP_t16<Src2VT>.ret;
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/VOPInstructions.td b/contrib/llvm-project/llvm/lib/Target/AMDGPU/VOPInstructions.td
index c4b9e7063093..df505c3365cb 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/VOPInstructions.td
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/VOPInstructions.td
@@ -152,7 +152,7 @@ class VOP3_Pseudo <string opName, VOPProfile P, list<dag> pattern = [],
let ClampLo = P.HasClampLo;
let ClampHi = P.HasClampHi;
- let ReadsModeReg = !or(isFloatType<P.DstVT>.ret, isFloatType<P.Src0VT>.ret);
+ let ReadsModeReg = !or(P.DstVT.isFP, P.Src0VT.isFP);
let mayRaiseFPException = ReadsModeReg;
let Uses = !if(ReadsModeReg, [MODE, EXEC], [EXEC]);
@@ -169,6 +169,7 @@ class VOP3_Pseudo <string opName, VOPProfile P, list<dag> pattern = [],
class VOP3P_Pseudo <string opName, VOPProfile P, list<dag> pattern = []> :
VOP3_Pseudo<opName, P, pattern, 1> {
let VOP3P = 1;
+ let IsDOT = P.IsDOT;
}
class VOP_Real<VOP_Pseudo ps> {
@@ -387,7 +388,7 @@ class VOP3Pe <bits<7> op, VOPProfile P> : Enc64 {
let Inst{12} = !if(!and(P.HasSrc1, P.HasOpSel), src1_modifiers{2}, 0); // op_sel(1)
let Inst{13} = !if(!and(P.HasSrc2, P.HasOpSel), src2_modifiers{2}, 0); // op_sel(2)
- let Inst{14} = !if(!and(P.HasSrc2, P.HasOpSel), src2_modifiers{3}, ?); // op_sel_hi(2)
+ let Inst{14} = !if(!and(P.HasSrc2, P.HasOpSel), src2_modifiers{3}, !if(P.IsDOT, 1, ?)); // op_sel_hi(2)
let Inst{15} = !if(P.HasClamp, clamp{0}, 0);
@@ -396,8 +397,8 @@ class VOP3Pe <bits<7> op, VOPProfile P> : Enc64 {
let Inst{40-32} = !if(P.HasSrc0, src0, 0);
let Inst{49-41} = !if(P.HasSrc1, src1, 0);
let Inst{58-50} = !if(P.HasSrc2, src2, 0);
- let Inst{59} = !if(!and(P.HasSrc0, P.HasOpSel), src0_modifiers{3}, ?); // op_sel_hi(0)
- let Inst{60} = !if(!and(P.HasSrc1, P.HasOpSel), src1_modifiers{3}, ?); // op_sel_hi(1)
+ let Inst{59} = !if(!and(P.HasSrc0, P.HasOpSel), src0_modifiers{3}, !if(P.IsDOT, 1, ?)); // op_sel_hi(0)
+ let Inst{60} = !if(!and(P.HasSrc1, P.HasOpSel), src1_modifiers{3}, !if(P.IsDOT, 1, ?)); // op_sel_hi(1)
let Inst{61} = !if(P.HasSrc0Mods, src0_modifiers{0}, 0); // neg (lo)
let Inst{62} = !if(P.HasSrc1Mods, src1_modifiers{0}, 0); // neg (lo)
let Inst{63} = !if(P.HasSrc2Mods, src2_modifiers{0}, 0); // neg (lo)
@@ -599,7 +600,7 @@ class VOP_SDWA_Pseudo <string opName, VOPProfile P, list<dag> pattern=[]> :
let VALU = 1;
let SDWA = 1;
- let ReadsModeReg = !or(isFloatType<P.DstVT>.ret, isFloatType<P.Src0VT>.ret);
+ let ReadsModeReg = !or(P.DstVT.isFP, P.Src0VT.isFP);
let mayRaiseFPException = ReadsModeReg;
let Uses = !if(ReadsModeReg, [MODE, EXEC], [EXEC]);
@@ -772,12 +773,12 @@ class VOP3P_DPPe_Common_Base<bits<7> op, VOPProfile P> : Enc96 {
let Inst{11} = !if(!and(P.HasSrc0, P.HasOpSel), src0_modifiers{2}, 0); // op_sel(0)
let Inst{12} = !if(!and(P.HasSrc1, P.HasOpSel), src1_modifiers{2}, 0); // op_sel(1)
let Inst{13} = !if(!and(P.HasSrc2, P.HasOpSel), src2_modifiers{2}, 0); // op_sel(2)
- let Inst{14} = !if(!and(P.HasSrc2, P.HasOpSel), src2_modifiers{3}, ?); // op_sel_hi(2)
+ let Inst{14} = !if(!and(P.HasSrc2, P.HasOpSel), src2_modifiers{3}, !if(P.IsDOT, 1, ?)); // op_sel_hi(2)
let Inst{15} = !if(P.HasClamp, clamp{0}, 0);
let Inst{22-16} = op;
let Inst{31-23} = 0x198; // encoding
- let Inst{59} = !if(!and(P.HasSrc0, P.HasOpSel), src0_modifiers{3}, ?); // op_sel_hi(0)
- let Inst{60} = !if(!and(P.HasSrc1, P.HasOpSel), src1_modifiers{3}, ?); // op_sel_hi(1)
+ let Inst{59} = !if(!and(P.HasSrc0, P.HasOpSel), src0_modifiers{3}, !if(P.IsDOT, 1, ?)); // op_sel_hi(0)
+ let Inst{60} = !if(!and(P.HasSrc1, P.HasOpSel), src1_modifiers{3}, !if(P.IsDOT, 1, ?)); // op_sel_hi(1)
let Inst{61} = !if(P.HasSrc0Mods, src0_modifiers{0}, 0); // neg (lo)
let Inst{62} = !if(P.HasSrc1Mods, src1_modifiers{0}, 0); // neg (lo)
let Inst{63} = !if(P.HasSrc2Mods, src2_modifiers{0}, 0); // neg (lo)
@@ -811,7 +812,7 @@ class VOP_DPP_Pseudo <string OpName, VOPProfile P, list<dag> pattern=[],
let DPP = 1;
let Size = 8;
- let ReadsModeReg = !or(isFloatType<P.DstVT>.ret, isFloatType<P.Src0VT>.ret);
+ let ReadsModeReg = !or(P.DstVT.isFP, P.Src0VT.isFP);
let mayRaiseFPException = ReadsModeReg;
let Uses = !if(ReadsModeReg, [MODE, EXEC], [EXEC]);