aboutsummaryrefslogtreecommitdiff
path: root/lib/Target/AMDGPU
diff options
context:
space:
mode:
Diffstat (limited to 'lib/Target/AMDGPU')
-rw-r--r--lib/Target/AMDGPU/AMDGPU.h4
-rw-r--r--lib/Target/AMDGPU/AMDGPU.td16
-rw-r--r--lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp21
-rw-r--r--lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h4
-rw-r--r--lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp78
-rw-r--r--lib/Target/AMDGPU/AMDGPUAsmPrinter.h10
-rw-r--r--lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp345
-rw-r--r--lib/Target/AMDGPU/AMDGPUCallLowering.cpp701
-rw-r--r--lib/Target/AMDGPU/AMDGPUCallLowering.h29
-rw-r--r--lib/Target/AMDGPU/AMDGPUCallingConv.td27
-rw-r--r--lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp12
-rw-r--r--lib/Target/AMDGPU/AMDGPUFrameLowering.cpp6
-rw-r--r--lib/Target/AMDGPU/AMDGPUFrameLowering.h4
-rw-r--r--lib/Target/AMDGPU/AMDGPUGISel.td78
-rw-r--r--lib/Target/AMDGPU/AMDGPUGenRegisterBankInfo.def80
-rw-r--r--lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp9
-rw-r--r--lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.h2
-rw-r--r--lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp272
-rw-r--r--lib/Target/AMDGPU/AMDGPUISelLowering.cpp250
-rw-r--r--lib/Target/AMDGPU/AMDGPUISelLowering.h18
-rw-r--r--lib/Target/AMDGPU/AMDGPUInline.cpp2
-rw-r--r--lib/Target/AMDGPU/AMDGPUInstrInfo.td126
-rw-r--r--lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp1144
-rw-r--r--lib/Target/AMDGPU/AMDGPUInstructionSelector.h48
-rw-r--r--lib/Target/AMDGPU/AMDGPUInstructions.td216
-rw-r--r--lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp1132
-rw-r--r--lib/Target/AMDGPU/AMDGPULegalizerInfo.h52
-rw-r--r--lib/Target/AMDGPU/AMDGPULibCalls.cpp37
-rw-r--r--lib/Target/AMDGPU/AMDGPULibFunc.cpp14
-rw-r--r--lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp20
-rw-r--r--lib/Target/AMDGPU/AMDGPUMCInstLower.cpp4
-rw-r--r--lib/Target/AMDGPU/AMDGPUMachineCFGStructurizer.cpp38
-rw-r--r--lib/Target/AMDGPU/AMDGPUMachineFunction.cpp1
-rw-r--r--lib/Target/AMDGPU/AMDGPUMachineFunction.h6
-rw-r--r--lib/Target/AMDGPU/AMDGPUPrintfRuntimeBinding.cpp592
-rw-r--r--lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp2
-rw-r--r--lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp1234
-rw-r--r--lib/Target/AMDGPU/AMDGPURegisterBankInfo.h56
-rw-r--r--lib/Target/AMDGPU/AMDGPURegisterBanks.td6
-rw-r--r--lib/Target/AMDGPU/AMDGPURegisterInfo.cpp66
-rw-r--r--lib/Target/AMDGPU/AMDGPURegisterInfo.h2
-rw-r--r--lib/Target/AMDGPU/AMDGPUSearchableTables.td4
-rw-r--r--lib/Target/AMDGPU/AMDGPUSubtarget.cpp64
-rw-r--r--lib/Target/AMDGPU/AMDGPUSubtarget.h67
-rw-r--r--lib/Target/AMDGPU/AMDGPUTargetMachine.cpp29
-rw-r--r--lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp90
-rw-r--r--lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h20
-rw-r--r--lib/Target/AMDGPU/AMDILCFGStructurizer.cpp8
-rw-r--r--lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp595
-rw-r--r--lib/Target/AMDGPU/BUFInstructions.td694
-rw-r--r--lib/Target/AMDGPU/DSInstructions.td92
-rw-r--r--lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp4
-rw-r--r--lib/Target/AMDGPU/EvergreenInstructions.td60
-rw-r--r--lib/Target/AMDGPU/FLATInstructions.td196
-rw-r--r--lib/Target/AMDGPU/GCNDPPCombine.cpp88
-rw-r--r--lib/Target/AMDGPU/GCNHazardRecognizer.cpp21
-rw-r--r--lib/Target/AMDGPU/GCNILPSched.cpp1
-rw-r--r--lib/Target/AMDGPU/GCNIterativeScheduler.cpp2
-rw-r--r--lib/Target/AMDGPU/GCNNSAReassign.cpp8
-rw-r--r--lib/Target/AMDGPU/GCNRegBankReassign.cpp14
-rw-r--r--lib/Target/AMDGPU/GCNRegPressure.cpp26
-rw-r--r--lib/Target/AMDGPU/GCNRegPressure.h2
-rw-r--r--lib/Target/AMDGPU/GCNSchedStrategy.cpp31
-rw-r--r--lib/Target/AMDGPU/GCNSchedStrategy.h3
-rw-r--r--lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp2
-rw-r--r--lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFObjectWriter.cpp2
-rw-r--r--lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp37
-rw-r--r--lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h6
-rw-r--r--lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp4
-rw-r--r--lib/Target/AMDGPU/MIMGInstructions.td4
-rw-r--r--lib/Target/AMDGPU/R600AsmPrinter.cpp2
-rw-r--r--lib/Target/AMDGPU/R600ControlFlowFinalizer.cpp4
-rw-r--r--lib/Target/AMDGPU/R600ExpandSpecialInstrs.cpp22
-rw-r--r--lib/Target/AMDGPU/R600FrameLowering.h6
-rw-r--r--lib/Target/AMDGPU/R600ISelLowering.cpp7
-rw-r--r--lib/Target/AMDGPU/R600InstrInfo.cpp22
-rw-r--r--lib/Target/AMDGPU/R600MachineScheduler.cpp8
-rw-r--r--lib/Target/AMDGPU/R600OptimizeVectorRegisters.cpp12
-rw-r--r--lib/Target/AMDGPU/R600Packetizer.cpp4
-rw-r--r--lib/Target/AMDGPU/R600RegisterInfo.cpp2
-rw-r--r--lib/Target/AMDGPU/SIAddIMGInit.cpp4
-rw-r--r--lib/Target/AMDGPU/SIDefines.h6
-rw-r--r--lib/Target/AMDGPU/SIFixSGPRCopies.cpp394
-rw-r--r--lib/Target/AMDGPU/SIFixupVectorISel.cpp3
-rw-r--r--lib/Target/AMDGPU/SIFoldOperands.cpp114
-rw-r--r--lib/Target/AMDGPU/SIFormMemoryClauses.cpp22
-rw-r--r--lib/Target/AMDGPU/SIFrameLowering.cpp34
-rw-r--r--lib/Target/AMDGPU/SIFrameLowering.h6
-rw-r--r--lib/Target/AMDGPU/SIISelLowering.cpp1052
-rw-r--r--lib/Target/AMDGPU/SIISelLowering.h54
-rw-r--r--lib/Target/AMDGPU/SIInsertWaitcnts.cpp6
-rw-r--r--lib/Target/AMDGPU/SIInstrFormats.td5
-rw-r--r--lib/Target/AMDGPU/SIInstrInfo.cpp558
-rw-r--r--lib/Target/AMDGPU/SIInstrInfo.h44
-rw-r--r--lib/Target/AMDGPU/SIInstrInfo.td320
-rw-r--r--lib/Target/AMDGPU/SIInstructions.td297
-rw-r--r--lib/Target/AMDGPU/SILoadStoreOptimizer.cpp1117
-rw-r--r--lib/Target/AMDGPU/SILowerControlFlow.cpp60
-rw-r--r--lib/Target/AMDGPU/SILowerI1Copies.cpp49
-rw-r--r--lib/Target/AMDGPU/SILowerSGPRSpills.cpp4
-rw-r--r--lib/Target/AMDGPU/SIMachineFunctionInfo.cpp9
-rw-r--r--lib/Target/AMDGPU/SIMachineFunctionInfo.h11
-rw-r--r--lib/Target/AMDGPU/SIMachineScheduler.cpp16
-rw-r--r--lib/Target/AMDGPU/SIMemoryLegalizer.cpp6
-rw-r--r--lib/Target/AMDGPU/SIModeRegister.cpp2
-rw-r--r--lib/Target/AMDGPU/SIOptimizeExecMasking.cpp2
-rw-r--r--lib/Target/AMDGPU/SIOptimizeExecMaskingPreRA.cpp32
-rw-r--r--lib/Target/AMDGPU/SIPeepholeSDWA.cpp32
-rw-r--r--lib/Target/AMDGPU/SIPreAllocateWWMRegs.cpp12
-rw-r--r--lib/Target/AMDGPU/SIProgramInfo.h5
-rw-r--r--lib/Target/AMDGPU/SIRegisterInfo.cpp445
-rw-r--r--lib/Target/AMDGPU/SIRegisterInfo.h15
-rw-r--r--lib/Target/AMDGPU/SIRegisterInfo.td464
-rw-r--r--lib/Target/AMDGPU/SIShrinkInstructions.cpp42
-rw-r--r--lib/Target/AMDGPU/SIWholeQuadMode.cpp35
-rw-r--r--lib/Target/AMDGPU/SMInstructions.td15
-rw-r--r--lib/Target/AMDGPU/SOPInstructions.td42
-rw-r--r--lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp71
-rw-r--r--lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h24
-rw-r--r--lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.cpp2
-rw-r--r--lib/Target/AMDGPU/VOP1Instructions.td65
-rw-r--r--lib/Target/AMDGPU/VOP2Instructions.td346
-rw-r--r--lib/Target/AMDGPU/VOP3Instructions.td69
-rw-r--r--lib/Target/AMDGPU/VOP3PInstructions.td2
-rw-r--r--lib/Target/AMDGPU/VOPCInstructions.td30
-rw-r--r--lib/Target/AMDGPU/VOPInstructions.td12
126 files changed, 9840 insertions, 5142 deletions
diff --git a/lib/Target/AMDGPU/AMDGPU.h b/lib/Target/AMDGPU/AMDGPU.h
index 19a8bd901629..b64422ae5427 100644
--- a/lib/Target/AMDGPU/AMDGPU.h
+++ b/lib/Target/AMDGPU/AMDGPU.h
@@ -188,6 +188,10 @@ ModulePass *createAMDGPUAlwaysInlinePass(bool GlobalOpt = true);
ModulePass *createR600OpenCLImageTypeLoweringPass();
FunctionPass *createAMDGPUAnnotateUniformValues();
+ModulePass *createAMDGPUPrintfRuntimeBinding();
+void initializeAMDGPUPrintfRuntimeBindingPass(PassRegistry&);
+extern char &AMDGPUPrintfRuntimeBindingID;
+
ModulePass* createAMDGPUUnifyMetadataPass();
void initializeAMDGPUUnifyMetadataPass(PassRegistry&);
extern char &AMDGPUUnifyMetadataID;
diff --git a/lib/Target/AMDGPU/AMDGPU.td b/lib/Target/AMDGPU/AMDGPU.td
index baeba534012c..42b477e07b3b 100644
--- a/lib/Target/AMDGPU/AMDGPU.td
+++ b/lib/Target/AMDGPU/AMDGPU.td
@@ -10,6 +10,15 @@ include "llvm/TableGen/SearchableTable.td"
include "llvm/Target/Target.td"
include "AMDGPUFeatures.td"
+def p0 : PtrValueType<i64, 0>;
+def p1 : PtrValueType<i64, 1>;
+def p2 : PtrValueType<i32, 2>;
+def p3 : PtrValueType<i32, 3>;
+def p4 : PtrValueType<i64, 4>;
+def p5 : PtrValueType<i32, 5>;
+def p6 : PtrValueType<i32, 6>;
+
+
class BoolToList<bit Value> {
list<int> ret = !if(Value, [1]<int>, []<int>);
}
@@ -145,6 +154,12 @@ def FeatureLdsMisalignedBug : SubtargetFeature<"lds-misaligned-bug",
"Some GFX10 bug with misaligned multi-dword LDS access in WGP mode"
>;
+def FeatureMFMAInlineLiteralBug : SubtargetFeature<"mfma-inline-literal-bug",
+ "HasMFMAInlineLiteralBug",
+ "true",
+ "MFMA cannot use inline literal as SrcC"
+>;
+
def FeatureVcmpxPermlaneHazard : SubtargetFeature<"vcmpx-permlane-hazard",
"HasVcmpxPermlaneHazard",
"true",
@@ -802,6 +817,7 @@ def FeatureISAVersion9_0_8 : FeatureSet<
FeaturePkFmacF16Inst,
FeatureAtomicFaddInsts,
FeatureSRAMECC,
+ FeatureMFMAInlineLiteralBug,
FeatureCodeObjectV3]>;
def FeatureISAVersion9_0_9 : FeatureSet<
diff --git a/lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp b/lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp
index 419ebb2240ad..e72b3f4fde63 100644
--- a/lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp
+++ b/lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp
@@ -173,6 +173,9 @@ static StringRef intrinsicToAttrName(Intrinsic::ID ID,
case Intrinsic::amdgcn_implicitarg_ptr:
return "amdgpu-implicitarg-ptr";
case Intrinsic::amdgcn_queue_ptr:
+ case Intrinsic::amdgcn_is_shared:
+ case Intrinsic::amdgcn_is_private:
+ // TODO: Does not require queue ptr on gfx9+
case Intrinsic::trap:
case Intrinsic::debugtrap:
IsQueuePtr = true;
@@ -194,18 +197,12 @@ static bool handleAttr(Function &Parent, const Function &Callee,
static void copyFeaturesToFunction(Function &Parent, const Function &Callee,
bool &NeedQueuePtr) {
// X ids unnecessarily propagated to kernels.
- static const StringRef AttrNames[] = {
- { "amdgpu-work-item-id-x" },
- { "amdgpu-work-item-id-y" },
- { "amdgpu-work-item-id-z" },
- { "amdgpu-work-group-id-x" },
- { "amdgpu-work-group-id-y" },
- { "amdgpu-work-group-id-z" },
- { "amdgpu-dispatch-ptr" },
- { "amdgpu-dispatch-id" },
- { "amdgpu-kernarg-segment-ptr" },
- { "amdgpu-implicitarg-ptr" }
- };
+ static constexpr StringLiteral AttrNames[] = {
+ "amdgpu-work-item-id-x", "amdgpu-work-item-id-y",
+ "amdgpu-work-item-id-z", "amdgpu-work-group-id-x",
+ "amdgpu-work-group-id-y", "amdgpu-work-group-id-z",
+ "amdgpu-dispatch-ptr", "amdgpu-dispatch-id",
+ "amdgpu-kernarg-segment-ptr", "amdgpu-implicitarg-ptr"};
if (handleAttr(Parent, Callee, "amdgpu-queue-ptr"))
NeedQueuePtr = true;
diff --git a/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h b/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h
index 097730441ed8..f0e7ee910f95 100644
--- a/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h
+++ b/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h
@@ -48,8 +48,8 @@ public:
return ArgDescriptor(Reg, Mask, false, true);
}
- static ArgDescriptor createStack(Register Reg, unsigned Mask = ~0u) {
- return ArgDescriptor(Reg, Mask, true, true);
+ static ArgDescriptor createStack(unsigned Offset, unsigned Mask = ~0u) {
+ return ArgDescriptor(Offset, Mask, true, true);
}
static ArgDescriptor createArg(const ArgDescriptor &Arg, unsigned Mask) {
diff --git a/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
index 743ac64b8f10..f2d903c8e7b1 100644
--- a/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
+++ b/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
@@ -229,7 +229,7 @@ void AMDGPUAsmPrinter::EmitFunctionBodyEnd() {
// alignment.
Streamer.EmitValueToAlignment(64, 0, 1, 0);
if (ReadOnlySection.getAlignment() < 64)
- ReadOnlySection.setAlignment(64);
+ ReadOnlySection.setAlignment(Align(64));
const MCSubtargetInfo &STI = MF->getSubtarget();
@@ -273,7 +273,7 @@ void AMDGPUAsmPrinter::EmitFunctionEntryLabel() {
AsmPrinter::EmitFunctionEntryLabel();
}
-void AMDGPUAsmPrinter::EmitBasicBlockStart(const MachineBasicBlock &MBB) const {
+void AMDGPUAsmPrinter::EmitBasicBlockStart(const MachineBasicBlock &MBB) {
if (DumpCodeInstEmitter && !isBlockOnlyReachableByFallthrough(&MBB)) {
// Write a line for the basic block label if it is not only fallthrough.
DisasmLines.push_back(
@@ -342,6 +342,8 @@ bool AMDGPUAsmPrinter::doFinalization(Module &M) {
// Print comments that apply to both callable functions and entry points.
void AMDGPUAsmPrinter::emitCommonFunctionComments(
uint32_t NumVGPR,
+ Optional<uint32_t> NumAGPR,
+ uint32_t TotalNumVGPR,
uint32_t NumSGPR,
uint64_t ScratchSize,
uint64_t CodeSize,
@@ -349,6 +351,11 @@ void AMDGPUAsmPrinter::emitCommonFunctionComments(
OutStreamer->emitRawComment(" codeLenInByte = " + Twine(CodeSize), false);
OutStreamer->emitRawComment(" NumSgprs: " + Twine(NumSGPR), false);
OutStreamer->emitRawComment(" NumVgprs: " + Twine(NumVGPR), false);
+ if (NumAGPR) {
+ OutStreamer->emitRawComment(" NumAgprs: " + Twine(*NumAGPR), false);
+ OutStreamer->emitRawComment(" TotalNumVgprs: " + Twine(TotalNumVGPR),
+ false);
+ }
OutStreamer->emitRawComment(" ScratchSize: " + Twine(ScratchSize), false);
OutStreamer->emitRawComment(" MemoryBound: " + Twine(MFI->isMemoryBound()),
false);
@@ -417,7 +424,7 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
// The starting address of all shader programs must be 256 bytes aligned.
// Regular functions just need the basic required instruction alignment.
- MF.setAlignment(MFI->isEntryFunction() ? 8 : 2);
+ MF.setAlignment(MFI->isEntryFunction() ? Align(256) : Align(4));
SetupMachineFunction(MF);
@@ -474,6 +481,8 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
SIFunctionResourceInfo &Info = CallGraphResourceInfo[&MF.getFunction()];
emitCommonFunctionComments(
Info.NumVGPR,
+ STM.hasMAIInsts() ? Info.NumAGPR : Optional<uint32_t>(),
+ Info.getTotalNumVGPRs(STM),
Info.getTotalNumSGPRs(MF.getSubtarget<GCNSubtarget>()),
Info.PrivateSegmentSize,
getFunctionCodeSize(MF), MFI);
@@ -481,7 +490,11 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
}
OutStreamer->emitRawComment(" Kernel info:", false);
- emitCommonFunctionComments(CurrentProgramInfo.NumVGPR,
+ emitCommonFunctionComments(CurrentProgramInfo.NumArchVGPR,
+ STM.hasMAIInsts()
+ ? CurrentProgramInfo.NumAccVGPR
+ : Optional<uint32_t>(),
+ CurrentProgramInfo.NumVGPR,
CurrentProgramInfo.NumSGPR,
CurrentProgramInfo.ScratchSize,
getFunctionCodeSize(MF), MFI);
@@ -507,6 +520,10 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
Twine(CurrentProgramInfo.NumVGPRsForWavesPerEU), false);
OutStreamer->emitRawComment(
+ " Occupancy: " +
+ Twine(CurrentProgramInfo.Occupancy), false);
+
+ OutStreamer->emitRawComment(
" WaveLimiterHint : " + Twine(MFI->needsWaveLimiter()), false);
OutStreamer->emitRawComment(
@@ -588,6 +605,11 @@ int32_t AMDGPUAsmPrinter::SIFunctionResourceInfo::getTotalNumSGPRs(
UsesVCC, UsesFlatScratch);
}
+int32_t AMDGPUAsmPrinter::SIFunctionResourceInfo::getTotalNumVGPRs(
+ const GCNSubtarget &ST) const {
+ return std::max(NumVGPR, NumAGPR);
+}
+
AMDGPUAsmPrinter::SIFunctionResourceInfo AMDGPUAsmPrinter::analyzeResourceUsage(
const MachineFunction &MF) const {
SIFunctionResourceInfo Info;
@@ -634,11 +656,18 @@ AMDGPUAsmPrinter::SIFunctionResourceInfo AMDGPUAsmPrinter::analyzeResourceUsage(
HighestVGPRReg = Reg;
break;
}
- MCPhysReg AReg = AMDGPU::AGPR0 + TRI.getHWRegIndex(Reg);
- if (MRI.isPhysRegUsed(AReg)) {
- HighestVGPRReg = AReg;
- break;
+ }
+
+ if (ST.hasMAIInsts()) {
+ MCPhysReg HighestAGPRReg = AMDGPU::NoRegister;
+ for (MCPhysReg Reg : reverse(AMDGPU::AGPR_32RegClass.getRegisters())) {
+ if (MRI.isPhysRegUsed(Reg)) {
+ HighestAGPRReg = Reg;
+ break;
+ }
}
+ Info.NumAGPR = HighestAGPRReg == AMDGPU::NoRegister ? 0 :
+ TRI.getHWRegIndex(HighestAGPRReg) + 1;
}
MCPhysReg HighestSGPRReg = AMDGPU::NoRegister;
@@ -660,6 +689,7 @@ AMDGPUAsmPrinter::SIFunctionResourceInfo AMDGPUAsmPrinter::analyzeResourceUsage(
}
int32_t MaxVGPR = -1;
+ int32_t MaxAGPR = -1;
int32_t MaxSGPR = -1;
uint64_t CalleeFrameSize = 0;
@@ -669,11 +699,12 @@ AMDGPUAsmPrinter::SIFunctionResourceInfo AMDGPUAsmPrinter::analyzeResourceUsage(
for (const MachineOperand &MO : MI.operands()) {
unsigned Width = 0;
bool IsSGPR = false;
+ bool IsAGPR = false;
if (!MO.isReg())
continue;
- unsigned Reg = MO.getReg();
+ Register Reg = MO.getReg();
switch (Reg) {
case AMDGPU::EXEC:
case AMDGPU::EXEC_LO:
@@ -744,6 +775,7 @@ AMDGPUAsmPrinter::SIFunctionResourceInfo AMDGPUAsmPrinter::analyzeResourceUsage(
Width = 1;
} else if (AMDGPU::AGPR_32RegClass.contains(Reg)) {
IsSGPR = false;
+ IsAGPR = true;
Width = 1;
} else if (AMDGPU::SReg_64RegClass.contains(Reg)) {
assert(!AMDGPU::TTMP_64RegClass.contains(Reg) &&
@@ -755,6 +787,7 @@ AMDGPUAsmPrinter::SIFunctionResourceInfo AMDGPUAsmPrinter::analyzeResourceUsage(
Width = 2;
} else if (AMDGPU::AReg_64RegClass.contains(Reg)) {
IsSGPR = false;
+ IsAGPR = true;
Width = 2;
} else if (AMDGPU::VReg_96RegClass.contains(Reg)) {
IsSGPR = false;
@@ -771,6 +804,7 @@ AMDGPUAsmPrinter::SIFunctionResourceInfo AMDGPUAsmPrinter::analyzeResourceUsage(
Width = 4;
} else if (AMDGPU::AReg_128RegClass.contains(Reg)) {
IsSGPR = false;
+ IsAGPR = true;
Width = 4;
} else if (AMDGPU::SReg_256RegClass.contains(Reg)) {
assert(!AMDGPU::TTMP_256RegClass.contains(Reg) &&
@@ -790,6 +824,7 @@ AMDGPUAsmPrinter::SIFunctionResourceInfo AMDGPUAsmPrinter::analyzeResourceUsage(
Width = 16;
} else if (AMDGPU::AReg_512RegClass.contains(Reg)) {
IsSGPR = false;
+ IsAGPR = true;
Width = 16;
} else if (AMDGPU::SReg_1024RegClass.contains(Reg)) {
IsSGPR = true;
@@ -799,6 +834,7 @@ AMDGPUAsmPrinter::SIFunctionResourceInfo AMDGPUAsmPrinter::analyzeResourceUsage(
Width = 32;
} else if (AMDGPU::AReg_1024RegClass.contains(Reg)) {
IsSGPR = false;
+ IsAGPR = true;
Width = 32;
} else {
llvm_unreachable("Unknown register class");
@@ -807,6 +843,8 @@ AMDGPUAsmPrinter::SIFunctionResourceInfo AMDGPUAsmPrinter::analyzeResourceUsage(
int MaxUsed = HWReg + Width - 1;
if (IsSGPR) {
MaxSGPR = MaxUsed > MaxSGPR ? MaxUsed : MaxSGPR;
+ } else if (IsAGPR) {
+ MaxAGPR = MaxUsed > MaxAGPR ? MaxUsed : MaxAGPR;
} else {
MaxVGPR = MaxUsed > MaxVGPR ? MaxUsed : MaxVGPR;
}
@@ -828,6 +866,7 @@ AMDGPUAsmPrinter::SIFunctionResourceInfo AMDGPUAsmPrinter::analyzeResourceUsage(
47 - IsaInfo::getNumExtraSGPRs(&ST, true, ST.hasFlatAddressSpace());
MaxSGPR = std::max(MaxSGPR, MaxSGPRGuess);
MaxVGPR = std::max(MaxVGPR, 23);
+ MaxAGPR = std::max(MaxAGPR, 23);
CalleeFrameSize = std::max(CalleeFrameSize, UINT64_C(16384));
Info.UsesVCC = true;
@@ -852,6 +891,7 @@ AMDGPUAsmPrinter::SIFunctionResourceInfo AMDGPUAsmPrinter::analyzeResourceUsage(
MaxSGPR = std::max(I->second.NumExplicitSGPR - 1, MaxSGPR);
MaxVGPR = std::max(I->second.NumVGPR - 1, MaxVGPR);
+ MaxAGPR = std::max(I->second.NumAGPR - 1, MaxAGPR);
CalleeFrameSize
= std::max(I->second.PrivateSegmentSize, CalleeFrameSize);
Info.UsesVCC |= I->second.UsesVCC;
@@ -868,6 +908,7 @@ AMDGPUAsmPrinter::SIFunctionResourceInfo AMDGPUAsmPrinter::analyzeResourceUsage(
Info.NumExplicitSGPR = MaxSGPR + 1;
Info.NumVGPR = MaxVGPR + 1;
+ Info.NumAGPR = MaxAGPR + 1;
Info.PrivateSegmentSize += CalleeFrameSize;
return Info;
@@ -876,8 +917,11 @@ AMDGPUAsmPrinter::SIFunctionResourceInfo AMDGPUAsmPrinter::analyzeResourceUsage(
void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
const MachineFunction &MF) {
SIFunctionResourceInfo Info = analyzeResourceUsage(MF);
+ const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
- ProgInfo.NumVGPR = Info.NumVGPR;
+ ProgInfo.NumArchVGPR = Info.NumVGPR;
+ ProgInfo.NumAccVGPR = Info.NumAGPR;
+ ProgInfo.NumVGPR = Info.getTotalNumVGPRs(STM);
ProgInfo.NumSGPR = Info.NumExplicitSGPR;
ProgInfo.ScratchSize = Info.PrivateSegmentSize;
ProgInfo.VCCUsed = Info.UsesVCC;
@@ -890,7 +934,6 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
MF.getFunction().getContext().diagnose(DiagStackSize);
}
- const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
// TODO(scott.linder): The calculations related to SGPR/VGPR blocks are
@@ -1057,6 +1100,10 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
// For AMDHSA, LDS_SIZE must be zero, as it is populated by the CP.
S_00B84C_LDS_SIZE(STM.isAmdHsaOS() ? 0 : ProgInfo.LDSBlocks) |
S_00B84C_EXCP_EN(0);
+
+ ProgInfo.Occupancy = STM.computeOccupancy(MF, ProgInfo.LDSSize,
+ ProgInfo.NumSGPRsForWavesPerEU,
+ ProgInfo.NumVGPRsForWavesPerEU);
}
static unsigned getRsrcReg(CallingConv::ID CallConv) {
@@ -1214,17 +1261,16 @@ void AMDGPUAsmPrinter::getAmdKernelCode(amd_kernel_code_t &Out,
if (STM.isXNACKEnabled())
Out.code_properties |= AMD_CODE_PROPERTY_IS_XNACK_SUPPORTED;
- unsigned MaxKernArgAlign;
+ Align MaxKernArgAlign;
Out.kernarg_segment_byte_size = STM.getKernArgSegmentSize(F, MaxKernArgAlign);
Out.wavefront_sgpr_count = CurrentProgramInfo.NumSGPR;
Out.workitem_vgpr_count = CurrentProgramInfo.NumVGPR;
Out.workitem_private_segment_byte_size = CurrentProgramInfo.ScratchSize;
Out.workgroup_group_segment_byte_size = CurrentProgramInfo.LDSSize;
- // These alignment values are specified in powers of two, so alignment =
- // 2^n. The minimum alignment is 2^4 = 16.
- Out.kernarg_segment_alignment = std::max<size_t>(4,
- countTrailingZeros(MaxKernArgAlign));
+ // kernarg_segment_alignment is specified as log of the alignment.
+ // The minimum alignment is 16.
+ Out.kernarg_segment_alignment = Log2(std::max(Align(16), MaxKernArgAlign));
}
bool AMDGPUAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
diff --git a/lib/Target/AMDGPU/AMDGPUAsmPrinter.h b/lib/Target/AMDGPU/AMDGPUAsmPrinter.h
index cf77034329ef..c50c19a4609c 100644
--- a/lib/Target/AMDGPU/AMDGPUAsmPrinter.h
+++ b/lib/Target/AMDGPU/AMDGPUAsmPrinter.h
@@ -43,6 +43,7 @@ private:
// Track the number of explicitly used VGPRs. Special registers reserved at
// the end are tracked separately.
int32_t NumVGPR = 0;
+ int32_t NumAGPR = 0;
int32_t NumExplicitSGPR = 0;
uint64_t PrivateSegmentSize = 0;
bool UsesVCC = false;
@@ -51,6 +52,7 @@ private:
bool HasRecursion = false;
int32_t getTotalNumSGPRs(const GCNSubtarget &ST) const;
+ int32_t getTotalNumVGPRs(const GCNSubtarget &ST) const;
};
SIProgramInfo CurrentProgramInfo;
@@ -77,6 +79,8 @@ private:
void EmitPALMetadata(const MachineFunction &MF,
const SIProgramInfo &KernelInfo);
void emitCommonFunctionComments(uint32_t NumVGPR,
+ Optional<uint32_t> NumAGPR,
+ uint32_t TotalNumVGPR,
uint32_t NumSGPR,
uint64_t ScratchSize,
uint64_t CodeSize,
@@ -125,7 +129,7 @@ public:
void EmitFunctionEntryLabel() override;
- void EmitBasicBlockStart(const MachineBasicBlock &MBB) const override;
+ void EmitBasicBlockStart(const MachineBasicBlock &MBB) override;
void EmitGlobalVariable(const GlobalVariable *GV) override;
@@ -140,8 +144,8 @@ public:
const char *ExtraCode, raw_ostream &O) override;
protected:
- mutable std::vector<std::string> DisasmLines, HexLines;
- mutable size_t DisasmLineMaxLen;
+ std::vector<std::string> DisasmLines, HexLines;
+ size_t DisasmLineMaxLen;
};
} // end namespace llvm
diff --git a/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp b/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp
index 8a92e7d923fb..ba8343142c63 100644
--- a/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp
+++ b/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp
@@ -15,6 +15,7 @@
#include "AMDGPU.h"
#include "AMDGPUSubtarget.h"
+#include "SIDefines.h"
#include "llvm/Analysis/LegacyDivergenceAnalysis.h"
#include "llvm/CodeGen/TargetPassConfig.h"
#include "llvm/IR/IRBuilder.h"
@@ -24,20 +25,10 @@
#define DEBUG_TYPE "amdgpu-atomic-optimizer"
using namespace llvm;
+using namespace llvm::AMDGPU;
namespace {
-enum DPP_CTRL {
- DPP_ROW_SR1 = 0x111,
- DPP_ROW_SR2 = 0x112,
- DPP_ROW_SR3 = 0x113,
- DPP_ROW_SR4 = 0x114,
- DPP_ROW_SR8 = 0x118,
- DPP_WF_SR1 = 0x138,
- DPP_ROW_BCAST15 = 0x142,
- DPP_ROW_BCAST31 = 0x143
-};
-
struct ReplacementInfo {
Instruction *I;
AtomicRMWInst::BinOp Op;
@@ -52,9 +43,12 @@ private:
const LegacyDivergenceAnalysis *DA;
const DataLayout *DL;
DominatorTree *DT;
- bool HasDPP;
+ const GCNSubtarget *ST;
bool IsPixelShader;
+ Value *buildScan(IRBuilder<> &B, AtomicRMWInst::BinOp Op, Value *V,
+ Value *const Identity) const;
+ Value *buildShiftRight(IRBuilder<> &B, Value *V, Value *const Identity) const;
void optimizeAtomic(Instruction &I, AtomicRMWInst::BinOp Op, unsigned ValIdx,
bool ValDivergent) const;
@@ -93,8 +87,7 @@ bool AMDGPUAtomicOptimizer::runOnFunction(Function &F) {
DT = DTW ? &DTW->getDomTree() : nullptr;
const TargetPassConfig &TPC = getAnalysis<TargetPassConfig>();
const TargetMachine &TM = TPC.getTM<TargetMachine>();
- const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
- HasDPP = ST.hasDPP();
+ ST = &TM.getSubtarget<GCNSubtarget>(F);
IsPixelShader = F.getCallingConv() == CallingConv::AMDGPU_PS;
visit(F);
@@ -142,17 +135,18 @@ void AMDGPUAtomicOptimizer::visitAtomicRMWInst(AtomicRMWInst &I) {
// If the pointer operand is divergent, then each lane is doing an atomic
// operation on a different address, and we cannot optimize that.
- if (DA->isDivergent(I.getOperand(PtrIdx))) {
+ if (DA->isDivergentUse(&I.getOperandUse(PtrIdx))) {
return;
}
- const bool ValDivergent = DA->isDivergent(I.getOperand(ValIdx));
+ const bool ValDivergent = DA->isDivergentUse(&I.getOperandUse(ValIdx));
// If the value operand is divergent, each lane is contributing a different
// value to the atomic calculation. We can only optimize divergent values if
// we have DPP available on our subtarget, and the atomic operation is 32
// bits.
- if (ValDivergent && (!HasDPP || (DL->getTypeSizeInBits(I.getType()) != 32))) {
+ if (ValDivergent &&
+ (!ST->hasDPP() || DL->getTypeSizeInBits(I.getType()) != 32)) {
return;
}
@@ -219,20 +213,21 @@ void AMDGPUAtomicOptimizer::visitIntrinsicInst(IntrinsicInst &I) {
const unsigned ValIdx = 0;
- const bool ValDivergent = DA->isDivergent(I.getOperand(ValIdx));
+ const bool ValDivergent = DA->isDivergentUse(&I.getOperandUse(ValIdx));
// If the value operand is divergent, each lane is contributing a different
// value to the atomic calculation. We can only optimize divergent values if
// we have DPP available on our subtarget, and the atomic operation is 32
// bits.
- if (ValDivergent && (!HasDPP || (DL->getTypeSizeInBits(I.getType()) != 32))) {
+ if (ValDivergent &&
+ (!ST->hasDPP() || DL->getTypeSizeInBits(I.getType()) != 32)) {
return;
}
// If any of the other arguments to the intrinsic are divergent, we can't
// optimize the operation.
for (unsigned Idx = 1; Idx < I.getNumOperands(); Idx++) {
- if (DA->isDivergent(I.getOperand(Idx))) {
+ if (DA->isDivergentUse(&I.getOperandUse(Idx))) {
return;
}
}
@@ -282,6 +277,111 @@ static Value *buildNonAtomicBinOp(IRBuilder<> &B, AtomicRMWInst::BinOp Op,
return B.CreateSelect(Cond, LHS, RHS);
}
+// Use the builder to create an inclusive scan of V across the wavefront, with
+// all lanes active.
+Value *AMDGPUAtomicOptimizer::buildScan(IRBuilder<> &B, AtomicRMWInst::BinOp Op,
+ Value *V, Value *const Identity) const {
+ Type *const Ty = V->getType();
+ Module *M = B.GetInsertBlock()->getModule();
+ Function *UpdateDPP =
+ Intrinsic::getDeclaration(M, Intrinsic::amdgcn_update_dpp, Ty);
+ Function *PermLaneX16 =
+ Intrinsic::getDeclaration(M, Intrinsic::amdgcn_permlanex16, {});
+ Function *ReadLane =
+ Intrinsic::getDeclaration(M, Intrinsic::amdgcn_readlane, {});
+
+ for (unsigned Idx = 0; Idx < 4; Idx++) {
+ V = buildNonAtomicBinOp(
+ B, Op, V,
+ B.CreateCall(UpdateDPP,
+ {Identity, V, B.getInt32(DPP::ROW_SHR0 | 1 << Idx),
+ B.getInt32(0xf), B.getInt32(0xf), B.getFalse()}));
+ }
+ if (ST->hasDPPBroadcasts()) {
+ // GFX9 has DPP row broadcast operations.
+ V = buildNonAtomicBinOp(
+ B, Op, V,
+ B.CreateCall(UpdateDPP,
+ {Identity, V, B.getInt32(DPP::BCAST15), B.getInt32(0xa),
+ B.getInt32(0xf), B.getFalse()}));
+ V = buildNonAtomicBinOp(
+ B, Op, V,
+ B.CreateCall(UpdateDPP,
+ {Identity, V, B.getInt32(DPP::BCAST31), B.getInt32(0xc),
+ B.getInt32(0xf), B.getFalse()}));
+ } else {
+ // On GFX10 all DPP operations are confined to a single row. To get cross-
+ // row operations we have to use permlane or readlane.
+
+ // Combine lane 15 into lanes 16..31 (and, for wave 64, lane 47 into lanes
+ // 48..63).
+ Value *const PermX =
+ B.CreateCall(PermLaneX16, {V, V, B.getInt32(-1), B.getInt32(-1),
+ B.getFalse(), B.getFalse()});
+ V = buildNonAtomicBinOp(
+ B, Op, V,
+ B.CreateCall(UpdateDPP,
+ {Identity, PermX, B.getInt32(DPP::QUAD_PERM_ID),
+ B.getInt32(0xa), B.getInt32(0xf), B.getFalse()}));
+ if (!ST->isWave32()) {
+ // Combine lane 31 into lanes 32..63.
+ Value *const Lane31 = B.CreateCall(ReadLane, {V, B.getInt32(31)});
+ V = buildNonAtomicBinOp(
+ B, Op, V,
+ B.CreateCall(UpdateDPP,
+ {Identity, Lane31, B.getInt32(DPP::QUAD_PERM_ID),
+ B.getInt32(0xc), B.getInt32(0xf), B.getFalse()}));
+ }
+ }
+ return V;
+}
+
+// Use the builder to create a shift right of V across the wavefront, with all
+// lanes active, to turn an inclusive scan into an exclusive scan.
+Value *AMDGPUAtomicOptimizer::buildShiftRight(IRBuilder<> &B, Value *V,
+ Value *const Identity) const {
+ Type *const Ty = V->getType();
+ Module *M = B.GetInsertBlock()->getModule();
+ Function *UpdateDPP =
+ Intrinsic::getDeclaration(M, Intrinsic::amdgcn_update_dpp, Ty);
+ Function *ReadLane =
+ Intrinsic::getDeclaration(M, Intrinsic::amdgcn_readlane, {});
+ Function *WriteLane =
+ Intrinsic::getDeclaration(M, Intrinsic::amdgcn_writelane, {});
+
+ if (ST->hasDPPWavefrontShifts()) {
+ // GFX9 has DPP wavefront shift operations.
+ V = B.CreateCall(UpdateDPP,
+ {Identity, V, B.getInt32(DPP::WAVE_SHR1), B.getInt32(0xf),
+ B.getInt32(0xf), B.getFalse()});
+ } else {
+ // On GFX10 all DPP operations are confined to a single row. To get cross-
+ // row operations we have to use permlane or readlane.
+ Value *Old = V;
+ V = B.CreateCall(UpdateDPP,
+ {Identity, V, B.getInt32(DPP::ROW_SHR0 + 1),
+ B.getInt32(0xf), B.getInt32(0xf), B.getFalse()});
+
+ // Copy the old lane 15 to the new lane 16.
+ V = B.CreateCall(WriteLane, {B.CreateCall(ReadLane, {Old, B.getInt32(15)}),
+ B.getInt32(16), V});
+
+ if (!ST->isWave32()) {
+ // Copy the old lane 31 to the new lane 32.
+ V = B.CreateCall(
+ WriteLane,
+ {B.CreateCall(ReadLane, {Old, B.getInt32(31)}), B.getInt32(32), V});
+
+ // Copy the old lane 47 to the new lane 48.
+ V = B.CreateCall(
+ WriteLane,
+ {B.CreateCall(ReadLane, {Old, B.getInt32(47)}), B.getInt32(48), V});
+ }
+ }
+
+ return V;
+}
+
static APInt getIdentityValueForAtomicOp(AtomicRMWInst::BinOp Op,
unsigned BitWidth) {
switch (Op) {
@@ -345,23 +445,29 @@ void AMDGPUAtomicOptimizer::optimizeAtomic(Instruction &I,
// We need to know how many lanes are active within the wavefront, and we do
// this by doing a ballot of active lanes.
+ Type *const WaveTy = B.getIntNTy(ST->getWavefrontSize());
CallInst *const Ballot = B.CreateIntrinsic(
- Intrinsic::amdgcn_icmp, {B.getInt64Ty(), B.getInt32Ty()},
+ Intrinsic::amdgcn_icmp, {WaveTy, B.getInt32Ty()},
{B.getInt32(1), B.getInt32(0), B.getInt32(CmpInst::ICMP_NE)});
// We need to know how many lanes are active within the wavefront that are
// below us. If we counted each lane linearly starting from 0, a lane is
// below us only if its associated index was less than ours. We do this by
// using the mbcnt intrinsic.
- Value *const BitCast = B.CreateBitCast(Ballot, VecTy);
- Value *const ExtractLo = B.CreateExtractElement(BitCast, B.getInt32(0));
- Value *const ExtractHi = B.CreateExtractElement(BitCast, B.getInt32(1));
- CallInst *const PartialMbcnt = B.CreateIntrinsic(
- Intrinsic::amdgcn_mbcnt_lo, {}, {ExtractLo, B.getInt32(0)});
- Value *const Mbcnt =
- B.CreateIntCast(B.CreateIntrinsic(Intrinsic::amdgcn_mbcnt_hi, {},
- {ExtractHi, PartialMbcnt}),
- Ty, false);
+ Value *Mbcnt;
+ if (ST->isWave32()) {
+ Mbcnt = B.CreateIntrinsic(Intrinsic::amdgcn_mbcnt_lo, {},
+ {Ballot, B.getInt32(0)});
+ } else {
+ Value *const BitCast = B.CreateBitCast(Ballot, VecTy);
+ Value *const ExtractLo = B.CreateExtractElement(BitCast, B.getInt32(0));
+ Value *const ExtractHi = B.CreateExtractElement(BitCast, B.getInt32(1));
+ Mbcnt = B.CreateIntrinsic(Intrinsic::amdgcn_mbcnt_lo, {},
+ {ExtractLo, B.getInt32(0)});
+ Mbcnt =
+ B.CreateIntrinsic(Intrinsic::amdgcn_mbcnt_hi, {}, {ExtractHi, Mbcnt});
+ }
+ Mbcnt = B.CreateIntCast(Mbcnt, Ty, false);
Value *const Identity = B.getInt(getIdentityValueForAtomicOp(Op, TyBitWidth));
@@ -373,47 +479,25 @@ void AMDGPUAtomicOptimizer::optimizeAtomic(Instruction &I,
if (ValDivergent) {
// First we need to set all inactive invocations to the identity value, so
// that they can correctly contribute to the final result.
- CallInst *const SetInactive =
- B.CreateIntrinsic(Intrinsic::amdgcn_set_inactive, Ty, {V, Identity});
-
- CallInst *const FirstDPP =
- B.CreateIntrinsic(Intrinsic::amdgcn_update_dpp, Ty,
- {Identity, SetInactive, B.getInt32(DPP_WF_SR1),
- B.getInt32(0xf), B.getInt32(0xf), B.getFalse()});
- ExclScan = FirstDPP;
-
- const unsigned Iters = 7;
- const unsigned DPPCtrl[Iters] = {
- DPP_ROW_SR1, DPP_ROW_SR2, DPP_ROW_SR3, DPP_ROW_SR4,
- DPP_ROW_SR8, DPP_ROW_BCAST15, DPP_ROW_BCAST31};
- const unsigned RowMask[Iters] = {0xf, 0xf, 0xf, 0xf, 0xf, 0xa, 0xc};
- const unsigned BankMask[Iters] = {0xf, 0xf, 0xf, 0xe, 0xc, 0xf, 0xf};
-
- // This loop performs an exclusive scan across the wavefront, with all lanes
- // active (by using the WWM intrinsic).
- for (unsigned Idx = 0; Idx < Iters; Idx++) {
- Value *const UpdateValue = Idx < 3 ? FirstDPP : ExclScan;
- CallInst *const DPP = B.CreateIntrinsic(
- Intrinsic::amdgcn_update_dpp, Ty,
- {Identity, UpdateValue, B.getInt32(DPPCtrl[Idx]),
- B.getInt32(RowMask[Idx]), B.getInt32(BankMask[Idx]), B.getFalse()});
-
- ExclScan = buildNonAtomicBinOp(B, Op, ExclScan, DPP);
- }
+ NewV = B.CreateIntrinsic(Intrinsic::amdgcn_set_inactive, Ty, {V, Identity});
- NewV = buildNonAtomicBinOp(B, Op, SetInactive, ExclScan);
+ const AtomicRMWInst::BinOp ScanOp =
+ Op == AtomicRMWInst::Sub ? AtomicRMWInst::Add : Op;
+ NewV = buildScan(B, ScanOp, NewV, Identity);
+ ExclScan = buildShiftRight(B, NewV, Identity);
// Read the value from the last lane, which has accumlated the values of
// each active lane in the wavefront. This will be our new value which we
// will provide to the atomic operation.
+ Value *const LastLaneIdx = B.getInt32(ST->getWavefrontSize() - 1);
if (TyBitWidth == 64) {
Value *const ExtractLo = B.CreateTrunc(NewV, B.getInt32Ty());
Value *const ExtractHi =
- B.CreateTrunc(B.CreateLShr(NewV, B.getInt64(32)), B.getInt32Ty());
+ B.CreateTrunc(B.CreateLShr(NewV, 32), B.getInt32Ty());
CallInst *const ReadLaneLo = B.CreateIntrinsic(
- Intrinsic::amdgcn_readlane, {}, {ExtractLo, B.getInt32(63)});
+ Intrinsic::amdgcn_readlane, {}, {ExtractLo, LastLaneIdx});
CallInst *const ReadLaneHi = B.CreateIntrinsic(
- Intrinsic::amdgcn_readlane, {}, {ExtractHi, B.getInt32(63)});
+ Intrinsic::amdgcn_readlane, {}, {ExtractHi, LastLaneIdx});
Value *const PartialInsert = B.CreateInsertElement(
UndefValue::get(VecTy), ReadLaneLo, B.getInt32(0));
Value *const Insert =
@@ -421,7 +505,7 @@ void AMDGPUAtomicOptimizer::optimizeAtomic(Instruction &I,
NewV = B.CreateBitCast(Insert, Ty);
} else if (TyBitWidth == 32) {
NewV = B.CreateIntrinsic(Intrinsic::amdgcn_readlane, {},
- {NewV, B.getInt32(63)});
+ {NewV, LastLaneIdx});
} else {
llvm_unreachable("Unhandled atomic bit width");
}
@@ -493,77 +577,80 @@ void AMDGPUAtomicOptimizer::optimizeAtomic(Instruction &I,
// original instruction.
B.SetInsertPoint(&I);
- // Create a PHI node to get our new atomic result into the exit block.
- PHINode *const PHI = B.CreatePHI(Ty, 2);
- PHI->addIncoming(UndefValue::get(Ty), EntryBB);
- PHI->addIncoming(NewI, SingleLaneTerminator->getParent());
-
- // We need to broadcast the value who was the lowest active lane (the first
- // lane) to all other lanes in the wavefront. We use an intrinsic for this,
- // but have to handle 64-bit broadcasts with two calls to this intrinsic.
- Value *BroadcastI = nullptr;
-
- if (TyBitWidth == 64) {
- Value *const ExtractLo = B.CreateTrunc(PHI, B.getInt32Ty());
- Value *const ExtractHi =
- B.CreateTrunc(B.CreateLShr(PHI, B.getInt64(32)), B.getInt32Ty());
- CallInst *const ReadFirstLaneLo =
- B.CreateIntrinsic(Intrinsic::amdgcn_readfirstlane, {}, ExtractLo);
- CallInst *const ReadFirstLaneHi =
- B.CreateIntrinsic(Intrinsic::amdgcn_readfirstlane, {}, ExtractHi);
- Value *const PartialInsert = B.CreateInsertElement(
- UndefValue::get(VecTy), ReadFirstLaneLo, B.getInt32(0));
- Value *const Insert =
- B.CreateInsertElement(PartialInsert, ReadFirstLaneHi, B.getInt32(1));
- BroadcastI = B.CreateBitCast(Insert, Ty);
- } else if (TyBitWidth == 32) {
-
- BroadcastI = B.CreateIntrinsic(Intrinsic::amdgcn_readfirstlane, {}, PHI);
- } else {
- llvm_unreachable("Unhandled atomic bit width");
- }
+ const bool NeedResult = !I.use_empty();
+ if (NeedResult) {
+ // Create a PHI node to get our new atomic result into the exit block.
+ PHINode *const PHI = B.CreatePHI(Ty, 2);
+ PHI->addIncoming(UndefValue::get(Ty), EntryBB);
+ PHI->addIncoming(NewI, SingleLaneTerminator->getParent());
- // Now that we have the result of our single atomic operation, we need to
- // get our individual lane's slice into the result. We use the lane offset we
- // previously calculated combined with the atomic result value we got from the
- // first lane, to get our lane's index into the atomic result.
- Value *LaneOffset = nullptr;
- if (ValDivergent) {
- LaneOffset = B.CreateIntrinsic(Intrinsic::amdgcn_wwm, Ty, ExclScan);
- } else {
- switch (Op) {
- default:
- llvm_unreachable("Unhandled atomic op");
- case AtomicRMWInst::Add:
- case AtomicRMWInst::Sub:
- LaneOffset = B.CreateMul(V, Mbcnt);
- break;
- case AtomicRMWInst::And:
- case AtomicRMWInst::Or:
- case AtomicRMWInst::Max:
- case AtomicRMWInst::Min:
- case AtomicRMWInst::UMax:
- case AtomicRMWInst::UMin:
- LaneOffset = B.CreateSelect(Cond, Identity, V);
- break;
- case AtomicRMWInst::Xor:
- LaneOffset = B.CreateMul(V, B.CreateAnd(Mbcnt, 1));
- break;
+ // We need to broadcast the value who was the lowest active lane (the first
+ // lane) to all other lanes in the wavefront. We use an intrinsic for this,
+ // but have to handle 64-bit broadcasts with two calls to this intrinsic.
+ Value *BroadcastI = nullptr;
+
+ if (TyBitWidth == 64) {
+ Value *const ExtractLo = B.CreateTrunc(PHI, B.getInt32Ty());
+ Value *const ExtractHi =
+ B.CreateTrunc(B.CreateLShr(PHI, 32), B.getInt32Ty());
+ CallInst *const ReadFirstLaneLo =
+ B.CreateIntrinsic(Intrinsic::amdgcn_readfirstlane, {}, ExtractLo);
+ CallInst *const ReadFirstLaneHi =
+ B.CreateIntrinsic(Intrinsic::amdgcn_readfirstlane, {}, ExtractHi);
+ Value *const PartialInsert = B.CreateInsertElement(
+ UndefValue::get(VecTy), ReadFirstLaneLo, B.getInt32(0));
+ Value *const Insert =
+ B.CreateInsertElement(PartialInsert, ReadFirstLaneHi, B.getInt32(1));
+ BroadcastI = B.CreateBitCast(Insert, Ty);
+ } else if (TyBitWidth == 32) {
+
+ BroadcastI = B.CreateIntrinsic(Intrinsic::amdgcn_readfirstlane, {}, PHI);
+ } else {
+ llvm_unreachable("Unhandled atomic bit width");
}
- }
- Value *const Result = buildNonAtomicBinOp(B, Op, BroadcastI, LaneOffset);
- if (IsPixelShader) {
- // Need a final PHI to reconverge to above the helper lane branch mask.
- B.SetInsertPoint(PixelExitBB->getFirstNonPHI());
+ // Now that we have the result of our single atomic operation, we need to
+ // get our individual lane's slice into the result. We use the lane offset
+ // we previously calculated combined with the atomic result value we got
+ // from the first lane, to get our lane's index into the atomic result.
+ Value *LaneOffset = nullptr;
+ if (ValDivergent) {
+ LaneOffset = B.CreateIntrinsic(Intrinsic::amdgcn_wwm, Ty, ExclScan);
+ } else {
+ switch (Op) {
+ default:
+ llvm_unreachable("Unhandled atomic op");
+ case AtomicRMWInst::Add:
+ case AtomicRMWInst::Sub:
+ LaneOffset = B.CreateMul(V, Mbcnt);
+ break;
+ case AtomicRMWInst::And:
+ case AtomicRMWInst::Or:
+ case AtomicRMWInst::Max:
+ case AtomicRMWInst::Min:
+ case AtomicRMWInst::UMax:
+ case AtomicRMWInst::UMin:
+ LaneOffset = B.CreateSelect(Cond, Identity, V);
+ break;
+ case AtomicRMWInst::Xor:
+ LaneOffset = B.CreateMul(V, B.CreateAnd(Mbcnt, 1));
+ break;
+ }
+ }
+ Value *const Result = buildNonAtomicBinOp(B, Op, BroadcastI, LaneOffset);
- PHINode *const PHI = B.CreatePHI(Ty, 2);
- PHI->addIncoming(UndefValue::get(Ty), PixelEntryBB);
- PHI->addIncoming(Result, I.getParent());
- I.replaceAllUsesWith(PHI);
- } else {
- // Replace the original atomic instruction with the new one.
- I.replaceAllUsesWith(Result);
+ if (IsPixelShader) {
+ // Need a final PHI to reconverge to above the helper lane branch mask.
+ B.SetInsertPoint(PixelExitBB->getFirstNonPHI());
+
+ PHINode *const PHI = B.CreatePHI(Ty, 2);
+ PHI->addIncoming(UndefValue::get(Ty), PixelEntryBB);
+ PHI->addIncoming(Result, I.getParent());
+ I.replaceAllUsesWith(PHI);
+ } else {
+ // Replace the original atomic instruction with the new one.
+ I.replaceAllUsesWith(Result);
+ }
}
// And delete the original.
diff --git a/lib/Target/AMDGPU/AMDGPUCallLowering.cpp b/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
index b107c357196d..58c44acde1a7 100644
--- a/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
+++ b/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
@@ -30,13 +30,15 @@ using namespace llvm;
namespace {
-struct OutgoingArgHandler : public CallLowering::ValueHandler {
- OutgoingArgHandler(MachineIRBuilder &MIRBuilder, MachineRegisterInfo &MRI,
- MachineInstrBuilder MIB, CCAssignFn *AssignFn)
- : ValueHandler(MIRBuilder, MRI, AssignFn), MIB(MIB) {}
+struct OutgoingValueHandler : public CallLowering::ValueHandler {
+ OutgoingValueHandler(MachineIRBuilder &B, MachineRegisterInfo &MRI,
+ MachineInstrBuilder MIB, CCAssignFn *AssignFn)
+ : ValueHandler(B, MRI, AssignFn), MIB(MIB) {}
MachineInstrBuilder MIB;
+ bool isIncomingArgumentHandler() const override { return false; }
+
Register getStackAddress(uint64_t Size, int64_t Offset,
MachinePointerInfo &MPO) override {
llvm_unreachable("not implemented");
@@ -49,15 +51,96 @@ struct OutgoingArgHandler : public CallLowering::ValueHandler {
void assignValueToReg(Register ValVReg, Register PhysReg,
CCValAssign &VA) override {
- MIB.addUse(PhysReg);
- MIRBuilder.buildCopy(PhysReg, ValVReg);
+ Register ExtReg;
+ if (VA.getLocVT().getSizeInBits() < 32) {
+ // 16-bit types are reported as legal for 32-bit registers. We need to
+ // extend and do a 32-bit copy to avoid the verifier complaining about it.
+ ExtReg = MIRBuilder.buildAnyExt(LLT::scalar(32), ValVReg).getReg(0);
+ } else
+ ExtReg = extendRegister(ValVReg, VA);
+
+ MIRBuilder.buildCopy(PhysReg, ExtReg);
+ MIB.addUse(PhysReg, RegState::Implicit);
}
bool assignArg(unsigned ValNo, MVT ValVT, MVT LocVT,
CCValAssign::LocInfo LocInfo,
const CallLowering::ArgInfo &Info,
+ ISD::ArgFlagsTy Flags,
CCState &State) override {
- return AssignFn(ValNo, ValVT, LocVT, LocInfo, Info.Flags, State);
+ return AssignFn(ValNo, ValVT, LocVT, LocInfo, Flags, State);
+ }
+};
+
+struct IncomingArgHandler : public CallLowering::ValueHandler {
+ uint64_t StackUsed = 0;
+
+ IncomingArgHandler(MachineIRBuilder &B, MachineRegisterInfo &MRI,
+ CCAssignFn *AssignFn)
+ : ValueHandler(B, MRI, AssignFn) {}
+
+ Register getStackAddress(uint64_t Size, int64_t Offset,
+ MachinePointerInfo &MPO) override {
+ auto &MFI = MIRBuilder.getMF().getFrameInfo();
+ int FI = MFI.CreateFixedObject(Size, Offset, true);
+ MPO = MachinePointerInfo::getFixedStack(MIRBuilder.getMF(), FI);
+ Register AddrReg = MRI.createGenericVirtualRegister(
+ LLT::pointer(AMDGPUAS::PRIVATE_ADDRESS, 32));
+ MIRBuilder.buildFrameIndex(AddrReg, FI);
+ StackUsed = std::max(StackUsed, Size + Offset);
+ return AddrReg;
+ }
+
+ void assignValueToReg(Register ValVReg, Register PhysReg,
+ CCValAssign &VA) override {
+ markPhysRegUsed(PhysReg);
+
+ if (VA.getLocVT().getSizeInBits() < 32) {
+ // 16-bit types are reported as legal for 32-bit registers. We need to do
+ // a 32-bit copy, and truncate to avoid the verifier complaining about it.
+ auto Copy = MIRBuilder.buildCopy(LLT::scalar(32), PhysReg);
+ MIRBuilder.buildTrunc(ValVReg, Copy);
+ return;
+ }
+
+ switch (VA.getLocInfo()) {
+ case CCValAssign::LocInfo::SExt:
+ case CCValAssign::LocInfo::ZExt:
+ case CCValAssign::LocInfo::AExt: {
+ auto Copy = MIRBuilder.buildCopy(LLT{VA.getLocVT()}, PhysReg);
+ MIRBuilder.buildTrunc(ValVReg, Copy);
+ break;
+ }
+ default:
+ MIRBuilder.buildCopy(ValVReg, PhysReg);
+ break;
+ }
+ }
+
+ void assignValueToAddress(Register ValVReg, Register Addr, uint64_t Size,
+ MachinePointerInfo &MPO, CCValAssign &VA) override {
+ // FIXME: Get alignment
+ auto MMO = MIRBuilder.getMF().getMachineMemOperand(
+ MPO, MachineMemOperand::MOLoad | MachineMemOperand::MOInvariant, Size, 1);
+ MIRBuilder.buildLoad(ValVReg, Addr, *MMO);
+ }
+
+ /// How the physical register gets marked varies between formal
+ /// parameters (it's a basic-block live-in), and a call instruction
+ /// (it's an implicit-def of the BL).
+ virtual void markPhysRegUsed(unsigned PhysReg) = 0;
+
+ // FIXME: What is the point of this being a callback?
+ bool isIncomingArgumentHandler() const override { return true; }
+};
+
+struct FormalArgHandler : public IncomingArgHandler {
+ FormalArgHandler(MachineIRBuilder &B, MachineRegisterInfo &MRI,
+ CCAssignFn *AssignFn)
+ : IncomingArgHandler(B, MRI, AssignFn) {}
+
+ void markPhysRegUsed(unsigned PhysReg) override {
+ MIRBuilder.getMBB().addLiveIn(PhysReg);
}
};
@@ -67,55 +150,198 @@ AMDGPUCallLowering::AMDGPUCallLowering(const AMDGPUTargetLowering &TLI)
: CallLowering(&TLI) {
}
-bool AMDGPUCallLowering::lowerReturn(MachineIRBuilder &MIRBuilder,
+void AMDGPUCallLowering::splitToValueTypes(
+ const ArgInfo &OrigArg, SmallVectorImpl<ArgInfo> &SplitArgs,
+ const DataLayout &DL, MachineRegisterInfo &MRI, CallingConv::ID CallConv,
+ SplitArgTy PerformArgSplit) const {
+ const SITargetLowering &TLI = *getTLI<SITargetLowering>();
+ LLVMContext &Ctx = OrigArg.Ty->getContext();
+
+ if (OrigArg.Ty->isVoidTy())
+ return;
+
+ SmallVector<EVT, 4> SplitVTs;
+ ComputeValueVTs(TLI, DL, OrigArg.Ty, SplitVTs);
+
+ assert(OrigArg.Regs.size() == SplitVTs.size());
+
+ int SplitIdx = 0;
+ for (EVT VT : SplitVTs) {
+ unsigned NumParts = TLI.getNumRegistersForCallingConv(Ctx, CallConv, VT);
+ Type *Ty = VT.getTypeForEVT(Ctx);
+
+
+
+ if (NumParts == 1) {
+ // No splitting to do, but we want to replace the original type (e.g. [1 x
+ // double] -> double).
+ SplitArgs.emplace_back(OrigArg.Regs[SplitIdx], Ty,
+ OrigArg.Flags, OrigArg.IsFixed);
+
+ ++SplitIdx;
+ continue;
+ }
+
+ LLT LLTy = getLLTForType(*Ty, DL);
+
+ SmallVector<Register, 8> SplitRegs;
+
+ EVT PartVT = TLI.getRegisterTypeForCallingConv(Ctx, CallConv, VT);
+ Type *PartTy = PartVT.getTypeForEVT(Ctx);
+ LLT PartLLT = getLLTForType(*PartTy, DL);
+
+ // FIXME: Should we be reporting all of the part registers for a single
+ // argument, and let handleAssignments take care of the repacking?
+ for (unsigned i = 0; i < NumParts; ++i) {
+ Register PartReg = MRI.createGenericVirtualRegister(PartLLT);
+ SplitRegs.push_back(PartReg);
+ SplitArgs.emplace_back(ArrayRef<Register>(PartReg), PartTy, OrigArg.Flags);
+ }
+
+ PerformArgSplit(SplitRegs, LLTy, PartLLT, SplitIdx);
+
+ ++SplitIdx;
+ }
+}
+
+// Get the appropriate type to make \p OrigTy \p Factor times bigger.
+static LLT getMultipleType(LLT OrigTy, int Factor) {
+ if (OrigTy.isVector()) {
+ return LLT::vector(OrigTy.getNumElements() * Factor,
+ OrigTy.getElementType());
+ }
+
+ return LLT::scalar(OrigTy.getSizeInBits() * Factor);
+}
+
+// TODO: Move to generic code
+static void unpackRegsToOrigType(MachineIRBuilder &B,
+ ArrayRef<Register> DstRegs,
+ Register SrcReg,
+ LLT SrcTy,
+ LLT PartTy) {
+ assert(DstRegs.size() > 1 && "Nothing to unpack");
+
+ MachineFunction &MF = B.getMF();
+ MachineRegisterInfo &MRI = MF.getRegInfo();
+
+ const unsigned SrcSize = SrcTy.getSizeInBits();
+ const unsigned PartSize = PartTy.getSizeInBits();
+
+ if (SrcTy.isVector() && !PartTy.isVector() &&
+ PartSize > SrcTy.getElementType().getSizeInBits()) {
+ // Vector was scalarized, and the elements extended.
+ auto UnmergeToEltTy = B.buildUnmerge(SrcTy.getElementType(),
+ SrcReg);
+ for (int i = 0, e = DstRegs.size(); i != e; ++i)
+ B.buildAnyExt(DstRegs[i], UnmergeToEltTy.getReg(i));
+ return;
+ }
+
+ if (SrcSize % PartSize == 0) {
+ B.buildUnmerge(DstRegs, SrcReg);
+ return;
+ }
+
+ const int NumRoundedParts = (SrcSize + PartSize - 1) / PartSize;
+
+ LLT BigTy = getMultipleType(PartTy, NumRoundedParts);
+ auto ImpDef = B.buildUndef(BigTy);
+
+ Register BigReg = MRI.createGenericVirtualRegister(BigTy);
+ B.buildInsert(BigReg, ImpDef.getReg(0), SrcReg, 0).getReg(0);
+
+ int64_t Offset = 0;
+ for (unsigned i = 0, e = DstRegs.size(); i != e; ++i, Offset += PartSize)
+ B.buildExtract(DstRegs[i], BigReg, Offset);
+}
+
+/// Lower the return value for the already existing \p Ret. This assumes that
+/// \p B's insertion point is correct.
+bool AMDGPUCallLowering::lowerReturnVal(MachineIRBuilder &B,
+ const Value *Val, ArrayRef<Register> VRegs,
+ MachineInstrBuilder &Ret) const {
+ if (!Val)
+ return true;
+
+ auto &MF = B.getMF();
+ const auto &F = MF.getFunction();
+ const DataLayout &DL = MF.getDataLayout();
+
+ CallingConv::ID CC = F.getCallingConv();
+ const SITargetLowering &TLI = *getTLI<SITargetLowering>();
+ MachineRegisterInfo &MRI = MF.getRegInfo();
+
+ ArgInfo OrigRetInfo(VRegs, Val->getType());
+ setArgFlags(OrigRetInfo, AttributeList::ReturnIndex, DL, F);
+ SmallVector<ArgInfo, 4> SplitRetInfos;
+
+ splitToValueTypes(
+ OrigRetInfo, SplitRetInfos, DL, MRI, CC,
+ [&](ArrayRef<Register> Regs, LLT LLTy, LLT PartLLT, int VTSplitIdx) {
+ unpackRegsToOrigType(B, Regs, VRegs[VTSplitIdx], LLTy, PartLLT);
+ });
+
+ CCAssignFn *AssignFn = TLI.CCAssignFnForReturn(CC, F.isVarArg());
+
+ OutgoingValueHandler RetHandler(B, MF.getRegInfo(), Ret, AssignFn);
+ return handleAssignments(B, SplitRetInfos, RetHandler);
+}
+
+bool AMDGPUCallLowering::lowerReturn(MachineIRBuilder &B,
const Value *Val,
ArrayRef<Register> VRegs) const {
- MachineFunction &MF = MIRBuilder.getMF();
+ MachineFunction &MF = B.getMF();
MachineRegisterInfo &MRI = MF.getRegInfo();
SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
MFI->setIfReturnsVoid(!Val);
- if (!Val) {
- MIRBuilder.buildInstr(AMDGPU::S_ENDPGM).addImm(0);
+ assert(!Val == VRegs.empty() && "Return value without a vreg");
+
+ CallingConv::ID CC = B.getMF().getFunction().getCallingConv();
+ const bool IsShader = AMDGPU::isShader(CC);
+ const bool IsWaveEnd = (IsShader && MFI->returnsVoid()) ||
+ AMDGPU::isKernel(CC);
+ if (IsWaveEnd) {
+ B.buildInstr(AMDGPU::S_ENDPGM)
+ .addImm(0);
return true;
}
- Register VReg = VRegs[0];
-
- const Function &F = MF.getFunction();
- auto &DL = F.getParent()->getDataLayout();
- if (!AMDGPU::isShader(F.getCallingConv()))
- return false;
+ auto const &ST = B.getMF().getSubtarget<GCNSubtarget>();
+ unsigned ReturnOpc =
+ IsShader ? AMDGPU::SI_RETURN_TO_EPILOG : AMDGPU::S_SETPC_B64_return;
- const AMDGPUTargetLowering &TLI = *getTLI<AMDGPUTargetLowering>();
- SmallVector<EVT, 4> SplitVTs;
- SmallVector<uint64_t, 4> Offsets;
- ArgInfo OrigArg{VReg, Val->getType()};
- setArgFlags(OrigArg, AttributeList::ReturnIndex, DL, F);
- ComputeValueVTs(TLI, DL, OrigArg.Ty, SplitVTs, &Offsets, 0);
-
- SmallVector<ArgInfo, 8> SplitArgs;
- CCAssignFn *AssignFn = CCAssignFnForReturn(F.getCallingConv(), false);
- for (unsigned i = 0, e = Offsets.size(); i != e; ++i) {
- Type *SplitTy = SplitVTs[i].getTypeForEVT(F.getContext());
- SplitArgs.push_back({VRegs[i], SplitTy, OrigArg.Flags, OrigArg.IsFixed});
+ auto Ret = B.buildInstrNoInsert(ReturnOpc);
+ Register ReturnAddrVReg;
+ if (ReturnOpc == AMDGPU::S_SETPC_B64_return) {
+ ReturnAddrVReg = MRI.createVirtualRegister(&AMDGPU::CCR_SGPR_64RegClass);
+ Ret.addUse(ReturnAddrVReg);
}
- auto RetInstr = MIRBuilder.buildInstrNoInsert(AMDGPU::SI_RETURN_TO_EPILOG);
- OutgoingArgHandler Handler(MIRBuilder, MRI, RetInstr, AssignFn);
- if (!handleAssignments(MIRBuilder, SplitArgs, Handler))
+
+ if (!lowerReturnVal(B, Val, VRegs, Ret))
return false;
- MIRBuilder.insertInstr(RetInstr);
+ if (ReturnOpc == AMDGPU::S_SETPC_B64_return) {
+ const SIRegisterInfo *TRI = ST.getRegisterInfo();
+ Register LiveInReturn = MF.addLiveIn(TRI->getReturnAddressReg(MF),
+ &AMDGPU::SGPR_64RegClass);
+ B.buildCopy(ReturnAddrVReg, LiveInReturn);
+ }
+
+ // TODO: Handle CalleeSavedRegsViaCopy.
+
+ B.insertInstr(Ret);
return true;
}
-Register AMDGPUCallLowering::lowerParameterPtr(MachineIRBuilder &MIRBuilder,
+Register AMDGPUCallLowering::lowerParameterPtr(MachineIRBuilder &B,
Type *ParamTy,
uint64_t Offset) const {
- MachineFunction &MF = MIRBuilder.getMF();
+ MachineFunction &MF = B.getMF();
const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
MachineRegisterInfo &MRI = MF.getRegInfo();
const Function &F = MF.getFunction();
@@ -128,79 +354,37 @@ Register AMDGPUCallLowering::lowerParameterPtr(MachineIRBuilder &MIRBuilder,
Register KernArgSegmentVReg = MRI.getLiveInVirtReg(KernArgSegmentPtr);
Register OffsetReg = MRI.createGenericVirtualRegister(LLT::scalar(64));
- MIRBuilder.buildConstant(OffsetReg, Offset);
+ B.buildConstant(OffsetReg, Offset);
- MIRBuilder.buildGEP(DstReg, KernArgSegmentVReg, OffsetReg);
+ B.buildGEP(DstReg, KernArgSegmentVReg, OffsetReg);
return DstReg;
}
-void AMDGPUCallLowering::lowerParameter(MachineIRBuilder &MIRBuilder,
+void AMDGPUCallLowering::lowerParameter(MachineIRBuilder &B,
Type *ParamTy, uint64_t Offset,
unsigned Align,
Register DstReg) const {
- MachineFunction &MF = MIRBuilder.getMF();
+ MachineFunction &MF = B.getMF();
const Function &F = MF.getFunction();
const DataLayout &DL = F.getParent()->getDataLayout();
PointerType *PtrTy = PointerType::get(ParamTy, AMDGPUAS::CONSTANT_ADDRESS);
MachinePointerInfo PtrInfo(UndefValue::get(PtrTy));
unsigned TypeSize = DL.getTypeStoreSize(ParamTy);
- Register PtrReg = lowerParameterPtr(MIRBuilder, ParamTy, Offset);
+ Register PtrReg = lowerParameterPtr(B, ParamTy, Offset);
MachineMemOperand *MMO =
MF.getMachineMemOperand(PtrInfo, MachineMemOperand::MOLoad |
- MachineMemOperand::MONonTemporal |
+ MachineMemOperand::MODereferenceable |
MachineMemOperand::MOInvariant,
TypeSize, Align);
- MIRBuilder.buildLoad(DstReg, PtrReg, *MMO);
-}
-
-static Register findFirstFreeSGPR(CCState &CCInfo) {
- unsigned NumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs();
- for (unsigned Reg = 0; Reg < NumSGPRs; ++Reg) {
- if (!CCInfo.isAllocated(AMDGPU::SGPR0 + Reg)) {
- return AMDGPU::SGPR0 + Reg;
- }
- }
- llvm_unreachable("Cannot allocate sgpr");
-}
-
-static void allocateSpecialEntryInputVGPRs(CCState &CCInfo,
- MachineFunction &MF,
- const SIRegisterInfo &TRI,
- SIMachineFunctionInfo &Info) {
- const LLT S32 = LLT::scalar(32);
- MachineRegisterInfo &MRI = MF.getRegInfo();
-
- if (Info.hasWorkItemIDX()) {
- Register Reg = AMDGPU::VGPR0;
- MRI.setType(MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32);
-
- CCInfo.AllocateReg(Reg);
- Info.setWorkItemIDX(ArgDescriptor::createRegister(Reg));
- }
-
- if (Info.hasWorkItemIDY()) {
- Register Reg = AMDGPU::VGPR1;
- MRI.setType(MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32);
-
- CCInfo.AllocateReg(Reg);
- Info.setWorkItemIDY(ArgDescriptor::createRegister(Reg));
- }
-
- if (Info.hasWorkItemIDZ()) {
- Register Reg = AMDGPU::VGPR2;
- MRI.setType(MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32);
-
- CCInfo.AllocateReg(Reg);
- Info.setWorkItemIDZ(ArgDescriptor::createRegister(Reg));
- }
+ B.buildLoad(DstReg, PtrReg, *MMO);
}
// Allocate special inputs passed in user SGPRs.
static void allocateHSAUserSGPRs(CCState &CCInfo,
- MachineIRBuilder &MIRBuilder,
+ MachineIRBuilder &B,
MachineFunction &MF,
const SIRegisterInfo &TRI,
SIMachineFunctionInfo &Info) {
@@ -229,8 +413,8 @@ static void allocateHSAUserSGPRs(CCState &CCInfo,
const LLT P4 = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
Register VReg = MRI.createGenericVirtualRegister(P4);
MRI.addLiveIn(InputPtrReg, VReg);
- MIRBuilder.getMBB().addLiveIn(InputPtrReg);
- MIRBuilder.buildCopy(VReg, InputPtrReg);
+ B.getMBB().addLiveIn(InputPtrReg);
+ B.buildCopy(VReg, InputPtrReg);
CCInfo.AllocateReg(InputPtrReg);
}
@@ -250,74 +434,22 @@ static void allocateHSAUserSGPRs(CCState &CCInfo,
// these from the dispatch pointer.
}
-static void allocateSystemSGPRs(CCState &CCInfo,
- MachineFunction &MF,
- SIMachineFunctionInfo &Info,
- CallingConv::ID CallConv,
- bool IsShader) {
- const LLT S32 = LLT::scalar(32);
- MachineRegisterInfo &MRI = MF.getRegInfo();
-
- if (Info.hasWorkGroupIDX()) {
- Register Reg = Info.addWorkGroupIDX();
- MRI.setType(MF.addLiveIn(Reg, &AMDGPU::SReg_32_XM0RegClass), S32);
- CCInfo.AllocateReg(Reg);
- }
-
- if (Info.hasWorkGroupIDY()) {
- Register Reg = Info.addWorkGroupIDY();
- MRI.setType(MF.addLiveIn(Reg, &AMDGPU::SReg_32_XM0RegClass), S32);
- CCInfo.AllocateReg(Reg);
- }
-
- if (Info.hasWorkGroupIDZ()) {
- unsigned Reg = Info.addWorkGroupIDZ();
- MRI.setType(MF.addLiveIn(Reg, &AMDGPU::SReg_32_XM0RegClass), S32);
- CCInfo.AllocateReg(Reg);
- }
-
- if (Info.hasWorkGroupInfo()) {
- unsigned Reg = Info.addWorkGroupInfo();
- MRI.setType(MF.addLiveIn(Reg, &AMDGPU::SReg_32_XM0RegClass), S32);
- CCInfo.AllocateReg(Reg);
- }
-
- if (Info.hasPrivateSegmentWaveByteOffset()) {
- // Scratch wave offset passed in system SGPR.
- unsigned PrivateSegmentWaveByteOffsetReg;
-
- if (IsShader) {
- PrivateSegmentWaveByteOffsetReg =
- Info.getPrivateSegmentWaveByteOffsetSystemSGPR();
-
- // This is true if the scratch wave byte offset doesn't have a fixed
- // location.
- if (PrivateSegmentWaveByteOffsetReg == AMDGPU::NoRegister) {
- PrivateSegmentWaveByteOffsetReg = findFirstFreeSGPR(CCInfo);
- Info.setPrivateSegmentWaveByteOffset(PrivateSegmentWaveByteOffsetReg);
- }
- } else
- PrivateSegmentWaveByteOffsetReg = Info.addPrivateSegmentWaveByteOffset();
-
- MF.addLiveIn(PrivateSegmentWaveByteOffsetReg, &AMDGPU::SGPR_32RegClass);
- CCInfo.AllocateReg(PrivateSegmentWaveByteOffsetReg);
- }
-}
-
bool AMDGPUCallLowering::lowerFormalArgumentsKernel(
- MachineIRBuilder &MIRBuilder, const Function &F,
+ MachineIRBuilder &B, const Function &F,
ArrayRef<ArrayRef<Register>> VRegs) const {
- MachineFunction &MF = MIRBuilder.getMF();
+ MachineFunction &MF = B.getMF();
const GCNSubtarget *Subtarget = &MF.getSubtarget<GCNSubtarget>();
MachineRegisterInfo &MRI = MF.getRegInfo();
SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
- const SIRegisterInfo *TRI = MF.getSubtarget<GCNSubtarget>().getRegisterInfo();
+ const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
+ const SITargetLowering &TLI = *getTLI<SITargetLowering>();
+
const DataLayout &DL = F.getParent()->getDataLayout();
SmallVector<CCValAssign, 16> ArgLocs;
CCState CCInfo(F.getCallingConv(), F.isVarArg(), MF, ArgLocs, F.getContext());
- allocateHSAUserSGPRs(CCInfo, MIRBuilder, MF, *TRI, *Info);
+ allocateHSAUserSGPRs(CCInfo, B, MF, *TRI, *Info);
unsigned i = 0;
const unsigned KernArgBaseAlign = 16;
@@ -343,123 +475,242 @@ bool AMDGPUCallLowering::lowerFormalArgumentsKernel(
: MRI.createGenericVirtualRegister(getLLTForType(*ArgTy, DL));
unsigned Align = MinAlign(KernArgBaseAlign, ArgOffset);
ArgOffset = alignTo(ArgOffset, DL.getABITypeAlignment(ArgTy));
- lowerParameter(MIRBuilder, ArgTy, ArgOffset, Align, ArgReg);
+ lowerParameter(B, ArgTy, ArgOffset, Align, ArgReg);
if (OrigArgRegs.size() > 1)
- unpackRegs(OrigArgRegs, ArgReg, ArgTy, MIRBuilder);
+ unpackRegs(OrigArgRegs, ArgReg, ArgTy, B);
++i;
}
- allocateSpecialEntryInputVGPRs(CCInfo, MF, *TRI, *Info);
- allocateSystemSGPRs(CCInfo, MF, *Info, F.getCallingConv(), false);
+ TLI.allocateSpecialEntryInputVGPRs(CCInfo, MF, *TRI, *Info);
+ TLI.allocateSystemSGPRs(CCInfo, MF, *Info, F.getCallingConv(), false);
return true;
}
+// TODO: Move this to generic code
+static void packSplitRegsToOrigType(MachineIRBuilder &B,
+ ArrayRef<Register> OrigRegs,
+ ArrayRef<Register> Regs,
+ LLT LLTy,
+ LLT PartLLT) {
+ if (!LLTy.isVector() && !PartLLT.isVector()) {
+ B.buildMerge(OrigRegs[0], Regs);
+ return;
+ }
+
+ if (LLTy.isVector() && PartLLT.isVector()) {
+ assert(LLTy.getElementType() == PartLLT.getElementType());
+
+ int DstElts = LLTy.getNumElements();
+ int PartElts = PartLLT.getNumElements();
+ if (DstElts % PartElts == 0)
+ B.buildConcatVectors(OrigRegs[0], Regs);
+ else {
+ // Deal with v3s16 split into v2s16
+ assert(PartElts == 2 && DstElts % 2 != 0);
+ int RoundedElts = PartElts * ((DstElts + PartElts - 1) / PartElts);
+
+ LLT RoundedDestTy = LLT::vector(RoundedElts, PartLLT.getElementType());
+ auto RoundedConcat = B.buildConcatVectors(RoundedDestTy, Regs);
+ B.buildExtract(OrigRegs[0], RoundedConcat, 0);
+ }
+
+ return;
+ }
+
+ assert(LLTy.isVector() && !PartLLT.isVector());
+
+ LLT DstEltTy = LLTy.getElementType();
+ if (DstEltTy == PartLLT) {
+ // Vector was trivially scalarized.
+ B.buildBuildVector(OrigRegs[0], Regs);
+ } else if (DstEltTy.getSizeInBits() > PartLLT.getSizeInBits()) {
+ // Deal with vector with 64-bit elements decomposed to 32-bit
+ // registers. Need to create intermediate 64-bit elements.
+ SmallVector<Register, 8> EltMerges;
+ int PartsPerElt = DstEltTy.getSizeInBits() / PartLLT.getSizeInBits();
+
+ assert(DstEltTy.getSizeInBits() % PartLLT.getSizeInBits() == 0);
+
+ for (int I = 0, NumElts = LLTy.getNumElements(); I != NumElts; ++I) {
+ auto Merge = B.buildMerge(DstEltTy,
+ Regs.take_front(PartsPerElt));
+ EltMerges.push_back(Merge.getReg(0));
+ Regs = Regs.drop_front(PartsPerElt);
+ }
+
+ B.buildBuildVector(OrigRegs[0], EltMerges);
+ } else {
+ // Vector was split, and elements promoted to a wider type.
+ LLT BVType = LLT::vector(LLTy.getNumElements(), PartLLT);
+ auto BV = B.buildBuildVector(BVType, Regs);
+ B.buildTrunc(OrigRegs[0], BV);
+ }
+}
+
bool AMDGPUCallLowering::lowerFormalArguments(
- MachineIRBuilder &MIRBuilder, const Function &F,
+ MachineIRBuilder &B, const Function &F,
ArrayRef<ArrayRef<Register>> VRegs) const {
+ CallingConv::ID CC = F.getCallingConv();
+
// The infrastructure for normal calling convention lowering is essentially
// useless for kernels. We want to avoid any kind of legalization or argument
// splitting.
- if (F.getCallingConv() == CallingConv::AMDGPU_KERNEL)
- return lowerFormalArgumentsKernel(MIRBuilder, F, VRegs);
+ if (CC == CallingConv::AMDGPU_KERNEL)
+ return lowerFormalArgumentsKernel(B, F, VRegs);
- // AMDGPU_GS and AMDGP_HS are not supported yet.
- if (F.getCallingConv() == CallingConv::AMDGPU_GS ||
- F.getCallingConv() == CallingConv::AMDGPU_HS)
- return false;
+ const bool IsShader = AMDGPU::isShader(CC);
+ const bool IsEntryFunc = AMDGPU::isEntryFunctionCC(CC);
- MachineFunction &MF = MIRBuilder.getMF();
+ MachineFunction &MF = B.getMF();
+ MachineBasicBlock &MBB = B.getMBB();
MachineRegisterInfo &MRI = MF.getRegInfo();
SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
- const SIRegisterInfo *TRI = MF.getSubtarget<GCNSubtarget>().getRegisterInfo();
+ const GCNSubtarget &Subtarget = MF.getSubtarget<GCNSubtarget>();
+ const SIRegisterInfo *TRI = Subtarget.getRegisterInfo();
const DataLayout &DL = F.getParent()->getDataLayout();
- bool IsShader = AMDGPU::isShader(F.getCallingConv());
SmallVector<CCValAssign, 16> ArgLocs;
- CCState CCInfo(F.getCallingConv(), F.isVarArg(), MF, ArgLocs, F.getContext());
+ CCState CCInfo(CC, F.isVarArg(), MF, ArgLocs, F.getContext());
+
+ if (!IsEntryFunc) {
+ Register ReturnAddrReg = TRI->getReturnAddressReg(MF);
+ Register LiveInReturn = MF.addLiveIn(ReturnAddrReg,
+ &AMDGPU::SGPR_64RegClass);
+ MBB.addLiveIn(ReturnAddrReg);
+ B.buildCopy(LiveInReturn, ReturnAddrReg);
+ }
if (Info->hasImplicitBufferPtr()) {
- unsigned ImplicitBufferPtrReg = Info->addImplicitBufferPtr(*TRI);
+ Register ImplicitBufferPtrReg = Info->addImplicitBufferPtr(*TRI);
MF.addLiveIn(ImplicitBufferPtrReg, &AMDGPU::SGPR_64RegClass);
CCInfo.AllocateReg(ImplicitBufferPtrReg);
}
- unsigned NumArgs = F.arg_size();
- Function::const_arg_iterator CurOrigArg = F.arg_begin();
- const AMDGPUTargetLowering &TLI = *getTLI<AMDGPUTargetLowering>();
+
+ SmallVector<ArgInfo, 32> SplitArgs;
+ unsigned Idx = 0;
unsigned PSInputNum = 0;
- BitVector Skipped(NumArgs);
- for (unsigned i = 0; i != NumArgs; ++i, ++CurOrigArg) {
- EVT ValEVT = TLI.getValueType(DL, CurOrigArg->getType());
-
- // We can only hanlde simple value types at the moment.
- ISD::ArgFlagsTy Flags;
- assert(VRegs[i].size() == 1 && "Can't lower into more than one register");
- ArgInfo OrigArg{VRegs[i][0], CurOrigArg->getType()};
- setArgFlags(OrigArg, i + 1, DL, F);
- Flags.setOrigAlign(DL.getABITypeAlignment(CurOrigArg->getType()));
-
- if (F.getCallingConv() == CallingConv::AMDGPU_PS &&
- !OrigArg.Flags.isInReg() && !OrigArg.Flags.isByVal() &&
- PSInputNum <= 15) {
- if (CurOrigArg->use_empty() && !Info->isPSInputAllocated(PSInputNum)) {
- Skipped.set(i);
- ++PSInputNum;
+
+ for (auto &Arg : F.args()) {
+ if (DL.getTypeStoreSize(Arg.getType()) == 0)
+ continue;
+
+ const bool InReg = Arg.hasAttribute(Attribute::InReg);
+
+ // SGPR arguments to functions not implemented.
+ if (!IsShader && InReg)
+ return false;
+
+ if (Arg.hasAttribute(Attribute::SwiftSelf) ||
+ Arg.hasAttribute(Attribute::SwiftError) ||
+ Arg.hasAttribute(Attribute::Nest))
+ return false;
+
+ if (CC == CallingConv::AMDGPU_PS && !InReg && PSInputNum <= 15) {
+ const bool ArgUsed = !Arg.use_empty();
+ bool SkipArg = !ArgUsed && !Info->isPSInputAllocated(PSInputNum);
+
+ if (!SkipArg) {
+ Info->markPSInputAllocated(PSInputNum);
+ if (ArgUsed)
+ Info->markPSInputEnabled(PSInputNum);
+ }
+
+ ++PSInputNum;
+
+ if (SkipArg) {
+ for (int I = 0, E = VRegs[Idx].size(); I != E; ++I)
+ B.buildUndef(VRegs[Idx][I]);
+
+ ++Idx;
continue;
}
+ }
- Info->markPSInputAllocated(PSInputNum);
- if (!CurOrigArg->use_empty())
- Info->markPSInputEnabled(PSInputNum);
+ ArgInfo OrigArg(VRegs[Idx], Arg.getType());
+ setArgFlags(OrigArg, Idx + AttributeList::FirstArgIndex, DL, F);
- ++PSInputNum;
+ splitToValueTypes(
+ OrigArg, SplitArgs, DL, MRI, CC,
+ // FIXME: We should probably be passing multiple registers to
+ // handleAssignments to do this
+ [&](ArrayRef<Register> Regs, LLT LLTy, LLT PartLLT, int VTSplitIdx) {
+ packSplitRegsToOrigType(B, VRegs[Idx][VTSplitIdx], Regs,
+ LLTy, PartLLT);
+ });
+
+ ++Idx;
+ }
+
+ // At least one interpolation mode must be enabled or else the GPU will
+ // hang.
+ //
+ // Check PSInputAddr instead of PSInputEnable. The idea is that if the user
+ // set PSInputAddr, the user wants to enable some bits after the compilation
+ // based on run-time states. Since we can't know what the final PSInputEna
+ // will look like, so we shouldn't do anything here and the user should take
+ // responsibility for the correct programming.
+ //
+ // Otherwise, the following restrictions apply:
+ // - At least one of PERSP_* (0xF) or LINEAR_* (0x70) must be enabled.
+ // - If POS_W_FLOAT (11) is enabled, at least one of PERSP_* must be
+ // enabled too.
+ if (CC == CallingConv::AMDGPU_PS) {
+ if ((Info->getPSInputAddr() & 0x7F) == 0 ||
+ ((Info->getPSInputAddr() & 0xF) == 0 &&
+ Info->isPSInputAllocated(11))) {
+ CCInfo.AllocateReg(AMDGPU::VGPR0);
+ CCInfo.AllocateReg(AMDGPU::VGPR1);
+ Info->markPSInputAllocated(0);
+ Info->markPSInputEnabled(0);
}
- CCAssignFn *AssignFn = CCAssignFnForCall(F.getCallingConv(),
- /*IsVarArg=*/false);
-
- if (ValEVT.isVector()) {
- EVT ElemVT = ValEVT.getVectorElementType();
- if (!ValEVT.isSimple())
- return false;
- MVT ValVT = ElemVT.getSimpleVT();
- bool Res = AssignFn(i, ValVT, ValVT, CCValAssign::Full,
- OrigArg.Flags, CCInfo);
- if (!Res)
- return false;
- } else {
- MVT ValVT = ValEVT.getSimpleVT();
- if (!ValEVT.isSimple())
- return false;
- bool Res =
- AssignFn(i, ValVT, ValVT, CCValAssign::Full, OrigArg.Flags, CCInfo);
-
- // Fail if we don't know how to handle this type.
- if (Res)
- return false;
+ if (Subtarget.isAmdPalOS()) {
+ // For isAmdPalOS, the user does not enable some bits after compilation
+ // based on run-time states; the register values being generated here are
+ // the final ones set in hardware. Therefore we need to apply the
+ // workaround to PSInputAddr and PSInputEnable together. (The case where
+ // a bit is set in PSInputAddr but not PSInputEnable is where the frontend
+ // set up an input arg for a particular interpolation mode, but nothing
+ // uses that input arg. Really we should have an earlier pass that removes
+ // such an arg.)
+ unsigned PsInputBits = Info->getPSInputAddr() & Info->getPSInputEnable();
+ if ((PsInputBits & 0x7F) == 0 ||
+ ((PsInputBits & 0xF) == 0 &&
+ (PsInputBits >> 11 & 1)))
+ Info->markPSInputEnabled(
+ countTrailingZeros(Info->getPSInputAddr(), ZB_Undefined));
}
}
- Function::const_arg_iterator Arg = F.arg_begin();
-
- if (F.getCallingConv() == CallingConv::AMDGPU_VS ||
- F.getCallingConv() == CallingConv::AMDGPU_PS) {
- for (unsigned i = 0, OrigArgIdx = 0;
- OrigArgIdx != NumArgs && i != ArgLocs.size(); ++Arg, ++OrigArgIdx) {
- if (Skipped.test(OrigArgIdx))
- continue;
- assert(VRegs[OrigArgIdx].size() == 1 &&
- "Can't lower into more than 1 reg");
- CCValAssign &VA = ArgLocs[i++];
- MRI.addLiveIn(VA.getLocReg(), VRegs[OrigArgIdx][0]);
- MIRBuilder.getMBB().addLiveIn(VA.getLocReg());
- MIRBuilder.buildCopy(VRegs[OrigArgIdx][0], VA.getLocReg());
- }
+ const SITargetLowering &TLI = *getTLI<SITargetLowering>();
+ CCAssignFn *AssignFn = TLI.CCAssignFnForCall(CC, F.isVarArg());
- allocateSystemSGPRs(CCInfo, MF, *Info, F.getCallingConv(), IsShader);
- return true;
+ if (!MBB.empty())
+ B.setInstr(*MBB.begin());
+
+ FormalArgHandler Handler(B, MRI, AssignFn);
+ if (!handleAssignments(CCInfo, ArgLocs, B, SplitArgs, Handler))
+ return false;
+
+ if (!IsEntryFunc) {
+ // Special inputs come after user arguments.
+ TLI.allocateSpecialInputVGPRs(CCInfo, MF, *TRI, *Info);
+ }
+
+ // Start adding system SGPRs.
+ if (IsEntryFunc) {
+ TLI.allocateSystemSGPRs(CCInfo, MF, *Info, CC, IsShader);
+ } else {
+ CCInfo.AllocateReg(Info->getScratchRSrcReg());
+ CCInfo.AllocateReg(Info->getScratchWaveOffsetReg());
+ CCInfo.AllocateReg(Info->getFrameOffsetReg());
+ TLI.allocateSpecialInputSGPRs(CCInfo, MF, *TRI, *Info);
}
- return false;
+ // Move back to the end of the basic block.
+ B.setMBB(MBB);
+
+ return true;
}
diff --git a/lib/Target/AMDGPU/AMDGPUCallLowering.h b/lib/Target/AMDGPU/AMDGPUCallLowering.h
index 3599659cac6a..53a562586bc0 100644
--- a/lib/Target/AMDGPU/AMDGPUCallLowering.h
+++ b/lib/Target/AMDGPU/AMDGPUCallLowering.h
@@ -20,26 +20,37 @@
namespace llvm {
class AMDGPUTargetLowering;
+class MachineInstrBuilder;
class AMDGPUCallLowering: public CallLowering {
- Register lowerParameterPtr(MachineIRBuilder &MIRBuilder, Type *ParamTy,
+ Register lowerParameterPtr(MachineIRBuilder &B, Type *ParamTy,
uint64_t Offset) const;
- void lowerParameter(MachineIRBuilder &MIRBuilder, Type *ParamTy,
- uint64_t Offset, unsigned Align,
- Register DstReg) const;
+ void lowerParameter(MachineIRBuilder &B, Type *ParamTy, uint64_t Offset,
+ unsigned Align, Register DstReg) const;
- public:
+ /// A function of this type is used to perform value split action.
+ using SplitArgTy = std::function<void(ArrayRef<Register>, LLT, LLT, int)>;
+
+ void splitToValueTypes(const ArgInfo &OrigArgInfo,
+ SmallVectorImpl<ArgInfo> &SplitArgs,
+ const DataLayout &DL, MachineRegisterInfo &MRI,
+ CallingConv::ID CallConv,
+ SplitArgTy SplitArg) const;
+
+ bool lowerReturnVal(MachineIRBuilder &B, const Value *Val,
+ ArrayRef<Register> VRegs, MachineInstrBuilder &Ret) const;
+
+public:
AMDGPUCallLowering(const AMDGPUTargetLowering &TLI);
- bool lowerReturn(MachineIRBuilder &MIRBuilder, const Value *Val,
+ bool lowerReturn(MachineIRBuilder &B, const Value *Val,
ArrayRef<Register> VRegs) const override;
- bool lowerFormalArgumentsKernel(MachineIRBuilder &MIRBuilder,
- const Function &F,
+ bool lowerFormalArgumentsKernel(MachineIRBuilder &B, const Function &F,
ArrayRef<ArrayRef<Register>> VRegs) const;
- bool lowerFormalArguments(MachineIRBuilder &MIRBuilder, const Function &F,
+ bool lowerFormalArguments(MachineIRBuilder &B, const Function &F,
ArrayRef<ArrayRef<Register>> VRegs) const override;
static CCAssignFn *CCAssignFnForCall(CallingConv::ID CC, bool IsVarArg);
static CCAssignFn *CCAssignFnForReturn(CallingConv::ID CC, bool IsVarArg);
diff --git a/lib/Target/AMDGPU/AMDGPUCallingConv.td b/lib/Target/AMDGPU/AMDGPUCallingConv.td
index 3688cd77542e..f8a54a61aac2 100644
--- a/lib/Target/AMDGPU/AMDGPUCallingConv.td
+++ b/lib/Target/AMDGPU/AMDGPUCallingConv.td
@@ -24,22 +24,9 @@ def CC_SI : CallingConv<[
SGPR16, SGPR17, SGPR18, SGPR19, SGPR20, SGPR21, SGPR22, SGPR23,
SGPR24, SGPR25, SGPR26, SGPR27, SGPR28, SGPR29, SGPR30, SGPR31,
SGPR32, SGPR33, SGPR34, SGPR35, SGPR36, SGPR37, SGPR38, SGPR39,
- SGPR40, SGPR41, SGPR42, SGPR43, SGPR44, SGPR45, SGPR46, SGPR47,
- SGPR48, SGPR49, SGPR50, SGPR51, SGPR52, SGPR53, SGPR54, SGPR55,
- SGPR56, SGPR57, SGPR58, SGPR59, SGPR60, SGPR61, SGPR62, SGPR63,
- SGPR64, SGPR65, SGPR66, SGPR67, SGPR68, SGPR69, SGPR70, SGPR71,
- SGPR72, SGPR73, SGPR74, SGPR75, SGPR76, SGPR77, SGPR78, SGPR79,
- SGPR80, SGPR81, SGPR82, SGPR83, SGPR84, SGPR85, SGPR86, SGPR87,
- SGPR88, SGPR89, SGPR90, SGPR91, SGPR92, SGPR93, SGPR94, SGPR95,
- SGPR96, SGPR97, SGPR98, SGPR99, SGPR100, SGPR101, SGPR102, SGPR103,
- SGPR104, SGPR105
+ SGPR40, SGPR41, SGPR42, SGPR43
]>>>,
- // We have no way of referring to the generated register tuples
- // here, so use a custom function.
- CCIfInReg<CCIfType<[i64], CCCustom<"allocateSGPRTuple">>>,
- CCIfByVal<CCIfType<[i64], CCCustom<"allocateSGPRTuple">>>,
-
// 32*4 + 4 is the minimum for a fetch shader consumer with 32 inputs.
CCIfNotInReg<CCIfType<[f32, i32, f16, v2i16, v2f16] , CCAssignToReg<[
VGPR0, VGPR1, VGPR2, VGPR3, VGPR4, VGPR5, VGPR6, VGPR7,
@@ -69,15 +56,7 @@ def RetCC_SI_Shader : CallingConv<[
SGPR16, SGPR17, SGPR18, SGPR19, SGPR20, SGPR21, SGPR22, SGPR23,
SGPR24, SGPR25, SGPR26, SGPR27, SGPR28, SGPR29, SGPR30, SGPR31,
SGPR32, SGPR33, SGPR34, SGPR35, SGPR36, SGPR37, SGPR38, SGPR39,
- SGPR40, SGPR41, SGPR42, SGPR43, SGPR44, SGPR45, SGPR46, SGPR47,
- SGPR48, SGPR49, SGPR50, SGPR51, SGPR52, SGPR53, SGPR54, SGPR55,
- SGPR56, SGPR57, SGPR58, SGPR59, SGPR60, SGPR61, SGPR62, SGPR63,
- SGPR64, SGPR65, SGPR66, SGPR67, SGPR68, SGPR69, SGPR70, SGPR71,
- SGPR72, SGPR73, SGPR74, SGPR75, SGPR76, SGPR77, SGPR78, SGPR79,
- SGPR80, SGPR81, SGPR82, SGPR83, SGPR84, SGPR85, SGPR86, SGPR87,
- SGPR88, SGPR89, SGPR90, SGPR91, SGPR92, SGPR93, SGPR94, SGPR95,
- SGPR96, SGPR97, SGPR98, SGPR99, SGPR100, SGPR101, SGPR102, SGPR103,
- SGPR104, SGPR105
+ SGPR40, SGPR41, SGPR42, SGPR43
]>>,
// 32*4 + 4 is the minimum for a fetch shader with 32 outputs.
@@ -138,7 +117,6 @@ def CC_AMDGPU_Func : CallingConv<[
VGPR8, VGPR9, VGPR10, VGPR11, VGPR12, VGPR13, VGPR14, VGPR15,
VGPR16, VGPR17, VGPR18, VGPR19, VGPR20, VGPR21, VGPR22, VGPR23,
VGPR24, VGPR25, VGPR26, VGPR27, VGPR28, VGPR29, VGPR30, VGPR31]>>,
- CCIfType<[i64, f64, v2i32, v2f32, v3i32, v3f32, v4i32, v4f32, v5i32, v5f32, v8i32, v8f32, v16i32, v16f32, v2i64, v2f64, v4i16, v4f16], CCCustom<"allocateVGPRTuple">>,
CCIfType<[i32, f32, v2i16, v2f16, i16, f16, i1], CCAssignToStack<4, 4>>,
CCIfType<[i64, f64, v2i32, v2f32], CCAssignToStack<8, 4>>,
CCIfType<[v3i32, v3f32], CCAssignToStack<12, 4>>,
@@ -157,7 +135,6 @@ def RetCC_AMDGPU_Func : CallingConv<[
VGPR8, VGPR9, VGPR10, VGPR11, VGPR12, VGPR13, VGPR14, VGPR15,
VGPR16, VGPR17, VGPR18, VGPR19, VGPR20, VGPR21, VGPR22, VGPR23,
VGPR24, VGPR25, VGPR26, VGPR27, VGPR28, VGPR29, VGPR30, VGPR31]>>,
- CCIfType<[i64, f64, v2i32, v2f32, v4i32, v4f32, v8i32, v8f32, v16i32, v16f32, v2i64, v2f64, v4i16, v4f16], CCCustom<"allocateVGPRTuple">>
]>;
def CC_AMDGPU : CallingConv<[
diff --git a/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp b/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
index b750c6b5f6d2..1640a4a59ee2 100644
--- a/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
+++ b/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
@@ -55,6 +55,12 @@ static cl::opt<bool> WidenLoads(
cl::ReallyHidden,
cl::init(true));
+static cl::opt<bool> UseMul24Intrin(
+ "amdgpu-codegenprepare-mul24",
+ cl::desc("Introduce mul24 intrinsics in AMDGPUCodeGenPrepare"),
+ cl::ReallyHidden,
+ cl::init(true));
+
class AMDGPUCodeGenPrepare : public FunctionPass,
public InstVisitor<AMDGPUCodeGenPrepare, bool> {
const GCNSubtarget *ST = nullptr;
@@ -509,7 +515,9 @@ bool AMDGPUCodeGenPrepare::replaceMulWithMul24(BinaryOperator &I) const {
}
}
- I.replaceAllUsesWith(insertValues(Builder, Ty, ResultVals));
+ Value *NewVal = insertValues(Builder, Ty, ResultVals);
+ NewVal->takeName(&I);
+ I.replaceAllUsesWith(NewVal);
I.eraseFromParent();
return true;
@@ -879,7 +887,7 @@ bool AMDGPUCodeGenPrepare::visitBinaryOperator(BinaryOperator &I) {
DA->isUniform(&I) && promoteUniformOpToI32(I))
return true;
- if (replaceMulWithMul24(I))
+ if (UseMul24Intrin && replaceMulWithMul24(I))
return true;
bool Changed = false;
diff --git a/lib/Target/AMDGPU/AMDGPUFrameLowering.cpp b/lib/Target/AMDGPU/AMDGPUFrameLowering.cpp
index e80797736363..61ce83b30e00 100644
--- a/lib/Target/AMDGPU/AMDGPUFrameLowering.cpp
+++ b/lib/Target/AMDGPU/AMDGPUFrameLowering.cpp
@@ -13,9 +13,9 @@
#include "AMDGPUFrameLowering.h"
using namespace llvm;
-AMDGPUFrameLowering::AMDGPUFrameLowering(StackDirection D, unsigned StackAl,
- int LAO, unsigned TransAl)
- : TargetFrameLowering(D, StackAl, LAO, TransAl) { }
+AMDGPUFrameLowering::AMDGPUFrameLowering(StackDirection D, Align StackAl,
+ int LAO, Align TransAl)
+ : TargetFrameLowering(D, StackAl, LAO, TransAl) {}
AMDGPUFrameLowering::~AMDGPUFrameLowering() = default;
diff --git a/lib/Target/AMDGPU/AMDGPUFrameLowering.h b/lib/Target/AMDGPU/AMDGPUFrameLowering.h
index 48b64488303e..92e256cf2829 100644
--- a/lib/Target/AMDGPU/AMDGPUFrameLowering.h
+++ b/lib/Target/AMDGPU/AMDGPUFrameLowering.h
@@ -25,8 +25,8 @@ namespace llvm {
/// See TargetFrameInfo for more comments.
class AMDGPUFrameLowering : public TargetFrameLowering {
public:
- AMDGPUFrameLowering(StackDirection D, unsigned StackAl, int LAO,
- unsigned TransAl = 1);
+ AMDGPUFrameLowering(StackDirection D, Align StackAl, int LAO,
+ Align TransAl = Align::None());
~AMDGPUFrameLowering() override;
/// \returns The number of 32-bit sub-registers that are used when storing
diff --git a/lib/Target/AMDGPU/AMDGPUGISel.td b/lib/Target/AMDGPU/AMDGPUGISel.td
index cad4c2ef404c..f2be1ca44d34 100644
--- a/lib/Target/AMDGPU/AMDGPUGISel.td
+++ b/lib/Target/AMDGPU/AMDGPUGISel.td
@@ -12,10 +12,6 @@
include "AMDGPU.td"
-def p0 : PtrValueType<i64, 0>;
-def p1 : PtrValueType<i64, 1>;
-def p4 : PtrValueType<i64, 4>;
-
def sd_vsrc0 : ComplexPattern<i32, 1, "">;
def gi_vsrc0 :
GIComplexOperandMatcher<s32, "selectVSRC0">,
@@ -38,6 +34,18 @@ def gi_vop3omods :
GIComplexOperandMatcher<s32, "selectVOP3OMods">,
GIComplexPatternEquiv<VOP3OMods>;
+def gi_vop3omods0clamp0omod :
+ GIComplexOperandMatcher<s32, "selectVOP3Mods0Clamp0OMod">,
+ GIComplexPatternEquiv<VOP3Mods0Clamp0OMod>;
+
+def gi_vop3opselmods0 :
+ GIComplexOperandMatcher<s32, "selectVOP3OpSelMods0">,
+ GIComplexPatternEquiv<VOP3OpSelMods0>;
+
+def gi_vop3opselmods :
+ GIComplexOperandMatcher<s32, "selectVOP3OpSelMods">,
+ GIComplexPatternEquiv<VOP3OpSelMods>;
+
def gi_smrd_imm :
GIComplexOperandMatcher<s64, "selectSmrdImm">,
GIComplexPatternEquiv<SMRDImm>;
@@ -50,12 +58,19 @@ def gi_smrd_sgpr :
GIComplexOperandMatcher<s64, "selectSmrdSgpr">,
GIComplexPatternEquiv<SMRDSgpr>;
+// FIXME: Why are the atomic versions separated?
def gi_flat_offset :
GIComplexOperandMatcher<s64, "selectFlatOffset">,
GIComplexPatternEquiv<FLATOffset>;
def gi_flat_offset_signed :
GIComplexOperandMatcher<s64, "selectFlatOffsetSigned">,
GIComplexPatternEquiv<FLATOffsetSigned>;
+def gi_flat_atomic :
+ GIComplexOperandMatcher<s64, "selectFlatOffset">,
+ GIComplexPatternEquiv<FLATAtomic>;
+def gi_flat_signed_atomic :
+ GIComplexOperandMatcher<s64, "selectFlatOffsetSigned">,
+ GIComplexPatternEquiv<FLATSignedAtomic>;
def gi_mubuf_scratch_offset :
GIComplexOperandMatcher<s32, "selectMUBUFScratchOffset">,
@@ -64,6 +79,44 @@ def gi_mubuf_scratch_offen :
GIComplexOperandMatcher<s32, "selectMUBUFScratchOffen">,
GIComplexPatternEquiv<MUBUFScratchOffen>;
+def gi_ds_1addr_1offset :
+ GIComplexOperandMatcher<s32, "selectDS1Addr1Offset">,
+ GIComplexPatternEquiv<DS1Addr1Offset>;
+
+
+// Separate load nodes are defined to glue m0 initialization in
+// SelectionDAG. The GISel selector can just insert m0 initialization
+// directly before before selecting a glue-less load, so hide this
+// distinction.
+
+def : GINodeEquiv<G_LOAD, AMDGPUld_glue> {
+ let CheckMMOIsNonAtomic = 1;
+}
+
+def : GINodeEquiv<G_STORE, AMDGPUst_glue> {
+ let CheckMMOIsNonAtomic = 1;
+}
+
+def : GINodeEquiv<G_LOAD, AMDGPUatomic_ld_glue> {
+ bit CheckMMOIsAtomic = 1;
+}
+
+
+
+def : GINodeEquiv<G_ATOMIC_CMPXCHG, atomic_cmp_swap_glue>;
+def : GINodeEquiv<G_ATOMICRMW_XCHG, atomic_swap_glue>;
+def : GINodeEquiv<G_ATOMICRMW_ADD, atomic_load_add_glue>;
+def : GINodeEquiv<G_ATOMICRMW_SUB, atomic_load_sub_glue>;
+def : GINodeEquiv<G_ATOMICRMW_AND, atomic_load_and_glue>;
+def : GINodeEquiv<G_ATOMICRMW_OR, atomic_load_or_glue>;
+def : GINodeEquiv<G_ATOMICRMW_XOR, atomic_load_xor_glue>;
+def : GINodeEquiv<G_ATOMICRMW_MIN, atomic_load_min_glue>;
+def : GINodeEquiv<G_ATOMICRMW_MAX, atomic_load_max_glue>;
+def : GINodeEquiv<G_ATOMICRMW_UMIN, atomic_load_umin_glue>;
+def : GINodeEquiv<G_ATOMICRMW_UMAX, atomic_load_umax_glue>;
+def : GINodeEquiv<G_ATOMICRMW_FADD, atomic_load_fadd_glue>;
+
+def : GINodeEquiv<G_AMDGPU_FFBH_U32, AMDGPUffbh_u32>;
class GISelSop2Pat <
SDPatternOperator node,
@@ -143,20 +196,6 @@ multiclass GISelVop2IntrPat <
def : GISelSop2Pat <or, S_OR_B32, i32>;
def : GISelVop2Pat <or, V_OR_B32_e32, i32>;
-// FIXME: We can't re-use SelectionDAG patterns here because they match
-// against a custom SDNode and we would need to create a generic machine
-// instruction that is equivalent to the custom SDNode. This would also require
-// us to custom legalize the intrinsic to the new generic machine instruction,
-// but I can't get custom legalizing of intrinsic to work and I'm not sure if
-// this is even supported yet.
-def : GISelVop3Pat2ModsPat <
- int_amdgcn_cvt_pkrtz, V_CVT_PKRTZ_F16_F32_e64, v2f16, f32>;
-
-defm : GISelVop2IntrPat <int_maxnum, V_MAX_F32_e32, f32>;
-def : GISelVop3Pat2ModsPat <int_maxnum, V_MAX_F64, f64>;
-defm : GISelVop2IntrPat <int_minnum, V_MIN_F32_e32, f32>;
-def : GISelVop3Pat2ModsPat <int_minnum, V_MIN_F64, f64>;
-
// Since GlobalISel is more flexible then SelectionDAG, I think we can get
// away with adding patterns for integer types and not legalizing all
// loads and stores to vector types. This should help simplify the load/store
@@ -164,3 +203,6 @@ def : GISelVop3Pat2ModsPat <int_minnum, V_MIN_F64, f64>;
foreach Ty = [i64, p0, p1, p4] in {
defm : SMRD_Pattern <"S_LOAD_DWORDX2", Ty>;
}
+
+def gi_as_i32timm : GICustomOperandRenderer<"renderTruncImm32">,
+ GISDNodeXFormEquiv<as_i32timm>;
diff --git a/lib/Target/AMDGPU/AMDGPUGenRegisterBankInfo.def b/lib/Target/AMDGPU/AMDGPUGenRegisterBankInfo.def
index 0a1f48231b18..85d1ad349157 100644
--- a/lib/Target/AMDGPU/AMDGPUGenRegisterBankInfo.def
+++ b/lib/Target/AMDGPU/AMDGPUGenRegisterBankInfo.def
@@ -22,15 +22,17 @@ enum PartialMappingIdx {
PM_SGPR128 = 9,
PM_SGPR256 = 10,
PM_SGPR512 = 11,
- PM_VGPR1 = 12,
- PM_VGPR16 = 16,
- PM_VGPR32 = 17,
- PM_VGPR64 = 18,
- PM_VGPR128 = 19,
- PM_VGPR256 = 20,
- PM_VGPR512 = 21,
- PM_SGPR96 = 22,
- PM_VGPR96 = 23
+ PM_SGPR1024 = 12,
+ PM_VGPR1 = 13,
+ PM_VGPR16 = 17,
+ PM_VGPR32 = 18,
+ PM_VGPR64 = 19,
+ PM_VGPR128 = 20,
+ PM_VGPR256 = 21,
+ PM_VGPR512 = 22,
+ PM_VGPR1024 = 23,
+ PM_SGPR96 = 24,
+ PM_VGPR96 = 25
};
const RegisterBankInfo::PartialMapping PartMappings[] {
@@ -45,6 +47,7 @@ const RegisterBankInfo::PartialMapping PartMappings[] {
{0, 128, SGPRRegBank},
{0, 256, SGPRRegBank},
{0, 512, SGPRRegBank},
+ {0, 1024, SGPRRegBank},
{0, 1, VGPRRegBank}, // VGPR begin
{0, 16, VGPRRegBank},
@@ -53,8 +56,9 @@ const RegisterBankInfo::PartialMapping PartMappings[] {
{0, 128, VGPRRegBank},
{0, 256, VGPRRegBank},
{0, 512, VGPRRegBank},
+ {0, 1024, VGPRRegBank},
{0, 96, SGPRRegBank},
- {0, 96, VGPRRegBank},
+ {0, 96, VGPRRegBank}
};
const RegisterBankInfo::ValueMapping ValMappings[] {
@@ -65,41 +69,43 @@ const RegisterBankInfo::ValueMapping ValMappings[] {
{&PartMappings[1], 1},
// SGPRs
- {&PartMappings[2], 1},
+ {&PartMappings[2], 1}, // 1
{nullptr, 0}, // Illegal power of 2 sizes
{nullptr, 0},
{nullptr, 0},
- {&PartMappings[3], 1},
- {&PartMappings[4], 1},
- {&PartMappings[5], 1},
- {&PartMappings[6], 1},
- {&PartMappings[7], 1},
- {&PartMappings[8], 1},
-
- // VGPRs
- {&PartMappings[9], 1},
+ {&PartMappings[3], 1}, // 16
+ {&PartMappings[4], 1}, // 32
+ {&PartMappings[5], 1}, // 64
+ {&PartMappings[6], 1}, // 128
+ {&PartMappings[7], 1}, // 256
+ {&PartMappings[8], 1}, // 512
+ {&PartMappings[9], 1}, // 1024
+
+ // VGPRs
+ {&PartMappings[10], 1}, // 1
{nullptr, 0},
{nullptr, 0},
{nullptr, 0},
- {&PartMappings[10], 1},
- {&PartMappings[11], 1},
- {&PartMappings[12], 1},
- {&PartMappings[13], 1},
- {&PartMappings[14], 1},
- {&PartMappings[15], 1},
- {&PartMappings[16], 1},
- {&PartMappings[17], 1}
+ {&PartMappings[11], 1}, // 16
+ {&PartMappings[12], 1}, // 32
+ {&PartMappings[13], 1}, // 64
+ {&PartMappings[14], 1}, // 128
+ {&PartMappings[15], 1}, // 256
+ {&PartMappings[16], 1}, // 512
+ {&PartMappings[17], 1}, // 1024
+ {&PartMappings[18], 1},
+ {&PartMappings[19], 1}
};
const RegisterBankInfo::PartialMapping SGPROnly64BreakDown[] {
- /*32-bit op*/ {0, 32, SGPRRegBank},
- /*2x32-bit op*/ {0, 32, SGPRRegBank},
- {32, 32, SGPRRegBank},
-/*<2x32-bit> op*/ {0, 64, SGPRRegBank},
-
- /*32-bit op*/ {0, 32, VGPRRegBank},
- /*2x32-bit op*/ {0, 32, VGPRRegBank},
- {32, 32, VGPRRegBank},
+ {0, 32, SGPRRegBank}, // 32-bit op
+ {0, 32, SGPRRegBank}, // 2x32-bit op
+ {32, 32, SGPRRegBank},
+ {0, 64, SGPRRegBank}, // <2x32-bit> op
+
+ {0, 32, VGPRRegBank}, // 32-bit op
+ {0, 32, VGPRRegBank}, // 2x32-bit op
+ {32, 32, VGPRRegBank},
};
@@ -116,7 +122,7 @@ const RegisterBankInfo::ValueMapping ValMappingsSGPR64OnlyVGPR32[] {
enum ValueMappingIdx {
SCCStartIdx = 0,
SGPRStartIdx = 2,
- VGPRStartIdx = 12
+ VGPRStartIdx = 13
};
const RegisterBankInfo::ValueMapping *getValueMapping(unsigned BankID,
diff --git a/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp b/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp
index b31de0af5018..9f5bcd8ff5f0 100644
--- a/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp
+++ b/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp
@@ -218,12 +218,13 @@ MetadataStreamerV2::getHSACodeProps(const MachineFunction &MF,
assert(F.getCallingConv() == CallingConv::AMDGPU_KERNEL ||
F.getCallingConv() == CallingConv::SPIR_KERNEL);
- unsigned MaxKernArgAlign;
+ Align MaxKernArgAlign;
HSACodeProps.mKernargSegmentSize = STM.getKernArgSegmentSize(F,
MaxKernArgAlign);
HSACodeProps.mGroupSegmentFixedSize = ProgramInfo.LDSSize;
HSACodeProps.mPrivateSegmentFixedSize = ProgramInfo.ScratchSize;
- HSACodeProps.mKernargSegmentAlign = std::max(MaxKernArgAlign, 4u);
+ HSACodeProps.mKernargSegmentAlign =
+ std::max(MaxKernArgAlign, Align(4)).value();
HSACodeProps.mWavefrontSize = STM.getWavefrontSize();
HSACodeProps.mNumSGPRs = ProgramInfo.NumSGPR;
HSACodeProps.mNumVGPRs = ProgramInfo.NumVGPR;
@@ -883,7 +884,7 @@ MetadataStreamerV3::getHSAKernelProps(const MachineFunction &MF,
auto Kern = HSAMetadataDoc->getMapNode();
- unsigned MaxKernArgAlign;
+ Align MaxKernArgAlign;
Kern[".kernarg_segment_size"] = Kern.getDocument()->getNode(
STM.getKernArgSegmentSize(F, MaxKernArgAlign));
Kern[".group_segment_fixed_size"] =
@@ -891,7 +892,7 @@ MetadataStreamerV3::getHSAKernelProps(const MachineFunction &MF,
Kern[".private_segment_fixed_size"] =
Kern.getDocument()->getNode(ProgramInfo.ScratchSize);
Kern[".kernarg_segment_align"] =
- Kern.getDocument()->getNode(std::max(uint32_t(4), MaxKernArgAlign));
+ Kern.getDocument()->getNode(std::max(Align(4), MaxKernArgAlign).value());
Kern[".wavefront_size"] =
Kern.getDocument()->getNode(STM.getWavefrontSize());
Kern[".sgpr_count"] = Kern.getDocument()->getNode(ProgramInfo.NumSGPR);
diff --git a/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.h b/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.h
index 2eecddbd7b01..80ac8ca67bcd 100644
--- a/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.h
+++ b/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.h
@@ -52,7 +52,7 @@ public:
class MetadataStreamerV3 final : public MetadataStreamer {
private:
std::unique_ptr<msgpack::Document> HSAMetadataDoc =
- llvm::make_unique<msgpack::Document>();
+ std::make_unique<msgpack::Document>();
void dump(StringRef HSAMetadataString) const;
diff --git a/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
index ea730539f834..f330bd7ebcdd 100644
--- a/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
+++ b/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
@@ -172,8 +172,9 @@ private:
MachineSDNode *buildSMovImm64(SDLoc &DL, uint64_t Val, EVT VT) const;
- SDNode *glueCopyToM0LDSInit(SDNode *N) const;
+ SDNode *glueCopyToOp(SDNode *N, SDValue NewChain, SDValue Glue) const;
SDNode *glueCopyToM0(SDNode *N, SDValue Val) const;
+ SDNode *glueCopyToM0LDSInit(SDNode *N) const;
const TargetRegisterClass *getOperandRegClass(SDNode *N, unsigned OpNo) const;
virtual bool SelectADDRVTX_READ(SDValue Addr, SDValue &Base, SDValue &Offset);
@@ -186,10 +187,11 @@ private:
bool SelectMUBUF(SDValue Addr, SDValue &SRsrc, SDValue &VAddr,
SDValue &SOffset, SDValue &Offset, SDValue &Offen,
SDValue &Idxen, SDValue &Addr64, SDValue &GLC, SDValue &SLC,
- SDValue &TFE, SDValue &DLC) const;
+ SDValue &TFE, SDValue &DLC, SDValue &SWZ) const;
bool SelectMUBUFAddr64(SDValue Addr, SDValue &SRsrc, SDValue &VAddr,
SDValue &SOffset, SDValue &Offset, SDValue &GLC,
- SDValue &SLC, SDValue &TFE, SDValue &DLC) const;
+ SDValue &SLC, SDValue &TFE, SDValue &DLC,
+ SDValue &SWZ) const;
bool SelectMUBUFAddr64(SDValue Addr, SDValue &SRsrc,
SDValue &VAddr, SDValue &SOffset, SDValue &Offset,
SDValue &SLC) const;
@@ -202,21 +204,20 @@ private:
bool SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc, SDValue &SOffset,
SDValue &Offset, SDValue &GLC, SDValue &SLC,
- SDValue &TFE, SDValue &DLC) const;
+ SDValue &TFE, SDValue &DLC, SDValue &SWZ) const;
bool SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc, SDValue &Soffset,
SDValue &Offset, SDValue &SLC) const;
bool SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc, SDValue &Soffset,
SDValue &Offset) const;
+ template <bool IsSigned>
+ bool SelectFlatOffset(SDNode *N, SDValue Addr, SDValue &VAddr,
+ SDValue &Offset, SDValue &SLC) const;
bool SelectFlatAtomic(SDNode *N, SDValue Addr, SDValue &VAddr,
SDValue &Offset, SDValue &SLC) const;
bool SelectFlatAtomicSigned(SDNode *N, SDValue Addr, SDValue &VAddr,
SDValue &Offset, SDValue &SLC) const;
- template <bool IsSigned>
- bool SelectFlatOffset(SDNode *N, SDValue Addr, SDValue &VAddr,
- SDValue &Offset, SDValue &SLC) const;
-
bool SelectSMRDOffset(SDValue ByteOffsetNode, SDValue &Offset,
bool &Imm) const;
SDValue Expand32BitAddress(SDValue Addr) const;
@@ -262,6 +263,8 @@ private:
SDValue getHi16Elt(SDValue In) const;
+ SDValue getMaterializedScalarImm32(int64_t Val, const SDLoc &DL) const;
+
void SelectADD_SUB_I64(SDNode *N);
void SelectAddcSubb(SDNode *N);
void SelectUADDO_USUBO(SDNode *N);
@@ -282,6 +285,7 @@ private:
void SelectDSAppendConsume(SDNode *N, unsigned IntrID);
void SelectDS_GWS(SDNode *N, unsigned IntrID);
void SelectINTRINSIC_W_CHAIN(SDNode *N);
+ void SelectINTRINSIC_WO_CHAIN(SDNode *N);
void SelectINTRINSIC_VOID(SDNode *N);
protected:
@@ -543,7 +547,7 @@ const TargetRegisterClass *AMDGPUDAGToDAGISel::getOperandRegClass(SDNode *N,
if (!N->isMachineOpcode()) {
if (N->getOpcode() == ISD::CopyToReg) {
unsigned Reg = cast<RegisterSDNode>(N->getOperand(1))->getReg();
- if (TargetRegisterInfo::isVirtualRegister(Reg)) {
+ if (Register::isVirtualRegister(Reg)) {
MachineRegisterInfo &MRI = CurDAG->getMachineFunction().getRegInfo();
return MRI.getRegClass(Reg);
}
@@ -582,19 +586,10 @@ const TargetRegisterClass *AMDGPUDAGToDAGISel::getOperandRegClass(SDNode *N,
}
}
-SDNode *AMDGPUDAGToDAGISel::glueCopyToM0(SDNode *N, SDValue Val) const {
- const SITargetLowering& Lowering =
- *static_cast<const SITargetLowering*>(getTargetLowering());
-
- assert(N->getOperand(0).getValueType() == MVT::Other && "Expected chain");
-
- SDValue M0 = Lowering.copyToM0(*CurDAG, N->getOperand(0), SDLoc(N),
- Val);
-
- SDValue Glue = M0.getValue(1);
-
+SDNode *AMDGPUDAGToDAGISel::glueCopyToOp(SDNode *N, SDValue NewChain,
+ SDValue Glue) const {
SmallVector <SDValue, 8> Ops;
- Ops.push_back(M0); // Replace the chain.
+ Ops.push_back(NewChain); // Replace the chain.
for (unsigned i = 1, e = N->getNumOperands(); i != e; ++i)
Ops.push_back(N->getOperand(i));
@@ -602,6 +597,16 @@ SDNode *AMDGPUDAGToDAGISel::glueCopyToM0(SDNode *N, SDValue Val) const {
return CurDAG->MorphNodeTo(N, N->getOpcode(), N->getVTList(), Ops);
}
+SDNode *AMDGPUDAGToDAGISel::glueCopyToM0(SDNode *N, SDValue Val) const {
+ const SITargetLowering& Lowering =
+ *static_cast<const SITargetLowering*>(getTargetLowering());
+
+ assert(N->getOperand(0).getValueType() == MVT::Other && "Expected chain");
+
+ SDValue M0 = Lowering.copyToM0(*CurDAG, N->getOperand(0), SDLoc(N), Val);
+ return glueCopyToOp(N, M0, M0.getValue(1));
+}
+
SDNode *AMDGPUDAGToDAGISel::glueCopyToM0LDSInit(SDNode *N) const {
unsigned AS = cast<MemSDNode>(N)->getAddressSpace();
if (AS == AMDGPUAS::LOCAL_ADDRESS) {
@@ -635,13 +640,13 @@ MachineSDNode *AMDGPUDAGToDAGISel::buildSMovImm64(SDLoc &DL, uint64_t Imm,
static unsigned selectSGPRVectorRegClassID(unsigned NumVectorElts) {
switch (NumVectorElts) {
case 1:
- return AMDGPU::SReg_32_XM0RegClassID;
+ return AMDGPU::SReg_32RegClassID;
case 2:
return AMDGPU::SReg_64RegClassID;
case 3:
return AMDGPU::SGPR_96RegClassID;
case 4:
- return AMDGPU::SReg_128RegClassID;
+ return AMDGPU::SGPR_128RegClassID;
case 5:
return AMDGPU::SGPR_160RegClassID;
case 8:
@@ -713,12 +718,17 @@ void AMDGPUDAGToDAGISel::Select(SDNode *N) {
return; // Already selected.
}
- if (isa<AtomicSDNode>(N) ||
+ // isa<MemSDNode> almost works but is slightly too permissive for some DS
+ // intrinsics.
+ if (Opc == ISD::LOAD || Opc == ISD::STORE || isa<AtomicSDNode>(N) ||
(Opc == AMDGPUISD::ATOMIC_INC || Opc == AMDGPUISD::ATOMIC_DEC ||
Opc == ISD::ATOMIC_LOAD_FADD ||
Opc == AMDGPUISD::ATOMIC_LOAD_FMIN ||
- Opc == AMDGPUISD::ATOMIC_LOAD_FMAX))
+ Opc == AMDGPUISD::ATOMIC_LOAD_FMAX)) {
N = glueCopyToM0LDSInit(N);
+ SelectCode(N);
+ return;
+ }
switch (Opc) {
default:
@@ -781,7 +791,7 @@ void AMDGPUDAGToDAGISel::Select(SDNode *N) {
SDValue RC, SubReg0, SubReg1;
SDLoc DL(N);
if (N->getValueType(0) == MVT::i128) {
- RC = CurDAG->getTargetConstant(AMDGPU::SReg_128RegClassID, DL, MVT::i32);
+ RC = CurDAG->getTargetConstant(AMDGPU::SGPR_128RegClassID, DL, MVT::i32);
SubReg0 = CurDAG->getTargetConstant(AMDGPU::sub0_sub1, DL, MVT::i32);
SubReg1 = CurDAG->getTargetConstant(AMDGPU::sub2_sub3, DL, MVT::i32);
} else if (N->getValueType(0) == MVT::i64) {
@@ -815,14 +825,6 @@ void AMDGPUDAGToDAGISel::Select(SDNode *N) {
ReplaceNode(N, buildSMovImm64(DL, Imm, N->getValueType(0)));
return;
}
- case ISD::LOAD:
- case ISD::STORE:
- case ISD::ATOMIC_LOAD:
- case ISD::ATOMIC_STORE: {
- N = glueCopyToM0LDSInit(N);
- break;
- }
-
case AMDGPUISD::BFE_I32:
case AMDGPUISD::BFE_U32: {
// There is a scalar version available, but unlike the vector version which
@@ -908,6 +910,10 @@ void AMDGPUDAGToDAGISel::Select(SDNode *N) {
SelectINTRINSIC_W_CHAIN(N);
return;
}
+ case ISD::INTRINSIC_WO_CHAIN: {
+ SelectINTRINSIC_WO_CHAIN(N);
+ return;
+ }
case ISD::INTRINSIC_VOID: {
SelectINTRINSIC_VOID(N);
return;
@@ -961,6 +967,14 @@ bool AMDGPUDAGToDAGISel::SelectADDRIndirect(SDValue Addr, SDValue &Base,
return true;
}
+SDValue AMDGPUDAGToDAGISel::getMaterializedScalarImm32(int64_t Val,
+ const SDLoc &DL) const {
+ SDNode *Mov = CurDAG->getMachineNode(
+ AMDGPU::S_MOV_B32, DL, MVT::i32,
+ CurDAG->getTargetConstant(Val, DL, MVT::i32));
+ return SDValue(Mov, 0);
+}
+
// FIXME: Should only handle addcarry/subcarry
void AMDGPUDAGToDAGISel::SelectADD_SUB_I64(SDNode *N) {
SDLoc DL(N);
@@ -1308,7 +1322,8 @@ bool AMDGPUDAGToDAGISel::SelectMUBUF(SDValue Addr, SDValue &Ptr,
SDValue &Offset, SDValue &Offen,
SDValue &Idxen, SDValue &Addr64,
SDValue &GLC, SDValue &SLC,
- SDValue &TFE, SDValue &DLC) const {
+ SDValue &TFE, SDValue &DLC,
+ SDValue &SWZ) const {
// Subtarget prefers to use flat instruction
if (Subtarget->useFlatForGlobal())
return false;
@@ -1321,6 +1336,7 @@ bool AMDGPUDAGToDAGISel::SelectMUBUF(SDValue Addr, SDValue &Ptr,
SLC = CurDAG->getTargetConstant(0, DL, MVT::i1);
TFE = CurDAG->getTargetConstant(0, DL, MVT::i1);
DLC = CurDAG->getTargetConstant(0, DL, MVT::i1);
+ SWZ = CurDAG->getTargetConstant(0, DL, MVT::i1);
Idxen = CurDAG->getTargetConstant(0, DL, MVT::i1);
Offen = CurDAG->getTargetConstant(0, DL, MVT::i1);
@@ -1400,7 +1416,7 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFAddr64(SDValue Addr, SDValue &SRsrc,
SDValue &VAddr, SDValue &SOffset,
SDValue &Offset, SDValue &GLC,
SDValue &SLC, SDValue &TFE,
- SDValue &DLC) const {
+ SDValue &DLC, SDValue &SWZ) const {
SDValue Ptr, Offen, Idxen, Addr64;
// addr64 bit was removed for volcanic islands.
@@ -1408,7 +1424,7 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFAddr64(SDValue Addr, SDValue &SRsrc,
return false;
if (!SelectMUBUF(Addr, Ptr, VAddr, SOffset, Offset, Offen, Idxen, Addr64,
- GLC, SLC, TFE, DLC))
+ GLC, SLC, TFE, DLC, SWZ))
return false;
ConstantSDNode *C = cast<ConstantSDNode>(Addr64);
@@ -1430,9 +1446,9 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFAddr64(SDValue Addr, SDValue &SRsrc,
SDValue &Offset,
SDValue &SLC) const {
SLC = CurDAG->getTargetConstant(0, SDLoc(Addr), MVT::i1);
- SDValue GLC, TFE, DLC;
+ SDValue GLC, TFE, DLC, SWZ;
- return SelectMUBUFAddr64(Addr, SRsrc, VAddr, SOffset, Offset, GLC, SLC, TFE, DLC);
+ return SelectMUBUFAddr64(Addr, SRsrc, VAddr, SOffset, Offset, GLC, SLC, TFE, DLC, SWZ);
}
static bool isStackPtrRelative(const MachinePointerInfo &PtrInfo) {
@@ -1557,13 +1573,14 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFScratchOffset(SDNode *Parent,
bool AMDGPUDAGToDAGISel::SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc,
SDValue &SOffset, SDValue &Offset,
SDValue &GLC, SDValue &SLC,
- SDValue &TFE, SDValue &DLC) const {
+ SDValue &TFE, SDValue &DLC,
+ SDValue &SWZ) const {
SDValue Ptr, VAddr, Offen, Idxen, Addr64;
const SIInstrInfo *TII =
static_cast<const SIInstrInfo *>(Subtarget->getInstrInfo());
if (!SelectMUBUF(Addr, Ptr, VAddr, SOffset, Offset, Offen, Idxen, Addr64,
- GLC, SLC, TFE, DLC))
+ GLC, SLC, TFE, DLC, SWZ))
return false;
if (!cast<ConstantSDNode>(Offen)->getSExtValue() &&
@@ -1585,16 +1602,30 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc,
bool AMDGPUDAGToDAGISel::SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc,
SDValue &Soffset, SDValue &Offset
) const {
- SDValue GLC, SLC, TFE, DLC;
+ SDValue GLC, SLC, TFE, DLC, SWZ;
- return SelectMUBUFOffset(Addr, SRsrc, Soffset, Offset, GLC, SLC, TFE, DLC);
+ return SelectMUBUFOffset(Addr, SRsrc, Soffset, Offset, GLC, SLC, TFE, DLC, SWZ);
}
bool AMDGPUDAGToDAGISel::SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc,
SDValue &Soffset, SDValue &Offset,
SDValue &SLC) const {
- SDValue GLC, TFE, DLC;
+ SDValue GLC, TFE, DLC, SWZ;
+
+ return SelectMUBUFOffset(Addr, SRsrc, Soffset, Offset, GLC, SLC, TFE, DLC, SWZ);
+}
- return SelectMUBUFOffset(Addr, SRsrc, Soffset, Offset, GLC, SLC, TFE, DLC);
+// Find a load or store from corresponding pattern root.
+// Roots may be build_vector, bitconvert or their combinations.
+static MemSDNode* findMemSDNode(SDNode *N) {
+ N = AMDGPUTargetLowering::stripBitcast(SDValue(N,0)).getNode();
+ if (MemSDNode *MN = dyn_cast<MemSDNode>(N))
+ return MN;
+ assert(isa<BuildVectorSDNode>(N));
+ for (SDValue V : N->op_values())
+ if (MemSDNode *MN =
+ dyn_cast<MemSDNode>(AMDGPUTargetLowering::stripBitcast(V)))
+ return MN;
+ llvm_unreachable("cannot find MemSDNode in the pattern!");
}
template <bool IsSigned>
@@ -1603,8 +1634,95 @@ bool AMDGPUDAGToDAGISel::SelectFlatOffset(SDNode *N,
SDValue &VAddr,
SDValue &Offset,
SDValue &SLC) const {
- return static_cast<const SITargetLowering*>(getTargetLowering())->
- SelectFlatOffset(IsSigned, *CurDAG, N, Addr, VAddr, Offset, SLC);
+ int64_t OffsetVal = 0;
+
+ if (Subtarget->hasFlatInstOffsets() &&
+ (!Subtarget->hasFlatSegmentOffsetBug() ||
+ findMemSDNode(N)->getAddressSpace() != AMDGPUAS::FLAT_ADDRESS) &&
+ CurDAG->isBaseWithConstantOffset(Addr)) {
+ SDValue N0 = Addr.getOperand(0);
+ SDValue N1 = Addr.getOperand(1);
+ uint64_t COffsetVal = cast<ConstantSDNode>(N1)->getSExtValue();
+
+ const SIInstrInfo *TII = Subtarget->getInstrInfo();
+ unsigned AS = findMemSDNode(N)->getAddressSpace();
+ if (TII->isLegalFLATOffset(COffsetVal, AS, IsSigned)) {
+ Addr = N0;
+ OffsetVal = COffsetVal;
+ } else {
+ // If the offset doesn't fit, put the low bits into the offset field and
+ // add the rest.
+
+ SDLoc DL(N);
+ uint64_t ImmField;
+ const unsigned NumBits = TII->getNumFlatOffsetBits(AS, IsSigned);
+ if (IsSigned) {
+ ImmField = SignExtend64(COffsetVal, NumBits);
+
+ // Don't use a negative offset field if the base offset is positive.
+ // Since the scheduler currently relies on the offset field, doing so
+ // could result in strange scheduling decisions.
+
+ // TODO: Should we not do this in the opposite direction as well?
+ if (static_cast<int64_t>(COffsetVal) > 0) {
+ if (static_cast<int64_t>(ImmField) < 0) {
+ const uint64_t OffsetMask = maskTrailingOnes<uint64_t>(NumBits - 1);
+ ImmField = COffsetVal & OffsetMask;
+ }
+ }
+ } else {
+ // TODO: Should we do this for a negative offset?
+ const uint64_t OffsetMask = maskTrailingOnes<uint64_t>(NumBits);
+ ImmField = COffsetVal & OffsetMask;
+ }
+
+ uint64_t RemainderOffset = COffsetVal - ImmField;
+
+ assert(TII->isLegalFLATOffset(ImmField, AS, IsSigned));
+ assert(RemainderOffset + ImmField == COffsetVal);
+
+ OffsetVal = ImmField;
+
+ // TODO: Should this try to use a scalar add pseudo if the base address is
+ // uniform and saddr is usable?
+ SDValue Sub0 = CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32);
+ SDValue Sub1 = CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32);
+
+ SDNode *N0Lo = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
+ DL, MVT::i32, N0, Sub0);
+ SDNode *N0Hi = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
+ DL, MVT::i32, N0, Sub1);
+
+ SDValue AddOffsetLo
+ = getMaterializedScalarImm32(Lo_32(RemainderOffset), DL);
+ SDValue AddOffsetHi
+ = getMaterializedScalarImm32(Hi_32(RemainderOffset), DL);
+
+ SDVTList VTs = CurDAG->getVTList(MVT::i32, MVT::i1);
+ SDValue Clamp = CurDAG->getTargetConstant(0, DL, MVT::i1);
+
+ SDNode *Add = CurDAG->getMachineNode(
+ AMDGPU::V_ADD_I32_e64, DL, VTs,
+ {AddOffsetLo, SDValue(N0Lo, 0), Clamp});
+
+ SDNode *Addc = CurDAG->getMachineNode(
+ AMDGPU::V_ADDC_U32_e64, DL, VTs,
+ {AddOffsetHi, SDValue(N0Hi, 0), SDValue(Add, 1), Clamp});
+
+ SDValue RegSequenceArgs[] = {
+ CurDAG->getTargetConstant(AMDGPU::VReg_64RegClassID, DL, MVT::i32),
+ SDValue(Add, 0), Sub0, SDValue(Addc, 0), Sub1
+ };
+
+ Addr = SDValue(CurDAG->getMachineNode(AMDGPU::REG_SEQUENCE, DL,
+ MVT::i64, RegSequenceArgs), 0);
+ }
+ }
+
+ VAddr = Addr;
+ Offset = CurDAG->getTargetConstant(OffsetVal, SDLoc(), MVT::i16);
+ SLC = CurDAG->getTargetConstant(0, SDLoc(), MVT::i1);
+ return true;
}
bool AMDGPUDAGToDAGISel::SelectFlatAtomic(SDNode *N,
@@ -1616,10 +1734,10 @@ bool AMDGPUDAGToDAGISel::SelectFlatAtomic(SDNode *N,
}
bool AMDGPUDAGToDAGISel::SelectFlatAtomicSigned(SDNode *N,
- SDValue Addr,
- SDValue &VAddr,
- SDValue &Offset,
- SDValue &SLC) const {
+ SDValue Addr,
+ SDValue &VAddr,
+ SDValue &Offset,
+ SDValue &SLC) const {
return SelectFlatOffset<true>(N, Addr, VAddr, Offset, SLC);
}
@@ -2158,10 +2276,12 @@ void AMDGPUDAGToDAGISel::SelectDS_GWS(SDNode *N, unsigned IntrID) {
// offset field) % 64. Some versions of the programming guide omit the m0
// part, or claim it's from offset 0.
if (ConstantSDNode *ConstOffset = dyn_cast<ConstantSDNode>(BaseOffset)) {
- // If we have a constant offset, try to use the default value for m0 as a
- // base to possibly avoid setting it up.
- glueCopyToM0(N, CurDAG->getTargetConstant(-1, SL, MVT::i32));
- ImmOffset = ConstOffset->getZExtValue() + 1;
+ // If we have a constant offset, try to use the 0 in m0 as the base.
+ // TODO: Look into changing the default m0 initialization value. If the
+ // default -1 only set the low 16-bits, we could leave it as-is and add 1 to
+ // the immediate offset.
+ glueCopyToM0(N, CurDAG->getTargetConstant(0, SL, MVT::i32));
+ ImmOffset = ConstOffset->getZExtValue();
} else {
if (CurDAG->isBaseWithConstantOffset(BaseOffset)) {
ImmOffset = BaseOffset.getConstantOperandVal(1);
@@ -2182,22 +2302,7 @@ void AMDGPUDAGToDAGISel::SelectDS_GWS(SDNode *N, unsigned IntrID) {
glueCopyToM0(N, SDValue(M0Base, 0));
}
- SDValue V0;
SDValue Chain = N->getOperand(0);
- SDValue Glue;
- if (HasVSrc) {
- SDValue VSrc0 = N->getOperand(2);
-
- // The manual doesn't mention this, but it seems only v0 works.
- V0 = CurDAG->getRegister(AMDGPU::VGPR0, MVT::i32);
-
- SDValue CopyToV0 = CurDAG->getCopyToReg(
- N->getOperand(0), SL, V0, VSrc0,
- N->getOperand(N->getNumOperands() - 1));
- Chain = CopyToV0;
- Glue = CopyToV0.getValue(1);
- }
-
SDValue OffsetField = CurDAG->getTargetConstant(ImmOffset, SL, MVT::i32);
// TODO: Can this just be removed from the instruction?
@@ -2206,14 +2311,11 @@ void AMDGPUDAGToDAGISel::SelectDS_GWS(SDNode *N, unsigned IntrID) {
const unsigned Opc = gwsIntrinToOpcode(IntrID);
SmallVector<SDValue, 5> Ops;
if (HasVSrc)
- Ops.push_back(V0);
+ Ops.push_back(N->getOperand(2));
Ops.push_back(OffsetField);
Ops.push_back(GDS);
Ops.push_back(Chain);
- if (HasVSrc)
- Ops.push_back(Glue);
-
SDNode *Selected = CurDAG->SelectNodeTo(N, Opc, N->getVTList(), Ops);
CurDAG->setNodeMemRefs(cast<MachineSDNode>(Selected), {MMO});
}
@@ -2233,6 +2335,28 @@ void AMDGPUDAGToDAGISel::SelectINTRINSIC_W_CHAIN(SDNode *N) {
SelectCode(N);
}
+void AMDGPUDAGToDAGISel::SelectINTRINSIC_WO_CHAIN(SDNode *N) {
+ unsigned IntrID = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
+ unsigned Opcode;
+ switch (IntrID) {
+ case Intrinsic::amdgcn_wqm:
+ Opcode = AMDGPU::WQM;
+ break;
+ case Intrinsic::amdgcn_softwqm:
+ Opcode = AMDGPU::SOFT_WQM;
+ break;
+ case Intrinsic::amdgcn_wwm:
+ Opcode = AMDGPU::WWM;
+ break;
+ default:
+ SelectCode(N);
+ return;
+ }
+
+ SDValue Src = N->getOperand(1);
+ CurDAG->SelectNodeTo(N, Opcode, N->getVTList(), {Src});
+}
+
void AMDGPUDAGToDAGISel::SelectINTRINSIC_VOID(SDNode *N) {
unsigned IntrID = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
switch (IntrID) {
diff --git a/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index 39016ed37193..1115d8c23620 100644
--- a/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -12,10 +12,6 @@
//
//===----------------------------------------------------------------------===//
-#define AMDGPU_LOG2E_F 1.44269504088896340735992468100189214f
-#define AMDGPU_LN2_F 0.693147180559945309417232121458176568f
-#define AMDGPU_LN10_F 2.30258509299404568401799145468436421f
-
#include "AMDGPUISelLowering.h"
#include "AMDGPU.h"
#include "AMDGPUCallLowering.h"
@@ -37,82 +33,9 @@
#include "llvm/IR/DataLayout.h"
#include "llvm/IR/DiagnosticInfo.h"
#include "llvm/Support/KnownBits.h"
+#include "llvm/Support/MathExtras.h"
using namespace llvm;
-static bool allocateCCRegs(unsigned ValNo, MVT ValVT, MVT LocVT,
- CCValAssign::LocInfo LocInfo,
- ISD::ArgFlagsTy ArgFlags, CCState &State,
- const TargetRegisterClass *RC,
- unsigned NumRegs) {
- ArrayRef<MCPhysReg> RegList = makeArrayRef(RC->begin(), NumRegs);
- unsigned RegResult = State.AllocateReg(RegList);
- if (RegResult == AMDGPU::NoRegister)
- return false;
-
- State.addLoc(CCValAssign::getReg(ValNo, ValVT, RegResult, LocVT, LocInfo));
- return true;
-}
-
-static bool allocateSGPRTuple(unsigned ValNo, MVT ValVT, MVT LocVT,
- CCValAssign::LocInfo LocInfo,
- ISD::ArgFlagsTy ArgFlags, CCState &State) {
- switch (LocVT.SimpleTy) {
- case MVT::i64:
- case MVT::f64:
- case MVT::v2i32:
- case MVT::v2f32:
- case MVT::v4i16:
- case MVT::v4f16: {
- // Up to SGPR0-SGPR105
- return allocateCCRegs(ValNo, ValVT, LocVT, LocInfo, ArgFlags, State,
- &AMDGPU::SGPR_64RegClass, 53);
- }
- default:
- return false;
- }
-}
-
-// Allocate up to VGPR31.
-//
-// TODO: Since there are no VGPR alignent requirements would it be better to
-// split into individual scalar registers?
-static bool allocateVGPRTuple(unsigned ValNo, MVT ValVT, MVT LocVT,
- CCValAssign::LocInfo LocInfo,
- ISD::ArgFlagsTy ArgFlags, CCState &State) {
- switch (LocVT.SimpleTy) {
- case MVT::i64:
- case MVT::f64:
- case MVT::v2i32:
- case MVT::v2f32:
- case MVT::v4i16:
- case MVT::v4f16: {
- return allocateCCRegs(ValNo, ValVT, LocVT, LocInfo, ArgFlags, State,
- &AMDGPU::VReg_64RegClass, 31);
- }
- case MVT::v4i32:
- case MVT::v4f32:
- case MVT::v2i64:
- case MVT::v2f64: {
- return allocateCCRegs(ValNo, ValVT, LocVT, LocInfo, ArgFlags, State,
- &AMDGPU::VReg_128RegClass, 29);
- }
- case MVT::v8i32:
- case MVT::v8f32: {
- return allocateCCRegs(ValNo, ValVT, LocVT, LocInfo, ArgFlags, State,
- &AMDGPU::VReg_256RegClass, 25);
-
- }
- case MVT::v16i32:
- case MVT::v16f32: {
- return allocateCCRegs(ValNo, ValVT, LocVT, LocInfo, ArgFlags, State,
- &AMDGPU::VReg_512RegClass, 17);
-
- }
- default:
- return false;
- }
-}
-
#include "AMDGPUGenCallingConv.inc"
// Find a larger type to do a load / store of a vector with.
@@ -208,7 +131,7 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
setLoadExtAction(ISD::EXTLOAD, VT, MVT::i32, Expand);
}
- for (MVT VT : MVT::integer_vector_valuetypes()) {
+ for (MVT VT : MVT::integer_fixedlen_vector_valuetypes()) {
setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i8, Expand);
setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i8, Expand);
setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::v2i8, Expand);
@@ -218,6 +141,9 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i16, Expand);
setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i16, Expand);
setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::v2i16, Expand);
+ setLoadExtAction(ISD::EXTLOAD, VT, MVT::v3i16, Expand);
+ setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v3i16, Expand);
+ setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::v3i16, Expand);
setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4i16, Expand);
setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v4i16, Expand);
setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::v4i16, Expand);
@@ -225,8 +151,11 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::f16, Expand);
setLoadExtAction(ISD::EXTLOAD, MVT::v2f32, MVT::v2f16, Expand);
+ setLoadExtAction(ISD::EXTLOAD, MVT::v3f32, MVT::v3f16, Expand);
setLoadExtAction(ISD::EXTLOAD, MVT::v4f32, MVT::v4f16, Expand);
setLoadExtAction(ISD::EXTLOAD, MVT::v8f32, MVT::v8f16, Expand);
+ setLoadExtAction(ISD::EXTLOAD, MVT::v16f32, MVT::v16f16, Expand);
+ setLoadExtAction(ISD::EXTLOAD, MVT::v32f32, MVT::v32f16, Expand);
setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f32, Expand);
setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f32, Expand);
@@ -286,8 +215,11 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
setTruncStoreAction(MVT::f32, MVT::f16, Expand);
setTruncStoreAction(MVT::v2f32, MVT::v2f16, Expand);
+ setTruncStoreAction(MVT::v3f32, MVT::v3f16, Expand);
setTruncStoreAction(MVT::v4f32, MVT::v4f16, Expand);
setTruncStoreAction(MVT::v8f32, MVT::v8f16, Expand);
+ setTruncStoreAction(MVT::v16f32, MVT::v16f16, Expand);
+ setTruncStoreAction(MVT::v32f32, MVT::v32f16, Expand);
setTruncStoreAction(MVT::f64, MVT::f16, Expand);
setTruncStoreAction(MVT::f64, MVT::f32, Expand);
@@ -571,6 +503,7 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
setTargetDAGCombine(ISD::FABS);
setTargetDAGCombine(ISD::AssertZext);
setTargetDAGCombine(ISD::AssertSext);
+ setTargetDAGCombine(ISD::INTRINSIC_WO_CHAIN);
}
//===----------------------------------------------------------------------===//
@@ -630,15 +563,26 @@ static bool hasSourceMods(const SDNode *N) {
case ISD::FREM:
case ISD::INLINEASM:
case ISD::INLINEASM_BR:
- case AMDGPUISD::INTERP_P1:
- case AMDGPUISD::INTERP_P2:
case AMDGPUISD::DIV_SCALE:
+ case ISD::INTRINSIC_W_CHAIN:
// TODO: Should really be looking at the users of the bitcast. These are
// problematic because bitcasts are used to legalize all stores to integer
// types.
case ISD::BITCAST:
return false;
+ case ISD::INTRINSIC_WO_CHAIN: {
+ switch (cast<ConstantSDNode>(N->getOperand(0))->getZExtValue()) {
+ case Intrinsic::amdgcn_interp_p1:
+ case Intrinsic::amdgcn_interp_p2:
+ case Intrinsic::amdgcn_interp_mov:
+ case Intrinsic::amdgcn_interp_p1_f16:
+ case Intrinsic::amdgcn_interp_p2_f16:
+ return false;
+ default:
+ return true;
+ }
+ }
default:
return true;
}
@@ -745,8 +689,9 @@ bool AMDGPUTargetLowering::isLoadBitCastBeneficial(EVT LoadTy, EVT CastTy,
return false;
bool Fast = false;
- return allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), CastTy,
- MMO, &Fast) && Fast;
+ return allowsMemoryAccessForAlignment(*DAG.getContext(), DAG.getDataLayout(),
+ CastTy, MMO, &Fast) &&
+ Fast;
}
// SI+ has instructions for cttz / ctlz for 32-bit values. This is probably also
@@ -782,9 +727,8 @@ bool AMDGPUTargetLowering::isSDNodeAlwaysUniform(const SDNode * N) const {
break;
case ISD::LOAD:
{
- const LoadSDNode * L = dyn_cast<LoadSDNode>(N);
- if (L->getMemOperand()->getAddrSpace()
- == AMDGPUAS::CONSTANT_ADDRESS_32BIT)
+ if (cast<LoadSDNode>(N)->getMemOperand()->getAddrSpace() ==
+ AMDGPUAS::CONSTANT_ADDRESS_32BIT)
return true;
return false;
}
@@ -1199,9 +1143,9 @@ SDValue AMDGPUTargetLowering::LowerOperation(SDValue Op,
case ISD::FROUND: return LowerFROUND(Op, DAG);
case ISD::FFLOOR: return LowerFFLOOR(Op, DAG);
case ISD::FLOG:
- return LowerFLOG(Op, DAG, 1 / AMDGPU_LOG2E_F);
+ return LowerFLOG(Op, DAG, 1.0F / numbers::log2ef);
case ISD::FLOG10:
- return LowerFLOG(Op, DAG, AMDGPU_LN2_F / AMDGPU_LN10_F);
+ return LowerFLOG(Op, DAG, numbers::ln2f / numbers::ln10f);
case ISD::FEXP:
return lowerFEXP(Op, DAG);
case ISD::SINT_TO_FP: return LowerSINT_TO_FP(Op, DAG);
@@ -1236,7 +1180,7 @@ void AMDGPUTargetLowering::ReplaceNodeResults(SDNode *N,
}
}
-static bool hasDefinedInitializer(const GlobalValue *GV) {
+bool AMDGPUTargetLowering::hasDefinedInitializer(const GlobalValue *GV) {
const GlobalVariable *GVar = dyn_cast<GlobalVariable>(GV);
if (!GVar || !GVar->hasInitializer())
return false;
@@ -2349,30 +2293,13 @@ SDValue AMDGPUTargetLowering::LowerFLOG(SDValue Op, SelectionDAG &DAG,
return DAG.getNode(ISD::FMUL, SL, VT, Log2Operand, Log2BaseInvertedOperand);
}
-// Return M_LOG2E of appropriate type
-static SDValue getLog2EVal(SelectionDAG &DAG, const SDLoc &SL, EVT VT) {
- switch (VT.getScalarType().getSimpleVT().SimpleTy) {
- case MVT::f32:
- return DAG.getConstantFP(1.44269504088896340735992468100189214f, SL, VT);
- case MVT::f16:
- return DAG.getConstantFP(
- APFloat(APFloat::IEEEhalf(), "1.44269504088896340735992468100189214"),
- SL, VT);
- case MVT::f64:
- return DAG.getConstantFP(
- APFloat(APFloat::IEEEdouble(), "0x1.71547652b82fep+0"), SL, VT);
- default:
- llvm_unreachable("unsupported fp type");
- }
-}
-
// exp2(M_LOG2E_F * f);
SDValue AMDGPUTargetLowering::lowerFEXP(SDValue Op, SelectionDAG &DAG) const {
EVT VT = Op.getValueType();
SDLoc SL(Op);
SDValue Src = Op.getOperand(0);
- const SDValue K = getLog2EVal(DAG, SL, VT);
+ const SDValue K = DAG.getConstantFP(numbers::log2e, SL, VT);
SDValue Mul = DAG.getNode(ISD::FMUL, SL, VT, Src, K, Op->getFlags());
return DAG.getNode(ISD::FEXP2, SL, VT, Mul, Op->getFlags());
}
@@ -2836,8 +2763,16 @@ static bool isI24(SDValue Op, SelectionDAG &DAG) {
static SDValue simplifyI24(SDNode *Node24,
TargetLowering::DAGCombinerInfo &DCI) {
SelectionDAG &DAG = DCI.DAG;
- SDValue LHS = Node24->getOperand(0);
- SDValue RHS = Node24->getOperand(1);
+ bool IsIntrin = Node24->getOpcode() == ISD::INTRINSIC_WO_CHAIN;
+
+ SDValue LHS = IsIntrin ? Node24->getOperand(1) : Node24->getOperand(0);
+ SDValue RHS = IsIntrin ? Node24->getOperand(2) : Node24->getOperand(1);
+ unsigned NewOpcode = Node24->getOpcode();
+ if (IsIntrin) {
+ unsigned IID = cast<ConstantSDNode>(Node24->getOperand(0))->getZExtValue();
+ NewOpcode = IID == Intrinsic::amdgcn_mul_i24 ?
+ AMDGPUISD::MUL_I24 : AMDGPUISD::MUL_U24;
+ }
APInt Demanded = APInt::getLowBitsSet(LHS.getValueSizeInBits(), 24);
@@ -2847,7 +2782,7 @@ static SDValue simplifyI24(SDNode *Node24,
SDValue DemandedLHS = DAG.GetDemandedBits(LHS, Demanded);
SDValue DemandedRHS = DAG.GetDemandedBits(RHS, Demanded);
if (DemandedLHS || DemandedRHS)
- return DAG.getNode(Node24->getOpcode(), SDLoc(Node24), Node24->getVTList(),
+ return DAG.getNode(NewOpcode, SDLoc(Node24), Node24->getVTList(),
DemandedLHS ? DemandedLHS : LHS,
DemandedRHS ? DemandedRHS : RHS);
@@ -2904,54 +2839,6 @@ bool AMDGPUTargetLowering::shouldCombineMemoryType(EVT VT) const {
return true;
}
-// Find a load or store from corresponding pattern root.
-// Roots may be build_vector, bitconvert or their combinations.
-static MemSDNode* findMemSDNode(SDNode *N) {
- N = AMDGPUTargetLowering::stripBitcast(SDValue(N,0)).getNode();
- if (MemSDNode *MN = dyn_cast<MemSDNode>(N))
- return MN;
- assert(isa<BuildVectorSDNode>(N));
- for (SDValue V : N->op_values())
- if (MemSDNode *MN =
- dyn_cast<MemSDNode>(AMDGPUTargetLowering::stripBitcast(V)))
- return MN;
- llvm_unreachable("cannot find MemSDNode in the pattern!");
-}
-
-bool AMDGPUTargetLowering::SelectFlatOffset(bool IsSigned,
- SelectionDAG &DAG,
- SDNode *N,
- SDValue Addr,
- SDValue &VAddr,
- SDValue &Offset,
- SDValue &SLC) const {
- const GCNSubtarget &ST =
- DAG.getMachineFunction().getSubtarget<GCNSubtarget>();
- int64_t OffsetVal = 0;
-
- if (ST.hasFlatInstOffsets() &&
- (!ST.hasFlatSegmentOffsetBug() ||
- findMemSDNode(N)->getAddressSpace() != AMDGPUAS::FLAT_ADDRESS) &&
- DAG.isBaseWithConstantOffset(Addr)) {
- SDValue N0 = Addr.getOperand(0);
- SDValue N1 = Addr.getOperand(1);
- int64_t COffsetVal = cast<ConstantSDNode>(N1)->getSExtValue();
-
- const SIInstrInfo *TII = ST.getInstrInfo();
- if (TII->isLegalFLATOffset(COffsetVal, findMemSDNode(N)->getAddressSpace(),
- IsSigned)) {
- Addr = N0;
- OffsetVal = COffsetVal;
- }
- }
-
- VAddr = Addr;
- Offset = DAG.getTargetConstant(OffsetVal, SDLoc(), MVT::i16);
- SLC = DAG.getTargetConstant(0, SDLoc(), MVT::i1);
-
- return true;
-}
-
// Replace load of an illegal type with a store of a bitcast to a friendlier
// type.
SDValue AMDGPUTargetLowering::performLoadCombine(SDNode *N,
@@ -3085,6 +2972,19 @@ SDValue AMDGPUTargetLowering::performAssertSZExtCombine(SDNode *N,
return SDValue();
}
+
+SDValue AMDGPUTargetLowering::performIntrinsicWOChainCombine(
+ SDNode *N, DAGCombinerInfo &DCI) const {
+ unsigned IID = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
+ switch (IID) {
+ case Intrinsic::amdgcn_mul_i24:
+ case Intrinsic::amdgcn_mul_u24:
+ return simplifyI24(N, DCI);
+ default:
+ return SDValue();
+ }
+}
+
/// Split the 64-bit value \p LHS into two 32-bit components, and perform the
/// binary operation \p Opc to it with the corresponding constant operands.
SDValue AMDGPUTargetLowering::splitBinaryBitConstantOpImpl(
@@ -4173,6 +4073,8 @@ SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N,
case ISD::AssertZext:
case ISD::AssertSext:
return performAssertSZExtCombine(N, DCI);
+ case ISD::INTRINSIC_WO_CHAIN:
+ return performIntrinsicWOChainCombine(N, DCI);
}
return SDValue();
}
@@ -4203,14 +4105,28 @@ SDValue AMDGPUTargetLowering::CreateLiveInRegister(SelectionDAG &DAG,
return DAG.getCopyFromReg(DAG.getEntryNode(), SL, VReg, VT);
}
+// This may be called multiple times, and nothing prevents creating multiple
+// objects at the same offset. See if we already defined this object.
+static int getOrCreateFixedStackObject(MachineFrameInfo &MFI, unsigned Size,
+ int64_t Offset) {
+ for (int I = MFI.getObjectIndexBegin(); I < 0; ++I) {
+ if (MFI.getObjectOffset(I) == Offset) {
+ assert(MFI.getObjectSize(I) == Size);
+ return I;
+ }
+ }
+
+ return MFI.CreateFixedObject(Size, Offset, true);
+}
+
SDValue AMDGPUTargetLowering::loadStackInputValue(SelectionDAG &DAG,
EVT VT,
const SDLoc &SL,
int64_t Offset) const {
MachineFunction &MF = DAG.getMachineFunction();
MachineFrameInfo &MFI = MF.getFrameInfo();
+ int FI = getOrCreateFixedStackObject(MFI, VT.getStoreSize(), Offset);
- int FI = MFI.CreateFixedObject(VT.getStoreSize(), Offset, true);
auto SrcPtrInfo = MachinePointerInfo::getStack(MF, Offset);
SDValue Ptr = DAG.getFrameIndex(FI, MVT::i32);
@@ -4260,7 +4176,7 @@ uint32_t AMDGPUTargetLowering::getImplicitParameterOffset(
const AMDGPUSubtarget &ST =
AMDGPUSubtarget::get(getTargetMachine(), MF.getFunction());
unsigned ExplicitArgOffset = ST.getExplicitKernelArgOffset(MF.getFunction());
- unsigned Alignment = ST.getAlignmentForImplicitArgPtr();
+ const Align Alignment = ST.getAlignmentForImplicitArgPtr();
uint64_t ArgOffset = alignTo(MFI->getExplicitKernArgSize(), Alignment) +
ExplicitArgOffset;
switch (Param) {
@@ -4295,6 +4211,7 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
NODE_NAME_CASE(FRACT)
NODE_NAME_CASE(SETCC)
NODE_NAME_CASE(SETREG)
+ NODE_NAME_CASE(DENORM_MODE)
NODE_NAME_CASE(FMA_W_CHAIN)
NODE_NAME_CASE(FMUL_W_CHAIN)
NODE_NAME_CASE(CLAMP)
@@ -4377,13 +4294,6 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
NODE_NAME_CASE(KILL)
NODE_NAME_CASE(DUMMY_CHAIN)
case AMDGPUISD::FIRST_MEM_OPCODE_NUMBER: break;
- NODE_NAME_CASE(INIT_EXEC)
- NODE_NAME_CASE(INIT_EXEC_FROM_INPUT)
- NODE_NAME_CASE(SENDMSG)
- NODE_NAME_CASE(SENDMSGHALT)
- NODE_NAME_CASE(INTERP_MOV)
- NODE_NAME_CASE(INTERP_P1)
- NODE_NAME_CASE(INTERP_P2)
NODE_NAME_CASE(INTERP_P1LL_F16)
NODE_NAME_CASE(INTERP_P1LV_F16)
NODE_NAME_CASE(INTERP_P2_F16)
@@ -4428,6 +4338,8 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
NODE_NAME_CASE(BUFFER_ATOMIC_AND)
NODE_NAME_CASE(BUFFER_ATOMIC_OR)
NODE_NAME_CASE(BUFFER_ATOMIC_XOR)
+ NODE_NAME_CASE(BUFFER_ATOMIC_INC)
+ NODE_NAME_CASE(BUFFER_ATOMIC_DEC)
NODE_NAME_CASE(BUFFER_ATOMIC_CMPSWAP)
NODE_NAME_CASE(BUFFER_ATOMIC_FADD)
NODE_NAME_CASE(BUFFER_ATOMIC_PK_FADD)
@@ -4576,9 +4488,9 @@ void AMDGPUTargetLowering::computeKnownBitsForTargetNode(
Known.One |= ((LHSKnown.One.getZExtValue() >> SelBits) & 0xff) << I;
Known.Zero |= ((LHSKnown.Zero.getZExtValue() >> SelBits) & 0xff) << I;
} else if (SelBits == 0x0c) {
- Known.Zero |= 0xff << I;
+ Known.Zero |= 0xFFull << I;
} else if (SelBits > 0x0c) {
- Known.One |= 0xff << I;
+ Known.One |= 0xFFull << I;
}
Sel >>= 8;
}
diff --git a/lib/Target/AMDGPU/AMDGPUISelLowering.h b/lib/Target/AMDGPU/AMDGPUISelLowering.h
index fe7ad694943d..dea0d1d4343a 100644
--- a/lib/Target/AMDGPU/AMDGPUISelLowering.h
+++ b/lib/Target/AMDGPU/AMDGPUISelLowering.h
@@ -38,6 +38,7 @@ private:
public:
static unsigned numBitsUnsigned(SDValue Op, SelectionDAG &DAG);
static unsigned numBitsSigned(SDValue Op, SelectionDAG &DAG);
+ static bool hasDefinedInitializer(const GlobalValue *GV);
protected:
SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, SelectionDAG &DAG) const;
@@ -78,6 +79,7 @@ protected:
SDValue performLoadCombine(SDNode *N, DAGCombinerInfo &DCI) const;
SDValue performStoreCombine(SDNode *N, DAGCombinerInfo &DCI) const;
SDValue performAssertSZExtCombine(SDNode *N, DAGCombinerInfo &DCI) const;
+ SDValue performIntrinsicWOChainCombine(SDNode *N, DAGCombinerInfo &DCI) const;
SDValue splitBinaryBitConstantOpImpl(DAGCombinerInfo &DCI, const SDLoc &SL,
unsigned Opc, SDValue LHS,
@@ -324,10 +326,6 @@ public:
}
AtomicExpansionKind shouldExpandAtomicRMWInIR(AtomicRMWInst *) const override;
-
- bool SelectFlatOffset(bool IsSigned, SelectionDAG &DAG, SDNode *N,
- SDValue Addr, SDValue &VAddr, SDValue &Offset,
- SDValue &SLC) const;
};
namespace AMDGPUISD {
@@ -369,6 +367,9 @@ enum NodeType : unsigned {
// result bit per item in the wavefront.
SETCC,
SETREG,
+
+ DENORM_MODE,
+
// FP ops with input and output chain.
FMA_W_CHAIN,
FMUL_W_CHAIN,
@@ -475,13 +476,6 @@ enum NodeType : unsigned {
BUILD_VERTICAL_VECTOR,
/// Pointer to the start of the shader's constant data.
CONST_DATA_PTR,
- INIT_EXEC,
- INIT_EXEC_FROM_INPUT,
- SENDMSG,
- SENDMSGHALT,
- INTERP_MOV,
- INTERP_P1,
- INTERP_P2,
INTERP_P1LL_F16,
INTERP_P1LV_F16,
INTERP_P2_F16,
@@ -532,6 +526,8 @@ enum NodeType : unsigned {
BUFFER_ATOMIC_AND,
BUFFER_ATOMIC_OR,
BUFFER_ATOMIC_XOR,
+ BUFFER_ATOMIC_INC,
+ BUFFER_ATOMIC_DEC,
BUFFER_ATOMIC_CMPSWAP,
BUFFER_ATOMIC_FADD,
BUFFER_ATOMIC_PK_FADD,
diff --git a/lib/Target/AMDGPU/AMDGPUInline.cpp b/lib/Target/AMDGPU/AMDGPUInline.cpp
index f4df20b8f03e..a83ec23ec054 100644
--- a/lib/Target/AMDGPU/AMDGPUInline.cpp
+++ b/lib/Target/AMDGPU/AMDGPUInline.cpp
@@ -51,7 +51,7 @@ ArgAllocaCutoff("amdgpu-inline-arg-alloca-cutoff", cl::Hidden, cl::init(256),
// Inliner constraint to achieve reasonable compilation time
static cl::opt<size_t>
-MaxBB("amdgpu-inline-max-bb", cl::Hidden, cl::init(300),
+MaxBB("amdgpu-inline-max-bb", cl::Hidden, cl::init(1100),
cl::desc("Maximum BB number allowed in a function after inlining"
" (compile time constraint)"));
diff --git a/lib/Target/AMDGPU/AMDGPUInstrInfo.td b/lib/Target/AMDGPU/AMDGPUInstrInfo.td
index 4a8446955496..cf0ce5659951 100644
--- a/lib/Target/AMDGPU/AMDGPUInstrInfo.td
+++ b/lib/Target/AMDGPU/AMDGPUInstrInfo.td
@@ -110,39 +110,38 @@ def AMDGPUdwordaddr : SDNode<"AMDGPUISD::DWORDADDR", SDTIntUnaryOp>;
// Force dependencies for vector trunc stores
def R600dummy_chain : SDNode<"AMDGPUISD::DUMMY_CHAIN", SDTNone, [SDNPHasChain]>;
-def AMDGPUcos : SDNode<"AMDGPUISD::COS_HW", SDTFPUnaryOp>;
-def AMDGPUsin : SDNode<"AMDGPUISD::SIN_HW", SDTFPUnaryOp>;
-
+def AMDGPUcos_impl : SDNode<"AMDGPUISD::COS_HW", SDTFPUnaryOp>;
+def AMDGPUsin_impl : SDNode<"AMDGPUISD::SIN_HW", SDTFPUnaryOp>;
// out = a - floor(a)
-def AMDGPUfract : SDNode<"AMDGPUISD::FRACT", SDTFPUnaryOp>;
+def AMDGPUfract_impl : SDNode<"AMDGPUISD::FRACT", SDTFPUnaryOp>;
// out = 1.0 / a
-def AMDGPUrcp : SDNode<"AMDGPUISD::RCP", SDTFPUnaryOp>;
+def AMDGPUrcp_impl : SDNode<"AMDGPUISD::RCP", SDTFPUnaryOp>;
// out = 1.0 / sqrt(a)
-def AMDGPUrsq : SDNode<"AMDGPUISD::RSQ", SDTFPUnaryOp>;
+def AMDGPUrsq_impl : SDNode<"AMDGPUISD::RSQ", SDTFPUnaryOp>;
// out = 1.0 / sqrt(a)
-def AMDGPUrcp_legacy : SDNode<"AMDGPUISD::RCP_LEGACY", SDTFPUnaryOp>;
-def AMDGPUrsq_legacy : SDNode<"AMDGPUISD::RSQ_LEGACY", SDTFPUnaryOp>;
+def AMDGPUrsq_legacy_impl : SDNode<"AMDGPUISD::RSQ_LEGACY", SDTFPUnaryOp>;
+def AMDGPUrcp_legacy_impl : SDNode<"AMDGPUISD::RCP_LEGACY", SDTFPUnaryOp>;
def AMDGPUrcp_iflag : SDNode<"AMDGPUISD::RCP_IFLAG", SDTFPUnaryOp>;
// out = 1.0 / sqrt(a) result clamped to +/- max_float.
-def AMDGPUrsq_clamp : SDNode<"AMDGPUISD::RSQ_CLAMP", SDTFPUnaryOp>;
+def AMDGPUrsq_clamp_impl : SDNode<"AMDGPUISD::RSQ_CLAMP", SDTFPUnaryOp>;
-def AMDGPUldexp : SDNode<"AMDGPUISD::LDEXP", AMDGPULdExpOp>;
+def AMDGPUldexp_impl : SDNode<"AMDGPUISD::LDEXP", AMDGPULdExpOp>;
-def AMDGPUpkrtz_f16_f32 : SDNode<"AMDGPUISD::CVT_PKRTZ_F16_F32", AMDGPUFPPackOp>;
-def AMDGPUpknorm_i16_f32 : SDNode<"AMDGPUISD::CVT_PKNORM_I16_F32", AMDGPUFPPackOp>;
-def AMDGPUpknorm_u16_f32 : SDNode<"AMDGPUISD::CVT_PKNORM_U16_F32", AMDGPUFPPackOp>;
-def AMDGPUpk_i16_i32 : SDNode<"AMDGPUISD::CVT_PK_I16_I32", AMDGPUIntPackOp>;
-def AMDGPUpk_u16_u32 : SDNode<"AMDGPUISD::CVT_PK_U16_U32", AMDGPUIntPackOp>;
+def AMDGPUpkrtz_f16_f32_impl : SDNode<"AMDGPUISD::CVT_PKRTZ_F16_F32", AMDGPUFPPackOp>;
+def AMDGPUpknorm_i16_f32_impl : SDNode<"AMDGPUISD::CVT_PKNORM_I16_F32", AMDGPUFPPackOp>;
+def AMDGPUpknorm_u16_f32_impl : SDNode<"AMDGPUISD::CVT_PKNORM_U16_F32", AMDGPUFPPackOp>;
+def AMDGPUpk_i16_i32_impl : SDNode<"AMDGPUISD::CVT_PK_I16_I32", AMDGPUIntPackOp>;
+def AMDGPUpk_u16_u32_impl : SDNode<"AMDGPUISD::CVT_PK_U16_U32", AMDGPUIntPackOp>;
def AMDGPUfp_to_f16 : SDNode<"AMDGPUISD::FP_TO_FP16" , SDTFPToIntOp>;
def AMDGPUfp16_zext : SDNode<"AMDGPUISD::FP16_ZEXT" , SDTFPToIntOp>;
-def AMDGPUfp_class : SDNode<"AMDGPUISD::FP_CLASS", AMDGPUFPClassOp>;
+def AMDGPUfp_class_impl : SDNode<"AMDGPUISD::FP_CLASS", AMDGPUFPClassOp>;
// out = max(a, b) a and b are floats, where a nan comparison fails.
// This is not commutative because this gives the second operand:
@@ -285,7 +284,7 @@ def AMDGPUbfi : SDNode<"AMDGPUISD::BFI", AMDGPUDTIntTernaryOp>;
def AMDGPUbfm : SDNode<"AMDGPUISD::BFM", SDTIntBinOp>;
def AMDGPUffbh_u32 : SDNode<"AMDGPUISD::FFBH_U32", SDTIntUnaryOp>;
-def AMDGPUffbh_i32 : SDNode<"AMDGPUISD::FFBH_I32", SDTIntUnaryOp>;
+def AMDGPUffbh_i32_impl : SDNode<"AMDGPUISD::FFBH_I32", SDTIntUnaryOp>;
def AMDGPUffbl_b32 : SDNode<"AMDGPUISD::FFBL_B32", SDTIntUnaryOp>;
@@ -320,7 +319,7 @@ def AMDGPUumed3 : SDNode<"AMDGPUISD::UMED3", AMDGPUDTIntTernaryOp,
[]
>;
-def AMDGPUfmed3 : SDNode<"AMDGPUISD::FMED3", SDTFPTernaryOp, []>;
+def AMDGPUfmed3_impl : SDNode<"AMDGPUISD::FMED3", SDTFPTernaryOp, []>;
def AMDGPUfdot2 : SDNode<"AMDGPUISD::FDOT2",
SDTypeProfile<1, 4, [SDTCisSameAs<0, 3>, SDTCisSameAs<1, 2>,
@@ -330,35 +329,6 @@ def AMDGPUfdot2 : SDNode<"AMDGPUISD::FDOT2",
def AMDGPUperm : SDNode<"AMDGPUISD::PERM", AMDGPUDTIntTernaryOp, []>;
-def AMDGPUinit_exec : SDNode<"AMDGPUISD::INIT_EXEC",
- SDTypeProfile<0, 1, [SDTCisInt<0>]>,
- [SDNPHasChain, SDNPInGlue]>;
-
-def AMDGPUinit_exec_from_input : SDNode<"AMDGPUISD::INIT_EXEC_FROM_INPUT",
- SDTypeProfile<0, 2,
- [SDTCisInt<0>, SDTCisInt<1>]>,
- [SDNPHasChain, SDNPInGlue]>;
-
-def AMDGPUsendmsg : SDNode<"AMDGPUISD::SENDMSG",
- SDTypeProfile<0, 1, [SDTCisInt<0>]>,
- [SDNPHasChain, SDNPInGlue]>;
-
-def AMDGPUsendmsghalt : SDNode<"AMDGPUISD::SENDMSGHALT",
- SDTypeProfile<0, 1, [SDTCisInt<0>]>,
- [SDNPHasChain, SDNPInGlue]>;
-
-def AMDGPUinterp_mov : SDNode<"AMDGPUISD::INTERP_MOV",
- SDTypeProfile<1, 3, [SDTCisFP<0>]>,
- [SDNPInGlue]>;
-
-def AMDGPUinterp_p1 : SDNode<"AMDGPUISD::INTERP_P1",
- SDTypeProfile<1, 3, [SDTCisFP<0>]>,
- [SDNPInGlue, SDNPOutGlue]>;
-
-def AMDGPUinterp_p2 : SDNode<"AMDGPUISD::INTERP_P2",
- SDTypeProfile<1, 4, [SDTCisFP<0>]>,
- [SDNPInGlue]>;
-
def AMDGPUinterp_p1ll_f16 : SDNode<"AMDGPUISD::INTERP_P1LL_F16",
SDTypeProfile<1, 7, [SDTCisFP<0>]>,
[SDNPInGlue, SDNPOutGlue]>;
@@ -425,3 +395,65 @@ def AMDGPUreturn_to_epilog : SDNode<"AMDGPUISD::RETURN_TO_EPILOG", SDTNone,
def AMDGPUret_flag : SDNode<"AMDGPUISD::RET_FLAG", SDTypeProfile<0, 1, [SDTCisPtrTy<0>]>,
[SDNPHasChain, SDNPOptInGlue, SDNPVariadic]
>;
+
+
+//===----------------------------------------------------------------------===//
+// Intrinsic/Custom node compatability PatFrags
+//===----------------------------------------------------------------------===//
+
+def AMDGPUrcp : PatFrags<(ops node:$src), [(int_amdgcn_rcp node:$src),
+ (AMDGPUrcp_impl node:$src)]>;
+def AMDGPUrcp_legacy : PatFrags<(ops node:$src), [(int_amdgcn_rcp_legacy node:$src),
+ (AMDGPUrcp_legacy_impl node:$src)]>;
+
+def AMDGPUrsq_legacy : PatFrags<(ops node:$src), [(int_amdgcn_rsq_legacy node:$src),
+ (AMDGPUrsq_legacy_impl node:$src)]>;
+
+def AMDGPUrsq : PatFrags<(ops node:$src), [(int_amdgcn_rsq node:$src),
+ (AMDGPUrsq_impl node:$src)]>;
+
+def AMDGPUrsq_clamp : PatFrags<(ops node:$src), [(int_amdgcn_rsq_clamp node:$src),
+ (AMDGPUrsq_clamp_impl node:$src)]>;
+
+def AMDGPUsin : PatFrags<(ops node:$src), [(int_amdgcn_sin node:$src),
+ (AMDGPUsin_impl node:$src)]>;
+def AMDGPUcos : PatFrags<(ops node:$src), [(int_amdgcn_cos node:$src),
+ (AMDGPUcos_impl node:$src)]>;
+def AMDGPUfract : PatFrags<(ops node:$src), [(int_amdgcn_fract node:$src),
+ (AMDGPUfract_impl node:$src)]>;
+
+def AMDGPUldexp : PatFrags<(ops node:$src0, node:$src1),
+ [(int_amdgcn_ldexp node:$src0, node:$src1),
+ (AMDGPUldexp_impl node:$src0, node:$src1)]>;
+
+def AMDGPUfp_class : PatFrags<(ops node:$src0, node:$src1),
+ [(int_amdgcn_class node:$src0, node:$src1),
+ (AMDGPUfp_class_impl node:$src0, node:$src1)]>;
+
+def AMDGPUfmed3 : PatFrags<(ops node:$src0, node:$src1, node:$src2),
+ [(int_amdgcn_fmed3 node:$src0, node:$src1, node:$src2),
+ (AMDGPUfmed3_impl node:$src0, node:$src1, node:$src2)]>;
+
+def AMDGPUffbh_i32 : PatFrags<(ops node:$src),
+ [(int_amdgcn_sffbh node:$src),
+ (AMDGPUffbh_i32_impl node:$src)]>;
+
+def AMDGPUpkrtz_f16_f32 : PatFrags<(ops node:$src0, node:$src1),
+ [(int_amdgcn_cvt_pkrtz node:$src0, node:$src1),
+ (AMDGPUpkrtz_f16_f32_impl node:$src0, node:$src1)]>;
+
+def AMDGPUpknorm_i16_f32 : PatFrags<(ops node:$src0, node:$src1),
+ [(int_amdgcn_cvt_pknorm_i16 node:$src0, node:$src1),
+ (AMDGPUpknorm_i16_f32_impl node:$src0, node:$src1)]>;
+
+def AMDGPUpknorm_u16_f32 : PatFrags<(ops node:$src0, node:$src1),
+ [(int_amdgcn_cvt_pknorm_u16 node:$src0, node:$src1),
+ (AMDGPUpknorm_u16_f32_impl node:$src0, node:$src1)]>;
+
+def AMDGPUpk_i16_i32 : PatFrags<(ops node:$src0, node:$src1),
+ [(int_amdgcn_cvt_pk_i16 node:$src0, node:$src1),
+ (AMDGPUpk_i16_i32_impl node:$src0, node:$src1)]>;
+
+def AMDGPUpk_u16_u32 : PatFrags<(ops node:$src0, node:$src1),
+ [(int_amdgcn_cvt_pk_u16 node:$src0, node:$src1),
+ (AMDGPUpk_u16_u32_impl node:$src0, node:$src1)]>;
diff --git a/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
index 901a2eaa8829..3cfa9d57ec46 100644
--- a/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
+++ b/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -19,8 +19,10 @@
#include "AMDGPUTargetMachine.h"
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
#include "SIMachineFunctionInfo.h"
+#include "llvm/CodeGen/GlobalISel/GISelKnownBits.h"
#include "llvm/CodeGen/GlobalISel/InstructionSelector.h"
#include "llvm/CodeGen/GlobalISel/InstructionSelectorImpl.h"
+#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
#include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
#include "llvm/CodeGen/GlobalISel/Utils.h"
#include "llvm/CodeGen/MachineBasicBlock.h"
@@ -61,8 +63,14 @@ AMDGPUInstructionSelector::AMDGPUInstructionSelector(
const char *AMDGPUInstructionSelector::getName() { return DEBUG_TYPE; }
+void AMDGPUInstructionSelector::setupMF(MachineFunction &MF, GISelKnownBits &KB,
+ CodeGenCoverage &CoverageInfo) {
+ MRI = &MF.getRegInfo();
+ InstructionSelector::setupMF(MF, KB, CoverageInfo);
+}
+
static bool isSCC(Register Reg, const MachineRegisterInfo &MRI) {
- if (TargetRegisterInfo::isPhysicalRegister(Reg))
+ if (Register::isPhysicalRegister(Reg))
return Reg == AMDGPU::SCC;
auto &RegClassOrBank = MRI.getRegClassOrRegBank(Reg);
@@ -71,7 +79,9 @@ static bool isSCC(Register Reg, const MachineRegisterInfo &MRI) {
if (RC) {
// FIXME: This is ambiguous for wave32. This could be SCC or VCC, but the
// context of the register bank has been lost.
- if (RC->getID() != AMDGPU::SReg_32_XM0RegClassID)
+ // Has a hack getRegClassForSizeOnBank uses exactly SGPR_32RegClass, which
+ // won't ever beconstrained any further.
+ if (RC != &AMDGPU::SGPR_32RegClass)
return false;
const LLT Ty = MRI.getType(Reg);
return Ty.isValid() && Ty.getSizeInBits() == 1;
@@ -83,7 +93,7 @@ static bool isSCC(Register Reg, const MachineRegisterInfo &MRI) {
bool AMDGPUInstructionSelector::isVCC(Register Reg,
const MachineRegisterInfo &MRI) const {
- if (TargetRegisterInfo::isPhysicalRegister(Reg))
+ if (Register::isPhysicalRegister(Reg))
return Reg == TRI.getVCC();
auto &RegClassOrBank = MRI.getRegClassOrRegBank(Reg);
@@ -102,8 +112,6 @@ bool AMDGPUInstructionSelector::isVCC(Register Reg,
bool AMDGPUInstructionSelector::selectCOPY(MachineInstr &I) const {
const DebugLoc &DL = I.getDebugLoc();
MachineBasicBlock *BB = I.getParent();
- MachineFunction *MF = BB->getParent();
- MachineRegisterInfo &MRI = MF->getRegInfo();
I.setDesc(TII.get(TargetOpcode::COPY));
const MachineOperand &Src = I.getOperand(1);
@@ -111,33 +119,33 @@ bool AMDGPUInstructionSelector::selectCOPY(MachineInstr &I) const {
Register DstReg = Dst.getReg();
Register SrcReg = Src.getReg();
- if (isVCC(DstReg, MRI)) {
+ if (isVCC(DstReg, *MRI)) {
if (SrcReg == AMDGPU::SCC) {
const TargetRegisterClass *RC
- = TRI.getConstrainedRegClassForOperand(Dst, MRI);
+ = TRI.getConstrainedRegClassForOperand(Dst, *MRI);
if (!RC)
return true;
- return RBI.constrainGenericRegister(DstReg, *RC, MRI);
+ return RBI.constrainGenericRegister(DstReg, *RC, *MRI);
}
- if (!isVCC(SrcReg, MRI)) {
+ if (!isVCC(SrcReg, *MRI)) {
// TODO: Should probably leave the copy and let copyPhysReg expand it.
- if (!RBI.constrainGenericRegister(DstReg, *TRI.getBoolRC(), MRI))
+ if (!RBI.constrainGenericRegister(DstReg, *TRI.getBoolRC(), *MRI))
return false;
BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_CMP_NE_U32_e64), DstReg)
.addImm(0)
.addReg(SrcReg);
- if (!MRI.getRegClassOrNull(SrcReg))
- MRI.setRegClass(SrcReg, TRI.getConstrainedRegClassForOperand(Src, MRI));
+ if (!MRI->getRegClassOrNull(SrcReg))
+ MRI->setRegClass(SrcReg, TRI.getConstrainedRegClassForOperand(Src, *MRI));
I.eraseFromParent();
return true;
}
const TargetRegisterClass *RC =
- TRI.getConstrainedRegClassForOperand(Dst, MRI);
- if (RC && !RBI.constrainGenericRegister(DstReg, *RC, MRI))
+ TRI.getConstrainedRegClassForOperand(Dst, *MRI);
+ if (RC && !RBI.constrainGenericRegister(DstReg, *RC, *MRI))
return false;
// Don't constrain the source register to a class so the def instruction
@@ -148,8 +156,8 @@ bool AMDGPUInstructionSelector::selectCOPY(MachineInstr &I) const {
// with size 1. An SReg_32 with size 1 is ambiguous with wave32.
if (Src.isUndef()) {
const TargetRegisterClass *SrcRC =
- TRI.getConstrainedRegClassForOperand(Src, MRI);
- if (SrcRC && !RBI.constrainGenericRegister(SrcReg, *SrcRC, MRI))
+ TRI.getConstrainedRegClassForOperand(Src, *MRI);
+ if (SrcRC && !RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI))
return false;
}
@@ -157,30 +165,26 @@ bool AMDGPUInstructionSelector::selectCOPY(MachineInstr &I) const {
}
for (const MachineOperand &MO : I.operands()) {
- if (TargetRegisterInfo::isPhysicalRegister(MO.getReg()))
+ if (Register::isPhysicalRegister(MO.getReg()))
continue;
const TargetRegisterClass *RC =
- TRI.getConstrainedRegClassForOperand(MO, MRI);
+ TRI.getConstrainedRegClassForOperand(MO, *MRI);
if (!RC)
continue;
- RBI.constrainGenericRegister(MO.getReg(), *RC, MRI);
+ RBI.constrainGenericRegister(MO.getReg(), *RC, *MRI);
}
return true;
}
bool AMDGPUInstructionSelector::selectPHI(MachineInstr &I) const {
- MachineBasicBlock *BB = I.getParent();
- MachineFunction *MF = BB->getParent();
- MachineRegisterInfo &MRI = MF->getRegInfo();
-
const Register DefReg = I.getOperand(0).getReg();
- const LLT DefTy = MRI.getType(DefReg);
+ const LLT DefTy = MRI->getType(DefReg);
// TODO: Verify this doesn't have insane operands (i.e. VGPR to SGPR copy)
const RegClassOrRegBank &RegClassOrBank =
- MRI.getRegClassOrRegBank(DefReg);
+ MRI->getRegClassOrRegBank(DefReg);
const TargetRegisterClass *DefRC
= RegClassOrBank.dyn_cast<const TargetRegisterClass *>();
@@ -196,7 +200,7 @@ bool AMDGPUInstructionSelector::selectPHI(MachineInstr &I) const {
return false;
}
- DefRC = TRI.getRegClassForTypeOnBank(DefTy, RB, MRI);
+ DefRC = TRI.getRegClassForTypeOnBank(DefTy, RB, *MRI);
if (!DefRC) {
LLVM_DEBUG(dbgs() << "PHI operand has unexpected size/bank\n");
return false;
@@ -204,7 +208,7 @@ bool AMDGPUInstructionSelector::selectPHI(MachineInstr &I) const {
}
I.setDesc(TII.get(TargetOpcode::PHI));
- return RBI.constrainGenericRegister(DefReg, *DefRC, MRI);
+ return RBI.constrainGenericRegister(DefReg, *DefRC, *MRI);
}
MachineOperand
@@ -214,13 +218,11 @@ AMDGPUInstructionSelector::getSubOperand64(MachineOperand &MO,
MachineInstr *MI = MO.getParent();
MachineBasicBlock *BB = MO.getParent()->getParent();
- MachineFunction *MF = BB->getParent();
- MachineRegisterInfo &MRI = MF->getRegInfo();
- Register DstReg = MRI.createVirtualRegister(&SubRC);
+ Register DstReg = MRI->createVirtualRegister(&SubRC);
if (MO.isReg()) {
unsigned ComposedSubIdx = TRI.composeSubRegIndices(MO.getSubReg(), SubIdx);
- unsigned Reg = MO.getReg();
+ Register Reg = MO.getReg();
BuildMI(*BB, MI, MI->getDebugLoc(), TII.get(AMDGPU::COPY), DstReg)
.addReg(Reg, 0, ComposedSubIdx);
@@ -244,10 +246,6 @@ AMDGPUInstructionSelector::getSubOperand64(MachineOperand &MO,
}
}
-static int64_t getConstant(const MachineInstr *MI) {
- return MI->getOperand(1).getCImm()->getSExtValue();
-}
-
static unsigned getLogicalBitOpcode(unsigned Opc, bool Is64) {
switch (Opc) {
case AMDGPU::G_AND:
@@ -262,16 +260,13 @@ static unsigned getLogicalBitOpcode(unsigned Opc, bool Is64) {
}
bool AMDGPUInstructionSelector::selectG_AND_OR_XOR(MachineInstr &I) const {
- MachineBasicBlock *BB = I.getParent();
- MachineFunction *MF = BB->getParent();
- MachineRegisterInfo &MRI = MF->getRegInfo();
MachineOperand &Dst = I.getOperand(0);
MachineOperand &Src0 = I.getOperand(1);
MachineOperand &Src1 = I.getOperand(2);
Register DstReg = Dst.getReg();
- unsigned Size = RBI.getSizeInBits(DstReg, MRI, TRI);
+ unsigned Size = RBI.getSizeInBits(DstReg, *MRI, TRI);
- const RegisterBank *DstRB = RBI.getRegBank(DstReg, MRI, TRI);
+ const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
if (DstRB->getID() == AMDGPU::VCCRegBankID) {
const TargetRegisterClass *RC = TRI.getBoolRC();
unsigned InstOpc = getLogicalBitOpcode(I.getOpcode(),
@@ -282,12 +277,12 @@ bool AMDGPUInstructionSelector::selectG_AND_OR_XOR(MachineInstr &I) const {
// The selector for G_ICMP relies on seeing the register bank for the result
// is VCC. In wave32 if we constrain the registers to SReg_32 here, it will
// be ambiguous whether it's a scalar or vector bool.
- if (Src0.isUndef() && !MRI.getRegClassOrNull(Src0.getReg()))
- MRI.setRegClass(Src0.getReg(), RC);
- if (Src1.isUndef() && !MRI.getRegClassOrNull(Src1.getReg()))
- MRI.setRegClass(Src1.getReg(), RC);
+ if (Src0.isUndef() && !MRI->getRegClassOrNull(Src0.getReg()))
+ MRI->setRegClass(Src0.getReg(), RC);
+ if (Src1.isUndef() && !MRI->getRegClassOrNull(Src1.getReg()))
+ MRI->setRegClass(Src1.getReg(), RC);
- return RBI.constrainGenericRegister(DstReg, *RC, MRI);
+ return RBI.constrainGenericRegister(DstReg, *RC, *MRI);
}
// TODO: Should this allow an SCC bank result, and produce a copy from SCC for
@@ -295,14 +290,7 @@ bool AMDGPUInstructionSelector::selectG_AND_OR_XOR(MachineInstr &I) const {
if (DstRB->getID() == AMDGPU::SGPRRegBankID) {
unsigned InstOpc = getLogicalBitOpcode(I.getOpcode(), Size > 32);
I.setDesc(TII.get(InstOpc));
-
- const TargetRegisterClass *RC
- = TRI.getConstrainedRegClassForOperand(Dst, MRI);
- if (!RC)
- return false;
- return RBI.constrainGenericRegister(DstReg, *RC, MRI) &&
- RBI.constrainGenericRegister(Src0.getReg(), *RC, MRI) &&
- RBI.constrainGenericRegister(Src1.getReg(), *RC, MRI);
+ return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
}
return false;
@@ -311,11 +299,10 @@ bool AMDGPUInstructionSelector::selectG_AND_OR_XOR(MachineInstr &I) const {
bool AMDGPUInstructionSelector::selectG_ADD_SUB(MachineInstr &I) const {
MachineBasicBlock *BB = I.getParent();
MachineFunction *MF = BB->getParent();
- MachineRegisterInfo &MRI = MF->getRegInfo();
Register DstReg = I.getOperand(0).getReg();
const DebugLoc &DL = I.getDebugLoc();
- unsigned Size = RBI.getSizeInBits(DstReg, MRI, TRI);
- const RegisterBank *DstRB = RBI.getRegBank(DstReg, MRI, TRI);
+ unsigned Size = RBI.getSizeInBits(DstReg, *MRI, TRI);
+ const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
const bool IsSALU = DstRB->getID() == AMDGPU::SGPRRegBankID;
const bool Sub = I.getOpcode() == TargetOpcode::G_SUB;
@@ -340,7 +327,7 @@ bool AMDGPUInstructionSelector::selectG_ADD_SUB(MachineInstr &I) const {
const unsigned Opc = Sub ? AMDGPU::V_SUB_I32_e64 : AMDGPU::V_ADD_I32_e64;
- Register UnusedCarry = MRI.createVirtualRegister(TRI.getWaveMaskRegClass());
+ Register UnusedCarry = MRI->createVirtualRegister(TRI.getWaveMaskRegClass());
MachineInstr *Add
= BuildMI(*BB, &I, DL, TII.get(Opc), DstReg)
.addDef(UnusedCarry, RegState::Dead)
@@ -363,8 +350,8 @@ bool AMDGPUInstructionSelector::selectG_ADD_SUB(MachineInstr &I) const {
MachineOperand Hi1(getSubOperand64(I.getOperand(1), HalfRC, AMDGPU::sub1));
MachineOperand Hi2(getSubOperand64(I.getOperand(2), HalfRC, AMDGPU::sub1));
- Register DstLo = MRI.createVirtualRegister(&HalfRC);
- Register DstHi = MRI.createVirtualRegister(&HalfRC);
+ Register DstLo = MRI->createVirtualRegister(&HalfRC);
+ Register DstHi = MRI->createVirtualRegister(&HalfRC);
if (IsSALU) {
BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_ADD_U32), DstLo)
@@ -375,14 +362,14 @@ bool AMDGPUInstructionSelector::selectG_ADD_SUB(MachineInstr &I) const {
.add(Hi2);
} else {
const TargetRegisterClass *CarryRC = TRI.getWaveMaskRegClass();
- Register CarryReg = MRI.createVirtualRegister(CarryRC);
+ Register CarryReg = MRI->createVirtualRegister(CarryRC);
BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_ADD_I32_e64), DstLo)
.addDef(CarryReg)
.add(Lo1)
.add(Lo2)
.addImm(0);
MachineInstr *Addc = BuildMI(*BB, &I, DL, TII.get(AMDGPU::V_ADDC_U32_e64), DstHi)
- .addDef(MRI.createVirtualRegister(CarryRC), RegState::Dead)
+ .addDef(MRI->createVirtualRegister(CarryRC), RegState::Dead)
.add(Hi1)
.add(Hi2)
.addReg(CarryReg, RegState::Kill)
@@ -399,19 +386,61 @@ bool AMDGPUInstructionSelector::selectG_ADD_SUB(MachineInstr &I) const {
.addImm(AMDGPU::sub1);
- if (!RBI.constrainGenericRegister(DstReg, RC, MRI))
+ if (!RBI.constrainGenericRegister(DstReg, RC, *MRI))
return false;
I.eraseFromParent();
return true;
}
-bool AMDGPUInstructionSelector::selectG_EXTRACT(MachineInstr &I) const {
+bool AMDGPUInstructionSelector::selectG_UADDO_USUBO(MachineInstr &I) const {
MachineBasicBlock *BB = I.getParent();
MachineFunction *MF = BB->getParent();
MachineRegisterInfo &MRI = MF->getRegInfo();
- assert(I.getOperand(2).getImm() % 32 == 0);
- unsigned SubReg = TRI.getSubRegFromChannel(I.getOperand(2).getImm() / 32);
+ const DebugLoc &DL = I.getDebugLoc();
+ Register Dst0Reg = I.getOperand(0).getReg();
+ Register Dst1Reg = I.getOperand(1).getReg();
+ const bool IsAdd = I.getOpcode() == AMDGPU::G_UADDO;
+
+ if (!isSCC(Dst1Reg, MRI)) {
+ // The name of the opcodes are misleading. v_add_i32/v_sub_i32 have unsigned
+ // carry out despite the _i32 name. These were renamed in VI to _U32.
+ // FIXME: We should probably rename the opcodes here.
+ unsigned NewOpc = IsAdd ? AMDGPU::V_ADD_I32_e64 : AMDGPU::V_SUB_I32_e64;
+ I.setDesc(TII.get(NewOpc));
+ I.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true));
+ I.addOperand(*MF, MachineOperand::CreateImm(0));
+ return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
+ }
+
+ Register Src0Reg = I.getOperand(2).getReg();
+ Register Src1Reg = I.getOperand(3).getReg();
+ unsigned NewOpc = IsAdd ? AMDGPU::S_ADD_U32 : AMDGPU::S_SUB_U32;
+ BuildMI(*BB, &I, DL, TII.get(NewOpc), Dst0Reg)
+ .add(I.getOperand(2))
+ .add(I.getOperand(3));
+ BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), Dst1Reg)
+ .addReg(AMDGPU::SCC);
+
+ if (!MRI.getRegClassOrNull(Dst1Reg))
+ MRI.setRegClass(Dst1Reg, &AMDGPU::SReg_32RegClass);
+
+ if (!RBI.constrainGenericRegister(Dst0Reg, AMDGPU::SReg_32RegClass, MRI) ||
+ !RBI.constrainGenericRegister(Src0Reg, AMDGPU::SReg_32RegClass, MRI) ||
+ !RBI.constrainGenericRegister(Src1Reg, AMDGPU::SReg_32RegClass, MRI))
+ return false;
+
+ I.eraseFromParent();
+ return true;
+}
+
+bool AMDGPUInstructionSelector::selectG_EXTRACT(MachineInstr &I) const {
+ MachineBasicBlock *BB = I.getParent();
+ unsigned Offset = I.getOperand(2).getImm();
+ if (Offset % 32 != 0)
+ return false;
+
+ unsigned SubReg = TRI.getSubRegFromChannel(Offset / 32);
const DebugLoc &DL = I.getDebugLoc();
MachineInstr *Copy = BuildMI(*BB, &I, DL, TII.get(TargetOpcode::COPY),
I.getOperand(0).getReg())
@@ -419,10 +448,10 @@ bool AMDGPUInstructionSelector::selectG_EXTRACT(MachineInstr &I) const {
for (const MachineOperand &MO : Copy->operands()) {
const TargetRegisterClass *RC =
- TRI.getConstrainedRegClassForOperand(MO, MRI);
+ TRI.getConstrainedRegClassForOperand(MO, *MRI);
if (!RC)
continue;
- RBI.constrainGenericRegister(MO.getReg(), *RC, MRI);
+ RBI.constrainGenericRegister(MO.getReg(), *RC, *MRI);
}
I.eraseFromParent();
return true;
@@ -430,21 +459,19 @@ bool AMDGPUInstructionSelector::selectG_EXTRACT(MachineInstr &I) const {
bool AMDGPUInstructionSelector::selectG_MERGE_VALUES(MachineInstr &MI) const {
MachineBasicBlock *BB = MI.getParent();
- MachineFunction *MF = BB->getParent();
- MachineRegisterInfo &MRI = MF->getRegInfo();
Register DstReg = MI.getOperand(0).getReg();
- LLT DstTy = MRI.getType(DstReg);
- LLT SrcTy = MRI.getType(MI.getOperand(1).getReg());
+ LLT DstTy = MRI->getType(DstReg);
+ LLT SrcTy = MRI->getType(MI.getOperand(1).getReg());
const unsigned SrcSize = SrcTy.getSizeInBits();
if (SrcSize < 32)
return false;
const DebugLoc &DL = MI.getDebugLoc();
- const RegisterBank *DstBank = RBI.getRegBank(DstReg, MRI, TRI);
+ const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI);
const unsigned DstSize = DstTy.getSizeInBits();
const TargetRegisterClass *DstRC =
- TRI.getRegClassForSizeOnBank(DstSize, *DstBank, MRI);
+ TRI.getRegClassForSizeOnBank(DstSize, *DstBank, *MRI);
if (!DstRC)
return false;
@@ -457,12 +484,12 @@ bool AMDGPUInstructionSelector::selectG_MERGE_VALUES(MachineInstr &MI) const {
MIB.addImm(SubRegs[I]);
const TargetRegisterClass *SrcRC
- = TRI.getConstrainedRegClassForOperand(Src, MRI);
- if (SrcRC && !RBI.constrainGenericRegister(Src.getReg(), *SrcRC, MRI))
+ = TRI.getConstrainedRegClassForOperand(Src, *MRI);
+ if (SrcRC && !RBI.constrainGenericRegister(Src.getReg(), *SrcRC, *MRI))
return false;
}
- if (!RBI.constrainGenericRegister(DstReg, *DstRC, MRI))
+ if (!RBI.constrainGenericRegister(DstReg, *DstRC, *MRI))
return false;
MI.eraseFromParent();
@@ -471,25 +498,23 @@ bool AMDGPUInstructionSelector::selectG_MERGE_VALUES(MachineInstr &MI) const {
bool AMDGPUInstructionSelector::selectG_UNMERGE_VALUES(MachineInstr &MI) const {
MachineBasicBlock *BB = MI.getParent();
- MachineFunction *MF = BB->getParent();
- MachineRegisterInfo &MRI = MF->getRegInfo();
const int NumDst = MI.getNumOperands() - 1;
MachineOperand &Src = MI.getOperand(NumDst);
Register SrcReg = Src.getReg();
Register DstReg0 = MI.getOperand(0).getReg();
- LLT DstTy = MRI.getType(DstReg0);
- LLT SrcTy = MRI.getType(SrcReg);
+ LLT DstTy = MRI->getType(DstReg0);
+ LLT SrcTy = MRI->getType(SrcReg);
const unsigned DstSize = DstTy.getSizeInBits();
const unsigned SrcSize = SrcTy.getSizeInBits();
const DebugLoc &DL = MI.getDebugLoc();
- const RegisterBank *SrcBank = RBI.getRegBank(SrcReg, MRI, TRI);
+ const RegisterBank *SrcBank = RBI.getRegBank(SrcReg, *MRI, TRI);
const TargetRegisterClass *SrcRC =
- TRI.getRegClassForSizeOnBank(SrcSize, *SrcBank, MRI);
- if (!SrcRC || !RBI.constrainGenericRegister(SrcReg, *SrcRC, MRI))
+ TRI.getRegClassForSizeOnBank(SrcSize, *SrcBank, *MRI);
+ if (!SrcRC || !RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI))
return false;
const unsigned SrcFlags = getUndefRegState(Src.isUndef());
@@ -504,8 +529,8 @@ bool AMDGPUInstructionSelector::selectG_UNMERGE_VALUES(MachineInstr &MI) const {
.addReg(SrcReg, SrcFlags, SubRegs[I]);
const TargetRegisterClass *DstRC =
- TRI.getConstrainedRegClassForOperand(Dst, MRI);
- if (DstRC && !RBI.constrainGenericRegister(Dst.getReg(), *DstRC, MRI))
+ TRI.getConstrainedRegClassForOperand(Dst, *MRI);
+ if (DstRC && !RBI.constrainGenericRegister(Dst.getReg(), *DstRC, *MRI))
return false;
}
@@ -518,16 +543,13 @@ bool AMDGPUInstructionSelector::selectG_GEP(MachineInstr &I) const {
}
bool AMDGPUInstructionSelector::selectG_IMPLICIT_DEF(MachineInstr &I) const {
- MachineBasicBlock *BB = I.getParent();
- MachineFunction *MF = BB->getParent();
- MachineRegisterInfo &MRI = MF->getRegInfo();
const MachineOperand &MO = I.getOperand(0);
// FIXME: Interface for getConstrainedRegClassForOperand needs work. The
// regbank check here is to know why getConstrainedRegClassForOperand failed.
- const TargetRegisterClass *RC = TRI.getConstrainedRegClassForOperand(MO, MRI);
- if ((!RC && !MRI.getRegBankOrNull(MO.getReg())) ||
- (RC && RBI.constrainGenericRegister(MO.getReg(), *RC, MRI))) {
+ const TargetRegisterClass *RC = TRI.getConstrainedRegClassForOperand(MO, *MRI);
+ if ((!RC && !MRI->getRegBankOrNull(MO.getReg())) ||
+ (RC && RBI.constrainGenericRegister(MO.getReg(), *RC, *MRI))) {
I.setDesc(TII.get(TargetOpcode::IMPLICIT_DEF));
return true;
}
@@ -537,44 +559,62 @@ bool AMDGPUInstructionSelector::selectG_IMPLICIT_DEF(MachineInstr &I) const {
bool AMDGPUInstructionSelector::selectG_INSERT(MachineInstr &I) const {
MachineBasicBlock *BB = I.getParent();
- MachineFunction *MF = BB->getParent();
- MachineRegisterInfo &MRI = MF->getRegInfo();
- unsigned SubReg = TRI.getSubRegFromChannel(I.getOperand(3).getImm() / 32);
- DebugLoc DL = I.getDebugLoc();
- MachineInstr *Ins = BuildMI(*BB, &I, DL, TII.get(TargetOpcode::INSERT_SUBREG))
- .addDef(I.getOperand(0).getReg())
- .addReg(I.getOperand(1).getReg())
- .addReg(I.getOperand(2).getReg())
- .addImm(SubReg);
-
- for (const MachineOperand &MO : Ins->operands()) {
- if (!MO.isReg())
- continue;
- if (TargetRegisterInfo::isPhysicalRegister(MO.getReg()))
- continue;
- const TargetRegisterClass *RC =
- TRI.getConstrainedRegClassForOperand(MO, MRI);
- if (!RC)
- continue;
- RBI.constrainGenericRegister(MO.getReg(), *RC, MRI);
- }
+ Register DstReg = I.getOperand(0).getReg();
+ Register Src0Reg = I.getOperand(1).getReg();
+ Register Src1Reg = I.getOperand(2).getReg();
+ LLT Src1Ty = MRI->getType(Src1Reg);
+
+ unsigned DstSize = MRI->getType(DstReg).getSizeInBits();
+ unsigned InsSize = Src1Ty.getSizeInBits();
+
+ int64_t Offset = I.getOperand(3).getImm();
+ if (Offset % 32 != 0)
+ return false;
+
+ unsigned SubReg = TRI.getSubRegFromChannel(Offset / 32, InsSize / 32);
+ if (SubReg == AMDGPU::NoSubRegister)
+ return false;
+
+ const RegisterBank *DstBank = RBI.getRegBank(DstReg, *MRI, TRI);
+ const TargetRegisterClass *DstRC =
+ TRI.getRegClassForSizeOnBank(DstSize, *DstBank, *MRI);
+ if (!DstRC)
+ return false;
+
+ const RegisterBank *Src0Bank = RBI.getRegBank(Src0Reg, *MRI, TRI);
+ const RegisterBank *Src1Bank = RBI.getRegBank(Src1Reg, *MRI, TRI);
+ const TargetRegisterClass *Src0RC =
+ TRI.getRegClassForSizeOnBank(DstSize, *Src0Bank, *MRI);
+ const TargetRegisterClass *Src1RC =
+ TRI.getRegClassForSizeOnBank(InsSize, *Src1Bank, *MRI);
+
+ // Deal with weird cases where the class only partially supports the subreg
+ // index.
+ Src0RC = TRI.getSubClassWithSubReg(Src0RC, SubReg);
+ if (!Src0RC)
+ return false;
+
+ if (!RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) ||
+ !RBI.constrainGenericRegister(Src0Reg, *Src0RC, *MRI) ||
+ !RBI.constrainGenericRegister(Src1Reg, *Src1RC, *MRI))
+ return false;
+
+ const DebugLoc &DL = I.getDebugLoc();
+ BuildMI(*BB, &I, DL, TII.get(TargetOpcode::INSERT_SUBREG), DstReg)
+ .addReg(Src0Reg)
+ .addReg(Src1Reg)
+ .addImm(SubReg);
+
I.eraseFromParent();
return true;
}
-bool AMDGPUInstructionSelector::selectG_INTRINSIC(
- MachineInstr &I, CodeGenCoverage &CoverageInfo) const {
- unsigned IntrinsicID = I.getOperand(I.getNumExplicitDefs()).getIntrinsicID();
+bool AMDGPUInstructionSelector::selectG_INTRINSIC(MachineInstr &I) const {
+ unsigned IntrinsicID = I.getIntrinsicID();
switch (IntrinsicID) {
- case Intrinsic::maxnum:
- case Intrinsic::minnum:
- case Intrinsic::amdgcn_cvt_pkrtz:
- return selectImpl(I, CoverageInfo);
case Intrinsic::amdgcn_if_break: {
MachineBasicBlock *BB = I.getParent();
- MachineFunction *MF = BB->getParent();
- MachineRegisterInfo &MRI = MF->getRegInfo();
// FIXME: Manually selecting to avoid dealiing with the SReg_1 trick
// SelectionDAG uses for wave32 vs wave64.
@@ -589,15 +629,13 @@ bool AMDGPUInstructionSelector::selectG_INTRINSIC(
I.eraseFromParent();
- for (Register Reg : { DstReg, Src0Reg, Src1Reg }) {
- if (!MRI.getRegClassOrNull(Reg))
- MRI.setRegClass(Reg, TRI.getWaveMaskRegClass());
- }
+ for (Register Reg : { DstReg, Src0Reg, Src1Reg })
+ MRI->setRegClass(Reg, TRI.getWaveMaskRegClass());
return true;
}
default:
- return selectImpl(I, CoverageInfo);
+ return selectImpl(I, *CoverageInfo);
}
}
@@ -677,17 +715,15 @@ int AMDGPUInstructionSelector::getS_CMPOpcode(CmpInst::Predicate P,
bool AMDGPUInstructionSelector::selectG_ICMP(MachineInstr &I) const {
MachineBasicBlock *BB = I.getParent();
- MachineFunction *MF = BB->getParent();
- MachineRegisterInfo &MRI = MF->getRegInfo();
const DebugLoc &DL = I.getDebugLoc();
- unsigned SrcReg = I.getOperand(2).getReg();
- unsigned Size = RBI.getSizeInBits(SrcReg, MRI, TRI);
+ Register SrcReg = I.getOperand(2).getReg();
+ unsigned Size = RBI.getSizeInBits(SrcReg, *MRI, TRI);
auto Pred = (CmpInst::Predicate)I.getOperand(1).getPredicate();
- unsigned CCReg = I.getOperand(0).getReg();
- if (isSCC(CCReg, MRI)) {
+ Register CCReg = I.getOperand(0).getReg();
+ if (isSCC(CCReg, *MRI)) {
int Opcode = getS_CMPOpcode(Pred, Size);
if (Opcode == -1)
return false;
@@ -698,7 +734,7 @@ bool AMDGPUInstructionSelector::selectG_ICMP(MachineInstr &I) const {
.addReg(AMDGPU::SCC);
bool Ret =
constrainSelectedInstRegOperands(*ICmp, TII, TRI, RBI) &&
- RBI.constrainGenericRegister(CCReg, AMDGPU::SReg_32RegClass, MRI);
+ RBI.constrainGenericRegister(CCReg, AMDGPU::SReg_32RegClass, *MRI);
I.eraseFromParent();
return Ret;
}
@@ -712,7 +748,7 @@ bool AMDGPUInstructionSelector::selectG_ICMP(MachineInstr &I) const {
.add(I.getOperand(2))
.add(I.getOperand(3));
RBI.constrainGenericRegister(ICmp->getOperand(0).getReg(),
- *TRI.getBoolRC(), MRI);
+ *TRI.getBoolRC(), *MRI);
bool Ret = constrainSelectedInstRegOperands(*ICmp, TII, TRI, RBI);
I.eraseFromParent();
return Ret;
@@ -736,19 +772,273 @@ buildEXP(const TargetInstrInfo &TII, MachineInstr *Insert, unsigned Tgt,
.addImm(Enabled);
}
+static bool isZero(Register Reg, MachineRegisterInfo &MRI) {
+ int64_t C;
+ if (mi_match(Reg, MRI, m_ICst(C)) && C == 0)
+ return true;
+
+ // FIXME: matcher should ignore copies
+ return mi_match(Reg, MRI, m_Copy(m_ICst(C))) && C == 0;
+}
+
+static unsigned extractGLC(unsigned AuxiliaryData) {
+ return AuxiliaryData & 1;
+}
+
+static unsigned extractSLC(unsigned AuxiliaryData) {
+ return (AuxiliaryData >> 1) & 1;
+}
+
+static unsigned extractDLC(unsigned AuxiliaryData) {
+ return (AuxiliaryData >> 2) & 1;
+}
+
+static unsigned extractSWZ(unsigned AuxiliaryData) {
+ return (AuxiliaryData >> 3) & 1;
+}
+
+// Returns Base register, constant offset, and offset def point.
+static std::tuple<Register, unsigned, MachineInstr *>
+getBaseWithConstantOffset(MachineRegisterInfo &MRI, Register Reg) {
+ MachineInstr *Def = getDefIgnoringCopies(Reg, MRI);
+ if (!Def)
+ return std::make_tuple(Reg, 0, nullptr);
+
+ if (Def->getOpcode() == AMDGPU::G_CONSTANT) {
+ unsigned Offset;
+ const MachineOperand &Op = Def->getOperand(1);
+ if (Op.isImm())
+ Offset = Op.getImm();
+ else
+ Offset = Op.getCImm()->getZExtValue();
+
+ return std::make_tuple(Register(), Offset, Def);
+ }
+
+ int64_t Offset;
+ if (Def->getOpcode() == AMDGPU::G_ADD) {
+ // TODO: Handle G_OR used for add case
+ if (mi_match(Def->getOperand(1).getReg(), MRI, m_ICst(Offset)))
+ return std::make_tuple(Def->getOperand(0).getReg(), Offset, Def);
+
+ // FIXME: matcher should ignore copies
+ if (mi_match(Def->getOperand(1).getReg(), MRI, m_Copy(m_ICst(Offset))))
+ return std::make_tuple(Def->getOperand(0).getReg(), Offset, Def);
+ }
+
+ return std::make_tuple(Reg, 0, Def);
+}
+
+static unsigned getBufferStoreOpcode(LLT Ty,
+ const unsigned MemSize,
+ const bool Offen) {
+ const int Size = Ty.getSizeInBits();
+ switch (8 * MemSize) {
+ case 8:
+ return Offen ? AMDGPU::BUFFER_STORE_BYTE_OFFEN_exact :
+ AMDGPU::BUFFER_STORE_BYTE_OFFSET_exact;
+ case 16:
+ return Offen ? AMDGPU::BUFFER_STORE_SHORT_OFFEN_exact :
+ AMDGPU::BUFFER_STORE_SHORT_OFFSET_exact;
+ default:
+ unsigned Opc = Offen ? AMDGPU::BUFFER_STORE_DWORD_OFFEN_exact :
+ AMDGPU::BUFFER_STORE_DWORD_OFFSET_exact;
+ if (Size > 32)
+ Opc = AMDGPU::getMUBUFOpcode(Opc, Size / 32);
+ return Opc;
+ }
+}
+
+static unsigned getBufferStoreFormatOpcode(LLT Ty,
+ const unsigned MemSize,
+ const bool Offen) {
+ bool IsD16Packed = Ty.getScalarSizeInBits() == 16;
+ bool IsD16Unpacked = 8 * MemSize < Ty.getSizeInBits();
+ int NumElts = Ty.isVector() ? Ty.getNumElements() : 1;
+
+ if (IsD16Packed) {
+ switch (NumElts) {
+ case 1:
+ return Offen ? AMDGPU::BUFFER_STORE_FORMAT_D16_X_OFFEN_exact :
+ AMDGPU::BUFFER_STORE_FORMAT_D16_X_OFFSET_exact;
+ case 2:
+ return Offen ? AMDGPU::BUFFER_STORE_FORMAT_D16_XY_OFFEN_exact :
+ AMDGPU::BUFFER_STORE_FORMAT_D16_XY_OFFSET_exact;
+ case 3:
+ return Offen ? AMDGPU::BUFFER_STORE_FORMAT_D16_XYZ_OFFEN_exact :
+ AMDGPU::BUFFER_STORE_FORMAT_D16_XYZ_OFFSET_exact;
+ case 4:
+ return Offen ? AMDGPU::BUFFER_STORE_FORMAT_D16_XYZW_OFFEN_exact :
+ AMDGPU::BUFFER_STORE_FORMAT_D16_XYZW_OFFSET_exact;
+ default:
+ return -1;
+ }
+ }
+
+ if (IsD16Unpacked) {
+ switch (NumElts) {
+ case 1:
+ return Offen ? AMDGPU::BUFFER_STORE_FORMAT_D16_X_OFFEN_exact :
+ AMDGPU::BUFFER_STORE_FORMAT_D16_X_OFFSET_exact;
+ case 2:
+ return Offen ? AMDGPU::BUFFER_STORE_FORMAT_D16_XY_gfx80_OFFEN_exact :
+ AMDGPU::BUFFER_STORE_FORMAT_D16_XY_gfx80_OFFSET_exact;
+ case 3:
+ return Offen ? AMDGPU::BUFFER_STORE_FORMAT_D16_XYZ_gfx80_OFFEN_exact :
+ AMDGPU::BUFFER_STORE_FORMAT_D16_XYZ_gfx80_OFFSET_exact;
+ case 4:
+ return Offen ? AMDGPU::BUFFER_STORE_FORMAT_D16_XYZW_gfx80_OFFEN_exact :
+ AMDGPU::BUFFER_STORE_FORMAT_D16_XYZW_gfx80_OFFSET_exact;
+ default:
+ return -1;
+ }
+ }
+
+ switch (NumElts) {
+ case 1:
+ return Offen ? AMDGPU::BUFFER_STORE_FORMAT_X_OFFEN_exact :
+ AMDGPU::BUFFER_STORE_FORMAT_X_OFFSET_exact;
+ case 2:
+ return Offen ? AMDGPU::BUFFER_STORE_FORMAT_XY_OFFEN_exact :
+ AMDGPU::BUFFER_STORE_FORMAT_XY_OFFSET_exact;
+ case 3:
+ return Offen ? AMDGPU::BUFFER_STORE_FORMAT_XYZ_OFFEN_exact :
+ AMDGPU::BUFFER_STORE_FORMAT_XYZ_OFFSET_exact;
+ case 4:
+ return Offen ? AMDGPU::BUFFER_STORE_FORMAT_XYZW_OFFEN_exact :
+ AMDGPU::BUFFER_STORE_FORMAT_XYZW_OFFSET_exact;
+ default:
+ return -1;
+ }
+
+ llvm_unreachable("unhandled buffer store");
+}
+
+// TODO: Move this to combiner
+// Returns base register, imm offset, total constant offset.
+std::tuple<Register, unsigned, unsigned>
+AMDGPUInstructionSelector::splitBufferOffsets(MachineIRBuilder &B,
+ Register OrigOffset) const {
+ const unsigned MaxImm = 4095;
+ Register BaseReg;
+ unsigned TotalConstOffset;
+ MachineInstr *OffsetDef;
+
+ std::tie(BaseReg, TotalConstOffset, OffsetDef)
+ = getBaseWithConstantOffset(*MRI, OrigOffset);
+
+ unsigned ImmOffset = TotalConstOffset;
+
+ // If the immediate value is too big for the immoffset field, put the value
+ // and -4096 into the immoffset field so that the value that is copied/added
+ // for the voffset field is a multiple of 4096, and it stands more chance
+ // of being CSEd with the copy/add for another similar load/store.f
+ // However, do not do that rounding down to a multiple of 4096 if that is a
+ // negative number, as it appears to be illegal to have a negative offset
+ // in the vgpr, even if adding the immediate offset makes it positive.
+ unsigned Overflow = ImmOffset & ~MaxImm;
+ ImmOffset -= Overflow;
+ if ((int32_t)Overflow < 0) {
+ Overflow += ImmOffset;
+ ImmOffset = 0;
+ }
+
+ if (Overflow != 0) {
+ // In case this is in a waterfall loop, insert offset code at the def point
+ // of the offset, not inside the loop.
+ MachineBasicBlock::iterator OldInsPt = B.getInsertPt();
+ MachineBasicBlock &OldMBB = B.getMBB();
+ B.setInstr(*OffsetDef);
+
+ if (!BaseReg) {
+ BaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+ B.buildInstr(AMDGPU::V_MOV_B32_e32)
+ .addDef(BaseReg)
+ .addImm(Overflow);
+ } else {
+ Register OverflowVal = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+ B.buildInstr(AMDGPU::V_MOV_B32_e32)
+ .addDef(OverflowVal)
+ .addImm(Overflow);
+
+ Register NewBaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+ TII.getAddNoCarry(B.getMBB(), B.getInsertPt(), B.getDebugLoc(), NewBaseReg)
+ .addReg(BaseReg)
+ .addReg(OverflowVal, RegState::Kill)
+ .addImm(0);
+ BaseReg = NewBaseReg;
+ }
+
+ B.setInsertPt(OldMBB, OldInsPt);
+ }
+
+ return std::make_tuple(BaseReg, ImmOffset, TotalConstOffset);
+}
+
+bool AMDGPUInstructionSelector::selectStoreIntrinsic(MachineInstr &MI,
+ bool IsFormat) const {
+ MachineIRBuilder B(MI);
+ MachineFunction &MF = B.getMF();
+ Register VData = MI.getOperand(1).getReg();
+ LLT Ty = MRI->getType(VData);
+
+ int Size = Ty.getSizeInBits();
+ if (Size % 32 != 0)
+ return false;
+
+ // FIXME: Verifier should enforce 1 MMO for these intrinsics.
+ MachineMemOperand *MMO = *MI.memoperands_begin();
+ const int MemSize = MMO->getSize();
+
+ Register RSrc = MI.getOperand(2).getReg();
+ Register VOffset = MI.getOperand(3).getReg();
+ Register SOffset = MI.getOperand(4).getReg();
+ unsigned AuxiliaryData = MI.getOperand(5).getImm();
+ unsigned ImmOffset;
+ unsigned TotalOffset;
+
+ std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset);
+ if (TotalOffset != 0)
+ MMO = MF.getMachineMemOperand(MMO, TotalOffset, MemSize);
+
+ const bool Offen = !isZero(VOffset, *MRI);
+
+ int Opc = IsFormat ? getBufferStoreFormatOpcode(Ty, MemSize, Offen) :
+ getBufferStoreOpcode(Ty, MemSize, Offen);
+ if (Opc == -1)
+ return false;
+
+ MachineInstrBuilder MIB = B.buildInstr(Opc)
+ .addUse(VData);
+
+ if (Offen)
+ MIB.addUse(VOffset);
+
+ MIB.addUse(RSrc)
+ .addUse(SOffset)
+ .addImm(ImmOffset)
+ .addImm(extractGLC(AuxiliaryData))
+ .addImm(extractSLC(AuxiliaryData))
+ .addImm(0) // tfe: FIXME: Remove from inst
+ .addImm(extractDLC(AuxiliaryData))
+ .addImm(extractSWZ(AuxiliaryData))
+ .addMemOperand(MMO);
+
+ MI.eraseFromParent();
+
+ return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
+}
+
bool AMDGPUInstructionSelector::selectG_INTRINSIC_W_SIDE_EFFECTS(
- MachineInstr &I, CodeGenCoverage &CoverageInfo) const {
+ MachineInstr &I) const {
MachineBasicBlock *BB = I.getParent();
- MachineFunction *MF = BB->getParent();
- MachineRegisterInfo &MRI = MF->getRegInfo();
-
- unsigned IntrinsicID = I.getOperand(0).getIntrinsicID();
+ unsigned IntrinsicID = I.getIntrinsicID();
switch (IntrinsicID) {
case Intrinsic::amdgcn_exp: {
- int64_t Tgt = getConstant(MRI.getVRegDef(I.getOperand(1).getReg()));
- int64_t Enabled = getConstant(MRI.getVRegDef(I.getOperand(2).getReg()));
- int64_t Done = getConstant(MRI.getVRegDef(I.getOperand(7).getReg()));
- int64_t VM = getConstant(MRI.getVRegDef(I.getOperand(8).getReg()));
+ int64_t Tgt = I.getOperand(1).getImm();
+ int64_t Enabled = I.getOperand(2).getImm();
+ int64_t Done = I.getOperand(7).getImm();
+ int64_t VM = I.getOperand(8).getImm();
MachineInstr *Exp = buildEXP(TII, &I, Tgt, I.getOperand(3).getReg(),
I.getOperand(4).getReg(),
@@ -761,13 +1051,13 @@ bool AMDGPUInstructionSelector::selectG_INTRINSIC_W_SIDE_EFFECTS(
}
case Intrinsic::amdgcn_exp_compr: {
const DebugLoc &DL = I.getDebugLoc();
- int64_t Tgt = getConstant(MRI.getVRegDef(I.getOperand(1).getReg()));
- int64_t Enabled = getConstant(MRI.getVRegDef(I.getOperand(2).getReg()));
- unsigned Reg0 = I.getOperand(3).getReg();
- unsigned Reg1 = I.getOperand(4).getReg();
- unsigned Undef = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
- int64_t Done = getConstant(MRI.getVRegDef(I.getOperand(5).getReg()));
- int64_t VM = getConstant(MRI.getVRegDef(I.getOperand(6).getReg()));
+ int64_t Tgt = I.getOperand(1).getImm();
+ int64_t Enabled = I.getOperand(2).getImm();
+ Register Reg0 = I.getOperand(3).getReg();
+ Register Reg1 = I.getOperand(4).getReg();
+ Register Undef = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+ int64_t Done = I.getOperand(5).getImm();
+ int64_t VM = I.getOperand(6).getImm();
BuildMI(*BB, &I, DL, TII.get(AMDGPU::IMPLICIT_DEF), Undef);
MachineInstr *Exp = buildEXP(TII, &I, Tgt, Reg0, Reg1, Undef, Undef, VM,
@@ -786,27 +1076,29 @@ bool AMDGPUInstructionSelector::selectG_INTRINSIC_W_SIDE_EFFECTS(
Register Reg = I.getOperand(1).getReg();
I.eraseFromParent();
- if (!MRI.getRegClassOrNull(Reg))
- MRI.setRegClass(Reg, TRI.getWaveMaskRegClass());
+ if (!MRI->getRegClassOrNull(Reg))
+ MRI->setRegClass(Reg, TRI.getWaveMaskRegClass());
return true;
}
+ case Intrinsic::amdgcn_raw_buffer_store:
+ return selectStoreIntrinsic(I, false);
+ case Intrinsic::amdgcn_raw_buffer_store_format:
+ return selectStoreIntrinsic(I, true);
default:
- return selectImpl(I, CoverageInfo);
+ return selectImpl(I, *CoverageInfo);
}
}
bool AMDGPUInstructionSelector::selectG_SELECT(MachineInstr &I) const {
MachineBasicBlock *BB = I.getParent();
- MachineFunction *MF = BB->getParent();
- MachineRegisterInfo &MRI = MF->getRegInfo();
const DebugLoc &DL = I.getDebugLoc();
- unsigned DstReg = I.getOperand(0).getReg();
- unsigned Size = RBI.getSizeInBits(DstReg, MRI, TRI);
+ Register DstReg = I.getOperand(0).getReg();
+ unsigned Size = RBI.getSizeInBits(DstReg, *MRI, TRI);
assert(Size <= 32 || Size == 64);
const MachineOperand &CCOp = I.getOperand(1);
- unsigned CCReg = CCOp.getReg();
- if (isSCC(CCReg, MRI)) {
+ Register CCReg = CCOp.getReg();
+ if (isSCC(CCReg, *MRI)) {
unsigned SelectOpcode = Size == 64 ? AMDGPU::S_CSELECT_B64 :
AMDGPU::S_CSELECT_B32;
MachineInstr *CopySCC = BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), AMDGPU::SCC)
@@ -815,8 +1107,8 @@ bool AMDGPUInstructionSelector::selectG_SELECT(MachineInstr &I) const {
// The generic constrainSelectedInstRegOperands doesn't work for the scc register
// bank, because it does not cover the register class that we used to represent
// for it. So we need to manually set the register class here.
- if (!MRI.getRegClassOrNull(CCReg))
- MRI.setRegClass(CCReg, TRI.getConstrainedRegClassForOperand(CCOp, MRI));
+ if (!MRI->getRegClassOrNull(CCReg))
+ MRI->setRegClass(CCReg, TRI.getConstrainedRegClassForOperand(CCOp, *MRI));
MachineInstr *Select = BuildMI(*BB, &I, DL, TII.get(SelectOpcode), DstReg)
.add(I.getOperand(2))
.add(I.getOperand(3));
@@ -845,52 +1137,8 @@ bool AMDGPUInstructionSelector::selectG_SELECT(MachineInstr &I) const {
}
bool AMDGPUInstructionSelector::selectG_STORE(MachineInstr &I) const {
- MachineBasicBlock *BB = I.getParent();
- MachineFunction *MF = BB->getParent();
- MachineRegisterInfo &MRI = MF->getRegInfo();
- DebugLoc DL = I.getDebugLoc();
- unsigned PtrSize = RBI.getSizeInBits(I.getOperand(1).getReg(), MRI, TRI);
- if (PtrSize != 64) {
- LLVM_DEBUG(dbgs() << "Unhandled address space\n");
- return false;
- }
-
- unsigned StoreSize = RBI.getSizeInBits(I.getOperand(0).getReg(), MRI, TRI);
- unsigned Opcode;
-
- // FIXME: Remove this when integers > s32 naturally selected.
- switch (StoreSize) {
- default:
- return false;
- case 32:
- Opcode = AMDGPU::FLAT_STORE_DWORD;
- break;
- case 64:
- Opcode = AMDGPU::FLAT_STORE_DWORDX2;
- break;
- case 96:
- Opcode = AMDGPU::FLAT_STORE_DWORDX3;
- break;
- case 128:
- Opcode = AMDGPU::FLAT_STORE_DWORDX4;
- break;
- }
-
- MachineInstr *Flat = BuildMI(*BB, &I, DL, TII.get(Opcode))
- .add(I.getOperand(1))
- .add(I.getOperand(0))
- .addImm(0) // offset
- .addImm(0) // glc
- .addImm(0) // slc
- .addImm(0); // dlc
-
-
- // Now that we selected an opcode, we need to constrain the register
- // operands to use appropriate classes.
- bool Ret = constrainSelectedInstRegOperands(*Flat, TII, TRI, RBI);
-
- I.eraseFromParent();
- return Ret;
+ initM0(I);
+ return selectImpl(I, *CoverageInfo);
}
static int sizeToSubRegIndex(unsigned Size) {
@@ -915,19 +1163,15 @@ static int sizeToSubRegIndex(unsigned Size) {
}
bool AMDGPUInstructionSelector::selectG_TRUNC(MachineInstr &I) const {
- MachineBasicBlock *BB = I.getParent();
- MachineFunction *MF = BB->getParent();
- MachineRegisterInfo &MRI = MF->getRegInfo();
-
- unsigned DstReg = I.getOperand(0).getReg();
- unsigned SrcReg = I.getOperand(1).getReg();
- const LLT DstTy = MRI.getType(DstReg);
- const LLT SrcTy = MRI.getType(SrcReg);
+ Register DstReg = I.getOperand(0).getReg();
+ Register SrcReg = I.getOperand(1).getReg();
+ const LLT DstTy = MRI->getType(DstReg);
+ const LLT SrcTy = MRI->getType(SrcReg);
if (!DstTy.isScalar())
return false;
- const RegisterBank *DstRB = RBI.getRegBank(DstReg, MRI, TRI);
- const RegisterBank *SrcRB = RBI.getRegBank(SrcReg, MRI, TRI);
+ const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
+ const RegisterBank *SrcRB = RBI.getRegBank(SrcReg, *MRI, TRI);
if (SrcRB != DstRB)
return false;
@@ -935,9 +1179,9 @@ bool AMDGPUInstructionSelector::selectG_TRUNC(MachineInstr &I) const {
unsigned SrcSize = SrcTy.getSizeInBits();
const TargetRegisterClass *SrcRC
- = TRI.getRegClassForSizeOnBank(SrcSize, *SrcRB, MRI);
+ = TRI.getRegClassForSizeOnBank(SrcSize, *SrcRB, *MRI);
const TargetRegisterClass *DstRC
- = TRI.getRegClassForSizeOnBank(DstSize, *DstRB, MRI);
+ = TRI.getRegClassForSizeOnBank(DstSize, *DstRB, *MRI);
if (SrcSize > 32) {
int SubRegIdx = sizeToSubRegIndex(DstSize);
@@ -953,8 +1197,8 @@ bool AMDGPUInstructionSelector::selectG_TRUNC(MachineInstr &I) const {
I.getOperand(1).setSubReg(SubRegIdx);
}
- if (!RBI.constrainGenericRegister(SrcReg, *SrcRC, MRI) ||
- !RBI.constrainGenericRegister(DstReg, *DstRC, MRI)) {
+ if (!RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI) ||
+ !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI)) {
LLVM_DEBUG(dbgs() << "Failed to constrain G_TRUNC\n");
return false;
}
@@ -974,20 +1218,18 @@ bool AMDGPUInstructionSelector::selectG_SZA_EXT(MachineInstr &I) const {
bool Signed = I.getOpcode() == AMDGPU::G_SEXT;
const DebugLoc &DL = I.getDebugLoc();
MachineBasicBlock &MBB = *I.getParent();
- MachineFunction &MF = *MBB.getParent();
- MachineRegisterInfo &MRI = MF.getRegInfo();
- const unsigned DstReg = I.getOperand(0).getReg();
- const unsigned SrcReg = I.getOperand(1).getReg();
+ const Register DstReg = I.getOperand(0).getReg();
+ const Register SrcReg = I.getOperand(1).getReg();
- const LLT DstTy = MRI.getType(DstReg);
- const LLT SrcTy = MRI.getType(SrcReg);
+ const LLT DstTy = MRI->getType(DstReg);
+ const LLT SrcTy = MRI->getType(SrcReg);
const LLT S1 = LLT::scalar(1);
const unsigned SrcSize = SrcTy.getSizeInBits();
const unsigned DstSize = DstTy.getSizeInBits();
if (!DstTy.isScalar())
return false;
- const RegisterBank *SrcBank = RBI.getRegBank(SrcReg, MRI, TRI);
+ const RegisterBank *SrcBank = RBI.getRegBank(SrcReg, *MRI, TRI);
if (SrcBank->getID() == AMDGPU::SCCRegBankID) {
if (SrcTy != S1 || DstSize > 64) // Invalid
@@ -1000,7 +1242,7 @@ bool AMDGPUInstructionSelector::selectG_SZA_EXT(MachineInstr &I) const {
// FIXME: Create an extra copy to avoid incorrectly constraining the result
// of the scc producer.
- unsigned TmpReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
+ Register TmpReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
BuildMI(MBB, I, DL, TII.get(AMDGPU::COPY), TmpReg)
.addReg(SrcReg);
BuildMI(MBB, I, DL, TII.get(AMDGPU::COPY), AMDGPU::SCC)
@@ -1010,7 +1252,8 @@ bool AMDGPUInstructionSelector::selectG_SZA_EXT(MachineInstr &I) const {
BuildMI(MBB, I, DL, TII.get(Opcode), DstReg)
.addImm(0)
.addImm(Signed ? -1 : 1);
- return RBI.constrainGenericRegister(DstReg, *DstRC, MRI);
+ I.eraseFromParent();
+ return RBI.constrainGenericRegister(DstReg, *DstRC, *MRI);
}
if (SrcBank->getID() == AMDGPU::VCCRegBankID && DstSize <= 32) {
@@ -1024,6 +1267,7 @@ bool AMDGPUInstructionSelector::selectG_SZA_EXT(MachineInstr &I) const {
.addImm(0) // src1_modifiers
.addImm(Signed ? -1 : 1) // src1
.addUse(SrcReg);
+ I.eraseFromParent();
return constrainSelectedInstRegOperands(*ExtI, TII, TRI, RBI);
}
@@ -1040,6 +1284,7 @@ bool AMDGPUInstructionSelector::selectG_SZA_EXT(MachineInstr &I) const {
BuildMI(MBB, I, DL, TII.get(AMDGPU::V_AND_B32_e32), DstReg)
.addImm(Mask)
.addReg(SrcReg);
+ I.eraseFromParent();
return constrainSelectedInstRegOperands(*ExtI, TII, TRI, RBI);
}
@@ -1049,11 +1294,12 @@ bool AMDGPUInstructionSelector::selectG_SZA_EXT(MachineInstr &I) const {
.addReg(SrcReg)
.addImm(0) // Offset
.addImm(SrcSize); // Width
+ I.eraseFromParent();
return constrainSelectedInstRegOperands(*ExtI, TII, TRI, RBI);
}
if (SrcBank->getID() == AMDGPU::SGPRRegBankID && DstSize <= 64) {
- if (!RBI.constrainGenericRegister(SrcReg, AMDGPU::SReg_32RegClass, MRI))
+ if (!RBI.constrainGenericRegister(SrcReg, AMDGPU::SReg_32RegClass, *MRI))
return false;
if (Signed && DstSize == 32 && (SrcSize == 8 || SrcSize == 16)) {
@@ -1061,7 +1307,8 @@ bool AMDGPUInstructionSelector::selectG_SZA_EXT(MachineInstr &I) const {
AMDGPU::S_SEXT_I32_I8 : AMDGPU::S_SEXT_I32_I16;
BuildMI(MBB, I, DL, TII.get(SextOpc), DstReg)
.addReg(SrcReg);
- return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_32RegClass, MRI);
+ I.eraseFromParent();
+ return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_32RegClass, *MRI);
}
const unsigned BFE64 = Signed ? AMDGPU::S_BFE_I64 : AMDGPU::S_BFE_U64;
@@ -1070,10 +1317,8 @@ bool AMDGPUInstructionSelector::selectG_SZA_EXT(MachineInstr &I) const {
// Scalar BFE is encoded as S1[5:0] = offset, S1[22:16]= width.
if (DstSize > 32 && SrcSize <= 32) {
// We need a 64-bit register source, but the high bits don't matter.
- unsigned ExtReg
- = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
- unsigned UndefReg
- = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
+ Register ExtReg = MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass);
+ Register UndefReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
BuildMI(MBB, I, DL, TII.get(AMDGPU::IMPLICIT_DEF), UndefReg);
BuildMI(MBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), ExtReg)
.addReg(SrcReg)
@@ -1085,7 +1330,8 @@ bool AMDGPUInstructionSelector::selectG_SZA_EXT(MachineInstr &I) const {
.addReg(ExtReg)
.addImm(SrcSize << 16);
- return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_64RegClass, MRI);
+ I.eraseFromParent();
+ return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_64RegClass, *MRI);
}
unsigned Mask;
@@ -1099,16 +1345,58 @@ bool AMDGPUInstructionSelector::selectG_SZA_EXT(MachineInstr &I) const {
.addImm(SrcSize << 16);
}
- return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_32RegClass, MRI);
+ I.eraseFromParent();
+ return RBI.constrainGenericRegister(DstReg, AMDGPU::SReg_32RegClass, *MRI);
}
return false;
}
+static int64_t getFPTrueImmVal(unsigned Size, bool Signed) {
+ switch (Size) {
+ case 16:
+ return Signed ? 0xBC00 : 0x3C00;
+ case 32:
+ return Signed ? 0xbf800000 : 0x3f800000;
+ case 64:
+ return Signed ? 0xbff0000000000000 : 0x3ff0000000000000;
+ default:
+ llvm_unreachable("Invalid FP type size");
+ }
+}
+
+bool AMDGPUInstructionSelector::selectG_SITOFP_UITOFP(MachineInstr &I) const {
+ MachineBasicBlock *MBB = I.getParent();
+ MachineFunction *MF = MBB->getParent();
+ MachineRegisterInfo &MRI = MF->getRegInfo();
+ Register Src = I.getOperand(1).getReg();
+ if (!isSCC(Src, MRI))
+ return selectImpl(I, *CoverageInfo);
+
+ bool Signed = I.getOpcode() == AMDGPU::G_SITOFP;
+ Register DstReg = I.getOperand(0).getReg();
+ const LLT DstTy = MRI.getType(DstReg);
+ const unsigned DstSize = DstTy.getSizeInBits();
+ const DebugLoc &DL = I.getDebugLoc();
+
+ BuildMI(*MBB, I, DL, TII.get(AMDGPU::COPY), AMDGPU::SCC)
+ .addReg(Src);
+
+ unsigned NewOpc =
+ DstSize > 32 ? AMDGPU::S_CSELECT_B64 : AMDGPU::S_CSELECT_B32;
+ auto MIB = BuildMI(*MBB, I, DL, TII.get(NewOpc), DstReg)
+ .addImm(0)
+ .addImm(getFPTrueImmVal(DstSize, Signed));
+
+ if (!constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI))
+ return false;
+
+ I.eraseFromParent();
+ return true;
+}
+
bool AMDGPUInstructionSelector::selectG_CONSTANT(MachineInstr &I) const {
MachineBasicBlock *BB = I.getParent();
- MachineFunction *MF = BB->getParent();
- MachineRegisterInfo &MRI = MF->getRegInfo();
MachineOperand &ImmOp = I.getOperand(1);
// The AMDGPU backend only supports Imm operands and not CImm or FPImm.
@@ -1119,15 +1407,15 @@ bool AMDGPUInstructionSelector::selectG_CONSTANT(MachineInstr &I) const {
ImmOp.ChangeToImmediate(ImmOp.getCImm()->getZExtValue());
}
- unsigned DstReg = I.getOperand(0).getReg();
+ Register DstReg = I.getOperand(0).getReg();
unsigned Size;
bool IsSgpr;
- const RegisterBank *RB = MRI.getRegBankOrNull(I.getOperand(0).getReg());
+ const RegisterBank *RB = MRI->getRegBankOrNull(I.getOperand(0).getReg());
if (RB) {
IsSgpr = RB->getID() == AMDGPU::SGPRRegBankID;
- Size = MRI.getType(DstReg).getSizeInBits();
+ Size = MRI->getType(DstReg).getSizeInBits();
} else {
- const TargetRegisterClass *RC = TRI.getRegClassForReg(MRI, DstReg);
+ const TargetRegisterClass *RC = TRI.getRegClassForReg(*MRI, DstReg);
IsSgpr = TRI.isSGPRClass(RC);
Size = TRI.getRegSizeInBits(*RC);
}
@@ -1142,34 +1430,41 @@ bool AMDGPUInstructionSelector::selectG_CONSTANT(MachineInstr &I) const {
return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
}
- DebugLoc DL = I.getDebugLoc();
- const TargetRegisterClass *RC = IsSgpr ? &AMDGPU::SReg_32_XM0RegClass :
- &AMDGPU::VGPR_32RegClass;
- unsigned LoReg = MRI.createVirtualRegister(RC);
- unsigned HiReg = MRI.createVirtualRegister(RC);
- const APInt &Imm = APInt(Size, I.getOperand(1).getImm());
-
- BuildMI(*BB, &I, DL, TII.get(Opcode), LoReg)
- .addImm(Imm.trunc(32).getZExtValue());
+ const DebugLoc &DL = I.getDebugLoc();
- BuildMI(*BB, &I, DL, TII.get(Opcode), HiReg)
- .addImm(Imm.ashr(32).getZExtValue());
+ APInt Imm(Size, I.getOperand(1).getImm());
- const MachineInstr *RS =
- BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)
- .addReg(LoReg)
- .addImm(AMDGPU::sub0)
- .addReg(HiReg)
- .addImm(AMDGPU::sub1);
+ MachineInstr *ResInst;
+ if (IsSgpr && TII.isInlineConstant(Imm)) {
+ ResInst = BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_MOV_B64), DstReg)
+ .addImm(I.getOperand(1).getImm());
+ } else {
+ const TargetRegisterClass *RC = IsSgpr ?
+ &AMDGPU::SReg_32RegClass : &AMDGPU::VGPR_32RegClass;
+ Register LoReg = MRI->createVirtualRegister(RC);
+ Register HiReg = MRI->createVirtualRegister(RC);
+
+ BuildMI(*BB, &I, DL, TII.get(Opcode), LoReg)
+ .addImm(Imm.trunc(32).getZExtValue());
+
+ BuildMI(*BB, &I, DL, TII.get(Opcode), HiReg)
+ .addImm(Imm.ashr(32).getZExtValue());
+
+ ResInst = BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)
+ .addReg(LoReg)
+ .addImm(AMDGPU::sub0)
+ .addReg(HiReg)
+ .addImm(AMDGPU::sub1);
+ }
// We can't call constrainSelectedInstRegOperands here, because it doesn't
// work for target independent opcodes
I.eraseFromParent();
const TargetRegisterClass *DstRC =
- TRI.getConstrainedRegClassForOperand(RS->getOperand(0), MRI);
+ TRI.getConstrainedRegClassForOperand(ResInst->getOperand(0), *MRI);
if (!DstRC)
return true;
- return RBI.constrainGenericRegister(DstReg, *DstRC, MRI);
+ return RBI.constrainGenericRegister(DstReg, *DstRC, *MRI);
}
static bool isConstant(const MachineInstr &MI) {
@@ -1188,13 +1483,13 @@ void AMDGPUInstructionSelector::getAddrModeInfo(const MachineInstr &Load,
GEPInfo GEPInfo(*PtrMI);
- for (unsigned i = 1, e = 3; i < e; ++i) {
+ for (unsigned i = 1; i != 3; ++i) {
const MachineOperand &GEPOp = PtrMI->getOperand(i);
const MachineInstr *OpDef = MRI.getUniqueVRegDef(GEPOp.getReg());
assert(OpDef);
- if (isConstant(*OpDef)) {
- // FIXME: Is it possible to have multiple Imm parts? Maybe if we
- // are lacking other optimizations.
+ if (i == 2 && isConstant(*OpDef)) {
+ // TODO: Could handle constant base + variable offset, but a combine
+ // probably should have commuted it.
assert(GEPInfo.Imm == 0);
GEPInfo.Imm = OpDef->getOperand(1).getCImm()->getSExtValue();
continue;
@@ -1240,16 +1535,26 @@ bool AMDGPUInstructionSelector::hasVgprParts(ArrayRef<GEPInfo> AddrInfo) const {
return false;
}
-bool AMDGPUInstructionSelector::selectG_LOAD(MachineInstr &I) const {
- // TODO: Can/should we insert m0 initialization here for DS instructions and
- // call the normal selector?
- return false;
+void AMDGPUInstructionSelector::initM0(MachineInstr &I) const {
+ MachineBasicBlock *BB = I.getParent();
+
+ const LLT PtrTy = MRI->getType(I.getOperand(1).getReg());
+ unsigned AS = PtrTy.getAddressSpace();
+ if ((AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) &&
+ STI.ldsRequiresM0Init()) {
+ // If DS instructions require M0 initializtion, insert it before selecting.
+ BuildMI(*BB, &I, I.getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), AMDGPU::M0)
+ .addImm(-1);
+ }
+}
+
+bool AMDGPUInstructionSelector::selectG_LOAD_ATOMICRMW(MachineInstr &I) const {
+ initM0(I);
+ return selectImpl(I, *CoverageInfo);
}
bool AMDGPUInstructionSelector::selectG_BRCOND(MachineInstr &I) const {
MachineBasicBlock *BB = I.getParent();
- MachineFunction *MF = BB->getParent();
- MachineRegisterInfo &MRI = MF->getRegInfo();
MachineOperand &CondOp = I.getOperand(0);
Register CondReg = CondOp.getReg();
const DebugLoc &DL = I.getDebugLoc();
@@ -1263,11 +1568,12 @@ bool AMDGPUInstructionSelector::selectG_BRCOND(MachineInstr &I) const {
// GlobalISel, we should push that decision into RegBankSelect. Assume for now
// RegBankSelect knows what it's doing if the branch condition is scc, even
// though it currently does not.
- if (isSCC(CondReg, MRI)) {
+ if (isSCC(CondReg, *MRI)) {
CondPhysReg = AMDGPU::SCC;
BrOpcode = AMDGPU::S_CBRANCH_SCC1;
- ConstrainRC = &AMDGPU::SReg_32_XM0RegClass;
- } else if (isVCC(CondReg, MRI)) {
+ // FIXME: Hack for isSCC tests
+ ConstrainRC = &AMDGPU::SGPR_32RegClass;
+ } else if (isVCC(CondReg, *MRI)) {
// FIXME: Do we have to insert an and with exec here, like in SelectionDAG?
// We sort of know that a VCC producer based on the register bank, that ands
// inactive lanes with 0. What if there was a logical operation with vcc
@@ -1279,8 +1585,8 @@ bool AMDGPUInstructionSelector::selectG_BRCOND(MachineInstr &I) const {
} else
return false;
- if (!MRI.getRegClassOrNull(CondReg))
- MRI.setRegClass(CondReg, ConstrainRC);
+ if (!MRI->getRegClassOrNull(CondReg))
+ MRI->setRegClass(CondReg, ConstrainRC);
BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), CondPhysReg)
.addReg(CondReg);
@@ -1292,27 +1598,83 @@ bool AMDGPUInstructionSelector::selectG_BRCOND(MachineInstr &I) const {
}
bool AMDGPUInstructionSelector::selectG_FRAME_INDEX(MachineInstr &I) const {
- MachineBasicBlock *BB = I.getParent();
- MachineFunction *MF = BB->getParent();
- MachineRegisterInfo &MRI = MF->getRegInfo();
-
Register DstReg = I.getOperand(0).getReg();
- const RegisterBank *DstRB = RBI.getRegBank(DstReg, MRI, TRI);
+ const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
const bool IsVGPR = DstRB->getID() == AMDGPU::VGPRRegBankID;
I.setDesc(TII.get(IsVGPR ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32));
if (IsVGPR)
I.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true));
return RBI.constrainGenericRegister(
- DstReg, IsVGPR ? AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass, MRI);
+ DstReg, IsVGPR ? AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass, *MRI);
+}
+
+bool AMDGPUInstructionSelector::selectG_PTR_MASK(MachineInstr &I) const {
+ uint64_t Align = I.getOperand(2).getImm();
+ const uint64_t Mask = ~((UINT64_C(1) << Align) - 1);
+
+ MachineBasicBlock *BB = I.getParent();
+
+ Register DstReg = I.getOperand(0).getReg();
+ Register SrcReg = I.getOperand(1).getReg();
+
+ const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
+ const RegisterBank *SrcRB = RBI.getRegBank(SrcReg, *MRI, TRI);
+ const bool IsVGPR = DstRB->getID() == AMDGPU::VGPRRegBankID;
+ unsigned NewOpc = IsVGPR ? AMDGPU::V_AND_B32_e64 : AMDGPU::S_AND_B32;
+ unsigned MovOpc = IsVGPR ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32;
+ const TargetRegisterClass &RegRC
+ = IsVGPR ? AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass;
+
+ LLT Ty = MRI->getType(DstReg);
+
+ const TargetRegisterClass *DstRC = TRI.getRegClassForTypeOnBank(Ty, *DstRB,
+ *MRI);
+ const TargetRegisterClass *SrcRC = TRI.getRegClassForTypeOnBank(Ty, *SrcRB,
+ *MRI);
+ if (!RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) ||
+ !RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI))
+ return false;
+
+ const DebugLoc &DL = I.getDebugLoc();
+ Register ImmReg = MRI->createVirtualRegister(&RegRC);
+ BuildMI(*BB, &I, DL, TII.get(MovOpc), ImmReg)
+ .addImm(Mask);
+
+ if (Ty.getSizeInBits() == 32) {
+ BuildMI(*BB, &I, DL, TII.get(NewOpc), DstReg)
+ .addReg(SrcReg)
+ .addReg(ImmReg);
+ I.eraseFromParent();
+ return true;
+ }
+
+ Register HiReg = MRI->createVirtualRegister(&RegRC);
+ Register LoReg = MRI->createVirtualRegister(&RegRC);
+ Register MaskLo = MRI->createVirtualRegister(&RegRC);
+
+ BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), LoReg)
+ .addReg(SrcReg, 0, AMDGPU::sub0);
+ BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), HiReg)
+ .addReg(SrcReg, 0, AMDGPU::sub1);
+
+ BuildMI(*BB, &I, DL, TII.get(NewOpc), MaskLo)
+ .addReg(LoReg)
+ .addReg(ImmReg);
+ BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)
+ .addReg(MaskLo)
+ .addImm(AMDGPU::sub0)
+ .addReg(HiReg)
+ .addImm(AMDGPU::sub1);
+ I.eraseFromParent();
+ return true;
}
-bool AMDGPUInstructionSelector::select(MachineInstr &I,
- CodeGenCoverage &CoverageInfo) const {
+bool AMDGPUInstructionSelector::select(MachineInstr &I) {
if (I.isPHI())
return selectPHI(I);
- if (!isPreISelGenericOpcode(I.getOpcode())) {
+ if (!I.isPreISelOpcode()) {
if (I.isCopy())
return selectCOPY(I);
return true;
@@ -1324,16 +1686,18 @@ bool AMDGPUInstructionSelector::select(MachineInstr &I,
case TargetOpcode::G_XOR:
if (selectG_AND_OR_XOR(I))
return true;
- return selectImpl(I, CoverageInfo);
+ return selectImpl(I, *CoverageInfo);
case TargetOpcode::G_ADD:
case TargetOpcode::G_SUB:
- if (selectG_ADD_SUB(I))
+ if (selectImpl(I, *CoverageInfo))
return true;
- LLVM_FALLTHROUGH;
- default:
- return selectImpl(I, CoverageInfo);
+ return selectG_ADD_SUB(I);
+ case TargetOpcode::G_UADDO:
+ case TargetOpcode::G_USUBO:
+ return selectG_UADDO_USUBO(I);
case TargetOpcode::G_INTTOPTR:
case TargetOpcode::G_BITCAST:
+ case TargetOpcode::G_PTRTOINT:
return selectCOPY(I);
case TargetOpcode::G_CONSTANT:
case TargetOpcode::G_FCONSTANT:
@@ -1353,32 +1717,40 @@ bool AMDGPUInstructionSelector::select(MachineInstr &I,
case TargetOpcode::G_INSERT:
return selectG_INSERT(I);
case TargetOpcode::G_INTRINSIC:
- return selectG_INTRINSIC(I, CoverageInfo);
+ return selectG_INTRINSIC(I);
case TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS:
- return selectG_INTRINSIC_W_SIDE_EFFECTS(I, CoverageInfo);
+ return selectG_INTRINSIC_W_SIDE_EFFECTS(I);
case TargetOpcode::G_ICMP:
if (selectG_ICMP(I))
return true;
- return selectImpl(I, CoverageInfo);
+ return selectImpl(I, *CoverageInfo);
case TargetOpcode::G_LOAD:
- return selectImpl(I, CoverageInfo);
+ case TargetOpcode::G_ATOMIC_CMPXCHG:
+ case TargetOpcode::G_ATOMICRMW_XCHG:
+ case TargetOpcode::G_ATOMICRMW_ADD:
+ case TargetOpcode::G_ATOMICRMW_SUB:
+ case TargetOpcode::G_ATOMICRMW_AND:
+ case TargetOpcode::G_ATOMICRMW_OR:
+ case TargetOpcode::G_ATOMICRMW_XOR:
+ case TargetOpcode::G_ATOMICRMW_MIN:
+ case TargetOpcode::G_ATOMICRMW_MAX:
+ case TargetOpcode::G_ATOMICRMW_UMIN:
+ case TargetOpcode::G_ATOMICRMW_UMAX:
+ case TargetOpcode::G_ATOMICRMW_FADD:
+ return selectG_LOAD_ATOMICRMW(I);
case TargetOpcode::G_SELECT:
return selectG_SELECT(I);
case TargetOpcode::G_STORE:
- if (selectImpl(I, CoverageInfo))
- return true;
return selectG_STORE(I);
case TargetOpcode::G_TRUNC:
return selectG_TRUNC(I);
case TargetOpcode::G_SEXT:
case TargetOpcode::G_ZEXT:
case TargetOpcode::G_ANYEXT:
- if (selectG_SZA_EXT(I)) {
- I.eraseFromParent();
- return true;
- }
-
- return false;
+ return selectG_SZA_EXT(I);
+ case TargetOpcode::G_SITOFP:
+ case TargetOpcode::G_UITOFP:
+ return selectG_SITOFP_UITOFP(I);
case TargetOpcode::G_BRCOND:
return selectG_BRCOND(I);
case TargetOpcode::G_FRAME_INDEX:
@@ -1388,6 +1760,10 @@ bool AMDGPUInstructionSelector::select(MachineInstr &I,
// is checking for G_CONSTANT
I.setDesc(TII.get(AMDGPU::ATOMIC_FENCE));
return true;
+ case TargetOpcode::G_PTR_MASK:
+ return selectG_PTR_MASK(I);
+ default:
+ return selectImpl(I, *CoverageInfo);
}
return false;
}
@@ -1402,14 +1778,14 @@ AMDGPUInstructionSelector::selectVCSRC(MachineOperand &Root) const {
std::pair<Register, unsigned>
AMDGPUInstructionSelector::selectVOP3ModsImpl(
- Register Src, const MachineRegisterInfo &MRI) const {
+ Register Src) const {
unsigned Mods = 0;
- MachineInstr *MI = MRI.getVRegDef(Src);
+ MachineInstr *MI = MRI->getVRegDef(Src);
if (MI && MI->getOpcode() == AMDGPU::G_FNEG) {
Src = MI->getOperand(1).getReg();
Mods |= SISrcMods::NEG;
- MI = MRI.getVRegDef(Src);
+ MI = MRI->getVRegDef(Src);
}
if (MI && MI->getOpcode() == AMDGPU::G_FABS) {
@@ -1432,12 +1808,23 @@ AMDGPUInstructionSelector::selectVSRC0(MachineOperand &Root) const {
InstructionSelector::ComplexRendererFns
AMDGPUInstructionSelector::selectVOP3Mods0(MachineOperand &Root) const {
- MachineRegisterInfo &MRI
- = Root.getParent()->getParent()->getParent()->getRegInfo();
+ Register Src;
+ unsigned Mods;
+ std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg());
+
+ return {{
+ [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
+ [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }, // src0_mods
+ [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }, // clamp
+ [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // omod
+ }};
+}
+InstructionSelector::ComplexRendererFns
+AMDGPUInstructionSelector::selectVOP3Mods0Clamp0OMod(MachineOperand &Root) const {
Register Src;
unsigned Mods;
- std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg(), MRI);
+ std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg());
return {{
[=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
@@ -1446,6 +1833,7 @@ AMDGPUInstructionSelector::selectVOP3Mods0(MachineOperand &Root) const {
[=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // omod
}};
}
+
InstructionSelector::ComplexRendererFns
AMDGPUInstructionSelector::selectVOP3OMods(MachineOperand &Root) const {
return {{
@@ -1457,12 +1845,9 @@ AMDGPUInstructionSelector::selectVOP3OMods(MachineOperand &Root) const {
InstructionSelector::ComplexRendererFns
AMDGPUInstructionSelector::selectVOP3Mods(MachineOperand &Root) const {
- MachineRegisterInfo &MRI
- = Root.getParent()->getParent()->getParent()->getRegInfo();
-
Register Src;
unsigned Mods;
- std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg(), MRI);
+ std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg());
return {{
[=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
@@ -1471,12 +1856,28 @@ AMDGPUInstructionSelector::selectVOP3Mods(MachineOperand &Root) const {
}
InstructionSelector::ComplexRendererFns
-AMDGPUInstructionSelector::selectSmrdImm(MachineOperand &Root) const {
- MachineRegisterInfo &MRI =
- Root.getParent()->getParent()->getParent()->getRegInfo();
+AMDGPUInstructionSelector::selectVOP3OpSelMods0(MachineOperand &Root) const {
+ // FIXME: Handle clamp and op_sel
+ return {{
+ [=](MachineInstrBuilder &MIB) { MIB.addReg(Root.getReg()); },
+ [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }, // src_mods
+ [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // clamp
+ }};
+}
+InstructionSelector::ComplexRendererFns
+AMDGPUInstructionSelector::selectVOP3OpSelMods(MachineOperand &Root) const {
+ // FIXME: Handle op_sel
+ return {{
+ [=](MachineInstrBuilder &MIB) { MIB.addReg(Root.getReg()); },
+ [=](MachineInstrBuilder &MIB) { MIB.addImm(0); } // src_mods
+ }};
+}
+
+InstructionSelector::ComplexRendererFns
+AMDGPUInstructionSelector::selectSmrdImm(MachineOperand &Root) const {
SmallVector<GEPInfo, 4> AddrInfo;
- getAddrModeInfo(*Root.getParent(), MRI, AddrInfo);
+ getAddrModeInfo(*Root.getParent(), *MRI, AddrInfo);
if (AddrInfo.empty() || AddrInfo[0].SgprParts.size() != 1)
return None;
@@ -1496,11 +1897,8 @@ AMDGPUInstructionSelector::selectSmrdImm(MachineOperand &Root) const {
InstructionSelector::ComplexRendererFns
AMDGPUInstructionSelector::selectSmrdImm32(MachineOperand &Root) const {
- MachineRegisterInfo &MRI =
- Root.getParent()->getParent()->getParent()->getRegInfo();
-
SmallVector<GEPInfo, 4> AddrInfo;
- getAddrModeInfo(*Root.getParent(), MRI, AddrInfo);
+ getAddrModeInfo(*Root.getParent(), *MRI, AddrInfo);
if (AddrInfo.empty() || AddrInfo[0].SgprParts.size() != 1)
return None;
@@ -1521,10 +1919,9 @@ InstructionSelector::ComplexRendererFns
AMDGPUInstructionSelector::selectSmrdSgpr(MachineOperand &Root) const {
MachineInstr *MI = Root.getParent();
MachineBasicBlock *MBB = MI->getParent();
- MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
SmallVector<GEPInfo, 4> AddrInfo;
- getAddrModeInfo(*MI, MRI, AddrInfo);
+ getAddrModeInfo(*MI, *MRI, AddrInfo);
// FIXME: We should shrink the GEP if the offset is known to be <= 32-bits,
// then we can select all ptr + 32-bit offsets not just immediate offsets.
@@ -1540,7 +1937,7 @@ AMDGPUInstructionSelector::selectSmrdSgpr(MachineOperand &Root) const {
// failed trying to select this load into one of the _IMM variants since
// the _IMM Patterns are considered before the _SGPR patterns.
unsigned PtrReg = GEPInfo.SgprParts[0];
- unsigned OffsetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
+ Register OffsetReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), OffsetReg)
.addImm(GEPInfo.Imm);
return {{
@@ -1553,8 +1950,6 @@ template <bool Signed>
InstructionSelector::ComplexRendererFns
AMDGPUInstructionSelector::selectFlatOffsetImpl(MachineOperand &Root) const {
MachineInstr *MI = Root.getParent();
- MachineBasicBlock *MBB = MI->getParent();
- MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
InstructionSelector::ComplexRendererFns Default = {{
[=](MachineInstrBuilder &MIB) { MIB.addReg(Root.getReg()); },
@@ -1565,12 +1960,12 @@ AMDGPUInstructionSelector::selectFlatOffsetImpl(MachineOperand &Root) const {
if (!STI.hasFlatInstOffsets())
return Default;
- const MachineInstr *OpDef = MRI.getVRegDef(Root.getReg());
+ const MachineInstr *OpDef = MRI->getVRegDef(Root.getReg());
if (!OpDef || OpDef->getOpcode() != AMDGPU::G_GEP)
return Default;
Optional<int64_t> Offset =
- getConstantVRegVal(OpDef->getOperand(2).getReg(), MRI);
+ getConstantVRegVal(OpDef->getOperand(2).getReg(), *MRI);
if (!Offset.hasValue())
return Default;
@@ -1597,12 +1992,6 @@ AMDGPUInstructionSelector::selectFlatOffsetSigned(MachineOperand &Root) const {
return selectFlatOffsetImpl<true>(Root);
}
-// FIXME: Implement
-static bool signBitIsZero(const MachineOperand &Op,
- const MachineRegisterInfo &MRI) {
- return false;
-}
-
static bool isStackPtrRelative(const MachinePointerInfo &PtrInfo) {
auto PSV = PtrInfo.V.dyn_cast<const PseudoSourceValue *>();
return PSV && PSV->isStack();
@@ -1613,12 +2002,11 @@ AMDGPUInstructionSelector::selectMUBUFScratchOffen(MachineOperand &Root) const {
MachineInstr *MI = Root.getParent();
MachineBasicBlock *MBB = MI->getParent();
MachineFunction *MF = MBB->getParent();
- MachineRegisterInfo &MRI = MF->getRegInfo();
const SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>();
int64_t Offset = 0;
- if (mi_match(Root.getReg(), MRI, m_ICst(Offset))) {
- Register HighBits = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+ if (mi_match(Root.getReg(), *MRI, m_ICst(Offset))) {
+ Register HighBits = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
// TODO: Should this be inside the render function? The iterator seems to
// move.
@@ -1652,18 +2040,18 @@ AMDGPUInstructionSelector::selectMUBUFScratchOffen(MachineOperand &Root) const {
// offsets.
Optional<int> FI;
Register VAddr = Root.getReg();
- if (const MachineInstr *RootDef = MRI.getVRegDef(Root.getReg())) {
- if (isBaseWithConstantOffset(Root, MRI)) {
+ if (const MachineInstr *RootDef = MRI->getVRegDef(Root.getReg())) {
+ if (isBaseWithConstantOffset(Root, *MRI)) {
const MachineOperand &LHS = RootDef->getOperand(1);
const MachineOperand &RHS = RootDef->getOperand(2);
- const MachineInstr *LHSDef = MRI.getVRegDef(LHS.getReg());
- const MachineInstr *RHSDef = MRI.getVRegDef(RHS.getReg());
+ const MachineInstr *LHSDef = MRI->getVRegDef(LHS.getReg());
+ const MachineInstr *RHSDef = MRI->getVRegDef(RHS.getReg());
if (LHSDef && RHSDef) {
int64_t PossibleOffset =
RHSDef->getOperand(1).getCImm()->getSExtValue();
if (SIInstrInfo::isLegalMUBUFImmOffset(PossibleOffset) &&
(!STI.privateMemoryResourceIsRangeChecked() ||
- signBitIsZero(LHS, MRI))) {
+ KnownBits->signBitIsZero(LHS.getReg()))) {
if (LHSDef->getOpcode() == AMDGPU::G_FRAME_INDEX)
FI = LHSDef->getOperand(1).getIndex();
else
@@ -1700,15 +2088,30 @@ AMDGPUInstructionSelector::selectMUBUFScratchOffen(MachineOperand &Root) const {
}}};
}
+bool AMDGPUInstructionSelector::isDSOffsetLegal(const MachineRegisterInfo &MRI,
+ const MachineOperand &Base,
+ int64_t Offset,
+ unsigned OffsetBits) const {
+ if ((OffsetBits == 16 && !isUInt<16>(Offset)) ||
+ (OffsetBits == 8 && !isUInt<8>(Offset)))
+ return false;
+
+ if (STI.hasUsableDSOffset() || STI.unsafeDSOffsetFoldingEnabled())
+ return true;
+
+ // On Southern Islands instruction with a negative base value and an offset
+ // don't seem to work.
+ return KnownBits->signBitIsZero(Base.getReg());
+}
+
InstructionSelector::ComplexRendererFns
AMDGPUInstructionSelector::selectMUBUFScratchOffset(
MachineOperand &Root) const {
MachineInstr *MI = Root.getParent();
MachineBasicBlock *MBB = MI->getParent();
- MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
int64_t Offset = 0;
- if (!mi_match(Root.getReg(), MRI, m_ICst(Offset)) ||
+ if (!mi_match(Root.getReg(), *MRI, m_ICst(Offset)) ||
!SIInstrInfo::isLegalMUBUFImmOffset(Offset))
return {};
@@ -1728,3 +2131,54 @@ AMDGPUInstructionSelector::selectMUBUFScratchOffset(
[=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); } // offset
}};
}
+
+InstructionSelector::ComplexRendererFns
+AMDGPUInstructionSelector::selectDS1Addr1Offset(MachineOperand &Root) const {
+ const MachineInstr *RootDef = MRI->getVRegDef(Root.getReg());
+ if (!RootDef) {
+ return {{
+ [=](MachineInstrBuilder &MIB) { MIB.add(Root); },
+ [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }
+ }};
+ }
+
+ int64_t ConstAddr = 0;
+ if (isBaseWithConstantOffset(Root, *MRI)) {
+ const MachineOperand &LHS = RootDef->getOperand(1);
+ const MachineOperand &RHS = RootDef->getOperand(2);
+ const MachineInstr *LHSDef = MRI->getVRegDef(LHS.getReg());
+ const MachineInstr *RHSDef = MRI->getVRegDef(RHS.getReg());
+ if (LHSDef && RHSDef) {
+ int64_t PossibleOffset =
+ RHSDef->getOperand(1).getCImm()->getSExtValue();
+ if (isDSOffsetLegal(*MRI, LHS, PossibleOffset, 16)) {
+ // (add n0, c0)
+ return {{
+ [=](MachineInstrBuilder &MIB) { MIB.add(LHS); },
+ [=](MachineInstrBuilder &MIB) { MIB.addImm(PossibleOffset); }
+ }};
+ }
+ }
+ } else if (RootDef->getOpcode() == AMDGPU::G_SUB) {
+
+
+
+ } else if (mi_match(Root.getReg(), *MRI, m_ICst(ConstAddr))) {
+
+
+ }
+
+ return {{
+ [=](MachineInstrBuilder &MIB) { MIB.add(Root); },
+ [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }
+ }};
+}
+
+void AMDGPUInstructionSelector::renderTruncImm32(MachineInstrBuilder &MIB,
+ const MachineInstr &MI) const {
+ const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
+ assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && "Expected G_CONSTANT");
+ Optional<int64_t> CstVal = getConstantVRegVal(MI.getOperand(0).getReg(), MRI);
+ assert(CstVal && "Expected constant value");
+ MIB.addImm(CstVal.getValue());
+}
diff --git a/lib/Target/AMDGPU/AMDGPUInstructionSelector.h b/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
index 4f489ddfb23d..d3c83a6a872a 100644
--- a/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
+++ b/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
@@ -35,6 +35,7 @@ class AMDGPUInstrInfo;
class AMDGPURegisterBankInfo;
class GCNSubtarget;
class MachineInstr;
+class MachineIRBuilder;
class MachineOperand;
class MachineRegisterInfo;
class SIInstrInfo;
@@ -42,14 +43,20 @@ class SIMachineFunctionInfo;
class SIRegisterInfo;
class AMDGPUInstructionSelector : public InstructionSelector {
+private:
+ MachineRegisterInfo *MRI;
+
public:
AMDGPUInstructionSelector(const GCNSubtarget &STI,
const AMDGPURegisterBankInfo &RBI,
const AMDGPUTargetMachine &TM);
- bool select(MachineInstr &I, CodeGenCoverage &CoverageInfo) const override;
+ bool select(MachineInstr &I) override;
static const char *getName();
+ void setupMF(MachineFunction &MF, GISelKnownBits &KB,
+ CodeGenCoverage &CoverageInfo) override;
+
private:
struct GEPInfo {
const MachineInstr &GEP;
@@ -72,32 +79,42 @@ private:
bool selectPHI(MachineInstr &I) const;
bool selectG_TRUNC(MachineInstr &I) const;
bool selectG_SZA_EXT(MachineInstr &I) const;
+ bool selectG_SITOFP_UITOFP(MachineInstr &I) const;
bool selectG_CONSTANT(MachineInstr &I) const;
bool selectG_AND_OR_XOR(MachineInstr &I) const;
bool selectG_ADD_SUB(MachineInstr &I) const;
+ bool selectG_UADDO_USUBO(MachineInstr &I) const;
bool selectG_EXTRACT(MachineInstr &I) const;
bool selectG_MERGE_VALUES(MachineInstr &I) const;
bool selectG_UNMERGE_VALUES(MachineInstr &I) const;
bool selectG_GEP(MachineInstr &I) const;
bool selectG_IMPLICIT_DEF(MachineInstr &I) const;
bool selectG_INSERT(MachineInstr &I) const;
- bool selectG_INTRINSIC(MachineInstr &I, CodeGenCoverage &CoverageInfo) const;
- bool selectG_INTRINSIC_W_SIDE_EFFECTS(MachineInstr &I,
- CodeGenCoverage &CoverageInfo) const;
+ bool selectG_INTRINSIC(MachineInstr &I) const;
+
+ std::tuple<Register, unsigned, unsigned>
+ splitBufferOffsets(MachineIRBuilder &B, Register OrigOffset) const;
+
+ bool selectStoreIntrinsic(MachineInstr &MI, bool IsFormat) const;
+
+ bool selectG_INTRINSIC_W_SIDE_EFFECTS(MachineInstr &I) const;
int getS_CMPOpcode(CmpInst::Predicate P, unsigned Size) const;
bool selectG_ICMP(MachineInstr &I) const;
bool hasVgprParts(ArrayRef<GEPInfo> AddrInfo) const;
void getAddrModeInfo(const MachineInstr &Load, const MachineRegisterInfo &MRI,
SmallVectorImpl<GEPInfo> &AddrInfo) const;
bool selectSMRD(MachineInstr &I, ArrayRef<GEPInfo> AddrInfo) const;
- bool selectG_LOAD(MachineInstr &I) const;
- bool selectG_SELECT(MachineInstr &I) const;
+
+ void initM0(MachineInstr &I) const;
+ bool selectG_LOAD_ATOMICRMW(MachineInstr &I) const;
bool selectG_STORE(MachineInstr &I) const;
+ bool selectG_SELECT(MachineInstr &I) const;
bool selectG_BRCOND(MachineInstr &I) const;
bool selectG_FRAME_INDEX(MachineInstr &I) const;
+ bool selectG_PTR_MASK(MachineInstr &I) const;
std::pair<Register, unsigned>
- selectVOP3ModsImpl(Register Src, const MachineRegisterInfo &MRI) const;
+ selectVOP3ModsImpl(Register Src) const;
InstructionSelector::ComplexRendererFns
selectVCSRC(MachineOperand &Root) const;
@@ -108,11 +125,18 @@ private:
InstructionSelector::ComplexRendererFns
selectVOP3Mods0(MachineOperand &Root) const;
InstructionSelector::ComplexRendererFns
+ selectVOP3Mods0Clamp0OMod(MachineOperand &Root) const;
+ InstructionSelector::ComplexRendererFns
selectVOP3OMods(MachineOperand &Root) const;
InstructionSelector::ComplexRendererFns
selectVOP3Mods(MachineOperand &Root) const;
InstructionSelector::ComplexRendererFns
+ selectVOP3OpSelMods0(MachineOperand &Root) const;
+ InstructionSelector::ComplexRendererFns
+ selectVOP3OpSelMods(MachineOperand &Root) const;
+
+ InstructionSelector::ComplexRendererFns
selectSmrdImm(MachineOperand &Root) const;
InstructionSelector::ComplexRendererFns
selectSmrdImm32(MachineOperand &Root) const;
@@ -133,6 +157,16 @@ private:
InstructionSelector::ComplexRendererFns
selectMUBUFScratchOffset(MachineOperand &Root) const;
+ bool isDSOffsetLegal(const MachineRegisterInfo &MRI,
+ const MachineOperand &Base,
+ int64_t Offset, unsigned OffsetBits) const;
+
+ InstructionSelector::ComplexRendererFns
+ selectDS1Addr1Offset(MachineOperand &Root) const;
+
+ void renderTruncImm32(MachineInstrBuilder &MIB,
+ const MachineInstr &MI) const;
+
const SIInstrInfo &TII;
const SIRegisterInfo &TRI;
const AMDGPURegisterBankInfo &RBI;
diff --git a/lib/Target/AMDGPU/AMDGPUInstructions.td b/lib/Target/AMDGPU/AMDGPUInstructions.td
index 61bc415c839d..846e7f577a28 100644
--- a/lib/Target/AMDGPU/AMDGPUInstructions.td
+++ b/lib/Target/AMDGPU/AMDGPUInstructions.td
@@ -75,7 +75,7 @@ class ILFormat<dag outs, dag ins, string asmstr, list<dag> pattern>
let isCodeGenOnly = 1;
}
-def TruePredicate : Predicate<"true">;
+def TruePredicate : Predicate<"">;
class PredicateControl {
Predicate SubtargetPredicate = TruePredicate;
@@ -220,80 +220,48 @@ def hi_f16_elt : PatLeaf<
// PatLeafs for floating-point comparisons
//===----------------------------------------------------------------------===//
-def COND_OEQ : PatLeaf <
- (cond),
- [{return N->get() == ISD::SETOEQ || N->get() == ISD::SETEQ;}]
->;
-
-def COND_ONE : PatLeaf <
- (cond),
- [{return N->get() == ISD::SETONE || N->get() == ISD::SETNE;}]
->;
-
-def COND_OGT : PatLeaf <
- (cond),
- [{return N->get() == ISD::SETOGT || N->get() == ISD::SETGT;}]
->;
-
-def COND_OGE : PatLeaf <
- (cond),
- [{return N->get() == ISD::SETOGE || N->get() == ISD::SETGE;}]
->;
-
-def COND_OLT : PatLeaf <
- (cond),
- [{return N->get() == ISD::SETOLT || N->get() == ISD::SETLT;}]
->;
-
-def COND_OLE : PatLeaf <
- (cond),
- [{return N->get() == ISD::SETOLE || N->get() == ISD::SETLE;}]
->;
-
-def COND_O : PatLeaf <(cond), [{return N->get() == ISD::SETO;}]>;
-def COND_UO : PatLeaf <(cond), [{return N->get() == ISD::SETUO;}]>;
+def COND_OEQ : PatFrags<(ops), [(OtherVT SETOEQ), (OtherVT SETEQ)]>;
+def COND_ONE : PatFrags<(ops), [(OtherVT SETONE), (OtherVT SETNE)]>;
+def COND_OGT : PatFrags<(ops), [(OtherVT SETOGT), (OtherVT SETGT)]>;
+def COND_OGE : PatFrags<(ops), [(OtherVT SETOGE), (OtherVT SETGE)]>;
+def COND_OLT : PatFrags<(ops), [(OtherVT SETOLT), (OtherVT SETLT)]>;
+def COND_OLE : PatFrags<(ops), [(OtherVT SETOLE), (OtherVT SETLE)]>;
+def COND_O : PatFrags<(ops), [(OtherVT SETO)]>;
+def COND_UO : PatFrags<(ops), [(OtherVT SETUO)]>;
//===----------------------------------------------------------------------===//
// PatLeafs for unsigned / unordered comparisons
//===----------------------------------------------------------------------===//
-def COND_UEQ : PatLeaf <(cond), [{return N->get() == ISD::SETUEQ;}]>;
-def COND_UNE : PatLeaf <(cond), [{return N->get() == ISD::SETUNE;}]>;
-def COND_UGT : PatLeaf <(cond), [{return N->get() == ISD::SETUGT;}]>;
-def COND_UGE : PatLeaf <(cond), [{return N->get() == ISD::SETUGE;}]>;
-def COND_ULT : PatLeaf <(cond), [{return N->get() == ISD::SETULT;}]>;
-def COND_ULE : PatLeaf <(cond), [{return N->get() == ISD::SETULE;}]>;
+def COND_UEQ : PatFrag<(ops), (OtherVT SETUEQ)>;
+def COND_UNE : PatFrag<(ops), (OtherVT SETUNE)>;
+def COND_UGT : PatFrag<(ops), (OtherVT SETUGT)>;
+def COND_UGE : PatFrag<(ops), (OtherVT SETUGE)>;
+def COND_ULT : PatFrag<(ops), (OtherVT SETULT)>;
+def COND_ULE : PatFrag<(ops), (OtherVT SETULE)>;
// XXX - For some reason R600 version is preferring to use unordered
// for setne?
-def COND_UNE_NE : PatLeaf <
- (cond),
- [{return N->get() == ISD::SETUNE || N->get() == ISD::SETNE;}]
->;
+def COND_UNE_NE : PatFrags<(ops), [(OtherVT SETUNE), (OtherVT SETNE)]>;
//===----------------------------------------------------------------------===//
// PatLeafs for signed comparisons
//===----------------------------------------------------------------------===//
-def COND_SGT : PatLeaf <(cond), [{return N->get() == ISD::SETGT;}]>;
-def COND_SGE : PatLeaf <(cond), [{return N->get() == ISD::SETGE;}]>;
-def COND_SLT : PatLeaf <(cond), [{return N->get() == ISD::SETLT;}]>;
-def COND_SLE : PatLeaf <(cond), [{return N->get() == ISD::SETLE;}]>;
+def COND_SGT : PatFrag<(ops), (OtherVT SETGT)>;
+def COND_SGE : PatFrag<(ops), (OtherVT SETGE)>;
+def COND_SLT : PatFrag<(ops), (OtherVT SETLT)>;
+def COND_SLE : PatFrag<(ops), (OtherVT SETLE)>;
//===----------------------------------------------------------------------===//
// PatLeafs for integer equality
//===----------------------------------------------------------------------===//
-def COND_EQ : PatLeaf <
- (cond),
- [{return N->get() == ISD::SETEQ || N->get() == ISD::SETUEQ;}]
->;
-
-def COND_NE : PatLeaf <
- (cond),
- [{return N->get() == ISD::SETNE || N->get() == ISD::SETUNE;}]
->;
+def COND_EQ : PatFrags<(ops), [(OtherVT SETEQ), (OtherVT SETUEQ)]>;
+def COND_NE : PatFrags<(ops), [(OtherVT SETNE), (OtherVT SETUNE)]>;
+// FIXME: Should not need code predicate
+//def COND_NULL : PatLeaf<(OtherVT null_frag)>;
def COND_NULL : PatLeaf <
(cond),
[{(void)N; return false;}]
@@ -335,17 +303,17 @@ def TEX_SHADOW_ARRAY : PatLeaf<
// Load/Store Pattern Fragments
//===----------------------------------------------------------------------===//
+def atomic_cmp_swap_glue : SDNode <"ISD::ATOMIC_CMP_SWAP", SDTAtomic3,
+ [SDNPHasChain, SDNPMayStore, SDNPMayLoad, SDNPMemOperand, SDNPInGlue]
+>;
+
class AddressSpaceList<list<int> AS> {
list<int> AddrSpaces = AS;
}
-class Aligned8Bytes <dag ops, dag frag> : PatFrag <ops, frag, [{
- return cast<MemSDNode>(N)->getAlignment() % 8 == 0;
-}]>;
-
-class Aligned16Bytes <dag ops, dag frag> : PatFrag <ops, frag, [{
- return cast<MemSDNode>(N)->getAlignment() >= 16;
-}]>;
+class Aligned<int Bytes> {
+ int MinAlignment = Bytes;
+}
class LoadFrag <SDPatternOperator op> : PatFrag<(ops node:$ptr), (op node:$ptr)>;
@@ -502,6 +470,35 @@ defm atomic_store_#as : binary_atomic_op<atomic_store>;
} // End foreach AddrSpace
+multiclass ret_noret_binary_atomic_op<SDNode atomic_op, bit IsInt = 1> {
+ foreach as = [ "global", "flat", "constant", "local", "private", "region" ] in {
+ let AddressSpaces = !cast<AddressSpaceList>("LoadAddress_"#as).AddrSpaces in {
+ defm "_"#as : binary_atomic_op<atomic_op, IsInt>;
+
+ let PredicateCode = [{return (SDValue(N, 0).use_empty());}] in {
+ defm "_"#as#"_noret" : binary_atomic_op<atomic_op, IsInt>;
+ }
+
+ let PredicateCode = [{return !(SDValue(N, 0).use_empty());}] in {
+ defm "_"#as#"_ret" : binary_atomic_op<atomic_op, IsInt>;
+ }
+ }
+ }
+}
+
+defm atomic_swap : ret_noret_binary_atomic_op<atomic_swap>;
+defm atomic_load_add : ret_noret_binary_atomic_op<atomic_load_add>;
+defm atomic_load_and : ret_noret_binary_atomic_op<atomic_load_and>;
+defm atomic_load_max : ret_noret_binary_atomic_op<atomic_load_max>;
+defm atomic_load_min : ret_noret_binary_atomic_op<atomic_load_min>;
+defm atomic_load_or : ret_noret_binary_atomic_op<atomic_load_or>;
+defm atomic_load_sub : ret_noret_binary_atomic_op<atomic_load_sub>;
+defm atomic_load_umax : ret_noret_binary_atomic_op<atomic_load_umax>;
+defm atomic_load_umin : ret_noret_binary_atomic_op<atomic_load_umin>;
+defm atomic_load_xor : ret_noret_binary_atomic_op<atomic_load_xor>;
+defm atomic_load_fadd : ret_noret_binary_atomic_op<atomic_load_fadd, 0>;
+
+
def store_hi16_private : StoreHi16 <truncstorei16>, PrivateAddress;
def truncstorei8_hi16_private : StoreHi16<truncstorei8>, PrivateAddress;
@@ -513,21 +510,31 @@ def store_local_hi16 : StoreHi16 <truncstorei16>, LocalAddress;
def truncstorei8_local_hi16 : StoreHi16<truncstorei8>, LocalAddress;
def atomic_store_local : LocalStore <atomic_store>;
-def load_align8_local : Aligned8Bytes <
- (ops node:$ptr), (load_local node:$ptr)
->;
-def load_align16_local : Aligned16Bytes <
- (ops node:$ptr), (load_local node:$ptr)
->;
+def load_align8_local : PatFrag <(ops node:$ptr), (load_local node:$ptr)> {
+ let IsLoad = 1;
+ let IsNonExtLoad = 1;
+ let MinAlignment = 8;
+}
-def store_align8_local : Aligned8Bytes <
- (ops node:$val, node:$ptr), (store_local node:$val, node:$ptr)
->;
+def load_align16_local : PatFrag <(ops node:$ptr), (load_local node:$ptr)> {
+ let IsLoad = 1;
+ let IsNonExtLoad = 1;
+ let MinAlignment = 16;
+}
+
+def store_align8_local: PatFrag<(ops node:$val, node:$ptr),
+ (store_local node:$val, node:$ptr)>, Aligned<8> {
+ let IsStore = 1;
+ let IsTruncStore = 0;
+}
+
+def store_align16_local: PatFrag<(ops node:$val, node:$ptr),
+ (store_local node:$val, node:$ptr)>, Aligned<16> {
+ let IsStore = 1;
+ let IsTruncStore = 0;
+}
-def store_align16_local : Aligned16Bytes <
- (ops node:$val, node:$ptr), (store_local node:$val, node:$ptr)
->;
def atomic_store_flat : FlatStore <atomic_store>;
def truncstorei8_hi16_flat : StoreHi16<truncstorei8>, FlatStoreAddress;
@@ -547,69 +554,26 @@ class region_binary_atomic_op<SDNode atomic_op> :
}]>;
-def atomic_swap_local : local_binary_atomic_op<atomic_swap>;
-def atomic_load_add_local : local_binary_atomic_op<atomic_load_add>;
-def atomic_load_sub_local : local_binary_atomic_op<atomic_load_sub>;
-def atomic_load_and_local : local_binary_atomic_op<atomic_load_and>;
-def atomic_load_or_local : local_binary_atomic_op<atomic_load_or>;
-def atomic_load_xor_local : local_binary_atomic_op<atomic_load_xor>;
-def atomic_load_nand_local : local_binary_atomic_op<atomic_load_nand>;
-def atomic_load_min_local : local_binary_atomic_op<atomic_load_min>;
-def atomic_load_max_local : local_binary_atomic_op<atomic_load_max>;
-def atomic_load_umin_local : local_binary_atomic_op<atomic_load_umin>;
-def atomic_load_umax_local : local_binary_atomic_op<atomic_load_umax>;
-
def mskor_global : PatFrag<(ops node:$val, node:$ptr),
(AMDGPUstore_mskor node:$val, node:$ptr), [{
return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS;
}]>;
-class AtomicCmpSwapLocal <SDNode cmp_swap_node> : PatFrag<
- (ops node:$ptr, node:$cmp, node:$swap),
- (cmp_swap_node node:$ptr, node:$cmp, node:$swap), [{
- AtomicSDNode *AN = cast<AtomicSDNode>(N);
- return AN->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS;
-}]>;
-
-class AtomicCmpSwapRegion <SDNode cmp_swap_node> : PatFrag<
- (ops node:$ptr, node:$cmp, node:$swap),
- (cmp_swap_node node:$ptr, node:$cmp, node:$swap), [{
- AtomicSDNode *AN = cast<AtomicSDNode>(N);
- return AN->getAddressSpace() == AMDGPUAS::REGION_ADDRESS;
-}]>;
+let AddressSpaces = StoreAddress_local.AddrSpaces in {
+defm atomic_cmp_swap_local : ternary_atomic_op<atomic_cmp_swap>;
+defm atomic_cmp_swap_local_m0 : ternary_atomic_op<atomic_cmp_swap_glue>;
+}
-def atomic_cmp_swap_local : AtomicCmpSwapLocal <atomic_cmp_swap>;
+let AddressSpaces = StoreAddress_region.AddrSpaces in {
+defm atomic_cmp_swap_region : ternary_atomic_op<atomic_cmp_swap>;
+defm atomic_cmp_swap_region_m0 : ternary_atomic_op<atomic_cmp_swap_glue>;
+}
class global_binary_atomic_op_frag<SDNode atomic_op> : PatFrag<
(ops node:$ptr, node:$value),
(atomic_op node:$ptr, node:$value),
[{return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS;}]>;
-multiclass global_binary_atomic_op<SDNode atomic_op> {
- def "" : global_binary_atomic_op_frag<atomic_op>;
-
- def _noret : PatFrag<
- (ops node:$ptr, node:$value),
- (atomic_op node:$ptr, node:$value),
- [{return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS && (SDValue(N, 0).use_empty());}]>;
-
- def _ret : PatFrag<
- (ops node:$ptr, node:$value),
- (atomic_op node:$ptr, node:$value),
- [{return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS && (!SDValue(N, 0).use_empty());}]>;
-}
-
-defm atomic_swap_global : global_binary_atomic_op<atomic_swap>;
-defm atomic_add_global : global_binary_atomic_op<atomic_load_add>;
-defm atomic_and_global : global_binary_atomic_op<atomic_load_and>;
-defm atomic_max_global : global_binary_atomic_op<atomic_load_max>;
-defm atomic_min_global : global_binary_atomic_op<atomic_load_min>;
-defm atomic_or_global : global_binary_atomic_op<atomic_load_or>;
-defm atomic_sub_global : global_binary_atomic_op<atomic_load_sub>;
-defm atomic_umax_global : global_binary_atomic_op<atomic_load_umax>;
-defm atomic_umin_global : global_binary_atomic_op<atomic_load_umin>;
-defm atomic_xor_global : global_binary_atomic_op<atomic_load_xor>;
-
// Legacy.
def AMDGPUatomic_cmp_swap_global : PatFrag<
(ops node:$ptr, node:$value),
diff --git a/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
index 670f6225fbf7..5aba35a19ced 100644
--- a/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -11,6 +11,13 @@
/// \todo This should be generated by TableGen.
//===----------------------------------------------------------------------===//
+#if defined(_MSC_VER) || defined(__MINGW32__)
+// According to Microsoft, one must set _USE_MATH_DEFINES in order to get M_PI
+// from the Visual C++ cmath / math.h headers:
+// https://docs.microsoft.com/en-us/cpp/c-runtime-library/math-constants?view=vs-2019
+#define _USE_MATH_DEFINES
+#endif
+
#include "AMDGPU.h"
#include "AMDGPULegalizerInfo.h"
#include "AMDGPUTargetMachine.h"
@@ -20,6 +27,7 @@
#include "llvm/CodeGen/TargetOpcodes.h"
#include "llvm/CodeGen/ValueTypes.h"
#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/DiagnosticInfo.h"
#include "llvm/IR/Type.h"
#include "llvm/Support/Debug.h"
@@ -32,7 +40,7 @@ using namespace LegalityPredicates;
static LegalityPredicate isMultiple32(unsigned TypeIdx,
- unsigned MaxSize = 512) {
+ unsigned MaxSize = 1024) {
return [=](const LegalityQuery &Query) {
const LLT Ty = Query.Types[TypeIdx];
const LLT EltTy = Ty.getScalarType();
@@ -40,12 +48,27 @@ static LegalityPredicate isMultiple32(unsigned TypeIdx,
};
}
+static LegalityPredicate sizeIs(unsigned TypeIdx, unsigned Size) {
+ return [=](const LegalityQuery &Query) {
+ return Query.Types[TypeIdx].getSizeInBits() == Size;
+ };
+}
+
static LegalityPredicate isSmallOddVector(unsigned TypeIdx) {
return [=](const LegalityQuery &Query) {
const LLT Ty = Query.Types[TypeIdx];
return Ty.isVector() &&
Ty.getNumElements() % 2 != 0 &&
- Ty.getElementType().getSizeInBits() < 32;
+ Ty.getElementType().getSizeInBits() < 32 &&
+ Ty.getSizeInBits() % 32 != 0;
+ };
+}
+
+static LegalityPredicate isWideVec16(unsigned TypeIdx) {
+ return [=](const LegalityQuery &Query) {
+ const LLT Ty = Query.Types[TypeIdx];
+ const LLT EltTy = Ty.getScalarType();
+ return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2;
};
}
@@ -68,6 +91,31 @@ static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx) {
};
}
+// Increase the number of vector elements to reach the next multiple of 32-bit
+// type.
+static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) {
+ return [=](const LegalityQuery &Query) {
+ const LLT Ty = Query.Types[TypeIdx];
+
+ const LLT EltTy = Ty.getElementType();
+ const int Size = Ty.getSizeInBits();
+ const int EltSize = EltTy.getSizeInBits();
+ const int NextMul32 = (Size + 31) / 32;
+
+ assert(EltSize < 32);
+
+ const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize;
+ return std::make_pair(TypeIdx, LLT::vector(NewNumElts, EltTy));
+ };
+}
+
+static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) {
+ return [=](const LegalityQuery &Query) {
+ const LLT QueryTy = Query.Types[TypeIdx];
+ return QueryTy.isVector() && QueryTy.getSizeInBits() < Size;
+ };
+}
+
static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) {
return [=](const LegalityQuery &Query) {
const LLT QueryTy = Query.Types[TypeIdx];
@@ -82,7 +130,7 @@ static LegalityPredicate numElementsNotEven(unsigned TypeIdx) {
};
}
-// Any combination of 32 or 64-bit elements up to 512 bits, and multiples of
+// Any combination of 32 or 64-bit elements up to 1024 bits, and multiples of
// v2s16.
static LegalityPredicate isRegisterType(unsigned TypeIdx) {
return [=](const LegalityQuery &Query) {
@@ -94,7 +142,21 @@ static LegalityPredicate isRegisterType(unsigned TypeIdx) {
EltSize == 128 || EltSize == 256;
}
- return Ty.getSizeInBits() % 32 == 0 && Ty.getSizeInBits() <= 512;
+ return Ty.getSizeInBits() % 32 == 0 && Ty.getSizeInBits() <= 1024;
+ };
+}
+
+static LegalityPredicate elementTypeIs(unsigned TypeIdx, LLT Type) {
+ return [=](const LegalityQuery &Query) {
+ return Query.Types[TypeIdx].getElementType() == Type;
+ };
+}
+
+static LegalityPredicate isWideScalarTruncStore(unsigned TypeIdx) {
+ return [=](const LegalityQuery &Query) {
+ const LLT Ty = Query.Types[TypeIdx];
+ return !Ty.isVector() && Ty.getSizeInBits() > 32 &&
+ Query.MMODescrs[0].SizeInBits < Ty.getSizeInBits();
};
}
@@ -112,9 +174,10 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
const LLT S16 = LLT::scalar(16);
const LLT S32 = LLT::scalar(32);
const LLT S64 = LLT::scalar(64);
+ const LLT S96 = LLT::scalar(96);
const LLT S128 = LLT::scalar(128);
const LLT S256 = LLT::scalar(256);
- const LLT S512 = LLT::scalar(512);
+ const LLT S1024 = LLT::scalar(1024);
const LLT V2S16 = LLT::vector(2, 16);
const LLT V4S16 = LLT::vector(4, 16);
@@ -134,6 +197,7 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
const LLT V14S32 = LLT::vector(14, 32);
const LLT V15S32 = LLT::vector(15, 32);
const LLT V16S32 = LLT::vector(16, 32);
+ const LLT V32S32 = LLT::vector(32, 32);
const LLT V2S64 = LLT::vector(2, 64);
const LLT V3S64 = LLT::vector(3, 64);
@@ -142,16 +206,19 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
const LLT V6S64 = LLT::vector(6, 64);
const LLT V7S64 = LLT::vector(7, 64);
const LLT V8S64 = LLT::vector(8, 64);
+ const LLT V16S64 = LLT::vector(16, 64);
std::initializer_list<LLT> AllS32Vectors =
{V2S32, V3S32, V4S32, V5S32, V6S32, V7S32, V8S32,
- V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32};
+ V9S32, V10S32, V11S32, V12S32, V13S32, V14S32, V15S32, V16S32, V32S32};
std::initializer_list<LLT> AllS64Vectors =
- {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64};
+ {V2S64, V3S64, V4S64, V5S64, V6S64, V7S64, V8S64, V16S64};
const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS);
const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS);
+ const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT);
const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS);
+ const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS);
const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS);
const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS);
@@ -162,7 +229,7 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
};
const std::initializer_list<LLT> AddrSpaces32 = {
- LocalPtr, PrivatePtr
+ LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr
};
const std::initializer_list<LLT> FPTypesBase = {
@@ -216,37 +283,34 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
.legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16})
.clampScalar(0, S32, S64)
.moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
- .fewerElementsIf(vectorWiderThan(0, 32), fewerEltsToSize64Vector(0))
+ .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0))
.widenScalarToNextPow2(0)
.scalarize(0);
- getActionDefinitionsBuilder({G_UADDO, G_SADDO, G_USUBO, G_SSUBO,
+ getActionDefinitionsBuilder({G_UADDO, G_USUBO,
G_UADDE, G_SADDE, G_USUBE, G_SSUBE})
.legalFor({{S32, S1}})
- .clampScalar(0, S32, S32);
+ .clampScalar(0, S32, S32)
+ .scalarize(0); // TODO: Implement.
+
+ getActionDefinitionsBuilder({G_SADDO, G_SSUBO})
+ .lower();
getActionDefinitionsBuilder(G_BITCAST)
- .legalForCartesianProduct({S32, V2S16})
- .legalForCartesianProduct({S64, V2S32, V4S16})
- .legalForCartesianProduct({V2S64, V4S32})
// Don't worry about the size constraint.
- .legalIf(all(isPointer(0), isPointer(1)));
+ .legalIf(all(isRegisterType(0), isRegisterType(1)))
+ // FIXME: Testing hack
+ .legalForCartesianProduct({S16, LLT::vector(2, 8), });
- if (ST.has16BitInsts()) {
- getActionDefinitionsBuilder(G_FCONSTANT)
- .legalFor({S32, S64, S16})
- .clampScalar(0, S16, S64);
- } else {
- getActionDefinitionsBuilder(G_FCONSTANT)
- .legalFor({S32, S64})
- .clampScalar(0, S32, S64);
- }
+ getActionDefinitionsBuilder(G_FCONSTANT)
+ .legalFor({S32, S64, S16})
+ .clampScalar(0, S16, S64);
getActionDefinitionsBuilder(G_IMPLICIT_DEF)
- .legalFor({S1, S32, S64, V2S32, V4S32, V2S16, V4S16, GlobalPtr,
+ .legalFor({S1, S32, S64, S16, V2S32, V4S32, V2S16, V4S16, GlobalPtr,
ConstantPtr, LocalPtr, FlatPtr, PrivatePtr})
.moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
- .clampScalarOrElt(0, S32, S512)
+ .clampScalarOrElt(0, S32, S1024)
.legalIf(isMultiple32(0))
.widenScalarToNextPow2(0, 32)
.clampMaxNumElements(0, S32, 16);
@@ -256,23 +320,33 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
// values may not be legal. We need to figure out how to distinguish
// between these two scenarios.
getActionDefinitionsBuilder(G_CONSTANT)
- .legalFor({S1, S32, S64, GlobalPtr,
+ .legalFor({S1, S32, S64, S16, GlobalPtr,
LocalPtr, ConstantPtr, PrivatePtr, FlatPtr })
.clampScalar(0, S32, S64)
.widenScalarToNextPow2(0)
.legalIf(isPointer(0));
setAction({G_FRAME_INDEX, PrivatePtr}, Legal);
+ getActionDefinitionsBuilder(G_GLOBAL_VALUE)
+ .customFor({LocalPtr, GlobalPtr, ConstantPtr, Constant32Ptr});
+
auto &FPOpActions = getActionDefinitionsBuilder(
- { G_FADD, G_FMUL, G_FNEG, G_FABS, G_FMA, G_FCANONICALIZE})
+ { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE})
.legalFor({S32, S64});
+ auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS})
+ .customFor({S32, S64});
+ auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV)
+ .customFor({S32, S64});
if (ST.has16BitInsts()) {
if (ST.hasVOP3PInsts())
FPOpActions.legalFor({S16, V2S16});
else
FPOpActions.legalFor({S16});
+
+ TrigActions.customFor({S16});
+ FDIVActions.customFor({S16});
}
auto &MinNumMaxNum = getActionDefinitionsBuilder({
@@ -293,22 +367,37 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
.scalarize(0);
}
- // TODO: Implement
- getActionDefinitionsBuilder({G_FMINIMUM, G_FMAXIMUM}).lower();
-
if (ST.hasVOP3PInsts())
FPOpActions.clampMaxNumElements(0, S16, 2);
+
FPOpActions
.scalarize(0)
.clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
+ TrigActions
+ .scalarize(0)
+ .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
+
+ FDIVActions
+ .scalarize(0)
+ .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
+
+ getActionDefinitionsBuilder({G_FNEG, G_FABS})
+ .legalFor(FPTypesPK16)
+ .clampMaxNumElements(0, S16, 2)
+ .scalarize(0)
+ .clampScalar(0, S16, S64);
+
+ // TODO: Implement
+ getActionDefinitionsBuilder({G_FMINIMUM, G_FMAXIMUM}).lower();
+
if (ST.has16BitInsts()) {
- getActionDefinitionsBuilder(G_FSQRT)
+ getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR})
.legalFor({S32, S64, S16})
.scalarize(0)
.clampScalar(0, S16, S64);
} else {
- getActionDefinitionsBuilder(G_FSQRT)
+ getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR})
.legalFor({S32, S64})
.scalarize(0)
.clampScalar(0, S32, S64);
@@ -334,23 +423,43 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
.scalarize(0)
.clampScalar(0, S32, S64);
+ // Whether this is legal depends on the floating point mode for the function.
+ auto &FMad = getActionDefinitionsBuilder(G_FMAD);
+ if (ST.hasMadF16())
+ FMad.customFor({S32, S16});
+ else
+ FMad.customFor({S32});
+ FMad.scalarize(0)
+ .lower();
+
getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT})
.legalFor({{S64, S32}, {S32, S16}, {S64, S16},
{S32, S1}, {S64, S1}, {S16, S1},
+ {S96, S32},
// FIXME: Hack
{S64, LLT::scalar(33)},
{S32, S8}, {S128, S32}, {S128, S64}, {S32, LLT::scalar(24)}})
.scalarize(0);
- getActionDefinitionsBuilder({G_SITOFP, G_UITOFP})
- .legalFor({{S32, S32}, {S64, S32}})
+ // TODO: Split s1->s64 during regbankselect for VALU.
+ auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP})
+ .legalFor({{S32, S32}, {S64, S32}, {S16, S32}, {S32, S1}, {S16, S1}, {S64, S1}})
.lowerFor({{S32, S64}})
- .customFor({{S64, S64}})
- .scalarize(0);
+ .customFor({{S64, S64}});
+ if (ST.has16BitInsts())
+ IToFP.legalFor({{S16, S16}});
+ IToFP.clampScalar(1, S32, S64)
+ .scalarize(0);
+
+ auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI})
+ .legalFor({{S32, S32}, {S32, S64}, {S32, S16}});
+ if (ST.has16BitInsts())
+ FPToI.legalFor({{S16, S16}});
+ else
+ FPToI.minScalar(1, S32);
- getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI})
- .legalFor({{S32, S32}, {S32, S64}})
- .scalarize(0);
+ FPToI.minScalar(0, S32)
+ .scalarize(0);
getActionDefinitionsBuilder(G_INTRINSIC_ROUND)
.legalFor({S32, S64})
@@ -374,6 +483,10 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
.legalForCartesianProduct(AddrSpaces32, {S32})
.scalarize(0);
+ getActionDefinitionsBuilder(G_PTR_MASK)
+ .scalarize(0)
+ .alwaysLegal();
+
setAction({G_BLOCK_ADDR, CodePtr}, Legal);
auto &CmpBuilder =
@@ -415,7 +528,7 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
.widenScalarToNextPow2(1, 32);
// TODO: Expand for > s32
- getActionDefinitionsBuilder(G_BSWAP)
+ getActionDefinitionsBuilder({G_BSWAP, G_BITREVERSE})
.legalFor({S32})
.clampScalar(0, S32, S32)
.scalarize(0);
@@ -491,87 +604,239 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
});
- if (ST.hasFlatAddressSpace()) {
- getActionDefinitionsBuilder(G_ADDRSPACE_CAST)
- .scalarize(0)
- .custom();
- }
+ getActionDefinitionsBuilder(G_ADDRSPACE_CAST)
+ .scalarize(0)
+ .custom();
// TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we
// handle some operations by just promoting the register during
// selection. There are also d16 loads on GFX9+ which preserve the high bits.
- getActionDefinitionsBuilder({G_LOAD, G_STORE})
- .narrowScalarIf([](const LegalityQuery &Query) {
- unsigned Size = Query.Types[0].getSizeInBits();
- unsigned MemSize = Query.MMODescrs[0].SizeInBits;
- return (Size > 32 && MemSize < Size);
- },
- [](const LegalityQuery &Query) {
- return std::make_pair(0, LLT::scalar(32));
- })
- .fewerElementsIf([=](const LegalityQuery &Query) {
- unsigned MemSize = Query.MMODescrs[0].SizeInBits;
- return (MemSize == 96) &&
- Query.Types[0].isVector() &&
- !ST.hasDwordx3LoadStores();
- },
- [=](const LegalityQuery &Query) {
- return std::make_pair(0, V2S32);
- })
- .legalIf([=](const LegalityQuery &Query) {
- const LLT &Ty0 = Query.Types[0];
-
- unsigned Size = Ty0.getSizeInBits();
- unsigned MemSize = Query.MMODescrs[0].SizeInBits;
- if (Size < 32 || (Size > 32 && MemSize < Size))
- return false;
-
- if (Ty0.isVector() && Size != MemSize)
- return false;
-
- // TODO: Decompose private loads into 4-byte components.
- // TODO: Illegal flat loads on SI
- switch (MemSize) {
- case 8:
- case 16:
- return Size == 32;
- case 32:
- case 64:
- case 128:
- return true;
+ auto maxSizeForAddrSpace = [this](unsigned AS) -> unsigned {
+ switch (AS) {
+ // FIXME: Private element size.
+ case AMDGPUAS::PRIVATE_ADDRESS:
+ return 32;
+ // FIXME: Check subtarget
+ case AMDGPUAS::LOCAL_ADDRESS:
+ return ST.useDS128() ? 128 : 64;
+
+ // Treat constant and global as identical. SMRD loads are sometimes usable
+ // for global loads (ideally constant address space should be eliminated)
+ // depending on the context. Legality cannot be context dependent, but
+ // RegBankSelect can split the load as necessary depending on the pointer
+ // register bank/uniformity and if the memory is invariant or not written in
+ // a kernel.
+ case AMDGPUAS::CONSTANT_ADDRESS:
+ case AMDGPUAS::GLOBAL_ADDRESS:
+ return 512;
+ default:
+ return 128;
+ }
+ };
- case 96:
- return ST.hasDwordx3LoadStores();
-
- case 256:
- case 512:
- // TODO: Possibly support loads of i256 and i512 . This will require
- // adding i256 and i512 types to MVT in order for to be able to use
- // TableGen.
- // TODO: Add support for other vector types, this will require
- // defining more value mappings for the new types.
- return Ty0.isVector() && (Ty0.getScalarType().getSizeInBits() == 32 ||
- Ty0.getScalarType().getSizeInBits() == 64);
-
- default:
- return false;
- }
- })
- .clampScalar(0, S32, S64);
+ const auto needToSplitLoad = [=](const LegalityQuery &Query) -> bool {
+ const LLT DstTy = Query.Types[0];
+
+ // Split vector extloads.
+ unsigned MemSize = Query.MMODescrs[0].SizeInBits;
+ if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize)
+ return true;
+
+ const LLT PtrTy = Query.Types[1];
+ unsigned AS = PtrTy.getAddressSpace();
+ if (MemSize > maxSizeForAddrSpace(AS))
+ return true;
+
+ // Catch weird sized loads that don't evenly divide into the access sizes
+ // TODO: May be able to widen depending on alignment etc.
+ unsigned NumRegs = MemSize / 32;
+ if (NumRegs == 3 && !ST.hasDwordx3LoadStores())
+ return true;
+
+ unsigned Align = Query.MMODescrs[0].AlignInBits;
+ if (Align < MemSize) {
+ const SITargetLowering *TLI = ST.getTargetLowering();
+ return !TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, Align / 8);
+ }
+
+ return false;
+ };
+ unsigned GlobalAlign32 = ST.hasUnalignedBufferAccess() ? 0 : 32;
+ unsigned GlobalAlign16 = ST.hasUnalignedBufferAccess() ? 0 : 16;
+ unsigned GlobalAlign8 = ST.hasUnalignedBufferAccess() ? 0 : 8;
+
+ // TODO: Refine based on subtargets which support unaligned access or 128-bit
+ // LDS
+ // TODO: Unsupported flat for SI.
+
+ for (unsigned Op : {G_LOAD, G_STORE}) {
+ const bool IsStore = Op == G_STORE;
+
+ auto &Actions = getActionDefinitionsBuilder(Op);
+ // Whitelist the common cases.
+ // TODO: Pointer loads
+ // TODO: Wide constant loads
+ // TODO: Only CI+ has 3x loads
+ // TODO: Loads to s16 on gfx9
+ Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, 32, GlobalAlign32},
+ {V2S32, GlobalPtr, 64, GlobalAlign32},
+ {V3S32, GlobalPtr, 96, GlobalAlign32},
+ {S96, GlobalPtr, 96, GlobalAlign32},
+ {V4S32, GlobalPtr, 128, GlobalAlign32},
+ {S128, GlobalPtr, 128, GlobalAlign32},
+ {S64, GlobalPtr, 64, GlobalAlign32},
+ {V2S64, GlobalPtr, 128, GlobalAlign32},
+ {V2S16, GlobalPtr, 32, GlobalAlign32},
+ {S32, GlobalPtr, 8, GlobalAlign8},
+ {S32, GlobalPtr, 16, GlobalAlign16},
+
+ {S32, LocalPtr, 32, 32},
+ {S64, LocalPtr, 64, 32},
+ {V2S32, LocalPtr, 64, 32},
+ {S32, LocalPtr, 8, 8},
+ {S32, LocalPtr, 16, 16},
+ {V2S16, LocalPtr, 32, 32},
+
+ {S32, PrivatePtr, 32, 32},
+ {S32, PrivatePtr, 8, 8},
+ {S32, PrivatePtr, 16, 16},
+ {V2S16, PrivatePtr, 32, 32},
+
+ {S32, FlatPtr, 32, GlobalAlign32},
+ {S32, FlatPtr, 16, GlobalAlign16},
+ {S32, FlatPtr, 8, GlobalAlign8},
+ {V2S16, FlatPtr, 32, GlobalAlign32},
+
+ {S32, ConstantPtr, 32, GlobalAlign32},
+ {V2S32, ConstantPtr, 64, GlobalAlign32},
+ {V3S32, ConstantPtr, 96, GlobalAlign32},
+ {V4S32, ConstantPtr, 128, GlobalAlign32},
+ {S64, ConstantPtr, 64, GlobalAlign32},
+ {S128, ConstantPtr, 128, GlobalAlign32},
+ {V2S32, ConstantPtr, 32, GlobalAlign32}});
+ Actions
+ .customIf(typeIs(1, Constant32Ptr))
+ .narrowScalarIf(
+ [=](const LegalityQuery &Query) -> bool {
+ return !Query.Types[0].isVector() && needToSplitLoad(Query);
+ },
+ [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
+ const LLT DstTy = Query.Types[0];
+ const LLT PtrTy = Query.Types[1];
+
+ const unsigned DstSize = DstTy.getSizeInBits();
+ unsigned MemSize = Query.MMODescrs[0].SizeInBits;
+
+ // Split extloads.
+ if (DstSize > MemSize)
+ return std::make_pair(0, LLT::scalar(MemSize));
+
+ if (DstSize > 32 && (DstSize % 32 != 0)) {
+ // FIXME: Need a way to specify non-extload of larger size if
+ // suitably aligned.
+ return std::make_pair(0, LLT::scalar(32 * (DstSize / 32)));
+ }
+
+ unsigned MaxSize = maxSizeForAddrSpace(PtrTy.getAddressSpace());
+ if (MemSize > MaxSize)
+ return std::make_pair(0, LLT::scalar(MaxSize));
+
+ unsigned Align = Query.MMODescrs[0].AlignInBits;
+ return std::make_pair(0, LLT::scalar(Align));
+ })
+ .fewerElementsIf(
+ [=](const LegalityQuery &Query) -> bool {
+ return Query.Types[0].isVector() && needToSplitLoad(Query);
+ },
+ [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
+ const LLT DstTy = Query.Types[0];
+ const LLT PtrTy = Query.Types[1];
+
+ LLT EltTy = DstTy.getElementType();
+ unsigned MaxSize = maxSizeForAddrSpace(PtrTy.getAddressSpace());
+
+ // Split if it's too large for the address space.
+ if (Query.MMODescrs[0].SizeInBits > MaxSize) {
+ unsigned NumElts = DstTy.getNumElements();
+ unsigned NumPieces = Query.MMODescrs[0].SizeInBits / MaxSize;
+
+ // FIXME: Refine when odd breakdowns handled
+ // The scalars will need to be re-legalized.
+ if (NumPieces == 1 || NumPieces >= NumElts ||
+ NumElts % NumPieces != 0)
+ return std::make_pair(0, EltTy);
+
+ return std::make_pair(0,
+ LLT::vector(NumElts / NumPieces, EltTy));
+ }
+
+ // Need to split because of alignment.
+ unsigned Align = Query.MMODescrs[0].AlignInBits;
+ unsigned EltSize = EltTy.getSizeInBits();
+ if (EltSize > Align &&
+ (EltSize / Align < DstTy.getNumElements())) {
+ return std::make_pair(0, LLT::vector(EltSize / Align, EltTy));
+ }
+
+ // May need relegalization for the scalars.
+ return std::make_pair(0, EltTy);
+ })
+ .minScalar(0, S32);
+
+ if (IsStore)
+ Actions.narrowScalarIf(isWideScalarTruncStore(0), changeTo(0, S32));
+
+ // TODO: Need a bitcast lower option?
+ Actions
+ .legalIf([=](const LegalityQuery &Query) {
+ const LLT Ty0 = Query.Types[0];
+ unsigned Size = Ty0.getSizeInBits();
+ unsigned MemSize = Query.MMODescrs[0].SizeInBits;
+ unsigned Align = Query.MMODescrs[0].AlignInBits;
+
+ // No extending vector loads.
+ if (Size > MemSize && Ty0.isVector())
+ return false;
+
+ // FIXME: Widening store from alignment not valid.
+ if (MemSize < Size)
+ MemSize = std::max(MemSize, Align);
+
+ switch (MemSize) {
+ case 8:
+ case 16:
+ return Size == 32;
+ case 32:
+ case 64:
+ case 128:
+ return true;
+ case 96:
+ return ST.hasDwordx3LoadStores();
+ case 256:
+ case 512:
+ return true;
+ default:
+ return false;
+ }
+ })
+ .widenScalarToNextPow2(0)
+ // TODO: v3s32->v4s32 with alignment
+ .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0));
+ }
- // FIXME: Handle alignment requirements.
auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD})
- .legalForTypesWithMemDesc({
- {S32, GlobalPtr, 8, 8},
- {S32, GlobalPtr, 16, 8},
- {S32, LocalPtr, 8, 8},
- {S32, LocalPtr, 16, 8},
- {S32, PrivatePtr, 8, 8},
- {S32, PrivatePtr, 16, 8}});
+ .legalForTypesWithMemDesc({{S32, GlobalPtr, 8, 8},
+ {S32, GlobalPtr, 16, 2 * 8},
+ {S32, LocalPtr, 8, 8},
+ {S32, LocalPtr, 16, 16},
+ {S32, PrivatePtr, 8, 8},
+ {S32, PrivatePtr, 16, 16},
+ {S32, ConstantPtr, 8, 8},
+ {S32, ConstantPtr, 16, 2 * 8}});
if (ST.hasFlatAddressSpace()) {
- ExtLoads.legalForTypesWithMemDesc({{S32, FlatPtr, 8, 8},
- {S32, FlatPtr, 16, 8}});
+ ExtLoads.legalForTypesWithMemDesc(
+ {{S32, FlatPtr, 8, 8}, {S32, FlatPtr, 16, 16}});
}
ExtLoads.clampScalar(0, S32, S32)
@@ -590,6 +855,12 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}});
}
+ getActionDefinitionsBuilder(G_ATOMICRMW_FADD)
+ .legalFor({{S32, LocalPtr}});
+
+ getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG_WITH_SUCCESS)
+ .lower();
+
// TODO: Pointer types, any 32-bit or 64-bit vector
getActionDefinitionsBuilder(G_SELECT)
.legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16,
@@ -643,7 +914,7 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
return (EltTy.getSizeInBits() == 16 ||
EltTy.getSizeInBits() % 32 == 0) &&
VecTy.getSizeInBits() % 32 == 0 &&
- VecTy.getSizeInBits() <= 512 &&
+ VecTy.getSizeInBits() <= 1024 &&
IdxTy.getSizeInBits() == 32;
})
.clampScalar(EltTypeIdx, S32, S64)
@@ -663,6 +934,8 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
// FIXME: Doesn't handle extract of illegal sizes.
getActionDefinitionsBuilder(Op)
+ .lowerIf(all(typeIs(LitTyIdx, S16), sizeIs(BigTyIdx, 32)))
+ // FIXME: Multiples of 16 should not be legal.
.legalIf([=](const LegalityQuery &Query) {
const LLT BigTy = Query.Types[BigTyIdx];
const LLT LitTy = Query.Types[LitTyIdx];
@@ -686,18 +959,36 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
}
- getActionDefinitionsBuilder(G_BUILD_VECTOR)
- .legalForCartesianProduct(AllS32Vectors, {S32})
- .legalForCartesianProduct(AllS64Vectors, {S64})
- .clampNumElements(0, V16S32, V16S32)
- .clampNumElements(0, V2S64, V8S64)
- .minScalarSameAs(1, 0)
- .legalIf(isRegisterType(0))
- .minScalarOrElt(0, S32);
+ auto &BuildVector = getActionDefinitionsBuilder(G_BUILD_VECTOR)
+ .legalForCartesianProduct(AllS32Vectors, {S32})
+ .legalForCartesianProduct(AllS64Vectors, {S64})
+ .clampNumElements(0, V16S32, V32S32)
+ .clampNumElements(0, V2S64, V16S64)
+ .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16));
+
+ if (ST.hasScalarPackInsts())
+ BuildVector.legalFor({V2S16, S32});
+
+ BuildVector
+ .minScalarSameAs(1, 0)
+ .legalIf(isRegisterType(0))
+ .minScalarOrElt(0, S32);
+
+ if (ST.hasScalarPackInsts()) {
+ getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
+ .legalFor({V2S16, S32})
+ .lower();
+ } else {
+ getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
+ .lower();
+ }
getActionDefinitionsBuilder(G_CONCAT_VECTORS)
.legalIf(isRegisterType(0));
+ // TODO: Don't fully scalarize v2s16 pieces
+ getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower();
+
// Merge/Unmerge
for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) {
unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1;
@@ -715,14 +1006,17 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
return false;
};
- getActionDefinitionsBuilder(Op)
+ auto &Builder = getActionDefinitionsBuilder(Op)
.widenScalarToNextPow2(LitTyIdx, /*Min*/ 16)
// Clamp the little scalar to s8-s256 and make it a power of 2. It's not
// worth considering the multiples of 64 since 2*192 and 2*384 are not
// valid.
.clampScalar(LitTyIdx, S16, S256)
.widenScalarToNextPow2(LitTyIdx, /*Min*/ 32)
-
+ .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
+ .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32),
+ elementTypeIs(1, S16)),
+ changeTo(1, V2S16))
// Break up vectors with weird elements into scalars
.fewerElementsIf(
[=](const LegalityQuery &Query) { return notValidElt(Query, 0); },
@@ -730,25 +1024,37 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
.fewerElementsIf(
[=](const LegalityQuery &Query) { return notValidElt(Query, 1); },
scalarize(1))
- .clampScalar(BigTyIdx, S32, S512)
- .widenScalarIf(
+ .clampScalar(BigTyIdx, S32, S1024)
+ .lowerFor({{S16, V2S16}});
+
+ if (Op == G_MERGE_VALUES) {
+ Builder.widenScalarIf(
+ // TODO: Use 16-bit shifts if legal for 8-bit values?
[=](const LegalityQuery &Query) {
- const LLT &Ty = Query.Types[BigTyIdx];
- return !isPowerOf2_32(Ty.getSizeInBits()) &&
- Ty.getSizeInBits() % 16 != 0;
+ const LLT Ty = Query.Types[LitTyIdx];
+ return Ty.getSizeInBits() < 32;
},
- [=](const LegalityQuery &Query) {
- // Pick the next power of 2, or a multiple of 64 over 128.
- // Whichever is smaller.
- const LLT &Ty = Query.Types[BigTyIdx];
- unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1);
- if (NewSizeInBits >= 256) {
- unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1);
- if (RoundedTo < NewSizeInBits)
- NewSizeInBits = RoundedTo;
- }
- return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits));
- })
+ changeTo(LitTyIdx, S32));
+ }
+
+ Builder.widenScalarIf(
+ [=](const LegalityQuery &Query) {
+ const LLT Ty = Query.Types[BigTyIdx];
+ return !isPowerOf2_32(Ty.getSizeInBits()) &&
+ Ty.getSizeInBits() % 16 != 0;
+ },
+ [=](const LegalityQuery &Query) {
+ // Pick the next power of 2, or a multiple of 64 over 128.
+ // Whichever is smaller.
+ const LLT &Ty = Query.Types[BigTyIdx];
+ unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1);
+ if (NewSizeInBits >= 256) {
+ unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1);
+ if (RoundedTo < NewSizeInBits)
+ NewSizeInBits = RoundedTo;
+ }
+ return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits));
+ })
.legalIf([=](const LegalityQuery &Query) {
const LLT &BigTy = Query.Types[BigTyIdx];
const LLT &LitTy = Query.Types[LitTyIdx];
@@ -760,43 +1066,56 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
return BigTy.getSizeInBits() % 16 == 0 &&
LitTy.getSizeInBits() % 16 == 0 &&
- BigTy.getSizeInBits() <= 512;
+ BigTy.getSizeInBits() <= 1024;
})
// Any vectors left are the wrong size. Scalarize them.
.scalarize(0)
.scalarize(1);
}
+ getActionDefinitionsBuilder(G_SEXT_INREG).lower();
+
computeTables();
verify(*ST.getInstrInfo());
}
bool AMDGPULegalizerInfo::legalizeCustom(MachineInstr &MI,
MachineRegisterInfo &MRI,
- MachineIRBuilder &MIRBuilder,
+ MachineIRBuilder &B,
GISelChangeObserver &Observer) const {
switch (MI.getOpcode()) {
case TargetOpcode::G_ADDRSPACE_CAST:
- return legalizeAddrSpaceCast(MI, MRI, MIRBuilder);
+ return legalizeAddrSpaceCast(MI, MRI, B);
case TargetOpcode::G_FRINT:
- return legalizeFrint(MI, MRI, MIRBuilder);
+ return legalizeFrint(MI, MRI, B);
case TargetOpcode::G_FCEIL:
- return legalizeFceil(MI, MRI, MIRBuilder);
+ return legalizeFceil(MI, MRI, B);
case TargetOpcode::G_INTRINSIC_TRUNC:
- return legalizeIntrinsicTrunc(MI, MRI, MIRBuilder);
+ return legalizeIntrinsicTrunc(MI, MRI, B);
case TargetOpcode::G_SITOFP:
- return legalizeITOFP(MI, MRI, MIRBuilder, true);
+ return legalizeITOFP(MI, MRI, B, true);
case TargetOpcode::G_UITOFP:
- return legalizeITOFP(MI, MRI, MIRBuilder, false);
+ return legalizeITOFP(MI, MRI, B, false);
case TargetOpcode::G_FMINNUM:
case TargetOpcode::G_FMAXNUM:
case TargetOpcode::G_FMINNUM_IEEE:
case TargetOpcode::G_FMAXNUM_IEEE:
- return legalizeMinNumMaxNum(MI, MRI, MIRBuilder);
+ return legalizeMinNumMaxNum(MI, MRI, B);
case TargetOpcode::G_EXTRACT_VECTOR_ELT:
- return legalizeExtractVectorElt(MI, MRI, MIRBuilder);
+ return legalizeExtractVectorElt(MI, MRI, B);
case TargetOpcode::G_INSERT_VECTOR_ELT:
- return legalizeInsertVectorElt(MI, MRI, MIRBuilder);
+ return legalizeInsertVectorElt(MI, MRI, B);
+ case TargetOpcode::G_FSIN:
+ case TargetOpcode::G_FCOS:
+ return legalizeSinCos(MI, MRI, B);
+ case TargetOpcode::G_GLOBAL_VALUE:
+ return legalizeGlobalValue(MI, MRI, B);
+ case TargetOpcode::G_LOAD:
+ return legalizeLoad(MI, MRI, B, Observer);
+ case TargetOpcode::G_FMAD:
+ return legalizeFMad(MI, MRI, B);
+ case TargetOpcode::G_FDIV:
+ return legalizeFDIV(MI, MRI, B);
default:
return false;
}
@@ -807,11 +1126,13 @@ bool AMDGPULegalizerInfo::legalizeCustom(MachineInstr &MI,
Register AMDGPULegalizerInfo::getSegmentAperture(
unsigned AS,
MachineRegisterInfo &MRI,
- MachineIRBuilder &MIRBuilder) const {
- MachineFunction &MF = MIRBuilder.getMF();
+ MachineIRBuilder &B) const {
+ MachineFunction &MF = B.getMF();
const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
const LLT S32 = LLT::scalar(32);
+ assert(AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS);
+
if (ST.hasApertureRegs()) {
// FIXME: Use inline constants (src_{shared, private}_base) instead of
// getreg.
@@ -829,13 +1150,13 @@ Register AMDGPULegalizerInfo::getSegmentAperture(
Register ApertureReg = MRI.createGenericVirtualRegister(S32);
Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
- MIRBuilder.buildInstr(AMDGPU::S_GETREG_B32)
+ B.buildInstr(AMDGPU::S_GETREG_B32)
.addDef(GetReg)
.addImm(Encoding);
MRI.setType(GetReg, S32);
- auto ShiftAmt = MIRBuilder.buildConstant(S32, WidthM1 + 1);
- MIRBuilder.buildInstr(TargetOpcode::G_SHL)
+ auto ShiftAmt = B.buildConstant(S32, WidthM1 + 1);
+ B.buildInstr(TargetOpcode::G_SHL)
.addDef(ApertureReg)
.addUse(GetReg)
.addUse(ShiftAmt.getReg(0));
@@ -846,8 +1167,9 @@ Register AMDGPULegalizerInfo::getSegmentAperture(
Register QueuePtr = MRI.createGenericVirtualRegister(
LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
- // FIXME: Placeholder until we can track the input registers.
- MIRBuilder.buildConstant(QueuePtr, 0xdeadbeef);
+ const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
+ if (!loadInputValue(QueuePtr, B, &MFI->getArgInfo().QueuePtr))
+ return Register();
// Offset into amd_queue_t for group_segment_aperture_base_hi /
// private_segment_aperture_base_hi.
@@ -870,18 +1192,19 @@ Register AMDGPULegalizerInfo::getSegmentAperture(
Register LoadResult = MRI.createGenericVirtualRegister(S32);
Register LoadAddr;
- MIRBuilder.materializeGEP(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset);
- MIRBuilder.buildLoad(LoadResult, LoadAddr, *MMO);
+ B.materializeGEP(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset);
+ B.buildLoad(LoadResult, LoadAddr, *MMO);
return LoadResult;
}
bool AMDGPULegalizerInfo::legalizeAddrSpaceCast(
MachineInstr &MI, MachineRegisterInfo &MRI,
- MachineIRBuilder &MIRBuilder) const {
- MachineFunction &MF = MIRBuilder.getMF();
+ MachineIRBuilder &B) const {
+ MachineFunction &MF = B.getMF();
- MIRBuilder.setInstr(MI);
+ B.setInstr(MI);
+ const LLT S32 = LLT::scalar(32);
Register Dst = MI.getOperand(0).getReg();
Register Src = MI.getOperand(1).getReg();
@@ -899,7 +1222,28 @@ bool AMDGPULegalizerInfo::legalizeAddrSpaceCast(
const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
if (ST.getTargetLowering()->isNoopAddrSpaceCast(SrcAS, DestAS)) {
- MI.setDesc(MIRBuilder.getTII().get(TargetOpcode::G_BITCAST));
+ MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST));
+ return true;
+ }
+
+ if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
+ // Truncate.
+ B.buildExtract(Dst, Src, 0);
+ MI.eraseFromParent();
+ return true;
+ }
+
+ if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
+ const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
+ uint32_t AddrHiVal = Info->get32BitAddressHighBits();
+
+ // FIXME: This is a bit ugly due to creating a merge of 2 pointers to
+ // another. Merge operands are required to be the same type, but creating an
+ // extra ptrtoint would be kind of pointless.
+ auto HighAddr = B.buildConstant(
+ LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS_32BIT, 32), AddrHiVal);
+ B.buildMerge(Dst, {Src, HighAddr.getReg(0)});
+ MI.eraseFromParent();
return true;
}
@@ -908,47 +1252,52 @@ bool AMDGPULegalizerInfo::legalizeAddrSpaceCast(
DestAS == AMDGPUAS::PRIVATE_ADDRESS);
unsigned NullVal = TM.getNullPointerValue(DestAS);
- auto SegmentNull = MIRBuilder.buildConstant(DstTy, NullVal);
- auto FlatNull = MIRBuilder.buildConstant(SrcTy, 0);
+ auto SegmentNull = B.buildConstant(DstTy, NullVal);
+ auto FlatNull = B.buildConstant(SrcTy, 0);
Register PtrLo32 = MRI.createGenericVirtualRegister(DstTy);
// Extract low 32-bits of the pointer.
- MIRBuilder.buildExtract(PtrLo32, Src, 0);
+ B.buildExtract(PtrLo32, Src, 0);
Register CmpRes = MRI.createGenericVirtualRegister(LLT::scalar(1));
- MIRBuilder.buildICmp(CmpInst::ICMP_NE, CmpRes, Src, FlatNull.getReg(0));
- MIRBuilder.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0));
+ B.buildICmp(CmpInst::ICMP_NE, CmpRes, Src, FlatNull.getReg(0));
+ B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0));
MI.eraseFromParent();
return true;
}
- assert(SrcAS == AMDGPUAS::LOCAL_ADDRESS ||
- SrcAS == AMDGPUAS::PRIVATE_ADDRESS);
+ if (SrcAS != AMDGPUAS::LOCAL_ADDRESS && SrcAS != AMDGPUAS::PRIVATE_ADDRESS)
+ return false;
+
+ if (!ST.hasFlatAddressSpace())
+ return false;
auto SegmentNull =
- MIRBuilder.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS));
+ B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS));
auto FlatNull =
- MIRBuilder.buildConstant(DstTy, TM.getNullPointerValue(DestAS));
+ B.buildConstant(DstTy, TM.getNullPointerValue(DestAS));
- Register ApertureReg = getSegmentAperture(DestAS, MRI, MIRBuilder);
+ Register ApertureReg = getSegmentAperture(SrcAS, MRI, B);
+ if (!ApertureReg.isValid())
+ return false;
Register CmpRes = MRI.createGenericVirtualRegister(LLT::scalar(1));
- MIRBuilder.buildICmp(CmpInst::ICMP_NE, CmpRes, Src, SegmentNull.getReg(0));
+ B.buildICmp(CmpInst::ICMP_NE, CmpRes, Src, SegmentNull.getReg(0));
Register BuildPtr = MRI.createGenericVirtualRegister(DstTy);
// Coerce the type of the low half of the result so we can use merge_values.
- Register SrcAsInt = MRI.createGenericVirtualRegister(LLT::scalar(32));
- MIRBuilder.buildInstr(TargetOpcode::G_PTRTOINT)
+ Register SrcAsInt = MRI.createGenericVirtualRegister(S32);
+ B.buildInstr(TargetOpcode::G_PTRTOINT)
.addDef(SrcAsInt)
.addUse(Src);
// TODO: Should we allow mismatched types but matching sizes in merges to
// avoid the ptrtoint?
- MIRBuilder.buildMerge(BuildPtr, {SrcAsInt, ApertureReg});
- MIRBuilder.buildSelect(Dst, CmpRes, BuildPtr, FlatNull.getReg(0));
+ B.buildMerge(BuildPtr, {SrcAsInt, ApertureReg});
+ B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull.getReg(0));
MI.eraseFromParent();
return true;
@@ -956,8 +1305,8 @@ bool AMDGPULegalizerInfo::legalizeAddrSpaceCast(
bool AMDGPULegalizerInfo::legalizeFrint(
MachineInstr &MI, MachineRegisterInfo &MRI,
- MachineIRBuilder &MIRBuilder) const {
- MIRBuilder.setInstr(MI);
+ MachineIRBuilder &B) const {
+ B.setInstr(MI);
Register Src = MI.getOperand(1).getReg();
LLT Ty = MRI.getType(Src);
@@ -966,18 +1315,18 @@ bool AMDGPULegalizerInfo::legalizeFrint(
APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
- auto C1 = MIRBuilder.buildFConstant(Ty, C1Val);
- auto CopySign = MIRBuilder.buildFCopysign(Ty, C1, Src);
+ auto C1 = B.buildFConstant(Ty, C1Val);
+ auto CopySign = B.buildFCopysign(Ty, C1, Src);
// TODO: Should this propagate fast-math-flags?
- auto Tmp1 = MIRBuilder.buildFAdd(Ty, Src, CopySign);
- auto Tmp2 = MIRBuilder.buildFSub(Ty, Tmp1, CopySign);
+ auto Tmp1 = B.buildFAdd(Ty, Src, CopySign);
+ auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign);
- auto C2 = MIRBuilder.buildFConstant(Ty, C2Val);
- auto Fabs = MIRBuilder.buildFAbs(Ty, Src);
+ auto C2 = B.buildFConstant(Ty, C2Val);
+ auto Fabs = B.buildFAbs(Ty, Src);
- auto Cond = MIRBuilder.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2);
- MIRBuilder.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2);
+ auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2);
+ B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2);
return true;
}
@@ -1124,7 +1473,7 @@ bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(
MachineIRBuilder HelperBuilder(MI);
GISelObserverWrapper DummyObserver;
LegalizerHelper Helper(MF, DummyObserver, HelperBuilder);
- HelperBuilder.setMBB(*MI.getParent());
+ HelperBuilder.setInstr(MI);
return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized;
}
@@ -1187,6 +1536,194 @@ bool AMDGPULegalizerInfo::legalizeInsertVectorElt(
return true;
}
+bool AMDGPULegalizerInfo::legalizeSinCos(
+ MachineInstr &MI, MachineRegisterInfo &MRI,
+ MachineIRBuilder &B) const {
+ B.setInstr(MI);
+
+ Register DstReg = MI.getOperand(0).getReg();
+ Register SrcReg = MI.getOperand(1).getReg();
+ LLT Ty = MRI.getType(DstReg);
+ unsigned Flags = MI.getFlags();
+
+ Register TrigVal;
+ auto OneOver2Pi = B.buildFConstant(Ty, 0.5 / M_PI);
+ if (ST.hasTrigReducedRange()) {
+ auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags);
+ TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false)
+ .addUse(MulVal.getReg(0))
+ .setMIFlags(Flags).getReg(0);
+ } else
+ TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0);
+
+ Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ?
+ Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos;
+ B.buildIntrinsic(TrigIntrin, makeArrayRef<Register>(DstReg), false)
+ .addUse(TrigVal)
+ .setMIFlags(Flags);
+ MI.eraseFromParent();
+ return true;
+}
+
+bool AMDGPULegalizerInfo::buildPCRelGlobalAddress(
+ Register DstReg, LLT PtrTy,
+ MachineIRBuilder &B, const GlobalValue *GV,
+ unsigned Offset, unsigned GAFlags) const {
+ // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered
+ // to the following code sequence:
+ //
+ // For constant address space:
+ // s_getpc_b64 s[0:1]
+ // s_add_u32 s0, s0, $symbol
+ // s_addc_u32 s1, s1, 0
+ //
+ // s_getpc_b64 returns the address of the s_add_u32 instruction and then
+ // a fixup or relocation is emitted to replace $symbol with a literal
+ // constant, which is a pc-relative offset from the encoding of the $symbol
+ // operand to the global variable.
+ //
+ // For global address space:
+ // s_getpc_b64 s[0:1]
+ // s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo
+ // s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi
+ //
+ // s_getpc_b64 returns the address of the s_add_u32 instruction and then
+ // fixups or relocations are emitted to replace $symbol@*@lo and
+ // $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
+ // which is a 64-bit pc-relative offset from the encoding of the $symbol
+ // operand to the global variable.
+ //
+ // What we want here is an offset from the value returned by s_getpc
+ // (which is the address of the s_add_u32 instruction) to the global
+ // variable, but since the encoding of $symbol starts 4 bytes after the start
+ // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too
+ // small. This requires us to add 4 to the global variable offset in order to
+ // compute the correct address.
+
+ LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
+
+ Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg :
+ B.getMRI()->createGenericVirtualRegister(ConstPtrTy);
+
+ MachineInstrBuilder MIB = B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET)
+ .addDef(PCReg);
+
+ MIB.addGlobalAddress(GV, Offset + 4, GAFlags);
+ if (GAFlags == SIInstrInfo::MO_NONE)
+ MIB.addImm(0);
+ else
+ MIB.addGlobalAddress(GV, Offset + 4, GAFlags + 1);
+
+ B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass);
+
+ if (PtrTy.getSizeInBits() == 32)
+ B.buildExtract(DstReg, PCReg, 0);
+ return true;
+ }
+
+bool AMDGPULegalizerInfo::legalizeGlobalValue(
+ MachineInstr &MI, MachineRegisterInfo &MRI,
+ MachineIRBuilder &B) const {
+ Register DstReg = MI.getOperand(0).getReg();
+ LLT Ty = MRI.getType(DstReg);
+ unsigned AS = Ty.getAddressSpace();
+
+ const GlobalValue *GV = MI.getOperand(1).getGlobal();
+ MachineFunction &MF = B.getMF();
+ SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
+ B.setInstr(MI);
+
+ if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
+ if (!MFI->isEntryFunction()) {
+ const Function &Fn = MF.getFunction();
+ DiagnosticInfoUnsupported BadLDSDecl(
+ Fn, "local memory global used by non-kernel function", MI.getDebugLoc());
+ Fn.getContext().diagnose(BadLDSDecl);
+ }
+
+ // TODO: We could emit code to handle the initialization somewhere.
+ if (!AMDGPUTargetLowering::hasDefinedInitializer(GV)) {
+ B.buildConstant(DstReg, MFI->allocateLDSGlobal(B.getDataLayout(), *GV));
+ MI.eraseFromParent();
+ return true;
+ }
+
+ const Function &Fn = MF.getFunction();
+ DiagnosticInfoUnsupported BadInit(
+ Fn, "unsupported initializer for address space", MI.getDebugLoc());
+ Fn.getContext().diagnose(BadInit);
+ return true;
+ }
+
+ const SITargetLowering *TLI = ST.getTargetLowering();
+
+ if (TLI->shouldEmitFixup(GV)) {
+ buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0);
+ MI.eraseFromParent();
+ return true;
+ }
+
+ if (TLI->shouldEmitPCReloc(GV)) {
+ buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32);
+ MI.eraseFromParent();
+ return true;
+ }
+
+ LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
+ Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy);
+
+ MachineMemOperand *GOTMMO = MF.getMachineMemOperand(
+ MachinePointerInfo::getGOT(MF),
+ MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
+ MachineMemOperand::MOInvariant,
+ 8 /*Size*/, 8 /*Align*/);
+
+ buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32);
+
+ if (Ty.getSizeInBits() == 32) {
+ // Truncate if this is a 32-bit constant adrdess.
+ auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO);
+ B.buildExtract(DstReg, Load, 0);
+ } else
+ B.buildLoad(DstReg, GOTAddr, *GOTMMO);
+
+ MI.eraseFromParent();
+ return true;
+}
+
+bool AMDGPULegalizerInfo::legalizeLoad(
+ MachineInstr &MI, MachineRegisterInfo &MRI,
+ MachineIRBuilder &B, GISelChangeObserver &Observer) const {
+ B.setInstr(MI);
+ LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
+ auto Cast = B.buildAddrSpaceCast(ConstPtr, MI.getOperand(1).getReg());
+ Observer.changingInstr(MI);
+ MI.getOperand(1).setReg(Cast.getReg(0));
+ Observer.changedInstr(MI);
+ return true;
+}
+
+bool AMDGPULegalizerInfo::legalizeFMad(
+ MachineInstr &MI, MachineRegisterInfo &MRI,
+ MachineIRBuilder &B) const {
+ LLT Ty = MRI.getType(MI.getOperand(0).getReg());
+ assert(Ty.isScalar());
+
+ // TODO: Always legal with future ftz flag.
+ if (Ty == LLT::scalar(32) && !ST.hasFP32Denormals())
+ return true;
+ if (Ty == LLT::scalar(16) && !ST.hasFP16Denormals())
+ return true;
+
+ MachineFunction &MF = B.getMF();
+
+ MachineIRBuilder HelperBuilder(MI);
+ GISelObserverWrapper DummyObserver;
+ LegalizerHelper Helper(MF, DummyObserver, HelperBuilder);
+ HelperBuilder.setMBB(*MI.getParent());
+ return Helper.lowerFMad(MI) == LegalizerHelper::Legalized;
+}
+
// Return the use branch instruction, otherwise null if the usage is invalid.
static MachineInstr *verifyCFIntrinsic(MachineInstr &MI,
MachineRegisterInfo &MRI) {
@@ -1212,10 +1749,9 @@ Register AMDGPULegalizerInfo::getLiveInRegister(MachineRegisterInfo &MRI,
bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B,
const ArgDescriptor *Arg) const {
- if (!Arg->isRegister())
+ if (!Arg->isRegister() || !Arg->getRegister().isValid())
return false; // TODO: Handle these
- assert(Arg->getRegister() != 0);
assert(Arg->getRegister().isPhysical());
MachineRegisterInfo &MRI = *B.getMRI();
@@ -1229,19 +1765,30 @@ bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B,
const unsigned Mask = Arg->getMask();
const unsigned Shift = countTrailingZeros<unsigned>(Mask);
- auto ShiftAmt = B.buildConstant(S32, Shift);
- auto LShr = B.buildLShr(S32, LiveIn, ShiftAmt);
- B.buildAnd(DstReg, LShr, B.buildConstant(S32, Mask >> Shift));
+ Register AndMaskSrc = LiveIn;
+
+ if (Shift != 0) {
+ auto ShiftAmt = B.buildConstant(S32, Shift);
+ AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0);
+ }
+
+ B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift));
} else
B.buildCopy(DstReg, LiveIn);
// Insert the argument copy if it doens't already exist.
// FIXME: It seems EmitLiveInCopies isn't called anywhere?
if (!MRI.getVRegDef(LiveIn)) {
+ // FIXME: Should have scoped insert pt
+ MachineBasicBlock &OrigInsBB = B.getMBB();
+ auto OrigInsPt = B.getInsertPt();
+
MachineBasicBlock &EntryMBB = B.getMF().front();
EntryMBB.addLiveIn(Arg->getRegister());
B.setInsertPt(EntryMBB, EntryMBB.begin());
B.buildCopy(LiveIn, Arg->getRegister());
+
+ B.setInsertPt(OrigInsBB, OrigInsPt);
}
return true;
@@ -1272,6 +1819,113 @@ bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin(
return false;
}
+bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI,
+ MachineRegisterInfo &MRI,
+ MachineIRBuilder &B) const {
+ B.setInstr(MI);
+
+ if (legalizeFastUnsafeFDIV(MI, MRI, B))
+ return true;
+
+ return false;
+}
+
+bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI,
+ MachineRegisterInfo &MRI,
+ MachineIRBuilder &B) const {
+ Register Res = MI.getOperand(0).getReg();
+ Register LHS = MI.getOperand(1).getReg();
+ Register RHS = MI.getOperand(2).getReg();
+
+ uint16_t Flags = MI.getFlags();
+
+ LLT ResTy = MRI.getType(Res);
+ LLT S32 = LLT::scalar(32);
+ LLT S64 = LLT::scalar(64);
+
+ const MachineFunction &MF = B.getMF();
+ bool Unsafe =
+ MF.getTarget().Options.UnsafeFPMath || MI.getFlag(MachineInstr::FmArcp);
+
+ if (!MF.getTarget().Options.UnsafeFPMath && ResTy == S64)
+ return false;
+
+ if (!Unsafe && ResTy == S32 && ST.hasFP32Denormals())
+ return false;
+
+ if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) {
+ // 1 / x -> RCP(x)
+ if (CLHS->isExactlyValue(1.0)) {
+ B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
+ .addUse(RHS)
+ .setMIFlags(Flags);
+
+ MI.eraseFromParent();
+ return true;
+ }
+
+ // -1 / x -> RCP( FNEG(x) )
+ if (CLHS->isExactlyValue(-1.0)) {
+ auto FNeg = B.buildFNeg(ResTy, RHS, Flags);
+ B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res, false)
+ .addUse(FNeg.getReg(0))
+ .setMIFlags(Flags);
+
+ MI.eraseFromParent();
+ return true;
+ }
+ }
+
+ // x / y -> x * (1.0 / y)
+ if (Unsafe) {
+ auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false)
+ .addUse(RHS)
+ .setMIFlags(Flags);
+ B.buildFMul(Res, LHS, RCP, Flags);
+
+ MI.eraseFromParent();
+ return true;
+ }
+
+ return false;
+}
+
+bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI,
+ MachineRegisterInfo &MRI,
+ MachineIRBuilder &B) const {
+ B.setInstr(MI);
+ Register Res = MI.getOperand(0).getReg();
+ Register LHS = MI.getOperand(2).getReg();
+ Register RHS = MI.getOperand(3).getReg();
+ uint16_t Flags = MI.getFlags();
+
+ LLT S32 = LLT::scalar(32);
+ LLT S1 = LLT::scalar(1);
+
+ auto Abs = B.buildFAbs(S32, RHS, Flags);
+ const APFloat C0Val(1.0f);
+
+ auto C0 = B.buildConstant(S32, 0x6f800000);
+ auto C1 = B.buildConstant(S32, 0x2f800000);
+ auto C2 = B.buildConstant(S32, FloatToBits(1.0f));
+
+ auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags);
+ auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags);
+
+ auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags);
+
+ auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
+ .addUse(Mul0.getReg(0))
+ .setMIFlags(Flags);
+
+ auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags);
+
+ B.buildFMul(Res, Sel, Mul1, Flags);
+
+ MI.eraseFromParent();
+ return true;
+}
+
bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI,
MachineRegisterInfo &MRI,
MachineIRBuilder &B) const {
@@ -1306,11 +1960,79 @@ bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI,
return true;
}
+bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI,
+ MachineRegisterInfo &MRI,
+ MachineIRBuilder &B,
+ unsigned AddrSpace) const {
+ B.setInstr(MI);
+ Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B);
+ auto Hi32 = B.buildExtract(LLT::scalar(32), MI.getOperand(2).getReg(), 32);
+ B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg);
+ MI.eraseFromParent();
+ return true;
+}
+
+/// Handle register layout difference for f16 images for some subtargets.
+Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B,
+ MachineRegisterInfo &MRI,
+ Register Reg) const {
+ if (!ST.hasUnpackedD16VMem())
+ return Reg;
+
+ const LLT S16 = LLT::scalar(16);
+ const LLT S32 = LLT::scalar(32);
+ LLT StoreVT = MRI.getType(Reg);
+ assert(StoreVT.isVector() && StoreVT.getElementType() == S16);
+
+ auto Unmerge = B.buildUnmerge(S16, Reg);
+
+ SmallVector<Register, 4> WideRegs;
+ for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
+ WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0));
+
+ int NumElts = StoreVT.getNumElements();
+
+ return B.buildBuildVector(LLT::vector(NumElts, S32), WideRegs).getReg(0);
+}
+
+bool AMDGPULegalizerInfo::legalizeRawBufferStore(MachineInstr &MI,
+ MachineRegisterInfo &MRI,
+ MachineIRBuilder &B,
+ bool IsFormat) const {
+ // TODO: Reject f16 format on targets where unsupported.
+ Register VData = MI.getOperand(1).getReg();
+ LLT Ty = MRI.getType(VData);
+
+ B.setInstr(MI);
+
+ const LLT S32 = LLT::scalar(32);
+ const LLT S16 = LLT::scalar(16);
+
+ // Fixup illegal register types for i8 stores.
+ if (Ty == LLT::scalar(8) || Ty == S16) {
+ Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0);
+ MI.getOperand(1).setReg(AnyExt);
+ return true;
+ }
+
+ if (Ty.isVector()) {
+ if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) {
+ if (IsFormat)
+ MI.getOperand(1).setReg(handleD16VData(B, MRI, VData));
+ return true;
+ }
+
+ return Ty.getElementType() == S32 && Ty.getNumElements() <= 4;
+ }
+
+ return Ty == S32;
+}
+
bool AMDGPULegalizerInfo::legalizeIntrinsic(MachineInstr &MI,
MachineRegisterInfo &MRI,
MachineIRBuilder &B) const {
// Replace the use G_BRCOND with the exec manipulate and branch pseudos.
- switch (MI.getOperand(MI.getNumExplicitDefs()).getIntrinsicID()) {
+ switch (MI.getIntrinsicID()) {
case Intrinsic::amdgcn_if: {
if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI)) {
const SIRegisterInfo *TRI
@@ -1386,6 +2108,22 @@ bool AMDGPULegalizerInfo::legalizeIntrinsic(MachineInstr &MI,
case Intrinsic::amdgcn_dispatch_id:
return legalizePreloadedArgIntrin(MI, MRI, B,
AMDGPUFunctionArgInfo::DISPATCH_ID);
+ case Intrinsic::amdgcn_fdiv_fast:
+ return legalizeFDIVFastIntrin(MI, MRI, B);
+ case Intrinsic::amdgcn_is_shared:
+ return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::LOCAL_ADDRESS);
+ case Intrinsic::amdgcn_is_private:
+ return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS);
+ case Intrinsic::amdgcn_wavefrontsize: {
+ B.setInstr(MI);
+ B.buildConstant(MI.getOperand(0), ST.getWavefrontSize());
+ MI.eraseFromParent();
+ return true;
+ }
+ case Intrinsic::amdgcn_raw_buffer_store:
+ return legalizeRawBufferStore(MI, MRI, B, false);
+ case Intrinsic::amdgcn_raw_buffer_store_format:
+ return legalizeRawBufferStore(MI, MRI, B, true);
default:
return true;
}
diff --git a/lib/Target/AMDGPU/AMDGPULegalizerInfo.h b/lib/Target/AMDGPU/AMDGPULegalizerInfo.h
index 3f1cc1d265dd..d0fba23a8686 100644
--- a/lib/Target/AMDGPU/AMDGPULegalizerInfo.h
+++ b/lib/Target/AMDGPU/AMDGPULegalizerInfo.h
@@ -16,6 +16,7 @@
#include "llvm/CodeGen/GlobalISel/LegalizerInfo.h"
#include "AMDGPUArgumentUsageInfo.h"
+#include "SIInstrInfo.h"
namespace llvm {
@@ -32,29 +33,44 @@ public:
const GCNTargetMachine &TM);
bool legalizeCustom(MachineInstr &MI, MachineRegisterInfo &MRI,
- MachineIRBuilder &MIRBuilder,
+ MachineIRBuilder &B,
GISelChangeObserver &Observer) const override;
Register getSegmentAperture(unsigned AddrSpace,
MachineRegisterInfo &MRI,
- MachineIRBuilder &MIRBuilder) const;
+ MachineIRBuilder &B) const;
bool legalizeAddrSpaceCast(MachineInstr &MI, MachineRegisterInfo &MRI,
- MachineIRBuilder &MIRBuilder) const;
+ MachineIRBuilder &B) const;
bool legalizeFrint(MachineInstr &MI, MachineRegisterInfo &MRI,
- MachineIRBuilder &MIRBuilder) const;
+ MachineIRBuilder &B) const;
bool legalizeFceil(MachineInstr &MI, MachineRegisterInfo &MRI,
- MachineIRBuilder &MIRBuilder) const;
+ MachineIRBuilder &B) const;
bool legalizeIntrinsicTrunc(MachineInstr &MI, MachineRegisterInfo &MRI,
- MachineIRBuilder &MIRBuilder) const;
+ MachineIRBuilder &B) const;
bool legalizeITOFP(MachineInstr &MI, MachineRegisterInfo &MRI,
- MachineIRBuilder &MIRBuilder, bool Signed) const;
+ MachineIRBuilder &B, bool Signed) const;
bool legalizeMinNumMaxNum(MachineInstr &MI, MachineRegisterInfo &MRI,
- MachineIRBuilder &MIRBuilder) const;
+ MachineIRBuilder &B) const;
bool legalizeExtractVectorElt(MachineInstr &MI, MachineRegisterInfo &MRI,
- MachineIRBuilder &MIRBuilder) const;
+ MachineIRBuilder &B) const;
bool legalizeInsertVectorElt(MachineInstr &MI, MachineRegisterInfo &MRI,
- MachineIRBuilder &MIRBuilder) const;
+ MachineIRBuilder &B) const;
+ bool legalizeSinCos(MachineInstr &MI, MachineRegisterInfo &MRI,
+ MachineIRBuilder &B) const;
+
+ bool buildPCRelGlobalAddress(
+ Register DstReg, LLT PtrTy, MachineIRBuilder &B, const GlobalValue *GV,
+ unsigned Offset, unsigned GAFlags = SIInstrInfo::MO_NONE) const;
+
+ bool legalizeGlobalValue(MachineInstr &MI, MachineRegisterInfo &MRI,
+ MachineIRBuilder &B) const;
+ bool legalizeLoad(MachineInstr &MI, MachineRegisterInfo &MRI,
+ MachineIRBuilder &B,
+ GISelChangeObserver &Observer) const;
+
+ bool legalizeFMad(MachineInstr &MI, MachineRegisterInfo &MRI,
+ MachineIRBuilder &B) const;
Register getLiveInRegister(MachineRegisterInfo &MRI,
Register Reg, LLT Ty) const;
@@ -65,10 +81,24 @@ public:
MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B,
AMDGPUFunctionArgInfo::PreloadedValue ArgType) const;
+ bool legalizeFDIV(MachineInstr &MI, MachineRegisterInfo &MRI,
+ MachineIRBuilder &B) const;
+ bool legalizeFastUnsafeFDIV(MachineInstr &MI, MachineRegisterInfo &MRI,
+ MachineIRBuilder &B) const;
+ bool legalizeFDIVFastIntrin(MachineInstr &MI, MachineRegisterInfo &MRI,
+ MachineIRBuilder &B) const;
+
bool legalizeImplicitArgPtr(MachineInstr &MI, MachineRegisterInfo &MRI,
MachineIRBuilder &B) const;
+ bool legalizeIsAddrSpace(MachineInstr &MI, MachineRegisterInfo &MRI,
+ MachineIRBuilder &B, unsigned AddrSpace) const;
+
+ Register handleD16VData(MachineIRBuilder &B, MachineRegisterInfo &MRI,
+ Register Reg) const;
+ bool legalizeRawBufferStore(MachineInstr &MI, MachineRegisterInfo &MRI,
+ MachineIRBuilder &B, bool IsFormat) const;
bool legalizeIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI,
- MachineIRBuilder &MIRBuilder) const override;
+ MachineIRBuilder &B) const override;
};
} // End llvm namespace.
diff --git a/lib/Target/AMDGPU/AMDGPULibCalls.cpp b/lib/Target/AMDGPU/AMDGPULibCalls.cpp
index ce0a9db7c7f4..2c94e0046651 100644
--- a/lib/Target/AMDGPU/AMDGPULibCalls.cpp
+++ b/lib/Target/AMDGPU/AMDGPULibCalls.cpp
@@ -30,6 +30,7 @@
#include "llvm/IR/Module.h"
#include "llvm/IR/ValueSymbolTable.h"
#include "llvm/Support/Debug.h"
+#include "llvm/Support/MathExtras.h"
#include "llvm/Support/raw_ostream.h"
#include "llvm/Target/TargetMachine.h"
#include "llvm/Target/TargetOptions.h"
@@ -48,18 +49,10 @@ static cl::list<std::string> UseNative("amdgpu-use-native",
cl::CommaSeparated, cl::ValueOptional,
cl::Hidden);
-#define MATH_PI 3.14159265358979323846264338327950288419716939937511
-#define MATH_E 2.71828182845904523536028747135266249775724709369996
-#define MATH_SQRT2 1.41421356237309504880168872420969807856967187537695
-
-#define MATH_LOG2E 1.4426950408889634073599246810018921374266459541529859
-#define MATH_LOG10E 0.4342944819032518276511289189166050822943970058036665
-// Value of log2(10)
-#define MATH_LOG2_10 3.3219280948873623478703194294893901758648313930245806
-// Value of 1 / log2(10)
-#define MATH_RLOG2_10 0.3010299956639811952137388947244930267681898814621085
-// Value of 1 / M_LOG2E_F = 1 / log2(e)
-#define MATH_RLOG2_E 0.6931471805599453094172321214581765680755001343602552
+#define MATH_PI numbers::pi
+#define MATH_E numbers::e
+#define MATH_SQRT2 numbers::sqrt2
+#define MATH_SQRT1_2 numbers::inv_sqrt2
namespace llvm {
@@ -254,8 +247,8 @@ struct TableEntry {
/* a list of {result, input} */
static const TableEntry tbl_acos[] = {
- {MATH_PI/2.0, 0.0},
- {MATH_PI/2.0, -0.0},
+ {MATH_PI / 2.0, 0.0},
+ {MATH_PI / 2.0, -0.0},
{0.0, 1.0},
{MATH_PI, -1.0}
};
@@ -271,8 +264,8 @@ static const TableEntry tbl_acospi[] = {
static const TableEntry tbl_asin[] = {
{0.0, 0.0},
{-0.0, -0.0},
- {MATH_PI/2.0, 1.0},
- {-MATH_PI/2.0, -1.0}
+ {MATH_PI / 2.0, 1.0},
+ {-MATH_PI / 2.0, -1.0}
};
static const TableEntry tbl_asinh[] = {
{0.0, 0.0},
@@ -287,8 +280,8 @@ static const TableEntry tbl_asinpi[] = {
static const TableEntry tbl_atan[] = {
{0.0, 0.0},
{-0.0, -0.0},
- {MATH_PI/4.0, 1.0},
- {-MATH_PI/4.0, -1.0}
+ {MATH_PI / 4.0, 1.0},
+ {-MATH_PI / 4.0, -1.0}
};
static const TableEntry tbl_atanh[] = {
{0.0, 0.0},
@@ -359,7 +352,7 @@ static const TableEntry tbl_log10[] = {
};
static const TableEntry tbl_rsqrt[] = {
{1.0, 1.0},
- {1.0/MATH_SQRT2, 2.0}
+ {MATH_SQRT1_2, 2.0}
};
static const TableEntry tbl_sin[] = {
{0.0, 0.0},
@@ -868,7 +861,7 @@ static double log2(double V) {
#if _XOPEN_SOURCE >= 600 || defined(_ISOC99_SOURCE) || _POSIX_C_SOURCE >= 200112L
return ::log2(V);
#else
- return log(V) / 0.693147180559945309417;
+ return log(V) / numbers::ln2;
#endif
}
}
@@ -1430,8 +1423,8 @@ AllocaInst* AMDGPULibCalls::insertAlloca(CallInst *UI, IRBuilder<> &B,
B.SetInsertPoint(&*ItNew);
AllocaInst *Alloc = B.CreateAlloca(RetType, 0,
std::string(prefix) + UI->getName());
- Alloc->setAlignment(UCallee->getParent()->getDataLayout()
- .getTypeAllocSize(RetType));
+ Alloc->setAlignment(MaybeAlign(
+ UCallee->getParent()->getDataLayout().getTypeAllocSize(RetType)));
return Alloc;
}
diff --git a/lib/Target/AMDGPU/AMDGPULibFunc.cpp b/lib/Target/AMDGPU/AMDGPULibFunc.cpp
index a5bac25701a0..e1ae496d9cbc 100644
--- a/lib/Target/AMDGPU/AMDGPULibFunc.cpp
+++ b/lib/Target/AMDGPU/AMDGPULibFunc.cpp
@@ -55,7 +55,7 @@ enum EManglingParam {
};
struct ManglingRule {
- StringRef const Name;
+ const char *Name;
unsigned char Lead[2];
unsigned char Param[5];
@@ -69,7 +69,7 @@ struct ManglingRule {
// Information about library functions with unmangled names.
class UnmangledFuncInfo {
- StringRef const Name;
+ const char *Name;
unsigned NumArgs;
// Table for all lib functions with unmangled names.
@@ -82,7 +82,7 @@ class UnmangledFuncInfo {
public:
using ID = AMDGPULibFunc::EFuncId;
- UnmangledFuncInfo(StringRef _Name, unsigned _NumArgs)
+ constexpr UnmangledFuncInfo(const char *_Name, unsigned _NumArgs)
: Name(_Name), NumArgs(_NumArgs) {}
// Get index to Table by function name.
static bool lookup(StringRef Name, ID &Id);
@@ -133,8 +133,8 @@ unsigned ManglingRule::getNumArgs() const {
// E_ANY - use prev lead type, E_CONSTPTR_ANY - make const pointer out of
// prev lead type, etc. see ParamIterator::getNextParam() for details.
-static const ManglingRule manglingRules[] = {
-{ StringRef(), {0}, {0} },
+static constexpr ManglingRule manglingRules[] = {
+{ "", {0}, {0} },
{ "abs" , {1}, {E_ANY}},
{ "abs_diff" , {1}, {E_ANY,E_COPY}},
{ "acos" , {1}, {E_ANY}},
@@ -682,9 +682,9 @@ bool AMDGPULibFunc::parse(StringRef FuncName, AMDGPULibFunc &F) {
}
if (eatTerm(FuncName, "_Z"))
- F.Impl = make_unique<AMDGPUMangledLibFunc>();
+ F.Impl = std::make_unique<AMDGPUMangledLibFunc>();
else
- F.Impl = make_unique<AMDGPUUnmangledLibFunc>();
+ F.Impl = std::make_unique<AMDGPUUnmangledLibFunc>();
if (F.Impl->parseFuncName(FuncName))
return true;
diff --git a/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp b/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp
index 5dd5b3691e0a..e64542a395f0 100644
--- a/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp
+++ b/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp
@@ -72,10 +72,10 @@ bool AMDGPULowerKernelArguments::runOnFunction(Function &F) {
BasicBlock &EntryBlock = *F.begin();
IRBuilder<> Builder(&*EntryBlock.begin());
- const unsigned KernArgBaseAlign = 16; // FIXME: Increase if necessary
+ const Align KernArgBaseAlign(16); // FIXME: Increase if necessary
const uint64_t BaseOffset = ST.getExplicitKernelArgOffset(F);
- unsigned MaxAlign;
+ Align MaxAlign;
// FIXME: Alignment is broken broken with explicit arg offset.;
const uint64_t TotalKernArgSize = ST.getKernArgSegmentSize(F, MaxAlign);
if (TotalKernArgSize == 0)
@@ -94,12 +94,12 @@ bool AMDGPULowerKernelArguments::runOnFunction(Function &F) {
for (Argument &Arg : F.args()) {
Type *ArgTy = Arg.getType();
- unsigned Align = DL.getABITypeAlignment(ArgTy);
+ unsigned ABITypeAlign = DL.getABITypeAlignment(ArgTy);
unsigned Size = DL.getTypeSizeInBits(ArgTy);
unsigned AllocSize = DL.getTypeAllocSize(ArgTy);
- uint64_t EltOffset = alignTo(ExplicitArgOffset, Align) + BaseOffset;
- ExplicitArgOffset = alignTo(ExplicitArgOffset, Align) + AllocSize;
+ uint64_t EltOffset = alignTo(ExplicitArgOffset, ABITypeAlign) + BaseOffset;
+ ExplicitArgOffset = alignTo(ExplicitArgOffset, ABITypeAlign) + AllocSize;
if (Arg.use_empty())
continue;
@@ -128,8 +128,8 @@ bool AMDGPULowerKernelArguments::runOnFunction(Function &F) {
int64_t AlignDownOffset = alignDown(EltOffset, 4);
int64_t OffsetDiff = EltOffset - AlignDownOffset;
- unsigned AdjustedAlign = MinAlign(DoShiftOpt ? AlignDownOffset : EltOffset,
- KernArgBaseAlign);
+ Align AdjustedAlign = commonAlignment(
+ KernArgBaseAlign, DoShiftOpt ? AlignDownOffset : EltOffset);
Value *ArgPtr;
Type *AdjustedArgTy;
@@ -160,7 +160,7 @@ bool AMDGPULowerKernelArguments::runOnFunction(Function &F) {
ArgPtr = Builder.CreateBitCast(ArgPtr, AdjustedArgTy->getPointerTo(AS),
ArgPtr->getName() + ".cast");
LoadInst *Load =
- Builder.CreateAlignedLoad(AdjustedArgTy, ArgPtr, AdjustedAlign);
+ Builder.CreateAlignedLoad(AdjustedArgTy, ArgPtr, AdjustedAlign.value());
Load->setMetadata(LLVMContext::MD_invariant_load, MDNode::get(Ctx, {}));
MDBuilder MDB(Ctx);
@@ -220,8 +220,8 @@ bool AMDGPULowerKernelArguments::runOnFunction(Function &F) {
}
KernArgSegment->addAttribute(
- AttributeList::ReturnIndex,
- Attribute::getWithAlignment(Ctx, std::max(KernArgBaseAlign, MaxAlign)));
+ AttributeList::ReturnIndex,
+ Attribute::getWithAlignment(Ctx, std::max(KernArgBaseAlign, MaxAlign)));
return true;
}
diff --git a/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp b/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp
index ae4c32c258a7..3760aed87a43 100644
--- a/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp
+++ b/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp
@@ -211,6 +211,10 @@ void AMDGPUMCInstLower::lower(const MachineInstr *MI, MCInst &OutMI) const {
lowerOperand(MO, MCOp);
OutMI.addOperand(MCOp);
}
+
+ int FIIdx = AMDGPU::getNamedOperandIdx(MCOpcode, AMDGPU::OpName::fi);
+ if (FIIdx >= (int)OutMI.getNumOperands())
+ OutMI.addOperand(MCOperand::createImm(0));
}
bool AMDGPUAsmPrinter::lowerOperand(const MachineOperand &MO,
diff --git a/lib/Target/AMDGPU/AMDGPUMachineCFGStructurizer.cpp b/lib/Target/AMDGPU/AMDGPUMachineCFGStructurizer.cpp
index 237490957058..ba72f71f4322 100644
--- a/lib/Target/AMDGPU/AMDGPUMachineCFGStructurizer.cpp
+++ b/lib/Target/AMDGPU/AMDGPUMachineCFGStructurizer.cpp
@@ -694,7 +694,7 @@ void LinearizedRegion::storeLiveOutReg(MachineBasicBlock *MBB, unsigned Reg,
const MachineRegisterInfo *MRI,
const TargetRegisterInfo *TRI,
PHILinearize &PHIInfo) {
- if (TRI->isVirtualRegister(Reg)) {
+ if (Register::isVirtualRegister(Reg)) {
LLVM_DEBUG(dbgs() << "Considering Register: " << printReg(Reg, TRI)
<< "\n");
// If this is a source register to a PHI we are chaining, it
@@ -734,7 +734,7 @@ void LinearizedRegion::storeLiveOutRegRegion(RegionMRT *Region, unsigned Reg,
const MachineRegisterInfo *MRI,
const TargetRegisterInfo *TRI,
PHILinearize &PHIInfo) {
- if (TRI->isVirtualRegister(Reg)) {
+ if (Register::isVirtualRegister(Reg)) {
LLVM_DEBUG(dbgs() << "Considering Register: " << printReg(Reg, TRI)
<< "\n");
for (auto &UI : MRI->use_operands(Reg)) {
@@ -949,7 +949,7 @@ void LinearizedRegion::replaceRegister(unsigned Register, unsigned NewRegister,
(IncludeLoopPHI && IsLoopPHI);
if (ShouldReplace) {
- if (TargetRegisterInfo::isPhysicalRegister(NewRegister)) {
+ if (Register::isPhysicalRegister(NewRegister)) {
LLVM_DEBUG(dbgs() << "Trying to substitute physical register: "
<< printReg(NewRegister, MRI->getTargetRegisterInfo())
<< "\n");
@@ -1016,13 +1016,15 @@ bool LinearizedRegion::hasNoDef(unsigned Reg, MachineRegisterInfo *MRI) {
// before are no longer register kills.
void LinearizedRegion::removeFalseRegisterKills(MachineRegisterInfo *MRI) {
const TargetRegisterInfo *TRI = MRI->getTargetRegisterInfo();
+ (void)TRI; // It's used by LLVM_DEBUG.
+
for (auto MBBI : MBBs) {
MachineBasicBlock *MBB = MBBI;
for (auto &II : *MBB) {
for (auto &RI : II.uses()) {
if (RI.isReg()) {
- unsigned Reg = RI.getReg();
- if (TRI->isVirtualRegister(Reg)) {
+ Register Reg = RI.getReg();
+ if (Register::isVirtualRegister(Reg)) {
if (hasNoDef(Reg, MRI))
continue;
if (!MRI->hasOneDef(Reg)) {
@@ -1402,7 +1404,7 @@ void AMDGPUMachineCFGStructurizer::storePHILinearizationInfoDest(
unsigned AMDGPUMachineCFGStructurizer::storePHILinearizationInfo(
MachineInstr &PHI, SmallVector<unsigned, 2> *RegionIndices) {
unsigned DestReg = getPHIDestReg(PHI);
- unsigned LinearizeDestReg =
+ Register LinearizeDestReg =
MRI->createVirtualRegister(MRI->getRegClass(DestReg));
PHIInfo.addDest(LinearizeDestReg, PHI.getDebugLoc());
storePHILinearizationInfoDest(LinearizeDestReg, PHI, RegionIndices);
@@ -1890,7 +1892,7 @@ void AMDGPUMachineCFGStructurizer::ensureCondIsNotKilled(
if (!Cond[0].isReg())
return;
- unsigned CondReg = Cond[0].getReg();
+ Register CondReg = Cond[0].getReg();
for (auto UI = MRI->use_begin(CondReg), E = MRI->use_end(); UI != E; ++UI) {
(*UI).setIsKill(false);
}
@@ -1929,8 +1931,8 @@ void AMDGPUMachineCFGStructurizer::rewriteCodeBBTerminator(MachineBasicBlock *Co
BBSelectReg, TrueBB->getNumber());
} else {
const TargetRegisterClass *RegClass = MRI->getRegClass(BBSelectReg);
- unsigned TrueBBReg = MRI->createVirtualRegister(RegClass);
- unsigned FalseBBReg = MRI->createVirtualRegister(RegClass);
+ Register TrueBBReg = MRI->createVirtualRegister(RegClass);
+ Register FalseBBReg = MRI->createVirtualRegister(RegClass);
TII->materializeImmediate(*CodeBB, CodeBB->getFirstTerminator(), DL,
TrueBBReg, TrueBB->getNumber());
TII->materializeImmediate(*CodeBB, CodeBB->getFirstTerminator(), DL,
@@ -1996,7 +1998,7 @@ void AMDGPUMachineCFGStructurizer::insertChainedPHI(MachineBasicBlock *IfBB,
InnerRegion->replaceRegisterOutsideRegion(SourceReg, DestReg, false, MRI);
}
const TargetRegisterClass *RegClass = MRI->getRegClass(DestReg);
- unsigned NextDestReg = MRI->createVirtualRegister(RegClass);
+ Register NextDestReg = MRI->createVirtualRegister(RegClass);
bool IsLastDef = PHIInfo.getNumSources(DestReg) == 1;
LLVM_DEBUG(dbgs() << "Insert Chained PHI\n");
insertMergePHI(IfBB, InnerRegion->getExit(), MergeBB, DestReg, NextDestReg,
@@ -2056,8 +2058,8 @@ void AMDGPUMachineCFGStructurizer::rewriteLiveOutRegs(MachineBasicBlock *IfBB,
// register, unless it is the outgoing BB select register. We have
// already creaed phi nodes for these.
const TargetRegisterClass *RegClass = MRI->getRegClass(Reg);
- unsigned PHIDestReg = MRI->createVirtualRegister(RegClass);
- unsigned IfSourceReg = MRI->createVirtualRegister(RegClass);
+ Register PHIDestReg = MRI->createVirtualRegister(RegClass);
+ Register IfSourceReg = MRI->createVirtualRegister(RegClass);
// Create initializer, this value is never used, but is needed
// to satisfy SSA.
LLVM_DEBUG(dbgs() << "Initializer for reg: " << printReg(Reg) << "\n");
@@ -2172,7 +2174,7 @@ void AMDGPUMachineCFGStructurizer::createEntryPHI(LinearizedRegion *CurrentRegio
MachineBasicBlock *PHIDefMBB = PHIDefInstr->getParent();
const TargetRegisterClass *RegClass =
MRI->getRegClass(CurrentBackedgeReg);
- unsigned NewBackedgeReg = MRI->createVirtualRegister(RegClass);
+ Register NewBackedgeReg = MRI->createVirtualRegister(RegClass);
MachineInstrBuilder BackedgePHI =
BuildMI(*PHIDefMBB, PHIDefMBB->instr_begin(), DL,
TII->get(TargetOpcode::PHI), NewBackedgeReg);
@@ -2230,7 +2232,7 @@ void AMDGPUMachineCFGStructurizer::replaceRegisterWith(unsigned Register,
I != E;) {
MachineOperand &O = *I;
++I;
- if (TargetRegisterInfo::isPhysicalRegister(NewRegister)) {
+ if (Register::isPhysicalRegister(NewRegister)) {
LLVM_DEBUG(dbgs() << "Trying to substitute physical register: "
<< printReg(NewRegister, MRI->getTargetRegisterInfo())
<< "\n");
@@ -2309,7 +2311,7 @@ MachineBasicBlock *AMDGPUMachineCFGStructurizer::createIfRegion(
} else {
// Handle internal block.
const TargetRegisterClass *RegClass = MRI->getRegClass(BBSelectRegIn);
- unsigned CodeBBSelectReg = MRI->createVirtualRegister(RegClass);
+ Register CodeBBSelectReg = MRI->createVirtualRegister(RegClass);
rewriteCodeBBTerminator(CodeBB, MergeBB, CodeBBSelectReg);
bool IsRegionEntryBB = CurrentRegion->getEntry() == CodeBB;
MachineBasicBlock *IfBB = createIfBlock(MergeBB, CodeBB, CodeBB, CodeBB,
@@ -2446,7 +2448,7 @@ void AMDGPUMachineCFGStructurizer::splitLoopPHI(MachineInstr &PHI,
}
const TargetRegisterClass *RegClass = MRI->getRegClass(PHIDest);
- unsigned NewDestReg = MRI->createVirtualRegister(RegClass);
+ Register NewDestReg = MRI->createVirtualRegister(RegClass);
LRegion->replaceRegisterInsideRegion(PHIDest, NewDestReg, false, MRI);
MachineInstrBuilder MIB =
BuildMI(*EntrySucc, EntrySucc->instr_begin(), PHI.getDebugLoc(),
@@ -2734,9 +2736,9 @@ bool AMDGPUMachineCFGStructurizer::structurizeComplexRegion(RegionMRT *Region) {
}
const DebugLoc &DL = NewSucc->findDebugLoc(NewSucc->getFirstNonPHI());
unsigned InReg = LRegion->getBBSelectRegIn();
- unsigned InnerSelectReg =
+ Register InnerSelectReg =
MRI->createVirtualRegister(MRI->getRegClass(InReg));
- unsigned NewInReg = MRI->createVirtualRegister(MRI->getRegClass(InReg));
+ Register NewInReg = MRI->createVirtualRegister(MRI->getRegClass(InReg));
TII->materializeImmediate(*(LRegion->getEntry()),
LRegion->getEntry()->getFirstTerminator(), DL,
NewInReg, Region->getEntry()->getNumber());
diff --git a/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp b/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp
index 0d3a1f1a769f..89ca702f577d 100644
--- a/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp
+++ b/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp
@@ -17,7 +17,6 @@ AMDGPUMachineFunction::AMDGPUMachineFunction(const MachineFunction &MF) :
MachineFunctionInfo(),
LocalMemoryObjects(),
ExplicitKernArgSize(0),
- MaxKernArgAlign(0),
LDSSize(0),
IsEntryFunction(AMDGPU::isEntryFunctionCC(MF.getFunction().getCallingConv())),
NoSignedZerosFPMath(MF.getTarget().Options.NoSignedZerosFPMath),
diff --git a/lib/Target/AMDGPU/AMDGPUMachineFunction.h b/lib/Target/AMDGPU/AMDGPUMachineFunction.h
index 52987e2fa411..9818ab1ef148 100644
--- a/lib/Target/AMDGPU/AMDGPUMachineFunction.h
+++ b/lib/Target/AMDGPU/AMDGPUMachineFunction.h
@@ -23,7 +23,7 @@ class AMDGPUMachineFunction : public MachineFunctionInfo {
protected:
uint64_t ExplicitKernArgSize; // Cache for this.
- unsigned MaxKernArgAlign; // Cache for this.
+ Align MaxKernArgAlign; // Cache for this.
/// Number of bytes in the LDS that are being used.
unsigned LDSSize;
@@ -47,9 +47,7 @@ public:
return ExplicitKernArgSize;
}
- unsigned getMaxKernArgAlign() const {
- return MaxKernArgAlign;
- }
+ unsigned getMaxKernArgAlign() const { return MaxKernArgAlign.value(); }
unsigned getLDSSize() const {
return LDSSize;
diff --git a/lib/Target/AMDGPU/AMDGPUPrintfRuntimeBinding.cpp b/lib/Target/AMDGPU/AMDGPUPrintfRuntimeBinding.cpp
new file mode 100644
index 000000000000..5250bf455d71
--- /dev/null
+++ b/lib/Target/AMDGPU/AMDGPUPrintfRuntimeBinding.cpp
@@ -0,0 +1,592 @@
+//=== AMDGPUPrintfRuntimeBinding.cpp - OpenCL printf implementation -------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+// \file
+//
+// The pass bind printfs to a kernel arg pointer that will be bound to a buffer
+// later by the runtime.
+//
+// This pass traverses the functions in the module and converts
+// each call to printf to a sequence of operations that
+// store the following into the printf buffer:
+// - format string (passed as a module's metadata unique ID)
+// - bitwise copies of printf arguments
+// The backend passes will need to store metadata in the kernel
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/Triple.h"
+#include "llvm/Analysis/InstructionSimplify.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Type.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "printfToRuntime"
+#define DWORD_ALIGN 4
+
+namespace {
+class LLVM_LIBRARY_VISIBILITY AMDGPUPrintfRuntimeBinding final
+ : public ModulePass {
+
+public:
+ static char ID;
+
+ explicit AMDGPUPrintfRuntimeBinding();
+
+private:
+ bool runOnModule(Module &M) override;
+ void getConversionSpecifiers(SmallVectorImpl<char> &OpConvSpecifiers,
+ StringRef fmt, size_t num_ops) const;
+
+ bool shouldPrintAsStr(char Specifier, Type *OpType) const;
+ bool
+ lowerPrintfForGpu(Module &M,
+ function_ref<const TargetLibraryInfo &(Function &)> GetTLI);
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.addRequired<TargetLibraryInfoWrapperPass>();
+ AU.addRequired<DominatorTreeWrapperPass>();
+ }
+
+ Value *simplify(Instruction *I, const TargetLibraryInfo *TLI) {
+ return SimplifyInstruction(I, {*TD, TLI, DT});
+ }
+
+ const DataLayout *TD;
+ const DominatorTree *DT;
+ SmallVector<CallInst *, 32> Printfs;
+};
+} // namespace
+
+char AMDGPUPrintfRuntimeBinding::ID = 0;
+
+INITIALIZE_PASS_BEGIN(AMDGPUPrintfRuntimeBinding,
+ "amdgpu-printf-runtime-binding", "AMDGPU Printf lowering",
+ false, false)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_END(AMDGPUPrintfRuntimeBinding, "amdgpu-printf-runtime-binding",
+ "AMDGPU Printf lowering", false, false)
+
+char &llvm::AMDGPUPrintfRuntimeBindingID = AMDGPUPrintfRuntimeBinding::ID;
+
+namespace llvm {
+ModulePass *createAMDGPUPrintfRuntimeBinding() {
+ return new AMDGPUPrintfRuntimeBinding();
+}
+} // namespace llvm
+
+AMDGPUPrintfRuntimeBinding::AMDGPUPrintfRuntimeBinding()
+ : ModulePass(ID), TD(nullptr), DT(nullptr) {
+ initializeAMDGPUPrintfRuntimeBindingPass(*PassRegistry::getPassRegistry());
+}
+
+void AMDGPUPrintfRuntimeBinding::getConversionSpecifiers(
+ SmallVectorImpl<char> &OpConvSpecifiers, StringRef Fmt,
+ size_t NumOps) const {
+ // not all format characters are collected.
+ // At this time the format characters of interest
+ // are %p and %s, which use to know if we
+ // are either storing a literal string or a
+ // pointer to the printf buffer.
+ static const char ConvSpecifiers[] = "cdieEfgGaosuxXp";
+ size_t CurFmtSpecifierIdx = 0;
+ size_t PrevFmtSpecifierIdx = 0;
+
+ while ((CurFmtSpecifierIdx = Fmt.find_first_of(
+ ConvSpecifiers, CurFmtSpecifierIdx)) != StringRef::npos) {
+ bool ArgDump = false;
+ StringRef CurFmt = Fmt.substr(PrevFmtSpecifierIdx,
+ CurFmtSpecifierIdx - PrevFmtSpecifierIdx);
+ size_t pTag = CurFmt.find_last_of("%");
+ if (pTag != StringRef::npos) {
+ ArgDump = true;
+ while (pTag && CurFmt[--pTag] == '%') {
+ ArgDump = !ArgDump;
+ }
+ }
+
+ if (ArgDump)
+ OpConvSpecifiers.push_back(Fmt[CurFmtSpecifierIdx]);
+
+ PrevFmtSpecifierIdx = ++CurFmtSpecifierIdx;
+ }
+}
+
+bool AMDGPUPrintfRuntimeBinding::shouldPrintAsStr(char Specifier,
+ Type *OpType) const {
+ if (Specifier != 's')
+ return false;
+ const PointerType *PT = dyn_cast<PointerType>(OpType);
+ if (!PT || PT->getAddressSpace() != AMDGPUAS::CONSTANT_ADDRESS)
+ return false;
+ Type *ElemType = PT->getContainedType(0);
+ if (ElemType->getTypeID() != Type::IntegerTyID)
+ return false;
+ IntegerType *ElemIType = cast<IntegerType>(ElemType);
+ return ElemIType->getBitWidth() == 8;
+}
+
+bool AMDGPUPrintfRuntimeBinding::lowerPrintfForGpu(
+ Module &M, function_ref<const TargetLibraryInfo &(Function &)> GetTLI) {
+ LLVMContext &Ctx = M.getContext();
+ IRBuilder<> Builder(Ctx);
+ Type *I32Ty = Type::getInt32Ty(Ctx);
+ unsigned UniqID = 0;
+ // NB: This is important for this string size to be divizable by 4
+ const char NonLiteralStr[4] = "???";
+
+ for (auto CI : Printfs) {
+ unsigned NumOps = CI->getNumArgOperands();
+
+ SmallString<16> OpConvSpecifiers;
+ Value *Op = CI->getArgOperand(0);
+
+ if (auto LI = dyn_cast<LoadInst>(Op)) {
+ Op = LI->getPointerOperand();
+ for (auto Use : Op->users()) {
+ if (auto SI = dyn_cast<StoreInst>(Use)) {
+ Op = SI->getValueOperand();
+ break;
+ }
+ }
+ }
+
+ if (auto I = dyn_cast<Instruction>(Op)) {
+ Value *Op_simplified = simplify(I, &GetTLI(*I->getFunction()));
+ if (Op_simplified)
+ Op = Op_simplified;
+ }
+
+ ConstantExpr *ConstExpr = dyn_cast<ConstantExpr>(Op);
+
+ if (ConstExpr) {
+ GlobalVariable *GVar = dyn_cast<GlobalVariable>(ConstExpr->getOperand(0));
+
+ StringRef Str("unknown");
+ if (GVar && GVar->hasInitializer()) {
+ auto Init = GVar->getInitializer();
+ if (auto CA = dyn_cast<ConstantDataArray>(Init)) {
+ if (CA->isString())
+ Str = CA->getAsCString();
+ } else if (isa<ConstantAggregateZero>(Init)) {
+ Str = "";
+ }
+ //
+ // we need this call to ascertain
+ // that we are printing a string
+ // or a pointer. It takes out the
+ // specifiers and fills up the first
+ // arg
+ getConversionSpecifiers(OpConvSpecifiers, Str, NumOps - 1);
+ }
+ // Add metadata for the string
+ std::string AStreamHolder;
+ raw_string_ostream Sizes(AStreamHolder);
+ int Sum = DWORD_ALIGN;
+ Sizes << CI->getNumArgOperands() - 1;
+ Sizes << ':';
+ for (unsigned ArgCount = 1; ArgCount < CI->getNumArgOperands() &&
+ ArgCount <= OpConvSpecifiers.size();
+ ArgCount++) {
+ Value *Arg = CI->getArgOperand(ArgCount);
+ Type *ArgType = Arg->getType();
+ unsigned ArgSize = TD->getTypeAllocSizeInBits(ArgType);
+ ArgSize = ArgSize / 8;
+ //
+ // ArgSize by design should be a multiple of DWORD_ALIGN,
+ // expand the arguments that do not follow this rule.
+ //
+ if (ArgSize % DWORD_ALIGN != 0) {
+ llvm::Type *ResType = llvm::Type::getInt32Ty(Ctx);
+ VectorType *LLVMVecType = llvm::dyn_cast<llvm::VectorType>(ArgType);
+ int NumElem = LLVMVecType ? LLVMVecType->getNumElements() : 1;
+ if (LLVMVecType && NumElem > 1)
+ ResType = llvm::VectorType::get(ResType, NumElem);
+ Builder.SetInsertPoint(CI);
+ Builder.SetCurrentDebugLocation(CI->getDebugLoc());
+ if (OpConvSpecifiers[ArgCount - 1] == 'x' ||
+ OpConvSpecifiers[ArgCount - 1] == 'X' ||
+ OpConvSpecifiers[ArgCount - 1] == 'u' ||
+ OpConvSpecifiers[ArgCount - 1] == 'o')
+ Arg = Builder.CreateZExt(Arg, ResType);
+ else
+ Arg = Builder.CreateSExt(Arg, ResType);
+ ArgType = Arg->getType();
+ ArgSize = TD->getTypeAllocSizeInBits(ArgType);
+ ArgSize = ArgSize / 8;
+ CI->setOperand(ArgCount, Arg);
+ }
+ if (OpConvSpecifiers[ArgCount - 1] == 'f') {
+ ConstantFP *FpCons = dyn_cast<ConstantFP>(Arg);
+ if (FpCons)
+ ArgSize = 4;
+ else {
+ FPExtInst *FpExt = dyn_cast<FPExtInst>(Arg);
+ if (FpExt && FpExt->getType()->isDoubleTy() &&
+ FpExt->getOperand(0)->getType()->isFloatTy())
+ ArgSize = 4;
+ }
+ }
+ if (shouldPrintAsStr(OpConvSpecifiers[ArgCount - 1], ArgType)) {
+ if (ConstantExpr *ConstExpr = dyn_cast<ConstantExpr>(Arg)) {
+ GlobalVariable *GV =
+ dyn_cast<GlobalVariable>(ConstExpr->getOperand(0));
+ if (GV && GV->hasInitializer()) {
+ Constant *Init = GV->getInitializer();
+ ConstantDataArray *CA = dyn_cast<ConstantDataArray>(Init);
+ if (Init->isZeroValue() || CA->isString()) {
+ size_t SizeStr = Init->isZeroValue()
+ ? 1
+ : (strlen(CA->getAsCString().data()) + 1);
+ size_t Rem = SizeStr % DWORD_ALIGN;
+ size_t NSizeStr = 0;
+ LLVM_DEBUG(dbgs() << "Printf string original size = " << SizeStr
+ << '\n');
+ if (Rem) {
+ NSizeStr = SizeStr + (DWORD_ALIGN - Rem);
+ } else {
+ NSizeStr = SizeStr;
+ }
+ ArgSize = NSizeStr;
+ }
+ } else {
+ ArgSize = sizeof(NonLiteralStr);
+ }
+ } else {
+ ArgSize = sizeof(NonLiteralStr);
+ }
+ }
+ LLVM_DEBUG(dbgs() << "Printf ArgSize (in buffer) = " << ArgSize
+ << " for type: " << *ArgType << '\n');
+ Sizes << ArgSize << ':';
+ Sum += ArgSize;
+ }
+ LLVM_DEBUG(dbgs() << "Printf format string in source = " << Str.str()
+ << '\n');
+ for (size_t I = 0; I < Str.size(); ++I) {
+ // Rest of the C escape sequences (e.g. \') are handled correctly
+ // by the MDParser
+ switch (Str[I]) {
+ case '\a':
+ Sizes << "\\a";
+ break;
+ case '\b':
+ Sizes << "\\b";
+ break;
+ case '\f':
+ Sizes << "\\f";
+ break;
+ case '\n':
+ Sizes << "\\n";
+ break;
+ case '\r':
+ Sizes << "\\r";
+ break;
+ case '\v':
+ Sizes << "\\v";
+ break;
+ case ':':
+ // ':' cannot be scanned by Flex, as it is defined as a delimiter
+ // Replace it with it's octal representation \72
+ Sizes << "\\72";
+ break;
+ default:
+ Sizes << Str[I];
+ break;
+ }
+ }
+
+ // Insert the printf_alloc call
+ Builder.SetInsertPoint(CI);
+ Builder.SetCurrentDebugLocation(CI->getDebugLoc());
+
+ AttributeList Attr = AttributeList::get(Ctx, AttributeList::FunctionIndex,
+ Attribute::NoUnwind);
+
+ Type *SizetTy = Type::getInt32Ty(Ctx);
+
+ Type *Tys_alloc[1] = {SizetTy};
+ Type *I8Ptr = PointerType::get(Type::getInt8Ty(Ctx), 1);
+ FunctionType *FTy_alloc = FunctionType::get(I8Ptr, Tys_alloc, false);
+ FunctionCallee PrintfAllocFn =
+ M.getOrInsertFunction(StringRef("__printf_alloc"), FTy_alloc, Attr);
+
+ LLVM_DEBUG(dbgs() << "Printf metadata = " << Sizes.str() << '\n');
+ std::string fmtstr = itostr(++UniqID) + ":" + Sizes.str().c_str();
+ MDString *fmtStrArray = MDString::get(Ctx, fmtstr);
+
+ // Instead of creating global variables, the
+ // printf format strings are extracted
+ // and passed as metadata. This avoids
+ // polluting llvm's symbol tables in this module.
+ // Metadata is going to be extracted
+ // by the backend passes and inserted
+ // into the OpenCL binary as appropriate.
+ StringRef amd("llvm.printf.fmts");
+ NamedMDNode *metaD = M.getOrInsertNamedMetadata(amd);
+ MDNode *myMD = MDNode::get(Ctx, fmtStrArray);
+ metaD->addOperand(myMD);
+ Value *sumC = ConstantInt::get(SizetTy, Sum, false);
+ SmallVector<Value *, 1> alloc_args;
+ alloc_args.push_back(sumC);
+ CallInst *pcall =
+ CallInst::Create(PrintfAllocFn, alloc_args, "printf_alloc_fn", CI);
+
+ //
+ // Insert code to split basicblock with a
+ // piece of hammock code.
+ // basicblock splits after buffer overflow check
+ //
+ ConstantPointerNull *zeroIntPtr =
+ ConstantPointerNull::get(PointerType::get(Type::getInt8Ty(Ctx), 1));
+ ICmpInst *cmp =
+ dyn_cast<ICmpInst>(Builder.CreateICmpNE(pcall, zeroIntPtr, ""));
+ if (!CI->use_empty()) {
+ Value *result =
+ Builder.CreateSExt(Builder.CreateNot(cmp), I32Ty, "printf_res");
+ CI->replaceAllUsesWith(result);
+ }
+ SplitBlock(CI->getParent(), cmp);
+ Instruction *Brnch =
+ SplitBlockAndInsertIfThen(cmp, cmp->getNextNode(), false);
+
+ Builder.SetInsertPoint(Brnch);
+
+ // store unique printf id in the buffer
+ //
+ SmallVector<Value *, 1> ZeroIdxList;
+ ConstantInt *zeroInt =
+ ConstantInt::get(Ctx, APInt(32, StringRef("0"), 10));
+ ZeroIdxList.push_back(zeroInt);
+
+ GetElementPtrInst *BufferIdx =
+ dyn_cast<GetElementPtrInst>(GetElementPtrInst::Create(
+ nullptr, pcall, ZeroIdxList, "PrintBuffID", Brnch));
+
+ Type *idPointer = PointerType::get(I32Ty, AMDGPUAS::GLOBAL_ADDRESS);
+ Value *id_gep_cast =
+ new BitCastInst(BufferIdx, idPointer, "PrintBuffIdCast", Brnch);
+
+ StoreInst *stbuff =
+ new StoreInst(ConstantInt::get(I32Ty, UniqID), id_gep_cast);
+ stbuff->insertBefore(Brnch); // to Remove unused variable warning
+
+ SmallVector<Value *, 2> FourthIdxList;
+ ConstantInt *fourInt =
+ ConstantInt::get(Ctx, APInt(32, StringRef("4"), 10));
+
+ FourthIdxList.push_back(fourInt); // 1st 4 bytes hold the printf_id
+ // the following GEP is the buffer pointer
+ BufferIdx = cast<GetElementPtrInst>(GetElementPtrInst::Create(
+ nullptr, pcall, FourthIdxList, "PrintBuffGep", Brnch));
+
+ Type *Int32Ty = Type::getInt32Ty(Ctx);
+ Type *Int64Ty = Type::getInt64Ty(Ctx);
+ for (unsigned ArgCount = 1; ArgCount < CI->getNumArgOperands() &&
+ ArgCount <= OpConvSpecifiers.size();
+ ArgCount++) {
+ Value *Arg = CI->getArgOperand(ArgCount);
+ Type *ArgType = Arg->getType();
+ SmallVector<Value *, 32> WhatToStore;
+ if (ArgType->isFPOrFPVectorTy() &&
+ (ArgType->getTypeID() != Type::VectorTyID)) {
+ Type *IType = (ArgType->isFloatTy()) ? Int32Ty : Int64Ty;
+ if (OpConvSpecifiers[ArgCount - 1] == 'f') {
+ ConstantFP *fpCons = dyn_cast<ConstantFP>(Arg);
+ if (fpCons) {
+ APFloat Val(fpCons->getValueAPF());
+ bool Lost = false;
+ Val.convert(APFloat::IEEEsingle(), APFloat::rmNearestTiesToEven,
+ &Lost);
+ Arg = ConstantFP::get(Ctx, Val);
+ IType = Int32Ty;
+ } else {
+ FPExtInst *FpExt = dyn_cast<FPExtInst>(Arg);
+ if (FpExt && FpExt->getType()->isDoubleTy() &&
+ FpExt->getOperand(0)->getType()->isFloatTy()) {
+ Arg = FpExt->getOperand(0);
+ IType = Int32Ty;
+ }
+ }
+ }
+ Arg = new BitCastInst(Arg, IType, "PrintArgFP", Brnch);
+ WhatToStore.push_back(Arg);
+ } else if (ArgType->getTypeID() == Type::PointerTyID) {
+ if (shouldPrintAsStr(OpConvSpecifiers[ArgCount - 1], ArgType)) {
+ const char *S = NonLiteralStr;
+ if (ConstantExpr *ConstExpr = dyn_cast<ConstantExpr>(Arg)) {
+ GlobalVariable *GV =
+ dyn_cast<GlobalVariable>(ConstExpr->getOperand(0));
+ if (GV && GV->hasInitializer()) {
+ Constant *Init = GV->getInitializer();
+ ConstantDataArray *CA = dyn_cast<ConstantDataArray>(Init);
+ if (Init->isZeroValue() || CA->isString()) {
+ S = Init->isZeroValue() ? "" : CA->getAsCString().data();
+ }
+ }
+ }
+ size_t SizeStr = strlen(S) + 1;
+ size_t Rem = SizeStr % DWORD_ALIGN;
+ size_t NSizeStr = 0;
+ if (Rem) {
+ NSizeStr = SizeStr + (DWORD_ALIGN - Rem);
+ } else {
+ NSizeStr = SizeStr;
+ }
+ if (S[0]) {
+ char *MyNewStr = new char[NSizeStr]();
+ strcpy(MyNewStr, S);
+ int NumInts = NSizeStr / 4;
+ int CharC = 0;
+ while (NumInts) {
+ int ANum = *(int *)(MyNewStr + CharC);
+ CharC += 4;
+ NumInts--;
+ Value *ANumV = ConstantInt::get(Int32Ty, ANum, false);
+ WhatToStore.push_back(ANumV);
+ }
+ delete[] MyNewStr;
+ } else {
+ // Empty string, give a hint to RT it is no NULL
+ Value *ANumV = ConstantInt::get(Int32Ty, 0xFFFFFF00, false);
+ WhatToStore.push_back(ANumV);
+ }
+ } else {
+ uint64_t Size = TD->getTypeAllocSizeInBits(ArgType);
+ assert((Size == 32 || Size == 64) && "unsupported size");
+ Type *DstType = (Size == 32) ? Int32Ty : Int64Ty;
+ Arg = new PtrToIntInst(Arg, DstType, "PrintArgPtr", Brnch);
+ WhatToStore.push_back(Arg);
+ }
+ } else if (ArgType->getTypeID() == Type::VectorTyID) {
+ Type *IType = NULL;
+ uint32_t EleCount = cast<VectorType>(ArgType)->getNumElements();
+ uint32_t EleSize = ArgType->getScalarSizeInBits();
+ uint32_t TotalSize = EleCount * EleSize;
+ if (EleCount == 3) {
+ IntegerType *Int32Ty = Type::getInt32Ty(ArgType->getContext());
+ Constant *Indices[4] = {
+ ConstantInt::get(Int32Ty, 0), ConstantInt::get(Int32Ty, 1),
+ ConstantInt::get(Int32Ty, 2), ConstantInt::get(Int32Ty, 2)};
+ Constant *Mask = ConstantVector::get(Indices);
+ ShuffleVectorInst *Shuffle = new ShuffleVectorInst(Arg, Arg, Mask);
+ Shuffle->insertBefore(Brnch);
+ Arg = Shuffle;
+ ArgType = Arg->getType();
+ TotalSize += EleSize;
+ }
+ switch (EleSize) {
+ default:
+ EleCount = TotalSize / 64;
+ IType = dyn_cast<Type>(Type::getInt64Ty(ArgType->getContext()));
+ break;
+ case 8:
+ if (EleCount >= 8) {
+ EleCount = TotalSize / 64;
+ IType = dyn_cast<Type>(Type::getInt64Ty(ArgType->getContext()));
+ } else if (EleCount >= 3) {
+ EleCount = 1;
+ IType = dyn_cast<Type>(Type::getInt32Ty(ArgType->getContext()));
+ } else {
+ EleCount = 1;
+ IType = dyn_cast<Type>(Type::getInt16Ty(ArgType->getContext()));
+ }
+ break;
+ case 16:
+ if (EleCount >= 3) {
+ EleCount = TotalSize / 64;
+ IType = dyn_cast<Type>(Type::getInt64Ty(ArgType->getContext()));
+ } else {
+ EleCount = 1;
+ IType = dyn_cast<Type>(Type::getInt32Ty(ArgType->getContext()));
+ }
+ break;
+ }
+ if (EleCount > 1) {
+ IType = dyn_cast<Type>(VectorType::get(IType, EleCount));
+ }
+ Arg = new BitCastInst(Arg, IType, "PrintArgVect", Brnch);
+ WhatToStore.push_back(Arg);
+ } else {
+ WhatToStore.push_back(Arg);
+ }
+ for (unsigned I = 0, E = WhatToStore.size(); I != E; ++I) {
+ Value *TheBtCast = WhatToStore[I];
+ unsigned ArgSize =
+ TD->getTypeAllocSizeInBits(TheBtCast->getType()) / 8;
+ SmallVector<Value *, 1> BuffOffset;
+ BuffOffset.push_back(ConstantInt::get(I32Ty, ArgSize));
+
+ Type *ArgPointer = PointerType::get(TheBtCast->getType(), 1);
+ Value *CastedGEP =
+ new BitCastInst(BufferIdx, ArgPointer, "PrintBuffPtrCast", Brnch);
+ StoreInst *StBuff = new StoreInst(TheBtCast, CastedGEP, Brnch);
+ LLVM_DEBUG(dbgs() << "inserting store to printf buffer:\n"
+ << *StBuff << '\n');
+ (void)StBuff;
+ if (I + 1 == E && ArgCount + 1 == CI->getNumArgOperands())
+ break;
+ BufferIdx = dyn_cast<GetElementPtrInst>(GetElementPtrInst::Create(
+ nullptr, BufferIdx, BuffOffset, "PrintBuffNextPtr", Brnch));
+ LLVM_DEBUG(dbgs() << "inserting gep to the printf buffer:\n"
+ << *BufferIdx << '\n');
+ }
+ }
+ }
+ }
+
+ // erase the printf calls
+ for (auto CI : Printfs)
+ CI->eraseFromParent();
+
+ Printfs.clear();
+ return true;
+}
+
+bool AMDGPUPrintfRuntimeBinding::runOnModule(Module &M) {
+ Triple TT(M.getTargetTriple());
+ if (TT.getArch() == Triple::r600)
+ return false;
+
+ auto PrintfFunction = M.getFunction("printf");
+ if (!PrintfFunction)
+ return false;
+
+ for (auto &U : PrintfFunction->uses()) {
+ if (auto *CI = dyn_cast<CallInst>(U.getUser())) {
+ if (CI->isCallee(&U))
+ Printfs.push_back(CI);
+ }
+ }
+
+ if (Printfs.empty())
+ return false;
+
+ TD = &M.getDataLayout();
+ auto DTWP = getAnalysisIfAvailable<DominatorTreeWrapperPass>();
+ DT = DTWP ? &DTWP->getDomTree() : nullptr;
+ auto GetTLI = [this](Function &F) -> TargetLibraryInfo & {
+ return this->getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
+ };
+
+ return lowerPrintfForGpu(M, GetTLI);
+}
diff --git a/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp b/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
index e4c9d6685d4a..3e9dcca114a3 100644
--- a/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
+++ b/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
@@ -801,7 +801,7 @@ bool AMDGPUPromoteAlloca::handleAlloca(AllocaInst &I, bool SufficientLDS) {
GlobalVariable::NotThreadLocal,
AMDGPUAS::LOCAL_ADDRESS);
GV->setUnnamedAddr(GlobalValue::UnnamedAddr::Global);
- GV->setAlignment(I.getAlignment());
+ GV->setAlignment(MaybeAlign(I.getAlignment()));
Value *TCntY, *TCntZ;
diff --git a/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
index 815cbc5e26ee..4d78188b3dc3 100644
--- a/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -17,9 +17,9 @@
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
#include "SIMachineFunctionInfo.h"
#include "SIRegisterInfo.h"
-#include "llvm/ADT/SmallSet.h"
#include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
+#include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
#include "llvm/CodeGen/GlobalISel/RegisterBank.h"
#include "llvm/CodeGen/GlobalISel/RegisterBankInfo.h"
#include "llvm/CodeGen/TargetRegisterInfo.h"
@@ -33,6 +33,7 @@
#include "AMDGPUGenRegisterBankInfo.def"
using namespace llvm;
+using namespace MIPatternMatch;
namespace {
@@ -84,9 +85,11 @@ public:
};
}
-AMDGPURegisterBankInfo::AMDGPURegisterBankInfo(const TargetRegisterInfo &TRI)
+AMDGPURegisterBankInfo::AMDGPURegisterBankInfo(const GCNSubtarget &ST)
: AMDGPUGenRegisterBankInfo(),
- TRI(static_cast<const SIRegisterInfo*>(&TRI)) {
+ Subtarget(ST),
+ TRI(Subtarget.getRegisterInfo()),
+ TII(Subtarget.getInstrInfo()) {
// HACK: Until this is fully tablegen'd.
static bool AlreadyInit = false;
@@ -163,11 +166,10 @@ unsigned AMDGPURegisterBankInfo::getBreakDownCost(
const RegisterBank &AMDGPURegisterBankInfo::getRegBankFromRegClass(
const TargetRegisterClass &RC) const {
+ if (&RC == &AMDGPU::SReg_1RegClass)
+ return AMDGPU::VCCRegBank;
- if (TRI->isSGPRClass(&RC))
- return getRegBank(AMDGPU::SGPRRegBankID);
-
- return getRegBank(AMDGPU::VGPRRegBankID);
+ return TRI->isSGPRClass(&RC) ? AMDGPU::SGPRRegBank : AMDGPU::VGPRRegBank;
}
template <unsigned NumOps>
@@ -192,7 +194,8 @@ AMDGPURegisterBankInfo::addMappingFromTable(
Operands[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SizeI);
}
- unsigned MappingID = 0;
+ // getInstrMapping's default mapping uses ID 1, so start at 2.
+ unsigned MappingID = 2;
for (const auto &Entry : Table) {
for (unsigned I = 0; I < NumOps; ++I) {
int OpIdx = RegSrcOpIdx[I];
@@ -210,7 +213,7 @@ AMDGPURegisterBankInfo::addMappingFromTable(
RegisterBankInfo::InstructionMappings
AMDGPURegisterBankInfo::getInstrAlternativeMappingsIntrinsic(
const MachineInstr &MI, const MachineRegisterInfo &MRI) const {
- switch (MI.getOperand(MI.getNumExplicitDefs()).getIntrinsicID()) {
+ switch (MI.getIntrinsicID()) {
case Intrinsic::amdgcn_readlane: {
static const OpRegBankEntry<3> Table[2] = {
// Perfectly legal.
@@ -251,7 +254,7 @@ RegisterBankInfo::InstructionMappings
AMDGPURegisterBankInfo::getInstrAlternativeMappingsIntrinsicWSideEffects(
const MachineInstr &MI, const MachineRegisterInfo &MRI) const {
- switch (MI.getOperand(MI.getNumExplicitDefs()).getIntrinsicID()) {
+ switch (MI.getIntrinsicID()) {
case Intrinsic::amdgcn_buffer_load: {
static const OpRegBankEntry<3> Table[4] = {
// Perfectly legal.
@@ -303,6 +306,7 @@ AMDGPURegisterBankInfo::getInstrAlternativeMappingsIntrinsicWSideEffects(
}
case Intrinsic::amdgcn_s_sendmsg:
case Intrinsic::amdgcn_s_sendmsghalt: {
+ // FIXME: Should have no register for immediate
static const OpRegBankEntry<1> Table[2] = {
// Perfectly legal.
{ { AMDGPU::SGPRRegBankID }, 1 },
@@ -319,12 +323,15 @@ AMDGPURegisterBankInfo::getInstrAlternativeMappingsIntrinsicWSideEffects(
}
}
-static bool isInstrUniform(const MachineInstr &MI) {
+// FIXME: Returns uniform if there's no source value information. This is
+// probably wrong.
+static bool isInstrUniformNonExtLoadAlign4(const MachineInstr &MI) {
if (!MI.hasOneMemOperand())
return false;
const MachineMemOperand *MMO = *MI.memoperands_begin();
- return AMDGPUInstrInfo::isUniformMMO(MMO);
+ return MMO->getSize() >= 4 && MMO->getAlignment() >= 4 &&
+ AMDGPUInstrInfo::isUniformMMO(MMO);
}
RegisterBankInfo::InstructionMappings
@@ -337,6 +344,31 @@ AMDGPURegisterBankInfo::getInstrAlternativeMappings(
InstructionMappings AltMappings;
switch (MI.getOpcode()) {
+ case TargetOpcode::G_CONSTANT: {
+ unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
+ if (Size == 1) {
+ static const OpRegBankEntry<1> Table[4] = {
+ { { AMDGPU::VGPRRegBankID }, 1 },
+ { { AMDGPU::SGPRRegBankID }, 1 },
+ { { AMDGPU::VCCRegBankID }, 1 },
+ { { AMDGPU::SCCRegBankID }, 1 }
+ };
+
+ return addMappingFromTable<1>(MI, MRI, {{ 0 }}, Table);
+ }
+
+ LLVM_FALLTHROUGH;
+ }
+ case TargetOpcode::G_FCONSTANT:
+ case TargetOpcode::G_FRAME_INDEX:
+ case TargetOpcode::G_GLOBAL_VALUE: {
+ static const OpRegBankEntry<1> Table[2] = {
+ { { AMDGPU::VGPRRegBankID }, 1 },
+ { { AMDGPU::SGPRRegBankID }, 1 }
+ };
+
+ return addMappingFromTable<1>(MI, MRI, {{ 0 }}, Table);
+ }
case TargetOpcode::G_AND:
case TargetOpcode::G_OR:
case TargetOpcode::G_XOR: {
@@ -408,23 +440,29 @@ AMDGPURegisterBankInfo::getInstrAlternativeMappings(
AltMappings.push_back(&VSMapping);
break;
}
- case TargetOpcode::G_LOAD: {
+ case TargetOpcode::G_LOAD:
+ case TargetOpcode::G_ZEXTLOAD:
+ case TargetOpcode::G_SEXTLOAD: {
unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
+ LLT PtrTy = MRI.getType(MI.getOperand(1).getReg());
+ unsigned PtrSize = PtrTy.getSizeInBits();
+ unsigned AS = PtrTy.getAddressSpace();
LLT LoadTy = MRI.getType(MI.getOperand(0).getReg());
- // FIXME: Should we be hard coding the size for these mappings?
- if (isInstrUniform(MI)) {
+ if ((AS != AMDGPUAS::LOCAL_ADDRESS && AS != AMDGPUAS::REGION_ADDRESS &&
+ AS != AMDGPUAS::PRIVATE_ADDRESS) &&
+ isInstrUniformNonExtLoadAlign4(MI)) {
const InstructionMapping &SSMapping = getInstructionMapping(
1, 1, getOperandsMapping(
{AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
- AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 64)}),
+ AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, PtrSize)}),
2); // Num Operands
AltMappings.push_back(&SSMapping);
}
const InstructionMapping &VVMapping = getInstructionMapping(
2, 1, getOperandsMapping(
- {AMDGPU::getValueMappingLoadSGPROnly(AMDGPU::VGPRRegBankID, LoadTy),
- AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 64)}),
+ {AMDGPU::getValueMappingLoadSGPROnly(AMDGPU::VGPRRegBankID, LoadTy),
+ AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, PtrSize)}),
2); // Num Operands
AltMappings.push_back(&VVMapping);
@@ -620,57 +658,53 @@ static LLT getHalfSizedType(LLT Ty) {
///
/// There is additional complexity to try for compare values to identify the
/// unique values used.
-void AMDGPURegisterBankInfo::executeInWaterfallLoop(
- MachineInstr &MI, MachineRegisterInfo &MRI,
- ArrayRef<unsigned> OpIndices) const {
- MachineFunction *MF = MI.getParent()->getParent();
- const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
- const SIInstrInfo *TII = ST.getInstrInfo();
- MachineBasicBlock::iterator I(MI);
-
- MachineBasicBlock &MBB = *MI.getParent();
- const DebugLoc &DL = MI.getDebugLoc();
-
- // Use a set to avoid extra readfirstlanes in the case where multiple operands
- // are the same register.
- SmallSet<Register, 4> SGPROperandRegs;
- for (unsigned Op : OpIndices) {
- assert(MI.getOperand(Op).isUse());
- Register Reg = MI.getOperand(Op).getReg();
- const RegisterBank *OpBank = getRegBank(Reg, MRI, *TRI);
- if (OpBank->getID() == AMDGPU::VGPRRegBankID)
- SGPROperandRegs.insert(Reg);
- }
-
- // No operands need to be replaced, so no need to loop.
- if (SGPROperandRegs.empty())
- return;
-
- MachineIRBuilder B(MI);
+bool AMDGPURegisterBankInfo::executeInWaterfallLoop(
+ MachineIRBuilder &B,
+ iterator_range<MachineBasicBlock::iterator> Range,
+ SmallSet<Register, 4> &SGPROperandRegs,
+ MachineRegisterInfo &MRI) const {
SmallVector<Register, 4> ResultRegs;
SmallVector<Register, 4> InitResultRegs;
SmallVector<Register, 4> PhiRegs;
- for (MachineOperand &Def : MI.defs()) {
- LLT ResTy = MRI.getType(Def.getReg());
- const RegisterBank *DefBank = getRegBank(Def.getReg(), MRI, *TRI);
- ResultRegs.push_back(Def.getReg());
- Register InitReg = B.buildUndef(ResTy).getReg(0);
- Register PhiReg = MRI.createGenericVirtualRegister(ResTy);
- InitResultRegs.push_back(InitReg);
- PhiRegs.push_back(PhiReg);
- MRI.setRegBank(PhiReg, *DefBank);
- MRI.setRegBank(InitReg, *DefBank);
+
+ MachineBasicBlock &MBB = B.getMBB();
+ MachineFunction *MF = &B.getMF();
+
+ const TargetRegisterClass *WaveRC = TRI->getWaveMaskRegClass();
+ const unsigned WaveAndOpc = Subtarget.isWave32() ?
+ AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64;
+ const unsigned MovTermOpc = Subtarget.isWave32() ?
+ AMDGPU::S_MOV_B32_term : AMDGPU::S_MOV_B64_term;
+ const unsigned XorTermOpc = Subtarget.isWave32() ?
+ AMDGPU::S_XOR_B32_term : AMDGPU::S_XOR_B64_term;
+ const unsigned AndSaveExecOpc = Subtarget.isWave32() ?
+ AMDGPU::S_AND_SAVEEXEC_B32 : AMDGPU::S_AND_SAVEEXEC_B64;
+ const unsigned ExecReg = Subtarget.isWave32() ?
+ AMDGPU::EXEC_LO : AMDGPU::EXEC;
+
+ for (MachineInstr &MI : Range) {
+ for (MachineOperand &Def : MI.defs()) {
+ LLT ResTy = MRI.getType(Def.getReg());
+ const RegisterBank *DefBank = getRegBank(Def.getReg(), MRI, *TRI);
+ ResultRegs.push_back(Def.getReg());
+ Register InitReg = B.buildUndef(ResTy).getReg(0);
+ Register PhiReg = MRI.createGenericVirtualRegister(ResTy);
+ InitResultRegs.push_back(InitReg);
+ PhiRegs.push_back(PhiReg);
+ MRI.setRegBank(PhiReg, *DefBank);
+ MRI.setRegBank(InitReg, *DefBank);
+ }
}
- Register SaveExecReg = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass);
- Register InitSaveExecReg = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass);
+ Register SaveExecReg = MRI.createVirtualRegister(WaveRC);
+ Register InitSaveExecReg = MRI.createVirtualRegister(WaveRC);
// Don't bother using generic instructions/registers for the exec mask.
B.buildInstr(TargetOpcode::IMPLICIT_DEF)
.addDef(InitSaveExecReg);
- Register PhiExec = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
- Register NewExec = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
+ Register PhiExec = MRI.createVirtualRegister(WaveRC);
+ Register NewExec = MRI.createVirtualRegister(WaveRC);
// To insert the loop we need to split the block. Move everything before this
// point to a new block, and insert a new empty block before this instruction.
@@ -688,7 +722,7 @@ void AMDGPURegisterBankInfo::executeInWaterfallLoop(
// Move the rest of the block into a new block.
RemainderBB->transferSuccessorsAndUpdatePHIs(&MBB);
- RemainderBB->splice(RemainderBB->begin(), &MBB, I, MBB.end());
+ RemainderBB->splice(RemainderBB->begin(), &MBB, Range.end(), MBB.end());
MBB.addSuccessor(LoopBB);
RestoreExecBB->addSuccessor(RemainderBB);
@@ -711,164 +745,173 @@ void AMDGPURegisterBankInfo::executeInWaterfallLoop(
.addMBB(LoopBB);
}
- // Move the instruction into the loop.
- LoopBB->splice(LoopBB->end(), &MBB, I);
- I = std::prev(LoopBB->end());
+ const DebugLoc &DL = B.getDL();
+
+ // Figure out the iterator range after splicing the instructions.
+ auto NewBegin = std::prev(LoopBB->end());
- B.setInstr(*I);
+ // Move the instruction into the loop. Note we moved everything after
+ // Range.end() already into a new block, so Range.end() is no longer valid.
+ LoopBB->splice(LoopBB->end(), &MBB, Range.begin(), MBB.end());
+
+ auto NewEnd = LoopBB->end();
+
+ MachineBasicBlock::iterator I = Range.begin();
+ B.setInsertPt(*LoopBB, I);
Register CondReg;
- for (MachineOperand &Op : MI.uses()) {
- if (!Op.isReg())
- continue;
+ for (MachineInstr &MI : make_range(NewBegin, NewEnd)) {
+ for (MachineOperand &Op : MI.uses()) {
+ if (!Op.isReg() || Op.isDef())
+ continue;
- assert(!Op.isDef());
- if (SGPROperandRegs.count(Op.getReg())) {
- LLT OpTy = MRI.getType(Op.getReg());
- unsigned OpSize = OpTy.getSizeInBits();
-
- // Can only do a readlane of 32-bit pieces.
- if (OpSize == 32) {
- // Avoid extra copies in the simple case of one 32-bit register.
- Register CurrentLaneOpReg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
- MRI.setType(CurrentLaneOpReg, OpTy);
-
- constrainGenericRegister(Op.getReg(), AMDGPU::VGPR_32RegClass, MRI);
- // Read the next variant <- also loop target.
- BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), CurrentLaneOpReg)
- .addReg(Op.getReg());
-
- Register NewCondReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
- bool First = CondReg == AMDGPU::NoRegister;
- if (First)
- CondReg = NewCondReg;
-
- // Compare the just read M0 value to all possible Idx values.
- B.buildInstr(AMDGPU::V_CMP_EQ_U32_e64)
- .addDef(NewCondReg)
- .addReg(CurrentLaneOpReg)
- .addReg(Op.getReg());
- Op.setReg(CurrentLaneOpReg);
-
- if (!First) {
- Register AndReg = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass);
-
- // If there are multiple operands to consider, and the conditions.
- B.buildInstr(AMDGPU::S_AND_B64)
- .addDef(AndReg)
- .addReg(NewCondReg)
- .addReg(CondReg);
- CondReg = AndReg;
- }
- } else {
- LLT S32 = LLT::scalar(32);
- SmallVector<Register, 8> ReadlanePieces;
+ if (SGPROperandRegs.count(Op.getReg())) {
+ LLT OpTy = MRI.getType(Op.getReg());
+ unsigned OpSize = OpTy.getSizeInBits();
- // The compares can be done as 64-bit, but the extract needs to be done
- // in 32-bit pieces.
+ // Can only do a readlane of 32-bit pieces.
+ if (OpSize == 32) {
+ // Avoid extra copies in the simple case of one 32-bit register.
+ Register CurrentLaneOpReg
+ = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
+ MRI.setType(CurrentLaneOpReg, OpTy);
- bool Is64 = OpSize % 64 == 0;
+ constrainGenericRegister(Op.getReg(), AMDGPU::VGPR_32RegClass, MRI);
+ // Read the next variant <- also loop target.
+ BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32),
+ CurrentLaneOpReg)
+ .addReg(Op.getReg());
- LLT UnmergeTy = OpSize % 64 == 0 ? LLT::scalar(64) : LLT::scalar(32);
- unsigned CmpOp = OpSize % 64 == 0 ? AMDGPU::V_CMP_EQ_U64_e64
- : AMDGPU::V_CMP_EQ_U32_e64;
+ Register NewCondReg = MRI.createVirtualRegister(WaveRC);
+ bool First = CondReg == AMDGPU::NoRegister;
+ if (First)
+ CondReg = NewCondReg;
- // The compares can be done as 64-bit, but the extract needs to be done
- // in 32-bit pieces.
+ // Compare the just read M0 value to all possible Idx values.
+ B.buildInstr(AMDGPU::V_CMP_EQ_U32_e64)
+ .addDef(NewCondReg)
+ .addReg(CurrentLaneOpReg)
+ .addReg(Op.getReg());
+ Op.setReg(CurrentLaneOpReg);
- // Insert the unmerge before the loop.
+ if (!First) {
+ Register AndReg = MRI.createVirtualRegister(WaveRC);
- B.setMBB(MBB);
- auto Unmerge = B.buildUnmerge(UnmergeTy, Op.getReg());
- B.setInstr(*I);
+ // If there are multiple operands to consider, and the conditions.
+ B.buildInstr(WaveAndOpc)
+ .addDef(AndReg)
+ .addReg(NewCondReg)
+ .addReg(CondReg);
+ CondReg = AndReg;
+ }
+ } else {
+ LLT S32 = LLT::scalar(32);
+ SmallVector<Register, 8> ReadlanePieces;
- unsigned NumPieces = Unmerge->getNumOperands() - 1;
- for (unsigned PieceIdx = 0; PieceIdx != NumPieces; ++PieceIdx) {
- unsigned UnmergePiece = Unmerge.getReg(PieceIdx);
+ // The compares can be done as 64-bit, but the extract needs to be done
+ // in 32-bit pieces.
- Register CurrentLaneOpReg;
- if (Is64) {
- Register CurrentLaneOpRegLo = MRI.createGenericVirtualRegister(S32);
- Register CurrentLaneOpRegHi = MRI.createGenericVirtualRegister(S32);
+ bool Is64 = OpSize % 64 == 0;
- MRI.setRegClass(UnmergePiece, &AMDGPU::VReg_64RegClass);
- MRI.setRegClass(CurrentLaneOpRegLo, &AMDGPU::SReg_32_XM0RegClass);
- MRI.setRegClass(CurrentLaneOpRegHi, &AMDGPU::SReg_32_XM0RegClass);
+ LLT UnmergeTy = OpSize % 64 == 0 ? LLT::scalar(64) : LLT::scalar(32);
+ unsigned CmpOp = OpSize % 64 == 0 ? AMDGPU::V_CMP_EQ_U64_e64
+ : AMDGPU::V_CMP_EQ_U32_e64;
- // Read the next variant <- also loop target.
- BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32),
- CurrentLaneOpRegLo)
- .addReg(UnmergePiece, 0, AMDGPU::sub0);
+ // The compares can be done as 64-bit, but the extract needs to be done
+ // in 32-bit pieces.
- // Read the next variant <- also loop target.
- BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32),
- CurrentLaneOpRegHi)
- .addReg(UnmergePiece, 0, AMDGPU::sub1);
+ // Insert the unmerge before the loop.
- CurrentLaneOpReg =
- B.buildMerge(LLT::scalar(64),
- {CurrentLaneOpRegLo, CurrentLaneOpRegHi})
- .getReg(0);
+ B.setMBB(MBB);
+ auto Unmerge = B.buildUnmerge(UnmergeTy, Op.getReg());
+ B.setInstr(*I);
- MRI.setRegClass(CurrentLaneOpReg, &AMDGPU::SReg_64_XEXECRegClass);
+ unsigned NumPieces = Unmerge->getNumOperands() - 1;
+ for (unsigned PieceIdx = 0; PieceIdx != NumPieces; ++PieceIdx) {
+ Register UnmergePiece = Unmerge.getReg(PieceIdx);
- if (OpTy.getScalarSizeInBits() == 64) {
- // If we need to produce a 64-bit element vector, so use the
- // merged pieces
- ReadlanePieces.push_back(CurrentLaneOpReg);
+ Register CurrentLaneOpReg;
+ if (Is64) {
+ Register CurrentLaneOpRegLo = MRI.createGenericVirtualRegister(S32);
+ Register CurrentLaneOpRegHi = MRI.createGenericVirtualRegister(S32);
+
+ MRI.setRegClass(UnmergePiece, &AMDGPU::VReg_64RegClass);
+ MRI.setRegClass(CurrentLaneOpRegLo, &AMDGPU::SReg_32_XM0RegClass);
+ MRI.setRegClass(CurrentLaneOpRegHi, &AMDGPU::SReg_32_XM0RegClass);
+
+ // Read the next variant <- also loop target.
+ BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32),
+ CurrentLaneOpRegLo)
+ .addReg(UnmergePiece, 0, AMDGPU::sub0);
+
+ // Read the next variant <- also loop target.
+ BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32),
+ CurrentLaneOpRegHi)
+ .addReg(UnmergePiece, 0, AMDGPU::sub1);
+
+ CurrentLaneOpReg =
+ B.buildMerge(LLT::scalar(64),
+ {CurrentLaneOpRegLo, CurrentLaneOpRegHi})
+ .getReg(0);
+
+ MRI.setRegClass(CurrentLaneOpReg, &AMDGPU::SReg_64_XEXECRegClass);
+
+ if (OpTy.getScalarSizeInBits() == 64) {
+ // If we need to produce a 64-bit element vector, so use the
+ // merged pieces
+ ReadlanePieces.push_back(CurrentLaneOpReg);
+ } else {
+ // 32-bit element type.
+ ReadlanePieces.push_back(CurrentLaneOpRegLo);
+ ReadlanePieces.push_back(CurrentLaneOpRegHi);
+ }
} else {
- // 32-bit element type.
- ReadlanePieces.push_back(CurrentLaneOpRegLo);
- ReadlanePieces.push_back(CurrentLaneOpRegHi);
+ CurrentLaneOpReg = MRI.createGenericVirtualRegister(S32);
+ MRI.setRegClass(UnmergePiece, &AMDGPU::VGPR_32RegClass);
+ MRI.setRegClass(CurrentLaneOpReg, &AMDGPU::SReg_32_XM0RegClass);
+
+ // Read the next variant <- also loop target.
+ BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32),
+ CurrentLaneOpReg)
+ .addReg(UnmergePiece);
+ ReadlanePieces.push_back(CurrentLaneOpReg);
}
- } else {
- CurrentLaneOpReg = MRI.createGenericVirtualRegister(LLT::scalar(32));
- MRI.setRegClass(UnmergePiece, &AMDGPU::VGPR_32RegClass);
- MRI.setRegClass(CurrentLaneOpReg, &AMDGPU::SReg_32_XM0RegClass);
- // Read the next variant <- also loop target.
- BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32),
- CurrentLaneOpReg)
- .addReg(UnmergePiece);
- ReadlanePieces.push_back(CurrentLaneOpReg);
- }
+ Register NewCondReg = MRI.createVirtualRegister(WaveRC);
+ bool First = CondReg == AMDGPU::NoRegister;
+ if (First)
+ CondReg = NewCondReg;
- Register NewCondReg
- = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass);
- bool First = CondReg == AMDGPU::NoRegister;
- if (First)
- CondReg = NewCondReg;
+ B.buildInstr(CmpOp)
+ .addDef(NewCondReg)
+ .addReg(CurrentLaneOpReg)
+ .addReg(UnmergePiece);
- B.buildInstr(CmpOp)
- .addDef(NewCondReg)
- .addReg(CurrentLaneOpReg)
- .addReg(UnmergePiece);
+ if (!First) {
+ Register AndReg = MRI.createVirtualRegister(WaveRC);
- if (!First) {
- Register AndReg
- = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass);
+ // If there are multiple operands to consider, and the conditions.
+ B.buildInstr(WaveAndOpc)
+ .addDef(AndReg)
+ .addReg(NewCondReg)
+ .addReg(CondReg);
+ CondReg = AndReg;
+ }
+ }
- // If there are multiple operands to consider, and the conditions.
- B.buildInstr(AMDGPU::S_AND_B64)
- .addDef(AndReg)
- .addReg(NewCondReg)
- .addReg(CondReg);
- CondReg = AndReg;
+ // FIXME: Build merge seems to switch to CONCAT_VECTORS but not
+ // BUILD_VECTOR
+ if (OpTy.isVector()) {
+ auto Merge = B.buildBuildVector(OpTy, ReadlanePieces);
+ Op.setReg(Merge.getReg(0));
+ } else {
+ auto Merge = B.buildMerge(OpTy, ReadlanePieces);
+ Op.setReg(Merge.getReg(0));
}
- }
- // FIXME: Build merge seems to switch to CONCAT_VECTORS but not
- // BUILD_VECTOR
- if (OpTy.isVector()) {
- auto Merge = B.buildBuildVector(OpTy, ReadlanePieces);
- Op.setReg(Merge.getReg(0));
- } else {
- auto Merge = B.buildMerge(OpTy, ReadlanePieces);
- Op.setReg(Merge.getReg(0));
+ MRI.setRegBank(Op.getReg(), getRegBank(AMDGPU::SGPRRegBankID));
}
-
- MRI.setRegBank(Op.getReg(), getRegBank(AMDGPU::SGPRRegBankID));
}
}
}
@@ -876,16 +919,16 @@ void AMDGPURegisterBankInfo::executeInWaterfallLoop(
B.setInsertPt(*LoopBB, LoopBB->end());
// Update EXEC, save the original EXEC value to VCC.
- B.buildInstr(AMDGPU::S_AND_SAVEEXEC_B64)
+ B.buildInstr(AndSaveExecOpc)
.addDef(NewExec)
.addReg(CondReg, RegState::Kill);
MRI.setSimpleHint(NewExec, CondReg);
// Update EXEC, switch all done bits to 0 and all todo bits to 1.
- B.buildInstr(AMDGPU::S_XOR_B64_term)
- .addDef(AMDGPU::EXEC)
- .addReg(AMDGPU::EXEC)
+ B.buildInstr(XorTermOpc)
+ .addDef(ExecReg)
+ .addReg(ExecReg)
.addReg(NewExec);
// XXX - s_xor_b64 sets scc to 1 if the result is nonzero, so can we use
@@ -896,14 +939,60 @@ void AMDGPURegisterBankInfo::executeInWaterfallLoop(
.addMBB(LoopBB);
// Save the EXEC mask before the loop.
- BuildMI(MBB, MBB.end(), DL, TII->get(AMDGPU::S_MOV_B64_term), SaveExecReg)
- .addReg(AMDGPU::EXEC);
+ BuildMI(MBB, MBB.end(), DL, TII->get(MovTermOpc), SaveExecReg)
+ .addReg(ExecReg);
// Restore the EXEC mask after the loop.
B.setMBB(*RestoreExecBB);
- B.buildInstr(AMDGPU::S_MOV_B64_term)
- .addDef(AMDGPU::EXEC)
+ B.buildInstr(MovTermOpc)
+ .addDef(ExecReg)
.addReg(SaveExecReg);
+
+ // Restore the insert point before the original instruction.
+ B.setInsertPt(MBB, MBB.end());
+
+ return true;
+}
+
+// Return any unique registers used by \p MI at \p OpIndices that need to be
+// handled in a waterfall loop. Returns these registers in \p
+// SGPROperandRegs. Returns true if there are any operansd to handle and a
+// waterfall loop is necessary.
+bool AMDGPURegisterBankInfo::collectWaterfallOperands(
+ SmallSet<Register, 4> &SGPROperandRegs, MachineInstr &MI,
+ MachineRegisterInfo &MRI, ArrayRef<unsigned> OpIndices) const {
+ for (unsigned Op : OpIndices) {
+ assert(MI.getOperand(Op).isUse());
+ Register Reg = MI.getOperand(Op).getReg();
+ const RegisterBank *OpBank = getRegBank(Reg, MRI, *TRI);
+ if (OpBank->getID() == AMDGPU::VGPRRegBankID)
+ SGPROperandRegs.insert(Reg);
+ }
+
+ // No operands need to be replaced, so no need to loop.
+ return !SGPROperandRegs.empty();
+}
+
+bool AMDGPURegisterBankInfo::executeInWaterfallLoop(
+ MachineIRBuilder &B, MachineInstr &MI, MachineRegisterInfo &MRI,
+ ArrayRef<unsigned> OpIndices) const {
+ // Use a set to avoid extra readfirstlanes in the case where multiple operands
+ // are the same register.
+ SmallSet<Register, 4> SGPROperandRegs;
+
+ if (!collectWaterfallOperands(SGPROperandRegs, MI, MRI, OpIndices))
+ return false;
+
+ MachineBasicBlock::iterator I = MI.getIterator();
+ return executeInWaterfallLoop(B, make_range(I, std::next(I)),
+ SGPROperandRegs, MRI);
+}
+
+bool AMDGPURegisterBankInfo::executeInWaterfallLoop(
+ MachineInstr &MI, MachineRegisterInfo &MRI,
+ ArrayRef<unsigned> OpIndices) const {
+ MachineIRBuilder B(MI);
+ return executeInWaterfallLoop(B, MI, MRI, OpIndices);
}
// Legalize an operand that must be an SGPR by inserting a readfirstlane.
@@ -960,8 +1049,13 @@ bool AMDGPURegisterBankInfo::applyMappingWideLoad(MachineInstr &MI,
SmallVector<unsigned, 1> SrcRegs(OpdMapper.getVRegs(1));
// If the pointer is an SGPR, we have nothing to do.
- if (SrcRegs.empty())
- return false;
+ if (SrcRegs.empty()) {
+ Register PtrReg = MI.getOperand(1).getReg();
+ const RegisterBank *PtrBank = getRegBank(PtrReg, MRI, *TRI);
+ if (PtrBank == &AMDGPU::SGPRRegBank)
+ return false;
+ SrcRegs.push_back(PtrReg);
+ }
assert(LoadSize % MaxNonSmrdLoadSize == 0);
@@ -1013,6 +1107,33 @@ bool AMDGPURegisterBankInfo::applyMappingWideLoad(MachineInstr &MI,
return true;
}
+bool AMDGPURegisterBankInfo::applyMappingImage(
+ MachineInstr &MI, const AMDGPURegisterBankInfo::OperandsMapper &OpdMapper,
+ MachineRegisterInfo &MRI, int RsrcIdx) const {
+ const int NumDefs = MI.getNumExplicitDefs();
+
+ // The reported argument index is relative to the IR intrinsic call arguments,
+ // so we need to shift by the number of defs and the intrinsic ID.
+ RsrcIdx += NumDefs + 1;
+
+ // Insert copies to VGPR arguments.
+ applyDefaultMapping(OpdMapper);
+
+ // Fixup any SGPR arguments.
+ SmallVector<unsigned, 4> SGPRIndexes;
+ for (int I = NumDefs, NumOps = MI.getNumOperands(); I != NumOps; ++I) {
+ if (!MI.getOperand(I).isReg())
+ continue;
+
+ // If this intrinsic has a sampler, it immediately follows rsrc.
+ if (I == RsrcIdx || I == RsrcIdx + 1)
+ SGPRIndexes.push_back(I);
+ }
+
+ executeInWaterfallLoop(MI, MRI, SGPRIndexes);
+ return true;
+}
+
// For cases where only a single copy is inserted for matching register banks.
// Replace the register in the instruction operand
static void substituteSimpleCopyRegs(
@@ -1024,6 +1145,184 @@ static void substituteSimpleCopyRegs(
}
}
+/// Handle register layout difference for f16 images for some subtargets.
+Register AMDGPURegisterBankInfo::handleD16VData(MachineIRBuilder &B,
+ MachineRegisterInfo &MRI,
+ Register Reg) const {
+ if (!Subtarget.hasUnpackedD16VMem())
+ return Reg;
+
+ const LLT S16 = LLT::scalar(16);
+ LLT StoreVT = MRI.getType(Reg);
+ if (!StoreVT.isVector() || StoreVT.getElementType() != S16)
+ return Reg;
+
+ auto Unmerge = B.buildUnmerge(S16, Reg);
+
+
+ SmallVector<Register, 4> WideRegs;
+ for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
+ WideRegs.push_back(Unmerge.getReg(I));
+
+ const LLT S32 = LLT::scalar(32);
+ int NumElts = StoreVT.getNumElements();
+
+ return B.buildMerge(LLT::vector(NumElts, S32), WideRegs).getReg(0);
+}
+
+static std::pair<Register, unsigned>
+getBaseWithConstantOffset(MachineRegisterInfo &MRI, Register Reg) {
+ int64_t Const;
+ if (mi_match(Reg, MRI, m_ICst(Const)))
+ return std::make_pair(Register(), Const);
+
+ Register Base;
+ if (mi_match(Reg, MRI, m_GAdd(m_Reg(Base), m_ICst(Const))))
+ return std::make_pair(Base, Const);
+
+ // TODO: Handle G_OR used for add case
+ return std::make_pair(Reg, 0);
+}
+
+std::pair<Register, unsigned>
+AMDGPURegisterBankInfo::splitBufferOffsets(MachineIRBuilder &B,
+ Register OrigOffset) const {
+ const unsigned MaxImm = 4095;
+ Register BaseReg;
+ unsigned ImmOffset;
+ const LLT S32 = LLT::scalar(32);
+
+ std::tie(BaseReg, ImmOffset) = getBaseWithConstantOffset(*B.getMRI(),
+ OrigOffset);
+
+ unsigned C1 = 0;
+ if (ImmOffset != 0) {
+ // If the immediate value is too big for the immoffset field, put the value
+ // and -4096 into the immoffset field so that the value that is copied/added
+ // for the voffset field is a multiple of 4096, and it stands more chance
+ // of being CSEd with the copy/add for another similar load/store.
+ // However, do not do that rounding down to a multiple of 4096 if that is a
+ // negative number, as it appears to be illegal to have a negative offset
+ // in the vgpr, even if adding the immediate offset makes it positive.
+ unsigned Overflow = ImmOffset & ~MaxImm;
+ ImmOffset -= Overflow;
+ if ((int32_t)Overflow < 0) {
+ Overflow += ImmOffset;
+ ImmOffset = 0;
+ }
+
+ C1 = ImmOffset;
+ if (Overflow != 0) {
+ if (!BaseReg)
+ BaseReg = B.buildConstant(S32, Overflow).getReg(0);
+ else {
+ auto OverflowVal = B.buildConstant(S32, Overflow);
+ BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0);
+ }
+ }
+ }
+
+ if (!BaseReg)
+ BaseReg = B.buildConstant(S32, 0).getReg(0);
+
+ return {BaseReg, C1};
+}
+
+static bool isZero(Register Reg, MachineRegisterInfo &MRI) {
+ int64_t C;
+ return mi_match(Reg, MRI, m_ICst(C)) && C == 0;
+}
+
+static unsigned extractGLC(unsigned CachePolicy) {
+ return CachePolicy & 1;
+}
+
+static unsigned extractSLC(unsigned CachePolicy) {
+ return (CachePolicy >> 1) & 1;
+}
+
+static unsigned extractDLC(unsigned CachePolicy) {
+ return (CachePolicy >> 2) & 1;
+}
+
+MachineInstr *
+AMDGPURegisterBankInfo::selectStoreIntrinsic(MachineIRBuilder &B,
+ MachineInstr &MI) const {
+ MachineRegisterInfo &MRI = *B.getMRI();
+ executeInWaterfallLoop(B, MI, MRI, {2, 4});
+
+ // FIXME: DAG lowering brokenly changes opcode based on FP vs. integer.
+
+ Register VData = MI.getOperand(1).getReg();
+ LLT Ty = MRI.getType(VData);
+
+ int EltSize = Ty.getScalarSizeInBits();
+ int Size = Ty.getSizeInBits();
+
+ // FIXME: Broken integer truncstore.
+ if (EltSize != 32)
+ report_fatal_error("unhandled intrinsic store");
+
+ // FIXME: Verifier should enforce 1 MMO for these intrinsics.
+ const int MemSize = (*MI.memoperands_begin())->getSize();
+
+
+ Register RSrc = MI.getOperand(2).getReg();
+ Register VOffset = MI.getOperand(3).getReg();
+ Register SOffset = MI.getOperand(4).getReg();
+ unsigned CachePolicy = MI.getOperand(5).getImm();
+
+ unsigned ImmOffset;
+ std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset);
+
+ const bool Offen = !isZero(VOffset, MRI);
+
+ unsigned Opc = AMDGPU::BUFFER_STORE_DWORD_OFFEN_exact;
+ switch (8 * MemSize) {
+ case 8:
+ Opc = Offen ? AMDGPU::BUFFER_STORE_BYTE_OFFEN_exact :
+ AMDGPU::BUFFER_STORE_BYTE_OFFSET_exact;
+ break;
+ case 16:
+ Opc = Offen ? AMDGPU::BUFFER_STORE_SHORT_OFFEN_exact :
+ AMDGPU::BUFFER_STORE_SHORT_OFFSET_exact;
+ break;
+ default:
+ Opc = Offen ? AMDGPU::BUFFER_STORE_DWORD_OFFEN_exact :
+ AMDGPU::BUFFER_STORE_DWORD_OFFSET_exact;
+ if (Size > 32)
+ Opc = AMDGPU::getMUBUFOpcode(Opc, Size / 32);
+ break;
+ }
+
+
+ // Set the insertion point back to the instruction in case it was moved into a
+ // loop.
+ B.setInstr(MI);
+
+ MachineInstrBuilder MIB = B.buildInstr(Opc)
+ .addUse(VData);
+
+ if (Offen)
+ MIB.addUse(VOffset);
+
+ MIB.addUse(RSrc)
+ .addUse(SOffset)
+ .addImm(ImmOffset)
+ .addImm(extractGLC(CachePolicy))
+ .addImm(extractSLC(CachePolicy))
+ .addImm(0) // tfe: FIXME: Remove from inst
+ .addImm(extractDLC(CachePolicy))
+ .cloneMemRefs(MI);
+
+ // FIXME: We need a way to report failure from applyMappingImpl.
+ // Insert constrain copies before inserting the loop.
+ if (!constrainSelectedInstRegOperands(*MIB, *TII, *TRI, *this))
+ report_fatal_error("failed to constrain selected store intrinsic");
+
+ return MIB;
+}
+
void AMDGPURegisterBankInfo::applyMappingImpl(
const OperandsMapper &OpdMapper) const {
MachineInstr &MI = OpdMapper.getMI();
@@ -1289,12 +1588,202 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
MI.eraseFromParent();
return;
}
- case AMDGPU::G_EXTRACT_VECTOR_ELT:
- applyDefaultMapping(OpdMapper);
- executeInWaterfallLoop(MI, MRI, { 2 });
+ case AMDGPU::G_BUILD_VECTOR:
+ case AMDGPU::G_BUILD_VECTOR_TRUNC: {
+ Register DstReg = MI.getOperand(0).getReg();
+ LLT DstTy = MRI.getType(DstReg);
+ if (DstTy != LLT::vector(2, 16))
+ break;
+
+ assert(MI.getNumOperands() == 3 && OpdMapper.getVRegs(0).empty());
+ substituteSimpleCopyRegs(OpdMapper, 1);
+ substituteSimpleCopyRegs(OpdMapper, 2);
+
+ const RegisterBank *DstBank = getRegBank(DstReg, MRI, *TRI);
+ if (DstBank == &AMDGPU::SGPRRegBank)
+ break; // Can use S_PACK_* instructions.
+
+ MachineIRBuilder B(MI);
+
+ Register Lo = MI.getOperand(1).getReg();
+ Register Hi = MI.getOperand(2).getReg();
+ const LLT S32 = LLT::scalar(32);
+
+ const RegisterBank *BankLo = getRegBank(Lo, MRI, *TRI);
+ const RegisterBank *BankHi = getRegBank(Hi, MRI, *TRI);
+
+ Register ZextLo;
+ Register ShiftHi;
+
+ if (Opc == AMDGPU::G_BUILD_VECTOR) {
+ ZextLo = B.buildZExt(S32, Lo).getReg(0);
+ MRI.setRegBank(ZextLo, *BankLo);
+
+ Register ZextHi = B.buildZExt(S32, Hi).getReg(0);
+ MRI.setRegBank(ZextHi, *BankHi);
+
+ auto ShiftAmt = B.buildConstant(S32, 16);
+ MRI.setRegBank(ShiftAmt.getReg(0), *BankHi);
+
+ ShiftHi = B.buildShl(S32, ZextHi, ShiftAmt).getReg(0);
+ MRI.setRegBank(ShiftHi, *BankHi);
+ } else {
+ Register MaskLo = B.buildConstant(S32, 0xffff).getReg(0);
+ MRI.setRegBank(MaskLo, *BankLo);
+
+ auto ShiftAmt = B.buildConstant(S32, 16);
+ MRI.setRegBank(ShiftAmt.getReg(0), *BankHi);
+
+ ShiftHi = B.buildShl(S32, Hi, ShiftAmt).getReg(0);
+ MRI.setRegBank(ShiftHi, *BankHi);
+
+ ZextLo = B.buildAnd(S32, Lo, MaskLo).getReg(0);
+ MRI.setRegBank(ZextLo, *BankLo);
+ }
+
+ auto Or = B.buildOr(S32, ZextLo, ShiftHi);
+ MRI.setRegBank(Or.getReg(0), *DstBank);
+
+ B.buildBitcast(DstReg, Or);
+ MI.eraseFromParent();
+ return;
+ }
+ case AMDGPU::G_EXTRACT_VECTOR_ELT: {
+ SmallVector<Register, 2> DstRegs(OpdMapper.getVRegs(0));
+
+ assert(OpdMapper.getVRegs(1).empty() && OpdMapper.getVRegs(2).empty());
+
+ if (DstRegs.empty()) {
+ applyDefaultMapping(OpdMapper);
+ executeInWaterfallLoop(MI, MRI, { 2 });
+ return;
+ }
+
+ Register DstReg = MI.getOperand(0).getReg();
+ Register SrcReg = MI.getOperand(1).getReg();
+ Register IdxReg = MI.getOperand(2).getReg();
+ LLT DstTy = MRI.getType(DstReg);
+ (void)DstTy;
+
+ assert(DstTy.getSizeInBits() == 64);
+
+ LLT SrcTy = MRI.getType(SrcReg);
+ const LLT S32 = LLT::scalar(32);
+ LLT Vec32 = LLT::vector(2 * SrcTy.getNumElements(), 32);
+
+ MachineIRBuilder B(MI);
+ auto CastSrc = B.buildBitcast(Vec32, SrcReg);
+ auto One = B.buildConstant(S32, 1);
+
+ // Split the vector index into 32-bit pieces. Prepare to move all of the
+ // new instructions into a waterfall loop if necessary.
+ //
+ // Don't put the bitcast or constant in the loop.
+ MachineInstrSpan Span(MachineBasicBlock::iterator(&MI), &B.getMBB());
+
+ // Compute 32-bit element indices, (2 * OrigIdx, 2 * OrigIdx + 1).
+ auto IdxLo = B.buildShl(S32, IdxReg, One);
+ auto IdxHi = B.buildAdd(S32, IdxLo, One);
+ B.buildExtractVectorElement(DstRegs[0], CastSrc, IdxLo);
+ B.buildExtractVectorElement(DstRegs[1], CastSrc, IdxHi);
+
+ const ValueMapping &DstMapping
+ = OpdMapper.getInstrMapping().getOperandMapping(0);
+
+ // FIXME: Should be getting from mapping or not?
+ const RegisterBank *SrcBank = getRegBank(SrcReg, MRI, *TRI);
+ MRI.setRegBank(DstReg, *DstMapping.BreakDown[0].RegBank);
+ MRI.setRegBank(CastSrc.getReg(0), *SrcBank);
+ MRI.setRegBank(One.getReg(0), AMDGPU::SGPRRegBank);
+ MRI.setRegBank(IdxLo.getReg(0), AMDGPU::SGPRRegBank);
+ MRI.setRegBank(IdxHi.getReg(0), AMDGPU::SGPRRegBank);
+
+ SmallSet<Register, 4> OpsToWaterfall;
+ if (!collectWaterfallOperands(OpsToWaterfall, MI, MRI, { 2 })) {
+ MI.eraseFromParent();
+ return;
+ }
+
+ // Remove the original instruction to avoid potentially confusing the
+ // waterfall loop logic.
+ B.setInstr(*Span.begin());
+ MI.eraseFromParent();
+ executeInWaterfallLoop(B, make_range(Span.begin(), Span.end()),
+ OpsToWaterfall, MRI);
+ return;
+ }
+ case AMDGPU::G_INSERT_VECTOR_ELT: {
+ SmallVector<Register, 2> InsRegs(OpdMapper.getVRegs(2));
+
+ assert(OpdMapper.getVRegs(0).empty());
+ assert(OpdMapper.getVRegs(1).empty());
+ assert(OpdMapper.getVRegs(3).empty());
+
+ if (InsRegs.empty()) {
+ applyDefaultMapping(OpdMapper);
+ executeInWaterfallLoop(MI, MRI, { 3 });
+ return;
+ }
+
+ Register DstReg = MI.getOperand(0).getReg();
+ Register SrcReg = MI.getOperand(1).getReg();
+ Register InsReg = MI.getOperand(2).getReg();
+ Register IdxReg = MI.getOperand(3).getReg();
+ LLT SrcTy = MRI.getType(SrcReg);
+ LLT InsTy = MRI.getType(InsReg);
+ (void)InsTy;
+
+ assert(InsTy.getSizeInBits() == 64);
+
+ const LLT S32 = LLT::scalar(32);
+ LLT Vec32 = LLT::vector(2 * SrcTy.getNumElements(), 32);
+
+ MachineIRBuilder B(MI);
+ auto CastSrc = B.buildBitcast(Vec32, SrcReg);
+ auto One = B.buildConstant(S32, 1);
+
+ // Split the vector index into 32-bit pieces. Prepare to move all of the
+ // new instructions into a waterfall loop if necessary.
+ //
+ // Don't put the bitcast or constant in the loop.
+ MachineInstrSpan Span(MachineBasicBlock::iterator(&MI), &B.getMBB());
+
+ // Compute 32-bit element indices, (2 * OrigIdx, 2 * OrigIdx + 1).
+ auto IdxLo = B.buildShl(S32, IdxReg, One);
+ auto IdxHi = B.buildAdd(S32, IdxLo, One);
+
+ auto InsLo = B.buildInsertVectorElement(Vec32, CastSrc, InsRegs[0], IdxLo);
+ auto InsHi = B.buildInsertVectorElement(Vec32, InsLo, InsRegs[1], IdxHi);
+ B.buildBitcast(DstReg, InsHi);
+
+ const RegisterBank *DstBank = getRegBank(DstReg, MRI, *TRI);
+ const RegisterBank *SrcBank = getRegBank(SrcReg, MRI, *TRI);
+ const RegisterBank *InsSrcBank = getRegBank(InsReg, MRI, *TRI);
+
+ MRI.setRegBank(InsReg, *InsSrcBank);
+ MRI.setRegBank(CastSrc.getReg(0), *SrcBank);
+ MRI.setRegBank(InsLo.getReg(0), *DstBank);
+ MRI.setRegBank(InsHi.getReg(0), *DstBank);
+ MRI.setRegBank(One.getReg(0), AMDGPU::SGPRRegBank);
+ MRI.setRegBank(IdxLo.getReg(0), AMDGPU::SGPRRegBank);
+ MRI.setRegBank(IdxHi.getReg(0), AMDGPU::SGPRRegBank);
+
+
+ SmallSet<Register, 4> OpsToWaterfall;
+ if (!collectWaterfallOperands(OpsToWaterfall, MI, MRI, { 3 })) {
+ MI.eraseFromParent();
+ return;
+ }
+
+ B.setInstr(*Span.begin());
+ MI.eraseFromParent();
+
+ executeInWaterfallLoop(B, make_range(Span.begin(), Span.end()),
+ OpsToWaterfall, MRI);
return;
+ }
case AMDGPU::G_INTRINSIC: {
- switch (MI.getOperand(MI.getNumExplicitDefs()).getIntrinsicID()) {
+ switch (MI.getIntrinsicID()) {
case Intrinsic::amdgcn_s_buffer_load: {
// FIXME: Move to G_INTRINSIC_W_SIDE_EFFECTS
executeInWaterfallLoop(MI, MRI, { 2, 3 });
@@ -1303,8 +1792,8 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
case Intrinsic::amdgcn_readlane: {
substituteSimpleCopyRegs(OpdMapper, 2);
- assert(empty(OpdMapper.getVRegs(0)));
- assert(empty(OpdMapper.getVRegs(3)));
+ assert(OpdMapper.getVRegs(0).empty());
+ assert(OpdMapper.getVRegs(3).empty());
// Make sure the index is an SGPR. It doesn't make sense to run this in a
// waterfall loop, so assume it's a uniform value.
@@ -1312,9 +1801,9 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
return;
}
case Intrinsic::amdgcn_writelane: {
- assert(empty(OpdMapper.getVRegs(0)));
- assert(empty(OpdMapper.getVRegs(2)));
- assert(empty(OpdMapper.getVRegs(3)));
+ assert(OpdMapper.getVRegs(0).empty());
+ assert(OpdMapper.getVRegs(2).empty());
+ assert(OpdMapper.getVRegs(3).empty());
substituteSimpleCopyRegs(OpdMapper, 4); // VGPR input val
constrainOpWithReadfirstlane(MI, MRI, 2); // Source value
@@ -1327,7 +1816,8 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
break;
}
case AMDGPU::G_INTRINSIC_W_SIDE_EFFECTS: {
- switch (MI.getOperand(MI.getNumExplicitDefs()).getIntrinsicID()) {
+ auto IntrID = MI.getIntrinsicID();
+ switch (IntrID) {
case Intrinsic::amdgcn_buffer_load: {
executeInWaterfallLoop(MI, MRI, { 2 });
return;
@@ -1335,23 +1825,70 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
case Intrinsic::amdgcn_ds_ordered_add:
case Intrinsic::amdgcn_ds_ordered_swap: {
// This is only allowed to execute with 1 lane, so readfirstlane is safe.
- assert(empty(OpdMapper.getVRegs(0)));
+ assert(OpdMapper.getVRegs(0).empty());
substituteSimpleCopyRegs(OpdMapper, 3);
constrainOpWithReadfirstlane(MI, MRI, 2); // M0
return;
}
+ case Intrinsic::amdgcn_ds_gws_init:
+ case Intrinsic::amdgcn_ds_gws_barrier:
+ case Intrinsic::amdgcn_ds_gws_sema_br: {
+ // Only the first lane is executes, so readfirstlane is safe.
+ substituteSimpleCopyRegs(OpdMapper, 1);
+ constrainOpWithReadfirstlane(MI, MRI, 2); // M0
+ return;
+ }
+ case Intrinsic::amdgcn_ds_gws_sema_v:
+ case Intrinsic::amdgcn_ds_gws_sema_p:
+ case Intrinsic::amdgcn_ds_gws_sema_release_all: {
+ // Only the first lane is executes, so readfirstlane is safe.
+ constrainOpWithReadfirstlane(MI, MRI, 1); // M0
+ return;
+ }
case Intrinsic::amdgcn_s_sendmsg:
case Intrinsic::amdgcn_s_sendmsghalt: {
// FIXME: Should this use a waterfall loop?
constrainOpWithReadfirstlane(MI, MRI, 2); // M0
return;
}
- default:
+ case Intrinsic::amdgcn_raw_buffer_load:
+ case Intrinsic::amdgcn_raw_buffer_load_format:
+ case Intrinsic::amdgcn_raw_tbuffer_load:
+ case Intrinsic::amdgcn_raw_buffer_store:
+ case Intrinsic::amdgcn_raw_buffer_store_format:
+ case Intrinsic::amdgcn_raw_tbuffer_store: {
+ applyDefaultMapping(OpdMapper);
+ executeInWaterfallLoop(MI, MRI, {2, 4});
+ return;
+ }
+ case Intrinsic::amdgcn_struct_buffer_load:
+ case Intrinsic::amdgcn_struct_buffer_store:
+ case Intrinsic::amdgcn_struct_tbuffer_load:
+ case Intrinsic::amdgcn_struct_tbuffer_store: {
+ applyDefaultMapping(OpdMapper);
+ executeInWaterfallLoop(MI, MRI, {2, 5});
+ return;
+ }
+ default: {
+ if (const AMDGPU::RsrcIntrinsic *RSrcIntrin =
+ AMDGPU::lookupRsrcIntrinsic(IntrID)) {
+ // Non-images can have complications from operands that allow both SGPR
+ // and VGPR. For now it's too complicated to figure out the final opcode
+ // to derive the register bank from the MCInstrDesc.
+ if (RSrcIntrin->IsImage) {
+ applyMappingImage(MI, OpdMapper, MRI, RSrcIntrin->RsrcArg);
+ return;
+ }
+ }
+
break;
}
+ }
break;
}
- case AMDGPU::G_LOAD: {
+ case AMDGPU::G_LOAD:
+ case AMDGPU::G_ZEXTLOAD:
+ case AMDGPU::G_SEXTLOAD: {
if (applyMappingWideLoad(MI, OpdMapper, MRI))
return;
break;
@@ -1452,25 +1989,71 @@ AMDGPURegisterBankInfo::getDefaultMappingAllVGPR(const MachineInstr &MI) const {
}
const RegisterBankInfo::InstructionMapping &
+AMDGPURegisterBankInfo::getImageMapping(const MachineRegisterInfo &MRI,
+ const MachineInstr &MI,
+ int RsrcIdx) const {
+ // The reported argument index is relative to the IR intrinsic call arguments,
+ // so we need to shift by the number of defs and the intrinsic ID.
+ RsrcIdx += MI.getNumExplicitDefs() + 1;
+
+ const int NumOps = MI.getNumOperands();
+ SmallVector<const ValueMapping *, 8> OpdsMapping(NumOps);
+
+ // TODO: Should packed/unpacked D16 difference be reported here as part of
+ // the value mapping?
+ for (int I = 0; I != NumOps; ++I) {
+ if (!MI.getOperand(I).isReg())
+ continue;
+
+ Register OpReg = MI.getOperand(I).getReg();
+ unsigned Size = getSizeInBits(OpReg, MRI, *TRI);
+
+ // FIXME: Probably need a new intrinsic register bank searchable table to
+ // handle arbitrary intrinsics easily.
+ //
+ // If this has a sampler, it immediately follows rsrc.
+ const bool MustBeSGPR = I == RsrcIdx || I == RsrcIdx + 1;
+
+ if (MustBeSGPR) {
+ // If this must be an SGPR, so we must report whatever it is as legal.
+ unsigned NewBank = getRegBankID(OpReg, MRI, *TRI, AMDGPU::SGPRRegBankID);
+ OpdsMapping[I] = AMDGPU::getValueMapping(NewBank, Size);
+ } else {
+ // Some operands must be VGPR, and these are easy to copy to.
+ OpdsMapping[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
+ }
+ }
+
+ return getInstructionMapping(1, 1, getOperandsMapping(OpdsMapping), NumOps);
+}
+
+const RegisterBankInfo::InstructionMapping &
AMDGPURegisterBankInfo::getInstrMappingForLoad(const MachineInstr &MI) const {
const MachineFunction &MF = *MI.getParent()->getParent();
const MachineRegisterInfo &MRI = MF.getRegInfo();
- SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands());
+ SmallVector<const ValueMapping*, 2> OpdsMapping(2);
unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
LLT LoadTy = MRI.getType(MI.getOperand(0).getReg());
- unsigned PtrSize = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI);
+ Register PtrReg = MI.getOperand(1).getReg();
+ LLT PtrTy = MRI.getType(PtrReg);
+ unsigned AS = PtrTy.getAddressSpace();
+ unsigned PtrSize = PtrTy.getSizeInBits();
const ValueMapping *ValMapping;
const ValueMapping *PtrMapping;
- if (isInstrUniform(MI)) {
+ const RegisterBank *PtrBank = getRegBank(PtrReg, MRI, *TRI);
+
+ if (PtrBank == &AMDGPU::SGPRRegBank &&
+ (AS != AMDGPUAS::LOCAL_ADDRESS && AS != AMDGPUAS::REGION_ADDRESS &&
+ AS != AMDGPUAS::PRIVATE_ADDRESS) &&
+ isInstrUniformNonExtLoadAlign4(MI)) {
// We have a uniform instruction so we want to use an SMRD load
ValMapping = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
PtrMapping = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, PtrSize);
} else {
ValMapping = AMDGPU::getValueMappingLoadSGPROnly(AMDGPU::VGPRRegBankID, LoadTy);
- // FIXME: What would happen if we used SGPRRegBankID here?
PtrMapping = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, PtrSize);
}
@@ -1494,6 +2077,31 @@ AMDGPURegisterBankInfo::getRegBankID(Register Reg,
return Bank ? Bank->getID() : Default;
}
+
+static unsigned regBankUnion(unsigned RB0, unsigned RB1) {
+ return (RB0 == AMDGPU::SGPRRegBankID && RB1 == AMDGPU::SGPRRegBankID) ?
+ AMDGPU::SGPRRegBankID : AMDGPU::VGPRRegBankID;
+}
+
+const RegisterBankInfo::ValueMapping *
+AMDGPURegisterBankInfo::getSGPROpMapping(Register Reg,
+ const MachineRegisterInfo &MRI,
+ const TargetRegisterInfo &TRI) const {
+ // Lie and claim anything is legal, even though this needs to be an SGPR
+ // applyMapping will have to deal with it as a waterfall loop.
+ unsigned Bank = getRegBankID(Reg, MRI, TRI, AMDGPU::SGPRRegBankID);
+ unsigned Size = getSizeInBits(Reg, MRI, TRI);
+ return AMDGPU::getValueMapping(Bank, Size);
+}
+
+const RegisterBankInfo::ValueMapping *
+AMDGPURegisterBankInfo::getVGPROpMapping(Register Reg,
+ const MachineRegisterInfo &MRI,
+ const TargetRegisterInfo &TRI) const {
+ unsigned Size = getSizeInBits(Reg, MRI, TRI);
+ return AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
+}
+
///
/// This function must return a legal mapping, because
/// AMDGPURegisterBankInfo::getInstrAlternativeMappings() is not called
@@ -1536,7 +2144,7 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
int ResultBank = -1;
for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) {
- unsigned Reg = MI.getOperand(I).getReg();
+ Register Reg = MI.getOperand(I).getReg();
const RegisterBank *Bank = getRegBank(Reg, MRI, *TRI);
// FIXME: Assuming VGPR for any undetermined inputs.
@@ -1660,7 +2268,6 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
LLVM_FALLTHROUGH;
}
-
case AMDGPU::G_GEP:
case AMDGPU::G_ADD:
case AMDGPU::G_SUB:
@@ -1669,15 +2276,11 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
case AMDGPU::G_LSHR:
case AMDGPU::G_ASHR:
case AMDGPU::G_UADDO:
- case AMDGPU::G_SADDO:
case AMDGPU::G_USUBO:
- case AMDGPU::G_SSUBO:
case AMDGPU::G_UADDE:
case AMDGPU::G_SADDE:
case AMDGPU::G_USUBE:
case AMDGPU::G_SSUBE:
- case AMDGPU::G_UMULH:
- case AMDGPU::G_SMULH:
case AMDGPU::G_SMIN:
case AMDGPU::G_SMAX:
case AMDGPU::G_UMIN:
@@ -1692,17 +2295,32 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
case AMDGPU::G_FPTOUI:
case AMDGPU::G_FMUL:
case AMDGPU::G_FMA:
+ case AMDGPU::G_FMAD:
case AMDGPU::G_FSQRT:
+ case AMDGPU::G_FFLOOR:
+ case AMDGPU::G_FCEIL:
+ case AMDGPU::G_FRINT:
case AMDGPU::G_SITOFP:
case AMDGPU::G_UITOFP:
case AMDGPU::G_FPTRUNC:
case AMDGPU::G_FPEXT:
case AMDGPU::G_FEXP2:
case AMDGPU::G_FLOG2:
+ case AMDGPU::G_FMINNUM:
+ case AMDGPU::G_FMAXNUM:
+ case AMDGPU::G_FMINNUM_IEEE:
+ case AMDGPU::G_FMAXNUM_IEEE:
case AMDGPU::G_FCANONICALIZE:
case AMDGPU::G_INTRINSIC_TRUNC:
case AMDGPU::G_INTRINSIC_ROUND:
+ case AMDGPU::G_AMDGPU_FFBH_U32:
+ return getDefaultMappingVOP(MI);
+ case AMDGPU::G_UMULH:
+ case AMDGPU::G_SMULH: {
+ if (Subtarget.hasScalarMulHiInsts() && isSALUMapping(MI))
+ return getDefaultMappingSOP(MI);
return getDefaultMappingVOP(MI);
+ }
case AMDGPU::G_IMPLICIT_DEF: {
unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
@@ -1710,12 +2328,19 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
}
case AMDGPU::G_FCONSTANT:
case AMDGPU::G_CONSTANT:
- case AMDGPU::G_FRAME_INDEX:
+ case AMDGPU::G_GLOBAL_VALUE:
case AMDGPU::G_BLOCK_ADDR: {
unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
break;
}
+ case AMDGPU::G_FRAME_INDEX: {
+ // TODO: This should be the same as other constants, but eliminateFrameIndex
+ // currently assumes VALU uses.
+ unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
+ OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
+ break;
+ }
case AMDGPU::G_INSERT: {
unsigned BankID = isSALUMapping(MI) ? AMDGPU::SGPRRegBankID :
AMDGPU::VGPRRegBankID;
@@ -1737,8 +2362,25 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
OpdsMapping[2] = nullptr;
break;
}
- case AMDGPU::G_MERGE_VALUES:
case AMDGPU::G_BUILD_VECTOR:
+ case AMDGPU::G_BUILD_VECTOR_TRUNC: {
+ LLT DstTy = MRI.getType(MI.getOperand(0).getReg());
+ if (DstTy == LLT::vector(2, 16)) {
+ unsigned DstSize = DstTy.getSizeInBits();
+ unsigned SrcSize = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
+ unsigned Src0BankID = getRegBankID(MI.getOperand(1).getReg(), MRI, *TRI);
+ unsigned Src1BankID = getRegBankID(MI.getOperand(2).getReg(), MRI, *TRI);
+ unsigned DstBankID = regBankUnion(Src0BankID, Src1BankID);
+
+ OpdsMapping[0] = AMDGPU::getValueMapping(DstBankID, DstSize);
+ OpdsMapping[1] = AMDGPU::getValueMapping(Src0BankID, SrcSize);
+ OpdsMapping[2] = AMDGPU::getValueMapping(Src1BankID, SrcSize);
+ break;
+ }
+
+ LLVM_FALLTHROUGH;
+ }
+ case AMDGPU::G_MERGE_VALUES:
case AMDGPU::G_CONCAT_VECTORS: {
unsigned Bank = isSALUMapping(MI) ?
AMDGPU::SGPRRegBankID : AMDGPU::VGPRRegBankID;
@@ -1760,6 +2402,7 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
case AMDGPU::G_CTTZ_ZERO_UNDEF:
case AMDGPU::G_CTPOP:
case AMDGPU::G_BSWAP:
+ case AMDGPU::G_BITREVERSE:
case AMDGPU::G_FABS:
case AMDGPU::G_FNEG: {
unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
@@ -1848,7 +2491,7 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
Op3Bank == AMDGPU::SGPRRegBankID &&
(Size == 32 || (Size == 64 &&
(Pred == CmpInst::ICMP_EQ || Pred == CmpInst::ICMP_NE) &&
- MF.getSubtarget<GCNSubtarget>().hasScalarCompareEq64()));
+ Subtarget.hasScalarCompareEq64()));
unsigned Op0Bank = CanUseSCC ? AMDGPU::SCCRegBankID : AMDGPU::VCCRegBankID;
@@ -1859,14 +2502,16 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
break;
}
case AMDGPU::G_EXTRACT_VECTOR_ELT: {
- unsigned OutputBankID = isSALUMapping(MI) ?
- AMDGPU::SGPRRegBankID : AMDGPU::VGPRRegBankID;
+ // VGPR index can be used for waterfall when indexing a SGPR vector.
+ unsigned SrcBankID = getRegBankID(MI.getOperand(1).getReg(), MRI, *TRI);
+ unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
unsigned SrcSize = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
unsigned IdxSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
unsigned IdxBank = getRegBankID(MI.getOperand(2).getReg(), MRI, *TRI);
+ unsigned OutputBankID = regBankUnion(SrcBankID, IdxBank);
- OpdsMapping[0] = AMDGPU::getValueMapping(OutputBankID, SrcSize);
- OpdsMapping[1] = AMDGPU::getValueMapping(OutputBankID, SrcSize);
+ OpdsMapping[0] = AMDGPU::getValueMappingSGPR64Only(OutputBankID, DstSize);
+ OpdsMapping[1] = AMDGPU::getValueMapping(SrcBankID, SrcSize);
// The index can be either if the source vector is VGPR.
OpdsMapping[2] = AMDGPU::getValueMapping(IdxBank, IdxSize);
@@ -1879,15 +2524,18 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
unsigned VecSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
unsigned InsertSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
unsigned IdxSize = MRI.getType(MI.getOperand(3).getReg()).getSizeInBits();
- unsigned InsertEltBank = getRegBankID(MI.getOperand(2).getReg(), MRI, *TRI);
- unsigned IdxBank = getRegBankID(MI.getOperand(3).getReg(), MRI, *TRI);
+ unsigned SrcBankID = getRegBankID(MI.getOperand(1).getReg(), MRI, *TRI);
+ unsigned InsertEltBankID = getRegBankID(MI.getOperand(2).getReg(),
+ MRI, *TRI);
+ unsigned IdxBankID = getRegBankID(MI.getOperand(3).getReg(), MRI, *TRI);
OpdsMapping[0] = AMDGPU::getValueMapping(OutputBankID, VecSize);
- OpdsMapping[1] = AMDGPU::getValueMapping(OutputBankID, VecSize);
- OpdsMapping[2] = AMDGPU::getValueMapping(InsertEltBank, InsertSize);
+ OpdsMapping[1] = AMDGPU::getValueMapping(SrcBankID, VecSize);
+ OpdsMapping[2] = AMDGPU::getValueMappingSGPR64Only(InsertEltBankID,
+ InsertSize);
// The index can be either if the source vector is VGPR.
- OpdsMapping[3] = AMDGPU::getValueMapping(IdxBank, IdxSize);
+ OpdsMapping[3] = AMDGPU::getValueMapping(IdxBankID, IdxSize);
break;
}
case AMDGPU::G_UNMERGE_VALUES: {
@@ -1903,11 +2551,9 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
break;
}
case AMDGPU::G_INTRINSIC: {
- switch (MI.getOperand(MI.getNumExplicitDefs()).getIntrinsicID()) {
+ switch (MI.getIntrinsicID()) {
default:
return getInvalidInstructionMapping();
- case Intrinsic::maxnum:
- case Intrinsic::minnum:
case Intrinsic::amdgcn_div_fmas:
case Intrinsic::amdgcn_trig_preop:
case Intrinsic::amdgcn_sin:
@@ -1938,6 +2584,8 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
case Intrinsic::amdgcn_mbcnt_hi:
case Intrinsic::amdgcn_ubfe:
case Intrinsic::amdgcn_sbfe:
+ case Intrinsic::amdgcn_mul_u24:
+ case Intrinsic::amdgcn_mul_i24:
case Intrinsic::amdgcn_lerp:
case Intrinsic::amdgcn_sad_u8:
case Intrinsic::amdgcn_msad_u8:
@@ -1956,10 +2604,10 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
case Intrinsic::amdgcn_udot4:
case Intrinsic::amdgcn_sdot8:
case Intrinsic::amdgcn_udot8:
- case Intrinsic::amdgcn_fdiv_fast:
case Intrinsic::amdgcn_wwm:
case Intrinsic::amdgcn_wqm:
return getDefaultMappingVOP(MI);
+ case Intrinsic::amdgcn_ds_swizzle:
case Intrinsic::amdgcn_ds_permute:
case Intrinsic::amdgcn_ds_bpermute:
case Intrinsic::amdgcn_update_dpp:
@@ -2040,7 +2688,7 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
}
case Intrinsic::amdgcn_readlane: {
// This must be an SGPR, but accept a VGPR.
- unsigned IdxReg = MI.getOperand(3).getReg();
+ Register IdxReg = MI.getOperand(3).getReg();
unsigned IdxSize = MRI.getType(IdxReg).getSizeInBits();
unsigned IdxBank = getRegBankID(IdxReg, MRI, *TRI, AMDGPU::SGPRRegBankID);
OpdsMapping[3] = AMDGPU::getValueMapping(IdxBank, IdxSize);
@@ -2055,10 +2703,10 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
}
case Intrinsic::amdgcn_writelane: {
unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
- unsigned SrcReg = MI.getOperand(2).getReg();
+ Register SrcReg = MI.getOperand(2).getReg();
unsigned SrcSize = MRI.getType(SrcReg).getSizeInBits();
unsigned SrcBank = getRegBankID(SrcReg, MRI, *TRI, AMDGPU::SGPRRegBankID);
- unsigned IdxReg = MI.getOperand(3).getReg();
+ Register IdxReg = MI.getOperand(3).getReg();
unsigned IdxSize = MRI.getType(IdxReg).getSizeInBits();
unsigned IdxBank = getRegBankID(IdxReg, MRI, *TRI, AMDGPU::SGPRRegBankID);
OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, DstSize);
@@ -2081,9 +2729,8 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
break;
}
case AMDGPU::G_INTRINSIC_W_SIDE_EFFECTS: {
- switch (MI.getOperand(MI.getNumExplicitDefs()).getIntrinsicID()) {
- default:
- return getInvalidInstructionMapping();
+ auto IntrID = MI.getIntrinsicID();
+ switch (IntrID) {
case Intrinsic::amdgcn_s_getreg:
case Intrinsic::amdgcn_s_memtime:
case Intrinsic::amdgcn_s_memrealtime:
@@ -2123,18 +2770,11 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
OpdsMapping[6] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32);
break;
case Intrinsic::amdgcn_exp:
- OpdsMapping[0] = nullptr; // IntrinsicID
- // FIXME: These are immediate values which can't be read from registers.
- OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32);
- OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32);
// FIXME: Could we support packed types here?
OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
OpdsMapping[5] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
OpdsMapping[6] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
- // FIXME: These are immediate values which can't be read from registers.
- OpdsMapping[7] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32);
- OpdsMapping[8] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32);
break;
case Intrinsic::amdgcn_buffer_load: {
Register RSrc = MI.getOperand(2).getReg(); // SGPR
@@ -2169,11 +2809,97 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
OpdsMapping[2] = AMDGPU::getValueMapping(Bank, 32);
break;
}
- case Intrinsic::amdgcn_end_cf: {
+ case Intrinsic::amdgcn_end_cf:
+ case Intrinsic::amdgcn_init_exec: {
+ unsigned Size = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI);
+ OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
+ break;
+ }
+ case Intrinsic::amdgcn_else: {
+ unsigned WaveSize = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI);
+ OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1);
+ OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, WaveSize);
+ OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, WaveSize);
+ break;
+ }
+ case Intrinsic::amdgcn_kill: {
+ OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1);
+ break;
+ }
+ case Intrinsic::amdgcn_raw_buffer_load:
+ case Intrinsic::amdgcn_raw_tbuffer_load: {
+ // FIXME: Should make intrinsic ID the last operand of the instruction,
+ // then this would be the same as store
+ OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
+ OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
+ OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
+ OpdsMapping[4] = getSGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
+ break;
+ }
+ case Intrinsic::amdgcn_raw_buffer_store:
+ case Intrinsic::amdgcn_raw_buffer_store_format:
+ case Intrinsic::amdgcn_raw_tbuffer_store: {
+ OpdsMapping[1] = getVGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
+ OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
+ OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
+ OpdsMapping[4] = getSGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
+ break;
+ }
+ case Intrinsic::amdgcn_struct_buffer_load:
+ case Intrinsic::amdgcn_struct_tbuffer_load: {
+ OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
+ OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
+ OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
+ OpdsMapping[4] = getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
+ OpdsMapping[5] = getSGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI);
+ break;
+ }
+ case Intrinsic::amdgcn_struct_buffer_store:
+ case Intrinsic::amdgcn_struct_tbuffer_store: {
+ OpdsMapping[1] = getVGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
+ OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
+ OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
+ OpdsMapping[4] = getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
+ OpdsMapping[5] = getSGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI);
+ break;
+ }
+ case Intrinsic::amdgcn_init_exec_from_input: {
unsigned Size = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI);
OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
+ OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
+ break;
+ }
+ case Intrinsic::amdgcn_ds_gws_init:
+ case Intrinsic::amdgcn_ds_gws_barrier:
+ case Intrinsic::amdgcn_ds_gws_sema_br: {
+ OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
+
+ // This must be an SGPR, but accept a VGPR.
+ unsigned Bank = getRegBankID(MI.getOperand(2).getReg(), MRI, *TRI,
+ AMDGPU::SGPRRegBankID);
+ OpdsMapping[2] = AMDGPU::getValueMapping(Bank, 32);
break;
}
+ case Intrinsic::amdgcn_ds_gws_sema_v:
+ case Intrinsic::amdgcn_ds_gws_sema_p:
+ case Intrinsic::amdgcn_ds_gws_sema_release_all: {
+ // This must be an SGPR, but accept a VGPR.
+ unsigned Bank = getRegBankID(MI.getOperand(1).getReg(), MRI, *TRI,
+ AMDGPU::SGPRRegBankID);
+ OpdsMapping[1] = AMDGPU::getValueMapping(Bank, 32);
+ break;
+ }
+ default:
+ if (const AMDGPU::RsrcIntrinsic *RSrcIntrin =
+ AMDGPU::lookupRsrcIntrinsic(IntrID)) {
+ // Non-images can have complications from operands that allow both SGPR
+ // and VGPR. For now it's too complicated to figure out the final opcode
+ // to derive the register bank from the MCInstrDesc.
+ if (RSrcIntrin->IsImage)
+ return getImageMapping(MRI, MI, RSrcIntrin->RsrcArg);
+ }
+
+ return getInvalidInstructionMapping();
}
break;
}
@@ -2216,6 +2942,8 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
}
case AMDGPU::G_LOAD:
+ case AMDGPU::G_ZEXTLOAD:
+ case AMDGPU::G_SEXTLOAD:
return getInstrMappingForLoad(MI);
case AMDGPU::G_ATOMICRMW_XCHG:
@@ -2228,6 +2956,7 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
case AMDGPU::G_ATOMICRMW_MIN:
case AMDGPU::G_ATOMICRMW_UMAX:
case AMDGPU::G_ATOMICRMW_UMIN:
+ case AMDGPU::G_ATOMICRMW_FADD:
case AMDGPU::G_ATOMIC_CMPXCHG: {
return getDefaultMappingAllVGPR(MI);
}
@@ -2247,4 +2976,3 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
getOperandsMapping(OpdsMapping),
MI.getNumOperands());
}
-
diff --git a/lib/Target/AMDGPU/AMDGPURegisterBankInfo.h b/lib/Target/AMDGPU/AMDGPURegisterBankInfo.h
index f3a96e2a6128..a14b74961118 100644
--- a/lib/Target/AMDGPU/AMDGPURegisterBankInfo.h
+++ b/lib/Target/AMDGPU/AMDGPURegisterBankInfo.h
@@ -13,6 +13,8 @@
#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUREGISTERBANKINFO_H
#define LLVM_LIB_TARGET_AMDGPU_AMDGPUREGISTERBANKINFO_H
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
#include "llvm/CodeGen/Register.h"
#include "llvm/CodeGen/GlobalISel/RegisterBankInfo.h"
@@ -23,7 +25,9 @@
namespace llvm {
class LLT;
+class GCNSubtarget;
class MachineIRBuilder;
+class SIInstrInfo;
class SIRegisterInfo;
class TargetRegisterInfo;
@@ -36,9 +40,27 @@ protected:
#include "AMDGPUGenRegisterBank.inc"
};
class AMDGPURegisterBankInfo : public AMDGPUGenRegisterBankInfo {
+ const GCNSubtarget &Subtarget;
const SIRegisterInfo *TRI;
-
- void executeInWaterfallLoop(MachineInstr &MI,
+ const SIInstrInfo *TII;
+
+ bool collectWaterfallOperands(
+ SmallSet<Register, 4> &SGPROperandRegs,
+ MachineInstr &MI,
+ MachineRegisterInfo &MRI,
+ ArrayRef<unsigned> OpIndices) const;
+
+ bool executeInWaterfallLoop(
+ MachineIRBuilder &B,
+ iterator_range<MachineBasicBlock::iterator> Range,
+ SmallSet<Register, 4> &SGPROperandRegs,
+ MachineRegisterInfo &MRI) const;
+
+ bool executeInWaterfallLoop(MachineIRBuilder &B,
+ MachineInstr &MI,
+ MachineRegisterInfo &MRI,
+ ArrayRef<unsigned> OpIndices) const;
+ bool executeInWaterfallLoop(MachineInstr &MI,
MachineRegisterInfo &MRI,
ArrayRef<unsigned> OpIndices) const;
@@ -47,6 +69,19 @@ class AMDGPURegisterBankInfo : public AMDGPUGenRegisterBankInfo {
bool applyMappingWideLoad(MachineInstr &MI,
const AMDGPURegisterBankInfo::OperandsMapper &OpdMapper,
MachineRegisterInfo &MRI) const;
+ bool
+ applyMappingImage(MachineInstr &MI,
+ const AMDGPURegisterBankInfo::OperandsMapper &OpdMapper,
+ MachineRegisterInfo &MRI, int RSrcIdx) const;
+
+ Register handleD16VData(MachineIRBuilder &B, MachineRegisterInfo &MRI,
+ Register Reg) const;
+
+ std::pair<Register, unsigned>
+ splitBufferOffsets(MachineIRBuilder &B, Register Offset) const;
+
+ MachineInstr *selectStoreIntrinsic(MachineIRBuilder &B,
+ MachineInstr &MI) const;
/// See RegisterBankInfo::applyMapping.
void applyMappingImpl(const OperandsMapper &OpdMapper) const override;
@@ -58,6 +93,16 @@ class AMDGPURegisterBankInfo : public AMDGPUGenRegisterBankInfo {
const TargetRegisterInfo &TRI,
unsigned Default = AMDGPU::VGPRRegBankID) const;
+ // Return a value mapping for an operand that is required to be an SGPR.
+ const ValueMapping *getSGPROpMapping(Register Reg,
+ const MachineRegisterInfo &MRI,
+ const TargetRegisterInfo &TRI) const;
+
+ // Return a value mapping for an operand that is required to be a VGPR.
+ const ValueMapping *getVGPROpMapping(Register Reg,
+ const MachineRegisterInfo &MRI,
+ const TargetRegisterInfo &TRI) const;
+
/// Split 64-bit value \p Reg into two 32-bit halves and populate them into \p
/// Regs. This appropriately sets the regbank of the new registers.
void split64BitValueForMapping(MachineIRBuilder &B,
@@ -90,8 +135,13 @@ class AMDGPURegisterBankInfo : public AMDGPUGenRegisterBankInfo {
const InstructionMapping &getDefaultMappingVOP(const MachineInstr &MI) const;
const InstructionMapping &getDefaultMappingAllVGPR(
const MachineInstr &MI) const;
+
+ const InstructionMapping &getImageMapping(const MachineRegisterInfo &MRI,
+ const MachineInstr &MI,
+ int RsrcIdx) const;
+
public:
- AMDGPURegisterBankInfo(const TargetRegisterInfo &TRI);
+ AMDGPURegisterBankInfo(const GCNSubtarget &STI);
unsigned copyCost(const RegisterBank &A, const RegisterBank &B,
unsigned Size) const override;
diff --git a/lib/Target/AMDGPU/AMDGPURegisterBanks.td b/lib/Target/AMDGPU/AMDGPURegisterBanks.td
index 9555694fb106..00f53b157577 100644
--- a/lib/Target/AMDGPU/AMDGPURegisterBanks.td
+++ b/lib/Target/AMDGPU/AMDGPURegisterBanks.td
@@ -7,14 +7,14 @@
//===----------------------------------------------------------------------===//
def SGPRRegBank : RegisterBank<"SGPR",
- [SReg_32, SReg_64, SReg_128, SReg_256, SReg_512]
+ [SReg_32, SReg_64, SReg_128, SReg_256, SReg_512, SReg_1024]
>;
def VGPRRegBank : RegisterBank<"VGPR",
- [VGPR_32, VReg_64, VReg_96, VReg_128, VReg_256, VReg_512]
+ [VGPR_32, VReg_64, VReg_96, VReg_128, VReg_256, VReg_512, VReg_1024]
>;
def SCCRegBank : RegisterBank <"SCC", [SReg_32, SCC_CLASS]>;
// It is helpful to distinguish conditions from ordinary SGPRs.
-def VCCRegBank : RegisterBank <"VCC", [SReg_64]>;
+def VCCRegBank : RegisterBank <"VCC", [SReg_1]>;
diff --git a/lib/Target/AMDGPU/AMDGPURegisterInfo.cpp b/lib/Target/AMDGPU/AMDGPURegisterInfo.cpp
index 7cffdf1a4dcf..9806e6b0714f 100644
--- a/lib/Target/AMDGPU/AMDGPURegisterInfo.cpp
+++ b/lib/Target/AMDGPU/AMDGPURegisterInfo.cpp
@@ -26,19 +26,59 @@ AMDGPURegisterInfo::AMDGPURegisterInfo() : AMDGPUGenRegisterInfo(0) {}
// they are not supported at this time.
//===----------------------------------------------------------------------===//
-unsigned AMDGPURegisterInfo::getSubRegFromChannel(unsigned Channel) {
- static const unsigned SubRegs[] = {
- AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3, AMDGPU::sub4,
- AMDGPU::sub5, AMDGPU::sub6, AMDGPU::sub7, AMDGPU::sub8, AMDGPU::sub9,
- AMDGPU::sub10, AMDGPU::sub11, AMDGPU::sub12, AMDGPU::sub13, AMDGPU::sub14,
- AMDGPU::sub15, AMDGPU::sub16, AMDGPU::sub17, AMDGPU::sub18, AMDGPU::sub19,
- AMDGPU::sub20, AMDGPU::sub21, AMDGPU::sub22, AMDGPU::sub23, AMDGPU::sub24,
- AMDGPU::sub25, AMDGPU::sub26, AMDGPU::sub27, AMDGPU::sub28, AMDGPU::sub29,
- AMDGPU::sub30, AMDGPU::sub31
- };
-
- assert(Channel < array_lengthof(SubRegs));
- return SubRegs[Channel];
+// Table of NumRegs sized pieces at every 32-bit offset.
+static const uint16_t SubRegFromChannelTable[][32] = {
+ { AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3,
+ AMDGPU::sub4, AMDGPU::sub5, AMDGPU::sub6, AMDGPU::sub7,
+ AMDGPU::sub8, AMDGPU::sub9, AMDGPU::sub10, AMDGPU::sub11,
+ AMDGPU::sub12, AMDGPU::sub13, AMDGPU::sub14, AMDGPU::sub15,
+ AMDGPU::sub16, AMDGPU::sub17, AMDGPU::sub18, AMDGPU::sub19,
+ AMDGPU::sub20, AMDGPU::sub21, AMDGPU::sub22, AMDGPU::sub23,
+ AMDGPU::sub24, AMDGPU::sub25, AMDGPU::sub26, AMDGPU::sub27,
+ AMDGPU::sub28, AMDGPU::sub29, AMDGPU::sub30, AMDGPU::sub31
+ },
+ {
+ AMDGPU::sub0_sub1, AMDGPU::sub1_sub2, AMDGPU::sub2_sub3, AMDGPU::sub3_sub4,
+ AMDGPU::sub4_sub5, AMDGPU::sub5_sub6, AMDGPU::sub6_sub7, AMDGPU::sub7_sub8,
+ AMDGPU::sub8_sub9, AMDGPU::sub9_sub10, AMDGPU::sub10_sub11, AMDGPU::sub11_sub12,
+ AMDGPU::sub12_sub13, AMDGPU::sub13_sub14, AMDGPU::sub14_sub15, AMDGPU::sub15_sub16,
+ AMDGPU::sub16_sub17, AMDGPU::sub17_sub18, AMDGPU::sub18_sub19, AMDGPU::sub19_sub20,
+ AMDGPU::sub20_sub21, AMDGPU::sub21_sub22, AMDGPU::sub22_sub23, AMDGPU::sub23_sub24,
+ AMDGPU::sub24_sub25, AMDGPU::sub25_sub26, AMDGPU::sub26_sub27, AMDGPU::sub27_sub28,
+ AMDGPU::sub28_sub29, AMDGPU::sub29_sub30, AMDGPU::sub30_sub31, AMDGPU::NoSubRegister
+ },
+ {
+ AMDGPU::sub0_sub1_sub2, AMDGPU::sub1_sub2_sub3, AMDGPU::sub2_sub3_sub4, AMDGPU::sub3_sub4_sub5,
+ AMDGPU::sub4_sub5_sub6, AMDGPU::sub5_sub6_sub7, AMDGPU::sub6_sub7_sub8, AMDGPU::sub7_sub8_sub9,
+ AMDGPU::sub8_sub9_sub10, AMDGPU::sub9_sub10_sub11, AMDGPU::sub10_sub11_sub12, AMDGPU::sub11_sub12_sub13,
+ AMDGPU::sub12_sub13_sub14, AMDGPU::sub13_sub14_sub15, AMDGPU::sub14_sub15_sub16, AMDGPU::sub15_sub16_sub17,
+ AMDGPU::sub16_sub17_sub18, AMDGPU::sub17_sub18_sub19, AMDGPU::sub18_sub19_sub20, AMDGPU::sub19_sub20_sub21,
+ AMDGPU::sub20_sub21_sub22, AMDGPU::sub21_sub22_sub23, AMDGPU::sub22_sub23_sub24, AMDGPU::sub23_sub24_sub25,
+ AMDGPU::sub24_sub25_sub26, AMDGPU::sub25_sub26_sub27, AMDGPU::sub26_sub27_sub28, AMDGPU::sub27_sub28_sub29,
+ AMDGPU::sub28_sub29_sub30, AMDGPU::sub29_sub30_sub31, AMDGPU::NoSubRegister, AMDGPU::NoSubRegister
+ },
+ {
+ AMDGPU::sub0_sub1_sub2_sub3, AMDGPU::sub1_sub2_sub3_sub4, AMDGPU::sub2_sub3_sub4_sub5, AMDGPU::sub3_sub4_sub5_sub6,
+ AMDGPU::sub4_sub5_sub6_sub7, AMDGPU::sub5_sub6_sub7_sub8, AMDGPU::sub6_sub7_sub8_sub9, AMDGPU::sub7_sub8_sub9_sub10,
+ AMDGPU::sub8_sub9_sub10_sub11, AMDGPU::sub9_sub10_sub11_sub12, AMDGPU::sub10_sub11_sub12_sub13, AMDGPU::sub11_sub12_sub13_sub14,
+ AMDGPU::sub12_sub13_sub14_sub15, AMDGPU::sub13_sub14_sub15_sub16, AMDGPU::sub14_sub15_sub16_sub17, AMDGPU::sub15_sub16_sub17_sub18,
+ AMDGPU::sub16_sub17_sub18_sub19, AMDGPU::sub17_sub18_sub19_sub20, AMDGPU::sub18_sub19_sub20_sub21, AMDGPU::sub19_sub20_sub21_sub22,
+ AMDGPU::sub20_sub21_sub22_sub23, AMDGPU::sub21_sub22_sub23_sub24, AMDGPU::sub22_sub23_sub24_sub25, AMDGPU::sub23_sub24_sub25_sub26,
+ AMDGPU::sub24_sub25_sub26_sub27, AMDGPU::sub25_sub26_sub27_sub28, AMDGPU::sub26_sub27_sub28_sub29, AMDGPU::sub27_sub28_sub29_sub30,
+ AMDGPU::sub28_sub29_sub30_sub31, AMDGPU::NoSubRegister, AMDGPU::NoSubRegister, AMDGPU::NoSubRegister
+ }
+};
+
+// FIXME: TableGen should generate something to make this manageable for all
+// register classes. At a minimum we could use the opposite of
+// composeSubRegIndices and go up from the base 32-bit subreg.
+unsigned AMDGPURegisterInfo::getSubRegFromChannel(unsigned Channel, unsigned NumRegs) {
+ const unsigned NumRegIndex = NumRegs - 1;
+
+ assert(NumRegIndex < array_lengthof(SubRegFromChannelTable) &&
+ "Not implemented");
+ assert(Channel < array_lengthof(SubRegFromChannelTable[0]));
+ return SubRegFromChannelTable[NumRegIndex][Channel];
}
void AMDGPURegisterInfo::reserveRegisterTuples(BitVector &Reserved, unsigned Reg) const {
diff --git a/lib/Target/AMDGPU/AMDGPURegisterInfo.h b/lib/Target/AMDGPU/AMDGPURegisterInfo.h
index 3453a8c1b0b3..9e713ca804a1 100644
--- a/lib/Target/AMDGPU/AMDGPURegisterInfo.h
+++ b/lib/Target/AMDGPU/AMDGPURegisterInfo.h
@@ -28,7 +28,7 @@ struct AMDGPURegisterInfo : public AMDGPUGenRegisterInfo {
/// \returns the sub reg enum value for the given \p Channel
/// (e.g. getSubRegFromChannel(0) -> AMDGPU::sub0)
- static unsigned getSubRegFromChannel(unsigned Channel);
+ static unsigned getSubRegFromChannel(unsigned Channel, unsigned NumRegs = 1);
void reserveRegisterTuples(BitVector &, unsigned Reg) const;
};
diff --git a/lib/Target/AMDGPU/AMDGPUSearchableTables.td b/lib/Target/AMDGPU/AMDGPUSearchableTables.td
index f8703c36127a..26b8b7840270 100644
--- a/lib/Target/AMDGPU/AMDGPUSearchableTables.td
+++ b/lib/Target/AMDGPU/AMDGPUSearchableTables.td
@@ -81,6 +81,8 @@ def : SourceOfDivergence<int_amdgcn_raw_buffer_atomic_umax>;
def : SourceOfDivergence<int_amdgcn_raw_buffer_atomic_and>;
def : SourceOfDivergence<int_amdgcn_raw_buffer_atomic_or>;
def : SourceOfDivergence<int_amdgcn_raw_buffer_atomic_xor>;
+def : SourceOfDivergence<int_amdgcn_raw_buffer_atomic_inc>;
+def : SourceOfDivergence<int_amdgcn_raw_buffer_atomic_dec>;
def : SourceOfDivergence<int_amdgcn_raw_buffer_atomic_cmpswap>;
def : SourceOfDivergence<int_amdgcn_struct_buffer_atomic_swap>;
def : SourceOfDivergence<int_amdgcn_struct_buffer_atomic_add>;
@@ -92,6 +94,8 @@ def : SourceOfDivergence<int_amdgcn_struct_buffer_atomic_umax>;
def : SourceOfDivergence<int_amdgcn_struct_buffer_atomic_and>;
def : SourceOfDivergence<int_amdgcn_struct_buffer_atomic_or>;
def : SourceOfDivergence<int_amdgcn_struct_buffer_atomic_xor>;
+def : SourceOfDivergence<int_amdgcn_struct_buffer_atomic_inc>;
+def : SourceOfDivergence<int_amdgcn_struct_buffer_atomic_dec>;
def : SourceOfDivergence<int_amdgcn_struct_buffer_atomic_cmpswap>;
def : SourceOfDivergence<int_amdgcn_ps_live>;
def : SourceOfDivergence<int_amdgcn_ds_swizzle>;
diff --git a/lib/Target/AMDGPU/AMDGPUSubtarget.cpp b/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
index 1eb9b83456c5..3bb6dd4571c0 100644
--- a/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
+++ b/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
@@ -175,6 +175,7 @@ AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT) :
HasFminFmaxLegacy(true),
EnablePromoteAlloca(false),
HasTrigReducedRange(false),
+ MaxWavesPerEU(10),
LocalMemorySize(0),
WavefrontSize(0)
{ }
@@ -261,6 +262,7 @@ GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
AddNoCarryInsts(false),
HasUnpackedD16VMem(false),
LDSMisalignedBug(false),
+ HasMFMAInlineLiteralBug(false),
ScalarizeGlobal(false),
@@ -278,9 +280,10 @@ GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
InstrInfo(initializeSubtargetDependencies(TT, GPU, FS)),
TLInfo(TM, *this),
FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0) {
+ MaxWavesPerEU = AMDGPU::IsaInfo::getMaxWavesPerEU(this);
CallLoweringInfo.reset(new AMDGPUCallLowering(*getTargetLowering()));
Legalizer.reset(new AMDGPULegalizerInfo(*this, TM));
- RegBankInfo.reset(new AMDGPURegisterBankInfo(*getRegisterInfo()));
+ RegBankInfo.reset(new AMDGPURegisterBankInfo(*this));
InstSelector.reset(new AMDGPUInstructionSelector(
*this, *static_cast<AMDGPURegisterBankInfo *>(RegBankInfo.get()), TM));
}
@@ -489,28 +492,28 @@ bool AMDGPUSubtarget::makeLIDRangeMetadata(Instruction *I) const {
}
uint64_t AMDGPUSubtarget::getExplicitKernArgSize(const Function &F,
- unsigned &MaxAlign) const {
+ Align &MaxAlign) const {
assert(F.getCallingConv() == CallingConv::AMDGPU_KERNEL ||
F.getCallingConv() == CallingConv::SPIR_KERNEL);
const DataLayout &DL = F.getParent()->getDataLayout();
uint64_t ExplicitArgBytes = 0;
- MaxAlign = 1;
+ MaxAlign = Align::None();
for (const Argument &Arg : F.args()) {
Type *ArgTy = Arg.getType();
- unsigned Align = DL.getABITypeAlignment(ArgTy);
+ const Align Alignment(DL.getABITypeAlignment(ArgTy));
uint64_t AllocSize = DL.getTypeAllocSize(ArgTy);
- ExplicitArgBytes = alignTo(ExplicitArgBytes, Align) + AllocSize;
- MaxAlign = std::max(MaxAlign, Align);
+ ExplicitArgBytes = alignTo(ExplicitArgBytes, Alignment) + AllocSize;
+ MaxAlign = std::max(MaxAlign, Alignment);
}
return ExplicitArgBytes;
}
unsigned AMDGPUSubtarget::getKernArgSegmentSize(const Function &F,
- unsigned &MaxAlign) const {
+ Align &MaxAlign) const {
uint64_t ExplicitArgBytes = getExplicitKernArgSize(F, MaxAlign);
unsigned ExplicitOffset = getExplicitKernelArgOffset(F);
@@ -518,7 +521,7 @@ unsigned AMDGPUSubtarget::getKernArgSegmentSize(const Function &F,
uint64_t TotalSize = ExplicitOffset + ExplicitArgBytes;
unsigned ImplicitBytes = getImplicitArgNumBytes(F);
if (ImplicitBytes != 0) {
- unsigned Alignment = getAlignmentForImplicitArgPtr();
+ const Align Alignment = getAlignmentForImplicitArgPtr();
TotalSize = alignTo(ExplicitArgBytes, Alignment) + ImplicitBytes;
}
@@ -566,7 +569,7 @@ bool GCNSubtarget::hasMadF16() const {
unsigned GCNSubtarget::getOccupancyWithNumSGPRs(unsigned SGPRs) const {
if (getGeneration() >= AMDGPUSubtarget::GFX10)
- return 10;
+ return getMaxWavesPerEU();
if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
if (SGPRs <= 80)
@@ -591,25 +594,12 @@ unsigned GCNSubtarget::getOccupancyWithNumSGPRs(unsigned SGPRs) const {
}
unsigned GCNSubtarget::getOccupancyWithNumVGPRs(unsigned VGPRs) const {
- if (VGPRs <= 24)
- return 10;
- if (VGPRs <= 28)
- return 9;
- if (VGPRs <= 32)
- return 8;
- if (VGPRs <= 36)
- return 7;
- if (VGPRs <= 40)
- return 6;
- if (VGPRs <= 48)
- return 5;
- if (VGPRs <= 64)
- return 4;
- if (VGPRs <= 84)
- return 3;
- if (VGPRs <= 128)
- return 2;
- return 1;
+ unsigned MaxWaves = getMaxWavesPerEU();
+ unsigned Granule = getVGPRAllocGranule();
+ if (VGPRs < Granule)
+ return MaxWaves;
+ unsigned RoundedRegs = ((VGPRs + Granule - 1) / Granule) * Granule;
+ return std::min(std::max(getTotalNumVGPRs() / RoundedRegs, 1u), MaxWaves);
}
unsigned GCNSubtarget::getReservedNumSGPRs(const MachineFunction &MF) const {
@@ -629,6 +619,20 @@ unsigned GCNSubtarget::getReservedNumSGPRs(const MachineFunction &MF) const {
return 2; // VCC.
}
+unsigned GCNSubtarget::computeOccupancy(const MachineFunction &MF,
+ unsigned LDSSize,
+ unsigned NumSGPRs,
+ unsigned NumVGPRs) const {
+ unsigned Occupancy =
+ std::min(getMaxWavesPerEU(),
+ getOccupancyWithLocalMemSize(LDSSize, MF.getFunction()));
+ if (NumSGPRs)
+ Occupancy = std::min(Occupancy, getOccupancyWithNumSGPRs(NumSGPRs));
+ if (NumVGPRs)
+ Occupancy = std::min(Occupancy, getOccupancyWithNumVGPRs(NumVGPRs));
+ return Occupancy;
+}
+
unsigned GCNSubtarget::getMaxNumSGPRs(const MachineFunction &MF) const {
const Function &F = MF.getFunction();
const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
@@ -878,8 +882,8 @@ struct FillMFMAShadowMutation : ScheduleDAGMutation {
void GCNSubtarget::getPostRAMutations(
std::vector<std::unique_ptr<ScheduleDAGMutation>> &Mutations) const {
- Mutations.push_back(llvm::make_unique<MemOpClusterMutation>(&InstrInfo));
- Mutations.push_back(llvm::make_unique<FillMFMAShadowMutation>(&InstrInfo));
+ Mutations.push_back(std::make_unique<MemOpClusterMutation>(&InstrInfo));
+ Mutations.push_back(std::make_unique<FillMFMAShadowMutation>(&InstrInfo));
}
const AMDGPUSubtarget &AMDGPUSubtarget::get(const MachineFunction &MF) {
diff --git a/lib/Target/AMDGPU/AMDGPUSubtarget.h b/lib/Target/AMDGPU/AMDGPUSubtarget.h
index 78c3b823946d..936feb00c62b 100644
--- a/lib/Target/AMDGPU/AMDGPUSubtarget.h
+++ b/lib/Target/AMDGPU/AMDGPUSubtarget.h
@@ -75,6 +75,7 @@ protected:
bool HasFminFmaxLegacy;
bool EnablePromoteAlloca;
bool HasTrigReducedRange;
+ unsigned MaxWavesPerEU;
int LocalMemorySize;
unsigned WavefrontSize;
@@ -195,8 +196,8 @@ public:
return LocalMemorySize;
}
- unsigned getAlignmentForImplicitArgPtr() const {
- return isAmdHsaOS() ? 8 : 4;
+ Align getAlignmentForImplicitArgPtr() const {
+ return isAmdHsaOS() ? Align(8) : Align(4);
}
/// Returns the offset in bytes from the start of the input buffer
@@ -223,7 +224,9 @@ public:
/// subtarget.
virtual unsigned getMinWavesPerEU() const = 0;
- unsigned getMaxWavesPerEU() const { return 10; }
+ /// \returns Maximum number of waves per execution unit supported by the
+ /// subtarget without any kind of limitation.
+ unsigned getMaxWavesPerEU() const { return MaxWavesPerEU; }
/// Creates value range metadata on an workitemid.* inrinsic call or load.
bool makeLIDRangeMetadata(Instruction *I) const;
@@ -235,16 +238,17 @@ public:
return 16;
return AMDGPU::getIntegerAttribute(F, "amdgpu-implicitarg-num-bytes", 0);
}
- uint64_t getExplicitKernArgSize(const Function &F,
- unsigned &MaxAlign) const;
- unsigned getKernArgSegmentSize(const Function &F,
- unsigned &MaxAlign) const;
+ uint64_t getExplicitKernArgSize(const Function &F, Align &MaxAlign) const;
+ unsigned getKernArgSegmentSize(const Function &F, Align &MaxAlign) const;
virtual ~AMDGPUSubtarget() {}
};
class GCNSubtarget : public AMDGPUGenSubtargetInfo,
public AMDGPUSubtarget {
+
+ using AMDGPUSubtarget::getMaxWavesPerEU;
+
public:
enum TrapHandlerAbi {
TrapHandlerAbiNone = 0,
@@ -362,6 +366,7 @@ protected:
bool CaymanISA;
bool CFALUBug;
bool LDSMisalignedBug;
+ bool HasMFMAInlineLiteralBug;
bool HasVertexCache;
short TexVTXClauseSize;
bool ScalarizeGlobal;
@@ -416,7 +421,7 @@ public:
return CallLoweringInfo.get();
}
- const InstructionSelector *getInstructionSelector() const override {
+ InstructionSelector *getInstructionSelector() const override {
return InstSelector.get();
}
@@ -544,6 +549,14 @@ public:
return GFX9Insts;
}
+ bool hasScalarPackInsts() const {
+ return GFX9Insts;
+ }
+
+ bool hasScalarMulHiInsts() const {
+ return GFX9Insts;
+ }
+
TrapHandlerAbi getTrapHandlerAbi() const {
return isAmdHsaOS() ? TrapHandlerAbiHsa : TrapHandlerAbiNone;
}
@@ -611,6 +624,11 @@ public:
return getGeneration() >= AMDGPUSubtarget::GFX9;
}
+ /// \returns If target supports S_DENORM_MODE.
+ bool hasDenormModeInst() const {
+ return getGeneration() >= AMDGPUSubtarget::GFX10;
+ }
+
bool useFlatForGlobal() const {
return FlatForGlobal;
}
@@ -848,9 +866,7 @@ public:
// on the pointer value itself may rely on the alignment / known low bits of
// the pointer. Set this to something above the minimum to avoid needing
// dynamic realignment in common cases.
- unsigned getStackAlignment() const {
- return 16;
- }
+ Align getStackAlignment() const { return Align(16); }
bool enableMachineScheduler() const override {
return true;
@@ -881,12 +897,6 @@ public:
return AMDGPU::IsaInfo::getMaxWavesPerCU(this, FlatWorkGroupSize);
}
- /// \returns Maximum number of waves per execution unit supported by the
- /// subtarget without any kind of limitation.
- unsigned getMaxWavesPerEU() const {
- return AMDGPU::IsaInfo::getMaxWavesPerEU();
- }
-
/// \returns Number of waves per work group supported by the subtarget and
/// limited by given \p FlatWorkGroupSize.
unsigned getWavesPerWorkGroup(unsigned FlatWorkGroupSize) const {
@@ -944,6 +954,14 @@ public:
return HasDPP;
}
+ bool hasDPPBroadcasts() const {
+ return HasDPP && getGeneration() < GFX10;
+ }
+
+ bool hasDPPWavefrontShifts() const {
+ return HasDPP && getGeneration() < GFX10;
+ }
+
bool hasDPP8() const {
return HasDPP8;
}
@@ -974,6 +992,10 @@ public:
return SGPRInitBug;
}
+ bool hasMFMAInlineLiteralBug() const {
+ return HasMFMAInlineLiteralBug;
+ }
+
bool has12DWordStoreHazard() const {
return getGeneration() != AMDGPUSubtarget::SOUTHERN_ISLANDS;
}
@@ -1036,6 +1058,13 @@ public:
/// VGPRs
unsigned getOccupancyWithNumVGPRs(unsigned VGPRs) const;
+ /// Return occupancy for the given function. Used LDS and a number of
+ /// registers if provided.
+ /// Note, occupancy can be affected by the scratch allocation as well, but
+ /// we do not have enough information to compute it.
+ unsigned computeOccupancy(const MachineFunction &MF, unsigned LDSSize = 0,
+ unsigned NumSGPRs = 0, unsigned NumVGPRs = 0) const;
+
/// \returns true if the flat_scratch register should be initialized with the
/// pointer to the wave's scratch memory rather than a size and offset.
bool flatScratchIsPointer() const {
@@ -1226,9 +1255,7 @@ public:
return Gen;
}
- unsigned getStackAlignment() const {
- return 4;
- }
+ Align getStackAlignment() const { return Align(4); }
R600Subtarget &initializeSubtargetDependencies(const Triple &TT,
StringRef GPU, StringRef FS);
diff --git a/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index 0ea8db04c298..e8cf77161a14 100644
--- a/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -238,16 +238,17 @@ extern "C" void LLVMInitializeAMDGPUTarget() {
initializeAMDGPUUseNativeCallsPass(*PR);
initializeAMDGPUSimplifyLibCallsPass(*PR);
initializeAMDGPUInlinerPass(*PR);
+ initializeAMDGPUPrintfRuntimeBindingPass(*PR);
initializeGCNRegBankReassignPass(*PR);
initializeGCNNSAReassignPass(*PR);
}
static std::unique_ptr<TargetLoweringObjectFile> createTLOF(const Triple &TT) {
- return llvm::make_unique<AMDGPUTargetObjectFile>();
+ return std::make_unique<AMDGPUTargetObjectFile>();
}
static ScheduleDAGInstrs *createR600MachineScheduler(MachineSchedContext *C) {
- return new ScheduleDAGMILive(C, llvm::make_unique<R600SchedStrategy>());
+ return new ScheduleDAGMILive(C, std::make_unique<R600SchedStrategy>());
}
static ScheduleDAGInstrs *createSIMachineScheduler(MachineSchedContext *C) {
@@ -257,7 +258,7 @@ static ScheduleDAGInstrs *createSIMachineScheduler(MachineSchedContext *C) {
static ScheduleDAGInstrs *
createGCNMaxOccupancyMachineScheduler(MachineSchedContext *C) {
ScheduleDAGMILive *DAG =
- new GCNScheduleDAGMILive(C, make_unique<GCNMaxOccupancySchedStrategy>(C));
+ new GCNScheduleDAGMILive(C, std::make_unique<GCNMaxOccupancySchedStrategy>(C));
DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI));
DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI));
DAG->addMutation(createAMDGPUMacroFusionDAGMutation());
@@ -412,6 +413,7 @@ void AMDGPUTargetMachine::adjustPassManager(PassManagerBuilder &Builder) {
PM.add(createAMDGPUExternalAAWrapperPass());
}
PM.add(createAMDGPUUnifyMetadataPass());
+ PM.add(createAMDGPUPrintfRuntimeBinding());
PM.add(createAMDGPUPropagateAttributesLatePass(this));
if (Internalize) {
PM.add(createInternalizePass(mustPreserveGV));
@@ -482,7 +484,7 @@ const R600Subtarget *R600TargetMachine::getSubtargetImpl(
// creation will depend on the TM and the code generation flags on the
// function that reside in TargetOptions.
resetTargetOptions(F);
- I = llvm::make_unique<R600Subtarget>(TargetTriple, GPU, FS, *this);
+ I = std::make_unique<R600Subtarget>(TargetTriple, GPU, FS, *this);
}
return I.get();
@@ -518,7 +520,7 @@ const GCNSubtarget *GCNTargetMachine::getSubtargetImpl(const Function &F) const
// creation will depend on the TM and the code generation flags on the
// function that reside in TargetOptions.
resetTargetOptions(F);
- I = llvm::make_unique<GCNSubtarget>(TargetTriple, GPU, FS, *this);
+ I = std::make_unique<GCNSubtarget>(TargetTriple, GPU, FS, *this);
}
I->setScalarizeGlobalBehavior(ScalarizeGlobal);
@@ -659,6 +661,8 @@ void AMDGPUPassConfig::addIRPasses() {
disablePass(&FuncletLayoutID);
disablePass(&PatchableFunctionID);
+ addPass(createAMDGPUPrintfRuntimeBinding());
+
// This must occur before inlining, as the inliner will not look through
// bitcast calls.
addPass(createAMDGPUFixFunctionBitcastsPass());
@@ -681,12 +685,6 @@ void AMDGPUPassConfig::addIRPasses() {
// without ever running any passes on the second.
addPass(createBarrierNoopPass());
- if (TM.getTargetTriple().getArch() == Triple::amdgcn) {
- // TODO: May want to move later or split into an early and late one.
-
- addPass(createAMDGPUCodeGenPreparePass());
- }
-
// Handle uses of OpenCL image2d_t, image3d_t and sampler_t arguments.
if (TM.getTargetTriple().getArch() == Triple::r600)
addPass(createR600OpenCLImageTypeLoweringPass());
@@ -714,6 +712,11 @@ void AMDGPUPassConfig::addIRPasses() {
}
}
+ if (TM.getTargetTriple().getArch() == Triple::amdgcn) {
+ // TODO: May want to move later or split into an early and late one.
+ addPass(createAMDGPUCodeGenPreparePass());
+ }
+
TargetPassConfig::addIRPasses();
// EarlyCSE is not always strong enough to clean up what LSR produces. For
@@ -1046,7 +1049,7 @@ bool GCNTargetMachine::parseMachineFunctionInfo(
return true;
if (MFI->ScratchRSrcReg != AMDGPU::PRIVATE_RSRC_REG &&
- !AMDGPU::SReg_128RegClass.contains(MFI->ScratchRSrcReg)) {
+ !AMDGPU::SGPR_128RegClass.contains(MFI->ScratchRSrcReg)) {
return diagnoseRegisterClass(YamlMFI.ScratchRSrcReg);
}
@@ -1095,7 +1098,7 @@ bool GCNTargetMachine::parseMachineFunctionInfo(
if (YamlMFI.ArgInfo &&
(parseAndCheckArgument(YamlMFI.ArgInfo->PrivateSegmentBuffer,
- AMDGPU::SReg_128RegClass,
+ AMDGPU::SGPR_128RegClass,
MFI->ArgInfo.PrivateSegmentBuffer, 4, 0) ||
parseAndCheckArgument(YamlMFI.ArgInfo->DispatchPtr,
AMDGPU::SReg_64RegClass, MFI->ArgInfo.DispatchPtr,
diff --git a/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
index aaed280a1270..616196ad5ba3 100644
--- a/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
+++ b/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
@@ -57,7 +57,7 @@ using namespace llvm;
static cl::opt<unsigned> UnrollThresholdPrivate(
"amdgpu-unroll-threshold-private",
cl::desc("Unroll threshold for AMDGPU if private memory used in a loop"),
- cl::init(2500), cl::Hidden);
+ cl::init(2000), cl::Hidden);
static cl::opt<unsigned> UnrollThresholdLocal(
"amdgpu-unroll-threshold-local",
@@ -590,6 +590,61 @@ bool GCNTTIImpl::isAlwaysUniform(const Value *V) const {
return false;
}
+bool GCNTTIImpl::collectFlatAddressOperands(SmallVectorImpl<int> &OpIndexes,
+ Intrinsic::ID IID) const {
+ switch (IID) {
+ case Intrinsic::amdgcn_atomic_inc:
+ case Intrinsic::amdgcn_atomic_dec:
+ case Intrinsic::amdgcn_ds_fadd:
+ case Intrinsic::amdgcn_ds_fmin:
+ case Intrinsic::amdgcn_ds_fmax:
+ case Intrinsic::amdgcn_is_shared:
+ case Intrinsic::amdgcn_is_private:
+ OpIndexes.push_back(0);
+ return true;
+ default:
+ return false;
+ }
+}
+
+bool GCNTTIImpl::rewriteIntrinsicWithAddressSpace(
+ IntrinsicInst *II, Value *OldV, Value *NewV) const {
+ auto IntrID = II->getIntrinsicID();
+ switch (IntrID) {
+ case Intrinsic::amdgcn_atomic_inc:
+ case Intrinsic::amdgcn_atomic_dec:
+ case Intrinsic::amdgcn_ds_fadd:
+ case Intrinsic::amdgcn_ds_fmin:
+ case Intrinsic::amdgcn_ds_fmax: {
+ const ConstantInt *IsVolatile = cast<ConstantInt>(II->getArgOperand(4));
+ if (!IsVolatile->isZero())
+ return false;
+ Module *M = II->getParent()->getParent()->getParent();
+ Type *DestTy = II->getType();
+ Type *SrcTy = NewV->getType();
+ Function *NewDecl =
+ Intrinsic::getDeclaration(M, II->getIntrinsicID(), {DestTy, SrcTy});
+ II->setArgOperand(0, NewV);
+ II->setCalledFunction(NewDecl);
+ return true;
+ }
+ case Intrinsic::amdgcn_is_shared:
+ case Intrinsic::amdgcn_is_private: {
+ unsigned TrueAS = IntrID == Intrinsic::amdgcn_is_shared ?
+ AMDGPUAS::LOCAL_ADDRESS : AMDGPUAS::PRIVATE_ADDRESS;
+ unsigned NewAS = NewV->getType()->getPointerAddressSpace();
+ LLVMContext &Ctx = NewV->getType()->getContext();
+ ConstantInt *NewVal = (TrueAS == NewAS) ?
+ ConstantInt::getTrue(Ctx) : ConstantInt::getFalse(Ctx);
+ II->replaceAllUsesWith(NewVal);
+ II->eraseFromParent();
+ return true;
+ }
+ default:
+ return false;
+ }
+}
+
unsigned GCNTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
Type *SubTp) {
if (ST->hasVOP3PInsts()) {
@@ -638,6 +693,39 @@ void GCNTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
CommonTTI.getUnrollingPreferences(L, SE, UP);
}
+unsigned GCNTTIImpl::getUserCost(const User *U,
+ ArrayRef<const Value *> Operands) {
+ // Estimate extractelement elimination
+ if (const ExtractElementInst *EE = dyn_cast<ExtractElementInst>(U)) {
+ ConstantInt *CI = dyn_cast<ConstantInt>(EE->getOperand(1));
+ unsigned Idx = -1;
+ if (CI)
+ Idx = CI->getZExtValue();
+ return getVectorInstrCost(EE->getOpcode(), EE->getOperand(0)->getType(),
+ Idx);
+ }
+
+ // Estimate insertelement elimination
+ if (const InsertElementInst *IE = dyn_cast<InsertElementInst>(U)) {
+ ConstantInt *CI = dyn_cast<ConstantInt>(IE->getOperand(2));
+ unsigned Idx = -1;
+ if (CI)
+ Idx = CI->getZExtValue();
+ return getVectorInstrCost(IE->getOpcode(), IE->getType(), Idx);
+ }
+
+ // Estimate different intrinsics, e.g. llvm.fabs
+ if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(U)) {
+ SmallVector<Value *, 4> Args(II->arg_operands());
+ FastMathFlags FMF;
+ if (auto *FPMO = dyn_cast<FPMathOperator>(II))
+ FMF = FPMO->getFastMathFlags();
+ return getIntrinsicInstrCost(II->getIntrinsicID(), II->getType(), Args,
+ FMF);
+ }
+ return BaseT::getUserCost(U, Operands);
+}
+
unsigned R600TTIImpl::getHardwareNumberOfRegisters(bool Vec) const {
return 4 * 128; // XXX - 4 channels. Should these count as vector instead?
}
diff --git a/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h b/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
index 6f1bf5a26f0d..67f7f9074f10 100644
--- a/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
+++ b/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
@@ -46,10 +46,18 @@ class AMDGPUTTIImpl final : public BasicTTIImplBase<AMDGPUTTIImpl> {
Triple TargetTriple;
+ const TargetSubtargetInfo *ST;
+ const TargetLoweringBase *TLI;
+
+ const TargetSubtargetInfo *getST() const { return ST; }
+ const TargetLoweringBase *getTLI() const { return TLI; }
+
public:
explicit AMDGPUTTIImpl(const AMDGPUTargetMachine *TM, const Function &F)
- : BaseT(TM, F.getParent()->getDataLayout()),
- TargetTriple(TM->getTargetTriple()) {}
+ : BaseT(TM, F.getParent()->getDataLayout()),
+ TargetTriple(TM->getTargetTriple()),
+ ST(static_cast<const GCNSubtarget *>(TM->getSubtargetImpl(F))),
+ TLI(ST->getTargetLowering()) {}
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
TTI::UnrollingPreferences &UP);
@@ -183,6 +191,11 @@ public:
return AMDGPUAS::FLAT_ADDRESS;
}
+ bool collectFlatAddressOperands(SmallVectorImpl<int> &OpIndexes,
+ Intrinsic::ID IID) const;
+ bool rewriteIntrinsicWithAddressSpace(IntrinsicInst *II,
+ Value *OldV, Value *NewV) const;
+
unsigned getVectorSplitCost() { return 0; }
unsigned getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
@@ -191,7 +204,7 @@ public:
bool areInlineCompatible(const Function *Caller,
const Function *Callee) const;
- unsigned getInliningThresholdMultiplier() { return 7; }
+ unsigned getInliningThresholdMultiplier() { return 9; }
int getInlinerVectorBonusPercent() { return 0; }
@@ -201,6 +214,7 @@ public:
int getMinMaxReductionCost(Type *Ty, Type *CondTy,
bool IsPairwiseForm,
bool IsUnsigned);
+ unsigned getUserCost(const User *U, ArrayRef<const Value *> Operands);
};
class R600TTIImpl final : public BasicTTIImplBase<R600TTIImpl> {
diff --git a/lib/Target/AMDGPU/AMDILCFGStructurizer.cpp b/lib/Target/AMDGPU/AMDILCFGStructurizer.cpp
index 12f2e9519c9e..101ecfc0c87c 100644
--- a/lib/Target/AMDGPU/AMDILCFGStructurizer.cpp
+++ b/lib/Target/AMDGPU/AMDILCFGStructurizer.cpp
@@ -1307,8 +1307,8 @@ int AMDGPUCFGStructurizer::improveSimpleJumpintoIf(MachineBasicBlock *HeadMBB,
if (LandBlkHasOtherPred) {
report_fatal_error("Extra register needed to handle CFG");
- unsigned CmpResReg =
- HeadMBB->getParent()->getRegInfo().createVirtualRegister(I32RC);
+ Register CmpResReg =
+ HeadMBB->getParent()->getRegInfo().createVirtualRegister(I32RC);
report_fatal_error("Extra compare instruction needed to handle CFG");
insertCondBranchBefore(LandBlk, I, R600::IF_PREDICATE_SET,
CmpResReg, DebugLoc());
@@ -1316,8 +1316,8 @@ int AMDGPUCFGStructurizer::improveSimpleJumpintoIf(MachineBasicBlock *HeadMBB,
// XXX: We are running this after RA, so creating virtual registers will
// cause an assertion failure in the PostRA scheduling pass.
- unsigned InitReg =
- HeadMBB->getParent()->getRegInfo().createVirtualRegister(I32RC);
+ Register InitReg =
+ HeadMBB->getParent()->getRegInfo().createVirtualRegister(I32RC);
insertCondBranchBefore(LandBlk, I, R600::IF_PREDICATE_SET, InitReg,
DebugLoc());
diff --git a/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
index 6d678966c98e..9dd511fab57c 100644
--- a/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
+++ b/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
@@ -143,6 +143,7 @@ public:
ImmTyDLC,
ImmTyGLC,
ImmTySLC,
+ ImmTySWZ,
ImmTyTFE,
ImmTyD16,
ImmTyClampSI,
@@ -216,14 +217,15 @@ public:
if (Kind == Token)
return true;
- if (Kind != Expression || !Expr)
- return false;
-
// When parsing operands, we can't always tell if something was meant to be
// a token, like 'gds', or an expression that references a global variable.
// In this case, we assume the string is an expression, and if we need to
// interpret is a token, then we treat the symbol name as the token.
- return isa<MCSymbolRefExpr>(Expr);
+ return isSymbolRefExpr();
+ }
+
+ bool isSymbolRefExpr() const {
+ return isExpr() && Expr && isa<MCSymbolRefExpr>(Expr);
}
bool isImm() const override {
@@ -274,8 +276,10 @@ public:
isRegClass(AMDGPU::VReg_64RegClassID) ||
isRegClass(AMDGPU::VReg_96RegClassID) ||
isRegClass(AMDGPU::VReg_128RegClassID) ||
+ isRegClass(AMDGPU::VReg_160RegClassID) ||
isRegClass(AMDGPU::VReg_256RegClassID) ||
- isRegClass(AMDGPU::VReg_512RegClassID);
+ isRegClass(AMDGPU::VReg_512RegClassID) ||
+ isRegClass(AMDGPU::VReg_1024RegClassID);
}
bool isVReg32() const {
@@ -286,6 +290,10 @@ public:
return isOff() || isVReg32();
}
+ bool isNull() const {
+ return isRegKind() && getReg() == AMDGPU::SGPR_NULL;
+ }
+
bool isSDWAOperand(MVT type) const;
bool isSDWAFP16Operand() const;
bool isSDWAFP32Operand() const;
@@ -325,6 +333,7 @@ public:
bool isDLC() const { return isImmTy(ImmTyDLC); }
bool isGLC() const { return isImmTy(ImmTyGLC); }
bool isSLC() const { return isImmTy(ImmTySLC); }
+ bool isSWZ() const { return isImmTy(ImmTySWZ); }
bool isTFE() const { return isImmTy(ImmTyTFE); }
bool isD16() const { return isImmTy(ImmTyD16); }
bool isFORMAT() const { return isImmTy(ImmTyFORMAT) && isUInt<8>(getImm()); }
@@ -817,6 +826,7 @@ public:
case ImmTyDLC: OS << "DLC"; break;
case ImmTyGLC: OS << "GLC"; break;
case ImmTySLC: OS << "SLC"; break;
+ case ImmTySWZ: OS << "SWZ"; break;
case ImmTyTFE: OS << "TFE"; break;
case ImmTyD16: OS << "D16"; break;
case ImmTyFORMAT: OS << "FORMAT"; break;
@@ -886,7 +896,7 @@ public:
int64_t Val, SMLoc Loc,
ImmTy Type = ImmTyNone,
bool IsFPImm = false) {
- auto Op = llvm::make_unique<AMDGPUOperand>(Immediate, AsmParser);
+ auto Op = std::make_unique<AMDGPUOperand>(Immediate, AsmParser);
Op->Imm.Val = Val;
Op->Imm.IsFPImm = IsFPImm;
Op->Imm.Type = Type;
@@ -899,7 +909,7 @@ public:
static AMDGPUOperand::Ptr CreateToken(const AMDGPUAsmParser *AsmParser,
StringRef Str, SMLoc Loc,
bool HasExplicitEncodingSize = true) {
- auto Res = llvm::make_unique<AMDGPUOperand>(Token, AsmParser);
+ auto Res = std::make_unique<AMDGPUOperand>(Token, AsmParser);
Res->Tok.Data = Str.data();
Res->Tok.Length = Str.size();
Res->StartLoc = Loc;
@@ -910,7 +920,7 @@ public:
static AMDGPUOperand::Ptr CreateReg(const AMDGPUAsmParser *AsmParser,
unsigned RegNo, SMLoc S,
SMLoc E) {
- auto Op = llvm::make_unique<AMDGPUOperand>(Register, AsmParser);
+ auto Op = std::make_unique<AMDGPUOperand>(Register, AsmParser);
Op->Reg.RegNo = RegNo;
Op->Reg.Mods = Modifiers();
Op->StartLoc = S;
@@ -920,7 +930,7 @@ public:
static AMDGPUOperand::Ptr CreateExpr(const AMDGPUAsmParser *AsmParser,
const class MCExpr *Expr, SMLoc S) {
- auto Op = llvm::make_unique<AMDGPUOperand>(Expression, AsmParser);
+ auto Op = std::make_unique<AMDGPUOperand>(Expression, AsmParser);
Op->Expr = Expr;
Op->StartLoc = S;
Op->EndLoc = S;
@@ -1051,11 +1061,23 @@ private:
std::string &CollectString);
bool AddNextRegisterToList(unsigned& Reg, unsigned& RegWidth,
- RegisterKind RegKind, unsigned Reg1,
- unsigned RegNum);
+ RegisterKind RegKind, unsigned Reg1);
bool ParseAMDGPURegister(RegisterKind& RegKind, unsigned& Reg,
- unsigned& RegNum, unsigned& RegWidth,
- unsigned *DwordRegIndex);
+ unsigned& RegNum, unsigned& RegWidth);
+ unsigned ParseRegularReg(RegisterKind &RegKind,
+ unsigned &RegNum,
+ unsigned &RegWidth);
+ unsigned ParseSpecialReg(RegisterKind &RegKind,
+ unsigned &RegNum,
+ unsigned &RegWidth);
+ unsigned ParseRegList(RegisterKind &RegKind,
+ unsigned &RegNum,
+ unsigned &RegWidth);
+ bool ParseRegRange(unsigned& Num, unsigned& Width);
+ unsigned getRegularReg(RegisterKind RegKind,
+ unsigned RegNum,
+ unsigned RegWidth);
+
bool isRegister();
bool isRegister(const AsmToken &Token, const AsmToken &NextToken) const;
Optional<StringRef> getGprCountSymbolName(RegisterKind RegKind);
@@ -1306,6 +1328,7 @@ private:
bool validateOpSel(const MCInst &Inst);
bool validateVccOperand(unsigned Reg) const;
bool validateVOP3Literal(const MCInst &Inst) const;
+ unsigned getConstantBusLimit(unsigned Opcode) const;
bool usesConstantBus(const MCInst &Inst, unsigned OpIdx);
bool isInlineConstant(const MCInst &Inst, unsigned OpIdx) const;
unsigned findImplicitSGPRReadInVOP(const MCInst &Inst) const;
@@ -1321,6 +1344,7 @@ private:
void peekTokens(MutableArrayRef<AsmToken> Tokens);
AsmToken::TokenKind getTokenKind() const;
bool parseExpr(int64_t &Imm);
+ bool parseExpr(OperandVector &Operands);
StringRef getTokenStr() const;
AsmToken peekToken();
AsmToken getToken() const;
@@ -1399,9 +1423,12 @@ public:
void cvtSdwaVOP1(MCInst &Inst, const OperandVector &Operands);
void cvtSdwaVOP2(MCInst &Inst, const OperandVector &Operands);
void cvtSdwaVOP2b(MCInst &Inst, const OperandVector &Operands);
+ void cvtSdwaVOP2e(MCInst &Inst, const OperandVector &Operands);
void cvtSdwaVOPC(MCInst &Inst, const OperandVector &Operands);
void cvtSDWA(MCInst &Inst, const OperandVector &Operands,
- uint64_t BasicInstType, bool skipVcc = false);
+ uint64_t BasicInstType,
+ bool SkipDstVcc = false,
+ bool SkipSrcVcc = false);
AMDGPUOperand::Ptr defaultBLGP() const;
AMDGPUOperand::Ptr defaultCBSZ() const;
@@ -1636,8 +1663,8 @@ bool AMDGPUOperand::isSDWAInt32Operand() const {
}
bool AMDGPUOperand::isBoolReg() const {
- return AsmParser->getFeatureBits()[AMDGPU::FeatureWavefrontSize64] ?
- isSCSrcB64() : isSCSrcB32();
+ return (AsmParser->getFeatureBits()[AMDGPU::FeatureWavefrontSize64] && isSCSrcB64()) ||
+ (AsmParser->getFeatureBits()[AMDGPU::FeatureWavefrontSize32] && isSCSrcB32());
}
uint64_t AMDGPUOperand::applyInputFPModifiers(uint64_t Val, unsigned Size) const
@@ -1849,6 +1876,8 @@ static bool isInlineValue(unsigned Reg) {
case AMDGPU::SRC_EXECZ:
case AMDGPU::SRC_SCC:
return true;
+ case AMDGPU::SGPR_NULL:
+ return true;
default:
return false;
}
@@ -1870,8 +1899,10 @@ static int getRegClass(RegisterKind Is, unsigned RegWidth) {
case 2: return AMDGPU::VReg_64RegClassID;
case 3: return AMDGPU::VReg_96RegClassID;
case 4: return AMDGPU::VReg_128RegClassID;
+ case 5: return AMDGPU::VReg_160RegClassID;
case 8: return AMDGPU::VReg_256RegClassID;
case 16: return AMDGPU::VReg_512RegClassID;
+ case 32: return AMDGPU::VReg_1024RegClassID;
}
} else if (Is == IS_TTMP) {
switch (RegWidth) {
@@ -1944,7 +1975,7 @@ static unsigned getSpecialRegForName(StringRef RegName) {
.Case("tba_lo", AMDGPU::TBA_LO)
.Case("tba_hi", AMDGPU::TBA_HI)
.Case("null", AMDGPU::SGPR_NULL)
- .Default(0);
+ .Default(AMDGPU::NoRegister);
}
bool AMDGPUAsmParser::ParseRegister(unsigned &RegNo, SMLoc &StartLoc,
@@ -1959,8 +1990,7 @@ bool AMDGPUAsmParser::ParseRegister(unsigned &RegNo, SMLoc &StartLoc,
}
bool AMDGPUAsmParser::AddNextRegisterToList(unsigned &Reg, unsigned &RegWidth,
- RegisterKind RegKind, unsigned Reg1,
- unsigned RegNum) {
+ RegisterKind RegKind, unsigned Reg1) {
switch (RegKind) {
case IS_SPECIAL:
if (Reg == AMDGPU::EXEC_LO && Reg1 == AMDGPU::EXEC_HI) {
@@ -2008,14 +2038,37 @@ bool AMDGPUAsmParser::AddNextRegisterToList(unsigned &Reg, unsigned &RegWidth,
}
}
-static const StringRef Registers[] = {
- { "v" },
- { "s" },
- { "ttmp" },
- { "acc" },
- { "a" },
+struct RegInfo {
+ StringLiteral Name;
+ RegisterKind Kind;
+};
+
+static constexpr RegInfo RegularRegisters[] = {
+ {{"v"}, IS_VGPR},
+ {{"s"}, IS_SGPR},
+ {{"ttmp"}, IS_TTMP},
+ {{"acc"}, IS_AGPR},
+ {{"a"}, IS_AGPR},
};
+static bool isRegularReg(RegisterKind Kind) {
+ return Kind == IS_VGPR ||
+ Kind == IS_SGPR ||
+ Kind == IS_TTMP ||
+ Kind == IS_AGPR;
+}
+
+static const RegInfo* getRegularRegInfo(StringRef Str) {
+ for (const RegInfo &Reg : RegularRegisters)
+ if (Str.startswith(Reg.Name))
+ return &Reg;
+ return nullptr;
+}
+
+static bool getRegNum(StringRef Str, unsigned& Num) {
+ return !Str.getAsInteger(10, Num);
+}
+
bool
AMDGPUAsmParser::isRegister(const AsmToken &Token,
const AsmToken &NextToken) const {
@@ -2029,24 +2082,24 @@ AMDGPUAsmParser::isRegister(const AsmToken &Token,
// A single register like s0 or a range of registers like s[0:1]
- StringRef RegName = Token.getString();
-
- for (StringRef Reg : Registers) {
- if (RegName.startswith(Reg)) {
- if (Reg.size() < RegName.size()) {
- unsigned RegNum;
- // A single register with an index: rXX
- if (!RegName.substr(Reg.size()).getAsInteger(10, RegNum))
- return true;
- } else {
- // A range of registers: r[XX:YY].
- if (NextToken.is(AsmToken::LBrac))
- return true;
- }
+ StringRef Str = Token.getString();
+ const RegInfo *Reg = getRegularRegInfo(Str);
+ if (Reg) {
+ StringRef RegName = Reg->Name;
+ StringRef RegSuffix = Str.substr(RegName.size());
+ if (!RegSuffix.empty()) {
+ unsigned Num;
+ // A single register with an index: rXX
+ if (getRegNum(RegSuffix, Num))
+ return true;
+ } else {
+ // A range of registers: r[XX:YY].
+ if (NextToken.is(AsmToken::LBrac))
+ return true;
}
}
- return getSpecialRegForName(RegName);
+ return getSpecialRegForName(Str) != AMDGPU::NoRegister;
}
bool
@@ -2055,137 +2108,161 @@ AMDGPUAsmParser::isRegister()
return isRegister(getToken(), peekToken());
}
-bool AMDGPUAsmParser::ParseAMDGPURegister(RegisterKind &RegKind, unsigned &Reg,
- unsigned &RegNum, unsigned &RegWidth,
- unsigned *DwordRegIndex) {
- if (DwordRegIndex) { *DwordRegIndex = 0; }
+unsigned
+AMDGPUAsmParser::getRegularReg(RegisterKind RegKind,
+ unsigned RegNum,
+ unsigned RegWidth) {
+
+ assert(isRegularReg(RegKind));
+
+ unsigned AlignSize = 1;
+ if (RegKind == IS_SGPR || RegKind == IS_TTMP) {
+ // SGPR and TTMP registers must be aligned.
+ // Max required alignment is 4 dwords.
+ AlignSize = std::min(RegWidth, 4u);
+ }
+
+ if (RegNum % AlignSize != 0)
+ return AMDGPU::NoRegister;
+
+ unsigned RegIdx = RegNum / AlignSize;
+ int RCID = getRegClass(RegKind, RegWidth);
+ if (RCID == -1)
+ return AMDGPU::NoRegister;
+
const MCRegisterInfo *TRI = getContext().getRegisterInfo();
- if (getLexer().is(AsmToken::Identifier)) {
- StringRef RegName = Parser.getTok().getString();
- if ((Reg = getSpecialRegForName(RegName))) {
- Parser.Lex();
- RegKind = IS_SPECIAL;
- } else {
- unsigned RegNumIndex = 0;
- if (RegName[0] == 'v') {
- RegNumIndex = 1;
- RegKind = IS_VGPR;
- } else if (RegName[0] == 's') {
- RegNumIndex = 1;
- RegKind = IS_SGPR;
- } else if (RegName[0] == 'a') {
- RegNumIndex = RegName.startswith("acc") ? 3 : 1;
- RegKind = IS_AGPR;
- } else if (RegName.startswith("ttmp")) {
- RegNumIndex = strlen("ttmp");
- RegKind = IS_TTMP;
- } else {
- return false;
- }
- if (RegName.size() > RegNumIndex) {
- // Single 32-bit register: vXX.
- if (RegName.substr(RegNumIndex).getAsInteger(10, RegNum))
- return false;
- Parser.Lex();
- RegWidth = 1;
- } else {
- // Range of registers: v[XX:YY]. ":YY" is optional.
- Parser.Lex();
- int64_t RegLo, RegHi;
- if (getLexer().isNot(AsmToken::LBrac))
- return false;
- Parser.Lex();
+ const MCRegisterClass RC = TRI->getRegClass(RCID);
+ if (RegIdx >= RC.getNumRegs())
+ return AMDGPU::NoRegister;
- if (getParser().parseAbsoluteExpression(RegLo))
- return false;
+ return RC.getRegister(RegIdx);
+}
- const bool isRBrace = getLexer().is(AsmToken::RBrac);
- if (!isRBrace && getLexer().isNot(AsmToken::Colon))
- return false;
- Parser.Lex();
+bool
+AMDGPUAsmParser::ParseRegRange(unsigned& Num, unsigned& Width) {
+ int64_t RegLo, RegHi;
+ if (!trySkipToken(AsmToken::LBrac))
+ return false;
- if (isRBrace) {
- RegHi = RegLo;
- } else {
- if (getParser().parseAbsoluteExpression(RegHi))
- return false;
+ if (!parseExpr(RegLo))
+ return false;
- if (getLexer().isNot(AsmToken::RBrac))
- return false;
- Parser.Lex();
- }
- RegNum = (unsigned) RegLo;
- RegWidth = (RegHi - RegLo) + 1;
- }
- }
- } else if (getLexer().is(AsmToken::LBrac)) {
- // List of consecutive registers: [s0,s1,s2,s3]
- Parser.Lex();
- if (!ParseAMDGPURegister(RegKind, Reg, RegNum, RegWidth, nullptr))
- return false;
- if (RegWidth != 1)
+ if (trySkipToken(AsmToken::Colon)) {
+ if (!parseExpr(RegHi))
return false;
- RegisterKind RegKind1;
- unsigned Reg1, RegNum1, RegWidth1;
- do {
- if (getLexer().is(AsmToken::Comma)) {
- Parser.Lex();
- } else if (getLexer().is(AsmToken::RBrac)) {
- Parser.Lex();
- break;
- } else if (ParseAMDGPURegister(RegKind1, Reg1, RegNum1, RegWidth1, nullptr)) {
- if (RegWidth1 != 1) {
- return false;
- }
- if (RegKind1 != RegKind) {
- return false;
- }
- if (!AddNextRegisterToList(Reg, RegWidth, RegKind1, Reg1, RegNum1)) {
- return false;
- }
- } else {
- return false;
- }
- } while (true);
} else {
- return false;
+ RegHi = RegLo;
}
- switch (RegKind) {
- case IS_SPECIAL:
+
+ if (!trySkipToken(AsmToken::RBrac))
+ return false;
+
+ if (!isUInt<32>(RegLo) || !isUInt<32>(RegHi) || RegLo > RegHi)
+ return false;
+
+ Num = static_cast<unsigned>(RegLo);
+ Width = (RegHi - RegLo) + 1;
+ return true;
+}
+
+unsigned
+AMDGPUAsmParser::ParseSpecialReg(RegisterKind &RegKind,
+ unsigned &RegNum,
+ unsigned &RegWidth) {
+ assert(isToken(AsmToken::Identifier));
+ unsigned Reg = getSpecialRegForName(getTokenStr());
+ if (Reg) {
RegNum = 0;
RegWidth = 1;
- break;
- case IS_VGPR:
- case IS_SGPR:
- case IS_AGPR:
- case IS_TTMP:
- {
- unsigned Size = 1;
- if (RegKind == IS_SGPR || RegKind == IS_TTMP) {
- // SGPR and TTMP registers must be aligned. Max required alignment is 4 dwords.
- Size = std::min(RegWidth, 4u);
- }
- if (RegNum % Size != 0)
- return false;
- if (DwordRegIndex) { *DwordRegIndex = RegNum; }
- RegNum = RegNum / Size;
- int RCID = getRegClass(RegKind, RegWidth);
- if (RCID == -1)
- return false;
- const MCRegisterClass RC = TRI->getRegClass(RCID);
- if (RegNum >= RC.getNumRegs())
- return false;
- Reg = RC.getRegister(RegNum);
- break;
+ RegKind = IS_SPECIAL;
+ lex(); // skip register name
+ }
+ return Reg;
+}
+
+unsigned
+AMDGPUAsmParser::ParseRegularReg(RegisterKind &RegKind,
+ unsigned &RegNum,
+ unsigned &RegWidth) {
+ assert(isToken(AsmToken::Identifier));
+ StringRef RegName = getTokenStr();
+
+ const RegInfo *RI = getRegularRegInfo(RegName);
+ if (!RI)
+ return AMDGPU::NoRegister;
+ lex(); // skip register name
+
+ RegKind = RI->Kind;
+ StringRef RegSuffix = RegName.substr(RI->Name.size());
+ if (!RegSuffix.empty()) {
+ // Single 32-bit register: vXX.
+ if (!getRegNum(RegSuffix, RegNum))
+ return AMDGPU::NoRegister;
+ RegWidth = 1;
+ } else {
+ // Range of registers: v[XX:YY]. ":YY" is optional.
+ if (!ParseRegRange(RegNum, RegWidth))
+ return AMDGPU::NoRegister;
}
- default:
- llvm_unreachable("unexpected register kind");
+ return getRegularReg(RegKind, RegNum, RegWidth);
+}
+
+unsigned
+AMDGPUAsmParser::ParseRegList(RegisterKind &RegKind,
+ unsigned &RegNum,
+ unsigned &RegWidth) {
+ unsigned Reg = AMDGPU::NoRegister;
+
+ if (!trySkipToken(AsmToken::LBrac))
+ return AMDGPU::NoRegister;
+
+ // List of consecutive registers, e.g.: [s0,s1,s2,s3]
+
+ if (!ParseAMDGPURegister(RegKind, Reg, RegNum, RegWidth))
+ return AMDGPU::NoRegister;
+ if (RegWidth != 1)
+ return AMDGPU::NoRegister;
+
+ for (; trySkipToken(AsmToken::Comma); ) {
+ RegisterKind NextRegKind;
+ unsigned NextReg, NextRegNum, NextRegWidth;
+
+ if (!ParseAMDGPURegister(NextRegKind, NextReg, NextRegNum, NextRegWidth))
+ return AMDGPU::NoRegister;
+ if (NextRegWidth != 1)
+ return AMDGPU::NoRegister;
+ if (NextRegKind != RegKind)
+ return AMDGPU::NoRegister;
+ if (!AddNextRegisterToList(Reg, RegWidth, RegKind, NextReg))
+ return AMDGPU::NoRegister;
}
- if (!subtargetHasRegister(*TRI, Reg))
- return false;
- return true;
+ if (!trySkipToken(AsmToken::RBrac))
+ return AMDGPU::NoRegister;
+
+ if (isRegularReg(RegKind))
+ Reg = getRegularReg(RegKind, RegNum, RegWidth);
+
+ return Reg;
+}
+
+bool AMDGPUAsmParser::ParseAMDGPURegister(RegisterKind &RegKind,
+ unsigned &Reg,
+ unsigned &RegNum,
+ unsigned &RegWidth) {
+ Reg = AMDGPU::NoRegister;
+
+ if (isToken(AsmToken::Identifier)) {
+ Reg = ParseSpecialReg(RegKind, RegNum, RegWidth);
+ if (Reg == AMDGPU::NoRegister)
+ Reg = ParseRegularReg(RegKind, RegNum, RegWidth);
+ } else {
+ Reg = ParseRegList(RegKind, RegNum, RegWidth);
+ }
+
+ const MCRegisterInfo *TRI = getContext().getRegisterInfo();
+ return Reg != AMDGPU::NoRegister && subtargetHasRegister(*TRI, Reg);
}
Optional<StringRef>
@@ -2241,18 +2318,18 @@ std::unique_ptr<AMDGPUOperand> AMDGPUAsmParser::parseRegister() {
SMLoc StartLoc = Tok.getLoc();
SMLoc EndLoc = Tok.getEndLoc();
RegisterKind RegKind;
- unsigned Reg, RegNum, RegWidth, DwordRegIndex;
+ unsigned Reg, RegNum, RegWidth;
- if (!ParseAMDGPURegister(RegKind, Reg, RegNum, RegWidth, &DwordRegIndex)) {
+ if (!ParseAMDGPURegister(RegKind, Reg, RegNum, RegWidth)) {
//FIXME: improve error messages (bug 41303).
Error(StartLoc, "not a valid operand.");
return nullptr;
}
if (AMDGPU::IsaInfo::hasCodeObjectV3(&getSTI())) {
- if (!updateGprCountSymbols(RegKind, DwordRegIndex, RegWidth))
+ if (!updateGprCountSymbols(RegKind, RegNum, RegWidth))
return nullptr;
} else
- KernelScope.usesRegister(RegKind, DwordRegIndex, RegWidth);
+ KernelScope.usesRegister(RegKind, RegNum, RegWidth);
return AMDGPUOperand::CreateReg(this, Reg, StartLoc, EndLoc);
}
@@ -2648,7 +2725,6 @@ unsigned AMDGPUAsmParser::findImplicitSGPRReadInVOP(const MCInst &Inst) const {
case AMDGPU::VCC_LO:
case AMDGPU::VCC_HI:
case AMDGPU::M0:
- case AMDGPU::SGPR_NULL:
return Reg;
default:
break;
@@ -2697,13 +2773,38 @@ bool AMDGPUAsmParser::isInlineConstant(const MCInst &Inst,
}
}
+unsigned AMDGPUAsmParser::getConstantBusLimit(unsigned Opcode) const {
+ if (!isGFX10())
+ return 1;
+
+ switch (Opcode) {
+ // 64-bit shift instructions can use only one scalar value input
+ case AMDGPU::V_LSHLREV_B64:
+ case AMDGPU::V_LSHLREV_B64_gfx10:
+ case AMDGPU::V_LSHL_B64:
+ case AMDGPU::V_LSHRREV_B64:
+ case AMDGPU::V_LSHRREV_B64_gfx10:
+ case AMDGPU::V_LSHR_B64:
+ case AMDGPU::V_ASHRREV_I64:
+ case AMDGPU::V_ASHRREV_I64_gfx10:
+ case AMDGPU::V_ASHR_I64:
+ return 1;
+ default:
+ return 2;
+ }
+}
+
bool AMDGPUAsmParser::usesConstantBus(const MCInst &Inst, unsigned OpIdx) {
const MCOperand &MO = Inst.getOperand(OpIdx);
if (MO.isImm()) {
return !isInlineConstant(Inst, OpIdx);
+ } else if (MO.isReg()) {
+ auto Reg = MO.getReg();
+ const MCRegisterInfo *TRI = getContext().getRegisterInfo();
+ return isSGPR(mc2PseudoReg(Reg), TRI) && Reg != SGPR_NULL;
+ } else {
+ return true;
}
- return !MO.isReg() ||
- isSGPR(mc2PseudoReg(MO.getReg()), getContext().getRegisterInfo());
}
bool AMDGPUAsmParser::validateConstantBusLimitations(const MCInst &Inst) {
@@ -2782,10 +2883,7 @@ bool AMDGPUAsmParser::validateConstantBusLimitations(const MCInst &Inst) {
}
ConstantBusUseCount += NumLiterals;
- if (isGFX10())
- return ConstantBusUseCount <= 2;
-
- return ConstantBusUseCount <= 1;
+ return ConstantBusUseCount <= getConstantBusLimit(Opcode);
}
bool AMDGPUAsmParser::validateEarlyClobberLimitations(const MCInst &Inst) {
@@ -3212,6 +3310,7 @@ bool AMDGPUAsmParser::validateSOPLiteral(const MCInst &Inst) const {
const int OpIndices[] = { Src0Idx, Src1Idx };
+ unsigned NumExprs = 0;
unsigned NumLiterals = 0;
uint32_t LiteralValue;
@@ -3219,19 +3318,21 @@ bool AMDGPUAsmParser::validateSOPLiteral(const MCInst &Inst) const {
if (OpIdx == -1) break;
const MCOperand &MO = Inst.getOperand(OpIdx);
- if (MO.isImm() &&
- // Exclude special imm operands (like that used by s_set_gpr_idx_on)
- AMDGPU::isSISrcOperand(Desc, OpIdx) &&
- !isInlineConstant(Inst, OpIdx)) {
- uint32_t Value = static_cast<uint32_t>(MO.getImm());
- if (NumLiterals == 0 || LiteralValue != Value) {
- LiteralValue = Value;
- ++NumLiterals;
+ // Exclude special imm operands (like that used by s_set_gpr_idx_on)
+ if (AMDGPU::isSISrcOperand(Desc, OpIdx)) {
+ if (MO.isImm() && !isInlineConstant(Inst, OpIdx)) {
+ uint32_t Value = static_cast<uint32_t>(MO.getImm());
+ if (NumLiterals == 0 || LiteralValue != Value) {
+ LiteralValue = Value;
+ ++NumLiterals;
+ }
+ } else if (MO.isExpr()) {
+ ++NumExprs;
}
}
}
- return NumLiterals <= 1;
+ return NumLiterals + NumExprs <= 1;
}
bool AMDGPUAsmParser::validateOpSel(const MCInst &Inst) {
@@ -3267,6 +3368,7 @@ bool AMDGPUAsmParser::validateVOP3Literal(const MCInst &Inst) const {
const int OpIndices[] = { Src0Idx, Src1Idx, Src2Idx };
+ unsigned NumExprs = 0;
unsigned NumLiterals = 0;
uint32_t LiteralValue;
@@ -3274,17 +3376,26 @@ bool AMDGPUAsmParser::validateVOP3Literal(const MCInst &Inst) const {
if (OpIdx == -1) break;
const MCOperand &MO = Inst.getOperand(OpIdx);
- if (!MO.isImm() || !AMDGPU::isSISrcOperand(Desc, OpIdx))
+ if (!MO.isImm() && !MO.isExpr())
+ continue;
+ if (!AMDGPU::isSISrcOperand(Desc, OpIdx))
continue;
- if (!isInlineConstant(Inst, OpIdx)) {
+ if (OpIdx == Src2Idx && (Desc.TSFlags & SIInstrFlags::IsMAI) &&
+ getFeatureBits()[AMDGPU::FeatureMFMAInlineLiteralBug])
+ return false;
+
+ if (MO.isImm() && !isInlineConstant(Inst, OpIdx)) {
uint32_t Value = static_cast<uint32_t>(MO.getImm());
if (NumLiterals == 0 || LiteralValue != Value) {
LiteralValue = Value;
++NumLiterals;
}
+ } else if (MO.isExpr()) {
+ ++NumExprs;
}
}
+ NumLiterals += NumExprs;
return !NumLiterals ||
(NumLiterals == 1 && getFeatureBits()[AMDGPU::FeatureVOP3Literal]);
@@ -3607,37 +3718,44 @@ bool AMDGPUAsmParser::ParseDirectiveAMDHSAKernel() {
PARSE_BITS_ENTRY(KD.kernel_code_properties,
KERNEL_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER,
Val, ValRange);
- UserSGPRCount += 4;
+ if (Val)
+ UserSGPRCount += 4;
} else if (ID == ".amdhsa_user_sgpr_dispatch_ptr") {
PARSE_BITS_ENTRY(KD.kernel_code_properties,
KERNEL_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR, Val,
ValRange);
- UserSGPRCount += 2;
+ if (Val)
+ UserSGPRCount += 2;
} else if (ID == ".amdhsa_user_sgpr_queue_ptr") {
PARSE_BITS_ENTRY(KD.kernel_code_properties,
KERNEL_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR, Val,
ValRange);
- UserSGPRCount += 2;
+ if (Val)
+ UserSGPRCount += 2;
} else if (ID == ".amdhsa_user_sgpr_kernarg_segment_ptr") {
PARSE_BITS_ENTRY(KD.kernel_code_properties,
KERNEL_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR,
Val, ValRange);
- UserSGPRCount += 2;
+ if (Val)
+ UserSGPRCount += 2;
} else if (ID == ".amdhsa_user_sgpr_dispatch_id") {
PARSE_BITS_ENTRY(KD.kernel_code_properties,
KERNEL_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID, Val,
ValRange);
- UserSGPRCount += 2;
+ if (Val)
+ UserSGPRCount += 2;
} else if (ID == ".amdhsa_user_sgpr_flat_scratch_init") {
PARSE_BITS_ENTRY(KD.kernel_code_properties,
KERNEL_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT, Val,
ValRange);
- UserSGPRCount += 2;
+ if (Val)
+ UserSGPRCount += 2;
} else if (ID == ".amdhsa_user_sgpr_private_segment_size") {
PARSE_BITS_ENTRY(KD.kernel_code_properties,
KERNEL_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_SIZE,
Val, ValRange);
- UserSGPRCount += 1;
+ if (Val)
+ UserSGPRCount += 1;
} else if (ID == ".amdhsa_wavefront_size32") {
if (IVersion.Major < 10)
return getParser().Error(IDRange.Start, "directive requires gfx10+",
@@ -5225,6 +5343,23 @@ AMDGPUAsmParser::parseExpr(int64_t &Imm) {
}
bool
+AMDGPUAsmParser::parseExpr(OperandVector &Operands) {
+ SMLoc S = getLoc();
+
+ const MCExpr *Expr;
+ if (Parser.parseExpression(Expr))
+ return false;
+
+ int64_t IntVal;
+ if (Expr->evaluateAsAbsolute(IntVal)) {
+ Operands.push_back(AMDGPUOperand::CreateImm(this, IntVal, S));
+ } else {
+ Operands.push_back(AMDGPUOperand::CreateExpr(this, Expr, S));
+ }
+ return true;
+}
+
+bool
AMDGPUAsmParser::parseString(StringRef &Val, const StringRef ErrMsg) {
if (isToken(AsmToken::String)) {
Val = getToken().getStringContents();
@@ -5605,25 +5740,29 @@ bool AMDGPUOperand::isGPRIdxMode() const {
OperandMatchResultTy
AMDGPUAsmParser::parseSOppBrTarget(OperandVector &Operands) {
- SMLoc S = Parser.getTok().getLoc();
- switch (getLexer().getKind()) {
- default: return MatchOperand_ParseFail;
- case AsmToken::Integer: {
- int64_t Imm;
- if (getParser().parseAbsoluteExpression(Imm))
- return MatchOperand_ParseFail;
- Operands.push_back(AMDGPUOperand::CreateImm(this, Imm, S));
- return MatchOperand_Success;
- }
+ // Make sure we are not parsing something
+ // that looks like a label or an expression but is not.
+ // This will improve error messages.
+ if (isRegister() || isModifier())
+ return MatchOperand_NoMatch;
- case AsmToken::Identifier:
- Operands.push_back(AMDGPUOperand::CreateExpr(this,
- MCSymbolRefExpr::create(getContext().getOrCreateSymbol(
- Parser.getTok().getString()), getContext()), S));
- Parser.Lex();
- return MatchOperand_Success;
+ if (parseExpr(Operands)) {
+
+ AMDGPUOperand &Opr = ((AMDGPUOperand &)*Operands[Operands.size() - 1]);
+ assert(Opr.isImm() || Opr.isExpr());
+ SMLoc Loc = Opr.getStartLoc();
+
+ // Currently we do not support arbitrary expressions as branch targets.
+ // Only labels and absolute expressions are accepted.
+ if (Opr.isExpr() && !Opr.isSymbolRefExpr()) {
+ Error(Loc, "expected an absolute expression or a label");
+ } else if (Opr.isImm() && !Opr.isS16Imm()) {
+ Error(Loc, "expected a 16-bit signed jump offset");
+ }
}
+
+ return MatchOperand_Success; // avoid excessive error messages
}
//===----------------------------------------------------------------------===//
@@ -5908,6 +6047,7 @@ static const OptionalOperand AMDGPUOptionalOperandTable[] = {
{"format", AMDGPUOperand::ImmTyFORMAT, false, nullptr},
{"glc", AMDGPUOperand::ImmTyGLC, true, nullptr},
{"slc", AMDGPUOperand::ImmTySLC, true, nullptr},
+ {"swz", AMDGPUOperand::ImmTySWZ, true, nullptr},
{"tfe", AMDGPUOperand::ImmTyTFE, true, nullptr},
{"d16", AMDGPUOperand::ImmTyD16, true, nullptr},
{"high", AMDGPUOperand::ImmTyHigh, true, nullptr},
@@ -5941,8 +6081,6 @@ static const OptionalOperand AMDGPUOptionalOperandTable[] = {
};
OperandMatchResultTy AMDGPUAsmParser::parseOptionalOperand(OperandVector &Operands) {
- unsigned size = Operands.size();
- assert(size > 0);
OperandMatchResultTy res = parseOptionalOpr(Operands);
@@ -5957,17 +6095,13 @@ OperandMatchResultTy AMDGPUAsmParser::parseOptionalOperand(OperandVector &Operan
// to make sure autogenerated parser of custom operands never hit hardcoded
// mandatory operands.
- if (size == 1 || ((AMDGPUOperand &)*Operands[size - 1]).isRegKind()) {
-
- // We have parsed the first optional operand.
- // Parse as many operands as necessary to skip all mandatory operands.
+ for (unsigned i = 0; i < MAX_OPR_LOOKAHEAD; ++i) {
+ if (res != MatchOperand_Success ||
+ isToken(AsmToken::EndOfStatement))
+ break;
- for (unsigned i = 0; i < MAX_OPR_LOOKAHEAD; ++i) {
- if (res != MatchOperand_Success ||
- getLexer().is(AsmToken::EndOfStatement)) break;
- if (getLexer().is(AsmToken::Comma)) Parser.Lex();
- res = parseOptionalOpr(Operands);
- }
+ trySkipToken(AsmToken::Comma);
+ res = parseOptionalOpr(Operands);
}
return res;
@@ -6682,7 +6816,11 @@ void AMDGPUAsmParser::cvtSdwaVOP2(MCInst &Inst, const OperandVector &Operands) {
}
void AMDGPUAsmParser::cvtSdwaVOP2b(MCInst &Inst, const OperandVector &Operands) {
- cvtSDWA(Inst, Operands, SIInstrFlags::VOP2, true);
+ cvtSDWA(Inst, Operands, SIInstrFlags::VOP2, true, true);
+}
+
+void AMDGPUAsmParser::cvtSdwaVOP2e(MCInst &Inst, const OperandVector &Operands) {
+ cvtSDWA(Inst, Operands, SIInstrFlags::VOP2, false, true);
}
void AMDGPUAsmParser::cvtSdwaVOPC(MCInst &Inst, const OperandVector &Operands) {
@@ -6690,11 +6828,14 @@ void AMDGPUAsmParser::cvtSdwaVOPC(MCInst &Inst, const OperandVector &Operands) {
}
void AMDGPUAsmParser::cvtSDWA(MCInst &Inst, const OperandVector &Operands,
- uint64_t BasicInstType, bool skipVcc) {
+ uint64_t BasicInstType,
+ bool SkipDstVcc,
+ bool SkipSrcVcc) {
using namespace llvm::AMDGPU::SDWA;
OptionalImmIndexMap OptionalIdx;
- bool skippedVcc = false;
+ bool SkipVcc = SkipDstVcc || SkipSrcVcc;
+ bool SkippedVcc = false;
unsigned I = 1;
const MCInstrDesc &Desc = MII.get(Inst.getOpcode());
@@ -6704,19 +6845,21 @@ void AMDGPUAsmParser::cvtSDWA(MCInst &Inst, const OperandVector &Operands,
for (unsigned E = Operands.size(); I != E; ++I) {
AMDGPUOperand &Op = ((AMDGPUOperand &)*Operands[I]);
- if (skipVcc && !skippedVcc && Op.isReg() &&
+ if (SkipVcc && !SkippedVcc && Op.isReg() &&
(Op.getReg() == AMDGPU::VCC || Op.getReg() == AMDGPU::VCC_LO)) {
// VOP2b (v_add_u32, v_sub_u32 ...) sdwa use "vcc" token as dst.
// Skip it if it's 2nd (e.g. v_add_i32_sdwa v1, vcc, v2, v3)
// or 4th (v_addc_u32_sdwa v1, vcc, v2, v3, vcc) operand.
// Skip VCC only if we didn't skip it on previous iteration.
+ // Note that src0 and src1 occupy 2 slots each because of modifiers.
if (BasicInstType == SIInstrFlags::VOP2 &&
- (Inst.getNumOperands() == 1 || Inst.getNumOperands() == 5)) {
- skippedVcc = true;
+ ((SkipDstVcc && Inst.getNumOperands() == 1) ||
+ (SkipSrcVcc && Inst.getNumOperands() == 5))) {
+ SkippedVcc = true;
continue;
} else if (BasicInstType == SIInstrFlags::VOPC &&
Inst.getNumOperands() == 0) {
- skippedVcc = true;
+ SkippedVcc = true;
continue;
}
}
@@ -6728,7 +6871,7 @@ void AMDGPUAsmParser::cvtSDWA(MCInst &Inst, const OperandVector &Operands,
} else {
llvm_unreachable("Invalid operand type");
}
- skippedVcc = false;
+ SkippedVcc = false;
}
if (Inst.getOpcode() != AMDGPU::V_NOP_sdwa_gfx10 &&
@@ -6849,6 +6992,14 @@ unsigned AMDGPUAsmParser::validateTargetOperandClass(MCParsedAsmOperand &Op,
return Operand.isInterpAttr() ? Match_Success : Match_InvalidOperand;
case MCK_AttrChan:
return Operand.isAttrChan() ? Match_Success : Match_InvalidOperand;
+ case MCK_SReg_64:
+ case MCK_SReg_64_XEXEC:
+ // Null is defined as a 32-bit register but
+ // it should also be enabled with 64-bit operands.
+ // The following code enables it for SReg_64 operands
+ // used as source and destination. Remaining source
+ // operands are handled in isInlinableImm.
+ return Operand.isNull() ? Match_Success : Match_InvalidOperand;
default:
return Match_InvalidOperand;
}
diff --git a/lib/Target/AMDGPU/BUFInstructions.td b/lib/Target/AMDGPU/BUFInstructions.td
index 62a19d848af2..1b12550aed88 100644
--- a/lib/Target/AMDGPU/BUFInstructions.td
+++ b/lib/Target/AMDGPU/BUFInstructions.td
@@ -7,13 +7,13 @@
//===----------------------------------------------------------------------===//
def MUBUFAddr32 : ComplexPattern<i64, 9, "SelectMUBUFAddr32">;
-def MUBUFAddr64 : ComplexPattern<i64, 8, "SelectMUBUFAddr64">;
+def MUBUFAddr64 : ComplexPattern<i64, 9, "SelectMUBUFAddr64">;
def MUBUFAddr64Atomic : ComplexPattern<i64, 5, "SelectMUBUFAddr64">;
def MUBUFScratchOffen : ComplexPattern<i64, 4, "SelectMUBUFScratchOffen", [], [SDNPWantParent]>;
def MUBUFScratchOffset : ComplexPattern<i64, 3, "SelectMUBUFScratchOffset", [], [SDNPWantParent], 20>;
-def MUBUFOffset : ComplexPattern<i64, 7, "SelectMUBUFOffset">;
+def MUBUFOffset : ComplexPattern<i64, 8, "SelectMUBUFOffset">;
def MUBUFOffsetNoGLC : ComplexPattern<i64, 3, "SelectMUBUFOffset">;
def MUBUFOffsetAtomic : ComplexPattern<i64, 4, "SelectMUBUFOffset">;
@@ -54,6 +54,17 @@ class MTBUFAddr64Table <bit is_addr64, string Name> {
// MTBUF classes
//===----------------------------------------------------------------------===//
+class MTBUFGetBaseOpcode<string Op> {
+ string ret = !subst("FORMAT_XY", "FORMAT_X",
+ !subst("FORMAT_XYZ", "FORMAT_X",
+ !subst("FORMAT_XYZW", "FORMAT_X", Op)));
+}
+
+class getMTBUFElements<string Op> {
+ int ret = 1;
+}
+
+
class MTBUF_Pseudo <string opName, dag outs, dag ins,
string asmOps, list<dag> pattern=[]> :
InstSI<outs, ins, "", pattern>,
@@ -67,6 +78,9 @@ class MTBUF_Pseudo <string opName, dag outs, dag ins,
string Mnemonic = opName;
string AsmOperands = asmOps;
+ Instruction Opcode = !cast<Instruction>(NAME);
+ Instruction BaseOpcode = !cast<Instruction>(MTBUFGetBaseOpcode<NAME>.ret);
+
let VM_CNT = 1;
let EXP_CNT = 1;
let MTBUF = 1;
@@ -90,6 +104,7 @@ class MTBUF_Pseudo <string opName, dag outs, dag ins,
bits<1> has_offset = 1;
bits<1> has_slc = 1;
bits<1> has_tfe = 1;
+ bits<4> elements = 0;
}
class MTBUF_Real <MTBUF_Pseudo ps> :
@@ -126,17 +141,17 @@ class getMTBUFInsDA<list<RegisterClass> vdataList,
RegisterClass vaddrClass = !if(!empty(vaddrList), ?, !head(vaddrList));
dag InsNoData = !if(!empty(vaddrList),
(ins SReg_128:$srsrc, SCSrc_b32:$soffset,
- offset:$offset, FORMAT:$format, GLC:$glc, SLC:$slc, TFE:$tfe, DLC:$dlc),
+ offset:$offset, FORMAT:$format, GLC:$glc, SLC:$slc, TFE:$tfe, DLC:$dlc, SWZ:$swz),
(ins vaddrClass:$vaddr, SReg_128:$srsrc, SCSrc_b32:$soffset,
- offset:$offset, FORMAT:$format, GLC:$glc, SLC:$slc, TFE:$tfe, DLC:$dlc)
+ offset:$offset, FORMAT:$format, GLC:$glc, SLC:$slc, TFE:$tfe, DLC:$dlc, SWZ:$swz)
);
dag InsData = !if(!empty(vaddrList),
(ins vdataClass:$vdata, SReg_128:$srsrc,
SCSrc_b32:$soffset, offset:$offset, FORMAT:$format, GLC:$glc,
- SLC:$slc, TFE:$tfe, DLC:$dlc),
+ SLC:$slc, TFE:$tfe, DLC:$dlc, SWZ:$swz),
(ins vdataClass:$vdata, vaddrClass:$vaddr, SReg_128:$srsrc,
SCSrc_b32:$soffset, offset:$offset, FORMAT:$format, GLC:$glc,
- SLC:$slc, TFE:$tfe, DLC:$dlc)
+ SLC:$slc, TFE:$tfe, DLC:$dlc, SWZ:$swz)
);
dag ret = !if(!empty(vdataList), InsNoData, InsData);
}
@@ -181,51 +196,54 @@ class MTBUF_SetupAddr<int addrKind> {
class MTBUF_Load_Pseudo <string opName,
int addrKind,
RegisterClass vdataClass,
+ int elems,
list<dag> pattern=[],
// Workaround bug bz30254
int addrKindCopy = addrKind>
: MTBUF_Pseudo<opName,
(outs vdataClass:$vdata),
getMTBUFIns<addrKindCopy>.ret,
- " $vdata, " # getMTBUFAsmOps<addrKindCopy>.ret # "$glc$slc$tfe$dlc",
+ " $vdata, " # getMTBUFAsmOps<addrKindCopy>.ret # "$glc$slc$tfe$dlc$swz",
pattern>,
MTBUF_SetupAddr<addrKindCopy> {
let PseudoInstr = opName # "_" # getAddrName<addrKindCopy>.ret;
let mayLoad = 1;
let mayStore = 0;
+ let elements = elems;
}
multiclass MTBUF_Pseudo_Loads<string opName, RegisterClass vdataClass,
- ValueType load_vt = i32,
+ int elems, ValueType load_vt = i32,
SDPatternOperator ld = null_frag> {
- def _OFFSET : MTBUF_Load_Pseudo <opName, BUFAddrKind.Offset, vdataClass,
+ def _OFFSET : MTBUF_Load_Pseudo <opName, BUFAddrKind.Offset, vdataClass, elems,
[(set load_vt:$vdata,
(ld (MUBUFOffset v4i32:$srsrc, i32:$soffset, i16:$offset, i8:$format,
- i1:$glc, i1:$slc, i1:$tfe, i1:$dlc)))]>,
+ i1:$glc, i1:$slc, i1:$tfe, i1:$dlc, i1:$swz)))]>,
MTBUFAddr64Table<0, NAME>;
- def _ADDR64 : MTBUF_Load_Pseudo <opName, BUFAddrKind.Addr64, vdataClass,
+ def _ADDR64 : MTBUF_Load_Pseudo <opName, BUFAddrKind.Addr64, vdataClass, elems,
[(set load_vt:$vdata,
(ld (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, i32:$soffset, i16:$offset,
- i8:$format, i1:$glc, i1:$slc, i1:$tfe, i1:$dlc)))]>,
+ i8:$format, i1:$glc, i1:$slc, i1:$tfe, i1:$dlc, i1:$swz)))]>,
MTBUFAddr64Table<1, NAME>;
- def _OFFEN : MTBUF_Load_Pseudo <opName, BUFAddrKind.OffEn, vdataClass>;
- def _IDXEN : MTBUF_Load_Pseudo <opName, BUFAddrKind.IdxEn, vdataClass>;
- def _BOTHEN : MTBUF_Load_Pseudo <opName, BUFAddrKind.BothEn, vdataClass>;
+ def _OFFEN : MTBUF_Load_Pseudo <opName, BUFAddrKind.OffEn, vdataClass, elems>;
+ def _IDXEN : MTBUF_Load_Pseudo <opName, BUFAddrKind.IdxEn, vdataClass, elems>;
+ def _BOTHEN : MTBUF_Load_Pseudo <opName, BUFAddrKind.BothEn, vdataClass, elems>;
let DisableWQM = 1 in {
- def _OFFSET_exact : MTBUF_Load_Pseudo <opName, BUFAddrKind.Offset, vdataClass>;
- def _OFFEN_exact : MTBUF_Load_Pseudo <opName, BUFAddrKind.OffEn, vdataClass>;
- def _IDXEN_exact : MTBUF_Load_Pseudo <opName, BUFAddrKind.IdxEn, vdataClass>;
- def _BOTHEN_exact : MTBUF_Load_Pseudo <opName, BUFAddrKind.BothEn, vdataClass>;
+ def _OFFSET_exact : MTBUF_Load_Pseudo <opName, BUFAddrKind.Offset, vdataClass, elems>;
+ def _OFFEN_exact : MTBUF_Load_Pseudo <opName, BUFAddrKind.OffEn, vdataClass, elems>;
+ def _IDXEN_exact : MTBUF_Load_Pseudo <opName, BUFAddrKind.IdxEn, vdataClass, elems>;
+ def _BOTHEN_exact : MTBUF_Load_Pseudo <opName, BUFAddrKind.BothEn, vdataClass, elems>;
}
}
class MTBUF_Store_Pseudo <string opName,
int addrKind,
RegisterClass vdataClass,
+ int elems,
list<dag> pattern=[],
// Workaround bug bz30254
int addrKindCopy = addrKind,
@@ -233,39 +251,40 @@ class MTBUF_Store_Pseudo <string opName,
: MTBUF_Pseudo<opName,
(outs),
getMTBUFIns<addrKindCopy, [vdataClassCopy]>.ret,
- " $vdata, " # getMTBUFAsmOps<addrKindCopy>.ret # "$glc$slc$tfe$dlc",
+ " $vdata, " # getMTBUFAsmOps<addrKindCopy>.ret # "$glc$slc$tfe$dlc$swz",
pattern>,
MTBUF_SetupAddr<addrKindCopy> {
let PseudoInstr = opName # "_" # getAddrName<addrKindCopy>.ret;
let mayLoad = 0;
let mayStore = 1;
+ let elements = elems;
}
multiclass MTBUF_Pseudo_Stores<string opName, RegisterClass vdataClass,
- ValueType store_vt = i32,
+ int elems, ValueType store_vt = i32,
SDPatternOperator st = null_frag> {
- def _OFFSET : MTBUF_Store_Pseudo <opName, BUFAddrKind.Offset, vdataClass,
+ def _OFFSET : MTBUF_Store_Pseudo <opName, BUFAddrKind.Offset, vdataClass, elems,
[(st store_vt:$vdata, (MUBUFOffset v4i32:$srsrc, i32:$soffset,
i16:$offset, i8:$format, i1:$glc,
- i1:$slc, i1:$tfe, i1:$dlc))]>,
+ i1:$slc, i1:$tfe, i1:$dlc, i1:$swz))]>,
MTBUFAddr64Table<0, NAME>;
- def _ADDR64 : MTBUF_Store_Pseudo <opName, BUFAddrKind.Addr64, vdataClass,
+ def _ADDR64 : MTBUF_Store_Pseudo <opName, BUFAddrKind.Addr64, vdataClass, elems,
[(st store_vt:$vdata, (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, i32:$soffset,
i16:$offset, i8:$format, i1:$glc,
- i1:$slc, i1:$tfe, i1:$dlc))]>,
+ i1:$slc, i1:$tfe, i1:$dlc, i1:$swz))]>,
MTBUFAddr64Table<1, NAME>;
- def _OFFEN : MTBUF_Store_Pseudo <opName, BUFAddrKind.OffEn, vdataClass>;
- def _IDXEN : MTBUF_Store_Pseudo <opName, BUFAddrKind.IdxEn, vdataClass>;
- def _BOTHEN : MTBUF_Store_Pseudo <opName, BUFAddrKind.BothEn, vdataClass>;
+ def _OFFEN : MTBUF_Store_Pseudo <opName, BUFAddrKind.OffEn, vdataClass, elems>;
+ def _IDXEN : MTBUF_Store_Pseudo <opName, BUFAddrKind.IdxEn, vdataClass, elems>;
+ def _BOTHEN : MTBUF_Store_Pseudo <opName, BUFAddrKind.BothEn, vdataClass, elems>;
let DisableWQM = 1 in {
- def _OFFSET_exact : MTBUF_Store_Pseudo <opName, BUFAddrKind.Offset, vdataClass>;
- def _OFFEN_exact : MTBUF_Store_Pseudo <opName, BUFAddrKind.OffEn, vdataClass>;
- def _IDXEN_exact : MTBUF_Store_Pseudo <opName, BUFAddrKind.IdxEn, vdataClass>;
- def _BOTHEN_exact : MTBUF_Store_Pseudo <opName, BUFAddrKind.BothEn, vdataClass>;
+ def _OFFSET_exact : MTBUF_Store_Pseudo <opName, BUFAddrKind.Offset, vdataClass, elems>;
+ def _OFFEN_exact : MTBUF_Store_Pseudo <opName, BUFAddrKind.OffEn, vdataClass, elems>;
+ def _IDXEN_exact : MTBUF_Store_Pseudo <opName, BUFAddrKind.IdxEn, vdataClass, elems>;
+ def _BOTHEN_exact : MTBUF_Store_Pseudo <opName, BUFAddrKind.BothEn, vdataClass, elems>;
}
}
@@ -320,7 +339,7 @@ class MUBUF_Pseudo <string opName, dag outs, dag ins,
bits<1> has_offset = 1;
bits<1> has_slc = 1;
bits<1> has_tfe = 1;
- bits<4> dwords = 0;
+ bits<4> elements = 0;
}
class MUBUF_Real <MUBUF_Pseudo ps> :
@@ -393,18 +412,30 @@ class getMUBUFInsDA<list<RegisterClass> vdataList,
);
dag ret = !con(
!if(!empty(vdataList), InsNoData, InsData),
- !if(isLds, (ins DLC:$dlc), (ins TFE:$tfe, DLC:$dlc))
+ !if(isLds, (ins DLC:$dlc, SWZ:$swz), (ins TFE:$tfe, DLC:$dlc,SWZ:$swz))
);
}
-class getMUBUFDwords<RegisterClass regClass> {
- string regClassAsInt = !cast<string>(regClass);
+class getMUBUFElements<ValueType vt> {
+ // eq does not support ValueType for some reason.
+ string vtAsStr = !cast<string>(vt);
+
int ret =
- !if(!eq(regClassAsInt, !cast<string>(VGPR_32)), 1,
- !if(!eq(regClassAsInt, !cast<string>(VReg_64)), 2,
- !if(!eq(regClassAsInt, !cast<string>(VReg_96)), 3,
- !if(!eq(regClassAsInt, !cast<string>(VReg_128)), 4,
- 0))));
+ !if(!eq(vtAsStr, "f16"), 1,
+ !if(!eq(vtAsStr, "v2f16"), 2,
+ !if(!eq(vtAsStr, "v3f16"), 3,
+ !if(!eq(vtAsStr, "v4f16"), 4,
+ !if(!eq(vt.Size, 32), 1,
+ !if(!eq(vt.Size, 64), 2,
+ !if(!eq(vt.Size, 96), 3,
+ !if(!eq(vt.Size, 128), 4, 0)
+ )
+ )
+ )
+ )
+ )
+ )
+ );
}
class getMUBUFIns<int addrKind, list<RegisterClass> vdataList=[], bit isLds = 0> {
@@ -442,18 +473,18 @@ class MUBUF_SetupAddr<int addrKind> {
class MUBUF_Load_Pseudo <string opName,
int addrKind,
- RegisterClass vdataClass,
+ ValueType vdata_vt,
bit HasTiedDest = 0,
bit isLds = 0,
list<dag> pattern=[],
// Workaround bug bz30254
int addrKindCopy = addrKind>
: MUBUF_Pseudo<opName,
- (outs vdataClass:$vdata),
+ (outs getVregSrcForVT<vdata_vt>.ret:$vdata),
!con(getMUBUFIns<addrKindCopy, [], isLds>.ret,
- !if(HasTiedDest, (ins vdataClass:$vdata_in), (ins))),
+ !if(HasTiedDest, (ins getVregSrcForVT<vdata_vt>.ret:$vdata_in), (ins))),
" $vdata, " # getMUBUFAsmOps<addrKindCopy>.ret # "$glc$slc" #
- !if(isLds, " lds", "$tfe") # "$dlc",
+ !if(isLds, " lds", "$tfe") # "$dlc" # "$swz",
pattern>,
MUBUF_SetupAddr<addrKindCopy> {
let PseudoInstr = opName # !if(isLds, "_lds", "") #
@@ -467,19 +498,19 @@ class MUBUF_Load_Pseudo <string opName,
let Uses = !if(isLds, [EXEC, M0], [EXEC]);
let has_tfe = !if(isLds, 0, 1);
let lds = isLds;
- let dwords = getMUBUFDwords<vdataClass>.ret;
+ let elements = getMUBUFElements<vdata_vt>.ret;
}
class MUBUF_Offset_Load_Pat <Instruction inst, ValueType load_vt = i32, SDPatternOperator ld = null_frag> : Pat <
- (load_vt (ld (MUBUFOffset v4i32:$srsrc, i32:$soffset, i16:$offset, i1:$glc, i1:$slc, i1:$tfe, i1:$dlc))),
- (load_vt (inst v4i32:$srsrc, i32:$soffset, i16:$offset, i1:$glc, i1:$slc, i1:$tfe, i1:$dlc))
+ (load_vt (ld (MUBUFOffset v4i32:$srsrc, i32:$soffset, i16:$offset, i1:$glc, i1:$slc, i1:$tfe, i1:$dlc, i1:$swz))),
+ (load_vt (inst v4i32:$srsrc, i32:$soffset, i16:$offset, i1:$glc, i1:$slc, i1:$tfe, i1:$dlc, i1:$swz))
>;
class MUBUF_Addr64_Load_Pat <Instruction inst,
ValueType load_vt = i32,
SDPatternOperator ld = null_frag> : Pat <
- (load_vt (ld (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, i32:$soffset, i16:$offset, i1:$glc, i1:$slc, i1:$tfe, i1:$dlc))),
- (load_vt (inst i64:$vaddr, v4i32:$srsrc, i32:$soffset, i16:$offset, i1:$glc, i1:$slc, i1:$tfe, i1:$dlc))
+ (load_vt (ld (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, i32:$soffset, i16:$offset, i1:$glc, i1:$slc, i1:$tfe, i1:$dlc, i1:$swz))),
+ (load_vt (inst i64:$vaddr, v4i32:$srsrc, i32:$soffset, i16:$offset, i1:$glc, i1:$slc, i1:$tfe, i1:$dlc, i1:$swz))
>;
multiclass MUBUF_Pseudo_Load_Pats<string BaseInst, ValueType load_vt = i32, SDPatternOperator ld = null_frag> {
@@ -490,89 +521,87 @@ multiclass MUBUF_Pseudo_Load_Pats<string BaseInst, ValueType load_vt = i32, SDPa
// FIXME: tfe can't be an operand because it requires a separate
// opcode because it needs an N+1 register class dest register.
-multiclass MUBUF_Pseudo_Loads<string opName, RegisterClass vdataClass,
+multiclass MUBUF_Pseudo_Loads<string opName,
ValueType load_vt = i32,
SDPatternOperator ld = null_frag,
bit TiedDest = 0,
bit isLds = 0> {
- def _OFFSET : MUBUF_Load_Pseudo <opName, BUFAddrKind.Offset, vdataClass, TiedDest, isLds>,
+ def _OFFSET : MUBUF_Load_Pseudo <opName, BUFAddrKind.Offset, load_vt, TiedDest, isLds>,
MUBUFAddr64Table<0, NAME # !if(isLds, "_LDS", "")>;
- def _ADDR64 : MUBUF_Load_Pseudo <opName, BUFAddrKind.Addr64, vdataClass, TiedDest, isLds>,
+ def _ADDR64 : MUBUF_Load_Pseudo <opName, BUFAddrKind.Addr64, load_vt, TiedDest, isLds>,
MUBUFAddr64Table<1, NAME # !if(isLds, "_LDS", "")>;
- def _OFFEN : MUBUF_Load_Pseudo <opName, BUFAddrKind.OffEn, vdataClass, TiedDest, isLds>;
- def _IDXEN : MUBUF_Load_Pseudo <opName, BUFAddrKind.IdxEn, vdataClass, TiedDest, isLds>;
- def _BOTHEN : MUBUF_Load_Pseudo <opName, BUFAddrKind.BothEn, vdataClass, TiedDest, isLds>;
+ def _OFFEN : MUBUF_Load_Pseudo <opName, BUFAddrKind.OffEn, load_vt, TiedDest, isLds>;
+ def _IDXEN : MUBUF_Load_Pseudo <opName, BUFAddrKind.IdxEn, load_vt, TiedDest, isLds>;
+ def _BOTHEN : MUBUF_Load_Pseudo <opName, BUFAddrKind.BothEn, load_vt, TiedDest, isLds>;
let DisableWQM = 1 in {
- def _OFFSET_exact : MUBUF_Load_Pseudo <opName, BUFAddrKind.Offset, vdataClass, TiedDest, isLds>;
- def _OFFEN_exact : MUBUF_Load_Pseudo <opName, BUFAddrKind.OffEn, vdataClass, TiedDest, isLds>;
- def _IDXEN_exact : MUBUF_Load_Pseudo <opName, BUFAddrKind.IdxEn, vdataClass, TiedDest, isLds>;
- def _BOTHEN_exact : MUBUF_Load_Pseudo <opName, BUFAddrKind.BothEn, vdataClass, TiedDest, isLds>;
+ def _OFFSET_exact : MUBUF_Load_Pseudo <opName, BUFAddrKind.Offset, load_vt, TiedDest, isLds>;
+ def _OFFEN_exact : MUBUF_Load_Pseudo <opName, BUFAddrKind.OffEn, load_vt, TiedDest, isLds>;
+ def _IDXEN_exact : MUBUF_Load_Pseudo <opName, BUFAddrKind.IdxEn, load_vt, TiedDest, isLds>;
+ def _BOTHEN_exact : MUBUF_Load_Pseudo <opName, BUFAddrKind.BothEn, load_vt, TiedDest, isLds>;
}
}
-multiclass MUBUF_Pseudo_Loads_Lds<string opName, RegisterClass vdataClass,
- ValueType load_vt = i32,
+multiclass MUBUF_Pseudo_Loads_Lds<string opName, ValueType load_vt = i32,
SDPatternOperator ld_nolds = null_frag,
SDPatternOperator ld_lds = null_frag> {
- defm NAME : MUBUF_Pseudo_Loads<opName, vdataClass, load_vt, ld_nolds>;
- defm _LDS : MUBUF_Pseudo_Loads<opName, vdataClass, load_vt, ld_lds, 0, 1>;
+ defm NAME : MUBUF_Pseudo_Loads<opName, load_vt, ld_nolds>;
+ defm _LDS : MUBUF_Pseudo_Loads<opName, load_vt, ld_lds, 0, 1>;
}
class MUBUF_Store_Pseudo <string opName,
int addrKind,
- RegisterClass vdataClass,
+ ValueType store_vt,
list<dag> pattern=[],
// Workaround bug bz30254
- int addrKindCopy = addrKind,
- RegisterClass vdataClassCopy = vdataClass>
+ int addrKindCopy = addrKind>
: MUBUF_Pseudo<opName,
(outs),
- getMUBUFIns<addrKindCopy, [vdataClassCopy]>.ret,
- " $vdata, " # getMUBUFAsmOps<addrKindCopy>.ret # "$glc$slc$tfe$dlc",
+ getMUBUFIns<addrKindCopy, [getVregSrcForVT<store_vt>.ret]>.ret,
+ " $vdata, " # getMUBUFAsmOps<addrKindCopy>.ret # "$glc$slc$tfe$dlc$swz",
pattern>,
MUBUF_SetupAddr<addrKindCopy> {
let PseudoInstr = opName # "_" # getAddrName<addrKindCopy>.ret;
let mayLoad = 0;
let mayStore = 1;
let maybeAtomic = 1;
- let dwords = getMUBUFDwords<vdataClass>.ret;
+ let elements = getMUBUFElements<store_vt>.ret;
}
-multiclass MUBUF_Pseudo_Stores<string opName, RegisterClass vdataClass,
+multiclass MUBUF_Pseudo_Stores<string opName,
ValueType store_vt = i32,
SDPatternOperator st = null_frag> {
- def _OFFSET : MUBUF_Store_Pseudo <opName, BUFAddrKind.Offset, vdataClass,
+ def _OFFSET : MUBUF_Store_Pseudo <opName, BUFAddrKind.Offset, store_vt,
[(st store_vt:$vdata, (MUBUFOffset v4i32:$srsrc, i32:$soffset,
- i16:$offset, i1:$glc, i1:$slc, i1:$tfe, i1:$dlc))]>,
+ i16:$offset, i1:$glc, i1:$slc, i1:$tfe, i1:$dlc, i1:$swz))]>,
MUBUFAddr64Table<0, NAME>;
- def _ADDR64 : MUBUF_Store_Pseudo <opName, BUFAddrKind.Addr64, vdataClass,
+ def _ADDR64 : MUBUF_Store_Pseudo <opName, BUFAddrKind.Addr64, store_vt,
[(st store_vt:$vdata, (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, i32:$soffset,
- i16:$offset, i1:$glc, i1:$slc, i1:$tfe, i1:$dlc))]>,
+ i16:$offset, i1:$glc, i1:$slc, i1:$tfe, i1:$dlc, i1:$swz))]>,
MUBUFAddr64Table<1, NAME>;
- def _OFFEN : MUBUF_Store_Pseudo <opName, BUFAddrKind.OffEn, vdataClass>;
- def _IDXEN : MUBUF_Store_Pseudo <opName, BUFAddrKind.IdxEn, vdataClass>;
- def _BOTHEN : MUBUF_Store_Pseudo <opName, BUFAddrKind.BothEn, vdataClass>;
+ def _OFFEN : MUBUF_Store_Pseudo <opName, BUFAddrKind.OffEn, store_vt>;
+ def _IDXEN : MUBUF_Store_Pseudo <opName, BUFAddrKind.IdxEn, store_vt>;
+ def _BOTHEN : MUBUF_Store_Pseudo <opName, BUFAddrKind.BothEn, store_vt>;
let DisableWQM = 1 in {
- def _OFFSET_exact : MUBUF_Store_Pseudo <opName, BUFAddrKind.Offset, vdataClass>;
- def _OFFEN_exact : MUBUF_Store_Pseudo <opName, BUFAddrKind.OffEn, vdataClass>;
- def _IDXEN_exact : MUBUF_Store_Pseudo <opName, BUFAddrKind.IdxEn, vdataClass>;
- def _BOTHEN_exact : MUBUF_Store_Pseudo <opName, BUFAddrKind.BothEn, vdataClass>;
+ def _OFFSET_exact : MUBUF_Store_Pseudo <opName, BUFAddrKind.Offset, store_vt>;
+ def _OFFEN_exact : MUBUF_Store_Pseudo <opName, BUFAddrKind.OffEn, store_vt>;
+ def _IDXEN_exact : MUBUF_Store_Pseudo <opName, BUFAddrKind.IdxEn, store_vt>;
+ def _BOTHEN_exact : MUBUF_Store_Pseudo <opName, BUFAddrKind.BothEn, store_vt>;
}
}
class MUBUF_Pseudo_Store_Lds<string opName>
: MUBUF_Pseudo<opName,
(outs),
- (ins SReg_128:$srsrc, SCSrc_b32:$soffset, offset:$offset, GLC:$glc, SLC:$slc),
- " $srsrc, $soffset$offset lds$glc$slc"> {
+ (ins SReg_128:$srsrc, SCSrc_b32:$soffset, offset:$offset, GLC:$glc, SLC:$slc, SWZ:$swz),
+ " $srsrc, $soffset$offset lds$glc$slc$swz"> {
let mayLoad = 0;
let mayStore = 1;
let maybeAtomic = 1;
@@ -686,7 +715,7 @@ multiclass MUBUF_Pseudo_Atomics_NO_RTN <string opName,
RegisterClass vdataClass,
ValueType vdataType,
SDPatternOperator atomic,
- bit isFP = getIsFP<vdataType>.ret> {
+ bit isFP = isFloatType<vdataType>.ret> {
let FPAtomic = isFP in
def _OFFSET : MUBUF_AtomicNoRet_Pseudo <opName, BUFAddrKind.Offset, vdataClass>,
MUBUFAddr64Table <0, NAME>;
@@ -710,7 +739,7 @@ multiclass MUBUF_Pseudo_Atomics_RTN <string opName,
RegisterClass vdataClass,
ValueType vdataType,
SDPatternOperator atomic,
- bit isFP = getIsFP<vdataType>.ret> {
+ bit isFP = isFloatType<vdataType>.ret> {
let FPAtomic = isFP in
def _OFFSET_RTN : MUBUF_AtomicRet_Pseudo <opName, BUFAddrKind.Offset, vdataClass,
[(set vdataType:$vdata,
@@ -748,107 +777,107 @@ multiclass MUBUF_Pseudo_Atomics <string opName,
//===----------------------------------------------------------------------===//
defm BUFFER_LOAD_FORMAT_X : MUBUF_Pseudo_Loads_Lds <
- "buffer_load_format_x", VGPR_32
+ "buffer_load_format_x", f32
>;
defm BUFFER_LOAD_FORMAT_XY : MUBUF_Pseudo_Loads <
- "buffer_load_format_xy", VReg_64
+ "buffer_load_format_xy", v2f32
>;
defm BUFFER_LOAD_FORMAT_XYZ : MUBUF_Pseudo_Loads <
- "buffer_load_format_xyz", VReg_96
+ "buffer_load_format_xyz", v3f32
>;
defm BUFFER_LOAD_FORMAT_XYZW : MUBUF_Pseudo_Loads <
- "buffer_load_format_xyzw", VReg_128
+ "buffer_load_format_xyzw", v4f32
>;
defm BUFFER_STORE_FORMAT_X : MUBUF_Pseudo_Stores <
- "buffer_store_format_x", VGPR_32
+ "buffer_store_format_x", f32
>;
defm BUFFER_STORE_FORMAT_XY : MUBUF_Pseudo_Stores <
- "buffer_store_format_xy", VReg_64
+ "buffer_store_format_xy", v2f32
>;
defm BUFFER_STORE_FORMAT_XYZ : MUBUF_Pseudo_Stores <
- "buffer_store_format_xyz", VReg_96
+ "buffer_store_format_xyz", v3f32
>;
defm BUFFER_STORE_FORMAT_XYZW : MUBUF_Pseudo_Stores <
- "buffer_store_format_xyzw", VReg_128
+ "buffer_store_format_xyzw", v4f32
>;
let SubtargetPredicate = HasUnpackedD16VMem, D16Buf = 1 in {
defm BUFFER_LOAD_FORMAT_D16_X_gfx80 : MUBUF_Pseudo_Loads <
- "buffer_load_format_d16_x", VGPR_32
+ "buffer_load_format_d16_x", i32
>;
defm BUFFER_LOAD_FORMAT_D16_XY_gfx80 : MUBUF_Pseudo_Loads <
- "buffer_load_format_d16_xy", VReg_64
+ "buffer_load_format_d16_xy", v2i32
>;
defm BUFFER_LOAD_FORMAT_D16_XYZ_gfx80 : MUBUF_Pseudo_Loads <
- "buffer_load_format_d16_xyz", VReg_96
+ "buffer_load_format_d16_xyz", v3i32
>;
defm BUFFER_LOAD_FORMAT_D16_XYZW_gfx80 : MUBUF_Pseudo_Loads <
- "buffer_load_format_d16_xyzw", VReg_128
+ "buffer_load_format_d16_xyzw", v4i32
>;
defm BUFFER_STORE_FORMAT_D16_X_gfx80 : MUBUF_Pseudo_Stores <
- "buffer_store_format_d16_x", VGPR_32
+ "buffer_store_format_d16_x", i32
>;
defm BUFFER_STORE_FORMAT_D16_XY_gfx80 : MUBUF_Pseudo_Stores <
- "buffer_store_format_d16_xy", VReg_64
+ "buffer_store_format_d16_xy", v2i32
>;
defm BUFFER_STORE_FORMAT_D16_XYZ_gfx80 : MUBUF_Pseudo_Stores <
- "buffer_store_format_d16_xyz", VReg_96
+ "buffer_store_format_d16_xyz", v3i32
>;
defm BUFFER_STORE_FORMAT_D16_XYZW_gfx80 : MUBUF_Pseudo_Stores <
- "buffer_store_format_d16_xyzw", VReg_128
+ "buffer_store_format_d16_xyzw", v4i32
>;
} // End HasUnpackedD16VMem.
let SubtargetPredicate = HasPackedD16VMem, D16Buf = 1 in {
defm BUFFER_LOAD_FORMAT_D16_X : MUBUF_Pseudo_Loads <
- "buffer_load_format_d16_x", VGPR_32
+ "buffer_load_format_d16_x", f16
>;
defm BUFFER_LOAD_FORMAT_D16_XY : MUBUF_Pseudo_Loads <
- "buffer_load_format_d16_xy", VGPR_32
+ "buffer_load_format_d16_xy", v2f16
>;
defm BUFFER_LOAD_FORMAT_D16_XYZ : MUBUF_Pseudo_Loads <
- "buffer_load_format_d16_xyz", VReg_64
+ "buffer_load_format_d16_xyz", v3f16
>;
defm BUFFER_LOAD_FORMAT_D16_XYZW : MUBUF_Pseudo_Loads <
- "buffer_load_format_d16_xyzw", VReg_64
+ "buffer_load_format_d16_xyzw", v4f16
>;
defm BUFFER_STORE_FORMAT_D16_X : MUBUF_Pseudo_Stores <
- "buffer_store_format_d16_x", VGPR_32
+ "buffer_store_format_d16_x", f16
>;
defm BUFFER_STORE_FORMAT_D16_XY : MUBUF_Pseudo_Stores <
- "buffer_store_format_d16_xy", VGPR_32
+ "buffer_store_format_d16_xy", v2f16
>;
defm BUFFER_STORE_FORMAT_D16_XYZ : MUBUF_Pseudo_Stores <
- "buffer_store_format_d16_xyz", VReg_64
+ "buffer_store_format_d16_xyz", v3f16
>;
defm BUFFER_STORE_FORMAT_D16_XYZW : MUBUF_Pseudo_Stores <
- "buffer_store_format_d16_xyzw", VReg_64
+ "buffer_store_format_d16_xyzw", v4f16
>;
} // End HasPackedD16VMem.
defm BUFFER_LOAD_UBYTE : MUBUF_Pseudo_Loads_Lds <
- "buffer_load_ubyte", VGPR_32, i32
+ "buffer_load_ubyte", i32
>;
defm BUFFER_LOAD_SBYTE : MUBUF_Pseudo_Loads_Lds <
- "buffer_load_sbyte", VGPR_32, i32
+ "buffer_load_sbyte", i32
>;
defm BUFFER_LOAD_USHORT : MUBUF_Pseudo_Loads_Lds <
- "buffer_load_ushort", VGPR_32, i32
+ "buffer_load_ushort", i32
>;
defm BUFFER_LOAD_SSHORT : MUBUF_Pseudo_Loads_Lds <
- "buffer_load_sshort", VGPR_32, i32
+ "buffer_load_sshort", i32
>;
defm BUFFER_LOAD_DWORD : MUBUF_Pseudo_Loads_Lds <
- "buffer_load_dword", VGPR_32, i32
+ "buffer_load_dword", i32
>;
defm BUFFER_LOAD_DWORDX2 : MUBUF_Pseudo_Loads <
- "buffer_load_dwordx2", VReg_64, v2i32
+ "buffer_load_dwordx2", v2i32
>;
defm BUFFER_LOAD_DWORDX3 : MUBUF_Pseudo_Loads <
- "buffer_load_dwordx3", VReg_96, v3i32
+ "buffer_load_dwordx3", v3i32
>;
defm BUFFER_LOAD_DWORDX4 : MUBUF_Pseudo_Loads <
- "buffer_load_dwordx4", VReg_128, v4i32
+ "buffer_load_dwordx4", v4i32
>;
defm : MUBUF_Pseudo_Load_Pats<"BUFFER_LOAD_UBYTE", i32, extloadi8_global>;
@@ -867,111 +896,111 @@ defm : MUBUF_Pseudo_Load_Pats<"BUFFER_LOAD_DWORDX4", v4i32, load_global>;
// in at least GFX8+ chips. See Bug 37653.
let SubtargetPredicate = isGFX8GFX9 in {
defm BUFFER_LOAD_DWORDX2_LDS : MUBUF_Pseudo_Loads <
- "buffer_load_dwordx2", VReg_64, v2i32, null_frag, 0, 1
+ "buffer_load_dwordx2", v2i32, null_frag, 0, 1
>;
defm BUFFER_LOAD_DWORDX3_LDS : MUBUF_Pseudo_Loads <
- "buffer_load_dwordx3", VReg_96, untyped, null_frag, 0, 1
+ "buffer_load_dwordx3", v3i32, null_frag, 0, 1
>;
defm BUFFER_LOAD_DWORDX4_LDS : MUBUF_Pseudo_Loads <
- "buffer_load_dwordx4", VReg_128, v4i32, null_frag, 0, 1
+ "buffer_load_dwordx4", v4i32, null_frag, 0, 1
>;
}
defm BUFFER_STORE_BYTE : MUBUF_Pseudo_Stores <
- "buffer_store_byte", VGPR_32, i32, truncstorei8_global
+ "buffer_store_byte", i32, truncstorei8_global
>;
defm BUFFER_STORE_SHORT : MUBUF_Pseudo_Stores <
- "buffer_store_short", VGPR_32, i32, truncstorei16_global
+ "buffer_store_short", i32, truncstorei16_global
>;
defm BUFFER_STORE_DWORD : MUBUF_Pseudo_Stores <
- "buffer_store_dword", VGPR_32, i32, store_global
+ "buffer_store_dword", i32, store_global
>;
defm BUFFER_STORE_DWORDX2 : MUBUF_Pseudo_Stores <
- "buffer_store_dwordx2", VReg_64, v2i32, store_global
+ "buffer_store_dwordx2", v2i32, store_global
>;
defm BUFFER_STORE_DWORDX3 : MUBUF_Pseudo_Stores <
- "buffer_store_dwordx3", VReg_96, v3i32, store_global
+ "buffer_store_dwordx3", v3i32, store_global
>;
defm BUFFER_STORE_DWORDX4 : MUBUF_Pseudo_Stores <
- "buffer_store_dwordx4", VReg_128, v4i32, store_global
+ "buffer_store_dwordx4", v4i32, store_global
>;
defm BUFFER_ATOMIC_SWAP : MUBUF_Pseudo_Atomics <
- "buffer_atomic_swap", VGPR_32, i32, atomic_swap_global
+ "buffer_atomic_swap", VGPR_32, i32, atomic_swap_global_32
>;
defm BUFFER_ATOMIC_CMPSWAP : MUBUF_Pseudo_Atomics <
"buffer_atomic_cmpswap", VReg_64, v2i32, null_frag
>;
defm BUFFER_ATOMIC_ADD : MUBUF_Pseudo_Atomics <
- "buffer_atomic_add", VGPR_32, i32, atomic_add_global
+ "buffer_atomic_add", VGPR_32, i32, atomic_load_add_global_32
>;
defm BUFFER_ATOMIC_SUB : MUBUF_Pseudo_Atomics <
- "buffer_atomic_sub", VGPR_32, i32, atomic_sub_global
+ "buffer_atomic_sub", VGPR_32, i32, atomic_load_sub_global_32
>;
defm BUFFER_ATOMIC_SMIN : MUBUF_Pseudo_Atomics <
- "buffer_atomic_smin", VGPR_32, i32, atomic_min_global
+ "buffer_atomic_smin", VGPR_32, i32, atomic_load_min_global_32
>;
defm BUFFER_ATOMIC_UMIN : MUBUF_Pseudo_Atomics <
- "buffer_atomic_umin", VGPR_32, i32, atomic_umin_global
+ "buffer_atomic_umin", VGPR_32, i32, atomic_load_umin_global_32
>;
defm BUFFER_ATOMIC_SMAX : MUBUF_Pseudo_Atomics <
- "buffer_atomic_smax", VGPR_32, i32, atomic_max_global
+ "buffer_atomic_smax", VGPR_32, i32, atomic_load_max_global_32
>;
defm BUFFER_ATOMIC_UMAX : MUBUF_Pseudo_Atomics <
- "buffer_atomic_umax", VGPR_32, i32, atomic_umax_global
+ "buffer_atomic_umax", VGPR_32, i32, atomic_load_umax_global_32
>;
defm BUFFER_ATOMIC_AND : MUBUF_Pseudo_Atomics <
- "buffer_atomic_and", VGPR_32, i32, atomic_and_global
+ "buffer_atomic_and", VGPR_32, i32, atomic_load_and_global_32
>;
defm BUFFER_ATOMIC_OR : MUBUF_Pseudo_Atomics <
- "buffer_atomic_or", VGPR_32, i32, atomic_or_global
+ "buffer_atomic_or", VGPR_32, i32, atomic_load_or_global_32
>;
defm BUFFER_ATOMIC_XOR : MUBUF_Pseudo_Atomics <
- "buffer_atomic_xor", VGPR_32, i32, atomic_xor_global
+ "buffer_atomic_xor", VGPR_32, i32, atomic_load_xor_global_32
>;
defm BUFFER_ATOMIC_INC : MUBUF_Pseudo_Atomics <
- "buffer_atomic_inc", VGPR_32, i32, atomic_inc_global
+ "buffer_atomic_inc", VGPR_32, i32, atomic_inc_global_32
>;
defm BUFFER_ATOMIC_DEC : MUBUF_Pseudo_Atomics <
- "buffer_atomic_dec", VGPR_32, i32, atomic_dec_global
+ "buffer_atomic_dec", VGPR_32, i32, atomic_dec_global_32
>;
defm BUFFER_ATOMIC_SWAP_X2 : MUBUF_Pseudo_Atomics <
- "buffer_atomic_swap_x2", VReg_64, i64, atomic_swap_global
+ "buffer_atomic_swap_x2", VReg_64, i64, atomic_swap_global_64
>;
defm BUFFER_ATOMIC_CMPSWAP_X2 : MUBUF_Pseudo_Atomics <
"buffer_atomic_cmpswap_x2", VReg_128, v2i64, null_frag
>;
defm BUFFER_ATOMIC_ADD_X2 : MUBUF_Pseudo_Atomics <
- "buffer_atomic_add_x2", VReg_64, i64, atomic_add_global
+ "buffer_atomic_add_x2", VReg_64, i64, atomic_load_add_global_64
>;
defm BUFFER_ATOMIC_SUB_X2 : MUBUF_Pseudo_Atomics <
- "buffer_atomic_sub_x2", VReg_64, i64, atomic_sub_global
+ "buffer_atomic_sub_x2", VReg_64, i64, atomic_load_sub_global_64
>;
defm BUFFER_ATOMIC_SMIN_X2 : MUBUF_Pseudo_Atomics <
- "buffer_atomic_smin_x2", VReg_64, i64, atomic_min_global
+ "buffer_atomic_smin_x2", VReg_64, i64, atomic_load_min_global_64
>;
defm BUFFER_ATOMIC_UMIN_X2 : MUBUF_Pseudo_Atomics <
- "buffer_atomic_umin_x2", VReg_64, i64, atomic_umin_global
+ "buffer_atomic_umin_x2", VReg_64, i64, atomic_load_umin_global_64
>;
defm BUFFER_ATOMIC_SMAX_X2 : MUBUF_Pseudo_Atomics <
- "buffer_atomic_smax_x2", VReg_64, i64, atomic_max_global
+ "buffer_atomic_smax_x2", VReg_64, i64, atomic_load_max_global_64
>;
defm BUFFER_ATOMIC_UMAX_X2 : MUBUF_Pseudo_Atomics <
- "buffer_atomic_umax_x2", VReg_64, i64, atomic_umax_global
+ "buffer_atomic_umax_x2", VReg_64, i64, atomic_load_umax_global_64
>;
defm BUFFER_ATOMIC_AND_X2 : MUBUF_Pseudo_Atomics <
- "buffer_atomic_and_x2", VReg_64, i64, atomic_and_global
+ "buffer_atomic_and_x2", VReg_64, i64, atomic_load_and_global_64
>;
defm BUFFER_ATOMIC_OR_X2 : MUBUF_Pseudo_Atomics <
- "buffer_atomic_or_x2", VReg_64, i64, atomic_or_global
+ "buffer_atomic_or_x2", VReg_64, i64, atomic_load_or_global_64
>;
defm BUFFER_ATOMIC_XOR_X2 : MUBUF_Pseudo_Atomics <
- "buffer_atomic_xor_x2", VReg_64, i64, atomic_xor_global
+ "buffer_atomic_xor_x2", VReg_64, i64, atomic_load_xor_global_64
>;
defm BUFFER_ATOMIC_INC_X2 : MUBUF_Pseudo_Atomics <
- "buffer_atomic_inc_x2", VReg_64, i64, atomic_inc_global
+ "buffer_atomic_inc_x2", VReg_64, i64, atomic_inc_global_64
>;
defm BUFFER_ATOMIC_DEC_X2 : MUBUF_Pseudo_Atomics <
- "buffer_atomic_dec_x2", VReg_64, i64, atomic_dec_global
+ "buffer_atomic_dec_x2", VReg_64, i64, atomic_dec_global_64
>;
let SubtargetPredicate = isGFX8GFX9 in {
@@ -981,58 +1010,75 @@ def BUFFER_STORE_LDS_DWORD : MUBUF_Pseudo_Store_Lds <"buffer_store_lds_dword">;
let SubtargetPredicate = isGFX6 in { // isn't on CI & VI
/*
defm BUFFER_ATOMIC_RSUB : MUBUF_Pseudo_Atomics <"buffer_atomic_rsub">;
-defm BUFFER_ATOMIC_FCMPSWAP : MUBUF_Pseudo_Atomics <"buffer_atomic_fcmpswap">;
-defm BUFFER_ATOMIC_FMIN : MUBUF_Pseudo_Atomics <"buffer_atomic_fmin">;
-defm BUFFER_ATOMIC_FMAX : MUBUF_Pseudo_Atomics <"buffer_atomic_fmax">;
defm BUFFER_ATOMIC_RSUB_X2 : MUBUF_Pseudo_Atomics <"buffer_atomic_rsub_x2">;
-defm BUFFER_ATOMIC_FCMPSWAP_X2 : MUBUF_Pseudo_Atomics <"buffer_atomic_fcmpswap_x2">;
-defm BUFFER_ATOMIC_FMIN_X2 : MUBUF_Pseudo_Atomics <"buffer_atomic_fmin_x2">;
-defm BUFFER_ATOMIC_FMAX_X2 : MUBUF_Pseudo_Atomics <"buffer_atomic_fmax_x2">;
*/
def BUFFER_WBINVL1_SC : MUBUF_Invalidate <"buffer_wbinvl1_sc",
int_amdgcn_buffer_wbinvl1_sc>;
}
+let SubtargetPredicate = isGFX6GFX7GFX10 in {
+
+defm BUFFER_ATOMIC_FCMPSWAP : MUBUF_Pseudo_Atomics <
+ "buffer_atomic_fcmpswap", VReg_64, v2f32, null_frag
+>;
+defm BUFFER_ATOMIC_FMIN : MUBUF_Pseudo_Atomics <
+ "buffer_atomic_fmin", VGPR_32, f32, null_frag
+>;
+defm BUFFER_ATOMIC_FMAX : MUBUF_Pseudo_Atomics <
+ "buffer_atomic_fmax", VGPR_32, f32, null_frag
+>;
+defm BUFFER_ATOMIC_FCMPSWAP_X2 : MUBUF_Pseudo_Atomics <
+ "buffer_atomic_fcmpswap_x2", VReg_128, v2f64, null_frag
+>;
+defm BUFFER_ATOMIC_FMIN_X2 : MUBUF_Pseudo_Atomics <
+ "buffer_atomic_fmin_x2", VReg_64, f64, null_frag
+>;
+defm BUFFER_ATOMIC_FMAX_X2 : MUBUF_Pseudo_Atomics <
+ "buffer_atomic_fmax_x2", VReg_64, f64, null_frag
+>;
+
+}
+
let SubtargetPredicate = HasD16LoadStore in {
defm BUFFER_LOAD_UBYTE_D16 : MUBUF_Pseudo_Loads <
- "buffer_load_ubyte_d16", VGPR_32, i32, null_frag, 1
+ "buffer_load_ubyte_d16", i32, null_frag, 1
>;
defm BUFFER_LOAD_UBYTE_D16_HI : MUBUF_Pseudo_Loads <
- "buffer_load_ubyte_d16_hi", VGPR_32, i32, null_frag, 1
+ "buffer_load_ubyte_d16_hi", i32, null_frag, 1
>;
defm BUFFER_LOAD_SBYTE_D16 : MUBUF_Pseudo_Loads <
- "buffer_load_sbyte_d16", VGPR_32, i32, null_frag, 1
+ "buffer_load_sbyte_d16", i32, null_frag, 1
>;
defm BUFFER_LOAD_SBYTE_D16_HI : MUBUF_Pseudo_Loads <
- "buffer_load_sbyte_d16_hi", VGPR_32, i32, null_frag, 1
+ "buffer_load_sbyte_d16_hi", i32, null_frag, 1
>;
defm BUFFER_LOAD_SHORT_D16 : MUBUF_Pseudo_Loads <
- "buffer_load_short_d16", VGPR_32, i32, null_frag, 1
+ "buffer_load_short_d16", i32, null_frag, 1
>;
defm BUFFER_LOAD_SHORT_D16_HI : MUBUF_Pseudo_Loads <
- "buffer_load_short_d16_hi", VGPR_32, i32, null_frag, 1
+ "buffer_load_short_d16_hi", i32, null_frag, 1
>;
defm BUFFER_STORE_BYTE_D16_HI : MUBUF_Pseudo_Stores <
- "buffer_store_byte_d16_hi", VGPR_32, i32
+ "buffer_store_byte_d16_hi", i32
>;
defm BUFFER_STORE_SHORT_D16_HI : MUBUF_Pseudo_Stores <
- "buffer_store_short_d16_hi", VGPR_32, i32
+ "buffer_store_short_d16_hi", i32
>;
defm BUFFER_LOAD_FORMAT_D16_HI_X : MUBUF_Pseudo_Loads <
- "buffer_load_format_d16_hi_x", VGPR_32
+ "buffer_load_format_d16_hi_x", i32
>;
defm BUFFER_STORE_FORMAT_D16_HI_X : MUBUF_Pseudo_Stores <
- "buffer_store_format_d16_hi_x", VGPR_32
+ "buffer_store_format_d16_hi_x", i32
>;
} // End HasD16LoadStore
@@ -1043,10 +1089,10 @@ def BUFFER_WBINVL1 : MUBUF_Invalidate <"buffer_wbinvl1",
let SubtargetPredicate = HasAtomicFaddInsts in {
defm BUFFER_ATOMIC_ADD_F32 : MUBUF_Pseudo_Atomics_NO_RTN <
- "buffer_atomic_add_f32", VGPR_32, f32, atomic_add_global
+ "buffer_atomic_add_f32", VGPR_32, f32, atomic_fadd_global_noret
>;
defm BUFFER_ATOMIC_PK_ADD_F16 : MUBUF_Pseudo_Atomics_NO_RTN <
- "buffer_atomic_pk_add_f16", VGPR_32, v2f16, atomic_add_global
+ "buffer_atomic_pk_add_f16", VGPR_32, v2f16, atomic_pk_fadd_global_noret
>;
} // End SubtargetPredicate = HasAtomicFaddInsts
@@ -1055,35 +1101,35 @@ defm BUFFER_ATOMIC_PK_ADD_F16 : MUBUF_Pseudo_Atomics_NO_RTN <
// MTBUF Instructions
//===----------------------------------------------------------------------===//
-defm TBUFFER_LOAD_FORMAT_X : MTBUF_Pseudo_Loads <"tbuffer_load_format_x", VGPR_32>;
-defm TBUFFER_LOAD_FORMAT_XY : MTBUF_Pseudo_Loads <"tbuffer_load_format_xy", VReg_64>;
-defm TBUFFER_LOAD_FORMAT_XYZ : MTBUF_Pseudo_Loads <"tbuffer_load_format_xyz", VReg_96>;
-defm TBUFFER_LOAD_FORMAT_XYZW : MTBUF_Pseudo_Loads <"tbuffer_load_format_xyzw", VReg_128>;
-defm TBUFFER_STORE_FORMAT_X : MTBUF_Pseudo_Stores <"tbuffer_store_format_x", VGPR_32>;
-defm TBUFFER_STORE_FORMAT_XY : MTBUF_Pseudo_Stores <"tbuffer_store_format_xy", VReg_64>;
-defm TBUFFER_STORE_FORMAT_XYZ : MTBUF_Pseudo_Stores <"tbuffer_store_format_xyz", VReg_96>;
-defm TBUFFER_STORE_FORMAT_XYZW : MTBUF_Pseudo_Stores <"tbuffer_store_format_xyzw", VReg_128>;
+defm TBUFFER_LOAD_FORMAT_X : MTBUF_Pseudo_Loads <"tbuffer_load_format_x", VGPR_32, 1>;
+defm TBUFFER_LOAD_FORMAT_XY : MTBUF_Pseudo_Loads <"tbuffer_load_format_xy", VReg_64, 2>;
+defm TBUFFER_LOAD_FORMAT_XYZ : MTBUF_Pseudo_Loads <"tbuffer_load_format_xyz", VReg_96, 3>;
+defm TBUFFER_LOAD_FORMAT_XYZW : MTBUF_Pseudo_Loads <"tbuffer_load_format_xyzw", VReg_128, 4>;
+defm TBUFFER_STORE_FORMAT_X : MTBUF_Pseudo_Stores <"tbuffer_store_format_x", VGPR_32, 1>;
+defm TBUFFER_STORE_FORMAT_XY : MTBUF_Pseudo_Stores <"tbuffer_store_format_xy", VReg_64, 2>;
+defm TBUFFER_STORE_FORMAT_XYZ : MTBUF_Pseudo_Stores <"tbuffer_store_format_xyz", VReg_96, 3>;
+defm TBUFFER_STORE_FORMAT_XYZW : MTBUF_Pseudo_Stores <"tbuffer_store_format_xyzw", VReg_128, 4>;
let SubtargetPredicate = HasUnpackedD16VMem, D16Buf = 1 in {
- defm TBUFFER_LOAD_FORMAT_D16_X_gfx80 : MTBUF_Pseudo_Loads <"tbuffer_load_format_d16_x", VGPR_32>;
- defm TBUFFER_LOAD_FORMAT_D16_XY_gfx80 : MTBUF_Pseudo_Loads <"tbuffer_load_format_d16_xy", VReg_64>;
- defm TBUFFER_LOAD_FORMAT_D16_XYZ_gfx80 : MTBUF_Pseudo_Loads <"tbuffer_load_format_d16_xyz", VReg_96>;
- defm TBUFFER_LOAD_FORMAT_D16_XYZW_gfx80 : MTBUF_Pseudo_Loads <"tbuffer_load_format_d16_xyzw", VReg_128>;
- defm TBUFFER_STORE_FORMAT_D16_X_gfx80 : MTBUF_Pseudo_Stores <"tbuffer_store_format_d16_x", VGPR_32>;
- defm TBUFFER_STORE_FORMAT_D16_XY_gfx80 : MTBUF_Pseudo_Stores <"tbuffer_store_format_d16_xy", VReg_64>;
- defm TBUFFER_STORE_FORMAT_D16_XYZ_gfx80 : MTBUF_Pseudo_Stores <"tbuffer_store_format_d16_xyz", VReg_96>;
- defm TBUFFER_STORE_FORMAT_D16_XYZW_gfx80 : MTBUF_Pseudo_Stores <"tbuffer_store_format_d16_xyzw", VReg_128>;
+ defm TBUFFER_LOAD_FORMAT_D16_X_gfx80 : MTBUF_Pseudo_Loads <"tbuffer_load_format_d16_x", VGPR_32, 1>;
+ defm TBUFFER_LOAD_FORMAT_D16_XY_gfx80 : MTBUF_Pseudo_Loads <"tbuffer_load_format_d16_xy", VReg_64, 2>;
+ defm TBUFFER_LOAD_FORMAT_D16_XYZ_gfx80 : MTBUF_Pseudo_Loads <"tbuffer_load_format_d16_xyz", VReg_96, 3>;
+ defm TBUFFER_LOAD_FORMAT_D16_XYZW_gfx80 : MTBUF_Pseudo_Loads <"tbuffer_load_format_d16_xyzw", VReg_128, 4>;
+ defm TBUFFER_STORE_FORMAT_D16_X_gfx80 : MTBUF_Pseudo_Stores <"tbuffer_store_format_d16_x", VGPR_32, 1>;
+ defm TBUFFER_STORE_FORMAT_D16_XY_gfx80 : MTBUF_Pseudo_Stores <"tbuffer_store_format_d16_xy", VReg_64, 2>;
+ defm TBUFFER_STORE_FORMAT_D16_XYZ_gfx80 : MTBUF_Pseudo_Stores <"tbuffer_store_format_d16_xyz", VReg_96, 3>;
+ defm TBUFFER_STORE_FORMAT_D16_XYZW_gfx80 : MTBUF_Pseudo_Stores <"tbuffer_store_format_d16_xyzw", VReg_128, 4>;
} // End HasUnpackedD16VMem.
let SubtargetPredicate = HasPackedD16VMem, D16Buf = 1 in {
- defm TBUFFER_LOAD_FORMAT_D16_X : MTBUF_Pseudo_Loads <"tbuffer_load_format_d16_x", VGPR_32>;
- defm TBUFFER_LOAD_FORMAT_D16_XY : MTBUF_Pseudo_Loads <"tbuffer_load_format_d16_xy", VGPR_32>;
- defm TBUFFER_LOAD_FORMAT_D16_XYZ : MTBUF_Pseudo_Loads <"tbuffer_load_format_d16_xyz", VReg_64>;
- defm TBUFFER_LOAD_FORMAT_D16_XYZW : MTBUF_Pseudo_Loads <"tbuffer_load_format_d16_xyzw", VReg_64>;
- defm TBUFFER_STORE_FORMAT_D16_X : MTBUF_Pseudo_Stores <"tbuffer_store_format_d16_x", VGPR_32>;
- defm TBUFFER_STORE_FORMAT_D16_XY : MTBUF_Pseudo_Stores <"tbuffer_store_format_d16_xy", VGPR_32>;
- defm TBUFFER_STORE_FORMAT_D16_XYZ : MTBUF_Pseudo_Stores <"tbuffer_store_format_d16_xyz", VReg_64>;
- defm TBUFFER_STORE_FORMAT_D16_XYZW : MTBUF_Pseudo_Stores <"tbuffer_store_format_d16_xyzw", VReg_64>;
+ defm TBUFFER_LOAD_FORMAT_D16_X : MTBUF_Pseudo_Loads <"tbuffer_load_format_d16_x", VGPR_32, 1>;
+ defm TBUFFER_LOAD_FORMAT_D16_XY : MTBUF_Pseudo_Loads <"tbuffer_load_format_d16_xy", VGPR_32, 2>;
+ defm TBUFFER_LOAD_FORMAT_D16_XYZ : MTBUF_Pseudo_Loads <"tbuffer_load_format_d16_xyz", VReg_64, 3>;
+ defm TBUFFER_LOAD_FORMAT_D16_XYZW : MTBUF_Pseudo_Loads <"tbuffer_load_format_d16_xyzw", VReg_64, 4>;
+ defm TBUFFER_STORE_FORMAT_D16_X : MTBUF_Pseudo_Stores <"tbuffer_store_format_d16_x", VGPR_32, 1>;
+ defm TBUFFER_STORE_FORMAT_D16_XY : MTBUF_Pseudo_Stores <"tbuffer_store_format_d16_xy", VGPR_32, 2>;
+ defm TBUFFER_STORE_FORMAT_D16_XYZ : MTBUF_Pseudo_Stores <"tbuffer_store_format_d16_xyz", VReg_64, 3>;
+ defm TBUFFER_STORE_FORMAT_D16_XYZW : MTBUF_Pseudo_Stores <"tbuffer_store_format_d16_xyzw", VReg_64, 4>;
} // End HasPackedD16VMem.
let SubtargetPredicate = isGFX7Plus in {
@@ -1118,6 +1164,10 @@ def extract_dlc : SDNodeXForm<imm, [{
return CurDAG->getTargetConstant((N->getZExtValue() >> 2) & 1, SDLoc(N), MVT::i8);
}]>;
+def extract_swz : SDNodeXForm<imm, [{
+ return CurDAG->getTargetConstant((N->getZExtValue() >> 3) & 1, SDLoc(N), MVT::i8);
+}]>;
+
//===----------------------------------------------------------------------===//
// buffer_load/store_format patterns
//===----------------------------------------------------------------------===//
@@ -1125,33 +1175,37 @@ def extract_dlc : SDNodeXForm<imm, [{
multiclass MUBUF_LoadIntrinsicPat<SDPatternOperator name, ValueType vt,
string opcode> {
def : GCNPat<
- (vt (name v4i32:$rsrc, 0, 0, i32:$soffset, imm:$offset,
- imm:$cachepolicy, 0)),
+ (vt (name v4i32:$rsrc, 0, 0, i32:$soffset, timm:$offset,
+ timm:$auxiliary, 0)),
(!cast<MUBUF_Pseudo>(opcode # _OFFSET) $rsrc, $soffset, (as_i16imm $offset),
- (extract_glc $cachepolicy), (extract_slc $cachepolicy), 0, (extract_dlc $cachepolicy))
+ (extract_glc $auxiliary), (extract_slc $auxiliary), 0, (extract_dlc $auxiliary),
+ (extract_swz $auxiliary))
>;
def : GCNPat<
- (vt (name v4i32:$rsrc, 0, i32:$voffset, i32:$soffset, imm:$offset,
- imm:$cachepolicy, 0)),
+ (vt (name v4i32:$rsrc, 0, i32:$voffset, i32:$soffset, timm:$offset,
+ timm:$auxiliary, 0)),
(!cast<MUBUF_Pseudo>(opcode # _OFFEN) $voffset, $rsrc, $soffset, (as_i16imm $offset),
- (extract_glc $cachepolicy), (extract_slc $cachepolicy), 0, (extract_dlc $cachepolicy))
+ (extract_glc $auxiliary), (extract_slc $auxiliary), 0, (extract_dlc $auxiliary),
+ (extract_swz $auxiliary))
>;
def : GCNPat<
- (vt (name v4i32:$rsrc, i32:$vindex, 0, i32:$soffset, imm:$offset,
- imm:$cachepolicy, imm)),
+ (vt (name v4i32:$rsrc, i32:$vindex, 0, i32:$soffset, timm:$offset,
+ timm:$auxiliary, timm)),
(!cast<MUBUF_Pseudo>(opcode # _IDXEN) $vindex, $rsrc, $soffset, (as_i16imm $offset),
- (extract_glc $cachepolicy), (extract_slc $cachepolicy), 0, (extract_dlc $cachepolicy))
+ (extract_glc $auxiliary), (extract_slc $auxiliary), 0, (extract_dlc $auxiliary),
+ (extract_swz $auxiliary))
>;
def : GCNPat<
- (vt (name v4i32:$rsrc, i32:$vindex, i32:$voffset, i32:$soffset, imm:$offset,
- imm:$cachepolicy, imm)),
+ (vt (name v4i32:$rsrc, i32:$vindex, i32:$voffset, i32:$soffset, timm:$offset,
+ timm:$auxiliary, timm)),
(!cast<MUBUF_Pseudo>(opcode # _BOTHEN)
(REG_SEQUENCE VReg_64, $vindex, sub0, $voffset, sub1),
$rsrc, $soffset, (as_i16imm $offset),
- (extract_glc $cachepolicy), (extract_slc $cachepolicy), 0, (extract_dlc $cachepolicy))
+ (extract_glc $auxiliary), (extract_slc $auxiliary), 0, (extract_dlc $auxiliary),
+ (extract_swz $auxiliary))
>;
}
@@ -1182,8 +1236,12 @@ let SubtargetPredicate = HasPackedD16VMem in {
defm : MUBUF_LoadIntrinsicPat<SIbuffer_load, f32, "BUFFER_LOAD_DWORD">;
defm : MUBUF_LoadIntrinsicPat<SIbuffer_load, i32, "BUFFER_LOAD_DWORD">;
+defm : MUBUF_LoadIntrinsicPat<SIbuffer_load, v2i16, "BUFFER_LOAD_DWORD">;
+defm : MUBUF_LoadIntrinsicPat<SIbuffer_load, v2f16, "BUFFER_LOAD_DWORD">;
defm : MUBUF_LoadIntrinsicPat<SIbuffer_load, v2f32, "BUFFER_LOAD_DWORDX2">;
defm : MUBUF_LoadIntrinsicPat<SIbuffer_load, v2i32, "BUFFER_LOAD_DWORDX2">;
+defm : MUBUF_LoadIntrinsicPat<SIbuffer_load, v4i16, "BUFFER_LOAD_DWORDX2">;
+defm : MUBUF_LoadIntrinsicPat<SIbuffer_load, v4f16, "BUFFER_LOAD_DWORDX2">;
defm : MUBUF_LoadIntrinsicPat<SIbuffer_load, v3f32, "BUFFER_LOAD_DWORDX3">;
defm : MUBUF_LoadIntrinsicPat<SIbuffer_load, v3i32, "BUFFER_LOAD_DWORDX3">;
defm : MUBUF_LoadIntrinsicPat<SIbuffer_load, v4f32, "BUFFER_LOAD_DWORDX4">;
@@ -1196,36 +1254,40 @@ defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_ushort, i32, "BUFFER_LOAD_USHORT">;
multiclass MUBUF_StoreIntrinsicPat<SDPatternOperator name, ValueType vt,
string opcode> {
def : GCNPat<
- (name vt:$vdata, v4i32:$rsrc, 0, 0, i32:$soffset, imm:$offset,
- imm:$cachepolicy, 0),
+ (name vt:$vdata, v4i32:$rsrc, 0, 0, i32:$soffset, timm:$offset,
+ timm:$auxiliary, 0),
(!cast<MUBUF_Pseudo>(opcode # _OFFSET_exact) $vdata, $rsrc, $soffset, (as_i16imm $offset),
- (extract_glc $cachepolicy), (extract_slc $cachepolicy), 0, (extract_dlc $cachepolicy))
+ (extract_glc $auxiliary), (extract_slc $auxiliary), 0, (extract_dlc $auxiliary),
+ (extract_swz $auxiliary))
>;
def : GCNPat<
- (name vt:$vdata, v4i32:$rsrc, 0, i32:$voffset, i32:$soffset, imm:$offset,
- imm:$cachepolicy, 0),
+ (name vt:$vdata, v4i32:$rsrc, 0, i32:$voffset, i32:$soffset, timm:$offset,
+ timm:$auxiliary, 0),
(!cast<MUBUF_Pseudo>(opcode # _OFFEN_exact) $vdata, $voffset, $rsrc, $soffset,
- (as_i16imm $offset), (extract_glc $cachepolicy),
- (extract_slc $cachepolicy), 0, (extract_dlc $cachepolicy))
+ (as_i16imm $offset), (extract_glc $auxiliary),
+ (extract_slc $auxiliary), 0, (extract_dlc $auxiliary),
+ (extract_swz $auxiliary))
>;
def : GCNPat<
- (name vt:$vdata, v4i32:$rsrc, i32:$vindex, 0, i32:$soffset, imm:$offset,
- imm:$cachepolicy, imm),
+ (name vt:$vdata, v4i32:$rsrc, i32:$vindex, 0, i32:$soffset, timm:$offset,
+ timm:$auxiliary, timm),
(!cast<MUBUF_Pseudo>(opcode # _IDXEN_exact) $vdata, $vindex, $rsrc, $soffset,
- (as_i16imm $offset), (extract_glc $cachepolicy),
- (extract_slc $cachepolicy), 0, (extract_dlc $cachepolicy))
+ (as_i16imm $offset), (extract_glc $auxiliary),
+ (extract_slc $auxiliary), 0, (extract_dlc $auxiliary),
+ (extract_swz $auxiliary))
>;
def : GCNPat<
- (name vt:$vdata, v4i32:$rsrc, i32:$vindex, i32:$voffset, i32:$soffset, imm:$offset,
- imm:$cachepolicy, imm),
+ (name vt:$vdata, v4i32:$rsrc, i32:$vindex, i32:$voffset, i32:$soffset, timm:$offset,
+ timm:$auxiliary, timm),
(!cast<MUBUF_Pseudo>(opcode # _BOTHEN_exact)
$vdata,
(REG_SEQUENCE VReg_64, $vindex, sub0, $voffset, sub1),
- $rsrc, $soffset, (as_i16imm $offset), (extract_glc $cachepolicy),
- (extract_slc $cachepolicy), 0, (extract_dlc $cachepolicy))
+ $rsrc, $soffset, (as_i16imm $offset), (extract_glc $auxiliary),
+ (extract_slc $auxiliary), 0, (extract_dlc $auxiliary),
+ (extract_swz $auxiliary))
>;
}
@@ -1256,8 +1318,12 @@ let SubtargetPredicate = HasPackedD16VMem in {
defm : MUBUF_StoreIntrinsicPat<SIbuffer_store, f32, "BUFFER_STORE_DWORD">;
defm : MUBUF_StoreIntrinsicPat<SIbuffer_store, i32, "BUFFER_STORE_DWORD">;
+defm : MUBUF_StoreIntrinsicPat<SIbuffer_store, v2i16, "BUFFER_STORE_DWORD">;
+defm : MUBUF_StoreIntrinsicPat<SIbuffer_store, v2f16, "BUFFER_STORE_DWORD">;
defm : MUBUF_StoreIntrinsicPat<SIbuffer_store, v2f32, "BUFFER_STORE_DWORDX2">;
defm : MUBUF_StoreIntrinsicPat<SIbuffer_store, v2i32, "BUFFER_STORE_DWORDX2">;
+defm : MUBUF_StoreIntrinsicPat<SIbuffer_store, v4i16, "BUFFER_STORE_DWORDX2">;
+defm : MUBUF_StoreIntrinsicPat<SIbuffer_store, v4f16, "BUFFER_STORE_DWORDX2">;
defm : MUBUF_StoreIntrinsicPat<SIbuffer_store, v3f32, "BUFFER_STORE_DWORDX3">;
defm : MUBUF_StoreIntrinsicPat<SIbuffer_store, v3i32, "BUFFER_STORE_DWORDX3">;
defm : MUBUF_StoreIntrinsicPat<SIbuffer_store, v4f32, "BUFFER_STORE_DWORDX4">;
@@ -1273,32 +1339,32 @@ multiclass BufferAtomicPatterns<SDPatternOperator name, ValueType vt,
string opcode> {
def : GCNPat<
(vt (name vt:$vdata_in, v4i32:$rsrc, 0,
- 0, i32:$soffset, imm:$offset,
- imm:$cachepolicy, 0)),
+ 0, i32:$soffset, timm:$offset,
+ timm:$cachepolicy, 0)),
(!cast<MUBUF_Pseudo>(opcode # _OFFSET_RTN) $vdata_in, $rsrc, $soffset,
(as_i16imm $offset), (extract_slc $cachepolicy))
>;
def : GCNPat<
(vt (name vt:$vdata_in, v4i32:$rsrc, i32:$vindex,
- 0, i32:$soffset, imm:$offset,
- imm:$cachepolicy, imm)),
+ 0, i32:$soffset, timm:$offset,
+ timm:$cachepolicy, timm)),
(!cast<MUBUF_Pseudo>(opcode # _IDXEN_RTN) $vdata_in, $vindex, $rsrc, $soffset,
(as_i16imm $offset), (extract_slc $cachepolicy))
>;
def : GCNPat<
(vt (name vt:$vdata_in, v4i32:$rsrc, 0,
- i32:$voffset, i32:$soffset, imm:$offset,
- imm:$cachepolicy, 0)),
+ i32:$voffset, i32:$soffset, timm:$offset,
+ timm:$cachepolicy, 0)),
(!cast<MUBUF_Pseudo>(opcode # _OFFEN_RTN) $vdata_in, $voffset, $rsrc, $soffset,
(as_i16imm $offset), (extract_slc $cachepolicy))
>;
def : GCNPat<
(vt (name vt:$vdata_in, v4i32:$rsrc, i32:$vindex,
- i32:$voffset, i32:$soffset, imm:$offset,
- imm:$cachepolicy, imm)),
+ i32:$voffset, i32:$soffset, timm:$offset,
+ timm:$cachepolicy, timm)),
(!cast<MUBUF_Pseudo>(opcode # _BOTHEN_RTN)
$vdata_in,
(REG_SEQUENCE VReg_64, $vindex, sub0, $voffset, sub1),
@@ -1316,6 +1382,8 @@ defm : BufferAtomicPatterns<SIbuffer_atomic_umax, i32, "BUFFER_ATOMIC_UMAX">;
defm : BufferAtomicPatterns<SIbuffer_atomic_and, i32, "BUFFER_ATOMIC_AND">;
defm : BufferAtomicPatterns<SIbuffer_atomic_or, i32, "BUFFER_ATOMIC_OR">;
defm : BufferAtomicPatterns<SIbuffer_atomic_xor, i32, "BUFFER_ATOMIC_XOR">;
+defm : BufferAtomicPatterns<SIbuffer_atomic_inc, i32, "BUFFER_ATOMIC_INC">;
+defm : BufferAtomicPatterns<SIbuffer_atomic_dec, i32, "BUFFER_ATOMIC_DEC">;
defm : BufferAtomicPatterns<SIbuffer_atomic_swap, i64, "BUFFER_ATOMIC_SWAP_X2">;
defm : BufferAtomicPatterns<SIbuffer_atomic_add, i64, "BUFFER_ATOMIC_ADD_X2">;
defm : BufferAtomicPatterns<SIbuffer_atomic_sub, i64, "BUFFER_ATOMIC_SUB_X2">;
@@ -1326,37 +1394,39 @@ defm : BufferAtomicPatterns<SIbuffer_atomic_umax, i64, "BUFFER_ATOMIC_UMAX_X2">;
defm : BufferAtomicPatterns<SIbuffer_atomic_and, i64, "BUFFER_ATOMIC_AND_X2">;
defm : BufferAtomicPatterns<SIbuffer_atomic_or, i64, "BUFFER_ATOMIC_OR_X2">;
defm : BufferAtomicPatterns<SIbuffer_atomic_xor, i64, "BUFFER_ATOMIC_XOR_X2">;
+defm : BufferAtomicPatterns<SIbuffer_atomic_inc, i64, "BUFFER_ATOMIC_INC_X2">;
+defm : BufferAtomicPatterns<SIbuffer_atomic_dec, i64, "BUFFER_ATOMIC_DEC_X2">;
multiclass BufferAtomicPatterns_NO_RTN<SDPatternOperator name, ValueType vt,
string opcode> {
def : GCNPat<
(name vt:$vdata_in, v4i32:$rsrc, 0,
- 0, i32:$soffset, imm:$offset,
- imm:$cachepolicy, 0),
+ 0, i32:$soffset, timm:$offset,
+ timm:$cachepolicy, 0),
(!cast<MUBUF_Pseudo>(opcode # _OFFSET) $vdata_in, $rsrc, $soffset,
(as_i16imm $offset), (extract_slc $cachepolicy))
>;
def : GCNPat<
(name vt:$vdata_in, v4i32:$rsrc, i32:$vindex,
- 0, i32:$soffset, imm:$offset,
- imm:$cachepolicy, imm),
+ 0, i32:$soffset, timm:$offset,
+ timm:$cachepolicy, timm),
(!cast<MUBUF_Pseudo>(opcode # _IDXEN) $vdata_in, $vindex, $rsrc, $soffset,
(as_i16imm $offset), (extract_slc $cachepolicy))
>;
def : GCNPat<
(name vt:$vdata_in, v4i32:$rsrc, 0,
- i32:$voffset, i32:$soffset, imm:$offset,
- imm:$cachepolicy, 0),
+ i32:$voffset, i32:$soffset, timm:$offset,
+ timm:$cachepolicy, 0),
(!cast<MUBUF_Pseudo>(opcode # _OFFEN) $vdata_in, $voffset, $rsrc, $soffset,
(as_i16imm $offset), (extract_slc $cachepolicy))
>;
def : GCNPat<
(name vt:$vdata_in, v4i32:$rsrc, i32:$vindex,
- i32:$voffset, i32:$soffset, imm:$offset,
- imm:$cachepolicy, imm),
+ i32:$voffset, i32:$soffset, timm:$offset,
+ timm:$cachepolicy, timm),
(!cast<MUBUF_Pseudo>(opcode # _BOTHEN)
$vdata_in,
(REG_SEQUENCE VReg_64, $vindex, sub0, $voffset, sub1),
@@ -1370,8 +1440,8 @@ defm : BufferAtomicPatterns_NO_RTN<SIbuffer_atomic_pk_fadd, v2f16, "BUFFER_ATOMI
def : GCNPat<
(SIbuffer_atomic_cmpswap
i32:$data, i32:$cmp, v4i32:$rsrc, 0,
- 0, i32:$soffset, imm:$offset,
- imm:$cachepolicy, 0),
+ 0, i32:$soffset, timm:$offset,
+ timm:$cachepolicy, 0),
(EXTRACT_SUBREG
(BUFFER_ATOMIC_CMPSWAP_OFFSET_RTN
(REG_SEQUENCE VReg_64, $data, sub0, $cmp, sub1),
@@ -1382,8 +1452,8 @@ def : GCNPat<
def : GCNPat<
(SIbuffer_atomic_cmpswap
i32:$data, i32:$cmp, v4i32:$rsrc, i32:$vindex,
- 0, i32:$soffset, imm:$offset,
- imm:$cachepolicy, imm),
+ 0, i32:$soffset, timm:$offset,
+ timm:$cachepolicy, timm),
(EXTRACT_SUBREG
(BUFFER_ATOMIC_CMPSWAP_IDXEN_RTN
(REG_SEQUENCE VReg_64, $data, sub0, $cmp, sub1),
@@ -1394,8 +1464,8 @@ def : GCNPat<
def : GCNPat<
(SIbuffer_atomic_cmpswap
i32:$data, i32:$cmp, v4i32:$rsrc, 0,
- i32:$voffset, i32:$soffset, imm:$offset,
- imm:$cachepolicy, 0),
+ i32:$voffset, i32:$soffset, timm:$offset,
+ timm:$cachepolicy, 0),
(EXTRACT_SUBREG
(BUFFER_ATOMIC_CMPSWAP_OFFEN_RTN
(REG_SEQUENCE VReg_64, $data, sub0, $cmp, sub1),
@@ -1406,8 +1476,8 @@ def : GCNPat<
def : GCNPat<
(SIbuffer_atomic_cmpswap
i32:$data, i32:$cmp, v4i32:$rsrc, i32:$vindex,
- i32:$voffset, i32:$soffset, imm:$offset,
- imm:$cachepolicy, imm),
+ i32:$voffset, i32:$soffset, timm:$offset,
+ timm:$cachepolicy, timm),
(EXTRACT_SUBREG
(BUFFER_ATOMIC_CMPSWAP_BOTHEN_RTN
(REG_SEQUENCE VReg_64, $data, sub0, $cmp, sub1),
@@ -1419,8 +1489,8 @@ def : GCNPat<
class MUBUFLoad_PatternADDR64 <MUBUF_Pseudo Instr_ADDR64, ValueType vt,
PatFrag constant_ld> : GCNPat <
(vt (constant_ld (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, i32:$soffset,
- i16:$offset, i1:$glc, i1:$slc, i1:$tfe, i1:$dlc))),
- (Instr_ADDR64 $vaddr, $srsrc, $soffset, $offset, $glc, $slc, $tfe, $dlc)
+ i16:$offset, i1:$glc, i1:$slc, i1:$tfe, i1:$dlc, i1:$swz))),
+ (Instr_ADDR64 $vaddr, $srsrc, $soffset, $offset, $glc, $slc, $tfe, $dlc, $swz)
>;
multiclass MUBUFLoad_Atomic_Pattern <MUBUF_Pseudo Instr_ADDR64, MUBUF_Pseudo Instr_OFFSET,
@@ -1428,12 +1498,12 @@ multiclass MUBUFLoad_Atomic_Pattern <MUBUF_Pseudo Instr_ADDR64, MUBUF_Pseudo Ins
def : GCNPat <
(vt (atomic_ld (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, i32:$soffset,
i16:$offset, i1:$slc))),
- (Instr_ADDR64 $vaddr, $srsrc, $soffset, $offset, 0, $slc, 0, 0)
+ (Instr_ADDR64 $vaddr, $srsrc, $soffset, $offset, 0, $slc, 0, 0, 0)
>;
def : GCNPat <
(vt (atomic_ld (MUBUFOffsetNoGLC v4i32:$rsrc, i32:$soffset, i16:$offset))),
- (Instr_OFFSET $rsrc, $soffset, (as_i16imm $offset), 0, 0, 0, 0)
+ (Instr_OFFSET $rsrc, $soffset, (as_i16imm $offset), 0, 0, 0, 0, 0)
>;
}
@@ -1454,8 +1524,8 @@ multiclass MUBUFLoad_Pattern <MUBUF_Pseudo Instr_OFFSET, ValueType vt,
def : GCNPat <
(vt (ld (MUBUFOffset v4i32:$srsrc, i32:$soffset,
- i16:$offset, i1:$glc, i1:$slc, i1:$tfe, i1:$dlc))),
- (Instr_OFFSET $srsrc, $soffset, $offset, $glc, $slc, $tfe, $dlc)
+ i16:$offset, i1:$glc, i1:$slc, i1:$tfe, i1:$dlc, i1:$swz))),
+ (Instr_OFFSET $srsrc, $soffset, $offset, $glc, $slc, $tfe, $dlc, $swz)
>;
}
@@ -1478,12 +1548,12 @@ multiclass MUBUFScratchLoadPat <MUBUF_Pseudo InstrOffen,
def : GCNPat <
(vt (ld (MUBUFScratchOffen v4i32:$srsrc, i32:$vaddr,
i32:$soffset, u16imm:$offset))),
- (InstrOffen $vaddr, $srsrc, $soffset, $offset, 0, 0, 0, 0)
+ (InstrOffen $vaddr, $srsrc, $soffset, $offset, 0, 0, 0, 0, 0)
>;
def : GCNPat <
(vt (ld (MUBUFScratchOffset v4i32:$srsrc, i32:$soffset, u16imm:$offset))),
- (InstrOffset $srsrc, $soffset, $offset, 0, 0, 0, 0)
+ (InstrOffset $srsrc, $soffset, $offset, 0, 0, 0, 0, 0)
>;
}
@@ -1493,12 +1563,12 @@ multiclass MUBUFScratchLoadPat_D16 <MUBUF_Pseudo InstrOffen,
ValueType vt, PatFrag ld_frag> {
def : GCNPat <
(ld_frag (MUBUFScratchOffen v4i32:$srsrc, i32:$vaddr, i32:$soffset, u16imm:$offset), vt:$in),
- (InstrOffen $vaddr, $srsrc, $soffset, $offset, 0, 0, 0, 0, $in)
+ (InstrOffen $vaddr, $srsrc, $soffset, $offset, 0, 0, 0, 0, 0, $in)
>;
def : GCNPat <
(ld_frag (MUBUFScratchOffset v4i32:$srsrc, i32:$soffset, u16imm:$offset), vt:$in),
- (InstrOffset $srsrc, $soffset, $offset, 0, 0, 0, 0, $in)
+ (InstrOffset $srsrc, $soffset, $offset, 0, 0, 0, 0, 0, $in)
>;
}
@@ -1512,7 +1582,10 @@ defm : MUBUFScratchLoadPat <BUFFER_LOAD_SSHORT_OFFEN, BUFFER_LOAD_SSHORT_OFFSET,
defm : MUBUFScratchLoadPat <BUFFER_LOAD_USHORT_OFFEN, BUFFER_LOAD_USHORT_OFFSET, i32, extloadi16_private>;
defm : MUBUFScratchLoadPat <BUFFER_LOAD_USHORT_OFFEN, BUFFER_LOAD_USHORT_OFFSET, i32, zextloadi16_private>;
defm : MUBUFScratchLoadPat <BUFFER_LOAD_USHORT_OFFEN, BUFFER_LOAD_USHORT_OFFSET, i16, load_private>;
+
+foreach vt = Reg32Types.types in {
defm : MUBUFScratchLoadPat <BUFFER_LOAD_DWORD_OFFEN, BUFFER_LOAD_DWORD_OFFSET, i32, load_private>;
+}
defm : MUBUFScratchLoadPat <BUFFER_LOAD_DWORDX2_OFFEN, BUFFER_LOAD_DWORDX2_OFFSET, v2i32, load_private>;
defm : MUBUFScratchLoadPat <BUFFER_LOAD_DWORDX3_OFFEN, BUFFER_LOAD_DWORDX3_OFFSET, v3i32, load_private>;
defm : MUBUFScratchLoadPat <BUFFER_LOAD_DWORDX4_OFFEN, BUFFER_LOAD_DWORDX4_OFFSET, v4i32, load_private>;
@@ -1535,16 +1608,16 @@ defm : MUBUFScratchLoadPat_D16<BUFFER_LOAD_SBYTE_D16_OFFEN, BUFFER_LOAD_SBYTE_D1
multiclass MUBUFStore_Atomic_Pattern <MUBUF_Pseudo Instr_ADDR64, MUBUF_Pseudo Instr_OFFSET,
ValueType vt, PatFrag atomic_st> {
- // Store follows atomic op convention so address is forst
+ // Store follows atomic op convention so address is first
def : GCNPat <
(atomic_st (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, i32:$soffset,
i16:$offset, i1:$slc), vt:$val),
- (Instr_ADDR64 $val, $vaddr, $srsrc, $soffset, $offset, 0, $slc, 0, 0)
+ (Instr_ADDR64 $val, $vaddr, $srsrc, $soffset, $offset, 0, $slc, 0, 0, 0)
>;
def : GCNPat <
(atomic_st (MUBUFOffsetNoGLC v4i32:$rsrc, i32:$soffset, i16:$offset), vt:$val),
- (Instr_OFFSET $val, $rsrc, $soffset, (as_i16imm $offset), 0, 0, 0, 0)
+ (Instr_OFFSET $val, $rsrc, $soffset, (as_i16imm $offset), 0, 0, 0, 0, 0)
>;
}
let SubtargetPredicate = isGFX6GFX7 in {
@@ -1558,8 +1631,8 @@ multiclass MUBUFStore_Pattern <MUBUF_Pseudo Instr_OFFSET, ValueType vt,
def : GCNPat <
(st vt:$vdata, (MUBUFOffset v4i32:$srsrc, i32:$soffset,
- i16:$offset, i1:$glc, i1:$slc, i1:$tfe, i1:$dlc)),
- (Instr_OFFSET $vdata, $srsrc, $soffset, $offset, $glc, $slc, $tfe, $dlc)
+ i16:$offset, i1:$glc, i1:$slc, i1:$tfe, i1:$dlc, i1:$swz)),
+ (Instr_OFFSET $vdata, $srsrc, $soffset, $offset, $glc, $slc, $tfe, $dlc, $swz)
>;
}
@@ -1573,13 +1646,13 @@ multiclass MUBUFScratchStorePat <MUBUF_Pseudo InstrOffen,
def : GCNPat <
(st vt:$value, (MUBUFScratchOffen v4i32:$srsrc, i32:$vaddr,
i32:$soffset, u16imm:$offset)),
- (InstrOffen rc:$value, $vaddr, $srsrc, $soffset, $offset, 0, 0, 0, 0)
+ (InstrOffen rc:$value, $vaddr, $srsrc, $soffset, $offset, 0, 0, 0, 0, 0)
>;
def : GCNPat <
(st vt:$value, (MUBUFScratchOffset v4i32:$srsrc, i32:$soffset,
u16imm:$offset)),
- (InstrOffset rc:$value, $srsrc, $soffset, $offset, 0, 0, 0, 0)
+ (InstrOffset rc:$value, $srsrc, $soffset, $offset, 0, 0, 0, 0, 0)
>;
}
@@ -1587,7 +1660,11 @@ defm : MUBUFScratchStorePat <BUFFER_STORE_BYTE_OFFEN, BUFFER_STORE_BYTE_OFFSET,
defm : MUBUFScratchStorePat <BUFFER_STORE_SHORT_OFFEN, BUFFER_STORE_SHORT_OFFSET, i32, truncstorei16_private>;
defm : MUBUFScratchStorePat <BUFFER_STORE_BYTE_OFFEN, BUFFER_STORE_BYTE_OFFSET, i16, truncstorei8_private>;
defm : MUBUFScratchStorePat <BUFFER_STORE_SHORT_OFFEN, BUFFER_STORE_SHORT_OFFSET, i16, store_private>;
-defm : MUBUFScratchStorePat <BUFFER_STORE_DWORD_OFFEN, BUFFER_STORE_DWORD_OFFSET, i32, store_private>;
+
+foreach vt = Reg32Types.types in {
+defm : MUBUFScratchStorePat <BUFFER_STORE_DWORD_OFFEN, BUFFER_STORE_DWORD_OFFSET, vt, store_private>;
+}
+
defm : MUBUFScratchStorePat <BUFFER_STORE_DWORDX2_OFFEN, BUFFER_STORE_DWORDX2_OFFSET, v2i32, store_private, VReg_64>;
defm : MUBUFScratchStorePat <BUFFER_STORE_DWORDX3_OFFEN, BUFFER_STORE_DWORDX3_OFFSET, v3i32, store_private, VReg_96>;
defm : MUBUFScratchStorePat <BUFFER_STORE_DWORDX4_OFFEN, BUFFER_STORE_DWORDX4_OFFSET, v4i32, store_private, VReg_128>;
@@ -1613,37 +1690,41 @@ defm : MUBUFScratchStorePat <BUFFER_STORE_BYTE_D16_HI_OFFEN, BUFFER_STORE_BYTE_D
multiclass MTBUF_LoadIntrinsicPat<SDPatternOperator name, ValueType vt,
string opcode> {
def : GCNPat<
- (vt (name v4i32:$rsrc, 0, 0, i32:$soffset, imm:$offset,
- imm:$format, imm:$cachepolicy, 0)),
+ (vt (name v4i32:$rsrc, 0, 0, i32:$soffset, timm:$offset,
+ timm:$format, timm:$auxiliary, 0)),
(!cast<MTBUF_Pseudo>(opcode # _OFFSET) $rsrc, $soffset, (as_i16imm $offset),
(as_i8imm $format),
- (extract_glc $cachepolicy), (extract_slc $cachepolicy), 0, (extract_dlc $cachepolicy))
+ (extract_glc $auxiliary), (extract_slc $auxiliary), 0, (extract_dlc $auxiliary),
+ (extract_swz $auxiliary))
>;
def : GCNPat<
- (vt (name v4i32:$rsrc, i32:$vindex, 0, i32:$soffset, imm:$offset,
- imm:$format, imm:$cachepolicy, imm)),
+ (vt (name v4i32:$rsrc, i32:$vindex, 0, i32:$soffset, timm:$offset,
+ timm:$format, timm:$auxiliary, timm)),
(!cast<MTBUF_Pseudo>(opcode # _IDXEN) $vindex, $rsrc, $soffset, (as_i16imm $offset),
(as_i8imm $format),
- (extract_glc $cachepolicy), (extract_slc $cachepolicy), 0, (extract_dlc $cachepolicy))
+ (extract_glc $auxiliary), (extract_slc $auxiliary), 0, (extract_dlc $auxiliary),
+ (extract_swz $auxiliary))
>;
def : GCNPat<
- (vt (name v4i32:$rsrc, 0, i32:$voffset, i32:$soffset, imm:$offset,
- imm:$format, imm:$cachepolicy, 0)),
+ (vt (name v4i32:$rsrc, 0, i32:$voffset, i32:$soffset, timm:$offset,
+ timm:$format, timm:$auxiliary, 0)),
(!cast<MTBUF_Pseudo>(opcode # _OFFEN) $voffset, $rsrc, $soffset, (as_i16imm $offset),
(as_i8imm $format),
- (extract_glc $cachepolicy), (extract_slc $cachepolicy), 0, (extract_dlc $cachepolicy))
+ (extract_glc $auxiliary), (extract_slc $auxiliary), 0, (extract_dlc $auxiliary),
+ (extract_swz $auxiliary))
>;
def : GCNPat<
- (vt (name v4i32:$rsrc, i32:$vindex, i32:$voffset, i32:$soffset, imm:$offset,
- imm:$format, imm:$cachepolicy, imm)),
+ (vt (name v4i32:$rsrc, i32:$vindex, i32:$voffset, i32:$soffset, timm:$offset,
+ timm:$format, timm:$auxiliary, timm)),
(!cast<MTBUF_Pseudo>(opcode # _BOTHEN)
(REG_SEQUENCE VReg_64, $vindex, sub0, $voffset, sub1),
$rsrc, $soffset, (as_i16imm $offset),
(as_i8imm $format),
- (extract_glc $cachepolicy), (extract_slc $cachepolicy), 0, (extract_dlc $cachepolicy))
+ (extract_glc $auxiliary), (extract_slc $auxiliary), 0, (extract_dlc $auxiliary),
+ (extract_swz $auxiliary))
>;
}
@@ -1671,37 +1752,41 @@ let SubtargetPredicate = HasPackedD16VMem in {
multiclass MTBUF_StoreIntrinsicPat<SDPatternOperator name, ValueType vt,
string opcode> {
def : GCNPat<
- (name vt:$vdata, v4i32:$rsrc, 0, 0, i32:$soffset, imm:$offset,
- imm:$format, imm:$cachepolicy, 0),
+ (name vt:$vdata, v4i32:$rsrc, 0, 0, i32:$soffset, timm:$offset,
+ timm:$format, timm:$auxiliary, 0),
(!cast<MTBUF_Pseudo>(opcode # _OFFSET_exact) $vdata, $rsrc, $soffset,
(as_i16imm $offset), (as_i8imm $format),
- (extract_glc $cachepolicy), (extract_slc $cachepolicy), 0, (extract_dlc $cachepolicy))
+ (extract_glc $auxiliary), (extract_slc $auxiliary), 0, (extract_dlc $auxiliary),
+ (extract_swz $auxiliary))
>;
def : GCNPat<
- (name vt:$vdata, v4i32:$rsrc, i32:$vindex, 0, i32:$soffset, imm:$offset,
- imm:$format, imm:$cachepolicy, imm),
+ (name vt:$vdata, v4i32:$rsrc, i32:$vindex, 0, i32:$soffset, timm:$offset,
+ timm:$format, timm:$auxiliary, timm),
(!cast<MTBUF_Pseudo>(opcode # _IDXEN_exact) $vdata, $vindex, $rsrc, $soffset,
(as_i16imm $offset), (as_i8imm $format),
- (extract_glc $cachepolicy), (extract_slc $cachepolicy), 0, (extract_dlc $cachepolicy))
+ (extract_glc $auxiliary), (extract_slc $auxiliary), 0, (extract_dlc $auxiliary),
+ (extract_swz $auxiliary))
>;
def : GCNPat<
- (name vt:$vdata, v4i32:$rsrc, 0, i32:$voffset, i32:$soffset, imm:$offset,
- imm:$format, imm:$cachepolicy, 0),
+ (name vt:$vdata, v4i32:$rsrc, 0, i32:$voffset, i32:$soffset, timm:$offset,
+ timm:$format, timm:$auxiliary, 0),
(!cast<MTBUF_Pseudo>(opcode # _OFFEN_exact) $vdata, $voffset, $rsrc, $soffset,
(as_i16imm $offset), (as_i8imm $format),
- (extract_glc $cachepolicy), (extract_slc $cachepolicy), 0, (extract_dlc $cachepolicy))
+ (extract_glc $auxiliary), (extract_slc $auxiliary), 0, (extract_dlc $auxiliary),
+ (extract_swz $auxiliary))
>;
def : GCNPat<
(name vt:$vdata, v4i32:$rsrc, i32:$vindex, i32:$voffset, i32:$soffset,
- imm:$offset, imm:$format, imm:$cachepolicy, imm),
+ timm:$offset, timm:$format, timm:$auxiliary, timm),
(!cast<MTBUF_Pseudo>(opcode # _BOTHEN_exact)
$vdata,
(REG_SEQUENCE VReg_64, $vindex, sub0, $voffset, sub1),
$rsrc, $soffset, (as_i16imm $offset), (as_i8imm $format),
- (extract_glc $cachepolicy), (extract_slc $cachepolicy), 0, (extract_dlc $cachepolicy))
+ (extract_glc $auxiliary), (extract_slc $auxiliary), 0, (extract_dlc $auxiliary),
+ (extract_swz $auxiliary))
>;
}
@@ -1957,10 +2042,9 @@ defm BUFFER_ATOMIC_OR : MUBUF_Real_Atomics_gfx6_gfx7_gfx10<0x03a>;
defm BUFFER_ATOMIC_XOR : MUBUF_Real_Atomics_gfx6_gfx7_gfx10<0x03b>;
defm BUFFER_ATOMIC_INC : MUBUF_Real_Atomics_gfx6_gfx7_gfx10<0x03c>;
defm BUFFER_ATOMIC_DEC : MUBUF_Real_Atomics_gfx6_gfx7_gfx10<0x03d>;
-// FIXME-GFX6-GFX7-GFX10: Add following instructions:
-//defm BUFFER_ATOMIC_FCMPSWAP : MUBUF_Real_Atomics_gfx6_gfx7_gfx10<0x03e>;
-//defm BUFFER_ATOMIC_FMIN : MUBUF_Real_Atomics_gfx6_gfx7_gfx10<0x03f>;
-//defm BUFFER_ATOMIC_FMAX : MUBUF_Real_Atomics_gfx6_gfx7_gfx10<0x040>;
+defm BUFFER_ATOMIC_FCMPSWAP : MUBUF_Real_Atomics_gfx6_gfx7_gfx10<0x03e>;
+defm BUFFER_ATOMIC_FMIN : MUBUF_Real_Atomics_gfx6_gfx7_gfx10<0x03f>;
+defm BUFFER_ATOMIC_FMAX : MUBUF_Real_Atomics_gfx6_gfx7_gfx10<0x040>;
defm BUFFER_ATOMIC_SWAP_X2 : MUBUF_Real_Atomics_gfx6_gfx7_gfx10<0x050>;
defm BUFFER_ATOMIC_CMPSWAP_X2 : MUBUF_Real_Atomics_gfx6_gfx7_gfx10<0x051>;
defm BUFFER_ATOMIC_ADD_X2 : MUBUF_Real_Atomics_gfx6_gfx7_gfx10<0x052>;
@@ -1975,10 +2059,9 @@ defm BUFFER_ATOMIC_XOR_X2 : MUBUF_Real_Atomics_gfx6_gfx7_gfx10<0x05b>;
defm BUFFER_ATOMIC_INC_X2 : MUBUF_Real_Atomics_gfx6_gfx7_gfx10<0x05c>;
defm BUFFER_ATOMIC_DEC_X2 : MUBUF_Real_Atomics_gfx6_gfx7_gfx10<0x05d>;
// FIXME-GFX7: Need to handle hazard for BUFFER_ATOMIC_FCMPSWAP_X2 on GFX7.
-// FIXME-GFX6-GFX7-GFX10: Add following instructions:
-//defm BUFFER_ATOMIC_FCMPSWAP_X2 : MUBUF_Real_Atomics_gfx6_gfx7_gfx10<0x05e>;
-//defm BUFFER_ATOMIC_FMIN_X2 : MUBUF_Real_Atomics_gfx6_gfx7_gfx10<0x05f>;
-//defm BUFFER_ATOMIC_FMAX_X2 : MUBUF_Real_Atomics_gfx6_gfx7_gfx10<0x060>;
+defm BUFFER_ATOMIC_FCMPSWAP_X2 : MUBUF_Real_Atomics_gfx6_gfx7_gfx10<0x05e>;
+defm BUFFER_ATOMIC_FMIN_X2 : MUBUF_Real_Atomics_gfx6_gfx7_gfx10<0x05f>;
+defm BUFFER_ATOMIC_FMAX_X2 : MUBUF_Real_Atomics_gfx6_gfx7_gfx10<0x060>;
defm BUFFER_WBINVL1_SC : MUBUF_Real_gfx6<0x070>;
defm BUFFER_WBINVL1_VOL : MUBUF_Real_gfx7<0x070>;
@@ -2353,7 +2436,7 @@ let SubtargetPredicate = HasPackedD16VMem in {
def MUBUFInfoTable : GenericTable {
let FilterClass = "MUBUF_Pseudo";
let CppTypeName = "MUBUFInfo";
- let Fields = ["Opcode", "BaseOpcode", "dwords", "has_vaddr", "has_srsrc", "has_soffset"];
+ let Fields = ["Opcode", "BaseOpcode", "elements", "has_vaddr", "has_srsrc", "has_soffset"];
let PrimaryKey = ["Opcode"];
let PrimaryKeyName = "getMUBUFOpcodeHelper";
@@ -2364,7 +2447,26 @@ def getMUBUFInfoFromOpcode : SearchIndex {
let Key = ["Opcode"];
}
-def getMUBUFInfoFromBaseOpcodeAndDwords : SearchIndex {
+def getMUBUFInfoFromBaseOpcodeAndElements : SearchIndex {
let Table = MUBUFInfoTable;
- let Key = ["BaseOpcode", "dwords"];
+ let Key = ["BaseOpcode", "elements"];
+}
+
+def MTBUFInfoTable : GenericTable {
+ let FilterClass = "MTBUF_Pseudo";
+ let CppTypeName = "MTBUFInfo";
+ let Fields = ["Opcode", "BaseOpcode", "elements", "has_vaddr", "has_srsrc", "has_soffset"];
+
+ let PrimaryKey = ["Opcode"];
+ let PrimaryKeyName = "getMTBUFOpcodeHelper";
+}
+
+def getMTBUFInfoFromOpcode : SearchIndex {
+ let Table = MTBUFInfoTable;
+ let Key = ["Opcode"];
+}
+
+def getMTBUFInfoFromBaseOpcodeAndElements : SearchIndex {
+ let Table = MTBUFInfoTable;
+ let Key = ["BaseOpcode", "elements"];
}
diff --git a/lib/Target/AMDGPU/DSInstructions.td b/lib/Target/AMDGPU/DSInstructions.td
index c52eaaa3fdc5..816ec14a0e98 100644
--- a/lib/Target/AMDGPU/DSInstructions.td
+++ b/lib/Target/AMDGPU/DSInstructions.td
@@ -81,6 +81,17 @@ class DS_Real <DS_Pseudo ds> :
// DS Pseudo instructions
+class DS_0A1D_NORET<string opName, RegisterClass rc = VGPR_32>
+: DS_Pseudo<opName,
+ (outs),
+ (ins rc:$data0, offset:$offset, gds:$gds),
+ "$data0$offset$gds"> {
+
+ let has_addr = 0;
+ let has_data1 = 0;
+ let has_vdst = 0;
+}
+
class DS_1A1D_NORET<string opName, RegisterClass rc = VGPR_32>
: DS_Pseudo<opName,
(outs),
@@ -317,13 +328,16 @@ class DS_GWS <string opName, dag ins, string asmOps>
class DS_GWS_0D <string opName>
: DS_GWS<opName,
- (ins offset:$offset, gds:$gds), "$offset gds">;
+ (ins offset:$offset, gds:$gds), "$offset gds"> {
+ let hasSideEffects = 1;
+}
class DS_GWS_1D <string opName>
: DS_GWS<opName,
(ins VGPR_32:$data0, offset:$offset, gds:$gds), "$data0$offset gds"> {
let has_gws_data0 = 1;
+ let hasSideEffects = 1;
}
class DS_VOID <string opName> : DS_Pseudo<opName,
@@ -391,11 +405,12 @@ def DS_WRITE_B8_D16_HI : DS_1A1D_NORET<"ds_write_b8_d16_hi">;
def DS_WRITE_B16_D16_HI : DS_1A1D_NORET<"ds_write_b16_d16_hi">;
}
+} // End has_m0_read = 0
+
let SubtargetPredicate = HasDSAddTid in {
-def DS_WRITE_ADDTID_B32 : DS_1A1D_NORET<"ds_write_addtid_b32">;
+def DS_WRITE_ADDTID_B32 : DS_0A1D_NORET<"ds_write_addtid_b32">;
}
-} // End has_m0_read = 0
} // End mayLoad = 0
defm DS_MSKOR_B32 : DS_1A2D_NORET_mc<"ds_mskor_b32">;
@@ -540,13 +555,14 @@ def DS_READ_I8_D16_HI : DS_1A_RET_Tied<"ds_read_i8_d16_hi">;
def DS_READ_U16_D16 : DS_1A_RET_Tied<"ds_read_u16_d16">;
def DS_READ_U16_D16_HI : DS_1A_RET_Tied<"ds_read_u16_d16_hi">;
}
+} // End has_m0_read = 0
let SubtargetPredicate = HasDSAddTid in {
-def DS_READ_ADDTID_B32 : DS_1A_RET<"ds_read_addtid_b32">;
-}
-} // End has_m0_read = 0
+def DS_READ_ADDTID_B32 : DS_0A_RET<"ds_read_addtid_b32">;
}
+} // End mayStore = 0
+
def DS_CONSUME : DS_0A_RET<"ds_consume">;
def DS_APPEND : DS_0A_RET<"ds_append">;
def DS_ORDERED_COUNT : DS_1A_RET_GDS<"ds_ordered_count">;
@@ -600,13 +616,13 @@ def DS_ADD_SRC2_F32 : DS_1A<"ds_add_src2_f32">;
//===----------------------------------------------------------------------===//
def : GCNPat <
- (int_amdgcn_ds_swizzle i32:$src, imm:$offset16),
+ (int_amdgcn_ds_swizzle i32:$src, timm:$offset16),
(DS_SWIZZLE_B32 $src, (as_i16imm $offset16), (i1 0))
>;
class DSReadPat <DS_Pseudo inst, ValueType vt, PatFrag frag, int gds=0> : GCNPat <
- (vt (frag (DS1Addr1Offset i32:$ptr, i32:$offset))),
- (inst $ptr, (as_i16imm $offset), (i1 gds))
+ (vt (frag (DS1Addr1Offset i32:$ptr, i16:$offset))),
+ (inst $ptr, offset:$offset, (i1 gds))
>;
multiclass DSReadPat_mc<DS_Pseudo inst, ValueType vt, string frag> {
@@ -621,8 +637,8 @@ multiclass DSReadPat_mc<DS_Pseudo inst, ValueType vt, string frag> {
}
class DSReadPat_D16 <DS_Pseudo inst, PatFrag frag, ValueType vt> : GCNPat <
- (frag (DS1Addr1Offset i32:$ptr, i32:$offset), vt:$in),
- (inst $ptr, (as_i16imm $offset), (i1 0), $in)
+ (frag (DS1Addr1Offset i32:$ptr, i16:$offset), vt:$in),
+ (inst $ptr, offset:$offset, (i1 0), $in)
>;
defm : DSReadPat_mc <DS_READ_I8, i32, "sextloadi8_local">;
@@ -636,13 +652,20 @@ defm : DSReadPat_mc <DS_READ_I16, i32, "sextloadi16_local">;
defm : DSReadPat_mc <DS_READ_U16, i32, "extloadi16_local">;
defm : DSReadPat_mc <DS_READ_U16, i32, "zextloadi16_local">;
defm : DSReadPat_mc <DS_READ_U16, i16, "load_local">;
-defm : DSReadPat_mc <DS_READ_B32, i32, "load_local">;
+
+foreach vt = Reg32Types.types in {
+defm : DSReadPat_mc <DS_READ_B32, vt, "load_local">;
+}
+
defm : DSReadPat_mc <DS_READ_B32, i32, "atomic_load_32_local">;
defm : DSReadPat_mc <DS_READ_B64, i64, "atomic_load_64_local">;
let AddedComplexity = 100 in {
-defm : DSReadPat_mc <DS_READ_B64, v2i32, "load_align8_local">;
+foreach vt = VReg_64.RegTypes in {
+defm : DSReadPat_mc <DS_READ_B64, vt, "load_align8_local">;
+}
+
defm : DSReadPat_mc <DS_READ_B128, v4i32, "load_align16_local">;
} // End AddedComplexity = 100
@@ -664,8 +687,8 @@ def : DSReadPat_D16<DS_READ_I8_D16, sextloadi8_d16_lo_local, v2f16>;
}
class DSWritePat <DS_Pseudo inst, ValueType vt, PatFrag frag, int gds=0> : GCNPat <
- (frag vt:$value, (DS1Addr1Offset i32:$ptr, i32:$offset)),
- (inst $ptr, $value, (as_i16imm $offset), (i1 gds))
+ (frag vt:$value, (DS1Addr1Offset i32:$ptr, i16:$offset)),
+ (inst $ptr, getVregSrcForVT<vt>.ret:$value, offset:$offset, (i1 gds))
>;
multiclass DSWritePat_mc <DS_Pseudo inst, ValueType vt, string frag> {
@@ -681,8 +704,8 @@ multiclass DSWritePat_mc <DS_Pseudo inst, ValueType vt, string frag> {
// Irritatingly, atomic_store reverses the order of operands from a
// normal store.
class DSAtomicWritePat <DS_Pseudo inst, ValueType vt, PatFrag frag> : GCNPat <
- (frag (DS1Addr1Offset i32:$ptr, i32:$offset), vt:$value),
- (inst $ptr, $value, (as_i16imm $offset), (i1 0))
+ (frag (DS1Addr1Offset i32:$ptr, i16:$offset), vt:$value),
+ (inst $ptr, $value, offset:$offset, (i1 0))
>;
multiclass DSAtomicWritePat_mc <DS_Pseudo inst, ValueType vt, string frag> {
@@ -699,9 +722,13 @@ defm : DSWritePat_mc <DS_WRITE_B8, i32, "truncstorei8_local">;
defm : DSWritePat_mc <DS_WRITE_B16, i32, "truncstorei16_local">;
defm : DSWritePat_mc <DS_WRITE_B8, i16, "truncstorei8_local">;
defm : DSWritePat_mc <DS_WRITE_B16, i16, "store_local">;
-defm : DSWritePat_mc <DS_WRITE_B32, i32, "store_local">;
-defm : DSAtomicWritePat_mc <DS_WRITE_B32, i32, "atomic_store_local">;
-defm : DSAtomicWritePat_mc <DS_WRITE_B64, i64, "atomic_store_local">;
+
+foreach vt = VGPR_32.RegTypes in {
+defm : DSWritePat_mc <DS_WRITE_B32, vt, "store_local">;
+}
+
+defm : DSAtomicWritePat_mc <DS_WRITE_B32, i32, "atomic_store_local_32">;
+defm : DSAtomicWritePat_mc <DS_WRITE_B64, i64, "atomic_store_local_64">;
let OtherPredicates = [D16PreservesUnusedBits] in {
def : DSWritePat <DS_WRITE_B16_D16_HI, i32, store_local_hi16>;
@@ -736,46 +763,49 @@ def : DS64Bit4ByteAlignedWritePat<DS_WRITE2_B32_gfx9, store_local>;
let AddedComplexity = 100 in {
-defm : DSWritePat_mc <DS_WRITE_B64, v2i32, "store_align8_local">;
+foreach vt = VReg_64.RegTypes in {
+defm : DSWritePat_mc <DS_WRITE_B64, vt, "store_align8_local">;
+}
+
defm : DSWritePat_mc <DS_WRITE_B128, v4i32, "store_align16_local">;
} // End AddedComplexity = 100
class DSAtomicRetPat<DS_Pseudo inst, ValueType vt, PatFrag frag, bit gds=0> : GCNPat <
- (frag (DS1Addr1Offset i32:$ptr, i32:$offset), vt:$value),
- (inst $ptr, $value, (as_i16imm $offset), (i1 gds))
+ (frag (DS1Addr1Offset i32:$ptr, i16:$offset), vt:$value),
+ (inst $ptr, getVregSrcForVT<vt>.ret:$value, offset:$offset, (i1 gds))
>;
multiclass DSAtomicRetPat_mc<DS_Pseudo inst, ValueType vt, string frag> {
let OtherPredicates = [LDSRequiresM0Init] in {
- def : DSAtomicRetPat<inst, vt, !cast<PatFrag>(frag#"_local_m0")>;
+ def : DSAtomicRetPat<inst, vt, !cast<PatFrag>(frag#"_local_m0_"#vt.Size)>;
}
let OtherPredicates = [NotLDSRequiresM0Init] in {
def : DSAtomicRetPat<!cast<DS_Pseudo>(!cast<string>(inst)#"_gfx9"), vt,
- !cast<PatFrag>(frag#"_local")>;
+ !cast<PatFrag>(frag#"_local_"#vt.Size)>;
}
- def : DSAtomicRetPat<inst, vt, !cast<PatFrag>(frag#"_region_m0"), 1>;
+ def : DSAtomicRetPat<inst, vt, !cast<PatFrag>(frag#"_region_m0_"#vt.Size), 1>;
}
class DSAtomicCmpXChg<DS_Pseudo inst, ValueType vt, PatFrag frag, bit gds=0> : GCNPat <
- (frag (DS1Addr1Offset i32:$ptr, i32:$offset), vt:$cmp, vt:$swap),
- (inst $ptr, $cmp, $swap, (as_i16imm $offset), (i1 gds))
+ (frag (DS1Addr1Offset i32:$ptr, i16:$offset), vt:$cmp, vt:$swap),
+ (inst $ptr, getVregSrcForVT<vt>.ret:$cmp, getVregSrcForVT<vt>.ret:$swap, offset:$offset, (i1 gds))
>;
multiclass DSAtomicCmpXChg_mc<DS_Pseudo inst, ValueType vt, string frag> {
let OtherPredicates = [LDSRequiresM0Init] in {
- def : DSAtomicCmpXChg<inst, vt, !cast<PatFrag>(frag#"_local_m0")>;
+ def : DSAtomicCmpXChg<inst, vt, !cast<PatFrag>(frag#"_local_m0_"#vt.Size)>;
}
let OtherPredicates = [NotLDSRequiresM0Init] in {
def : DSAtomicCmpXChg<!cast<DS_Pseudo>(!cast<string>(inst)#"_gfx9"), vt,
- !cast<PatFrag>(frag#"_local")>;
+ !cast<PatFrag>(frag#"_local_"#vt.Size)>;
}
- def : DSAtomicCmpXChg<inst, vt, !cast<PatFrag>(frag#"_region_m0"), 1>;
+ def : DSAtomicCmpXChg<inst, vt, !cast<PatFrag>(frag#"_region_m0_"#vt.Size), 1>;
}
diff --git a/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp b/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
index 4ec4be9bc485..ec2e2c4e8b71 100644
--- a/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
+++ b/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
@@ -1095,6 +1095,7 @@ MCOperand AMDGPUDisassembler::decodeSpecialReg64(unsigned Val) const {
case 106: return createRegOperand(VCC);
case 108: return createRegOperand(TBA);
case 110: return createRegOperand(TMA);
+ case 125: return createRegOperand(SGPR_NULL);
case 126: return createRegOperand(EXEC);
case 235: return createRegOperand(SRC_SHARED_BASE);
case 236: return createRegOperand(SRC_SHARED_LIMIT);
@@ -1172,7 +1173,8 @@ MCOperand AMDGPUDisassembler::decodeSDWAVopcDst(unsigned Val) const {
int TTmpIdx = getTTmpIdx(Val);
if (TTmpIdx >= 0) {
- return createSRegOperand(getTtmpClassId(OPW64), TTmpIdx);
+ auto TTmpClsId = getTtmpClassId(IsWave64 ? OPW64 : OPW32);
+ return createSRegOperand(TTmpClsId, TTmpIdx);
} else if (Val > SGPR_MAX) {
return IsWave64 ? decodeSpecialReg64(Val)
: decodeSpecialReg32(Val);
diff --git a/lib/Target/AMDGPU/EvergreenInstructions.td b/lib/Target/AMDGPU/EvergreenInstructions.td
index 0550092ce1d6..792e26d21f98 100644
--- a/lib/Target/AMDGPU/EvergreenInstructions.td
+++ b/lib/Target/AMDGPU/EvergreenInstructions.td
@@ -322,46 +322,46 @@ def : EGOrCaymanPat<(i32 (atomic_cmp_swap_global_noret i32:$ptr, i32:$cmp, i32:$
defm AtomicSwapPat : AtomicPat <RAT_ATOMIC_XCHG_INT_RTN,
RAT_ATOMIC_XCHG_INT_NORET,
- atomic_swap_global_ret,
- atomic_swap_global_noret>;
+ atomic_swap_global_ret_32,
+ atomic_swap_global_noret_32>;
defm AtomicAddPat : AtomicPat <RAT_ATOMIC_ADD_RTN, RAT_ATOMIC_ADD_NORET,
- atomic_add_global_ret, atomic_add_global_noret>;
+ atomic_load_add_global_ret_32, atomic_load_add_global_noret_32>;
defm AtomicSubPat : AtomicPat <RAT_ATOMIC_SUB_RTN, RAT_ATOMIC_SUB_NORET,
- atomic_sub_global_ret, atomic_sub_global_noret>;
+ atomic_load_sub_global_ret_32, atomic_load_sub_global_noret_32>;
defm AtomicMinPat : AtomicPat <RAT_ATOMIC_MIN_INT_RTN,
RAT_ATOMIC_MIN_INT_NORET,
- atomic_min_global_ret, atomic_min_global_noret>;
+ atomic_load_min_global_ret_32, atomic_load_min_global_noret_32>;
defm AtomicUMinPat : AtomicPat <RAT_ATOMIC_MIN_UINT_RTN,
RAT_ATOMIC_MIN_UINT_NORET,
- atomic_umin_global_ret, atomic_umin_global_noret>;
+ atomic_load_umin_global_ret_32, atomic_load_umin_global_noret_32>;
defm AtomicMaxPat : AtomicPat <RAT_ATOMIC_MAX_INT_RTN,
RAT_ATOMIC_MAX_INT_NORET,
- atomic_max_global_ret, atomic_max_global_noret>;
+ atomic_load_max_global_ret_32, atomic_load_max_global_noret_32>;
defm AtomicUMaxPat : AtomicPat <RAT_ATOMIC_MAX_UINT_RTN,
RAT_ATOMIC_MAX_UINT_NORET,
- atomic_umax_global_ret, atomic_umax_global_noret>;
+ atomic_load_umax_global_ret_32, atomic_load_umax_global_noret_32>;
defm AtomicAndPat : AtomicPat <RAT_ATOMIC_AND_RTN, RAT_ATOMIC_AND_NORET,
- atomic_and_global_ret, atomic_and_global_noret>;
+ atomic_load_and_global_ret_32, atomic_load_and_global_noret_32>;
defm AtomicOrPat : AtomicPat <RAT_ATOMIC_OR_RTN, RAT_ATOMIC_OR_NORET,
- atomic_or_global_ret, atomic_or_global_noret>;
+ atomic_load_or_global_ret_32, atomic_load_or_global_noret_32>;
defm AtomicXorPat : AtomicPat <RAT_ATOMIC_XOR_RTN, RAT_ATOMIC_XOR_NORET,
- atomic_xor_global_ret, atomic_xor_global_noret>;
+ atomic_load_xor_global_ret_32, atomic_load_xor_global_noret_32>;
defm AtomicIncAddPat : AtomicIncDecPat <RAT_ATOMIC_INC_UINT_RTN,
RAT_ATOMIC_INC_UINT_NORET,
- atomic_add_global_ret,
- atomic_add_global_noret, 1>;
+ atomic_load_add_global_ret_32,
+ atomic_load_add_global_noret_32, 1>;
defm AtomicIncSubPat : AtomicIncDecPat <RAT_ATOMIC_INC_UINT_RTN,
RAT_ATOMIC_INC_UINT_NORET,
- atomic_sub_global_ret,
- atomic_sub_global_noret, -1>;
+ atomic_load_sub_global_ret_32,
+ atomic_load_sub_global_noret_32, -1>;
defm AtomicDecAddPat : AtomicIncDecPat <RAT_ATOMIC_DEC_UINT_RTN,
RAT_ATOMIC_DEC_UINT_NORET,
- atomic_add_global_ret,
- atomic_add_global_noret, -1>;
+ atomic_load_add_global_ret_32,
+ atomic_load_add_global_noret_32, -1>;
defm AtomicDecSubPat : AtomicIncDecPat <RAT_ATOMIC_DEC_UINT_RTN,
RAT_ATOMIC_DEC_UINT_NORET,
- atomic_sub_global_ret,
- atomic_sub_global_noret, 1>;
+ atomic_load_sub_global_ret_32,
+ atomic_load_sub_global_noret_32, 1>;
// Should be predicated on FeatureFP64
// def FMA_64 : R600_3OP <
@@ -628,37 +628,37 @@ def LDS_SHORT_WRITE : R600_LDS_1A1D_NORET<0x13, "LDS_SHORT_WRITE",
[(truncstorei16_local i32:$src1, i32:$src0)]
>;
def LDS_ADD_RET : R600_LDS_1A1D_RET <0x20, "LDS_ADD",
- [(set i32:$dst, (atomic_load_add_local i32:$src0, i32:$src1))]
+ [(set i32:$dst, (atomic_load_add_local_32 i32:$src0, i32:$src1))]
>;
def LDS_SUB_RET : R600_LDS_1A1D_RET <0x21, "LDS_SUB",
- [(set i32:$dst, (atomic_load_sub_local i32:$src0, i32:$src1))]
+ [(set i32:$dst, (atomic_load_sub_local_32 i32:$src0, i32:$src1))]
>;
def LDS_AND_RET : R600_LDS_1A1D_RET <0x29, "LDS_AND",
- [(set i32:$dst, (atomic_load_and_local i32:$src0, i32:$src1))]
+ [(set i32:$dst, (atomic_load_and_local_32 i32:$src0, i32:$src1))]
>;
def LDS_OR_RET : R600_LDS_1A1D_RET <0x2a, "LDS_OR",
- [(set i32:$dst, (atomic_load_or_local i32:$src0, i32:$src1))]
+ [(set i32:$dst, (atomic_load_or_local_32 i32:$src0, i32:$src1))]
>;
def LDS_XOR_RET : R600_LDS_1A1D_RET <0x2b, "LDS_XOR",
- [(set i32:$dst, (atomic_load_xor_local i32:$src0, i32:$src1))]
+ [(set i32:$dst, (atomic_load_xor_local_32 i32:$src0, i32:$src1))]
>;
def LDS_MIN_INT_RET : R600_LDS_1A1D_RET <0x25, "LDS_MIN_INT",
- [(set i32:$dst, (atomic_load_min_local i32:$src0, i32:$src1))]
+ [(set i32:$dst, (atomic_load_min_local_32 i32:$src0, i32:$src1))]
>;
def LDS_MAX_INT_RET : R600_LDS_1A1D_RET <0x26, "LDS_MAX_INT",
- [(set i32:$dst, (atomic_load_max_local i32:$src0, i32:$src1))]
+ [(set i32:$dst, (atomic_load_max_local_32 i32:$src0, i32:$src1))]
>;
def LDS_MIN_UINT_RET : R600_LDS_1A1D_RET <0x27, "LDS_MIN_UINT",
- [(set i32:$dst, (atomic_load_umin_local i32:$src0, i32:$src1))]
+ [(set i32:$dst, (atomic_load_umin_local_32 i32:$src0, i32:$src1))]
>;
def LDS_MAX_UINT_RET : R600_LDS_1A1D_RET <0x28, "LDS_MAX_UINT",
- [(set i32:$dst, (atomic_load_umax_local i32:$src0, i32:$src1))]
+ [(set i32:$dst, (atomic_load_umax_local_32 i32:$src0, i32:$src1))]
>;
def LDS_WRXCHG_RET : R600_LDS_1A1D_RET <0x2d, "LDS_WRXCHG",
- [(set i32:$dst, (atomic_swap_local i32:$src0, i32:$src1))]
+ [(set i32:$dst, (atomic_swap_local_32 i32:$src0, i32:$src1))]
>;
def LDS_CMPST_RET : R600_LDS_1A2D_RET <0x30, "LDS_CMPST",
- [(set i32:$dst, (atomic_cmp_swap_local i32:$src0, i32:$src1, i32:$src2))]
+ [(set i32:$dst, (atomic_cmp_swap_local_32 i32:$src0, i32:$src1, i32:$src2))]
>;
def LDS_READ_RET : R600_LDS_1A <0x32, "LDS_READ_RET",
[(set (i32 R600_Reg32:$dst), (load_local R600_Reg32:$src0))]
diff --git a/lib/Target/AMDGPU/FLATInstructions.td b/lib/Target/AMDGPU/FLATInstructions.td
index 889f60dae920..80ee17eba141 100644
--- a/lib/Target/AMDGPU/FLATInstructions.td
+++ b/lib/Target/AMDGPU/FLATInstructions.td
@@ -270,7 +270,7 @@ multiclass FLAT_Atomic_Pseudo<
SDPatternOperator atomic = null_frag,
ValueType data_vt = vt,
RegisterClass data_rc = vdst_rc,
- bit isFP = getIsFP<data_vt>.ret> {
+ bit isFP = isFloatType<data_vt>.ret> {
def "" : FLAT_AtomicNoRet_Pseudo <opName,
(outs),
(ins VReg_64:$vaddr, data_rc:$vdata, flat_offset:$offset, SLC:$slc),
@@ -300,7 +300,7 @@ multiclass FLAT_Global_Atomic_Pseudo_NO_RTN<
SDPatternOperator atomic = null_frag,
ValueType data_vt = vt,
RegisterClass data_rc = vdst_rc,
- bit isFP = getIsFP<data_vt>.ret> {
+ bit isFP = isFloatType<data_vt>.ret> {
def "" : FLAT_AtomicNoRet_Pseudo <opName,
(outs),
@@ -333,7 +333,7 @@ multiclass FLAT_Global_Atomic_Pseudo_RTN<
SDPatternOperator atomic = null_frag,
ValueType data_vt = vt,
RegisterClass data_rc = vdst_rc,
- bit isFP = getIsFP<data_vt>.ret> {
+ bit isFP = isFloatType<data_vt>.ret> {
def _RTN : FLAT_AtomicRet_Pseudo <opName,
(outs vdst_rc:$vdst),
@@ -564,76 +564,76 @@ defm GLOBAL_ATOMIC_CMPSWAP_X2 : FLAT_Global_Atomic_Pseudo <"global_atomic_cmpswa
v2i64, VReg_128>;
defm GLOBAL_ATOMIC_SWAP : FLAT_Global_Atomic_Pseudo <"global_atomic_swap",
- VGPR_32, i32, atomic_swap_global>;
+ VGPR_32, i32, atomic_swap_global_32>;
defm GLOBAL_ATOMIC_SWAP_X2 : FLAT_Global_Atomic_Pseudo <"global_atomic_swap_x2",
- VReg_64, i64, atomic_swap_global>;
+ VReg_64, i64, atomic_swap_global_64>;
defm GLOBAL_ATOMIC_ADD : FLAT_Global_Atomic_Pseudo <"global_atomic_add",
- VGPR_32, i32, atomic_add_global>;
+ VGPR_32, i32, atomic_load_add_global_32>;
defm GLOBAL_ATOMIC_SUB : FLAT_Global_Atomic_Pseudo <"global_atomic_sub",
- VGPR_32, i32, atomic_sub_global>;
+ VGPR_32, i32, atomic_load_sub_global_32>;
defm GLOBAL_ATOMIC_SMIN : FLAT_Global_Atomic_Pseudo <"global_atomic_smin",
- VGPR_32, i32, atomic_min_global>;
+ VGPR_32, i32, atomic_load_min_global_32>;
defm GLOBAL_ATOMIC_UMIN : FLAT_Global_Atomic_Pseudo <"global_atomic_umin",
- VGPR_32, i32, atomic_umin_global>;
+ VGPR_32, i32, atomic_load_umin_global_32>;
defm GLOBAL_ATOMIC_SMAX : FLAT_Global_Atomic_Pseudo <"global_atomic_smax",
- VGPR_32, i32, atomic_max_global>;
+ VGPR_32, i32, atomic_load_max_global_32>;
defm GLOBAL_ATOMIC_UMAX : FLAT_Global_Atomic_Pseudo <"global_atomic_umax",
- VGPR_32, i32, atomic_umax_global>;
+ VGPR_32, i32, atomic_load_umax_global_32>;
defm GLOBAL_ATOMIC_AND : FLAT_Global_Atomic_Pseudo <"global_atomic_and",
- VGPR_32, i32, atomic_and_global>;
+ VGPR_32, i32, atomic_load_and_global_32>;
defm GLOBAL_ATOMIC_OR : FLAT_Global_Atomic_Pseudo <"global_atomic_or",
- VGPR_32, i32, atomic_or_global>;
+ VGPR_32, i32, atomic_load_or_global_32>;
defm GLOBAL_ATOMIC_XOR : FLAT_Global_Atomic_Pseudo <"global_atomic_xor",
- VGPR_32, i32, atomic_xor_global>;
+ VGPR_32, i32, atomic_load_xor_global_32>;
defm GLOBAL_ATOMIC_INC : FLAT_Global_Atomic_Pseudo <"global_atomic_inc",
- VGPR_32, i32, atomic_inc_global>;
+ VGPR_32, i32, atomic_inc_global_32>;
defm GLOBAL_ATOMIC_DEC : FLAT_Global_Atomic_Pseudo <"global_atomic_dec",
- VGPR_32, i32, atomic_dec_global>;
+ VGPR_32, i32, atomic_dec_global_32>;
defm GLOBAL_ATOMIC_ADD_X2 : FLAT_Global_Atomic_Pseudo <"global_atomic_add_x2",
- VReg_64, i64, atomic_add_global>;
+ VReg_64, i64, atomic_load_add_global_64>;
defm GLOBAL_ATOMIC_SUB_X2 : FLAT_Global_Atomic_Pseudo <"global_atomic_sub_x2",
- VReg_64, i64, atomic_sub_global>;
+ VReg_64, i64, atomic_load_sub_global_64>;
defm GLOBAL_ATOMIC_SMIN_X2 : FLAT_Global_Atomic_Pseudo <"global_atomic_smin_x2",
- VReg_64, i64, atomic_min_global>;
+ VReg_64, i64, atomic_load_min_global_64>;
defm GLOBAL_ATOMIC_UMIN_X2 : FLAT_Global_Atomic_Pseudo <"global_atomic_umin_x2",
- VReg_64, i64, atomic_umin_global>;
+ VReg_64, i64, atomic_load_umin_global_64>;
defm GLOBAL_ATOMIC_SMAX_X2 : FLAT_Global_Atomic_Pseudo <"global_atomic_smax_x2",
- VReg_64, i64, atomic_max_global>;
+ VReg_64, i64, atomic_load_max_global_64>;
defm GLOBAL_ATOMIC_UMAX_X2 : FLAT_Global_Atomic_Pseudo <"global_atomic_umax_x2",
- VReg_64, i64, atomic_umax_global>;
+ VReg_64, i64, atomic_load_umax_global_64>;
defm GLOBAL_ATOMIC_AND_X2 : FLAT_Global_Atomic_Pseudo <"global_atomic_and_x2",
- VReg_64, i64, atomic_and_global>;
+ VReg_64, i64, atomic_load_and_global_64>;
defm GLOBAL_ATOMIC_OR_X2 : FLAT_Global_Atomic_Pseudo <"global_atomic_or_x2",
- VReg_64, i64, atomic_or_global>;
+ VReg_64, i64, atomic_load_or_global_64>;
defm GLOBAL_ATOMIC_XOR_X2 : FLAT_Global_Atomic_Pseudo <"global_atomic_xor_x2",
- VReg_64, i64, atomic_xor_global>;
+ VReg_64, i64, atomic_load_xor_global_64>;
defm GLOBAL_ATOMIC_INC_X2 : FLAT_Global_Atomic_Pseudo <"global_atomic_inc_x2",
- VReg_64, i64, atomic_inc_global>;
+ VReg_64, i64, atomic_inc_global_64>;
defm GLOBAL_ATOMIC_DEC_X2 : FLAT_Global_Atomic_Pseudo <"global_atomic_dec_x2",
- VReg_64, i64, atomic_dec_global>;
+ VReg_64, i64, atomic_dec_global_64>;
} // End is_flat_global = 1
} // End SubtargetPredicate = HasFlatGlobalInsts
@@ -686,10 +686,10 @@ let SubtargetPredicate = isGFX10Plus, is_flat_global = 1 in {
let SubtargetPredicate = HasAtomicFaddInsts, is_flat_global = 1 in {
defm GLOBAL_ATOMIC_ADD_F32 : FLAT_Global_Atomic_Pseudo_NO_RTN <
- "global_atomic_add_f32", VGPR_32, f32, atomic_add_global
+ "global_atomic_add_f32", VGPR_32, f32, atomic_fadd_global_noret
>;
defm GLOBAL_ATOMIC_PK_ADD_F16 : FLAT_Global_Atomic_Pseudo_NO_RTN <
- "global_atomic_pk_add_f16", VGPR_32, v2f16, atomic_add_global
+ "global_atomic_pk_add_f16", VGPR_32, v2f16, atomic_pk_fadd_global_noret
>;
} // End SubtargetPredicate = HasAtomicFaddInsts
@@ -777,8 +777,6 @@ def : FlatLoadPat <FLAT_LOAD_USHORT, extloadi16_flat, i32>;
def : FlatLoadPat <FLAT_LOAD_USHORT, zextloadi16_flat, i32>;
def : FlatLoadPat <FLAT_LOAD_USHORT, load_flat, i16>;
def : FlatLoadPat <FLAT_LOAD_SSHORT, sextloadi16_flat, i32>;
-def : FlatLoadPat <FLAT_LOAD_DWORD, load_flat, i32>;
-def : FlatLoadPat <FLAT_LOAD_DWORDX2, load_flat, v2i32>;
def : FlatLoadPat <FLAT_LOAD_DWORDX3, load_flat, v3i32>;
def : FlatLoadPat <FLAT_LOAD_DWORDX4, load_flat, v4i32>;
@@ -787,41 +785,50 @@ def : FlatLoadAtomicPat <FLAT_LOAD_DWORDX2, atomic_load_64_flat, i64>;
def : FlatStorePat <FLAT_STORE_BYTE, truncstorei8_flat, i32>;
def : FlatStorePat <FLAT_STORE_SHORT, truncstorei16_flat, i32>;
-def : FlatStorePat <FLAT_STORE_DWORD, store_flat, i32>;
-def : FlatStorePat <FLAT_STORE_DWORDX2, store_flat, v2i32, VReg_64>;
+
+foreach vt = Reg32Types.types in {
+def : FlatLoadPat <FLAT_LOAD_DWORD, load_flat, vt>;
+def : FlatStorePat <FLAT_STORE_DWORD, store_flat, vt>;
+}
+
+foreach vt = VReg_64.RegTypes in {
+def : FlatStorePat <FLAT_STORE_DWORDX2, store_flat, vt, VReg_64>;
+def : FlatLoadPat <FLAT_LOAD_DWORDX2, load_flat, vt>;
+}
+
def : FlatStorePat <FLAT_STORE_DWORDX3, store_flat, v3i32, VReg_96>;
def : FlatStorePat <FLAT_STORE_DWORDX4, store_flat, v4i32, VReg_128>;
def : FlatStoreAtomicPat <FLAT_STORE_DWORD, atomic_store_flat_32, i32>;
def : FlatStoreAtomicPat <FLAT_STORE_DWORDX2, atomic_store_flat_64, i64, VReg_64>;
-def : FlatAtomicPat <FLAT_ATOMIC_ADD_RTN, atomic_add_global, i32>;
-def : FlatAtomicPat <FLAT_ATOMIC_SUB_RTN, atomic_sub_global, i32>;
-def : FlatAtomicPat <FLAT_ATOMIC_INC_RTN, atomic_inc_global, i32>;
-def : FlatAtomicPat <FLAT_ATOMIC_DEC_RTN, atomic_dec_global, i32>;
-def : FlatAtomicPat <FLAT_ATOMIC_AND_RTN, atomic_and_global, i32>;
-def : FlatAtomicPat <FLAT_ATOMIC_SMAX_RTN, atomic_max_global, i32>;
-def : FlatAtomicPat <FLAT_ATOMIC_UMAX_RTN, atomic_umax_global, i32>;
-def : FlatAtomicPat <FLAT_ATOMIC_SMIN_RTN, atomic_min_global, i32>;
-def : FlatAtomicPat <FLAT_ATOMIC_UMIN_RTN, atomic_umin_global, i32>;
-def : FlatAtomicPat <FLAT_ATOMIC_OR_RTN, atomic_or_global, i32>;
-def : FlatAtomicPat <FLAT_ATOMIC_SWAP_RTN, atomic_swap_global, i32>;
+def : FlatAtomicPat <FLAT_ATOMIC_ADD_RTN, atomic_load_add_global_32, i32>;
+def : FlatAtomicPat <FLAT_ATOMIC_SUB_RTN, atomic_load_sub_global_32, i32>;
+def : FlatAtomicPat <FLAT_ATOMIC_INC_RTN, atomic_inc_global_32, i32>;
+def : FlatAtomicPat <FLAT_ATOMIC_DEC_RTN, atomic_dec_global_32, i32>;
+def : FlatAtomicPat <FLAT_ATOMIC_AND_RTN, atomic_load_and_global_32, i32>;
+def : FlatAtomicPat <FLAT_ATOMIC_SMAX_RTN, atomic_load_max_global_32, i32>;
+def : FlatAtomicPat <FLAT_ATOMIC_UMAX_RTN, atomic_load_umax_global_32, i32>;
+def : FlatAtomicPat <FLAT_ATOMIC_SMIN_RTN, atomic_load_min_global_32, i32>;
+def : FlatAtomicPat <FLAT_ATOMIC_UMIN_RTN, atomic_load_umin_global_32, i32>;
+def : FlatAtomicPat <FLAT_ATOMIC_OR_RTN, atomic_load_or_global_32, i32>;
+def : FlatAtomicPat <FLAT_ATOMIC_SWAP_RTN, atomic_swap_global_32, i32>;
def : FlatAtomicPat <FLAT_ATOMIC_CMPSWAP_RTN, AMDGPUatomic_cmp_swap_global, i32, v2i32>;
-def : FlatAtomicPat <FLAT_ATOMIC_XOR_RTN, atomic_xor_global, i32>;
-
-def : FlatAtomicPat <FLAT_ATOMIC_ADD_X2_RTN, atomic_add_global, i64>;
-def : FlatAtomicPat <FLAT_ATOMIC_SUB_X2_RTN, atomic_sub_global, i64>;
-def : FlatAtomicPat <FLAT_ATOMIC_INC_X2_RTN, atomic_inc_global, i64>;
-def : FlatAtomicPat <FLAT_ATOMIC_DEC_X2_RTN, atomic_dec_global, i64>;
-def : FlatAtomicPat <FLAT_ATOMIC_AND_X2_RTN, atomic_and_global, i64>;
-def : FlatAtomicPat <FLAT_ATOMIC_SMAX_X2_RTN, atomic_max_global, i64>;
-def : FlatAtomicPat <FLAT_ATOMIC_UMAX_X2_RTN, atomic_umax_global, i64>;
-def : FlatAtomicPat <FLAT_ATOMIC_SMIN_X2_RTN, atomic_min_global, i64>;
-def : FlatAtomicPat <FLAT_ATOMIC_UMIN_X2_RTN, atomic_umin_global, i64>;
-def : FlatAtomicPat <FLAT_ATOMIC_OR_X2_RTN, atomic_or_global, i64>;
-def : FlatAtomicPat <FLAT_ATOMIC_SWAP_X2_RTN, atomic_swap_global, i64>;
+def : FlatAtomicPat <FLAT_ATOMIC_XOR_RTN, atomic_load_xor_global_32, i32>;
+
+def : FlatAtomicPat <FLAT_ATOMIC_ADD_X2_RTN, atomic_load_add_global_64, i64>;
+def : FlatAtomicPat <FLAT_ATOMIC_SUB_X2_RTN, atomic_load_sub_global_64, i64>;
+def : FlatAtomicPat <FLAT_ATOMIC_INC_X2_RTN, atomic_inc_global_64, i64>;
+def : FlatAtomicPat <FLAT_ATOMIC_DEC_X2_RTN, atomic_dec_global_64, i64>;
+def : FlatAtomicPat <FLAT_ATOMIC_AND_X2_RTN, atomic_load_and_global_64, i64>;
+def : FlatAtomicPat <FLAT_ATOMIC_SMAX_X2_RTN, atomic_load_max_global_64, i64>;
+def : FlatAtomicPat <FLAT_ATOMIC_UMAX_X2_RTN, atomic_load_umax_global_64, i64>;
+def : FlatAtomicPat <FLAT_ATOMIC_SMIN_X2_RTN, atomic_load_min_global_64, i64>;
+def : FlatAtomicPat <FLAT_ATOMIC_UMIN_X2_RTN, atomic_load_umin_global_64, i64>;
+def : FlatAtomicPat <FLAT_ATOMIC_OR_X2_RTN, atomic_load_or_global_64, i64>;
+def : FlatAtomicPat <FLAT_ATOMIC_SWAP_X2_RTN, atomic_swap_global_64, i64>;
def : FlatAtomicPat <FLAT_ATOMIC_CMPSWAP_X2_RTN, AMDGPUatomic_cmp_swap_global, i64, v2i64>;
-def : FlatAtomicPat <FLAT_ATOMIC_XOR_X2_RTN, atomic_xor_global, i64>;
+def : FlatAtomicPat <FLAT_ATOMIC_XOR_X2_RTN, atomic_load_xor_global_64, i64>;
def : FlatStorePat <FLAT_STORE_BYTE, truncstorei8_flat, i16>;
def : FlatStorePat <FLAT_STORE_SHORT, store_flat, i16>;
@@ -847,9 +854,6 @@ def : FlatLoadPat_D16 <FLAT_LOAD_SHORT_D16, load_d16_lo_flat, v2f16>;
} // End OtherPredicates = [HasFlatAddressSpace]
-def atomic_fadd_global : global_binary_atomic_op_frag<SIglobal_atomic_fadd>;
-def atomic_pk_fadd_global : global_binary_atomic_op_frag<SIglobal_atomic_pk_fadd>;
-
let OtherPredicates = [HasFlatGlobalInsts], AddedComplexity = 10 in {
def : FlatLoadSignedPat <GLOBAL_LOAD_UBYTE, extloadi8_global, i32>;
@@ -863,8 +867,16 @@ def : FlatLoadSignedPat <GLOBAL_LOAD_USHORT, zextloadi16_global, i32>;
def : FlatLoadSignedPat <GLOBAL_LOAD_SSHORT, sextloadi16_global, i32>;
def : FlatLoadSignedPat <GLOBAL_LOAD_USHORT, load_global, i16>;
-def : FlatLoadSignedPat <GLOBAL_LOAD_DWORD, load_global, i32>;
-def : FlatLoadSignedPat <GLOBAL_LOAD_DWORDX2, load_global, v2i32>;
+foreach vt = Reg32Types.types in {
+def : FlatLoadSignedPat <GLOBAL_LOAD_DWORD, load_global, vt>;
+def : FlatStoreSignedPat <GLOBAL_STORE_DWORD, store_global, vt, VGPR_32>;
+}
+
+foreach vt = VReg_64.RegTypes in {
+def : FlatLoadSignedPat <GLOBAL_LOAD_DWORDX2, load_global, vt>;
+def : FlatStoreSignedPat <GLOBAL_STORE_DWORDX2, store_global, vt, VReg_64>;
+}
+
def : FlatLoadSignedPat <GLOBAL_LOAD_DWORDX3, load_global, v3i32>;
def : FlatLoadSignedPat <GLOBAL_LOAD_DWORDX4, load_global, v4i32>;
@@ -875,8 +887,6 @@ def : FlatStoreSignedPat <GLOBAL_STORE_BYTE, truncstorei8_global, i32, VGPR_32>;
def : FlatStoreSignedPat <GLOBAL_STORE_BYTE, truncstorei8_global, i16, VGPR_32>;
def : FlatStoreSignedPat <GLOBAL_STORE_SHORT, truncstorei16_global, i32, VGPR_32>;
def : FlatStoreSignedPat <GLOBAL_STORE_SHORT, store_global, i16, VGPR_32>;
-def : FlatStoreSignedPat <GLOBAL_STORE_DWORD, store_global, i32, VGPR_32>;
-def : FlatStoreSignedPat <GLOBAL_STORE_DWORDX2, store_global, v2i32, VReg_64>;
def : FlatStoreSignedPat <GLOBAL_STORE_DWORDX3, store_global, v3i32, VReg_96>;
def : FlatStoreSignedPat <GLOBAL_STORE_DWORDX4, store_global, v4i32, VReg_128>;
@@ -902,36 +912,36 @@ def : FlatSignedLoadPat_D16 <GLOBAL_LOAD_SHORT_D16, load_d16_lo_global, v2f16>;
def : FlatStoreSignedAtomicPat <GLOBAL_STORE_DWORD, store_atomic_global, i32>;
def : FlatStoreSignedAtomicPat <GLOBAL_STORE_DWORDX2, store_atomic_global, i64, VReg_64>;
-def : FlatSignedAtomicPat <GLOBAL_ATOMIC_ADD_RTN, atomic_add_global, i32>;
-def : FlatSignedAtomicPat <GLOBAL_ATOMIC_SUB_RTN, atomic_sub_global, i32>;
-def : FlatSignedAtomicPat <GLOBAL_ATOMIC_INC_RTN, atomic_inc_global, i32>;
-def : FlatSignedAtomicPat <GLOBAL_ATOMIC_DEC_RTN, atomic_dec_global, i32>;
-def : FlatSignedAtomicPat <GLOBAL_ATOMIC_AND_RTN, atomic_and_global, i32>;
-def : FlatSignedAtomicPat <GLOBAL_ATOMIC_SMAX_RTN, atomic_max_global, i32>;
-def : FlatSignedAtomicPat <GLOBAL_ATOMIC_UMAX_RTN, atomic_umax_global, i32>;
-def : FlatSignedAtomicPat <GLOBAL_ATOMIC_SMIN_RTN, atomic_min_global, i32>;
-def : FlatSignedAtomicPat <GLOBAL_ATOMIC_UMIN_RTN, atomic_umin_global, i32>;
-def : FlatSignedAtomicPat <GLOBAL_ATOMIC_OR_RTN, atomic_or_global, i32>;
-def : FlatSignedAtomicPat <GLOBAL_ATOMIC_SWAP_RTN, atomic_swap_global, i32>;
+def : FlatSignedAtomicPat <GLOBAL_ATOMIC_ADD_RTN, atomic_load_add_global_32, i32>;
+def : FlatSignedAtomicPat <GLOBAL_ATOMIC_SUB_RTN, atomic_load_sub_global_32, i32>;
+def : FlatSignedAtomicPat <GLOBAL_ATOMIC_INC_RTN, atomic_inc_global_32, i32>;
+def : FlatSignedAtomicPat <GLOBAL_ATOMIC_DEC_RTN, atomic_dec_global_32, i32>;
+def : FlatSignedAtomicPat <GLOBAL_ATOMIC_AND_RTN, atomic_load_and_global_32, i32>;
+def : FlatSignedAtomicPat <GLOBAL_ATOMIC_SMAX_RTN, atomic_load_max_global_32, i32>;
+def : FlatSignedAtomicPat <GLOBAL_ATOMIC_UMAX_RTN, atomic_load_umax_global_32, i32>;
+def : FlatSignedAtomicPat <GLOBAL_ATOMIC_SMIN_RTN, atomic_load_min_global_32, i32>;
+def : FlatSignedAtomicPat <GLOBAL_ATOMIC_UMIN_RTN, atomic_load_umin_global_32, i32>;
+def : FlatSignedAtomicPat <GLOBAL_ATOMIC_OR_RTN, atomic_load_or_global_32, i32>;
+def : FlatSignedAtomicPat <GLOBAL_ATOMIC_SWAP_RTN, atomic_swap_global_32, i32>;
def : FlatSignedAtomicPat <GLOBAL_ATOMIC_CMPSWAP_RTN, AMDGPUatomic_cmp_swap_global, i32, v2i32>;
-def : FlatSignedAtomicPat <GLOBAL_ATOMIC_XOR_RTN, atomic_xor_global, i32>;
-
-def : FlatSignedAtomicPat <GLOBAL_ATOMIC_ADD_X2_RTN, atomic_add_global, i64>;
-def : FlatSignedAtomicPat <GLOBAL_ATOMIC_SUB_X2_RTN, atomic_sub_global, i64>;
-def : FlatSignedAtomicPat <GLOBAL_ATOMIC_INC_X2_RTN, atomic_inc_global, i64>;
-def : FlatSignedAtomicPat <GLOBAL_ATOMIC_DEC_X2_RTN, atomic_dec_global, i64>;
-def : FlatSignedAtomicPat <GLOBAL_ATOMIC_AND_X2_RTN, atomic_and_global, i64>;
-def : FlatSignedAtomicPat <GLOBAL_ATOMIC_SMAX_X2_RTN, atomic_max_global, i64>;
-def : FlatSignedAtomicPat <GLOBAL_ATOMIC_UMAX_X2_RTN, atomic_umax_global, i64>;
-def : FlatSignedAtomicPat <GLOBAL_ATOMIC_SMIN_X2_RTN, atomic_min_global, i64>;
-def : FlatSignedAtomicPat <GLOBAL_ATOMIC_UMIN_X2_RTN, atomic_umin_global, i64>;
-def : FlatSignedAtomicPat <GLOBAL_ATOMIC_OR_X2_RTN, atomic_or_global, i64>;
-def : FlatSignedAtomicPat <GLOBAL_ATOMIC_SWAP_X2_RTN, atomic_swap_global, i64>;
+def : FlatSignedAtomicPat <GLOBAL_ATOMIC_XOR_RTN, atomic_load_xor_global_32, i32>;
+
+def : FlatSignedAtomicPat <GLOBAL_ATOMIC_ADD_X2_RTN, atomic_load_add_global_64, i64>;
+def : FlatSignedAtomicPat <GLOBAL_ATOMIC_SUB_X2_RTN, atomic_load_sub_global_64, i64>;
+def : FlatSignedAtomicPat <GLOBAL_ATOMIC_INC_X2_RTN, atomic_inc_global_64, i64>;
+def : FlatSignedAtomicPat <GLOBAL_ATOMIC_DEC_X2_RTN, atomic_dec_global_64, i64>;
+def : FlatSignedAtomicPat <GLOBAL_ATOMIC_AND_X2_RTN, atomic_load_and_global_64, i64>;
+def : FlatSignedAtomicPat <GLOBAL_ATOMIC_SMAX_X2_RTN, atomic_load_max_global_64, i64>;
+def : FlatSignedAtomicPat <GLOBAL_ATOMIC_UMAX_X2_RTN, atomic_load_umax_global_64, i64>;
+def : FlatSignedAtomicPat <GLOBAL_ATOMIC_SMIN_X2_RTN, atomic_load_min_global_64, i64>;
+def : FlatSignedAtomicPat <GLOBAL_ATOMIC_UMIN_X2_RTN, atomic_load_umin_global_64, i64>;
+def : FlatSignedAtomicPat <GLOBAL_ATOMIC_OR_X2_RTN, atomic_load_or_global_64, i64>;
+def : FlatSignedAtomicPat <GLOBAL_ATOMIC_SWAP_X2_RTN, atomic_swap_global_64, i64>;
def : FlatSignedAtomicPat <GLOBAL_ATOMIC_CMPSWAP_X2_RTN, AMDGPUatomic_cmp_swap_global, i64, v2i64>;
-def : FlatSignedAtomicPat <GLOBAL_ATOMIC_XOR_X2_RTN, atomic_xor_global, i64>;
+def : FlatSignedAtomicPat <GLOBAL_ATOMIC_XOR_X2_RTN, atomic_load_xor_global_64, i64>;
-def : FlatAtomicPatNoRtn <GLOBAL_ATOMIC_ADD_F32, atomic_fadd_global, f32>;
-def : FlatAtomicPatNoRtn <GLOBAL_ATOMIC_PK_ADD_F16, atomic_pk_fadd_global, v2f16>;
+def : FlatAtomicPatNoRtn <GLOBAL_ATOMIC_ADD_F32, atomic_fadd_global_noret, f32>;
+def : FlatAtomicPatNoRtn <GLOBAL_ATOMIC_PK_ADD_F16, atomic_pk_fadd_global_noret, v2f16>;
} // End OtherPredicates = [HasFlatGlobalInsts], AddedComplexity = 10
@@ -1174,7 +1184,7 @@ class FLAT_Real_gfx10<bits<7> op, FLAT_Pseudo ps> :
let AssemblerPredicate = isGFX10Plus;
let DecoderNamespace = "GFX10";
- let Inst{11-0} = {offset{12}, offset{10-0}};
+ let Inst{11-0} = offset{11-0};
let Inst{12} = !if(ps.has_dlc, dlc, ps.dlcValue);
let Inst{54-48} = !if(ps.has_saddr, !if(ps.enabled_saddr, saddr, 0x7d), 0x7d);
let Inst{55} = 0;
diff --git a/lib/Target/AMDGPU/GCNDPPCombine.cpp b/lib/Target/AMDGPU/GCNDPPCombine.cpp
index e1845e2e8e87..98678873e37c 100644
--- a/lib/Target/AMDGPU/GCNDPPCombine.cpp
+++ b/lib/Target/AMDGPU/GCNDPPCombine.cpp
@@ -41,6 +41,7 @@
#include "AMDGPUSubtarget.h"
#include "SIInstrInfo.h"
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
+#include "llvm/ADT/DenseMap.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/Statistic.h"
#include "llvm/CodeGen/MachineBasicBlock.h"
@@ -155,8 +156,6 @@ MachineInstr *GCNDPPCombine::createDPPInst(MachineInstr &OrigMI,
RegSubRegPair CombOldVGPR,
bool CombBCZ) const {
assert(MovMI.getOpcode() == AMDGPU::V_MOV_B32_dpp);
- assert(TII->getNamedOperand(MovMI, AMDGPU::OpName::vdst)->getReg() ==
- TII->getNamedOperand(OrigMI, AMDGPU::OpName::src0)->getReg());
auto OrigOp = OrigMI.getOpcode();
auto DPPOp = getDPPOp(OrigOp);
@@ -178,7 +177,9 @@ MachineInstr *GCNDPPCombine::createDPPInst(MachineInstr &OrigMI,
if (OldIdx != -1) {
assert(OldIdx == NumOperands);
assert(isOfRegClass(CombOldVGPR, AMDGPU::VGPR_32RegClass, *MRI));
- DPPInst.addReg(CombOldVGPR.Reg, 0, CombOldVGPR.SubReg);
+ auto *Def = getVRegSubRegDef(CombOldVGPR, *MRI);
+ DPPInst.addReg(CombOldVGPR.Reg, Def ? 0 : RegState::Undef,
+ CombOldVGPR.SubReg);
++NumOperands;
} else {
// TODO: this discards MAC/FMA instructions for now, let's add it later
@@ -195,6 +196,10 @@ MachineInstr *GCNDPPCombine::createDPPInst(MachineInstr &OrigMI,
assert(0LL == (Mod0->getImm() & ~(SISrcMods::ABS | SISrcMods::NEG)));
DPPInst.addImm(Mod0->getImm());
++NumOperands;
+ } else if (AMDGPU::getNamedOperandIdx(DPPOp,
+ AMDGPU::OpName::src0_modifiers) != -1) {
+ DPPInst.addImm(0);
+ ++NumOperands;
}
auto *Src0 = TII->getNamedOperand(MovMI, AMDGPU::OpName::src0);
assert(Src0);
@@ -214,6 +219,10 @@ MachineInstr *GCNDPPCombine::createDPPInst(MachineInstr &OrigMI,
assert(0LL == (Mod1->getImm() & ~(SISrcMods::ABS | SISrcMods::NEG)));
DPPInst.addImm(Mod1->getImm());
++NumOperands;
+ } else if (AMDGPU::getNamedOperandIdx(DPPOp,
+ AMDGPU::OpName::src1_modifiers) != -1) {
+ DPPInst.addImm(0);
+ ++NumOperands;
}
if (auto *Src1 = TII->getNamedOperand(OrigMI, AMDGPU::OpName::src1)) {
if (!TII->isOperandLegal(*DPPInst.getInstr(), NumOperands, Src1)) {
@@ -344,6 +353,10 @@ bool GCNDPPCombine::combineDPPMov(MachineInstr &MovMI) const {
auto *DstOpnd = TII->getNamedOperand(MovMI, AMDGPU::OpName::vdst);
assert(DstOpnd && DstOpnd->isReg());
auto DPPMovReg = DstOpnd->getReg();
+ if (DPPMovReg.isPhysical()) {
+ LLVM_DEBUG(dbgs() << " failed: dpp move writes physreg\n");
+ return false;
+ }
if (execMayBeModifiedBeforeAnyUse(*MRI, DPPMovReg, MovMI)) {
LLVM_DEBUG(dbgs() << " failed: EXEC mask should remain the same"
" for all uses\n");
@@ -362,7 +375,13 @@ bool GCNDPPCombine::combineDPPMov(MachineInstr &MovMI) const {
bool BoundCtrlZero = BCZOpnd->getImm();
auto *OldOpnd = TII->getNamedOperand(MovMI, AMDGPU::OpName::old);
+ auto *SrcOpnd = TII->getNamedOperand(MovMI, AMDGPU::OpName::src0);
assert(OldOpnd && OldOpnd->isReg());
+ assert(SrcOpnd && SrcOpnd->isReg());
+ if (OldOpnd->getReg().isPhysical() || SrcOpnd->getReg().isPhysical()) {
+ LLVM_DEBUG(dbgs() << " failed: dpp move reads physreg\n");
+ return false;
+ }
auto * const OldOpndValue = getOldOpndValue(*OldOpnd);
// OldOpndValue is either undef (IMPLICIT_DEF) or immediate or something else
@@ -408,6 +427,7 @@ bool GCNDPPCombine::combineDPPMov(MachineInstr &MovMI) const {
dbgs() << ", bound_ctrl=" << CombBCZ << '\n');
SmallVector<MachineInstr*, 4> OrigMIs, DPPMIs;
+ DenseMap<MachineInstr*, SmallVector<unsigned, 4>> RegSeqWithOpNos;
auto CombOldVGPR = getRegSubRegPair(*OldOpnd);
// try to reuse previous old reg if its undefined (IMPLICIT_DEF)
if (CombBCZ && OldOpndValue) { // CombOldVGPR should be undef
@@ -420,13 +440,49 @@ bool GCNDPPCombine::combineDPPMov(MachineInstr &MovMI) const {
OrigMIs.push_back(&MovMI);
bool Rollback = true;
+ SmallVector<MachineOperand*, 16> Uses;
+
for (auto &Use : MRI->use_nodbg_operands(DPPMovReg)) {
+ Uses.push_back(&Use);
+ }
+
+ while (!Uses.empty()) {
+ MachineOperand *Use = Uses.pop_back_val();
Rollback = true;
- auto &OrigMI = *Use.getParent();
+ auto &OrigMI = *Use->getParent();
LLVM_DEBUG(dbgs() << " try: " << OrigMI);
auto OrigOp = OrigMI.getOpcode();
+ if (OrigOp == AMDGPU::REG_SEQUENCE) {
+ Register FwdReg = OrigMI.getOperand(0).getReg();
+ unsigned FwdSubReg = 0;
+
+ if (execMayBeModifiedBeforeAnyUse(*MRI, FwdReg, OrigMI)) {
+ LLVM_DEBUG(dbgs() << " failed: EXEC mask should remain the same"
+ " for all uses\n");
+ break;
+ }
+
+ unsigned OpNo, E = OrigMI.getNumOperands();
+ for (OpNo = 1; OpNo < E; OpNo += 2) {
+ if (OrigMI.getOperand(OpNo).getReg() == DPPMovReg) {
+ FwdSubReg = OrigMI.getOperand(OpNo + 1).getImm();
+ break;
+ }
+ }
+
+ if (!FwdSubReg)
+ break;
+
+ for (auto &Op : MRI->use_nodbg_operands(FwdReg)) {
+ if (Op.getSubReg() == FwdSubReg)
+ Uses.push_back(&Op);
+ }
+ RegSeqWithOpNos[&OrigMI].push_back(OpNo);
+ continue;
+ }
+
if (TII->isVOP3(OrigOp)) {
if (!TII->hasVALU32BitEncoding(OrigOp)) {
LLVM_DEBUG(dbgs() << " failed: VOP3 hasn't e32 equivalent\n");
@@ -447,14 +503,14 @@ bool GCNDPPCombine::combineDPPMov(MachineInstr &MovMI) const {
}
LLVM_DEBUG(dbgs() << " combining: " << OrigMI);
- if (&Use == TII->getNamedOperand(OrigMI, AMDGPU::OpName::src0)) {
+ if (Use == TII->getNamedOperand(OrigMI, AMDGPU::OpName::src0)) {
if (auto *DPPInst = createDPPInst(OrigMI, MovMI, CombOldVGPR,
OldOpndValue, CombBCZ)) {
DPPMIs.push_back(DPPInst);
Rollback = false;
}
} else if (OrigMI.isCommutable() &&
- &Use == TII->getNamedOperand(OrigMI, AMDGPU::OpName::src1)) {
+ Use == TII->getNamedOperand(OrigMI, AMDGPU::OpName::src1)) {
auto *BB = OrigMI.getParent();
auto *NewMI = BB->getParent()->CloneMachineInstr(&OrigMI);
BB->insert(OrigMI, NewMI);
@@ -475,9 +531,22 @@ bool GCNDPPCombine::combineDPPMov(MachineInstr &MovMI) const {
OrigMIs.push_back(&OrigMI);
}
+ Rollback |= !Uses.empty();
+
for (auto *MI : *(Rollback? &DPPMIs : &OrigMIs))
MI->eraseFromParent();
+ if (!Rollback) {
+ for (auto &S : RegSeqWithOpNos) {
+ if (MRI->use_nodbg_empty(S.first->getOperand(0).getReg())) {
+ S.first->eraseFromParent();
+ continue;
+ }
+ while (!S.second.empty())
+ S.first->getOperand(S.second.pop_back_val()).setIsUndef(true);
+ }
+ }
+
return !Rollback;
}
@@ -498,6 +567,13 @@ bool GCNDPPCombine::runOnMachineFunction(MachineFunction &MF) {
if (MI.getOpcode() == AMDGPU::V_MOV_B32_dpp && combineDPPMov(MI)) {
Changed = true;
++NumDPPMovsCombined;
+ } else if (MI.getOpcode() == AMDGPU::V_MOV_B64_DPP_PSEUDO) {
+ auto Split = TII->expandMovDPP64(MI);
+ for (auto M : { Split.first, Split.second }) {
+ if (combineDPPMov(*M))
+ ++NumDPPMovsCombined;
+ }
+ Changed = true;
}
}
}
diff --git a/lib/Target/AMDGPU/GCNHazardRecognizer.cpp b/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
index 885239e2faed..9528aee4c50e 100644
--- a/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
+++ b/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
@@ -726,7 +726,7 @@ int GCNHazardRecognizer::checkVALUHazardsHelper(const MachineOperand &Def,
if (!TRI->isVGPR(MRI, Def.getReg()))
return WaitStatesNeeded;
- unsigned Reg = Def.getReg();
+ Register Reg = Def.getReg();
auto IsHazardFn = [this, Reg, TRI] (MachineInstr *MI) {
int DataIdx = createsVALUHazard(*MI);
return DataIdx >= 0 &&
@@ -792,7 +792,7 @@ int GCNHazardRecognizer::checkRWLaneHazards(MachineInstr *RWLane) {
if (!LaneSelectOp->isReg() || !TRI->isSGPRReg(MRI, LaneSelectOp->getReg()))
return 0;
- unsigned LaneSelectReg = LaneSelectOp->getReg();
+ Register LaneSelectReg = LaneSelectOp->getReg();
auto IsHazardFn = [TII] (MachineInstr *MI) {
return TII->isVALU(*MI);
};
@@ -891,7 +891,7 @@ bool GCNHazardRecognizer::fixVcmpxPermlaneHazards(MachineInstr *MI) {
// Use V_MOB_B32 v?, v?. Register must be alive so use src0 of V_PERMLANE*
// which is always a VGPR and available.
auto *Src0 = TII->getNamedOperand(*MI, AMDGPU::OpName::src0);
- unsigned Reg = Src0->getReg();
+ Register Reg = Src0->getReg();
bool IsUndef = Src0->isUndef();
BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
TII->get(AMDGPU::V_MOV_B32_e32))
@@ -952,6 +952,7 @@ bool GCNHazardRecognizer::fixSMEMtoVectorWriteHazards(MachineInstr *MI) {
unsigned SDSTName;
switch (MI->getOpcode()) {
case AMDGPU::V_READLANE_B32:
+ case AMDGPU::V_READLANE_B32_gfx10:
case AMDGPU::V_READFIRSTLANE_B32:
SDSTName = AMDGPU::OpName::vdst;
break;
@@ -976,7 +977,7 @@ bool GCNHazardRecognizer::fixSMEMtoVectorWriteHazards(MachineInstr *MI) {
if (!SDST)
return false;
- const unsigned SDSTReg = SDST->getReg();
+ const Register SDSTReg = SDST->getReg();
auto IsHazardFn = [SDSTReg, TRI] (MachineInstr *I) {
return SIInstrInfo::isSMRD(*I) && I->readsRegister(SDSTReg, TRI);
};
@@ -1251,14 +1252,14 @@ int GCNHazardRecognizer::checkMAIHazards(MachineInstr *MI) {
const int MFMA16x16WritesAGPRAccVgprWriteWaitStates = 7;
const int MFMA32x32WritesAGPRAccVgprWriteWaitStates = 15;
const int MaxWaitStates = 18;
- unsigned Reg = Op.getReg();
+ Register Reg = Op.getReg();
unsigned HazardDefLatency = 0;
auto IsOverlappedMFMAFn = [Reg, &IsMFMAFn, &HazardDefLatency, this]
(MachineInstr *MI) {
if (!IsMFMAFn(MI))
return false;
- unsigned DstReg = MI->getOperand(0).getReg();
+ Register DstReg = MI->getOperand(0).getReg();
if (DstReg == Reg)
return false;
HazardDefLatency = std::max(HazardDefLatency,
@@ -1304,7 +1305,7 @@ int GCNHazardRecognizer::checkMAIHazards(MachineInstr *MI) {
auto IsAccVgprWriteFn = [Reg, this] (MachineInstr *MI) {
if (MI->getOpcode() != AMDGPU::V_ACCVGPR_WRITE_B32)
return false;
- unsigned DstReg = MI->getOperand(0).getReg();
+ Register DstReg = MI->getOperand(0).getReg();
return TRI.regsOverlap(Reg, DstReg);
};
@@ -1330,14 +1331,14 @@ int GCNHazardRecognizer::checkMAIHazards(MachineInstr *MI) {
const int MFMA16x16ReadSrcCAccVgprWriteWaitStates = 5;
const int MFMA32x32ReadSrcCAccVgprWriteWaitStates = 13;
const int MaxWaitStates = 13;
- unsigned DstReg = MI->getOperand(0).getReg();
+ Register DstReg = MI->getOperand(0).getReg();
unsigned HazardDefLatency = 0;
auto IsSrcCMFMAFn = [DstReg, &IsMFMAFn, &HazardDefLatency, this]
(MachineInstr *MI) {
if (!IsMFMAFn(MI))
return false;
- unsigned Reg = TII.getNamedOperand(*MI, AMDGPU::OpName::src2)->getReg();
+ Register Reg = TII.getNamedOperand(*MI, AMDGPU::OpName::src2)->getReg();
HazardDefLatency = std::max(HazardDefLatency,
TSchedModel.computeInstrLatency(MI));
return TRI.regsOverlap(Reg, DstReg);
@@ -1376,7 +1377,7 @@ int GCNHazardRecognizer::checkMAILdStHazards(MachineInstr *MI) {
if (!Op.isReg() || !TRI.isVGPR(MF.getRegInfo(), Op.getReg()))
continue;
- unsigned Reg = Op.getReg();
+ Register Reg = Op.getReg();
const int AccVgprReadLdStWaitStates = 2;
const int VALUWriteAccVgprReadLdStDepVALUWaitStates = 1;
diff --git a/lib/Target/AMDGPU/GCNILPSched.cpp b/lib/Target/AMDGPU/GCNILPSched.cpp
index 1eb617640c32..39072af7d871 100644
--- a/lib/Target/AMDGPU/GCNILPSched.cpp
+++ b/lib/Target/AMDGPU/GCNILPSched.cpp
@@ -11,6 +11,7 @@
//===----------------------------------------------------------------------===//
#include "llvm/CodeGen/ScheduleDAG.h"
+#include "llvm/Support/Debug.h"
using namespace llvm;
diff --git a/lib/Target/AMDGPU/GCNIterativeScheduler.cpp b/lib/Target/AMDGPU/GCNIterativeScheduler.cpp
index 3525174223bd..90ab6a14ce20 100644
--- a/lib/Target/AMDGPU/GCNIterativeScheduler.cpp
+++ b/lib/Target/AMDGPU/GCNIterativeScheduler.cpp
@@ -237,7 +237,7 @@ public:
GCNIterativeScheduler::GCNIterativeScheduler(MachineSchedContext *C,
StrategyKind S)
- : BaseClass(C, llvm::make_unique<SchedStrategyStub>())
+ : BaseClass(C, std::make_unique<SchedStrategyStub>())
, Context(C)
, Strategy(S)
, UPTracker(*LIS) {
diff --git a/lib/Target/AMDGPU/GCNNSAReassign.cpp b/lib/Target/AMDGPU/GCNNSAReassign.cpp
index 51c4c99cfb18..36a8f74150f5 100644
--- a/lib/Target/AMDGPU/GCNNSAReassign.cpp
+++ b/lib/Target/AMDGPU/GCNNSAReassign.cpp
@@ -173,11 +173,11 @@ GCNNSAReassign::CheckNSA(const MachineInstr &MI, bool Fast) const {
bool NSA = false;
for (unsigned I = 0; I < Info->VAddrDwords; ++I) {
const MachineOperand &Op = MI.getOperand(VAddr0Idx + I);
- unsigned Reg = Op.getReg();
- if (TargetRegisterInfo::isPhysicalRegister(Reg) || !VRM->isAssignedReg(Reg))
+ Register Reg = Op.getReg();
+ if (Register::isPhysicalRegister(Reg) || !VRM->isAssignedReg(Reg))
return NSA_Status::FIXED;
- unsigned PhysReg = VRM->getPhys(Reg);
+ Register PhysReg = VRM->getPhys(Reg);
if (!Fast) {
if (!PhysReg)
@@ -276,7 +276,7 @@ bool GCNNSAReassign::runOnMachineFunction(MachineFunction &MF) {
SlotIndex MinInd, MaxInd;
for (unsigned I = 0; I < Info->VAddrDwords; ++I) {
const MachineOperand &Op = MI->getOperand(VAddr0Idx + I);
- unsigned Reg = Op.getReg();
+ Register Reg = Op.getReg();
LiveInterval *LI = &LIS->getInterval(Reg);
if (llvm::find(Intervals, LI) != Intervals.end()) {
// Same register used, unable to make sequential
diff --git a/lib/Target/AMDGPU/GCNRegBankReassign.cpp b/lib/Target/AMDGPU/GCNRegBankReassign.cpp
index f0d47eaa4ed1..2927d4eb745a 100644
--- a/lib/Target/AMDGPU/GCNRegBankReassign.cpp
+++ b/lib/Target/AMDGPU/GCNRegBankReassign.cpp
@@ -230,7 +230,7 @@ private:
public:
Printable printReg(unsigned Reg, unsigned SubReg = 0) const {
return Printable([Reg, SubReg, this](raw_ostream &OS) {
- if (TargetRegisterInfo::isPhysicalRegister(Reg)) {
+ if (Register::isPhysicalRegister(Reg)) {
OS << llvm::printReg(Reg, TRI);
return;
}
@@ -275,7 +275,7 @@ char GCNRegBankReassign::ID = 0;
char &llvm::GCNRegBankReassignID = GCNRegBankReassign::ID;
unsigned GCNRegBankReassign::getPhysRegBank(unsigned Reg) const {
- assert (TargetRegisterInfo::isPhysicalRegister(Reg));
+ assert(Register::isPhysicalRegister(Reg));
const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg);
unsigned Size = TRI->getRegSizeInBits(*RC);
@@ -293,7 +293,7 @@ unsigned GCNRegBankReassign::getPhysRegBank(unsigned Reg) const {
unsigned GCNRegBankReassign::getRegBankMask(unsigned Reg, unsigned SubReg,
int Bank) {
- if (TargetRegisterInfo::isVirtualRegister(Reg)) {
+ if (Register::isVirtualRegister(Reg)) {
if (!VRM->isAssignedReg(Reg))
return 0;
@@ -364,7 +364,7 @@ unsigned GCNRegBankReassign::analyzeInst(const MachineInstr& MI,
if (!Op.isReg() || Op.isUndef())
continue;
- unsigned R = Op.getReg();
+ Register R = Op.getReg();
if (TRI->hasAGPRs(TRI->getRegClassForReg(*MRI, R)))
continue;
@@ -420,12 +420,12 @@ unsigned GCNRegBankReassign::getOperandGatherWeight(const MachineInstr& MI,
}
bool GCNRegBankReassign::isReassignable(unsigned Reg) const {
- if (TargetRegisterInfo::isPhysicalRegister(Reg) || !VRM->isAssignedReg(Reg))
+ if (Register::isPhysicalRegister(Reg) || !VRM->isAssignedReg(Reg))
return false;
const MachineInstr *Def = MRI->getUniqueVRegDef(Reg);
- unsigned PhysReg = VRM->getPhys(Reg);
+ Register PhysReg = VRM->getPhys(Reg);
if (Def && Def->isCopy() && Def->getOperand(1).getReg() == PhysReg)
return false;
@@ -654,7 +654,7 @@ unsigned GCNRegBankReassign::tryReassign(Candidate &C) {
}
std::sort(BankStalls.begin(), BankStalls.end());
- unsigned OrigReg = VRM->getPhys(C.Reg);
+ Register OrigReg = VRM->getPhys(C.Reg);
LRM->unassign(LI);
while (!BankStalls.empty()) {
BankStall BS = BankStalls.pop_back_val();
diff --git a/lib/Target/AMDGPU/GCNRegPressure.cpp b/lib/Target/AMDGPU/GCNRegPressure.cpp
index 39460fbd8a84..d593204cba05 100644
--- a/lib/Target/AMDGPU/GCNRegPressure.cpp
+++ b/lib/Target/AMDGPU/GCNRegPressure.cpp
@@ -40,7 +40,7 @@ void llvm::printLivesAt(SlotIndex SI,
<< *LIS.getInstructionFromIndex(SI);
unsigned Num = 0;
for (unsigned I = 0, E = MRI.getNumVirtRegs(); I != E; ++I) {
- const unsigned Reg = TargetRegisterInfo::index2VirtReg(I);
+ const unsigned Reg = Register::index2VirtReg(I);
if (!LIS.hasInterval(Reg))
continue;
const auto &LI = LIS.getInterval(Reg);
@@ -84,7 +84,7 @@ bool llvm::isEqual(const GCNRPTracker::LiveRegSet &S1,
unsigned GCNRegPressure::getRegKind(unsigned Reg,
const MachineRegisterInfo &MRI) {
- assert(TargetRegisterInfo::isVirtualRegister(Reg));
+ assert(Register::isVirtualRegister(Reg));
const auto RC = MRI.getRegClass(Reg);
auto STI = static_cast<const SIRegisterInfo*>(MRI.getTargetRegisterInfo());
return STI->isSGPRClass(RC) ?
@@ -183,7 +183,8 @@ bool GCNRegPressure::less(const GCNSubtarget &ST,
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
LLVM_DUMP_METHOD
void GCNRegPressure::print(raw_ostream &OS, const GCNSubtarget *ST) const {
- OS << "VGPRs: " << getVGPRNum();
+ OS << "VGPRs: " << Value[VGPR32] << ' ';
+ OS << "AGPRs: " << Value[AGPR32];
if (ST) OS << "(O" << ST->getOccupancyWithNumVGPRs(getVGPRNum()) << ')';
OS << ", SGPRs: " << getSGPRNum();
if (ST) OS << "(O" << ST->getOccupancyWithNumSGPRs(getSGPRNum()) << ')';
@@ -196,8 +197,7 @@ void GCNRegPressure::print(raw_ostream &OS, const GCNSubtarget *ST) const {
static LaneBitmask getDefRegMask(const MachineOperand &MO,
const MachineRegisterInfo &MRI) {
- assert(MO.isDef() && MO.isReg() &&
- TargetRegisterInfo::isVirtualRegister(MO.getReg()));
+ assert(MO.isDef() && MO.isReg() && Register::isVirtualRegister(MO.getReg()));
// We don't rely on read-undef flag because in case of tentative schedule
// tracking it isn't set correctly yet. This works correctly however since
@@ -210,8 +210,7 @@ static LaneBitmask getDefRegMask(const MachineOperand &MO,
static LaneBitmask getUsedRegMask(const MachineOperand &MO,
const MachineRegisterInfo &MRI,
const LiveIntervals &LIS) {
- assert(MO.isUse() && MO.isReg() &&
- TargetRegisterInfo::isVirtualRegister(MO.getReg()));
+ assert(MO.isUse() && MO.isReg() && Register::isVirtualRegister(MO.getReg()));
if (auto SubReg = MO.getSubReg())
return MRI.getTargetRegisterInfo()->getSubRegIndexLaneMask(SubReg);
@@ -232,7 +231,7 @@ collectVirtualRegUses(const MachineInstr &MI, const LiveIntervals &LIS,
const MachineRegisterInfo &MRI) {
SmallVector<RegisterMaskPair, 8> Res;
for (const auto &MO : MI.operands()) {
- if (!MO.isReg() || !TargetRegisterInfo::isVirtualRegister(MO.getReg()))
+ if (!MO.isReg() || !Register::isVirtualRegister(MO.getReg()))
continue;
if (!MO.isUse() || !MO.readsReg())
continue;
@@ -278,7 +277,7 @@ GCNRPTracker::LiveRegSet llvm::getLiveRegs(SlotIndex SI,
const MachineRegisterInfo &MRI) {
GCNRPTracker::LiveRegSet LiveRegs;
for (unsigned I = 0, E = MRI.getNumVirtRegs(); I != E; ++I) {
- auto Reg = TargetRegisterInfo::index2VirtReg(I);
+ auto Reg = Register::index2VirtReg(I);
if (!LIS.hasInterval(Reg))
continue;
auto LiveMask = getLiveLaneMask(Reg, SI, LIS, MRI);
@@ -329,8 +328,7 @@ void GCNUpwardRPTracker::recede(const MachineInstr &MI) {
MaxPressure = max(AtMIPressure, MaxPressure);
for (const auto &MO : MI.defs()) {
- if (!MO.isReg() || !TargetRegisterInfo::isVirtualRegister(MO.getReg()) ||
- MO.isDead())
+ if (!MO.isReg() || !Register::isVirtualRegister(MO.getReg()) || MO.isDead())
continue;
auto Reg = MO.getReg();
@@ -408,8 +406,8 @@ void GCNDownwardRPTracker::advanceToNext() {
for (const auto &MO : LastTrackedMI->defs()) {
if (!MO.isReg())
continue;
- unsigned Reg = MO.getReg();
- if (!TargetRegisterInfo::isVirtualRegister(Reg))
+ Register Reg = MO.getReg();
+ if (!Register::isVirtualRegister(Reg))
continue;
auto &LiveMask = LiveRegs[Reg];
auto PrevMask = LiveMask;
@@ -500,7 +498,7 @@ void GCNRPTracker::printLiveRegs(raw_ostream &OS, const LiveRegSet& LiveRegs,
const MachineRegisterInfo &MRI) {
const TargetRegisterInfo *TRI = MRI.getTargetRegisterInfo();
for (unsigned I = 0, E = MRI.getNumVirtRegs(); I != E; ++I) {
- unsigned Reg = TargetRegisterInfo::index2VirtReg(I);
+ unsigned Reg = Register::index2VirtReg(I);
auto It = LiveRegs.find(Reg);
if (It != LiveRegs.end() && It->second.any())
OS << ' ' << printVRegOrUnit(Reg, TRI) << ':'
diff --git a/lib/Target/AMDGPU/GCNRegPressure.h b/lib/Target/AMDGPU/GCNRegPressure.h
index e4894418b943..5862cdb04166 100644
--- a/lib/Target/AMDGPU/GCNRegPressure.h
+++ b/lib/Target/AMDGPU/GCNRegPressure.h
@@ -214,7 +214,7 @@ getLiveRegMap(Range &&R, bool After, LiveIntervals &LIS) {
DenseMap<MachineInstr *, GCNRPTracker::LiveRegSet> LiveRegMap;
SmallVector<SlotIndex, 32> LiveIdxs, SRLiveIdxs;
for (unsigned I = 0, E = MRI.getNumVirtRegs(); I != E; ++I) {
- auto Reg = TargetRegisterInfo::index2VirtReg(I);
+ auto Reg = Register::index2VirtReg(I);
if (!LIS.hasInterval(Reg))
continue;
auto &LI = LIS.getInterval(Reg);
diff --git a/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/lib/Target/AMDGPU/GCNSchedStrategy.cpp
index 4ea990ae490e..973491a70d3c 100644
--- a/lib/Target/AMDGPU/GCNSchedStrategy.cpp
+++ b/lib/Target/AMDGPU/GCNSchedStrategy.cpp
@@ -71,8 +71,8 @@ void GCNMaxOccupancySchedStrategy::initCandidate(SchedCandidate &Cand, SUnit *SU
// the tracker, so we need to pass those function a non-const copy.
RegPressureTracker &TempTracker = const_cast<RegPressureTracker&>(RPTracker);
- std::vector<unsigned> Pressure;
- std::vector<unsigned> MaxPressure;
+ Pressure.clear();
+ MaxPressure.clear();
if (AtTop)
TempTracker.getDownwardPressure(SU->getInstr(), Pressure, MaxPressure);
@@ -103,10 +103,10 @@ void GCNMaxOccupancySchedStrategy::initCandidate(SchedCandidate &Cand, SUnit *SU
// the analysis to look through dependencies to find the path with the least
// register pressure.
- // We only need to update the RPDelata for instructions that increase
- // register pressure. Instructions that decrease or keep reg pressure
- // the same will be marked as RegExcess in tryCandidate() when they
- // are compared with instructions that increase the register pressure.
+ // We only need to update the RPDelta for instructions that increase register
+ // pressure. Instructions that decrease or keep reg pressure the same will be
+ // marked as RegExcess in tryCandidate() when they are compared with
+ // instructions that increase the register pressure.
if (ShouldTrackVGPRs && NewVGPRPressure >= VGPRExcessLimit) {
Cand.RPDelta.Excess = PressureChange(SRI->getVGPRPressureSet());
Cand.RPDelta.Excess.setUnitInc(NewVGPRPressure - VGPRExcessLimit);
@@ -160,6 +160,7 @@ void GCNMaxOccupancySchedStrategy::pickNodeFromQueue(SchedBoundary &Zone,
if (TryCand.ResDelta == SchedResourceDelta())
TryCand.initResourceDelta(Zone.DAG, SchedModel);
Cand.setBest(TryCand);
+ LLVM_DEBUG(traceCandidate(Cand));
}
}
}
@@ -195,6 +196,15 @@ SUnit *GCNMaxOccupancySchedStrategy::pickNodeBidirectional(bool &IsTopNode) {
assert(BotCand.Reason != NoCand && "failed to find the first candidate");
} else {
LLVM_DEBUG(traceCandidate(BotCand));
+#ifndef NDEBUG
+ if (VerifyScheduling) {
+ SchedCandidate TCand;
+ TCand.reset(CandPolicy());
+ pickNodeFromQueue(Bot, BotPolicy, DAG->getBotRPTracker(), TCand);
+ assert(TCand.SU == BotCand.SU &&
+ "Last pick result should correspond to re-picking right now");
+ }
+#endif
}
// Check if the top Q has a better candidate.
@@ -206,6 +216,15 @@ SUnit *GCNMaxOccupancySchedStrategy::pickNodeBidirectional(bool &IsTopNode) {
assert(TopCand.Reason != NoCand && "failed to find the first candidate");
} else {
LLVM_DEBUG(traceCandidate(TopCand));
+#ifndef NDEBUG
+ if (VerifyScheduling) {
+ SchedCandidate TCand;
+ TCand.reset(CandPolicy());
+ pickNodeFromQueue(Top, TopPolicy, DAG->getTopRPTracker(), TCand);
+ assert(TCand.SU == TopCand.SU &&
+ "Last pick result should correspond to re-picking right now");
+ }
+#endif
}
// Pick best from BotCand and TopCand.
diff --git a/lib/Target/AMDGPU/GCNSchedStrategy.h b/lib/Target/AMDGPU/GCNSchedStrategy.h
index eaf3dee9ba5d..dd687a930c79 100644
--- a/lib/Target/AMDGPU/GCNSchedStrategy.h
+++ b/lib/Target/AMDGPU/GCNSchedStrategy.h
@@ -40,6 +40,9 @@ class GCNMaxOccupancySchedStrategy final : public GenericScheduler {
const SIRegisterInfo *SRI,
unsigned SGPRPressure, unsigned VGPRPressure);
+ std::vector<unsigned> Pressure;
+ std::vector<unsigned> MaxPressure;
+
unsigned SGPRExcessLimit;
unsigned VGPRExcessLimit;
unsigned SGPRCriticalLimit;
diff --git a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp
index 57c0ba26cc3a..1f94ab799122 100644
--- a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp
+++ b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp
@@ -109,7 +109,7 @@ static uint64_t adjustFixupValue(const MCFixup &Fixup, uint64_t Value,
MCContext *Ctx) {
int64_t SignedValue = static_cast<int64_t>(Value);
- switch (static_cast<unsigned>(Fixup.getKind())) {
+ switch (Fixup.getTargetKind()) {
case AMDGPU::fixup_si_sopp_br: {
int64_t BrImm = (SignedValue - 4) / 4;
diff --git a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFObjectWriter.cpp b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFObjectWriter.cpp
index 6549a8d7d592..d352219a7a98 100644
--- a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFObjectWriter.cpp
+++ b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFObjectWriter.cpp
@@ -87,7 +87,7 @@ std::unique_ptr<MCObjectTargetWriter>
llvm::createAMDGPUELFObjectWriter(bool Is64Bit, uint8_t OSABI,
bool HasRelocationAddend,
uint8_t ABIVersion) {
- return llvm::make_unique<AMDGPUELFObjectWriter>(Is64Bit, OSABI,
+ return std::make_unique<AMDGPUELFObjectWriter>(Is64Bit, OSABI,
HasRelocationAddend,
ABIVersion);
}
diff --git a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp
index 01b53432cbb7..a9888e6ed924 100644
--- a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp
+++ b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp
@@ -196,6 +196,10 @@ void AMDGPUInstPrinter::printSLC(const MCInst *MI, unsigned OpNo,
printNamedBit(MI, OpNo, O, "slc");
}
+void AMDGPUInstPrinter::printSWZ(const MCInst *MI, unsigned OpNo,
+ const MCSubtargetInfo &STI, raw_ostream &O) {
+}
+
void AMDGPUInstPrinter::printTFE(const MCInst *MI, unsigned OpNo,
const MCSubtargetInfo &STI, raw_ostream &O) {
printNamedBit(MI, OpNo, O, "tfe");
@@ -292,35 +296,7 @@ void AMDGPUInstPrinter::printRegOperand(unsigned RegNo, raw_ostream &O,
}
#endif
- unsigned AltName = AMDGPU::Reg32;
-
- if (MRI.getRegClass(AMDGPU::VReg_64RegClassID).contains(RegNo) ||
- MRI.getRegClass(AMDGPU::SGPR_64RegClassID).contains(RegNo) ||
- MRI.getRegClass(AMDGPU::AReg_64RegClassID).contains(RegNo))
- AltName = AMDGPU::Reg64;
- else if (MRI.getRegClass(AMDGPU::VReg_128RegClassID).contains(RegNo) ||
- MRI.getRegClass(AMDGPU::SGPR_128RegClassID).contains(RegNo) ||
- MRI.getRegClass(AMDGPU::AReg_128RegClassID).contains(RegNo))
- AltName = AMDGPU::Reg128;
- else if (MRI.getRegClass(AMDGPU::VReg_96RegClassID).contains(RegNo) ||
- MRI.getRegClass(AMDGPU::SReg_96RegClassID).contains(RegNo))
- AltName = AMDGPU::Reg96;
- else if (MRI.getRegClass(AMDGPU::VReg_160RegClassID).contains(RegNo) ||
- MRI.getRegClass(AMDGPU::SReg_160RegClassID).contains(RegNo))
- AltName = AMDGPU::Reg160;
- else if (MRI.getRegClass(AMDGPU::VReg_256RegClassID).contains(RegNo) ||
- MRI.getRegClass(AMDGPU::SGPR_256RegClassID).contains(RegNo))
- AltName = AMDGPU::Reg256;
- else if (MRI.getRegClass(AMDGPU::VReg_512RegClassID).contains(RegNo) ||
- MRI.getRegClass(AMDGPU::SGPR_512RegClassID).contains(RegNo) ||
- MRI.getRegClass(AMDGPU::AReg_512RegClassID).contains(RegNo))
- AltName = AMDGPU::Reg512;
- else if (MRI.getRegClass(AMDGPU::VReg_1024RegClassID).contains(RegNo) ||
- MRI.getRegClass(AMDGPU::SReg_1024RegClassID).contains(RegNo) ||
- MRI.getRegClass(AMDGPU::AReg_1024RegClassID).contains(RegNo))
- AltName = AMDGPU::Reg1024;
-
- O << getRegisterName(RegNo, AltName);
+ O << getRegisterName(RegNo);
}
void AMDGPUInstPrinter::printVOPDst(const MCInst *MI, unsigned OpNo,
@@ -623,9 +599,11 @@ void AMDGPUInstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
case AMDGPU::V_ADD_CO_CI_U32_e32_gfx10:
case AMDGPU::V_SUB_CO_CI_U32_e32_gfx10:
case AMDGPU::V_SUBREV_CO_CI_U32_e32_gfx10:
+ case AMDGPU::V_CNDMASK_B32_dpp_gfx10:
case AMDGPU::V_ADD_CO_CI_U32_dpp_gfx10:
case AMDGPU::V_SUB_CO_CI_U32_dpp_gfx10:
case AMDGPU::V_SUBREV_CO_CI_U32_dpp_gfx10:
+ case AMDGPU::V_CNDMASK_B32_dpp8_gfx10:
case AMDGPU::V_ADD_CO_CI_U32_dpp8_gfx10:
case AMDGPU::V_SUB_CO_CI_U32_dpp8_gfx10:
case AMDGPU::V_SUBREV_CO_CI_U32_dpp8_gfx10:
@@ -689,6 +667,7 @@ void AMDGPUInstPrinter::printOperandAndIntInputMods(const MCInst *MI,
switch (MI->getOpcode()) {
default: break;
+ case AMDGPU::V_CNDMASK_B32_sdwa_gfx10:
case AMDGPU::V_ADD_CO_CI_U32_sdwa_gfx10:
case AMDGPU::V_SUB_CO_CI_U32_sdwa_gfx10:
case AMDGPU::V_SUBREV_CO_CI_U32_sdwa_gfx10:
diff --git a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h
index b544d1ef3605..66b70831ff9e 100644
--- a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h
+++ b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h
@@ -12,7 +12,6 @@
#ifndef LLVM_LIB_TARGET_AMDGPU_MCTARGETDESC_AMDGPUINSTPRINTER_H
#define LLVM_LIB_TARGET_AMDGPU_MCTARGETDESC_AMDGPUINSTPRINTER_H
-#include "AMDGPUMCTargetDesc.h"
#include "llvm/MC/MCInstPrinter.h"
namespace llvm {
@@ -26,8 +25,7 @@ public:
//Autogenerated by tblgen
void printInstruction(const MCInst *MI, const MCSubtargetInfo &STI,
raw_ostream &O);
- static const char *getRegisterName(unsigned RegNo,
- unsigned AltIdx = AMDGPU::NoRegAltName);
+ static const char *getRegisterName(unsigned RegNo);
void printInst(const MCInst *MI, raw_ostream &O, StringRef Annot,
const MCSubtargetInfo &STI) override;
@@ -74,6 +72,8 @@ private:
raw_ostream &O);
void printSLC(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
raw_ostream &O);
+ void printSWZ(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
+ raw_ostream &O);
void printTFE(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
raw_ostream &O);
void printDMask(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
diff --git a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp
index 8f11433476f4..c15da8075a34 100644
--- a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp
+++ b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp
@@ -250,7 +250,7 @@ bool AMDGPUTargetAsmStreamer::EmitHSAMetadata(
bool AMDGPUTargetAsmStreamer::EmitCodeEnd() {
const uint32_t Encoded_s_code_end = 0xbf9f0000;
OS << "\t.p2alignl 6, " << Encoded_s_code_end << '\n';
- OS << "\t.fill 32, 4, " << Encoded_s_code_end << '\n';
+ OS << "\t.fill 48, 4, " << Encoded_s_code_end << '\n';
return true;
}
@@ -602,7 +602,7 @@ bool AMDGPUTargetELFStreamer::EmitCodeEnd() {
MCStreamer &OS = getStreamer();
OS.PushSection();
OS.EmitValueToAlignment(64, Encoded_s_code_end, 4);
- for (unsigned I = 0; I < 32; ++I)
+ for (unsigned I = 0; I < 48; ++I)
OS.EmitIntValue(Encoded_s_code_end, 4);
OS.PopSection();
return true;
diff --git a/lib/Target/AMDGPU/MIMGInstructions.td b/lib/Target/AMDGPU/MIMGInstructions.td
index 4735e6cb2446..f33ad950d5d9 100644
--- a/lib/Target/AMDGPU/MIMGInstructions.td
+++ b/lib/Target/AMDGPU/MIMGInstructions.td
@@ -26,7 +26,7 @@ def MIMGEncoding : GenericEnum {
// Represent an ISA-level opcode, independent of the encoding and the
// vdata/vaddr size.
-class MIMGBaseOpcode {
+class MIMGBaseOpcode : PredicateControl {
MIMGBaseOpcode BaseOpcode = !cast<MIMGBaseOpcode>(NAME);
bit Store = 0;
bit Atomic = 0;
@@ -291,7 +291,7 @@ multiclass MIMG_NoSampler_Src_Helper <bits<8> op, string asm,
multiclass MIMG_NoSampler <bits<8> op, string asm, bit has_d16, bit mip = 0,
bit isResInfo = 0> {
- def "" : MIMGBaseOpcode, PredicateControl {
+ def "" : MIMGBaseOpcode {
let Coordinates = !if(isResInfo, 0, 1);
let LodOrClampOrMip = mip;
let HasD16 = has_d16;
diff --git a/lib/Target/AMDGPU/R600AsmPrinter.cpp b/lib/Target/AMDGPU/R600AsmPrinter.cpp
index 3fb18862fca8..b29cd75f75cf 100644
--- a/lib/Target/AMDGPU/R600AsmPrinter.cpp
+++ b/lib/Target/AMDGPU/R600AsmPrinter.cpp
@@ -104,7 +104,7 @@ bool R600AsmPrinter::runOnMachineFunction(MachineFunction &MF) {
// Functions needs to be cacheline (256B) aligned.
- MF.ensureAlignment(8);
+ MF.ensureAlignment(Align(256));
SetupMachineFunction(MF);
diff --git a/lib/Target/AMDGPU/R600ControlFlowFinalizer.cpp b/lib/Target/AMDGPU/R600ControlFlowFinalizer.cpp
index 8098b81d1ea2..e4160ac11c86 100644
--- a/lib/Target/AMDGPU/R600ControlFlowFinalizer.cpp
+++ b/lib/Target/AMDGPU/R600ControlFlowFinalizer.cpp
@@ -303,7 +303,7 @@ private:
if (!MO.isReg())
continue;
if (MO.isDef()) {
- unsigned Reg = MO.getReg();
+ Register Reg = MO.getReg();
if (R600::R600_Reg128RegClass.contains(Reg))
DstMI = Reg;
else
@@ -312,7 +312,7 @@ private:
&R600::R600_Reg128RegClass);
}
if (MO.isUse()) {
- unsigned Reg = MO.getReg();
+ Register Reg = MO.getReg();
if (R600::R600_Reg128RegClass.contains(Reg))
SrcMI = Reg;
else
diff --git a/lib/Target/AMDGPU/R600ExpandSpecialInstrs.cpp b/lib/Target/AMDGPU/R600ExpandSpecialInstrs.cpp
index c6e8a060d8a0..fd75c41040e1 100644
--- a/lib/Target/AMDGPU/R600ExpandSpecialInstrs.cpp
+++ b/lib/Target/AMDGPU/R600ExpandSpecialInstrs.cpp
@@ -135,7 +135,7 @@ bool R600ExpandSpecialInstrsPass::runOnMachineFunction(MachineFunction &MF) {
const R600RegisterInfo &TRI = TII->getRegisterInfo();
- unsigned DstReg = MI.getOperand(0).getReg();
+ Register DstReg = MI.getOperand(0).getReg();
unsigned DstBase = TRI.getEncodingValue(DstReg) & HW_REG_MASK;
for (unsigned Chan = 0; Chan < 4; ++Chan) {
@@ -155,12 +155,12 @@ bool R600ExpandSpecialInstrsPass::runOnMachineFunction(MachineFunction &MF) {
unsigned Opcode = BMI->getOpcode();
// While not strictly necessary from hw point of view, we force
// all src operands of a dot4 inst to belong to the same slot.
- unsigned Src0 = BMI->getOperand(
- TII->getOperandIdx(Opcode, R600::OpName::src0))
- .getReg();
- unsigned Src1 = BMI->getOperand(
- TII->getOperandIdx(Opcode, R600::OpName::src1))
- .getReg();
+ Register Src0 =
+ BMI->getOperand(TII->getOperandIdx(Opcode, R600::OpName::src0))
+ .getReg();
+ Register Src1 =
+ BMI->getOperand(TII->getOperandIdx(Opcode, R600::OpName::src1))
+ .getReg();
(void) Src0;
(void) Src1;
if ((TRI.getEncodingValue(Src0) & 0xff) < 127 &&
@@ -205,10 +205,10 @@ bool R600ExpandSpecialInstrsPass::runOnMachineFunction(MachineFunction &MF) {
// T0_Z = CUBE T1_X, T1_Z
// T0_W = CUBE T1_Y, T1_Z
for (unsigned Chan = 0; Chan < 4; Chan++) {
- unsigned DstReg = MI.getOperand(
- TII->getOperandIdx(MI, R600::OpName::dst)).getReg();
- unsigned Src0 = MI.getOperand(
- TII->getOperandIdx(MI, R600::OpName::src0)).getReg();
+ Register DstReg =
+ MI.getOperand(TII->getOperandIdx(MI, R600::OpName::dst)).getReg();
+ Register Src0 =
+ MI.getOperand(TII->getOperandIdx(MI, R600::OpName::src0)).getReg();
unsigned Src1 = 0;
// Determine the correct source registers
diff --git a/lib/Target/AMDGPU/R600FrameLowering.h b/lib/Target/AMDGPU/R600FrameLowering.h
index 950e238f4979..283e4d1935ea 100644
--- a/lib/Target/AMDGPU/R600FrameLowering.h
+++ b/lib/Target/AMDGPU/R600FrameLowering.h
@@ -15,9 +15,9 @@ namespace llvm {
class R600FrameLowering : public AMDGPUFrameLowering {
public:
- R600FrameLowering(StackDirection D, unsigned StackAl, int LAO,
- unsigned TransAl = 1) :
- AMDGPUFrameLowering(D, StackAl, LAO, TransAl) {}
+ R600FrameLowering(StackDirection D, Align StackAl, int LAO,
+ Align TransAl = Align::None())
+ : AMDGPUFrameLowering(D, StackAl, LAO, TransAl) {}
~R600FrameLowering() override;
void emitPrologue(MachineFunction &MF,
diff --git a/lib/Target/AMDGPU/R600ISelLowering.cpp b/lib/Target/AMDGPU/R600ISelLowering.cpp
index f80a53ba1dc6..659458b0b752 100644
--- a/lib/Target/AMDGPU/R600ISelLowering.cpp
+++ b/lib/Target/AMDGPU/R600ISelLowering.cpp
@@ -41,6 +41,7 @@
#include "llvm/Support/Compiler.h"
#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/MachineValueType.h"
+#include "llvm/Support/MathExtras.h"
#include <cassert>
#include <cstdint>
#include <iterator>
@@ -334,8 +335,8 @@ R600TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
}
case R600::MASK_WRITE: {
- unsigned maskedRegister = MI.getOperand(0).getReg();
- assert(TargetRegisterInfo::isVirtualRegister(maskedRegister));
+ Register maskedRegister = MI.getOperand(0).getReg();
+ assert(Register::isVirtualRegister(maskedRegister));
MachineInstr * defInstr = MRI.getVRegDef(maskedRegister);
TII->addFlag(*defInstr, 0, MO_FLAG_MASK);
break;
@@ -782,7 +783,7 @@ SDValue R600TargetLowering::LowerTrig(SDValue Op, SelectionDAG &DAG) const {
return TrigVal;
// On R600 hw, COS/SIN input must be between -Pi and Pi.
return DAG.getNode(ISD::FMUL, DL, VT, TrigVal,
- DAG.getConstantFP(3.14159265359, DL, MVT::f32));
+ DAG.getConstantFP(numbers::pif, DL, MVT::f32));
}
SDValue R600TargetLowering::LowerSHLParts(SDValue Op, SelectionDAG &DAG) const {
diff --git a/lib/Target/AMDGPU/R600InstrInfo.cpp b/lib/Target/AMDGPU/R600InstrInfo.cpp
index d9e839fe2035..04a5e93f6213 100644
--- a/lib/Target/AMDGPU/R600InstrInfo.cpp
+++ b/lib/Target/AMDGPU/R600InstrInfo.cpp
@@ -97,8 +97,8 @@ bool R600InstrInfo::isLegalToSplitMBBAt(MachineBasicBlock &MBB,
MachineBasicBlock::iterator MBBI) const {
for (MachineInstr::const_mop_iterator I = MBBI->operands_begin(),
E = MBBI->operands_end(); I != E; ++I) {
- if (I->isReg() && !TargetRegisterInfo::isVirtualRegister(I->getReg()) &&
- I->isUse() && RI.isPhysRegLiveAcrossClauses(I->getReg()))
+ if (I->isReg() && !Register::isVirtualRegister(I->getReg()) && I->isUse() &&
+ RI.isPhysRegLiveAcrossClauses(I->getReg()))
return false;
}
return true;
@@ -242,8 +242,7 @@ bool R600InstrInfo::readsLDSSrcReg(const MachineInstr &MI) const {
for (MachineInstr::const_mop_iterator I = MI.operands_begin(),
E = MI.operands_end();
I != E; ++I) {
- if (!I->isReg() || !I->isUse() ||
- TargetRegisterInfo::isVirtualRegister(I->getReg()))
+ if (!I->isReg() || !I->isUse() || Register::isVirtualRegister(I->getReg()))
continue;
if (R600::R600_LDS_SRC_REGRegClass.contains(I->getReg()))
@@ -294,7 +293,7 @@ R600InstrInfo::getSrcs(MachineInstr &MI) const {
for (unsigned j = 0; j < 8; j++) {
MachineOperand &MO =
MI.getOperand(getOperandIdx(MI.getOpcode(), OpTable[j][0]));
- unsigned Reg = MO.getReg();
+ Register Reg = MO.getReg();
if (Reg == R600::ALU_CONST) {
MachineOperand &Sel =
MI.getOperand(getOperandIdx(MI.getOpcode(), OpTable[j][1]));
@@ -317,7 +316,7 @@ R600InstrInfo::getSrcs(MachineInstr &MI) const {
if (SrcIdx < 0)
break;
MachineOperand &MO = MI.getOperand(SrcIdx);
- unsigned Reg = MO.getReg();
+ Register Reg = MO.getReg();
if (Reg == R600::ALU_CONST) {
MachineOperand &Sel =
MI.getOperand(getOperandIdx(MI.getOpcode(), OpTable[j][1]));
@@ -348,7 +347,7 @@ R600InstrInfo::ExtractSrcs(MachineInstr &MI,
unsigned i = 0;
for (const auto &Src : getSrcs(MI)) {
++i;
- unsigned Reg = Src.first->getReg();
+ Register Reg = Src.first->getReg();
int Index = RI.getEncodingValue(Reg) & 0xff;
if (Reg == R600::OQAP) {
Result.push_back(std::make_pair(Index, 0U));
@@ -865,7 +864,7 @@ bool R600InstrInfo::isPredicated(const MachineInstr &MI) const {
if (idx < 0)
return false;
- unsigned Reg = MI.getOperand(idx).getReg();
+ Register Reg = MI.getOperand(idx).getReg();
switch (Reg) {
default: return false;
case R600::PRED_SEL_ONE:
@@ -1038,7 +1037,7 @@ bool R600InstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
unsigned RegIndex = MI.getOperand(RegOpIdx).getImm();
unsigned Channel = MI.getOperand(ChanOpIdx).getImm();
unsigned Address = calculateIndirectAddress(RegIndex, Channel);
- unsigned OffsetReg = MI.getOperand(OffsetOpIdx).getReg();
+ Register OffsetReg = MI.getOperand(OffsetOpIdx).getReg();
if (OffsetReg == R600::INDIRECT_BASE_ADDR) {
buildMovInstr(MBB, MI, MI.getOperand(DstOpIdx).getReg(),
getIndirectAddrRegClass()->getRegister(Address));
@@ -1052,7 +1051,7 @@ bool R600InstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
unsigned RegIndex = MI.getOperand(RegOpIdx).getImm();
unsigned Channel = MI.getOperand(ChanOpIdx).getImm();
unsigned Address = calculateIndirectAddress(RegIndex, Channel);
- unsigned OffsetReg = MI.getOperand(OffsetOpIdx).getReg();
+ Register OffsetReg = MI.getOperand(OffsetOpIdx).getReg();
if (OffsetReg == R600::INDIRECT_BASE_ADDR) {
buildMovInstr(MBB, MI, getIndirectAddrRegClass()->getRegister(Address),
MI.getOperand(ValOpIdx).getReg());
@@ -1193,8 +1192,7 @@ int R600InstrInfo::getIndirectIndexBegin(const MachineFunction &MF) const {
const TargetRegisterClass *IndirectRC = getIndirectAddrRegClass();
for (std::pair<unsigned, unsigned> LI : MRI.liveins()) {
unsigned Reg = LI.first;
- if (TargetRegisterInfo::isVirtualRegister(Reg) ||
- !IndirectRC->contains(Reg))
+ if (Register::isVirtualRegister(Reg) || !IndirectRC->contains(Reg))
continue;
unsigned RegIndex;
diff --git a/lib/Target/AMDGPU/R600MachineScheduler.cpp b/lib/Target/AMDGPU/R600MachineScheduler.cpp
index 34267a909b5e..7569a2629539 100644
--- a/lib/Target/AMDGPU/R600MachineScheduler.cpp
+++ b/lib/Target/AMDGPU/R600MachineScheduler.cpp
@@ -183,7 +183,7 @@ isPhysicalRegCopy(MachineInstr *MI) {
if (MI->getOpcode() != R600::COPY)
return false;
- return !TargetRegisterInfo::isVirtualRegister(MI->getOperand(1).getReg());
+ return !Register::isVirtualRegister(MI->getOperand(1).getReg());
}
void R600SchedStrategy::releaseTopNode(SUnit *SU) {
@@ -209,7 +209,7 @@ void R600SchedStrategy::releaseBottomNode(SUnit *SU) {
bool R600SchedStrategy::regBelongsToClass(unsigned Reg,
const TargetRegisterClass *RC) const {
- if (!TargetRegisterInfo::isVirtualRegister(Reg)) {
+ if (!Register::isVirtualRegister(Reg)) {
return RC->contains(Reg);
} else {
return MRI->getRegClass(Reg) == RC;
@@ -270,7 +270,7 @@ R600SchedStrategy::AluKind R600SchedStrategy::getAluKind(SUnit *SU) const {
}
// Is the result already member of a X/Y/Z/W class ?
- unsigned DestReg = MI->getOperand(0).getReg();
+ Register DestReg = MI->getOperand(0).getReg();
if (regBelongsToClass(DestReg, &R600::R600_TReg32_XRegClass) ||
regBelongsToClass(DestReg, &R600::R600_AddrRegClass))
return AluT_X;
@@ -357,7 +357,7 @@ void R600SchedStrategy::AssignSlot(MachineInstr* MI, unsigned Slot) {
if (DstIndex == -1) {
return;
}
- unsigned DestReg = MI->getOperand(DstIndex).getReg();
+ Register DestReg = MI->getOperand(DstIndex).getReg();
// PressureRegister crashes if an operand is def and used in the same inst
// and we try to constraint its regclass
for (MachineInstr::mop_iterator It = MI->operands_begin(),
diff --git a/lib/Target/AMDGPU/R600OptimizeVectorRegisters.cpp b/lib/Target/AMDGPU/R600OptimizeVectorRegisters.cpp
index 9f1cb6582b5c..cec7f563f480 100644
--- a/lib/Target/AMDGPU/R600OptimizeVectorRegisters.cpp
+++ b/lib/Target/AMDGPU/R600OptimizeVectorRegisters.cpp
@@ -58,7 +58,7 @@ using namespace llvm;
static bool isImplicitlyDef(MachineRegisterInfo &MRI, unsigned Reg) {
assert(MRI.isSSA());
- if (TargetRegisterInfo::isPhysicalRegister(Reg))
+ if (Register::isPhysicalRegister(Reg))
return false;
const MachineInstr *MI = MRI.getUniqueVRegDef(Reg);
return MI && MI->isImplicitDef();
@@ -197,17 +197,17 @@ unsigned getReassignedChan(
MachineInstr *R600VectorRegMerger::RebuildVector(
RegSeqInfo *RSI, const RegSeqInfo *BaseRSI,
const std::vector<std::pair<unsigned, unsigned>> &RemapChan) const {
- unsigned Reg = RSI->Instr->getOperand(0).getReg();
+ Register Reg = RSI->Instr->getOperand(0).getReg();
MachineBasicBlock::iterator Pos = RSI->Instr;
MachineBasicBlock &MBB = *Pos->getParent();
DebugLoc DL = Pos->getDebugLoc();
- unsigned SrcVec = BaseRSI->Instr->getOperand(0).getReg();
+ Register SrcVec = BaseRSI->Instr->getOperand(0).getReg();
DenseMap<unsigned, unsigned> UpdatedRegToChan = BaseRSI->RegToChan;
std::vector<unsigned> UpdatedUndef = BaseRSI->UndefReg;
for (DenseMap<unsigned, unsigned>::iterator It = RSI->RegToChan.begin(),
E = RSI->RegToChan.end(); It != E; ++It) {
- unsigned DstReg = MRI->createVirtualRegister(&R600::R600_Reg128RegClass);
+ Register DstReg = MRI->createVirtualRegister(&R600::R600_Reg128RegClass);
unsigned SubReg = (*It).first;
unsigned Swizzle = (*It).second;
unsigned Chan = getReassignedChan(RemapChan, Swizzle);
@@ -350,7 +350,7 @@ bool R600VectorRegMerger::runOnMachineFunction(MachineFunction &Fn) {
MachineInstr &MI = *MII;
if (MI.getOpcode() != R600::REG_SEQUENCE) {
if (TII->get(MI.getOpcode()).TSFlags & R600_InstFlag::TEX_INST) {
- unsigned Reg = MI.getOperand(1).getReg();
+ Register Reg = MI.getOperand(1).getReg();
for (MachineRegisterInfo::def_instr_iterator
It = MRI->def_instr_begin(Reg), E = MRI->def_instr_end();
It != E; ++It) {
@@ -363,7 +363,7 @@ bool R600VectorRegMerger::runOnMachineFunction(MachineFunction &Fn) {
RegSeqInfo RSI(*MRI, &MI);
// All uses of MI are swizzeable ?
- unsigned Reg = MI.getOperand(0).getReg();
+ Register Reg = MI.getOperand(0).getReg();
if (!areAllUsesSwizzeable(Reg))
continue;
diff --git a/lib/Target/AMDGPU/R600Packetizer.cpp b/lib/Target/AMDGPU/R600Packetizer.cpp
index df200baf11c1..176269f9b68c 100644
--- a/lib/Target/AMDGPU/R600Packetizer.cpp
+++ b/lib/Target/AMDGPU/R600Packetizer.cpp
@@ -90,7 +90,7 @@ private:
if (DstIdx == -1) {
continue;
}
- unsigned Dst = BI->getOperand(DstIdx).getReg();
+ Register Dst = BI->getOperand(DstIdx).getReg();
if (isTrans || TII->isTransOnly(*BI)) {
Result[Dst] = R600::PS;
continue;
@@ -136,7 +136,7 @@ private:
int OperandIdx = TII->getOperandIdx(MI.getOpcode(), Ops[i]);
if (OperandIdx < 0)
continue;
- unsigned Src = MI.getOperand(OperandIdx).getReg();
+ Register Src = MI.getOperand(OperandIdx).getReg();
const DenseMap<unsigned, unsigned>::const_iterator It = PVs.find(Src);
if (It != PVs.end())
MI.getOperand(OperandIdx).setReg(It->second);
diff --git a/lib/Target/AMDGPU/R600RegisterInfo.cpp b/lib/Target/AMDGPU/R600RegisterInfo.cpp
index 685df74490fe..ef12c1d24594 100644
--- a/lib/Target/AMDGPU/R600RegisterInfo.cpp
+++ b/lib/Target/AMDGPU/R600RegisterInfo.cpp
@@ -93,7 +93,7 @@ const RegClassWeight &R600RegisterInfo::getRegClassWeight(
}
bool R600RegisterInfo::isPhysRegLiveAcrossClauses(unsigned Reg) const {
- assert(!TargetRegisterInfo::isVirtualRegister(Reg));
+ assert(!Register::isVirtualRegister(Reg));
switch (Reg) {
case R600::OQAP:
diff --git a/lib/Target/AMDGPU/SIAddIMGInit.cpp b/lib/Target/AMDGPU/SIAddIMGInit.cpp
index f8094e35816c..ee011286b8ff 100644
--- a/lib/Target/AMDGPU/SIAddIMGInit.cpp
+++ b/lib/Target/AMDGPU/SIAddIMGInit.cpp
@@ -129,7 +129,7 @@ bool SIAddIMGInit::runOnMachineFunction(MachineFunction &MF) {
continue;
// Create a register for the intialization value.
- unsigned PrevDst =
+ Register PrevDst =
MRI.createVirtualRegister(TII->getOpRegClass(MI, DstIdx));
unsigned NewDst = 0; // Final initialized value will be in here
@@ -150,7 +150,7 @@ bool SIAddIMGInit::runOnMachineFunction(MachineFunction &MF) {
NewDst =
MRI.createVirtualRegister(TII->getOpRegClass(MI, DstIdx));
// Initialize dword
- unsigned SubReg =
+ Register SubReg =
MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), SubReg)
.addImm(0);
diff --git a/lib/Target/AMDGPU/SIDefines.h b/lib/Target/AMDGPU/SIDefines.h
index a0e1ec6ac235..23ef56afc39c 100644
--- a/lib/Target/AMDGPU/SIDefines.h
+++ b/lib/Target/AMDGPU/SIDefines.h
@@ -99,7 +99,10 @@ enum : uint64_t {
FPAtomic = UINT64_C(1) << 53,
// Is a MFMA instruction.
- IsMAI = UINT64_C(1) << 54
+ IsMAI = UINT64_C(1) << 54,
+
+ // Is a DOT instruction.
+ IsDOT = UINT64_C(1) << 55
};
// v_cmp_class_* etc. use a 10-bit mask for what operation is checked.
@@ -444,6 +447,7 @@ namespace DPP {
enum DppCtrl : unsigned {
QUAD_PERM_FIRST = 0,
+ QUAD_PERM_ID = 0xE4, // identity permutation
QUAD_PERM_LAST = 0xFF,
DPP_UNUSED1 = 0x100,
ROW_SHL0 = 0x100,
diff --git a/lib/Target/AMDGPU/SIFixSGPRCopies.cpp b/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
index 624953963cf4..65286751c12d 100644
--- a/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
+++ b/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
@@ -113,10 +113,16 @@ class SIFixSGPRCopies : public MachineFunctionPass {
public:
static char ID;
+ MachineRegisterInfo *MRI;
+ const SIRegisterInfo *TRI;
+ const SIInstrInfo *TII;
+
SIFixSGPRCopies() : MachineFunctionPass(ID) {}
bool runOnMachineFunction(MachineFunction &MF) override;
+ void processPHINode(MachineInstr &MI);
+
StringRef getPassName() const override { return "SI Fix SGPR copies"; }
void getAnalysisUsage(AnalysisUsage &AU) const override {
@@ -148,7 +154,7 @@ static bool hasVectorOperands(const MachineInstr &MI,
const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
if (!MI.getOperand(i).isReg() ||
- !TargetRegisterInfo::isVirtualRegister(MI.getOperand(i).getReg()))
+ !Register::isVirtualRegister(MI.getOperand(i).getReg()))
continue;
if (TRI->hasVectorRegisters(MRI.getRegClass(MI.getOperand(i).getReg())))
@@ -161,21 +167,19 @@ static std::pair<const TargetRegisterClass *, const TargetRegisterClass *>
getCopyRegClasses(const MachineInstr &Copy,
const SIRegisterInfo &TRI,
const MachineRegisterInfo &MRI) {
- unsigned DstReg = Copy.getOperand(0).getReg();
- unsigned SrcReg = Copy.getOperand(1).getReg();
+ Register DstReg = Copy.getOperand(0).getReg();
+ Register SrcReg = Copy.getOperand(1).getReg();
- const TargetRegisterClass *SrcRC =
- TargetRegisterInfo::isVirtualRegister(SrcReg) ?
- MRI.getRegClass(SrcReg) :
- TRI.getPhysRegClass(SrcReg);
+ const TargetRegisterClass *SrcRC = Register::isVirtualRegister(SrcReg)
+ ? MRI.getRegClass(SrcReg)
+ : TRI.getPhysRegClass(SrcReg);
// We don't really care about the subregister here.
// SrcRC = TRI.getSubRegClass(SrcRC, Copy.getOperand(1).getSubReg());
- const TargetRegisterClass *DstRC =
- TargetRegisterInfo::isVirtualRegister(DstReg) ?
- MRI.getRegClass(DstReg) :
- TRI.getPhysRegClass(DstReg);
+ const TargetRegisterClass *DstRC = Register::isVirtualRegister(DstReg)
+ ? MRI.getRegClass(DstReg)
+ : TRI.getPhysRegClass(DstReg);
return std::make_pair(SrcRC, DstRC);
}
@@ -199,10 +203,10 @@ static bool tryChangeVGPRtoSGPRinCopy(MachineInstr &MI,
const SIInstrInfo *TII) {
MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
auto &Src = MI.getOperand(1);
- unsigned DstReg = MI.getOperand(0).getReg();
- unsigned SrcReg = Src.getReg();
- if (!TargetRegisterInfo::isVirtualRegister(SrcReg) ||
- !TargetRegisterInfo::isVirtualRegister(DstReg))
+ Register DstReg = MI.getOperand(0).getReg();
+ Register SrcReg = Src.getReg();
+ if (!Register::isVirtualRegister(SrcReg) ||
+ !Register::isVirtualRegister(DstReg))
return false;
for (const auto &MO : MRI.reg_nodbg_operands(DstReg)) {
@@ -238,7 +242,7 @@ static bool foldVGPRCopyIntoRegSequence(MachineInstr &MI,
MachineRegisterInfo &MRI) {
assert(MI.isRegSequence());
- unsigned DstReg = MI.getOperand(0).getReg();
+ Register DstReg = MI.getOperand(0).getReg();
if (!TRI->isSGPRClass(MRI.getRegClass(DstReg)))
return false;
@@ -250,7 +254,7 @@ static bool foldVGPRCopyIntoRegSequence(MachineInstr &MI,
return false;
// It is illegal to have vreg inputs to a physreg defining reg_sequence.
- if (TargetRegisterInfo::isPhysicalRegister(CopyUse.getOperand(0).getReg()))
+ if (Register::isPhysicalRegister(CopyUse.getOperand(0).getReg()))
return false;
const TargetRegisterClass *SrcRC, *DstRC;
@@ -281,7 +285,7 @@ static bool foldVGPRCopyIntoRegSequence(MachineInstr &MI,
bool IsAGPR = TRI->hasAGPRs(DstRC);
for (unsigned I = 1, N = MI.getNumOperands(); I != N; I += 2) {
- unsigned SrcReg = MI.getOperand(I).getReg();
+ Register SrcReg = MI.getOperand(I).getReg();
unsigned SrcSubReg = MI.getOperand(I).getSubReg();
const TargetRegisterClass *SrcRC = MRI.getRegClass(SrcReg);
@@ -291,7 +295,7 @@ static bool foldVGPRCopyIntoRegSequence(MachineInstr &MI,
SrcRC = TRI->getSubRegClass(SrcRC, SrcSubReg);
const TargetRegisterClass *NewSrcRC = TRI->getEquivalentVGPRClass(SrcRC);
- unsigned TmpReg = MRI.createVirtualRegister(NewSrcRC);
+ Register TmpReg = MRI.createVirtualRegister(NewSrcRC);
BuildMI(*MI.getParent(), &MI, MI.getDebugLoc(), TII->get(AMDGPU::COPY),
TmpReg)
@@ -299,7 +303,7 @@ static bool foldVGPRCopyIntoRegSequence(MachineInstr &MI,
if (IsAGPR) {
const TargetRegisterClass *NewSrcRC = TRI->getEquivalentAGPRClass(SrcRC);
- unsigned TmpAReg = MRI.createVirtualRegister(NewSrcRC);
+ Register TmpAReg = MRI.createVirtualRegister(NewSrcRC);
unsigned Opc = NewSrcRC == &AMDGPU::AGPR_32RegClass ?
AMDGPU::V_ACCVGPR_WRITE_B32 : AMDGPU::COPY;
BuildMI(*MI.getParent(), &MI, MI.getDebugLoc(), TII->get(Opc),
@@ -315,52 +319,6 @@ static bool foldVGPRCopyIntoRegSequence(MachineInstr &MI,
return true;
}
-static bool phiHasVGPROperands(const MachineInstr &PHI,
- const MachineRegisterInfo &MRI,
- const SIRegisterInfo *TRI,
- const SIInstrInfo *TII) {
- for (unsigned i = 1; i < PHI.getNumOperands(); i += 2) {
- unsigned Reg = PHI.getOperand(i).getReg();
- if (TRI->hasVGPRs(MRI.getRegClass(Reg)))
- return true;
- }
- return false;
-}
-
-static bool phiHasBreakDef(const MachineInstr &PHI,
- const MachineRegisterInfo &MRI,
- SmallSet<unsigned, 8> &Visited) {
- for (unsigned i = 1; i < PHI.getNumOperands(); i += 2) {
- unsigned Reg = PHI.getOperand(i).getReg();
- if (Visited.count(Reg))
- continue;
-
- Visited.insert(Reg);
-
- MachineInstr *DefInstr = MRI.getVRegDef(Reg);
- switch (DefInstr->getOpcode()) {
- default:
- break;
- case AMDGPU::SI_IF_BREAK:
- return true;
- case AMDGPU::PHI:
- if (phiHasBreakDef(*DefInstr, MRI, Visited))
- return true;
- }
- }
- return false;
-}
-
-static bool hasTerminatorThatModifiesExec(const MachineBasicBlock &MBB,
- const TargetRegisterInfo &TRI) {
- for (MachineBasicBlock::const_iterator I = MBB.getFirstTerminator(),
- E = MBB.end(); I != E; ++I) {
- if (I->modifiesRegister(AMDGPU::EXEC, &TRI))
- return true;
- }
- return false;
-}
-
static bool isSafeToFoldImmIntoCopy(const MachineInstr *Copy,
const MachineInstr *MoveImm,
const SIInstrInfo *TII,
@@ -422,12 +380,6 @@ bool searchPredecessors(const MachineBasicBlock *MBB,
return false;
}
-static bool predsHasDivergentTerminator(MachineBasicBlock *MBB,
- const TargetRegisterInfo *TRI) {
- return searchPredecessors(MBB, nullptr, [TRI](MachineBasicBlock *MBB) {
- return hasTerminatorThatModifiesExec(*MBB, *TRI); });
-}
-
// Checks if there is potential path From instruction To instruction.
// If CutOff is specified and it sits in between of that path we ignore
// a higher portion of the path and report it is not reachable.
@@ -468,6 +420,7 @@ getFirstNonPrologue(MachineBasicBlock *MBB, const TargetInstrInfo *TII) {
// executioon.
static bool hoistAndMergeSGPRInits(unsigned Reg,
const MachineRegisterInfo &MRI,
+ const TargetRegisterInfo *TRI,
MachineDominatorTree &MDT,
const TargetInstrInfo *TII) {
// List of inits by immediate value.
@@ -482,7 +435,7 @@ static bool hoistAndMergeSGPRInits(unsigned Reg,
for (auto &MI : MRI.def_instructions(Reg)) {
MachineOperand *Imm = nullptr;
- for (auto &MO: MI.operands()) {
+ for (auto &MO : MI.operands()) {
if ((MO.isReg() && ((MO.isDef() && MO.getReg() != Reg) || !MO.isDef())) ||
(!MO.isImm() && !MO.isReg()) || (MO.isImm() && Imm)) {
Imm = nullptr;
@@ -587,8 +540,44 @@ static bool hoistAndMergeSGPRInits(unsigned Reg,
}
}
- for (auto MI : MergedInstrs)
- MI->removeFromParent();
+ // Remove initializations that were merged into another.
+ for (auto &Init : Inits) {
+ auto &Defs = Init.second;
+ auto I = Defs.begin();
+ while (I != Defs.end()) {
+ if (MergedInstrs.count(*I)) {
+ (*I)->eraseFromParent();
+ I = Defs.erase(I);
+ } else
+ ++I;
+ }
+ }
+
+ // Try to schedule SGPR initializations as early as possible in the MBB.
+ for (auto &Init : Inits) {
+ auto &Defs = Init.second;
+ for (auto MI : Defs) {
+ auto MBB = MI->getParent();
+ MachineInstr &BoundaryMI = *getFirstNonPrologue(MBB, TII);
+ MachineBasicBlock::reverse_iterator B(BoundaryMI);
+ // Check if B should actually be a boundary. If not set the previous
+ // instruction as the boundary instead.
+ if (!TII->isBasicBlockPrologue(*B))
+ B++;
+
+ auto R = std::next(MI->getReverseIterator());
+ const unsigned Threshold = 50;
+ // Search until B or Threshold for a place to insert the initialization.
+ for (unsigned I = 0; R != B && I < Threshold; ++R, ++I)
+ if (R->readsRegister(Reg, TRI) || R->definesRegister(Reg, TRI) ||
+ TII->isSchedulingBoundary(*R, MBB, *MBB->getParent()))
+ break;
+
+ // Move to directly after R.
+ if (&*--R != MI)
+ MBB->splice(*R, MBB, MI);
+ }
+ }
if (Changed)
MRI.clearKillFlags(Reg);
@@ -598,9 +587,9 @@ static bool hoistAndMergeSGPRInits(unsigned Reg,
bool SIFixSGPRCopies::runOnMachineFunction(MachineFunction &MF) {
const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
- MachineRegisterInfo &MRI = MF.getRegInfo();
- const SIRegisterInfo *TRI = ST.getRegisterInfo();
- const SIInstrInfo *TII = ST.getInstrInfo();
+ MRI = &MF.getRegInfo();
+ TRI = ST.getRegisterInfo();
+ TII = ST.getInstrInfo();
MDT = &getAnalysis<MachineDominatorTree>();
SmallVector<MachineInstr *, 16> Worklist;
@@ -617,22 +606,39 @@ bool SIFixSGPRCopies::runOnMachineFunction(MachineFunction &MF) {
continue;
case AMDGPU::COPY:
case AMDGPU::WQM:
+ case AMDGPU::SOFT_WQM:
case AMDGPU::WWM: {
- // If the destination register is a physical register there isn't really
- // much we can do to fix this.
- if (!TargetRegisterInfo::isVirtualRegister(MI.getOperand(0).getReg()))
- continue;
+ Register DstReg = MI.getOperand(0).getReg();
const TargetRegisterClass *SrcRC, *DstRC;
- std::tie(SrcRC, DstRC) = getCopyRegClasses(MI, *TRI, MRI);
+ std::tie(SrcRC, DstRC) = getCopyRegClasses(MI, *TRI, *MRI);
+
+ if (!Register::isVirtualRegister(DstReg)) {
+ // If the destination register is a physical register there isn't
+ // really much we can do to fix this.
+ // Some special instructions use M0 as an input. Some even only use
+ // the first lane. Insert a readfirstlane and hope for the best.
+ if (DstReg == AMDGPU::M0 && TRI->hasVectorRegisters(SrcRC)) {
+ Register TmpReg
+ = MRI->createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
+
+ BuildMI(MBB, MI, MI.getDebugLoc(),
+ TII->get(AMDGPU::V_READFIRSTLANE_B32), TmpReg)
+ .add(MI.getOperand(1));
+ MI.getOperand(1).setReg(TmpReg);
+ }
+
+ continue;
+ }
+
if (isVGPRToSGPRCopy(SrcRC, DstRC, *TRI)) {
- unsigned SrcReg = MI.getOperand(1).getReg();
- if (!TargetRegisterInfo::isVirtualRegister(SrcReg)) {
+ Register SrcReg = MI.getOperand(1).getReg();
+ if (!Register::isVirtualRegister(SrcReg)) {
TII->moveToVALU(MI, MDT);
break;
}
- MachineInstr *DefMI = MRI.getVRegDef(SrcReg);
+ MachineInstr *DefMI = MRI->getVRegDef(SrcReg);
unsigned SMovOp;
int64_t Imm;
// If we are just copying an immediate, we can replace the copy with
@@ -651,70 +657,13 @@ bool SIFixSGPRCopies::runOnMachineFunction(MachineFunction &MF) {
break;
}
case AMDGPU::PHI: {
- unsigned Reg = MI.getOperand(0).getReg();
- if (!TRI->isSGPRClass(MRI.getRegClass(Reg)))
- break;
-
- // We don't need to fix the PHI if the common dominator of the
- // two incoming blocks terminates with a uniform branch.
- bool HasVGPROperand = phiHasVGPROperands(MI, MRI, TRI, TII);
- if (MI.getNumExplicitOperands() == 5 && !HasVGPROperand) {
- MachineBasicBlock *MBB0 = MI.getOperand(2).getMBB();
- MachineBasicBlock *MBB1 = MI.getOperand(4).getMBB();
-
- if (!predsHasDivergentTerminator(MBB0, TRI) &&
- !predsHasDivergentTerminator(MBB1, TRI)) {
- LLVM_DEBUG(dbgs()
- << "Not fixing PHI for uniform branch: " << MI << '\n');
- break;
- }
- }
-
- // If a PHI node defines an SGPR and any of its operands are VGPRs,
- // then we need to move it to the VALU.
- //
- // Also, if a PHI node defines an SGPR and has all SGPR operands
- // we must move it to the VALU, because the SGPR operands will
- // all end up being assigned the same register, which means
- // there is a potential for a conflict if different threads take
- // different control flow paths.
- //
- // For Example:
- //
- // sgpr0 = def;
- // ...
- // sgpr1 = def;
- // ...
- // sgpr2 = PHI sgpr0, sgpr1
- // use sgpr2;
- //
- // Will Become:
- //
- // sgpr2 = def;
- // ...
- // sgpr2 = def;
- // ...
- // use sgpr2
- //
- // The one exception to this rule is when one of the operands
- // is defined by a SI_BREAK, SI_IF_BREAK, or SI_ELSE_BREAK
- // instruction. In this case, there we know the program will
- // never enter the second block (the loop) without entering
- // the first block (where the condition is computed), so there
- // is no chance for values to be over-written.
-
- SmallSet<unsigned, 8> Visited;
- if (HasVGPROperand || !phiHasBreakDef(MI, MRI, Visited)) {
- LLVM_DEBUG(dbgs() << "Fixing PHI: " << MI);
- TII->moveToVALU(MI, MDT);
- }
-
+ processPHINode(MI);
break;
}
case AMDGPU::REG_SEQUENCE:
if (TRI->hasVectorRegisters(TII->getOpRegClass(MI, 0)) ||
!hasVectorOperands(MI, TRI)) {
- foldVGPRCopyIntoRegSequence(MI, TRI, TII, MRI);
+ foldVGPRCopyIntoRegSequence(MI, TRI, TII, *MRI);
continue;
}
@@ -724,9 +673,9 @@ bool SIFixSGPRCopies::runOnMachineFunction(MachineFunction &MF) {
break;
case AMDGPU::INSERT_SUBREG: {
const TargetRegisterClass *DstRC, *Src0RC, *Src1RC;
- DstRC = MRI.getRegClass(MI.getOperand(0).getReg());
- Src0RC = MRI.getRegClass(MI.getOperand(1).getReg());
- Src1RC = MRI.getRegClass(MI.getOperand(2).getReg());
+ DstRC = MRI->getRegClass(MI.getOperand(0).getReg());
+ Src0RC = MRI->getRegClass(MI.getOperand(1).getReg());
+ Src1RC = MRI->getRegClass(MI.getOperand(2).getReg());
if (TRI->isSGPRClass(DstRC) &&
(TRI->hasVectorRegisters(Src0RC) ||
TRI->hasVectorRegisters(Src1RC))) {
@@ -735,12 +684,159 @@ bool SIFixSGPRCopies::runOnMachineFunction(MachineFunction &MF) {
}
break;
}
+ case AMDGPU::V_WRITELANE_B32: {
+ // Some architectures allow more than one constant bus access without
+ // SGPR restriction
+ if (ST.getConstantBusLimit(MI.getOpcode()) != 1)
+ break;
+
+ // Writelane is special in that it can use SGPR and M0 (which would
+ // normally count as using the constant bus twice - but in this case it
+ // is allowed since the lane selector doesn't count as a use of the
+ // constant bus). However, it is still required to abide by the 1 SGPR
+ // rule. Apply a fix here as we might have multiple SGPRs after
+ // legalizing VGPRs to SGPRs
+ int Src0Idx =
+ AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::src0);
+ int Src1Idx =
+ AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::src1);
+ MachineOperand &Src0 = MI.getOperand(Src0Idx);
+ MachineOperand &Src1 = MI.getOperand(Src1Idx);
+
+ // Check to see if the instruction violates the 1 SGPR rule
+ if ((Src0.isReg() && TRI->isSGPRReg(*MRI, Src0.getReg()) &&
+ Src0.getReg() != AMDGPU::M0) &&
+ (Src1.isReg() && TRI->isSGPRReg(*MRI, Src1.getReg()) &&
+ Src1.getReg() != AMDGPU::M0)) {
+
+ // Check for trivially easy constant prop into one of the operands
+ // If this is the case then perform the operation now to resolve SGPR
+ // issue. If we don't do that here we will always insert a mov to m0
+ // that can't be resolved in later operand folding pass
+ bool Resolved = false;
+ for (MachineOperand *MO : {&Src0, &Src1}) {
+ if (Register::isVirtualRegister(MO->getReg())) {
+ MachineInstr *DefMI = MRI->getVRegDef(MO->getReg());
+ if (DefMI && TII->isFoldableCopy(*DefMI)) {
+ const MachineOperand &Def = DefMI->getOperand(0);
+ if (Def.isReg() &&
+ MO->getReg() == Def.getReg() &&
+ MO->getSubReg() == Def.getSubReg()) {
+ const MachineOperand &Copied = DefMI->getOperand(1);
+ if (Copied.isImm() &&
+ TII->isInlineConstant(APInt(64, Copied.getImm(), true))) {
+ MO->ChangeToImmediate(Copied.getImm());
+ Resolved = true;
+ break;
+ }
+ }
+ }
+ }
+ }
+
+ if (!Resolved) {
+ // Haven't managed to resolve by replacing an SGPR with an immediate
+ // Move src1 to be in M0
+ BuildMI(*MI.getParent(), MI, MI.getDebugLoc(),
+ TII->get(AMDGPU::COPY), AMDGPU::M0)
+ .add(Src1);
+ Src1.ChangeToRegister(AMDGPU::M0, false);
+ }
+ }
+ break;
+ }
}
}
}
if (MF.getTarget().getOptLevel() > CodeGenOpt::None && EnableM0Merge)
- hoistAndMergeSGPRInits(AMDGPU::M0, MRI, *MDT, TII);
+ hoistAndMergeSGPRInits(AMDGPU::M0, *MRI, TRI, *MDT, TII);
return true;
}
+
+void SIFixSGPRCopies::processPHINode(MachineInstr &MI) {
+ unsigned numVGPRUses = 0;
+ bool AllAGPRUses = true;
+ SetVector<const MachineInstr *> worklist;
+ SmallSet<const MachineInstr *, 4> Visited;
+ worklist.insert(&MI);
+ Visited.insert(&MI);
+ while (!worklist.empty()) {
+ const MachineInstr *Instr = worklist.pop_back_val();
+ unsigned Reg = Instr->getOperand(0).getReg();
+ for (const auto &Use : MRI->use_operands(Reg)) {
+ const MachineInstr *UseMI = Use.getParent();
+ AllAGPRUses &= (UseMI->isCopy() &&
+ TRI->isAGPR(*MRI, UseMI->getOperand(0).getReg())) ||
+ TRI->isAGPR(*MRI, Use.getReg());
+ if (UseMI->isCopy() || UseMI->isRegSequence()) {
+ if (UseMI->isCopy() &&
+ UseMI->getOperand(0).getReg().isPhysical() &&
+ !TRI->isSGPRReg(*MRI, UseMI->getOperand(0).getReg())) {
+ numVGPRUses++;
+ }
+ if (Visited.insert(UseMI).second)
+ worklist.insert(UseMI);
+
+ continue;
+ }
+
+ if (UseMI->isPHI()) {
+ const TargetRegisterClass *UseRC = MRI->getRegClass(Use.getReg());
+ if (!TRI->isSGPRReg(*MRI, Use.getReg()) &&
+ UseRC != &AMDGPU::VReg_1RegClass)
+ numVGPRUses++;
+ continue;
+ }
+
+ const TargetRegisterClass *OpRC =
+ TII->getOpRegClass(*UseMI, UseMI->getOperandNo(&Use));
+ if (!TRI->isSGPRClass(OpRC) && OpRC != &AMDGPU::VS_32RegClass &&
+ OpRC != &AMDGPU::VS_64RegClass) {
+ numVGPRUses++;
+ }
+ }
+ }
+
+ Register PHIRes = MI.getOperand(0).getReg();
+ const TargetRegisterClass *RC0 = MRI->getRegClass(PHIRes);
+ if (AllAGPRUses && numVGPRUses && !TRI->hasAGPRs(RC0)) {
+ LLVM_DEBUG(dbgs() << "Moving PHI to AGPR: " << MI);
+ MRI->setRegClass(PHIRes, TRI->getEquivalentAGPRClass(RC0));
+ }
+
+ bool hasVGPRInput = false;
+ for (unsigned i = 1; i < MI.getNumOperands(); i += 2) {
+ unsigned InputReg = MI.getOperand(i).getReg();
+ MachineInstr *Def = MRI->getVRegDef(InputReg);
+ if (TRI->isVectorRegister(*MRI, InputReg)) {
+ if (Def->isCopy()) {
+ unsigned SrcReg = Def->getOperand(1).getReg();
+ const TargetRegisterClass *RC =
+ TRI->getRegClassForReg(*MRI, SrcReg);
+ if (TRI->isSGPRClass(RC))
+ continue;
+ }
+ hasVGPRInput = true;
+ break;
+ }
+ else if (Def->isCopy() &&
+ TRI->isVectorRegister(*MRI, Def->getOperand(1).getReg())) {
+ hasVGPRInput = true;
+ break;
+ }
+ }
+
+ if ((!TRI->isVectorRegister(*MRI, PHIRes) &&
+ RC0 != &AMDGPU::VReg_1RegClass) &&
+ (hasVGPRInput || numVGPRUses > 1)) {
+ LLVM_DEBUG(dbgs() << "Fixing PHI: " << MI);
+ TII->moveToVALU(MI);
+ }
+ else {
+ LLVM_DEBUG(dbgs() << "Legalizing PHI: " << MI);
+ TII->legalizeOperands(MI, MDT);
+ }
+
+}
diff --git a/lib/Target/AMDGPU/SIFixupVectorISel.cpp b/lib/Target/AMDGPU/SIFixupVectorISel.cpp
index 5b834c8de13a..a0119297b112 100644
--- a/lib/Target/AMDGPU/SIFixupVectorISel.cpp
+++ b/lib/Target/AMDGPU/SIFixupVectorISel.cpp
@@ -91,8 +91,7 @@ static bool findSRegBaseAndIndex(MachineOperand *Op,
Worklist.push_back(Op);
while (!Worklist.empty()) {
MachineOperand *WOp = Worklist.pop_back_val();
- if (!WOp->isReg() ||
- !TargetRegisterInfo::isVirtualRegister(WOp->getReg()))
+ if (!WOp->isReg() || !Register::isVirtualRegister(WOp->getReg()))
continue;
MachineInstr *DefInst = MRI.getUniqueVRegDef(WOp->getReg());
switch (DefInst->getOpcode()) {
diff --git a/lib/Target/AMDGPU/SIFoldOperands.cpp b/lib/Target/AMDGPU/SIFoldOperands.cpp
index 74d77d328019..4eac03168760 100644
--- a/lib/Target/AMDGPU/SIFoldOperands.cpp
+++ b/lib/Target/AMDGPU/SIFoldOperands.cpp
@@ -142,16 +142,20 @@ static bool isInlineConstantIfFolded(const SIInstrInfo *TII,
switch (Opc) {
case AMDGPU::V_MAC_F32_e64:
case AMDGPU::V_MAC_F16_e64:
- case AMDGPU::V_FMAC_F32_e64: {
+ case AMDGPU::V_FMAC_F32_e64:
+ case AMDGPU::V_FMAC_F16_e64: {
// Special case for mac. Since this is replaced with mad when folded into
// src2, we need to check the legality for the final instruction.
int Src2Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2);
if (static_cast<int>(OpNo) == Src2Idx) {
- bool IsFMA = Opc == AMDGPU::V_FMAC_F32_e64;
- bool IsF32 = Opc == AMDGPU::V_MAC_F32_e64;
+ bool IsFMA = Opc == AMDGPU::V_FMAC_F32_e64 ||
+ Opc == AMDGPU::V_FMAC_F16_e64;
+ bool IsF32 = Opc == AMDGPU::V_MAC_F32_e64 ||
+ Opc == AMDGPU::V_FMAC_F32_e64;
unsigned Opc = IsFMA ?
- AMDGPU::V_FMA_F32 : (IsF32 ? AMDGPU::V_MAD_F32 : AMDGPU::V_MAD_F16);
+ (IsF32 ? AMDGPU::V_FMA_F32 : AMDGPU::V_FMA_F16_gfx9) :
+ (IsF32 ? AMDGPU::V_MAD_F32 : AMDGPU::V_MAD_F16);
const MCInstrDesc &MadDesc = TII->get(Opc);
return TII->isInlineConstant(OpToFold, MadDesc.OpInfo[OpNo].OperandType);
}
@@ -235,9 +239,11 @@ static bool updateOperand(FoldCandidate &Fold,
if ((Fold.isImm() || Fold.isFI() || Fold.isGlobal()) && Fold.needsShrink()) {
MachineBasicBlock *MBB = MI->getParent();
- auto Liveness = MBB->computeRegisterLiveness(&TRI, AMDGPU::VCC, MI);
- if (Liveness != MachineBasicBlock::LQR_Dead)
+ auto Liveness = MBB->computeRegisterLiveness(&TRI, AMDGPU::VCC, MI, 16);
+ if (Liveness != MachineBasicBlock::LQR_Dead) {
+ LLVM_DEBUG(dbgs() << "Not shrinking " << MI << " due to vcc liveness\n");
return false;
+ }
MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
int Op32 = Fold.getShrinkOpcode();
@@ -248,7 +254,7 @@ static bool updateOperand(FoldCandidate &Fold,
bool HaveNonDbgCarryUse = !MRI.use_nodbg_empty(Dst1.getReg());
const TargetRegisterClass *Dst0RC = MRI.getRegClass(Dst0.getReg());
- unsigned NewReg0 = MRI.createVirtualRegister(Dst0RC);
+ Register NewReg0 = MRI.createVirtualRegister(Dst0RC);
MachineInstr *Inst32 = TII.buildShrunkInst(*MI, Op32);
@@ -314,12 +320,15 @@ static bool tryAddToFoldList(SmallVectorImpl<FoldCandidate> &FoldList,
// Special case for v_mac_{f16, f32}_e64 if we are trying to fold into src2
unsigned Opc = MI->getOpcode();
if ((Opc == AMDGPU::V_MAC_F32_e64 || Opc == AMDGPU::V_MAC_F16_e64 ||
- Opc == AMDGPU::V_FMAC_F32_e64) &&
+ Opc == AMDGPU::V_FMAC_F32_e64 || Opc == AMDGPU::V_FMAC_F16_e64) &&
(int)OpNo == AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2)) {
- bool IsFMA = Opc == AMDGPU::V_FMAC_F32_e64;
- bool IsF32 = Opc == AMDGPU::V_MAC_F32_e64;
+ bool IsFMA = Opc == AMDGPU::V_FMAC_F32_e64 ||
+ Opc == AMDGPU::V_FMAC_F16_e64;
+ bool IsF32 = Opc == AMDGPU::V_MAC_F32_e64 ||
+ Opc == AMDGPU::V_FMAC_F32_e64;
unsigned NewOpc = IsFMA ?
- AMDGPU::V_FMA_F32 : (IsF32 ? AMDGPU::V_MAD_F32 : AMDGPU::V_MAD_F16);
+ (IsF32 ? AMDGPU::V_FMA_F32 : AMDGPU::V_FMA_F16_gfx9) :
+ (IsF32 ? AMDGPU::V_MAD_F32 : AMDGPU::V_MAD_F16);
// Check if changing this to a v_mad_{f16, f32} instruction will allow us
// to fold the operand.
@@ -435,7 +444,8 @@ static bool tryToFoldACImm(const SIInstrInfo *TII,
OpTy > AMDGPU::OPERAND_REG_INLINE_AC_LAST)
return false;
- if (OpToFold.isImm() && TII->isInlineConstant(OpToFold, OpTy)) {
+ if (OpToFold.isImm() && TII->isInlineConstant(OpToFold, OpTy) &&
+ TII->isOperandLegal(*UseMI, UseOpIdx, &OpToFold)) {
UseMI->getOperand(UseOpIdx).ChangeToImmediate(OpToFold.getImm());
return true;
}
@@ -443,8 +453,8 @@ static bool tryToFoldACImm(const SIInstrInfo *TII,
if (!OpToFold.isReg())
return false;
- unsigned UseReg = OpToFold.getReg();
- if (!TargetRegisterInfo::isVirtualRegister(UseReg))
+ Register UseReg = OpToFold.getReg();
+ if (!Register::isVirtualRegister(UseReg))
return false;
if (llvm::find_if(FoldList, [UseMI](const FoldCandidate &FC) {
@@ -481,6 +491,9 @@ static bool tryToFoldACImm(const SIInstrInfo *TII,
return false; // Can only fold splat constants
}
+ if (!TII->isOperandLegal(*UseMI, UseOpIdx, Op))
+ return false;
+
FoldList.push_back(FoldCandidate(UseMI, UseOpIdx, Op));
return true;
}
@@ -518,7 +531,7 @@ void SIFoldOperands::foldOperand(
// REG_SEQUENCE instructions, so we have to fold them into the
// uses of REG_SEQUENCE.
if (UseMI->isRegSequence()) {
- unsigned RegSeqDstReg = UseMI->getOperand(0).getReg();
+ Register RegSeqDstReg = UseMI->getOperand(0).getReg();
unsigned RegSeqDstSubReg = UseMI->getOperand(UseOpIdx + 1).getImm();
MachineRegisterInfo::use_iterator Next;
@@ -569,15 +582,18 @@ void SIFoldOperands::foldOperand(
OpToFold.isImm() || OpToFold.isFI() || OpToFold.isGlobal();
if (FoldingImmLike && UseMI->isCopy()) {
- unsigned DestReg = UseMI->getOperand(0).getReg();
- const TargetRegisterClass *DestRC
- = TargetRegisterInfo::isVirtualRegister(DestReg) ?
- MRI->getRegClass(DestReg) :
- TRI->getPhysRegClass(DestReg);
-
- unsigned SrcReg = UseMI->getOperand(1).getReg();
- if (TargetRegisterInfo::isVirtualRegister(DestReg) &&
- TargetRegisterInfo::isVirtualRegister(SrcReg)) {
+ Register DestReg = UseMI->getOperand(0).getReg();
+
+ // Don't fold into a copy to a physical register. Doing so would interfere
+ // with the register coalescer's logic which would avoid redundant
+ // initalizations.
+ if (DestReg.isPhysical())
+ return;
+
+ const TargetRegisterClass *DestRC = MRI->getRegClass(DestReg);
+
+ Register SrcReg = UseMI->getOperand(1).getReg();
+ if (SrcReg.isVirtual()) { // XXX - This can be an assert?
const TargetRegisterClass * SrcRC = MRI->getRegClass(SrcReg);
if (TRI->isSGPRClass(SrcRC) && TRI->hasVectorRegisters(DestRC)) {
MachineRegisterInfo::use_iterator NextUse;
@@ -613,10 +629,17 @@ void SIFoldOperands::foldOperand(
return;
UseMI->setDesc(TII->get(MovOp));
+ MachineInstr::mop_iterator ImpOpI = UseMI->implicit_operands().begin();
+ MachineInstr::mop_iterator ImpOpE = UseMI->implicit_operands().end();
+ while (ImpOpI != ImpOpE) {
+ MachineInstr::mop_iterator Tmp = ImpOpI;
+ ImpOpI++;
+ UseMI->RemoveOperand(UseMI->getOperandNo(Tmp));
+ }
CopiesToReplace.push_back(UseMI);
} else {
if (UseMI->isCopy() && OpToFold.isReg() &&
- TargetRegisterInfo::isVirtualRegister(UseMI->getOperand(0).getReg()) &&
+ Register::isVirtualRegister(UseMI->getOperand(0).getReg()) &&
TRI->isVectorRegister(*MRI, UseMI->getOperand(0).getReg()) &&
TRI->isVectorRegister(*MRI, UseMI->getOperand(1).getReg()) &&
!UseMI->getOperand(1).getSubReg()) {
@@ -677,6 +700,9 @@ void SIFoldOperands::foldOperand(
// =>
// %sgpr1 = COPY %sgpr0
UseMI->setDesc(TII->get(AMDGPU::COPY));
+ UseMI->getOperand(1).setReg(OpToFold.getReg());
+ UseMI->getOperand(1).setSubReg(OpToFold.getSubReg());
+ UseMI->getOperand(1).setIsKill(false);
UseMI->RemoveOperand(2); // Remove exec read (or src1 for readlane)
return;
}
@@ -708,7 +734,7 @@ void SIFoldOperands::foldOperand(
// Split 64-bit constants into 32-bits for folding.
if (UseOp.getSubReg() && AMDGPU::getRegBitWidth(FoldRC->getID()) == 64) {
- unsigned UseReg = UseOp.getReg();
+ Register UseReg = UseOp.getReg();
const TargetRegisterClass *UseRC = MRI->getRegClass(UseReg);
if (AMDGPU::getRegBitWidth(UseRC->getID()) != 64)
@@ -810,7 +836,7 @@ static MachineOperand *getImmOrMaterializedImm(MachineRegisterInfo &MRI,
if (Op.isReg()) {
// If this has a subregister, it obviously is a register source.
if (Op.getSubReg() != AMDGPU::NoSubRegister ||
- !TargetRegisterInfo::isVirtualRegister(Op.getReg()))
+ !Register::isVirtualRegister(Op.getReg()))
return &Op;
MachineInstr *Def = MRI.getVRegDef(Op.getReg());
@@ -1073,6 +1099,13 @@ void SIFoldOperands::foldInstOperand(MachineInstr &MI,
Copy->addImplicitDefUseOperands(*MF);
for (FoldCandidate &Fold : FoldList) {
+ if (Fold.isReg() && Register::isVirtualRegister(Fold.OpToFold->getReg())) {
+ Register Reg = Fold.OpToFold->getReg();
+ MachineInstr *DefMI = Fold.OpToFold->getParent();
+ if (DefMI->readsRegister(AMDGPU::EXEC, TRI) &&
+ execMayBeModifiedBeforeUse(*MRI, Reg, *DefMI, *Fold.UseMI))
+ continue;
+ }
if (updateOperand(Fold, *TII, *TRI, *ST)) {
// Clear kill flags.
if (Fold.isReg()) {
@@ -1316,6 +1349,8 @@ bool SIFoldOperands::runOnMachineFunction(MachineFunction &MF) {
for (MachineBasicBlock *MBB : depth_first(&MF)) {
MachineBasicBlock::iterator I, Next;
+
+ MachineOperand *CurrentKnownM0Val = nullptr;
for (I = MBB->begin(); I != MBB->end(); I = Next) {
Next = std::next(I);
MachineInstr &MI = *I;
@@ -1328,6 +1363,25 @@ bool SIFoldOperands::runOnMachineFunction(MachineFunction &MF) {
if (IsIEEEMode || (!HasNSZ && !MI.getFlag(MachineInstr::FmNsz)) ||
!tryFoldOMod(MI))
tryFoldClamp(MI);
+
+ // Saw an unknown clobber of m0, so we no longer know what it is.
+ if (CurrentKnownM0Val && MI.modifiesRegister(AMDGPU::M0, TRI))
+ CurrentKnownM0Val = nullptr;
+ continue;
+ }
+
+ // Specially track simple redefs of m0 to the same value in a block, so we
+ // can erase the later ones.
+ if (MI.getOperand(0).getReg() == AMDGPU::M0) {
+ MachineOperand &NewM0Val = MI.getOperand(1);
+ if (CurrentKnownM0Val && CurrentKnownM0Val->isIdenticalTo(NewM0Val)) {
+ MI.eraseFromParent();
+ continue;
+ }
+
+ // We aren't tracking other physical registers
+ CurrentKnownM0Val = (NewM0Val.isReg() && NewM0Val.getReg().isPhysical()) ?
+ nullptr : &NewM0Val;
continue;
}
@@ -1339,8 +1393,7 @@ bool SIFoldOperands::runOnMachineFunction(MachineFunction &MF) {
if (!FoldingImm && !OpToFold.isReg())
continue;
- if (OpToFold.isReg() &&
- !TargetRegisterInfo::isVirtualRegister(OpToFold.getReg()))
+ if (OpToFold.isReg() && !Register::isVirtualRegister(OpToFold.getReg()))
continue;
// Prevent folding operands backwards in the function. For example,
@@ -1350,8 +1403,7 @@ bool SIFoldOperands::runOnMachineFunction(MachineFunction &MF) {
// ...
// %vgpr0 = V_MOV_B32_e32 1, implicit %exec
MachineOperand &Dst = MI.getOperand(0);
- if (Dst.isReg() &&
- !TargetRegisterInfo::isVirtualRegister(Dst.getReg()))
+ if (Dst.isReg() && !Register::isVirtualRegister(Dst.getReg()))
continue;
foldInstOperand(MI, OpToFold);
diff --git a/lib/Target/AMDGPU/SIFormMemoryClauses.cpp b/lib/Target/AMDGPU/SIFormMemoryClauses.cpp
index f3c9ad63a80a..26bae5734df7 100644
--- a/lib/Target/AMDGPU/SIFormMemoryClauses.cpp
+++ b/lib/Target/AMDGPU/SIFormMemoryClauses.cpp
@@ -120,7 +120,7 @@ static bool isValidClauseInst(const MachineInstr &MI, bool IsVMEMClause) {
return false;
// If this is a load instruction where the result has been coalesced with an operand, then we cannot clause it.
for (const MachineOperand &ResMO : MI.defs()) {
- unsigned ResReg = ResMO.getReg();
+ Register ResReg = ResMO.getReg();
for (const MachineOperand &MO : MI.uses()) {
if (!MO.isReg() || MO.isDef())
continue;
@@ -144,7 +144,7 @@ static unsigned getMopState(const MachineOperand &MO) {
S |= RegState::Kill;
if (MO.isEarlyClobber())
S |= RegState::EarlyClobber;
- if (TargetRegisterInfo::isPhysicalRegister(MO.getReg()) && MO.isRenamable())
+ if (Register::isPhysicalRegister(MO.getReg()) && MO.isRenamable())
S |= RegState::Renamable;
return S;
}
@@ -152,7 +152,7 @@ static unsigned getMopState(const MachineOperand &MO) {
template <typename Callable>
void SIFormMemoryClauses::forAllLanes(unsigned Reg, LaneBitmask LaneMask,
Callable Func) const {
- if (LaneMask.all() || TargetRegisterInfo::isPhysicalRegister(Reg) ||
+ if (LaneMask.all() || Register::isPhysicalRegister(Reg) ||
LaneMask == MRI->getMaxLaneMaskForVReg(Reg)) {
Func(0);
return;
@@ -216,7 +216,7 @@ bool SIFormMemoryClauses::canBundle(const MachineInstr &MI,
if (!MO.isReg())
continue;
- unsigned Reg = MO.getReg();
+ Register Reg = MO.getReg();
// If it is tied we will need to write same register as we read.
if (MO.isTied())
@@ -227,7 +227,7 @@ bool SIFormMemoryClauses::canBundle(const MachineInstr &MI,
if (Conflict == Map.end())
continue;
- if (TargetRegisterInfo::isPhysicalRegister(Reg))
+ if (Register::isPhysicalRegister(Reg))
return false;
LaneBitmask Mask = TRI->getSubRegIndexLaneMask(MO.getSubReg());
@@ -265,13 +265,13 @@ void SIFormMemoryClauses::collectRegUses(const MachineInstr &MI,
for (const MachineOperand &MO : MI.operands()) {
if (!MO.isReg())
continue;
- unsigned Reg = MO.getReg();
+ Register Reg = MO.getReg();
if (!Reg)
continue;
- LaneBitmask Mask = TargetRegisterInfo::isVirtualRegister(Reg) ?
- TRI->getSubRegIndexLaneMask(MO.getSubReg()) :
- LaneBitmask::getAll();
+ LaneBitmask Mask = Register::isVirtualRegister(Reg)
+ ? TRI->getSubRegIndexLaneMask(MO.getSubReg())
+ : LaneBitmask::getAll();
RegUse &Map = MO.isDef() ? Defs : Uses;
auto Loc = Map.find(Reg);
@@ -389,7 +389,7 @@ bool SIFormMemoryClauses::runOnMachineFunction(MachineFunction &MF) {
for (auto &&R : Defs) {
unsigned Reg = R.first;
Uses.erase(Reg);
- if (TargetRegisterInfo::isPhysicalRegister(Reg))
+ if (Register::isPhysicalRegister(Reg))
continue;
LIS->removeInterval(Reg);
LIS->createAndComputeVirtRegInterval(Reg);
@@ -397,7 +397,7 @@ bool SIFormMemoryClauses::runOnMachineFunction(MachineFunction &MF) {
for (auto &&R : Uses) {
unsigned Reg = R.first;
- if (TargetRegisterInfo::isPhysicalRegister(Reg))
+ if (Register::isPhysicalRegister(Reg))
continue;
LIS->removeInterval(Reg);
LIS->createAndComputeVirtRegInterval(Reg);
diff --git a/lib/Target/AMDGPU/SIFrameLowering.cpp b/lib/Target/AMDGPU/SIFrameLowering.cpp
index feab6bed2603..ed07ed100a19 100644
--- a/lib/Target/AMDGPU/SIFrameLowering.cpp
+++ b/lib/Target/AMDGPU/SIFrameLowering.cpp
@@ -112,6 +112,7 @@ static void buildPrologSpill(LivePhysRegs &LiveRegs, MachineBasicBlock &MBB,
.addImm(0) // slc
.addImm(0) // tfe
.addImm(0) // dlc
+ .addImm(0) // swz
.addMemOperand(MMO);
return;
}
@@ -132,6 +133,7 @@ static void buildPrologSpill(LivePhysRegs &LiveRegs, MachineBasicBlock &MBB,
.addImm(0) // slc
.addImm(0) // tfe
.addImm(0) // dlc
+ .addImm(0) // swz
.addMemOperand(MMO);
}
@@ -157,6 +159,7 @@ static void buildEpilogReload(LivePhysRegs &LiveRegs, MachineBasicBlock &MBB,
.addImm(0) // slc
.addImm(0) // tfe
.addImm(0) // dlc
+ .addImm(0) // swz
.addMemOperand(MMO);
return;
}
@@ -177,6 +180,7 @@ static void buildEpilogReload(LivePhysRegs &LiveRegs, MachineBasicBlock &MBB,
.addImm(0) // slc
.addImm(0) // tfe
.addImm(0) // dlc
+ .addImm(0) // swz
.addMemOperand(MMO);
}
@@ -202,15 +206,15 @@ void SIFrameLowering::emitFlatScratchInit(const GCNSubtarget &ST,
DebugLoc DL;
MachineBasicBlock::iterator I = MBB.begin();
- unsigned FlatScratchInitReg
- = MFI->getPreloadedReg(AMDGPUFunctionArgInfo::FLAT_SCRATCH_INIT);
+ Register FlatScratchInitReg =
+ MFI->getPreloadedReg(AMDGPUFunctionArgInfo::FLAT_SCRATCH_INIT);
MachineRegisterInfo &MRI = MF.getRegInfo();
MRI.addLiveIn(FlatScratchInitReg);
MBB.addLiveIn(FlatScratchInitReg);
- unsigned FlatScrInitLo = TRI->getSubReg(FlatScratchInitReg, AMDGPU::sub0);
- unsigned FlatScrInitHi = TRI->getSubReg(FlatScratchInitReg, AMDGPU::sub1);
+ Register FlatScrInitLo = TRI->getSubReg(FlatScratchInitReg, AMDGPU::sub0);
+ Register FlatScrInitHi = TRI->getSubReg(FlatScratchInitReg, AMDGPU::sub1);
unsigned ScratchWaveOffsetReg = MFI->getScratchWaveOffsetReg();
@@ -424,8 +428,8 @@ void SIFrameLowering::emitEntryFunctionPrologue(MachineFunction &MF,
getReservedPrivateSegmentWaveByteOffsetReg(ST, TII, TRI, MFI, MF);
// We need to insert initialization of the scratch resource descriptor.
- unsigned PreloadedScratchWaveOffsetReg = MFI->getPreloadedReg(
- AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET);
+ Register PreloadedScratchWaveOffsetReg = MFI->getPreloadedReg(
+ AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET);
unsigned PreloadedPrivateBufferReg = AMDGPU::NoRegister;
if (ST.isAmdHsaOrMesa(F)) {
@@ -539,9 +543,9 @@ void SIFrameLowering::emitEntryFunctionScratchSetup(const GCNSubtarget &ST,
if (ST.isAmdPalOS()) {
// The pointer to the GIT is formed from the offset passed in and either
// the amdgpu-git-ptr-high function attribute or the top part of the PC
- unsigned RsrcLo = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0);
- unsigned RsrcHi = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub1);
- unsigned Rsrc01 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0_sub1);
+ Register RsrcLo = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0);
+ Register RsrcHi = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub1);
+ Register Rsrc01 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0_sub1);
const MCInstrDesc &SMovB32 = TII->get(AMDGPU::S_MOV_B32);
@@ -601,14 +605,14 @@ void SIFrameLowering::emitEntryFunctionScratchSetup(const GCNSubtarget &ST,
assert(!ST.isAmdHsaOrMesa(Fn));
const MCInstrDesc &SMovB32 = TII->get(AMDGPU::S_MOV_B32);
- unsigned Rsrc2 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub2);
- unsigned Rsrc3 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub3);
+ Register Rsrc2 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub2);
+ Register Rsrc3 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub3);
// Use relocations to get the pointer, and setup the other bits manually.
uint64_t Rsrc23 = TII->getScratchRsrcWords23();
if (MFI->hasImplicitBufferPtr()) {
- unsigned Rsrc01 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0_sub1);
+ Register Rsrc01 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0_sub1);
if (AMDGPU::isCompute(MF.getFunction().getCallingConv())) {
const MCInstrDesc &Mov64 = TII->get(AMDGPU::S_MOV_B64);
@@ -640,8 +644,8 @@ void SIFrameLowering::emitEntryFunctionScratchSetup(const GCNSubtarget &ST,
MBB.addLiveIn(MFI->getImplicitBufferPtrUserSGPR());
}
} else {
- unsigned Rsrc0 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0);
- unsigned Rsrc1 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub1);
+ Register Rsrc0 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0);
+ Register Rsrc1 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub1);
BuildMI(MBB, I, DL, SMovB32, Rsrc0)
.addExternalSymbol("SCRATCH_RSRC_DWORD0")
@@ -669,6 +673,8 @@ bool SIFrameLowering::isSupportedStackID(TargetStackID::Value ID) const {
case TargetStackID::NoAlloc:
case TargetStackID::SGPRSpill:
return true;
+ case TargetStackID::SVEVector:
+ return false;
}
llvm_unreachable("Invalid TargetStackID::Value");
}
diff --git a/lib/Target/AMDGPU/SIFrameLowering.h b/lib/Target/AMDGPU/SIFrameLowering.h
index c644f4726e2c..d9970fd6b4b8 100644
--- a/lib/Target/AMDGPU/SIFrameLowering.h
+++ b/lib/Target/AMDGPU/SIFrameLowering.h
@@ -20,9 +20,9 @@ class GCNSubtarget;
class SIFrameLowering final : public AMDGPUFrameLowering {
public:
- SIFrameLowering(StackDirection D, unsigned StackAl, int LAO,
- unsigned TransAl = 1) :
- AMDGPUFrameLowering(D, StackAl, LAO, TransAl) {}
+ SIFrameLowering(StackDirection D, Align StackAl, int LAO,
+ Align TransAl = Align::None())
+ : AMDGPUFrameLowering(D, StackAl, LAO, TransAl) {}
~SIFrameLowering() override = default;
void emitEntryFunctionPrologue(MachineFunction &MF,
diff --git a/lib/Target/AMDGPU/SIISelLowering.cpp b/lib/Target/AMDGPU/SIISelLowering.cpp
index db0782e2bf3e..56ebf9c06741 100644
--- a/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -20,11 +20,11 @@
#include "AMDGPU.h"
#include "AMDGPUSubtarget.h"
#include "AMDGPUTargetMachine.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
#include "SIDefines.h"
#include "SIInstrInfo.h"
#include "SIMachineFunctionInfo.h"
#include "SIRegisterInfo.h"
-#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
#include "Utils/AMDGPUBaseInfo.h"
#include "llvm/ADT/APFloat.h"
#include "llvm/ADT/APInt.h"
@@ -35,6 +35,7 @@
#include "llvm/ADT/StringRef.h"
#include "llvm/ADT/StringSwitch.h"
#include "llvm/ADT/Twine.h"
+#include "llvm/Analysis/LegacyDivergenceAnalysis.h"
#include "llvm/CodeGen/Analysis.h"
#include "llvm/CodeGen/CallingConvLower.h"
#include "llvm/CodeGen/DAGCombine.h"
@@ -44,6 +45,7 @@
#include "llvm/CodeGen/MachineFunction.h"
#include "llvm/CodeGen/MachineInstr.h"
#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
#include "llvm/CodeGen/MachineMemOperand.h"
#include "llvm/CodeGen/MachineModuleInfo.h"
#include "llvm/CodeGen/MachineOperand.h"
@@ -115,7 +117,7 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
addRegisterClass(MVT::i1, &AMDGPU::VReg_1RegClass);
addRegisterClass(MVT::i64, &AMDGPU::SReg_64RegClass);
- addRegisterClass(MVT::i32, &AMDGPU::SReg_32_XM0RegClass);
+ addRegisterClass(MVT::i32, &AMDGPU::SReg_32RegClass);
addRegisterClass(MVT::f32, &AMDGPU::VGPR_32RegClass);
addRegisterClass(MVT::f64, &AMDGPU::VReg_64RegClass);
@@ -125,10 +127,10 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
addRegisterClass(MVT::v3i32, &AMDGPU::SGPR_96RegClass);
addRegisterClass(MVT::v3f32, &AMDGPU::VReg_96RegClass);
- addRegisterClass(MVT::v2i64, &AMDGPU::SReg_128RegClass);
- addRegisterClass(MVT::v2f64, &AMDGPU::SReg_128RegClass);
+ addRegisterClass(MVT::v2i64, &AMDGPU::SGPR_128RegClass);
+ addRegisterClass(MVT::v2f64, &AMDGPU::SGPR_128RegClass);
- addRegisterClass(MVT::v4i32, &AMDGPU::SReg_128RegClass);
+ addRegisterClass(MVT::v4i32, &AMDGPU::SGPR_128RegClass);
addRegisterClass(MVT::v4f32, &AMDGPU::VReg_128RegClass);
addRegisterClass(MVT::v5i32, &AMDGPU::SGPR_160RegClass);
@@ -141,12 +143,12 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
addRegisterClass(MVT::v16f32, &AMDGPU::VReg_512RegClass);
if (Subtarget->has16BitInsts()) {
- addRegisterClass(MVT::i16, &AMDGPU::SReg_32_XM0RegClass);
- addRegisterClass(MVT::f16, &AMDGPU::SReg_32_XM0RegClass);
+ addRegisterClass(MVT::i16, &AMDGPU::SReg_32RegClass);
+ addRegisterClass(MVT::f16, &AMDGPU::SReg_32RegClass);
// Unless there are also VOP3P operations, not operations are really legal.
- addRegisterClass(MVT::v2i16, &AMDGPU::SReg_32_XM0RegClass);
- addRegisterClass(MVT::v2f16, &AMDGPU::SReg_32_XM0RegClass);
+ addRegisterClass(MVT::v2i16, &AMDGPU::SReg_32RegClass);
+ addRegisterClass(MVT::v2f16, &AMDGPU::SReg_32RegClass);
addRegisterClass(MVT::v4i16, &AMDGPU::SReg_64RegClass);
addRegisterClass(MVT::v4f16, &AMDGPU::SReg_64RegClass);
}
@@ -178,6 +180,7 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
setOperationAction(ISD::STORE, MVT::v32i32, Custom);
setTruncStoreAction(MVT::v2i32, MVT::v2i16, Expand);
+ setTruncStoreAction(MVT::v3i32, MVT::v3i16, Expand);
setTruncStoreAction(MVT::v4i32, MVT::v4i16, Expand);
setTruncStoreAction(MVT::v8i32, MVT::v8i16, Expand);
setTruncStoreAction(MVT::v16i32, MVT::v16i16, Expand);
@@ -215,31 +218,10 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i8, Custom);
setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i8, Custom);
setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i16, Custom);
+ setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v3i16, Custom);
setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i16, Custom);
setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::Other, Custom);
- setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
- setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::f32, Custom);
- setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v4f32, Custom);
- setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i16, Custom);
- setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::f16, Custom);
- setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v2i16, Custom);
- setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v2f16, Custom);
-
- setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::v2f16, Custom);
- setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::v4f16, Custom);
- setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::v8f16, Custom);
- setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom);
- setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i16, Custom);
- setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i8, Custom);
-
- setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
- setOperationAction(ISD::INTRINSIC_VOID, MVT::v2i16, Custom);
- setOperationAction(ISD::INTRINSIC_VOID, MVT::v2f16, Custom);
- setOperationAction(ISD::INTRINSIC_VOID, MVT::v4f16, Custom);
- setOperationAction(ISD::INTRINSIC_VOID, MVT::i16, Custom);
- setOperationAction(ISD::INTRINSIC_VOID, MVT::i8, Custom);
-
setOperationAction(ISD::BRCOND, MVT::Other, Custom);
setOperationAction(ISD::BR_CC, MVT::i1, Expand);
setOperationAction(ISD::BR_CC, MVT::i32, Expand);
@@ -653,6 +635,7 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
setOperationAction(ISD::FADD, MVT::v4f16, Custom);
setOperationAction(ISD::FMUL, MVT::v4f16, Custom);
+ setOperationAction(ISD::FMA, MVT::v4f16, Custom);
setOperationAction(ISD::FMAXNUM, MVT::v2f16, Custom);
setOperationAction(ISD::FMINNUM, MVT::v2f16, Custom);
@@ -687,6 +670,33 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
setOperationAction(ISD::SELECT, VT, Custom);
}
+ setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
+ setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::f32, Custom);
+ setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v4f32, Custom);
+ setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i16, Custom);
+ setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::f16, Custom);
+ setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v2i16, Custom);
+ setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v2f16, Custom);
+
+ setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::v2f16, Custom);
+ setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::v2i16, Custom);
+ setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::v4f16, Custom);
+ setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::v4i16, Custom);
+ setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::v8f16, Custom);
+ setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom);
+ setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::f16, Custom);
+ setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i16, Custom);
+ setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i8, Custom);
+
+ setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
+ setOperationAction(ISD::INTRINSIC_VOID, MVT::v2i16, Custom);
+ setOperationAction(ISD::INTRINSIC_VOID, MVT::v2f16, Custom);
+ setOperationAction(ISD::INTRINSIC_VOID, MVT::v4f16, Custom);
+ setOperationAction(ISD::INTRINSIC_VOID, MVT::v4i16, Custom);
+ setOperationAction(ISD::INTRINSIC_VOID, MVT::f16, Custom);
+ setOperationAction(ISD::INTRINSIC_VOID, MVT::i16, Custom);
+ setOperationAction(ISD::INTRINSIC_VOID, MVT::i8, Custom);
+
setTargetDAGCombine(ISD::ADD);
setTargetDAGCombine(ISD::ADDCARRY);
setTargetDAGCombine(ISD::SUB);
@@ -768,19 +778,22 @@ bool SITargetLowering::isShuffleMaskLegal(ArrayRef<int>, EVT) const {
MVT SITargetLowering::getRegisterTypeForCallingConv(LLVMContext &Context,
CallingConv::ID CC,
EVT VT) const {
- // TODO: Consider splitting all arguments into 32-bit pieces.
- if (CC != CallingConv::AMDGPU_KERNEL && VT.isVector()) {
+ if (CC == CallingConv::AMDGPU_KERNEL)
+ return TargetLowering::getRegisterTypeForCallingConv(Context, CC, VT);
+
+ if (VT.isVector()) {
EVT ScalarVT = VT.getScalarType();
unsigned Size = ScalarVT.getSizeInBits();
if (Size == 32)
return ScalarVT.getSimpleVT();
- if (Size == 64)
+ if (Size > 32)
return MVT::i32;
if (Size == 16 && Subtarget->has16BitInsts())
return VT.isInteger() ? MVT::v2i16 : MVT::v2f16;
- }
+ } else if (VT.getSizeInBits() > 32)
+ return MVT::i32;
return TargetLowering::getRegisterTypeForCallingConv(Context, CC, VT);
}
@@ -788,7 +801,10 @@ MVT SITargetLowering::getRegisterTypeForCallingConv(LLVMContext &Context,
unsigned SITargetLowering::getNumRegistersForCallingConv(LLVMContext &Context,
CallingConv::ID CC,
EVT VT) const {
- if (CC != CallingConv::AMDGPU_KERNEL && VT.isVector()) {
+ if (CC == CallingConv::AMDGPU_KERNEL)
+ return TargetLowering::getNumRegistersForCallingConv(Context, CC, VT);
+
+ if (VT.isVector()) {
unsigned NumElts = VT.getVectorNumElements();
EVT ScalarVT = VT.getScalarType();
unsigned Size = ScalarVT.getSizeInBits();
@@ -796,12 +812,13 @@ unsigned SITargetLowering::getNumRegistersForCallingConv(LLVMContext &Context,
if (Size == 32)
return NumElts;
- if (Size == 64)
- return 2 * NumElts;
+ if (Size > 32)
+ return NumElts * ((Size + 31) / 32);
if (Size == 16 && Subtarget->has16BitInsts())
- return (VT.getVectorNumElements() + 1) / 2;
- }
+ return (NumElts + 1) / 2;
+ } else if (VT.getSizeInBits() > 32)
+ return (VT.getSizeInBits() + 31) / 32;
return TargetLowering::getNumRegistersForCallingConv(Context, CC, VT);
}
@@ -821,10 +838,10 @@ unsigned SITargetLowering::getVectorTypeBreakdownForCallingConv(
return NumIntermediates;
}
- if (Size == 64) {
+ if (Size > 32) {
RegisterVT = MVT::i32;
IntermediateVT = RegisterVT;
- NumIntermediates = 2 * NumElts;
+ NumIntermediates = NumElts * ((Size + 31) / 32);
return NumIntermediates;
}
@@ -901,7 +918,7 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
Info.ptrVal = MFI->getImagePSV(
*MF.getSubtarget<GCNSubtarget>().getInstrInfo(),
CI.getArgOperand(RsrcIntr->RsrcArg));
- Info.align = 0;
+ Info.align.reset();
} else {
Info.ptrVal = MFI->getBufferPSV(
*MF.getSubtarget<GCNSubtarget>().getInstrInfo(),
@@ -947,7 +964,7 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
Info.opc = ISD::INTRINSIC_W_CHAIN;
Info.memVT = MVT::getVT(CI.getType());
Info.ptrVal = CI.getOperand(0);
- Info.align = 0;
+ Info.align.reset();
Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOStore;
const ConstantInt *Vol = cast<ConstantInt>(CI.getOperand(4));
@@ -964,7 +981,7 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
Info.ptrVal = MFI->getBufferPSV(
*MF.getSubtarget<GCNSubtarget>().getInstrInfo(),
CI.getArgOperand(1));
- Info.align = 0;
+ Info.align.reset();
Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOStore;
const ConstantInt *Vol = dyn_cast<ConstantInt>(CI.getOperand(4));
@@ -978,7 +995,7 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
Info.memVT = MVT::getVT(CI.getOperand(0)->getType()
->getPointerElementType());
Info.ptrVal = CI.getOperand(0);
- Info.align = 0;
+ Info.align.reset();
Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOStore;
return true;
@@ -988,7 +1005,7 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
Info.opc = ISD::INTRINSIC_W_CHAIN;
Info.memVT = MVT::getVT(CI.getType());
Info.ptrVal = CI.getOperand(0);
- Info.align = 0;
+ Info.align.reset();
Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOStore;
const ConstantInt *Vol = cast<ConstantInt>(CI.getOperand(1));
@@ -1012,7 +1029,7 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
// This is an abstract access, but we need to specify a type and size.
Info.memVT = MVT::i32;
Info.size = 4;
- Info.align = 4;
+ Info.align = Align(4);
Info.flags = MachineMemOperand::MOStore;
if (IntrID == Intrinsic::amdgcn_ds_gws_barrier)
@@ -1215,21 +1232,12 @@ bool SITargetLowering::canMergeStoresTo(unsigned AS, EVT MemVT,
return true;
}
-bool SITargetLowering::allowsMisalignedMemoryAccesses(
- EVT VT, unsigned AddrSpace, unsigned Align, MachineMemOperand::Flags Flags,
- bool *IsFast) const {
+bool SITargetLowering::allowsMisalignedMemoryAccessesImpl(
+ unsigned Size, unsigned AddrSpace, unsigned Align,
+ MachineMemOperand::Flags Flags, bool *IsFast) const {
if (IsFast)
*IsFast = false;
- // TODO: I think v3i32 should allow unaligned accesses on CI with DS_READ_B96,
- // which isn't a simple VT.
- // Until MVT is extended to handle this, simply check for the size and
- // rely on the condition below: allow accesses if the size is a multiple of 4.
- if (VT == MVT::Other || (VT != MVT::Other && VT.getSizeInBits() > 1024 &&
- VT.getStoreSize() > 16)) {
- return false;
- }
-
if (AddrSpace == AMDGPUAS::LOCAL_ADDRESS ||
AddrSpace == AMDGPUAS::REGION_ADDRESS) {
// ds_read/write_b64 require 8-byte alignment, but we can do a 4 byte
@@ -1268,7 +1276,7 @@ bool SITargetLowering::allowsMisalignedMemoryAccesses(
}
// Smaller than dword value must be aligned.
- if (VT.bitsLT(MVT::i32))
+ if (Size < 32)
return false;
// 8.1.6 - For Dword or larger reads or writes, the two LSBs of the
@@ -1277,7 +1285,26 @@ bool SITargetLowering::allowsMisalignedMemoryAccesses(
if (IsFast)
*IsFast = true;
- return VT.bitsGT(MVT::i32) && Align % 4 == 0;
+ return Size >= 32 && Align >= 4;
+}
+
+bool SITargetLowering::allowsMisalignedMemoryAccesses(
+ EVT VT, unsigned AddrSpace, unsigned Align, MachineMemOperand::Flags Flags,
+ bool *IsFast) const {
+ if (IsFast)
+ *IsFast = false;
+
+ // TODO: I think v3i32 should allow unaligned accesses on CI with DS_READ_B96,
+ // which isn't a simple VT.
+ // Until MVT is extended to handle this, simply check for the size and
+ // rely on the condition below: allow accesses if the size is a multiple of 4.
+ if (VT == MVT::Other || (VT != MVT::Other && VT.getSizeInBits() > 1024 &&
+ VT.getStoreSize() > 16)) {
+ return false;
+ }
+
+ return allowsMisalignedMemoryAccessesImpl(VT.getSizeInBits(), AddrSpace,
+ Align, Flags, IsFast);
}
EVT SITargetLowering::getOptimalMemOpType(
@@ -1336,9 +1363,9 @@ bool SITargetLowering::isMemOpUniform(const SDNode *N) const {
TargetLoweringBase::LegalizeTypeAction
SITargetLowering::getPreferredVectorAction(MVT VT) const {
- if (VT.getVectorNumElements() != 1 && VT.getScalarType().bitsLE(MVT::i16))
- return TypeSplitVector;
-
+ int NumElts = VT.getVectorNumElements();
+ if (NumElts != 1 && VT.getScalarType().bitsLE(MVT::i16))
+ return VT.isPow2VectorType() ? TypeSplitVector : TypeWidenVector;
return TargetLoweringBase::getPreferredVectorAction(VT);
}
@@ -1562,7 +1589,8 @@ static void processShaderInputArgs(SmallVectorImpl<ISD::InputArg> &Splits,
// entire split argument.
if (Arg->Flags.isSplit()) {
while (!Arg->Flags.isSplitEnd()) {
- assert(!Arg->VT.isVector() &&
+ assert((!Arg->VT.isVector() ||
+ Arg->VT.getScalarSizeInBits() == 16) &&
"unexpected vector split in ps argument type");
if (!SkipArg)
Splits.push_back(*Arg);
@@ -1589,29 +1617,32 @@ static void processShaderInputArgs(SmallVectorImpl<ISD::InputArg> &Splits,
}
// Allocate special inputs passed in VGPRs.
-static void allocateSpecialEntryInputVGPRs(CCState &CCInfo,
- MachineFunction &MF,
- const SIRegisterInfo &TRI,
- SIMachineFunctionInfo &Info) {
+void SITargetLowering::allocateSpecialEntryInputVGPRs(CCState &CCInfo,
+ MachineFunction &MF,
+ const SIRegisterInfo &TRI,
+ SIMachineFunctionInfo &Info) const {
+ const LLT S32 = LLT::scalar(32);
+ MachineRegisterInfo &MRI = MF.getRegInfo();
+
if (Info.hasWorkItemIDX()) {
- unsigned Reg = AMDGPU::VGPR0;
- MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass);
+ Register Reg = AMDGPU::VGPR0;
+ MRI.setType(MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32);
CCInfo.AllocateReg(Reg);
Info.setWorkItemIDX(ArgDescriptor::createRegister(Reg));
}
if (Info.hasWorkItemIDY()) {
- unsigned Reg = AMDGPU::VGPR1;
- MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass);
+ Register Reg = AMDGPU::VGPR1;
+ MRI.setType(MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32);
CCInfo.AllocateReg(Reg);
Info.setWorkItemIDY(ArgDescriptor::createRegister(Reg));
}
if (Info.hasWorkItemIDZ()) {
- unsigned Reg = AMDGPU::VGPR2;
- MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass);
+ Register Reg = AMDGPU::VGPR2;
+ MRI.setType(MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32);
CCInfo.AllocateReg(Reg);
Info.setWorkItemIDZ(ArgDescriptor::createRegister(Reg));
@@ -1642,7 +1673,8 @@ static ArgDescriptor allocateVGPR32Input(CCState &CCInfo, unsigned Mask = ~0u,
assert(Reg != AMDGPU::NoRegister);
MachineFunction &MF = CCInfo.getMachineFunction();
- MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass);
+ Register LiveInVReg = MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass);
+ MF.getRegInfo().setType(LiveInVReg, LLT::scalar(32));
return ArgDescriptor::createRegister(Reg, Mask);
}
@@ -1671,10 +1703,10 @@ static ArgDescriptor allocateSGPR64Input(CCState &CCInfo) {
return allocateSGPR32InputImpl(CCInfo, &AMDGPU::SGPR_64RegClass, 16);
}
-static void allocateSpecialInputVGPRs(CCState &CCInfo,
- MachineFunction &MF,
- const SIRegisterInfo &TRI,
- SIMachineFunctionInfo &Info) {
+void SITargetLowering::allocateSpecialInputVGPRs(CCState &CCInfo,
+ MachineFunction &MF,
+ const SIRegisterInfo &TRI,
+ SIMachineFunctionInfo &Info) const {
const unsigned Mask = 0x3ff;
ArgDescriptor Arg;
@@ -1692,10 +1724,11 @@ static void allocateSpecialInputVGPRs(CCState &CCInfo,
Info.setWorkItemIDZ(allocateVGPR32Input(CCInfo, Mask << 20, Arg));
}
-static void allocateSpecialInputSGPRs(CCState &CCInfo,
- MachineFunction &MF,
- const SIRegisterInfo &TRI,
- SIMachineFunctionInfo &Info) {
+void SITargetLowering::allocateSpecialInputSGPRs(
+ CCState &CCInfo,
+ MachineFunction &MF,
+ const SIRegisterInfo &TRI,
+ SIMachineFunctionInfo &Info) const {
auto &ArgInfo = Info.getArgInfo();
// TODO: Unify handling with private memory pointers.
@@ -1728,10 +1761,10 @@ static void allocateSpecialInputSGPRs(CCState &CCInfo,
}
// Allocate special inputs passed in user SGPRs.
-static void allocateHSAUserSGPRs(CCState &CCInfo,
- MachineFunction &MF,
- const SIRegisterInfo &TRI,
- SIMachineFunctionInfo &Info) {
+void SITargetLowering::allocateHSAUserSGPRs(CCState &CCInfo,
+ MachineFunction &MF,
+ const SIRegisterInfo &TRI,
+ SIMachineFunctionInfo &Info) const {
if (Info.hasImplicitBufferPtr()) {
unsigned ImplicitBufferPtrReg = Info.addImplicitBufferPtr(TRI);
MF.addLiveIn(ImplicitBufferPtrReg, &AMDGPU::SGPR_64RegClass);
@@ -1758,9 +1791,12 @@ static void allocateHSAUserSGPRs(CCState &CCInfo,
}
if (Info.hasKernargSegmentPtr()) {
- unsigned InputPtrReg = Info.addKernargSegmentPtr(TRI);
- MF.addLiveIn(InputPtrReg, &AMDGPU::SGPR_64RegClass);
+ MachineRegisterInfo &MRI = MF.getRegInfo();
+ Register InputPtrReg = Info.addKernargSegmentPtr(TRI);
CCInfo.AllocateReg(InputPtrReg);
+
+ Register VReg = MF.addLiveIn(InputPtrReg, &AMDGPU::SGPR_64RegClass);
+ MRI.setType(VReg, LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
}
if (Info.hasDispatchID()) {
@@ -1780,32 +1816,32 @@ static void allocateHSAUserSGPRs(CCState &CCInfo,
}
// Allocate special input registers that are initialized per-wave.
-static void allocateSystemSGPRs(CCState &CCInfo,
- MachineFunction &MF,
- SIMachineFunctionInfo &Info,
- CallingConv::ID CallConv,
- bool IsShader) {
+void SITargetLowering::allocateSystemSGPRs(CCState &CCInfo,
+ MachineFunction &MF,
+ SIMachineFunctionInfo &Info,
+ CallingConv::ID CallConv,
+ bool IsShader) const {
if (Info.hasWorkGroupIDX()) {
unsigned Reg = Info.addWorkGroupIDX();
- MF.addLiveIn(Reg, &AMDGPU::SReg_32_XM0RegClass);
+ MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
CCInfo.AllocateReg(Reg);
}
if (Info.hasWorkGroupIDY()) {
unsigned Reg = Info.addWorkGroupIDY();
- MF.addLiveIn(Reg, &AMDGPU::SReg_32_XM0RegClass);
+ MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
CCInfo.AllocateReg(Reg);
}
if (Info.hasWorkGroupIDZ()) {
unsigned Reg = Info.addWorkGroupIDZ();
- MF.addLiveIn(Reg, &AMDGPU::SReg_32_XM0RegClass);
+ MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
CCInfo.AllocateReg(Reg);
}
if (Info.hasWorkGroupInfo()) {
unsigned Reg = Info.addWorkGroupInfo();
- MF.addLiveIn(Reg, &AMDGPU::SReg_32_XM0RegClass);
+ MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
CCInfo.AllocateReg(Reg);
}
@@ -1860,7 +1896,7 @@ static void reservePrivateMemoryRegs(const TargetMachine &TM,
// resource. For the Code Object V2 ABI, this will be the first 4 user
// SGPR inputs. We can reserve those and use them directly.
- unsigned PrivateSegmentBufferReg =
+ Register PrivateSegmentBufferReg =
Info.getPreloadedReg(AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_BUFFER);
Info.setScratchRSrcReg(PrivateSegmentBufferReg);
} else {
@@ -1921,7 +1957,7 @@ static void reservePrivateMemoryRegs(const TargetMachine &TM,
//
// FIXME: Should not do this if inline asm is reading/writing these
// registers.
- unsigned PreloadedSP = Info.getPreloadedReg(
+ Register PreloadedSP = Info.getPreloadedReg(
AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET);
Info.setStackPtrOffsetReg(PreloadedSP);
@@ -1971,7 +2007,7 @@ void SITargetLowering::insertCopiesSplitCSR(
else
llvm_unreachable("Unexpected register class in CSRsViaCopy!");
- unsigned NewVR = MRI->createVirtualRegister(RC);
+ Register NewVR = MRI->createVirtualRegister(RC);
// Create copy from CSR to a virtual register.
Entry->addLiveIn(*I);
BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR)
@@ -2134,7 +2170,7 @@ SDValue SITargetLowering::LowerFormalArguments(
assert(VA.isRegLoc() && "Parameter must be in a register!");
- unsigned Reg = VA.getLocReg();
+ Register Reg = VA.getLocReg();
const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg, VT);
EVT ValVT = VA.getValVT();
@@ -2652,6 +2688,15 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,
bool IsThisReturn = false;
MachineFunction &MF = DAG.getMachineFunction();
+ if (Callee.isUndef() || isNullConstant(Callee)) {
+ if (!CLI.IsTailCall) {
+ for (unsigned I = 0, E = CLI.Ins.size(); I != E; ++I)
+ InVals.push_back(DAG.getUNDEF(CLI.Ins[I].VT));
+ }
+
+ return Chain;
+ }
+
if (IsVarArg) {
return lowerUnhandledCall(CLI, InVals,
"unsupported call to variadic function ");
@@ -2782,7 +2827,7 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,
int32_t Offset = LocMemOffset;
SDValue PtrOff = DAG.getConstant(Offset, DL, PtrVT);
- unsigned Align = 0;
+ MaybeAlign Alignment;
if (IsTailCall) {
ISD::ArgFlagsTy Flags = Outs[realArgIdx].Flags;
@@ -2790,8 +2835,10 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,
Flags.getByValSize() : VA.getValVT().getStoreSize();
// FIXME: We can have better than the minimum byval required alignment.
- Align = Flags.isByVal() ? Flags.getByValAlign() :
- MinAlign(Subtarget->getStackAlignment(), Offset);
+ Alignment =
+ Flags.isByVal()
+ ? MaybeAlign(Flags.getByValAlign())
+ : commonAlignment(Subtarget->getStackAlignment(), Offset);
Offset = Offset + FPDiff;
int FI = MFI.CreateFixedObject(OpSize, Offset, true);
@@ -2810,7 +2857,8 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,
} else {
DstAddr = PtrOff;
DstInfo = MachinePointerInfo::getStack(MF, LocMemOffset);
- Align = MinAlign(Subtarget->getStackAlignment(), LocMemOffset);
+ Alignment =
+ commonAlignment(Subtarget->getStackAlignment(), LocMemOffset);
}
if (Outs[i].Flags.isByVal()) {
@@ -2825,7 +2873,8 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,
MemOpChains.push_back(Cpy);
} else {
- SDValue Store = DAG.getStore(Chain, DL, Arg, DstAddr, DstInfo, Align);
+ SDValue Store = DAG.getStore(Chain, DL, Arg, DstAddr, DstInfo,
+ Alignment ? Alignment->value() : 0);
MemOpChains.push_back(Store);
}
}
@@ -2937,9 +2986,9 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,
IsThisReturn ? OutVals[0] : SDValue());
}
-unsigned SITargetLowering::getRegisterByName(const char* RegName, EVT VT,
- SelectionDAG &DAG) const {
- unsigned Reg = StringSwitch<unsigned>(RegName)
+Register SITargetLowering::getRegisterByName(const char* RegName, EVT VT,
+ const MachineFunction &MF) const {
+ Register Reg = StringSwitch<Register>(RegName)
.Case("m0", AMDGPU::M0)
.Case("exec", AMDGPU::EXEC)
.Case("exec_lo", AMDGPU::EXEC_LO)
@@ -2947,7 +2996,7 @@ unsigned SITargetLowering::getRegisterByName(const char* RegName, EVT VT,
.Case("flat_scratch", AMDGPU::FLAT_SCR)
.Case("flat_scratch_lo", AMDGPU::FLAT_SCR_LO)
.Case("flat_scratch_hi", AMDGPU::FLAT_SCR_HI)
- .Default(AMDGPU::NoRegister);
+ .Default(Register());
if (Reg == AMDGPU::NoRegister) {
report_fatal_error(Twine("invalid register name \""
@@ -3055,6 +3104,20 @@ splitBlockForLoop(MachineInstr &MI, MachineBasicBlock &MBB, bool InstInLoop) {
return std::make_pair(LoopBB, RemainderBB);
}
+/// Insert \p MI into a BUNDLE with an S_WAITCNT 0 immediately following it.
+void SITargetLowering::bundleInstWithWaitcnt(MachineInstr &MI) const {
+ MachineBasicBlock *MBB = MI.getParent();
+ const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
+ auto I = MI.getIterator();
+ auto E = std::next(I);
+
+ BuildMI(*MBB, E, MI.getDebugLoc(), TII->get(AMDGPU::S_WAITCNT))
+ .addImm(0);
+
+ MIBundleBuilder Bundler(*MBB, I, E);
+ finalizeBundle(*MBB, Bundler.begin());
+}
+
MachineBasicBlock *
SITargetLowering::emitGWSMemViolTestLoop(MachineInstr &MI,
MachineBasicBlock *BB) const {
@@ -3066,12 +3129,13 @@ SITargetLowering::emitGWSMemViolTestLoop(MachineInstr &MI,
MachineBasicBlock *RemainderBB;
const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
- MachineBasicBlock::iterator Prev = std::prev(MI.getIterator());
+ // Apparently kill flags are only valid if the def is in the same block?
+ if (MachineOperand *Src = TII->getNamedOperand(MI, AMDGPU::OpName::data0))
+ Src->setIsKill(false);
std::tie(LoopBB, RemainderBB) = splitBlockForLoop(MI, *BB, true);
MachineBasicBlock::iterator I = LoopBB->end();
- MachineOperand *Src = TII->getNamedOperand(MI, AMDGPU::OpName::data0);
const unsigned EncodedReg = AMDGPU::Hwreg::encodeHwreg(
AMDGPU::Hwreg::ID_TRAPSTS, AMDGPU::Hwreg::OFFSET_MEM_VIOL, 1);
@@ -3081,23 +3145,9 @@ SITargetLowering::emitGWSMemViolTestLoop(MachineInstr &MI,
.addImm(0)
.addImm(EncodedReg);
- // This is a pain, but we're not allowed to have physical register live-ins
- // yet. Insert a pair of copies if the VGPR0 hack is necessary.
- if (Src && TargetRegisterInfo::isPhysicalRegister(Src->getReg())) {
- unsigned Data0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
- BuildMI(*BB, std::next(Prev), DL, TII->get(AMDGPU::COPY), Data0)
- .add(*Src);
+ bundleInstWithWaitcnt(MI);
- BuildMI(*LoopBB, LoopBB->begin(), DL, TII->get(AMDGPU::COPY), Src->getReg())
- .addReg(Data0);
-
- MRI.setSimpleHint(Data0, Src->getReg());
- }
-
- BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::S_WAITCNT))
- .addImm(0);
-
- unsigned Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
+ Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
// Load and check TRAP_STS.MEM_VIOL
BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::S_GETREG_B32), Reg)
@@ -3138,10 +3188,10 @@ static MachineBasicBlock::iterator emitLoadM0FromVGPRLoop(
MachineBasicBlock::iterator I = LoopBB.begin();
const TargetRegisterClass *BoolRC = TRI->getBoolRC();
- unsigned PhiExec = MRI.createVirtualRegister(BoolRC);
- unsigned NewExec = MRI.createVirtualRegister(BoolRC);
- unsigned CurrentIdxReg = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
- unsigned CondReg = MRI.createVirtualRegister(BoolRC);
+ Register PhiExec = MRI.createVirtualRegister(BoolRC);
+ Register NewExec = MRI.createVirtualRegister(BoolRC);
+ Register CurrentIdxReg = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
+ Register CondReg = MRI.createVirtualRegister(BoolRC);
BuildMI(LoopBB, I, DL, TII->get(TargetOpcode::PHI), PhiReg)
.addReg(InitReg)
@@ -3240,9 +3290,9 @@ static MachineBasicBlock::iterator loadM0FromVGPR(const SIInstrInfo *TII,
MachineBasicBlock::iterator I(&MI);
const auto *BoolXExecRC = TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
- unsigned DstReg = MI.getOperand(0).getReg();
- unsigned SaveExec = MRI.createVirtualRegister(BoolXExecRC);
- unsigned TmpExec = MRI.createVirtualRegister(BoolXExecRC);
+ Register DstReg = MI.getOperand(0).getReg();
+ Register SaveExec = MRI.createVirtualRegister(BoolXExecRC);
+ Register TmpExec = MRI.createVirtualRegister(BoolXExecRC);
unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
unsigned MovExecOpc = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
@@ -3315,7 +3365,7 @@ static bool setM0ToIndexFromSGPR(const SIInstrInfo *TII,
SetOn->getOperand(3).setIsUndef();
} else {
- unsigned Tmp = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
+ Register Tmp = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_ADD_I32), Tmp)
.add(*Idx)
.addImm(Offset);
@@ -3351,8 +3401,8 @@ static MachineBasicBlock *emitIndirectSrc(MachineInstr &MI,
MachineFunction *MF = MBB.getParent();
MachineRegisterInfo &MRI = MF->getRegInfo();
- unsigned Dst = MI.getOperand(0).getReg();
- unsigned SrcReg = TII->getNamedOperand(MI, AMDGPU::OpName::src)->getReg();
+ Register Dst = MI.getOperand(0).getReg();
+ Register SrcReg = TII->getNamedOperand(MI, AMDGPU::OpName::src)->getReg();
int Offset = TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm();
const TargetRegisterClass *VecRC = MRI.getRegClass(SrcReg);
@@ -3390,8 +3440,8 @@ static MachineBasicBlock *emitIndirectSrc(MachineInstr &MI,
const DebugLoc &DL = MI.getDebugLoc();
MachineBasicBlock::iterator I(&MI);
- unsigned PhiReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
- unsigned InitReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+ Register PhiReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+ Register InitReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
BuildMI(MBB, I, DL, TII->get(TargetOpcode::IMPLICIT_DEF), InitReg);
@@ -3442,7 +3492,7 @@ static MachineBasicBlock *emitIndirectDst(MachineInstr &MI,
MachineFunction *MF = MBB.getParent();
MachineRegisterInfo &MRI = MF->getRegInfo();
- unsigned Dst = MI.getOperand(0).getReg();
+ Register Dst = MI.getOperand(0).getReg();
const MachineOperand *SrcVec = TII->getNamedOperand(MI, AMDGPU::OpName::src);
const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
const MachineOperand *Val = TII->getNamedOperand(MI, AMDGPU::OpName::val);
@@ -3505,7 +3555,7 @@ static MachineBasicBlock *emitIndirectDst(MachineInstr &MI,
const DebugLoc &DL = MI.getDebugLoc();
- unsigned PhiReg = MRI.createVirtualRegister(VecRC);
+ Register PhiReg = MRI.createVirtualRegister(VecRC);
auto InsPt = loadM0FromVGPR(TII, MBB, MI, SrcVec->getReg(), PhiReg,
Offset, UseGPRIdxMode, false);
@@ -3564,22 +3614,22 @@ MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter(
MachineOperand &Src0 = MI.getOperand(1);
MachineOperand &Src1 = MI.getOperand(2);
- unsigned DestSub0 = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
- unsigned DestSub1 = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
+ Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
+ Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
MachineOperand Src0Sub0 = TII->buildExtractSubRegOrImm(MI, MRI,
Src0, BoolRC, AMDGPU::sub0,
- &AMDGPU::SReg_32_XM0RegClass);
+ &AMDGPU::SReg_32RegClass);
MachineOperand Src0Sub1 = TII->buildExtractSubRegOrImm(MI, MRI,
Src0, BoolRC, AMDGPU::sub1,
- &AMDGPU::SReg_32_XM0RegClass);
+ &AMDGPU::SReg_32RegClass);
MachineOperand Src1Sub0 = TII->buildExtractSubRegOrImm(MI, MRI,
Src1, BoolRC, AMDGPU::sub0,
- &AMDGPU::SReg_32_XM0RegClass);
+ &AMDGPU::SReg_32RegClass);
MachineOperand Src1Sub1 = TII->buildExtractSubRegOrImm(MI, MRI,
Src1, BoolRC, AMDGPU::sub1,
- &AMDGPU::SReg_32_XM0RegClass);
+ &AMDGPU::SReg_32RegClass);
bool IsAdd = (MI.getOpcode() == AMDGPU::S_ADD_U64_PSEUDO);
@@ -3632,8 +3682,8 @@ MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter(
// S_CMOV_B64 exec, -1
MachineInstr *FirstMI = &*BB->begin();
MachineRegisterInfo &MRI = MF->getRegInfo();
- unsigned InputReg = MI.getOperand(0).getReg();
- unsigned CountReg = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
+ Register InputReg = MI.getOperand(0).getReg();
+ Register CountReg = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
bool Found = false;
// Move the COPY of the input reg to the beginning, so that we can use it.
@@ -3707,16 +3757,16 @@ MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter(
const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
const SIRegisterInfo *TRI = ST.getRegisterInfo();
- unsigned Dst = MI.getOperand(0).getReg();
- unsigned Src0 = MI.getOperand(1).getReg();
- unsigned Src1 = MI.getOperand(2).getReg();
+ Register Dst = MI.getOperand(0).getReg();
+ Register Src0 = MI.getOperand(1).getReg();
+ Register Src1 = MI.getOperand(2).getReg();
const DebugLoc &DL = MI.getDebugLoc();
- unsigned SrcCond = MI.getOperand(3).getReg();
+ Register SrcCond = MI.getOperand(3).getReg();
- unsigned DstLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
- unsigned DstHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+ Register DstLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+ Register DstHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
const auto *CondRC = TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
- unsigned SrcCondCopy = MRI.createVirtualRegister(CondRC);
+ Register SrcCondCopy = MRI.createVirtualRegister(CondRC);
BuildMI(*BB, MI, DL, TII->get(AMDGPU::COPY), SrcCondCopy)
.addReg(SrcCond);
@@ -3814,8 +3864,12 @@ MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter(
case AMDGPU::DS_GWS_SEMA_P:
case AMDGPU::DS_GWS_SEMA_RELEASE_ALL:
case AMDGPU::DS_GWS_BARRIER:
- if (getSubtarget()->hasGWSAutoReplay())
+ // A s_waitcnt 0 is required to be the instruction immediately following.
+ if (getSubtarget()->hasGWSAutoReplay()) {
+ bundleInstWithWaitcnt(MI);
return BB;
+ }
+
return emitGWSMemViolTestLoop(MI, BB);
default:
return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI, BB);
@@ -3939,6 +3993,30 @@ SDValue SITargetLowering::splitBinaryVectorOp(SDValue Op,
return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op), VT, OpLo, OpHi);
}
+SDValue SITargetLowering::splitTernaryVectorOp(SDValue Op,
+ SelectionDAG &DAG) const {
+ unsigned Opc = Op.getOpcode();
+ EVT VT = Op.getValueType();
+ assert(VT == MVT::v4i16 || VT == MVT::v4f16);
+
+ SDValue Lo0, Hi0;
+ std::tie(Lo0, Hi0) = DAG.SplitVectorOperand(Op.getNode(), 0);
+ SDValue Lo1, Hi1;
+ std::tie(Lo1, Hi1) = DAG.SplitVectorOperand(Op.getNode(), 1);
+ SDValue Lo2, Hi2;
+ std::tie(Lo2, Hi2) = DAG.SplitVectorOperand(Op.getNode(), 2);
+
+ SDLoc SL(Op);
+
+ SDValue OpLo = DAG.getNode(Opc, SL, Lo0.getValueType(), Lo0, Lo1, Lo2,
+ Op->getFlags());
+ SDValue OpHi = DAG.getNode(Opc, SL, Hi0.getValueType(), Hi0, Hi1, Hi2,
+ Op->getFlags());
+
+ return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op), VT, OpLo, OpHi);
+}
+
+
SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
switch (Op.getOpcode()) {
default: return AMDGPUTargetLowering::LowerOperation(Op, DAG);
@@ -3991,6 +4069,8 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
case ISD::FMINNUM:
case ISD::FMAXNUM:
return lowerFMINNUM_FMAXNUM(Op, DAG);
+ case ISD::FMA:
+ return splitTernaryVectorOp(Op, DAG);
case ISD::SHL:
case ISD::SRA:
case ISD::SRL:
@@ -4070,6 +4150,41 @@ SDValue SITargetLowering::adjustLoadValueType(unsigned Opcode,
return DAG.getMergeValues({ Adjusted, Load.getValue(1) }, DL);
}
+SDValue SITargetLowering::lowerIntrinsicLoad(MemSDNode *M, bool IsFormat,
+ SelectionDAG &DAG,
+ ArrayRef<SDValue> Ops) const {
+ SDLoc DL(M);
+ EVT LoadVT = M->getValueType(0);
+ EVT EltType = LoadVT.getScalarType();
+ EVT IntVT = LoadVT.changeTypeToInteger();
+
+ bool IsD16 = IsFormat && (EltType.getSizeInBits() == 16);
+
+ unsigned Opc =
+ IsFormat ? AMDGPUISD::BUFFER_LOAD_FORMAT : AMDGPUISD::BUFFER_LOAD;
+
+ if (IsD16) {
+ return adjustLoadValueType(AMDGPUISD::BUFFER_LOAD_FORMAT_D16, M, DAG, Ops);
+ }
+
+ // Handle BUFFER_LOAD_BYTE/UBYTE/SHORT/USHORT overloaded intrinsics
+ if (!IsD16 && !LoadVT.isVector() && EltType.getSizeInBits() < 32)
+ return handleByteShortBufferLoads(DAG, LoadVT, DL, Ops, M);
+
+ if (isTypeLegal(LoadVT)) {
+ return getMemIntrinsicNode(Opc, DL, M->getVTList(), Ops, IntVT,
+ M->getMemOperand(), DAG);
+ }
+
+ EVT CastVT = getEquivalentMemType(*DAG.getContext(), LoadVT);
+ SDVTList VTList = DAG.getVTList(CastVT, MVT::Other);
+ SDValue MemNode = getMemIntrinsicNode(Opc, DL, VTList, Ops, CastVT,
+ M->getMemOperand(), DAG);
+ return DAG.getMergeValues(
+ {DAG.getNode(ISD::BITCAST, DL, LoadVT, MemNode), MemNode.getValue(1)},
+ DL);
+}
+
static SDValue lowerICMPIntrinsic(const SITargetLowering &TLI,
SDNode *N, SelectionDAG &DAG) {
EVT VT = N->getValueType(0);
@@ -4196,8 +4311,14 @@ void SITargetLowering::ReplaceNodeResults(SDNode *N,
}
case ISD::INTRINSIC_W_CHAIN: {
if (SDValue Res = LowerINTRINSIC_W_CHAIN(SDValue(N, 0), DAG)) {
- Results.push_back(Res);
- Results.push_back(Res.getValue(1));
+ if (Res.getOpcode() == ISD::MERGE_VALUES) {
+ // FIXME: Hacky
+ Results.push_back(Res.getOperand(0));
+ Results.push_back(Res.getOperand(1));
+ } else {
+ Results.push_back(Res);
+ Results.push_back(Res.getValue(1));
+ }
return;
}
@@ -4935,11 +5056,8 @@ buildPCRelGlobalAddress(SelectionDAG &DAG, const GlobalValue *GV,
// of the s_add_u32 instruction, we end up with an offset that is 4 bytes too
// small. This requires us to add 4 to the global variable offset in order to
// compute the correct address.
- unsigned LoFlags = GAFlags;
- if (LoFlags == SIInstrInfo::MO_NONE)
- LoFlags = SIInstrInfo::MO_REL32;
SDValue PtrLo =
- DAG.getTargetGlobalAddress(GV, DL, MVT::i32, Offset + 4, LoFlags);
+ DAG.getTargetGlobalAddress(GV, DL, MVT::i32, Offset + 4, GAFlags);
SDValue PtrHi;
if (GAFlags == SIInstrInfo::MO_NONE) {
PtrHi = DAG.getTargetConstant(0, DL, MVT::i32);
@@ -5563,14 +5681,14 @@ SDValue SITargetLowering::lowerSBuffer(EVT VT, SDLoc DL, SDValue Rsrc,
SDVTList VTList = DAG.getVTList({LoadVT, MVT::Glue});
unsigned CachePolicy = cast<ConstantSDNode>(GLC)->getZExtValue();
SDValue Ops[] = {
- DAG.getEntryNode(), // Chain
- Rsrc, // rsrc
- DAG.getConstant(0, DL, MVT::i32), // vindex
- {}, // voffset
- {}, // soffset
- {}, // offset
- DAG.getConstant(CachePolicy, DL, MVT::i32), // cachepolicy
- DAG.getConstant(0, DL, MVT::i1), // idxen
+ DAG.getEntryNode(), // Chain
+ Rsrc, // rsrc
+ DAG.getConstant(0, DL, MVT::i32), // vindex
+ {}, // voffset
+ {}, // soffset
+ {}, // offset
+ DAG.getTargetConstant(CachePolicy, DL, MVT::i32), // cachepolicy
+ DAG.getTargetConstant(0, DL, MVT::i1), // idxen
};
// Use the alignment to ensure that the required offsets will fit into the
@@ -5579,7 +5697,7 @@ SDValue SITargetLowering::lowerSBuffer(EVT VT, SDLoc DL, SDValue Rsrc,
uint64_t InstOffset = cast<ConstantSDNode>(Ops[5])->getZExtValue();
for (unsigned i = 0; i < NumLoads; ++i) {
- Ops[5] = DAG.getConstant(InstOffset + 16 * i, DL, MVT::i32);
+ Ops[5] = DAG.getTargetConstant(InstOffset + 16 * i, DL, MVT::i32);
Loads.push_back(DAG.getMemIntrinsicNode(AMDGPUISD::BUFFER_LOAD, DL, VTList,
Ops, LoadVT, MMO));
}
@@ -5758,45 +5876,31 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
}
case Intrinsic::amdgcn_fdiv_fast:
return lowerFDIV_FAST(Op, DAG);
- case Intrinsic::amdgcn_interp_mov: {
- SDValue M0 = copyToM0(DAG, DAG.getEntryNode(), DL, Op.getOperand(4));
- SDValue Glue = M0.getValue(1);
- return DAG.getNode(AMDGPUISD::INTERP_MOV, DL, MVT::f32, Op.getOperand(1),
- Op.getOperand(2), Op.getOperand(3), Glue);
- }
- case Intrinsic::amdgcn_interp_p1: {
- SDValue M0 = copyToM0(DAG, DAG.getEntryNode(), DL, Op.getOperand(4));
- SDValue Glue = M0.getValue(1);
- return DAG.getNode(AMDGPUISD::INTERP_P1, DL, MVT::f32, Op.getOperand(1),
- Op.getOperand(2), Op.getOperand(3), Glue);
- }
- case Intrinsic::amdgcn_interp_p2: {
- SDValue M0 = copyToM0(DAG, DAG.getEntryNode(), DL, Op.getOperand(5));
- SDValue Glue = SDValue(M0.getNode(), 1);
- return DAG.getNode(AMDGPUISD::INTERP_P2, DL, MVT::f32, Op.getOperand(1),
- Op.getOperand(2), Op.getOperand(3), Op.getOperand(4),
- Glue);
- }
case Intrinsic::amdgcn_interp_p1_f16: {
- SDValue M0 = copyToM0(DAG, DAG.getEntryNode(), DL, Op.getOperand(5));
- SDValue Glue = M0.getValue(1);
+ SDValue ToM0 = DAG.getCopyToReg(DAG.getEntryNode(), DL, AMDGPU::M0,
+ Op.getOperand(5), SDValue());
if (getSubtarget()->getLDSBankCount() == 16) {
// 16 bank LDS
- SDValue S = DAG.getNode(AMDGPUISD::INTERP_MOV, DL, MVT::f32,
- DAG.getConstant(2, DL, MVT::i32), // P0
- Op.getOperand(2), // Attrchan
- Op.getOperand(3), // Attr
- Glue);
+
+ // FIXME: This implicitly will insert a second CopyToReg to M0.
+ SDValue S = DAG.getNode(
+ ISD::INTRINSIC_WO_CHAIN, DL, MVT::f32,
+ DAG.getTargetConstant(Intrinsic::amdgcn_interp_mov, DL, MVT::i32),
+ DAG.getConstant(2, DL, MVT::i32), // P0
+ Op.getOperand(2), // Attrchan
+ Op.getOperand(3), // Attr
+ Op.getOperand(5)); // m0
+
SDValue Ops[] = {
Op.getOperand(1), // Src0
Op.getOperand(2), // Attrchan
Op.getOperand(3), // Attr
- DAG.getConstant(0, DL, MVT::i32), // $src0_modifiers
+ DAG.getTargetConstant(0, DL, MVT::i32), // $src0_modifiers
S, // Src2 - holds two f16 values selected by high
- DAG.getConstant(0, DL, MVT::i32), // $src2_modifiers
+ DAG.getTargetConstant(0, DL, MVT::i32), // $src2_modifiers
Op.getOperand(4), // high
- DAG.getConstant(0, DL, MVT::i1), // $clamp
- DAG.getConstant(0, DL, MVT::i32) // $omod
+ DAG.getTargetConstant(0, DL, MVT::i1), // $clamp
+ DAG.getTargetConstant(0, DL, MVT::i32) // $omod
};
return DAG.getNode(AMDGPUISD::INTERP_P1LV_F16, DL, MVT::f32, Ops);
} else {
@@ -5805,28 +5909,28 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
Op.getOperand(1), // Src0
Op.getOperand(2), // Attrchan
Op.getOperand(3), // Attr
- DAG.getConstant(0, DL, MVT::i32), // $src0_modifiers
+ DAG.getTargetConstant(0, DL, MVT::i32), // $src0_modifiers
Op.getOperand(4), // high
- DAG.getConstant(0, DL, MVT::i1), // $clamp
- DAG.getConstant(0, DL, MVT::i32), // $omod
- Glue
+ DAG.getTargetConstant(0, DL, MVT::i1), // $clamp
+ DAG.getTargetConstant(0, DL, MVT::i32), // $omod
+ ToM0.getValue(1)
};
return DAG.getNode(AMDGPUISD::INTERP_P1LL_F16, DL, MVT::f32, Ops);
}
}
case Intrinsic::amdgcn_interp_p2_f16: {
- SDValue M0 = copyToM0(DAG, DAG.getEntryNode(), DL, Op.getOperand(6));
- SDValue Glue = SDValue(M0.getNode(), 1);
+ SDValue ToM0 = DAG.getCopyToReg(DAG.getEntryNode(), DL, AMDGPU::M0,
+ Op.getOperand(6), SDValue());
SDValue Ops[] = {
Op.getOperand(2), // Src0
Op.getOperand(3), // Attrchan
Op.getOperand(4), // Attr
- DAG.getConstant(0, DL, MVT::i32), // $src0_modifiers
+ DAG.getTargetConstant(0, DL, MVT::i32), // $src0_modifiers
Op.getOperand(1), // Src2
- DAG.getConstant(0, DL, MVT::i32), // $src2_modifiers
+ DAG.getTargetConstant(0, DL, MVT::i32), // $src2_modifiers
Op.getOperand(5), // high
- DAG.getConstant(0, DL, MVT::i1), // $clamp
- Glue
+ DAG.getTargetConstant(0, DL, MVT::i1), // $clamp
+ ToM0.getValue(1)
};
return DAG.getNode(AMDGPUISD::INTERP_P2_F16, DL, MVT::f16, Ops);
}
@@ -5947,16 +6051,6 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
Op.getOperand(1), Op.getOperand(2));
return DAG.getNode(ISD::BITCAST, DL, VT, Node);
}
- case Intrinsic::amdgcn_wqm: {
- SDValue Src = Op.getOperand(1);
- return SDValue(DAG.getMachineNode(AMDGPU::WQM, DL, Src.getValueType(), Src),
- 0);
- }
- case Intrinsic::amdgcn_wwm: {
- SDValue Src = Op.getOperand(1);
- return SDValue(DAG.getMachineNode(AMDGPU::WWM, DL, Src.getValueType(), Src),
- 0);
- }
case Intrinsic::amdgcn_fmad_ftz:
return DAG.getNode(AMDGPUISD::FMAD_FTZ, DL, VT, Op.getOperand(1),
Op.getOperand(2), Op.getOperand(3));
@@ -5977,6 +6071,19 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
SIInstrInfo::MO_ABS32_LO);
return {DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, GA), 0};
}
+ case Intrinsic::amdgcn_is_shared:
+ case Intrinsic::amdgcn_is_private: {
+ SDLoc SL(Op);
+ unsigned AS = (IntrinsicID == Intrinsic::amdgcn_is_shared) ?
+ AMDGPUAS::LOCAL_ADDRESS : AMDGPUAS::PRIVATE_ADDRESS;
+ SDValue Aperture = getSegmentAperture(AS, SL, DAG);
+ SDValue SrcVec = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32,
+ Op.getOperand(1));
+
+ SDValue SrcHi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, SrcVec,
+ DAG.getConstant(1, SL, MVT::i32));
+ return DAG.getSetCC(SL, MVT::i1, SrcHi, Aperture, ISD::SETEQ);
+ }
default:
if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
AMDGPU::getImageDimIntrinsicInfo(IntrinsicID))
@@ -5986,6 +6093,30 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
}
}
+// This function computes an appropriate offset to pass to
+// MachineMemOperand::setOffset() based on the offset inputs to
+// an intrinsic. If any of the offsets are non-contstant or
+// if VIndex is non-zero then this function returns 0. Otherwise,
+// it returns the sum of VOffset, SOffset, and Offset.
+static unsigned getBufferOffsetForMMO(SDValue VOffset,
+ SDValue SOffset,
+ SDValue Offset,
+ SDValue VIndex = SDValue()) {
+
+ if (!isa<ConstantSDNode>(VOffset) || !isa<ConstantSDNode>(SOffset) ||
+ !isa<ConstantSDNode>(Offset))
+ return 0;
+
+ if (VIndex) {
+ if (!isa<ConstantSDNode>(VIndex) || !cast<ConstantSDNode>(VIndex)->isNullValue())
+ return 0;
+ }
+
+ return cast<ConstantSDNode>(VOffset)->getSExtValue() +
+ cast<ConstantSDNode>(SOffset)->getSExtValue() +
+ cast<ConstantSDNode>(Offset)->getSExtValue();
+}
+
SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
SelectionDAG &DAG) const {
unsigned IntrID = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
@@ -6128,17 +6259,22 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
SDValue(), // voffset -- will be set by setBufferOffsets
SDValue(), // soffset -- will be set by setBufferOffsets
SDValue(), // offset -- will be set by setBufferOffsets
- DAG.getConstant(Glc | (Slc << 1), DL, MVT::i32), // cachepolicy
- DAG.getConstant(IdxEn, DL, MVT::i1), // idxen
+ DAG.getTargetConstant(Glc | (Slc << 1), DL, MVT::i32), // cachepolicy
+ DAG.getTargetConstant(IdxEn, DL, MVT::i1), // idxen
};
- setBufferOffsets(Op.getOperand(4), DAG, &Ops[3]);
+ unsigned Offset = setBufferOffsets(Op.getOperand(4), DAG, &Ops[3]);
+ // We don't know the offset if vindex is non-zero, so clear it.
+ if (IdxEn)
+ Offset = 0;
+
unsigned Opc = (IntrID == Intrinsic::amdgcn_buffer_load) ?
AMDGPUISD::BUFFER_LOAD : AMDGPUISD::BUFFER_LOAD_FORMAT;
EVT VT = Op.getValueType();
EVT IntVT = VT.changeTypeToInteger();
auto *M = cast<MemSDNode>(Op);
+ M->getMemOperand()->setOffset(Offset);
EVT LoadVT = Op.getValueType();
if (LoadVT.getScalarType() == MVT::f16)
@@ -6155,6 +6291,8 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
}
case Intrinsic::amdgcn_raw_buffer_load:
case Intrinsic::amdgcn_raw_buffer_load_format: {
+ const bool IsFormat = IntrID == Intrinsic::amdgcn_raw_buffer_load_format;
+
auto Offsets = splitBufferOffsets(Op.getOperand(3), DAG);
SDValue Ops[] = {
Op.getOperand(0), // Chain
@@ -6163,32 +6301,18 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
Offsets.first, // voffset
Op.getOperand(4), // soffset
Offsets.second, // offset
- Op.getOperand(5), // cachepolicy
- DAG.getConstant(0, DL, MVT::i1), // idxen
+ Op.getOperand(5), // cachepolicy, swizzled buffer
+ DAG.getTargetConstant(0, DL, MVT::i1), // idxen
};
- unsigned Opc = (IntrID == Intrinsic::amdgcn_raw_buffer_load) ?
- AMDGPUISD::BUFFER_LOAD : AMDGPUISD::BUFFER_LOAD_FORMAT;
-
- EVT VT = Op.getValueType();
- EVT IntVT = VT.changeTypeToInteger();
auto *M = cast<MemSDNode>(Op);
- EVT LoadVT = Op.getValueType();
-
- if (LoadVT.getScalarType() == MVT::f16)
- return adjustLoadValueType(AMDGPUISD::BUFFER_LOAD_FORMAT_D16,
- M, DAG, Ops);
-
- // Handle BUFFER_LOAD_BYTE/UBYTE/SHORT/USHORT overloaded intrinsics
- if (LoadVT.getScalarType() == MVT::i8 ||
- LoadVT.getScalarType() == MVT::i16)
- return handleByteShortBufferLoads(DAG, LoadVT, DL, Ops, M);
-
- return getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops, IntVT,
- M->getMemOperand(), DAG);
+ M->getMemOperand()->setOffset(getBufferOffsetForMMO(Ops[3], Ops[4], Ops[5]));
+ return lowerIntrinsicLoad(M, IsFormat, DAG, Ops);
}
case Intrinsic::amdgcn_struct_buffer_load:
case Intrinsic::amdgcn_struct_buffer_load_format: {
+ const bool IsFormat = IntrID == Intrinsic::amdgcn_struct_buffer_load_format;
+
auto Offsets = splitBufferOffsets(Op.getOperand(4), DAG);
SDValue Ops[] = {
Op.getOperand(0), // Chain
@@ -6197,29 +6321,14 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
Offsets.first, // voffset
Op.getOperand(5), // soffset
Offsets.second, // offset
- Op.getOperand(6), // cachepolicy
- DAG.getConstant(1, DL, MVT::i1), // idxen
+ Op.getOperand(6), // cachepolicy, swizzled buffer
+ DAG.getTargetConstant(1, DL, MVT::i1), // idxen
};
- unsigned Opc = (IntrID == Intrinsic::amdgcn_struct_buffer_load) ?
- AMDGPUISD::BUFFER_LOAD : AMDGPUISD::BUFFER_LOAD_FORMAT;
-
- EVT VT = Op.getValueType();
- EVT IntVT = VT.changeTypeToInteger();
auto *M = cast<MemSDNode>(Op);
- EVT LoadVT = Op.getValueType();
-
- if (LoadVT.getScalarType() == MVT::f16)
- return adjustLoadValueType(AMDGPUISD::BUFFER_LOAD_FORMAT_D16,
- M, DAG, Ops);
-
- // Handle BUFFER_LOAD_BYTE/UBYTE/SHORT/USHORT overloaded intrinsics
- if (LoadVT.getScalarType() == MVT::i8 ||
- LoadVT.getScalarType() == MVT::i16)
- return handleByteShortBufferLoads(DAG, LoadVT, DL, Ops, M);
-
- return getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops, IntVT,
- M->getMemOperand(), DAG);
+ M->getMemOperand()->setOffset(getBufferOffsetForMMO(Ops[3], Ops[4], Ops[5],
+ Ops[2]));
+ return lowerIntrinsicLoad(cast<MemSDNode>(Op), IsFormat, DAG, Ops);
}
case Intrinsic::amdgcn_tbuffer_load: {
MemSDNode *M = cast<MemSDNode>(Op);
@@ -6239,9 +6348,9 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
Op.getOperand(4), // voffset
Op.getOperand(5), // soffset
Op.getOperand(6), // offset
- DAG.getConstant(Dfmt | (Nfmt << 4), DL, MVT::i32), // format
- DAG.getConstant(Glc | (Slc << 1), DL, MVT::i32), // cachepolicy
- DAG.getConstant(IdxEn, DL, MVT::i1), // idxen
+ DAG.getTargetConstant(Dfmt | (Nfmt << 4), DL, MVT::i32), // format
+ DAG.getTargetConstant(Glc | (Slc << 1), DL, MVT::i32), // cachepolicy
+ DAG.getTargetConstant(IdxEn, DL, MVT::i1) // idxen
};
if (LoadVT.getScalarType() == MVT::f16)
@@ -6264,8 +6373,8 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
Op.getOperand(4), // soffset
Offsets.second, // offset
Op.getOperand(5), // format
- Op.getOperand(6), // cachepolicy
- DAG.getConstant(0, DL, MVT::i1), // idxen
+ Op.getOperand(6), // cachepolicy, swizzled buffer
+ DAG.getTargetConstant(0, DL, MVT::i1), // idxen
};
if (LoadVT.getScalarType() == MVT::f16)
@@ -6288,8 +6397,8 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
Op.getOperand(5), // soffset
Offsets.second, // offset
Op.getOperand(6), // format
- Op.getOperand(7), // cachepolicy
- DAG.getConstant(1, DL, MVT::i1), // idxen
+ Op.getOperand(7), // cachepolicy, swizzled buffer
+ DAG.getTargetConstant(1, DL, MVT::i1), // idxen
};
if (LoadVT.getScalarType() == MVT::f16)
@@ -6321,13 +6430,17 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
SDValue(), // voffset -- will be set by setBufferOffsets
SDValue(), // soffset -- will be set by setBufferOffsets
SDValue(), // offset -- will be set by setBufferOffsets
- DAG.getConstant(Slc << 1, DL, MVT::i32), // cachepolicy
- DAG.getConstant(IdxEn, DL, MVT::i1), // idxen
+ DAG.getTargetConstant(Slc << 1, DL, MVT::i32), // cachepolicy
+ DAG.getTargetConstant(IdxEn, DL, MVT::i1), // idxen
};
- setBufferOffsets(Op.getOperand(5), DAG, &Ops[4]);
+ unsigned Offset = setBufferOffsets(Op.getOperand(5), DAG, &Ops[4]);
+ // We don't know the offset if vindex is non-zero, so clear it.
+ if (IdxEn)
+ Offset = 0;
EVT VT = Op.getValueType();
auto *M = cast<MemSDNode>(Op);
+ M->getMemOperand()->setOffset(Offset);
unsigned Opcode = 0;
switch (IntrID) {
@@ -6377,7 +6490,9 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
case Intrinsic::amdgcn_raw_buffer_atomic_umax:
case Intrinsic::amdgcn_raw_buffer_atomic_and:
case Intrinsic::amdgcn_raw_buffer_atomic_or:
- case Intrinsic::amdgcn_raw_buffer_atomic_xor: {
+ case Intrinsic::amdgcn_raw_buffer_atomic_xor:
+ case Intrinsic::amdgcn_raw_buffer_atomic_inc:
+ case Intrinsic::amdgcn_raw_buffer_atomic_dec: {
auto Offsets = splitBufferOffsets(Op.getOperand(4), DAG);
SDValue Ops[] = {
Op.getOperand(0), // Chain
@@ -6388,11 +6503,12 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
Op.getOperand(5), // soffset
Offsets.second, // offset
Op.getOperand(6), // cachepolicy
- DAG.getConstant(0, DL, MVT::i1), // idxen
+ DAG.getTargetConstant(0, DL, MVT::i1), // idxen
};
EVT VT = Op.getValueType();
auto *M = cast<MemSDNode>(Op);
+ M->getMemOperand()->setOffset(getBufferOffsetForMMO(Ops[4], Ops[5], Ops[6]));
unsigned Opcode = 0;
switch (IntrID) {
@@ -6426,6 +6542,12 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
case Intrinsic::amdgcn_raw_buffer_atomic_xor:
Opcode = AMDGPUISD::BUFFER_ATOMIC_XOR;
break;
+ case Intrinsic::amdgcn_raw_buffer_atomic_inc:
+ Opcode = AMDGPUISD::BUFFER_ATOMIC_INC;
+ break;
+ case Intrinsic::amdgcn_raw_buffer_atomic_dec:
+ Opcode = AMDGPUISD::BUFFER_ATOMIC_DEC;
+ break;
default:
llvm_unreachable("unhandled atomic opcode");
}
@@ -6442,7 +6564,9 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
case Intrinsic::amdgcn_struct_buffer_atomic_umax:
case Intrinsic::amdgcn_struct_buffer_atomic_and:
case Intrinsic::amdgcn_struct_buffer_atomic_or:
- case Intrinsic::amdgcn_struct_buffer_atomic_xor: {
+ case Intrinsic::amdgcn_struct_buffer_atomic_xor:
+ case Intrinsic::amdgcn_struct_buffer_atomic_inc:
+ case Intrinsic::amdgcn_struct_buffer_atomic_dec: {
auto Offsets = splitBufferOffsets(Op.getOperand(5), DAG);
SDValue Ops[] = {
Op.getOperand(0), // Chain
@@ -6453,11 +6577,13 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
Op.getOperand(6), // soffset
Offsets.second, // offset
Op.getOperand(7), // cachepolicy
- DAG.getConstant(1, DL, MVT::i1), // idxen
+ DAG.getTargetConstant(1, DL, MVT::i1), // idxen
};
EVT VT = Op.getValueType();
auto *M = cast<MemSDNode>(Op);
+ M->getMemOperand()->setOffset(getBufferOffsetForMMO(Ops[4], Ops[5], Ops[6],
+ Ops[3]));
unsigned Opcode = 0;
switch (IntrID) {
@@ -6491,6 +6617,12 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
case Intrinsic::amdgcn_struct_buffer_atomic_xor:
Opcode = AMDGPUISD::BUFFER_ATOMIC_XOR;
break;
+ case Intrinsic::amdgcn_struct_buffer_atomic_inc:
+ Opcode = AMDGPUISD::BUFFER_ATOMIC_INC;
+ break;
+ case Intrinsic::amdgcn_struct_buffer_atomic_dec:
+ Opcode = AMDGPUISD::BUFFER_ATOMIC_DEC;
+ break;
default:
llvm_unreachable("unhandled atomic opcode");
}
@@ -6512,12 +6644,16 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
SDValue(), // voffset -- will be set by setBufferOffsets
SDValue(), // soffset -- will be set by setBufferOffsets
SDValue(), // offset -- will be set by setBufferOffsets
- DAG.getConstant(Slc << 1, DL, MVT::i32), // cachepolicy
- DAG.getConstant(IdxEn, DL, MVT::i1), // idxen
+ DAG.getTargetConstant(Slc << 1, DL, MVT::i32), // cachepolicy
+ DAG.getTargetConstant(IdxEn, DL, MVT::i1), // idxen
};
- setBufferOffsets(Op.getOperand(6), DAG, &Ops[5]);
+ unsigned Offset = setBufferOffsets(Op.getOperand(6), DAG, &Ops[5]);
+ // We don't know the offset if vindex is non-zero, so clear it.
+ if (IdxEn)
+ Offset = 0;
EVT VT = Op.getValueType();
auto *M = cast<MemSDNode>(Op);
+ M->getMemOperand()->setOffset(Offset);
return DAG.getMemIntrinsicNode(AMDGPUISD::BUFFER_ATOMIC_CMPSWAP, DL,
Op->getVTList(), Ops, VT, M->getMemOperand());
@@ -6534,10 +6670,11 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
Op.getOperand(6), // soffset
Offsets.second, // offset
Op.getOperand(7), // cachepolicy
- DAG.getConstant(0, DL, MVT::i1), // idxen
+ DAG.getTargetConstant(0, DL, MVT::i1), // idxen
};
EVT VT = Op.getValueType();
auto *M = cast<MemSDNode>(Op);
+ M->getMemOperand()->setOffset(getBufferOffsetForMMO(Ops[5], Ops[6], Ops[7]));
return DAG.getMemIntrinsicNode(AMDGPUISD::BUFFER_ATOMIC_CMPSWAP, DL,
Op->getVTList(), Ops, VT, M->getMemOperand());
@@ -6554,10 +6691,12 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
Op.getOperand(7), // soffset
Offsets.second, // offset
Op.getOperand(8), // cachepolicy
- DAG.getConstant(1, DL, MVT::i1), // idxen
+ DAG.getTargetConstant(1, DL, MVT::i1), // idxen
};
EVT VT = Op.getValueType();
auto *M = cast<MemSDNode>(Op);
+ M->getMemOperand()->setOffset(getBufferOffsetForMMO(Ops[5], Ops[6], Ops[7],
+ Ops[4]));
return DAG.getMemIntrinsicNode(AMDGPUISD::BUFFER_ATOMIC_CMPSWAP, DL,
Op->getVTList(), Ops, VT, M->getMemOperand());
@@ -6686,23 +6825,6 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
AMDGPUISD::EXPORT : AMDGPUISD::EXPORT_DONE;
return DAG.getNode(Opc, DL, Op->getVTList(), Ops);
}
- case Intrinsic::amdgcn_s_sendmsg:
- case Intrinsic::amdgcn_s_sendmsghalt: {
- unsigned NodeOp = (IntrinsicID == Intrinsic::amdgcn_s_sendmsg) ?
- AMDGPUISD::SENDMSG : AMDGPUISD::SENDMSGHALT;
- Chain = copyToM0(DAG, Chain, DL, Op.getOperand(3));
- SDValue Glue = Chain.getValue(1);
- return DAG.getNode(NodeOp, DL, MVT::Other, Chain,
- Op.getOperand(2), Glue);
- }
- case Intrinsic::amdgcn_init_exec: {
- return DAG.getNode(AMDGPUISD::INIT_EXEC, DL, MVT::Other, Chain,
- Op.getOperand(2));
- }
- case Intrinsic::amdgcn_init_exec_from_input: {
- return DAG.getNode(AMDGPUISD::INIT_EXEC_FROM_INPUT, DL, MVT::Other, Chain,
- Op.getOperand(2), Op.getOperand(3));
- }
case Intrinsic::amdgcn_s_barrier: {
if (getTargetMachine().getOptLevel() > CodeGenOpt::None) {
const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
@@ -6733,9 +6855,9 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
Op.getOperand(5), // voffset
Op.getOperand(6), // soffset
Op.getOperand(7), // offset
- DAG.getConstant(Dfmt | (Nfmt << 4), DL, MVT::i32), // format
- DAG.getConstant(Glc | (Slc << 1), DL, MVT::i32), // cachepolicy
- DAG.getConstant(IdxEn, DL, MVT::i1), // idexen
+ DAG.getTargetConstant(Dfmt | (Nfmt << 4), DL, MVT::i32), // format
+ DAG.getTargetConstant(Glc | (Slc << 1), DL, MVT::i32), // cachepolicy
+ DAG.getTargetConstant(IdxEn, DL, MVT::i1), // idexen
};
unsigned Opc = IsD16 ? AMDGPUISD::TBUFFER_STORE_FORMAT_D16 :
AMDGPUISD::TBUFFER_STORE_FORMAT;
@@ -6759,8 +6881,8 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
Op.getOperand(6), // soffset
Offsets.second, // offset
Op.getOperand(7), // format
- Op.getOperand(8), // cachepolicy
- DAG.getConstant(1, DL, MVT::i1), // idexen
+ Op.getOperand(8), // cachepolicy, swizzled buffer
+ DAG.getTargetConstant(1, DL, MVT::i1), // idexen
};
unsigned Opc = IsD16 ? AMDGPUISD::TBUFFER_STORE_FORMAT_D16 :
AMDGPUISD::TBUFFER_STORE_FORMAT;
@@ -6784,8 +6906,8 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
Op.getOperand(5), // soffset
Offsets.second, // offset
Op.getOperand(6), // format
- Op.getOperand(7), // cachepolicy
- DAG.getConstant(0, DL, MVT::i1), // idexen
+ Op.getOperand(7), // cachepolicy, swizzled buffer
+ DAG.getTargetConstant(0, DL, MVT::i1), // idexen
};
unsigned Opc = IsD16 ? AMDGPUISD::TBUFFER_STORE_FORMAT_D16 :
AMDGPUISD::TBUFFER_STORE_FORMAT;
@@ -6813,14 +6935,18 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
SDValue(), // voffset -- will be set by setBufferOffsets
SDValue(), // soffset -- will be set by setBufferOffsets
SDValue(), // offset -- will be set by setBufferOffsets
- DAG.getConstant(Glc | (Slc << 1), DL, MVT::i32), // cachepolicy
- DAG.getConstant(IdxEn, DL, MVT::i1), // idxen
+ DAG.getTargetConstant(Glc | (Slc << 1), DL, MVT::i32), // cachepolicy
+ DAG.getTargetConstant(IdxEn, DL, MVT::i1), // idxen
};
- setBufferOffsets(Op.getOperand(5), DAG, &Ops[4]);
+ unsigned Offset = setBufferOffsets(Op.getOperand(5), DAG, &Ops[4]);
+ // We don't know the offset if vindex is non-zero, so clear it.
+ if (IdxEn)
+ Offset = 0;
unsigned Opc = IntrinsicID == Intrinsic::amdgcn_buffer_store ?
AMDGPUISD::BUFFER_STORE : AMDGPUISD::BUFFER_STORE_FORMAT;
Opc = IsD16 ? AMDGPUISD::BUFFER_STORE_FORMAT_D16 : Opc;
MemSDNode *M = cast<MemSDNode>(Op);
+ M->getMemOperand()->setOffset(Offset);
// Handle BUFFER_STORE_BYTE/SHORT overloaded intrinsics
EVT VDataType = VData.getValueType().getScalarType();
@@ -6833,10 +6959,22 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
case Intrinsic::amdgcn_raw_buffer_store:
case Intrinsic::amdgcn_raw_buffer_store_format: {
+ const bool IsFormat =
+ IntrinsicID == Intrinsic::amdgcn_raw_buffer_store_format;
+
SDValue VData = Op.getOperand(2);
- bool IsD16 = (VData.getValueType().getScalarType() == MVT::f16);
+ EVT VDataVT = VData.getValueType();
+ EVT EltType = VDataVT.getScalarType();
+ bool IsD16 = IsFormat && (EltType.getSizeInBits() == 16);
if (IsD16)
VData = handleD16VData(VData, DAG);
+
+ if (!isTypeLegal(VDataVT)) {
+ VData =
+ DAG.getNode(ISD::BITCAST, DL,
+ getEquivalentMemType(*DAG.getContext(), VDataVT), VData);
+ }
+
auto Offsets = splitBufferOffsets(Op.getOperand(4), DAG);
SDValue Ops[] = {
Chain,
@@ -6846,18 +6984,18 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
Offsets.first, // voffset
Op.getOperand(5), // soffset
Offsets.second, // offset
- Op.getOperand(6), // cachepolicy
- DAG.getConstant(0, DL, MVT::i1), // idxen
+ Op.getOperand(6), // cachepolicy, swizzled buffer
+ DAG.getTargetConstant(0, DL, MVT::i1), // idxen
};
- unsigned Opc = IntrinsicID == Intrinsic::amdgcn_raw_buffer_store ?
- AMDGPUISD::BUFFER_STORE : AMDGPUISD::BUFFER_STORE_FORMAT;
+ unsigned Opc =
+ IsFormat ? AMDGPUISD::BUFFER_STORE_FORMAT : AMDGPUISD::BUFFER_STORE;
Opc = IsD16 ? AMDGPUISD::BUFFER_STORE_FORMAT_D16 : Opc;
MemSDNode *M = cast<MemSDNode>(Op);
+ M->getMemOperand()->setOffset(getBufferOffsetForMMO(Ops[4], Ops[5], Ops[6]));
// Handle BUFFER_STORE_BYTE/SHORT overloaded intrinsics
- EVT VDataType = VData.getValueType().getScalarType();
- if (VDataType == MVT::i8 || VDataType == MVT::i16)
- return handleByteShortBufferStores(DAG, VDataType, DL, Ops, M);
+ if (!IsD16 && !VDataVT.isVector() && EltType.getSizeInBits() < 32)
+ return handleByteShortBufferStores(DAG, VDataVT, DL, Ops, M);
return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
M->getMemoryVT(), M->getMemOperand());
@@ -6865,10 +7003,23 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
case Intrinsic::amdgcn_struct_buffer_store:
case Intrinsic::amdgcn_struct_buffer_store_format: {
+ const bool IsFormat =
+ IntrinsicID == Intrinsic::amdgcn_struct_buffer_store_format;
+
SDValue VData = Op.getOperand(2);
- bool IsD16 = (VData.getValueType().getScalarType() == MVT::f16);
+ EVT VDataVT = VData.getValueType();
+ EVT EltType = VDataVT.getScalarType();
+ bool IsD16 = IsFormat && (EltType.getSizeInBits() == 16);
+
if (IsD16)
VData = handleD16VData(VData, DAG);
+
+ if (!isTypeLegal(VDataVT)) {
+ VData =
+ DAG.getNode(ISD::BITCAST, DL,
+ getEquivalentMemType(*DAG.getContext(), VDataVT), VData);
+ }
+
auto Offsets = splitBufferOffsets(Op.getOperand(5), DAG);
SDValue Ops[] = {
Chain,
@@ -6878,17 +7029,19 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
Offsets.first, // voffset
Op.getOperand(6), // soffset
Offsets.second, // offset
- Op.getOperand(7), // cachepolicy
- DAG.getConstant(1, DL, MVT::i1), // idxen
+ Op.getOperand(7), // cachepolicy, swizzled buffer
+ DAG.getTargetConstant(1, DL, MVT::i1), // idxen
};
unsigned Opc = IntrinsicID == Intrinsic::amdgcn_struct_buffer_store ?
AMDGPUISD::BUFFER_STORE : AMDGPUISD::BUFFER_STORE_FORMAT;
Opc = IsD16 ? AMDGPUISD::BUFFER_STORE_FORMAT_D16 : Opc;
MemSDNode *M = cast<MemSDNode>(Op);
+ M->getMemOperand()->setOffset(getBufferOffsetForMMO(Ops[4], Ops[5], Ops[6],
+ Ops[3]));
// Handle BUFFER_STORE_BYTE/SHORT overloaded intrinsics
EVT VDataType = VData.getValueType().getScalarType();
- if (VDataType == MVT::i8 || VDataType == MVT::i16)
+ if (!IsD16 && !VDataVT.isVector() && EltType.getSizeInBits() < 32)
return handleByteShortBufferStores(DAG, VDataType, DL, Ops, M);
return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
@@ -6908,13 +7061,17 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
SDValue(), // voffset -- will be set by setBufferOffsets
SDValue(), // soffset -- will be set by setBufferOffsets
SDValue(), // offset -- will be set by setBufferOffsets
- DAG.getConstant(Slc << 1, DL, MVT::i32), // cachepolicy
- DAG.getConstant(IdxEn, DL, MVT::i1), // idxen
+ DAG.getTargetConstant(Slc << 1, DL, MVT::i32), // cachepolicy
+ DAG.getTargetConstant(IdxEn, DL, MVT::i1), // idxen
};
- setBufferOffsets(Op.getOperand(5), DAG, &Ops[4]);
+ unsigned Offset = setBufferOffsets(Op.getOperand(5), DAG, &Ops[4]);
+ // We don't know the offset if vindex is non-zero, so clear it.
+ if (IdxEn)
+ Offset = 0;
EVT VT = Op.getOperand(2).getValueType();
auto *M = cast<MemSDNode>(Op);
+ M->getMemOperand()->setOffset(Offset);
unsigned Opcode = VT.isVector() ? AMDGPUISD::BUFFER_ATOMIC_PK_FADD
: AMDGPUISD::BUFFER_ATOMIC_FADD;
@@ -6987,7 +7144,7 @@ std::pair<SDValue, SDValue> SITargetLowering::splitBufferOffsets(
Overflow += ImmOffset;
ImmOffset = 0;
}
- C1 = cast<ConstantSDNode>(DAG.getConstant(ImmOffset, DL, MVT::i32));
+ C1 = cast<ConstantSDNode>(DAG.getTargetConstant(ImmOffset, DL, MVT::i32));
if (Overflow) {
auto OverflowVal = DAG.getConstant(Overflow, DL, MVT::i32);
if (!N0)
@@ -7001,14 +7158,14 @@ std::pair<SDValue, SDValue> SITargetLowering::splitBufferOffsets(
if (!N0)
N0 = DAG.getConstant(0, DL, MVT::i32);
if (!C1)
- C1 = cast<ConstantSDNode>(DAG.getConstant(0, DL, MVT::i32));
+ C1 = cast<ConstantSDNode>(DAG.getTargetConstant(0, DL, MVT::i32));
return {N0, SDValue(C1, 0)};
}
// Analyze a combined offset from an amdgcn_buffer_ intrinsic and store the
// three offsets (voffset, soffset and instoffset) into the SDValue[3] array
// pointed to by Offsets.
-void SITargetLowering::setBufferOffsets(SDValue CombinedOffset,
+unsigned SITargetLowering::setBufferOffsets(SDValue CombinedOffset,
SelectionDAG &DAG, SDValue *Offsets,
unsigned Align) const {
SDLoc DL(CombinedOffset);
@@ -7018,8 +7175,8 @@ void SITargetLowering::setBufferOffsets(SDValue CombinedOffset,
if (AMDGPU::splitMUBUFOffset(Imm, SOffset, ImmOffset, Subtarget, Align)) {
Offsets[0] = DAG.getConstant(0, DL, MVT::i32);
Offsets[1] = DAG.getConstant(SOffset, DL, MVT::i32);
- Offsets[2] = DAG.getConstant(ImmOffset, DL, MVT::i32);
- return;
+ Offsets[2] = DAG.getTargetConstant(ImmOffset, DL, MVT::i32);
+ return SOffset + ImmOffset;
}
}
if (DAG.isBaseWithConstantOffset(CombinedOffset)) {
@@ -7031,13 +7188,14 @@ void SITargetLowering::setBufferOffsets(SDValue CombinedOffset,
Subtarget, Align)) {
Offsets[0] = N0;
Offsets[1] = DAG.getConstant(SOffset, DL, MVT::i32);
- Offsets[2] = DAG.getConstant(ImmOffset, DL, MVT::i32);
- return;
+ Offsets[2] = DAG.getTargetConstant(ImmOffset, DL, MVT::i32);
+ return 0;
}
}
Offsets[0] = CombinedOffset;
Offsets[1] = DAG.getConstant(0, DL, MVT::i32);
- Offsets[2] = DAG.getConstant(0, DL, MVT::i32);
+ Offsets[2] = DAG.getTargetConstant(0, DL, MVT::i32);
+ return 0;
}
// Handle 8 bit and 16 bit buffer loads
@@ -7053,9 +7211,10 @@ SDValue SITargetLowering::handleByteShortBufferLoads(SelectionDAG &DAG,
SDValue BufferLoad = DAG.getMemIntrinsicNode(Opc, DL, ResList,
Ops, IntVT,
M->getMemOperand());
- SDValue BufferLoadTrunc = DAG.getNode(ISD::TRUNCATE, DL,
- LoadVT.getScalarType(), BufferLoad);
- return DAG.getMergeValues({BufferLoadTrunc, BufferLoad.getValue(1)}, DL);
+ SDValue LoadVal = DAG.getNode(ISD::TRUNCATE, DL, IntVT, BufferLoad);
+ LoadVal = DAG.getNode(ISD::BITCAST, DL, LoadVT, LoadVal);
+
+ return DAG.getMergeValues({LoadVal, BufferLoad.getValue(1)}, DL);
}
// Handle 8 bit and 16 bit buffer stores
@@ -7063,6 +7222,9 @@ SDValue SITargetLowering::handleByteShortBufferStores(SelectionDAG &DAG,
EVT VDataType, SDLoc DL,
SDValue Ops[],
MemSDNode *M) const {
+ if (VDataType == MVT::f16)
+ Ops[1] = DAG.getNode(ISD::BITCAST, DL, MVT::i16, Ops[1]);
+
SDValue BufferStoreExt = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Ops[1]);
Ops[1] = BufferStoreExt;
unsigned Opc = (VDataType == MVT::i8) ? AMDGPUISD::BUFFER_STORE_BYTE :
@@ -7215,8 +7377,8 @@ SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
assert(Op.getValueType().getVectorElementType() == MVT::i32 &&
"Custom lowering for non-i32 vectors hasn't been implemented.");
- if (!allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), MemVT,
- *Load->getMemOperand())) {
+ if (!allowsMemoryAccessForAlignment(*DAG.getContext(), DAG.getDataLayout(),
+ MemVT, *Load->getMemOperand())) {
SDValue Ops[2];
std::tie(Ops[0], Ops[1]) = expandUnalignedLoad(Load, DAG);
return DAG.getMergeValues(Ops, DL);
@@ -7505,6 +7667,19 @@ SDValue SITargetLowering::lowerFDIV_FAST(SDValue Op, SelectionDAG &DAG) const {
return DAG.getNode(ISD::FMUL, SL, MVT::f32, r3, Mul);
}
+// Returns immediate value for setting the F32 denorm mode when using the
+// S_DENORM_MODE instruction.
+static const SDValue getSPDenormModeValue(int SPDenormMode, SelectionDAG &DAG,
+ const SDLoc &SL, const GCNSubtarget *ST) {
+ assert(ST->hasDenormModeInst() && "Requires S_DENORM_MODE");
+ int DPDenormModeDefault = ST->hasFP64Denormals()
+ ? FP_DENORM_FLUSH_NONE
+ : FP_DENORM_FLUSH_IN_FLUSH_OUT;
+
+ int Mode = SPDenormMode | (DPDenormModeDefault << 2);
+ return DAG.getTargetConstant(Mode, SL, MVT::i32);
+}
+
SDValue SITargetLowering::LowerFDIV32(SDValue Op, SelectionDAG &DAG) const {
if (SDValue FastLowered = lowerFastUnsafeFDIV(Op, DAG))
return FastLowered;
@@ -7531,16 +7706,26 @@ SDValue SITargetLowering::LowerFDIV32(SDValue Op, SelectionDAG &DAG) const {
const unsigned Denorm32Reg = AMDGPU::Hwreg::ID_MODE |
(4 << AMDGPU::Hwreg::OFFSET_SHIFT_) |
(1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_);
-
const SDValue BitField = DAG.getTargetConstant(Denorm32Reg, SL, MVT::i16);
if (!Subtarget->hasFP32Denormals()) {
SDVTList BindParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
- const SDValue EnableDenormValue = DAG.getConstant(FP_DENORM_FLUSH_NONE,
- SL, MVT::i32);
- SDValue EnableDenorm = DAG.getNode(AMDGPUISD::SETREG, SL, BindParamVTs,
- DAG.getEntryNode(),
- EnableDenormValue, BitField);
+
+ SDValue EnableDenorm;
+ if (Subtarget->hasDenormModeInst()) {
+ const SDValue EnableDenormValue =
+ getSPDenormModeValue(FP_DENORM_FLUSH_NONE, DAG, SL, Subtarget);
+
+ EnableDenorm = DAG.getNode(AMDGPUISD::DENORM_MODE, SL, BindParamVTs,
+ DAG.getEntryNode(), EnableDenormValue);
+ } else {
+ const SDValue EnableDenormValue = DAG.getConstant(FP_DENORM_FLUSH_NONE,
+ SL, MVT::i32);
+ EnableDenorm = DAG.getNode(AMDGPUISD::SETREG, SL, BindParamVTs,
+ DAG.getEntryNode(), EnableDenormValue,
+ BitField);
+ }
+
SDValue Ops[3] = {
NegDivScale0,
EnableDenorm.getValue(0),
@@ -7562,19 +7747,29 @@ SDValue SITargetLowering::LowerFDIV32(SDValue Op, SelectionDAG &DAG) const {
SDValue Fma2 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, NegDivScale0, Mul,
NumeratorScaled, Mul);
- SDValue Fma3 = getFPTernOp(DAG, ISD::FMA,SL, MVT::f32, Fma2, Fma1, Mul, Fma2);
+ SDValue Fma3 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, Fma2, Fma1, Mul, Fma2);
SDValue Fma4 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, NegDivScale0, Fma3,
NumeratorScaled, Fma3);
if (!Subtarget->hasFP32Denormals()) {
- const SDValue DisableDenormValue =
- DAG.getConstant(FP_DENORM_FLUSH_IN_FLUSH_OUT, SL, MVT::i32);
- SDValue DisableDenorm = DAG.getNode(AMDGPUISD::SETREG, SL, MVT::Other,
- Fma4.getValue(1),
- DisableDenormValue,
- BitField,
- Fma4.getValue(2));
+
+ SDValue DisableDenorm;
+ if (Subtarget->hasDenormModeInst()) {
+ const SDValue DisableDenormValue =
+ getSPDenormModeValue(FP_DENORM_FLUSH_IN_FLUSH_OUT, DAG, SL, Subtarget);
+
+ DisableDenorm = DAG.getNode(AMDGPUISD::DENORM_MODE, SL, MVT::Other,
+ Fma4.getValue(1), DisableDenormValue,
+ Fma4.getValue(2));
+ } else {
+ const SDValue DisableDenormValue =
+ DAG.getConstant(FP_DENORM_FLUSH_IN_FLUSH_OUT, SL, MVT::i32);
+
+ DisableDenorm = DAG.getNode(AMDGPUISD::SETREG, SL, MVT::Other,
+ Fma4.getValue(1), DisableDenormValue,
+ BitField, Fma4.getValue(2));
+ }
SDValue OutputChain = DAG.getNode(ISD::TokenFactor, SL, MVT::Other,
DisableDenorm, DAG.getRoot());
@@ -7684,8 +7879,8 @@ SDValue SITargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
assert(VT.isVector() &&
Store->getValue().getValueType().getScalarType() == MVT::i32);
- if (!allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), VT,
- *Store->getMemOperand())) {
+ if (!allowsMemoryAccessForAlignment(*DAG.getContext(), DAG.getDataLayout(),
+ VT, *Store->getMemOperand())) {
return expandUnalignedStore(Store, DAG);
}
@@ -10065,7 +10260,7 @@ SDNode *SITargetLowering::legalizeTargetIndependentNode(SDNode *Node,
// Insert a copy to a VReg_1 virtual register so LowerI1Copies doesn't have
// to try understanding copies to physical registers.
if (SrcVal.getValueType() == MVT::i1 &&
- TargetRegisterInfo::isPhysicalRegister(DestReg->getReg())) {
+ Register::isPhysicalRegister(DestReg->getReg())) {
SDLoc SL(Node);
MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo();
SDValue VReg = DAG.getRegister(
@@ -10218,7 +10413,7 @@ void SITargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI,
MachineOperand &Op = MI.getOperand(I);
if ((OpInfo[I].RegClass != llvm::AMDGPU::AV_64RegClassID &&
OpInfo[I].RegClass != llvm::AMDGPU::AV_32RegClassID) ||
- !TargetRegisterInfo::isVirtualRegister(Op.getReg()) ||
+ !Register::isVirtualRegister(Op.getReg()) ||
!TRI->isAGPR(MRI, Op.getReg()))
continue;
auto *Src = MRI.getUniqueVRegDef(Op.getReg());
@@ -10256,7 +10451,7 @@ void SITargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI,
Node->use_begin()->isMachineOpcode() &&
Node->use_begin()->getMachineOpcode() == AMDGPU::EXTRACT_SUBREG &&
!Node->use_begin()->hasAnyUseOfValue(0))) {
- unsigned Def = MI.getOperand(0).getReg();
+ Register Def = MI.getOperand(0).getReg();
// Change this into a noret atomic.
MI.setDesc(TII->get(NoRetAtomicOp));
@@ -10300,7 +10495,7 @@ MachineSDNode *SITargetLowering::wrapAddr64Rsrc(SelectionDAG &DAG,
// Combine the constants and the pointer.
const SDValue Ops1[] = {
- DAG.getTargetConstant(AMDGPU::SReg_128RegClassID, DL, MVT::i32),
+ DAG.getTargetConstant(AMDGPU::SGPR_128RegClassID, DL, MVT::i32),
Ptr,
DAG.getTargetConstant(AMDGPU::sub0_sub1, DL, MVT::i32),
SubRegHi,
@@ -10330,7 +10525,7 @@ MachineSDNode *SITargetLowering::buildRSRC(SelectionDAG &DAG, const SDLoc &DL,
SDValue DataHi = buildSMovImm32(DAG, DL, RsrcDword2And3 >> 32);
const SDValue Ops[] = {
- DAG.getTargetConstant(AMDGPU::SReg_128RegClassID, DL, MVT::i32),
+ DAG.getTargetConstant(AMDGPU::SGPR_128RegClassID, DL, MVT::i32),
PtrLo,
DAG.getTargetConstant(AMDGPU::sub0, DL, MVT::i32),
PtrHi,
@@ -10364,7 +10559,7 @@ SITargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
return std::make_pair(0U, nullptr);
case 32:
case 16:
- RC = &AMDGPU::SReg_32_XM0RegClass;
+ RC = &AMDGPU::SReg_32RegClass;
break;
case 64:
RC = &AMDGPU::SGPR_64RegClass;
@@ -10373,7 +10568,7 @@ SITargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
RC = &AMDGPU::SReg_96RegClass;
break;
case 128:
- RC = &AMDGPU::SReg_128RegClass;
+ RC = &AMDGPU::SGPR_128RegClass;
break;
case 160:
RC = &AMDGPU::SReg_160RegClass;
@@ -10415,6 +10610,8 @@ SITargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
}
break;
case 'a':
+ if (!Subtarget->hasMAIInsts())
+ break;
switch (VT.getSizeInBits()) {
default:
return std::make_pair(0U, nullptr);
@@ -10548,9 +10745,9 @@ void SITargetLowering::computeKnownBitsForFrameIndex(const SDValue Op,
Known.Zero.setHighBits(getSubtarget()->getKnownHighZeroBitsForFrameIndex());
}
-unsigned SITargetLowering::getPrefLoopAlignment(MachineLoop *ML) const {
- const unsigned PrefAlign = TargetLowering::getPrefLoopAlignment(ML);
- const unsigned CacheLineAlign = 6; // log2(64)
+Align SITargetLowering::getPrefLoopAlignment(MachineLoop *ML) const {
+ const Align PrefAlign = TargetLowering::getPrefLoopAlignment(ML);
+ const Align CacheLineAlign = Align(64);
// Pre-GFX10 target did not benefit from loop alignment
if (!ML || DisableLoopAlignment ||
@@ -10578,7 +10775,7 @@ unsigned SITargetLowering::getPrefLoopAlignment(MachineLoop *ML) const {
// If inner loop block is aligned assume in average half of the alignment
// size to be added as nops.
if (MBB != Header)
- LoopSize += (1 << MBB->getAlignment()) / 2;
+ LoopSize += MBB->getAlignment().value() / 2;
for (const MachineInstr &MI : *MBB) {
LoopSize += TII->getInstSizeInBytes(MI);
@@ -10644,7 +10841,7 @@ bool SITargetLowering::isSDNodeSourceOfDivergence(const SDNode * N,
const MachineRegisterInfo &MRI = MF->getRegInfo();
const SIRegisterInfo &TRI = ST.getInstrInfo()->getRegisterInfo();
unsigned Reg = R->getReg();
- if (TRI.isPhysicalRegister(Reg))
+ if (Register::isPhysicalRegister(Reg))
return !TRI.isSGPRReg(MRI, Reg);
if (MRI.isLiveIn(Reg)) {
@@ -10683,12 +10880,6 @@ bool SITargetLowering::isSDNodeSourceOfDivergence(const SDNode * N,
case ISD::INTRINSIC_W_CHAIN:
return AMDGPU::isIntrinsicSourceOfDivergence(
cast<ConstantSDNode>(N->getOperand(1))->getZExtValue());
- // In some cases intrinsics that are a source of divergence have been
- // lowered to AMDGPUISD so we also need to check those too.
- case AMDGPUISD::INTERP_MOV:
- case AMDGPUISD::INTERP_P1:
- case AMDGPUISD::INTERP_P2:
- return true;
}
return false;
}
@@ -10748,3 +10939,110 @@ SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const {
return AMDGPUTargetLowering::shouldExpandAtomicRMWInIR(RMW);
}
+
+const TargetRegisterClass *
+SITargetLowering::getRegClassFor(MVT VT, bool isDivergent) const {
+ const TargetRegisterClass *RC = TargetLoweringBase::getRegClassFor(VT, false);
+ const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
+ if (RC == &AMDGPU::VReg_1RegClass && !isDivergent)
+ return Subtarget->getWavefrontSize() == 64 ? &AMDGPU::SReg_64RegClass
+ : &AMDGPU::SReg_32RegClass;
+ if (!TRI->isSGPRClass(RC) && !isDivergent)
+ return TRI->getEquivalentSGPRClass(RC);
+ else if (TRI->isSGPRClass(RC) && isDivergent)
+ return TRI->getEquivalentVGPRClass(RC);
+
+ return RC;
+}
+
+static bool hasCFUser(const Value *V, SmallPtrSet<const Value *, 16> &Visited) {
+ if (!Visited.insert(V).second)
+ return false;
+ bool Result = false;
+ for (auto U : V->users()) {
+ if (const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(U)) {
+ if (V == U->getOperand(1)) {
+ switch (Intrinsic->getIntrinsicID()) {
+ default:
+ Result = false;
+ break;
+ case Intrinsic::amdgcn_if_break:
+ case Intrinsic::amdgcn_if:
+ case Intrinsic::amdgcn_else:
+ Result = true;
+ break;
+ }
+ }
+ if (V == U->getOperand(0)) {
+ switch (Intrinsic->getIntrinsicID()) {
+ default:
+ Result = false;
+ break;
+ case Intrinsic::amdgcn_end_cf:
+ case Intrinsic::amdgcn_loop:
+ Result = true;
+ break;
+ }
+ }
+ } else {
+ Result = hasCFUser(U, Visited);
+ }
+ if (Result)
+ break;
+ }
+ return Result;
+}
+
+bool SITargetLowering::requiresUniformRegister(MachineFunction &MF,
+ const Value *V) const {
+ if (const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(V)) {
+ switch (Intrinsic->getIntrinsicID()) {
+ default:
+ return false;
+ case Intrinsic::amdgcn_if_break:
+ return true;
+ }
+ }
+ if (const ExtractValueInst *ExtValue = dyn_cast<ExtractValueInst>(V)) {
+ if (const IntrinsicInst *Intrinsic =
+ dyn_cast<IntrinsicInst>(ExtValue->getOperand(0))) {
+ switch (Intrinsic->getIntrinsicID()) {
+ default:
+ return false;
+ case Intrinsic::amdgcn_if:
+ case Intrinsic::amdgcn_else: {
+ ArrayRef<unsigned> Indices = ExtValue->getIndices();
+ if (Indices.size() == 1 && Indices[0] == 1) {
+ return true;
+ }
+ }
+ }
+ }
+ }
+ if (const CallInst *CI = dyn_cast<CallInst>(V)) {
+ if (isa<InlineAsm>(CI->getCalledValue())) {
+ const SIRegisterInfo *SIRI = Subtarget->getRegisterInfo();
+ ImmutableCallSite CS(CI);
+ TargetLowering::AsmOperandInfoVector TargetConstraints = ParseConstraints(
+ MF.getDataLayout(), Subtarget->getRegisterInfo(), CS);
+ for (auto &TC : TargetConstraints) {
+ if (TC.Type == InlineAsm::isOutput) {
+ ComputeConstraintToUse(TC, SDValue());
+ unsigned AssignedReg;
+ const TargetRegisterClass *RC;
+ std::tie(AssignedReg, RC) = getRegForInlineAsmConstraint(
+ SIRI, TC.ConstraintCode, TC.ConstraintVT);
+ if (RC) {
+ MachineRegisterInfo &MRI = MF.getRegInfo();
+ if (AssignedReg != 0 && SIRI->isSGPRReg(MRI, AssignedReg))
+ return true;
+ else if (SIRI->isSGPRClass(RC))
+ return true;
+ }
+ }
+ }
+ }
+ }
+ SmallPtrSet<const Value *, 16> Visited;
+ return hasCFUser(V, Visited);
+}
diff --git a/lib/Target/AMDGPU/SIISelLowering.h b/lib/Target/AMDGPU/SIISelLowering.h
index 21a215e16ce7..f0102feb65c4 100644
--- a/lib/Target/AMDGPU/SIISelLowering.h
+++ b/lib/Target/AMDGPU/SIISelLowering.h
@@ -94,6 +94,9 @@ private:
SelectionDAG &DAG, ArrayRef<SDValue> Ops,
bool IsIntrinsic = false) const;
+ SDValue lowerIntrinsicLoad(MemSDNode *M, bool IsFormat, SelectionDAG &DAG,
+ ArrayRef<SDValue> Ops) const;
+
// Call DAG.getMemIntrinsicNode for a load, but first widen a dwordx3 type to
// dwordx4 if on SI.
SDValue getMemIntrinsicNode(unsigned Opcode, const SDLoc &DL, SDVTList VTList,
@@ -183,6 +186,7 @@ private:
unsigned isCFIntrinsic(const SDNode *Intr) const;
+public:
/// \returns True if fixup needs to be emitted for given global value \p GV,
/// false otherwise.
bool shouldEmitFixup(const GlobalValue *GV) const;
@@ -195,11 +199,14 @@ private:
/// global value \p GV, false otherwise.
bool shouldEmitPCReloc(const GlobalValue *GV) const;
+private:
// Analyze a combined offset from an amdgcn_buffer_ intrinsic and store the
// three offsets (voffset, soffset and instoffset) into the SDValue[3] array
// pointed to by Offsets.
- void setBufferOffsets(SDValue CombinedOffset, SelectionDAG &DAG,
- SDValue *Offsets, unsigned Align = 4) const;
+ /// \returns 0 If there is a non-constant offset or if the offset is 0.
+ /// Otherwise returns the constant offset.
+ unsigned setBufferOffsets(SDValue CombinedOffset, SelectionDAG &DAG,
+ SDValue *Offsets, unsigned Align = 4) const;
// Handle 8 bit and 16 bit buffer loads
SDValue handleByteShortBufferLoads(SelectionDAG &DAG, EVT LoadVT, SDLoc DL,
@@ -235,6 +242,11 @@ public:
bool canMergeStoresTo(unsigned AS, EVT MemVT,
const SelectionDAG &DAG) const override;
+ bool allowsMisalignedMemoryAccessesImpl(
+ unsigned Size, unsigned AS, unsigned Align,
+ MachineMemOperand::Flags Flags = MachineMemOperand::MONone,
+ bool *IsFast = nullptr) const;
+
bool allowsMisalignedMemoryAccesses(
EVT VT, unsigned AS, unsigned Align,
MachineMemOperand::Flags Flags = MachineMemOperand::MONone,
@@ -309,12 +321,13 @@ public:
SDValue LowerCall(CallLoweringInfo &CLI,
SmallVectorImpl<SDValue> &InVals) const override;
- unsigned getRegisterByName(const char* RegName, EVT VT,
- SelectionDAG &DAG) const override;
+ Register getRegisterByName(const char* RegName, EVT VT,
+ const MachineFunction &MF) const override;
MachineBasicBlock *splitKillBlock(MachineInstr &MI,
MachineBasicBlock *BB) const;
+ void bundleInstWithWaitcnt(MachineInstr &MI) const;
MachineBasicBlock *emitGWSMemViolTestLoop(MachineInstr &MI,
MachineBasicBlock *BB) const;
@@ -330,6 +343,7 @@ public:
bool isFMAFasterThanFMulAndFAdd(EVT VT) const override;
SDValue splitUnaryVectorOp(SDValue Op, SelectionDAG &DAG) const;
SDValue splitBinaryVectorOp(SDValue Op, SelectionDAG &DAG) const;
+ SDValue splitTernaryVectorOp(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override;
void ReplaceNodeResults(SDNode *N, SmallVectorImpl<SDValue> &Results,
@@ -374,7 +388,37 @@ public:
unsigned Depth = 0) const override;
AtomicExpansionKind shouldExpandAtomicRMWInIR(AtomicRMWInst *) const override;
- unsigned getPrefLoopAlignment(MachineLoop *ML) const override;
+ virtual const TargetRegisterClass *
+ getRegClassFor(MVT VT, bool isDivergent) const override;
+ virtual bool requiresUniformRegister(MachineFunction &MF,
+ const Value *V) const override;
+ Align getPrefLoopAlignment(MachineLoop *ML) const override;
+
+ void allocateHSAUserSGPRs(CCState &CCInfo,
+ MachineFunction &MF,
+ const SIRegisterInfo &TRI,
+ SIMachineFunctionInfo &Info) const;
+
+ void allocateSystemSGPRs(CCState &CCInfo,
+ MachineFunction &MF,
+ SIMachineFunctionInfo &Info,
+ CallingConv::ID CallConv,
+ bool IsShader) const;
+
+ void allocateSpecialEntryInputVGPRs(CCState &CCInfo,
+ MachineFunction &MF,
+ const SIRegisterInfo &TRI,
+ SIMachineFunctionInfo &Info) const;
+ void allocateSpecialInputSGPRs(
+ CCState &CCInfo,
+ MachineFunction &MF,
+ const SIRegisterInfo &TRI,
+ SIMachineFunctionInfo &Info) const;
+
+ void allocateSpecialInputVGPRs(CCState &CCInfo,
+ MachineFunction &MF,
+ const SIRegisterInfo &TRI,
+ SIMachineFunctionInfo &Info) const;
};
} // End namespace llvm
diff --git a/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
index c89d5b71ec5c..dcb04e426584 100644
--- a/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ b/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -1483,12 +1483,12 @@ bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) {
if (BI.Incoming) {
if (!Brackets)
- Brackets = llvm::make_unique<WaitcntBrackets>(*BI.Incoming);
+ Brackets = std::make_unique<WaitcntBrackets>(*BI.Incoming);
else
*Brackets = *BI.Incoming;
} else {
if (!Brackets)
- Brackets = llvm::make_unique<WaitcntBrackets>(ST);
+ Brackets = std::make_unique<WaitcntBrackets>(ST);
else
Brackets->clear();
}
@@ -1508,7 +1508,7 @@ bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) {
if (!MoveBracketsToSucc) {
MoveBracketsToSucc = &SuccBI;
} else {
- SuccBI.Incoming = llvm::make_unique<WaitcntBrackets>(*Brackets);
+ SuccBI.Incoming = std::make_unique<WaitcntBrackets>(*Brackets);
}
} else if (SuccBI.Incoming->merge(*Brackets)) {
SuccBI.Dirty = true;
diff --git a/lib/Target/AMDGPU/SIInstrFormats.td b/lib/Target/AMDGPU/SIInstrFormats.td
index 561a16c3e351..4dcbe92861f2 100644
--- a/lib/Target/AMDGPU/SIInstrFormats.td
+++ b/lib/Target/AMDGPU/SIInstrFormats.td
@@ -124,6 +124,9 @@ class InstSI <dag outs, dag ins, string asm = "",
// This bit indicates that this is one of MFMA instructions.
field bit IsMAI = 0;
+ // This bit indicates that this is one of DOT instructions.
+ field bit IsDOT = 0;
+
// These need to be kept in sync with the enum in SIInstrFlags.
let TSFlags{0} = SALU;
let TSFlags{1} = VALU;
@@ -189,6 +192,8 @@ class InstSI <dag outs, dag ins, string asm = "",
let TSFlags{54} = IsMAI;
+ let TSFlags{55} = IsDOT;
+
let SchedRW = [Write32Bit];
field bits<1> DisableSIDecoder = 0;
diff --git a/lib/Target/AMDGPU/SIInstrInfo.cpp b/lib/Target/AMDGPU/SIInstrInfo.cpp
index ba8ed6993a56..d97e6a62971b 100644
--- a/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -318,8 +318,25 @@ bool SIInstrInfo::getMemOperandWithOffset(const MachineInstr &LdSt,
if (isMUBUF(LdSt) || isMTBUF(LdSt)) {
const MachineOperand *SOffset = getNamedOperand(LdSt, AMDGPU::OpName::soffset);
- if (SOffset && SOffset->isReg())
- return false;
+ if (SOffset && SOffset->isReg()) {
+ // We can only handle this if it's a stack access, as any other resource
+ // would require reporting multiple base registers.
+ const MachineOperand *AddrReg = getNamedOperand(LdSt, AMDGPU::OpName::vaddr);
+ if (AddrReg && !AddrReg->isFI())
+ return false;
+
+ const MachineOperand *RSrc = getNamedOperand(LdSt, AMDGPU::OpName::srsrc);
+ const SIMachineFunctionInfo *MFI
+ = LdSt.getParent()->getParent()->getInfo<SIMachineFunctionInfo>();
+ if (RSrc->getReg() != MFI->getScratchRSrcReg())
+ return false;
+
+ const MachineOperand *OffsetImm =
+ getNamedOperand(LdSt, AMDGPU::OpName::offset);
+ BaseOp = SOffset;
+ Offset = OffsetImm->getImm();
+ return true;
+ }
const MachineOperand *AddrReg = getNamedOperand(LdSt, AMDGPU::OpName::vaddr);
if (!AddrReg)
@@ -458,9 +475,9 @@ bool SIInstrInfo::shouldClusterMemOps(const MachineOperand &BaseOp1,
const MachineRegisterInfo &MRI =
FirstLdSt.getParent()->getParent()->getRegInfo();
- const unsigned Reg = FirstDst->getReg();
+ const Register Reg = FirstDst->getReg();
- const TargetRegisterClass *DstRC = TargetRegisterInfo::isVirtualRegister(Reg)
+ const TargetRegisterClass *DstRC = Register::isVirtualRegister(Reg)
? MRI.getRegClass(Reg)
: RI.getPhysRegClass(Reg);
@@ -807,7 +824,7 @@ void SIInstrInfo::insertVectorSelect(MachineBasicBlock &MBB,
"Not a VGPR32 reg");
if (Cond.size() == 1) {
- unsigned SReg = MRI.createVirtualRegister(BoolXExecRC);
+ Register SReg = MRI.createVirtualRegister(BoolXExecRC);
BuildMI(MBB, I, DL, get(AMDGPU::COPY), SReg)
.add(Cond[0]);
BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
@@ -820,7 +837,7 @@ void SIInstrInfo::insertVectorSelect(MachineBasicBlock &MBB,
assert(Cond[0].isImm() && "Cond[0] is not an immediate");
switch (Cond[0].getImm()) {
case SIInstrInfo::SCC_TRUE: {
- unsigned SReg = MRI.createVirtualRegister(BoolXExecRC);
+ Register SReg = MRI.createVirtualRegister(BoolXExecRC);
BuildMI(MBB, I, DL, get(ST.isWave32() ? AMDGPU::S_CSELECT_B32
: AMDGPU::S_CSELECT_B64), SReg)
.addImm(-1)
@@ -834,7 +851,7 @@ void SIInstrInfo::insertVectorSelect(MachineBasicBlock &MBB,
break;
}
case SIInstrInfo::SCC_FALSE: {
- unsigned SReg = MRI.createVirtualRegister(BoolXExecRC);
+ Register SReg = MRI.createVirtualRegister(BoolXExecRC);
BuildMI(MBB, I, DL, get(ST.isWave32() ? AMDGPU::S_CSELECT_B32
: AMDGPU::S_CSELECT_B64), SReg)
.addImm(0)
@@ -850,7 +867,7 @@ void SIInstrInfo::insertVectorSelect(MachineBasicBlock &MBB,
case SIInstrInfo::VCCNZ: {
MachineOperand RegOp = Cond[1];
RegOp.setImplicit(false);
- unsigned SReg = MRI.createVirtualRegister(BoolXExecRC);
+ Register SReg = MRI.createVirtualRegister(BoolXExecRC);
BuildMI(MBB, I, DL, get(AMDGPU::COPY), SReg)
.add(RegOp);
BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
@@ -864,7 +881,7 @@ void SIInstrInfo::insertVectorSelect(MachineBasicBlock &MBB,
case SIInstrInfo::VCCZ: {
MachineOperand RegOp = Cond[1];
RegOp.setImplicit(false);
- unsigned SReg = MRI.createVirtualRegister(BoolXExecRC);
+ Register SReg = MRI.createVirtualRegister(BoolXExecRC);
BuildMI(MBB, I, DL, get(AMDGPU::COPY), SReg)
.add(RegOp);
BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
@@ -876,8 +893,8 @@ void SIInstrInfo::insertVectorSelect(MachineBasicBlock &MBB,
break;
}
case SIInstrInfo::EXECNZ: {
- unsigned SReg = MRI.createVirtualRegister(BoolXExecRC);
- unsigned SReg2 = MRI.createVirtualRegister(RI.getBoolRC());
+ Register SReg = MRI.createVirtualRegister(BoolXExecRC);
+ Register SReg2 = MRI.createVirtualRegister(RI.getBoolRC());
BuildMI(MBB, I, DL, get(ST.isWave32() ? AMDGPU::S_OR_SAVEEXEC_B32
: AMDGPU::S_OR_SAVEEXEC_B64), SReg2)
.addImm(0);
@@ -894,8 +911,8 @@ void SIInstrInfo::insertVectorSelect(MachineBasicBlock &MBB,
break;
}
case SIInstrInfo::EXECZ: {
- unsigned SReg = MRI.createVirtualRegister(BoolXExecRC);
- unsigned SReg2 = MRI.createVirtualRegister(RI.getBoolRC());
+ Register SReg = MRI.createVirtualRegister(BoolXExecRC);
+ Register SReg2 = MRI.createVirtualRegister(RI.getBoolRC());
BuildMI(MBB, I, DL, get(ST.isWave32() ? AMDGPU::S_OR_SAVEEXEC_B32
: AMDGPU::S_OR_SAVEEXEC_B64), SReg2)
.addImm(0);
@@ -925,7 +942,7 @@ unsigned SIInstrInfo::insertEQ(MachineBasicBlock *MBB,
const DebugLoc &DL,
unsigned SrcReg, int Value) const {
MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
- unsigned Reg = MRI.createVirtualRegister(RI.getBoolRC());
+ Register Reg = MRI.createVirtualRegister(RI.getBoolRC());
BuildMI(*MBB, I, DL, get(AMDGPU::V_CMP_EQ_I32_e64), Reg)
.addImm(Value)
.addReg(SrcReg);
@@ -938,7 +955,7 @@ unsigned SIInstrInfo::insertNE(MachineBasicBlock *MBB,
const DebugLoc &DL,
unsigned SrcReg, int Value) const {
MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
- unsigned Reg = MRI.createVirtualRegister(RI.getBoolRC());
+ Register Reg = MRI.createVirtualRegister(RI.getBoolRC());
BuildMI(*MBB, I, DL, get(AMDGPU::V_CMP_NE_I32_e64), Reg)
.addImm(Value)
.addReg(SrcReg);
@@ -1052,12 +1069,12 @@ void SIInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
// The SGPR spill/restore instructions only work on number sgprs, so we need
// to make sure we are using the correct register class.
- if (TargetRegisterInfo::isVirtualRegister(SrcReg) && SpillSize == 4) {
+ if (Register::isVirtualRegister(SrcReg) && SpillSize == 4) {
MachineRegisterInfo &MRI = MF->getRegInfo();
MRI.constrainRegClass(SrcReg, &AMDGPU::SReg_32_XM0RegClass);
}
- MachineInstrBuilder Spill = BuildMI(MBB, MI, DL, OpDesc)
+ BuildMI(MBB, MI, DL, OpDesc)
.addReg(SrcReg, getKillRegState(isKill)) // data
.addFrameIndex(FrameIndex) // addr
.addMemOperand(MMO)
@@ -1068,11 +1085,6 @@ void SIInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
// correctly handled.
if (RI.spillSGPRToVGPR())
FrameInfo.setStackID(FrameIndex, TargetStackID::SGPRSpill);
- if (ST.hasScalarStores()) {
- // m0 is used for offset to scalar stores if used to spill.
- Spill.addReg(AMDGPU::M0, RegState::ImplicitDefine | RegState::Dead);
- }
-
return;
}
@@ -1083,7 +1095,7 @@ void SIInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
auto MIB = BuildMI(MBB, MI, DL, get(Opcode));
if (RI.hasAGPRs(RC)) {
MachineRegisterInfo &MRI = MF->getRegInfo();
- unsigned Tmp = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+ Register Tmp = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
MIB.addReg(Tmp, RegState::Define);
}
MIB.addReg(SrcReg, getKillRegState(isKill)) // data
@@ -1182,24 +1194,18 @@ void SIInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
// FIXME: Maybe this should not include a memoperand because it will be
// lowered to non-memory instructions.
const MCInstrDesc &OpDesc = get(getSGPRSpillRestoreOpcode(SpillSize));
- if (TargetRegisterInfo::isVirtualRegister(DestReg) && SpillSize == 4) {
+ if (Register::isVirtualRegister(DestReg) && SpillSize == 4) {
MachineRegisterInfo &MRI = MF->getRegInfo();
MRI.constrainRegClass(DestReg, &AMDGPU::SReg_32_XM0RegClass);
}
if (RI.spillSGPRToVGPR())
FrameInfo.setStackID(FrameIndex, TargetStackID::SGPRSpill);
- MachineInstrBuilder Spill = BuildMI(MBB, MI, DL, OpDesc, DestReg)
+ BuildMI(MBB, MI, DL, OpDesc, DestReg)
.addFrameIndex(FrameIndex) // addr
.addMemOperand(MMO)
.addReg(MFI->getScratchRSrcReg(), RegState::Implicit)
.addReg(MFI->getStackPtrOffsetReg(), RegState::Implicit);
-
- if (ST.hasScalarStores()) {
- // m0 is used for offset to scalar stores if used to spill.
- Spill.addReg(AMDGPU::M0, RegState::ImplicitDefine | RegState::Dead);
- }
-
return;
}
@@ -1208,7 +1214,7 @@ void SIInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
auto MIB = BuildMI(MBB, MI, DL, get(Opcode), DestReg);
if (RI.hasAGPRs(RC)) {
MachineRegisterInfo &MRI = MF->getRegInfo();
- unsigned Tmp = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+ Register Tmp = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
MIB.addReg(Tmp, RegState::Define);
}
MIB.addFrameIndex(FrameIndex) // vaddr
@@ -1242,13 +1248,13 @@ unsigned SIInstrInfo::calculateLDSSpillAddress(
if (!AMDGPU::isShader(MF->getFunction().getCallingConv()) &&
WorkGroupSize > WavefrontSize) {
- unsigned TIDIGXReg
- = MFI->getPreloadedReg(AMDGPUFunctionArgInfo::WORKGROUP_ID_X);
- unsigned TIDIGYReg
- = MFI->getPreloadedReg(AMDGPUFunctionArgInfo::WORKGROUP_ID_Y);
- unsigned TIDIGZReg
- = MFI->getPreloadedReg(AMDGPUFunctionArgInfo::WORKGROUP_ID_Z);
- unsigned InputPtrReg =
+ Register TIDIGXReg =
+ MFI->getPreloadedReg(AMDGPUFunctionArgInfo::WORKGROUP_ID_X);
+ Register TIDIGYReg =
+ MFI->getPreloadedReg(AMDGPUFunctionArgInfo::WORKGROUP_ID_Y);
+ Register TIDIGZReg =
+ MFI->getPreloadedReg(AMDGPUFunctionArgInfo::WORKGROUP_ID_Z);
+ Register InputPtrReg =
MFI->getPreloadedReg(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
for (unsigned Reg : {TIDIGXReg, TIDIGYReg, TIDIGZReg}) {
if (!Entry.isLiveIn(Reg))
@@ -1410,9 +1416,9 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
break;
case AMDGPU::V_MOV_B64_PSEUDO: {
- unsigned Dst = MI.getOperand(0).getReg();
- unsigned DstLo = RI.getSubReg(Dst, AMDGPU::sub0);
- unsigned DstHi = RI.getSubReg(Dst, AMDGPU::sub1);
+ Register Dst = MI.getOperand(0).getReg();
+ Register DstLo = RI.getSubReg(Dst, AMDGPU::sub0);
+ Register DstHi = RI.getSubReg(Dst, AMDGPU::sub1);
const MachineOperand &SrcOp = MI.getOperand(1);
// FIXME: Will this work for 64-bit floating point immediates?
@@ -1437,6 +1443,10 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
MI.eraseFromParent();
break;
}
+ case AMDGPU::V_MOV_B64_DPP_PSEUDO: {
+ expandMovDPP64(MI);
+ break;
+ }
case AMDGPU::V_SET_INACTIVE_B32: {
unsigned NotOpc = ST.isWave32() ? AMDGPU::S_NOT_B32 : AMDGPU::S_NOT_B64;
unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
@@ -1469,7 +1479,7 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
case AMDGPU::V_MOVRELD_B32_V8:
case AMDGPU::V_MOVRELD_B32_V16: {
const MCInstrDesc &MovRelDesc = get(AMDGPU::V_MOVRELD_B32_e32);
- unsigned VecReg = MI.getOperand(0).getReg();
+ Register VecReg = MI.getOperand(0).getReg();
bool IsUndef = MI.getOperand(1).isUndef();
unsigned SubReg = AMDGPU::sub0 + MI.getOperand(3).getImm();
assert(VecReg == MI.getOperand(1).getReg());
@@ -1492,9 +1502,9 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
}
case AMDGPU::SI_PC_ADD_REL_OFFSET: {
MachineFunction &MF = *MBB.getParent();
- unsigned Reg = MI.getOperand(0).getReg();
- unsigned RegLo = RI.getSubReg(Reg, AMDGPU::sub0);
- unsigned RegHi = RI.getSubReg(Reg, AMDGPU::sub1);
+ Register Reg = MI.getOperand(0).getReg();
+ Register RegLo = RI.getSubReg(Reg, AMDGPU::sub0);
+ Register RegHi = RI.getSubReg(Reg, AMDGPU::sub1);
// Create a bundle so these instructions won't be re-ordered by the
// post-RA scheduler.
@@ -1531,7 +1541,7 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
break;
}
case TargetOpcode::BUNDLE: {
- if (!MI.mayLoad())
+ if (!MI.mayLoad() || MI.hasUnmodeledSideEffects())
return false;
// If it is a load it must be a memory clause
@@ -1550,6 +1560,64 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
return true;
}
+std::pair<MachineInstr*, MachineInstr*>
+SIInstrInfo::expandMovDPP64(MachineInstr &MI) const {
+ assert (MI.getOpcode() == AMDGPU::V_MOV_B64_DPP_PSEUDO);
+
+ MachineBasicBlock &MBB = *MI.getParent();
+ DebugLoc DL = MBB.findDebugLoc(MI);
+ MachineFunction *MF = MBB.getParent();
+ MachineRegisterInfo &MRI = MF->getRegInfo();
+ Register Dst = MI.getOperand(0).getReg();
+ unsigned Part = 0;
+ MachineInstr *Split[2];
+
+
+ for (auto Sub : { AMDGPU::sub0, AMDGPU::sub1 }) {
+ auto MovDPP = BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_dpp));
+ if (Dst.isPhysical()) {
+ MovDPP.addDef(RI.getSubReg(Dst, Sub));
+ } else {
+ assert(MRI.isSSA());
+ auto Tmp = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+ MovDPP.addDef(Tmp);
+ }
+
+ for (unsigned I = 1; I <= 2; ++I) { // old and src operands.
+ const MachineOperand &SrcOp = MI.getOperand(I);
+ assert(!SrcOp.isFPImm());
+ if (SrcOp.isImm()) {
+ APInt Imm(64, SrcOp.getImm());
+ Imm.ashrInPlace(Part * 32);
+ MovDPP.addImm(Imm.getLoBits(32).getZExtValue());
+ } else {
+ assert(SrcOp.isReg());
+ Register Src = SrcOp.getReg();
+ if (Src.isPhysical())
+ MovDPP.addReg(RI.getSubReg(Src, Sub));
+ else
+ MovDPP.addReg(Src, SrcOp.isUndef() ? RegState::Undef : 0, Sub);
+ }
+ }
+
+ for (unsigned I = 3; I < MI.getNumExplicitOperands(); ++I)
+ MovDPP.addImm(MI.getOperand(I).getImm());
+
+ Split[Part] = MovDPP;
+ ++Part;
+ }
+
+ if (Dst.isVirtual())
+ BuildMI(MBB, MI, DL, get(AMDGPU::REG_SEQUENCE), Dst)
+ .addReg(Split[0]->getOperand(0).getReg())
+ .addImm(AMDGPU::sub0)
+ .addReg(Split[1]->getOperand(0).getReg())
+ .addImm(AMDGPU::sub1);
+
+ MI.eraseFromParent();
+ return std::make_pair(Split[0], Split[1]);
+}
+
bool SIInstrInfo::swapSourceModifiers(MachineInstr &MI,
MachineOperand &Src0,
unsigned Src0OpName,
@@ -1574,7 +1642,7 @@ bool SIInstrInfo::swapSourceModifiers(MachineInstr &MI,
static MachineInstr *swapRegAndNonRegOperand(MachineInstr &MI,
MachineOperand &RegOp,
MachineOperand &NonRegOp) {
- unsigned Reg = RegOp.getReg();
+ Register Reg = RegOp.getReg();
unsigned SubReg = RegOp.getSubReg();
bool IsKill = RegOp.isKill();
bool IsDead = RegOp.isDead();
@@ -1646,7 +1714,8 @@ MachineInstr *SIInstrInfo::commuteInstructionImpl(MachineInstr &MI, bool NewMI,
// This needs to be implemented because the source modifiers may be inserted
// between the true commutable operands, and the base
// TargetInstrInfo::commuteInstruction uses it.
-bool SIInstrInfo::findCommutedOpIndices(MachineInstr &MI, unsigned &SrcOpIdx0,
+bool SIInstrInfo::findCommutedOpIndices(const MachineInstr &MI,
+ unsigned &SrcOpIdx0,
unsigned &SrcOpIdx1) const {
return findCommutedOpIndices(MI.getDesc(), SrcOpIdx0, SrcOpIdx1);
}
@@ -1710,7 +1779,7 @@ unsigned SIInstrInfo::insertIndirectBranch(MachineBasicBlock &MBB,
// FIXME: Virtual register workaround for RegScavenger not working with empty
// blocks.
- unsigned PCReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
+ Register PCReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
auto I = MBB.end();
@@ -2163,7 +2232,7 @@ void SIInstrInfo::insertSelect(MachineBasicBlock &MBB,
SmallVector<unsigned, 8> Regs;
for (int Idx = 0; Idx != NElts; ++Idx) {
- unsigned DstElt = MRI.createVirtualRegister(EltRC);
+ Register DstElt = MRI.createVirtualRegister(EltRC);
Regs.push_back(DstElt);
unsigned SubIdx = SubIndices[Idx];
@@ -2327,7 +2396,7 @@ bool SIInstrInfo::FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
UseMI.RemoveOperand(
AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::clamp));
- unsigned Src1Reg = Src1->getReg();
+ Register Src1Reg = Src1->getReg();
unsigned Src1SubReg = Src1->getSubReg();
Src0->setReg(Src1Reg);
Src0->setSubReg(Src1SubReg);
@@ -2367,12 +2436,12 @@ bool SIInstrInfo::FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
MRI->hasOneUse(Src0->getReg())) {
Src0->ChangeToImmediate(Def->getOperand(1).getImm());
Src0Inlined = true;
- } else if ((RI.isPhysicalRegister(Src0->getReg()) &&
- (ST.getConstantBusLimit(Opc) <= 1 &&
- RI.isSGPRClass(RI.getPhysRegClass(Src0->getReg())))) ||
- (RI.isVirtualRegister(Src0->getReg()) &&
- (ST.getConstantBusLimit(Opc) <= 1 &&
- RI.isSGPRClass(MRI->getRegClass(Src0->getReg())))))
+ } else if ((Register::isPhysicalRegister(Src0->getReg()) &&
+ (ST.getConstantBusLimit(Opc) <= 1 &&
+ RI.isSGPRClass(RI.getPhysRegClass(Src0->getReg())))) ||
+ (Register::isVirtualRegister(Src0->getReg()) &&
+ (ST.getConstantBusLimit(Opc) <= 1 &&
+ RI.isSGPRClass(MRI->getRegClass(Src0->getReg())))))
return false;
// VGPR is okay as Src0 - fallthrough
}
@@ -2385,10 +2454,10 @@ bool SIInstrInfo::FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
MRI->hasOneUse(Src1->getReg()) &&
commuteInstruction(UseMI)) {
Src0->ChangeToImmediate(Def->getOperand(1).getImm());
- } else if ((RI.isPhysicalRegister(Src1->getReg()) &&
- RI.isSGPRClass(RI.getPhysRegClass(Src1->getReg()))) ||
- (RI.isVirtualRegister(Src1->getReg()) &&
- RI.isSGPRClass(MRI->getRegClass(Src1->getReg()))))
+ } else if ((Register::isPhysicalRegister(Src1->getReg()) &&
+ RI.isSGPRClass(RI.getPhysRegClass(Src1->getReg()))) ||
+ (Register::isVirtualRegister(Src1->getReg()) &&
+ RI.isSGPRClass(MRI->getRegClass(Src1->getReg()))))
return false;
// VGPR is okay as Src1 - fallthrough
}
@@ -2472,8 +2541,7 @@ bool SIInstrInfo::checkInstOffsetsDoNotOverlap(const MachineInstr &MIa,
}
bool SIInstrInfo::areMemAccessesTriviallyDisjoint(const MachineInstr &MIa,
- const MachineInstr &MIb,
- AliasAnalysis *AA) const {
+ const MachineInstr &MIb) const {
assert((MIa.mayLoad() || MIa.mayStore()) &&
"MIa must load from or modify a memory location");
assert((MIb.mayLoad() || MIb.mayStore()) &&
@@ -2664,6 +2732,7 @@ bool SIInstrInfo::isSchedulingBoundary(const MachineInstr &MI,
MI.modifiesRegister(AMDGPU::EXEC, &RI) ||
MI.getOpcode() == AMDGPU::S_SETREG_IMM32_B32 ||
MI.getOpcode() == AMDGPU::S_SETREG_B32 ||
+ MI.getOpcode() == AMDGPU::S_DENORM_MODE ||
changesVGPRIndexingMode(MI);
}
@@ -2865,8 +2934,16 @@ bool SIInstrInfo::isImmOperandLegal(const MachineInstr &MI, unsigned OpNo,
if (OpInfo.RegClass < 0)
return false;
- if (MO.isImm() && isInlineConstant(MO, OpInfo))
+ const MachineFunction *MF = MI.getParent()->getParent();
+ const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
+
+ if (MO.isImm() && isInlineConstant(MO, OpInfo)) {
+ if (isMAI(MI) && ST.hasMFMAInlineLiteralBug() &&
+ OpNo ==(unsigned)AMDGPU::getNamedOperandIdx(MI.getOpcode(),
+ AMDGPU::OpName::src2))
+ return false;
return RI.opCanUseInlineConstant(OpInfo.OperandType);
+ }
if (!RI.opCanUseLiteralConstant(OpInfo.OperandType))
return false;
@@ -2874,8 +2951,6 @@ bool SIInstrInfo::isImmOperandLegal(const MachineInstr &MI, unsigned OpNo,
if (!isVOP3(MI) || !AMDGPU::isSISrcOperand(InstDesc, OpNo))
return true;
- const MachineFunction *MF = MI.getParent()->getParent();
- const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
return ST.hasVOP3Literal();
}
@@ -3036,7 +3111,7 @@ bool SIInstrInfo::usesConstantBus(const MachineRegisterInfo &MRI,
if (!MO.isUse())
return false;
- if (TargetRegisterInfo::isVirtualRegister(MO.getReg()))
+ if (Register::isVirtualRegister(MO.getReg()))
return RI.isSGPRClass(MRI.getRegClass(MO.getReg()));
// Null is free
@@ -3093,7 +3168,8 @@ static bool shouldReadExec(const MachineInstr &MI) {
return true;
}
- if (SIInstrInfo::isGenericOpcode(MI.getOpcode()) ||
+ if (MI.isPreISelOpcode() ||
+ SIInstrInfo::isGenericOpcode(MI.getOpcode()) ||
SIInstrInfo::isSALU(MI) ||
SIInstrInfo::isSMRD(MI))
return false;
@@ -3104,7 +3180,7 @@ static bool shouldReadExec(const MachineInstr &MI) {
static bool isSubRegOf(const SIRegisterInfo &TRI,
const MachineOperand &SuperVec,
const MachineOperand &SubReg) {
- if (TargetRegisterInfo::isPhysicalRegister(SubReg.getReg()))
+ if (Register::isPhysicalRegister(SubReg.getReg()))
return TRI.isSubRegister(SuperVec.getReg(), SubReg.getReg());
return SubReg.getSubReg() != AMDGPU::NoSubRegister &&
@@ -3144,8 +3220,8 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI,
if (!Op.isReg())
continue;
- unsigned Reg = Op.getReg();
- if (!TargetRegisterInfo::isVirtualRegister(Reg) && !RC->contains(Reg)) {
+ Register Reg = Op.getReg();
+ if (!Register::isVirtualRegister(Reg) && !RC->contains(Reg)) {
ErrInfo = "inlineasm operand has incorrect register class.";
return false;
}
@@ -3209,9 +3285,8 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI,
continue;
if (RegClass != -1) {
- unsigned Reg = MI.getOperand(i).getReg();
- if (Reg == AMDGPU::NoRegister ||
- TargetRegisterInfo::isVirtualRegister(Reg))
+ Register Reg = MI.getOperand(i).getReg();
+ if (Reg == AMDGPU::NoRegister || Register::isVirtualRegister(Reg))
continue;
const TargetRegisterClass *RC = RI.getRegClass(RegClass);
@@ -3304,7 +3379,7 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI,
ErrInfo =
"Dst register should be tied to implicit use of preserved register";
return false;
- } else if (TargetRegisterInfo::isPhysicalRegister(TiedMO.getReg()) &&
+ } else if (Register::isPhysicalRegister(TiedMO.getReg()) &&
Dst.getReg() != TiedMO.getReg()) {
ErrInfo = "Dst register should use same physical register as preserved";
return false;
@@ -3409,6 +3484,32 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI,
}
}
+ // Special case for writelane - this can break the multiple constant bus rule,
+ // but still can't use more than one SGPR register
+ if (Desc.getOpcode() == AMDGPU::V_WRITELANE_B32) {
+ unsigned SGPRCount = 0;
+ Register SGPRUsed = AMDGPU::NoRegister;
+
+ for (int OpIdx : {Src0Idx, Src1Idx, Src2Idx}) {
+ if (OpIdx == -1)
+ break;
+
+ const MachineOperand &MO = MI.getOperand(OpIdx);
+
+ if (usesConstantBus(MRI, MO, MI.getDesc().OpInfo[OpIdx])) {
+ if (MO.isReg() && MO.getReg() != AMDGPU::M0) {
+ if (MO.getReg() != SGPRUsed)
+ ++SGPRCount;
+ SGPRUsed = MO.getReg();
+ }
+ }
+ if (SGPRCount > ST.getConstantBusLimit(Opcode)) {
+ ErrInfo = "WRITELANE instruction violates constant bus restriction";
+ return false;
+ }
+ }
+ }
+
// Verify misc. restrictions on specific instructions.
if (Desc.getOpcode() == AMDGPU::V_DIV_SCALE_F32 ||
Desc.getOpcode() == AMDGPU::V_DIV_SCALE_F64) {
@@ -3609,7 +3710,7 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI,
if (DC >= DppCtrl::BCAST15 && DC <= DppCtrl::BCAST31 &&
ST.getGeneration() >= AMDGPUSubtarget::GFX10) {
ErrInfo = "Invalid dpp_ctrl value: "
- "broadcats are not supported on GFX10+";
+ "broadcasts are not supported on GFX10+";
return false;
}
if (DC >= DppCtrl::ROW_SHARE_FIRST && DC <= DppCtrl::ROW_XMASK_LAST &&
@@ -3631,6 +3732,7 @@ unsigned SIInstrInfo::getVALUOp(const MachineInstr &MI) const {
case AMDGPU::PHI: return AMDGPU::PHI;
case AMDGPU::INSERT_SUBREG: return AMDGPU::INSERT_SUBREG;
case AMDGPU::WQM: return AMDGPU::WQM;
+ case AMDGPU::SOFT_WQM: return AMDGPU::SOFT_WQM;
case AMDGPU::WWM: return AMDGPU::WWM;
case AMDGPU::S_MOV_B32: {
const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
@@ -3708,9 +3810,9 @@ const TargetRegisterClass *SIInstrInfo::getOpRegClass(const MachineInstr &MI,
const MCInstrDesc &Desc = get(MI.getOpcode());
if (MI.isVariadic() || OpNo >= Desc.getNumOperands() ||
Desc.OpInfo[OpNo].RegClass == -1) {
- unsigned Reg = MI.getOperand(OpNo).getReg();
+ Register Reg = MI.getOperand(OpNo).getReg();
- if (TargetRegisterInfo::isVirtualRegister(Reg))
+ if (Register::isVirtualRegister(Reg))
return MRI.getRegClass(Reg);
return RI.getPhysRegClass(Reg);
}
@@ -3741,7 +3843,7 @@ void SIInstrInfo::legalizeOpWithMove(MachineInstr &MI, unsigned OpIdx) const {
else
VRC = &AMDGPU::VGPR_32RegClass;
- unsigned Reg = MRI.createVirtualRegister(VRC);
+ Register Reg = MRI.createVirtualRegister(VRC);
DebugLoc DL = MBB->findDebugLoc(I);
BuildMI(*MI.getParent(), I, DL, get(Opcode), Reg).add(MO);
MO.ChangeToRegister(Reg, false);
@@ -3756,7 +3858,7 @@ unsigned SIInstrInfo::buildExtractSubReg(MachineBasicBlock::iterator MI,
const {
MachineBasicBlock *MBB = MI->getParent();
DebugLoc DL = MI->getDebugLoc();
- unsigned SubReg = MRI.createVirtualRegister(SubRC);
+ Register SubReg = MRI.createVirtualRegister(SubRC);
if (SuperReg.getSubReg() == AMDGPU::NoSubRegister) {
BuildMI(*MBB, MI, DL, get(TargetOpcode::COPY), SubReg)
@@ -3768,7 +3870,7 @@ unsigned SIInstrInfo::buildExtractSubReg(MachineBasicBlock::iterator MI,
// value so we don't need to worry about merging its subreg index with the
// SubIdx passed to this function. The register coalescer should be able to
// eliminate this extra copy.
- unsigned NewSuperReg = MRI.createVirtualRegister(SuperRC);
+ Register NewSuperReg = MRI.createVirtualRegister(SuperRC);
BuildMI(*MBB, MI, DL, get(TargetOpcode::COPY), NewSuperReg)
.addReg(SuperReg.getReg(), 0, SuperReg.getSubReg());
@@ -3814,11 +3916,10 @@ bool SIInstrInfo::isLegalRegOperand(const MachineRegisterInfo &MRI,
if (!MO.isReg())
return false;
- unsigned Reg = MO.getReg();
- const TargetRegisterClass *RC =
- TargetRegisterInfo::isVirtualRegister(Reg) ?
- MRI.getRegClass(Reg) :
- RI.getPhysRegClass(Reg);
+ Register Reg = MO.getReg();
+ const TargetRegisterClass *RC = Register::isVirtualRegister(Reg)
+ ? MRI.getRegClass(Reg)
+ : RI.getPhysRegClass(Reg);
const SIRegisterInfo *TRI =
static_cast<const SIRegisterInfo*>(MRI.getTargetRegisterInfo());
@@ -3935,13 +4036,13 @@ void SIInstrInfo::legalizeOperandsVOP2(MachineRegisterInfo &MRI,
if (Opc == AMDGPU::V_WRITELANE_B32) {
const DebugLoc &DL = MI.getDebugLoc();
if (Src0.isReg() && RI.isVGPR(MRI, Src0.getReg())) {
- unsigned Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
+ Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
.add(Src0);
Src0.ChangeToRegister(Reg, false);
}
if (Src1.isReg() && RI.isVGPR(MRI, Src1.getReg())) {
- unsigned Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
+ Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
const DebugLoc &DL = MI.getDebugLoc();
BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
.add(Src1);
@@ -3967,7 +4068,7 @@ void SIInstrInfo::legalizeOperandsVOP2(MachineRegisterInfo &MRI,
// select is uniform.
if (Opc == AMDGPU::V_READLANE_B32 && Src1.isReg() &&
RI.isVGPR(MRI, Src1.getReg())) {
- unsigned Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
+ Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
const DebugLoc &DL = MI.getDebugLoc();
BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
.add(Src1);
@@ -4003,7 +4104,7 @@ void SIInstrInfo::legalizeOperandsVOP2(MachineRegisterInfo &MRI,
MI.setDesc(get(CommutedOpc));
- unsigned Src0Reg = Src0.getReg();
+ Register Src0Reg = Src0.getReg();
unsigned Src0SubReg = Src0.getSubReg();
bool Src0Kill = Src0.isKill();
@@ -4039,13 +4140,13 @@ void SIInstrInfo::legalizeOperandsVOP3(MachineRegisterInfo &MRI,
MachineOperand &Src2 = MI.getOperand(VOP3Idx[2]);
const DebugLoc &DL = MI.getDebugLoc();
if (Src1.isReg() && !RI.isSGPRClass(MRI.getRegClass(Src1.getReg()))) {
- unsigned Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
+ Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
.add(Src1);
Src1.ChangeToRegister(Reg, false);
}
if (Src2.isReg() && !RI.isSGPRClass(MRI.getRegClass(Src2.getReg()))) {
- unsigned Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
+ Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
BuildMI(*MI.getParent(), MI, DL, get(AMDGPU::V_READFIRSTLANE_B32), Reg)
.add(Src2);
Src2.ChangeToRegister(Reg, false);
@@ -4113,12 +4214,12 @@ unsigned SIInstrInfo::readlaneVGPRToSGPR(unsigned SrcReg, MachineInstr &UseMI,
MachineRegisterInfo &MRI) const {
const TargetRegisterClass *VRC = MRI.getRegClass(SrcReg);
const TargetRegisterClass *SRC = RI.getEquivalentSGPRClass(VRC);
- unsigned DstReg = MRI.createVirtualRegister(SRC);
+ Register DstReg = MRI.createVirtualRegister(SRC);
unsigned SubRegs = RI.getRegSizeInBits(*VRC) / 32;
if (RI.hasAGPRs(VRC)) {
VRC = RI.getEquivalentVGPRClass(VRC);
- unsigned NewSrcReg = MRI.createVirtualRegister(VRC);
+ Register NewSrcReg = MRI.createVirtualRegister(VRC);
BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(),
get(TargetOpcode::COPY), NewSrcReg)
.addReg(SrcReg);
@@ -4134,7 +4235,7 @@ unsigned SIInstrInfo::readlaneVGPRToSGPR(unsigned SrcReg, MachineInstr &UseMI,
SmallVector<unsigned, 8> SRegs;
for (unsigned i = 0; i < SubRegs; ++i) {
- unsigned SGPR = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
+ Register SGPR = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(),
get(AMDGPU::V_READFIRSTLANE_B32), SGPR)
.addReg(SrcReg, 0, RI.getSubRegFromChannel(i));
@@ -4176,7 +4277,7 @@ void SIInstrInfo::legalizeGenericOperand(MachineBasicBlock &InsertMBB,
MachineOperand &Op,
MachineRegisterInfo &MRI,
const DebugLoc &DL) const {
- unsigned OpReg = Op.getReg();
+ Register OpReg = Op.getReg();
unsigned OpSubReg = Op.getSubReg();
const TargetRegisterClass *OpRC = RI.getSubClassWithSubReg(
@@ -4186,7 +4287,7 @@ void SIInstrInfo::legalizeGenericOperand(MachineBasicBlock &InsertMBB,
if (DstRC == OpRC)
return;
- unsigned DstReg = MRI.createVirtualRegister(DstRC);
+ Register DstReg = MRI.createVirtualRegister(DstRC);
MachineInstr *Copy =
BuildMI(InsertMBB, I, DL, get(AMDGPU::COPY), DstReg).add(Op);
@@ -4198,8 +4299,19 @@ void SIInstrInfo::legalizeGenericOperand(MachineBasicBlock &InsertMBB,
return;
// Try to eliminate the copy if it is copying an immediate value.
- if (Def->isMoveImmediate())
+ if (Def->isMoveImmediate() && DstRC != &AMDGPU::VReg_1RegClass)
FoldImmediate(*Copy, *Def, OpReg, &MRI);
+
+ bool ImpDef = Def->isImplicitDef();
+ while (!ImpDef && Def && Def->isCopy()) {
+ if (Def->getOperand(1).getReg().isPhysical())
+ break;
+ Def = MRI.getUniqueVRegDef(Def->getOperand(1).getReg());
+ ImpDef = Def && Def->isImplicitDef();
+ }
+ if (!RI.isSGPRClass(DstRC) && !Copy->readsRegister(AMDGPU::EXEC, &RI) &&
+ !ImpDef)
+ Copy->addOperand(MachineOperand::CreateReg(AMDGPU::EXEC, false, true));
}
// Emit the actual waterfall loop, executing the wrapped instruction for each
@@ -4223,18 +4335,18 @@ emitLoadSRsrcFromVGPRLoop(const SIInstrInfo &TII, MachineRegisterInfo &MRI,
MachineBasicBlock::iterator I = LoopBB.begin();
- unsigned VRsrc = Rsrc.getReg();
+ Register VRsrc = Rsrc.getReg();
unsigned VRsrcUndef = getUndefRegState(Rsrc.isUndef());
- unsigned SaveExec = MRI.createVirtualRegister(BoolXExecRC);
- unsigned CondReg0 = MRI.createVirtualRegister(BoolXExecRC);
- unsigned CondReg1 = MRI.createVirtualRegister(BoolXExecRC);
- unsigned AndCond = MRI.createVirtualRegister(BoolXExecRC);
- unsigned SRsrcSub0 = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
- unsigned SRsrcSub1 = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
- unsigned SRsrcSub2 = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
- unsigned SRsrcSub3 = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
- unsigned SRsrc = MRI.createVirtualRegister(&AMDGPU::SReg_128RegClass);
+ Register SaveExec = MRI.createVirtualRegister(BoolXExecRC);
+ Register CondReg0 = MRI.createVirtualRegister(BoolXExecRC);
+ Register CondReg1 = MRI.createVirtualRegister(BoolXExecRC);
+ Register AndCond = MRI.createVirtualRegister(BoolXExecRC);
+ Register SRsrcSub0 = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
+ Register SRsrcSub1 = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
+ Register SRsrcSub2 = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
+ Register SRsrcSub3 = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
+ Register SRsrc = MRI.createVirtualRegister(&AMDGPU::SGPR_128RegClass);
// Beginning of the loop, read the next Rsrc variant.
BuildMI(LoopBB, I, DL, TII.get(AMDGPU::V_READFIRSTLANE_B32), SRsrcSub0)
@@ -4302,7 +4414,7 @@ static void loadSRsrcFromVGPR(const SIInstrInfo &TII, MachineInstr &MI,
unsigned MovExecOpc = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
const auto *BoolXExecRC = TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
- unsigned SaveExec = MRI.createVirtualRegister(BoolXExecRC);
+ Register SaveExec = MRI.createVirtualRegister(BoolXExecRC);
// Save the EXEC mask
BuildMI(MBB, I, DL, TII.get(MovExecOpc), SaveExec).addReg(Exec);
@@ -4370,10 +4482,10 @@ extractRsrcPtr(const SIInstrInfo &TII, MachineInstr &MI, MachineOperand &Rsrc) {
AMDGPU::sub0_sub1, &AMDGPU::VReg_64RegClass);
// Create an empty resource descriptor
- unsigned Zero64 = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
- unsigned SRsrcFormatLo = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
- unsigned SRsrcFormatHi = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
- unsigned NewSRsrc = MRI.createVirtualRegister(&AMDGPU::SReg_128RegClass);
+ Register Zero64 = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
+ Register SRsrcFormatLo = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
+ Register SRsrcFormatHi = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
+ Register NewSRsrc = MRI.createVirtualRegister(&AMDGPU::SGPR_128RegClass);
uint64_t RsrcDataFormat = TII.getDefaultRsrcDataFormat();
// Zero64 = 0
@@ -4430,7 +4542,7 @@ void SIInstrInfo::legalizeOperands(MachineInstr &MI,
const TargetRegisterClass *RC = nullptr, *SRC = nullptr, *VRC = nullptr;
for (unsigned i = 1, e = MI.getNumOperands(); i != e; i += 2) {
if (!MI.getOperand(i).isReg() ||
- !TargetRegisterInfo::isVirtualRegister(MI.getOperand(i).getReg()))
+ !Register::isVirtualRegister(MI.getOperand(i).getReg()))
continue;
const TargetRegisterClass *OpRC =
MRI.getRegClass(MI.getOperand(i).getReg());
@@ -4447,8 +4559,16 @@ void SIInstrInfo::legalizeOperands(MachineInstr &MI,
if (VRC || !RI.isSGPRClass(getOpRegClass(MI, 0))) {
if (!VRC) {
assert(SRC);
- VRC = RI.hasAGPRs(getOpRegClass(MI, 0)) ? RI.getEquivalentAGPRClass(SRC)
- : RI.getEquivalentVGPRClass(SRC);
+ if (getOpRegClass(MI, 0) == &AMDGPU::VReg_1RegClass) {
+ VRC = &AMDGPU::VReg_1RegClass;
+ } else
+ VRC = RI.hasAGPRs(getOpRegClass(MI, 0))
+ ? RI.getEquivalentAGPRClass(SRC)
+ : RI.getEquivalentVGPRClass(SRC);
+ } else {
+ VRC = RI.hasAGPRs(getOpRegClass(MI, 0))
+ ? RI.getEquivalentAGPRClass(VRC)
+ : RI.getEquivalentVGPRClass(VRC);
}
RC = VRC;
} else {
@@ -4458,7 +4578,7 @@ void SIInstrInfo::legalizeOperands(MachineInstr &MI,
// Update all the operands so they have the same type.
for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) {
MachineOperand &Op = MI.getOperand(I);
- if (!Op.isReg() || !TargetRegisterInfo::isVirtualRegister(Op.getReg()))
+ if (!Op.isReg() || !Register::isVirtualRegister(Op.getReg()))
continue;
// MI is a PHI instruction.
@@ -4483,7 +4603,7 @@ void SIInstrInfo::legalizeOperands(MachineInstr &MI,
// subregister index types e.g. sub0_sub1 + sub2 + sub3
for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) {
MachineOperand &Op = MI.getOperand(I);
- if (!Op.isReg() || !TargetRegisterInfo::isVirtualRegister(Op.getReg()))
+ if (!Op.isReg() || !Register::isVirtualRegister(Op.getReg()))
continue;
const TargetRegisterClass *OpRC = MRI.getRegClass(Op.getReg());
@@ -4502,8 +4622,8 @@ void SIInstrInfo::legalizeOperands(MachineInstr &MI,
// Legalize INSERT_SUBREG
// src0 must have the same register class as dst
if (MI.getOpcode() == AMDGPU::INSERT_SUBREG) {
- unsigned Dst = MI.getOperand(0).getReg();
- unsigned Src0 = MI.getOperand(1).getReg();
+ Register Dst = MI.getOperand(0).getReg();
+ Register Src0 = MI.getOperand(1).getReg();
const TargetRegisterClass *DstRC = MRI.getRegClass(Dst);
const TargetRegisterClass *Src0RC = MRI.getRegClass(Src0);
if (DstRC != Src0RC) {
@@ -4577,13 +4697,13 @@ void SIInstrInfo::legalizeOperands(MachineInstr &MI,
if (VAddr && AMDGPU::getIfAddr64Inst(MI.getOpcode()) != -1) {
// This is already an ADDR64 instruction so we need to add the pointer
// extracted from the resource descriptor to the current value of VAddr.
- unsigned NewVAddrLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
- unsigned NewVAddrHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
- unsigned NewVAddr = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
+ Register NewVAddrLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+ Register NewVAddrHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+ Register NewVAddr = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
const auto *BoolXExecRC = RI.getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
- unsigned CondReg0 = MRI.createVirtualRegister(BoolXExecRC);
- unsigned CondReg1 = MRI.createVirtualRegister(BoolXExecRC);
+ Register CondReg0 = MRI.createVirtualRegister(BoolXExecRC);
+ Register CondReg1 = MRI.createVirtualRegister(BoolXExecRC);
unsigned RsrcPtr, NewSRsrc;
std::tie(RsrcPtr, NewSRsrc) = extractRsrcPtr(*this, MI, *Rsrc);
@@ -4623,7 +4743,7 @@ void SIInstrInfo::legalizeOperands(MachineInstr &MI,
unsigned RsrcPtr, NewSRsrc;
std::tie(RsrcPtr, NewSRsrc) = extractRsrcPtr(*this, MI, *Rsrc);
- unsigned NewVAddr = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
+ Register NewVAddr = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
MachineOperand *VData = getNamedOperand(MI, AMDGPU::OpName::vdata);
MachineOperand *Offset = getNamedOperand(MI, AMDGPU::OpName::offset);
MachineOperand *SOffset = getNamedOperand(MI, AMDGPU::OpName::soffset);
@@ -4661,6 +4781,8 @@ void SIInstrInfo::legalizeOperands(MachineInstr &MI,
MIB.addImm(TFE->getImm());
}
+ MIB.addImm(getNamedImmOperand(MI, AMDGPU::OpName::swz));
+
MIB.cloneMemRefs(MI);
Addr64 = MIB;
} else {
@@ -4933,8 +5055,8 @@ void SIInstrInfo::moveToVALU(MachineInstr &TopInst,
bool HasDst = Inst.getOperand(0).isReg() && Inst.getOperand(0).isDef();
unsigned NewDstReg = AMDGPU::NoRegister;
if (HasDst) {
- unsigned DstReg = Inst.getOperand(0).getReg();
- if (TargetRegisterInfo::isPhysicalRegister(DstReg))
+ Register DstReg = Inst.getOperand(0).getReg();
+ if (Register::isPhysicalRegister(DstReg))
continue;
// Update the destination register class.
@@ -4943,7 +5065,7 @@ void SIInstrInfo::moveToVALU(MachineInstr &TopInst,
continue;
if (Inst.isCopy() &&
- TargetRegisterInfo::isVirtualRegister(Inst.getOperand(1).getReg()) &&
+ Register::isVirtualRegister(Inst.getOperand(1).getReg()) &&
NewDstRC == RI.getRegClassForReg(MRI, Inst.getOperand(1).getReg())) {
// Instead of creating a copy where src and dst are the same register
// class, we just replace all uses of dst with src. These kinds of
@@ -4988,8 +5110,8 @@ bool SIInstrInfo::moveScalarAddSub(SetVectorType &Worklist, MachineInstr &Inst,
MachineBasicBlock &MBB = *Inst.getParent();
MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
- unsigned OldDstReg = Inst.getOperand(0).getReg();
- unsigned ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+ Register OldDstReg = Inst.getOperand(0).getReg();
+ Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
unsigned Opc = Inst.getOpcode();
assert(Opc == AMDGPU::S_ADD_I32 || Opc == AMDGPU::S_SUB_I32);
@@ -5022,8 +5144,8 @@ void SIInstrInfo::lowerScalarAbs(SetVectorType &Worklist,
MachineOperand &Dest = Inst.getOperand(0);
MachineOperand &Src = Inst.getOperand(1);
- unsigned TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
- unsigned ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+ Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+ Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
unsigned SubOp = ST.hasAddNoCarry() ?
AMDGPU::V_SUB_U32_e32 : AMDGPU::V_SUB_I32_e32;
@@ -5052,7 +5174,7 @@ void SIInstrInfo::lowerScalarXnor(SetVectorType &Worklist,
MachineOperand &Src1 = Inst.getOperand(2);
if (ST.hasDLInsts()) {
- unsigned NewDest = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+ Register NewDest = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
legalizeGenericOperand(MBB, MII, &AMDGPU::VGPR_32RegClass, Src0, MRI, DL);
legalizeGenericOperand(MBB, MII, &AMDGPU::VGPR_32RegClass, Src1, MRI, DL);
@@ -5072,8 +5194,8 @@ void SIInstrInfo::lowerScalarXnor(SetVectorType &Worklist,
bool Src1IsSGPR = Src1.isReg() &&
RI.isSGPRClass(MRI.getRegClass(Src1.getReg()));
MachineInstr *Xor;
- unsigned Temp = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
- unsigned NewDest = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
+ Register Temp = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
+ Register NewDest = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
// Build a pair of scalar instructions and add them to the work list.
// The next iteration over the work list will lower these to the vector
@@ -5117,8 +5239,8 @@ void SIInstrInfo::splitScalarNotBinop(SetVectorType &Worklist,
MachineOperand &Src0 = Inst.getOperand(1);
MachineOperand &Src1 = Inst.getOperand(2);
- unsigned NewDest = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
- unsigned Interm = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
+ Register NewDest = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
+ Register Interm = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
MachineInstr &Op = *BuildMI(MBB, MII, DL, get(Opcode), Interm)
.add(Src0)
@@ -5146,8 +5268,8 @@ void SIInstrInfo::splitScalarBinOpN2(SetVectorType& Worklist,
MachineOperand &Src0 = Inst.getOperand(1);
MachineOperand &Src1 = Inst.getOperand(2);
- unsigned NewDest = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
- unsigned Interm = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
+ Register NewDest = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
+ Register Interm = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
MachineInstr &Not = *BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B32), Interm)
.add(Src1);
@@ -5189,16 +5311,16 @@ void SIInstrInfo::splitScalar64BitUnaryOp(
const TargetRegisterClass *NewDestRC = RI.getEquivalentVGPRClass(DestRC);
const TargetRegisterClass *NewDestSubRC = RI.getSubRegClass(NewDestRC, AMDGPU::sub0);
- unsigned DestSub0 = MRI.createVirtualRegister(NewDestSubRC);
+ Register DestSub0 = MRI.createVirtualRegister(NewDestSubRC);
MachineInstr &LoHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub0).add(SrcReg0Sub0);
MachineOperand SrcReg0Sub1 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC,
AMDGPU::sub1, Src0SubRC);
- unsigned DestSub1 = MRI.createVirtualRegister(NewDestSubRC);
+ Register DestSub1 = MRI.createVirtualRegister(NewDestSubRC);
MachineInstr &HiHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub1).add(SrcReg0Sub1);
- unsigned FullDestReg = MRI.createVirtualRegister(NewDestRC);
+ Register FullDestReg = MRI.createVirtualRegister(NewDestRC);
BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg)
.addReg(DestSub0)
.addImm(AMDGPU::sub0)
@@ -5226,12 +5348,12 @@ void SIInstrInfo::splitScalar64BitAddSub(SetVectorType &Worklist,
MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
const auto *CarryRC = RI.getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
- unsigned FullDestReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
- unsigned DestSub0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
- unsigned DestSub1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+ Register FullDestReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
+ Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+ Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
- unsigned CarryReg = MRI.createVirtualRegister(CarryRC);
- unsigned DeadCarryReg = MRI.createVirtualRegister(CarryRC);
+ Register CarryReg = MRI.createVirtualRegister(CarryRC);
+ Register DeadCarryReg = MRI.createVirtualRegister(CarryRC);
MachineOperand &Dest = Inst.getOperand(0);
MachineOperand &Src0 = Inst.getOperand(1);
@@ -5327,17 +5449,17 @@ void SIInstrInfo::splitScalar64BitBinaryOp(SetVectorType &Worklist,
const TargetRegisterClass *NewDestRC = RI.getEquivalentVGPRClass(DestRC);
const TargetRegisterClass *NewDestSubRC = RI.getSubRegClass(NewDestRC, AMDGPU::sub0);
- unsigned DestSub0 = MRI.createVirtualRegister(NewDestSubRC);
+ Register DestSub0 = MRI.createVirtualRegister(NewDestSubRC);
MachineInstr &LoHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub0)
.add(SrcReg0Sub0)
.add(SrcReg1Sub0);
- unsigned DestSub1 = MRI.createVirtualRegister(NewDestSubRC);
+ Register DestSub1 = MRI.createVirtualRegister(NewDestSubRC);
MachineInstr &HiHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub1)
.add(SrcReg0Sub1)
.add(SrcReg1Sub1);
- unsigned FullDestReg = MRI.createVirtualRegister(NewDestRC);
+ Register FullDestReg = MRI.createVirtualRegister(NewDestRC);
BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg)
.addReg(DestSub0)
.addImm(AMDGPU::sub0)
@@ -5368,7 +5490,7 @@ void SIInstrInfo::splitScalar64BitXnor(SetVectorType &Worklist,
const TargetRegisterClass *DestRC = MRI.getRegClass(Dest.getReg());
- unsigned Interm = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
+ Register Interm = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
MachineOperand* Op0;
MachineOperand* Op1;
@@ -5384,7 +5506,7 @@ void SIInstrInfo::splitScalar64BitXnor(SetVectorType &Worklist,
BuildMI(MBB, MII, DL, get(AMDGPU::S_NOT_B64), Interm)
.add(*Op0);
- unsigned NewDest = MRI.createVirtualRegister(DestRC);
+ Register NewDest = MRI.createVirtualRegister(DestRC);
MachineInstr &Xor = *BuildMI(MBB, MII, DL, get(AMDGPU::S_XOR_B64), NewDest)
.addReg(Interm)
@@ -5411,8 +5533,8 @@ void SIInstrInfo::splitScalar64BitBCNT(
MRI.getRegClass(Src.getReg()) :
&AMDGPU::SGPR_32RegClass;
- unsigned MidReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
- unsigned ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+ Register MidReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+ Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
const TargetRegisterClass *SrcSubRC = RI.getSubRegClass(SrcRC, AMDGPU::sub0);
@@ -5451,9 +5573,9 @@ void SIInstrInfo::splitScalar64BitBFE(SetVectorType &Worklist,
Offset == 0 && "Not implemented");
if (BitWidth < 32) {
- unsigned MidRegLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
- unsigned MidRegHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
- unsigned ResultReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
+ Register MidRegLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+ Register MidRegHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+ Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
BuildMI(MBB, MII, DL, get(AMDGPU::V_BFE_I32), MidRegLo)
.addReg(Inst.getOperand(1).getReg(), 0, AMDGPU::sub0)
@@ -5476,8 +5598,8 @@ void SIInstrInfo::splitScalar64BitBFE(SetVectorType &Worklist,
}
MachineOperand &Src = Inst.getOperand(1);
- unsigned TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
- unsigned ResultReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
+ Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+ Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass);
BuildMI(MBB, MII, DL, get(AMDGPU::V_ASHRREV_I32_e64), TmpReg)
.addImm(31)
@@ -5506,6 +5628,7 @@ void SIInstrInfo::addUsersToMoveToVALUWorklist(
switch (UseMI.getOpcode()) {
case AMDGPU::COPY:
case AMDGPU::WQM:
+ case AMDGPU::SOFT_WQM:
case AMDGPU::WWM:
case AMDGPU::REG_SEQUENCE:
case AMDGPU::PHI:
@@ -5531,7 +5654,7 @@ void SIInstrInfo::addUsersToMoveToVALUWorklist(
void SIInstrInfo::movePackToVALU(SetVectorType &Worklist,
MachineRegisterInfo &MRI,
MachineInstr &Inst) const {
- unsigned ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+ Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
MachineBasicBlock *MBB = Inst.getParent();
MachineOperand &Src0 = Inst.getOperand(1);
MachineOperand &Src1 = Inst.getOperand(2);
@@ -5539,8 +5662,8 @@ void SIInstrInfo::movePackToVALU(SetVectorType &Worklist,
switch (Inst.getOpcode()) {
case AMDGPU::S_PACK_LL_B32_B16: {
- unsigned ImmReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
- unsigned TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+ Register ImmReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+ Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
// FIXME: Can do a lot better if we know the high bits of src0 or src1 are
// 0.
@@ -5558,7 +5681,7 @@ void SIInstrInfo::movePackToVALU(SetVectorType &Worklist,
break;
}
case AMDGPU::S_PACK_LH_B32_B16: {
- unsigned ImmReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+ Register ImmReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
BuildMI(*MBB, Inst, DL, get(AMDGPU::V_MOV_B32_e32), ImmReg)
.addImm(0xffff);
BuildMI(*MBB, Inst, DL, get(AMDGPU::V_BFI_B32), ResultReg)
@@ -5568,8 +5691,8 @@ void SIInstrInfo::movePackToVALU(SetVectorType &Worklist,
break;
}
case AMDGPU::S_PACK_HH_B32_B16: {
- unsigned ImmReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
- unsigned TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+ Register ImmReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+ Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHRREV_B32_e64), TmpReg)
.addImm(16)
.add(Src0);
@@ -5623,17 +5746,27 @@ const TargetRegisterClass *SIInstrInfo::getDestEquivalentVGPRClass(
case AMDGPU::REG_SEQUENCE:
case AMDGPU::INSERT_SUBREG:
case AMDGPU::WQM:
+ case AMDGPU::SOFT_WQM:
case AMDGPU::WWM: {
const TargetRegisterClass *SrcRC = getOpRegClass(Inst, 1);
if (RI.hasAGPRs(SrcRC)) {
if (RI.hasAGPRs(NewDstRC))
return nullptr;
- NewDstRC = RI.getEquivalentAGPRClass(NewDstRC);
+ switch (Inst.getOpcode()) {
+ case AMDGPU::PHI:
+ case AMDGPU::REG_SEQUENCE:
+ case AMDGPU::INSERT_SUBREG:
+ NewDstRC = RI.getEquivalentAGPRClass(NewDstRC);
+ break;
+ default:
+ NewDstRC = RI.getEquivalentVGPRClass(NewDstRC);
+ }
+
if (!NewDstRC)
return nullptr;
} else {
- if (RI.hasVGPRs(NewDstRC))
+ if (RI.hasVGPRs(NewDstRC) || NewDstRC == &AMDGPU::VReg_1RegClass)
return nullptr;
NewDstRC = RI.getEquivalentVGPRClass(NewDstRC);
@@ -5686,7 +5819,7 @@ unsigned SIInstrInfo::findUsedSGPR(const MachineInstr &MI,
return MO.getReg();
// If this could be a VGPR or an SGPR, Check the dynamic register class.
- unsigned Reg = MO.getReg();
+ Register Reg = MO.getReg();
const TargetRegisterClass *RegRC = MRI.getRegClass(Reg);
if (RI.isSGPRClass(RegRC))
UsedSGPRs[i] = Reg;
@@ -5941,7 +6074,7 @@ void SIInstrInfo::convertNonUniformIfRegion(MachineBasicBlock *IfEntry,
MachineRegisterInfo &MRI = IfEntry->getParent()->getRegInfo();
if (Branch->getOpcode() == AMDGPU::SI_NON_UNIFORM_BRCOND_PSEUDO) {
- unsigned DstReg = MRI.createVirtualRegister(RI.getBoolRC());
+ Register DstReg = MRI.createVirtualRegister(RI.getBoolRC());
MachineInstr *SIIF =
BuildMI(*MF, Branch->getDebugLoc(), get(AMDGPU::SI_IF), DstReg)
.add(Branch->getOperand(0))
@@ -5968,8 +6101,8 @@ void SIInstrInfo::convertNonUniformLoopRegion(
if (Branch->getOpcode() == AMDGPU::SI_NON_UNIFORM_BRCOND_PSEUDO) {
- unsigned DstReg = MRI.createVirtualRegister(RI.getBoolRC());
- unsigned BackEdgeReg = MRI.createVirtualRegister(RI.getBoolRC());
+ Register DstReg = MRI.createVirtualRegister(RI.getBoolRC());
+ Register BackEdgeReg = MRI.createVirtualRegister(RI.getBoolRC());
MachineInstrBuilder HeaderPHIBuilder =
BuildMI(*(MF), Branch->getDebugLoc(), get(TargetOpcode::PHI), DstReg);
for (MachineBasicBlock::pred_iterator PI = LoopEntry->pred_begin(),
@@ -5979,7 +6112,7 @@ void SIInstrInfo::convertNonUniformLoopRegion(
HeaderPHIBuilder.addReg(BackEdgeReg);
} else {
MachineBasicBlock *PMBB = *PI;
- unsigned ZeroReg = MRI.createVirtualRegister(RI.getBoolRC());
+ Register ZeroReg = MRI.createVirtualRegister(RI.getBoolRC());
materializeImmediate(*PMBB, PMBB->getFirstTerminator(), DebugLoc(),
ZeroReg, 0);
HeaderPHIBuilder.addReg(ZeroReg);
@@ -6063,13 +6196,30 @@ SIInstrInfo::getAddNoCarry(MachineBasicBlock &MBB,
return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_U32_e64), DestReg);
MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
- unsigned UnusedCarry = MRI.createVirtualRegister(RI.getBoolRC());
+ Register UnusedCarry = MRI.createVirtualRegister(RI.getBoolRC());
MRI.setRegAllocationHint(UnusedCarry, 0, RI.getVCC());
return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_I32_e64), DestReg)
.addReg(UnusedCarry, RegState::Define | RegState::Dead);
}
+MachineInstrBuilder SIInstrInfo::getAddNoCarry(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator I,
+ const DebugLoc &DL,
+ Register DestReg,
+ RegScavenger &RS) const {
+ if (ST.hasAddNoCarry())
+ return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_U32_e32), DestReg);
+
+ Register UnusedCarry = RS.scavengeRegister(RI.getBoolRC(), I, 0, false);
+ // TODO: Users need to deal with this.
+ if (!UnusedCarry.isValid())
+ return MachineInstrBuilder();
+
+ return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_I32_e64), DestReg)
+ .addReg(UnusedCarry, RegState::Define | RegState::Dead);
+}
+
bool SIInstrInfo::isKillTerminator(unsigned Opcode) {
switch (Opcode) {
case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR:
@@ -6115,7 +6265,21 @@ bool SIInstrInfo::isBufferSMRD(const MachineInstr &MI) const {
return false;
const auto RCID = MI.getDesc().OpInfo[Idx].RegClass;
- return RCID == AMDGPU::SReg_128RegClassID;
+ return RI.getRegClass(RCID)->hasSubClassEq(&AMDGPU::SGPR_128RegClass);
+}
+
+unsigned SIInstrInfo::getNumFlatOffsetBits(unsigned AddrSpace,
+ bool Signed) const {
+ if (!ST.hasFlatInstOffsets())
+ return 0;
+
+ if (ST.hasFlatSegmentOffsetBug() && AddrSpace == AMDGPUAS::FLAT_ADDRESS)
+ return 0;
+
+ if (ST.getGeneration() >= AMDGPUSubtarget::GFX10)
+ return Signed ? 12 : 11;
+
+ return Signed ? 13 : 12;
}
bool SIInstrInfo::isLegalFLATOffset(int64_t Offset, unsigned AddrSpace,
@@ -6254,7 +6418,7 @@ static bool followSubRegDef(MachineInstr &MI,
MachineInstr *llvm::getVRegSubRegDef(const TargetInstrInfo::RegSubRegPair &P,
MachineRegisterInfo &MRI) {
assert(MRI.isSSA());
- if (!TargetRegisterInfo::isVirtualRegister(P.Reg))
+ if (!Register::isVirtualRegister(P.Reg))
return nullptr;
auto RSR = P;
@@ -6265,8 +6429,7 @@ MachineInstr *llvm::getVRegSubRegDef(const TargetInstrInfo::RegSubRegPair &P,
case AMDGPU::COPY:
case AMDGPU::V_MOV_B32_e32: {
auto &Op1 = MI->getOperand(1);
- if (Op1.isReg() &&
- TargetRegisterInfo::isVirtualRegister(Op1.getReg())) {
+ if (Op1.isReg() && Register::isVirtualRegister(Op1.getReg())) {
if (Op1.isUndef())
return nullptr;
RSR = getRegSubRegPair(Op1);
@@ -6360,3 +6523,40 @@ bool llvm::execMayBeModifiedBeforeAnyUse(const MachineRegisterInfo &MRI,
return true;
}
}
+
+MachineInstr *SIInstrInfo::createPHIDestinationCopy(
+ MachineBasicBlock &MBB, MachineBasicBlock::iterator LastPHIIt,
+ const DebugLoc &DL, Register Src, Register Dst) const {
+ auto Cur = MBB.begin();
+ if (Cur != MBB.end())
+ do {
+ if (!Cur->isPHI() && Cur->readsRegister(Dst))
+ return BuildMI(MBB, Cur, DL, get(TargetOpcode::COPY), Dst).addReg(Src);
+ ++Cur;
+ } while (Cur != MBB.end() && Cur != LastPHIIt);
+
+ return TargetInstrInfo::createPHIDestinationCopy(MBB, LastPHIIt, DL, Src,
+ Dst);
+}
+
+MachineInstr *SIInstrInfo::createPHISourceCopy(
+ MachineBasicBlock &MBB, MachineBasicBlock::iterator InsPt,
+ const DebugLoc &DL, Register Src, Register SrcSubReg, Register Dst) const {
+ if (InsPt != MBB.end() &&
+ (InsPt->getOpcode() == AMDGPU::SI_IF ||
+ InsPt->getOpcode() == AMDGPU::SI_ELSE ||
+ InsPt->getOpcode() == AMDGPU::SI_IF_BREAK) &&
+ InsPt->definesRegister(Src)) {
+ InsPt++;
+ return BuildMI(MBB, InsPt, InsPt->getDebugLoc(),
+ get(ST.isWave32() ? AMDGPU::S_MOV_B32_term
+ : AMDGPU::S_MOV_B64_term),
+ Dst)
+ .addReg(Src, 0, SrcSubReg)
+ .addReg(AMDGPU::EXEC, RegState::Implicit);
+ }
+ return TargetInstrInfo::createPHISourceCopy(MBB, InsPt, DL, Src, SrcSubReg,
+ Dst);
+}
+
+bool llvm::SIInstrInfo::isWave32() const { return ST.isWave32(); }
diff --git a/lib/Target/AMDGPU/SIInstrInfo.h b/lib/Target/AMDGPU/SIInstrInfo.h
index 3ff35da0b963..be463442c888 100644
--- a/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/lib/Target/AMDGPU/SIInstrInfo.h
@@ -173,7 +173,7 @@ public:
}
bool isReallyTriviallyReMaterializable(const MachineInstr &MI,
- AliasAnalysis *AA) const override;
+ AAResults *AA) const override;
bool areLoadsFromSameBasePtr(SDNode *Load1, SDNode *Load2,
int64_t &Offset1,
@@ -229,6 +229,14 @@ public:
bool expandPostRAPseudo(MachineInstr &MI) const override;
+ // Splits a V_MOV_B64_DPP_PSEUDO opcode into a pair of v_mov_b32_dpp
+ // instructions. Returns a pair of generated instructions.
+ // Can split either post-RA with physical registers or pre-RA with
+ // virtual registers. In latter case IR needs to be in SSA form and
+ // and a REG_SEQUENCE is produced to define original register.
+ std::pair<MachineInstr*, MachineInstr*>
+ expandMovDPP64(MachineInstr &MI) const;
+
// Returns an opcode that can be used to move a value to a \p DstRC
// register. If there is no hardware instruction that can store to \p
// DstRC, then AMDGPU::COPY is returned.
@@ -242,7 +250,7 @@ public:
return commuteOpcode(MI.getOpcode());
}
- bool findCommutedOpIndices(MachineInstr &MI, unsigned &SrcOpIdx1,
+ bool findCommutedOpIndices(const MachineInstr &MI, unsigned &SrcOpIdx1,
unsigned &SrcOpIdx2) const override;
bool findCommutedOpIndices(MCInstrDesc Desc, unsigned & SrcOpIdx0,
@@ -303,8 +311,7 @@ public:
bool
areMemAccessesTriviallyDisjoint(const MachineInstr &MIa,
- const MachineInstr &MIb,
- AliasAnalysis *AA = nullptr) const override;
+ const MachineInstr &MIb) const override;
bool isFoldableCopy(const MachineInstr &MI) const;
@@ -578,6 +585,14 @@ public:
return get(Opcode).TSFlags & SIInstrFlags::IsMAI;
}
+ static bool isDOT(const MachineInstr &MI) {
+ return MI.getDesc().TSFlags & SIInstrFlags::IsDOT;
+ }
+
+ bool isDOT(uint16_t Opcode) const {
+ return get(Opcode).TSFlags & SIInstrFlags::IsDOT;
+ }
+
static bool isScalarUnit(const MachineInstr &MI) {
return MI.getDesc().TSFlags & (SIInstrFlags::SALU | SIInstrFlags::SMRD);
}
@@ -954,6 +969,19 @@ public:
bool isBasicBlockPrologue(const MachineInstr &MI) const override;
+ MachineInstr *createPHIDestinationCopy(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator InsPt,
+ const DebugLoc &DL, Register Src,
+ Register Dst) const override;
+
+ MachineInstr *createPHISourceCopy(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator InsPt,
+ const DebugLoc &DL, Register Src,
+ Register SrcSubReg,
+ Register Dst) const override;
+
+ bool isWave32() const;
+
/// Return a partially built integer add instruction without carry.
/// Caller must add source operands.
/// For pre-GFX9 it will generate unused carry destination operand.
@@ -963,6 +991,12 @@ public:
const DebugLoc &DL,
unsigned DestReg) const;
+ MachineInstrBuilder getAddNoCarry(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator I,
+ const DebugLoc &DL,
+ Register DestReg,
+ RegScavenger &RS) const;
+
static bool isKillTerminator(unsigned Opcode);
const MCInstrDesc &getKillTerminatorFromPseudo(unsigned Opcode) const;
@@ -970,6 +1004,8 @@ public:
return isUInt<12>(Imm);
}
+ unsigned getNumFlatOffsetBits(unsigned AddrSpace, bool Signed) const;
+
/// Returns if \p Offset is legal for the subtarget as the offset to a FLAT
/// encoded instruction. If \p Signed, this is for an instruction that
/// interprets the offset as signed.
diff --git a/lib/Target/AMDGPU/SIInstrInfo.td b/lib/Target/AMDGPU/SIInstrInfo.td
index c382c816e0b4..1eecbf555613 100644
--- a/lib/Target/AMDGPU/SIInstrInfo.td
+++ b/lib/Target/AMDGPU/SIInstrInfo.td
@@ -84,7 +84,7 @@ def SDTtbuffer_load : SDTypeProfile<1, 8,
SDTCisVT<4, i32>, // soffset(SGPR)
SDTCisVT<5, i32>, // offset(imm)
SDTCisVT<6, i32>, // format(imm)
- SDTCisVT<7, i32>, // cachecontrol(imm)
+ SDTCisVT<7, i32>, // cachepolicy, swizzled buffer(imm)
SDTCisVT<8, i1> // idxen(imm)
]>;
@@ -102,7 +102,7 @@ def SDTtbuffer_store : SDTypeProfile<0, 9,
SDTCisVT<4, i32>, // soffset(SGPR)
SDTCisVT<5, i32>, // offset(imm)
SDTCisVT<6, i32>, // format(imm)
- SDTCisVT<7, i32>, // cachecontrol(imm)
+ SDTCisVT<7, i32>, // cachepolicy, swizzled buffer(imm)
SDTCisVT<8, i1> // idxen(imm)
]>;
@@ -119,7 +119,7 @@ def SDTBufferLoad : SDTypeProfile<1, 7,
SDTCisVT<3, i32>, // voffset(VGPR)
SDTCisVT<4, i32>, // soffset(SGPR)
SDTCisVT<5, i32>, // offset(imm)
- SDTCisVT<6, i32>, // cachepolicy(imm)
+ SDTCisVT<6, i32>, // cachepolicy, swizzled buffer(imm)
SDTCisVT<7, i1>]>; // idxen(imm)
def SIbuffer_load : SDNode <"AMDGPUISD::BUFFER_LOAD", SDTBufferLoad,
@@ -145,7 +145,7 @@ def SDTBufferStore : SDTypeProfile<0, 8,
SDTCisVT<3, i32>, // voffset(VGPR)
SDTCisVT<4, i32>, // soffset(SGPR)
SDTCisVT<5, i32>, // offset(imm)
- SDTCisVT<6, i32>, // cachepolicy(imm)
+ SDTCisVT<6, i32>, // cachepolicy, swizzled buffer(imm)
SDTCisVT<7, i1>]>; // idxen(imm)
def SIbuffer_store : SDNode <"AMDGPUISD::BUFFER_STORE", SDTBufferStore,
@@ -198,6 +198,8 @@ def SIbuffer_atomic_umax : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_UMAX">;
def SIbuffer_atomic_and : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_AND">;
def SIbuffer_atomic_or : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_OR">;
def SIbuffer_atomic_xor : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_XOR">;
+def SIbuffer_atomic_inc : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_INC">;
+def SIbuffer_atomic_dec : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_DEC">;
def SIbuffer_atomic_fadd : SDBufferAtomicNoRtn <"AMDGPUISD::BUFFER_ATOMIC_FADD", f32>;
def SIbuffer_atomic_pk_fadd : SDBufferAtomicNoRtn <"AMDGPUISD::BUFFER_ATOMIC_PK_FADD", v2f16>;
@@ -264,6 +266,11 @@ def SIload_d16_hi_i8 : SDNode<"AMDGPUISD::LOAD_D16_HI_I8",
[SDNPMayLoad, SDNPMemOperand, SDNPHasChain]
>;
+def SIdenorm_mode : SDNode<"AMDGPUISD::DENORM_MODE",
+ SDTypeProfile<0 ,1, [SDTCisInt<0>]>,
+ [SDNPHasChain, SDNPSideEffect, SDNPOptInGlue, SDNPOutGlue]
+>;
+
//===----------------------------------------------------------------------===//
// ValueType helpers
//===----------------------------------------------------------------------===//
@@ -277,7 +284,9 @@ class isFloatType<ValueType SrcVT> {
!if(!eq(SrcVT.Value, f64.Value), 1,
!if(!eq(SrcVT.Value, v2f16.Value), 1,
!if(!eq(SrcVT.Value, v4f16.Value), 1,
- 0)))));
+ !if(!eq(SrcVT.Value, v2f32.Value), 1,
+ !if(!eq(SrcVT.Value, v2f64.Value), 1,
+ 0)))))));
}
class isIntType<ValueType SrcVT> {
@@ -300,14 +309,36 @@ class isPackedType<ValueType SrcVT> {
// PatFrags for global memory operations
//===----------------------------------------------------------------------===//
-defm atomic_inc_global : global_binary_atomic_op<SIatomic_inc>;
-defm atomic_dec_global : global_binary_atomic_op<SIatomic_dec>;
+foreach as = [ "global", "flat", "constant", "local", "private", "region" ] in {
+let AddressSpaces = !cast<AddressSpaceList>("LoadAddress_"#as).AddrSpaces in {
+
-def atomic_inc_local : local_binary_atomic_op<SIatomic_inc>;
-def atomic_dec_local : local_binary_atomic_op<SIatomic_dec>;
-def atomic_load_fadd_local : local_binary_atomic_op<atomic_load_fadd>;
-def atomic_load_fmin_local : local_binary_atomic_op<SIatomic_fmin>;
-def atomic_load_fmax_local : local_binary_atomic_op<SIatomic_fmax>;
+defm atomic_inc_#as : binary_atomic_op<SIatomic_inc>;
+defm atomic_dec_#as : binary_atomic_op<SIatomic_dec>;
+defm atomic_load_fmin_#as : binary_atomic_op<SIatomic_fmin, 0>;
+defm atomic_load_fmax_#as : binary_atomic_op<SIatomic_fmax, 0>;
+
+
+} // End let AddressSpaces = ...
+} // End foreach AddrSpace
+
+def atomic_fadd_global_noret : PatFrag<
+ (ops node:$ptr, node:$value),
+ (SIglobal_atomic_fadd node:$ptr, node:$value)> {
+ // FIXME: Move this
+ let MemoryVT = f32;
+ let IsAtomic = 1;
+ let AddressSpaces = StoreAddress_global.AddrSpaces;
+}
+
+def atomic_pk_fadd_global_noret : PatFrag<
+ (ops node:$ptr, node:$value),
+ (SIglobal_atomic_pk_fadd node:$ptr, node:$value)> {
+ // FIXME: Move this
+ let MemoryVT = v2f16;
+ let IsAtomic = 1;
+ let AddressSpaces = StoreAddress_global.AddrSpaces;
+}
//===----------------------------------------------------------------------===//
// SDNodes PatFrags for loads/stores with a glue input.
@@ -328,10 +359,12 @@ def AMDGPUatomic_ld_glue : SDNode <"ISD::ATOMIC_LOAD", SDTAtomicLoad,
>;
def unindexedload_glue : PatFrag <(ops node:$ptr), (AMDGPUld_glue node:$ptr)> {
+ let IsLoad = 1;
let IsUnindexed = 1;
}
def load_glue : PatFrag <(ops node:$ptr), (unindexedload_glue node:$ptr)> {
+ let IsLoad = 1;
let IsNonExtLoad = 1;
}
@@ -347,14 +380,15 @@ def atomic_load_64_glue : PatFrag<(ops node:$ptr),
let MemoryVT = i64;
}
-def extload_glue : PatFrag<(ops node:$ptr), (load_glue node:$ptr)> {
+def extload_glue : PatFrag<(ops node:$ptr), (unindexedload_glue node:$ptr)> {
let IsLoad = 1;
let IsAnyExtLoad = 1;
}
-def sextload_glue : PatFrag<(ops node:$ptr), (unindexedload_glue node:$ptr), [{
- return cast<LoadSDNode>(N)->getExtensionType() == ISD::SEXTLOAD;
-}]>;
+def sextload_glue : PatFrag<(ops node:$ptr), (unindexedload_glue node:$ptr)> {
+ let IsLoad = 1;
+ let IsSignExtLoad = 1;
+}
def zextload_glue : PatFrag<(ops node:$ptr), (unindexedload_glue node:$ptr)> {
let IsLoad = 1;
@@ -391,25 +425,50 @@ def sextloadi16_glue : PatFrag<(ops node:$ptr), (sextload_glue node:$ptr)> {
let MemoryVT = i16;
}
-def load_glue_align8 : Aligned8Bytes <
- (ops node:$ptr), (load_glue node:$ptr)
->;
-def load_glue_align16 : Aligned16Bytes <
- (ops node:$ptr), (load_glue node:$ptr)
->;
+let IsLoad = 1, AddressSpaces = LoadAddress_local.AddrSpaces in {
+def load_local_m0 : PatFrag<(ops node:$ptr), (load_glue node:$ptr)> {
+ let IsNonExtLoad = 1;
+}
-def load_local_m0 : LoadFrag<load_glue>, LocalAddress;
-def sextloadi8_local_m0 : LoadFrag<sextloadi8_glue>, LocalAddress;
-def sextloadi16_local_m0 : LoadFrag<sextloadi16_glue>, LocalAddress;
-def extloadi8_local_m0 : LoadFrag<extloadi8_glue>, LocalAddress;
-def zextloadi8_local_m0 : LoadFrag<zextloadi8_glue>, LocalAddress;
-def extloadi16_local_m0 : LoadFrag<extloadi16_glue>, LocalAddress;
-def zextloadi16_local_m0 : LoadFrag<zextloadi16_glue>, LocalAddress;
-def load_align8_local_m0 : LoadFrag <load_glue_align8>, LocalAddress;
-def load_align16_local_m0 : LoadFrag <load_glue_align16>, LocalAddress;
-def atomic_load_32_local_m0 : LoadFrag<atomic_load_32_glue>, LocalAddress;
-def atomic_load_64_local_m0 : LoadFrag<atomic_load_64_glue>, LocalAddress;
+let MemoryVT = i8 in {
+def extloadi8_local_m0 : PatFrag<(ops node:$ptr), (extloadi8_glue node:$ptr)>;
+def sextloadi8_local_m0 : PatFrag<(ops node:$ptr), (sextloadi8_glue node:$ptr)>;
+def zextloadi8_local_m0 : PatFrag<(ops node:$ptr), (zextloadi8_glue node:$ptr)>;
+}
+
+let MemoryVT = i16 in {
+def extloadi16_local_m0 : PatFrag<(ops node:$ptr), (extloadi16_glue node:$ptr)>;
+def sextloadi16_local_m0 : PatFrag<(ops node:$ptr), (sextloadi16_glue node:$ptr)>;
+def zextloadi16_local_m0 : PatFrag<(ops node:$ptr), (zextloadi16_glue node:$ptr)>;
+}
+
+def load_align8_local_m0 : PatFrag<(ops node:$ptr),
+ (load_local_m0 node:$ptr)> {
+ let IsLoad = 1;
+ let IsNonExtLoad = 1;
+ let MinAlignment = 8;
+}
+def load_align16_local_m0 : PatFrag<(ops node:$ptr),
+ (load_local_m0 node:$ptr)> {
+ let IsLoad = 1;
+ let IsNonExtLoad = 1;
+ let MinAlignment = 16;
+}
+
+} // End IsLoad = 1
+
+let IsAtomic = 1, AddressSpaces = LoadAddress_local.AddrSpaces in {
+def atomic_load_32_local_m0 : PatFrag<(ops node:$ptr),
+ (atomic_load_32_glue node:$ptr)> {
+ let MemoryVT = i32;
+}
+def atomic_load_64_local_m0 : PatFrag<(ops node:$ptr),
+ (atomic_load_64_glue node:$ptr)> {
+ let MemoryVT = i64;
+}
+
+} // End let AddressSpaces = LoadAddress_local.AddrSpaces
def AMDGPUst_glue : SDNode <"ISD::STORE", SDTStore,
@@ -420,50 +479,88 @@ def AMDGPUatomic_st_glue : SDNode <"ISD::ATOMIC_STORE", SDTAtomicStore,
[SDNPHasChain, SDNPMayStore, SDNPMemOperand, SDNPInGlue]
>;
-def atomic_store_glue : PatFrag<(ops node:$ptr, node:$val),
- (AMDGPUatomic_st_glue node:$ptr, node:$val)> {
-}
-
def unindexedstore_glue : PatFrag<(ops node:$val, node:$ptr),
- (AMDGPUst_glue node:$val, node:$ptr), [{
- return cast<StoreSDNode>(N)->getAddressingMode() == ISD::UNINDEXED;
-}]>;
+ (AMDGPUst_glue node:$val, node:$ptr)> {
+ let IsStore = 1;
+ let IsUnindexed = 1;
+}
def store_glue : PatFrag<(ops node:$val, node:$ptr),
- (unindexedstore_glue node:$val, node:$ptr), [{
- return !cast<StoreSDNode>(N)->isTruncatingStore();
-}]>;
+ (unindexedstore_glue node:$val, node:$ptr)> {
+ let IsStore = 1;
+ let IsTruncStore = 0;
+}
def truncstore_glue : PatFrag<(ops node:$val, node:$ptr),
- (unindexedstore_glue node:$val, node:$ptr), [{
- return cast<StoreSDNode>(N)->isTruncatingStore();
-}]>;
+ (unindexedstore_glue node:$val, node:$ptr)> {
+ let IsStore = 1;
+ let IsTruncStore = 1;
+}
def truncstorei8_glue : PatFrag<(ops node:$val, node:$ptr),
- (truncstore_glue node:$val, node:$ptr), [{
- return cast<StoreSDNode>(N)->getMemoryVT() == MVT::i8;
-}]>;
+ (truncstore_glue node:$val, node:$ptr)> {
+ let IsStore = 1;
+ let MemoryVT = i8;
+}
def truncstorei16_glue : PatFrag<(ops node:$val, node:$ptr),
- (truncstore_glue node:$val, node:$ptr), [{
- return cast<StoreSDNode>(N)->getMemoryVT() == MVT::i16;
-}]>;
+ (truncstore_glue node:$val, node:$ptr)> {
+ let IsStore = 1;
+ let MemoryVT = i16;
+}
-def store_glue_align8 : Aligned8Bytes <
- (ops node:$value, node:$ptr), (store_glue node:$value, node:$ptr)
->;
+let IsStore = 1, AddressSpaces = StoreAddress_local.AddrSpaces in {
+def store_local_m0 : PatFrag<(ops node:$val, node:$ptr),
+ (store_glue node:$val, node:$ptr)> {
+ let IsStore = 1;
+ let IsTruncStore = 0;
+}
-def store_glue_align16 : Aligned16Bytes <
- (ops node:$value, node:$ptr), (store_glue node:$value, node:$ptr)
->;
+def truncstorei8_local_m0 : PatFrag<(ops node:$val, node:$ptr),
+ (unindexedstore_glue node:$val, node:$ptr)> {
+ let IsStore = 1;
+ let MemoryVT = i8;
+}
+
+def truncstorei16_local_m0 : PatFrag<(ops node:$val, node:$ptr),
+ (unindexedstore_glue node:$val, node:$ptr)> {
+ let IsStore = 1;
+ let MemoryVT = i16;
+}
+}
+
+def store_align16_local_m0 : PatFrag <
+ (ops node:$value, node:$ptr),
+ (store_local_m0 node:$value, node:$ptr)> {
+ let IsStore = 1;
+ let IsTruncStore = 0;
+ let MinAlignment = 16;
+}
-def store_local_m0 : StoreFrag<store_glue>, LocalAddress;
-def truncstorei8_local_m0 : StoreFrag<truncstorei8_glue>, LocalAddress;
-def truncstorei16_local_m0 : StoreFrag<truncstorei16_glue>, LocalAddress;
-def atomic_store_local_m0 : StoreFrag<AMDGPUatomic_st_glue>, LocalAddress;
+def store_align8_local_m0 : PatFrag <
+ (ops node:$value, node:$ptr),
+ (store_local_m0 node:$value, node:$ptr)> {
+ let IsStore = 1;
+ let IsTruncStore = 0;
+ let MinAlignment = 8;
+}
+
+let AddressSpaces = StoreAddress_local.AddrSpaces in {
+
+def atomic_store_local_32_m0 : PatFrag <
+ (ops node:$value, node:$ptr),
+ (AMDGPUatomic_st_glue node:$value, node:$ptr)> {
+ let IsAtomic = 1;
+ let MemoryVT = i32;
+}
+def atomic_store_local_64_m0 : PatFrag <
+ (ops node:$value, node:$ptr),
+ (AMDGPUatomic_st_glue node:$value, node:$ptr)> {
+ let IsAtomic = 1;
+ let MemoryVT = i64;
+}
+} // End let AddressSpaces = StoreAddress_local.AddrSpaces
-def store_align8_local_m0 : StoreFrag<store_glue_align8>, LocalAddress;
-def store_align16_local_m0 : StoreFrag<store_glue_align16>, LocalAddress;
def si_setcc_uniform : PatFrag <
(ops node:$lhs, node:$rhs, node:$cond),
@@ -539,16 +636,27 @@ def lshl_rev : PatFrag <
(shl $src0, $src1)
>;
+def add_ctpop : PatFrag <
+ (ops node:$src0, node:$src1),
+ (add (ctpop $src0), $src1)
+>;
+
multiclass SIAtomicM0Glue2 <string op_name, bit is_amdgpu = 0,
- SDTypeProfile tc = SDTAtomic2> {
+ SDTypeProfile tc = SDTAtomic2,
+ bit IsInt = 1> {
def _glue : SDNode <
!if(is_amdgpu, "AMDGPUISD", "ISD")#"::ATOMIC_"#op_name, tc,
[SDNPHasChain, SDNPMayStore, SDNPMayLoad, SDNPMemOperand, SDNPInGlue]
>;
- def _local_m0 : local_binary_atomic_op <!cast<SDNode>(NAME#"_glue")>;
- def _region_m0 : region_binary_atomic_op <!cast<SDNode>(NAME#"_glue")>;
+ let AddressSpaces = StoreAddress_local.AddrSpaces in {
+ defm _local_m0 : binary_atomic_op <!cast<SDNode>(NAME#"_glue"), IsInt>;
+ }
+
+ let AddressSpaces = StoreAddress_region.AddrSpaces in {
+ defm _region_m0 : binary_atomic_op <!cast<SDNode>(NAME#"_glue"), IsInt>;
+ }
}
defm atomic_load_add : SIAtomicM0Glue2 <"LOAD_ADD">;
@@ -563,17 +671,9 @@ defm atomic_load_xor : SIAtomicM0Glue2 <"LOAD_XOR">;
defm atomic_load_umin : SIAtomicM0Glue2 <"LOAD_UMIN">;
defm atomic_load_umax : SIAtomicM0Glue2 <"LOAD_UMAX">;
defm atomic_swap : SIAtomicM0Glue2 <"SWAP">;
-defm atomic_load_fadd : SIAtomicM0Glue2 <"LOAD_FADD", 0, SDTAtomic2_f32>;
-defm atomic_load_fmin : SIAtomicM0Glue2 <"LOAD_FMIN", 1, SDTAtomic2_f32>;
-defm atomic_load_fmax : SIAtomicM0Glue2 <"LOAD_FMAX", 1, SDTAtomic2_f32>;
-
-def atomic_cmp_swap_glue : SDNode <"ISD::ATOMIC_CMP_SWAP", SDTAtomic3,
- [SDNPHasChain, SDNPMayStore, SDNPMayLoad, SDNPMemOperand, SDNPInGlue]
->;
-
-def atomic_cmp_swap_local_m0 : AtomicCmpSwapLocal<atomic_cmp_swap_glue>;
-def atomic_cmp_swap_region_m0 : AtomicCmpSwapRegion<atomic_cmp_swap_glue>;
-
+defm atomic_load_fadd : SIAtomicM0Glue2 <"LOAD_FADD", 0, SDTAtomic2_f32, 0>;
+defm atomic_load_fmin : SIAtomicM0Glue2 <"LOAD_FMIN", 1, SDTAtomic2_f32, 0>;
+defm atomic_load_fmax : SIAtomicM0Glue2 <"LOAD_FMAX", 1, SDTAtomic2_f32, 0>;
def as_i1imm : SDNodeXForm<imm, [{
return CurDAG->getTargetConstant(N->getZExtValue(), SDLoc(N), MVT::i1);
@@ -591,6 +691,10 @@ def as_i32imm: SDNodeXForm<imm, [{
return CurDAG->getTargetConstant(N->getSExtValue(), SDLoc(N), MVT::i32);
}]>;
+def as_i32timm: SDNodeXForm<timm, [{
+ return CurDAG->getTargetConstant(N->getSExtValue(), SDLoc(N), MVT::i32);
+}]>;
+
def as_i64imm: SDNodeXForm<imm, [{
return CurDAG->getTargetConstant(N->getSExtValue(), SDLoc(N), MVT::i64);
}]>;
@@ -627,9 +731,13 @@ def SIMM16bit : ImmLeaf <i32,
>;
def UIMM16bit : ImmLeaf <i32,
- [{return isUInt<16>(Imm); }]
+ [{return isUInt<16>(Imm);}]
>;
+def i64imm_32bit : ImmLeaf<i64, [{
+ return (Imm & 0xffffffffULL) == static_cast<uint64_t>(Imm);
+}]>;
+
class InlineImm <ValueType vt> : PatLeaf <(vt imm), [{
return isInlineImmediate(N);
}]>;
@@ -763,6 +871,18 @@ def ExpTgtMatchClass : AsmOperandClass {
let RenderMethod = "printExpTgt";
}
+def SWaitMatchClass : AsmOperandClass {
+ let Name = "SWaitCnt";
+ let RenderMethod = "addImmOperands";
+ let ParserMethod = "parseSWaitCntOps";
+}
+
+def VReg32OrOffClass : AsmOperandClass {
+ let Name = "VReg32OrOff";
+ let ParserMethod = "parseVReg32OrOff";
+}
+
+let OperandType = "OPERAND_IMMEDIATE" in {
def SendMsgImm : Operand<i32> {
let PrintMethod = "printSendMsg";
let ParserMatchClass = SendMsgMatchClass;
@@ -778,22 +898,11 @@ def EndpgmImm : Operand<i16> {
let ParserMatchClass = EndpgmMatchClass;
}
-def SWaitMatchClass : AsmOperandClass {
- let Name = "SWaitCnt";
- let RenderMethod = "addImmOperands";
- let ParserMethod = "parseSWaitCntOps";
-}
-
-def VReg32OrOffClass : AsmOperandClass {
- let Name = "VReg32OrOff";
- let ParserMethod = "parseVReg32OrOff";
-}
-
def WAIT_FLAG : Operand <i32> {
let ParserMatchClass = SWaitMatchClass;
let PrintMethod = "printWaitFlag";
- let OperandType = "OPERAND_IMMEDIATE";
}
+} // End OperandType = "OPERAND_IMMEDIATE"
include "SIInstrFormats.td"
include "VIInstrFormats.td"
@@ -929,6 +1038,7 @@ def DLC : NamedOperandBit<"DLC", NamedMatchClass<"DLC">>;
def GLC : NamedOperandBit<"GLC", NamedMatchClass<"GLC">>;
def SLC : NamedOperandBit<"SLC", NamedMatchClass<"SLC">>;
def TFE : NamedOperandBit<"TFE", NamedMatchClass<"TFE">>;
+def SWZ : NamedOperandBit<"SWZ", NamedMatchClass<"SWZ">>;
def UNorm : NamedOperandBit<"UNorm", NamedMatchClass<"UNorm">>;
def DA : NamedOperandBit<"DA", NamedMatchClass<"DA">>;
def R128A16 : NamedOperandBit<"R128A16", NamedMatchClass<"R128A16">>;
@@ -1317,18 +1427,6 @@ class getVALUDstForVT<ValueType VT> {
VOPDstS64orS32)))); // else VT == i1
}
-// Returns true if VT is floating point.
-class getIsFP<ValueType VT> {
- bit ret = !if(!eq(VT.Value, f16.Value), 1,
- !if(!eq(VT.Value, v2f16.Value), 1,
- !if(!eq(VT.Value, v4f16.Value), 1,
- !if(!eq(VT.Value, f32.Value), 1,
- !if(!eq(VT.Value, v2f32.Value), 1,
- !if(!eq(VT.Value, f64.Value), 1,
- !if(!eq(VT.Value, v2f64.Value), 1,
- 0)))))));
-}
-
// Returns the register class to use for the destination of VOP[12C]
// instructions with SDWA extension
class getSDWADstForVT<ValueType VT> {
@@ -1340,7 +1438,7 @@ class getSDWADstForVT<ValueType VT> {
// Returns the register class to use for source 0 of VOP[12C]
// instructions for the given VT.
class getVOPSrc0ForVT<ValueType VT> {
- bit isFP = getIsFP<VT>.ret;
+ bit isFP = isFloatType<VT>.ret;
RegisterOperand ret =
!if(isFP,
@@ -1373,11 +1471,14 @@ class getVOPSrc0ForVT<ValueType VT> {
// Returns the vreg register class to use for source operand given VT
class getVregSrcForVT<ValueType VT> {
RegisterClass ret = !if(!eq(VT.Size, 128), VReg_128,
- !if(!eq(VT.Size, 64), VReg_64, VGPR_32));
+ !if(!eq(VT.Size, 96), VReg_96,
+ !if(!eq(VT.Size, 64), VReg_64,
+ !if(!eq(VT.Size, 48), VReg_64,
+ VGPR_32))));
}
class getSDWASrcForVT <ValueType VT> {
- bit isFP = getIsFP<VT>.ret;
+ bit isFP = isFloatType<VT>.ret;
RegisterOperand retFlt = !if(!eq(VT.Size, 16), SDWASrc_f16, SDWASrc_f32);
RegisterOperand retInt = !if(!eq(VT.Size, 16), SDWASrc_i16, SDWASrc_i32);
RegisterOperand ret = !if(isFP, retFlt, retInt);
@@ -1386,7 +1487,7 @@ class getSDWASrcForVT <ValueType VT> {
// Returns the register class to use for sources of VOP3 instructions for the
// given VT.
class getVOP3SrcForVT<ValueType VT> {
- bit isFP = getIsFP<VT>.ret;
+ bit isFP = isFloatType<VT>.ret;
RegisterOperand ret =
!if(!eq(VT.Size, 128),
VSrc_128,
@@ -1433,7 +1534,7 @@ class isModifierType<ValueType SrcVT> {
// Return type of input modifiers operand for specified input operand
class getSrcMod <ValueType VT, bit EnableF32SrcMods> {
- bit isFP = getIsFP<VT>.ret;
+ bit isFP = isFloatType<VT>.ret;
bit isPacked = isPackedType<VT>.ret;
Operand ret = !if(!eq(VT.Size, 64),
!if(isFP, FP64InputMods, Int64InputMods),
@@ -1452,7 +1553,7 @@ class getOpSelMod <ValueType VT> {
// Return type of input modifiers operand specified input operand for DPP
class getSrcModExt <ValueType VT> {
- bit isFP = getIsFP<VT>.ret;
+ bit isFP = isFloatType<VT>.ret;
Operand ret = !if(isFP, FPVRegInputMods, IntVRegInputMods);
}
@@ -2038,6 +2139,7 @@ class VOPProfile <list<ValueType> _ArgVT, bit _EnableF32SrcMods = 0,
field int NeedPatGen = PatGenMode.NoPattern;
field bit IsMAI = 0;
+ field bit IsDOT = 0;
field Operand Src0PackedMod = !if(HasSrc0FloatMods, PackedF16InputMods, PackedI16InputMods);
field Operand Src1PackedMod = !if(HasSrc1FloatMods, PackedF16InputMods, PackedI16InputMods);
diff --git a/lib/Target/AMDGPU/SIInstructions.td b/lib/Target/AMDGPU/SIInstructions.td
index 70f20bb69370..21984c6ad910 100644
--- a/lib/Target/AMDGPU/SIInstructions.td
+++ b/lib/Target/AMDGPU/SIInstructions.td
@@ -43,8 +43,8 @@ multiclass V_INTERP_P1_F32_m : VINTRP_m <
(outs VINTRPDst:$vdst),
(ins VGPR_32:$vsrc, Attr:$attr, AttrChan:$attrchan),
"v_interp_p1_f32$vdst, $vsrc, $attr$attrchan",
- [(set f32:$vdst, (AMDGPUinterp_p1 f32:$vsrc, (i32 imm:$attrchan),
- (i32 imm:$attr)))]
+ [(set f32:$vdst, (int_amdgcn_interp_p1 f32:$vsrc,
+ (i32 timm:$attrchan), (i32 timm:$attr), M0))]
>;
let OtherPredicates = [has32BankLDS] in {
@@ -66,8 +66,8 @@ defm V_INTERP_P2_F32 : VINTRP_m <
(outs VINTRPDst:$vdst),
(ins VGPR_32:$src0, VGPR_32:$vsrc, Attr:$attr, AttrChan:$attrchan),
"v_interp_p2_f32$vdst, $vsrc, $attr$attrchan",
- [(set f32:$vdst, (AMDGPUinterp_p2 f32:$src0, f32:$vsrc, (i32 imm:$attrchan),
- (i32 imm:$attr)))]>;
+ [(set f32:$vdst, (int_amdgcn_interp_p2 f32:$src0, f32:$vsrc,
+ (i32 timm:$attrchan), (i32 timm:$attr), M0))]>;
} // End DisableEncoding = "$src0", Constraints = "$src0 = $vdst"
@@ -76,8 +76,8 @@ defm V_INTERP_MOV_F32 : VINTRP_m <
(outs VINTRPDst:$vdst),
(ins InterpSlot:$vsrc, Attr:$attr, AttrChan:$attrchan),
"v_interp_mov_f32$vdst, $vsrc, $attr$attrchan",
- [(set f32:$vdst, (AMDGPUinterp_mov (i32 imm:$vsrc), (i32 imm:$attrchan),
- (i32 imm:$attr)))]>;
+ [(set f32:$vdst, (int_amdgcn_interp_mov (i32 imm:$vsrc),
+ (i32 timm:$attrchan), (i32 timm:$attr), M0))]>;
} // End Uses = [M0, EXEC]
@@ -92,6 +92,11 @@ def ATOMIC_FENCE : SPseudoInstSI<
let maybeAtomic = 1;
}
+def VOP_I64_I64_DPP : VOPProfile <[i64, i64, untyped, untyped]> {
+ let HasExt = 1;
+ let HasExtDPP = 1;
+}
+
let hasSideEffects = 0, mayLoad = 0, mayStore = 0, Uses = [EXEC] in {
// For use in patterns
@@ -107,10 +112,19 @@ def V_CNDMASK_B64_PSEUDO : VOP3Common <(outs VReg_64:$vdst),
def V_MOV_B64_PSEUDO : VPseudoInstSI <(outs VReg_64:$vdst),
(ins VSrc_b64:$src0)>;
+// 64-bit vector move with dpp. Expanded post-RA.
+def V_MOV_B64_DPP_PSEUDO : VOP_DPP_Pseudo <"v_mov_b64_dpp", VOP_I64_I64_DPP> {
+ let Size = 16; // Requires two 8-byte v_mov_b32_dpp to complete.
+}
+
// Pseudoinstruction for @llvm.amdgcn.wqm. It is turned into a copy after the
// WQM pass processes it.
def WQM : PseudoInstSI <(outs unknown:$vdst), (ins unknown:$src0)>;
+// Pseudoinstruction for @llvm.amdgcn.softwqm. Like @llvm.amdgcn.wqm it is
+// turned into a copy by WQM pass, but does not seed WQM requirements.
+def SOFT_WQM : PseudoInstSI <(outs unknown:$vdst), (ins unknown:$src0)>;
+
// Pseudoinstruction for @llvm.amdgcn.wwm. It is turned into a copy post-RA, so
// that the @earlyclobber is respected. The @earlyclobber is to make sure that
// the instruction that defines $src0 (which is run in WWM) doesn't
@@ -345,13 +359,15 @@ def SI_INIT_M0 : SPseudoInstSI <(outs), (ins SSrc_b32:$src)> {
}
def SI_INIT_EXEC : SPseudoInstSI <
- (outs), (ins i64imm:$src), []> {
+ (outs), (ins i64imm:$src),
+ [(int_amdgcn_init_exec (i64 timm:$src))]> {
let Defs = [EXEC];
let usesCustomInserter = 1;
let isAsCheapAsAMove = 1;
let WaveSizePredicate = isWave64;
}
+// FIXME: Intrinsic should be mangled for wave size.
def SI_INIT_EXEC_LO : SPseudoInstSI <
(outs), (ins i32imm:$src), []> {
let Defs = [EXEC_LO];
@@ -360,12 +376,20 @@ def SI_INIT_EXEC_LO : SPseudoInstSI <
let WaveSizePredicate = isWave32;
}
+// FIXME: Wave32 version
def SI_INIT_EXEC_FROM_INPUT : SPseudoInstSI <
- (outs), (ins SSrc_b32:$input, i32imm:$shift), []> {
+ (outs), (ins SSrc_b32:$input, i32imm:$shift),
+ [(int_amdgcn_init_exec_from_input i32:$input, (i32 timm:$shift))]> {
let Defs = [EXEC];
let usesCustomInserter = 1;
}
+def : GCNPat <
+ (int_amdgcn_init_exec timm:$src),
+ (SI_INIT_EXEC_LO (as_i32imm imm:$src))> {
+ let WaveSizePredicate = isWave32;
+}
+
// Return for returning shaders to a shader variant epilog.
def SI_RETURN_TO_EPILOG : SPseudoInstSI <
(outs), (ins variable_ops), [(AMDGPUreturn_to_epilog)]> {
@@ -604,25 +628,6 @@ def : GCNPat <
(SI_PC_ADD_REL_OFFSET $ptr_lo, (i32 0))
>;
-def : GCNPat <
- (AMDGPUinit_exec i64:$src),
- (SI_INIT_EXEC (as_i64imm $src))
-> {
- let WaveSizePredicate = isWave64;
-}
-
-def : GCNPat <
- (AMDGPUinit_exec i64:$src),
- (SI_INIT_EXEC_LO (as_i32imm $src))
-> {
- let WaveSizePredicate = isWave32;
-}
-
-def : GCNPat <
- (AMDGPUinit_exec_from_input i32:$input, i32:$shift),
- (SI_INIT_EXEC_FROM_INPUT (i32 $input), (as_i32imm $shift))
->;
-
def : GCNPat<
(AMDGPUtrap timm:$trapid),
(S_TRAP $trapid)
@@ -740,22 +745,22 @@ def : GCNPat <
def : GCNPat <
(i32 (fp_to_sint f16:$src)),
- (V_CVT_I32_F32_e32 (V_CVT_F32_F16_e32 $src))
+ (V_CVT_I32_F32_e32 (V_CVT_F32_F16_e32 VSrc_b32:$src))
>;
def : GCNPat <
(i32 (fp_to_uint f16:$src)),
- (V_CVT_U32_F32_e32 (V_CVT_F32_F16_e32 $src))
+ (V_CVT_U32_F32_e32 (V_CVT_F32_F16_e32 VSrc_b32:$src))
>;
def : GCNPat <
(f16 (sint_to_fp i32:$src)),
- (V_CVT_F16_F32_e32 (V_CVT_F32_I32_e32 $src))
+ (V_CVT_F16_F32_e32 (V_CVT_F32_I32_e32 VSrc_b32:$src))
>;
def : GCNPat <
(f16 (uint_to_fp i32:$src)),
- (V_CVT_F16_F32_e32 (V_CVT_F32_U32_e32 $src))
+ (V_CVT_F16_F32_e32 (V_CVT_F32_U32_e32 VSrc_b32:$src))
>;
//===----------------------------------------------------------------------===//
@@ -808,8 +813,14 @@ def : GCNPat <
(V_BCNT_U32_B32_e64 $popcnt, $val)
>;
}
+
def : GCNPat <
- (i16 (add (i16 (trunc (getDivergentFrag<ctpop>.ret i32:$popcnt))), i16:$val)),
+ (i32 (ctpop i32:$popcnt)),
+ (V_BCNT_U32_B32_e64 VSrc_b32:$popcnt, (i32 0))
+>;
+
+def : GCNPat <
+ (i16 (add (i16 (trunc (i32 (getDivergentFrag<ctpop>.ret i32:$popcnt)))), i16:$val)),
(V_BCNT_U32_B32_e64 $popcnt, $val)
>;
@@ -1076,53 +1087,158 @@ def : GCNPat <
/********** ================================ **********/
// Prevent expanding both fneg and fabs.
+// TODO: Add IgnoredBySelectionDAG bit?
+let AddedComplexity = 1 in { // Prefer SALU to VALU patterns for DAG
def : GCNPat <
- (fneg (fabs f32:$src)),
- (S_OR_B32 $src, (S_MOV_B32(i32 0x80000000))) // Set sign bit
+ (fneg (fabs (f32 SReg_32:$src))),
+ (S_OR_B32 SReg_32:$src, (S_MOV_B32 (i32 0x80000000))) // Set sign bit
>;
-// FIXME: Should use S_OR_B32
def : GCNPat <
- (fneg (fabs f64:$src)),
- (REG_SEQUENCE VReg_64,
- (i32 (EXTRACT_SUBREG f64:$src, sub0)),
- sub0,
- (V_OR_B32_e32 (i32 (EXTRACT_SUBREG f64:$src, sub1)),
- (V_MOV_B32_e32 (i32 0x80000000))), // Set sign bit.
- sub1)
+ (fabs (f32 SReg_32:$src)),
+ (S_AND_B32 SReg_32:$src, (S_MOV_B32 (i32 0x7fffffff)))
+>;
+
+def : GCNPat <
+ (fneg (f32 SReg_32:$src)),
+ (S_XOR_B32 SReg_32:$src, (S_MOV_B32 (i32 0x80000000)))
+>;
+
+def : GCNPat <
+ (fneg (f16 SReg_32:$src)),
+ (S_XOR_B32 SReg_32:$src, (S_MOV_B32 (i32 0x00008000)))
+>;
+
+def : GCNPat <
+ (fneg (f16 VGPR_32:$src)),
+ (V_XOR_B32_e32 (S_MOV_B32 (i32 0x00008000)), VGPR_32:$src)
+>;
+
+def : GCNPat <
+ (fabs (f16 SReg_32:$src)),
+ (S_AND_B32 SReg_32:$src, (S_MOV_B32 (i32 0x00007fff)))
+>;
+
+def : GCNPat <
+ (fneg (fabs (f16 SReg_32:$src))),
+ (S_OR_B32 SReg_32:$src, (S_MOV_B32 (i32 0x00008000))) // Set sign bit
+>;
+
+def : GCNPat <
+ (fneg (fabs (f16 VGPR_32:$src))),
+ (V_OR_B32_e32 (S_MOV_B32 (i32 0x00008000)), VGPR_32:$src) // Set sign bit
+>;
+
+def : GCNPat <
+ (fneg (v2f16 SReg_32:$src)),
+ (S_XOR_B32 SReg_32:$src, (S_MOV_B32 (i32 0x80008000)))
+>;
+
+def : GCNPat <
+ (fabs (v2f16 SReg_32:$src)),
+ (S_AND_B32 SReg_32:$src, (S_MOV_B32 (i32 0x7fff7fff)))
+>;
+
+// This is really (fneg (fabs v2f16:$src))
+//
+// fabs is not reported as free because there is modifier for it in
+// VOP3P instructions, so it is turned into the bit op.
+def : GCNPat <
+ (fneg (v2f16 (bitconvert (and_oneuse (i32 SReg_32:$src), 0x7fff7fff)))),
+ (S_OR_B32 SReg_32:$src, (S_MOV_B32 (i32 0x80008000))) // Set sign bit
>;
def : GCNPat <
- (fabs f32:$src),
- (S_AND_B32 $src, (S_MOV_B32 (i32 0x7fffffff)))
+ (fneg (v2f16 (fabs SReg_32:$src))),
+ (S_OR_B32 SReg_32:$src, (S_MOV_B32 (i32 0x80008000))) // Set sign bit
+>;
+
+// FIXME: The implicit-def of scc from S_[X]OR_B32 is mishandled
+ // def : GCNPat <
+// (fneg (f64 SReg_64:$src)),
+// (REG_SEQUENCE SReg_64,
+// (i32 (EXTRACT_SUBREG SReg_64:$src, sub0)),
+// sub0,
+// (S_XOR_B32 (i32 (EXTRACT_SUBREG SReg_64:$src, sub1)),
+// (i32 (S_MOV_B32 (i32 0x80000000)))),
+// sub1)
+// >;
+
+// def : GCNPat <
+// (fneg (fabs (f64 SReg_64:$src))),
+// (REG_SEQUENCE SReg_64,
+// (i32 (EXTRACT_SUBREG SReg_64:$src, sub0)),
+// sub0,
+// (S_OR_B32 (i32 (EXTRACT_SUBREG SReg_64:$src, sub1)),
+// (S_MOV_B32 (i32 0x80000000))), // Set sign bit.
+// sub1)
+// >;
+
+} // End let AddedComplexity = 1
+
+def : GCNPat <
+ (fabs (f32 VGPR_32:$src)),
+ (V_AND_B32_e32 (S_MOV_B32 (i32 0x7fffffff)), VGPR_32:$src)
+>;
+
+def : GCNPat <
+ (fneg (f32 VGPR_32:$src)),
+ (V_XOR_B32_e32 (S_MOV_B32 (i32 0x80000000)), VGPR_32:$src)
+>;
+
+def : GCNPat <
+ (fabs (f16 VGPR_32:$src)),
+ (V_AND_B32_e32 (S_MOV_B32 (i32 0x00007fff)), VGPR_32:$src)
>;
def : GCNPat <
- (fneg f32:$src),
- (V_XOR_B32_e32 $src, (V_MOV_B32_e32 (i32 0x80000000)))
+ (fneg (v2f16 VGPR_32:$src)),
+ (V_XOR_B32_e32 (S_MOV_B32 (i32 0x80008000)), VGPR_32:$src)
>;
def : GCNPat <
- (fabs f64:$src),
+ (fabs (v2f16 VGPR_32:$src)),
+ (V_AND_B32_e32 (S_MOV_B32 (i32 0x7fff7fff)), VGPR_32:$src)
+>;
+
+def : GCNPat <
+ (fneg (v2f16 (fabs VGPR_32:$src))),
+ (V_OR_B32_e32 (S_MOV_B32 (i32 0x80008000)), VGPR_32:$src) // Set sign bit
+>;
+
+def : GCNPat <
+ (fabs (f64 VReg_64:$src)),
(REG_SEQUENCE VReg_64,
- (i32 (EXTRACT_SUBREG f64:$src, sub0)),
+ (i32 (EXTRACT_SUBREG VReg_64:$src, sub0)),
sub0,
- (V_AND_B32_e64 (i32 (EXTRACT_SUBREG f64:$src, sub1)),
+ (V_AND_B32_e64 (i32 (EXTRACT_SUBREG VReg_64:$src, sub1)),
(V_MOV_B32_e32 (i32 0x7fffffff))), // Set sign bit.
sub1)
>;
+// TODO: Use SGPR for constant
def : GCNPat <
- (fneg f64:$src),
+ (fneg (f64 VReg_64:$src)),
(REG_SEQUENCE VReg_64,
- (i32 (EXTRACT_SUBREG f64:$src, sub0)),
+ (i32 (EXTRACT_SUBREG VReg_64:$src, sub0)),
sub0,
- (V_XOR_B32_e32 (i32 (EXTRACT_SUBREG f64:$src, sub1)),
+ (V_XOR_B32_e32 (i32 (EXTRACT_SUBREG VReg_64:$src, sub1)),
(i32 (V_MOV_B32_e32 (i32 0x80000000)))),
sub1)
>;
+// TODO: Use SGPR for constant
+def : GCNPat <
+ (fneg (fabs (f64 VReg_64:$src))),
+ (REG_SEQUENCE VReg_64,
+ (i32 (EXTRACT_SUBREG VReg_64:$src, sub0)),
+ sub0,
+ (V_OR_B32_e32 (i32 (EXTRACT_SUBREG VReg_64:$src, sub1)),
+ (V_MOV_B32_e32 (i32 0x80000000))), // Set sign bit.
+ sub1)
+>;
+
def : GCNPat <
(fcopysign f16:$src0, f16:$src1),
(V_BFI_B32 (S_MOV_B32 (i32 0x00007fff)), $src0, $src1)
@@ -1154,45 +1270,6 @@ def : GCNPat <
(V_LSHRREV_B32_e64 (i32 16), (EXTRACT_SUBREG $src1, sub1)))
>;
-def : GCNPat <
- (fneg f16:$src),
- (S_XOR_B32 $src, (S_MOV_B32 (i32 0x00008000)))
->;
-
-def : GCNPat <
- (fabs f16:$src),
- (S_AND_B32 $src, (S_MOV_B32 (i32 0x00007fff)))
->;
-
-def : GCNPat <
- (fneg (fabs f16:$src)),
- (S_OR_B32 $src, (S_MOV_B32 (i32 0x00008000))) // Set sign bit
->;
-
-def : GCNPat <
- (fneg v2f16:$src),
- (S_XOR_B32 $src, (S_MOV_B32 (i32 0x80008000)))
->;
-
-def : GCNPat <
- (fabs v2f16:$src),
- (S_AND_B32 $src, (S_MOV_B32 (i32 0x7fff7fff)))
->;
-
-// This is really (fneg (fabs v2f16:$src))
-//
-// fabs is not reported as free because there is modifier for it in
-// VOP3P instructions, so it is turned into the bit op.
-def : GCNPat <
- (fneg (v2f16 (bitconvert (and_oneuse i32:$src, 0x7fff7fff)))),
- (S_OR_B32 $src, (S_MOV_B32 (i32 0x80008000))) // Set sign bit
->;
-
-def : GCNPat <
- (fneg (v2f16 (fabs v2f16:$src))),
- (S_OR_B32 $src, (S_MOV_B32 (i32 0x80008000))) // Set sign bit
->;
-
/********** ================== **********/
/********** Immediate Patterns **********/
/********** ================== **********/
@@ -1544,7 +1621,7 @@ def : GCNPat <
(V_CVT_F16_F32_e32 (
V_CNDMASK_B32_e64 /*src0mod*/(i32 0), /*src0*/(i32 0),
/*src1mod*/(i32 0), /*src1*/(i32 CONST.FP32_NEG_ONE),
- $src))
+ SSrc_i1:$src))
>;
def : GCNPat <
@@ -1552,35 +1629,35 @@ def : GCNPat <
(V_CVT_F16_F32_e32 (
V_CNDMASK_B32_e64 /*src0mod*/(i32 0), /*src0*/(i32 0),
/*src1mod*/(i32 0), /*src1*/(i32 CONST.FP32_ONE),
- $src))
+ SSrc_i1:$src))
>;
def : GCNPat <
(f32 (sint_to_fp i1:$src)),
(V_CNDMASK_B32_e64 /*src0mod*/(i32 0), /*src0*/(i32 0),
/*src1mod*/(i32 0), /*src1*/(i32 CONST.FP32_NEG_ONE),
- $src)
+ SSrc_i1:$src)
>;
def : GCNPat <
(f32 (uint_to_fp i1:$src)),
(V_CNDMASK_B32_e64 /*src0mod*/(i32 0), /*src0*/(i32 0),
/*src1mod*/(i32 0), /*src1*/(i32 CONST.FP32_ONE),
- $src)
+ SSrc_i1:$src)
>;
def : GCNPat <
(f64 (sint_to_fp i1:$src)),
(V_CVT_F64_I32_e32 (V_CNDMASK_B32_e64 /*src0mod*/(i32 0), /*src0*/(i32 0),
/*src1mod*/(i32 0), /*src1*/(i32 -1),
- $src))
+ SSrc_i1:$src))
>;
def : GCNPat <
(f64 (uint_to_fp i1:$src)),
(V_CVT_F64_U32_e32 (V_CNDMASK_B32_e64 /*src0mod*/(i32 0), /*src0*/(i32 0),
/*src1mod*/(i32 0), /*src1*/(i32 1),
- $src))
+ SSrc_i1:$src))
>;
//===----------------------------------------------------------------------===//
@@ -1788,6 +1865,22 @@ def : GCNPat <
(INSERT_SUBREG (IMPLICIT_DEF), $src0, sub0)
>;
+def : GCNPat <
+ (i64 (int_amdgcn_mov_dpp i64:$src, timm:$dpp_ctrl, timm:$row_mask, timm:$bank_mask,
+ timm:$bound_ctrl)),
+ (V_MOV_B64_DPP_PSEUDO $src, $src, (as_i32imm $dpp_ctrl),
+ (as_i32imm $row_mask), (as_i32imm $bank_mask),
+ (as_i1imm $bound_ctrl))
+>;
+
+def : GCNPat <
+ (i64 (int_amdgcn_update_dpp i64:$old, i64:$src, timm:$dpp_ctrl, timm:$row_mask,
+ timm:$bank_mask, timm:$bound_ctrl)),
+ (V_MOV_B64_DPP_PSEUDO $old, $src, (as_i32imm $dpp_ctrl),
+ (as_i32imm $row_mask), (as_i32imm $bank_mask),
+ (as_i1imm $bound_ctrl))
+>;
+
//===----------------------------------------------------------------------===//
// Fract Patterns
//===----------------------------------------------------------------------===//
@@ -1915,3 +2008,13 @@ def : FP16Med3Pat<f16, V_MED3_F16>;
defm : Int16Med3Pat<V_MED3_I16, smin, smax, smax_oneuse, smin_oneuse>;
defm : Int16Med3Pat<V_MED3_U16, umin, umax, umax_oneuse, umin_oneuse>;
} // End Predicates = [isGFX9Plus]
+
+class AMDGPUGenericInstruction : GenericInstruction {
+ let Namespace = "AMDGPU";
+}
+
+def G_AMDGPU_FFBH_U32 : AMDGPUGenericInstruction {
+ let OutOperandList = (outs type0:$dst);
+ let InOperandList = (ins type1:$src);
+ let hasSideEffects = 0;
+}
diff --git a/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp b/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
index ae8b967893a2..20db1c37f354 100644
--- a/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
+++ b/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
@@ -42,10 +42,7 @@
//
// Future improvements:
//
-// - This currently relies on the scheduler to place loads and stores next to
-// each other, and then only merges adjacent pairs of instructions. It would
-// be good to be more flexible with interleaved instructions, and possibly run
-// before scheduling. It currently missing stores of constants because loading
+// - This is currently missing stores of constants because loading
// the constant into the data register is placed between the stores, although
// this is arguably a scheduling problem.
//
@@ -98,14 +95,9 @@ enum InstClassEnum {
DS_READ,
DS_WRITE,
S_BUFFER_LOAD_IMM,
- BUFFER_LOAD_OFFEN = AMDGPU::BUFFER_LOAD_DWORD_OFFEN,
- BUFFER_LOAD_OFFSET = AMDGPU::BUFFER_LOAD_DWORD_OFFSET,
- BUFFER_STORE_OFFEN = AMDGPU::BUFFER_STORE_DWORD_OFFEN,
- BUFFER_STORE_OFFSET = AMDGPU::BUFFER_STORE_DWORD_OFFSET,
- BUFFER_LOAD_OFFEN_exact = AMDGPU::BUFFER_LOAD_DWORD_OFFEN_exact,
- BUFFER_LOAD_OFFSET_exact = AMDGPU::BUFFER_LOAD_DWORD_OFFSET_exact,
- BUFFER_STORE_OFFEN_exact = AMDGPU::BUFFER_STORE_DWORD_OFFEN_exact,
- BUFFER_STORE_OFFSET_exact = AMDGPU::BUFFER_STORE_DWORD_OFFSET_exact,
+ BUFFER_LOAD,
+ BUFFER_STORE,
+ MIMG,
};
enum RegisterEnum {
@@ -114,6 +106,7 @@ enum RegisterEnum {
SOFFSET = 0x4,
VADDR = 0x8,
ADDR = 0x10,
+ SSAMP = 0x20,
};
class SILoadStoreOptimizer : public MachineFunctionPass {
@@ -126,6 +119,8 @@ class SILoadStoreOptimizer : public MachineFunctionPass {
unsigned Width0;
unsigned Width1;
unsigned BaseOff;
+ unsigned DMask0;
+ unsigned DMask1;
InstClassEnum InstClass;
bool GLC0;
bool GLC1;
@@ -135,6 +130,60 @@ class SILoadStoreOptimizer : public MachineFunctionPass {
bool DLC1;
bool UseST64;
SmallVector<MachineInstr *, 8> InstsToMove;
+ int AddrIdx[5];
+ const MachineOperand *AddrReg[5];
+ unsigned NumAddresses;
+
+ bool hasSameBaseAddress(const MachineInstr &MI) {
+ for (unsigned i = 0; i < NumAddresses; i++) {
+ const MachineOperand &AddrRegNext = MI.getOperand(AddrIdx[i]);
+
+ if (AddrReg[i]->isImm() || AddrRegNext.isImm()) {
+ if (AddrReg[i]->isImm() != AddrRegNext.isImm() ||
+ AddrReg[i]->getImm() != AddrRegNext.getImm()) {
+ return false;
+ }
+ continue;
+ }
+
+ // Check same base pointer. Be careful of subregisters, which can occur
+ // with vectors of pointers.
+ if (AddrReg[i]->getReg() != AddrRegNext.getReg() ||
+ AddrReg[i]->getSubReg() != AddrRegNext.getSubReg()) {
+ return false;
+ }
+ }
+ return true;
+ }
+
+ bool hasMergeableAddress(const MachineRegisterInfo &MRI) {
+ for (unsigned i = 0; i < NumAddresses; ++i) {
+ const MachineOperand *AddrOp = AddrReg[i];
+ // Immediates are always OK.
+ if (AddrOp->isImm())
+ continue;
+
+ // Don't try to merge addresses that aren't either immediates or registers.
+ // TODO: Should be possible to merge FrameIndexes and maybe some other
+ // non-register
+ if (!AddrOp->isReg())
+ return false;
+
+ // TODO: We should be able to merge physical reg addreses.
+ if (Register::isPhysicalRegister(AddrOp->getReg()))
+ return false;
+
+ // If an address has only one use then there will be on other
+ // instructions with the same address, so we can't merge this one.
+ if (MRI.hasOneNonDBGUse(AddrOp->getReg()))
+ return false;
+ }
+ return true;
+ }
+
+ void setMI(MachineBasicBlock::iterator MI, const SIInstrInfo &TII,
+ const GCNSubtarget &STM);
+ void setPaired(MachineBasicBlock::iterator MI, const SIInstrInfo &TII);
};
struct BaseRegisters {
@@ -160,14 +209,12 @@ private:
AliasAnalysis *AA = nullptr;
bool OptimizeAgain;
+ static bool dmasksCanBeCombined(const CombineInfo &CI, const SIInstrInfo &TII);
static bool offsetsCanBeCombined(CombineInfo &CI);
static bool widthsFit(const GCNSubtarget &STM, const CombineInfo &CI);
static unsigned getNewOpcode(const CombineInfo &CI);
static std::pair<unsigned, unsigned> getSubRegIdxs(const CombineInfo &CI);
const TargetRegisterClass *getTargetRegisterClass(const CombineInfo &CI);
- unsigned getOpcodeWidth(const MachineInstr &MI);
- InstClassEnum getInstClass(unsigned Opc);
- unsigned getRegs(unsigned Opc);
bool findMatchingInst(CombineInfo &CI);
@@ -178,22 +225,27 @@ private:
unsigned write2Opcode(unsigned EltSize) const;
unsigned write2ST64Opcode(unsigned EltSize) const;
MachineBasicBlock::iterator mergeWrite2Pair(CombineInfo &CI);
+ MachineBasicBlock::iterator mergeImagePair(CombineInfo &CI);
MachineBasicBlock::iterator mergeSBufferLoadImmPair(CombineInfo &CI);
MachineBasicBlock::iterator mergeBufferLoadPair(CombineInfo &CI);
MachineBasicBlock::iterator mergeBufferStorePair(CombineInfo &CI);
void updateBaseAndOffset(MachineInstr &I, unsigned NewBase,
- int32_t NewOffset);
- unsigned computeBase(MachineInstr &MI, const MemAddress &Addr);
- MachineOperand createRegOrImm(int32_t Val, MachineInstr &MI);
- Optional<int32_t> extractConstOffset(const MachineOperand &Op);
- void processBaseWithConstOffset(const MachineOperand &Base, MemAddress &Addr);
+ int32_t NewOffset) const;
+ unsigned computeBase(MachineInstr &MI, const MemAddress &Addr) const;
+ MachineOperand createRegOrImm(int32_t Val, MachineInstr &MI) const;
+ Optional<int32_t> extractConstOffset(const MachineOperand &Op) const;
+ void processBaseWithConstOffset(const MachineOperand &Base, MemAddress &Addr) const;
/// Promotes constant offset to the immediate by adjusting the base. It
/// tries to use a base from the nearby instructions that allows it to have
/// a 13bit constant offset which gets promoted to the immediate.
bool promoteConstantOffsetToImm(MachineInstr &CI,
MemInfoMap &Visited,
- SmallPtrSet<MachineInstr *, 4> &Promoted);
+ SmallPtrSet<MachineInstr *, 4> &Promoted) const;
+ void addInstToMergeableList(const CombineInfo &CI,
+ std::list<std::list<CombineInfo> > &MergeableInsts) const;
+ bool collectMergeableInsts(MachineBasicBlock &MBB,
+ std::list<std::list<CombineInfo> > &MergeableInsts) const;
public:
static char ID;
@@ -202,7 +254,11 @@ public:
initializeSILoadStoreOptimizerPass(*PassRegistry::getPassRegistry());
}
- bool optimizeBlock(MachineBasicBlock &MBB);
+ void removeCombinedInst(std::list<CombineInfo> &MergeList,
+ const MachineInstr &MI);
+ bool optimizeInstsWithSameBaseAddr(std::list<CombineInfo> &MergeList,
+ bool &OptimizeListAgain);
+ bool optimizeBlock(std::list<std::list<CombineInfo> > &MergeableInsts);
bool runOnMachineFunction(MachineFunction &MF) override;
@@ -216,6 +272,264 @@ public:
}
};
+static unsigned getOpcodeWidth(const MachineInstr &MI, const SIInstrInfo &TII) {
+ const unsigned Opc = MI.getOpcode();
+
+ if (TII.isMUBUF(Opc)) {
+ // FIXME: Handle d16 correctly
+ return AMDGPU::getMUBUFElements(Opc);
+ }
+ if (TII.isMIMG(MI)) {
+ uint64_t DMaskImm =
+ TII.getNamedOperand(MI, AMDGPU::OpName::dmask)->getImm();
+ return countPopulation(DMaskImm);
+ }
+
+ switch (Opc) {
+ case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
+ return 1;
+ case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
+ return 2;
+ case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
+ return 4;
+ default:
+ return 0;
+ }
+}
+
+/// Maps instruction opcode to enum InstClassEnum.
+static InstClassEnum getInstClass(unsigned Opc, const SIInstrInfo &TII) {
+ switch (Opc) {
+ default:
+ if (TII.isMUBUF(Opc)) {
+ switch (AMDGPU::getMUBUFBaseOpcode(Opc)) {
+ default:
+ return UNKNOWN;
+ case AMDGPU::BUFFER_LOAD_DWORD_OFFEN:
+ case AMDGPU::BUFFER_LOAD_DWORD_OFFEN_exact:
+ case AMDGPU::BUFFER_LOAD_DWORD_OFFSET:
+ case AMDGPU::BUFFER_LOAD_DWORD_OFFSET_exact:
+ return BUFFER_LOAD;
+ case AMDGPU::BUFFER_STORE_DWORD_OFFEN:
+ case AMDGPU::BUFFER_STORE_DWORD_OFFEN_exact:
+ case AMDGPU::BUFFER_STORE_DWORD_OFFSET:
+ case AMDGPU::BUFFER_STORE_DWORD_OFFSET_exact:
+ return BUFFER_STORE;
+ }
+ }
+ if (TII.isMIMG(Opc)) {
+ // Ignore instructions encoded without vaddr.
+ if (AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr) == -1)
+ return UNKNOWN;
+ // TODO: Support IMAGE_GET_RESINFO and IMAGE_GET_LOD.
+ if (TII.get(Opc).mayStore() || !TII.get(Opc).mayLoad() || TII.isGather4(Opc))
+ return UNKNOWN;
+ return MIMG;
+ }
+ return UNKNOWN;
+ case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
+ case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
+ case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
+ return S_BUFFER_LOAD_IMM;
+ case AMDGPU::DS_READ_B32:
+ case AMDGPU::DS_READ_B32_gfx9:
+ case AMDGPU::DS_READ_B64:
+ case AMDGPU::DS_READ_B64_gfx9:
+ return DS_READ;
+ case AMDGPU::DS_WRITE_B32:
+ case AMDGPU::DS_WRITE_B32_gfx9:
+ case AMDGPU::DS_WRITE_B64:
+ case AMDGPU::DS_WRITE_B64_gfx9:
+ return DS_WRITE;
+ }
+}
+
+/// Determines instruction subclass from opcode. Only instructions
+/// of the same subclass can be merged together.
+static unsigned getInstSubclass(unsigned Opc, const SIInstrInfo &TII) {
+ switch (Opc) {
+ default:
+ if (TII.isMUBUF(Opc))
+ return AMDGPU::getMUBUFBaseOpcode(Opc);
+ if (TII.isMIMG(Opc)) {
+ const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Opc);
+ assert(Info);
+ return Info->BaseOpcode;
+ }
+ return -1;
+ case AMDGPU::DS_READ_B32:
+ case AMDGPU::DS_READ_B32_gfx9:
+ case AMDGPU::DS_READ_B64:
+ case AMDGPU::DS_READ_B64_gfx9:
+ case AMDGPU::DS_WRITE_B32:
+ case AMDGPU::DS_WRITE_B32_gfx9:
+ case AMDGPU::DS_WRITE_B64:
+ case AMDGPU::DS_WRITE_B64_gfx9:
+ return Opc;
+ case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
+ case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
+ case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
+ return AMDGPU::S_BUFFER_LOAD_DWORD_IMM;
+ }
+}
+
+static unsigned getRegs(unsigned Opc, const SIInstrInfo &TII) {
+ if (TII.isMUBUF(Opc)) {
+ unsigned result = 0;
+
+ if (AMDGPU::getMUBUFHasVAddr(Opc)) {
+ result |= VADDR;
+ }
+
+ if (AMDGPU::getMUBUFHasSrsrc(Opc)) {
+ result |= SRSRC;
+ }
+
+ if (AMDGPU::getMUBUFHasSoffset(Opc)) {
+ result |= SOFFSET;
+ }
+
+ return result;
+ }
+
+ if (TII.isMIMG(Opc)) {
+ unsigned result = VADDR | SRSRC;
+ const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Opc);
+ if (Info && AMDGPU::getMIMGBaseOpcodeInfo(Info->BaseOpcode)->Sampler)
+ result |= SSAMP;
+ return result;
+ }
+
+ switch (Opc) {
+ default:
+ return 0;
+ case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
+ case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
+ case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
+ return SBASE;
+ case AMDGPU::DS_READ_B32:
+ case AMDGPU::DS_READ_B64:
+ case AMDGPU::DS_READ_B32_gfx9:
+ case AMDGPU::DS_READ_B64_gfx9:
+ case AMDGPU::DS_WRITE_B32:
+ case AMDGPU::DS_WRITE_B64:
+ case AMDGPU::DS_WRITE_B32_gfx9:
+ case AMDGPU::DS_WRITE_B64_gfx9:
+ return ADDR;
+ }
+}
+
+
+void SILoadStoreOptimizer::CombineInfo::setMI(MachineBasicBlock::iterator MI,
+ const SIInstrInfo &TII,
+ const GCNSubtarget &STM) {
+ I = MI;
+ unsigned Opc = MI->getOpcode();
+ InstClass = getInstClass(Opc, TII);
+
+ if (InstClass == UNKNOWN)
+ return;
+
+ switch (InstClass) {
+ case DS_READ:
+ EltSize =
+ (Opc == AMDGPU::DS_READ_B64 || Opc == AMDGPU::DS_READ_B64_gfx9) ? 8
+ : 4;
+ break;
+ case DS_WRITE:
+ EltSize =
+ (Opc == AMDGPU::DS_WRITE_B64 || Opc == AMDGPU::DS_WRITE_B64_gfx9) ? 8
+ : 4;
+ break;
+ case S_BUFFER_LOAD_IMM:
+ EltSize = AMDGPU::getSMRDEncodedOffset(STM, 4);
+ break;
+ default:
+ EltSize = 4;
+ break;
+ }
+
+ if (InstClass == MIMG) {
+ DMask0 = TII.getNamedOperand(*I, AMDGPU::OpName::dmask)->getImm();
+ } else {
+ int OffsetIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::offset);
+ Offset0 = I->getOperand(OffsetIdx).getImm();
+ }
+
+ Width0 = getOpcodeWidth(*I, TII);
+
+ if ((InstClass == DS_READ) || (InstClass == DS_WRITE)) {
+ Offset0 &= 0xffff;
+ } else if (InstClass != MIMG) {
+ GLC0 = TII.getNamedOperand(*I, AMDGPU::OpName::glc)->getImm();
+ if (InstClass != S_BUFFER_LOAD_IMM) {
+ SLC0 = TII.getNamedOperand(*I, AMDGPU::OpName::slc)->getImm();
+ }
+ DLC0 = TII.getNamedOperand(*I, AMDGPU::OpName::dlc)->getImm();
+ }
+
+ unsigned AddrOpName[5] = {0};
+ NumAddresses = 0;
+ const unsigned Regs = getRegs(I->getOpcode(), TII);
+
+ if (Regs & ADDR) {
+ AddrOpName[NumAddresses++] = AMDGPU::OpName::addr;
+ }
+
+ if (Regs & SBASE) {
+ AddrOpName[NumAddresses++] = AMDGPU::OpName::sbase;
+ }
+
+ if (Regs & SRSRC) {
+ AddrOpName[NumAddresses++] = AMDGPU::OpName::srsrc;
+ }
+
+ if (Regs & SOFFSET) {
+ AddrOpName[NumAddresses++] = AMDGPU::OpName::soffset;
+ }
+
+ if (Regs & VADDR) {
+ AddrOpName[NumAddresses++] = AMDGPU::OpName::vaddr;
+ }
+
+ if (Regs & SSAMP) {
+ AddrOpName[NumAddresses++] = AMDGPU::OpName::ssamp;
+ }
+
+ for (unsigned i = 0; i < NumAddresses; i++) {
+ AddrIdx[i] = AMDGPU::getNamedOperandIdx(I->getOpcode(), AddrOpName[i]);
+ AddrReg[i] = &I->getOperand(AddrIdx[i]);
+ }
+
+ InstsToMove.clear();
+}
+
+void SILoadStoreOptimizer::CombineInfo::setPaired(MachineBasicBlock::iterator MI,
+ const SIInstrInfo &TII) {
+ Paired = MI;
+ assert(InstClass == getInstClass(Paired->getOpcode(), TII));
+
+ if (InstClass == MIMG) {
+ DMask1 = TII.getNamedOperand(*Paired, AMDGPU::OpName::dmask)->getImm();
+ } else {
+ int OffsetIdx =
+ AMDGPU::getNamedOperandIdx(I->getOpcode(), AMDGPU::OpName::offset);
+ Offset1 = Paired->getOperand(OffsetIdx).getImm();
+ }
+
+ Width1 = getOpcodeWidth(*Paired, TII);
+ if ((InstClass == DS_READ) || (InstClass == DS_WRITE)) {
+ Offset1 &= 0xffff;
+ } else if (InstClass != MIMG) {
+ GLC1 = TII.getNamedOperand(*Paired, AMDGPU::OpName::glc)->getImm();
+ if (InstClass != S_BUFFER_LOAD_IMM) {
+ SLC1 = TII.getNamedOperand(*Paired, AMDGPU::OpName::slc)->getImm();
+ }
+ DLC1 = TII.getNamedOperand(*Paired, AMDGPU::OpName::dlc)->getImm();
+ }
+}
+
+
} // end anonymous namespace.
INITIALIZE_PASS_BEGIN(SILoadStoreOptimizer, DEBUG_TYPE,
@@ -249,8 +563,7 @@ static void addDefsUsesToList(const MachineInstr &MI,
if (Op.isReg()) {
if (Op.isDef())
RegDefs.insert(Op.getReg());
- else if (Op.readsReg() &&
- TargetRegisterInfo::isPhysicalRegister(Op.getReg()))
+ else if (Op.readsReg() && Register::isPhysicalRegister(Op.getReg()))
PhysRegUses.insert(Op.getReg());
}
}
@@ -282,7 +595,7 @@ static bool addToListsIfDependent(MachineInstr &MI, DenseSet<unsigned> &RegDefs,
if (Use.isReg() &&
((Use.readsReg() && RegDefs.count(Use.getReg())) ||
(Use.isDef() && RegDefs.count(Use.getReg())) ||
- (Use.isDef() && TargetRegisterInfo::isPhysicalRegister(Use.getReg()) &&
+ (Use.isDef() && Register::isPhysicalRegister(Use.getReg()) &&
PhysRegUses.count(Use.getReg())))) {
Insts.push_back(&MI);
addDefsUsesToList(MI, RegDefs, PhysRegUses);
@@ -307,7 +620,59 @@ static bool canMoveInstsAcrossMemOp(MachineInstr &MemOp,
return true;
}
+// This function assumes that \p A and \p B have are identical except for
+// size and offset, and they referecne adjacent memory.
+static MachineMemOperand *combineKnownAdjacentMMOs(MachineFunction &MF,
+ const MachineMemOperand *A,
+ const MachineMemOperand *B) {
+ unsigned MinOffset = std::min(A->getOffset(), B->getOffset());
+ unsigned Size = A->getSize() + B->getSize();
+ // This function adds the offset parameter to the existing offset for A,
+ // so we pass 0 here as the offset and then manually set it to the correct
+ // value after the call.
+ MachineMemOperand *MMO = MF.getMachineMemOperand(A, 0, Size);
+ MMO->setOffset(MinOffset);
+ return MMO;
+}
+
+bool SILoadStoreOptimizer::dmasksCanBeCombined(const CombineInfo &CI, const SIInstrInfo &TII) {
+ assert(CI.InstClass == MIMG);
+
+ // Ignore instructions with tfe/lwe set.
+ const auto *TFEOp = TII.getNamedOperand(*CI.I, AMDGPU::OpName::tfe);
+ const auto *LWEOp = TII.getNamedOperand(*CI.I, AMDGPU::OpName::lwe);
+
+ if ((TFEOp && TFEOp->getImm()) || (LWEOp && LWEOp->getImm()))
+ return false;
+
+ // Check other optional immediate operands for equality.
+ unsigned OperandsToMatch[] = {AMDGPU::OpName::glc, AMDGPU::OpName::slc,
+ AMDGPU::OpName::d16, AMDGPU::OpName::unorm,
+ AMDGPU::OpName::da, AMDGPU::OpName::r128};
+
+ for (auto op : OperandsToMatch) {
+ int Idx = AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), op);
+ if (AMDGPU::getNamedOperandIdx(CI.Paired->getOpcode(), op) != Idx)
+ return false;
+ if (Idx != -1 &&
+ CI.I->getOperand(Idx).getImm() != CI.Paired->getOperand(Idx).getImm())
+ return false;
+ }
+
+ // Check DMask for overlaps.
+ unsigned MaxMask = std::max(CI.DMask0, CI.DMask1);
+ unsigned MinMask = std::min(CI.DMask0, CI.DMask1);
+
+ unsigned AllowedBitsForMin = llvm::countTrailingZeros(MaxMask);
+ if ((1u << AllowedBitsForMin) <= MinMask)
+ return false;
+
+ return true;
+}
+
bool SILoadStoreOptimizer::offsetsCanBeCombined(CombineInfo &CI) {
+ assert(CI.InstClass != MIMG);
+
// XXX - Would the same offset be OK? Is there any reason this would happen or
// be useful?
if (CI.Offset0 == CI.Offset1)
@@ -384,164 +749,24 @@ bool SILoadStoreOptimizer::widthsFit(const GCNSubtarget &STM,
}
}
-unsigned SILoadStoreOptimizer::getOpcodeWidth(const MachineInstr &MI) {
- const unsigned Opc = MI.getOpcode();
-
- if (TII->isMUBUF(MI)) {
- return AMDGPU::getMUBUFDwords(Opc);
- }
-
- switch (Opc) {
- default:
- return 0;
- case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
- return 1;
- case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
- return 2;
- case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
- return 4;
- }
-}
-
-InstClassEnum SILoadStoreOptimizer::getInstClass(unsigned Opc) {
- if (TII->isMUBUF(Opc)) {
- const int baseOpcode = AMDGPU::getMUBUFBaseOpcode(Opc);
-
- // If we couldn't identify the opcode, bail out.
- if (baseOpcode == -1) {
- return UNKNOWN;
- }
-
- switch (baseOpcode) {
- default:
- return UNKNOWN;
- case AMDGPU::BUFFER_LOAD_DWORD_OFFEN:
- return BUFFER_LOAD_OFFEN;
- case AMDGPU::BUFFER_LOAD_DWORD_OFFSET:
- return BUFFER_LOAD_OFFSET;
- case AMDGPU::BUFFER_STORE_DWORD_OFFEN:
- return BUFFER_STORE_OFFEN;
- case AMDGPU::BUFFER_STORE_DWORD_OFFSET:
- return BUFFER_STORE_OFFSET;
- case AMDGPU::BUFFER_LOAD_DWORD_OFFEN_exact:
- return BUFFER_LOAD_OFFEN_exact;
- case AMDGPU::BUFFER_LOAD_DWORD_OFFSET_exact:
- return BUFFER_LOAD_OFFSET_exact;
- case AMDGPU::BUFFER_STORE_DWORD_OFFEN_exact:
- return BUFFER_STORE_OFFEN_exact;
- case AMDGPU::BUFFER_STORE_DWORD_OFFSET_exact:
- return BUFFER_STORE_OFFSET_exact;
- }
- }
-
- switch (Opc) {
- default:
- return UNKNOWN;
- case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
- case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
- case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
- return S_BUFFER_LOAD_IMM;
- case AMDGPU::DS_READ_B32:
- case AMDGPU::DS_READ_B64:
- case AMDGPU::DS_READ_B32_gfx9:
- case AMDGPU::DS_READ_B64_gfx9:
- return DS_READ;
- case AMDGPU::DS_WRITE_B32:
- case AMDGPU::DS_WRITE_B64:
- case AMDGPU::DS_WRITE_B32_gfx9:
- case AMDGPU::DS_WRITE_B64_gfx9:
- return DS_WRITE;
- }
-}
-
-unsigned SILoadStoreOptimizer::getRegs(unsigned Opc) {
- if (TII->isMUBUF(Opc)) {
- unsigned result = 0;
-
- if (AMDGPU::getMUBUFHasVAddr(Opc)) {
- result |= VADDR;
- }
-
- if (AMDGPU::getMUBUFHasSrsrc(Opc)) {
- result |= SRSRC;
- }
-
- if (AMDGPU::getMUBUFHasSoffset(Opc)) {
- result |= SOFFSET;
- }
-
- return result;
- }
-
- switch (Opc) {
- default:
- return 0;
- case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
- case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
- case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
- return SBASE;
- case AMDGPU::DS_READ_B32:
- case AMDGPU::DS_READ_B64:
- case AMDGPU::DS_READ_B32_gfx9:
- case AMDGPU::DS_READ_B64_gfx9:
- case AMDGPU::DS_WRITE_B32:
- case AMDGPU::DS_WRITE_B64:
- case AMDGPU::DS_WRITE_B32_gfx9:
- case AMDGPU::DS_WRITE_B64_gfx9:
- return ADDR;
- }
-}
-
bool SILoadStoreOptimizer::findMatchingInst(CombineInfo &CI) {
MachineBasicBlock *MBB = CI.I->getParent();
MachineBasicBlock::iterator E = MBB->end();
MachineBasicBlock::iterator MBBI = CI.I;
const unsigned Opc = CI.I->getOpcode();
- const InstClassEnum InstClass = getInstClass(Opc);
+ const InstClassEnum InstClass = getInstClass(Opc, *TII);
if (InstClass == UNKNOWN) {
return false;
}
+ const unsigned InstSubclass = getInstSubclass(Opc, *TII);
- const unsigned Regs = getRegs(Opc);
-
- unsigned AddrOpName[5] = {0};
- int AddrIdx[5];
- const MachineOperand *AddrReg[5];
- unsigned NumAddresses = 0;
-
- if (Regs & ADDR) {
- AddrOpName[NumAddresses++] = AMDGPU::OpName::addr;
- }
-
- if (Regs & SBASE) {
- AddrOpName[NumAddresses++] = AMDGPU::OpName::sbase;
- }
-
- if (Regs & SRSRC) {
- AddrOpName[NumAddresses++] = AMDGPU::OpName::srsrc;
- }
-
- if (Regs & SOFFSET) {
- AddrOpName[NumAddresses++] = AMDGPU::OpName::soffset;
- }
-
- if (Regs & VADDR) {
- AddrOpName[NumAddresses++] = AMDGPU::OpName::vaddr;
- }
-
- for (unsigned i = 0; i < NumAddresses; i++) {
- AddrIdx[i] = AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), AddrOpName[i]);
- AddrReg[i] = &CI.I->getOperand(AddrIdx[i]);
-
- // We only ever merge operations with the same base address register, so
- // don't bother scanning forward if there are no other uses.
- if (AddrReg[i]->isReg() &&
- (TargetRegisterInfo::isPhysicalRegister(AddrReg[i]->getReg()) ||
- MRI->hasOneNonDBGUse(AddrReg[i]->getReg())))
- return false;
- }
+ // Do not merge VMEM buffer instructions with "swizzled" bit set.
+ int Swizzled =
+ AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), AMDGPU::OpName::swz);
+ if (Swizzled != -1 && CI.I->getOperand(Swizzled).getImm())
+ return false;
++MBBI;
@@ -550,11 +775,10 @@ bool SILoadStoreOptimizer::findMatchingInst(CombineInfo &CI) {
addDefsUsesToList(*CI.I, RegDefsToMove, PhysRegUsesToMove);
for (; MBBI != E; ++MBBI) {
- const bool IsDS = (InstClass == DS_READ) || (InstClass == DS_WRITE);
- if ((getInstClass(MBBI->getOpcode()) != InstClass) ||
- (IsDS && (MBBI->getOpcode() != Opc))) {
- // This is not a matching DS instruction, but we can keep looking as
+ if ((getInstClass(MBBI->getOpcode(), *TII) != InstClass) ||
+ (getInstSubclass(MBBI->getOpcode(), *TII) != InstSubclass)) {
+ // This is not a matching instruction, but we can keep looking as
// long as one of these conditions are met:
// 1. It is safe to move I down past MBBI.
// 2. It is safe to move MBBI down past the instruction that I will
@@ -599,58 +823,23 @@ bool SILoadStoreOptimizer::findMatchingInst(CombineInfo &CI) {
CI.InstsToMove))
continue;
- bool Match = true;
- for (unsigned i = 0; i < NumAddresses; i++) {
- const MachineOperand &AddrRegNext = MBBI->getOperand(AddrIdx[i]);
-
- if (AddrReg[i]->isImm() || AddrRegNext.isImm()) {
- if (AddrReg[i]->isImm() != AddrRegNext.isImm() ||
- AddrReg[i]->getImm() != AddrRegNext.getImm()) {
- Match = false;
- break;
- }
- continue;
- }
-
- // Check same base pointer. Be careful of subregisters, which can occur
- // with vectors of pointers.
- if (AddrReg[i]->getReg() != AddrRegNext.getReg() ||
- AddrReg[i]->getSubReg() != AddrRegNext.getSubReg()) {
- Match = false;
- break;
- }
- }
+ bool Match = CI.hasSameBaseAddress(*MBBI);
if (Match) {
- int OffsetIdx =
- AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), AMDGPU::OpName::offset);
- CI.Offset0 = CI.I->getOperand(OffsetIdx).getImm();
- CI.Width0 = getOpcodeWidth(*CI.I);
- CI.Offset1 = MBBI->getOperand(OffsetIdx).getImm();
- CI.Width1 = getOpcodeWidth(*MBBI);
- CI.Paired = MBBI;
-
- if ((CI.InstClass == DS_READ) || (CI.InstClass == DS_WRITE)) {
- CI.Offset0 &= 0xffff;
- CI.Offset1 &= 0xffff;
- } else {
- CI.GLC0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::glc)->getImm();
- CI.GLC1 = TII->getNamedOperand(*MBBI, AMDGPU::OpName::glc)->getImm();
- if (CI.InstClass != S_BUFFER_LOAD_IMM) {
- CI.SLC0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::slc)->getImm();
- CI.SLC1 = TII->getNamedOperand(*MBBI, AMDGPU::OpName::slc)->getImm();
- }
- CI.DLC0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::dlc)->getImm();
- CI.DLC1 = TII->getNamedOperand(*MBBI, AMDGPU::OpName::dlc)->getImm();
- }
+ CI.setPaired(MBBI, *TII);
+
+ // Check both offsets (or masks for MIMG) can be combined and fit in the
+ // reduced range.
+ bool canBeCombined =
+ CI.InstClass == MIMG
+ ? dmasksCanBeCombined(CI, *TII)
+ : widthsFit(*STM, CI) && offsetsCanBeCombined(CI);
- // Check both offsets fit in the reduced range.
// We also need to go through the list of instructions that we plan to
// move and make sure they are all safe to move down past the merged
// instruction.
- if (widthsFit(*STM, CI) && offsetsCanBeCombined(CI))
- if (canMoveInstsAcrossMemOp(*MBBI, CI.InstsToMove, AA))
- return true;
+ if (canBeCombined && canMoveInstsAcrossMemOp(*MBBI, CI.InstsToMove, AA))
+ return true;
}
// We've found a load/store that we couldn't merge for some reason.
@@ -711,15 +900,15 @@ SILoadStoreOptimizer::mergeRead2Pair(CombineInfo &CI) {
const TargetRegisterClass *SuperRC =
(CI.EltSize == 4) ? &AMDGPU::VReg_64RegClass : &AMDGPU::VReg_128RegClass;
- unsigned DestReg = MRI->createVirtualRegister(SuperRC);
+ Register DestReg = MRI->createVirtualRegister(SuperRC);
DebugLoc DL = CI.I->getDebugLoc();
- unsigned BaseReg = AddrReg->getReg();
+ Register BaseReg = AddrReg->getReg();
unsigned BaseSubReg = AddrReg->getSubReg();
unsigned BaseRegFlags = 0;
if (CI.BaseOff) {
- unsigned ImmReg = MRI->createVirtualRegister(&AMDGPU::SGPR_32RegClass);
+ Register ImmReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
BuildMI(*MBB, CI.Paired, DL, TII->get(AMDGPU::S_MOV_B32), ImmReg)
.addImm(CI.BaseOff);
@@ -755,12 +944,11 @@ SILoadStoreOptimizer::mergeRead2Pair(CombineInfo &CI) {
moveInstsAfter(Copy1, CI.InstsToMove);
- MachineBasicBlock::iterator Next = std::next(CI.I);
CI.I->eraseFromParent();
CI.Paired->eraseFromParent();
LLVM_DEBUG(dbgs() << "Inserted read2: " << *Read2 << '\n');
- return Next;
+ return Read2;
}
unsigned SILoadStoreOptimizer::write2Opcode(unsigned EltSize) const {
@@ -809,11 +997,11 @@ SILoadStoreOptimizer::mergeWrite2Pair(CombineInfo &CI) {
const MCInstrDesc &Write2Desc = TII->get(Opc);
DebugLoc DL = CI.I->getDebugLoc();
- unsigned BaseReg = AddrReg->getReg();
+ Register BaseReg = AddrReg->getReg();
unsigned BaseSubReg = AddrReg->getSubReg();
unsigned BaseRegFlags = 0;
if (CI.BaseOff) {
- unsigned ImmReg = MRI->createVirtualRegister(&AMDGPU::SGPR_32RegClass);
+ Register ImmReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
BuildMI(*MBB, CI.Paired, DL, TII->get(AMDGPU::S_MOV_B32), ImmReg)
.addImm(CI.BaseOff);
@@ -839,12 +1027,65 @@ SILoadStoreOptimizer::mergeWrite2Pair(CombineInfo &CI) {
moveInstsAfter(Write2, CI.InstsToMove);
- MachineBasicBlock::iterator Next = std::next(CI.I);
CI.I->eraseFromParent();
CI.Paired->eraseFromParent();
LLVM_DEBUG(dbgs() << "Inserted write2 inst: " << *Write2 << '\n');
- return Next;
+ return Write2;
+}
+
+MachineBasicBlock::iterator
+SILoadStoreOptimizer::mergeImagePair(CombineInfo &CI) {
+ MachineBasicBlock *MBB = CI.I->getParent();
+ DebugLoc DL = CI.I->getDebugLoc();
+ const unsigned Opcode = getNewOpcode(CI);
+
+ const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI);
+
+ Register DestReg = MRI->createVirtualRegister(SuperRC);
+ unsigned MergedDMask = CI.DMask0 | CI.DMask1;
+ unsigned DMaskIdx =
+ AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), AMDGPU::OpName::dmask);
+
+ auto MIB = BuildMI(*MBB, CI.Paired, DL, TII->get(Opcode), DestReg);
+ for (unsigned I = 1, E = (*CI.I).getNumOperands(); I != E; ++I) {
+ if (I == DMaskIdx)
+ MIB.addImm(MergedDMask);
+ else
+ MIB.add((*CI.I).getOperand(I));
+ }
+
+ // It shouldn't be possible to get this far if the two instructions
+ // don't have a single memoperand, because MachineInstr::mayAlias()
+ // will return true if this is the case.
+ assert(CI.I->hasOneMemOperand() && CI.Paired->hasOneMemOperand());
+
+ const MachineMemOperand *MMOa = *CI.I->memoperands_begin();
+ const MachineMemOperand *MMOb = *CI.Paired->memoperands_begin();
+
+ MachineInstr *New = MIB.addMemOperand(combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb));
+
+ std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI);
+ const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
+ const unsigned SubRegIdx1 = std::get<1>(SubRegIdx);
+
+ // Copy to the old destination registers.
+ const MCInstrDesc &CopyDesc = TII->get(TargetOpcode::COPY);
+ const auto *Dest0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata);
+ const auto *Dest1 = TII->getNamedOperand(*CI.Paired, AMDGPU::OpName::vdata);
+
+ BuildMI(*MBB, CI.Paired, DL, CopyDesc)
+ .add(*Dest0) // Copy to same destination including flags and sub reg.
+ .addReg(DestReg, 0, SubRegIdx0);
+ MachineInstr *Copy1 = BuildMI(*MBB, CI.Paired, DL, CopyDesc)
+ .add(*Dest1)
+ .addReg(DestReg, RegState::Kill, SubRegIdx1);
+
+ moveInstsAfter(Copy1, CI.InstsToMove);
+
+ CI.I->eraseFromParent();
+ CI.Paired->eraseFromParent();
+ return New;
}
MachineBasicBlock::iterator
@@ -855,15 +1096,24 @@ SILoadStoreOptimizer::mergeSBufferLoadImmPair(CombineInfo &CI) {
const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI);
- unsigned DestReg = MRI->createVirtualRegister(SuperRC);
+ Register DestReg = MRI->createVirtualRegister(SuperRC);
unsigned MergedOffset = std::min(CI.Offset0, CI.Offset1);
- BuildMI(*MBB, CI.Paired, DL, TII->get(Opcode), DestReg)
- .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::sbase))
- .addImm(MergedOffset) // offset
- .addImm(CI.GLC0) // glc
- .addImm(CI.DLC0) // dlc
- .cloneMergedMemRefs({&*CI.I, &*CI.Paired});
+ // It shouldn't be possible to get this far if the two instructions
+ // don't have a single memoperand, because MachineInstr::mayAlias()
+ // will return true if this is the case.
+ assert(CI.I->hasOneMemOperand() && CI.Paired->hasOneMemOperand());
+
+ const MachineMemOperand *MMOa = *CI.I->memoperands_begin();
+ const MachineMemOperand *MMOb = *CI.Paired->memoperands_begin();
+
+ MachineInstr *New =
+ BuildMI(*MBB, CI.Paired, DL, TII->get(Opcode), DestReg)
+ .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::sbase))
+ .addImm(MergedOffset) // offset
+ .addImm(CI.GLC0) // glc
+ .addImm(CI.DLC0) // dlc
+ .addMemOperand(combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb));
std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI);
const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
@@ -883,10 +1133,9 @@ SILoadStoreOptimizer::mergeSBufferLoadImmPair(CombineInfo &CI) {
moveInstsAfter(Copy1, CI.InstsToMove);
- MachineBasicBlock::iterator Next = std::next(CI.I);
CI.I->eraseFromParent();
CI.Paired->eraseFromParent();
- return Next;
+ return New;
}
MachineBasicBlock::iterator
@@ -899,24 +1148,34 @@ SILoadStoreOptimizer::mergeBufferLoadPair(CombineInfo &CI) {
const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI);
// Copy to the new source register.
- unsigned DestReg = MRI->createVirtualRegister(SuperRC);
+ Register DestReg = MRI->createVirtualRegister(SuperRC);
unsigned MergedOffset = std::min(CI.Offset0, CI.Offset1);
auto MIB = BuildMI(*MBB, CI.Paired, DL, TII->get(Opcode), DestReg);
- const unsigned Regs = getRegs(Opcode);
+ const unsigned Regs = getRegs(Opcode, *TII);
if (Regs & VADDR)
MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
- MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
- .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
- .addImm(MergedOffset) // offset
- .addImm(CI.GLC0) // glc
- .addImm(CI.SLC0) // slc
- .addImm(0) // tfe
- .addImm(CI.DLC0) // dlc
- .cloneMergedMemRefs({&*CI.I, &*CI.Paired});
+ // It shouldn't be possible to get this far if the two instructions
+ // don't have a single memoperand, because MachineInstr::mayAlias()
+ // will return true if this is the case.
+ assert(CI.I->hasOneMemOperand() && CI.Paired->hasOneMemOperand());
+
+ const MachineMemOperand *MMOa = *CI.I->memoperands_begin();
+ const MachineMemOperand *MMOb = *CI.Paired->memoperands_begin();
+
+ MachineInstr *New =
+ MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
+ .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
+ .addImm(MergedOffset) // offset
+ .addImm(CI.GLC0) // glc
+ .addImm(CI.SLC0) // slc
+ .addImm(0) // tfe
+ .addImm(CI.DLC0) // dlc
+ .addImm(0) // swz
+ .addMemOperand(combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb));
std::pair<unsigned, unsigned> SubRegIdx = getSubRegIdxs(CI);
const unsigned SubRegIdx0 = std::get<0>(SubRegIdx);
@@ -936,10 +1195,9 @@ SILoadStoreOptimizer::mergeBufferLoadPair(CombineInfo &CI) {
moveInstsAfter(Copy1, CI.InstsToMove);
- MachineBasicBlock::iterator Next = std::next(CI.I);
CI.I->eraseFromParent();
CI.Paired->eraseFromParent();
- return Next;
+ return New;
}
unsigned SILoadStoreOptimizer::getNewOpcode(const CombineInfo &CI) {
@@ -947,7 +1205,10 @@ unsigned SILoadStoreOptimizer::getNewOpcode(const CombineInfo &CI) {
switch (CI.InstClass) {
default:
- return AMDGPU::getMUBUFOpcode(CI.InstClass, Width);
+ assert(CI.InstClass == BUFFER_LOAD || CI.InstClass == BUFFER_STORE);
+ // FIXME: Handle d16 correctly
+ return AMDGPU::getMUBUFOpcode(AMDGPU::getMUBUFBaseOpcode(CI.I->getOpcode()),
+ Width);
case UNKNOWN:
llvm_unreachable("Unknown instruction class");
case S_BUFFER_LOAD_IMM:
@@ -959,76 +1220,47 @@ unsigned SILoadStoreOptimizer::getNewOpcode(const CombineInfo &CI) {
case 4:
return AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM;
}
+ case MIMG:
+ assert("No overlaps" && (countPopulation(CI.DMask0 | CI.DMask1) == Width));
+ return AMDGPU::getMaskedMIMGOp(CI.I->getOpcode(), Width);
}
}
std::pair<unsigned, unsigned>
SILoadStoreOptimizer::getSubRegIdxs(const CombineInfo &CI) {
- if (CI.Offset0 > CI.Offset1) {
- switch (CI.Width0) {
- default:
- return std::make_pair(0, 0);
- case 1:
- switch (CI.Width1) {
- default:
- return std::make_pair(0, 0);
- case 1:
- return std::make_pair(AMDGPU::sub1, AMDGPU::sub0);
- case 2:
- return std::make_pair(AMDGPU::sub2, AMDGPU::sub0_sub1);
- case 3:
- return std::make_pair(AMDGPU::sub3, AMDGPU::sub0_sub1_sub2);
- }
- case 2:
- switch (CI.Width1) {
- default:
- return std::make_pair(0, 0);
- case 1:
- return std::make_pair(AMDGPU::sub1_sub2, AMDGPU::sub0);
- case 2:
- return std::make_pair(AMDGPU::sub2_sub3, AMDGPU::sub0_sub1);
- }
- case 3:
- switch (CI.Width1) {
- default:
- return std::make_pair(0, 0);
- case 1:
- return std::make_pair(AMDGPU::sub1_sub2_sub3, AMDGPU::sub0);
- }
- }
+
+ if (CI.Width0 == 0 || CI.Width0 == 0 || CI.Width0 + CI.Width1 > 4)
+ return std::make_pair(0, 0);
+
+ bool ReverseOrder;
+ if (CI.InstClass == MIMG) {
+ assert((countPopulation(CI.DMask0 | CI.DMask1) == CI.Width0 + CI.Width1) &&
+ "No overlaps");
+ ReverseOrder = CI.DMask0 > CI.DMask1;
+ } else
+ ReverseOrder = CI.Offset0 > CI.Offset1;
+
+ static const unsigned Idxs[4][4] = {
+ {AMDGPU::sub0, AMDGPU::sub0_sub1, AMDGPU::sub0_sub1_sub2, AMDGPU::sub0_sub1_sub2_sub3},
+ {AMDGPU::sub1, AMDGPU::sub1_sub2, AMDGPU::sub1_sub2_sub3, 0},
+ {AMDGPU::sub2, AMDGPU::sub2_sub3, 0, 0},
+ {AMDGPU::sub3, 0, 0, 0},
+ };
+ unsigned Idx0;
+ unsigned Idx1;
+
+ assert(CI.Width0 >= 1 && CI.Width0 <= 3);
+ assert(CI.Width1 >= 1 && CI.Width1 <= 3);
+
+ if (ReverseOrder) {
+ Idx1 = Idxs[0][CI.Width1 - 1];
+ Idx0 = Idxs[CI.Width1][CI.Width0 - 1];
} else {
- switch (CI.Width0) {
- default:
- return std::make_pair(0, 0);
- case 1:
- switch (CI.Width1) {
- default:
- return std::make_pair(0, 0);
- case 1:
- return std::make_pair(AMDGPU::sub0, AMDGPU::sub1);
- case 2:
- return std::make_pair(AMDGPU::sub0, AMDGPU::sub1_sub2);
- case 3:
- return std::make_pair(AMDGPU::sub0, AMDGPU::sub1_sub2_sub3);
- }
- case 2:
- switch (CI.Width1) {
- default:
- return std::make_pair(0, 0);
- case 1:
- return std::make_pair(AMDGPU::sub0_sub1, AMDGPU::sub2);
- case 2:
- return std::make_pair(AMDGPU::sub0_sub1, AMDGPU::sub2_sub3);
- }
- case 3:
- switch (CI.Width1) {
- default:
- return std::make_pair(0, 0);
- case 1:
- return std::make_pair(AMDGPU::sub0_sub1_sub2, AMDGPU::sub3);
- }
- }
+ Idx0 = Idxs[0][CI.Width0 - 1];
+ Idx1 = Idxs[CI.Width0][CI.Width1 - 1];
}
+
+ return std::make_pair(Idx0, Idx1);
}
const TargetRegisterClass *
@@ -1040,7 +1272,7 @@ SILoadStoreOptimizer::getTargetRegisterClass(const CombineInfo &CI) {
case 2:
return &AMDGPU::SReg_64_XEXECRegClass;
case 4:
- return &AMDGPU::SReg_128RegClass;
+ return &AMDGPU::SGPR_128RegClass;
case 8:
return &AMDGPU::SReg_256RegClass;
case 16:
@@ -1073,7 +1305,7 @@ SILoadStoreOptimizer::mergeBufferStorePair(CombineInfo &CI) {
// Copy to the new source register.
const TargetRegisterClass *SuperRC = getTargetRegisterClass(CI);
- unsigned SrcReg = MRI->createVirtualRegister(SuperRC);
+ Register SrcReg = MRI->createVirtualRegister(SuperRC);
const auto *Src0 = TII->getNamedOperand(*CI.I, AMDGPU::OpName::vdata);
const auto *Src1 = TII->getNamedOperand(*CI.Paired, AMDGPU::OpName::vdata);
@@ -1087,35 +1319,45 @@ SILoadStoreOptimizer::mergeBufferStorePair(CombineInfo &CI) {
auto MIB = BuildMI(*MBB, CI.Paired, DL, TII->get(Opcode))
.addReg(SrcReg, RegState::Kill);
- const unsigned Regs = getRegs(Opcode);
+ const unsigned Regs = getRegs(Opcode, *TII);
if (Regs & VADDR)
MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
- MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
- .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
- .addImm(std::min(CI.Offset0, CI.Offset1)) // offset
- .addImm(CI.GLC0) // glc
- .addImm(CI.SLC0) // slc
- .addImm(0) // tfe
- .addImm(CI.DLC0) // dlc
- .cloneMergedMemRefs({&*CI.I, &*CI.Paired});
+
+ // It shouldn't be possible to get this far if the two instructions
+ // don't have a single memoperand, because MachineInstr::mayAlias()
+ // will return true if this is the case.
+ assert(CI.I->hasOneMemOperand() && CI.Paired->hasOneMemOperand());
+
+ const MachineMemOperand *MMOa = *CI.I->memoperands_begin();
+ const MachineMemOperand *MMOb = *CI.Paired->memoperands_begin();
+
+ MachineInstr *New =
+ MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc))
+ .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset))
+ .addImm(std::min(CI.Offset0, CI.Offset1)) // offset
+ .addImm(CI.GLC0) // glc
+ .addImm(CI.SLC0) // slc
+ .addImm(0) // tfe
+ .addImm(CI.DLC0) // dlc
+ .addImm(0) // swz
+ .addMemOperand(combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb));
moveInstsAfter(MIB, CI.InstsToMove);
- MachineBasicBlock::iterator Next = std::next(CI.I);
CI.I->eraseFromParent();
CI.Paired->eraseFromParent();
- return Next;
+ return New;
}
MachineOperand
-SILoadStoreOptimizer::createRegOrImm(int32_t Val, MachineInstr &MI) {
+SILoadStoreOptimizer::createRegOrImm(int32_t Val, MachineInstr &MI) const {
APInt V(32, Val, true);
if (TII->isInlineConstant(V))
return MachineOperand::CreateImm(Val);
- unsigned Reg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
+ Register Reg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
MachineInstr *Mov =
BuildMI(*MI.getParent(), MI.getIterator(), MI.getDebugLoc(),
TII->get(AMDGPU::S_MOV_B32), Reg)
@@ -1127,7 +1369,7 @@ SILoadStoreOptimizer::createRegOrImm(int32_t Val, MachineInstr &MI) {
// Compute base address using Addr and return the final register.
unsigned SILoadStoreOptimizer::computeBase(MachineInstr &MI,
- const MemAddress &Addr) {
+ const MemAddress &Addr) const {
MachineBasicBlock *MBB = MI.getParent();
MachineBasicBlock::iterator MBBI = MI.getIterator();
DebugLoc DL = MI.getDebugLoc();
@@ -1146,11 +1388,11 @@ unsigned SILoadStoreOptimizer::computeBase(MachineInstr &MI,
createRegOrImm(static_cast<int32_t>(Addr.Offset >> 32), MI);
const auto *CarryRC = TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
- unsigned CarryReg = MRI->createVirtualRegister(CarryRC);
- unsigned DeadCarryReg = MRI->createVirtualRegister(CarryRC);
+ Register CarryReg = MRI->createVirtualRegister(CarryRC);
+ Register DeadCarryReg = MRI->createVirtualRegister(CarryRC);
- unsigned DestSub0 = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
- unsigned DestSub1 = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+ Register DestSub0 = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+ Register DestSub1 = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
MachineInstr *LoHalf =
BuildMI(*MBB, MBBI, DL, TII->get(AMDGPU::V_ADD_I32_e64), DestSub0)
.addReg(CarryReg, RegState::Define)
@@ -1170,7 +1412,7 @@ unsigned SILoadStoreOptimizer::computeBase(MachineInstr &MI,
(void)HiHalf;
LLVM_DEBUG(dbgs() << " "; HiHalf->dump(););
- unsigned FullDestReg = MRI->createVirtualRegister(&AMDGPU::VReg_64RegClass);
+ Register FullDestReg = MRI->createVirtualRegister(&AMDGPU::VReg_64RegClass);
MachineInstr *FullBase =
BuildMI(*MBB, MBBI, DL, TII->get(TargetOpcode::REG_SEQUENCE), FullDestReg)
.addReg(DestSub0)
@@ -1186,13 +1428,13 @@ unsigned SILoadStoreOptimizer::computeBase(MachineInstr &MI,
// Update base and offset with the NewBase and NewOffset in MI.
void SILoadStoreOptimizer::updateBaseAndOffset(MachineInstr &MI,
unsigned NewBase,
- int32_t NewOffset) {
+ int32_t NewOffset) const {
TII->getNamedOperand(MI, AMDGPU::OpName::vaddr)->setReg(NewBase);
TII->getNamedOperand(MI, AMDGPU::OpName::offset)->setImm(NewOffset);
}
Optional<int32_t>
-SILoadStoreOptimizer::extractConstOffset(const MachineOperand &Op) {
+SILoadStoreOptimizer::extractConstOffset(const MachineOperand &Op) const {
if (Op.isImm())
return Op.getImm();
@@ -1218,7 +1460,7 @@ SILoadStoreOptimizer::extractConstOffset(const MachineOperand &Op) {
// %Base:vreg_64 =
// REG_SEQUENCE %LO:vgpr_32, %subreg.sub0, %HI:vgpr_32, %subreg.sub1
void SILoadStoreOptimizer::processBaseWithConstOffset(const MachineOperand &Base,
- MemAddress &Addr) {
+ MemAddress &Addr) const {
if (!Base.isReg())
return;
@@ -1273,15 +1515,16 @@ void SILoadStoreOptimizer::processBaseWithConstOffset(const MachineOperand &Base
bool SILoadStoreOptimizer::promoteConstantOffsetToImm(
MachineInstr &MI,
MemInfoMap &Visited,
- SmallPtrSet<MachineInstr *, 4> &AnchorList) {
+ SmallPtrSet<MachineInstr *, 4> &AnchorList) const {
+
+ if (!(MI.mayLoad() ^ MI.mayStore()))
+ return false;
// TODO: Support flat and scratch.
- if (AMDGPU::getGlobalSaddrOp(MI.getOpcode()) < 0 ||
- TII->getNamedOperand(MI, AMDGPU::OpName::vdata) != NULL)
+ if (AMDGPU::getGlobalSaddrOp(MI.getOpcode()) < 0)
return false;
- // TODO: Support Store.
- if (!MI.mayLoad())
+ if (MI.mayLoad() && TII->getNamedOperand(MI, AMDGPU::OpName::vdata) != NULL)
return false;
if (AnchorList.count(&MI))
@@ -1418,100 +1661,166 @@ bool SILoadStoreOptimizer::promoteConstantOffsetToImm(
return false;
}
-// Scan through looking for adjacent LDS operations with constant offsets from
-// the same base register. We rely on the scheduler to do the hard work of
-// clustering nearby loads, and assume these are all adjacent.
-bool SILoadStoreOptimizer::optimizeBlock(MachineBasicBlock &MBB) {
- bool Modified = false;
+void SILoadStoreOptimizer::addInstToMergeableList(const CombineInfo &CI,
+ std::list<std::list<CombineInfo> > &MergeableInsts) const {
+ for (std::list<CombineInfo> &AddrList : MergeableInsts) {
+ if (AddrList.front().hasSameBaseAddress(*CI.I) &&
+ AddrList.front().InstClass == CI.InstClass) {
+ AddrList.emplace_back(CI);
+ return;
+ }
+ }
+
+ // Base address not found, so add a new list.
+ MergeableInsts.emplace_back(1, CI);
+}
+bool SILoadStoreOptimizer::collectMergeableInsts(MachineBasicBlock &MBB,
+ std::list<std::list<CombineInfo> > &MergeableInsts) const {
+ bool Modified = false;
// Contain the list
MemInfoMap Visited;
// Contains the list of instructions for which constant offsets are being
// promoted to the IMM.
SmallPtrSet<MachineInstr *, 4> AnchorList;
- for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); I != E;) {
- MachineInstr &MI = *I;
-
+ // Sort potential mergeable instructions into lists. One list per base address.
+ for (MachineInstr &MI : MBB.instrs()) {
+ // We run this before checking if an address is mergeable, because it can produce
+ // better code even if the instructions aren't mergeable.
if (promoteConstantOffsetToImm(MI, Visited, AnchorList))
Modified = true;
+ const InstClassEnum InstClass = getInstClass(MI.getOpcode(), *TII);
+ if (InstClass == UNKNOWN)
+ continue;
+
// Don't combine if volatile.
- if (MI.hasOrderedMemoryRef()) {
- ++I;
+ if (MI.hasOrderedMemoryRef())
+ continue;
+
+ CombineInfo CI;
+ CI.setMI(MI, *TII, *STM);
+
+ if (!CI.hasMergeableAddress(*MRI))
+ continue;
+
+ addInstToMergeableList(CI, MergeableInsts);
+ }
+ return Modified;
+}
+
+// Scan through looking for adjacent LDS operations with constant offsets from
+// the same base register. We rely on the scheduler to do the hard work of
+// clustering nearby loads, and assume these are all adjacent.
+bool SILoadStoreOptimizer::optimizeBlock(
+ std::list<std::list<CombineInfo> > &MergeableInsts) {
+ bool Modified = false;
+
+ for (std::list<CombineInfo> &MergeList : MergeableInsts) {
+ if (MergeList.size() < 2)
+ continue;
+
+ bool OptimizeListAgain = false;
+ if (!optimizeInstsWithSameBaseAddr(MergeList, OptimizeListAgain)) {
+ // We weren't able to make any changes, so clear the list so we don't
+ // process the same instructions the next time we try to optimize this
+ // block.
+ MergeList.clear();
continue;
}
- const unsigned Opc = MI.getOpcode();
+ // We made changes, but also determined that there were no more optimization
+ // opportunities, so we don't need to reprocess the list
+ if (!OptimizeListAgain)
+ MergeList.clear();
- CombineInfo CI;
- CI.I = I;
- CI.InstClass = getInstClass(Opc);
+ OptimizeAgain |= OptimizeListAgain;
+ Modified = true;
+ }
+ return Modified;
+}
+
+void
+SILoadStoreOptimizer::removeCombinedInst(std::list<CombineInfo> &MergeList,
+ const MachineInstr &MI) {
+
+ for (auto CI = MergeList.begin(), E = MergeList.end(); CI != E; ++CI) {
+ if (&*CI->I == &MI) {
+ MergeList.erase(CI);
+ return;
+ }
+ }
+}
+
+bool
+SILoadStoreOptimizer::optimizeInstsWithSameBaseAddr(
+ std::list<CombineInfo> &MergeList,
+ bool &OptimizeListAgain) {
+ bool Modified = false;
+ for (auto I = MergeList.begin(); I != MergeList.end(); ++I) {
+ CombineInfo &CI = *I;
switch (CI.InstClass) {
default:
break;
case DS_READ:
- CI.EltSize =
- (Opc == AMDGPU::DS_READ_B64 || Opc == AMDGPU::DS_READ_B64_gfx9) ? 8
- : 4;
if (findMatchingInst(CI)) {
Modified = true;
- I = mergeRead2Pair(CI);
- } else {
- ++I;
+ removeCombinedInst(MergeList, *CI.Paired);
+ MachineBasicBlock::iterator NewMI = mergeRead2Pair(CI);
+ CI.setMI(NewMI, *TII, *STM);
}
- continue;
+ break;
case DS_WRITE:
- CI.EltSize =
- (Opc == AMDGPU::DS_WRITE_B64 || Opc == AMDGPU::DS_WRITE_B64_gfx9) ? 8
- : 4;
if (findMatchingInst(CI)) {
Modified = true;
- I = mergeWrite2Pair(CI);
- } else {
- ++I;
+ removeCombinedInst(MergeList, *CI.Paired);
+ MachineBasicBlock::iterator NewMI = mergeWrite2Pair(CI);
+ CI.setMI(NewMI, *TII, *STM);
}
- continue;
+ break;
case S_BUFFER_LOAD_IMM:
- CI.EltSize = AMDGPU::getSMRDEncodedOffset(*STM, 4);
if (findMatchingInst(CI)) {
Modified = true;
- I = mergeSBufferLoadImmPair(CI);
- OptimizeAgain |= (CI.Width0 + CI.Width1) < 16;
- } else {
- ++I;
+ removeCombinedInst(MergeList, *CI.Paired);
+ MachineBasicBlock::iterator NewMI = mergeSBufferLoadImmPair(CI);
+ CI.setMI(NewMI, *TII, *STM);
+ OptimizeListAgain |= (CI.Width0 + CI.Width1) < 16;
}
- continue;
- case BUFFER_LOAD_OFFEN:
- case BUFFER_LOAD_OFFSET:
- case BUFFER_LOAD_OFFEN_exact:
- case BUFFER_LOAD_OFFSET_exact:
- CI.EltSize = 4;
+ break;
+ case BUFFER_LOAD:
if (findMatchingInst(CI)) {
Modified = true;
- I = mergeBufferLoadPair(CI);
- OptimizeAgain |= (CI.Width0 + CI.Width1) < 4;
- } else {
- ++I;
+ removeCombinedInst(MergeList, *CI.Paired);
+ MachineBasicBlock::iterator NewMI = mergeBufferLoadPair(CI);
+ CI.setMI(NewMI, *TII, *STM);
+ OptimizeListAgain |= (CI.Width0 + CI.Width1) < 4;
}
- continue;
- case BUFFER_STORE_OFFEN:
- case BUFFER_STORE_OFFSET:
- case BUFFER_STORE_OFFEN_exact:
- case BUFFER_STORE_OFFSET_exact:
- CI.EltSize = 4;
+ break;
+ case BUFFER_STORE:
if (findMatchingInst(CI)) {
Modified = true;
- I = mergeBufferStorePair(CI);
- OptimizeAgain |= (CI.Width0 + CI.Width1) < 4;
- } else {
- ++I;
+ removeCombinedInst(MergeList, *CI.Paired);
+ MachineBasicBlock::iterator NewMI = mergeBufferStorePair(CI);
+ CI.setMI(NewMI, *TII, *STM);
+ OptimizeListAgain |= (CI.Width0 + CI.Width1) < 4;
}
- continue;
+ break;
+ case MIMG:
+ if (findMatchingInst(CI)) {
+ Modified = true;
+ removeCombinedInst(MergeList, *CI.Paired);
+ MachineBasicBlock::iterator NewMI = mergeImagePair(CI);
+ CI.setMI(NewMI, *TII, *STM);
+ OptimizeListAgain |= (CI.Width0 + CI.Width1) < 4;
+ }
+ break;
}
-
- ++I;
+ // Clear the InstsToMove after we have finished searching so we don't have
+ // stale values left over if we search for this CI again in another pass
+ // over the block.
+ CI.InstsToMove.clear();
}
return Modified;
@@ -1537,10 +1846,14 @@ bool SILoadStoreOptimizer::runOnMachineFunction(MachineFunction &MF) {
bool Modified = false;
+
for (MachineBasicBlock &MBB : MF) {
+ std::list<std::list<CombineInfo> > MergeableInsts;
+ // First pass: Collect list of all instructions we know how to merge.
+ Modified |= collectMergeableInsts(MBB, MergeableInsts);
do {
OptimizeAgain = false;
- Modified |= optimizeBlock(MBB);
+ Modified |= optimizeBlock(MergeableInsts);
} while (OptimizeAgain);
}
diff --git a/lib/Target/AMDGPU/SILowerControlFlow.cpp b/lib/Target/AMDGPU/SILowerControlFlow.cpp
index 78f409cd9555..6f9abd3a8d9b 100644
--- a/lib/Target/AMDGPU/SILowerControlFlow.cpp
+++ b/lib/Target/AMDGPU/SILowerControlFlow.cpp
@@ -98,6 +98,8 @@ private:
void emitLoop(MachineInstr &MI);
void emitEndCf(MachineInstr &MI);
+ Register getSaveExec(MachineInstr* MI);
+
void findMaskOperands(MachineInstr &MI, unsigned OpNo,
SmallVectorImpl<MachineOperand> &Src) const;
@@ -144,7 +146,7 @@ char &llvm::SILowerControlFlowID = SILowerControlFlow::ID;
static bool isSimpleIf(const MachineInstr &MI, const MachineRegisterInfo *MRI,
const SIInstrInfo *TII) {
- unsigned SaveExecReg = MI.getOperand(0).getReg();
+ Register SaveExecReg = MI.getOperand(0).getReg();
auto U = MRI->use_instr_nodbg_begin(SaveExecReg);
if (U == MRI->use_instr_nodbg_end() ||
@@ -175,17 +177,31 @@ static bool isSimpleIf(const MachineInstr &MI, const MachineRegisterInfo *MRI,
return true;
}
+Register SILowerControlFlow::getSaveExec(MachineInstr *MI) {
+ MachineBasicBlock *MBB = MI->getParent();
+ MachineOperand &SaveExec = MI->getOperand(0);
+ assert(SaveExec.getSubReg() == AMDGPU::NoSubRegister);
+
+ Register SaveExecReg = SaveExec.getReg();
+ unsigned FalseTermOpc =
+ TII->isWave32() ? AMDGPU::S_MOV_B32_term : AMDGPU::S_MOV_B64_term;
+ MachineBasicBlock::iterator I = (MI);
+ MachineBasicBlock::iterator J = std::next(I);
+ if (J != MBB->end() && J->getOpcode() == FalseTermOpc &&
+ J->getOperand(1).isReg() && J->getOperand(1).getReg() == SaveExecReg) {
+ SaveExecReg = J->getOperand(0).getReg();
+ J->eraseFromParent();
+ }
+ return SaveExecReg;
+}
+
void SILowerControlFlow::emitIf(MachineInstr &MI) {
MachineBasicBlock &MBB = *MI.getParent();
const DebugLoc &DL = MI.getDebugLoc();
MachineBasicBlock::iterator I(&MI);
-
- MachineOperand &SaveExec = MI.getOperand(0);
- MachineOperand &Cond = MI.getOperand(1);
- assert(SaveExec.getSubReg() == AMDGPU::NoSubRegister &&
- Cond.getSubReg() == AMDGPU::NoSubRegister);
-
- Register SaveExecReg = SaveExec.getReg();
+ Register SaveExecReg = getSaveExec(&MI);
+ MachineOperand& Cond = MI.getOperand(1);
+ assert(Cond.getSubReg() == AMDGPU::NoSubRegister);
MachineOperand &ImpDefSCC = MI.getOperand(4);
assert(ImpDefSCC.getReg() == AMDGPU::SCC && ImpDefSCC.isDef());
@@ -204,7 +220,7 @@ void SILowerControlFlow::emitIf(MachineInstr &MI) {
.addReg(Exec)
.addReg(Exec, RegState::ImplicitDefine);
- unsigned Tmp = MRI->createVirtualRegister(BoolRC);
+ Register Tmp = MRI->createVirtualRegister(BoolRC);
MachineInstr *And =
BuildMI(MBB, I, DL, TII->get(AndOpc), Tmp)
@@ -266,8 +282,7 @@ void SILowerControlFlow::emitElse(MachineInstr &MI) {
MachineBasicBlock &MBB = *MI.getParent();
const DebugLoc &DL = MI.getDebugLoc();
- Register DstReg = MI.getOperand(0).getReg();
- assert(MI.getOperand(0).getSubReg() == AMDGPU::NoSubRegister);
+ Register DstReg = getSaveExec(&MI);
bool ExecModified = MI.getOperand(3).getImm() != 0;
MachineBasicBlock::iterator Start = MBB.begin();
@@ -339,7 +354,7 @@ void SILowerControlFlow::emitElse(MachineInstr &MI) {
void SILowerControlFlow::emitIfBreak(MachineInstr &MI) {
MachineBasicBlock &MBB = *MI.getParent();
const DebugLoc &DL = MI.getDebugLoc();
- auto Dst = MI.getOperand(0).getReg();
+ auto Dst = getSaveExec(&MI);
// Skip ANDing with exec if the break condition is already masked by exec
// because it is a V_CMP in the same basic block. (We know the break
@@ -400,13 +415,17 @@ void SILowerControlFlow::emitLoop(MachineInstr &MI) {
void SILowerControlFlow::emitEndCf(MachineInstr &MI) {
MachineBasicBlock &MBB = *MI.getParent();
+ MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
+ unsigned CFMask = MI.getOperand(0).getReg();
+ MachineInstr *Def = MRI.getUniqueVRegDef(CFMask);
const DebugLoc &DL = MI.getDebugLoc();
- MachineBasicBlock::iterator InsPt = MBB.begin();
- MachineInstr *NewMI =
- BuildMI(MBB, InsPt, DL, TII->get(OrOpc), Exec)
- .addReg(Exec)
- .add(MI.getOperand(0));
+ MachineBasicBlock::iterator InsPt =
+ Def && Def->getParent() == &MBB ? std::next(MachineBasicBlock::iterator(Def))
+ : MBB.begin();
+ MachineInstr *NewMI = BuildMI(MBB, InsPt, DL, TII->get(OrOpc), Exec)
+ .addReg(Exec)
+ .add(MI.getOperand(0));
if (LIS)
LIS->ReplaceMachineInstrInMaps(MI, *NewMI);
@@ -422,7 +441,7 @@ void SILowerControlFlow::emitEndCf(MachineInstr &MI) {
void SILowerControlFlow::findMaskOperands(MachineInstr &MI, unsigned OpNo,
SmallVectorImpl<MachineOperand> &Src) const {
MachineOperand &Op = MI.getOperand(OpNo);
- if (!Op.isReg() || !TargetRegisterInfo::isVirtualRegister(Op.getReg())) {
+ if (!Op.isReg() || !Register::isVirtualRegister(Op.getReg())) {
Src.push_back(Op);
return;
}
@@ -442,8 +461,7 @@ void SILowerControlFlow::findMaskOperands(MachineInstr &MI, unsigned OpNo,
for (const auto &SrcOp : Def->explicit_operands())
if (SrcOp.isReg() && SrcOp.isUse() &&
- (TargetRegisterInfo::isVirtualRegister(SrcOp.getReg()) ||
- SrcOp.getReg() == Exec))
+ (Register::isVirtualRegister(SrcOp.getReg()) || SrcOp.getReg() == Exec))
Src.push_back(SrcOp);
}
@@ -466,7 +484,7 @@ void SILowerControlFlow::combineMasks(MachineInstr &MI) {
else if (Ops[1].isIdenticalTo(Ops[2])) UniqueOpndIdx = 1;
else return;
- unsigned Reg = MI.getOperand(OpToReplace).getReg();
+ Register Reg = MI.getOperand(OpToReplace).getReg();
MI.RemoveOperand(OpToReplace);
MI.addOperand(Ops[UniqueOpndIdx]);
if (MRI->use_empty(Reg))
diff --git a/lib/Target/AMDGPU/SILowerI1Copies.cpp b/lib/Target/AMDGPU/SILowerI1Copies.cpp
index 1c0f836f07e6..b45412536356 100644
--- a/lib/Target/AMDGPU/SILowerI1Copies.cpp
+++ b/lib/Target/AMDGPU/SILowerI1Copies.cpp
@@ -96,7 +96,7 @@ private:
getSaluInsertionAtEnd(MachineBasicBlock &MBB) const;
bool isVreg1(unsigned Reg) const {
- return TargetRegisterInfo::isVirtualRegister(Reg) &&
+ return Register::isVirtualRegister(Reg) &&
MRI->getRegClass(Reg) == &AMDGPU::VReg_1RegClass;
}
@@ -489,6 +489,15 @@ bool SILowerI1Copies::runOnMachineFunction(MachineFunction &TheMF) {
return true;
}
+#ifndef NDEBUG
+static bool isVRegCompatibleReg(const SIRegisterInfo &TRI,
+ const MachineRegisterInfo &MRI,
+ Register Reg) {
+ unsigned Size = TRI.getRegSizeInBits(Reg, MRI);
+ return Size == 1 || Size == 32;
+}
+#endif
+
void SILowerI1Copies::lowerCopiesFromI1() {
SmallVector<MachineInstr *, 4> DeadCopies;
@@ -497,8 +506,8 @@ void SILowerI1Copies::lowerCopiesFromI1() {
if (MI.getOpcode() != AMDGPU::COPY)
continue;
- unsigned DstReg = MI.getOperand(0).getReg();
- unsigned SrcReg = MI.getOperand(1).getReg();
+ Register DstReg = MI.getOperand(0).getReg();
+ Register SrcReg = MI.getOperand(1).getReg();
if (!isVreg1(SrcReg))
continue;
@@ -509,7 +518,7 @@ void SILowerI1Copies::lowerCopiesFromI1() {
LLVM_DEBUG(dbgs() << "Lower copy from i1: " << MI);
DebugLoc DL = MI.getDebugLoc();
- assert(TII->getRegisterInfo().getRegSizeInBits(DstReg, *MRI) == 32);
+ assert(isVRegCompatibleReg(TII->getRegisterInfo(), *MRI, DstReg));
assert(!MI.getOperand(0).getSubReg());
ConstrainRegs.insert(SrcReg);
@@ -544,7 +553,7 @@ void SILowerI1Copies::lowerPhis() {
LF.initialize(MBB);
for (MachineInstr &MI : MBB.phis()) {
- unsigned DstReg = MI.getOperand(0).getReg();
+ Register DstReg = MI.getOperand(0).getReg();
if (!isVreg1(DstReg))
continue;
@@ -556,7 +565,7 @@ void SILowerI1Copies::lowerPhis() {
// Collect incoming values.
for (unsigned i = 1; i < MI.getNumOperands(); i += 2) {
assert(i + 1 < MI.getNumOperands());
- unsigned IncomingReg = MI.getOperand(i).getReg();
+ Register IncomingReg = MI.getOperand(i).getReg();
MachineBasicBlock *IncomingMBB = MI.getOperand(i + 1).getMBB();
MachineInstr *IncomingDef = MRI->getUniqueVRegDef(IncomingReg);
@@ -580,12 +589,12 @@ void SILowerI1Copies::lowerPhis() {
// Phis in a loop that are observed outside the loop receive a simple but
// conservatively correct treatment.
- MachineBasicBlock *PostDomBound = &MBB;
- for (MachineInstr &Use : MRI->use_instructions(DstReg)) {
- PostDomBound =
- PDT->findNearestCommonDominator(PostDomBound, Use.getParent());
- }
+ std::vector<MachineBasicBlock *> DomBlocks = {&MBB};
+ for (MachineInstr &Use : MRI->use_instructions(DstReg))
+ DomBlocks.push_back(Use.getParent());
+ MachineBasicBlock *PostDomBound =
+ PDT->findNearestCommonDominator(DomBlocks);
unsigned FoundLoopLevel = LF.findLoop(PostDomBound);
SSAUpdater.Initialize(DstReg);
@@ -669,7 +678,7 @@ void SILowerI1Copies::lowerCopiesToI1() {
MI.getOpcode() != AMDGPU::COPY)
continue;
- unsigned DstReg = MI.getOperand(0).getReg();
+ Register DstReg = MI.getOperand(0).getReg();
if (!isVreg1(DstReg))
continue;
@@ -686,10 +695,10 @@ void SILowerI1Copies::lowerCopiesToI1() {
continue;
DebugLoc DL = MI.getDebugLoc();
- unsigned SrcReg = MI.getOperand(1).getReg();
+ Register SrcReg = MI.getOperand(1).getReg();
assert(!MI.getOperand(1).getSubReg());
- if (!TargetRegisterInfo::isVirtualRegister(SrcReg) ||
+ if (!Register::isVirtualRegister(SrcReg) ||
(!isLaneMaskReg(SrcReg) && !isVreg1(SrcReg))) {
assert(TII->getRegisterInfo().getRegSizeInBits(SrcReg, *MRI) == 32);
unsigned TmpReg = createLaneMaskReg(*MF);
@@ -702,12 +711,12 @@ void SILowerI1Copies::lowerCopiesToI1() {
// Defs in a loop that are observed outside the loop must be transformed
// into appropriate bit manipulation.
- MachineBasicBlock *PostDomBound = &MBB;
- for (MachineInstr &Use : MRI->use_instructions(DstReg)) {
- PostDomBound =
- PDT->findNearestCommonDominator(PostDomBound, Use.getParent());
- }
+ std::vector<MachineBasicBlock *> DomBlocks = {&MBB};
+ for (MachineInstr &Use : MRI->use_instructions(DstReg))
+ DomBlocks.push_back(Use.getParent());
+ MachineBasicBlock *PostDomBound =
+ PDT->findNearestCommonDominator(DomBlocks);
unsigned FoundLoopLevel = LF.findLoop(PostDomBound);
if (FoundLoopLevel) {
SSAUpdater.Initialize(DstReg);
@@ -734,7 +743,7 @@ bool SILowerI1Copies::isConstantLaneMask(unsigned Reg, bool &Val) const {
break;
Reg = MI->getOperand(1).getReg();
- if (!TargetRegisterInfo::isVirtualRegister(Reg))
+ if (!Register::isVirtualRegister(Reg))
return false;
if (!isLaneMaskReg(Reg))
return false;
diff --git a/lib/Target/AMDGPU/SILowerSGPRSpills.cpp b/lib/Target/AMDGPU/SILowerSGPRSpills.cpp
index a82047473370..714d403a3e8f 100644
--- a/lib/Target/AMDGPU/SILowerSGPRSpills.cpp
+++ b/lib/Target/AMDGPU/SILowerSGPRSpills.cpp
@@ -278,8 +278,8 @@ bool SILowerSGPRSpills::runOnMachineFunction(MachineFunction &MF) {
unsigned FIOp = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
AMDGPU::OpName::vaddr);
int FI = MI.getOperand(FIOp).getIndex();
- unsigned VReg = TII->getNamedOperand(MI, AMDGPU::OpName::vdata)
- ->getReg();
+ Register VReg =
+ TII->getNamedOperand(MI, AMDGPU::OpName::vdata)->getReg();
if (FuncInfo->allocateVGPRSpillToAGPR(MF, FI,
TRI->isAGPR(MRI, VReg))) {
TRI->eliminateFrameIndex(MI, 0, FIOp, nullptr);
diff --git a/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp b/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
index 46da974a2f45..7dd0f11c95de 100644
--- a/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
+++ b/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
@@ -53,8 +53,7 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF)
FlatWorkGroupSizes = ST.getFlatWorkGroupSizes(F);
WavesPerEU = ST.getWavesPerEU(F);
- Occupancy = getMaxWavesPerEU();
- limitOccupancy(MF);
+ Occupancy = ST.computeOccupancy(MF, getLDSSize());
CallingConv::ID CC = F.getCallingConv();
if (CC == CallingConv::AMDGPU_KERNEL || CC == CallingConv::SPIR_KERNEL) {
@@ -190,7 +189,7 @@ unsigned SIMachineFunctionInfo::addPrivateSegmentBuffer(
const SIRegisterInfo &TRI) {
ArgInfo.PrivateSegmentBuffer =
ArgDescriptor::createRegister(TRI.getMatchingSuperReg(
- getNextUserSGPR(), AMDGPU::sub0, &AMDGPU::SReg_128RegClass));
+ getNextUserSGPR(), AMDGPU::sub0, &AMDGPU::SGPR_128RegClass));
NumUserSGPRs += 4;
return ArgInfo.PrivateSegmentBuffer.getRegister();
}
@@ -487,6 +486,7 @@ yaml::SIMachineFunctionInfo::SIMachineFunctionInfo(
NoSignedZerosFPMath(MFI.hasNoSignedZerosFPMath()),
MemoryBound(MFI.isMemoryBound()),
WaveLimiter(MFI.needsWaveLimiter()),
+ HighBitsOf32BitAddress(MFI.get32BitAddressHighBits()),
ScratchRSrcReg(regToString(MFI.getScratchRSrcReg(), TRI)),
ScratchWaveOffsetReg(regToString(MFI.getScratchWaveOffsetReg(), TRI)),
FrameOffsetReg(regToString(MFI.getFrameOffsetReg(), TRI)),
@@ -501,8 +501,9 @@ void yaml::SIMachineFunctionInfo::mappingImpl(yaml::IO &YamlIO) {
bool SIMachineFunctionInfo::initializeBaseYamlFields(
const yaml::SIMachineFunctionInfo &YamlMFI) {
ExplicitKernArgSize = YamlMFI.ExplicitKernArgSize;
- MaxKernArgAlign = YamlMFI.MaxKernArgAlign;
+ MaxKernArgAlign = assumeAligned(YamlMFI.MaxKernArgAlign);
LDSSize = YamlMFI.LDSSize;
+ HighBitsOf32BitAddress = YamlMFI.HighBitsOf32BitAddress;
IsEntryFunction = YamlMFI.IsEntryFunction;
NoSignedZerosFPMath = YamlMFI.NoSignedZerosFPMath;
MemoryBound = YamlMFI.MemoryBound;
diff --git a/lib/Target/AMDGPU/SIMachineFunctionInfo.h b/lib/Target/AMDGPU/SIMachineFunctionInfo.h
index f19b20ceb5da..7d70c786b594 100644
--- a/lib/Target/AMDGPU/SIMachineFunctionInfo.h
+++ b/lib/Target/AMDGPU/SIMachineFunctionInfo.h
@@ -265,6 +265,7 @@ struct SIMachineFunctionInfo final : public yaml::MachineFunctionInfo {
bool NoSignedZerosFPMath = false;
bool MemoryBound = false;
bool WaveLimiter = false;
+ uint32_t HighBitsOf32BitAddress = 0;
StringValue ScratchRSrcReg = "$private_rsrc_reg";
StringValue ScratchWaveOffsetReg = "$scratch_wave_offset_reg";
@@ -302,6 +303,8 @@ template <> struct MappingTraits<SIMachineFunctionInfo> {
StringValue("$sp_reg"));
YamlIO.mapOptional("argumentInfo", MFI.ArgInfo);
YamlIO.mapOptional("mode", MFI.Mode, SIMode());
+ YamlIO.mapOptional("highBitsOf32BitAddress",
+ MFI.HighBitsOf32BitAddress, 0u);
}
};
@@ -670,7 +673,7 @@ public:
return GITPtrHigh;
}
- unsigned get32BitAddressHighBits() const {
+ uint32_t get32BitAddressHighBits() const {
return HighBitsOf32BitAddress;
}
@@ -873,7 +876,7 @@ public:
assert(BufferRsrc);
auto PSV = BufferPSVs.try_emplace(
BufferRsrc,
- llvm::make_unique<AMDGPUBufferPseudoSourceValue>(TII));
+ std::make_unique<AMDGPUBufferPseudoSourceValue>(TII));
return PSV.first->second.get();
}
@@ -882,14 +885,14 @@ public:
assert(ImgRsrc);
auto PSV = ImagePSVs.try_emplace(
ImgRsrc,
- llvm::make_unique<AMDGPUImagePseudoSourceValue>(TII));
+ std::make_unique<AMDGPUImagePseudoSourceValue>(TII));
return PSV.first->second.get();
}
const AMDGPUGWSResourcePseudoSourceValue *getGWSPSV(const SIInstrInfo &TII) {
if (!GWSResourcePSV) {
GWSResourcePSV =
- llvm::make_unique<AMDGPUGWSResourcePseudoSourceValue>(TII);
+ std::make_unique<AMDGPUGWSResourcePseudoSourceValue>(TII);
}
return GWSResourcePSV.get();
diff --git a/lib/Target/AMDGPU/SIMachineScheduler.cpp b/lib/Target/AMDGPU/SIMachineScheduler.cpp
index ebbdf80f9567..c072ba6b2d1c 100644
--- a/lib/Target/AMDGPU/SIMachineScheduler.cpp
+++ b/lib/Target/AMDGPU/SIMachineScheduler.cpp
@@ -348,7 +348,7 @@ void SIScheduleBlock::initRegPressure(MachineBasicBlock::iterator BeginBlock,
// Do not Track Physical Registers, because it messes up.
for (const auto &RegMaskPair : RPTracker.getPressure().LiveInRegs) {
- if (TargetRegisterInfo::isVirtualRegister(RegMaskPair.RegUnit))
+ if (Register::isVirtualRegister(RegMaskPair.RegUnit))
LiveInRegs.insert(RegMaskPair.RegUnit);
}
LiveOutRegs.clear();
@@ -376,7 +376,7 @@ void SIScheduleBlock::initRegPressure(MachineBasicBlock::iterator BeginBlock,
// The use of findDefBetween removes the case 4.
for (const auto &RegMaskPair : RPTracker.getPressure().LiveOutRegs) {
unsigned Reg = RegMaskPair.RegUnit;
- if (TargetRegisterInfo::isVirtualRegister(Reg) &&
+ if (Register::isVirtualRegister(Reg) &&
isDefBetween(Reg, LIS->getInstructionIndex(*BeginBlock).getRegSlot(),
LIS->getInstructionIndex(*EndBlock).getRegSlot(), MRI,
LIS)) {
@@ -1228,7 +1228,7 @@ void SIScheduleBlockCreator::createBlocksForVariant(SISchedulerBlockCreatorVaria
unsigned Color = CurrentColoring[SU->NodeNum];
if (RealID.find(Color) == RealID.end()) {
int ID = CurrentBlocks.size();
- BlockPtrs.push_back(llvm::make_unique<SIScheduleBlock>(DAG, this, ID));
+ BlockPtrs.push_back(std::make_unique<SIScheduleBlock>(DAG, this, ID));
CurrentBlocks.push_back(BlockPtrs.rbegin()->get());
RealID[Color] = ID;
}
@@ -1690,7 +1690,7 @@ SIScheduleBlock *SIScheduleBlockScheduler::pickBlock() {
void SIScheduleBlockScheduler::addLiveRegs(std::set<unsigned> &Regs) {
for (unsigned Reg : Regs) {
// For now only track virtual registers.
- if (!TargetRegisterInfo::isVirtualRegister(Reg))
+ if (!Register::isVirtualRegister(Reg))
continue;
// If not already in the live set, then add it.
(void) LiveRegs.insert(Reg);
@@ -1750,7 +1750,7 @@ SIScheduleBlockScheduler::checkRegUsageImpact(std::set<unsigned> &InRegs,
for (unsigned Reg : InRegs) {
// For now only track virtual registers.
- if (!TargetRegisterInfo::isVirtualRegister(Reg))
+ if (!Register::isVirtualRegister(Reg))
continue;
if (LiveRegsConsumers[Reg] > 1)
continue;
@@ -1762,7 +1762,7 @@ SIScheduleBlockScheduler::checkRegUsageImpact(std::set<unsigned> &InRegs,
for (unsigned Reg : OutRegs) {
// For now only track virtual registers.
- if (!TargetRegisterInfo::isVirtualRegister(Reg))
+ if (!Register::isVirtualRegister(Reg))
continue;
PSetIterator PSetI = DAG->getMRI()->getPressureSets(Reg);
for (; PSetI.isValid(); ++PSetI) {
@@ -1801,7 +1801,7 @@ SIScheduler::scheduleVariant(SISchedulerBlockCreatorVariant BlockVariant,
// SIScheduleDAGMI //
SIScheduleDAGMI::SIScheduleDAGMI(MachineSchedContext *C) :
- ScheduleDAGMILive(C, llvm::make_unique<GenericScheduler>(C)) {
+ ScheduleDAGMILive(C, std::make_unique<GenericScheduler>(C)) {
SITII = static_cast<const SIInstrInfo*>(TII);
SITRI = static_cast<const SIRegisterInfo*>(TRI);
@@ -1913,7 +1913,7 @@ SIScheduleDAGMI::fillVgprSgprCost(_Iterator First, _Iterator End,
for (_Iterator RegI = First; RegI != End; ++RegI) {
unsigned Reg = *RegI;
// For now only track virtual registers
- if (!TargetRegisterInfo::isVirtualRegister(Reg))
+ if (!Register::isVirtualRegister(Reg))
continue;
PSetIterator PSetI = MRI.getPressureSets(Reg);
for (; PSetI.isValid(); ++PSetI) {
diff --git a/lib/Target/AMDGPU/SIMemoryLegalizer.cpp b/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
index 4320e6c957a0..e914573306ae 100644
--- a/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
+++ b/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
@@ -656,10 +656,10 @@ SICacheControl::SICacheControl(const GCNSubtarget &ST) {
std::unique_ptr<SICacheControl> SICacheControl::create(const GCNSubtarget &ST) {
GCNSubtarget::Generation Generation = ST.getGeneration();
if (Generation <= AMDGPUSubtarget::SOUTHERN_ISLANDS)
- return make_unique<SIGfx6CacheControl>(ST);
+ return std::make_unique<SIGfx6CacheControl>(ST);
if (Generation < AMDGPUSubtarget::GFX10)
- return make_unique<SIGfx7CacheControl>(ST);
- return make_unique<SIGfx10CacheControl>(ST, ST.isCuModeEnabled());
+ return std::make_unique<SIGfx7CacheControl>(ST);
+ return std::make_unique<SIGfx10CacheControl>(ST, ST.isCuModeEnabled());
}
bool SIGfx6CacheControl::enableLoadCacheBypass(
diff --git a/lib/Target/AMDGPU/SIModeRegister.cpp b/lib/Target/AMDGPU/SIModeRegister.cpp
index a5edd7b3554a..52989a280e80 100644
--- a/lib/Target/AMDGPU/SIModeRegister.cpp
+++ b/lib/Target/AMDGPU/SIModeRegister.cpp
@@ -226,7 +226,7 @@ void SIModeRegister::insertSetreg(MachineBasicBlock &MBB, MachineInstr *MI,
// - on exit we have set the Require, Change, and initial Exit modes.
void SIModeRegister::processBlockPhase1(MachineBasicBlock &MBB,
const SIInstrInfo *TII) {
- auto NewInfo = llvm::make_unique<BlockData>();
+ auto NewInfo = std::make_unique<BlockData>();
MachineInstr *InsertionPoint = nullptr;
// RequirePending is used to indicate whether we are collecting the initial
// requirements for the block, and need to defer the first InsertionPoint to
diff --git a/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp b/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp
index 3227bff20513..cc9b46a75582 100644
--- a/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp
+++ b/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp
@@ -322,7 +322,7 @@ bool SIOptimizeExecMasking::runOnMachineFunction(MachineFunction &MF) {
continue;
}
- unsigned CopyFromExec = CopyFromExecInst->getOperand(0).getReg();
+ Register CopyFromExec = CopyFromExecInst->getOperand(0).getReg();
MachineInstr *SaveExecInst = nullptr;
SmallVector<MachineInstr *, 4> OtherUseInsts;
diff --git a/lib/Target/AMDGPU/SIOptimizeExecMaskingPreRA.cpp b/lib/Target/AMDGPU/SIOptimizeExecMaskingPreRA.cpp
index 7e10316eab92..fdd30db6a7cb 100644
--- a/lib/Target/AMDGPU/SIOptimizeExecMaskingPreRA.cpp
+++ b/lib/Target/AMDGPU/SIOptimizeExecMaskingPreRA.cpp
@@ -211,7 +211,7 @@ static unsigned optimizeVcndVcmpPair(MachineBasicBlock &MBB,
return AMDGPU::NoRegister;
MachineOperand *AndCC = &And->getOperand(1);
- unsigned CmpReg = AndCC->getReg();
+ Register CmpReg = AndCC->getReg();
unsigned CmpSubReg = AndCC->getSubReg();
if (CmpReg == ExecReg) {
AndCC = &And->getOperand(2);
@@ -234,7 +234,7 @@ static unsigned optimizeVcndVcmpPair(MachineBasicBlock &MBB,
if (!Op1->isReg() || !Op2->isImm() || Op2->getImm() != 1)
return AMDGPU::NoRegister;
- unsigned SelReg = Op1->getReg();
+ Register SelReg = Op1->getReg();
auto *Sel = TRI->findReachingDef(SelReg, Op1->getSubReg(), *Cmp, MRI, LIS);
if (!Sel || Sel->getOpcode() != AMDGPU::V_CNDMASK_B32_e64)
return AMDGPU::NoRegister;
@@ -250,15 +250,16 @@ static unsigned optimizeVcndVcmpPair(MachineBasicBlock &MBB,
Op1->getImm() != 0 || Op2->getImm() != 1)
return AMDGPU::NoRegister;
- LLVM_DEBUG(dbgs() << "Folding sequence:\n\t" << *Sel << '\t'
- << *Cmp << '\t' << *And);
+ LLVM_DEBUG(dbgs() << "Folding sequence:\n\t" << *Sel << '\t' << *Cmp << '\t'
+ << *And);
- unsigned CCReg = CC->getReg();
+ Register CCReg = CC->getReg();
LIS->RemoveMachineInstrFromMaps(*And);
- MachineInstr *Andn2 = BuildMI(MBB, *And, And->getDebugLoc(),
- TII->get(Andn2Opc), And->getOperand(0).getReg())
- .addReg(ExecReg)
- .addReg(CCReg, 0, CC->getSubReg());
+ MachineInstr *Andn2 =
+ BuildMI(MBB, *And, And->getDebugLoc(), TII->get(Andn2Opc),
+ And->getOperand(0).getReg())
+ .addReg(ExecReg)
+ .addReg(CCReg, getUndefRegState(CC->isUndef()), CC->getSubReg());
And->eraseFromParent();
LIS->InsertMachineInstrInMaps(*Andn2);
@@ -266,20 +267,19 @@ static unsigned optimizeVcndVcmpPair(MachineBasicBlock &MBB,
// Try to remove compare. Cmp value should not used in between of cmp
// and s_and_b64 if VCC or just unused if any other register.
- if ((TargetRegisterInfo::isVirtualRegister(CmpReg) &&
- MRI.use_nodbg_empty(CmpReg)) ||
+ if ((Register::isVirtualRegister(CmpReg) && MRI.use_nodbg_empty(CmpReg)) ||
(CmpReg == CondReg &&
std::none_of(std::next(Cmp->getIterator()), Andn2->getIterator(),
[&](const MachineInstr &MI) {
- return MI.readsRegister(CondReg, TRI); }))) {
+ return MI.readsRegister(CondReg, TRI);
+ }))) {
LLVM_DEBUG(dbgs() << "Erasing: " << *Cmp << '\n');
LIS->RemoveMachineInstrFromMaps(*Cmp);
Cmp->eraseFromParent();
// Try to remove v_cndmask_b32.
- if (TargetRegisterInfo::isVirtualRegister(SelReg) &&
- MRI.use_nodbg_empty(SelReg)) {
+ if (Register::isVirtualRegister(SelReg) && MRI.use_nodbg_empty(SelReg)) {
LLVM_DEBUG(dbgs() << "Erasing: " << *Sel << '\n');
LIS->RemoveMachineInstrFromMaps(*Sel);
@@ -413,7 +413,7 @@ bool SIOptimizeExecMaskingPreRA::runOnMachineFunction(MachineFunction &MF) {
if (!SaveExec || !SaveExec->isFullCopy())
continue;
- unsigned SavedExec = SaveExec->getOperand(0).getReg();
+ Register SavedExec = SaveExec->getOperand(0).getReg();
bool SafeToReplace = true;
for (auto& U : MRI.use_nodbg_instructions(SavedExec)) {
if (U.getParent() != SaveExec->getParent()) {
@@ -434,7 +434,7 @@ bool SIOptimizeExecMaskingPreRA::runOnMachineFunction(MachineFunction &MF) {
if (Changed) {
for (auto Reg : RecalcRegs) {
- if (TargetRegisterInfo::isVirtualRegister(Reg)) {
+ if (Register::isVirtualRegister(Reg)) {
LIS->removeInterval(Reg);
if (!MRI.reg_empty(Reg))
LIS->createAndComputeVirtRegInterval(Reg);
diff --git a/lib/Target/AMDGPU/SIPeepholeSDWA.cpp b/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
index 2d71abc0612a..9b3b2436475c 100644
--- a/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
+++ b/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
@@ -574,16 +574,16 @@ SIPeepholeSDWA::matchSDWAOperand(MachineInstr &MI) {
MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
- if (TRI->isPhysicalRegister(Src1->getReg()) ||
- TRI->isPhysicalRegister(Dst->getReg()))
+ if (Register::isPhysicalRegister(Src1->getReg()) ||
+ Register::isPhysicalRegister(Dst->getReg()))
break;
if (Opcode == AMDGPU::V_LSHLREV_B32_e32 ||
Opcode == AMDGPU::V_LSHLREV_B32_e64) {
- return make_unique<SDWADstOperand>(
+ return std::make_unique<SDWADstOperand>(
Dst, Src1, *Imm == 16 ? WORD_1 : BYTE_3, UNUSED_PAD);
} else {
- return make_unique<SDWASrcOperand>(
+ return std::make_unique<SDWASrcOperand>(
Src1, Dst, *Imm == 16 ? WORD_1 : BYTE_3, false, false,
Opcode != AMDGPU::V_LSHRREV_B32_e32 &&
Opcode != AMDGPU::V_LSHRREV_B32_e64);
@@ -613,15 +613,15 @@ SIPeepholeSDWA::matchSDWAOperand(MachineInstr &MI) {
MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
- if (TRI->isPhysicalRegister(Src1->getReg()) ||
- TRI->isPhysicalRegister(Dst->getReg()))
+ if (Register::isPhysicalRegister(Src1->getReg()) ||
+ Register::isPhysicalRegister(Dst->getReg()))
break;
if (Opcode == AMDGPU::V_LSHLREV_B16_e32 ||
Opcode == AMDGPU::V_LSHLREV_B16_e64) {
- return make_unique<SDWADstOperand>(Dst, Src1, BYTE_1, UNUSED_PAD);
+ return std::make_unique<SDWADstOperand>(Dst, Src1, BYTE_1, UNUSED_PAD);
} else {
- return make_unique<SDWASrcOperand>(
+ return std::make_unique<SDWASrcOperand>(
Src1, Dst, BYTE_1, false, false,
Opcode != AMDGPU::V_LSHRREV_B16_e32 &&
Opcode != AMDGPU::V_LSHRREV_B16_e64);
@@ -677,11 +677,11 @@ SIPeepholeSDWA::matchSDWAOperand(MachineInstr &MI) {
MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
- if (TRI->isPhysicalRegister(Src0->getReg()) ||
- TRI->isPhysicalRegister(Dst->getReg()))
+ if (Register::isPhysicalRegister(Src0->getReg()) ||
+ Register::isPhysicalRegister(Dst->getReg()))
break;
- return make_unique<SDWASrcOperand>(
+ return std::make_unique<SDWASrcOperand>(
Src0, Dst, SrcSel, false, false, Opcode != AMDGPU::V_BFE_U32);
}
@@ -706,11 +706,11 @@ SIPeepholeSDWA::matchSDWAOperand(MachineInstr &MI) {
MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
- if (TRI->isPhysicalRegister(ValSrc->getReg()) ||
- TRI->isPhysicalRegister(Dst->getReg()))
+ if (Register::isPhysicalRegister(ValSrc->getReg()) ||
+ Register::isPhysicalRegister(Dst->getReg()))
break;
- return make_unique<SDWASrcOperand>(
+ return std::make_unique<SDWASrcOperand>(
ValSrc, Dst, *Imm == 0x0000ffff ? WORD_0 : BYTE_0);
}
@@ -840,7 +840,7 @@ SIPeepholeSDWA::matchSDWAOperand(MachineInstr &MI) {
MachineOperand *OrDst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
assert(OrDst && OrDst->isReg());
- return make_unique<SDWADstPreserveOperand>(
+ return std::make_unique<SDWADstPreserveOperand>(
OrDst, OrSDWADef, OrOtherDef, DstSel);
}
@@ -1189,7 +1189,7 @@ void SIPeepholeSDWA::legalizeScalarOperands(MachineInstr &MI,
continue;
}
- unsigned VGPR = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+ Register VGPR = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
auto Copy = BuildMI(*MI.getParent(), MI.getIterator(), MI.getDebugLoc(),
TII->get(AMDGPU::V_MOV_B32_e32), VGPR);
if (Op.isImm())
diff --git a/lib/Target/AMDGPU/SIPreAllocateWWMRegs.cpp b/lib/Target/AMDGPU/SIPreAllocateWWMRegs.cpp
index f9bfe96f65cb..6cdd12d0e7bd 100644
--- a/lib/Target/AMDGPU/SIPreAllocateWWMRegs.cpp
+++ b/lib/Target/AMDGPU/SIPreAllocateWWMRegs.cpp
@@ -90,12 +90,12 @@ bool SIPreAllocateWWMRegs::processDef(MachineOperand &MO) {
if (!MO.isReg())
return false;
- unsigned Reg = MO.getReg();
+ Register Reg = MO.getReg();
if (!TRI->isVGPR(*MRI, Reg))
return false;
- if (TRI->isPhysicalRegister(Reg))
+ if (Register::isPhysicalRegister(Reg))
return false;
if (VRM->hasPhys(Reg))
@@ -124,14 +124,14 @@ void SIPreAllocateWWMRegs::rewriteRegs(MachineFunction &MF) {
if (!MO.isReg())
continue;
- const unsigned VirtReg = MO.getReg();
- if (TRI->isPhysicalRegister(VirtReg))
+ const Register VirtReg = MO.getReg();
+ if (Register::isPhysicalRegister(VirtReg))
continue;
if (!VRM->hasPhys(VirtReg))
continue;
- unsigned PhysReg = VRM->getPhys(VirtReg);
+ Register PhysReg = VRM->getPhys(VirtReg);
const unsigned SubReg = MO.getSubReg();
if (SubReg != 0) {
PhysReg = TRI->getSubReg(PhysReg, SubReg);
@@ -149,7 +149,7 @@ void SIPreAllocateWWMRegs::rewriteRegs(MachineFunction &MF) {
for (unsigned Reg : RegsToRewrite) {
LIS->removeInterval(Reg);
- const unsigned PhysReg = VRM->getPhys(Reg);
+ const Register PhysReg = VRM->getPhys(Reg);
assert(PhysReg != 0);
MFI->ReserveWWMRegister(PhysReg);
}
diff --git a/lib/Target/AMDGPU/SIProgramInfo.h b/lib/Target/AMDGPU/SIProgramInfo.h
index 168f05f8fdd6..7c039a54b57f 100644
--- a/lib/Target/AMDGPU/SIProgramInfo.h
+++ b/lib/Target/AMDGPU/SIProgramInfo.h
@@ -41,6 +41,8 @@ struct SIProgramInfo {
uint64_t ComputePGMRSrc2 = 0;
uint32_t NumVGPR = 0;
+ uint32_t NumArchVGPR = 0;
+ uint32_t NumAccVGPR = 0;
uint32_t NumSGPR = 0;
uint32_t LDSSize = 0;
bool FlatUsed = false;
@@ -51,6 +53,9 @@ struct SIProgramInfo {
// Number of VGPRs that meets number of waves per execution unit request.
uint32_t NumVGPRsForWavesPerEU = 0;
+ // Final occupancy.
+ uint32_t Occupancy = 0;
+
// Whether there is recursion, dynamic allocas, indirect calls or some other
// reason there may be statically unknown stack usage.
bool DynamicCallStack = false;
diff --git a/lib/Target/AMDGPU/SIRegisterInfo.cpp b/lib/Target/AMDGPU/SIRegisterInfo.cpp
index f152deb28004..f58bc3060c42 100644
--- a/lib/Target/AMDGPU/SIRegisterInfo.cpp
+++ b/lib/Target/AMDGPU/SIRegisterInfo.cpp
@@ -48,11 +48,6 @@ void SIRegisterInfo::classifyPressureSet(unsigned PSetID, unsigned Reg,
}
}
-static cl::opt<bool> EnableSpillSGPRToSMEM(
- "amdgpu-spill-sgpr-to-smem",
- cl::desc("Use scalar stores to spill SGPRs if supported by subtarget"),
- cl::init(false));
-
static cl::opt<bool> EnableSpillSGPRToVGPR(
"amdgpu-spill-sgpr-to-vgpr",
cl::desc("Enable spilling VGPRs to SGPRs"),
@@ -61,17 +56,12 @@ static cl::opt<bool> EnableSpillSGPRToVGPR(
SIRegisterInfo::SIRegisterInfo(const GCNSubtarget &ST) :
AMDGPURegisterInfo(),
+ ST(ST),
SGPRPressureSets(getNumRegPressureSets()),
VGPRPressureSets(getNumRegPressureSets()),
AGPRPressureSets(getNumRegPressureSets()),
- SpillSGPRToVGPR(false),
- SpillSGPRToSMEM(false),
+ SpillSGPRToVGPR(EnableSpillSGPRToVGPR),
isWave32(ST.isWave32()) {
- if (EnableSpillSGPRToSMEM && ST.hasScalarStores())
- SpillSGPRToSMEM = true;
- else if (EnableSpillSGPRToVGPR)
- SpillSGPRToVGPR = true;
-
unsigned NumRegPressureSets = getNumRegPressureSets();
SGPRSetID = NumRegPressureSets;
@@ -118,11 +108,9 @@ SIRegisterInfo::SIRegisterInfo(const GCNSubtarget &ST) :
unsigned SIRegisterInfo::reservedPrivateSegmentBufferReg(
const MachineFunction &MF) const {
-
- const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
unsigned BaseIdx = alignDown(ST.getMaxNumSGPRs(MF), 4) - 4;
unsigned BaseReg(AMDGPU::SGPR_32RegClass.getRegister(BaseIdx));
- return getMatchingSuperReg(BaseReg, AMDGPU::sub0, &AMDGPU::SReg_128RegClass);
+ return getMatchingSuperReg(BaseReg, AMDGPU::sub0, &AMDGPU::SGPR_128RegClass);
}
static unsigned findPrivateSegmentWaveByteOffsetRegIndex(unsigned RegCount) {
@@ -144,7 +132,6 @@ static unsigned findPrivateSegmentWaveByteOffsetRegIndex(unsigned RegCount) {
unsigned SIRegisterInfo::reservedPrivateSegmentWaveByteOffsetReg(
const MachineFunction &MF) const {
- const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
unsigned Reg = findPrivateSegmentWaveByteOffsetRegIndex(ST.getMaxNumSGPRs(MF));
return AMDGPU::SGPR_32RegClass.getRegister(Reg);
}
@@ -202,8 +189,6 @@ BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
Reserved.set(AMDGPU::VCC_HI);
}
- const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
-
unsigned MaxNumSGPRs = ST.getMaxNumSGPRs(MF);
unsigned TotalNumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs();
for (unsigned i = MaxNumSGPRs; i < TotalNumSGPRs; ++i) {
@@ -220,6 +205,14 @@ BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
reserveRegisterTuples(Reserved, Reg);
}
+ // Reserve all the rest AGPRs if there are no instructions to use it.
+ if (!ST.hasMAIInsts()) {
+ for (unsigned i = 0; i < MaxNumVGPRs; ++i) {
+ unsigned Reg = AMDGPU::AGPR_32RegClass.getRegister(i);
+ reserveRegisterTuples(Reserved, Reg);
+ }
+ }
+
const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
unsigned ScratchWaveOffsetReg = MFI->getScratchWaveOffsetReg();
@@ -293,32 +286,17 @@ bool SIRegisterInfo::requiresRegisterScavenging(const MachineFunction &Fn) const
bool SIRegisterInfo::requiresFrameIndexScavenging(
const MachineFunction &MF) const {
- const MachineFrameInfo &MFI = MF.getFrameInfo();
- if (MFI.hasStackObjects())
- return true;
-
- // May need to deal with callee saved registers.
- const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
- return !Info->isEntryFunction();
+ // Do not use frame virtual registers. They used to be used for SGPRs, but
+ // once we reach PrologEpilogInserter, we can no longer spill SGPRs. If the
+ // scavenger fails, we can increment/decrement the necessary SGPRs to avoid a
+ // spill.
+ return false;
}
bool SIRegisterInfo::requiresFrameIndexReplacementScavenging(
const MachineFunction &MF) const {
const MachineFrameInfo &MFI = MF.getFrameInfo();
- if (!MFI.hasStackObjects())
- return false;
-
- // The scavenger is used for large frames which may require finding a free
- // register for large offsets.
- if (!isUInt<12>(MFI.getStackSize()))
- return true;
-
- // If using scalar stores, for spills, m0 is needed for the scalar store
- // offset (pre-GFX9). m0 is unallocatable, so we can't create a virtual
- // register for it during frame index elimination, so the scavenger is
- // directly needed.
- return MF.getSubtarget<GCNSubtarget>().hasScalarStores() &&
- MF.getInfo<SIMachineFunctionInfo>()->hasSpilledSGPRs();
+ return MFI.hasStackObjects();
}
bool SIRegisterInfo::requiresVirtualBaseRegisters(
@@ -372,8 +350,7 @@ void SIRegisterInfo::materializeFrameBaseRegister(MachineBasicBlock *MBB,
DL = Ins->getDebugLoc();
MachineFunction *MF = MBB->getParent();
- const GCNSubtarget &Subtarget = MF->getSubtarget<GCNSubtarget>();
- const SIInstrInfo *TII = Subtarget.getInstrInfo();
+ const SIInstrInfo *TII = ST.getInstrInfo();
if (Offset == 0) {
BuildMI(*MBB, Ins, DL, TII->get(AMDGPU::V_MOV_B32_e32), BaseReg)
@@ -382,9 +359,9 @@ void SIRegisterInfo::materializeFrameBaseRegister(MachineBasicBlock *MBB,
}
MachineRegisterInfo &MRI = MF->getRegInfo();
- unsigned OffsetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
+ Register OffsetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
- unsigned FIReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+ Register FIReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
BuildMI(*MBB, Ins, DL, TII->get(AMDGPU::S_MOV_B32), OffsetReg)
.addImm(Offset);
@@ -399,11 +376,7 @@ void SIRegisterInfo::materializeFrameBaseRegister(MachineBasicBlock *MBB,
void SIRegisterInfo::resolveFrameIndex(MachineInstr &MI, unsigned BaseReg,
int64_t Offset) const {
-
- MachineBasicBlock *MBB = MI.getParent();
- MachineFunction *MF = MBB->getParent();
- const GCNSubtarget &Subtarget = MF->getSubtarget<GCNSubtarget>();
- const SIInstrInfo *TII = Subtarget.getInstrInfo();
+ const SIInstrInfo *TII = ST.getInstrInfo();
#ifndef NDEBUG
// FIXME: Is it possible to be storing a frame index to itself?
@@ -419,12 +392,15 @@ void SIRegisterInfo::resolveFrameIndex(MachineInstr &MI, unsigned BaseReg,
#endif
MachineOperand *FIOp = TII->getNamedOperand(MI, AMDGPU::OpName::vaddr);
+#ifndef NDEBUG
+ MachineBasicBlock *MBB = MI.getParent();
+ MachineFunction *MF = MBB->getParent();
+#endif
assert(FIOp && FIOp->isFI() && "frame index must be address operand");
assert(TII->isMUBUF(MI));
assert(TII->getNamedOperand(MI, AMDGPU::OpName::soffset)->getReg() ==
- MF->getInfo<SIMachineFunctionInfo>()->getFrameOffsetReg() &&
- "should only be seeing frame offset relative FrameIndex");
-
+ MF->getInfo<SIMachineFunctionInfo>()->getStackPtrOffsetReg() &&
+ "should only be seeing stack pointer offset relative FrameIndex");
MachineOperand *OffsetOp = TII->getNamedOperand(MI, AMDGPU::OpName::offset);
int64_t NewOffset = OffsetOp->getImm() + Offset;
@@ -564,7 +540,8 @@ static int getOffsetMUBUFLoad(unsigned Opc) {
}
}
-static MachineInstrBuilder spillVGPRtoAGPR(MachineBasicBlock::iterator MI,
+static MachineInstrBuilder spillVGPRtoAGPR(const GCNSubtarget &ST,
+ MachineBasicBlock::iterator MI,
int Index,
unsigned Lane,
unsigned ValueReg,
@@ -572,7 +549,6 @@ static MachineInstrBuilder spillVGPRtoAGPR(MachineBasicBlock::iterator MI,
MachineBasicBlock *MBB = MI->getParent();
MachineFunction *MF = MI->getParent()->getParent();
SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
- const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
const SIInstrInfo *TII = ST.getInstrInfo();
MCPhysReg Reg = MFI->getVGPRToAGPRSpill(Index, Lane);
@@ -595,11 +571,12 @@ static MachineInstrBuilder spillVGPRtoAGPR(MachineBasicBlock::iterator MI,
// This differs from buildSpillLoadStore by only scavenging a VGPR. It does not
// need to handle the case where an SGPR may need to be spilled while spilling.
-static bool buildMUBUFOffsetLoadStore(const SIInstrInfo *TII,
+static bool buildMUBUFOffsetLoadStore(const GCNSubtarget &ST,
MachineFrameInfo &MFI,
MachineBasicBlock::iterator MI,
int Index,
int64_t Offset) {
+ const SIInstrInfo *TII = ST.getInstrInfo();
MachineBasicBlock *MBB = MI->getParent();
const DebugLoc &DL = MI->getDebugLoc();
bool IsStore = MI->mayStore();
@@ -611,7 +588,7 @@ static bool buildMUBUFOffsetLoadStore(const SIInstrInfo *TII,
return false;
const MachineOperand *Reg = TII->getNamedOperand(*MI, AMDGPU::OpName::vdata);
- if (spillVGPRtoAGPR(MI, Index, 0, Reg->getReg(), false).getInstr())
+ if (spillVGPRtoAGPR(ST, MI, Index, 0, Reg->getReg(), false).getInstr())
return true;
MachineInstrBuilder NewMI =
@@ -624,6 +601,7 @@ static bool buildMUBUFOffsetLoadStore(const SIInstrInfo *TII,
.addImm(0) // slc
.addImm(0) // tfe
.addImm(0) // dlc
+ .addImm(0) // swz
.cloneMemRefs(*MI);
const MachineOperand *VDataIn = TII->getNamedOperand(*MI,
@@ -645,7 +623,6 @@ void SIRegisterInfo::buildSpillLoadStore(MachineBasicBlock::iterator MI,
RegScavenger *RS) const {
MachineBasicBlock *MBB = MI->getParent();
MachineFunction *MF = MI->getParent()->getParent();
- const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
const SIInstrInfo *TII = ST.getInstrInfo();
const MachineFrameInfo &MFI = MF->getFrameInfo();
@@ -707,8 +684,9 @@ void SIRegisterInfo::buildSpillLoadStore(MachineBasicBlock::iterator MI,
}
for (unsigned i = 0, e = NumSubRegs; i != e; ++i, Offset += EltSize) {
- unsigned SubReg = NumSubRegs == 1 ?
- ValueReg : getSubReg(ValueReg, getSubRegFromChannel(i));
+ Register SubReg = NumSubRegs == 1
+ ? Register(ValueReg)
+ : getSubReg(ValueReg, getSubRegFromChannel(i));
unsigned SOffsetRegState = 0;
unsigned SrcDstRegState = getDefRegState(!IsStore);
@@ -718,7 +696,7 @@ void SIRegisterInfo::buildSpillLoadStore(MachineBasicBlock::iterator MI,
SrcDstRegState |= getKillRegState(IsKill);
}
- auto MIB = spillVGPRtoAGPR(MI, Index, i, SubReg, IsKill);
+ auto MIB = spillVGPRtoAGPR(ST, MI, Index, i, SubReg, IsKill);
if (!MIB.getInstr()) {
unsigned FinalReg = SubReg;
@@ -743,6 +721,7 @@ void SIRegisterInfo::buildSpillLoadStore(MachineBasicBlock::iterator MI,
.addImm(0) // slc
.addImm(0) // tfe
.addImm(0) // dlc
+ .addImm(0) // swz
.addMemOperand(NewMMO);
if (!IsStore && TmpReg != AMDGPU::NoRegister)
@@ -763,22 +742,6 @@ void SIRegisterInfo::buildSpillLoadStore(MachineBasicBlock::iterator MI,
}
}
-static std::pair<unsigned, unsigned> getSpillEltSize(unsigned SuperRegSize,
- bool Store) {
- if (SuperRegSize % 16 == 0) {
- return { 16, Store ? AMDGPU::S_BUFFER_STORE_DWORDX4_SGPR :
- AMDGPU::S_BUFFER_LOAD_DWORDX4_SGPR };
- }
-
- if (SuperRegSize % 8 == 0) {
- return { 8, Store ? AMDGPU::S_BUFFER_STORE_DWORDX2_SGPR :
- AMDGPU::S_BUFFER_LOAD_DWORDX2_SGPR };
- }
-
- return { 4, Store ? AMDGPU::S_BUFFER_STORE_DWORD_SGPR :
- AMDGPU::S_BUFFER_LOAD_DWORD_SGPR};
-}
-
bool SIRegisterInfo::spillSGPR(MachineBasicBlock::iterator MI,
int Index,
RegScavenger *RS,
@@ -794,98 +757,37 @@ bool SIRegisterInfo::spillSGPR(MachineBasicBlock::iterator MI,
if (OnlyToVGPR && !SpillToVGPR)
return false;
- MachineRegisterInfo &MRI = MF->getRegInfo();
- const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
const SIInstrInfo *TII = ST.getInstrInfo();
- unsigned SuperReg = MI->getOperand(0).getReg();
+ Register SuperReg = MI->getOperand(0).getReg();
bool IsKill = MI->getOperand(0).isKill();
const DebugLoc &DL = MI->getDebugLoc();
MachineFrameInfo &FrameInfo = MF->getFrameInfo();
- bool SpillToSMEM = spillSGPRToSMEM();
- if (SpillToSMEM && OnlyToVGPR)
- return false;
-
- Register FrameReg = getFrameRegister(*MF);
-
assert(SpillToVGPR || (SuperReg != MFI->getStackPtrOffsetReg() &&
SuperReg != MFI->getFrameOffsetReg() &&
SuperReg != MFI->getScratchWaveOffsetReg()));
assert(SuperReg != AMDGPU::M0 && "m0 should never spill");
- unsigned OffsetReg = AMDGPU::M0;
unsigned M0CopyReg = AMDGPU::NoRegister;
- if (SpillToSMEM) {
- if (RS->isRegUsed(AMDGPU::M0)) {
- M0CopyReg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
- BuildMI(*MBB, MI, DL, TII->get(AMDGPU::COPY), M0CopyReg)
- .addReg(AMDGPU::M0);
- }
- }
-
- unsigned ScalarStoreOp;
unsigned EltSize = 4;
const TargetRegisterClass *RC = getPhysRegClass(SuperReg);
- if (SpillToSMEM && isSGPRClass(RC)) {
- // XXX - if private_element_size is larger than 4 it might be useful to be
- // able to spill wider vmem spills.
- std::tie(EltSize, ScalarStoreOp) =
- getSpillEltSize(getRegSizeInBits(*RC) / 8, true);
- }
ArrayRef<int16_t> SplitParts = getRegSplitParts(RC, EltSize);
unsigned NumSubRegs = SplitParts.empty() ? 1 : SplitParts.size();
+ // Scavenged temporary VGPR to use. It must be scavenged once for any number
+ // of spilled subregs.
+ Register TmpVGPR;
+
// SubReg carries the "Kill" flag when SubReg == SuperReg.
unsigned SubKillState = getKillRegState((NumSubRegs == 1) && IsKill);
for (unsigned i = 0, e = NumSubRegs; i < e; ++i) {
- unsigned SubReg = NumSubRegs == 1 ?
- SuperReg : getSubReg(SuperReg, SplitParts[i]);
-
- if (SpillToSMEM) {
- int64_t FrOffset = FrameInfo.getObjectOffset(Index);
-
- // The allocated memory size is really the wavefront size * the frame
- // index size. The widest register class is 64 bytes, so a 4-byte scratch
- // allocation is enough to spill this in a single stack object.
- //
- // FIXME: Frame size/offsets are computed earlier than this, so the extra
- // space is still unnecessarily allocated.
-
- unsigned Align = FrameInfo.getObjectAlignment(Index);
- MachinePointerInfo PtrInfo
- = MachinePointerInfo::getFixedStack(*MF, Index, EltSize * i);
- MachineMemOperand *MMO
- = MF->getMachineMemOperand(PtrInfo, MachineMemOperand::MOStore,
- EltSize, MinAlign(Align, EltSize * i));
-
- // SMEM instructions only support a single offset, so increment the wave
- // offset.
-
- int64_t Offset = (ST.getWavefrontSize() * FrOffset) + (EltSize * i);
- if (Offset != 0) {
- BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_U32), OffsetReg)
- .addReg(FrameReg)
- .addImm(Offset);
- } else {
- BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_MOV_B32), OffsetReg)
- .addReg(FrameReg);
- }
-
- BuildMI(*MBB, MI, DL, TII->get(ScalarStoreOp))
- .addReg(SubReg, getKillRegState(IsKill)) // sdata
- .addReg(MFI->getScratchRSrcReg()) // sbase
- .addReg(OffsetReg, RegState::Kill) // soff
- .addImm(0) // glc
- .addImm(0) // dlc
- .addMemOperand(MMO);
-
- continue;
- }
+ Register SubReg =
+ NumSubRegs == 1 ? SuperReg : getSubReg(SuperReg, SplitParts[i]);
if (SpillToVGPR) {
SIMachineFunctionInfo::SpilledReg Spill = VGPRSpills[i];
@@ -915,15 +817,13 @@ bool SIRegisterInfo::spillSGPR(MachineBasicBlock::iterator MI,
return false;
// Spill SGPR to a frame index.
- // TODO: Should VI try to spill to VGPR and then spill to SMEM?
- unsigned TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
- // TODO: Should VI try to spill to VGPR and then spill to SMEM?
+ if (!TmpVGPR.isValid())
+ TmpVGPR = RS->scavengeRegister(&AMDGPU::VGPR_32RegClass, MI, 0);
MachineInstrBuilder Mov
- = BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpReg)
+ = BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpVGPR)
.addReg(SubReg, SubKillState);
-
// There could be undef components of a spilled super register.
// TODO: Can we detect this and skip the spill?
if (NumSubRegs > 1) {
@@ -941,7 +841,7 @@ bool SIRegisterInfo::spillSGPR(MachineBasicBlock::iterator MI,
= MF->getMachineMemOperand(PtrInfo, MachineMemOperand::MOStore,
EltSize, MinAlign(Align, EltSize * i));
BuildMI(*MBB, MI, DL, TII->get(AMDGPU::SI_SPILL_V32_SAVE))
- .addReg(TmpReg, RegState::Kill) // src
+ .addReg(TmpVGPR, RegState::Kill) // src
.addFrameIndex(Index) // vaddr
.addReg(MFI->getScratchRSrcReg()) // srrsrc
.addReg(MFI->getStackPtrOffsetReg()) // soffset
@@ -965,7 +865,6 @@ bool SIRegisterInfo::restoreSGPR(MachineBasicBlock::iterator MI,
RegScavenger *RS,
bool OnlyToVGPR) const {
MachineFunction *MF = MI->getParent()->getParent();
- MachineRegisterInfo &MRI = MF->getRegInfo();
MachineBasicBlock *MBB = MI->getParent();
SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
@@ -976,84 +875,27 @@ bool SIRegisterInfo::restoreSGPR(MachineBasicBlock::iterator MI,
return false;
MachineFrameInfo &FrameInfo = MF->getFrameInfo();
- const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
const SIInstrInfo *TII = ST.getInstrInfo();
const DebugLoc &DL = MI->getDebugLoc();
- unsigned SuperReg = MI->getOperand(0).getReg();
- bool SpillToSMEM = spillSGPRToSMEM();
- if (SpillToSMEM && OnlyToVGPR)
- return false;
+ Register SuperReg = MI->getOperand(0).getReg();
assert(SuperReg != AMDGPU::M0 && "m0 should never spill");
- unsigned OffsetReg = AMDGPU::M0;
unsigned M0CopyReg = AMDGPU::NoRegister;
- if (SpillToSMEM) {
- if (RS->isRegUsed(AMDGPU::M0)) {
- M0CopyReg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
- BuildMI(*MBB, MI, DL, TII->get(AMDGPU::COPY), M0CopyReg)
- .addReg(AMDGPU::M0);
- }
- }
-
unsigned EltSize = 4;
- unsigned ScalarLoadOp;
-
- Register FrameReg = getFrameRegister(*MF);
const TargetRegisterClass *RC = getPhysRegClass(SuperReg);
- if (SpillToSMEM && isSGPRClass(RC)) {
- // XXX - if private_element_size is larger than 4 it might be useful to be
- // able to spill wider vmem spills.
- std::tie(EltSize, ScalarLoadOp) =
- getSpillEltSize(getRegSizeInBits(*RC) / 8, false);
- }
ArrayRef<int16_t> SplitParts = getRegSplitParts(RC, EltSize);
unsigned NumSubRegs = SplitParts.empty() ? 1 : SplitParts.size();
- // SubReg carries the "Kill" flag when SubReg == SuperReg.
- int64_t FrOffset = FrameInfo.getObjectOffset(Index);
+ Register TmpVGPR;
for (unsigned i = 0, e = NumSubRegs; i < e; ++i) {
- unsigned SubReg = NumSubRegs == 1 ?
- SuperReg : getSubReg(SuperReg, SplitParts[i]);
-
- if (SpillToSMEM) {
- // FIXME: Size may be > 4 but extra bytes wasted.
- unsigned Align = FrameInfo.getObjectAlignment(Index);
- MachinePointerInfo PtrInfo
- = MachinePointerInfo::getFixedStack(*MF, Index, EltSize * i);
- MachineMemOperand *MMO
- = MF->getMachineMemOperand(PtrInfo, MachineMemOperand::MOLoad,
- EltSize, MinAlign(Align, EltSize * i));
-
- // Add i * 4 offset
- int64_t Offset = (ST.getWavefrontSize() * FrOffset) + (EltSize * i);
- if (Offset != 0) {
- BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_U32), OffsetReg)
- .addReg(FrameReg)
- .addImm(Offset);
- } else {
- BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_MOV_B32), OffsetReg)
- .addReg(FrameReg);
- }
-
- auto MIB =
- BuildMI(*MBB, MI, DL, TII->get(ScalarLoadOp), SubReg)
- .addReg(MFI->getScratchRSrcReg()) // sbase
- .addReg(OffsetReg, RegState::Kill) // soff
- .addImm(0) // glc
- .addImm(0) // dlc
- .addMemOperand(MMO);
-
- if (NumSubRegs > 1 && i == 0)
- MIB.addReg(SuperReg, RegState::ImplicitDefine);
-
- continue;
- }
+ Register SubReg =
+ NumSubRegs == 1 ? SuperReg : getSubReg(SuperReg, SplitParts[i]);
if (SpillToVGPR) {
SIMachineFunctionInfo::SpilledReg Spill = VGPRSpills[i];
@@ -1071,7 +913,8 @@ bool SIRegisterInfo::restoreSGPR(MachineBasicBlock::iterator MI,
// Restore SGPR from a stack slot.
// FIXME: We should use S_LOAD_DWORD here for VI.
- unsigned TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+ if (!TmpVGPR.isValid())
+ TmpVGPR = RS->scavengeRegister(&AMDGPU::VGPR_32RegClass, MI, 0);
unsigned Align = FrameInfo.getObjectAlignment(Index);
MachinePointerInfo PtrInfo
@@ -1081,7 +924,7 @@ bool SIRegisterInfo::restoreSGPR(MachineBasicBlock::iterator MI,
MachineMemOperand::MOLoad, EltSize,
MinAlign(Align, EltSize * i));
- BuildMI(*MBB, MI, DL, TII->get(AMDGPU::SI_SPILL_V32_RESTORE), TmpReg)
+ BuildMI(*MBB, MI, DL, TII->get(AMDGPU::SI_SPILL_V32_RESTORE), TmpVGPR)
.addFrameIndex(Index) // vaddr
.addReg(MFI->getScratchRSrcReg()) // srsrc
.addReg(MFI->getStackPtrOffsetReg()) // soffset
@@ -1090,7 +933,7 @@ bool SIRegisterInfo::restoreSGPR(MachineBasicBlock::iterator MI,
auto MIB =
BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), SubReg)
- .addReg(TmpReg, RegState::Kill);
+ .addReg(TmpVGPR, RegState::Kill);
if (NumSubRegs > 1)
MIB.addReg(MI->getOperand(0).getReg(), RegState::ImplicitDefine);
@@ -1141,11 +984,9 @@ void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
int SPAdj, unsigned FIOperandNum,
RegScavenger *RS) const {
MachineFunction *MF = MI->getParent()->getParent();
- MachineRegisterInfo &MRI = MF->getRegInfo();
MachineBasicBlock *MBB = MI->getParent();
SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
MachineFrameInfo &FrameInfo = MF->getFrameInfo();
- const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
const SIInstrInfo *TII = ST.getInstrInfo();
DebugLoc DL = MI->getDebugLoc();
@@ -1255,13 +1096,16 @@ void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
// In an entry function/kernel the offset is already the absolute
// address relative to the frame register.
- unsigned DiffReg
- = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
+ Register TmpDiffReg =
+ RS->scavengeRegister(&AMDGPU::SReg_32_XM0RegClass, MI, 0, false);
+
+ // If there's no free SGPR, in-place modify the FP
+ Register DiffReg = TmpDiffReg.isValid() ? TmpDiffReg : FrameReg;
bool IsCopy = MI->getOpcode() == AMDGPU::V_MOV_B32_e32;
Register ResultReg = IsCopy ?
MI->getOperand(0).getReg() :
- MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+ RS->scavengeRegister(&AMDGPU::VGPR_32RegClass, MI, 0);
BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_SUB_U32), DiffReg)
.addReg(FrameReg)
@@ -1271,35 +1115,80 @@ void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
if (Offset == 0) {
// XXX - This never happens because of emergency scavenging slot at 0?
BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_LSHRREV_B32_e64), ResultReg)
- .addImm(Log2_32(ST.getWavefrontSize()))
+ .addImm(ST.getWavefrontSizeLog2())
.addReg(DiffReg);
} else {
- unsigned ScaledReg
- = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
-
- BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_LSHRREV_B32_e64), ScaledReg)
- .addImm(Log2_32(ST.getWavefrontSize()))
- .addReg(DiffReg, RegState::Kill);
-
- // TODO: Fold if use instruction is another add of a constant.
- if (AMDGPU::isInlinableLiteral32(Offset, ST.hasInv2PiInlineImm())) {
- TII->getAddNoCarry(*MBB, MI, DL, ResultReg)
- .addImm(Offset)
- .addReg(ScaledReg, RegState::Kill)
- .addImm(0); // clamp bit
+ if (auto MIB = TII->getAddNoCarry(*MBB, MI, DL, ResultReg, *RS)) {
+ Register ScaledReg =
+ RS->scavengeRegister(&AMDGPU::VGPR_32RegClass, MIB, 0);
+
+ BuildMI(*MBB, *MIB, DL, TII->get(AMDGPU::V_LSHRREV_B32_e64),
+ ScaledReg)
+ .addImm(ST.getWavefrontSizeLog2())
+ .addReg(DiffReg, RegState::Kill);
+
+ const bool IsVOP2 = MIB->getOpcode() == AMDGPU::V_ADD_U32_e32;
+
+ // TODO: Fold if use instruction is another add of a constant.
+ if (IsVOP2 || AMDGPU::isInlinableLiteral32(Offset, ST.hasInv2PiInlineImm())) {
+ // FIXME: This can fail
+ MIB.addImm(Offset);
+ MIB.addReg(ScaledReg, RegState::Kill);
+ if (!IsVOP2)
+ MIB.addImm(0); // clamp bit
+ } else {
+ Register ConstOffsetReg =
+ RS->scavengeRegister(&AMDGPU::SReg_32_XM0RegClass, MIB, 0, false);
+
+ // This should always be able to use the unused carry out.
+ assert(ConstOffsetReg && "this scavenge should not be able to fail");
+
+ BuildMI(*MBB, *MIB, DL, TII->get(AMDGPU::S_MOV_B32), ConstOffsetReg)
+ .addImm(Offset);
+ MIB.addReg(ConstOffsetReg, RegState::Kill);
+ MIB.addReg(ScaledReg, RegState::Kill);
+ MIB.addImm(0); // clamp bit
+ }
} else {
- unsigned ConstOffsetReg
- = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
-
- BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_MOV_B32), ConstOffsetReg)
- .addImm(Offset);
- TII->getAddNoCarry(*MBB, MI, DL, ResultReg)
- .addReg(ConstOffsetReg, RegState::Kill)
+ // We have to produce a carry out, and we there isn't a free SGPR
+ // pair for it. We can keep the whole computation on the SALU to
+ // avoid clobbering an additional register at the cost of an extra
+ // mov.
+
+ // We may have 1 free scratch SGPR even though a carry out is
+ // unavailable. Only one additional mov is needed.
+ Register TmpScaledReg =
+ RS->scavengeRegister(&AMDGPU::SReg_32_XM0RegClass, MI, 0, false);
+ Register ScaledReg = TmpScaledReg.isValid() ? TmpScaledReg : DiffReg;
+
+ BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_LSHR_B32), ScaledReg)
+ .addReg(DiffReg, RegState::Kill)
+ .addImm(ST.getWavefrontSizeLog2());
+ BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_U32), ScaledReg)
.addReg(ScaledReg, RegState::Kill)
- .addImm(0); // clamp bit
+ .addImm(Offset);
+ BuildMI(*MBB, MI, DL, TII->get(AMDGPU::COPY), ResultReg)
+ .addReg(ScaledReg, RegState::Kill);
+
+ // If there were truly no free SGPRs, we need to undo everything.
+ if (!TmpScaledReg.isValid()) {
+ BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_SUB_U32), ScaledReg)
+ .addReg(ScaledReg, RegState::Kill)
+ .addImm(Offset);
+ BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_LSHL_B32), ScaledReg)
+ .addReg(DiffReg, RegState::Kill)
+ .addImm(ST.getWavefrontSizeLog2());
+ }
}
}
+ if (!TmpDiffReg.isValid()) {
+ // Restore the FP.
+ BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_U32), FrameReg)
+ .addReg(FrameReg)
+ .addReg(MFI->getScratchWaveOffsetReg());
+ }
+
// Don't introduce an extra copy if we're just materializing in a mov.
if (IsCopy)
MI->eraseFromParent();
@@ -1325,7 +1214,7 @@ void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
int64_t NewOffset = OldImm + Offset;
if (isUInt<12>(NewOffset) &&
- buildMUBUFOffsetLoadStore(TII, FrameInfo, MI, Index, NewOffset)) {
+ buildMUBUFOffsetLoadStore(ST, FrameInfo, MI, Index, NewOffset)) {
MI->eraseFromParent();
return;
}
@@ -1337,7 +1226,7 @@ void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
int64_t Offset = FrameInfo.getObjectOffset(Index);
FIOp.ChangeToImmediate(Offset);
if (!TII->isImmOperandLegal(*MI, FIOperandNum, FIOp)) {
- unsigned TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+ Register TmpReg = RS->scavengeRegister(&AMDGPU::VGPR_32RegClass, MI, 0);
BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpReg)
.addImm(Offset);
FIOp.ChangeToRegister(TmpReg, false, false, true);
@@ -1347,27 +1236,13 @@ void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
}
StringRef SIRegisterInfo::getRegAsmName(unsigned Reg) const {
- const TargetRegisterClass *RC = getMinimalPhysRegClass(Reg);
- unsigned Size = getRegSizeInBits(*RC);
- unsigned AltName = AMDGPU::NoRegAltName;
-
- switch (Size) {
- case 32: AltName = AMDGPU::Reg32; break;
- case 64: AltName = AMDGPU::Reg64; break;
- case 96: AltName = AMDGPU::Reg96; break;
- case 128: AltName = AMDGPU::Reg128; break;
- case 160: AltName = AMDGPU::Reg160; break;
- case 256: AltName = AMDGPU::Reg256; break;
- case 512: AltName = AMDGPU::Reg512; break;
- case 1024: AltName = AMDGPU::Reg1024; break;
- }
- return AMDGPUInstPrinter::getRegisterName(Reg, AltName);
+ return AMDGPUInstPrinter::getRegisterName(Reg);
}
// FIXME: This is very slow. It might be worth creating a map from physreg to
// register class.
const TargetRegisterClass *SIRegisterInfo::getPhysRegClass(unsigned Reg) const {
- assert(!TargetRegisterInfo::isVirtualRegister(Reg));
+ assert(!Register::isVirtualRegister(Reg));
static const TargetRegisterClass *const BaseClasses[] = {
&AMDGPU::VGPR_32RegClass,
@@ -1408,8 +1283,6 @@ const TargetRegisterClass *SIRegisterInfo::getPhysRegClass(unsigned Reg) const {
// TargetRegisterClass to mark which classes are VGPRs to make this trivial.
bool SIRegisterInfo::hasVGPRs(const TargetRegisterClass *RC) const {
unsigned Size = getRegSizeInBits(*RC);
- if (Size < 32)
- return false;
switch (Size) {
case 32:
return getCommonSubClass(&AMDGPU::VGPR_32RegClass, RC) != nullptr;
@@ -1427,8 +1300,11 @@ bool SIRegisterInfo::hasVGPRs(const TargetRegisterClass *RC) const {
return getCommonSubClass(&AMDGPU::VReg_512RegClass, RC) != nullptr;
case 1024:
return getCommonSubClass(&AMDGPU::VReg_1024RegClass, RC) != nullptr;
+ case 1:
+ return getCommonSubClass(&AMDGPU::VReg_1RegClass, RC) != nullptr;
default:
- llvm_unreachable("Invalid register class size");
+ assert(Size < 32 && "Invalid register class size");
+ return false;
}
}
@@ -1476,6 +1352,8 @@ const TargetRegisterClass *SIRegisterInfo::getEquivalentVGPRClass(
return &AMDGPU::VReg_512RegClass;
case 1024:
return &AMDGPU::VReg_1024RegClass;
+ case 1:
+ return &AMDGPU::VReg_1RegClass;
default:
llvm_unreachable("Invalid register class size");
}
@@ -1509,7 +1387,7 @@ const TargetRegisterClass *SIRegisterInfo::getEquivalentSGPRClass(
case 96:
return &AMDGPU::SReg_96RegClass;
case 128:
- return &AMDGPU::SReg_128RegClass;
+ return &AMDGPU::SGPR_128RegClass;
case 160:
return &AMDGPU::SReg_160RegClass;
case 256:
@@ -1539,7 +1417,7 @@ const TargetRegisterClass *SIRegisterInfo::getSubRegClass(
case 3:
return &AMDGPU::SReg_96RegClass;
case 4:
- return &AMDGPU::SReg_128RegClass;
+ return &AMDGPU::SGPR_128RegClass;
case 5:
return &AMDGPU::SReg_160RegClass;
case 8:
@@ -1587,6 +1465,15 @@ const TargetRegisterClass *SIRegisterInfo::getSubRegClass(
}
}
+bool SIRegisterInfo::opCanUseInlineConstant(unsigned OpType) const {
+ if (OpType >= AMDGPU::OPERAND_REG_INLINE_AC_FIRST &&
+ OpType <= AMDGPU::OPERAND_REG_INLINE_AC_LAST)
+ return !ST.hasMFMAInlineLiteralBug();
+
+ return OpType >= AMDGPU::OPERAND_SRC_FIRST &&
+ OpType <= AMDGPU::OPERAND_SRC_LAST;
+}
+
bool SIRegisterInfo::shouldRewriteCopySrc(
const TargetRegisterClass *DefRC,
unsigned DefSubReg,
@@ -1802,7 +1689,7 @@ ArrayRef<int16_t> SIRegisterInfo::getRegSplitParts(const TargetRegisterClass *RC
const TargetRegisterClass*
SIRegisterInfo::getRegClassForReg(const MachineRegisterInfo &MRI,
unsigned Reg) const {
- if (TargetRegisterInfo::isVirtualRegister(Reg))
+ if (Register::isVirtualRegister(Reg))
return MRI.getRegClass(Reg);
return getPhysRegClass(Reg);
@@ -1845,8 +1732,6 @@ bool SIRegisterInfo::shouldCoalesce(MachineInstr *MI,
unsigned SIRegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC,
MachineFunction &MF) const {
-
- const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
unsigned Occupancy = ST.getOccupancyWithLocalMemSize(MFI->getLDSSize(),
@@ -1900,18 +1785,22 @@ SIRegisterInfo::getRegClassForSizeOnBank(unsigned Size,
return isWave32 ?
&AMDGPU::SReg_32_XM0_XEXECRegClass : &AMDGPU::SReg_64_XEXECRegClass;
case AMDGPU::SGPRRegBankID:
- return &AMDGPU::SReg_32_XM0RegClass;
+ return &AMDGPU::SReg_32RegClass;
case AMDGPU::SCCRegBankID:
// This needs to return an allocatable class, so don't bother returning
// the dummy SCC class.
- return &AMDGPU::SReg_32_XM0RegClass;
+ //
+ // FIXME: This is a grotesque hack. We use SGPR_32 as an indication this
+ // was not an VCC bank value since we use the larger class SReg_32 for
+ // other values. These should all use SReg_32.
+ return &AMDGPU::SGPR_32RegClass;
default:
llvm_unreachable("unknown register bank");
}
}
case 32:
return RB.getID() == AMDGPU::VGPRRegBankID ? &AMDGPU::VGPR_32RegClass :
- &AMDGPU::SReg_32_XM0RegClass;
+ &AMDGPU::SReg_32RegClass;
case 64:
return RB.getID() == AMDGPU::VGPRRegBankID ? &AMDGPU::VReg_64RegClass :
&AMDGPU::SReg_64_XEXECRegClass;
@@ -1920,7 +1809,7 @@ SIRegisterInfo::getRegClassForSizeOnBank(unsigned Size,
&AMDGPU::SReg_96RegClass;
case 128:
return RB.getID() == AMDGPU::VGPRRegBankID ? &AMDGPU::VReg_128RegClass :
- &AMDGPU::SReg_128RegClass;
+ &AMDGPU::SGPR_128RegClass;
case 160:
return RB.getID() == AMDGPU::VGPRRegBankID ? &AMDGPU::VReg_160RegClass :
&AMDGPU::SReg_160RegClass;
@@ -1930,10 +1819,13 @@ SIRegisterInfo::getRegClassForSizeOnBank(unsigned Size,
case 512:
return RB.getID() == AMDGPU::VGPRRegBankID ? &AMDGPU::VReg_512RegClass :
&AMDGPU::SReg_512RegClass;
+ case 1024:
+ return RB.getID() == AMDGPU::VGPRRegBankID ? &AMDGPU::VReg_1024RegClass :
+ &AMDGPU::SReg_1024RegClass;
default:
if (Size < 32)
return RB.getID() == AMDGPU::VGPRRegBankID ? &AMDGPU::VGPR_32RegClass :
- &AMDGPU::SReg_32_XM0RegClass;
+ &AMDGPU::SReg_32RegClass;
return nullptr;
}
}
@@ -1941,9 +1833,12 @@ SIRegisterInfo::getRegClassForSizeOnBank(unsigned Size,
const TargetRegisterClass *
SIRegisterInfo::getConstrainedRegClassForOperand(const MachineOperand &MO,
const MachineRegisterInfo &MRI) const {
- if (const RegisterBank *RB = MRI.getRegBankOrNull(MO.getReg()))
+ const RegClassOrRegBank &RCOrRB = MRI.getRegClassOrRegBank(MO.getReg());
+ if (const RegisterBank *RB = RCOrRB.dyn_cast<const RegisterBank*>())
return getRegClassForTypeOnBank(MRI.getType(MO.getReg()), *RB, MRI);
- return nullptr;
+
+ const TargetRegisterClass *RC = RCOrRB.get<const TargetRegisterClass*>();
+ return getAllocatableClass(RC);
}
unsigned SIRegisterInfo::getVCC() const {
@@ -1974,7 +1869,7 @@ MachineInstr *SIRegisterInfo::findReachingDef(unsigned Reg, unsigned SubReg,
SlotIndex UseIdx = LIS->getInstructionIndex(Use);
SlotIndex DefIdx;
- if (TargetRegisterInfo::isVirtualRegister(Reg)) {
+ if (Register::isVirtualRegister(Reg)) {
if (!LIS->hasInterval(Reg))
return nullptr;
LiveInterval &LI = LIS->getInterval(Reg);
diff --git a/lib/Target/AMDGPU/SIRegisterInfo.h b/lib/Target/AMDGPU/SIRegisterInfo.h
index 34487c96e72e..ac3dea1a1a28 100644
--- a/lib/Target/AMDGPU/SIRegisterInfo.h
+++ b/lib/Target/AMDGPU/SIRegisterInfo.h
@@ -27,6 +27,7 @@ class SIMachineFunctionInfo;
class SIRegisterInfo final : public AMDGPURegisterInfo {
private:
+ const GCNSubtarget &ST;
unsigned SGPRSetID;
unsigned VGPRSetID;
unsigned AGPRSetID;
@@ -34,7 +35,6 @@ private:
BitVector VGPRPressureSets;
BitVector AGPRPressureSets;
bool SpillSGPRToVGPR;
- bool SpillSGPRToSMEM;
bool isWave32;
void classifyPressureSet(unsigned PSetID, unsigned Reg,
@@ -46,10 +46,6 @@ public:
return SpillSGPRToVGPR;
}
- bool spillSGPRToSMEM() const {
- return SpillSGPRToSMEM;
- }
-
/// Return the end register initially reserved for the scratch buffer in case
/// spilling is needed.
unsigned reservedPrivateSegmentBufferReg(const MachineFunction &MF) const;
@@ -141,7 +137,7 @@ public:
bool isSGPRReg(const MachineRegisterInfo &MRI, unsigned Reg) const {
const TargetRegisterClass *RC;
- if (TargetRegisterInfo::isVirtualRegister(Reg))
+ if (Register::isVirtualRegister(Reg))
RC = MRI.getRegClass(Reg);
else
RC = getPhysRegClass(Reg);
@@ -193,10 +189,7 @@ public:
/// \returns True if operands defined with this operand type can accept
/// an inline constant. i.e. An integer value in the range (-16, 64) or
/// -4.0f, -2.0f, -1.0f, -0.5f, 0.0f, 0.5f, 1.0f, 2.0f, 4.0f.
- bool opCanUseInlineConstant(unsigned OpType) const {
- return OpType >= AMDGPU::OPERAND_SRC_FIRST &&
- OpType <= AMDGPU::OPERAND_SRC_LAST;
- }
+ bool opCanUseInlineConstant(unsigned OpType) const;
unsigned findUnusedRegister(const MachineRegisterInfo &MRI,
const TargetRegisterClass *RC,
@@ -270,7 +263,7 @@ public:
const MachineRegisterInfo &MRI) const override;
const TargetRegisterClass *getBoolRC() const {
- return isWave32 ? &AMDGPU::SReg_32_XM0RegClass
+ return isWave32 ? &AMDGPU::SReg_32RegClass
: &AMDGPU::SReg_64RegClass;
}
diff --git a/lib/Target/AMDGPU/SIRegisterInfo.td b/lib/Target/AMDGPU/SIRegisterInfo.td
index d5948a7862cc..82219cbdf3b2 100644
--- a/lib/Target/AMDGPU/SIRegisterInfo.td
+++ b/lib/Target/AMDGPU/SIRegisterInfo.td
@@ -37,50 +37,52 @@ class getSubRegs<int size> {
!if(!eq(size, 16), ret16, ret32))))));
}
-let Namespace = "AMDGPU" in {
-defset list<RegAltNameIndex> AllRegAltNameIndices = {
- def Reg32 : RegAltNameIndex;
- def Reg64 : RegAltNameIndex;
- def Reg96 : RegAltNameIndex;
- def Reg128 : RegAltNameIndex;
- def Reg160 : RegAltNameIndex;
- def Reg256 : RegAltNameIndex;
- def Reg512 : RegAltNameIndex;
- def Reg1024 : RegAltNameIndex;
-}
-}
+// Generates list of sequential register tuple names.
+// E.g. RegSeq<3,2,2,"s">.ret -> [ "s[0:1]", "s[2:3]" ]
+class RegSeqNames<int last_reg, int stride, int size, string prefix,
+ int start = 0> {
+ int next = !add(start, stride);
+ int end_reg = !add(!add(start, size), -1);
+ list<string> ret =
+ !if(!le(end_reg, last_reg),
+ !listconcat([prefix # "[" # start # ":" # end_reg # "]"],
+ RegSeqNames<last_reg, stride, size, prefix, next>.ret),
+ []);
+}
+
+// Generates list of dags for register tupless.
+class RegSeqDags<RegisterClass RC, int last_reg, int stride, int size,
+ int start = 0> {
+ dag trunc_rc = (trunc RC,
+ !if(!and(!eq(stride, 1), !eq(start, 0)),
+ !add(!add(last_reg, 2), !mul(size, -1)),
+ !add(last_reg, 1)));
+ list<dag> ret =
+ !if(!lt(start, size),
+ !listconcat([(add (decimate (shl trunc_rc, start), stride))],
+ RegSeqDags<RC, last_reg, stride, size, !add(start, 1)>.ret),
+ []);
+}
+
+class SIRegisterTuples<list<SubRegIndex> Indices, RegisterClass RC,
+ int last_reg, int stride, int size, string prefix> :
+ RegisterTuples<Indices,
+ RegSeqDags<RC, last_reg, stride, size>.ret,
+ RegSeqNames<last_reg, stride, size, prefix>.ret>;
//===----------------------------------------------------------------------===//
// Declarations that describe the SI registers
//===----------------------------------------------------------------------===//
-class SIReg <string n, bits<16> regIdx = 0, string prefix = "",
- int regNo = !cast<int>(regIdx)> :
- Register<n, !if(!eq(prefix, ""),
- [ n, n, n, n, n, n, n, n ],
- [ prefix # regNo,
- prefix # "[" # regNo # ":" # !and(!add(regNo, 1), 255) # "]",
- prefix # "[" # regNo # ":" # !and(!add(regNo, 2), 255) # "]",
- prefix # "[" # regNo # ":" # !and(!add(regNo, 3), 255) # "]",
- prefix # "[" # regNo # ":" # !and(!add(regNo, 4), 255) # "]",
- prefix # "[" # regNo # ":" # !and(!add(regNo, 7), 255) # "]",
- prefix # "[" # regNo # ":" # !and(!add(regNo, 15), 255) # "]",
- prefix # "[" # regNo # ":" # !and(!add(regNo, 31), 255) # "]",
- ])>,
+class SIReg <string n, bits<16> regIdx = 0> :
+ Register<n>,
DwarfRegNum<[!cast<int>(HWEncoding)]> {
let Namespace = "AMDGPU";
- let RegAltNameIndices = AllRegAltNameIndices;
// This is the not yet the complete register encoding. An additional
// bit is set for VGPRs.
let HWEncoding = regIdx;
}
-class SIRegisterWithSubRegs<string n, list<Register> subregs> :
- RegisterWithSubRegs<n, subregs> {
- let RegAltNameIndices = AllRegAltNameIndices;
- let AltNames = [ n, n, n, n, n, n, n, n ];
-}
-
// Special Registers
def VCC_LO : SIReg<"vcc_lo", 106>;
def VCC_HI : SIReg<"vcc_hi", 107>;
@@ -93,7 +95,7 @@ def SP_REG : SIReg<"sp", 0>;
def SCRATCH_WAVE_OFFSET_REG : SIReg<"scratch_wave_offset", 0>;
// VCC for 64-bit instructions
-def VCC : SIRegisterWithSubRegs<"vcc", [VCC_LO, VCC_HI]>,
+def VCC : RegisterWithSubRegs<"vcc", [VCC_LO, VCC_HI]>,
DwarfRegAlias<VCC_LO> {
let Namespace = "AMDGPU";
let SubRegIndices = [sub0, sub1];
@@ -103,7 +105,7 @@ def VCC : SIRegisterWithSubRegs<"vcc", [VCC_LO, VCC_HI]>,
def EXEC_LO : SIReg<"exec_lo", 126>;
def EXEC_HI : SIReg<"exec_hi", 127>;
-def EXEC : SIRegisterWithSubRegs<"exec", [EXEC_LO, EXEC_HI]>,
+def EXEC : RegisterWithSubRegs<"exec", [EXEC_LO, EXEC_HI]>,
DwarfRegAlias<EXEC_LO> {
let Namespace = "AMDGPU";
let SubRegIndices = [sub0, sub1];
@@ -134,7 +136,7 @@ def LDS_DIRECT : SIReg <"src_lds_direct", 254>;
def XNACK_MASK_LO : SIReg<"xnack_mask_lo", 104>;
def XNACK_MASK_HI : SIReg<"xnack_mask_hi", 105>;
-def XNACK_MASK : SIRegisterWithSubRegs<"xnack_mask", [XNACK_MASK_LO, XNACK_MASK_HI]>,
+def XNACK_MASK : RegisterWithSubRegs<"xnack_mask", [XNACK_MASK_LO, XNACK_MASK_HI]>,
DwarfRegAlias<XNACK_MASK_LO> {
let Namespace = "AMDGPU";
let SubRegIndices = [sub0, sub1];
@@ -145,7 +147,7 @@ def XNACK_MASK : SIRegisterWithSubRegs<"xnack_mask", [XNACK_MASK_LO, XNACK_MASK_
def TBA_LO : SIReg<"tba_lo", 108>;
def TBA_HI : SIReg<"tba_hi", 109>;
-def TBA : SIRegisterWithSubRegs<"tba", [TBA_LO, TBA_HI]>,
+def TBA : RegisterWithSubRegs<"tba", [TBA_LO, TBA_HI]>,
DwarfRegAlias<TBA_LO> {
let Namespace = "AMDGPU";
let SubRegIndices = [sub0, sub1];
@@ -155,7 +157,7 @@ def TBA : SIRegisterWithSubRegs<"tba", [TBA_LO, TBA_HI]>,
def TMA_LO : SIReg<"tma_lo", 110>;
def TMA_HI : SIReg<"tma_hi", 111>;
-def TMA : SIRegisterWithSubRegs<"tma", [TMA_LO, TMA_HI]>,
+def TMA : RegisterWithSubRegs<"tma", [TMA_LO, TMA_HI]>,
DwarfRegAlias<TMA_LO> {
let Namespace = "AMDGPU";
let SubRegIndices = [sub0, sub1];
@@ -175,7 +177,7 @@ multiclass FLAT_SCR_LOHI_m <string n, bits<16> ci_e, bits<16> vi_e> {
}
class FlatReg <Register lo, Register hi, bits<16> encoding> :
- SIRegisterWithSubRegs<"flat_scratch", [lo, hi]>,
+ RegisterWithSubRegs<"flat_scratch", [lo, hi]>,
DwarfRegAlias<lo> {
let Namespace = "AMDGPU";
let SubRegIndices = [sub0, sub1];
@@ -191,19 +193,19 @@ def FLAT_SCR : FlatReg<FLAT_SCR_LO, FLAT_SCR_HI, 0>;
// SGPR registers
foreach Index = 0-105 in {
- def SGPR#Index : SIReg <"SGPR"#Index, Index, "s">;
+ def SGPR#Index : SIReg <"s"#Index, Index>;
}
// VGPR registers
foreach Index = 0-255 in {
- def VGPR#Index : SIReg <"VGPR"#Index, Index, "v"> {
+ def VGPR#Index : SIReg <"v"#Index, Index> {
let HWEncoding{8} = 1;
}
}
// AccVGPR registers
foreach Index = 0-255 in {
- def AGPR#Index : SIReg <"AGPR"#Index, Index, "a"> {
+ def AGPR#Index : SIReg <"a"#Index, Index> {
let HWEncoding{8} = 1;
}
}
@@ -226,102 +228,32 @@ def M0_CLASS : RegisterClass<"AMDGPU", [i32], 32, (add M0)> {
// SGPR 32-bit registers
def SGPR_32 : RegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 32,
- (add (sequence "SGPR%u", 0, 105)), Reg32> {
+ (add (sequence "SGPR%u", 0, 105))> {
// Give all SGPR classes higher priority than VGPR classes, because
// we want to spill SGPRs to VGPRs.
let AllocationPriority = 9;
}
// SGPR 64-bit registers
-def SGPR_64Regs : RegisterTuples<getSubRegs<2>.ret,
- [(add (decimate SGPR_32, 2)),
- (add (decimate (shl SGPR_32, 1), 2))]>;
+def SGPR_64Regs : SIRegisterTuples<getSubRegs<2>.ret, SGPR_32, 105, 2, 2, "s">;
// SGPR 96-bit registers. No operations use these, but for symmetry with 96-bit VGPRs.
-def SGPR_96Regs : RegisterTuples<getSubRegs<3>.ret,
- [(add (decimate SGPR_32, 3)),
- (add (decimate (shl SGPR_32, 1), 3)),
- (add (decimate (shl SGPR_32, 2), 3))]>;
+def SGPR_96Regs : SIRegisterTuples<getSubRegs<3>.ret, SGPR_32, 105, 3, 3, "s">;
// SGPR 128-bit registers
-def SGPR_128Regs : RegisterTuples<getSubRegs<4>.ret,
- [(add (decimate SGPR_32, 4)),
- (add (decimate (shl SGPR_32, 1), 4)),
- (add (decimate (shl SGPR_32, 2), 4)),
- (add (decimate (shl SGPR_32, 3), 4))]>;
+def SGPR_128Regs : SIRegisterTuples<getSubRegs<4>.ret, SGPR_32, 105, 4, 4, "s">;
// SGPR 160-bit registers. No operations use these, but for symmetry with 160-bit VGPRs.
-def SGPR_160Regs : RegisterTuples<getSubRegs<5>.ret,
- [(add (decimate SGPR_32, 4)),
- (add (decimate (shl SGPR_32, 1), 4)),
- (add (decimate (shl SGPR_32, 2), 4)),
- (add (decimate (shl SGPR_32, 3), 4)),
- (add (decimate (shl SGPR_32, 4), 4))]>;
+def SGPR_160Regs : SIRegisterTuples<getSubRegs<5>.ret, SGPR_32, 105, 4, 5, "s">;
// SGPR 256-bit registers
-def SGPR_256Regs : RegisterTuples<getSubRegs<8>.ret,
- [(add (decimate SGPR_32, 4)),
- (add (decimate (shl SGPR_32, 1), 4)),
- (add (decimate (shl SGPR_32, 2), 4)),
- (add (decimate (shl SGPR_32, 3), 4)),
- (add (decimate (shl SGPR_32, 4), 4)),
- (add (decimate (shl SGPR_32, 5), 4)),
- (add (decimate (shl SGPR_32, 6), 4)),
- (add (decimate (shl SGPR_32, 7), 4))]>;
+def SGPR_256Regs : SIRegisterTuples<getSubRegs<8>.ret, SGPR_32, 105, 4, 8, "s">;
// SGPR 512-bit registers
-def SGPR_512Regs : RegisterTuples<getSubRegs<16>.ret,
- [(add (decimate SGPR_32, 4)),
- (add (decimate (shl SGPR_32, 1), 4)),
- (add (decimate (shl SGPR_32, 2), 4)),
- (add (decimate (shl SGPR_32, 3), 4)),
- (add (decimate (shl SGPR_32, 4), 4)),
- (add (decimate (shl SGPR_32, 5), 4)),
- (add (decimate (shl SGPR_32, 6), 4)),
- (add (decimate (shl SGPR_32, 7), 4)),
- (add (decimate (shl SGPR_32, 8), 4)),
- (add (decimate (shl SGPR_32, 9), 4)),
- (add (decimate (shl SGPR_32, 10), 4)),
- (add (decimate (shl SGPR_32, 11), 4)),
- (add (decimate (shl SGPR_32, 12), 4)),
- (add (decimate (shl SGPR_32, 13), 4)),
- (add (decimate (shl SGPR_32, 14), 4)),
- (add (decimate (shl SGPR_32, 15), 4))]>;
+def SGPR_512Regs : SIRegisterTuples<getSubRegs<16>.ret, SGPR_32, 105, 4, 16, "s">;
// SGPR 1024-bit registers
-def SGPR_1024Regs : RegisterTuples<getSubRegs<32>.ret,
- [(add (decimate SGPR_32, 4)),
- (add (decimate (shl SGPR_32, 1), 4)),
- (add (decimate (shl SGPR_32, 2), 4)),
- (add (decimate (shl SGPR_32, 3), 4)),
- (add (decimate (shl SGPR_32, 4), 4)),
- (add (decimate (shl SGPR_32, 5), 4)),
- (add (decimate (shl SGPR_32, 6), 4)),
- (add (decimate (shl SGPR_32, 7), 4)),
- (add (decimate (shl SGPR_32, 8), 4)),
- (add (decimate (shl SGPR_32, 9), 4)),
- (add (decimate (shl SGPR_32, 10), 4)),
- (add (decimate (shl SGPR_32, 11), 4)),
- (add (decimate (shl SGPR_32, 12), 4)),
- (add (decimate (shl SGPR_32, 13), 4)),
- (add (decimate (shl SGPR_32, 14), 4)),
- (add (decimate (shl SGPR_32, 15), 4)),
- (add (decimate (shl SGPR_32, 16), 4)),
- (add (decimate (shl SGPR_32, 17), 4)),
- (add (decimate (shl SGPR_32, 18), 4)),
- (add (decimate (shl SGPR_32, 19), 4)),
- (add (decimate (shl SGPR_32, 20), 4)),
- (add (decimate (shl SGPR_32, 21), 4)),
- (add (decimate (shl SGPR_32, 22), 4)),
- (add (decimate (shl SGPR_32, 23), 4)),
- (add (decimate (shl SGPR_32, 24), 4)),
- (add (decimate (shl SGPR_32, 25), 4)),
- (add (decimate (shl SGPR_32, 26), 4)),
- (add (decimate (shl SGPR_32, 27), 4)),
- (add (decimate (shl SGPR_32, 28), 4)),
- (add (decimate (shl SGPR_32, 29), 4)),
- (add (decimate (shl SGPR_32, 30), 4)),
- (add (decimate (shl SGPR_32, 31), 4))]>;
+def SGPR_1024Regs : SIRegisterTuples<getSubRegs<32>.ret, SGPR_32, 105, 4, 32, "s">;
// Trap handler TMP 32-bit registers
def TTMP_32 : RegisterClass<"AMDGPU", [i32, f32, v2i16, v2f16], 32,
@@ -330,51 +262,21 @@ def TTMP_32 : RegisterClass<"AMDGPU", [i32, f32, v2i16, v2f16], 32,
}
// Trap handler TMP 64-bit registers
-def TTMP_64Regs : RegisterTuples<getSubRegs<2>.ret,
- [(add (decimate TTMP_32, 2)),
- (add (decimate (shl TTMP_32, 1), 2))]>;
+def TTMP_64Regs : SIRegisterTuples<getSubRegs<2>.ret, TTMP_32, 15, 2, 2, "ttmp">;
// Trap handler TMP 128-bit registers
-def TTMP_128Regs : RegisterTuples<getSubRegs<4>.ret,
- [(add (decimate TTMP_32, 4)),
- (add (decimate (shl TTMP_32, 1), 4)),
- (add (decimate (shl TTMP_32, 2), 4)),
- (add (decimate (shl TTMP_32, 3), 4))]>;
-
-def TTMP_256Regs : RegisterTuples<getSubRegs<8>.ret,
- [(add (decimate TTMP_32, 4)),
- (add (decimate (shl TTMP_32, 1), 4)),
- (add (decimate (shl TTMP_32, 2), 4)),
- (add (decimate (shl TTMP_32, 3), 4)),
- (add (decimate (shl TTMP_32, 4), 4)),
- (add (decimate (shl TTMP_32, 5), 4)),
- (add (decimate (shl TTMP_32, 6), 4)),
- (add (decimate (shl TTMP_32, 7), 4))]>;
-
-def TTMP_512Regs : RegisterTuples<getSubRegs<16>.ret,
- [(add (decimate TTMP_32, 4)),
- (add (decimate (shl TTMP_32, 1), 4)),
- (add (decimate (shl TTMP_32, 2), 4)),
- (add (decimate (shl TTMP_32, 3), 4)),
- (add (decimate (shl TTMP_32, 4), 4)),
- (add (decimate (shl TTMP_32, 5), 4)),
- (add (decimate (shl TTMP_32, 6), 4)),
- (add (decimate (shl TTMP_32, 7), 4)),
- (add (decimate (shl TTMP_32, 8), 4)),
- (add (decimate (shl TTMP_32, 9), 4)),
- (add (decimate (shl TTMP_32, 10), 4)),
- (add (decimate (shl TTMP_32, 11), 4)),
- (add (decimate (shl TTMP_32, 12), 4)),
- (add (decimate (shl TTMP_32, 13), 4)),
- (add (decimate (shl TTMP_32, 14), 4)),
- (add (decimate (shl TTMP_32, 15), 4))]>;
+def TTMP_128Regs : SIRegisterTuples<getSubRegs<4>.ret, TTMP_32, 15, 4, 4, "ttmp">;
+
+def TTMP_256Regs : SIRegisterTuples<getSubRegs<8>.ret, TTMP_32, 15, 4, 8, "ttmp">;
+
+def TTMP_512Regs : SIRegisterTuples<getSubRegs<16>.ret, TTMP_32, 15, 4, 16, "ttmp">;
class TmpRegTuplesBase<int index, int size,
list<Register> subRegs,
list<SubRegIndex> indices = getSubRegs<size>.ret,
int index1 = !add(index, !add(size, -1)),
string name = "ttmp["#index#":"#index1#"]"> :
- SIRegisterWithSubRegs<name, subRegs> {
+ RegisterWithSubRegs<name, subRegs> {
let HWEncoding = subRegs[0].HWEncoding;
let SubRegIndices = indices;
}
@@ -448,196 +350,80 @@ def TTMP0_TTMP1_TTMP2_TTMP3_TTMP4_TTMP5_TTMP6_TTMP7_TTMP8_TTMP9_TTMP10_TTMP11_TT
TTMP8_gfx9_gfx10, TTMP9_gfx9_gfx10, TTMP10_gfx9_gfx10, TTMP11_gfx9_gfx10,
TTMP12_gfx9_gfx10, TTMP13_gfx9_gfx10, TTMP14_gfx9_gfx10, TTMP15_gfx9_gfx10]>;
+class RegisterTypes<list<ValueType> reg_types> {
+ list<ValueType> types = reg_types;
+}
+
+def Reg16Types : RegisterTypes<[i16, f16]>;
+def Reg32Types : RegisterTypes<[i32, f32, v2i16, v2f16, p2, p3, p5, p6]>;
+
+
// VGPR 32-bit registers
// i16/f16 only on VI+
-def VGPR_32 : RegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 32,
- (add (sequence "VGPR%u", 0, 255)), Reg32> {
+def VGPR_32 : RegisterClass<"AMDGPU", !listconcat(Reg32Types.types, Reg16Types.types), 32,
+ (add (sequence "VGPR%u", 0, 255))> {
let AllocationPriority = 1;
let Size = 32;
}
// VGPR 64-bit registers
-def VGPR_64 : RegisterTuples<getSubRegs<2>.ret,
- [(add (trunc VGPR_32, 255)),
- (add (shl VGPR_32, 1))]>;
+def VGPR_64 : SIRegisterTuples<getSubRegs<2>.ret, VGPR_32, 255, 1, 2, "v">;
// VGPR 96-bit registers
-def VGPR_96 : RegisterTuples<getSubRegs<3>.ret,
- [(add (trunc VGPR_32, 254)),
- (add (shl VGPR_32, 1)),
- (add (shl VGPR_32, 2))]>;
+def VGPR_96 : SIRegisterTuples<getSubRegs<3>.ret, VGPR_32, 255, 1, 3, "v">;
// VGPR 128-bit registers
-def VGPR_128 : RegisterTuples<getSubRegs<4>.ret,
- [(add (trunc VGPR_32, 253)),
- (add (shl VGPR_32, 1)),
- (add (shl VGPR_32, 2)),
- (add (shl VGPR_32, 3))]>;
+def VGPR_128 : SIRegisterTuples<getSubRegs<4>.ret, VGPR_32, 255, 1, 4, "v">;
// VGPR 160-bit registers
-def VGPR_160 : RegisterTuples<getSubRegs<5>.ret,
- [(add (trunc VGPR_32, 252)),
- (add (shl VGPR_32, 1)),
- (add (shl VGPR_32, 2)),
- (add (shl VGPR_32, 3)),
- (add (shl VGPR_32, 4))]>;
+def VGPR_160 : SIRegisterTuples<getSubRegs<5>.ret, VGPR_32, 255, 1, 5, "v">;
// VGPR 256-bit registers
-def VGPR_256 : RegisterTuples<getSubRegs<8>.ret,
- [(add (trunc VGPR_32, 249)),
- (add (shl VGPR_32, 1)),
- (add (shl VGPR_32, 2)),
- (add (shl VGPR_32, 3)),
- (add (shl VGPR_32, 4)),
- (add (shl VGPR_32, 5)),
- (add (shl VGPR_32, 6)),
- (add (shl VGPR_32, 7))]>;
+def VGPR_256 : SIRegisterTuples<getSubRegs<8>.ret, VGPR_32, 255, 1, 8, "v">;
// VGPR 512-bit registers
-def VGPR_512 : RegisterTuples<getSubRegs<16>.ret,
- [(add (trunc VGPR_32, 241)),
- (add (shl VGPR_32, 1)),
- (add (shl VGPR_32, 2)),
- (add (shl VGPR_32, 3)),
- (add (shl VGPR_32, 4)),
- (add (shl VGPR_32, 5)),
- (add (shl VGPR_32, 6)),
- (add (shl VGPR_32, 7)),
- (add (shl VGPR_32, 8)),
- (add (shl VGPR_32, 9)),
- (add (shl VGPR_32, 10)),
- (add (shl VGPR_32, 11)),
- (add (shl VGPR_32, 12)),
- (add (shl VGPR_32, 13)),
- (add (shl VGPR_32, 14)),
- (add (shl VGPR_32, 15))]>;
+def VGPR_512 : SIRegisterTuples<getSubRegs<16>.ret, VGPR_32, 255, 1, 16, "v">;
// VGPR 1024-bit registers
-def VGPR_1024 : RegisterTuples<getSubRegs<32>.ret,
- [(add (trunc VGPR_32, 225)),
- (add (shl VGPR_32, 1)),
- (add (shl VGPR_32, 2)),
- (add (shl VGPR_32, 3)),
- (add (shl VGPR_32, 4)),
- (add (shl VGPR_32, 5)),
- (add (shl VGPR_32, 6)),
- (add (shl VGPR_32, 7)),
- (add (shl VGPR_32, 8)),
- (add (shl VGPR_32, 9)),
- (add (shl VGPR_32, 10)),
- (add (shl VGPR_32, 11)),
- (add (shl VGPR_32, 12)),
- (add (shl VGPR_32, 13)),
- (add (shl VGPR_32, 14)),
- (add (shl VGPR_32, 15)),
- (add (shl VGPR_32, 16)),
- (add (shl VGPR_32, 17)),
- (add (shl VGPR_32, 18)),
- (add (shl VGPR_32, 19)),
- (add (shl VGPR_32, 20)),
- (add (shl VGPR_32, 21)),
- (add (shl VGPR_32, 22)),
- (add (shl VGPR_32, 23)),
- (add (shl VGPR_32, 24)),
- (add (shl VGPR_32, 25)),
- (add (shl VGPR_32, 26)),
- (add (shl VGPR_32, 27)),
- (add (shl VGPR_32, 28)),
- (add (shl VGPR_32, 29)),
- (add (shl VGPR_32, 30)),
- (add (shl VGPR_32, 31))]>;
+def VGPR_1024 : SIRegisterTuples<getSubRegs<32>.ret, VGPR_32, 255, 1, 32, "v">;
// AccVGPR 32-bit registers
def AGPR_32 : RegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 32,
- (add (sequence "AGPR%u", 0, 255)), Reg32> {
+ (add (sequence "AGPR%u", 0, 255))> {
let AllocationPriority = 1;
let Size = 32;
}
// AGPR 64-bit registers
-def AGPR_64 : RegisterTuples<getSubRegs<2>.ret,
- [(add (trunc AGPR_32, 255)),
- (add (shl AGPR_32, 1))]>;
+def AGPR_64 : SIRegisterTuples<getSubRegs<2>.ret, AGPR_32, 255, 1, 2, "a">;
// AGPR 128-bit registers
-def AGPR_128 : RegisterTuples<getSubRegs<4>.ret,
- [(add (trunc AGPR_32, 253)),
- (add (shl AGPR_32, 1)),
- (add (shl AGPR_32, 2)),
- (add (shl AGPR_32, 3))]>;
+def AGPR_128 : SIRegisterTuples<getSubRegs<4>.ret, AGPR_32, 255, 1, 4, "a">;
// AGPR 512-bit registers
-def AGPR_512 : RegisterTuples<getSubRegs<16>.ret,
- [(add (trunc AGPR_32, 241)),
- (add (shl AGPR_32, 1)),
- (add (shl AGPR_32, 2)),
- (add (shl AGPR_32, 3)),
- (add (shl AGPR_32, 4)),
- (add (shl AGPR_32, 5)),
- (add (shl AGPR_32, 6)),
- (add (shl AGPR_32, 7)),
- (add (shl AGPR_32, 8)),
- (add (shl AGPR_32, 9)),
- (add (shl AGPR_32, 10)),
- (add (shl AGPR_32, 11)),
- (add (shl AGPR_32, 12)),
- (add (shl AGPR_32, 13)),
- (add (shl AGPR_32, 14)),
- (add (shl AGPR_32, 15))]>;
+def AGPR_512 : SIRegisterTuples<getSubRegs<16>.ret, AGPR_32, 255, 1, 16, "a">;
// AGPR 1024-bit registers
-def AGPR_1024 : RegisterTuples<getSubRegs<32>.ret,
- [(add (trunc AGPR_32, 225)),
- (add (shl AGPR_32, 1)),
- (add (shl AGPR_32, 2)),
- (add (shl AGPR_32, 3)),
- (add (shl AGPR_32, 4)),
- (add (shl AGPR_32, 5)),
- (add (shl AGPR_32, 6)),
- (add (shl AGPR_32, 7)),
- (add (shl AGPR_32, 8)),
- (add (shl AGPR_32, 9)),
- (add (shl AGPR_32, 10)),
- (add (shl AGPR_32, 11)),
- (add (shl AGPR_32, 12)),
- (add (shl AGPR_32, 13)),
- (add (shl AGPR_32, 14)),
- (add (shl AGPR_32, 15)),
- (add (shl AGPR_32, 16)),
- (add (shl AGPR_32, 17)),
- (add (shl AGPR_32, 18)),
- (add (shl AGPR_32, 19)),
- (add (shl AGPR_32, 20)),
- (add (shl AGPR_32, 21)),
- (add (shl AGPR_32, 22)),
- (add (shl AGPR_32, 23)),
- (add (shl AGPR_32, 24)),
- (add (shl AGPR_32, 25)),
- (add (shl AGPR_32, 26)),
- (add (shl AGPR_32, 27)),
- (add (shl AGPR_32, 28)),
- (add (shl AGPR_32, 29)),
- (add (shl AGPR_32, 30)),
- (add (shl AGPR_32, 31))]>;
+def AGPR_1024 : SIRegisterTuples<getSubRegs<32>.ret, AGPR_32, 255, 1, 32, "a">;
//===----------------------------------------------------------------------===//
// Register classes used as source and destination
//===----------------------------------------------------------------------===//
def Pseudo_SReg_32 : RegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 32,
- (add FP_REG, SP_REG, SCRATCH_WAVE_OFFSET_REG), Reg32> {
+ (add FP_REG, SP_REG, SCRATCH_WAVE_OFFSET_REG)> {
let isAllocatable = 0;
let CopyCost = -1;
}
def Pseudo_SReg_128 : RegisterClass<"AMDGPU", [v4i32, v2i64, v2f64], 32,
- (add PRIVATE_RSRC_REG), Reg128> {
+ (add PRIVATE_RSRC_REG)> {
let isAllocatable = 0;
let CopyCost = -1;
}
def LDS_DIRECT_CLASS : RegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 32,
- (add LDS_DIRECT), Reg32> {
+ (add LDS_DIRECT)> {
let isAllocatable = 0;
let CopyCost = -1;
}
@@ -648,41 +434,40 @@ def SReg_32_XM0_XEXEC : RegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f1
(add SGPR_32, VCC_LO, VCC_HI, FLAT_SCR_LO, FLAT_SCR_HI, XNACK_MASK_LO, XNACK_MASK_HI,
SGPR_NULL, TTMP_32, TMA_LO, TMA_HI, TBA_LO, TBA_HI, SRC_SHARED_BASE, SRC_SHARED_LIMIT,
SRC_PRIVATE_BASE, SRC_PRIVATE_LIMIT, SRC_POPS_EXITING_WAVE_ID,
- SRC_VCCZ, SRC_EXECZ, SRC_SCC), Reg32> {
+ SRC_VCCZ, SRC_EXECZ, SRC_SCC)> {
let AllocationPriority = 10;
}
def SReg_32_XEXEC_HI : RegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16, i1], 32,
- (add SReg_32_XM0_XEXEC, EXEC_LO, M0_CLASS), Reg32> {
+ (add SReg_32_XM0_XEXEC, EXEC_LO, M0_CLASS)> {
let AllocationPriority = 10;
}
def SReg_32_XM0 : RegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16, i1], 32,
- (add SReg_32_XM0_XEXEC, EXEC_LO, EXEC_HI), Reg32> {
+ (add SReg_32_XM0_XEXEC, EXEC_LO, EXEC_HI)> {
let AllocationPriority = 10;
}
// Register class for all scalar registers (SGPRs + Special Registers)
def SReg_32 : RegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16, i1], 32,
- (add SReg_32_XM0, M0_CLASS, EXEC_LO, EXEC_HI, SReg_32_XEXEC_HI), Reg32> {
+ (add SReg_32_XM0, M0_CLASS, EXEC_LO, EXEC_HI, SReg_32_XEXEC_HI)> {
let AllocationPriority = 10;
}
def SRegOrLds_32 : RegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16, i1], 32,
- (add SReg_32_XM0, M0_CLASS, EXEC_LO, EXEC_HI, SReg_32_XEXEC_HI, LDS_DIRECT_CLASS),
- Reg32> {
+ (add SReg_32_XM0, M0_CLASS, EXEC_LO, EXEC_HI, SReg_32_XEXEC_HI, LDS_DIRECT_CLASS)> {
let isAllocatable = 0;
}
def SGPR_64 : RegisterClass<"AMDGPU", [v2i32, i64, v2f32, f64, v4i16, v4f16], 32,
- (add SGPR_64Regs), Reg64> {
+ (add SGPR_64Regs)> {
let CopyCost = 1;
let AllocationPriority = 11;
}
// CCR (call clobbered registers) SGPR 64-bit registers
def CCR_SGPR_64 : RegisterClass<"AMDGPU", SGPR_64.RegTypes, 32,
- (add (trunc SGPR_64, 16)), Reg64> {
+ (add (trunc SGPR_64, 16))> {
let CopyCost = SGPR_64.CopyCost;
let AllocationPriority = SGPR_64.AllocationPriority;
}
@@ -693,13 +478,13 @@ def TTMP_64 : RegisterClass<"AMDGPU", [v2i32, i64, f64, v4i16, v4f16], 32,
}
def SReg_64_XEXEC : RegisterClass<"AMDGPU", [v2i32, i64, v2f32, f64, i1, v4i16, v4f16], 32,
- (add SGPR_64, VCC, FLAT_SCR, XNACK_MASK, TTMP_64, TBA, TMA), Reg64> {
+ (add SGPR_64, VCC, FLAT_SCR, XNACK_MASK, TTMP_64, TBA, TMA)> {
let CopyCost = 1;
let AllocationPriority = 13;
}
def SReg_64 : RegisterClass<"AMDGPU", [v2i32, i64, v2f32, f64, i1, v4i16, v4f16], 32,
- (add SReg_64_XEXEC, EXEC), Reg64> {
+ (add SReg_64_XEXEC, EXEC)> {
let CopyCost = 1;
let AllocationPriority = 13;
}
@@ -722,17 +507,17 @@ let CopyCost = 2 in {
// There are no 3-component scalar instructions, but this is needed
// for symmetry with VGPRs.
def SGPR_96 : RegisterClass<"AMDGPU", [v3i32, v3f32], 32,
- (add SGPR_96Regs), Reg96> {
+ (add SGPR_96Regs)> {
let AllocationPriority = 14;
}
def SReg_96 : RegisterClass<"AMDGPU", [v3i32, v3f32], 32,
- (add SGPR_96), Reg96> {
+ (add SGPR_96)> {
let AllocationPriority = 14;
}
def SGPR_128 : RegisterClass<"AMDGPU", [v4i32, v4f32, v2i64], 32,
- (add SGPR_128Regs), Reg128> {
+ (add SGPR_128Regs)> {
let AllocationPriority = 15;
}
@@ -742,8 +527,9 @@ def TTMP_128 : RegisterClass<"AMDGPU", [v4i32, v4f32, v2i64], 32,
}
def SReg_128 : RegisterClass<"AMDGPU", [v4i32, v4f32, v2i64, v2f64], 32,
- (add SGPR_128, TTMP_128), Reg128> {
+ (add SGPR_128, TTMP_128)> {
let AllocationPriority = 15;
+ let isAllocatable = 0;
}
} // End CopyCost = 2
@@ -751,17 +537,16 @@ def SReg_128 : RegisterClass<"AMDGPU", [v4i32, v4f32, v2i64, v2f64], 32,
// There are no 5-component scalar instructions, but this is needed
// for symmetry with VGPRs.
def SGPR_160 : RegisterClass<"AMDGPU", [v5i32, v5f32], 32,
- (add SGPR_160Regs), Reg160> {
+ (add SGPR_160Regs)> {
let AllocationPriority = 16;
}
def SReg_160 : RegisterClass<"AMDGPU", [v5i32, v5f32], 32,
- (add SGPR_160), Reg160> {
+ (add SGPR_160)> {
let AllocationPriority = 16;
}
-def SGPR_256 : RegisterClass<"AMDGPU", [v8i32, v8f32], 32, (add SGPR_256Regs),
- Reg256> {
+def SGPR_256 : RegisterClass<"AMDGPU", [v8i32, v8f32], 32, (add SGPR_256Regs)> {
let AllocationPriority = 17;
}
@@ -770,14 +555,14 @@ def TTMP_256 : RegisterClass<"AMDGPU", [v8i32, v8f32], 32, (add TTMP_256Regs)> {
}
def SReg_256 : RegisterClass<"AMDGPU", [v8i32, v8f32], 32,
- (add SGPR_256, TTMP_256), Reg256> {
+ (add SGPR_256, TTMP_256)> {
// Requires 4 s_mov_b64 to copy
let CopyCost = 4;
let AllocationPriority = 17;
}
def SGPR_512 : RegisterClass<"AMDGPU", [v16i32, v16f32], 32,
- (add SGPR_512Regs), Reg512> {
+ (add SGPR_512Regs)> {
let AllocationPriority = 18;
}
@@ -787,31 +572,31 @@ def TTMP_512 : RegisterClass<"AMDGPU", [v16i32, v16f32], 32,
}
def SReg_512 : RegisterClass<"AMDGPU", [v16i32, v16f32], 32,
- (add SGPR_512, TTMP_512), Reg512> {
+ (add SGPR_512, TTMP_512)> {
// Requires 8 s_mov_b64 to copy
let CopyCost = 8;
let AllocationPriority = 18;
}
def VRegOrLds_32 : RegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 32,
- (add VGPR_32, LDS_DIRECT_CLASS), Reg32> {
+ (add VGPR_32, LDS_DIRECT_CLASS)> {
let isAllocatable = 0;
}
def SGPR_1024 : RegisterClass<"AMDGPU", [v32i32, v32f32], 32,
- (add SGPR_1024Regs), Reg1024> {
+ (add SGPR_1024Regs)> {
let AllocationPriority = 19;
}
def SReg_1024 : RegisterClass<"AMDGPU", [v32i32, v32f32], 32,
- (add SGPR_1024), Reg1024> {
+ (add SGPR_1024)> {
let CopyCost = 16;
let AllocationPriority = 19;
}
// Register class for all vector registers (VGPRs + Interploation Registers)
-def VReg_64 : RegisterClass<"AMDGPU", [i64, f64, v2i32, v2f32, v4f16, v4i16], 32,
- (add VGPR_64), Reg64> {
+def VReg_64 : RegisterClass<"AMDGPU", [i64, f64, v2i32, v2f32, v4f16, v4i16, p0, p1, p4], 32,
+ (add VGPR_64)> {
let Size = 64;
// Requires 2 v_mov_b32 to copy
@@ -819,7 +604,7 @@ def VReg_64 : RegisterClass<"AMDGPU", [i64, f64, v2i32, v2f32, v4f16, v4i16], 32
let AllocationPriority = 2;
}
-def VReg_96 : RegisterClass<"AMDGPU", [v3i32, v3f32], 32, (add VGPR_96), Reg96> {
+def VReg_96 : RegisterClass<"AMDGPU", [v3i32, v3f32], 32, (add VGPR_96)> {
let Size = 96;
// Requires 3 v_mov_b32 to copy
@@ -828,7 +613,7 @@ def VReg_96 : RegisterClass<"AMDGPU", [v3i32, v3f32], 32, (add VGPR_96), Reg96>
}
def VReg_128 : RegisterClass<"AMDGPU", [v4i32, v4f32, v2i64, v2f64], 32,
- (add VGPR_128), Reg128> {
+ (add VGPR_128)> {
let Size = 128;
// Requires 4 v_mov_b32 to copy
@@ -837,7 +622,7 @@ def VReg_128 : RegisterClass<"AMDGPU", [v4i32, v4f32, v2i64, v2f64], 32,
}
def VReg_160 : RegisterClass<"AMDGPU", [v5i32, v5f32], 32,
- (add VGPR_160), Reg160> {
+ (add VGPR_160)> {
let Size = 160;
// Requires 5 v_mov_b32 to copy
@@ -846,28 +631,28 @@ def VReg_160 : RegisterClass<"AMDGPU", [v5i32, v5f32], 32,
}
def VReg_256 : RegisterClass<"AMDGPU", [v8i32, v8f32], 32,
- (add VGPR_256), Reg256> {
+ (add VGPR_256)> {
let Size = 256;
let CopyCost = 8;
let AllocationPriority = 6;
}
def VReg_512 : RegisterClass<"AMDGPU", [v16i32, v16f32], 32,
- (add VGPR_512), Reg512> {
+ (add VGPR_512)> {
let Size = 512;
let CopyCost = 16;
let AllocationPriority = 7;
}
def VReg_1024 : RegisterClass<"AMDGPU", [v32i32, v32f32], 32,
- (add VGPR_1024), Reg1024> {
+ (add VGPR_1024)> {
let Size = 1024;
let CopyCost = 32;
let AllocationPriority = 8;
}
def AReg_64 : RegisterClass<"AMDGPU", [i64, f64, v2i32, v2f32, v4f16, v4i16], 32,
- (add AGPR_64), Reg64> {
+ (add AGPR_64)> {
let Size = 64;
let CopyCost = 5;
@@ -875,7 +660,7 @@ def AReg_64 : RegisterClass<"AMDGPU", [i64, f64, v2i32, v2f32, v4f16, v4i16], 32
}
def AReg_128 : RegisterClass<"AMDGPU", [v4i32, v4f32, v2i64, v2f64], 32,
- (add AGPR_128), Reg128> {
+ (add AGPR_128)> {
let Size = 128;
// Requires 4 v_accvgpr_write and 4 v_accvgpr_read to copy + burn 1 vgpr
@@ -884,40 +669,39 @@ def AReg_128 : RegisterClass<"AMDGPU", [v4i32, v4f32, v2i64, v2f64], 32,
}
def AReg_512 : RegisterClass<"AMDGPU", [v16i32, v16f32], 32,
- (add AGPR_512), Reg512> {
+ (add AGPR_512)> {
let Size = 512;
let CopyCost = 33;
let AllocationPriority = 7;
}
def AReg_1024 : RegisterClass<"AMDGPU", [v32i32, v32f32], 32,
- (add AGPR_1024), Reg1024> {
+ (add AGPR_1024)> {
let Size = 1024;
let CopyCost = 65;
let AllocationPriority = 8;
}
-def VReg_1 : RegisterClass<"AMDGPU", [i1], 32, (add VGPR_32), Reg32> {
- let Size = 32;
+def VReg_1 : RegisterClass<"AMDGPU", [i1], 32, (add VGPR_32)> {
+ let Size = 1;
}
def VS_32 : RegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 32,
- (add VGPR_32, SReg_32, LDS_DIRECT_CLASS), Reg32> {
+ (add VGPR_32, SReg_32, LDS_DIRECT_CLASS)> {
let isAllocatable = 0;
}
-def VS_64 : RegisterClass<"AMDGPU", [i64, f64], 32, (add VReg_64, SReg_64),
- Reg64> {
+def VS_64 : RegisterClass<"AMDGPU", [i64, f64], 32, (add VReg_64, SReg_64)> {
let isAllocatable = 0;
}
def AV_32 : RegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 32,
- (add AGPR_32, VGPR_32), Reg32> {
+ (add AGPR_32, VGPR_32)> {
let isAllocatable = 0;
}
def AV_64 : RegisterClass<"AMDGPU", [i64, f64, v4f16], 32,
- (add AReg_64, VReg_64), Reg64> {
+ (add AReg_64, VReg_64)> {
let isAllocatable = 0;
}
diff --git a/lib/Target/AMDGPU/SIShrinkInstructions.cpp b/lib/Target/AMDGPU/SIShrinkInstructions.cpp
index 7ee178149c7a..8afca2cdc325 100644
--- a/lib/Target/AMDGPU/SIShrinkInstructions.cpp
+++ b/lib/Target/AMDGPU/SIShrinkInstructions.cpp
@@ -77,8 +77,8 @@ static bool foldImmediates(MachineInstr &MI, const SIInstrInfo *TII,
// Try to fold Src0
MachineOperand &Src0 = MI.getOperand(Src0Idx);
if (Src0.isReg()) {
- unsigned Reg = Src0.getReg();
- if (TargetRegisterInfo::isVirtualRegister(Reg) && MRI.hasOneUse(Reg)) {
+ Register Reg = Src0.getReg();
+ if (Register::isVirtualRegister(Reg) && MRI.hasOneUse(Reg)) {
MachineInstr *Def = MRI.getUniqueVRegDef(Reg);
if (Def && Def->isMoveImmediate()) {
MachineOperand &MovSrc = Def->getOperand(1);
@@ -360,8 +360,7 @@ static bool shrinkScalarLogicOp(const GCNSubtarget &ST,
}
if (NewImm != 0) {
- if (TargetRegisterInfo::isVirtualRegister(Dest->getReg()) &&
- SrcReg->isReg()) {
+ if (Register::isVirtualRegister(Dest->getReg()) && SrcReg->isReg()) {
MRI.setRegAllocationHint(Dest->getReg(), 0, SrcReg->getReg());
MRI.setRegAllocationHint(SrcReg->getReg(), 0, Dest->getReg());
return true;
@@ -394,12 +393,11 @@ static bool instAccessReg(iterator_range<MachineInstr::const_mop_iterator> &&R,
if (!MO.isReg())
continue;
- if (TargetRegisterInfo::isPhysicalRegister(Reg) &&
- TargetRegisterInfo::isPhysicalRegister(MO.getReg())) {
+ if (Register::isPhysicalRegister(Reg) &&
+ Register::isPhysicalRegister(MO.getReg())) {
if (TRI.regsOverlap(Reg, MO.getReg()))
return true;
- } else if (MO.getReg() == Reg &&
- TargetRegisterInfo::isVirtualRegister(Reg)) {
+ } else if (MO.getReg() == Reg && Register::isVirtualRegister(Reg)) {
LaneBitmask Overlap = TRI.getSubRegIndexLaneMask(SubReg) &
TRI.getSubRegIndexLaneMask(MO.getSubReg());
if (Overlap.any())
@@ -425,7 +423,7 @@ static TargetInstrInfo::RegSubRegPair
getSubRegForIndex(unsigned Reg, unsigned Sub, unsigned I,
const SIRegisterInfo &TRI, const MachineRegisterInfo &MRI) {
if (TRI.getRegSizeInBits(Reg, MRI) != 32) {
- if (TargetRegisterInfo::isPhysicalRegister(Reg)) {
+ if (Register::isPhysicalRegister(Reg)) {
Reg = TRI.getSubReg(Reg, TRI.getSubRegFromChannel(I));
} else {
LaneBitmask LM = TRI.getSubRegIndexLaneMask(Sub);
@@ -459,13 +457,13 @@ static MachineInstr* matchSwap(MachineInstr &MovT, MachineRegisterInfo &MRI,
assert(MovT.getOpcode() == AMDGPU::V_MOV_B32_e32 ||
MovT.getOpcode() == AMDGPU::COPY);
- unsigned T = MovT.getOperand(0).getReg();
+ Register T = MovT.getOperand(0).getReg();
unsigned Tsub = MovT.getOperand(0).getSubReg();
MachineOperand &Xop = MovT.getOperand(1);
if (!Xop.isReg())
return nullptr;
- unsigned X = Xop.getReg();
+ Register X = Xop.getReg();
unsigned Xsub = Xop.getSubReg();
unsigned Size = TII->getOpSize(MovT, 0) / 4;
@@ -484,7 +482,7 @@ static MachineInstr* matchSwap(MachineInstr &MovT, MachineRegisterInfo &MRI,
MovY.getOperand(1).getSubReg() != Tsub)
continue;
- unsigned Y = MovY.getOperand(0).getReg();
+ Register Y = MovY.getOperand(0).getReg();
unsigned Ysub = MovY.getOperand(0).getSubReg();
if (!TRI.isVGPR(MRI, Y) || MovT.getParent() != MovY.getParent())
@@ -579,7 +577,7 @@ bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) {
// XXX - not exactly a check for post-regalloc run.
MachineOperand &Src = MI.getOperand(1);
if (Src.isImm() &&
- TargetRegisterInfo::isPhysicalRegister(MI.getOperand(0).getReg())) {
+ Register::isPhysicalRegister(MI.getOperand(0).getReg())) {
int32_t ReverseImm;
if (isReverseInlineImm(TII, Src, ReverseImm)) {
MI.setDesc(TII->get(AMDGPU::V_BFREV_B32_e32));
@@ -643,8 +641,7 @@ bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) {
// FIXME: This could work better if hints worked with subregisters. If
// we have a vector add of a constant, we usually don't get the correct
// allocation due to the subregister usage.
- if (TargetRegisterInfo::isVirtualRegister(Dest->getReg()) &&
- Src0->isReg()) {
+ if (Register::isVirtualRegister(Dest->getReg()) && Src0->isReg()) {
MRI.setRegAllocationHint(Dest->getReg(), 0, Src0->getReg());
MRI.setRegAllocationHint(Src0->getReg(), 0, Dest->getReg());
continue;
@@ -672,8 +669,7 @@ bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) {
const MachineOperand &Dst = MI.getOperand(0);
MachineOperand &Src = MI.getOperand(1);
- if (Src.isImm() &&
- TargetRegisterInfo::isPhysicalRegister(Dst.getReg())) {
+ if (Src.isImm() && Register::isPhysicalRegister(Dst.getReg())) {
int32_t ReverseImm;
if (isKImmOperand(TII, Src))
MI.setDesc(TII->get(AMDGPU::S_MOVK_I32));
@@ -721,8 +717,8 @@ bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) {
int Op32 = AMDGPU::getVOPe32(MI.getOpcode());
if (TII->isVOPC(Op32)) {
- unsigned DstReg = MI.getOperand(0).getReg();
- if (TargetRegisterInfo::isVirtualRegister(DstReg)) {
+ Register DstReg = MI.getOperand(0).getReg();
+ if (Register::isVirtualRegister(DstReg)) {
// VOPC instructions can only write to the VCC register. We can't
// force them to use VCC here, because this is only one register and
// cannot deal with sequences which would require multiple copies of
@@ -745,8 +741,8 @@ bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) {
TII->getNamedOperand(MI, AMDGPU::OpName::src2);
if (!Src2->isReg())
continue;
- unsigned SReg = Src2->getReg();
- if (TargetRegisterInfo::isVirtualRegister(SReg)) {
+ Register SReg = Src2->getReg();
+ if (Register::isVirtualRegister(SReg)) {
MRI.setRegAllocationHint(SReg, 0, VCCReg);
continue;
}
@@ -766,7 +762,7 @@ bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) {
bool Next = false;
if (SDst->getReg() != VCCReg) {
- if (TargetRegisterInfo::isVirtualRegister(SDst->getReg()))
+ if (Register::isVirtualRegister(SDst->getReg()))
MRI.setRegAllocationHint(SDst->getReg(), 0, VCCReg);
Next = true;
}
@@ -774,7 +770,7 @@ bool SIShrinkInstructions::runOnMachineFunction(MachineFunction &MF) {
// All of the instructions with carry outs also have an SGPR input in
// src2.
if (Src2 && Src2->getReg() != VCCReg) {
- if (TargetRegisterInfo::isVirtualRegister(Src2->getReg()))
+ if (Register::isVirtualRegister(Src2->getReg()))
MRI.setRegAllocationHint(Src2->getReg(), 0, VCCReg);
Next = true;
}
diff --git a/lib/Target/AMDGPU/SIWholeQuadMode.cpp b/lib/Target/AMDGPU/SIWholeQuadMode.cpp
index 4e07efff55d8..cb4cf68d709a 100644
--- a/lib/Target/AMDGPU/SIWholeQuadMode.cpp
+++ b/lib/Target/AMDGPU/SIWholeQuadMode.cpp
@@ -273,12 +273,12 @@ void SIWholeQuadMode::markInstructionUses(const MachineInstr &MI, char Flag,
if (!Use.isReg() || !Use.isUse())
continue;
- unsigned Reg = Use.getReg();
+ Register Reg = Use.getReg();
// Handle physical registers that we need to track; this is mostly relevant
// for VCC, which can appear as the (implicit) input of a uniform branch,
// e.g. when a loop counter is stored in a VGPR.
- if (!TargetRegisterInfo::isVirtualRegister(Reg)) {
+ if (!Register::isVirtualRegister(Reg)) {
if (Reg == AMDGPU::EXEC || Reg == AMDGPU::EXEC_LO)
continue;
@@ -312,6 +312,7 @@ char SIWholeQuadMode::scanInstructions(MachineFunction &MF,
char GlobalFlags = 0;
bool WQMOutputs = MF.getFunction().hasFnAttribute("amdgpu-ps-wqm-outputs");
SmallVector<MachineInstr *, 4> SetInactiveInstrs;
+ SmallVector<MachineInstr *, 4> SoftWQMInstrs;
// We need to visit the basic blocks in reverse post-order so that we visit
// defs before uses, in particular so that we don't accidentally mark an
@@ -340,6 +341,10 @@ char SIWholeQuadMode::scanInstructions(MachineFunction &MF,
// correct, so we need it to be in WQM.
Flags = StateWQM;
LowerToCopyInstrs.push_back(&MI);
+ } else if (Opcode == AMDGPU::SOFT_WQM) {
+ LowerToCopyInstrs.push_back(&MI);
+ SoftWQMInstrs.push_back(&MI);
+ continue;
} else if (Opcode == AMDGPU::WWM) {
// The WWM intrinsic doesn't make the same guarantee, and plus it needs
// to be executed in WQM or Exact so that its copy doesn't clobber
@@ -356,8 +361,8 @@ char SIWholeQuadMode::scanInstructions(MachineFunction &MF,
if (Inactive.isUndef()) {
LowerToCopyInstrs.push_back(&MI);
} else {
- unsigned Reg = Inactive.getReg();
- if (TargetRegisterInfo::isVirtualRegister(Reg)) {
+ Register Reg = Inactive.getReg();
+ if (Register::isVirtualRegister(Reg)) {
for (MachineInstr &DefMI : MRI->def_instructions(Reg))
markInstruction(DefMI, StateWWM, Worklist);
}
@@ -385,9 +390,9 @@ char SIWholeQuadMode::scanInstructions(MachineFunction &MF,
if (!MO.isReg())
continue;
- unsigned Reg = MO.getReg();
+ Register Reg = MO.getReg();
- if (!TRI->isVirtualRegister(Reg) &&
+ if (!Register::isVirtualRegister(Reg) &&
TRI->hasVectorRegisters(TRI->getPhysRegClass(Reg))) {
Flags = StateWQM;
break;
@@ -407,9 +412,12 @@ char SIWholeQuadMode::scanInstructions(MachineFunction &MF,
// Mark sure that any SET_INACTIVE instructions are computed in WQM if WQM is
// ever used anywhere in the function. This implements the corresponding
// semantics of @llvm.amdgcn.set.inactive.
+ // Similarly for SOFT_WQM instructions, implementing @llvm.amdgcn.softwqm.
if (GlobalFlags & StateWQM) {
for (MachineInstr *MI : SetInactiveInstrs)
markInstruction(*MI, StateWQM, Worklist);
+ for (MachineInstr *MI : SoftWQMInstrs)
+ markInstruction(*MI, StateWQM, Worklist);
}
return GlobalFlags;
@@ -548,7 +556,7 @@ bool SIWholeQuadMode::requiresCorrectState(const MachineInstr &MI) const {
MachineBasicBlock::iterator
SIWholeQuadMode::saveSCC(MachineBasicBlock &MBB,
MachineBasicBlock::iterator Before) {
- unsigned SaveReg = MRI->createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
+ Register SaveReg = MRI->createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
MachineInstr *Save =
BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::COPY), SaveReg)
@@ -832,7 +840,7 @@ void SIWholeQuadMode::processBlock(MachineBasicBlock &MBB, unsigned LiveMaskReg,
void SIWholeQuadMode::lowerLiveMaskQueries(unsigned LiveMaskReg) {
for (MachineInstr *MI : LiveMaskQueries) {
const DebugLoc &DL = MI->getDebugLoc();
- unsigned Dest = MI->getOperand(0).getReg();
+ Register Dest = MI->getOperand(0).getReg();
MachineInstr *Copy =
BuildMI(*MI->getParent(), MI, DL, TII->get(AMDGPU::COPY), Dest)
.addReg(LiveMaskReg);
@@ -847,13 +855,12 @@ void SIWholeQuadMode::lowerCopyInstrs() {
for (unsigned i = MI->getNumExplicitOperands() - 1; i > 1; i--)
MI->RemoveOperand(i);
- const unsigned Reg = MI->getOperand(0).getReg();
+ const Register Reg = MI->getOperand(0).getReg();
if (TRI->isVGPR(*MRI, Reg)) {
- const TargetRegisterClass *regClass =
- TargetRegisterInfo::isVirtualRegister(Reg)
- ? MRI->getRegClass(Reg)
- : TRI->getPhysRegClass(Reg);
+ const TargetRegisterClass *regClass = Register::isVirtualRegister(Reg)
+ ? MRI->getRegClass(Reg)
+ : TRI->getPhysRegClass(Reg);
const unsigned MovOp = TII->getMovOpcode(regClass);
MI->setDesc(TII->get(MovOp));
@@ -885,7 +892,7 @@ bool SIWholeQuadMode::runOnMachineFunction(MachineFunction &MF) {
unsigned Exec = ST->isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
if (!(GlobalFlags & StateWQM)) {
lowerLiveMaskQueries(Exec);
- if (!(GlobalFlags & StateWWM))
+ if (!(GlobalFlags & StateWWM) && LowerToCopyInstrs.empty())
return !LiveMaskQueries.empty();
} else {
// Store a copy of the original live mask when required
diff --git a/lib/Target/AMDGPU/SMInstructions.td b/lib/Target/AMDGPU/SMInstructions.td
index 1b410b6b5912..1a74ebbf8165 100644
--- a/lib/Target/AMDGPU/SMInstructions.td
+++ b/lib/Target/AMDGPU/SMInstructions.td
@@ -793,9 +793,18 @@ multiclass SMLoad_Pattern <string Instr, ValueType vt> {
// selector to prefer those.
let AddedComplexity = 100 in {
-defm : SMRD_Pattern <"S_LOAD_DWORD", i32>;
-defm : SMRD_Pattern <"S_LOAD_DWORDX2", v2i32>;
-defm : SMRD_Pattern <"S_LOAD_DWORDX4", v4i32>;
+foreach vt = Reg32Types.types in {
+defm : SMRD_Pattern <"S_LOAD_DWORD", vt>;
+}
+
+foreach vt = SReg_64.RegTypes in {
+defm : SMRD_Pattern <"S_LOAD_DWORDX2", vt>;
+}
+
+foreach vt = SReg_128.RegTypes in {
+defm : SMRD_Pattern <"S_LOAD_DWORDX4", vt>;
+}
+
defm : SMRD_Pattern <"S_LOAD_DWORDX8", v8i32>;
defm : SMRD_Pattern <"S_LOAD_DWORDX16", v16i32>;
diff --git a/lib/Target/AMDGPU/SOPInstructions.td b/lib/Target/AMDGPU/SOPInstructions.td
index dfafdccc05a3..d31a49f428ee 100644
--- a/lib/Target/AMDGPU/SOPInstructions.td
+++ b/lib/Target/AMDGPU/SOPInstructions.td
@@ -181,7 +181,9 @@ def S_BCNT0_I32_B64 : SOP1_32_64 <"s_bcnt0_i32_b64">;
def S_BCNT1_I32_B32 : SOP1_32 <"s_bcnt1_i32_b32",
[(set i32:$sdst, (ctpop i32:$src0))]
>;
-def S_BCNT1_I32_B64 : SOP1_32_64 <"s_bcnt1_i32_b64">;
+def S_BCNT1_I32_B64 : SOP1_32_64 <"s_bcnt1_i32_b64",
+ [(set i32:$sdst, (ctpop i64:$src0))]
+>;
} // End Defs = [SCC]
def S_FF0_I32_B32 : SOP1_32 <"s_ff0_i32_b32">;
@@ -417,16 +419,16 @@ def S_SUBB_U32 : SOP2_32 <"s_subb_u32",
let isCommutable = 1 in {
def S_MIN_I32 : SOP2_32 <"s_min_i32",
- [(set i32:$sdst, (UniformBinFrag<smin> i32:$src0, i32:$src1))]
+ [(set i32:$sdst, (smin i32:$src0, i32:$src1))]
>;
def S_MIN_U32 : SOP2_32 <"s_min_u32",
- [(set i32:$sdst, (UniformBinFrag<umin> i32:$src0, i32:$src1))]
+ [(set i32:$sdst, (umin i32:$src0, i32:$src1))]
>;
def S_MAX_I32 : SOP2_32 <"s_max_i32",
- [(set i32:$sdst, (UniformBinFrag<smax> i32:$src0, i32:$src1))]
+ [(set i32:$sdst, (smax i32:$src0, i32:$src1))]
>;
def S_MAX_U32 : SOP2_32 <"s_max_u32",
- [(set i32:$sdst, (UniformBinFrag<umax> i32:$src0, i32:$src1))]
+ [(set i32:$sdst, (umax i32:$src0, i32:$src1))]
>;
} // End isCommutable = 1
} // End Defs = [SCC]
@@ -853,13 +855,13 @@ class SOPC_Base <bits<7> op, RegisterOperand rc0, RegisterOperand rc1,
let Defs = [SCC];
}
class SOPC_Helper <bits<7> op, RegisterOperand rc, ValueType vt,
- string opName, PatLeaf cond> : SOPC_Base <
+ string opName, SDPatternOperator cond> : SOPC_Base <
op, rc, rc, opName,
[(set SCC, (si_setcc_uniform vt:$src0, vt:$src1, cond))] > {
}
class SOPC_CMP_32<bits<7> op, string opName,
- PatLeaf cond = COND_NULL, string revOp = opName>
+ SDPatternOperator cond = COND_NULL, string revOp = opName>
: SOPC_Helper<op, SSrc_b32, i32, opName, cond>,
Commutable_REV<revOp, !eq(revOp, opName)>,
SOPKInstTable<0, opName> {
@@ -868,7 +870,7 @@ class SOPC_CMP_32<bits<7> op, string opName,
}
class SOPC_CMP_64<bits<7> op, string opName,
- PatLeaf cond = COND_NULL, string revOp = opName>
+ SDPatternOperator cond = COND_NULL, string revOp = opName>
: SOPC_Helper<op, SSrc_b64, i64, opName, cond>,
Commutable_REV<revOp, !eq(revOp, opName)> {
let isCompare = 1;
@@ -1076,8 +1078,6 @@ def S_BARRIER : SOPP <0x0000000a, (ins), "s_barrier",
[(int_amdgcn_s_barrier)]> {
let SchedRW = [WriteBarrier];
let simm16 = 0;
- let mayLoad = 1;
- let mayStore = 1;
let isConvergent = 1;
}
@@ -1090,7 +1090,7 @@ def S_WAKEUP : SOPP <0x00000003, (ins), "s_wakeup"> {
let mayLoad = 1, mayStore = 1, hasSideEffects = 1 in
def S_WAITCNT : SOPP <0x0000000c, (ins WAIT_FLAG:$simm16), "s_waitcnt $simm16",
- [(int_amdgcn_s_waitcnt UIMM16bit:$simm16)]>;
+ [(int_amdgcn_s_waitcnt timm:$simm16)]>;
def S_SETHALT : SOPP <0x0000000d, (ins i16imm:$simm16), "s_sethalt $simm16">;
def S_SETKILL : SOPP <0x0000000b, (ins i16imm:$simm16), "s_setkill $simm16">;
@@ -1099,7 +1099,7 @@ def S_SETKILL : SOPP <0x0000000b, (ins i16imm:$simm16), "s_setkill $simm16">;
// maximum reported is 960 cycles, so 960 / 64 = 15 max, so is the
// maximum really 15 on VI?
def S_SLEEP : SOPP <0x0000000e, (ins i32imm:$simm16),
- "s_sleep $simm16", [(int_amdgcn_s_sleep SIMM16bit:$simm16)]> {
+ "s_sleep $simm16", [(int_amdgcn_s_sleep timm:$simm16)]> {
let hasSideEffects = 1;
let mayLoad = 1;
let mayStore = 1;
@@ -1110,12 +1110,11 @@ def S_SETPRIO : SOPP <0x0000000f, (ins i16imm:$simm16), "s_setprio $simm16">;
let Uses = [EXEC, M0] in {
// FIXME: Should this be mayLoad+mayStore?
def S_SENDMSG : SOPP <0x00000010, (ins SendMsgImm:$simm16), "s_sendmsg $simm16",
- [(AMDGPUsendmsg (i32 imm:$simm16))]
->;
+ [(int_amdgcn_s_sendmsg (i32 timm:$simm16), M0)]>;
def S_SENDMSGHALT : SOPP <0x00000011, (ins SendMsgImm:$simm16), "s_sendmsghalt $simm16",
- [(AMDGPUsendmsghalt (i32 imm:$simm16))]
->;
+ [(int_amdgcn_s_sendmsghalt (i32 timm:$simm16), M0)]>;
+
} // End Uses = [EXEC, M0]
def S_TRAP : SOPP <0x00000012, (ins i16imm:$simm16), "s_trap $simm16"> {
@@ -1126,13 +1125,13 @@ def S_ICACHE_INV : SOPP <0x00000013, (ins), "s_icache_inv"> {
let simm16 = 0;
}
def S_INCPERFLEVEL : SOPP <0x00000014, (ins i32imm:$simm16), "s_incperflevel $simm16",
- [(int_amdgcn_s_incperflevel SIMM16bit:$simm16)]> {
+ [(int_amdgcn_s_incperflevel timm:$simm16)]> {
let hasSideEffects = 1;
let mayLoad = 1;
let mayStore = 1;
}
def S_DECPERFLEVEL : SOPP <0x00000015, (ins i32imm:$simm16), "s_decperflevel $simm16",
- [(int_amdgcn_s_decperflevel SIMM16bit:$simm16)]> {
+ [(int_amdgcn_s_decperflevel timm:$simm16)]> {
let hasSideEffects = 1;
let mayLoad = 1;
let mayStore = 1;
@@ -1169,7 +1168,10 @@ let SubtargetPredicate = isGFX10Plus in {
def S_ROUND_MODE :
SOPP<0x024, (ins s16imm:$simm16), "s_round_mode $simm16">;
def S_DENORM_MODE :
- SOPP<0x025, (ins s16imm:$simm16), "s_denorm_mode $simm16">;
+ SOPP<0x025, (ins i32imm:$simm16), "s_denorm_mode $simm16",
+ [(SIdenorm_mode (i32 timm:$simm16))]> {
+ let hasSideEffects = 1;
+ }
def S_TTRACEDATA_IMM :
SOPP<0x028, (ins s16imm:$simm16), "s_ttracedata_imm $simm16">;
} // End SubtargetPredicate = isGFX10Plus
@@ -1178,7 +1180,7 @@ let SubtargetPredicate = isGFX10Plus in {
// S_GETREG_B32 Intrinsic Pattern.
//===----------------------------------------------------------------------===//
def : GCNPat <
- (int_amdgcn_s_getreg imm:$simm16),
+ (int_amdgcn_s_getreg timm:$simm16),
(S_GETREG_B32 (as_i16imm $simm16))
>;
diff --git a/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
index e90f40e6abea..afb2fd987afd 100644
--- a/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
+++ b/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
@@ -131,29 +131,70 @@ int getMaskedMIMGOp(unsigned Opc, unsigned NewChannels) {
struct MUBUFInfo {
uint16_t Opcode;
uint16_t BaseOpcode;
- uint8_t dwords;
+ uint8_t elements;
bool has_vaddr;
bool has_srsrc;
bool has_soffset;
};
+struct MTBUFInfo {
+ uint16_t Opcode;
+ uint16_t BaseOpcode;
+ uint8_t elements;
+ bool has_vaddr;
+ bool has_srsrc;
+ bool has_soffset;
+};
+
+#define GET_MTBUFInfoTable_DECL
+#define GET_MTBUFInfoTable_IMPL
#define GET_MUBUFInfoTable_DECL
#define GET_MUBUFInfoTable_IMPL
#include "AMDGPUGenSearchableTables.inc"
+int getMTBUFBaseOpcode(unsigned Opc) {
+ const MTBUFInfo *Info = getMTBUFInfoFromOpcode(Opc);
+ return Info ? Info->BaseOpcode : -1;
+}
+
+int getMTBUFOpcode(unsigned BaseOpc, unsigned Elements) {
+ const MTBUFInfo *Info = getMTBUFInfoFromBaseOpcodeAndElements(BaseOpc, Elements);
+ return Info ? Info->Opcode : -1;
+}
+
+int getMTBUFElements(unsigned Opc) {
+ const MTBUFInfo *Info = getMTBUFOpcodeHelper(Opc);
+ return Info ? Info->elements : 0;
+}
+
+bool getMTBUFHasVAddr(unsigned Opc) {
+ const MTBUFInfo *Info = getMTBUFOpcodeHelper(Opc);
+ return Info ? Info->has_vaddr : false;
+}
+
+bool getMTBUFHasSrsrc(unsigned Opc) {
+ const MTBUFInfo *Info = getMTBUFOpcodeHelper(Opc);
+ return Info ? Info->has_srsrc : false;
+}
+
+bool getMTBUFHasSoffset(unsigned Opc) {
+ const MTBUFInfo *Info = getMTBUFOpcodeHelper(Opc);
+ return Info ? Info->has_soffset : false;
+}
+
int getMUBUFBaseOpcode(unsigned Opc) {
const MUBUFInfo *Info = getMUBUFInfoFromOpcode(Opc);
return Info ? Info->BaseOpcode : -1;
}
-int getMUBUFOpcode(unsigned BaseOpc, unsigned Dwords) {
- const MUBUFInfo *Info = getMUBUFInfoFromBaseOpcodeAndDwords(BaseOpc, Dwords);
+int getMUBUFOpcode(unsigned BaseOpc, unsigned Elements) {
+ const MUBUFInfo *Info = getMUBUFInfoFromBaseOpcodeAndElements(BaseOpc, Elements);
return Info ? Info->Opcode : -1;
}
-int getMUBUFDwords(unsigned Opc) {
+int getMUBUFElements(unsigned Opc) {
const MUBUFInfo *Info = getMUBUFOpcodeHelper(Opc);
- return Info ? Info->dwords : 0;
+ return Info ? Info->elements : 0;
}
bool getMUBUFHasVAddr(unsigned Opc) {
@@ -241,7 +282,7 @@ unsigned getMaxWorkGroupsPerCU(const MCSubtargetInfo *STI,
}
unsigned getMaxWavesPerCU(const MCSubtargetInfo *STI) {
- return getMaxWavesPerEU() * getEUsPerCU(STI);
+ return getMaxWavesPerEU(STI) * getEUsPerCU(STI);
}
unsigned getMaxWavesPerCU(const MCSubtargetInfo *STI,
@@ -253,9 +294,11 @@ unsigned getMinWavesPerEU(const MCSubtargetInfo *STI) {
return 1;
}
-unsigned getMaxWavesPerEU() {
+unsigned getMaxWavesPerEU(const MCSubtargetInfo *STI) {
// FIXME: Need to take scratch memory into account.
- return 10;
+ if (!isGFX10(*STI))
+ return 10;
+ return 20;
}
unsigned getMaxWavesPerEU(const MCSubtargetInfo *STI,
@@ -317,7 +360,7 @@ unsigned getMinNumSGPRs(const MCSubtargetInfo *STI, unsigned WavesPerEU) {
if (Version.Major >= 10)
return 0;
- if (WavesPerEU >= getMaxWavesPerEU())
+ if (WavesPerEU >= getMaxWavesPerEU(STI))
return 0;
unsigned MinNumSGPRs = getTotalNumSGPRs(STI) / (WavesPerEU + 1);
@@ -394,17 +437,19 @@ unsigned getVGPREncodingGranule(const MCSubtargetInfo *STI,
}
unsigned getTotalNumVGPRs(const MCSubtargetInfo *STI) {
- return 256;
+ if (!isGFX10(*STI))
+ return 256;
+ return STI->getFeatureBits().test(FeatureWavefrontSize32) ? 1024 : 512;
}
unsigned getAddressableNumVGPRs(const MCSubtargetInfo *STI) {
- return getTotalNumVGPRs(STI);
+ return 256;
}
unsigned getMinNumVGPRs(const MCSubtargetInfo *STI, unsigned WavesPerEU) {
assert(WavesPerEU != 0);
- if (WavesPerEU >= getMaxWavesPerEU())
+ if (WavesPerEU >= getMaxWavesPerEU(STI))
return 0;
unsigned MinNumVGPRs =
alignDown(getTotalNumVGPRs(STI) / (WavesPerEU + 1),
@@ -510,7 +555,7 @@ bool isReadOnlySegment(const GlobalValue *GV) {
}
bool shouldEmitConstantsToTextSection(const Triple &TT) {
- return TT.getOS() != Triple::AMDHSA;
+ return TT.getOS() == Triple::AMDPAL;
}
int getIntegerAttribute(const Function &F, StringRef Name, int Default) {
diff --git a/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
index 209ef7eef749..f78dadd447ff 100644
--- a/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
+++ b/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
@@ -94,7 +94,7 @@ unsigned getMinWavesPerEU(const MCSubtargetInfo *STI);
/// \returns Maximum number of waves per execution unit for given subtarget \p
/// STI without any kind of limitation.
-unsigned getMaxWavesPerEU();
+unsigned getMaxWavesPerEU(const MCSubtargetInfo *STI);
/// \returns Maximum number of waves per execution unit for given subtarget \p
/// STI and limited by given \p FlatWorkGroupSize.
@@ -264,13 +264,31 @@ LLVM_READONLY
const MIMGInfo *getMIMGInfo(unsigned Opc);
LLVM_READONLY
+int getMTBUFBaseOpcode(unsigned Opc);
+
+LLVM_READONLY
+int getMTBUFOpcode(unsigned BaseOpc, unsigned Elements);
+
+LLVM_READONLY
+int getMTBUFElements(unsigned Opc);
+
+LLVM_READONLY
+bool getMTBUFHasVAddr(unsigned Opc);
+
+LLVM_READONLY
+bool getMTBUFHasSrsrc(unsigned Opc);
+
+LLVM_READONLY
+bool getMTBUFHasSoffset(unsigned Opc);
+
+LLVM_READONLY
int getMUBUFBaseOpcode(unsigned Opc);
LLVM_READONLY
-int getMUBUFOpcode(unsigned BaseOpc, unsigned Dwords);
+int getMUBUFOpcode(unsigned BaseOpc, unsigned Elements);
LLVM_READONLY
-int getMUBUFDwords(unsigned Opc);
+int getMUBUFElements(unsigned Opc);
LLVM_READONLY
bool getMUBUFHasVAddr(unsigned Opc);
diff --git a/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.cpp b/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.cpp
index db20d5ccf5f9..207e4232e829 100644
--- a/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.cpp
+++ b/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.cpp
@@ -21,6 +21,8 @@
#include "SIDefines.h"
#include "llvm/BinaryFormat/ELF.h"
#include "llvm/IR/CallingConv.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Module.h"
#include "llvm/Support/AMDGPUMetadata.h"
#include "llvm/Support/EndianStream.h"
diff --git a/lib/Target/AMDGPU/VOP1Instructions.td b/lib/Target/AMDGPU/VOP1Instructions.td
index 6bc416ed7d4b..f1cdc3097dc0 100644
--- a/lib/Target/AMDGPU/VOP1Instructions.td
+++ b/lib/Target/AMDGPU/VOP1Instructions.td
@@ -104,9 +104,21 @@ multiclass VOP1Inst <string opName, VOPProfile P,
SDPatternOperator node = null_frag> {
def _e32 : VOP1_Pseudo <opName, P>;
def _e64 : VOP3_Pseudo <opName, P, getVOP1Pat64<node, P>.ret>;
- def _sdwa : VOP1_SDWA_Pseudo <opName, P>;
+
+ foreach _ = BoolToList<P.HasExtSDWA>.ret in
+ def _sdwa : VOP1_SDWA_Pseudo <opName, P>;
+
foreach _ = BoolToList<P.HasExtDPP>.ret in
def _dpp : VOP1_DPP_Pseudo <opName, P>;
+
+ def : MnemonicAlias<opName#"_e32", opName>, LetDummies;
+ def : MnemonicAlias<opName#"_e64", opName>, LetDummies;
+
+ foreach _ = BoolToList<P.HasExtSDWA>.ret in
+ def : MnemonicAlias<opName#"_sdwa", opName>, LetDummies;
+
+ foreach _ = BoolToList<P.HasExtDPP>.ret in
+ def : MnemonicAlias<opName#"_dpp", opName>, LetDummies;
}
// Special profile for instructions which have clamp
@@ -227,10 +239,10 @@ defm V_COS_F32 : VOP1Inst <"v_cos_f32", VOP_F32_F32, AMDGPUcos>;
} // End SchedRW = [WriteQuarterRate32]
defm V_NOT_B32 : VOP1Inst <"v_not_b32", VOP_I32_I32>;
-defm V_BFREV_B32 : VOP1Inst <"v_bfrev_b32", VOP_I32_I32>;
-defm V_FFBH_U32 : VOP1Inst <"v_ffbh_u32", VOP_I32_I32>;
+defm V_BFREV_B32 : VOP1Inst <"v_bfrev_b32", VOP_I32_I32, bitreverse>;
+defm V_FFBH_U32 : VOP1Inst <"v_ffbh_u32", VOP_I32_I32, AMDGPUffbh_u32>;
defm V_FFBL_B32 : VOP1Inst <"v_ffbl_b32", VOP_I32_I32>;
-defm V_FFBH_I32 : VOP1Inst <"v_ffbh_i32", VOP_I32_I32>;
+defm V_FFBH_I32 : VOP1Inst <"v_ffbh_i32", VOP_I32_I32, AMDGPUffbh_i32>;
let SchedRW = [WriteDoubleAdd] in {
defm V_FREXP_EXP_I32_F64 : VOP1Inst <"v_frexp_exp_i32_f64", VOP_I32_F64, int_amdgcn_frexp_exp>;
@@ -434,7 +446,7 @@ let SubtargetPredicate = isGFX10Plus in {
// Target-specific instruction encodings.
//===----------------------------------------------------------------------===//
-class VOP1_DPP<bits<8> op, VOP1_Pseudo ps, VOPProfile p = ps.Pfl, bit isDPP16 = 0> :
+class VOP1_DPP<bits<8> op, VOP1_DPP_Pseudo ps, VOPProfile p = ps.Pfl, bit isDPP16 = 0> :
VOP_DPP<ps.OpName, p, isDPP16> {
let hasSideEffects = ps.hasSideEffects;
let Defs = ps.Defs;
@@ -448,8 +460,9 @@ class VOP1_DPP<bits<8> op, VOP1_Pseudo ps, VOPProfile p = ps.Pfl, bit isDPP16 =
let Inst{31-25} = 0x3f;
}
-class VOP1_DPP16<bits<8> op, VOP1_Pseudo ps, VOPProfile p = ps.Pfl> :
- VOP1_DPP<op, ps, p, 1> {
+class VOP1_DPP16<bits<8> op, VOP1_DPP_Pseudo ps, VOPProfile p = ps.Pfl> :
+ VOP1_DPP<op, ps, p, 1>,
+ SIMCInstr <ps.PseudoInstr, SIEncodingFamily.GFX10> {
let AssemblerPredicate = !if(p.HasExt, HasDPP16, DisableInst);
let SubtargetPredicate = HasDPP16;
}
@@ -492,6 +505,7 @@ let AssemblerPredicate = isGFX10Plus, DecoderNamespace = "GFX10" in {
VOP3e_gfx10<{0, 1, 1, op{6-0}}, !cast<VOP3_Pseudo>(NAME#"_e64").Pfl>;
}
multiclass VOP1_Real_sdwa_gfx10<bits<9> op> {
+ foreach _ = BoolToList<!cast<VOP1_Pseudo>(NAME#"_e32").Pfl.HasExtSDWA9>.ret in
def _sdwa_gfx10 :
VOP_SDWA10_Real<!cast<VOP1_SDWA_Pseudo>(NAME#"_sdwa")>,
VOP1_SDWA9Ae<op{7-0}, !cast<VOP1_SDWA_Pseudo>(NAME#"_sdwa").Pfl> {
@@ -499,11 +513,13 @@ let AssemblerPredicate = isGFX10Plus, DecoderNamespace = "GFX10" in {
}
}
multiclass VOP1_Real_dpp_gfx10<bits<9> op> {
- def _dpp_gfx10 : VOP1_DPP16<op{7-0}, !cast<VOP1_Pseudo>(NAME#"_e32")> {
+ foreach _ = BoolToList<!cast<VOP1_Pseudo>(NAME#"_e32").Pfl.HasExtDPP>.ret in
+ def _dpp_gfx10 : VOP1_DPP16<op{7-0}, !cast<VOP1_DPP_Pseudo>(NAME#"_dpp")> {
let DecoderNamespace = "SDWA10";
}
}
multiclass VOP1_Real_dpp8_gfx10<bits<9> op> {
+ foreach _ = BoolToList<!cast<VOP1_Pseudo>(NAME#"_e32").Pfl.HasExtDPP>.ret in
def _dpp8_gfx10 : VOP1_DPP8<op{7-0}, !cast<VOP1_Pseudo>(NAME#"_e32")> {
let DecoderNamespace = "DPP8";
}
@@ -704,10 +720,12 @@ multiclass VOP1_Real_e32e64_vi <bits<10> op> {
multiclass VOP1_Real_vi <bits<10> op> {
defm NAME : VOP1_Real_e32e64_vi <op>;
+ foreach _ = BoolToList<!cast<VOP1_Pseudo>(NAME#"_e32").Pfl.HasExtSDWA>.ret in
def _sdwa_vi :
VOP_SDWA_Real <!cast<VOP1_SDWA_Pseudo>(NAME#"_sdwa")>,
VOP1_SDWAe <op{7-0}, !cast<VOP1_SDWA_Pseudo>(NAME#"_sdwa").Pfl>;
+ foreach _ = BoolToList<!cast<VOP1_Pseudo>(NAME#"_e32").Pfl.HasExtSDWA9>.ret in
def _sdwa_gfx9 :
VOP_SDWA9_Real <!cast<VOP1_SDWA_Pseudo>(NAME#"_sdwa")>,
VOP1_SDWA9Ae <op{7-0}, !cast<VOP1_SDWA_Pseudo>(NAME#"_sdwa").Pfl>;
@@ -831,25 +849,25 @@ def V_MOVRELD_B32_V4 : V_MOVRELD_B32_pseudo<VReg_128>;
def V_MOVRELD_B32_V8 : V_MOVRELD_B32_pseudo<VReg_256>;
def V_MOVRELD_B32_V16 : V_MOVRELD_B32_pseudo<VReg_512>;
-let OtherPredicates = [isGFX8GFX9] in {
+let OtherPredicates = [isGFX8Plus] in {
def : GCNPat <
- (i32 (int_amdgcn_mov_dpp i32:$src, imm:$dpp_ctrl, imm:$row_mask, imm:$bank_mask,
- imm:$bound_ctrl)),
+ (i32 (int_amdgcn_mov_dpp i32:$src, timm:$dpp_ctrl, timm:$row_mask, timm:$bank_mask,
+ timm:$bound_ctrl)),
(V_MOV_B32_dpp $src, $src, (as_i32imm $dpp_ctrl),
(as_i32imm $row_mask), (as_i32imm $bank_mask),
(as_i1imm $bound_ctrl))
>;
def : GCNPat <
- (i32 (int_amdgcn_update_dpp i32:$old, i32:$src, imm:$dpp_ctrl, imm:$row_mask,
- imm:$bank_mask, imm:$bound_ctrl)),
+ (i32 (int_amdgcn_update_dpp i32:$old, i32:$src, timm:$dpp_ctrl, timm:$row_mask,
+ timm:$bank_mask, timm:$bound_ctrl)),
(V_MOV_B32_dpp $old, $src, (as_i32imm $dpp_ctrl),
(as_i32imm $row_mask), (as_i32imm $bank_mask),
(as_i1imm $bound_ctrl))
>;
-} // End OtherPredicates = [isGFX8GFX9]
+} // End OtherPredicates = [isGFX8Plus]
let OtherPredicates = [isGFX8Plus] in {
def : GCNPat<
@@ -885,6 +903,7 @@ multiclass VOP1_Real_gfx9 <bits<10> op> {
defm NAME : VOP1_Real_e32e64_vi <op>;
}
+ foreach _ = BoolToList<!cast<VOP1_Pseudo>(NAME#"_e32").Pfl.HasExtSDWA9>.ret in
def _sdwa_gfx9 :
VOP_SDWA9_Real <!cast<VOP1_SDWA_Pseudo>(NAME#"_sdwa")>,
VOP1_SDWA9Ae <op{7-0}, !cast<VOP1_SDWA_Pseudo>(NAME#"_sdwa").Pfl>;
@@ -904,23 +923,7 @@ defm V_SCREEN_PARTITION_4SE_B32 : VOP1_Real_gfx9 <0x37>;
let OtherPredicates = [isGFX10Plus] in {
def : GCNPat <
- (i32 (int_amdgcn_mov_dpp8 i32:$src, imm:$dpp8)),
+ (i32 (int_amdgcn_mov_dpp8 i32:$src, timm:$dpp8)),
(V_MOV_B32_dpp8_gfx10 $src, $src, (as_i32imm $dpp8), (i32 DPP8Mode.FI_0))
>;
-
-def : GCNPat <
- (i32 (int_amdgcn_mov_dpp i32:$src, imm:$dpp_ctrl, imm:$row_mask, imm:$bank_mask,
- imm:$bound_ctrl)),
- (V_MOV_B32_dpp_gfx10 $src, $src, (as_i32imm $dpp_ctrl),
- (as_i32imm $row_mask), (as_i32imm $bank_mask),
- (as_i1imm $bound_ctrl), (i32 0))
->;
-
-def : GCNPat <
- (i32 (int_amdgcn_update_dpp i32:$old, i32:$src, imm:$dpp_ctrl, imm:$row_mask,
- imm:$bank_mask, imm:$bound_ctrl)),
- (V_MOV_B32_dpp_gfx10 $old, $src, (as_i32imm $dpp_ctrl),
- (as_i32imm $row_mask), (as_i32imm $bank_mask),
- (as_i1imm $bound_ctrl), (i32 0))
->;
} // End OtherPredicates = [isGFX10Plus]
diff --git a/lib/Target/AMDGPU/VOP2Instructions.td b/lib/Target/AMDGPU/VOP2Instructions.td
index 1b30cd2ed516..1ab0fc1ab58d 100644
--- a/lib/Target/AMDGPU/VOP2Instructions.td
+++ b/lib/Target/AMDGPU/VOP2Instructions.td
@@ -147,7 +147,8 @@ multiclass VOP2Inst_sdwa<string opName,
string revOp = opName,
bit GFX9Renamed = 0> {
let renamedInGFX9 = GFX9Renamed in {
- def _sdwa : VOP2_SDWA_Pseudo <opName, P>;
+ foreach _ = BoolToList<P.HasExtSDWA>.ret in
+ def _sdwa : VOP2_SDWA_Pseudo <opName, P>;
} // End renamedInGFX9 = GFX9Renamed
}
@@ -179,9 +180,10 @@ multiclass VOP2bInst <string opName,
let usesCustomInserter = !eq(P.NumSrcArgs, 2);
}
- def _sdwa : VOP2_SDWA_Pseudo <opName, P> {
- let AsmMatchConverter = "cvtSdwaVOP2b";
- }
+ foreach _ = BoolToList<P.HasExtSDWA>.ret in
+ def _sdwa : VOP2_SDWA_Pseudo <opName, P> {
+ let AsmMatchConverter = "cvtSdwaVOP2b";
+ }
foreach _ = BoolToList<P.HasExtDPP>.ret in
def _dpp : VOP2_DPP_Pseudo <opName, P>;
}
@@ -220,9 +222,10 @@ multiclass VOP2eInst <string opName,
def _e32 : VOP2_Pseudo <opName, P>,
Commutable_REV<revOp#"_e32", !eq(revOp, opName)>;
- def _sdwa : VOP2_SDWA_Pseudo <opName, P> {
- let AsmMatchConverter = "cvtSdwaVOP2b";
- }
+ foreach _ = BoolToList<P.HasExtSDWA>.ret in
+ def _sdwa : VOP2_SDWA_Pseudo <opName, P> {
+ let AsmMatchConverter = "cvtSdwaVOP2e";
+ }
foreach _ = BoolToList<P.HasExtDPP>.ret in
def _dpp : VOP2_DPP_Pseudo <opName, P>;
@@ -251,7 +254,9 @@ multiclass VOP2eInstAliases<VOP2_Pseudo ps, VOP2_Real inst> {
class VOP_MADAK <ValueType vt> : VOPProfile <[vt, vt, vt, vt]> {
field Operand ImmOpType = !if(!eq(vt.Size, 32), f32kimm, f16kimm);
- field dag Ins32 = (ins VCSrc_f32:$src0, VGPR_32:$src1, ImmOpType:$imm);
+ field dag Ins32 = !if(!eq(vt.Size, 32),
+ (ins VCSrc_f32:$src0, VGPR_32:$src1, ImmOpType:$imm),
+ (ins VCSrc_f16:$src0, VGPR_32:$src1, ImmOpType:$imm));
field bit HasExt = 0;
// Hack to stop printing _e64
@@ -519,7 +524,7 @@ def V_WRITELANE_B32 : VOP2_Pseudo<"v_writelane_b32", VOP_WRITELANE,
} // End isConvergent = 1
defm V_BFM_B32 : VOP2Inst <"v_bfm_b32", VOP_NO_EXT<VOP_I32_I32_I32>>;
-defm V_BCNT_U32_B32 : VOP2Inst <"v_bcnt_u32_b32", VOP_NO_EXT<VOP_I32_I32_I32>>;
+defm V_BCNT_U32_B32 : VOP2Inst <"v_bcnt_u32_b32", VOP_NO_EXT<VOP_I32_I32_I32>, add_ctpop>;
defm V_MBCNT_LO_U32_B32 : VOP2Inst <"v_mbcnt_lo_u32_b32", VOP_NO_EXT<VOP_I32_I32_I32>, int_amdgcn_mbcnt_lo>;
defm V_MBCNT_HI_U32_B32 : VOP2Inst <"v_mbcnt_hi_u32_b32", VOP_NO_EXT<VOP_I32_I32_I32>, int_amdgcn_mbcnt_hi>;
defm V_LDEXP_F32 : VOP2Inst <"v_ldexp_f32", VOP_NO_EXT<VOP_F32_F32_I32>, AMDGPUldexp>;
@@ -539,9 +544,9 @@ defm V_MAX_LEGACY_F32 : VOP2Inst <"v_max_legacy_f32", VOP_F32_F32_F32, AMDGPUfma
let SubtargetPredicate = isGFX6GFX7GFX10 in {
let isCommutable = 1 in {
defm V_MAC_LEGACY_F32 : VOP2Inst <"v_mac_legacy_f32", VOP_F32_F32_F32>;
-defm V_LSHR_B32 : VOP2Inst <"v_lshr_b32", VOP_I32_I32_I32>;
-defm V_ASHR_I32 : VOP2Inst <"v_ashr_i32", VOP_I32_I32_I32>;
-defm V_LSHL_B32 : VOP2Inst <"v_lshl_b32", VOP_I32_I32_I32>;
+defm V_LSHR_B32 : VOP2Inst <"v_lshr_b32", VOP_I32_I32_I32, srl>;
+defm V_ASHR_I32 : VOP2Inst <"v_ashr_i32", VOP_I32_I32_I32, sra>;
+defm V_LSHL_B32 : VOP2Inst <"v_lshl_b32", VOP_I32_I32_I32, shl>;
} // End isCommutable = 1
} // End SubtargetPredicate = isGFX6GFX7GFX10
@@ -606,9 +611,9 @@ def V_MADMK_F16 : VOP2_Pseudo <"v_madmk_f16", VOP_MADMK_F16, [], "">;
defm V_LDEXP_F16 : VOP2Inst <"v_ldexp_f16", VOP_F16_F16_I32, AMDGPUldexp>;
} // End FPDPRounding = 1
-defm V_LSHLREV_B16 : VOP2Inst <"v_lshlrev_b16", VOP_I16_I16_I16>;
-defm V_LSHRREV_B16 : VOP2Inst <"v_lshrrev_b16", VOP_I16_I16_I16>;
-defm V_ASHRREV_I16 : VOP2Inst <"v_ashrrev_i16", VOP_I16_I16_I16>;
+defm V_LSHLREV_B16 : VOP2Inst <"v_lshlrev_b16", VOP_I16_I16_I16, lshl_rev>;
+defm V_LSHRREV_B16 : VOP2Inst <"v_lshrrev_b16", VOP_I16_I16_I16, lshr_rev>;
+defm V_ASHRREV_I16 : VOP2Inst <"v_ashrrev_i16", VOP_I16_I16_I16, ashr_rev>;
let isCommutable = 1 in {
let FPDPRounding = 1 in {
@@ -618,16 +623,16 @@ defm V_SUBREV_F16 : VOP2Inst <"v_subrev_f16", VOP_F16_F16_F16, null_frag, "v_sub
defm V_MUL_F16 : VOP2Inst <"v_mul_f16", VOP_F16_F16_F16, fmul>;
def V_MADAK_F16 : VOP2_Pseudo <"v_madak_f16", VOP_MADAK_F16, [], "">;
} // End FPDPRounding = 1
-defm V_ADD_U16 : VOP2Inst <"v_add_u16", VOP_I16_I16_I16>;
-defm V_SUB_U16 : VOP2Inst <"v_sub_u16" , VOP_I16_I16_I16>;
+defm V_ADD_U16 : VOP2Inst <"v_add_u16", VOP_I16_I16_I16, add>;
+defm V_SUB_U16 : VOP2Inst <"v_sub_u16" , VOP_I16_I16_I16, sub>;
defm V_SUBREV_U16 : VOP2Inst <"v_subrev_u16", VOP_I16_I16_I16, null_frag, "v_sub_u16">;
-defm V_MUL_LO_U16 : VOP2Inst <"v_mul_lo_u16", VOP_I16_I16_I16>;
+defm V_MUL_LO_U16 : VOP2Inst <"v_mul_lo_u16", VOP_I16_I16_I16, mul>;
defm V_MAX_F16 : VOP2Inst <"v_max_f16", VOP_F16_F16_F16, fmaxnum_like>;
defm V_MIN_F16 : VOP2Inst <"v_min_f16", VOP_F16_F16_F16, fminnum_like>;
-defm V_MAX_U16 : VOP2Inst <"v_max_u16", VOP_I16_I16_I16>;
-defm V_MAX_I16 : VOP2Inst <"v_max_i16", VOP_I16_I16_I16>;
-defm V_MIN_U16 : VOP2Inst <"v_min_u16", VOP_I16_I16_I16>;
-defm V_MIN_I16 : VOP2Inst <"v_min_i16", VOP_I16_I16_I16>;
+defm V_MAX_U16 : VOP2Inst <"v_max_u16", VOP_I16_I16_I16, umax>;
+defm V_MAX_I16 : VOP2Inst <"v_max_i16", VOP_I16_I16_I16, smax>;
+defm V_MIN_U16 : VOP2Inst <"v_min_u16", VOP_I16_I16_I16, umin>;
+defm V_MIN_I16 : VOP2Inst <"v_min_i16", VOP_I16_I16_I16, smin>;
let Constraints = "$vdst = $src2", DisableEncoding="$src2",
isConvertibleToThreeAddress = 1 in {
@@ -653,16 +658,17 @@ defm V_FMAC_F32 : VOP2Inst <"v_fmac_f32", VOP_MAC_F32>;
let Constraints = "$vdst = $src2",
DisableEncoding="$src2",
isConvertibleToThreeAddress = 1,
- isCommutable = 1 in {
+ isCommutable = 1,
+ IsDOT = 1 in {
let SubtargetPredicate = HasDot5Insts in
- defm V_DOT2C_F32_F16 : VOP2Inst_e32<"v_dot2c_f32_f16", VOP_DOT_ACC_F32_V2F16>;
+ defm V_DOT2C_F32_F16 : VOP2Inst<"v_dot2c_f32_f16", VOP_DOT_ACC_F32_V2F16>;
let SubtargetPredicate = HasDot6Insts in
- defm V_DOT4C_I32_I8 : VOP2Inst_e32<"v_dot4c_i32_i8", VOP_DOT_ACC_I32_I32>;
+ defm V_DOT4C_I32_I8 : VOP2Inst<"v_dot4c_i32_i8", VOP_DOT_ACC_I32_I32>;
let SubtargetPredicate = HasDot4Insts in
- defm V_DOT2C_I32_I16 : VOP2Inst_e32<"v_dot2c_i32_i16", VOP_DOT_ACC_I32_I32>;
+ defm V_DOT2C_I32_I16 : VOP2Inst<"v_dot2c_i32_i16", VOP_DOT_ACC_I32_I32>;
let SubtargetPredicate = HasDot3Insts in
- defm V_DOT8C_I32_I4 : VOP2Inst_e32<"v_dot8c_i32_i4", VOP_DOT_ACC_I32_I32>;
+ defm V_DOT8C_I32_I4 : VOP2Inst<"v_dot8c_i32_i4", VOP_DOT_ACC_I32_I32>;
}
let AddedComplexity = 30 in {
@@ -719,50 +725,17 @@ defm V_PK_FMAC_F16 : VOP2Inst<"v_pk_fmac_f16", VOP_V2F16_V2F16_V2F16>;
// Note: 16-bit instructions produce a 0 result in the high 16-bits
// on GFX8 and GFX9 and preserve high 16 bits on GFX10+
-def ClearHI16 : OutPatFrag<(ops node:$op),
- (V_AND_B32_e64 $op, (V_MOV_B32_e32 (i32 0xffff)))>;
-
-multiclass Arithmetic_i16_Pats <SDPatternOperator op, Instruction inst,
- bit PreservesHI16 = 0> {
-
-def : GCNPat<
- (op i16:$src0, i16:$src1),
- !if(!eq(PreservesHI16,1), (ClearHI16 (inst $src0, $src1)), (inst $src0, $src1))
->;
-
-def : GCNPat<
- (i32 (zext (op i16:$src0, i16:$src1))),
- !if(!eq(PreservesHI16,1), (ClearHI16 (inst $src0, $src1)), (inst $src0, $src1))
->;
-
-def : GCNPat<
- (i64 (zext (op i16:$src0, i16:$src1))),
- (REG_SEQUENCE VReg_64,
- !if(!eq(PreservesHI16,1), (ClearHI16 (inst $src0, $src1)), (inst $src0, $src1)),
- sub0,
- (V_MOV_B32_e32 (i32 0)), sub1)
->;
-}
-
-multiclass Bits_OpsRev_i16_Pats <SDPatternOperator op, Instruction inst,
- bit PreservesHI16 = 0> {
-
-def : GCNPat<
- (op i16:$src0, i16:$src1),
- !if(!eq(PreservesHI16,1), (ClearHI16 (inst $src1, $src0)), (inst $src1, $src0))
->;
+multiclass Arithmetic_i16_0Hi_Pats <SDPatternOperator op, Instruction inst> {
def : GCNPat<
(i32 (zext (op i16:$src0, i16:$src1))),
- !if(!eq(PreservesHI16,1), (ClearHI16 (inst $src1, $src0)), (inst $src1, $src0))
+ (inst $src0, $src1)
>;
-
def : GCNPat<
(i64 (zext (op i16:$src0, i16:$src1))),
(REG_SEQUENCE VReg_64,
- !if(!eq(PreservesHI16,1), (ClearHI16 (inst $src1, $src0)), (inst $src1, $src0)),
- sub0,
+ (inst $src0, $src1), sub0,
(V_MOV_B32_e32 (i32 0)), sub1)
>;
}
@@ -774,53 +747,36 @@ class ZExt_i16_i1_Pat <SDNode ext> : GCNPat <
$src)
>;
-let Predicates = [Has16BitInsts] in {
-
-let Predicates = [Has16BitInsts, isGFX7GFX8GFX9] in {
-defm : Arithmetic_i16_Pats<add, V_ADD_U16_e64>;
-defm : Arithmetic_i16_Pats<mul, V_MUL_LO_U16_e64>;
-defm : Arithmetic_i16_Pats<sub, V_SUB_U16_e64>;
-defm : Arithmetic_i16_Pats<smin, V_MIN_I16_e64>;
-defm : Arithmetic_i16_Pats<smax, V_MAX_I16_e64>;
-defm : Arithmetic_i16_Pats<umin, V_MIN_U16_e64>;
-defm : Arithmetic_i16_Pats<umax, V_MAX_U16_e64>;
-}
-
-let Predicates = [Has16BitInsts, isGFX10Plus] in {
-defm : Arithmetic_i16_Pats<add, V_ADD_U16_e64, 1>;
-defm : Arithmetic_i16_Pats<mul, V_MUL_LO_U16_e64, 1>;
-defm : Arithmetic_i16_Pats<sub, V_SUB_U16_e64, 1>;
-defm : Arithmetic_i16_Pats<smin, V_MIN_I16_e64, 1>;
-defm : Arithmetic_i16_Pats<smax, V_MAX_I16_e64, 1>;
-defm : Arithmetic_i16_Pats<umin, V_MIN_U16_e64, 1>;
-defm : Arithmetic_i16_Pats<umax, V_MAX_U16_e64, 1>;
-}
-
+foreach vt = [i16, v2i16] in {
def : GCNPat <
- (and i16:$src0, i16:$src1),
- (V_AND_B32_e64 $src0, $src1)
+ (and vt:$src0, vt:$src1),
+ (V_AND_B32_e64 VSrc_b32:$src0, VSrc_b32:$src1)
>;
def : GCNPat <
- (or i16:$src0, i16:$src1),
- (V_OR_B32_e64 $src0, $src1)
+ (or vt:$src0, vt:$src1),
+ (V_OR_B32_e64 VSrc_b32:$src0, VSrc_b32:$src1)
>;
def : GCNPat <
- (xor i16:$src0, i16:$src1),
- (V_XOR_B32_e64 $src0, $src1)
+ (xor vt:$src0, vt:$src1),
+ (V_XOR_B32_e64 VSrc_b32:$src0, VSrc_b32:$src1)
>;
-
-let Predicates = [Has16BitInsts, isGFX7GFX8GFX9] in {
-defm : Bits_OpsRev_i16_Pats<shl, V_LSHLREV_B16_e64>;
-defm : Bits_OpsRev_i16_Pats<srl, V_LSHRREV_B16_e64>;
-defm : Bits_OpsRev_i16_Pats<sra, V_ASHRREV_I16_e64>;
}
-let Predicates = [Has16BitInsts, isGFX10Plus] in {
-defm : Bits_OpsRev_i16_Pats<shl, V_LSHLREV_B16_e64, 1>;
-defm : Bits_OpsRev_i16_Pats<srl, V_LSHRREV_B16_e64, 1>;
-defm : Bits_OpsRev_i16_Pats<sra, V_ASHRREV_I16_e64, 1>;
+let Predicates = [Has16BitInsts] in {
+
+let Predicates = [Has16BitInsts, isGFX7GFX8GFX9] in {
+defm : Arithmetic_i16_0Hi_Pats<add, V_ADD_U16_e64>;
+defm : Arithmetic_i16_0Hi_Pats<mul, V_MUL_LO_U16_e64>;
+defm : Arithmetic_i16_0Hi_Pats<sub, V_SUB_U16_e64>;
+defm : Arithmetic_i16_0Hi_Pats<smin, V_MIN_I16_e64>;
+defm : Arithmetic_i16_0Hi_Pats<smax, V_MAX_I16_e64>;
+defm : Arithmetic_i16_0Hi_Pats<umin, V_MIN_U16_e64>;
+defm : Arithmetic_i16_0Hi_Pats<umax, V_MAX_U16_e64>;
+defm : Arithmetic_i16_0Hi_Pats<lshl_rev, V_LSHLREV_B16_e64>;
+defm : Arithmetic_i16_0Hi_Pats<lshr_rev, V_LSHRREV_B16_e64>;
+defm : Arithmetic_i16_0Hi_Pats<ashr_rev, V_ASHRREV_I16_e64>;
}
def : ZExt_i16_i1_Pat<zext>;
@@ -847,7 +803,7 @@ def : GCNPat<
// Target-specific instruction encodings.
//===----------------------------------------------------------------------===//
-class VOP2_DPP<bits<6> op, VOP2_Pseudo ps,
+class VOP2_DPP<bits<6> op, VOP2_DPP_Pseudo ps,
string opName = ps.OpName, VOPProfile p = ps.Pfl,
bit IsDPP16 = 0> :
VOP_DPP<opName, p, IsDPP16> {
@@ -865,13 +821,18 @@ class VOP2_DPP<bits<6> op, VOP2_Pseudo ps,
let Inst{31} = 0x0;
}
-class VOP2_DPP16<bits<6> op, VOP2_Pseudo ps,
+class Base_VOP2_DPP16<bits<6> op, VOP2_DPP_Pseudo ps,
string opName = ps.OpName, VOPProfile p = ps.Pfl> :
VOP2_DPP<op, ps, opName, p, 1> {
let AssemblerPredicate = !if(p.HasExt, HasDPP16, DisableInst);
let SubtargetPredicate = HasDPP16;
}
+class VOP2_DPP16<bits<6> op, VOP2_DPP_Pseudo ps,
+ string opName = ps.OpName, VOPProfile p = ps.Pfl> :
+ Base_VOP2_DPP16<op, ps, opName, p>,
+ SIMCInstr <ps.PseudoInstr, SIEncodingFamily.GFX10>;
+
class VOP2_DPP8<bits<6> op, VOP2_Pseudo ps,
string opName = ps.OpName, VOPProfile p = ps.Pfl> :
VOP_DPP8<ps.OpName, p> {
@@ -924,6 +885,7 @@ let AssemblerPredicate = isGFX10Plus, DecoderNamespace = "GFX10" in {
VOP3e_gfx10<{0, 1, 0, 0, op{5-0}}, !cast<VOP3_Pseudo>(NAME#"_e64").Pfl>;
}
multiclass VOP2_Real_sdwa_gfx10<bits<6> op> {
+ foreach _ = BoolToList<!cast<VOP2_Pseudo>(NAME#"_e32").Pfl.HasExtSDWA9>.ret in
def _sdwa_gfx10 :
VOP_SDWA10_Real<!cast<VOP2_SDWA_Pseudo>(NAME#"_sdwa")>,
VOP2_SDWA9Ae<op{5-0}, !cast<VOP2_SDWA_Pseudo>(NAME#"_sdwa").Pfl> {
@@ -931,11 +893,13 @@ let AssemblerPredicate = isGFX10Plus, DecoderNamespace = "GFX10" in {
}
}
multiclass VOP2_Real_dpp_gfx10<bits<6> op> {
- def _dpp_gfx10 : VOP2_DPP16<op, !cast<VOP2_Pseudo>(NAME#"_e32")> {
+ foreach _ = BoolToList<!cast<VOP2_Pseudo>(NAME#"_e32").Pfl.HasExtDPP>.ret in
+ def _dpp_gfx10 : VOP2_DPP16<op, !cast<VOP2_DPP_Pseudo>(NAME#"_dpp")> {
let DecoderNamespace = "SDWA10";
}
}
multiclass VOP2_Real_dpp8_gfx10<bits<6> op> {
+ foreach _ = BoolToList<!cast<VOP2_Pseudo>(NAME#"_e32").Pfl.HasExtDPP>.ret in
def _dpp8_gfx10 : VOP2_DPP8<op, !cast<VOP2_Pseudo>(NAME#"_e32")> {
let DecoderNamespace = "DPP8";
}
@@ -964,6 +928,7 @@ let AssemblerPredicate = isGFX10Plus, DecoderNamespace = "GFX10" in {
let DecoderNamespace = "SDWA10" in {
multiclass VOP2_Real_sdwa_gfx10_with_name<bits<6> op, string opName,
string asmName> {
+ foreach _ = BoolToList<!cast<VOP2_Pseudo>(opName#"_e32").Pfl.HasExtSDWA9>.ret in
def _sdwa_gfx10 :
VOP_SDWA10_Real<!cast<VOP2_SDWA_Pseudo>(opName#"_sdwa")>,
VOP2_SDWA9Ae<op{5-0}, !cast<VOP2_SDWA_Pseudo>(opName#"_sdwa").Pfl> {
@@ -973,13 +938,15 @@ let AssemblerPredicate = isGFX10Plus, DecoderNamespace = "GFX10" in {
}
multiclass VOP2_Real_dpp_gfx10_with_name<bits<6> op, string opName,
string asmName> {
- def _dpp_gfx10 : VOP2_DPP16<op, !cast<VOP2_Pseudo>(opName#"_e32")> {
+ foreach _ = BoolToList<!cast<VOP2_Pseudo>(opName#"_e32").Pfl.HasExtDPP>.ret in
+ def _dpp_gfx10 : VOP2_DPP16<op, !cast<VOP2_DPP_Pseudo>(opName#"_dpp")> {
VOP2_Pseudo ps = !cast<VOP2_Pseudo>(opName#"_e32");
let AsmString = asmName # ps.Pfl.AsmDPP16;
}
}
multiclass VOP2_Real_dpp8_gfx10_with_name<bits<6> op, string opName,
string asmName> {
+ foreach _ = BoolToList<!cast<VOP2_Pseudo>(opName#"_e32").Pfl.HasExtDPP>.ret in
def _dpp8_gfx10 : VOP2_DPP8<op, !cast<VOP2_Pseudo>(opName#"_e32")> {
VOP2_Pseudo ps = !cast<VOP2_Pseudo>(opName#"_e32");
let AsmString = asmName # ps.Pfl.AsmDPP8;
@@ -989,13 +956,15 @@ let AssemblerPredicate = isGFX10Plus, DecoderNamespace = "GFX10" in {
} // End DecoderNamespace = "SDWA10"
//===------------------------------ VOP2be ------------------------------===//
- multiclass VOP2be_Real_gfx10<bits<6> op, string opName, string asmName> {
+ multiclass VOP2be_Real_e32_gfx10<bits<6> op, string opName, string asmName> {
def _e32_gfx10 :
VOP2_Real<!cast<VOP2_Pseudo>(opName#"_e32"), SIEncodingFamily.GFX10>,
VOP2e<op{5-0}, !cast<VOP2_Pseudo>(opName#"_e32").Pfl> {
VOP2_Pseudo Ps = !cast<VOP2_Pseudo>(opName#"_e32");
let AsmString = asmName # !subst(", vcc", "", Ps.AsmOperands);
}
+ }
+ multiclass VOP2be_Real_e64_gfx10<bits<6> op, string opName, string asmName> {
def _e64_gfx10 :
VOP3_Real<!cast<VOP3_Pseudo>(opName#"_e64"), SIEncodingFamily.GFX10>,
VOP3be_gfx10<{0, 1, 0, 0, op{5-0}},
@@ -1003,6 +972,9 @@ let AssemblerPredicate = isGFX10Plus, DecoderNamespace = "GFX10" in {
VOP3_Pseudo Ps = !cast<VOP3_Pseudo>(opName#"_e64");
let AsmString = asmName # Ps.AsmOperands;
}
+ }
+ multiclass VOP2be_Real_sdwa_gfx10<bits<6> op, string opName, string asmName> {
+ foreach _ = BoolToList<!cast<VOP2_Pseudo>(opName#"_e32").Pfl.HasExtSDWA9>.ret in
def _sdwa_gfx10 :
VOP_SDWA10_Real<!cast<VOP2_SDWA_Pseudo>(opName#"_sdwa")>,
VOP2_SDWA9Ae<op{5-0}, !cast<VOP2_SDWA_Pseudo>(opName#"_sdwa").Pfl> {
@@ -1010,64 +982,76 @@ let AssemblerPredicate = isGFX10Plus, DecoderNamespace = "GFX10" in {
let AsmString = asmName # !subst(", vcc", "", Ps.AsmOperands);
let DecoderNamespace = "SDWA10";
}
+ foreach _ = BoolToList<!cast<VOP2_Pseudo>(opName#"_e32").Pfl.HasExtSDWA9>.ret in
+ def _sdwa_w32_gfx10 :
+ Base_VOP_SDWA10_Real<!cast<VOP2_SDWA_Pseudo>(opName#"_sdwa")>,
+ VOP2_SDWA9Ae<op{5-0}, !cast<VOP2_SDWA_Pseudo>(opName#"_sdwa").Pfl> {
+ VOP2_SDWA_Pseudo Ps = !cast<VOP2_SDWA_Pseudo>(opName#"_sdwa");
+ let AsmString = asmName # !subst("vcc", "vcc_lo", Ps.AsmOperands);
+ let isAsmParserOnly = 1;
+ let DecoderNamespace = "SDWA10";
+ let WaveSizePredicate = isWave32;
+ }
+ foreach _ = BoolToList<!cast<VOP2_Pseudo>(opName#"_e32").Pfl.HasExtSDWA9>.ret in
+ def _sdwa_w64_gfx10 :
+ Base_VOP_SDWA10_Real<!cast<VOP2_SDWA_Pseudo>(opName#"_sdwa")>,
+ VOP2_SDWA9Ae<op{5-0}, !cast<VOP2_SDWA_Pseudo>(opName#"_sdwa").Pfl> {
+ VOP2_SDWA_Pseudo Ps = !cast<VOP2_SDWA_Pseudo>(opName#"_sdwa");
+ let AsmString = asmName # Ps.AsmOperands;
+ let isAsmParserOnly = 1;
+ let DecoderNamespace = "SDWA10";
+ let WaveSizePredicate = isWave64;
+ }
+ }
+ multiclass VOP2be_Real_dpp_gfx10<bits<6> op, string opName, string asmName> {
+ foreach _ = BoolToList<!cast<VOP2_Pseudo>(opName#"_e32").Pfl.HasExtDPP>.ret in
def _dpp_gfx10 :
- VOP2_DPP16<op, !cast<VOP2_Pseudo>(opName#"_e32"), asmName> {
+ VOP2_DPP16<op, !cast<VOP2_DPP_Pseudo>(opName#"_dpp"), asmName> {
string AsmDPP = !cast<VOP2_Pseudo>(opName#"_e32").Pfl.AsmDPP16;
let AsmString = asmName # !subst(", vcc", "", AsmDPP);
let DecoderNamespace = "SDWA10";
}
+ foreach _ = BoolToList<!cast<VOP2_Pseudo>(opName#"_e32").Pfl.HasExtDPP>.ret in
+ def _dpp_w32_gfx10 :
+ Base_VOP2_DPP16<op, !cast<VOP2_DPP_Pseudo>(opName#"_dpp"), asmName> {
+ string AsmDPP = !cast<VOP2_Pseudo>(opName#"_e32").Pfl.AsmDPP16;
+ let AsmString = asmName # !subst("vcc", "vcc_lo", AsmDPP);
+ let isAsmParserOnly = 1;
+ let WaveSizePredicate = isWave32;
+ }
+ foreach _ = BoolToList<!cast<VOP2_Pseudo>(opName#"_e32").Pfl.HasExtDPP>.ret in
+ def _dpp_w64_gfx10 :
+ Base_VOP2_DPP16<op, !cast<VOP2_DPP_Pseudo>(opName#"_dpp"), asmName> {
+ string AsmDPP = !cast<VOP2_Pseudo>(opName#"_e32").Pfl.AsmDPP16;
+ let AsmString = asmName # AsmDPP;
+ let isAsmParserOnly = 1;
+ let WaveSizePredicate = isWave64;
+ }
+ }
+ multiclass VOP2be_Real_dpp8_gfx10<bits<6> op, string opName, string asmName> {
+ foreach _ = BoolToList<!cast<VOP2_Pseudo>(opName#"_e32").Pfl.HasExtDPP>.ret in
def _dpp8_gfx10 :
VOP2_DPP8<op, !cast<VOP2_Pseudo>(opName#"_e32"), asmName> {
string AsmDPP8 = !cast<VOP2_Pseudo>(opName#"_e32").Pfl.AsmDPP8;
let AsmString = asmName # !subst(", vcc", "", AsmDPP8);
let DecoderNamespace = "DPP8";
}
-
- let WaveSizePredicate = isWave32 in {
- def _sdwa_w32_gfx10 :
- Base_VOP_SDWA10_Real<!cast<VOP2_SDWA_Pseudo>(opName#"_sdwa")>,
- VOP2_SDWA9Ae<op{5-0}, !cast<VOP2_SDWA_Pseudo>(opName#"_sdwa").Pfl> {
- VOP2_SDWA_Pseudo Ps = !cast<VOP2_SDWA_Pseudo>(opName#"_sdwa");
- let AsmString = asmName # !subst("vcc", "vcc_lo", Ps.AsmOperands);
- let isAsmParserOnly = 1;
- let DecoderNamespace = "SDWA10";
- }
- def _dpp_w32_gfx10 :
- VOP2_DPP16<op, !cast<VOP2_Pseudo>(opName#"_e32"), asmName> {
- string AsmDPP = !cast<VOP2_Pseudo>(opName#"_e32").Pfl.AsmDPP16;
- let AsmString = asmName # !subst("vcc", "vcc_lo", AsmDPP);
- let isAsmParserOnly = 1;
- }
- def _dpp8_w32_gfx10 :
- VOP2_DPP8<op, !cast<VOP2_Pseudo>(opName#"_e32"), asmName> {
- string AsmDPP8 = !cast<VOP2_Pseudo>(opName#"_e32").Pfl.AsmDPP8;
- let AsmString = asmName # !subst("vcc", "vcc_lo", AsmDPP8);
- let isAsmParserOnly = 1;
- }
- } // End WaveSizePredicate = isWave32
-
- let WaveSizePredicate = isWave64 in {
- def _sdwa_w64_gfx10 :
- Base_VOP_SDWA10_Real<!cast<VOP2_SDWA_Pseudo>(opName#"_sdwa")>,
- VOP2_SDWA9Ae<op{5-0}, !cast<VOP2_SDWA_Pseudo>(opName#"_sdwa").Pfl> {
- VOP2_SDWA_Pseudo Ps = !cast<VOP2_SDWA_Pseudo>(opName#"_sdwa");
- let AsmString = asmName # Ps.AsmOperands;
- let isAsmParserOnly = 1;
- let DecoderNamespace = "SDWA10";
- }
- def _dpp_w64_gfx10 :
- VOP2_DPP16<op, !cast<VOP2_Pseudo>(opName#"_e32"), asmName> {
- string AsmDPP = !cast<VOP2_Pseudo>(opName#"_e32").Pfl.AsmDPP16;
- let AsmString = asmName # AsmDPP;
- let isAsmParserOnly = 1;
- }
- def _dpp8_w64_gfx10 :
- VOP2_DPP8<op, !cast<VOP2_Pseudo>(opName#"_e32"), asmName> {
- string AsmDPP8 = !cast<VOP2_Pseudo>(opName#"_e32").Pfl.AsmDPP8;
- let AsmString = asmName # AsmDPP8;
- let isAsmParserOnly = 1;
- }
- } // End WaveSizePredicate = isWave64
+ foreach _ = BoolToList<!cast<VOP2_Pseudo>(opName#"_e32").Pfl.HasExtDPP>.ret in
+ def _dpp8_w32_gfx10 :
+ VOP2_DPP8<op, !cast<VOP2_Pseudo>(opName#"_e32"), asmName> {
+ string AsmDPP8 = !cast<VOP2_Pseudo>(opName#"_e32").Pfl.AsmDPP8;
+ let AsmString = asmName # !subst("vcc", "vcc_lo", AsmDPP8);
+ let isAsmParserOnly = 1;
+ let WaveSizePredicate = isWave32;
+ }
+ foreach _ = BoolToList<!cast<VOP2_Pseudo>(opName#"_e32").Pfl.HasExtDPP>.ret in
+ def _dpp8_w64_gfx10 :
+ VOP2_DPP8<op, !cast<VOP2_Pseudo>(opName#"_e32"), asmName> {
+ string AsmDPP8 = !cast<VOP2_Pseudo>(opName#"_e32").Pfl.AsmDPP8;
+ let AsmString = asmName # AsmDPP8;
+ let isAsmParserOnly = 1;
+ let WaveSizePredicate = isWave64;
+ }
}
//===----------------------------- VOP3Only -----------------------------===//
@@ -1088,8 +1072,19 @@ let AssemblerPredicate = isGFX10Plus, DecoderNamespace = "GFX10" in {
}
} // End AssemblerPredicate = isGFX10Plus, DecoderNamespace = "GFX10"
-multiclass Base_VOP2_Real_gfx10<bits<6> op> :
- VOP2_Real_e32_gfx10<op>, VOP2_Real_e64_gfx10<op>;
+multiclass VOP2be_Real_gfx10<bits<6> op, string opName, string asmName> :
+ VOP2be_Real_e32_gfx10<op, opName, asmName>,
+ VOP2be_Real_e64_gfx10<op, opName, asmName>,
+ VOP2be_Real_sdwa_gfx10<op, opName, asmName>,
+ VOP2be_Real_dpp_gfx10<op, opName, asmName>,
+ VOP2be_Real_dpp8_gfx10<op, opName, asmName>;
+
+multiclass VOP2e_Real_gfx10<bits<6> op, string opName, string asmName> :
+ VOP2_Real_e32_gfx10<op>,
+ VOP2_Real_e64_gfx10<op>,
+ VOP2be_Real_sdwa_gfx10<op, opName, asmName>,
+ VOP2be_Real_dpp_gfx10<op, opName, asmName>,
+ VOP2be_Real_dpp8_gfx10<op, opName, asmName>;
multiclass VOP2_Real_gfx10<bits<6> op> :
VOP2_Real_e32_gfx10<op>, VOP2_Real_e64_gfx10<op>,
@@ -1103,7 +1098,6 @@ multiclass VOP2_Real_gfx10_with_name<bits<6> op, string opName,
VOP2_Real_dpp_gfx10_with_name<op, opName, asmName>,
VOP2_Real_dpp8_gfx10_with_name<op, opName, asmName>;
-defm V_CNDMASK_B32 : Base_VOP2_Real_gfx10<0x001>;
defm V_XNOR_B32 : VOP2_Real_gfx10<0x01e>;
defm V_FMAC_F32 : VOP2_Real_gfx10<0x02b>;
defm V_FMAMK_F32 : VOP2Only_Real_MADK_gfx10<0x02c>;
@@ -1136,6 +1130,9 @@ defm V_SUB_CO_CI_U32 :
defm V_SUBREV_CO_CI_U32 :
VOP2be_Real_gfx10<0x02a, "V_SUBBREV_U32", "v_subrev_co_ci_u32">;
+defm V_CNDMASK_B32 :
+ VOP2e_Real_gfx10<0x001, "V_CNDMASK_B32", "v_cndmask_b32">;
+
// VOP3 only.
defm V_BFM_B32 : VOP3Only_Real_gfx10<0x363>;
defm V_BCNT_U32_B32 : VOP3Only_Real_gfx10<0x364>;
@@ -1322,12 +1319,14 @@ multiclass Base_VOP2_Real_e32e64_vi <bits<6> op> :
} // End AssemblerPredicates = [isGFX8GFX9], DecoderNamespace = "GFX8"
multiclass VOP2_SDWA_Real <bits<6> op> {
+ foreach _ = BoolToList<!cast<VOP2_Pseudo>(NAME#"_e32").Pfl.HasExtSDWA>.ret in
def _sdwa_vi :
VOP_SDWA_Real <!cast<VOP2_SDWA_Pseudo>(NAME#"_sdwa")>,
VOP2_SDWAe <op{5-0}, !cast<VOP2_SDWA_Pseudo>(NAME#"_sdwa").Pfl>;
}
multiclass VOP2_SDWA9_Real <bits<6> op> {
+ foreach _ = BoolToList<!cast<VOP2_Pseudo>(NAME#"_e32").Pfl.HasExtSDWA9>.ret in
def _sdwa_gfx9 :
VOP_SDWA9_Real <!cast<VOP2_SDWA_Pseudo>(NAME#"_sdwa")>,
VOP2_SDWA9Ae <op{5-0}, !cast<VOP2_SDWA_Pseudo>(NAME#"_sdwa").Pfl>;
@@ -1350,12 +1349,13 @@ multiclass VOP2be_Real_e32e64_vi_only <bits<6> op, string OpName, string AsmName
let AsmString = AsmName # ps.AsmOperands;
let DecoderNamespace = "GFX8";
}
- def _sdwa_vi :
- VOP_SDWA_Real <!cast<VOP2_SDWA_Pseudo>(OpName#"_sdwa")>,
- VOP2_SDWAe <op{5-0}, !cast<VOP2_SDWA_Pseudo>(OpName#"_sdwa").Pfl> {
- VOP2_SDWA_Pseudo ps = !cast<VOP2_SDWA_Pseudo>(OpName#"_sdwa");
- let AsmString = AsmName # ps.AsmOperands;
- }
+ foreach _ = BoolToList<!cast<VOP2_Pseudo>(OpName#"_e32").Pfl.HasExtSDWA>.ret in
+ def _sdwa_vi :
+ VOP_SDWA_Real <!cast<VOP2_SDWA_Pseudo>(OpName#"_sdwa")>,
+ VOP2_SDWAe <op{5-0}, !cast<VOP2_SDWA_Pseudo>(OpName#"_sdwa").Pfl> {
+ VOP2_SDWA_Pseudo ps = !cast<VOP2_SDWA_Pseudo>(OpName#"_sdwa");
+ let AsmString = AsmName # ps.AsmOperands;
+ }
foreach _ = BoolToList<!cast<VOP2_Pseudo>(OpName#"_e32").Pfl.HasExtDPP>.ret in
def _dpp_vi :
VOP_DPP_Real<!cast<VOP2_DPP_Pseudo>(OpName#"_dpp"), SIEncodingFamily.VI>,
@@ -1383,12 +1383,13 @@ multiclass VOP2be_Real_e32e64_gfx9 <bits<6> op, string OpName, string AsmName> {
let AsmString = AsmName # ps.AsmOperands;
let DecoderNamespace = "GFX9";
}
- def _sdwa_gfx9 :
- VOP_SDWA9_Real <!cast<VOP2_SDWA_Pseudo>(OpName#"_sdwa")>,
- VOP2_SDWA9Ae <op{5-0}, !cast<VOP2_SDWA_Pseudo>(OpName#"_sdwa").Pfl> {
- VOP2_SDWA_Pseudo ps = !cast<VOP2_SDWA_Pseudo>(OpName#"_sdwa");
- let AsmString = AsmName # ps.AsmOperands;
- }
+ foreach _ = BoolToList<!cast<VOP2_Pseudo>(OpName#"_e32").Pfl.HasExtSDWA9>.ret in
+ def _sdwa_gfx9 :
+ VOP_SDWA9_Real <!cast<VOP2_SDWA_Pseudo>(OpName#"_sdwa")>,
+ VOP2_SDWA9Ae <op{5-0}, !cast<VOP2_SDWA_Pseudo>(OpName#"_sdwa").Pfl> {
+ VOP2_SDWA_Pseudo ps = !cast<VOP2_SDWA_Pseudo>(OpName#"_sdwa");
+ let AsmString = AsmName # ps.AsmOperands;
+ }
foreach _ = BoolToList<!cast<VOP2_Pseudo>(OpName#"_e32").Pfl.HasExtDPP>.ret in
def _dpp_gfx9 :
VOP_DPP_Real<!cast<VOP2_DPP_Pseudo>(OpName#"_dpp"), SIEncodingFamily.GFX9>,
@@ -1410,10 +1411,11 @@ multiclass VOP2_Real_e32e64_gfx9 <bits<6> op> {
VOP3e_vi <{0, 1, 0, 0, op{5-0}}, !cast<VOP3_Pseudo>(NAME#"_e64").Pfl> {
let DecoderNamespace = "GFX9";
}
- def _sdwa_gfx9 :
- VOP_SDWA9_Real <!cast<VOP2_SDWA_Pseudo>(NAME#"_sdwa")>,
- VOP2_SDWA9Ae <op{5-0}, !cast<VOP2_SDWA_Pseudo>(NAME#"_sdwa").Pfl> {
- }
+ foreach _ = BoolToList<!cast<VOP2_Pseudo>(NAME#"_e32").Pfl.HasExtSDWA9>.ret in
+ def _sdwa_gfx9 :
+ VOP_SDWA9_Real <!cast<VOP2_SDWA_Pseudo>(NAME#"_sdwa")>,
+ VOP2_SDWA9Ae <op{5-0}, !cast<VOP2_SDWA_Pseudo>(NAME#"_sdwa").Pfl> {
+ }
foreach _ = BoolToList<!cast<VOP2_Pseudo>(NAME#"_e32").Pfl.HasExtDPP>.ret in
def _dpp_gfx9 :
VOP_DPP_Real<!cast<VOP2_DPP_Pseudo>(NAME#"_dpp"), SIEncodingFamily.GFX9>,
@@ -1554,7 +1556,7 @@ defm V_XNOR_B32 : VOP2_Real_e32e64_vi <0x3d>;
} // End SubtargetPredicate = HasDLInsts
multiclass VOP2_Real_DOT_ACC_gfx9<bits<6> op> : VOP2_Real_e32_vi<op> {
- def _dpp : VOP2_DPP<op, !cast<VOP2_Pseudo>(NAME#"_e32")>;
+ def _dpp_vi : VOP2_DPP<op, !cast<VOP2_DPP_Pseudo>(NAME#"_dpp")>;
}
multiclass VOP2_Real_DOT_ACC_gfx10<bits<6> op> :
diff --git a/lib/Target/AMDGPU/VOP3Instructions.td b/lib/Target/AMDGPU/VOP3Instructions.td
index 21dbef9240e1..605425972b1c 100644
--- a/lib/Target/AMDGPU/VOP3Instructions.td
+++ b/lib/Target/AMDGPU/VOP3Instructions.td
@@ -112,7 +112,7 @@ class getVOP3ClampPat<VOPProfile P, SDPatternOperator node> {
class getVOP3MAIPat<VOPProfile P, SDPatternOperator node> {
list<dag> ret = [(set P.DstVT:$vdst, (node P.Src0VT:$src0, P.Src1VT:$src1, P.Src2VT:$src2,
- imm:$cbsz, imm:$abid, imm:$blgp))];
+ timm:$cbsz, timm:$abid, timm:$blgp))];
}
class VOP3Inst<string OpName, VOPProfile P, SDPatternOperator node = null_frag, bit VOP3Only = 0> :
@@ -385,12 +385,12 @@ def V_TRIG_PREOP_F64 : VOP3Inst <"v_trig_preop_f64", VOP3_Profile<VOP_F64_F64_I3
}
let SchedRW = [Write64Bit] in {
-let SubtargetPredicate = isGFX6GFX7GFX10, Predicates = [isGFX6GFX7GFX10] in {
-def V_LSHL_B64 : VOP3Inst <"v_lshl_b64", VOP3_Profile<VOP_PAT_GEN<VOP_I64_I64_I32>>, shl>;
-def V_LSHR_B64 : VOP3Inst <"v_lshr_b64", VOP3_Profile<VOP_PAT_GEN<VOP_I64_I64_I32>>, srl>;
-def V_ASHR_I64 : VOP3Inst <"v_ashr_i64", VOP3_Profile<VOP_PAT_GEN<VOP_I64_I64_I32>>, sra>;
+let SubtargetPredicate = isGFX6GFX7GFX10 in {
+def V_LSHL_B64 : VOP3Inst <"v_lshl_b64", VOP3_Profile<VOP_I64_I64_I32>, shl>;
+def V_LSHR_B64 : VOP3Inst <"v_lshr_b64", VOP3_Profile<VOP_I64_I64_I32>, srl>;
+def V_ASHR_I64 : VOP3Inst <"v_ashr_i64", VOP3_Profile<VOP_I64_I64_I32>, sra>;
def V_MULLIT_F32 : VOP3Inst <"v_mullit_f32", VOP3_Profile<VOP_F32_F32_F32_F32>>;
-} // End SubtargetPredicate = isGFX6GFX7GFX10, Predicates = [isGFX6GFX7GFX10]
+} // End SubtargetPredicate = isGFX6GFX7GFX10
let SubtargetPredicate = isGFX8Plus in {
def V_LSHLREV_B64 : VOP3Inst <"v_lshlrev_b64", VOP3_Profile<VOP_I64_I32_I64>, lshl_rev>;
@@ -399,21 +399,6 @@ def V_ASHRREV_I64 : VOP3Inst <"v_ashrrev_i64", VOP3_Profile<VOP_I64_I32_I64>, as
} // End SubtargetPredicate = isGFX8Plus
} // End SchedRW = [Write64Bit]
-let Predicates = [isGFX8Plus] in {
-def : GCNPat <
- (getDivergentFrag<shl>.ret i64:$x, i32:$y),
- (V_LSHLREV_B64 $y, $x)
->;
-def : AMDGPUPat <
- (getDivergentFrag<srl>.ret i64:$x, i32:$y),
- (V_LSHRREV_B64 $y, $x)
->;
-def : AMDGPUPat <
- (getDivergentFrag<sra>.ret i64:$x, i32:$y),
- (V_ASHRREV_I64 $y, $x)
->;
-}
-
let SchedRW = [Write32Bit] in {
let SubtargetPredicate = isGFX8Plus in {
@@ -468,13 +453,13 @@ let FPDPRounding = 1 in {
def V_MAD_F16 : VOP3Inst <"v_mad_f16", VOP3_Profile<VOP_F16_F16_F16_F16>, fmad>;
let Uses = [M0, EXEC] in {
def V_INTERP_P2_F16 : VOP3Interp <"v_interp_p2_f16", VOP3_INTERP16<[f16, f32, i32, f32]>,
- [(set f16:$vdst, (AMDGPUinterp_p2_f16 f32:$src0, (i32 imm:$attrchan),
- (i32 imm:$attr),
- (i32 imm:$src0_modifiers),
+ [(set f16:$vdst, (AMDGPUinterp_p2_f16 f32:$src0, (i32 timm:$attrchan),
+ (i32 timm:$attr),
+ (i32 timm:$src0_modifiers),
(f32 VRegSrc_32:$src2),
- (i32 imm:$src2_modifiers),
- (i1 imm:$high),
- (i1 imm:$clamp)))]>;
+ (i32 timm:$src2_modifiers),
+ (i1 timm:$high),
+ (i1 timm:$clamp)))]>;
} // End Uses = [M0, EXEC]
} // End FPDPRounding = 1
} // End renamedInGFX9 = 1
@@ -493,21 +478,21 @@ def V_INTERP_P2_F16_gfx9 : VOP3Interp <"v_interp_p2_f16_gfx9", VOP3_INTERP16<[f1
let Uses = [M0, EXEC], FPDPRounding = 1 in {
def V_INTERP_P1LL_F16 : VOP3Interp <"v_interp_p1ll_f16", VOP3_INTERP16<[f32, f32, i32, untyped]>,
- [(set f32:$vdst, (AMDGPUinterp_p1ll_f16 f32:$src0, (i32 imm:$attrchan),
- (i32 imm:$attr),
- (i32 imm:$src0_modifiers),
- (i1 imm:$high),
- (i1 imm:$clamp),
- (i32 imm:$omod)))]>;
+ [(set f32:$vdst, (AMDGPUinterp_p1ll_f16 f32:$src0, (i32 timm:$attrchan),
+ (i32 timm:$attr),
+ (i32 timm:$src0_modifiers),
+ (i1 timm:$high),
+ (i1 timm:$clamp),
+ (i32 timm:$omod)))]>;
def V_INTERP_P1LV_F16 : VOP3Interp <"v_interp_p1lv_f16", VOP3_INTERP16<[f32, f32, i32, f16]>,
- [(set f32:$vdst, (AMDGPUinterp_p1lv_f16 f32:$src0, (i32 imm:$attrchan),
- (i32 imm:$attr),
- (i32 imm:$src0_modifiers),
+ [(set f32:$vdst, (AMDGPUinterp_p1lv_f16 f32:$src0, (i32 timm:$attrchan),
+ (i32 timm:$attr),
+ (i32 timm:$src0_modifiers),
(f32 VRegSrc_32:$src2),
- (i32 imm:$src2_modifiers),
- (i1 imm:$high),
- (i1 imm:$clamp),
- (i32 imm:$omod)))]>;
+ (i32 timm:$src2_modifiers),
+ (i1 timm:$high),
+ (i1 timm:$clamp),
+ (i32 timm:$omod)))]>;
} // End Uses = [M0, EXEC], FPDPRounding = 1
} // End SubtargetPredicate = Has16BitInsts, isCommutable = 1
@@ -657,11 +642,11 @@ let SubtargetPredicate = isGFX10Plus in {
} // End $vdst = $vdst_in, DisableEncoding $vdst_in
def : GCNPat<
- (int_amdgcn_permlane16 i32:$vdst_in, i32:$src0, i32:$src1, i32:$src2, imm:$fi, imm:$bc),
+ (int_amdgcn_permlane16 i32:$vdst_in, i32:$src0, i32:$src1, i32:$src2, timm:$fi, timm:$bc),
(V_PERMLANE16_B32 (as_i1imm $fi), $src0, (as_i1imm $bc), $src1, 0, $src2, $vdst_in)
>;
def : GCNPat<
- (int_amdgcn_permlanex16 i32:$vdst_in, i32:$src0, i32:$src1, i32:$src2, imm:$fi, imm:$bc),
+ (int_amdgcn_permlanex16 i32:$vdst_in, i32:$src0, i32:$src1, i32:$src2, timm:$fi, timm:$bc),
(V_PERMLANEX16_B32 (as_i1imm $fi), $src0, (as_i1imm $bc), $src1, 0, $src2, $vdst_in)
>;
} // End SubtargetPredicate = isGFX10Plus
diff --git a/lib/Target/AMDGPU/VOP3PInstructions.td b/lib/Target/AMDGPU/VOP3PInstructions.td
index 55ee5f6577cf..0c13f39fec02 100644
--- a/lib/Target/AMDGPU/VOP3PInstructions.td
+++ b/lib/Target/AMDGPU/VOP3PInstructions.td
@@ -261,6 +261,7 @@ class SDot2Pat<Instruction Inst> : GCNPat <
let SubtargetPredicate = !cast<VOP_Pseudo>(Inst).SubtargetPredicate;
}
+let IsDOT = 1 in {
let SubtargetPredicate = HasDot2Insts in {
def V_DOT2_F32_F16 : VOP3PInst<"v_dot2_f32_f16", VOP3_Profile<VOP_F32_V2F16_V2F16_F32>>;
@@ -277,6 +278,7 @@ def V_DOT4_I32_I8 : VOP3PInst<"v_dot4_i32_i8", VOP3_Profile<VOP_I32_I32_I32_I32
def V_DOT8_I32_I4 : VOP3PInst<"v_dot8_i32_i4", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_PACKED>>;
} // End SubtargetPredicate = HasDot1Insts
+} // End let IsDOT = 1
multiclass DotPats<SDPatternOperator dot_op,
VOP3PInst dot_inst> {
diff --git a/lib/Target/AMDGPU/VOPCInstructions.td b/lib/Target/AMDGPU/VOPCInstructions.td
index b3513e383d10..8ef0ec7b71f4 100644
--- a/lib/Target/AMDGPU/VOPCInstructions.td
+++ b/lib/Target/AMDGPU/VOPCInstructions.td
@@ -183,7 +183,7 @@ multiclass VOPCXInstAliases <string OpName, string Arch> {
}
-class getVOPCPat64 <PatLeaf cond, VOPProfile P> : LetDummies {
+class getVOPCPat64 <SDPatternOperator cond, VOPProfile P> : LetDummies {
list<dag> ret = !if(P.HasModifiers,
[(set i1:$sdst,
(setcc (P.Src0VT
@@ -202,7 +202,7 @@ class VCMPXNoSDstTable <bit has_sdst, string Name> {
multiclass VOPC_Pseudos <string opName,
VOPC_Profile P,
- PatLeaf cond = COND_NULL,
+ SDPatternOperator cond = COND_NULL,
string revOp = opName,
bit DefExec = 0> {
@@ -225,6 +225,7 @@ multiclass VOPC_Pseudos <string opName,
let isCommutable = 1;
}
+ foreach _ = BoolToList<P.HasExtSDWA>.ret in
def _sdwa : VOPC_SDWA_Pseudo <opName, P> {
let Defs = !if(DefExec, [VCC, EXEC], [VCC]);
let SchedRW = P.Schedule;
@@ -236,7 +237,7 @@ multiclass VOPC_Pseudos <string opName,
let SubtargetPredicate = HasSdstCMPX in {
multiclass VOPCX_Pseudos <string opName,
VOPC_Profile P, VOPC_Profile P_NoSDst,
- PatLeaf cond = COND_NULL,
+ SDPatternOperator cond = COND_NULL,
string revOp = opName> :
VOPC_Pseudos <opName, P, cond, revOp, 1> {
@@ -261,6 +262,7 @@ multiclass VOPCX_Pseudos <string opName,
let SubtargetPredicate = HasNoSdstCMPX;
}
+ foreach _ = BoolToList<P_NoSDst.HasExtSDWA>.ret in
def _nosdst_sdwa : VOPC_SDWA_Pseudo <opName#"_nosdst", P_NoSDst> {
let Defs = [EXEC];
let SchedRW = P_NoSDst.Schedule;
@@ -285,22 +287,23 @@ def VOPC_I16_I16 : VOPC_NoSdst_Profile<[Write32Bit], i16>;
def VOPC_I32_I32 : VOPC_NoSdst_Profile<[Write32Bit], i32>;
def VOPC_I64_I64 : VOPC_NoSdst_Profile<[Write64Bit], i64>;
-multiclass VOPC_F16 <string opName, PatLeaf cond = COND_NULL, string revOp = opName> :
+multiclass VOPC_F16 <string opName, SDPatternOperator cond = COND_NULL,
+ string revOp = opName> :
VOPC_Pseudos <opName, VOPC_I1_F16_F16, cond, revOp, 0>;
-multiclass VOPC_F32 <string opName, PatLeaf cond = COND_NULL, string revOp = opName> :
+multiclass VOPC_F32 <string opName, SDPatternOperator cond = COND_NULL, string revOp = opName> :
VOPC_Pseudos <opName, VOPC_I1_F32_F32, cond, revOp, 0>;
-multiclass VOPC_F64 <string opName, PatLeaf cond = COND_NULL, string revOp = opName> :
+multiclass VOPC_F64 <string opName, SDPatternOperator cond = COND_NULL, string revOp = opName> :
VOPC_Pseudos <opName, VOPC_I1_F64_F64, cond, revOp, 0>;
-multiclass VOPC_I16 <string opName, PatLeaf cond = COND_NULL, string revOp = opName> :
+multiclass VOPC_I16 <string opName, SDPatternOperator cond = COND_NULL, string revOp = opName> :
VOPC_Pseudos <opName, VOPC_I1_I16_I16, cond, revOp, 0>;
-multiclass VOPC_I32 <string opName, PatLeaf cond = COND_NULL, string revOp = opName> :
+multiclass VOPC_I32 <string opName, SDPatternOperator cond = COND_NULL, string revOp = opName> :
VOPC_Pseudos <opName, VOPC_I1_I32_I32, cond, revOp, 0>;
-multiclass VOPC_I64 <string opName, PatLeaf cond = COND_NULL, string revOp = opName> :
+multiclass VOPC_I64 <string opName, SDPatternOperator cond = COND_NULL, string revOp = opName> :
VOPC_Pseudos <opName, VOPC_I1_I64_I64, cond, revOp, 0>;
multiclass VOPCX_F16 <string opName, string revOp = opName> :
@@ -669,6 +672,7 @@ multiclass VOPC_Class_Pseudos <string opName, VOPC_Profile p, bit DefExec,
let SchedRW = p.Schedule;
}
+ foreach _ = BoolToList<p.HasExtSDWA>.ret in
def _sdwa : VOPC_SDWA_Pseudo <opName, p> {
let Defs = !if(DefExec, !if(DefVcc, [VCC, EXEC], [EXEC]),
!if(DefVcc, [VCC], []));
@@ -698,6 +702,7 @@ multiclass VOPCX_Class_Pseudos <string opName,
let SubtargetPredicate = HasNoSdstCMPX;
}
+ foreach _ = BoolToList<P_NoSDst.HasExtSDWA>.ret in
def _nosdst_sdwa : VOPC_SDWA_Pseudo <opName#"_nosdst", P_NoSDst> {
let Defs = [EXEC];
let SchedRW = P_NoSDst.Schedule;
@@ -737,8 +742,11 @@ defm V_CMP_CLASS_F32 : VOPC_CLASS_F32 <"v_cmp_class_f32">;
defm V_CMPX_CLASS_F32 : VOPCX_CLASS_F32 <"v_cmpx_class_f32">;
defm V_CMP_CLASS_F64 : VOPC_CLASS_F64 <"v_cmp_class_f64">;
defm V_CMPX_CLASS_F64 : VOPCX_CLASS_F64 <"v_cmpx_class_f64">;
+
+let SubtargetPredicate = Has16BitInsts in {
defm V_CMP_CLASS_F16 : VOPC_CLASS_F16 <"v_cmp_class_f16">;
defm V_CMPX_CLASS_F16 : VOPCX_CLASS_F16 <"v_cmpx_class_f16">;
+}
//===----------------------------------------------------------------------===//
// V_ICMPIntrinsic Pattern.
@@ -878,6 +886,7 @@ let AssemblerPredicate = isGFX10Plus in {
}
} // End DecoderNamespace = "GFX10"
+ foreach _ = BoolToList<!cast<VOPC_Pseudo>(NAME#"_e32").Pfl.HasExtSDWA9>.ret in
def _sdwa_gfx10 :
VOP_SDWA10_Real<!cast<VOPC_SDWA_Pseudo>(NAME#"_sdwa")>,
VOPC_SDWA9e<op{7-0}, !cast<VOPC_SDWA_Pseudo>(NAME#"_sdwa").Pfl>;
@@ -903,6 +912,7 @@ let AssemblerPredicate = isGFX10Plus in {
}
} // End DecoderNamespace = "GFX10"
+ foreach _ = BoolToList<!cast<VOPC_Pseudo>(NAME#"_nosdst_e32").Pfl.HasExtSDWA9>.ret in
def _sdwa_gfx10 :
VOP_SDWA10_Real<!cast<VOPC_SDWA_Pseudo>(NAME#"_nosdst_sdwa")>,
VOPC_SDWA9e<op{7-0}, !cast<VOPC_SDWA_Pseudo>(NAME#"_nosdst_sdwa").Pfl> {
@@ -1223,10 +1233,12 @@ multiclass VOPC_Real_vi <bits<10> op> {
}
}
+ foreach _ = BoolToList<!cast<VOPC_Pseudo>(NAME#"_e32").Pfl.HasExtSDWA>.ret in
def _sdwa_vi :
VOP_SDWA_Real <!cast<VOPC_SDWA_Pseudo>(NAME#"_sdwa")>,
VOPC_SDWAe <op{7-0}, !cast<VOPC_SDWA_Pseudo>(NAME#"_sdwa").Pfl>;
+ foreach _ = BoolToList<!cast<VOPC_Pseudo>(NAME#"_e32").Pfl.HasExtSDWA9>.ret in
def _sdwa_gfx9 :
VOP_SDWA9_Real <!cast<VOPC_SDWA_Pseudo>(NAME#"_sdwa")>,
VOPC_SDWA9e <op{7-0}, !cast<VOPC_SDWA_Pseudo>(NAME#"_sdwa").Pfl>;
diff --git a/lib/Target/AMDGPU/VOPInstructions.td b/lib/Target/AMDGPU/VOPInstructions.td
index 677095a354be..f208a1134a5a 100644
--- a/lib/Target/AMDGPU/VOPInstructions.td
+++ b/lib/Target/AMDGPU/VOPInstructions.td
@@ -14,6 +14,7 @@ class LetDummies {
bit isReMaterializable;
bit isAsCheapAsAMove;
bit VOPAsmPrefer32Bit;
+ bit FPDPRounding;
Predicate SubtargetPredicate;
string Constraints;
string DisableEncoding;
@@ -41,9 +42,7 @@ class VOP_Pseudo <string opName, string suffix, VOPProfile P, dag outs, dag ins,
string asm, list<dag> pattern> :
InstSI <outs, ins, asm, pattern>,
VOP <opName>,
- SIMCInstr <opName#suffix, SIEncodingFamily.NONE>,
- MnemonicAlias<opName#suffix, opName> {
-
+ SIMCInstr <opName#suffix, SIEncodingFamily.NONE> {
let isPseudo = 1;
let isCodeGenOnly = 1;
let UseNamedOperandTable = 1;
@@ -148,6 +147,7 @@ class VOP3_Real <VOP_Pseudo ps, int EncodingFamily> :
// copy relevant pseudo op flags
let SubtargetPredicate = ps.SubtargetPredicate;
+ let OtherPredicates = ps.OtherPredicates;
let AsmMatchConverter = ps.AsmMatchConverter;
let AsmVariantName = ps.AsmVariantName;
let Constraints = ps.Constraints;
@@ -473,8 +473,7 @@ class VOP_SDWA9Be<VOPProfile P> : VOP_SDWA9e<P> {
class VOP_SDWA_Pseudo <string opName, VOPProfile P, list<dag> pattern=[]> :
InstSI <P.OutsSDWA, P.InsSDWA, "", pattern>,
VOP <opName>,
- SIMCInstr <opName#"_sdwa", SIEncodingFamily.NONE>,
- MnemonicAlias <opName#"_sdwa", opName> {
+ SIMCInstr <opName#"_sdwa", SIEncodingFamily.NONE> {
let isPseudo = 1;
let isCodeGenOnly = 1;
@@ -595,8 +594,7 @@ class VOP_DPPe<VOPProfile P, bit IsDPP16=0> : Enc64 {
class VOP_DPP_Pseudo <string OpName, VOPProfile P, list<dag> pattern=[]> :
InstSI <P.OutsDPP, P.InsDPP, OpName#P.AsmDPP, pattern>,
VOP <OpName>,
- SIMCInstr <OpName#"_dpp", SIEncodingFamily.NONE>,
- MnemonicAlias <OpName#"_dpp", OpName> {
+ SIMCInstr <OpName#"_dpp", SIEncodingFamily.NONE> {
let isPseudo = 1;
let isCodeGenOnly = 1;