aboutsummaryrefslogtreecommitdiff
path: root/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp')
-rw-r--r--lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp78
1 files changed, 62 insertions, 16 deletions
diff --git a/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
index 743ac64b8f10..f2d903c8e7b1 100644
--- a/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
+++ b/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
@@ -229,7 +229,7 @@ void AMDGPUAsmPrinter::EmitFunctionBodyEnd() {
// alignment.
Streamer.EmitValueToAlignment(64, 0, 1, 0);
if (ReadOnlySection.getAlignment() < 64)
- ReadOnlySection.setAlignment(64);
+ ReadOnlySection.setAlignment(Align(64));
const MCSubtargetInfo &STI = MF->getSubtarget();
@@ -273,7 +273,7 @@ void AMDGPUAsmPrinter::EmitFunctionEntryLabel() {
AsmPrinter::EmitFunctionEntryLabel();
}
-void AMDGPUAsmPrinter::EmitBasicBlockStart(const MachineBasicBlock &MBB) const {
+void AMDGPUAsmPrinter::EmitBasicBlockStart(const MachineBasicBlock &MBB) {
if (DumpCodeInstEmitter && !isBlockOnlyReachableByFallthrough(&MBB)) {
// Write a line for the basic block label if it is not only fallthrough.
DisasmLines.push_back(
@@ -342,6 +342,8 @@ bool AMDGPUAsmPrinter::doFinalization(Module &M) {
// Print comments that apply to both callable functions and entry points.
void AMDGPUAsmPrinter::emitCommonFunctionComments(
uint32_t NumVGPR,
+ Optional<uint32_t> NumAGPR,
+ uint32_t TotalNumVGPR,
uint32_t NumSGPR,
uint64_t ScratchSize,
uint64_t CodeSize,
@@ -349,6 +351,11 @@ void AMDGPUAsmPrinter::emitCommonFunctionComments(
OutStreamer->emitRawComment(" codeLenInByte = " + Twine(CodeSize), false);
OutStreamer->emitRawComment(" NumSgprs: " + Twine(NumSGPR), false);
OutStreamer->emitRawComment(" NumVgprs: " + Twine(NumVGPR), false);
+ if (NumAGPR) {
+ OutStreamer->emitRawComment(" NumAgprs: " + Twine(*NumAGPR), false);
+ OutStreamer->emitRawComment(" TotalNumVgprs: " + Twine(TotalNumVGPR),
+ false);
+ }
OutStreamer->emitRawComment(" ScratchSize: " + Twine(ScratchSize), false);
OutStreamer->emitRawComment(" MemoryBound: " + Twine(MFI->isMemoryBound()),
false);
@@ -417,7 +424,7 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
// The starting address of all shader programs must be 256 bytes aligned.
// Regular functions just need the basic required instruction alignment.
- MF.setAlignment(MFI->isEntryFunction() ? 8 : 2);
+ MF.setAlignment(MFI->isEntryFunction() ? Align(256) : Align(4));
SetupMachineFunction(MF);
@@ -474,6 +481,8 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
SIFunctionResourceInfo &Info = CallGraphResourceInfo[&MF.getFunction()];
emitCommonFunctionComments(
Info.NumVGPR,
+ STM.hasMAIInsts() ? Info.NumAGPR : Optional<uint32_t>(),
+ Info.getTotalNumVGPRs(STM),
Info.getTotalNumSGPRs(MF.getSubtarget<GCNSubtarget>()),
Info.PrivateSegmentSize,
getFunctionCodeSize(MF), MFI);
@@ -481,7 +490,11 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
}
OutStreamer->emitRawComment(" Kernel info:", false);
- emitCommonFunctionComments(CurrentProgramInfo.NumVGPR,
+ emitCommonFunctionComments(CurrentProgramInfo.NumArchVGPR,
+ STM.hasMAIInsts()
+ ? CurrentProgramInfo.NumAccVGPR
+ : Optional<uint32_t>(),
+ CurrentProgramInfo.NumVGPR,
CurrentProgramInfo.NumSGPR,
CurrentProgramInfo.ScratchSize,
getFunctionCodeSize(MF), MFI);
@@ -507,6 +520,10 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
Twine(CurrentProgramInfo.NumVGPRsForWavesPerEU), false);
OutStreamer->emitRawComment(
+ " Occupancy: " +
+ Twine(CurrentProgramInfo.Occupancy), false);
+
+ OutStreamer->emitRawComment(
" WaveLimiterHint : " + Twine(MFI->needsWaveLimiter()), false);
OutStreamer->emitRawComment(
@@ -588,6 +605,11 @@ int32_t AMDGPUAsmPrinter::SIFunctionResourceInfo::getTotalNumSGPRs(
UsesVCC, UsesFlatScratch);
}
+int32_t AMDGPUAsmPrinter::SIFunctionResourceInfo::getTotalNumVGPRs(
+ const GCNSubtarget &ST) const {
+ return std::max(NumVGPR, NumAGPR);
+}
+
AMDGPUAsmPrinter::SIFunctionResourceInfo AMDGPUAsmPrinter::analyzeResourceUsage(
const MachineFunction &MF) const {
SIFunctionResourceInfo Info;
@@ -634,11 +656,18 @@ AMDGPUAsmPrinter::SIFunctionResourceInfo AMDGPUAsmPrinter::analyzeResourceUsage(
HighestVGPRReg = Reg;
break;
}
- MCPhysReg AReg = AMDGPU::AGPR0 + TRI.getHWRegIndex(Reg);
- if (MRI.isPhysRegUsed(AReg)) {
- HighestVGPRReg = AReg;
- break;
+ }
+
+ if (ST.hasMAIInsts()) {
+ MCPhysReg HighestAGPRReg = AMDGPU::NoRegister;
+ for (MCPhysReg Reg : reverse(AMDGPU::AGPR_32RegClass.getRegisters())) {
+ if (MRI.isPhysRegUsed(Reg)) {
+ HighestAGPRReg = Reg;
+ break;
+ }
}
+ Info.NumAGPR = HighestAGPRReg == AMDGPU::NoRegister ? 0 :
+ TRI.getHWRegIndex(HighestAGPRReg) + 1;
}
MCPhysReg HighestSGPRReg = AMDGPU::NoRegister;
@@ -660,6 +689,7 @@ AMDGPUAsmPrinter::SIFunctionResourceInfo AMDGPUAsmPrinter::analyzeResourceUsage(
}
int32_t MaxVGPR = -1;
+ int32_t MaxAGPR = -1;
int32_t MaxSGPR = -1;
uint64_t CalleeFrameSize = 0;
@@ -669,11 +699,12 @@ AMDGPUAsmPrinter::SIFunctionResourceInfo AMDGPUAsmPrinter::analyzeResourceUsage(
for (const MachineOperand &MO : MI.operands()) {
unsigned Width = 0;
bool IsSGPR = false;
+ bool IsAGPR = false;
if (!MO.isReg())
continue;
- unsigned Reg = MO.getReg();
+ Register Reg = MO.getReg();
switch (Reg) {
case AMDGPU::EXEC:
case AMDGPU::EXEC_LO:
@@ -744,6 +775,7 @@ AMDGPUAsmPrinter::SIFunctionResourceInfo AMDGPUAsmPrinter::analyzeResourceUsage(
Width = 1;
} else if (AMDGPU::AGPR_32RegClass.contains(Reg)) {
IsSGPR = false;
+ IsAGPR = true;
Width = 1;
} else if (AMDGPU::SReg_64RegClass.contains(Reg)) {
assert(!AMDGPU::TTMP_64RegClass.contains(Reg) &&
@@ -755,6 +787,7 @@ AMDGPUAsmPrinter::SIFunctionResourceInfo AMDGPUAsmPrinter::analyzeResourceUsage(
Width = 2;
} else if (AMDGPU::AReg_64RegClass.contains(Reg)) {
IsSGPR = false;
+ IsAGPR = true;
Width = 2;
} else if (AMDGPU::VReg_96RegClass.contains(Reg)) {
IsSGPR = false;
@@ -771,6 +804,7 @@ AMDGPUAsmPrinter::SIFunctionResourceInfo AMDGPUAsmPrinter::analyzeResourceUsage(
Width = 4;
} else if (AMDGPU::AReg_128RegClass.contains(Reg)) {
IsSGPR = false;
+ IsAGPR = true;
Width = 4;
} else if (AMDGPU::SReg_256RegClass.contains(Reg)) {
assert(!AMDGPU::TTMP_256RegClass.contains(Reg) &&
@@ -790,6 +824,7 @@ AMDGPUAsmPrinter::SIFunctionResourceInfo AMDGPUAsmPrinter::analyzeResourceUsage(
Width = 16;
} else if (AMDGPU::AReg_512RegClass.contains(Reg)) {
IsSGPR = false;
+ IsAGPR = true;
Width = 16;
} else if (AMDGPU::SReg_1024RegClass.contains(Reg)) {
IsSGPR = true;
@@ -799,6 +834,7 @@ AMDGPUAsmPrinter::SIFunctionResourceInfo AMDGPUAsmPrinter::analyzeResourceUsage(
Width = 32;
} else if (AMDGPU::AReg_1024RegClass.contains(Reg)) {
IsSGPR = false;
+ IsAGPR = true;
Width = 32;
} else {
llvm_unreachable("Unknown register class");
@@ -807,6 +843,8 @@ AMDGPUAsmPrinter::SIFunctionResourceInfo AMDGPUAsmPrinter::analyzeResourceUsage(
int MaxUsed = HWReg + Width - 1;
if (IsSGPR) {
MaxSGPR = MaxUsed > MaxSGPR ? MaxUsed : MaxSGPR;
+ } else if (IsAGPR) {
+ MaxAGPR = MaxUsed > MaxAGPR ? MaxUsed : MaxAGPR;
} else {
MaxVGPR = MaxUsed > MaxVGPR ? MaxUsed : MaxVGPR;
}
@@ -828,6 +866,7 @@ AMDGPUAsmPrinter::SIFunctionResourceInfo AMDGPUAsmPrinter::analyzeResourceUsage(
47 - IsaInfo::getNumExtraSGPRs(&ST, true, ST.hasFlatAddressSpace());
MaxSGPR = std::max(MaxSGPR, MaxSGPRGuess);
MaxVGPR = std::max(MaxVGPR, 23);
+ MaxAGPR = std::max(MaxAGPR, 23);
CalleeFrameSize = std::max(CalleeFrameSize, UINT64_C(16384));
Info.UsesVCC = true;
@@ -852,6 +891,7 @@ AMDGPUAsmPrinter::SIFunctionResourceInfo AMDGPUAsmPrinter::analyzeResourceUsage(
MaxSGPR = std::max(I->second.NumExplicitSGPR - 1, MaxSGPR);
MaxVGPR = std::max(I->second.NumVGPR - 1, MaxVGPR);
+ MaxAGPR = std::max(I->second.NumAGPR - 1, MaxAGPR);
CalleeFrameSize
= std::max(I->second.PrivateSegmentSize, CalleeFrameSize);
Info.UsesVCC |= I->second.UsesVCC;
@@ -868,6 +908,7 @@ AMDGPUAsmPrinter::SIFunctionResourceInfo AMDGPUAsmPrinter::analyzeResourceUsage(
Info.NumExplicitSGPR = MaxSGPR + 1;
Info.NumVGPR = MaxVGPR + 1;
+ Info.NumAGPR = MaxAGPR + 1;
Info.PrivateSegmentSize += CalleeFrameSize;
return Info;
@@ -876,8 +917,11 @@ AMDGPUAsmPrinter::SIFunctionResourceInfo AMDGPUAsmPrinter::analyzeResourceUsage(
void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
const MachineFunction &MF) {
SIFunctionResourceInfo Info = analyzeResourceUsage(MF);
+ const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
- ProgInfo.NumVGPR = Info.NumVGPR;
+ ProgInfo.NumArchVGPR = Info.NumVGPR;
+ ProgInfo.NumAccVGPR = Info.NumAGPR;
+ ProgInfo.NumVGPR = Info.getTotalNumVGPRs(STM);
ProgInfo.NumSGPR = Info.NumExplicitSGPR;
ProgInfo.ScratchSize = Info.PrivateSegmentSize;
ProgInfo.VCCUsed = Info.UsesVCC;
@@ -890,7 +934,6 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
MF.getFunction().getContext().diagnose(DiagStackSize);
}
- const GCNSubtarget &STM = MF.getSubtarget<GCNSubtarget>();
const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
// TODO(scott.linder): The calculations related to SGPR/VGPR blocks are
@@ -1057,6 +1100,10 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
// For AMDHSA, LDS_SIZE must be zero, as it is populated by the CP.
S_00B84C_LDS_SIZE(STM.isAmdHsaOS() ? 0 : ProgInfo.LDSBlocks) |
S_00B84C_EXCP_EN(0);
+
+ ProgInfo.Occupancy = STM.computeOccupancy(MF, ProgInfo.LDSSize,
+ ProgInfo.NumSGPRsForWavesPerEU,
+ ProgInfo.NumVGPRsForWavesPerEU);
}
static unsigned getRsrcReg(CallingConv::ID CallConv) {
@@ -1214,17 +1261,16 @@ void AMDGPUAsmPrinter::getAmdKernelCode(amd_kernel_code_t &Out,
if (STM.isXNACKEnabled())
Out.code_properties |= AMD_CODE_PROPERTY_IS_XNACK_SUPPORTED;
- unsigned MaxKernArgAlign;
+ Align MaxKernArgAlign;
Out.kernarg_segment_byte_size = STM.getKernArgSegmentSize(F, MaxKernArgAlign);
Out.wavefront_sgpr_count = CurrentProgramInfo.NumSGPR;
Out.workitem_vgpr_count = CurrentProgramInfo.NumVGPR;
Out.workitem_private_segment_byte_size = CurrentProgramInfo.ScratchSize;
Out.workgroup_group_segment_byte_size = CurrentProgramInfo.LDSSize;
- // These alignment values are specified in powers of two, so alignment =
- // 2^n. The minimum alignment is 2^4 = 16.
- Out.kernarg_segment_alignment = std::max<size_t>(4,
- countTrailingZeros(MaxKernArgAlign));
+ // kernarg_segment_alignment is specified as log of the alignment.
+ // The minimum alignment is 16.
+ Out.kernarg_segment_alignment = Log2(std::max(Align(16), MaxKernArgAlign));
}
bool AMDGPUAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,