summaryrefslogtreecommitdiff
path: root/lib/Target/AMDGPU
diff options
context:
space:
mode:
authorDimitry Andric <dim@FreeBSD.org>2017-05-02 18:30:13 +0000
committerDimitry Andric <dim@FreeBSD.org>2017-05-02 18:30:13 +0000
commita303c417bbdb53703c2c17398b08486bde78f1f6 (patch)
tree98366d6b93d863cefdc53f16c66c0c5ae7fb2261 /lib/Target/AMDGPU
parent12f3ca4cdb95b193af905a00e722a4dcb40b3de3 (diff)
Notes
Diffstat (limited to 'lib/Target/AMDGPU')
-rw-r--r--lib/Target/AMDGPU/AMDGPU.td2
-rw-r--r--lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp339
-rw-r--r--lib/Target/AMDGPU/AMDGPUAsmPrinter.h33
-rw-r--r--lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp4
-rw-r--r--lib/Target/AMDGPU/AMDGPUISelLowering.cpp31
-rw-r--r--lib/Target/AMDGPU/AMDGPUISelLowering.h10
-rw-r--r--lib/Target/AMDGPU/AMDGPUInstrInfo.td9
-rw-r--r--lib/Target/AMDGPU/AMDGPUMachineFunction.cpp1
-rw-r--r--lib/Target/AMDGPU/AMDGPUTargetMachine.cpp1
-rw-r--r--lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp1
-rw-r--r--lib/Target/AMDGPU/GCNSchedStrategy.cpp3
-rw-r--r--lib/Target/AMDGPU/R600Intrinsics.td2
-rw-r--r--lib/Target/AMDGPU/SIAnnotateControlFlow.cpp18
-rw-r--r--lib/Target/AMDGPU/SIDefines.h1
-rw-r--r--lib/Target/AMDGPU/SIFixSGPRCopies.cpp13
-rw-r--r--lib/Target/AMDGPU/SIISelLowering.cpp87
-rw-r--r--lib/Target/AMDGPU/SIInstructions.td23
-rw-r--r--lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp1
18 files changed, 404 insertions, 175 deletions
diff --git a/lib/Target/AMDGPU/AMDGPU.td b/lib/Target/AMDGPU/AMDGPU.td
index 0f331486d0f8..2e5b78bbf7ef 100644
--- a/lib/Target/AMDGPU/AMDGPU.td
+++ b/lib/Target/AMDGPU/AMDGPU.td
@@ -407,7 +407,7 @@ def FeatureGFX9 : SubtargetFeatureGeneration<"GFX9",
FeatureGCN3Encoding, FeatureCIInsts, Feature16BitInsts,
FeatureSMemRealTime, FeatureScalarStores, FeatureInv2PiInlineImm,
FeatureApertureRegs, FeatureGFX9Insts, FeatureVOP3P, FeatureVGPRIndexMode,
- FeatureFastFMAF32
+ FeatureFastFMAF32, FeatureDPP
]
>;
diff --git a/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
index a81bcb56dfdc..2ce23dbf08e6 100644
--- a/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
+++ b/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
@@ -149,11 +149,9 @@ void AMDGPUAsmPrinter::EmitFunctionBodyStart() {
return;
const AMDGPUSubtarget &STM = MF->getSubtarget<AMDGPUSubtarget>();
- SIProgramInfo KernelInfo;
amd_kernel_code_t KernelCode;
if (STM.isAmdCodeObjectV2(*MF)) {
- getSIProgramInfo(KernelInfo, *MF);
- getAmdKernelCode(KernelCode, KernelInfo, *MF);
+ getAmdKernelCode(KernelCode, CurrentProgramInfo, *MF);
OutStreamer->SwitchSection(getObjFileLowering().getTextSection());
getTargetStreamer().EmitAMDKernelCodeT(KernelCode);
@@ -187,7 +185,26 @@ void AMDGPUAsmPrinter::EmitGlobalVariable(const GlobalVariable *GV) {
AsmPrinter::EmitGlobalVariable(GV);
}
+bool AMDGPUAsmPrinter::doFinalization(Module &M) {
+ CallGraphResourceInfo.clear();
+ return AsmPrinter::doFinalization(M);
+}
+
+// Print comments that apply to both callable functions and entry points.
+void AMDGPUAsmPrinter::emitCommonFunctionComments(
+ uint32_t NumVGPR,
+ uint32_t NumSGPR,
+ uint32_t ScratchSize,
+ uint64_t CodeSize) {
+ OutStreamer->emitRawComment(" codeLenInByte = " + Twine(CodeSize), false);
+ OutStreamer->emitRawComment(" NumSgprs: " + Twine(NumSGPR), false);
+ OutStreamer->emitRawComment(" NumVgprs: " + Twine(NumVGPR), false);
+ OutStreamer->emitRawComment(" ScratchSize: " + Twine(ScratchSize), false);
+}
+
bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
+ CurrentProgramInfo = SIProgramInfo();
+
const AMDGPUMachineFunction *MFI = MF.getInfo<AMDGPUMachineFunction>();
// The starting address of all shader programs must be 256 bytes aligned.
@@ -204,11 +221,19 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
OutStreamer->SwitchSection(ConfigSection);
}
- SIProgramInfo KernelInfo;
if (STM.getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS) {
- getSIProgramInfo(KernelInfo, MF);
+ if (MFI->isEntryFunction()) {
+ getSIProgramInfo(CurrentProgramInfo, MF);
+ } else {
+ auto I = CallGraphResourceInfo.insert(
+ std::make_pair(MF.getFunction(), SIFunctionResourceInfo()));
+ SIFunctionResourceInfo &Info = I.first->second;
+ assert(I.second && "should only be called once per function");
+ Info = analyzeResourceUsage(MF);
+ }
+
if (!STM.isAmdHsaOS()) {
- EmitProgramInfoSI(MF, KernelInfo);
+ EmitProgramInfoSI(MF, CurrentProgramInfo);
}
} else {
EmitProgramInfoR600(MF);
@@ -226,72 +251,87 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
OutStreamer->SwitchSection(CommentSection);
if (STM.getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS) {
- if (MFI->isEntryFunction()) {
- OutStreamer->emitRawComment(" Kernel info:", false);
- } else {
+ if (!MFI->isEntryFunction()) {
OutStreamer->emitRawComment(" Function info:", false);
+ SIFunctionResourceInfo &Info = CallGraphResourceInfo[MF.getFunction()];
+ emitCommonFunctionComments(
+ Info.NumVGPR,
+ Info.getTotalNumSGPRs(MF.getSubtarget<SISubtarget>()),
+ Info.PrivateSegmentSize,
+ getFunctionCodeSize(MF));
+ return false;
}
+ OutStreamer->emitRawComment(" Kernel info:", false);
+ emitCommonFunctionComments(CurrentProgramInfo.NumVGPR,
+ CurrentProgramInfo.NumSGPR,
+ CurrentProgramInfo.ScratchSize,
+ getFunctionCodeSize(MF));
+
OutStreamer->emitRawComment(" codeLenInByte = " +
Twine(getFunctionCodeSize(MF)), false);
- OutStreamer->emitRawComment(" NumSgprs: " + Twine(KernelInfo.NumSGPR),
- false);
- OutStreamer->emitRawComment(" NumVgprs: " + Twine(KernelInfo.NumVGPR),
- false);
-
- OutStreamer->emitRawComment(" FloatMode: " + Twine(KernelInfo.FloatMode),
- false);
- OutStreamer->emitRawComment(" IeeeMode: " + Twine(KernelInfo.IEEEMode),
- false);
- OutStreamer->emitRawComment(" ScratchSize: " + Twine(KernelInfo.ScratchSize),
- false);
- OutStreamer->emitRawComment(" LDSByteSize: " + Twine(KernelInfo.LDSSize) +
- " bytes/workgroup (compile time only)", false);
+ OutStreamer->emitRawComment(
+ " NumSgprs: " + Twine(CurrentProgramInfo.NumSGPR), false);
+ OutStreamer->emitRawComment(
+ " NumVgprs: " + Twine(CurrentProgramInfo.NumVGPR), false);
- if (!MFI->isEntryFunction())
- return false;
+ OutStreamer->emitRawComment(
+ " FloatMode: " + Twine(CurrentProgramInfo.FloatMode), false);
+ OutStreamer->emitRawComment(
+ " IeeeMode: " + Twine(CurrentProgramInfo.IEEEMode), false);
+ OutStreamer->emitRawComment(
+ " ScratchSize: " + Twine(CurrentProgramInfo.ScratchSize), false);
+ OutStreamer->emitRawComment(
+ " LDSByteSize: " + Twine(CurrentProgramInfo.LDSSize) +
+ " bytes/workgroup (compile time only)", false);
- OutStreamer->emitRawComment(" SGPRBlocks: " +
- Twine(KernelInfo.SGPRBlocks), false);
- OutStreamer->emitRawComment(" VGPRBlocks: " +
- Twine(KernelInfo.VGPRBlocks), false);
+ OutStreamer->emitRawComment(
+ " SGPRBlocks: " + Twine(CurrentProgramInfo.SGPRBlocks), false);
+ OutStreamer->emitRawComment(
+ " VGPRBlocks: " + Twine(CurrentProgramInfo.VGPRBlocks), false);
- OutStreamer->emitRawComment(" NumSGPRsForWavesPerEU: " +
- Twine(KernelInfo.NumSGPRsForWavesPerEU), false);
- OutStreamer->emitRawComment(" NumVGPRsForWavesPerEU: " +
- Twine(KernelInfo.NumVGPRsForWavesPerEU), false);
+ OutStreamer->emitRawComment(
+ " NumSGPRsForWavesPerEU: " +
+ Twine(CurrentProgramInfo.NumSGPRsForWavesPerEU), false);
+ OutStreamer->emitRawComment(
+ " NumVGPRsForWavesPerEU: " +
+ Twine(CurrentProgramInfo.NumVGPRsForWavesPerEU), false);
- OutStreamer->emitRawComment(" ReservedVGPRFirst: " + Twine(KernelInfo.ReservedVGPRFirst),
- false);
- OutStreamer->emitRawComment(" ReservedVGPRCount: " + Twine(KernelInfo.ReservedVGPRCount),
- false);
+ OutStreamer->emitRawComment(
+ " ReservedVGPRFirst: " + Twine(CurrentProgramInfo.ReservedVGPRFirst),
+ false);
+ OutStreamer->emitRawComment(
+ " ReservedVGPRCount: " + Twine(CurrentProgramInfo.ReservedVGPRCount),
+ false);
if (MF.getSubtarget<SISubtarget>().debuggerEmitPrologue()) {
- OutStreamer->emitRawComment(" DebuggerWavefrontPrivateSegmentOffsetSGPR: s" +
- Twine(KernelInfo.DebuggerWavefrontPrivateSegmentOffsetSGPR), false);
- OutStreamer->emitRawComment(" DebuggerPrivateSegmentBufferSGPR: s" +
- Twine(KernelInfo.DebuggerPrivateSegmentBufferSGPR), false);
+ OutStreamer->emitRawComment(
+ " DebuggerWavefrontPrivateSegmentOffsetSGPR: s" +
+ Twine(CurrentProgramInfo.DebuggerWavefrontPrivateSegmentOffsetSGPR), false);
+ OutStreamer->emitRawComment(
+ " DebuggerPrivateSegmentBufferSGPR: s" +
+ Twine(CurrentProgramInfo.DebuggerPrivateSegmentBufferSGPR), false);
}
- OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:USER_SGPR: " +
- Twine(G_00B84C_USER_SGPR(KernelInfo.ComputePGMRSrc2)),
- false);
- OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:TRAP_HANDLER: " +
- Twine(G_00B84C_TRAP_HANDLER(KernelInfo.ComputePGMRSrc2)),
- false);
- OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:TGID_X_EN: " +
- Twine(G_00B84C_TGID_X_EN(KernelInfo.ComputePGMRSrc2)),
- false);
- OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:TGID_Y_EN: " +
- Twine(G_00B84C_TGID_Y_EN(KernelInfo.ComputePGMRSrc2)),
- false);
- OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:TGID_Z_EN: " +
- Twine(G_00B84C_TGID_Z_EN(KernelInfo.ComputePGMRSrc2)),
- false);
- OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:TIDIG_COMP_CNT: " +
- Twine(G_00B84C_TIDIG_COMP_CNT(KernelInfo.ComputePGMRSrc2)),
- false);
-
+ OutStreamer->emitRawComment(
+ " COMPUTE_PGM_RSRC2:USER_SGPR: " +
+ Twine(G_00B84C_USER_SGPR(CurrentProgramInfo.ComputePGMRSrc2)), false);
+ OutStreamer->emitRawComment(
+ " COMPUTE_PGM_RSRC2:TRAP_HANDLER: " +
+ Twine(G_00B84C_TRAP_HANDLER(CurrentProgramInfo.ComputePGMRSrc2)), false);
+ OutStreamer->emitRawComment(
+ " COMPUTE_PGM_RSRC2:TGID_X_EN: " +
+ Twine(G_00B84C_TGID_X_EN(CurrentProgramInfo.ComputePGMRSrc2)), false);
+ OutStreamer->emitRawComment(
+ " COMPUTE_PGM_RSRC2:TGID_Y_EN: " +
+ Twine(G_00B84C_TGID_Y_EN(CurrentProgramInfo.ComputePGMRSrc2)), false);
+ OutStreamer->emitRawComment(
+ " COMPUTE_PGM_RSRC2:TGID_Z_EN: " +
+ Twine(G_00B84C_TGID_Z_EN(CurrentProgramInfo.ComputePGMRSrc2)), false);
+ OutStreamer->emitRawComment(
+ " COMPUTE_PGM_RSRC2:TIDIG_COMP_CNT: " +
+ Twine(G_00B84C_TIDIG_COMP_CNT(CurrentProgramInfo.ComputePGMRSrc2)),
+ false);
} else {
R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>();
OutStreamer->emitRawComment(
@@ -407,71 +447,117 @@ static bool hasAnyNonFlatUseOfReg(const MachineRegisterInfo &MRI,
return false;
}
-void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
- const MachineFunction &MF) const {
- const SISubtarget &STM = MF.getSubtarget<SISubtarget>();
- const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
- const MachineRegisterInfo &MRI = MF.getRegInfo();
- const SIInstrInfo *TII = STM.getInstrInfo();
- const SIRegisterInfo *RI = &TII->getRegisterInfo();
+static unsigned getNumExtraSGPRs(const SISubtarget &ST,
+ bool VCCUsed,
+ bool FlatScrUsed) {
+ unsigned ExtraSGPRs = 0;
+ if (VCCUsed)
+ ExtraSGPRs = 2;
+ if (ST.getGeneration() < SISubtarget::VOLCANIC_ISLANDS) {
+ if (FlatScrUsed)
+ ExtraSGPRs = 4;
+ } else {
+ if (ST.isXNACKEnabled())
+ ExtraSGPRs = 4;
- MCPhysReg NumVGPRReg = AMDGPU::NoRegister;
- for (MCPhysReg Reg : reverse(AMDGPU::VGPR_32RegClass.getRegisters())) {
- if (MRI.isPhysRegUsed(Reg)) {
- NumVGPRReg = Reg;
- break;
- }
+ if (FlatScrUsed)
+ ExtraSGPRs = 6;
}
- MCPhysReg NumSGPRReg = AMDGPU::NoRegister;
- for (MCPhysReg Reg : reverse(AMDGPU::SGPR_32RegClass.getRegisters())) {
- if (MRI.isPhysRegUsed(Reg)) {
- NumSGPRReg = Reg;
- break;
- }
- }
+ return ExtraSGPRs;
+}
- // We found the maximum register index. They start at 0, so add one to get the
- // number of registers.
- ProgInfo.NumVGPR = NumVGPRReg == AMDGPU::NoRegister ? 0 :
- RI->getHWRegIndex(NumVGPRReg) + 1;
- ProgInfo.NumSGPR = NumSGPRReg == AMDGPU::NoRegister ? 0 :
- RI->getHWRegIndex(NumSGPRReg) + 1;
- unsigned ExtraSGPRs = 0;
+int32_t AMDGPUAsmPrinter::SIFunctionResourceInfo::getTotalNumSGPRs(
+ const SISubtarget &ST) const {
+ return NumExplicitSGPR + getNumExtraSGPRs(ST, UsesVCC, UsesFlatScratch);
+}
- ProgInfo.VCCUsed = MRI.isPhysRegUsed(AMDGPU::VCC_LO) ||
- MRI.isPhysRegUsed(AMDGPU::VCC_HI);
- if (ProgInfo.VCCUsed)
- ExtraSGPRs = 2;
+AMDGPUAsmPrinter::SIFunctionResourceInfo AMDGPUAsmPrinter::analyzeResourceUsage(
+ const MachineFunction &MF) const {
+ SIFunctionResourceInfo Info;
- ProgInfo.FlatUsed = MRI.isPhysRegUsed(AMDGPU::FLAT_SCR_LO) ||
- MRI.isPhysRegUsed(AMDGPU::FLAT_SCR_HI);
+ const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
+ const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
+ const MachineFrameInfo &FrameInfo = MF.getFrameInfo();
+ const MachineRegisterInfo &MRI = MF.getRegInfo();
+ const SIInstrInfo *TII = ST.getInstrInfo();
+ const SIRegisterInfo &TRI = TII->getRegisterInfo();
+
+ Info.UsesFlatScratch = MRI.isPhysRegUsed(AMDGPU::FLAT_SCR_LO) ||
+ MRI.isPhysRegUsed(AMDGPU::FLAT_SCR_HI);
// Even if FLAT_SCRATCH is implicitly used, it has no effect if flat
- // instructions aren't used to access the scratch buffer. Inline assembly
- // may need it though.
+ // instructions aren't used to access the scratch buffer. Inline assembly may
+ // need it though.
//
// If we only have implicit uses of flat_scr on flat instructions, it is not
// really needed.
- if (ProgInfo.FlatUsed && !MFI->hasFlatScratchInit() &&
+ if (Info.UsesFlatScratch && !MFI->hasFlatScratchInit() &&
(!hasAnyNonFlatUseOfReg(MRI, *TII, AMDGPU::FLAT_SCR) &&
!hasAnyNonFlatUseOfReg(MRI, *TII, AMDGPU::FLAT_SCR_LO) &&
!hasAnyNonFlatUseOfReg(MRI, *TII, AMDGPU::FLAT_SCR_HI))) {
- ProgInfo.FlatUsed = false;
+ Info.UsesFlatScratch = false;
}
- if (STM.getGeneration() < SISubtarget::VOLCANIC_ISLANDS) {
- if (ProgInfo.FlatUsed)
- ExtraSGPRs = 4;
- } else {
- if (STM.isXNACKEnabled())
- ExtraSGPRs = 4;
+ Info.HasDynamicallySizedStack = FrameInfo.hasVarSizedObjects();
+ Info.PrivateSegmentSize = FrameInfo.getStackSize();
- if (ProgInfo.FlatUsed)
- ExtraSGPRs = 6;
+ if (!FrameInfo.hasCalls()) {
+ Info.UsesVCC = MRI.isPhysRegUsed(AMDGPU::VCC_LO) ||
+ MRI.isPhysRegUsed(AMDGPU::VCC_HI);
+
+ // If there are no calls, MachineRegisterInfo can tell us the used register
+ // count easily.
+
+ MCPhysReg HighestVGPRReg = AMDGPU::NoRegister;
+ for (MCPhysReg Reg : reverse(AMDGPU::VGPR_32RegClass.getRegisters())) {
+ if (MRI.isPhysRegUsed(Reg)) {
+ HighestVGPRReg = Reg;
+ break;
+ }
+ }
+
+ MCPhysReg HighestSGPRReg = AMDGPU::NoRegister;
+ for (MCPhysReg Reg : reverse(AMDGPU::SGPR_32RegClass.getRegisters())) {
+ if (MRI.isPhysRegUsed(Reg)) {
+ HighestSGPRReg = Reg;
+ break;
+ }
+ }
+
+ // We found the maximum register index. They start at 0, so add one to get the
+ // number of registers.
+ Info.NumVGPR = HighestVGPRReg == AMDGPU::NoRegister ? 0 :
+ TRI.getHWRegIndex(HighestVGPRReg) + 1;
+ Info.NumExplicitSGPR = HighestSGPRReg == AMDGPU::NoRegister ? 0 :
+ TRI.getHWRegIndex(HighestSGPRReg) + 1;
+
+ return Info;
}
+ llvm_unreachable("calls not implemented");
+}
+
+void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
+ const MachineFunction &MF) {
+ SIFunctionResourceInfo Info = analyzeResourceUsage(MF);
+
+ ProgInfo.NumVGPR = Info.NumVGPR;
+ ProgInfo.NumSGPR = Info.NumExplicitSGPR;
+ ProgInfo.ScratchSize = Info.PrivateSegmentSize;
+ ProgInfo.VCCUsed = Info.UsesVCC;
+ ProgInfo.FlatUsed = Info.UsesFlatScratch;
+ ProgInfo.DynamicCallStack = Info.HasDynamicallySizedStack || Info.HasRecursion;
+
+ const SISubtarget &STM = MF.getSubtarget<SISubtarget>();
+ const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
+ const SIInstrInfo *TII = STM.getInstrInfo();
+ const SIRegisterInfo *RI = &TII->getRegisterInfo();
+
+ unsigned ExtraSGPRs = getNumExtraSGPRs(STM,
+ ProgInfo.VCCUsed,
+ ProgInfo.FlatUsed);
unsigned ExtraVGPRs = STM.getReservedNumVGPRs(MF);
// Check the addressable register limit before we add ExtraSGPRs.
@@ -574,9 +660,6 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
// Make clamp modifier on NaN input returns 0.
ProgInfo.DX10Clamp = STM.enableDX10Clamp();
- const MachineFrameInfo &FrameInfo = MF.getFrameInfo();
- ProgInfo.ScratchSize = FrameInfo.getStackSize();
-
unsigned LDSAlignShift;
if (STM.getGeneration() < SISubtarget::SEA_ISLANDS) {
// LDS is allocated in 64 dword blocks.
@@ -638,6 +721,7 @@ static unsigned getRsrcReg(CallingConv::ID CallConv) {
switch (CallConv) {
default: LLVM_FALLTHROUGH;
case CallingConv::AMDGPU_CS: return R_00B848_COMPUTE_PGM_RSRC1;
+ case CallingConv::AMDGPU_HS: return R_00B428_SPI_SHADER_PGM_RSRC1_HS;
case CallingConv::AMDGPU_GS: return R_00B228_SPI_SHADER_PGM_RSRC1_GS;
case CallingConv::AMDGPU_PS: return R_00B028_SPI_SHADER_PGM_RSRC1_PS;
case CallingConv::AMDGPU_VS: return R_00B128_SPI_SHADER_PGM_RSRC1_VS;
@@ -645,7 +729,7 @@ static unsigned getRsrcReg(CallingConv::ID CallConv) {
}
void AMDGPUAsmPrinter::EmitProgramInfoSI(const MachineFunction &MF,
- const SIProgramInfo &KernelInfo) {
+ const SIProgramInfo &CurrentProgramInfo) {
const SISubtarget &STM = MF.getSubtarget<SISubtarget>();
const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
unsigned RsrcReg = getRsrcReg(MF.getFunction()->getCallingConv());
@@ -653,29 +737,29 @@ void AMDGPUAsmPrinter::EmitProgramInfoSI(const MachineFunction &MF,
if (AMDGPU::isCompute(MF.getFunction()->getCallingConv())) {
OutStreamer->EmitIntValue(R_00B848_COMPUTE_PGM_RSRC1, 4);
- OutStreamer->EmitIntValue(KernelInfo.ComputePGMRSrc1, 4);
+ OutStreamer->EmitIntValue(CurrentProgramInfo.ComputePGMRSrc1, 4);
OutStreamer->EmitIntValue(R_00B84C_COMPUTE_PGM_RSRC2, 4);
- OutStreamer->EmitIntValue(KernelInfo.ComputePGMRSrc2, 4);
+ OutStreamer->EmitIntValue(CurrentProgramInfo.ComputePGMRSrc2, 4);
OutStreamer->EmitIntValue(R_00B860_COMPUTE_TMPRING_SIZE, 4);
- OutStreamer->EmitIntValue(S_00B860_WAVESIZE(KernelInfo.ScratchBlocks), 4);
+ OutStreamer->EmitIntValue(S_00B860_WAVESIZE(CurrentProgramInfo.ScratchBlocks), 4);
// TODO: Should probably note flat usage somewhere. SC emits a "FlatPtr32 =
// 0" comment but I don't see a corresponding field in the register spec.
} else {
OutStreamer->EmitIntValue(RsrcReg, 4);
- OutStreamer->EmitIntValue(S_00B028_VGPRS(KernelInfo.VGPRBlocks) |
- S_00B028_SGPRS(KernelInfo.SGPRBlocks), 4);
+ OutStreamer->EmitIntValue(S_00B028_VGPRS(CurrentProgramInfo.VGPRBlocks) |
+ S_00B028_SGPRS(CurrentProgramInfo.SGPRBlocks), 4);
if (STM.isVGPRSpillingEnabled(*MF.getFunction())) {
OutStreamer->EmitIntValue(R_0286E8_SPI_TMPRING_SIZE, 4);
- OutStreamer->EmitIntValue(S_0286E8_WAVESIZE(KernelInfo.ScratchBlocks), 4);
+ OutStreamer->EmitIntValue(S_0286E8_WAVESIZE(CurrentProgramInfo.ScratchBlocks), 4);
}
}
if (MF.getFunction()->getCallingConv() == CallingConv::AMDGPU_PS) {
OutStreamer->EmitIntValue(R_00B02C_SPI_SHADER_PGM_RSRC2_PS, 4);
- OutStreamer->EmitIntValue(S_00B02C_EXTRA_LDS_SIZE(KernelInfo.LDSBlocks), 4);
+ OutStreamer->EmitIntValue(S_00B02C_EXTRA_LDS_SIZE(CurrentProgramInfo.LDSBlocks), 4);
OutStreamer->EmitIntValue(R_0286CC_SPI_PS_INPUT_ENA, 4);
OutStreamer->EmitIntValue(MFI->getPSInputEnable(), 4);
OutStreamer->EmitIntValue(R_0286D0_SPI_PS_INPUT_ADDR, 4);
@@ -703,7 +787,7 @@ static amd_element_byte_size_t getElementByteSizeValue(unsigned Size) {
}
void AMDGPUAsmPrinter::getAmdKernelCode(amd_kernel_code_t &Out,
- const SIProgramInfo &KernelInfo,
+ const SIProgramInfo &CurrentProgramInfo,
const MachineFunction &MF) const {
const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
const SISubtarget &STM = MF.getSubtarget<SISubtarget>();
@@ -711,10 +795,13 @@ void AMDGPUAsmPrinter::getAmdKernelCode(amd_kernel_code_t &Out,
AMDGPU::initDefaultAMDKernelCodeT(Out, STM.getFeatureBits());
Out.compute_pgm_resource_registers =
- KernelInfo.ComputePGMRSrc1 |
- (KernelInfo.ComputePGMRSrc2 << 32);
+ CurrentProgramInfo.ComputePGMRSrc1 |
+ (CurrentProgramInfo.ComputePGMRSrc2 << 32);
Out.code_properties = AMD_CODE_PROPERTY_IS_PTR64;
+ if (CurrentProgramInfo.DynamicCallStack)
+ Out.code_properties |= AMD_CODE_PROPERTY_IS_DYNAMIC_CALLSTACK;
+
AMD_HSA_BITS_SET(Out.code_properties,
AMD_CODE_PROPERTY_PRIVATE_ELEMENT_SIZE,
getElementByteSizeValue(STM.getMaxPrivateElementSize()));
@@ -766,12 +853,12 @@ void AMDGPUAsmPrinter::getAmdKernelCode(amd_kernel_code_t &Out,
// FIXME: Should use getKernArgSize
Out.kernarg_segment_byte_size =
STM.getKernArgSegmentSize(MF, MFI->getABIArgOffset());
- Out.wavefront_sgpr_count = KernelInfo.NumSGPR;
- Out.workitem_vgpr_count = KernelInfo.NumVGPR;
- Out.workitem_private_segment_byte_size = KernelInfo.ScratchSize;
- Out.workgroup_group_segment_byte_size = KernelInfo.LDSSize;
- Out.reserved_vgpr_first = KernelInfo.ReservedVGPRFirst;
- Out.reserved_vgpr_count = KernelInfo.ReservedVGPRCount;
+ Out.wavefront_sgpr_count = CurrentProgramInfo.NumSGPR;
+ Out.workitem_vgpr_count = CurrentProgramInfo.NumVGPR;
+ Out.workitem_private_segment_byte_size = CurrentProgramInfo.ScratchSize;
+ Out.workgroup_group_segment_byte_size = CurrentProgramInfo.LDSSize;
+ Out.reserved_vgpr_first = CurrentProgramInfo.ReservedVGPRFirst;
+ Out.reserved_vgpr_count = CurrentProgramInfo.ReservedVGPRCount;
// These alignment values are specified in powers of two, so alignment =
// 2^n. The minimum alignment is 2^4 = 16.
@@ -780,9 +867,9 @@ void AMDGPUAsmPrinter::getAmdKernelCode(amd_kernel_code_t &Out,
if (STM.debuggerEmitPrologue()) {
Out.debug_wavefront_private_segment_offset_sgpr =
- KernelInfo.DebuggerWavefrontPrivateSegmentOffsetSGPR;
+ CurrentProgramInfo.DebuggerWavefrontPrivateSegmentOffsetSGPR;
Out.debug_private_segment_buffer_sgpr =
- KernelInfo.DebuggerPrivateSegmentBufferSGPR;
+ CurrentProgramInfo.DebuggerPrivateSegmentBufferSGPR;
}
}
diff --git a/lib/Target/AMDGPU/AMDGPUAsmPrinter.h b/lib/Target/AMDGPU/AMDGPUAsmPrinter.h
index 8c86dea4b885..e5adeeb465e1 100644
--- a/lib/Target/AMDGPU/AMDGPUAsmPrinter.h
+++ b/lib/Target/AMDGPU/AMDGPUAsmPrinter.h
@@ -30,9 +30,26 @@ namespace llvm {
class AMDGPUTargetStreamer;
class MCOperand;
+class SISubtarget;
class AMDGPUAsmPrinter final : public AsmPrinter {
private:
+ // Track resource usage for callee functions.
+ struct SIFunctionResourceInfo {
+ // Track the number of explicitly used VGPRs. Special registers reserved at
+ // the end are tracked separately.
+ int32_t NumVGPR = 0;
+ int32_t NumExplicitSGPR = 0;
+ uint32_t PrivateSegmentSize = 0;
+ bool UsesVCC = false;
+ bool UsesFlatScratch = false;
+ bool HasDynamicallySizedStack = false;
+ bool HasRecursion = false;
+
+ int32_t getTotalNumSGPRs(const SISubtarget &ST) const;
+ };
+
+ // Track resource usage for kernels / entry functions.
struct SIProgramInfo {
// Fields set in PGM_RSRC1 pm4 packet.
uint32_t VGPRBlocks = 0;
@@ -83,14 +100,23 @@ private:
uint16_t DebuggerPrivateSegmentBufferSGPR =
std::numeric_limits<uint16_t>::max();
+ // Whether there is recursion, dynamic allocas, indirect calls or some other
+ // reason there may be statically unknown stack usage.
+ bool DynamicCallStack = false;
+
// Bonus information for debugging.
bool VCCUsed = false;
SIProgramInfo() = default;
};
+ SIProgramInfo CurrentProgramInfo;
+ DenseMap<const Function *, SIFunctionResourceInfo> CallGraphResourceInfo;
+
uint64_t getFunctionCodeSize(const MachineFunction &MF) const;
- void getSIProgramInfo(SIProgramInfo &Out, const MachineFunction &MF) const;
+ SIFunctionResourceInfo analyzeResourceUsage(const MachineFunction &MF) const;
+
+ void getSIProgramInfo(SIProgramInfo &Out, const MachineFunction &MF);
void getAmdKernelCode(amd_kernel_code_t &Out, const SIProgramInfo &KernelInfo,
const MachineFunction &MF) const;
void findNumUsedRegistersSI(const MachineFunction &MF,
@@ -101,6 +127,10 @@ private:
/// can correctly setup the GPU state.
void EmitProgramInfoR600(const MachineFunction &MF);
void EmitProgramInfoSI(const MachineFunction &MF, const SIProgramInfo &KernelInfo);
+ void emitCommonFunctionComments(uint32_t NumVGPR,
+ uint32_t NumSGPR,
+ uint32_t ScratchSize,
+ uint64_t CodeSize);
public:
explicit AMDGPUAsmPrinter(TargetMachine &TM,
@@ -112,6 +142,7 @@ public:
AMDGPUTargetStreamer& getTargetStreamer() const;
+ bool doFinalization(Module &M) override;
bool runOnMachineFunction(MachineFunction &MF) override;
/// \brief Wrapper for MCInstLowering.lowerOperand() for the tblgen'erated
diff --git a/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
index f5110857da84..ccae36ced1f8 100644
--- a/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
+++ b/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
@@ -207,8 +207,8 @@ bool AMDGPUDAGToDAGISel::isNoNanSrc(SDValue N) const {
return true;
// TODO: Move into isKnownNeverNaN
- if (const auto *BO = dyn_cast<BinaryWithFlagsSDNode>(N))
- return BO->Flags.hasNoNaNs();
+ if (N->getFlags().isDefined())
+ return N->getFlags().hasNoNaNs();
return CurDAG->isKnownNeverNaN(N);
}
diff --git a/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index e21775e61dd4..64e1b8f0d7f0 100644
--- a/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -29,6 +29,7 @@
#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
#include "llvm/IR/DataLayout.h"
#include "llvm/IR/DiagnosticInfo.h"
+#include "llvm/Support/KnownBits.h"
#include "SIInstrInfo.h"
using namespace llvm;
@@ -895,6 +896,7 @@ CCAssignFn *AMDGPUTargetLowering::CCAssignFnForCall(CallingConv::ID CC,
case CallingConv::SPIR_KERNEL:
return CC_AMDGPU_Kernel;
case CallingConv::AMDGPU_VS:
+ case CallingConv::AMDGPU_HS:
case CallingConv::AMDGPU_GS:
case CallingConv::AMDGPU_PS:
case CallingConv::AMDGPU_CS:
@@ -2293,11 +2295,11 @@ SDValue AMDGPUTargetLowering::LowerSIGN_EXTEND_INREG(SDValue Op,
//===----------------------------------------------------------------------===//
static bool isU24(SDValue Op, SelectionDAG &DAG) {
- APInt KnownZero, KnownOne;
+ KnownBits Known;
EVT VT = Op.getValueType();
- DAG.computeKnownBits(Op, KnownZero, KnownOne);
+ DAG.computeKnownBits(Op, Known);
- return (VT.getSizeInBits() - KnownZero.countLeadingOnes()) <= 24;
+ return (VT.getSizeInBits() - Known.Zero.countLeadingOnes()) <= 24;
}
static bool isI24(SDValue Op, SelectionDAG &DAG) {
@@ -3358,13 +3360,12 @@ SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N,
OffsetVal,
OffsetVal + WidthVal);
- APInt KnownZero, KnownOne;
+ KnownBits Known;
TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
!DCI.isBeforeLegalizeOps());
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
if (TLI.ShrinkDemandedConstant(BitsFrom, Demanded, TLO) ||
- TLI.SimplifyDemandedBits(BitsFrom, Demanded,
- KnownZero, KnownOne, TLO)) {
+ TLI.SimplifyDemandedBits(BitsFrom, Demanded, Known, TLO)) {
DCI.CommitTargetLoweringOpt(TLO);
}
}
@@ -3516,6 +3517,8 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
NODE_NAME_CASE(KILL)
NODE_NAME_CASE(DUMMY_CHAIN)
case AMDGPUISD::FIRST_MEM_OPCODE_NUMBER: break;
+ NODE_NAME_CASE(INIT_EXEC)
+ NODE_NAME_CASE(INIT_EXEC_FROM_INPUT)
NODE_NAME_CASE(SENDMSG)
NODE_NAME_CASE(SENDMSGHALT)
NODE_NAME_CASE(INTERP_MOV)
@@ -3574,14 +3577,12 @@ SDValue AMDGPUTargetLowering::getRecipEstimate(SDValue Operand,
}
void AMDGPUTargetLowering::computeKnownBitsForTargetNode(
- const SDValue Op, APInt &KnownZero, APInt &KnownOne,
+ const SDValue Op, KnownBits &Known,
const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth) const {
- unsigned BitWidth = KnownZero.getBitWidth();
- KnownZero = KnownOne = APInt(BitWidth, 0); // Don't know anything.
+ Known.Zero.clearAllBits(); Known.One.clearAllBits(); // Don't know anything.
- APInt KnownZero2;
- APInt KnownOne2;
+ KnownBits Known2;
unsigned Opc = Op.getOpcode();
switch (Opc) {
@@ -3589,7 +3590,7 @@ void AMDGPUTargetLowering::computeKnownBitsForTargetNode(
break;
case AMDGPUISD::CARRY:
case AMDGPUISD::BORROW: {
- KnownZero = APInt::getHighBitsSet(32, 31);
+ Known.Zero = APInt::getHighBitsSet(32, 31);
break;
}
@@ -3602,16 +3603,16 @@ void AMDGPUTargetLowering::computeKnownBitsForTargetNode(
uint32_t Width = CWidth->getZExtValue() & 0x1f;
if (Opc == AMDGPUISD::BFE_U32)
- KnownZero = APInt::getHighBitsSet(32, 32 - Width);
+ Known.Zero = APInt::getHighBitsSet(32, 32 - Width);
break;
}
case AMDGPUISD::FP_TO_FP16:
case AMDGPUISD::FP16_ZEXT: {
- unsigned BitWidth = KnownZero.getBitWidth();
+ unsigned BitWidth = Known.getBitWidth();
// High bits are zero.
- KnownZero = APInt::getHighBitsSet(BitWidth, BitWidth - 16);
+ Known.Zero = APInt::getHighBitsSet(BitWidth, BitWidth - 16);
break;
}
}
diff --git a/lib/Target/AMDGPU/AMDGPUISelLowering.h b/lib/Target/AMDGPU/AMDGPUISelLowering.h
index 13cbfe267932..e1a5a2072418 100644
--- a/lib/Target/AMDGPU/AMDGPUISelLowering.h
+++ b/lib/Target/AMDGPU/AMDGPUISelLowering.h
@@ -125,8 +125,9 @@ public:
if (getTargetMachine().Options.NoSignedZerosFPMath)
return true;
- if (const auto *BO = dyn_cast<BinaryWithFlagsSDNode>(Op))
- return BO->Flags.hasNoSignedZeros();
+ const auto Flags = Op.getNode()->getFlags();
+ if (Flags.isDefined())
+ return Flags.hasNoSignedZeros();
return false;
}
@@ -199,8 +200,7 @@ public:
/// either zero or one and return them in the \p KnownZero and \p KnownOne
/// bitsets.
void computeKnownBitsForTargetNode(const SDValue Op,
- APInt &KnownZero,
- APInt &KnownOne,
+ KnownBits &Known,
const APInt &DemandedElts,
const SelectionDAG &DAG,
unsigned Depth = 0) const override;
@@ -370,6 +370,8 @@ enum NodeType : unsigned {
BUILD_VERTICAL_VECTOR,
/// Pointer to the start of the shader's constant data.
CONST_DATA_PTR,
+ INIT_EXEC,
+ INIT_EXEC_FROM_INPUT,
SENDMSG,
SENDMSGHALT,
INTERP_MOV,
diff --git a/lib/Target/AMDGPU/AMDGPUInstrInfo.td b/lib/Target/AMDGPU/AMDGPUInstrInfo.td
index c1706d12a2ea..353cc5742791 100644
--- a/lib/Target/AMDGPU/AMDGPUInstrInfo.td
+++ b/lib/Target/AMDGPU/AMDGPUInstrInfo.td
@@ -299,6 +299,15 @@ def AMDGPUumed3 : SDNode<"AMDGPUISD::UMED3", AMDGPUDTIntTernaryOp,
def AMDGPUfmed3 : SDNode<"AMDGPUISD::FMED3", SDTFPTernaryOp, []>;
+def AMDGPUinit_exec : SDNode<"AMDGPUISD::INIT_EXEC",
+ SDTypeProfile<0, 1, [SDTCisInt<0>]>,
+ [SDNPHasChain, SDNPInGlue]>;
+
+def AMDGPUinit_exec_from_input : SDNode<"AMDGPUISD::INIT_EXEC_FROM_INPUT",
+ SDTypeProfile<0, 2,
+ [SDTCisInt<0>, SDTCisInt<1>]>,
+ [SDNPHasChain, SDNPInGlue]>;
+
def AMDGPUsendmsg : SDNode<"AMDGPUISD::SENDMSG",
SDTypeProfile<0, 1, [SDTCisInt<0>]>,
[SDNPHasChain, SDNPInGlue]>;
diff --git a/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp b/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp
index 27fe639e3d4b..fe7283ccf7d9 100644
--- a/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp
+++ b/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp
@@ -17,6 +17,7 @@ static bool isEntryFunctionCC(CallingConv::ID CC) {
case CallingConv::AMDGPU_KERNEL:
case CallingConv::SPIR_KERNEL:
case CallingConv::AMDGPU_VS:
+ case CallingConv::AMDGPU_HS:
case CallingConv::AMDGPU_GS:
case CallingConv::AMDGPU_PS:
case CallingConv::AMDGPU_CS:
diff --git a/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index 0202220b8011..cd5bad04d0b3 100644
--- a/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -309,6 +309,7 @@ void AMDGPUTargetMachine::adjustPassManager(PassManagerBuilder &Builder) {
default:
return false;
case CallingConv::AMDGPU_VS:
+ case CallingConv::AMDGPU_HS:
case CallingConv::AMDGPU_GS:
case CallingConv::AMDGPU_PS:
case CallingConv::AMDGPU_CS:
diff --git a/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
index 6edd3e923ba1..c9482c37ec80 100644
--- a/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
+++ b/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
@@ -432,6 +432,7 @@ static bool isArgPassedInSGPR(const Argument *A) {
case CallingConv::SPIR_KERNEL:
return true;
case CallingConv::AMDGPU_VS:
+ case CallingConv::AMDGPU_HS:
case CallingConv::AMDGPU_GS:
case CallingConv::AMDGPU_PS:
case CallingConv::AMDGPU_CS:
diff --git a/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/lib/Target/AMDGPU/GCNSchedStrategy.cpp
index ea305a92fc60..630442625aa3 100644
--- a/lib/Target/AMDGPU/GCNSchedStrategy.cpp
+++ b/lib/Target/AMDGPU/GCNSchedStrategy.cpp
@@ -422,8 +422,9 @@ void GCNScheduleDAGMILive::discoverLiveIns() {
unsigned SGPRs = 0;
unsigned VGPRs = 0;
+ auto &MI = *begin()->getParent()->getFirstNonDebugInstr();
const SIRegisterInfo *SRI = static_cast<const SIRegisterInfo*>(TRI);
- SlotIndex SI = LIS->getInstructionIndex(*begin()).getBaseIndex();
+ SlotIndex SI = LIS->getInstructionIndex(MI).getBaseIndex();
assert (SI.isValid());
DEBUG(dbgs() << "Region live-ins:");
diff --git a/lib/Target/AMDGPU/R600Intrinsics.td b/lib/Target/AMDGPU/R600Intrinsics.td
index a5310e9fd6d0..4c9e1e8a5434 100644
--- a/lib/Target/AMDGPU/R600Intrinsics.td
+++ b/lib/Target/AMDGPU/R600Intrinsics.td
@@ -61,7 +61,7 @@ def int_r600_ddx : TextureIntrinsicFloatInput;
def int_r600_ddy : TextureIntrinsicFloatInput;
def int_r600_dot4 : Intrinsic<[llvm_float_ty],
- [llvm_v4f32_ty, llvm_v4f32_ty], [IntrNoMem]
+ [llvm_v4f32_ty, llvm_v4f32_ty], [IntrNoMem, IntrSpeculatable]
>;
} // End TargetPrefix = "r600", isTarget = 1
diff --git a/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp b/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp
index b7e62075244b..d8cb98fe1b19 100644
--- a/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp
+++ b/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp
@@ -77,9 +77,10 @@ class SIAnnotateControlFlow : public FunctionPass {
void insertElse(BranchInst *Term);
- Value *handleLoopCondition(Value *Cond, PHINode *Broken,
- llvm::Loop *L, BranchInst *Term,
- SmallVectorImpl<WeakVH> &LoopPhiConditions);
+ Value *
+ handleLoopCondition(Value *Cond, PHINode *Broken, llvm::Loop *L,
+ BranchInst *Term,
+ SmallVectorImpl<WeakTrackingVH> &LoopPhiConditions);
void handleLoop(BranchInst *Term);
@@ -212,9 +213,8 @@ void SIAnnotateControlFlow::insertElse(BranchInst *Term) {
/// \brief Recursively handle the condition leading to a loop
Value *SIAnnotateControlFlow::handleLoopCondition(
- Value *Cond, PHINode *Broken,
- llvm::Loop *L, BranchInst *Term,
- SmallVectorImpl<WeakVH> &LoopPhiConditions) {
+ Value *Cond, PHINode *Broken, llvm::Loop *L, BranchInst *Term,
+ SmallVectorImpl<WeakTrackingVH> &LoopPhiConditions) {
// Only search through PHI nodes which are inside the loop. If we try this
// with PHI nodes that are outside of the loop, we end up inserting new PHI
@@ -281,7 +281,7 @@ Value *SIAnnotateControlFlow::handleLoopCondition(
NewPhi->setIncomingValue(i, PhiArg);
}
- LoopPhiConditions.push_back(WeakVH(Phi));
+ LoopPhiConditions.push_back(WeakTrackingVH(Phi));
return Ret;
}
@@ -323,7 +323,7 @@ void SIAnnotateControlFlow::handleLoop(BranchInst *Term) {
BasicBlock *Target = Term->getSuccessor(1);
PHINode *Broken = PHINode::Create(Int64, 0, "phi.broken", &Target->front());
- SmallVector<WeakVH, 8> LoopPhiConditions;
+ SmallVector<WeakTrackingVH, 8> LoopPhiConditions;
Value *Cond = Term->getCondition();
Term->setCondition(BoolTrue);
Value *Arg = handleLoopCondition(Cond, Broken, L, Term, LoopPhiConditions);
@@ -333,7 +333,7 @@ void SIAnnotateControlFlow::handleLoop(BranchInst *Term) {
Term->setCondition(CallInst::Create(Loop, Arg, "", Term));
- for (WeakVH Val : reverse(LoopPhiConditions)) {
+ for (WeakTrackingVH Val : reverse(LoopPhiConditions)) {
if (PHINode *Cond = cast_or_null<PHINode>(Val))
eraseIfUnused(Cond);
}
diff --git a/lib/Target/AMDGPU/SIDefines.h b/lib/Target/AMDGPU/SIDefines.h
index 3dd372b32866..a01330cb9171 100644
--- a/lib/Target/AMDGPU/SIDefines.h
+++ b/lib/Target/AMDGPU/SIDefines.h
@@ -302,6 +302,7 @@ enum DstUnused {
#define S_00B02C_EXTRA_LDS_SIZE(x) (((x) & 0xFF) << 8)
#define R_00B128_SPI_SHADER_PGM_RSRC1_VS 0x00B128
#define R_00B228_SPI_SHADER_PGM_RSRC1_GS 0x00B228
+#define R_00B428_SPI_SHADER_PGM_RSRC1_HS 0x00B428
#define R_00B848_COMPUTE_PGM_RSRC1 0x00B848
#define S_00B028_VGPRS(x) (((x) & 0x3F) << 0)
#define S_00B028_SGPRS(x) (((x) & 0x0F) << 6)
diff --git a/lib/Target/AMDGPU/SIFixSGPRCopies.cpp b/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
index b0f0bf04a891..3cca815d8773 100644
--- a/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
+++ b/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
@@ -278,8 +278,7 @@ static bool phiHasBreakDef(const MachineInstr &PHI,
Visited.insert(Reg);
- MachineInstr *DefInstr = MRI.getUniqueVRegDef(Reg);
- assert(DefInstr);
+ MachineInstr *DefInstr = MRI.getVRegDef(Reg);
switch (DefInstr->getOpcode()) {
default:
break;
@@ -346,7 +345,7 @@ bool searchPredecessors(const MachineBasicBlock *MBB,
return false;
DenseSet<const MachineBasicBlock*> Visited;
- SmallVector<MachineBasicBlock*, 4> Worklist(MBB->pred_begin(),
+ SmallVector<MachineBasicBlock*, 4> Worklist(MBB->pred_begin(),
MBB->pred_end());
while (!Worklist.empty()) {
@@ -546,7 +545,13 @@ bool SIFixSGPRCopies::runOnMachineFunction(MachineFunction &MF) {
const TargetRegisterClass *SrcRC, *DstRC;
std::tie(SrcRC, DstRC) = getCopyRegClasses(MI, *TRI, MRI);
if (isVGPRToSGPRCopy(SrcRC, DstRC, *TRI)) {
- MachineInstr *DefMI = MRI.getVRegDef(MI.getOperand(1).getReg());
+ unsigned SrcReg = MI.getOperand(1).getReg();
+ if (!TargetRegisterInfo::isVirtualRegister(SrcReg)) {
+ TII->moveToVALU(MI);
+ break;
+ }
+
+ MachineInstr *DefMI = MRI.getVRegDef(SrcReg);
unsigned SMovOp;
int64_t Imm;
// If we are just copying an immediate, we can replace the copy with
diff --git a/lib/Target/AMDGPU/SIISelLowering.cpp b/lib/Target/AMDGPU/SIISelLowering.cpp
index ce74a7cd8b04..853c8737b464 100644
--- a/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -68,6 +68,7 @@
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/Compiler.h"
#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/KnownBits.h"
#include "llvm/Support/MathExtras.h"
#include "llvm/Target/TargetCallingConv.h"
#include "llvm/Target/TargetOptions.h"
@@ -1956,6 +1957,63 @@ MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter(
MI.eraseFromParent();
return BB;
+ case AMDGPU::SI_INIT_EXEC:
+ // This should be before all vector instructions.
+ BuildMI(*BB, &*BB->begin(), MI.getDebugLoc(), TII->get(AMDGPU::S_MOV_B64),
+ AMDGPU::EXEC)
+ .addImm(MI.getOperand(0).getImm());
+ MI.eraseFromParent();
+ return BB;
+
+ case AMDGPU::SI_INIT_EXEC_FROM_INPUT: {
+ // Extract the thread count from an SGPR input and set EXEC accordingly.
+ // Since BFM can't shift by 64, handle that case with CMP + CMOV.
+ //
+ // S_BFE_U32 count, input, {shift, 7}
+ // S_BFM_B64 exec, count, 0
+ // S_CMP_EQ_U32 count, 64
+ // S_CMOV_B64 exec, -1
+ MachineInstr *FirstMI = &*BB->begin();
+ MachineRegisterInfo &MRI = MF->getRegInfo();
+ unsigned InputReg = MI.getOperand(0).getReg();
+ unsigned CountReg = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
+ bool Found = false;
+
+ // Move the COPY of the input reg to the beginning, so that we can use it.
+ for (auto I = BB->begin(); I != &MI; I++) {
+ if (I->getOpcode() != TargetOpcode::COPY ||
+ I->getOperand(0).getReg() != InputReg)
+ continue;
+
+ if (I == FirstMI) {
+ FirstMI = &*++BB->begin();
+ } else {
+ I->removeFromParent();
+ BB->insert(FirstMI, &*I);
+ }
+ Found = true;
+ break;
+ }
+ assert(Found);
+
+ // This should be before all vector instructions.
+ BuildMI(*BB, FirstMI, DebugLoc(), TII->get(AMDGPU::S_BFE_U32), CountReg)
+ .addReg(InputReg)
+ .addImm((MI.getOperand(1).getImm() & 0x7f) | 0x70000);
+ BuildMI(*BB, FirstMI, DebugLoc(), TII->get(AMDGPU::S_BFM_B64),
+ AMDGPU::EXEC)
+ .addReg(CountReg)
+ .addImm(0);
+ BuildMI(*BB, FirstMI, DebugLoc(), TII->get(AMDGPU::S_CMP_EQ_U32))
+ .addReg(CountReg, RegState::Kill)
+ .addImm(64);
+ BuildMI(*BB, FirstMI, DebugLoc(), TII->get(AMDGPU::S_CMOV_B64),
+ AMDGPU::EXEC)
+ .addImm(-1);
+ MI.eraseFromParent();
+ return BB;
+ }
+
case AMDGPU::GET_GROUPSTATICSIZE: {
DebugLoc DL = MI.getDebugLoc();
BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_MOV_B32))
@@ -3223,6 +3281,14 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
return DAG.getNode(NodeOp, DL, MVT::Other, Chain,
Op.getOperand(2), Glue);
}
+ case Intrinsic::amdgcn_init_exec: {
+ return DAG.getNode(AMDGPUISD::INIT_EXEC, DL, MVT::Other, Chain,
+ Op.getOperand(2));
+ }
+ case Intrinsic::amdgcn_init_exec_from_input: {
+ return DAG.getNode(AMDGPUISD::INIT_EXEC_FROM_INPUT, DL, MVT::Other, Chain,
+ Op.getOperand(2), Op.getOperand(3));
+ }
case AMDGPUIntrinsic::SI_tbuffer_store: {
SDValue Ops[] = {
Chain,
@@ -3455,15 +3521,15 @@ SDValue SITargetLowering::lowerFastUnsafeFDIV(SDValue Op,
}
}
- const SDNodeFlags *Flags = Op->getFlags();
+ const SDNodeFlags Flags = Op->getFlags();
- if (Unsafe || Flags->hasAllowReciprocal()) {
+ if (Unsafe || Flags.hasAllowReciprocal()) {
// Turn into multiply by the reciprocal.
// x / y -> x * (1.0 / y)
- SDNodeFlags Flags;
- Flags.setUnsafeAlgebra(true);
+ SDNodeFlags NewFlags;
+ NewFlags.setUnsafeAlgebra(true);
SDValue Recip = DAG.getNode(AMDGPUISD::RCP, SL, VT, RHS);
- return DAG.getNode(ISD::FMUL, SL, VT, LHS, Recip, &Flags);
+ return DAG.getNode(ISD::FMUL, SL, VT, LHS, Recip, NewFlags);
}
return SDValue();
@@ -4542,10 +4608,9 @@ unsigned SITargetLowering::getFusedOpcode(const SelectionDAG &DAG,
return ISD::FMAD;
const TargetOptions &Options = DAG.getTarget().Options;
- if ((Options.AllowFPOpFusion == FPOpFusion::Fast ||
- Options.UnsafeFPMath ||
- (cast<BinaryWithFlagsSDNode>(N0)->Flags.hasUnsafeAlgebra() &&
- cast<BinaryWithFlagsSDNode>(N1)->Flags.hasUnsafeAlgebra())) &&
+ if ((Options.AllowFPOpFusion == FPOpFusion::Fast || Options.UnsafeFPMath ||
+ (N0->getFlags().hasUnsafeAlgebra() &&
+ N1->getFlags().hasUnsafeAlgebra())) &&
isFMAFasterThanFMulAndFAdd(VT)) {
return ISD::FMA;
}
@@ -4706,12 +4771,12 @@ SDValue SITargetLowering::performCvtF32UByteNCombine(SDNode *N,
APInt Demanded = APInt::getBitsSet(32, 8 * Offset, 8 * Offset + 8);
- APInt KnownZero, KnownOne;
+ KnownBits Known;
TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
!DCI.isBeforeLegalizeOps());
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
if (TLI.ShrinkDemandedConstant(Src, Demanded, TLO) ||
- TLI.SimplifyDemandedBits(Src, Demanded, KnownZero, KnownOne, TLO)) {
+ TLI.SimplifyDemandedBits(Src, Demanded, Known, TLO)) {
DCI.CommitTargetLoweringOpt(TLO);
}
diff --git a/lib/Target/AMDGPU/SIInstructions.td b/lib/Target/AMDGPU/SIInstructions.td
index 3f6ddec70479..7ccb54f54e34 100644
--- a/lib/Target/AMDGPU/SIInstructions.td
+++ b/lib/Target/AMDGPU/SIInstructions.td
@@ -286,6 +286,19 @@ def SI_INIT_M0 : SPseudoInstSI <(outs), (ins SSrc_b32:$src)> {
let isReMaterializable = 1;
}
+def SI_INIT_EXEC : SPseudoInstSI <
+ (outs), (ins i64imm:$src), []> {
+ let Defs = [EXEC];
+ let usesCustomInserter = 1;
+ let isAsCheapAsAMove = 1;
+}
+
+def SI_INIT_EXEC_FROM_INPUT : SPseudoInstSI <
+ (outs), (ins SSrc_b32:$input, i32imm:$shift), []> {
+ let Defs = [EXEC];
+ let usesCustomInserter = 1;
+}
+
// Return for returning shaders to a shader variant epilog.
def SI_RETURN_TO_EPILOG : SPseudoInstSI <
(outs), (ins variable_ops), [(AMDGPUreturn_to_epilog)]> {
@@ -399,6 +412,16 @@ def SI_PC_ADD_REL_OFFSET : SPseudoInstSI <
} // End SubtargetPredicate = isGCN
let Predicates = [isGCN] in {
+def : Pat <
+ (AMDGPUinit_exec i64:$src),
+ (SI_INIT_EXEC (as_i64imm $src))
+>;
+
+def : Pat <
+ (AMDGPUinit_exec_from_input i32:$input, i32:$shift),
+ (SI_INIT_EXEC_FROM_INPUT (i32 $input), (as_i32imm $shift))
+>;
+
def : Pat<
(AMDGPUtrap timm:$trapid),
(S_TRAP $trapid)
diff --git a/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
index 5a3242bed1d0..d565c84bfeda 100644
--- a/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
+++ b/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
@@ -503,6 +503,7 @@ unsigned getInitialPSInputAddr(const Function &F) {
bool isShader(CallingConv::ID cc) {
switch(cc) {
case CallingConv::AMDGPU_VS:
+ case CallingConv::AMDGPU_HS:
case CallingConv::AMDGPU_GS:
case CallingConv::AMDGPU_PS:
case CallingConv::AMDGPU_CS: