diff options
| author | Dimitry Andric <dim@FreeBSD.org> | 2017-05-02 18:30:13 +0000 |
|---|---|---|
| committer | Dimitry Andric <dim@FreeBSD.org> | 2017-05-02 18:30:13 +0000 |
| commit | a303c417bbdb53703c2c17398b08486bde78f1f6 (patch) | |
| tree | 98366d6b93d863cefdc53f16c66c0c5ae7fb2261 /lib/Target/AMDGPU | |
| parent | 12f3ca4cdb95b193af905a00e722a4dcb40b3de3 (diff) | |
Notes
Diffstat (limited to 'lib/Target/AMDGPU')
| -rw-r--r-- | lib/Target/AMDGPU/AMDGPU.td | 2 | ||||
| -rw-r--r-- | lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp | 339 | ||||
| -rw-r--r-- | lib/Target/AMDGPU/AMDGPUAsmPrinter.h | 33 | ||||
| -rw-r--r-- | lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp | 4 | ||||
| -rw-r--r-- | lib/Target/AMDGPU/AMDGPUISelLowering.cpp | 31 | ||||
| -rw-r--r-- | lib/Target/AMDGPU/AMDGPUISelLowering.h | 10 | ||||
| -rw-r--r-- | lib/Target/AMDGPU/AMDGPUInstrInfo.td | 9 | ||||
| -rw-r--r-- | lib/Target/AMDGPU/AMDGPUMachineFunction.cpp | 1 | ||||
| -rw-r--r-- | lib/Target/AMDGPU/AMDGPUTargetMachine.cpp | 1 | ||||
| -rw-r--r-- | lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp | 1 | ||||
| -rw-r--r-- | lib/Target/AMDGPU/GCNSchedStrategy.cpp | 3 | ||||
| -rw-r--r-- | lib/Target/AMDGPU/R600Intrinsics.td | 2 | ||||
| -rw-r--r-- | lib/Target/AMDGPU/SIAnnotateControlFlow.cpp | 18 | ||||
| -rw-r--r-- | lib/Target/AMDGPU/SIDefines.h | 1 | ||||
| -rw-r--r-- | lib/Target/AMDGPU/SIFixSGPRCopies.cpp | 13 | ||||
| -rw-r--r-- | lib/Target/AMDGPU/SIISelLowering.cpp | 87 | ||||
| -rw-r--r-- | lib/Target/AMDGPU/SIInstructions.td | 23 | ||||
| -rw-r--r-- | lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp | 1 |
18 files changed, 404 insertions, 175 deletions
diff --git a/lib/Target/AMDGPU/AMDGPU.td b/lib/Target/AMDGPU/AMDGPU.td index 0f331486d0f8..2e5b78bbf7ef 100644 --- a/lib/Target/AMDGPU/AMDGPU.td +++ b/lib/Target/AMDGPU/AMDGPU.td @@ -407,7 +407,7 @@ def FeatureGFX9 : SubtargetFeatureGeneration<"GFX9", FeatureGCN3Encoding, FeatureCIInsts, Feature16BitInsts, FeatureSMemRealTime, FeatureScalarStores, FeatureInv2PiInlineImm, FeatureApertureRegs, FeatureGFX9Insts, FeatureVOP3P, FeatureVGPRIndexMode, - FeatureFastFMAF32 + FeatureFastFMAF32, FeatureDPP ] >; diff --git a/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp index a81bcb56dfdc..2ce23dbf08e6 100644 --- a/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp +++ b/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp @@ -149,11 +149,9 @@ void AMDGPUAsmPrinter::EmitFunctionBodyStart() { return; const AMDGPUSubtarget &STM = MF->getSubtarget<AMDGPUSubtarget>(); - SIProgramInfo KernelInfo; amd_kernel_code_t KernelCode; if (STM.isAmdCodeObjectV2(*MF)) { - getSIProgramInfo(KernelInfo, *MF); - getAmdKernelCode(KernelCode, KernelInfo, *MF); + getAmdKernelCode(KernelCode, CurrentProgramInfo, *MF); OutStreamer->SwitchSection(getObjFileLowering().getTextSection()); getTargetStreamer().EmitAMDKernelCodeT(KernelCode); @@ -187,7 +185,26 @@ void AMDGPUAsmPrinter::EmitGlobalVariable(const GlobalVariable *GV) { AsmPrinter::EmitGlobalVariable(GV); } +bool AMDGPUAsmPrinter::doFinalization(Module &M) { + CallGraphResourceInfo.clear(); + return AsmPrinter::doFinalization(M); +} + +// Print comments that apply to both callable functions and entry points. +void AMDGPUAsmPrinter::emitCommonFunctionComments( + uint32_t NumVGPR, + uint32_t NumSGPR, + uint32_t ScratchSize, + uint64_t CodeSize) { + OutStreamer->emitRawComment(" codeLenInByte = " + Twine(CodeSize), false); + OutStreamer->emitRawComment(" NumSgprs: " + Twine(NumSGPR), false); + OutStreamer->emitRawComment(" NumVgprs: " + Twine(NumVGPR), false); + OutStreamer->emitRawComment(" ScratchSize: " + Twine(ScratchSize), false); +} + bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) { + CurrentProgramInfo = SIProgramInfo(); + const AMDGPUMachineFunction *MFI = MF.getInfo<AMDGPUMachineFunction>(); // The starting address of all shader programs must be 256 bytes aligned. @@ -204,11 +221,19 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) { OutStreamer->SwitchSection(ConfigSection); } - SIProgramInfo KernelInfo; if (STM.getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS) { - getSIProgramInfo(KernelInfo, MF); + if (MFI->isEntryFunction()) { + getSIProgramInfo(CurrentProgramInfo, MF); + } else { + auto I = CallGraphResourceInfo.insert( + std::make_pair(MF.getFunction(), SIFunctionResourceInfo())); + SIFunctionResourceInfo &Info = I.first->second; + assert(I.second && "should only be called once per function"); + Info = analyzeResourceUsage(MF); + } + if (!STM.isAmdHsaOS()) { - EmitProgramInfoSI(MF, KernelInfo); + EmitProgramInfoSI(MF, CurrentProgramInfo); } } else { EmitProgramInfoR600(MF); @@ -226,72 +251,87 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) { OutStreamer->SwitchSection(CommentSection); if (STM.getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS) { - if (MFI->isEntryFunction()) { - OutStreamer->emitRawComment(" Kernel info:", false); - } else { + if (!MFI->isEntryFunction()) { OutStreamer->emitRawComment(" Function info:", false); + SIFunctionResourceInfo &Info = CallGraphResourceInfo[MF.getFunction()]; + emitCommonFunctionComments( + Info.NumVGPR, + Info.getTotalNumSGPRs(MF.getSubtarget<SISubtarget>()), + Info.PrivateSegmentSize, + getFunctionCodeSize(MF)); + return false; } + OutStreamer->emitRawComment(" Kernel info:", false); + emitCommonFunctionComments(CurrentProgramInfo.NumVGPR, + CurrentProgramInfo.NumSGPR, + CurrentProgramInfo.ScratchSize, + getFunctionCodeSize(MF)); + OutStreamer->emitRawComment(" codeLenInByte = " + Twine(getFunctionCodeSize(MF)), false); - OutStreamer->emitRawComment(" NumSgprs: " + Twine(KernelInfo.NumSGPR), - false); - OutStreamer->emitRawComment(" NumVgprs: " + Twine(KernelInfo.NumVGPR), - false); - - OutStreamer->emitRawComment(" FloatMode: " + Twine(KernelInfo.FloatMode), - false); - OutStreamer->emitRawComment(" IeeeMode: " + Twine(KernelInfo.IEEEMode), - false); - OutStreamer->emitRawComment(" ScratchSize: " + Twine(KernelInfo.ScratchSize), - false); - OutStreamer->emitRawComment(" LDSByteSize: " + Twine(KernelInfo.LDSSize) + - " bytes/workgroup (compile time only)", false); + OutStreamer->emitRawComment( + " NumSgprs: " + Twine(CurrentProgramInfo.NumSGPR), false); + OutStreamer->emitRawComment( + " NumVgprs: " + Twine(CurrentProgramInfo.NumVGPR), false); - if (!MFI->isEntryFunction()) - return false; + OutStreamer->emitRawComment( + " FloatMode: " + Twine(CurrentProgramInfo.FloatMode), false); + OutStreamer->emitRawComment( + " IeeeMode: " + Twine(CurrentProgramInfo.IEEEMode), false); + OutStreamer->emitRawComment( + " ScratchSize: " + Twine(CurrentProgramInfo.ScratchSize), false); + OutStreamer->emitRawComment( + " LDSByteSize: " + Twine(CurrentProgramInfo.LDSSize) + + " bytes/workgroup (compile time only)", false); - OutStreamer->emitRawComment(" SGPRBlocks: " + - Twine(KernelInfo.SGPRBlocks), false); - OutStreamer->emitRawComment(" VGPRBlocks: " + - Twine(KernelInfo.VGPRBlocks), false); + OutStreamer->emitRawComment( + " SGPRBlocks: " + Twine(CurrentProgramInfo.SGPRBlocks), false); + OutStreamer->emitRawComment( + " VGPRBlocks: " + Twine(CurrentProgramInfo.VGPRBlocks), false); - OutStreamer->emitRawComment(" NumSGPRsForWavesPerEU: " + - Twine(KernelInfo.NumSGPRsForWavesPerEU), false); - OutStreamer->emitRawComment(" NumVGPRsForWavesPerEU: " + - Twine(KernelInfo.NumVGPRsForWavesPerEU), false); + OutStreamer->emitRawComment( + " NumSGPRsForWavesPerEU: " + + Twine(CurrentProgramInfo.NumSGPRsForWavesPerEU), false); + OutStreamer->emitRawComment( + " NumVGPRsForWavesPerEU: " + + Twine(CurrentProgramInfo.NumVGPRsForWavesPerEU), false); - OutStreamer->emitRawComment(" ReservedVGPRFirst: " + Twine(KernelInfo.ReservedVGPRFirst), - false); - OutStreamer->emitRawComment(" ReservedVGPRCount: " + Twine(KernelInfo.ReservedVGPRCount), - false); + OutStreamer->emitRawComment( + " ReservedVGPRFirst: " + Twine(CurrentProgramInfo.ReservedVGPRFirst), + false); + OutStreamer->emitRawComment( + " ReservedVGPRCount: " + Twine(CurrentProgramInfo.ReservedVGPRCount), + false); if (MF.getSubtarget<SISubtarget>().debuggerEmitPrologue()) { - OutStreamer->emitRawComment(" DebuggerWavefrontPrivateSegmentOffsetSGPR: s" + - Twine(KernelInfo.DebuggerWavefrontPrivateSegmentOffsetSGPR), false); - OutStreamer->emitRawComment(" DebuggerPrivateSegmentBufferSGPR: s" + - Twine(KernelInfo.DebuggerPrivateSegmentBufferSGPR), false); + OutStreamer->emitRawComment( + " DebuggerWavefrontPrivateSegmentOffsetSGPR: s" + + Twine(CurrentProgramInfo.DebuggerWavefrontPrivateSegmentOffsetSGPR), false); + OutStreamer->emitRawComment( + " DebuggerPrivateSegmentBufferSGPR: s" + + Twine(CurrentProgramInfo.DebuggerPrivateSegmentBufferSGPR), false); } - OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:USER_SGPR: " + - Twine(G_00B84C_USER_SGPR(KernelInfo.ComputePGMRSrc2)), - false); - OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:TRAP_HANDLER: " + - Twine(G_00B84C_TRAP_HANDLER(KernelInfo.ComputePGMRSrc2)), - false); - OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:TGID_X_EN: " + - Twine(G_00B84C_TGID_X_EN(KernelInfo.ComputePGMRSrc2)), - false); - OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:TGID_Y_EN: " + - Twine(G_00B84C_TGID_Y_EN(KernelInfo.ComputePGMRSrc2)), - false); - OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:TGID_Z_EN: " + - Twine(G_00B84C_TGID_Z_EN(KernelInfo.ComputePGMRSrc2)), - false); - OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:TIDIG_COMP_CNT: " + - Twine(G_00B84C_TIDIG_COMP_CNT(KernelInfo.ComputePGMRSrc2)), - false); - + OutStreamer->emitRawComment( + " COMPUTE_PGM_RSRC2:USER_SGPR: " + + Twine(G_00B84C_USER_SGPR(CurrentProgramInfo.ComputePGMRSrc2)), false); + OutStreamer->emitRawComment( + " COMPUTE_PGM_RSRC2:TRAP_HANDLER: " + + Twine(G_00B84C_TRAP_HANDLER(CurrentProgramInfo.ComputePGMRSrc2)), false); + OutStreamer->emitRawComment( + " COMPUTE_PGM_RSRC2:TGID_X_EN: " + + Twine(G_00B84C_TGID_X_EN(CurrentProgramInfo.ComputePGMRSrc2)), false); + OutStreamer->emitRawComment( + " COMPUTE_PGM_RSRC2:TGID_Y_EN: " + + Twine(G_00B84C_TGID_Y_EN(CurrentProgramInfo.ComputePGMRSrc2)), false); + OutStreamer->emitRawComment( + " COMPUTE_PGM_RSRC2:TGID_Z_EN: " + + Twine(G_00B84C_TGID_Z_EN(CurrentProgramInfo.ComputePGMRSrc2)), false); + OutStreamer->emitRawComment( + " COMPUTE_PGM_RSRC2:TIDIG_COMP_CNT: " + + Twine(G_00B84C_TIDIG_COMP_CNT(CurrentProgramInfo.ComputePGMRSrc2)), + false); } else { R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>(); OutStreamer->emitRawComment( @@ -407,71 +447,117 @@ static bool hasAnyNonFlatUseOfReg(const MachineRegisterInfo &MRI, return false; } -void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo, - const MachineFunction &MF) const { - const SISubtarget &STM = MF.getSubtarget<SISubtarget>(); - const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); - const MachineRegisterInfo &MRI = MF.getRegInfo(); - const SIInstrInfo *TII = STM.getInstrInfo(); - const SIRegisterInfo *RI = &TII->getRegisterInfo(); +static unsigned getNumExtraSGPRs(const SISubtarget &ST, + bool VCCUsed, + bool FlatScrUsed) { + unsigned ExtraSGPRs = 0; + if (VCCUsed) + ExtraSGPRs = 2; + if (ST.getGeneration() < SISubtarget::VOLCANIC_ISLANDS) { + if (FlatScrUsed) + ExtraSGPRs = 4; + } else { + if (ST.isXNACKEnabled()) + ExtraSGPRs = 4; - MCPhysReg NumVGPRReg = AMDGPU::NoRegister; - for (MCPhysReg Reg : reverse(AMDGPU::VGPR_32RegClass.getRegisters())) { - if (MRI.isPhysRegUsed(Reg)) { - NumVGPRReg = Reg; - break; - } + if (FlatScrUsed) + ExtraSGPRs = 6; } - MCPhysReg NumSGPRReg = AMDGPU::NoRegister; - for (MCPhysReg Reg : reverse(AMDGPU::SGPR_32RegClass.getRegisters())) { - if (MRI.isPhysRegUsed(Reg)) { - NumSGPRReg = Reg; - break; - } - } + return ExtraSGPRs; +} - // We found the maximum register index. They start at 0, so add one to get the - // number of registers. - ProgInfo.NumVGPR = NumVGPRReg == AMDGPU::NoRegister ? 0 : - RI->getHWRegIndex(NumVGPRReg) + 1; - ProgInfo.NumSGPR = NumSGPRReg == AMDGPU::NoRegister ? 0 : - RI->getHWRegIndex(NumSGPRReg) + 1; - unsigned ExtraSGPRs = 0; +int32_t AMDGPUAsmPrinter::SIFunctionResourceInfo::getTotalNumSGPRs( + const SISubtarget &ST) const { + return NumExplicitSGPR + getNumExtraSGPRs(ST, UsesVCC, UsesFlatScratch); +} - ProgInfo.VCCUsed = MRI.isPhysRegUsed(AMDGPU::VCC_LO) || - MRI.isPhysRegUsed(AMDGPU::VCC_HI); - if (ProgInfo.VCCUsed) - ExtraSGPRs = 2; +AMDGPUAsmPrinter::SIFunctionResourceInfo AMDGPUAsmPrinter::analyzeResourceUsage( + const MachineFunction &MF) const { + SIFunctionResourceInfo Info; - ProgInfo.FlatUsed = MRI.isPhysRegUsed(AMDGPU::FLAT_SCR_LO) || - MRI.isPhysRegUsed(AMDGPU::FLAT_SCR_HI); + const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); + const SISubtarget &ST = MF.getSubtarget<SISubtarget>(); + const MachineFrameInfo &FrameInfo = MF.getFrameInfo(); + const MachineRegisterInfo &MRI = MF.getRegInfo(); + const SIInstrInfo *TII = ST.getInstrInfo(); + const SIRegisterInfo &TRI = TII->getRegisterInfo(); + + Info.UsesFlatScratch = MRI.isPhysRegUsed(AMDGPU::FLAT_SCR_LO) || + MRI.isPhysRegUsed(AMDGPU::FLAT_SCR_HI); // Even if FLAT_SCRATCH is implicitly used, it has no effect if flat - // instructions aren't used to access the scratch buffer. Inline assembly - // may need it though. + // instructions aren't used to access the scratch buffer. Inline assembly may + // need it though. // // If we only have implicit uses of flat_scr on flat instructions, it is not // really needed. - if (ProgInfo.FlatUsed && !MFI->hasFlatScratchInit() && + if (Info.UsesFlatScratch && !MFI->hasFlatScratchInit() && (!hasAnyNonFlatUseOfReg(MRI, *TII, AMDGPU::FLAT_SCR) && !hasAnyNonFlatUseOfReg(MRI, *TII, AMDGPU::FLAT_SCR_LO) && !hasAnyNonFlatUseOfReg(MRI, *TII, AMDGPU::FLAT_SCR_HI))) { - ProgInfo.FlatUsed = false; + Info.UsesFlatScratch = false; } - if (STM.getGeneration() < SISubtarget::VOLCANIC_ISLANDS) { - if (ProgInfo.FlatUsed) - ExtraSGPRs = 4; - } else { - if (STM.isXNACKEnabled()) - ExtraSGPRs = 4; + Info.HasDynamicallySizedStack = FrameInfo.hasVarSizedObjects(); + Info.PrivateSegmentSize = FrameInfo.getStackSize(); - if (ProgInfo.FlatUsed) - ExtraSGPRs = 6; + if (!FrameInfo.hasCalls()) { + Info.UsesVCC = MRI.isPhysRegUsed(AMDGPU::VCC_LO) || + MRI.isPhysRegUsed(AMDGPU::VCC_HI); + + // If there are no calls, MachineRegisterInfo can tell us the used register + // count easily. + + MCPhysReg HighestVGPRReg = AMDGPU::NoRegister; + for (MCPhysReg Reg : reverse(AMDGPU::VGPR_32RegClass.getRegisters())) { + if (MRI.isPhysRegUsed(Reg)) { + HighestVGPRReg = Reg; + break; + } + } + + MCPhysReg HighestSGPRReg = AMDGPU::NoRegister; + for (MCPhysReg Reg : reverse(AMDGPU::SGPR_32RegClass.getRegisters())) { + if (MRI.isPhysRegUsed(Reg)) { + HighestSGPRReg = Reg; + break; + } + } + + // We found the maximum register index. They start at 0, so add one to get the + // number of registers. + Info.NumVGPR = HighestVGPRReg == AMDGPU::NoRegister ? 0 : + TRI.getHWRegIndex(HighestVGPRReg) + 1; + Info.NumExplicitSGPR = HighestSGPRReg == AMDGPU::NoRegister ? 0 : + TRI.getHWRegIndex(HighestSGPRReg) + 1; + + return Info; } + llvm_unreachable("calls not implemented"); +} + +void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo, + const MachineFunction &MF) { + SIFunctionResourceInfo Info = analyzeResourceUsage(MF); + + ProgInfo.NumVGPR = Info.NumVGPR; + ProgInfo.NumSGPR = Info.NumExplicitSGPR; + ProgInfo.ScratchSize = Info.PrivateSegmentSize; + ProgInfo.VCCUsed = Info.UsesVCC; + ProgInfo.FlatUsed = Info.UsesFlatScratch; + ProgInfo.DynamicCallStack = Info.HasDynamicallySizedStack || Info.HasRecursion; + + const SISubtarget &STM = MF.getSubtarget<SISubtarget>(); + const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); + const SIInstrInfo *TII = STM.getInstrInfo(); + const SIRegisterInfo *RI = &TII->getRegisterInfo(); + + unsigned ExtraSGPRs = getNumExtraSGPRs(STM, + ProgInfo.VCCUsed, + ProgInfo.FlatUsed); unsigned ExtraVGPRs = STM.getReservedNumVGPRs(MF); // Check the addressable register limit before we add ExtraSGPRs. @@ -574,9 +660,6 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo, // Make clamp modifier on NaN input returns 0. ProgInfo.DX10Clamp = STM.enableDX10Clamp(); - const MachineFrameInfo &FrameInfo = MF.getFrameInfo(); - ProgInfo.ScratchSize = FrameInfo.getStackSize(); - unsigned LDSAlignShift; if (STM.getGeneration() < SISubtarget::SEA_ISLANDS) { // LDS is allocated in 64 dword blocks. @@ -638,6 +721,7 @@ static unsigned getRsrcReg(CallingConv::ID CallConv) { switch (CallConv) { default: LLVM_FALLTHROUGH; case CallingConv::AMDGPU_CS: return R_00B848_COMPUTE_PGM_RSRC1; + case CallingConv::AMDGPU_HS: return R_00B428_SPI_SHADER_PGM_RSRC1_HS; case CallingConv::AMDGPU_GS: return R_00B228_SPI_SHADER_PGM_RSRC1_GS; case CallingConv::AMDGPU_PS: return R_00B028_SPI_SHADER_PGM_RSRC1_PS; case CallingConv::AMDGPU_VS: return R_00B128_SPI_SHADER_PGM_RSRC1_VS; @@ -645,7 +729,7 @@ static unsigned getRsrcReg(CallingConv::ID CallConv) { } void AMDGPUAsmPrinter::EmitProgramInfoSI(const MachineFunction &MF, - const SIProgramInfo &KernelInfo) { + const SIProgramInfo &CurrentProgramInfo) { const SISubtarget &STM = MF.getSubtarget<SISubtarget>(); const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); unsigned RsrcReg = getRsrcReg(MF.getFunction()->getCallingConv()); @@ -653,29 +737,29 @@ void AMDGPUAsmPrinter::EmitProgramInfoSI(const MachineFunction &MF, if (AMDGPU::isCompute(MF.getFunction()->getCallingConv())) { OutStreamer->EmitIntValue(R_00B848_COMPUTE_PGM_RSRC1, 4); - OutStreamer->EmitIntValue(KernelInfo.ComputePGMRSrc1, 4); + OutStreamer->EmitIntValue(CurrentProgramInfo.ComputePGMRSrc1, 4); OutStreamer->EmitIntValue(R_00B84C_COMPUTE_PGM_RSRC2, 4); - OutStreamer->EmitIntValue(KernelInfo.ComputePGMRSrc2, 4); + OutStreamer->EmitIntValue(CurrentProgramInfo.ComputePGMRSrc2, 4); OutStreamer->EmitIntValue(R_00B860_COMPUTE_TMPRING_SIZE, 4); - OutStreamer->EmitIntValue(S_00B860_WAVESIZE(KernelInfo.ScratchBlocks), 4); + OutStreamer->EmitIntValue(S_00B860_WAVESIZE(CurrentProgramInfo.ScratchBlocks), 4); // TODO: Should probably note flat usage somewhere. SC emits a "FlatPtr32 = // 0" comment but I don't see a corresponding field in the register spec. } else { OutStreamer->EmitIntValue(RsrcReg, 4); - OutStreamer->EmitIntValue(S_00B028_VGPRS(KernelInfo.VGPRBlocks) | - S_00B028_SGPRS(KernelInfo.SGPRBlocks), 4); + OutStreamer->EmitIntValue(S_00B028_VGPRS(CurrentProgramInfo.VGPRBlocks) | + S_00B028_SGPRS(CurrentProgramInfo.SGPRBlocks), 4); if (STM.isVGPRSpillingEnabled(*MF.getFunction())) { OutStreamer->EmitIntValue(R_0286E8_SPI_TMPRING_SIZE, 4); - OutStreamer->EmitIntValue(S_0286E8_WAVESIZE(KernelInfo.ScratchBlocks), 4); + OutStreamer->EmitIntValue(S_0286E8_WAVESIZE(CurrentProgramInfo.ScratchBlocks), 4); } } if (MF.getFunction()->getCallingConv() == CallingConv::AMDGPU_PS) { OutStreamer->EmitIntValue(R_00B02C_SPI_SHADER_PGM_RSRC2_PS, 4); - OutStreamer->EmitIntValue(S_00B02C_EXTRA_LDS_SIZE(KernelInfo.LDSBlocks), 4); + OutStreamer->EmitIntValue(S_00B02C_EXTRA_LDS_SIZE(CurrentProgramInfo.LDSBlocks), 4); OutStreamer->EmitIntValue(R_0286CC_SPI_PS_INPUT_ENA, 4); OutStreamer->EmitIntValue(MFI->getPSInputEnable(), 4); OutStreamer->EmitIntValue(R_0286D0_SPI_PS_INPUT_ADDR, 4); @@ -703,7 +787,7 @@ static amd_element_byte_size_t getElementByteSizeValue(unsigned Size) { } void AMDGPUAsmPrinter::getAmdKernelCode(amd_kernel_code_t &Out, - const SIProgramInfo &KernelInfo, + const SIProgramInfo &CurrentProgramInfo, const MachineFunction &MF) const { const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); const SISubtarget &STM = MF.getSubtarget<SISubtarget>(); @@ -711,10 +795,13 @@ void AMDGPUAsmPrinter::getAmdKernelCode(amd_kernel_code_t &Out, AMDGPU::initDefaultAMDKernelCodeT(Out, STM.getFeatureBits()); Out.compute_pgm_resource_registers = - KernelInfo.ComputePGMRSrc1 | - (KernelInfo.ComputePGMRSrc2 << 32); + CurrentProgramInfo.ComputePGMRSrc1 | + (CurrentProgramInfo.ComputePGMRSrc2 << 32); Out.code_properties = AMD_CODE_PROPERTY_IS_PTR64; + if (CurrentProgramInfo.DynamicCallStack) + Out.code_properties |= AMD_CODE_PROPERTY_IS_DYNAMIC_CALLSTACK; + AMD_HSA_BITS_SET(Out.code_properties, AMD_CODE_PROPERTY_PRIVATE_ELEMENT_SIZE, getElementByteSizeValue(STM.getMaxPrivateElementSize())); @@ -766,12 +853,12 @@ void AMDGPUAsmPrinter::getAmdKernelCode(amd_kernel_code_t &Out, // FIXME: Should use getKernArgSize Out.kernarg_segment_byte_size = STM.getKernArgSegmentSize(MF, MFI->getABIArgOffset()); - Out.wavefront_sgpr_count = KernelInfo.NumSGPR; - Out.workitem_vgpr_count = KernelInfo.NumVGPR; - Out.workitem_private_segment_byte_size = KernelInfo.ScratchSize; - Out.workgroup_group_segment_byte_size = KernelInfo.LDSSize; - Out.reserved_vgpr_first = KernelInfo.ReservedVGPRFirst; - Out.reserved_vgpr_count = KernelInfo.ReservedVGPRCount; + Out.wavefront_sgpr_count = CurrentProgramInfo.NumSGPR; + Out.workitem_vgpr_count = CurrentProgramInfo.NumVGPR; + Out.workitem_private_segment_byte_size = CurrentProgramInfo.ScratchSize; + Out.workgroup_group_segment_byte_size = CurrentProgramInfo.LDSSize; + Out.reserved_vgpr_first = CurrentProgramInfo.ReservedVGPRFirst; + Out.reserved_vgpr_count = CurrentProgramInfo.ReservedVGPRCount; // These alignment values are specified in powers of two, so alignment = // 2^n. The minimum alignment is 2^4 = 16. @@ -780,9 +867,9 @@ void AMDGPUAsmPrinter::getAmdKernelCode(amd_kernel_code_t &Out, if (STM.debuggerEmitPrologue()) { Out.debug_wavefront_private_segment_offset_sgpr = - KernelInfo.DebuggerWavefrontPrivateSegmentOffsetSGPR; + CurrentProgramInfo.DebuggerWavefrontPrivateSegmentOffsetSGPR; Out.debug_private_segment_buffer_sgpr = - KernelInfo.DebuggerPrivateSegmentBufferSGPR; + CurrentProgramInfo.DebuggerPrivateSegmentBufferSGPR; } } diff --git a/lib/Target/AMDGPU/AMDGPUAsmPrinter.h b/lib/Target/AMDGPU/AMDGPUAsmPrinter.h index 8c86dea4b885..e5adeeb465e1 100644 --- a/lib/Target/AMDGPU/AMDGPUAsmPrinter.h +++ b/lib/Target/AMDGPU/AMDGPUAsmPrinter.h @@ -30,9 +30,26 @@ namespace llvm { class AMDGPUTargetStreamer; class MCOperand; +class SISubtarget; class AMDGPUAsmPrinter final : public AsmPrinter { private: + // Track resource usage for callee functions. + struct SIFunctionResourceInfo { + // Track the number of explicitly used VGPRs. Special registers reserved at + // the end are tracked separately. + int32_t NumVGPR = 0; + int32_t NumExplicitSGPR = 0; + uint32_t PrivateSegmentSize = 0; + bool UsesVCC = false; + bool UsesFlatScratch = false; + bool HasDynamicallySizedStack = false; + bool HasRecursion = false; + + int32_t getTotalNumSGPRs(const SISubtarget &ST) const; + }; + + // Track resource usage for kernels / entry functions. struct SIProgramInfo { // Fields set in PGM_RSRC1 pm4 packet. uint32_t VGPRBlocks = 0; @@ -83,14 +100,23 @@ private: uint16_t DebuggerPrivateSegmentBufferSGPR = std::numeric_limits<uint16_t>::max(); + // Whether there is recursion, dynamic allocas, indirect calls or some other + // reason there may be statically unknown stack usage. + bool DynamicCallStack = false; + // Bonus information for debugging. bool VCCUsed = false; SIProgramInfo() = default; }; + SIProgramInfo CurrentProgramInfo; + DenseMap<const Function *, SIFunctionResourceInfo> CallGraphResourceInfo; + uint64_t getFunctionCodeSize(const MachineFunction &MF) const; - void getSIProgramInfo(SIProgramInfo &Out, const MachineFunction &MF) const; + SIFunctionResourceInfo analyzeResourceUsage(const MachineFunction &MF) const; + + void getSIProgramInfo(SIProgramInfo &Out, const MachineFunction &MF); void getAmdKernelCode(amd_kernel_code_t &Out, const SIProgramInfo &KernelInfo, const MachineFunction &MF) const; void findNumUsedRegistersSI(const MachineFunction &MF, @@ -101,6 +127,10 @@ private: /// can correctly setup the GPU state. void EmitProgramInfoR600(const MachineFunction &MF); void EmitProgramInfoSI(const MachineFunction &MF, const SIProgramInfo &KernelInfo); + void emitCommonFunctionComments(uint32_t NumVGPR, + uint32_t NumSGPR, + uint32_t ScratchSize, + uint64_t CodeSize); public: explicit AMDGPUAsmPrinter(TargetMachine &TM, @@ -112,6 +142,7 @@ public: AMDGPUTargetStreamer& getTargetStreamer() const; + bool doFinalization(Module &M) override; bool runOnMachineFunction(MachineFunction &MF) override; /// \brief Wrapper for MCInstLowering.lowerOperand() for the tblgen'erated diff --git a/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp index f5110857da84..ccae36ced1f8 100644 --- a/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp +++ b/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp @@ -207,8 +207,8 @@ bool AMDGPUDAGToDAGISel::isNoNanSrc(SDValue N) const { return true; // TODO: Move into isKnownNeverNaN - if (const auto *BO = dyn_cast<BinaryWithFlagsSDNode>(N)) - return BO->Flags.hasNoNaNs(); + if (N->getFlags().isDefined()) + return N->getFlags().hasNoNaNs(); return CurDAG->isKnownNeverNaN(N); } diff --git a/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/lib/Target/AMDGPU/AMDGPUISelLowering.cpp index e21775e61dd4..64e1b8f0d7f0 100644 --- a/lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ b/lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -29,6 +29,7 @@ #include "llvm/CodeGen/TargetLoweringObjectFileImpl.h" #include "llvm/IR/DataLayout.h" #include "llvm/IR/DiagnosticInfo.h" +#include "llvm/Support/KnownBits.h" #include "SIInstrInfo.h" using namespace llvm; @@ -895,6 +896,7 @@ CCAssignFn *AMDGPUTargetLowering::CCAssignFnForCall(CallingConv::ID CC, case CallingConv::SPIR_KERNEL: return CC_AMDGPU_Kernel; case CallingConv::AMDGPU_VS: + case CallingConv::AMDGPU_HS: case CallingConv::AMDGPU_GS: case CallingConv::AMDGPU_PS: case CallingConv::AMDGPU_CS: @@ -2293,11 +2295,11 @@ SDValue AMDGPUTargetLowering::LowerSIGN_EXTEND_INREG(SDValue Op, //===----------------------------------------------------------------------===// static bool isU24(SDValue Op, SelectionDAG &DAG) { - APInt KnownZero, KnownOne; + KnownBits Known; EVT VT = Op.getValueType(); - DAG.computeKnownBits(Op, KnownZero, KnownOne); + DAG.computeKnownBits(Op, Known); - return (VT.getSizeInBits() - KnownZero.countLeadingOnes()) <= 24; + return (VT.getSizeInBits() - Known.Zero.countLeadingOnes()) <= 24; } static bool isI24(SDValue Op, SelectionDAG &DAG) { @@ -3358,13 +3360,12 @@ SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N, OffsetVal, OffsetVal + WidthVal); - APInt KnownZero, KnownOne; + KnownBits Known; TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(), !DCI.isBeforeLegalizeOps()); const TargetLowering &TLI = DAG.getTargetLoweringInfo(); if (TLI.ShrinkDemandedConstant(BitsFrom, Demanded, TLO) || - TLI.SimplifyDemandedBits(BitsFrom, Demanded, - KnownZero, KnownOne, TLO)) { + TLI.SimplifyDemandedBits(BitsFrom, Demanded, Known, TLO)) { DCI.CommitTargetLoweringOpt(TLO); } } @@ -3516,6 +3517,8 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const { NODE_NAME_CASE(KILL) NODE_NAME_CASE(DUMMY_CHAIN) case AMDGPUISD::FIRST_MEM_OPCODE_NUMBER: break; + NODE_NAME_CASE(INIT_EXEC) + NODE_NAME_CASE(INIT_EXEC_FROM_INPUT) NODE_NAME_CASE(SENDMSG) NODE_NAME_CASE(SENDMSGHALT) NODE_NAME_CASE(INTERP_MOV) @@ -3574,14 +3577,12 @@ SDValue AMDGPUTargetLowering::getRecipEstimate(SDValue Operand, } void AMDGPUTargetLowering::computeKnownBitsForTargetNode( - const SDValue Op, APInt &KnownZero, APInt &KnownOne, + const SDValue Op, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth) const { - unsigned BitWidth = KnownZero.getBitWidth(); - KnownZero = KnownOne = APInt(BitWidth, 0); // Don't know anything. + Known.Zero.clearAllBits(); Known.One.clearAllBits(); // Don't know anything. - APInt KnownZero2; - APInt KnownOne2; + KnownBits Known2; unsigned Opc = Op.getOpcode(); switch (Opc) { @@ -3589,7 +3590,7 @@ void AMDGPUTargetLowering::computeKnownBitsForTargetNode( break; case AMDGPUISD::CARRY: case AMDGPUISD::BORROW: { - KnownZero = APInt::getHighBitsSet(32, 31); + Known.Zero = APInt::getHighBitsSet(32, 31); break; } @@ -3602,16 +3603,16 @@ void AMDGPUTargetLowering::computeKnownBitsForTargetNode( uint32_t Width = CWidth->getZExtValue() & 0x1f; if (Opc == AMDGPUISD::BFE_U32) - KnownZero = APInt::getHighBitsSet(32, 32 - Width); + Known.Zero = APInt::getHighBitsSet(32, 32 - Width); break; } case AMDGPUISD::FP_TO_FP16: case AMDGPUISD::FP16_ZEXT: { - unsigned BitWidth = KnownZero.getBitWidth(); + unsigned BitWidth = Known.getBitWidth(); // High bits are zero. - KnownZero = APInt::getHighBitsSet(BitWidth, BitWidth - 16); + Known.Zero = APInt::getHighBitsSet(BitWidth, BitWidth - 16); break; } } diff --git a/lib/Target/AMDGPU/AMDGPUISelLowering.h b/lib/Target/AMDGPU/AMDGPUISelLowering.h index 13cbfe267932..e1a5a2072418 100644 --- a/lib/Target/AMDGPU/AMDGPUISelLowering.h +++ b/lib/Target/AMDGPU/AMDGPUISelLowering.h @@ -125,8 +125,9 @@ public: if (getTargetMachine().Options.NoSignedZerosFPMath) return true; - if (const auto *BO = dyn_cast<BinaryWithFlagsSDNode>(Op)) - return BO->Flags.hasNoSignedZeros(); + const auto Flags = Op.getNode()->getFlags(); + if (Flags.isDefined()) + return Flags.hasNoSignedZeros(); return false; } @@ -199,8 +200,7 @@ public: /// either zero or one and return them in the \p KnownZero and \p KnownOne /// bitsets. void computeKnownBitsForTargetNode(const SDValue Op, - APInt &KnownZero, - APInt &KnownOne, + KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth = 0) const override; @@ -370,6 +370,8 @@ enum NodeType : unsigned { BUILD_VERTICAL_VECTOR, /// Pointer to the start of the shader's constant data. CONST_DATA_PTR, + INIT_EXEC, + INIT_EXEC_FROM_INPUT, SENDMSG, SENDMSGHALT, INTERP_MOV, diff --git a/lib/Target/AMDGPU/AMDGPUInstrInfo.td b/lib/Target/AMDGPU/AMDGPUInstrInfo.td index c1706d12a2ea..353cc5742791 100644 --- a/lib/Target/AMDGPU/AMDGPUInstrInfo.td +++ b/lib/Target/AMDGPU/AMDGPUInstrInfo.td @@ -299,6 +299,15 @@ def AMDGPUumed3 : SDNode<"AMDGPUISD::UMED3", AMDGPUDTIntTernaryOp, def AMDGPUfmed3 : SDNode<"AMDGPUISD::FMED3", SDTFPTernaryOp, []>; +def AMDGPUinit_exec : SDNode<"AMDGPUISD::INIT_EXEC", + SDTypeProfile<0, 1, [SDTCisInt<0>]>, + [SDNPHasChain, SDNPInGlue]>; + +def AMDGPUinit_exec_from_input : SDNode<"AMDGPUISD::INIT_EXEC_FROM_INPUT", + SDTypeProfile<0, 2, + [SDTCisInt<0>, SDTCisInt<1>]>, + [SDNPHasChain, SDNPInGlue]>; + def AMDGPUsendmsg : SDNode<"AMDGPUISD::SENDMSG", SDTypeProfile<0, 1, [SDTCisInt<0>]>, [SDNPHasChain, SDNPInGlue]>; diff --git a/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp b/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp index 27fe639e3d4b..fe7283ccf7d9 100644 --- a/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp +++ b/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp @@ -17,6 +17,7 @@ static bool isEntryFunctionCC(CallingConv::ID CC) { case CallingConv::AMDGPU_KERNEL: case CallingConv::SPIR_KERNEL: case CallingConv::AMDGPU_VS: + case CallingConv::AMDGPU_HS: case CallingConv::AMDGPU_GS: case CallingConv::AMDGPU_PS: case CallingConv::AMDGPU_CS: diff --git a/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp index 0202220b8011..cd5bad04d0b3 100644 --- a/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -309,6 +309,7 @@ void AMDGPUTargetMachine::adjustPassManager(PassManagerBuilder &Builder) { default: return false; case CallingConv::AMDGPU_VS: + case CallingConv::AMDGPU_HS: case CallingConv::AMDGPU_GS: case CallingConv::AMDGPU_PS: case CallingConv::AMDGPU_CS: diff --git a/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp index 6edd3e923ba1..c9482c37ec80 100644 --- a/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp +++ b/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp @@ -432,6 +432,7 @@ static bool isArgPassedInSGPR(const Argument *A) { case CallingConv::SPIR_KERNEL: return true; case CallingConv::AMDGPU_VS: + case CallingConv::AMDGPU_HS: case CallingConv::AMDGPU_GS: case CallingConv::AMDGPU_PS: case CallingConv::AMDGPU_CS: diff --git a/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/lib/Target/AMDGPU/GCNSchedStrategy.cpp index ea305a92fc60..630442625aa3 100644 --- a/lib/Target/AMDGPU/GCNSchedStrategy.cpp +++ b/lib/Target/AMDGPU/GCNSchedStrategy.cpp @@ -422,8 +422,9 @@ void GCNScheduleDAGMILive::discoverLiveIns() { unsigned SGPRs = 0; unsigned VGPRs = 0; + auto &MI = *begin()->getParent()->getFirstNonDebugInstr(); const SIRegisterInfo *SRI = static_cast<const SIRegisterInfo*>(TRI); - SlotIndex SI = LIS->getInstructionIndex(*begin()).getBaseIndex(); + SlotIndex SI = LIS->getInstructionIndex(MI).getBaseIndex(); assert (SI.isValid()); DEBUG(dbgs() << "Region live-ins:"); diff --git a/lib/Target/AMDGPU/R600Intrinsics.td b/lib/Target/AMDGPU/R600Intrinsics.td index a5310e9fd6d0..4c9e1e8a5434 100644 --- a/lib/Target/AMDGPU/R600Intrinsics.td +++ b/lib/Target/AMDGPU/R600Intrinsics.td @@ -61,7 +61,7 @@ def int_r600_ddx : TextureIntrinsicFloatInput; def int_r600_ddy : TextureIntrinsicFloatInput; def int_r600_dot4 : Intrinsic<[llvm_float_ty], - [llvm_v4f32_ty, llvm_v4f32_ty], [IntrNoMem] + [llvm_v4f32_ty, llvm_v4f32_ty], [IntrNoMem, IntrSpeculatable] >; } // End TargetPrefix = "r600", isTarget = 1 diff --git a/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp b/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp index b7e62075244b..d8cb98fe1b19 100644 --- a/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp +++ b/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp @@ -77,9 +77,10 @@ class SIAnnotateControlFlow : public FunctionPass { void insertElse(BranchInst *Term); - Value *handleLoopCondition(Value *Cond, PHINode *Broken, - llvm::Loop *L, BranchInst *Term, - SmallVectorImpl<WeakVH> &LoopPhiConditions); + Value * + handleLoopCondition(Value *Cond, PHINode *Broken, llvm::Loop *L, + BranchInst *Term, + SmallVectorImpl<WeakTrackingVH> &LoopPhiConditions); void handleLoop(BranchInst *Term); @@ -212,9 +213,8 @@ void SIAnnotateControlFlow::insertElse(BranchInst *Term) { /// \brief Recursively handle the condition leading to a loop Value *SIAnnotateControlFlow::handleLoopCondition( - Value *Cond, PHINode *Broken, - llvm::Loop *L, BranchInst *Term, - SmallVectorImpl<WeakVH> &LoopPhiConditions) { + Value *Cond, PHINode *Broken, llvm::Loop *L, BranchInst *Term, + SmallVectorImpl<WeakTrackingVH> &LoopPhiConditions) { // Only search through PHI nodes which are inside the loop. If we try this // with PHI nodes that are outside of the loop, we end up inserting new PHI @@ -281,7 +281,7 @@ Value *SIAnnotateControlFlow::handleLoopCondition( NewPhi->setIncomingValue(i, PhiArg); } - LoopPhiConditions.push_back(WeakVH(Phi)); + LoopPhiConditions.push_back(WeakTrackingVH(Phi)); return Ret; } @@ -323,7 +323,7 @@ void SIAnnotateControlFlow::handleLoop(BranchInst *Term) { BasicBlock *Target = Term->getSuccessor(1); PHINode *Broken = PHINode::Create(Int64, 0, "phi.broken", &Target->front()); - SmallVector<WeakVH, 8> LoopPhiConditions; + SmallVector<WeakTrackingVH, 8> LoopPhiConditions; Value *Cond = Term->getCondition(); Term->setCondition(BoolTrue); Value *Arg = handleLoopCondition(Cond, Broken, L, Term, LoopPhiConditions); @@ -333,7 +333,7 @@ void SIAnnotateControlFlow::handleLoop(BranchInst *Term) { Term->setCondition(CallInst::Create(Loop, Arg, "", Term)); - for (WeakVH Val : reverse(LoopPhiConditions)) { + for (WeakTrackingVH Val : reverse(LoopPhiConditions)) { if (PHINode *Cond = cast_or_null<PHINode>(Val)) eraseIfUnused(Cond); } diff --git a/lib/Target/AMDGPU/SIDefines.h b/lib/Target/AMDGPU/SIDefines.h index 3dd372b32866..a01330cb9171 100644 --- a/lib/Target/AMDGPU/SIDefines.h +++ b/lib/Target/AMDGPU/SIDefines.h @@ -302,6 +302,7 @@ enum DstUnused { #define S_00B02C_EXTRA_LDS_SIZE(x) (((x) & 0xFF) << 8) #define R_00B128_SPI_SHADER_PGM_RSRC1_VS 0x00B128 #define R_00B228_SPI_SHADER_PGM_RSRC1_GS 0x00B228 +#define R_00B428_SPI_SHADER_PGM_RSRC1_HS 0x00B428 #define R_00B848_COMPUTE_PGM_RSRC1 0x00B848 #define S_00B028_VGPRS(x) (((x) & 0x3F) << 0) #define S_00B028_SGPRS(x) (((x) & 0x0F) << 6) diff --git a/lib/Target/AMDGPU/SIFixSGPRCopies.cpp b/lib/Target/AMDGPU/SIFixSGPRCopies.cpp index b0f0bf04a891..3cca815d8773 100644 --- a/lib/Target/AMDGPU/SIFixSGPRCopies.cpp +++ b/lib/Target/AMDGPU/SIFixSGPRCopies.cpp @@ -278,8 +278,7 @@ static bool phiHasBreakDef(const MachineInstr &PHI, Visited.insert(Reg); - MachineInstr *DefInstr = MRI.getUniqueVRegDef(Reg); - assert(DefInstr); + MachineInstr *DefInstr = MRI.getVRegDef(Reg); switch (DefInstr->getOpcode()) { default: break; @@ -346,7 +345,7 @@ bool searchPredecessors(const MachineBasicBlock *MBB, return false; DenseSet<const MachineBasicBlock*> Visited; - SmallVector<MachineBasicBlock*, 4> Worklist(MBB->pred_begin(), + SmallVector<MachineBasicBlock*, 4> Worklist(MBB->pred_begin(), MBB->pred_end()); while (!Worklist.empty()) { @@ -546,7 +545,13 @@ bool SIFixSGPRCopies::runOnMachineFunction(MachineFunction &MF) { const TargetRegisterClass *SrcRC, *DstRC; std::tie(SrcRC, DstRC) = getCopyRegClasses(MI, *TRI, MRI); if (isVGPRToSGPRCopy(SrcRC, DstRC, *TRI)) { - MachineInstr *DefMI = MRI.getVRegDef(MI.getOperand(1).getReg()); + unsigned SrcReg = MI.getOperand(1).getReg(); + if (!TargetRegisterInfo::isVirtualRegister(SrcReg)) { + TII->moveToVALU(MI); + break; + } + + MachineInstr *DefMI = MRI.getVRegDef(SrcReg); unsigned SMovOp; int64_t Imm; // If we are just copying an immediate, we can replace the copy with diff --git a/lib/Target/AMDGPU/SIISelLowering.cpp b/lib/Target/AMDGPU/SIISelLowering.cpp index ce74a7cd8b04..853c8737b464 100644 --- a/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/lib/Target/AMDGPU/SIISelLowering.cpp @@ -68,6 +68,7 @@ #include "llvm/Support/CommandLine.h" #include "llvm/Support/Compiler.h" #include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/KnownBits.h" #include "llvm/Support/MathExtras.h" #include "llvm/Target/TargetCallingConv.h" #include "llvm/Target/TargetOptions.h" @@ -1956,6 +1957,63 @@ MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter( MI.eraseFromParent(); return BB; + case AMDGPU::SI_INIT_EXEC: + // This should be before all vector instructions. + BuildMI(*BB, &*BB->begin(), MI.getDebugLoc(), TII->get(AMDGPU::S_MOV_B64), + AMDGPU::EXEC) + .addImm(MI.getOperand(0).getImm()); + MI.eraseFromParent(); + return BB; + + case AMDGPU::SI_INIT_EXEC_FROM_INPUT: { + // Extract the thread count from an SGPR input and set EXEC accordingly. + // Since BFM can't shift by 64, handle that case with CMP + CMOV. + // + // S_BFE_U32 count, input, {shift, 7} + // S_BFM_B64 exec, count, 0 + // S_CMP_EQ_U32 count, 64 + // S_CMOV_B64 exec, -1 + MachineInstr *FirstMI = &*BB->begin(); + MachineRegisterInfo &MRI = MF->getRegInfo(); + unsigned InputReg = MI.getOperand(0).getReg(); + unsigned CountReg = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); + bool Found = false; + + // Move the COPY of the input reg to the beginning, so that we can use it. + for (auto I = BB->begin(); I != &MI; I++) { + if (I->getOpcode() != TargetOpcode::COPY || + I->getOperand(0).getReg() != InputReg) + continue; + + if (I == FirstMI) { + FirstMI = &*++BB->begin(); + } else { + I->removeFromParent(); + BB->insert(FirstMI, &*I); + } + Found = true; + break; + } + assert(Found); + + // This should be before all vector instructions. + BuildMI(*BB, FirstMI, DebugLoc(), TII->get(AMDGPU::S_BFE_U32), CountReg) + .addReg(InputReg) + .addImm((MI.getOperand(1).getImm() & 0x7f) | 0x70000); + BuildMI(*BB, FirstMI, DebugLoc(), TII->get(AMDGPU::S_BFM_B64), + AMDGPU::EXEC) + .addReg(CountReg) + .addImm(0); + BuildMI(*BB, FirstMI, DebugLoc(), TII->get(AMDGPU::S_CMP_EQ_U32)) + .addReg(CountReg, RegState::Kill) + .addImm(64); + BuildMI(*BB, FirstMI, DebugLoc(), TII->get(AMDGPU::S_CMOV_B64), + AMDGPU::EXEC) + .addImm(-1); + MI.eraseFromParent(); + return BB; + } + case AMDGPU::GET_GROUPSTATICSIZE: { DebugLoc DL = MI.getDebugLoc(); BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_MOV_B32)) @@ -3223,6 +3281,14 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op, return DAG.getNode(NodeOp, DL, MVT::Other, Chain, Op.getOperand(2), Glue); } + case Intrinsic::amdgcn_init_exec: { + return DAG.getNode(AMDGPUISD::INIT_EXEC, DL, MVT::Other, Chain, + Op.getOperand(2)); + } + case Intrinsic::amdgcn_init_exec_from_input: { + return DAG.getNode(AMDGPUISD::INIT_EXEC_FROM_INPUT, DL, MVT::Other, Chain, + Op.getOperand(2), Op.getOperand(3)); + } case AMDGPUIntrinsic::SI_tbuffer_store: { SDValue Ops[] = { Chain, @@ -3455,15 +3521,15 @@ SDValue SITargetLowering::lowerFastUnsafeFDIV(SDValue Op, } } - const SDNodeFlags *Flags = Op->getFlags(); + const SDNodeFlags Flags = Op->getFlags(); - if (Unsafe || Flags->hasAllowReciprocal()) { + if (Unsafe || Flags.hasAllowReciprocal()) { // Turn into multiply by the reciprocal. // x / y -> x * (1.0 / y) - SDNodeFlags Flags; - Flags.setUnsafeAlgebra(true); + SDNodeFlags NewFlags; + NewFlags.setUnsafeAlgebra(true); SDValue Recip = DAG.getNode(AMDGPUISD::RCP, SL, VT, RHS); - return DAG.getNode(ISD::FMUL, SL, VT, LHS, Recip, &Flags); + return DAG.getNode(ISD::FMUL, SL, VT, LHS, Recip, NewFlags); } return SDValue(); @@ -4542,10 +4608,9 @@ unsigned SITargetLowering::getFusedOpcode(const SelectionDAG &DAG, return ISD::FMAD; const TargetOptions &Options = DAG.getTarget().Options; - if ((Options.AllowFPOpFusion == FPOpFusion::Fast || - Options.UnsafeFPMath || - (cast<BinaryWithFlagsSDNode>(N0)->Flags.hasUnsafeAlgebra() && - cast<BinaryWithFlagsSDNode>(N1)->Flags.hasUnsafeAlgebra())) && + if ((Options.AllowFPOpFusion == FPOpFusion::Fast || Options.UnsafeFPMath || + (N0->getFlags().hasUnsafeAlgebra() && + N1->getFlags().hasUnsafeAlgebra())) && isFMAFasterThanFMulAndFAdd(VT)) { return ISD::FMA; } @@ -4706,12 +4771,12 @@ SDValue SITargetLowering::performCvtF32UByteNCombine(SDNode *N, APInt Demanded = APInt::getBitsSet(32, 8 * Offset, 8 * Offset + 8); - APInt KnownZero, KnownOne; + KnownBits Known; TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(), !DCI.isBeforeLegalizeOps()); const TargetLowering &TLI = DAG.getTargetLoweringInfo(); if (TLI.ShrinkDemandedConstant(Src, Demanded, TLO) || - TLI.SimplifyDemandedBits(Src, Demanded, KnownZero, KnownOne, TLO)) { + TLI.SimplifyDemandedBits(Src, Demanded, Known, TLO)) { DCI.CommitTargetLoweringOpt(TLO); } diff --git a/lib/Target/AMDGPU/SIInstructions.td b/lib/Target/AMDGPU/SIInstructions.td index 3f6ddec70479..7ccb54f54e34 100644 --- a/lib/Target/AMDGPU/SIInstructions.td +++ b/lib/Target/AMDGPU/SIInstructions.td @@ -286,6 +286,19 @@ def SI_INIT_M0 : SPseudoInstSI <(outs), (ins SSrc_b32:$src)> { let isReMaterializable = 1; } +def SI_INIT_EXEC : SPseudoInstSI < + (outs), (ins i64imm:$src), []> { + let Defs = [EXEC]; + let usesCustomInserter = 1; + let isAsCheapAsAMove = 1; +} + +def SI_INIT_EXEC_FROM_INPUT : SPseudoInstSI < + (outs), (ins SSrc_b32:$input, i32imm:$shift), []> { + let Defs = [EXEC]; + let usesCustomInserter = 1; +} + // Return for returning shaders to a shader variant epilog. def SI_RETURN_TO_EPILOG : SPseudoInstSI < (outs), (ins variable_ops), [(AMDGPUreturn_to_epilog)]> { @@ -399,6 +412,16 @@ def SI_PC_ADD_REL_OFFSET : SPseudoInstSI < } // End SubtargetPredicate = isGCN let Predicates = [isGCN] in { +def : Pat < + (AMDGPUinit_exec i64:$src), + (SI_INIT_EXEC (as_i64imm $src)) +>; + +def : Pat < + (AMDGPUinit_exec_from_input i32:$input, i32:$shift), + (SI_INIT_EXEC_FROM_INPUT (i32 $input), (as_i32imm $shift)) +>; + def : Pat< (AMDGPUtrap timm:$trapid), (S_TRAP $trapid) diff --git a/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp index 5a3242bed1d0..d565c84bfeda 100644 --- a/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp +++ b/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp @@ -503,6 +503,7 @@ unsigned getInitialPSInputAddr(const Function &F) { bool isShader(CallingConv::ID cc) { switch(cc) { case CallingConv::AMDGPU_VS: + case CallingConv::AMDGPU_HS: case CallingConv::AMDGPU_GS: case CallingConv::AMDGPU_PS: case CallingConv::AMDGPU_CS: |
