Vendor import of llvm trunk r301939: - src

diff options


context:
space:
mode:

author	Dimitry Andric <dim@FreeBSD.org>	2017-05-02 18:30:13 +0000
committer	Dimitry Andric <dim@FreeBSD.org>	2017-05-02 18:30:13 +0000
commit	a303c417bbdb53703c2c17398b08486bde78f1f6 (patch)
tree	98366d6b93d863cefdc53f16c66c0c5ae7fb2261 /lib/Target/AMDGPU
parent	12f3ca4cdb95b193af905a00e722a4dcb40b3de3 (diff)

vendor/llvm/llvm-trunk-r301939

Notes

Diffstat (limited to 'lib/Target/AMDGPU')

-rw-r--r--

lib/Target/AMDGPU/AMDGPU.td

-rw-r--r--

lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp

339

-rw-r--r--

lib/Target/AMDGPU/AMDGPUAsmPrinter.h

-rw-r--r--

lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp

-rw-r--r--

lib/Target/AMDGPU/AMDGPUISelLowering.cpp

-rw-r--r--

lib/Target/AMDGPU/AMDGPUISelLowering.h

-rw-r--r--

lib/Target/AMDGPU/AMDGPUInstrInfo.td

-rw-r--r--

lib/Target/AMDGPU/AMDGPUMachineFunction.cpp

-rw-r--r--

lib/Target/AMDGPU/AMDGPUTargetMachine.cpp

-rw-r--r--

lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp

-rw-r--r--

lib/Target/AMDGPU/GCNSchedStrategy.cpp

-rw-r--r--

lib/Target/AMDGPU/R600Intrinsics.td

-rw-r--r--

lib/Target/AMDGPU/SIAnnotateControlFlow.cpp

-rw-r--r--

lib/Target/AMDGPU/SIDefines.h

-rw-r--r--

lib/Target/AMDGPU/SIFixSGPRCopies.cpp

-rw-r--r--

lib/Target/AMDGPU/SIISelLowering.cpp

-rw-r--r--

lib/Target/AMDGPU/SIInstructions.td

-rw-r--r--

lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp

18 files changed, 404 insertions, 175 deletions

diff --git a/lib/Target/AMDGPU/AMDGPU.td b/lib/Target/AMDGPU/AMDGPU.td
index 0f331486d0f8..2e5b78bbf7ef 100644
--- a/lib/Target/AMDGPU/AMDGPU.td
+++ b/lib/Target/AMDGPU/AMDGPU.td

@@ -407,7 +407,7 @@ def FeatureGFX9 : SubtargetFeatureGeneration<"GFX9",

FeatureGCN3Encoding, FeatureCIInsts, Feature16BitInsts,

FeatureSMemRealTime, FeatureScalarStores, FeatureInv2PiInlineImm,

FeatureApertureRegs, FeatureGFX9Insts, FeatureVOP3P, FeatureVGPRIndexMode,

- FeatureFastFMAF32

+ FeatureFastFMAF32, FeatureDPP

]

diff --git a/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
index a81bcb56dfdc..2ce23dbf08e6 100644
--- a/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
+++ b/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp

@@ -149,11 +149,9 @@ void AMDGPUAsmPrinter::EmitFunctionBodyStart() {

return;

const AMDGPUSubtarget &STM = MF->getSubtarget<AMDGPUSubtarget>();

- SIProgramInfo KernelInfo;

amd_kernel_code_t KernelCode;

if (STM.isAmdCodeObjectV2(*MF)) {

- getSIProgramInfo(KernelInfo, *MF);

- getAmdKernelCode(KernelCode, KernelInfo, *MF);

+ getAmdKernelCode(KernelCode, CurrentProgramInfo, *MF);

OutStreamer->SwitchSection(getObjFileLowering().getTextSection());

getTargetStreamer().EmitAMDKernelCodeT(KernelCode);

@@ -187,7 +185,26 @@ void AMDGPUAsmPrinter::EmitGlobalVariable(const GlobalVariable *GV) {

AsmPrinter::EmitGlobalVariable(GV);

}

+bool AMDGPUAsmPrinter::doFinalization(Module &M) {

+ CallGraphResourceInfo.clear();

+ return AsmPrinter::doFinalization(M);

+// Print comments that apply to both callable functions and entry points.

+void AMDGPUAsmPrinter::emitCommonFunctionComments(

+ uint32_t NumVGPR,

+ uint32_t NumSGPR,

+ uint32_t ScratchSize,

+ uint64_t CodeSize) {

+ OutStreamer->emitRawComment(" codeLenInByte = " + Twine(CodeSize), false);

+ OutStreamer->emitRawComment(" NumSgprs: " + Twine(NumSGPR), false);

+ OutStreamer->emitRawComment(" NumVgprs: " + Twine(NumVGPR), false);

+ OutStreamer->emitRawComment(" ScratchSize: " + Twine(ScratchSize), false);

bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) {

+ CurrentProgramInfo = SIProgramInfo();

const AMDGPUMachineFunction *MFI = MF.getInfo<AMDGPUMachineFunction>();

// The starting address of all shader programs must be 256 bytes aligned.

@@ -204,11 +221,19 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) {

OutStreamer->SwitchSection(ConfigSection);

}

- SIProgramInfo KernelInfo;

if (STM.getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS) {

- getSIProgramInfo(KernelInfo, MF);

+ if (MFI->isEntryFunction()) {

+ getSIProgramInfo(CurrentProgramInfo, MF);

+ } else {

+ auto I = CallGraphResourceInfo.insert(

+ std::make_pair(MF.getFunction(), SIFunctionResourceInfo()));

+ SIFunctionResourceInfo &Info = I.first->second;

+ assert(I.second && "should only be called once per function");

+ Info = analyzeResourceUsage(MF);

+ }

if (!STM.isAmdHsaOS()) {

- EmitProgramInfoSI(MF, KernelInfo);

+ EmitProgramInfoSI(MF, CurrentProgramInfo);

}

} else {

EmitProgramInfoR600(MF);

@@ -226,72 +251,87 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) {

OutStreamer->SwitchSection(CommentSection);

if (STM.getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS) {

- if (MFI->isEntryFunction()) {

- OutStreamer->emitRawComment(" Kernel info:", false);

- } else {

+ if (!MFI->isEntryFunction()) {

OutStreamer->emitRawComment(" Function info:", false);

+ SIFunctionResourceInfo &Info = CallGraphResourceInfo[MF.getFunction()];

+ emitCommonFunctionComments(

+ Info.NumVGPR,

+ Info.getTotalNumSGPRs(MF.getSubtarget<SISubtarget>()),

+ Info.PrivateSegmentSize,

+ getFunctionCodeSize(MF));

+ return false;

}

+ OutStreamer->emitRawComment(" Kernel info:", false);

+ emitCommonFunctionComments(CurrentProgramInfo.NumVGPR,

+ CurrentProgramInfo.NumSGPR,

+ CurrentProgramInfo.ScratchSize,

+ getFunctionCodeSize(MF));

OutStreamer->emitRawComment(" codeLenInByte = " +

Twine(getFunctionCodeSize(MF)), false);

- OutStreamer->emitRawComment(" NumSgprs: " + Twine(KernelInfo.NumSGPR),

- false);

- OutStreamer->emitRawComment(" NumVgprs: " + Twine(KernelInfo.NumVGPR),

- false);

- OutStreamer->emitRawComment(" FloatMode: " + Twine(KernelInfo.FloatMode),

- false);

- OutStreamer->emitRawComment(" IeeeMode: " + Twine(KernelInfo.IEEEMode),

- false);

- OutStreamer->emitRawComment(" ScratchSize: " + Twine(KernelInfo.ScratchSize),

- false);

- OutStreamer->emitRawComment(" LDSByteSize: " + Twine(KernelInfo.LDSSize) +

- " bytes/workgroup (compile time only)", false);

+ OutStreamer->emitRawComment(

+ " NumSgprs: " + Twine(CurrentProgramInfo.NumSGPR), false);

+ OutStreamer->emitRawComment(

+ " NumVgprs: " + Twine(CurrentProgramInfo.NumVGPR), false);

- if (!MFI->isEntryFunction())

- return false;

+ OutStreamer->emitRawComment(

+ " FloatMode: " + Twine(CurrentProgramInfo.FloatMode), false);

+ OutStreamer->emitRawComment(

+ " IeeeMode: " + Twine(CurrentProgramInfo.IEEEMode), false);

+ OutStreamer->emitRawComment(

+ " ScratchSize: " + Twine(CurrentProgramInfo.ScratchSize), false);

+ OutStreamer->emitRawComment(

+ " LDSByteSize: " + Twine(CurrentProgramInfo.LDSSize) +

+ " bytes/workgroup (compile time only)", false);

- OutStreamer->emitRawComment(" SGPRBlocks: " +

- Twine(KernelInfo.SGPRBlocks), false);

- OutStreamer->emitRawComment(" VGPRBlocks: " +

- Twine(KernelInfo.VGPRBlocks), false);

+ OutStreamer->emitRawComment(

+ " SGPRBlocks: " + Twine(CurrentProgramInfo.SGPRBlocks), false);

+ OutStreamer->emitRawComment(

+ " VGPRBlocks: " + Twine(CurrentProgramInfo.VGPRBlocks), false);

- OutStreamer->emitRawComment(" NumSGPRsForWavesPerEU: " +

- Twine(KernelInfo.NumSGPRsForWavesPerEU), false);

- OutStreamer->emitRawComment(" NumVGPRsForWavesPerEU: " +

- Twine(KernelInfo.NumVGPRsForWavesPerEU), false);

+ OutStreamer->emitRawComment(

+ " NumSGPRsForWavesPerEU: " +

+ Twine(CurrentProgramInfo.NumSGPRsForWavesPerEU), false);

+ OutStreamer->emitRawComment(

+ " NumVGPRsForWavesPerEU: " +

+ Twine(CurrentProgramInfo.NumVGPRsForWavesPerEU), false);

- OutStreamer->emitRawComment(" ReservedVGPRFirst: " + Twine(KernelInfo.ReservedVGPRFirst),

- false);

- OutStreamer->emitRawComment(" ReservedVGPRCount: " + Twine(KernelInfo.ReservedVGPRCount),

- false);

+ OutStreamer->emitRawComment(

+ " ReservedVGPRFirst: " + Twine(CurrentProgramInfo.ReservedVGPRFirst),

+ false);

+ OutStreamer->emitRawComment(

+ " ReservedVGPRCount: " + Twine(CurrentProgramInfo.ReservedVGPRCount),

+ false);

if (MF.getSubtarget<SISubtarget>().debuggerEmitPrologue()) {

- OutStreamer->emitRawComment(" DebuggerWavefrontPrivateSegmentOffsetSGPR: s" +

- Twine(KernelInfo.DebuggerWavefrontPrivateSegmentOffsetSGPR), false);

- OutStreamer->emitRawComment(" DebuggerPrivateSegmentBufferSGPR: s" +

- Twine(KernelInfo.DebuggerPrivateSegmentBufferSGPR), false);

+ OutStreamer->emitRawComment(

+ " DebuggerWavefrontPrivateSegmentOffsetSGPR: s" +

+ Twine(CurrentProgramInfo.DebuggerWavefrontPrivateSegmentOffsetSGPR), false);

+ OutStreamer->emitRawComment(

+ " DebuggerPrivateSegmentBufferSGPR: s" +

+ Twine(CurrentProgramInfo.DebuggerPrivateSegmentBufferSGPR), false);

}

- OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:USER_SGPR: " +

- Twine(G_00B84C_USER_SGPR(KernelInfo.ComputePGMRSrc2)),

- false);

- OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:TRAP_HANDLER: " +

- Twine(G_00B84C_TRAP_HANDLER(KernelInfo.ComputePGMRSrc2)),

- false);

- OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:TGID_X_EN: " +

- Twine(G_00B84C_TGID_X_EN(KernelInfo.ComputePGMRSrc2)),

- false);

- OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:TGID_Y_EN: " +

- Twine(G_00B84C_TGID_Y_EN(KernelInfo.ComputePGMRSrc2)),

- false);

- OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:TGID_Z_EN: " +

- Twine(G_00B84C_TGID_Z_EN(KernelInfo.ComputePGMRSrc2)),

- false);

- OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:TIDIG_COMP_CNT: " +

- Twine(G_00B84C_TIDIG_COMP_CNT(KernelInfo.ComputePGMRSrc2)),

- false);

+ OutStreamer->emitRawComment(

+ " COMPUTE_PGM_RSRC2:USER_SGPR: " +

+ Twine(G_00B84C_USER_SGPR(CurrentProgramInfo.ComputePGMRSrc2)), false);

+ OutStreamer->emitRawComment(

+ " COMPUTE_PGM_RSRC2:TRAP_HANDLER: " +

+ Twine(G_00B84C_TRAP_HANDLER(CurrentProgramInfo.ComputePGMRSrc2)), false);

+ OutStreamer->emitRawComment(

+ " COMPUTE_PGM_RSRC2:TGID_X_EN: " +

+ Twine(G_00B84C_TGID_X_EN(CurrentProgramInfo.ComputePGMRSrc2)), false);

+ OutStreamer->emitRawComment(

+ " COMPUTE_PGM_RSRC2:TGID_Y_EN: " +

+ Twine(G_00B84C_TGID_Y_EN(CurrentProgramInfo.ComputePGMRSrc2)), false);

+ OutStreamer->emitRawComment(

+ " COMPUTE_PGM_RSRC2:TGID_Z_EN: " +

+ Twine(G_00B84C_TGID_Z_EN(CurrentProgramInfo.ComputePGMRSrc2)), false);

+ OutStreamer->emitRawComment(

+ " COMPUTE_PGM_RSRC2:TIDIG_COMP_CNT: " +

+ Twine(G_00B84C_TIDIG_COMP_CNT(CurrentProgramInfo.ComputePGMRSrc2)),

+ false);

} else {

R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>();

OutStreamer->emitRawComment(

@@ -407,71 +447,117 @@ static bool hasAnyNonFlatUseOfReg(const MachineRegisterInfo &MRI,

return false;

}

-void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,

- const MachineFunction &MF) const {

- const SISubtarget &STM = MF.getSubtarget<SISubtarget>();

- const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();

- const MachineRegisterInfo &MRI = MF.getRegInfo();

- const SIInstrInfo *TII = STM.getInstrInfo();

- const SIRegisterInfo *RI = &TII->getRegisterInfo();

+static unsigned getNumExtraSGPRs(const SISubtarget &ST,

+ bool VCCUsed,

+ bool FlatScrUsed) {

+ unsigned ExtraSGPRs = 0;

+ if (VCCUsed)

+ ExtraSGPRs = 2;

+ if (ST.getGeneration() < SISubtarget::VOLCANIC_ISLANDS) {

+ if (FlatScrUsed)

+ ExtraSGPRs = 4;

+ } else {

+ if (ST.isXNACKEnabled())

+ ExtraSGPRs = 4;

- MCPhysReg NumVGPRReg = AMDGPU::NoRegister;

- for (MCPhysReg Reg : reverse(AMDGPU::VGPR_32RegClass.getRegisters())) {

- if (MRI.isPhysRegUsed(Reg)) {

- NumVGPRReg = Reg;

- break;

- }

+ if (FlatScrUsed)

+ ExtraSGPRs = 6;

}

- MCPhysReg NumSGPRReg = AMDGPU::NoRegister;

- for (MCPhysReg Reg : reverse(AMDGPU::SGPR_32RegClass.getRegisters())) {

- if (MRI.isPhysRegUsed(Reg)) {

- NumSGPRReg = Reg;

- break;

- }

+ return ExtraSGPRs;

- // We found the maximum register index. They start at 0, so add one to get the

- // number of registers.

- ProgInfo.NumVGPR = NumVGPRReg == AMDGPU::NoRegister ? 0 :

- RI->getHWRegIndex(NumVGPRReg) + 1;

- ProgInfo.NumSGPR = NumSGPRReg == AMDGPU::NoRegister ? 0 :

- RI->getHWRegIndex(NumSGPRReg) + 1;

- unsigned ExtraSGPRs = 0;

+int32_t AMDGPUAsmPrinter::SIFunctionResourceInfo::getTotalNumSGPRs(

+ const SISubtarget &ST) const {

+ return NumExplicitSGPR + getNumExtraSGPRs(ST, UsesVCC, UsesFlatScratch);

- ProgInfo.VCCUsed = MRI.isPhysRegUsed(AMDGPU::VCC_LO) ||

- MRI.isPhysRegUsed(AMDGPU::VCC_HI);

- if (ProgInfo.VCCUsed)

- ExtraSGPRs = 2;

+AMDGPUAsmPrinter::SIFunctionResourceInfo AMDGPUAsmPrinter::analyzeResourceUsage(

+ const MachineFunction &MF) const {

+ SIFunctionResourceInfo Info;

- ProgInfo.FlatUsed = MRI.isPhysRegUsed(AMDGPU::FLAT_SCR_LO) ||

- MRI.isPhysRegUsed(AMDGPU::FLAT_SCR_HI);

+ const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();

+ const SISubtarget &ST = MF.getSubtarget<SISubtarget>();

+ const MachineFrameInfo &FrameInfo = MF.getFrameInfo();

+ const MachineRegisterInfo &MRI = MF.getRegInfo();

+ const SIInstrInfo *TII = ST.getInstrInfo();

+ const SIRegisterInfo &TRI = TII->getRegisterInfo();

+ Info.UsesFlatScratch = MRI.isPhysRegUsed(AMDGPU::FLAT_SCR_LO) ||

+ MRI.isPhysRegUsed(AMDGPU::FLAT_SCR_HI);

// Even if FLAT_SCRATCH is implicitly used, it has no effect if flat

- // instructions aren't used to access the scratch buffer. Inline assembly

- // may need it though.

+ // instructions aren't used to access the scratch buffer. Inline assembly may

+ // need it though.

// If we only have implicit uses of flat_scr on flat instructions, it is not

// really needed.

- if (ProgInfo.FlatUsed && !MFI->hasFlatScratchInit() &&

+ if (Info.UsesFlatScratch && !MFI->hasFlatScratchInit() &&

(!hasAnyNonFlatUseOfReg(MRI, *TII, AMDGPU::FLAT_SCR) &&

!hasAnyNonFlatUseOfReg(MRI, *TII, AMDGPU::FLAT_SCR_LO) &&

!hasAnyNonFlatUseOfReg(MRI, *TII, AMDGPU::FLAT_SCR_HI))) {

- ProgInfo.FlatUsed = false;

+ Info.UsesFlatScratch = false;

}

- if (STM.getGeneration() < SISubtarget::VOLCANIC_ISLANDS) {

- if (ProgInfo.FlatUsed)

- ExtraSGPRs = 4;

- } else {

- if (STM.isXNACKEnabled())

- ExtraSGPRs = 4;

+ Info.HasDynamicallySizedStack = FrameInfo.hasVarSizedObjects();

+ Info.PrivateSegmentSize = FrameInfo.getStackSize();

- if (ProgInfo.FlatUsed)

- ExtraSGPRs = 6;

+ if (!FrameInfo.hasCalls()) {

+ Info.UsesVCC = MRI.isPhysRegUsed(AMDGPU::VCC_LO) ||

+ MRI.isPhysRegUsed(AMDGPU::VCC_HI);

+ // If there are no calls, MachineRegisterInfo can tell us the used register

+ // count easily.

+ MCPhysReg HighestVGPRReg = AMDGPU::NoRegister;

+ for (MCPhysReg Reg : reverse(AMDGPU::VGPR_32RegClass.getRegisters())) {

+ if (MRI.isPhysRegUsed(Reg)) {

+ HighestVGPRReg = Reg;

+ break;

+ }

+ MCPhysReg HighestSGPRReg = AMDGPU::NoRegister;

+ for (MCPhysReg Reg : reverse(AMDGPU::SGPR_32RegClass.getRegisters())) {

+ if (MRI.isPhysRegUsed(Reg)) {

+ HighestSGPRReg = Reg;

+ break;

+ }

+ // We found the maximum register index. They start at 0, so add one to get the

+ // number of registers.

+ Info.NumVGPR = HighestVGPRReg == AMDGPU::NoRegister ? 0 :

+ TRI.getHWRegIndex(HighestVGPRReg) + 1;

+ Info.NumExplicitSGPR = HighestSGPRReg == AMDGPU::NoRegister ? 0 :

+ TRI.getHWRegIndex(HighestSGPRReg) + 1;

+ return Info;

}

+ llvm_unreachable("calls not implemented");

+void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,

+ const MachineFunction &MF) {

+ SIFunctionResourceInfo Info = analyzeResourceUsage(MF);

+ ProgInfo.NumVGPR = Info.NumVGPR;

+ ProgInfo.NumSGPR = Info.NumExplicitSGPR;

+ ProgInfo.ScratchSize = Info.PrivateSegmentSize;

+ ProgInfo.VCCUsed = Info.UsesVCC;

+ ProgInfo.FlatUsed = Info.UsesFlatScratch;

+ ProgInfo.DynamicCallStack = Info.HasDynamicallySizedStack || Info.HasRecursion;

+ const SISubtarget &STM = MF.getSubtarget<SISubtarget>();

+ const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();

+ const SIInstrInfo *TII = STM.getInstrInfo();

+ const SIRegisterInfo *RI = &TII->getRegisterInfo();

+ unsigned ExtraSGPRs = getNumExtraSGPRs(STM,

+ ProgInfo.VCCUsed,

+ ProgInfo.FlatUsed);

unsigned ExtraVGPRs = STM.getReservedNumVGPRs(MF);

// Check the addressable register limit before we add ExtraSGPRs.

@@ -574,9 +660,6 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,

// Make clamp modifier on NaN input returns 0.

ProgInfo.DX10Clamp = STM.enableDX10Clamp();

- const MachineFrameInfo &FrameInfo = MF.getFrameInfo();

- ProgInfo.ScratchSize = FrameInfo.getStackSize();

unsigned LDSAlignShift;

if (STM.getGeneration() < SISubtarget::SEA_ISLANDS) {

// LDS is allocated in 64 dword blocks.

@@ -638,6 +721,7 @@ static unsigned getRsrcReg(CallingConv::ID CallConv) {

switch (CallConv) {

default: LLVM_FALLTHROUGH;

case CallingConv::AMDGPU_CS: return R_00B848_COMPUTE_PGM_RSRC1;

+ case CallingConv::AMDGPU_HS: return R_00B428_SPI_SHADER_PGM_RSRC1_HS;

case CallingConv::AMDGPU_GS: return R_00B228_SPI_SHADER_PGM_RSRC1_GS;

case CallingConv::AMDGPU_PS: return R_00B028_SPI_SHADER_PGM_RSRC1_PS;

case CallingConv::AMDGPU_VS: return R_00B128_SPI_SHADER_PGM_RSRC1_VS;

@@ -645,7 +729,7 @@ static unsigned getRsrcReg(CallingConv::ID CallConv) {

}

void AMDGPUAsmPrinter::EmitProgramInfoSI(const MachineFunction &MF,

- const SIProgramInfo &KernelInfo) {

+ const SIProgramInfo &CurrentProgramInfo) {

const SISubtarget &STM = MF.getSubtarget<SISubtarget>();

const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();

unsigned RsrcReg = getRsrcReg(MF.getFunction()->getCallingConv());

@@ -653,29 +737,29 @@ void AMDGPUAsmPrinter::EmitProgramInfoSI(const MachineFunction &MF,

if (AMDGPU::isCompute(MF.getFunction()->getCallingConv())) {

OutStreamer->EmitIntValue(R_00B848_COMPUTE_PGM_RSRC1, 4);

- OutStreamer->EmitIntValue(KernelInfo.ComputePGMRSrc1, 4);

+ OutStreamer->EmitIntValue(CurrentProgramInfo.ComputePGMRSrc1, 4);

OutStreamer->EmitIntValue(R_00B84C_COMPUTE_PGM_RSRC2, 4);

- OutStreamer->EmitIntValue(KernelInfo.ComputePGMRSrc2, 4);

+ OutStreamer->EmitIntValue(CurrentProgramInfo.ComputePGMRSrc2, 4);

OutStreamer->EmitIntValue(R_00B860_COMPUTE_TMPRING_SIZE, 4);

- OutStreamer->EmitIntValue(S_00B860_WAVESIZE(KernelInfo.ScratchBlocks), 4);

+ OutStreamer->EmitIntValue(S_00B860_WAVESIZE(CurrentProgramInfo.ScratchBlocks), 4);

// TODO: Should probably note flat usage somewhere. SC emits a "FlatPtr32 =

// 0" comment but I don't see a corresponding field in the register spec.

} else {

OutStreamer->EmitIntValue(RsrcReg, 4);

- OutStreamer->EmitIntValue(S_00B028_VGPRS(KernelInfo.VGPRBlocks) |

- S_00B028_SGPRS(KernelInfo.SGPRBlocks), 4);

+ OutStreamer->EmitIntValue(S_00B028_VGPRS(CurrentProgramInfo.VGPRBlocks) |

+ S_00B028_SGPRS(CurrentProgramInfo.SGPRBlocks), 4);

if (STM.isVGPRSpillingEnabled(*MF.getFunction())) {

OutStreamer->EmitIntValue(R_0286E8_SPI_TMPRING_SIZE, 4);

- OutStreamer->EmitIntValue(S_0286E8_WAVESIZE(KernelInfo.ScratchBlocks), 4);

+ OutStreamer->EmitIntValue(S_0286E8_WAVESIZE(CurrentProgramInfo.ScratchBlocks), 4);

}

if (MF.getFunction()->getCallingConv() == CallingConv::AMDGPU_PS) {

OutStreamer->EmitIntValue(R_00B02C_SPI_SHADER_PGM_RSRC2_PS, 4);

- OutStreamer->EmitIntValue(S_00B02C_EXTRA_LDS_SIZE(KernelInfo.LDSBlocks), 4);

+ OutStreamer->EmitIntValue(S_00B02C_EXTRA_LDS_SIZE(CurrentProgramInfo.LDSBlocks), 4);

OutStreamer->EmitIntValue(R_0286CC_SPI_PS_INPUT_ENA, 4);

OutStreamer->EmitIntValue(MFI->getPSInputEnable(), 4);

OutStreamer->EmitIntValue(R_0286D0_SPI_PS_INPUT_ADDR, 4);

@@ -703,7 +787,7 @@ static amd_element_byte_size_t getElementByteSizeValue(unsigned Size) {

}

void AMDGPUAsmPrinter::getAmdKernelCode(amd_kernel_code_t &Out,

- const SIProgramInfo &KernelInfo,

+ const SIProgramInfo &CurrentProgramInfo,

const MachineFunction &MF) const {

const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();

const SISubtarget &STM = MF.getSubtarget<SISubtarget>();

@@ -711,10 +795,13 @@ void AMDGPUAsmPrinter::getAmdKernelCode(amd_kernel_code_t &Out,

AMDGPU::initDefaultAMDKernelCodeT(Out, STM.getFeatureBits());

Out.compute_pgm_resource_registers =

- KernelInfo.ComputePGMRSrc1 |

- (KernelInfo.ComputePGMRSrc2 << 32);

+ CurrentProgramInfo.ComputePGMRSrc1 |

+ (CurrentProgramInfo.ComputePGMRSrc2 << 32);

Out.code_properties = AMD_CODE_PROPERTY_IS_PTR64;

+ if (CurrentProgramInfo.DynamicCallStack)

+ Out.code_properties |= AMD_CODE_PROPERTY_IS_DYNAMIC_CALLSTACK;

AMD_HSA_BITS_SET(Out.code_properties,

AMD_CODE_PROPERTY_PRIVATE_ELEMENT_SIZE,

getElementByteSizeValue(STM.getMaxPrivateElementSize()));

@@ -766,12 +853,12 @@ void AMDGPUAsmPrinter::getAmdKernelCode(amd_kernel_code_t &Out,

// FIXME: Should use getKernArgSize

Out.kernarg_segment_byte_size =

STM.getKernArgSegmentSize(MF, MFI->getABIArgOffset());

- Out.wavefront_sgpr_count = KernelInfo.NumSGPR;

- Out.workitem_vgpr_count = KernelInfo.NumVGPR;

- Out.workitem_private_segment_byte_size = KernelInfo.ScratchSize;

- Out.workgroup_group_segment_byte_size = KernelInfo.LDSSize;

- Out.reserved_vgpr_first = KernelInfo.ReservedVGPRFirst;

- Out.reserved_vgpr_count = KernelInfo.ReservedVGPRCount;

+ Out.wavefront_sgpr_count = CurrentProgramInfo.NumSGPR;

+ Out.workitem_vgpr_count = CurrentProgramInfo.NumVGPR;

+ Out.workitem_private_segment_byte_size = CurrentProgramInfo.ScratchSize;

+ Out.workgroup_group_segment_byte_size = CurrentProgramInfo.LDSSize;

+ Out.reserved_vgpr_first = CurrentProgramInfo.ReservedVGPRFirst;

+ Out.reserved_vgpr_count = CurrentProgramInfo.ReservedVGPRCount;

// These alignment values are specified in powers of two, so alignment =

// 2^n. The minimum alignment is 2^4 = 16.

@@ -780,9 +867,9 @@ void AMDGPUAsmPrinter::getAmdKernelCode(amd_kernel_code_t &Out,

if (STM.debuggerEmitPrologue()) {

Out.debug_wavefront_private_segment_offset_sgpr =

- KernelInfo.DebuggerWavefrontPrivateSegmentOffsetSGPR;

+ CurrentProgramInfo.DebuggerWavefrontPrivateSegmentOffsetSGPR;

Out.debug_private_segment_buffer_sgpr =

- KernelInfo.DebuggerPrivateSegmentBufferSGPR;

+ CurrentProgramInfo.DebuggerPrivateSegmentBufferSGPR;

}

diff --git a/lib/Target/AMDGPU/AMDGPUAsmPrinter.h b/lib/Target/AMDGPU/AMDGPUAsmPrinter.h
index 8c86dea4b885..e5adeeb465e1 100644
--- a/lib/Target/AMDGPU/AMDGPUAsmPrinter.h
+++ b/lib/Target/AMDGPU/AMDGPUAsmPrinter.h

@@ -30,9 +30,26 @@ namespace llvm {

class AMDGPUTargetStreamer;

class MCOperand;

+class SISubtarget;

class AMDGPUAsmPrinter final : public AsmPrinter {

private:

+ // Track resource usage for callee functions.

+ struct SIFunctionResourceInfo {

+ // Track the number of explicitly used VGPRs. Special registers reserved at

+ // the end are tracked separately.

+ int32_t NumVGPR = 0;

+ int32_t NumExplicitSGPR = 0;

+ uint32_t PrivateSegmentSize = 0;

+ bool UsesVCC = false;

+ bool UsesFlatScratch = false;

+ bool HasDynamicallySizedStack = false;

+ bool HasRecursion = false;

+ int32_t getTotalNumSGPRs(const SISubtarget &ST) const;

+ };

+ // Track resource usage for kernels / entry functions.

struct SIProgramInfo {

// Fields set in PGM_RSRC1 pm4 packet.

uint32_t VGPRBlocks = 0;

@@ -83,14 +100,23 @@ private:

uint16_t DebuggerPrivateSegmentBufferSGPR =

std::numeric_limits<uint16_t>::max();

+ // Whether there is recursion, dynamic allocas, indirect calls or some other

+ // reason there may be statically unknown stack usage.

+ bool DynamicCallStack = false;

// Bonus information for debugging.

bool VCCUsed = false;

SIProgramInfo() = default;

};

+ SIProgramInfo CurrentProgramInfo;

+ DenseMap<const Function *, SIFunctionResourceInfo> CallGraphResourceInfo;

uint64_t getFunctionCodeSize(const MachineFunction &MF) const;

- void getSIProgramInfo(SIProgramInfo &Out, const MachineFunction &MF) const;

+ SIFunctionResourceInfo analyzeResourceUsage(const MachineFunction &MF) const;

+ void getSIProgramInfo(SIProgramInfo &Out, const MachineFunction &MF);

void getAmdKernelCode(amd_kernel_code_t &Out, const SIProgramInfo &KernelInfo,

const MachineFunction &MF) const;

void findNumUsedRegistersSI(const MachineFunction &MF,

@@ -101,6 +127,10 @@ private:

/// can correctly setup the GPU state.

void EmitProgramInfoR600(const MachineFunction &MF);

void EmitProgramInfoSI(const MachineFunction &MF, const SIProgramInfo &KernelInfo);

+ void emitCommonFunctionComments(uint32_t NumVGPR,

+ uint32_t NumSGPR,

+ uint32_t ScratchSize,

+ uint64_t CodeSize);

public:

explicit AMDGPUAsmPrinter(TargetMachine &TM,

@@ -112,6 +142,7 @@ public:

AMDGPUTargetStreamer& getTargetStreamer() const;

+ bool doFinalization(Module &M) override;

bool runOnMachineFunction(MachineFunction &MF) override;

/// \brief Wrapper for MCInstLowering.lowerOperand() for the tblgen'erated

diff --git a/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
index f5110857da84..ccae36ced1f8 100644
--- a/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
+++ b/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp

@@ -207,8 +207,8 @@ bool AMDGPUDAGToDAGISel::isNoNanSrc(SDValue N) const {

return true;

// TODO: Move into isKnownNeverNaN

- if (const auto *BO = dyn_cast<BinaryWithFlagsSDNode>(N))

- return BO->Flags.hasNoNaNs();

+ if (N->getFlags().isDefined())

+ return N->getFlags().hasNoNaNs();

return CurDAG->isKnownNeverNaN(N);

}

diff --git a/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index e21775e61dd4..64e1b8f0d7f0 100644
--- a/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/lib/Target/AMDGPU/AMDGPUISelLowering.cpp

@@ -29,6 +29,7 @@

#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"

#include "llvm/IR/DataLayout.h"

#include "llvm/IR/DiagnosticInfo.h"

+#include "llvm/Support/KnownBits.h"

#include "SIInstrInfo.h"

using namespace llvm;

@@ -895,6 +896,7 @@ CCAssignFn *AMDGPUTargetLowering::CCAssignFnForCall(CallingConv::ID CC,

case CallingConv::SPIR_KERNEL:

return CC_AMDGPU_Kernel;

case CallingConv::AMDGPU_VS:

+ case CallingConv::AMDGPU_HS:

case CallingConv::AMDGPU_GS:

case CallingConv::AMDGPU_PS:

case CallingConv::AMDGPU_CS:

@@ -2293,11 +2295,11 @@ SDValue AMDGPUTargetLowering::LowerSIGN_EXTEND_INREG(SDValue Op,

//===----------------------------------------------------------------------===//

static bool isU24(SDValue Op, SelectionDAG &DAG) {

- APInt KnownZero, KnownOne;

+ KnownBits Known;

EVT VT = Op.getValueType();

- DAG.computeKnownBits(Op, KnownZero, KnownOne);

+ DAG.computeKnownBits(Op, Known);

- return (VT.getSizeInBits() - KnownZero.countLeadingOnes()) <= 24;

+ return (VT.getSizeInBits() - Known.Zero.countLeadingOnes()) <= 24;

}

static bool isI24(SDValue Op, SelectionDAG &DAG) {

@@ -3358,13 +3360,12 @@ SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N,

OffsetVal,

OffsetVal + WidthVal);

- APInt KnownZero, KnownOne;

+ KnownBits Known;

TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),

!DCI.isBeforeLegalizeOps());

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

if (TLI.ShrinkDemandedConstant(BitsFrom, Demanded, TLO) ||

- TLI.SimplifyDemandedBits(BitsFrom, Demanded,

- KnownZero, KnownOne, TLO)) {

+ TLI.SimplifyDemandedBits(BitsFrom, Demanded, Known, TLO)) {

DCI.CommitTargetLoweringOpt(TLO);

}

@@ -3516,6 +3517,8 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {

NODE_NAME_CASE(KILL)

NODE_NAME_CASE(DUMMY_CHAIN)

case AMDGPUISD::FIRST_MEM_OPCODE_NUMBER: break;

+ NODE_NAME_CASE(INIT_EXEC)

+ NODE_NAME_CASE(INIT_EXEC_FROM_INPUT)

NODE_NAME_CASE(SENDMSG)

NODE_NAME_CASE(SENDMSGHALT)

NODE_NAME_CASE(INTERP_MOV)

@@ -3574,14 +3577,12 @@ SDValue AMDGPUTargetLowering::getRecipEstimate(SDValue Operand,

}

void AMDGPUTargetLowering::computeKnownBitsForTargetNode(

- const SDValue Op, APInt &KnownZero, APInt &KnownOne,

+ const SDValue Op, KnownBits &Known,

const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth) const {

- unsigned BitWidth = KnownZero.getBitWidth();

- KnownZero = KnownOne = APInt(BitWidth, 0); // Don't know anything.

+ Known.Zero.clearAllBits(); Known.One.clearAllBits(); // Don't know anything.

- APInt KnownZero2;

- APInt KnownOne2;

+ KnownBits Known2;

unsigned Opc = Op.getOpcode();

switch (Opc) {

@@ -3589,7 +3590,7 @@ void AMDGPUTargetLowering::computeKnownBitsForTargetNode(

break;

case AMDGPUISD::CARRY:

case AMDGPUISD::BORROW: {

- KnownZero = APInt::getHighBitsSet(32, 31);

+ Known.Zero = APInt::getHighBitsSet(32, 31);

break;

}

@@ -3602,16 +3603,16 @@ void AMDGPUTargetLowering::computeKnownBitsForTargetNode(

uint32_t Width = CWidth->getZExtValue() & 0x1f;

if (Opc == AMDGPUISD::BFE_U32)

- KnownZero = APInt::getHighBitsSet(32, 32 - Width);

+ Known.Zero = APInt::getHighBitsSet(32, 32 - Width);

break;

}

case AMDGPUISD::FP_TO_FP16:

case AMDGPUISD::FP16_ZEXT: {

- unsigned BitWidth = KnownZero.getBitWidth();

+ unsigned BitWidth = Known.getBitWidth();

// High bits are zero.

- KnownZero = APInt::getHighBitsSet(BitWidth, BitWidth - 16);

+ Known.Zero = APInt::getHighBitsSet(BitWidth, BitWidth - 16);

break;

}

diff --git a/lib/Target/AMDGPU/AMDGPUISelLowering.h b/lib/Target/AMDGPU/AMDGPUISelLowering.h
index 13cbfe267932..e1a5a2072418 100644
--- a/lib/Target/AMDGPU/AMDGPUISelLowering.h
+++ b/lib/Target/AMDGPU/AMDGPUISelLowering.h

@@ -125,8 +125,9 @@ public:

if (getTargetMachine().Options.NoSignedZerosFPMath)

return true;

- if (const auto *BO = dyn_cast<BinaryWithFlagsSDNode>(Op))

- return BO->Flags.hasNoSignedZeros();

+ const auto Flags = Op.getNode()->getFlags();

+ if (Flags.isDefined())

+ return Flags.hasNoSignedZeros();

return false;

}

@@ -199,8 +200,7 @@ public:

/// either zero or one and return them in the \p KnownZero and \p KnownOne

/// bitsets.

void computeKnownBitsForTargetNode(const SDValue Op,

- APInt &KnownZero,

- APInt &KnownOne,

+ KnownBits &Known,

const APInt &DemandedElts,

const SelectionDAG &DAG,

unsigned Depth = 0) const override;

@@ -370,6 +370,8 @@ enum NodeType : unsigned {

BUILD_VERTICAL_VECTOR,

/// Pointer to the start of the shader's constant data.

CONST_DATA_PTR,

+ INIT_EXEC,

+ INIT_EXEC_FROM_INPUT,

SENDMSG,

SENDMSGHALT,

INTERP_MOV,

diff --git a/lib/Target/AMDGPU/AMDGPUInstrInfo.td b/lib/Target/AMDGPU/AMDGPUInstrInfo.td
index c1706d12a2ea..353cc5742791 100644
--- a/lib/Target/AMDGPU/AMDGPUInstrInfo.td
+++ b/lib/Target/AMDGPU/AMDGPUInstrInfo.td

@@ -299,6 +299,15 @@ def AMDGPUumed3 : SDNode<"AMDGPUISD::UMED3", AMDGPUDTIntTernaryOp,

def AMDGPUfmed3 : SDNode<"AMDGPUISD::FMED3", SDTFPTernaryOp, []>;

+def AMDGPUinit_exec : SDNode<"AMDGPUISD::INIT_EXEC",

+ SDTypeProfile<0, 1, [SDTCisInt<0>]>,

+ [SDNPHasChain, SDNPInGlue]>;

+def AMDGPUinit_exec_from_input : SDNode<"AMDGPUISD::INIT_EXEC_FROM_INPUT",

+ SDTypeProfile<0, 2,

+ [SDTCisInt<0>, SDTCisInt<1>]>,

+ [SDNPHasChain, SDNPInGlue]>;

def AMDGPUsendmsg : SDNode<"AMDGPUISD::SENDMSG",

SDTypeProfile<0, 1, [SDTCisInt<0>]>,

[SDNPHasChain, SDNPInGlue]>;

diff --git a/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp b/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp
index 27fe639e3d4b..fe7283ccf7d9 100644
--- a/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp
+++ b/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp

@@ -17,6 +17,7 @@ static bool isEntryFunctionCC(CallingConv::ID CC) {

case CallingConv::AMDGPU_KERNEL:

case CallingConv::SPIR_KERNEL:

case CallingConv::AMDGPU_VS:

+ case CallingConv::AMDGPU_HS:

case CallingConv::AMDGPU_GS:

case CallingConv::AMDGPU_PS:

case CallingConv::AMDGPU_CS:

diff --git a/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index 0202220b8011..cd5bad04d0b3 100644
--- a/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp

@@ -309,6 +309,7 @@ void AMDGPUTargetMachine::adjustPassManager(PassManagerBuilder &Builder) {

default:

return false;

case CallingConv::AMDGPU_VS:

+ case CallingConv::AMDGPU_HS:

case CallingConv::AMDGPU_GS:

case CallingConv::AMDGPU_PS:

case CallingConv::AMDGPU_CS:

diff --git a/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
index 6edd3e923ba1..c9482c37ec80 100644
--- a/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
+++ b/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp

@@ -432,6 +432,7 @@ static bool isArgPassedInSGPR(const Argument *A) {

case CallingConv::SPIR_KERNEL:

return true;

case CallingConv::AMDGPU_VS:

+ case CallingConv::AMDGPU_HS:

case CallingConv::AMDGPU_GS:

case CallingConv::AMDGPU_PS:

case CallingConv::AMDGPU_CS:

diff --git a/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/lib/Target/AMDGPU/GCNSchedStrategy.cpp
index ea305a92fc60..630442625aa3 100644
--- a/lib/Target/AMDGPU/GCNSchedStrategy.cpp
+++ b/lib/Target/AMDGPU/GCNSchedStrategy.cpp

@@ -422,8 +422,9 @@ void GCNScheduleDAGMILive::discoverLiveIns() {

unsigned SGPRs = 0;

unsigned VGPRs = 0;

+ auto &MI = *begin()->getParent()->getFirstNonDebugInstr();

const SIRegisterInfo *SRI = static_cast<const SIRegisterInfo*>(TRI);

- SlotIndex SI = LIS->getInstructionIndex(*begin()).getBaseIndex();

+ SlotIndex SI = LIS->getInstructionIndex(MI).getBaseIndex();

assert (SI.isValid());

DEBUG(dbgs() << "Region live-ins:");

diff --git a/lib/Target/AMDGPU/R600Intrinsics.td b/lib/Target/AMDGPU/R600Intrinsics.td
index a5310e9fd6d0..4c9e1e8a5434 100644
--- a/lib/Target/AMDGPU/R600Intrinsics.td
+++ b/lib/Target/AMDGPU/R600Intrinsics.td

@@ -61,7 +61,7 @@ def int_r600_ddx : TextureIntrinsicFloatInput;

def int_r600_ddy : TextureIntrinsicFloatInput;

def int_r600_dot4 : Intrinsic<[llvm_float_ty],

- [llvm_v4f32_ty, llvm_v4f32_ty], [IntrNoMem]

+ [llvm_v4f32_ty, llvm_v4f32_ty], [IntrNoMem, IntrSpeculatable]

} // End TargetPrefix = "r600", isTarget = 1

diff --git a/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp b/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp
index b7e62075244b..d8cb98fe1b19 100644
--- a/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp
+++ b/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp

@@ -77,9 +77,10 @@ class SIAnnotateControlFlow : public FunctionPass {

void insertElse(BranchInst *Term);

- Value *handleLoopCondition(Value *Cond, PHINode *Broken,

- llvm::Loop *L, BranchInst *Term,

- SmallVectorImpl<WeakVH> &LoopPhiConditions);

+ Value *

+ handleLoopCondition(Value *Cond, PHINode *Broken, llvm::Loop *L,

+ BranchInst *Term,

+ SmallVectorImpl<WeakTrackingVH> &LoopPhiConditions);

void handleLoop(BranchInst *Term);

@@ -212,9 +213,8 @@ void SIAnnotateControlFlow::insertElse(BranchInst *Term) {

/// \brief Recursively handle the condition leading to a loop

Value *SIAnnotateControlFlow::handleLoopCondition(

- Value *Cond, PHINode *Broken,

- llvm::Loop *L, BranchInst *Term,

- SmallVectorImpl<WeakVH> &LoopPhiConditions) {

+ Value *Cond, PHINode *Broken, llvm::Loop *L, BranchInst *Term,

+ SmallVectorImpl<WeakTrackingVH> &LoopPhiConditions) {

// Only search through PHI nodes which are inside the loop. If we try this

// with PHI nodes that are outside of the loop, we end up inserting new PHI

@@ -281,7 +281,7 @@ Value *SIAnnotateControlFlow::handleLoopCondition(

NewPhi->setIncomingValue(i, PhiArg);

}

- LoopPhiConditions.push_back(WeakVH(Phi));

+ LoopPhiConditions.push_back(WeakTrackingVH(Phi));

return Ret;

}

@@ -323,7 +323,7 @@ void SIAnnotateControlFlow::handleLoop(BranchInst *Term) {

BasicBlock *Target = Term->getSuccessor(1);

PHINode *Broken = PHINode::Create(Int64, 0, "phi.broken", &Target->front());

- SmallVector<WeakVH, 8> LoopPhiConditions;

+ SmallVector<WeakTrackingVH, 8> LoopPhiConditions;

Value *Cond = Term->getCondition();

Term->setCondition(BoolTrue);

Value *Arg = handleLoopCondition(Cond, Broken, L, Term, LoopPhiConditions);

@@ -333,7 +333,7 @@ void SIAnnotateControlFlow::handleLoop(BranchInst *Term) {

Term->setCondition(CallInst::Create(Loop, Arg, "", Term));

- for (WeakVH Val : reverse(LoopPhiConditions)) {

+ for (WeakTrackingVH Val : reverse(LoopPhiConditions)) {

if (PHINode *Cond = cast_or_null<PHINode>(Val))

eraseIfUnused(Cond);

}

diff --git a/lib/Target/AMDGPU/SIDefines.h b/lib/Target/AMDGPU/SIDefines.h
index 3dd372b32866..a01330cb9171 100644
--- a/lib/Target/AMDGPU/SIDefines.h
+++ b/lib/Target/AMDGPU/SIDefines.h

@@ -302,6 +302,7 @@ enum DstUnused {

#define S_00B02C_EXTRA_LDS_SIZE(x) (((x) & 0xFF) << 8)

#define R_00B128_SPI_SHADER_PGM_RSRC1_VS 0x00B128

#define R_00B228_SPI_SHADER_PGM_RSRC1_GS 0x00B228

+#define R_00B428_SPI_SHADER_PGM_RSRC1_HS 0x00B428

#define R_00B848_COMPUTE_PGM_RSRC1 0x00B848

#define S_00B028_VGPRS(x) (((x) & 0x3F) << 0)

#define S_00B028_SGPRS(x) (((x) & 0x0F) << 6)

diff --git a/lib/Target/AMDGPU/SIFixSGPRCopies.cpp b/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
index b0f0bf04a891..3cca815d8773 100644
--- a/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
+++ b/lib/Target/AMDGPU/SIFixSGPRCopies.cpp

@@ -278,8 +278,7 @@ static bool phiHasBreakDef(const MachineInstr &PHI,

Visited.insert(Reg);

- MachineInstr *DefInstr = MRI.getUniqueVRegDef(Reg);

- assert(DefInstr);

+ MachineInstr *DefInstr = MRI.getVRegDef(Reg);

switch (DefInstr->getOpcode()) {

default:

break;

@@ -346,7 +345,7 @@ bool searchPredecessors(const MachineBasicBlock *MBB,

return false;

DenseSet<const MachineBasicBlock*> Visited;

- SmallVector<MachineBasicBlock*, 4> Worklist(MBB->pred_begin(),

+ SmallVector<MachineBasicBlock*, 4> Worklist(MBB->pred_begin(),

MBB->pred_end());

while (!Worklist.empty()) {

@@ -546,7 +545,13 @@ bool SIFixSGPRCopies::runOnMachineFunction(MachineFunction &MF) {

const TargetRegisterClass *SrcRC, *DstRC;

std::tie(SrcRC, DstRC) = getCopyRegClasses(MI, *TRI, MRI);

if (isVGPRToSGPRCopy(SrcRC, DstRC, *TRI)) {

- MachineInstr *DefMI = MRI.getVRegDef(MI.getOperand(1).getReg());

+ unsigned SrcReg = MI.getOperand(1).getReg();

+ if (!TargetRegisterInfo::isVirtualRegister(SrcReg)) {

+ TII->moveToVALU(MI);

+ break;

+ }

+ MachineInstr *DefMI = MRI.getVRegDef(SrcReg);

unsigned SMovOp;

int64_t Imm;

// If we are just copying an immediate, we can replace the copy with

diff --git a/lib/Target/AMDGPU/SIISelLowering.cpp b/lib/Target/AMDGPU/SIISelLowering.cpp
index ce74a7cd8b04..853c8737b464 100644
--- a/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/lib/Target/AMDGPU/SIISelLowering.cpp

@@ -68,6 +68,7 @@

#include "llvm/Support/CommandLine.h"

#include "llvm/Support/Compiler.h"

#include "llvm/Support/ErrorHandling.h"

+#include "llvm/Support/KnownBits.h"

#include "llvm/Support/MathExtras.h"

#include "llvm/Target/TargetCallingConv.h"

#include "llvm/Target/TargetOptions.h"

@@ -1956,6 +1957,63 @@ MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter(

MI.eraseFromParent();

return BB;

+ case AMDGPU::SI_INIT_EXEC:

+ // This should be before all vector instructions.

+ BuildMI(*BB, &*BB->begin(), MI.getDebugLoc(), TII->get(AMDGPU::S_MOV_B64),

+ AMDGPU::EXEC)

+ .addImm(MI.getOperand(0).getImm());

+ MI.eraseFromParent();

+ return BB;

+ case AMDGPU::SI_INIT_EXEC_FROM_INPUT: {

+ // Extract the thread count from an SGPR input and set EXEC accordingly.

+ // Since BFM can't shift by 64, handle that case with CMP + CMOV.

+ //

+ // S_BFE_U32 count, input, {shift, 7}

+ // S_BFM_B64 exec, count, 0

+ // S_CMP_EQ_U32 count, 64

+ // S_CMOV_B64 exec, -1

+ MachineInstr *FirstMI = &*BB->begin();

+ MachineRegisterInfo &MRI = MF->getRegInfo();

+ unsigned InputReg = MI.getOperand(0).getReg();

+ unsigned CountReg = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);

+ bool Found = false;

+ // Move the COPY of the input reg to the beginning, so that we can use it.

+ for (auto I = BB->begin(); I != &MI; I++) {

+ if (I->getOpcode() != TargetOpcode::COPY ||

+ I->getOperand(0).getReg() != InputReg)

+ continue;

+ if (I == FirstMI) {

+ FirstMI = &*++BB->begin();

+ } else {

+ I->removeFromParent();

+ BB->insert(FirstMI, &*I);

+ }

+ Found = true;

+ break;

+ }

+ assert(Found);

+ // This should be before all vector instructions.

+ BuildMI(*BB, FirstMI, DebugLoc(), TII->get(AMDGPU::S_BFE_U32), CountReg)

+ .addReg(InputReg)

+ .addImm((MI.getOperand(1).getImm() & 0x7f) | 0x70000);

+ BuildMI(*BB, FirstMI, DebugLoc(), TII->get(AMDGPU::S_BFM_B64),

+ AMDGPU::EXEC)

+ .addReg(CountReg)

+ .addImm(0);

+ BuildMI(*BB, FirstMI, DebugLoc(), TII->get(AMDGPU::S_CMP_EQ_U32))

+ .addReg(CountReg, RegState::Kill)

+ .addImm(64);

+ BuildMI(*BB, FirstMI, DebugLoc(), TII->get(AMDGPU::S_CMOV_B64),

+ AMDGPU::EXEC)

+ .addImm(-1);

+ MI.eraseFromParent();

+ return BB;

+ }

case AMDGPU::GET_GROUPSTATICSIZE: {

DebugLoc DL = MI.getDebugLoc();

BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_MOV_B32))

@@ -3223,6 +3281,14 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,

return DAG.getNode(NodeOp, DL, MVT::Other, Chain,

Op.getOperand(2), Glue);

}

+ case Intrinsic::amdgcn_init_exec: {

+ return DAG.getNode(AMDGPUISD::INIT_EXEC, DL, MVT::Other, Chain,

+ Op.getOperand(2));

+ }

+ case Intrinsic::amdgcn_init_exec_from_input: {

+ return DAG.getNode(AMDGPUISD::INIT_EXEC_FROM_INPUT, DL, MVT::Other, Chain,

+ Op.getOperand(2), Op.getOperand(3));

+ }

case AMDGPUIntrinsic::SI_tbuffer_store: {

SDValue Ops[] = {

Chain,

@@ -3455,15 +3521,15 @@ SDValue SITargetLowering::lowerFastUnsafeFDIV(SDValue Op,

}

- const SDNodeFlags *Flags = Op->getFlags();

+ const SDNodeFlags Flags = Op->getFlags();

- if (Unsafe || Flags->hasAllowReciprocal()) {

+ if (Unsafe || Flags.hasAllowReciprocal()) {

// Turn into multiply by the reciprocal.

// x / y -> x * (1.0 / y)

- SDNodeFlags Flags;

- Flags.setUnsafeAlgebra(true);

+ SDNodeFlags NewFlags;

+ NewFlags.setUnsafeAlgebra(true);

SDValue Recip = DAG.getNode(AMDGPUISD::RCP, SL, VT, RHS);

- return DAG.getNode(ISD::FMUL, SL, VT, LHS, Recip, &Flags);

+ return DAG.getNode(ISD::FMUL, SL, VT, LHS, Recip, NewFlags);

}

return SDValue();

@@ -4542,10 +4608,9 @@ unsigned SITargetLowering::getFusedOpcode(const SelectionDAG &DAG,

return ISD::FMAD;

const TargetOptions &Options = DAG.getTarget().Options;

- if ((Options.AllowFPOpFusion == FPOpFusion::Fast ||

- Options.UnsafeFPMath ||

- (cast<BinaryWithFlagsSDNode>(N0)->Flags.hasUnsafeAlgebra() &&

- cast<BinaryWithFlagsSDNode>(N1)->Flags.hasUnsafeAlgebra())) &&

+ if ((Options.AllowFPOpFusion == FPOpFusion::Fast || Options.UnsafeFPMath ||

+ (N0->getFlags().hasUnsafeAlgebra() &&

+ N1->getFlags().hasUnsafeAlgebra())) &&

isFMAFasterThanFMulAndFAdd(VT)) {

return ISD::FMA;

}

@@ -4706,12 +4771,12 @@ SDValue SITargetLowering::performCvtF32UByteNCombine(SDNode *N,

APInt Demanded = APInt::getBitsSet(32, 8 * Offset, 8 * Offset + 8);

- APInt KnownZero, KnownOne;

+ KnownBits Known;

TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),

!DCI.isBeforeLegalizeOps());

const TargetLowering &TLI = DAG.getTargetLoweringInfo();

if (TLI.ShrinkDemandedConstant(Src, Demanded, TLO) ||

- TLI.SimplifyDemandedBits(Src, Demanded, KnownZero, KnownOne, TLO)) {

+ TLI.SimplifyDemandedBits(Src, Demanded, Known, TLO)) {

DCI.CommitTargetLoweringOpt(TLO);

}

diff --git a/lib/Target/AMDGPU/SIInstructions.td b/lib/Target/AMDGPU/SIInstructions.td
index 3f6ddec70479..7ccb54f54e34 100644
--- a/lib/Target/AMDGPU/SIInstructions.td
+++ b/lib/Target/AMDGPU/SIInstructions.td

@@ -286,6 +286,19 @@ def SI_INIT_M0 : SPseudoInstSI <(outs), (ins SSrc_b32:$src)> {

let isReMaterializable = 1;

}

+def SI_INIT_EXEC : SPseudoInstSI <

+ (outs), (ins i64imm:$src), []> {

+ let Defs = [EXEC];

+ let usesCustomInserter = 1;

+ let isAsCheapAsAMove = 1;

+def SI_INIT_EXEC_FROM_INPUT : SPseudoInstSI <

+ (outs), (ins SSrc_b32:$input, i32imm:$shift), []> {

+ let Defs = [EXEC];

+ let usesCustomInserter = 1;

// Return for returning shaders to a shader variant epilog.

def SI_RETURN_TO_EPILOG : SPseudoInstSI <

(outs), (ins variable_ops), [(AMDGPUreturn_to_epilog)]> {

@@ -399,6 +412,16 @@ def SI_PC_ADD_REL_OFFSET : SPseudoInstSI <

} // End SubtargetPredicate = isGCN

let Predicates = [isGCN] in {

+def : Pat <

+ (AMDGPUinit_exec i64:$src),

+ (SI_INIT_EXEC (as_i64imm $src))

+>;

+def : Pat <

+ (AMDGPUinit_exec_from_input i32:$input, i32:$shift),

+ (SI_INIT_EXEC_FROM_INPUT (i32 $input), (as_i32imm $shift))

+>;

def : Pat<

(AMDGPUtrap timm:$trapid),

(S_TRAP $trapid)

diff --git a/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
index 5a3242bed1d0..d565c84bfeda 100644
--- a/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
+++ b/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp

@@ -503,6 +503,7 @@ unsigned getInitialPSInputAddr(const Function &F) {

bool isShader(CallingConv::ID cc) {

switch(cc) {

case CallingConv::AMDGPU_VS:

+ case CallingConv::AMDGPU_HS:

case CallingConv::AMDGPU_GS:

case CallingConv::AMDGPU_PS:

case CallingConv::AMDGPU_CS: