diff options
Diffstat (limited to 'llvm/lib/Target/AMDGPU/SIInstrInfo.cpp')
-rw-r--r-- | llvm/lib/Target/AMDGPU/SIInstrInfo.cpp | 1090 |
1 files changed, 802 insertions, 288 deletions
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp index d53950ca44655..9af8ffedce0f3 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -63,6 +63,8 @@ using namespace llvm; +#define DEBUG_TYPE "si-instr-info" + #define GET_INSTRINFO_CTOR_DTOR #include "AMDGPUGenInstrInfo.inc" @@ -83,6 +85,12 @@ static cl::opt<unsigned> BranchOffsetBits("amdgpu-s-branch-bits", cl::ReallyHidden, cl::init(16), cl::desc("Restrict range of branch instructions (DEBUG)")); +static cl::opt<bool> Fix16BitCopies( + "amdgpu-fix-16-bit-physreg-copies", + cl::desc("Fix copies between 32 and 16 bit registers by extending to 32 bit"), + cl::init(true), + cl::ReallyHidden); + SIInstrInfo::SIInstrInfo(const GCNSubtarget &ST) : AMDGPUGenInstrInfo(AMDGPU::ADJCALLSTACKUP, AMDGPU::ADJCALLSTACKDOWN), RI(ST), ST(ST) { @@ -136,6 +144,8 @@ bool SIInstrInfo::isReallyTriviallyReMaterializable(const MachineInstr &MI, case AMDGPU::V_MOV_B32_e32: case AMDGPU::V_MOV_B32_e64: case AMDGPU::V_MOV_B64_PSEUDO: + case AMDGPU::V_ACCVGPR_READ_B32: + case AMDGPU::V_ACCVGPR_WRITE_B32: // No implicit operands. return MI.getNumOperands() == MI.getDesc().getNumOperands(); default: @@ -258,43 +268,49 @@ static bool isStride64(unsigned Opc) { } } -bool SIInstrInfo::getMemOperandWithOffset(const MachineInstr &LdSt, - const MachineOperand *&BaseOp, - int64_t &Offset, - const TargetRegisterInfo *TRI) const { +bool SIInstrInfo::getMemOperandsWithOffsetWidth( + const MachineInstr &LdSt, SmallVectorImpl<const MachineOperand *> &BaseOps, + int64_t &Offset, bool &OffsetIsScalable, unsigned &Width, + const TargetRegisterInfo *TRI) const { if (!LdSt.mayLoadOrStore()) return false; unsigned Opc = LdSt.getOpcode(); + OffsetIsScalable = false; + const MachineOperand *BaseOp, *OffsetOp; + int DataOpIdx; if (isDS(LdSt)) { - const MachineOperand *OffsetImm = - getNamedOperand(LdSt, AMDGPU::OpName::offset); - if (OffsetImm) { + BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::addr); + OffsetOp = getNamedOperand(LdSt, AMDGPU::OpName::offset); + if (OffsetOp) { // Normal, single offset LDS instruction. - BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::addr); - // TODO: ds_consume/ds_append use M0 for the base address. Is it safe to - // report that here? - if (!BaseOp || !BaseOp->isReg()) + if (!BaseOp) { + // DS_CONSUME/DS_APPEND use M0 for the base address. + // TODO: find the implicit use operand for M0 and use that as BaseOp? + return false; + } + BaseOps.push_back(BaseOp); + Offset = OffsetOp->getImm(); + // Get appropriate operand, and compute width accordingly. + DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst); + if (DataOpIdx == -1) + DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data0); + Width = getOpSize(LdSt, DataOpIdx); + } else { + // The 2 offset instructions use offset0 and offset1 instead. We can treat + // these as a load with a single offset if the 2 offsets are consecutive. + // We will use this for some partially aligned loads. + const MachineOperand *Offset0Op = + getNamedOperand(LdSt, AMDGPU::OpName::offset0); + const MachineOperand *Offset1Op = + getNamedOperand(LdSt, AMDGPU::OpName::offset1); + + unsigned Offset0 = Offset0Op->getImm(); + unsigned Offset1 = Offset1Op->getImm(); + if (Offset0 + 1 != Offset1) return false; - Offset = OffsetImm->getImm(); - - return true; - } - - // The 2 offset instructions use offset0 and offset1 instead. We can treat - // these as a load with a single offset if the 2 offsets are consecutive. We - // will use this for some partially aligned loads. - const MachineOperand *Offset0Imm = - getNamedOperand(LdSt, AMDGPU::OpName::offset0); - const MachineOperand *Offset1Imm = - getNamedOperand(LdSt, AMDGPU::OpName::offset1); - - uint8_t Offset0 = Offset0Imm->getImm(); - uint8_t Offset1 = Offset1Imm->getImm(); - - if (Offset1 > Offset0 && Offset1 - Offset0 == 1) { // Each of these offsets is in element sized units, so we need to convert // to bytes of the individual reads. @@ -310,16 +326,20 @@ bool SIInstrInfo::getMemOperandWithOffset(const MachineInstr &LdSt, if (isStride64(Opc)) EltSize *= 64; - BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::addr); - if (!BaseOp->isReg()) - return false; - + BaseOps.push_back(BaseOp); Offset = EltSize * Offset0; - - return true; + // Get appropriate operand(s), and compute width accordingly. + DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst); + if (DataOpIdx == -1) { + DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data0); + Width = getOpSize(LdSt, DataOpIdx); + DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data1); + Width += getOpSize(LdSt, DataOpIdx); + } else { + Width = getOpSize(LdSt, DataOpIdx); + } } - - return false; + return true; } if (isMUBUF(LdSt) || isMTBUF(LdSt)) { @@ -339,59 +359,78 @@ bool SIInstrInfo::getMemOperandWithOffset(const MachineInstr &LdSt, const MachineOperand *OffsetImm = getNamedOperand(LdSt, AMDGPU::OpName::offset); - BaseOp = SOffset; + BaseOps.push_back(RSrc); + BaseOps.push_back(SOffset); Offset = OffsetImm->getImm(); - return true; - } - - const MachineOperand *AddrReg = getNamedOperand(LdSt, AMDGPU::OpName::vaddr); - if (!AddrReg) - return false; + } else { + BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::srsrc); + if (!BaseOp) // e.g. BUFFER_WBINVL1_VOL + return false; + BaseOps.push_back(BaseOp); - const MachineOperand *OffsetImm = - getNamedOperand(LdSt, AMDGPU::OpName::offset); - BaseOp = AddrReg; - Offset = OffsetImm->getImm(); - if (SOffset) // soffset can be an inline immediate. - Offset += SOffset->getImm(); + BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::vaddr); + if (BaseOp) + BaseOps.push_back(BaseOp); - if (!BaseOp->isReg()) - return false; + const MachineOperand *OffsetImm = + getNamedOperand(LdSt, AMDGPU::OpName::offset); + Offset = OffsetImm->getImm(); + if (SOffset) // soffset can be an inline immediate. + Offset += SOffset->getImm(); + } + // Get appropriate operand, and compute width accordingly. + DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst); + if (DataOpIdx == -1) + DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdata); + Width = getOpSize(LdSt, DataOpIdx); + return true; + } + if (isMIMG(LdSt)) { + int SRsrcIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::srsrc); + BaseOps.push_back(&LdSt.getOperand(SRsrcIdx)); + int VAddr0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0); + if (VAddr0Idx >= 0) { + // GFX10 possible NSA encoding. + for (int I = VAddr0Idx; I < SRsrcIdx; ++I) + BaseOps.push_back(&LdSt.getOperand(I)); + } else { + BaseOps.push_back(getNamedOperand(LdSt, AMDGPU::OpName::vaddr)); + } + Offset = 0; + // Get appropriate operand, and compute width accordingly. + DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdata); + Width = getOpSize(LdSt, DataOpIdx); return true; } if (isSMRD(LdSt)) { - const MachineOperand *OffsetImm = - getNamedOperand(LdSt, AMDGPU::OpName::offset); - if (!OffsetImm) + BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::sbase); + if (!BaseOp) // e.g. S_MEMTIME return false; - - const MachineOperand *SBaseReg = getNamedOperand(LdSt, AMDGPU::OpName::sbase); - BaseOp = SBaseReg; - Offset = OffsetImm->getImm(); - if (!BaseOp->isReg()) - return false; - + BaseOps.push_back(BaseOp); + OffsetOp = getNamedOperand(LdSt, AMDGPU::OpName::offset); + Offset = OffsetOp ? OffsetOp->getImm() : 0; + // Get appropriate operand, and compute width accordingly. + DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::sdst); + Width = getOpSize(LdSt, DataOpIdx); return true; } if (isFLAT(LdSt)) { - const MachineOperand *VAddr = getNamedOperand(LdSt, AMDGPU::OpName::vaddr); - if (VAddr) { - // Can't analyze 2 offsets. - if (getNamedOperand(LdSt, AMDGPU::OpName::saddr)) - return false; - - BaseOp = VAddr; - } else { - // scratch instructions have either vaddr or saddr. - BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::saddr); - } - + // Instructions have either vaddr or saddr or both. + BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::vaddr); + if (BaseOp) + BaseOps.push_back(BaseOp); + BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::saddr); + if (BaseOp) + BaseOps.push_back(BaseOp); Offset = getNamedOperand(LdSt, AMDGPU::OpName::offset)->getImm(); - if (!BaseOp->isReg()) - return false; + // Get appropriate operand, and compute width accordingly. + DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst); + if (DataOpIdx == -1) + DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdata); + Width = getOpSize(LdSt, DataOpIdx); return true; } @@ -399,15 +438,13 @@ bool SIInstrInfo::getMemOperandWithOffset(const MachineInstr &LdSt, } static bool memOpsHaveSameBasePtr(const MachineInstr &MI1, - const MachineOperand &BaseOp1, + ArrayRef<const MachineOperand *> BaseOps1, const MachineInstr &MI2, - const MachineOperand &BaseOp2) { - // Support only base operands with base registers. - // Note: this could be extended to support FI operands. - if (!BaseOp1.isReg() || !BaseOp2.isReg()) - return false; - - if (BaseOp1.isIdenticalTo(BaseOp2)) + ArrayRef<const MachineOperand *> BaseOps2) { + // Only examine the first "base" operand of each instruction, on the + // assumption that it represents the real base address of the memory access. + // Other operands are typically offsets or indices from this base address. + if (BaseOps1.front()->isIdenticalTo(*BaseOps2.front())) return true; if (!MI1.hasOneMemOperand() || !MI2.hasOneMemOperand()) @@ -433,62 +470,31 @@ static bool memOpsHaveSameBasePtr(const MachineInstr &MI1, return Base1 == Base2; } -bool SIInstrInfo::shouldClusterMemOps(const MachineOperand &BaseOp1, - const MachineOperand &BaseOp2, - unsigned NumLoads) const { - const MachineInstr &FirstLdSt = *BaseOp1.getParent(); - const MachineInstr &SecondLdSt = *BaseOp2.getParent(); - - if (!memOpsHaveSameBasePtr(FirstLdSt, BaseOp1, SecondLdSt, BaseOp2)) - return false; - - const MachineOperand *FirstDst = nullptr; - const MachineOperand *SecondDst = nullptr; - - if ((isMUBUF(FirstLdSt) && isMUBUF(SecondLdSt)) || - (isMTBUF(FirstLdSt) && isMTBUF(SecondLdSt)) || - (isFLAT(FirstLdSt) && isFLAT(SecondLdSt))) { - const unsigned MaxGlobalLoadCluster = 6; - if (NumLoads > MaxGlobalLoadCluster) - return false; - - FirstDst = getNamedOperand(FirstLdSt, AMDGPU::OpName::vdata); - if (!FirstDst) - FirstDst = getNamedOperand(FirstLdSt, AMDGPU::OpName::vdst); - SecondDst = getNamedOperand(SecondLdSt, AMDGPU::OpName::vdata); - if (!SecondDst) - SecondDst = getNamedOperand(SecondLdSt, AMDGPU::OpName::vdst); - } else if (isSMRD(FirstLdSt) && isSMRD(SecondLdSt)) { - FirstDst = getNamedOperand(FirstLdSt, AMDGPU::OpName::sdst); - SecondDst = getNamedOperand(SecondLdSt, AMDGPU::OpName::sdst); - } else if (isDS(FirstLdSt) && isDS(SecondLdSt)) { - FirstDst = getNamedOperand(FirstLdSt, AMDGPU::OpName::vdst); - SecondDst = getNamedOperand(SecondLdSt, AMDGPU::OpName::vdst); - } - - if (!FirstDst || !SecondDst) +bool SIInstrInfo::shouldClusterMemOps(ArrayRef<const MachineOperand *> BaseOps1, + ArrayRef<const MachineOperand *> BaseOps2, + unsigned NumLoads, + unsigned NumBytes) const { + // If current mem ops pair do not have same base pointer, then they cannot be + // clustered. + assert(!BaseOps1.empty() && !BaseOps2.empty()); + const MachineInstr &FirstLdSt = *BaseOps1.front()->getParent(); + const MachineInstr &SecondLdSt = *BaseOps2.front()->getParent(); + if (!memOpsHaveSameBasePtr(FirstLdSt, BaseOps1, SecondLdSt, BaseOps2)) return false; - // Try to limit clustering based on the total number of bytes loaded - // rather than the number of instructions. This is done to help reduce - // register pressure. The method used is somewhat inexact, though, - // because it assumes that all loads in the cluster will load the - // same number of bytes as FirstLdSt. - - // The unit of this value is bytes. - // FIXME: This needs finer tuning. - unsigned LoadClusterThreshold = 16; - - const MachineRegisterInfo &MRI = - FirstLdSt.getParent()->getParent()->getRegInfo(); - - const Register Reg = FirstDst->getReg(); - - const TargetRegisterClass *DstRC = Register::isVirtualRegister(Reg) - ? MRI.getRegClass(Reg) - : RI.getPhysRegClass(Reg); - - return (NumLoads * (RI.getRegSizeInBits(*DstRC) / 8)) <= LoadClusterThreshold; + // Compute max cluster size based on average number bytes clustered till now, + // and decide based on it, if current mem ops pair can be clustered or not. + assert((NumLoads > 0) && (NumBytes > 0) && (NumBytes >= NumLoads) && + "Invalid NumLoads/NumBytes values"); + unsigned MaxNumLoads; + if (NumBytes <= 4 * NumLoads) { + // Loads are dword or smaller (on average). + MaxNumLoads = 5; + } else { + // Loads are bigger than a dword (on average). + MaxNumLoads = 4; + } + return NumLoads <= MaxNumLoads; } // FIXME: This behaves strangely. If, for example, you have 32 load + stores, @@ -516,11 +522,10 @@ bool SIInstrInfo::shouldScheduleLoadsNear(SDNode *Load0, SDNode *Load1, static void reportIllegalCopy(const SIInstrInfo *TII, MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, const DebugLoc &DL, MCRegister DestReg, - MCRegister SrcReg, bool KillSrc) { + MCRegister SrcReg, bool KillSrc, + const char *Msg = "illegal SGPR to VGPR copy") { MachineFunction *MF = MBB.getParent(); - DiagnosticInfoUnsupported IllegalCopy(MF->getFunction(), - "illegal SGPR to VGPR copy", - DL, DS_Error); + DiagnosticInfoUnsupported IllegalCopy(MF->getFunction(), Msg, DL, DS_Error); LLVMContext &C = MF->getFunction().getContext(); C.diagnose(IllegalCopy); @@ -534,6 +539,25 @@ void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB, MCRegister SrcReg, bool KillSrc) const { const TargetRegisterClass *RC = RI.getPhysRegClass(DestReg); + // FIXME: This is hack to resolve copies between 16 bit and 32 bit + // registers until all patterns are fixed. + if (Fix16BitCopies && + ((RI.getRegSizeInBits(*RC) == 16) ^ + (RI.getRegSizeInBits(*RI.getPhysRegClass(SrcReg)) == 16))) { + MCRegister &RegToFix = (RI.getRegSizeInBits(*RC) == 16) ? DestReg : SrcReg; + MCRegister Super = RI.get32BitRegister(RegToFix); + assert(RI.getSubReg(Super, AMDGPU::lo16) == RegToFix); + RegToFix = Super; + + if (DestReg == SrcReg) { + // Insert empty bundle since ExpandPostRA expects an instruction here. + BuildMI(MBB, MI, DL, get(AMDGPU::BUNDLE)); + return; + } + + RC = RI.getPhysRegClass(DestReg); + } + if (RC == &AMDGPU::VGPR_32RegClass) { assert(AMDGPU::VGPR_32RegClass.contains(SrcReg) || AMDGPU::SReg_32RegClass.contains(SrcReg) || @@ -580,6 +604,13 @@ void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB, } if (RC == &AMDGPU::SReg_64RegClass) { + if (SrcReg == AMDGPU::SCC) { + BuildMI(MBB, MI, DL, get(AMDGPU::S_CSELECT_B64), DestReg) + .addImm(1) + .addImm(0); + return; + } + if (DestReg == AMDGPU::VCC) { if (AMDGPU::SReg_64RegClass.contains(SrcReg)) { BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B64), AMDGPU::VCC) @@ -606,10 +637,18 @@ void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB, } if (DestReg == AMDGPU::SCC) { + // Copying 64-bit or 32-bit sources to SCC barely makes sense, + // but SelectionDAG emits such copies for i1 sources. + // TODO: Use S_BITCMP0_B32 instead and only consider the 0th bit. + if (AMDGPU::SReg_64RegClass.contains(SrcReg)) { + SrcReg = RI.getSubReg(SrcReg, AMDGPU::sub0); + } assert(AMDGPU::SReg_32RegClass.contains(SrcReg)); + BuildMI(MBB, MI, DL, get(AMDGPU::S_CMP_LG_U32)) - .addReg(SrcReg, getKillRegState(KillSrc)) - .addImm(0); + .addReg(SrcReg, getKillRegState(KillSrc)) + .addImm(0); + return; } @@ -660,7 +699,7 @@ void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB, // Registers in the sequence are allocated contiguously so we can just // use register number to pick one of three round-robin temps. unsigned RegNo = DestReg % 3; - unsigned Tmp = RS.scavengeRegister(&AMDGPU::VGPR_32RegClass, 0); + Register Tmp = RS.scavengeRegister(&AMDGPU::VGPR_32RegClass, 0); if (!Tmp) report_fatal_error("Cannot scavenge VGPR to copy to AGPR"); RS.setRegUsed(Tmp); @@ -685,6 +724,72 @@ void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB, return; } + if (RI.getRegSizeInBits(*RC) == 16) { + assert(AMDGPU::VGPR_LO16RegClass.contains(SrcReg) || + AMDGPU::VGPR_HI16RegClass.contains(SrcReg) || + AMDGPU::SReg_LO16RegClass.contains(SrcReg) || + AMDGPU::AGPR_LO16RegClass.contains(SrcReg)); + + bool IsSGPRDst = AMDGPU::SReg_LO16RegClass.contains(DestReg); + bool IsSGPRSrc = AMDGPU::SReg_LO16RegClass.contains(SrcReg); + bool IsAGPRDst = AMDGPU::AGPR_LO16RegClass.contains(DestReg); + bool IsAGPRSrc = AMDGPU::AGPR_LO16RegClass.contains(SrcReg); + bool DstLow = AMDGPU::VGPR_LO16RegClass.contains(DestReg) || + AMDGPU::SReg_LO16RegClass.contains(DestReg) || + AMDGPU::AGPR_LO16RegClass.contains(DestReg); + bool SrcLow = AMDGPU::VGPR_LO16RegClass.contains(SrcReg) || + AMDGPU::SReg_LO16RegClass.contains(SrcReg) || + AMDGPU::AGPR_LO16RegClass.contains(SrcReg); + MCRegister NewDestReg = RI.get32BitRegister(DestReg); + MCRegister NewSrcReg = RI.get32BitRegister(SrcReg); + + if (IsSGPRDst) { + if (!IsSGPRSrc) { + reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc); + return; + } + + BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), NewDestReg) + .addReg(NewSrcReg, getKillRegState(KillSrc)); + return; + } + + if (IsAGPRDst || IsAGPRSrc) { + if (!DstLow || !SrcLow) { + reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc, + "Cannot use hi16 subreg with an AGPR!"); + } + + copyPhysReg(MBB, MI, DL, NewDestReg, NewSrcReg, KillSrc); + return; + } + + if (IsSGPRSrc && !ST.hasSDWAScalar()) { + if (!DstLow || !SrcLow) { + reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc, + "Cannot use hi16 subreg on VI!"); + } + + BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), NewDestReg) + .addReg(NewSrcReg, getKillRegState(KillSrc)); + return; + } + + auto MIB = BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_sdwa), NewDestReg) + .addImm(0) // src0_modifiers + .addReg(NewSrcReg) + .addImm(0) // clamp + .addImm(DstLow ? AMDGPU::SDWA::SdwaSel::WORD_0 + : AMDGPU::SDWA::SdwaSel::WORD_1) + .addImm(AMDGPU::SDWA::DstUnused::UNUSED_PRESERVE) + .addImm(SrcLow ? AMDGPU::SDWA::SdwaSel::WORD_0 + : AMDGPU::SDWA::SdwaSel::WORD_1) + .addReg(NewDestReg, RegState::Implicit | RegState::Undef); + // First implicit operand is $exec. + MIB->tieOperands(0, MIB->getNumOperands() - 1); + return; + } + unsigned EltSize = 4; unsigned Opcode = AMDGPU::V_MOV_B32_e32; if (RI.isSGPRClass(RC)) { @@ -806,7 +911,7 @@ void SIInstrInfo::materializeImmediate(MachineBasicBlock &MBB, int64_t IdxValue = Idx == 0 ? Value : 0; MachineInstrBuilder Builder = BuildMI(MBB, MI, DL, - get(Opcode), RI.getSubReg(DestReg, Idx)); + get(Opcode), RI.getSubReg(DestReg, SubIndices[Idx])); Builder.addImm(IdxValue); } } @@ -818,10 +923,10 @@ SIInstrInfo::getPreferredSelectRegClass(unsigned Size) const { void SIInstrInfo::insertVectorSelect(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, - const DebugLoc &DL, unsigned DstReg, + const DebugLoc &DL, Register DstReg, ArrayRef<MachineOperand> Cond, - unsigned TrueReg, - unsigned FalseReg) const { + Register TrueReg, + Register FalseReg) const { MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); MachineFunction *MF = MBB.getParent(); const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>(); @@ -944,10 +1049,10 @@ void SIInstrInfo::insertVectorSelect(MachineBasicBlock &MBB, } } -unsigned SIInstrInfo::insertEQ(MachineBasicBlock *MBB, +Register SIInstrInfo::insertEQ(MachineBasicBlock *MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, - unsigned SrcReg, int Value) const { + Register SrcReg, int Value) const { MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo(); Register Reg = MRI.createVirtualRegister(RI.getBoolRC()); BuildMI(*MBB, I, DL, get(AMDGPU::V_CMP_EQ_I32_e64), Reg) @@ -957,10 +1062,10 @@ unsigned SIInstrInfo::insertEQ(MachineBasicBlock *MBB, return Reg; } -unsigned SIInstrInfo::insertNE(MachineBasicBlock *MBB, +Register SIInstrInfo::insertNE(MachineBasicBlock *MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, - unsigned SrcReg, int Value) const { + Register SrcReg, int Value) const { MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo(); Register Reg = MRI.createVirtualRegister(RI.getBoolRC()); BuildMI(*MBB, I, DL, get(AMDGPU::V_CMP_NE_I32_e64), Reg) @@ -984,6 +1089,80 @@ unsigned SIInstrInfo::getMovOpcode(const TargetRegisterClass *DstRC) const { return AMDGPU::COPY; } +static unsigned getIndirectVGPRWritePseudoOpc(unsigned VecSize) { + if (VecSize <= 32) // 4 bytes + return AMDGPU::V_INDIRECT_REG_WRITE_B32_V1; + if (VecSize <= 64) // 8 bytes + return AMDGPU::V_INDIRECT_REG_WRITE_B32_V2; + if (VecSize <= 96) // 12 bytes + return AMDGPU::V_INDIRECT_REG_WRITE_B32_V3; + if (VecSize <= 128) // 16 bytes + return AMDGPU::V_INDIRECT_REG_WRITE_B32_V4; + if (VecSize <= 160) // 20 bytes + return AMDGPU::V_INDIRECT_REG_WRITE_B32_V5; + if (VecSize <= 256) // 32 bytes + return AMDGPU::V_INDIRECT_REG_WRITE_B32_V8; + if (VecSize <= 512) // 64 bytes + return AMDGPU::V_INDIRECT_REG_WRITE_B32_V16; + if (VecSize <= 1024) // 128 bytes + return AMDGPU::V_INDIRECT_REG_WRITE_B32_V32; + + llvm_unreachable("unsupported size for IndirectRegWrite pseudos"); +} + +static unsigned getIndirectSGPRWritePseudo32(unsigned VecSize) { + if (VecSize <= 32) // 4 bytes + return AMDGPU::S_INDIRECT_REG_WRITE_B32_V1; + if (VecSize <= 64) // 8 bytes + return AMDGPU::S_INDIRECT_REG_WRITE_B32_V2; + if (VecSize <= 96) // 12 bytes + return AMDGPU::S_INDIRECT_REG_WRITE_B32_V3; + if (VecSize <= 128) // 16 bytes + return AMDGPU::S_INDIRECT_REG_WRITE_B32_V4; + if (VecSize <= 160) // 20 bytes + return AMDGPU::S_INDIRECT_REG_WRITE_B32_V5; + if (VecSize <= 256) // 32 bytes + return AMDGPU::S_INDIRECT_REG_WRITE_B32_V8; + if (VecSize <= 512) // 64 bytes + return AMDGPU::S_INDIRECT_REG_WRITE_B32_V16; + if (VecSize <= 1024) // 128 bytes + return AMDGPU::S_INDIRECT_REG_WRITE_B32_V32; + + llvm_unreachable("unsupported size for IndirectRegWrite pseudos"); +} + +static unsigned getIndirectSGPRWritePseudo64(unsigned VecSize) { + if (VecSize <= 64) // 8 bytes + return AMDGPU::S_INDIRECT_REG_WRITE_B64_V1; + if (VecSize <= 128) // 16 bytes + return AMDGPU::S_INDIRECT_REG_WRITE_B64_V2; + if (VecSize <= 256) // 32 bytes + return AMDGPU::S_INDIRECT_REG_WRITE_B64_V4; + if (VecSize <= 512) // 64 bytes + return AMDGPU::S_INDIRECT_REG_WRITE_B64_V8; + if (VecSize <= 1024) // 128 bytes + return AMDGPU::S_INDIRECT_REG_WRITE_B64_V16; + + llvm_unreachable("unsupported size for IndirectRegWrite pseudos"); +} + +const MCInstrDesc &SIInstrInfo::getIndirectRegWritePseudo( + unsigned VecSize, unsigned EltSize, bool IsSGPR) const { + if (IsSGPR) { + switch (EltSize) { + case 32: + return get(getIndirectSGPRWritePseudo32(VecSize)); + case 64: + return get(getIndirectSGPRWritePseudo64(VecSize)); + default: + llvm_unreachable("invalid reg indexing elt size"); + } + } + + assert(EltSize == 32 && "invalid reg indexing elt size"); + return get(getIndirectVGPRWritePseudoOpc(VecSize)); +} + static unsigned getSGPRSpillSaveOpcode(unsigned Size) { switch (Size) { case 4: @@ -996,6 +1175,8 @@ static unsigned getSGPRSpillSaveOpcode(unsigned Size) { return AMDGPU::SI_SPILL_S128_SAVE; case 20: return AMDGPU::SI_SPILL_S160_SAVE; + case 24: + return AMDGPU::SI_SPILL_S192_SAVE; case 32: return AMDGPU::SI_SPILL_S256_SAVE; case 64: @@ -1019,6 +1200,8 @@ static unsigned getVGPRSpillSaveOpcode(unsigned Size) { return AMDGPU::SI_SPILL_V128_SAVE; case 20: return AMDGPU::SI_SPILL_V160_SAVE; + case 24: + return AMDGPU::SI_SPILL_V192_SAVE; case 32: return AMDGPU::SI_SPILL_V256_SAVE; case 64: @@ -1049,7 +1232,7 @@ static unsigned getAGPRSpillSaveOpcode(unsigned Size) { void SIInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, - unsigned SrcReg, bool isKill, + Register SrcReg, bool isKill, int FrameIndex, const TargetRegisterClass *RC, const TargetRegisterInfo *TRI) const { @@ -1058,18 +1241,18 @@ void SIInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB, MachineFrameInfo &FrameInfo = MF->getFrameInfo(); const DebugLoc &DL = MBB.findDebugLoc(MI); - unsigned Size = FrameInfo.getObjectSize(FrameIndex); - unsigned Align = FrameInfo.getObjectAlignment(FrameIndex); MachinePointerInfo PtrInfo = MachinePointerInfo::getFixedStack(*MF, FrameIndex); - MachineMemOperand *MMO - = MF->getMachineMemOperand(PtrInfo, MachineMemOperand::MOStore, - Size, Align); + MachineMemOperand *MMO = MF->getMachineMemOperand( + PtrInfo, MachineMemOperand::MOStore, FrameInfo.getObjectSize(FrameIndex), + FrameInfo.getObjectAlign(FrameIndex)); unsigned SpillSize = TRI->getSpillSize(*RC); if (RI.isSGPRClass(RC)) { MFI->setHasSpilledSGPRs(); assert(SrcReg != AMDGPU::M0 && "m0 should not be spilled"); + assert(SrcReg != AMDGPU::EXEC_LO && SrcReg != AMDGPU::EXEC_HI && + SrcReg != AMDGPU::EXEC && "exec should not be spilled"); // We are only allowed to create one new instruction when spilling // registers, so we need to use pseudo instruction for spilling SGPRs. @@ -1079,7 +1262,7 @@ void SIInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB, // to make sure we are using the correct register class. if (Register::isVirtualRegister(SrcReg) && SpillSize == 4) { MachineRegisterInfo &MRI = MF->getRegInfo(); - MRI.constrainRegClass(SrcReg, &AMDGPU::SReg_32_XM0RegClass); + MRI.constrainRegClass(SrcReg, &AMDGPU::SReg_32_XM0_XEXECRegClass); } BuildMI(MBB, MI, DL, OpDesc) @@ -1126,6 +1309,8 @@ static unsigned getSGPRSpillRestoreOpcode(unsigned Size) { return AMDGPU::SI_SPILL_S128_RESTORE; case 20: return AMDGPU::SI_SPILL_S160_RESTORE; + case 24: + return AMDGPU::SI_SPILL_S192_RESTORE; case 32: return AMDGPU::SI_SPILL_S256_RESTORE; case 64: @@ -1149,6 +1334,8 @@ static unsigned getVGPRSpillRestoreOpcode(unsigned Size) { return AMDGPU::SI_SPILL_V128_RESTORE; case 20: return AMDGPU::SI_SPILL_V160_RESTORE; + case 24: + return AMDGPU::SI_SPILL_V192_RESTORE; case 32: return AMDGPU::SI_SPILL_V256_RESTORE; case 64: @@ -1179,33 +1366,34 @@ static unsigned getAGPRSpillRestoreOpcode(unsigned Size) { void SIInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, - unsigned DestReg, int FrameIndex, + Register DestReg, int FrameIndex, const TargetRegisterClass *RC, const TargetRegisterInfo *TRI) const { MachineFunction *MF = MBB.getParent(); SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>(); MachineFrameInfo &FrameInfo = MF->getFrameInfo(); const DebugLoc &DL = MBB.findDebugLoc(MI); - unsigned Align = FrameInfo.getObjectAlignment(FrameIndex); - unsigned Size = FrameInfo.getObjectSize(FrameIndex); unsigned SpillSize = TRI->getSpillSize(*RC); MachinePointerInfo PtrInfo = MachinePointerInfo::getFixedStack(*MF, FrameIndex); MachineMemOperand *MMO = MF->getMachineMemOperand( - PtrInfo, MachineMemOperand::MOLoad, Size, Align); + PtrInfo, MachineMemOperand::MOLoad, FrameInfo.getObjectSize(FrameIndex), + FrameInfo.getObjectAlign(FrameIndex)); if (RI.isSGPRClass(RC)) { MFI->setHasSpilledSGPRs(); assert(DestReg != AMDGPU::M0 && "m0 should not be reloaded into"); + assert(DestReg != AMDGPU::EXEC_LO && DestReg != AMDGPU::EXEC_HI && + DestReg != AMDGPU::EXEC && "exec should not be spilled"); // FIXME: Maybe this should not include a memoperand because it will be // lowered to non-memory instructions. const MCInstrDesc &OpDesc = get(getSGPRSpillRestoreOpcode(SpillSize)); - if (Register::isVirtualRegister(DestReg) && SpillSize == 4) { + if (DestReg.isVirtual() && SpillSize == 4) { MachineRegisterInfo &MRI = MF->getRegInfo(); - MRI.constrainRegClass(DestReg, &AMDGPU::SReg_32_XM0RegClass); + MRI.constrainRegClass(DestReg, &AMDGPU::SReg_32_XM0_XEXECRegClass); } if (RI.spillSGPRToVGPR()) @@ -1244,7 +1432,7 @@ unsigned SIInstrInfo::calculateLDSSpillAddress( unsigned WorkGroupSize = MFI->getMaxFlatWorkGroupSize(); unsigned WavefrontSize = ST.getWavefrontSize(); - unsigned TIDReg = MFI->getTIDReg(); + Register TIDReg = MFI->getTIDReg(); if (!MFI->hasCalculatedTID()) { MachineBasicBlock &Entry = MBB.getParent()->front(); MachineBasicBlock::iterator Insert = Entry.front(); @@ -1272,8 +1460,8 @@ unsigned SIInstrInfo::calculateLDSSpillAddress( RS->enterBasicBlock(Entry); // FIXME: Can we scavenge an SReg_64 and access the subregs? - unsigned STmp0 = RS->scavengeRegister(&AMDGPU::SGPR_32RegClass, 0); - unsigned STmp1 = RS->scavengeRegister(&AMDGPU::SGPR_32RegClass, 0); + Register STmp0 = RS->scavengeRegister(&AMDGPU::SGPR_32RegClass, 0); + Register STmp1 = RS->scavengeRegister(&AMDGPU::SGPR_32RegClass, 0); BuildMI(Entry, Insert, DL, get(AMDGPU::S_LOAD_DWORD_IMM), STmp0) .addReg(InputPtrReg) .addImm(SI::KernelInputOffsets::NGROUPS_Z); @@ -1482,30 +1670,55 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const { MI.eraseFromParent(); break; } - case AMDGPU::V_MOVRELD_B32_V1: - case AMDGPU::V_MOVRELD_B32_V2: - case AMDGPU::V_MOVRELD_B32_V4: - case AMDGPU::V_MOVRELD_B32_V8: - case AMDGPU::V_MOVRELD_B32_V16: { - const MCInstrDesc &MovRelDesc = get(AMDGPU::V_MOVRELD_B32_e32); + case AMDGPU::V_INDIRECT_REG_WRITE_B32_V1: + case AMDGPU::V_INDIRECT_REG_WRITE_B32_V2: + case AMDGPU::V_INDIRECT_REG_WRITE_B32_V3: + case AMDGPU::V_INDIRECT_REG_WRITE_B32_V4: + case AMDGPU::V_INDIRECT_REG_WRITE_B32_V5: + case AMDGPU::V_INDIRECT_REG_WRITE_B32_V8: + case AMDGPU::V_INDIRECT_REG_WRITE_B32_V16: + case AMDGPU::V_INDIRECT_REG_WRITE_B32_V32: + case AMDGPU::S_INDIRECT_REG_WRITE_B32_V1: + case AMDGPU::S_INDIRECT_REG_WRITE_B32_V2: + case AMDGPU::S_INDIRECT_REG_WRITE_B32_V3: + case AMDGPU::S_INDIRECT_REG_WRITE_B32_V4: + case AMDGPU::S_INDIRECT_REG_WRITE_B32_V5: + case AMDGPU::S_INDIRECT_REG_WRITE_B32_V8: + case AMDGPU::S_INDIRECT_REG_WRITE_B32_V16: + case AMDGPU::S_INDIRECT_REG_WRITE_B32_V32: + case AMDGPU::S_INDIRECT_REG_WRITE_B64_V1: + case AMDGPU::S_INDIRECT_REG_WRITE_B64_V2: + case AMDGPU::S_INDIRECT_REG_WRITE_B64_V4: + case AMDGPU::S_INDIRECT_REG_WRITE_B64_V8: + case AMDGPU::S_INDIRECT_REG_WRITE_B64_V16: { + const TargetRegisterClass *EltRC = getOpRegClass(MI, 2); + + unsigned Opc; + if (RI.hasVGPRs(EltRC)) { + Opc = ST.useVGPRIndexMode() ? + AMDGPU::V_MOV_B32_indirect : AMDGPU::V_MOVRELD_B32_e32; + } else { + Opc = RI.getRegSizeInBits(*EltRC) == 64 ? + AMDGPU::S_MOVRELD_B64 : AMDGPU::S_MOVRELD_B32; + } + + const MCInstrDesc &OpDesc = get(Opc); Register VecReg = MI.getOperand(0).getReg(); bool IsUndef = MI.getOperand(1).isUndef(); - unsigned SubReg = AMDGPU::sub0 + MI.getOperand(3).getImm(); + unsigned SubReg = MI.getOperand(3).getImm(); assert(VecReg == MI.getOperand(1).getReg()); - MachineInstr *MovRel = - BuildMI(MBB, MI, DL, MovRelDesc) - .addReg(RI.getSubReg(VecReg, SubReg), RegState::Undef) - .add(MI.getOperand(2)) - .addReg(VecReg, RegState::ImplicitDefine) - .addReg(VecReg, - RegState::Implicit | (IsUndef ? RegState::Undef : 0)); + MachineInstrBuilder MIB = + BuildMI(MBB, MI, DL, OpDesc) + .addReg(RI.getSubReg(VecReg, SubReg), RegState::Undef) + .add(MI.getOperand(2)) + .addReg(VecReg, RegState::ImplicitDefine) + .addReg(VecReg, RegState::Implicit | (IsUndef ? RegState::Undef : 0)); const int ImpDefIdx = - MovRelDesc.getNumOperands() + MovRelDesc.getNumImplicitUses(); + OpDesc.getNumOperands() + OpDesc.getNumImplicitUses(); const int ImpUseIdx = ImpDefIdx + 1; - MovRel->tieOperands(ImpDefIdx, ImpUseIdx); - + MIB->tieOperands(ImpDefIdx, ImpUseIdx); MI.eraseFromParent(); break; } @@ -1549,22 +1762,6 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const { MI.setDesc(get(ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64)); break; } - case TargetOpcode::BUNDLE: { - if (!MI.mayLoad() || MI.hasUnmodeledSideEffects()) - return false; - - // If it is a load it must be a memory clause - for (MachineBasicBlock::instr_iterator I = MI.getIterator(); - I->isBundledWithSucc(); ++I) { - I->unbundleFromSucc(); - for (MachineOperand &MO : I->operands()) - if (MO.isReg()) - MO.setIsInternalRead(false); - } - - MI.eraseFromParent(); - break; - } } return true; } @@ -1662,9 +1859,15 @@ static MachineInstr *swapRegAndNonRegOperand(MachineInstr &MI, RegOp.ChangeToImmediate(NonRegOp.getImm()); else if (NonRegOp.isFI()) RegOp.ChangeToFrameIndex(NonRegOp.getIndex()); - else + else if (NonRegOp.isGlobal()) { + RegOp.ChangeToGA(NonRegOp.getGlobal(), NonRegOp.getOffset(), + NonRegOp.getTargetFlags()); + } else return nullptr; + // Make sure we don't reinterpret a subreg index in the target flags. + RegOp.setTargetFlags(NonRegOp.getTargetFlags()); + NonRegOp.ChangeToRegister(Reg, false, false, IsKill, IsDead, IsUndef, IsDebug); NonRegOp.setSubReg(SubReg); @@ -2085,6 +2288,7 @@ unsigned SIInstrInfo::insertBranch(MachineBasicBlock &MBB, // Copy the flags onto the implicit condition register operand. preserveCondRegFlags(CondBr->getOperand(1), Cond[1]); + fixImplicitOperands(*CondBr); if (BytesAdded) *BytesAdded = 4; @@ -2125,8 +2329,8 @@ bool SIInstrInfo::reverseBranchCondition( bool SIInstrInfo::canInsertSelect(const MachineBasicBlock &MBB, ArrayRef<MachineOperand> Cond, - unsigned TrueReg, unsigned FalseReg, - int &CondCycles, + Register DstReg, Register TrueReg, + Register FalseReg, int &CondCycles, int &TrueCycles, int &FalseCycles) const { switch (Cond[0].getImm()) { case VCCNZ: @@ -2165,8 +2369,8 @@ bool SIInstrInfo::canInsertSelect(const MachineBasicBlock &MBB, void SIInstrInfo::insertSelect(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, - unsigned DstReg, ArrayRef<MachineOperand> Cond, - unsigned TrueReg, unsigned FalseReg) const { + Register DstReg, ArrayRef<MachineOperand> Cond, + Register TrueReg, Register FalseReg) const { BranchPredicate Pred = static_cast<BranchPredicate>(Cond[0].getImm()); if (Pred == VCCZ || Pred == SCC_FALSE) { Pred = static_cast<BranchPredicate>(-Pred); @@ -2178,14 +2382,17 @@ void SIInstrInfo::insertSelect(MachineBasicBlock &MBB, unsigned DstSize = RI.getRegSizeInBits(*DstRC); if (DstSize == 32) { - unsigned SelOp = Pred == SCC_TRUE ? - AMDGPU::S_CSELECT_B32 : AMDGPU::V_CNDMASK_B32_e32; - - // Instruction's operands are backwards from what is expected. - MachineInstr *Select = - BuildMI(MBB, I, DL, get(SelOp), DstReg) - .addReg(FalseReg) - .addReg(TrueReg); + MachineInstr *Select; + if (Pred == SCC_TRUE) { + Select = BuildMI(MBB, I, DL, get(AMDGPU::S_CSELECT_B32), DstReg) + .addReg(TrueReg) + .addReg(FalseReg); + } else { + // Instruction's operands are backwards from what is expected. + Select = BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e32), DstReg) + .addReg(FalseReg) + .addReg(TrueReg); + } preserveCondRegFlags(Select->getOperand(3), Cond[1]); return; @@ -2194,8 +2401,8 @@ void SIInstrInfo::insertSelect(MachineBasicBlock &MBB, if (DstSize == 64 && Pred == SCC_TRUE) { MachineInstr *Select = BuildMI(MBB, I, DL, get(AMDGPU::S_CSELECT_B64), DstReg) - .addReg(FalseReg) - .addReg(TrueReg); + .addReg(TrueReg) + .addReg(FalseReg); preserveCondRegFlags(Select->getOperand(3), Cond[1]); return; @@ -2239,17 +2446,26 @@ void SIInstrInfo::insertSelect(MachineBasicBlock &MBB, I = MIB->getIterator(); - SmallVector<unsigned, 8> Regs; + SmallVector<Register, 8> Regs; for (int Idx = 0; Idx != NElts; ++Idx) { Register DstElt = MRI.createVirtualRegister(EltRC); Regs.push_back(DstElt); unsigned SubIdx = SubIndices[Idx]; - MachineInstr *Select = - BuildMI(MBB, I, DL, get(SelOp), DstElt) - .addReg(FalseReg, 0, SubIdx) - .addReg(TrueReg, 0, SubIdx); + MachineInstr *Select; + if (SelOp == AMDGPU::V_CNDMASK_B32_e32) { + Select = + BuildMI(MBB, I, DL, get(SelOp), DstElt) + .addReg(FalseReg, 0, SubIdx) + .addReg(TrueReg, 0, SubIdx); + } else { + Select = + BuildMI(MBB, I, DL, get(SelOp), DstElt) + .addReg(TrueReg, 0, SubIdx) + .addReg(FalseReg, 0, SubIdx); + } + preserveCondRegFlags(Select->getOperand(3), Cond[1]); fixImplicitOperands(*Select); @@ -2313,7 +2529,7 @@ static void removeModOperands(MachineInstr &MI) { } bool SIInstrInfo::FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI, - unsigned Reg, MachineRegisterInfo *MRI) const { + Register Reg, MachineRegisterInfo *MRI) const { if (!MRI->hasOneNonDBGUse(Reg)) return false; @@ -2339,15 +2555,40 @@ bool SIInstrInfo::FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI, unsigned Opc = UseMI.getOpcode(); if (Opc == AMDGPU::COPY) { - bool isVGPRCopy = RI.isVGPR(*MRI, UseMI.getOperand(0).getReg()); + Register DstReg = UseMI.getOperand(0).getReg(); + bool Is16Bit = getOpSize(UseMI, 0) == 2; + bool isVGPRCopy = RI.isVGPR(*MRI, DstReg); unsigned NewOpc = isVGPRCopy ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32; - if (RI.isAGPR(*MRI, UseMI.getOperand(0).getReg())) { - if (!isInlineConstant(*ImmOp, AMDGPU::OPERAND_REG_INLINE_AC_INT32)) + APInt Imm(32, ImmOp->getImm()); + + if (UseMI.getOperand(1).getSubReg() == AMDGPU::hi16) + Imm = Imm.ashr(16); + + if (RI.isAGPR(*MRI, DstReg)) { + if (!isInlineConstant(Imm)) return false; NewOpc = AMDGPU::V_ACCVGPR_WRITE_B32; } + + if (Is16Bit) { + if (isVGPRCopy) + return false; // Do not clobber vgpr_hi16 + + if (DstReg.isVirtual() && + UseMI.getOperand(0).getSubReg() != AMDGPU::lo16) + return false; + + UseMI.getOperand(0).setSubReg(0); + if (DstReg.isPhysical()) { + DstReg = RI.get32BitRegister(DstReg); + UseMI.getOperand(0).setReg(DstReg); + } + assert(UseMI.getOperand(1).getReg().isVirtual()); + } + UseMI.setDesc(get(NewOpc)); - UseMI.getOperand(1).ChangeToImmediate(ImmOp->getImm()); + UseMI.getOperand(1).ChangeToImmediate(Imm.getSExtValue()); + UseMI.getOperand(1).setTargetFlags(0); UseMI.addImplicitDefUseOperands(*UseMI.getParent()->getParent()); return true; } @@ -2517,6 +2758,18 @@ bool SIInstrInfo::FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI, return false; } +static bool +memOpsHaveSameBaseOperands(ArrayRef<const MachineOperand *> BaseOps1, + ArrayRef<const MachineOperand *> BaseOps2) { + if (BaseOps1.size() != BaseOps2.size()) + return false; + for (size_t I = 0, E = BaseOps1.size(); I < E; ++I) { + if (!BaseOps1[I]->isIdenticalTo(*BaseOps2[I])) + return false; + } + return true; +} + static bool offsetsDoNotOverlap(int WidthA, int OffsetA, int WidthB, int OffsetB) { int LowOffset = OffsetA < OffsetB ? OffsetA : OffsetB; @@ -2527,26 +2780,26 @@ static bool offsetsDoNotOverlap(int WidthA, int OffsetA, bool SIInstrInfo::checkInstOffsetsDoNotOverlap(const MachineInstr &MIa, const MachineInstr &MIb) const { - const MachineOperand *BaseOp0, *BaseOp1; + SmallVector<const MachineOperand *, 4> BaseOps0, BaseOps1; int64_t Offset0, Offset1; + unsigned Dummy0, Dummy1; + bool Offset0IsScalable, Offset1IsScalable; + if (!getMemOperandsWithOffsetWidth(MIa, BaseOps0, Offset0, Offset0IsScalable, + Dummy0, &RI) || + !getMemOperandsWithOffsetWidth(MIb, BaseOps1, Offset1, Offset1IsScalable, + Dummy1, &RI)) + return false; - if (getMemOperandWithOffset(MIa, BaseOp0, Offset0, &RI) && - getMemOperandWithOffset(MIb, BaseOp1, Offset1, &RI)) { - if (!BaseOp0->isIdenticalTo(*BaseOp1)) - return false; + if (!memOpsHaveSameBaseOperands(BaseOps0, BaseOps1)) + return false; - if (!MIa.hasOneMemOperand() || !MIb.hasOneMemOperand()) { - // FIXME: Handle ds_read2 / ds_write2. - return false; - } - unsigned Width0 = (*MIa.memoperands_begin())->getSize(); - unsigned Width1 = (*MIb.memoperands_begin())->getSize(); - if (offsetsDoNotOverlap(Width0, Offset0, Width1, Offset1)) { - return true; - } + if (!MIa.hasOneMemOperand() || !MIb.hasOneMemOperand()) { + // FIXME: Handle ds_read2 / ds_write2. + return false; } - - return false; + unsigned Width0 = MIa.memoperands().front()->getSize(); + unsigned Width1 = MIb.memoperands().front()->getSize(); + return offsetsDoNotOverlap(Width0, Offset0, Width1, Offset1); } bool SIInstrInfo::areMemAccessesTriviallyDisjoint(const MachineInstr &MIa, @@ -2586,7 +2839,7 @@ bool SIInstrInfo::areMemAccessesTriviallyDisjoint(const MachineInstr &MIa, if (isSMRD(MIb)) return checkInstOffsetsDoNotOverlap(MIa, MIb); - return !isFLAT(MIb) && !isMUBUF(MIa) && !isMTBUF(MIa); + return !isFLAT(MIb) && !isMUBUF(MIb) && !isMTBUF(MIb); } if (isFLAT(MIa)) { @@ -2732,16 +2985,30 @@ static bool changesVGPRIndexingMode(const MachineInstr &MI) { bool SIInstrInfo::isSchedulingBoundary(const MachineInstr &MI, const MachineBasicBlock *MBB, const MachineFunction &MF) const { - // XXX - Do we want the SP check in the base implementation? + // Skipping the check for SP writes in the base implementation. The reason it + // was added was apparently due to compile time concerns. + // + // TODO: Do we really want this barrier? It triggers unnecessary hazard nops + // but is probably avoidable. + + // Copied from base implementation. + // Terminators and labels can't be scheduled around. + if (MI.isTerminator() || MI.isPosition()) + return true; + + // INLINEASM_BR can jump to another block + if (MI.getOpcode() == TargetOpcode::INLINEASM_BR) + return true; // Target-independent instructions do not have an implicit-use of EXEC, even // when they operate on VGPRs. Treating EXEC modifications as scheduling // boundaries prevents incorrect movements of such instructions. - return TargetInstrInfo::isSchedulingBoundary(MI, MBB, MF) || - MI.modifiesRegister(AMDGPU::EXEC, &RI) || + + // TODO: Don't treat setreg with known constant that only changes MODE as + // barrier. + return MI.modifiesRegister(AMDGPU::EXEC, &RI) || MI.getOpcode() == AMDGPU::S_SETREG_IMM32_B32 || MI.getOpcode() == AMDGPU::S_SETREG_B32 || - MI.getOpcode() == AMDGPU::S_DENORM_MODE || changesVGPRIndexingMode(MI); } @@ -2755,6 +3022,20 @@ bool SIInstrInfo::isAlwaysGDS(uint16_t Opcode) const { Opcode == AMDGPU::DS_GWS_BARRIER; } +bool SIInstrInfo::modifiesModeRegister(const MachineInstr &MI) { + // Skip the full operand and register alias search modifiesRegister + // does. There's only a handful of instructions that touch this, it's only an + // implicit def, and doesn't alias any other registers. + if (const MCPhysReg *ImpDef = MI.getDesc().getImplicitDefs()) { + for (; ImpDef && *ImpDef; ++ImpDef) { + if (*ImpDef == AMDGPU::MODE) + return true; + } + } + + return false; +} + bool SIInstrInfo::hasUnwantedEffectsWhenEXECEmpty(const MachineInstr &MI) const { unsigned Opcode = MI.getOpcode(); @@ -2780,6 +3061,10 @@ bool SIInstrInfo::hasUnwantedEffectsWhenEXECEmpty(const MachineInstr &MI) const if (MI.isCall() || MI.isInlineAsm()) return true; // conservative assumption + // A mode change is a scalar operation that influences vector instructions. + if (modifiesModeRegister(MI)) + return true; + // These are like SALU instructions in terms of effects, so it's questionable // whether we should return true for those. // @@ -2866,10 +3151,26 @@ bool SIInstrInfo::isInlineConstant(const MachineOperand &MO, return AMDGPU::isInlinableLiteral64(MO.getImm(), ST.hasInv2PiInlineImm()); case AMDGPU::OPERAND_REG_IMM_INT16: - case AMDGPU::OPERAND_REG_IMM_FP16: case AMDGPU::OPERAND_REG_INLINE_C_INT16: - case AMDGPU::OPERAND_REG_INLINE_C_FP16: case AMDGPU::OPERAND_REG_INLINE_AC_INT16: + // We would expect inline immediates to not be concerned with an integer/fp + // distinction. However, in the case of 16-bit integer operations, the + // "floating point" values appear to not work. It seems read the low 16-bits + // of 32-bit immediates, which happens to always work for the integer + // values. + // + // See llvm bugzilla 46302. + // + // TODO: Theoretically we could use op-sel to use the high bits of the + // 32-bit FP values. + return AMDGPU::isInlinableIntLiteral(Imm); + case AMDGPU::OPERAND_REG_IMM_V2INT16: + case AMDGPU::OPERAND_REG_INLINE_C_V2INT16: + case AMDGPU::OPERAND_REG_INLINE_AC_V2INT16: + // This suffers the same problem as the scalar 16-bit cases. + return AMDGPU::isInlinableIntLiteralV216(Imm); + case AMDGPU::OPERAND_REG_IMM_FP16: + case AMDGPU::OPERAND_REG_INLINE_C_FP16: case AMDGPU::OPERAND_REG_INLINE_AC_FP16: { if (isInt<16>(Imm) || isUInt<16>(Imm)) { // A few special case instructions have 16-bit operands on subtargets @@ -2883,11 +3184,8 @@ bool SIInstrInfo::isInlineConstant(const MachineOperand &MO, return false; } - case AMDGPU::OPERAND_REG_IMM_V2INT16: case AMDGPU::OPERAND_REG_IMM_V2FP16: - case AMDGPU::OPERAND_REG_INLINE_C_V2INT16: case AMDGPU::OPERAND_REG_INLINE_C_V2FP16: - case AMDGPU::OPERAND_REG_INLINE_AC_V2INT16: case AMDGPU::OPERAND_REG_INLINE_AC_V2FP16: { uint32_t Trunc = static_cast<uint32_t>(Imm); return AMDGPU::isInlinableLiteralV216(Trunc, ST.hasInv2PiInlineImm()); @@ -3056,7 +3354,8 @@ static void copyFlagsToImplicitVCC(MachineInstr &MI, const MachineOperand &Orig) { for (MachineOperand &Use : MI.implicit_operands()) { - if (Use.isUse() && Use.getReg() == AMDGPU::VCC) { + if (Use.isUse() && + (Use.getReg() == AMDGPU::VCC || Use.getReg() == AMDGPU::VCC_LO)) { Use.setIsUndef(Orig.isUndef()); Use.setIsKill(Orig.isKill()); return; @@ -3068,7 +3367,8 @@ MachineInstr *SIInstrInfo::buildShrunkInst(MachineInstr &MI, unsigned Op32) const { MachineBasicBlock *MBB = MI.getParent();; MachineInstrBuilder Inst32 = - BuildMI(*MBB, MI, MI.getDebugLoc(), get(Op32)); + BuildMI(*MBB, MI, MI.getDebugLoc(), get(Op32)) + .setMIFlags(MI.getFlags()); // Add the dst operand if the 32-bit encoding also has an explicit $vdst. // For VOPC instructions, this is replaced by an implicit def of vcc. @@ -3138,7 +3438,7 @@ bool SIInstrInfo::usesConstantBus(const MachineRegisterInfo &MRI, } } -static unsigned findImplicitSGPRRead(const MachineInstr &MI) { +static Register findImplicitSGPRRead(const MachineInstr &MI) { for (const MachineOperand &MO : MI.implicit_operands()) { // We only care about reads. if (MO.isDef()) @@ -3239,6 +3539,11 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI, return true; } + if (isMIMG(MI) && MI.memoperands_empty() && MI.mayLoadOrStore()) { + ErrInfo = "missing memory operand from MIMG instruction."; + return false; + } + // Make sure the register classes are correct. for (int i = 0, e = Desc.getNumOperands(); i != e; ++i) { if (MI.getOperand(i).isFPImm()) { @@ -3446,8 +3751,8 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI, if (AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::imm) != -1) ++ConstantBusCount; - SmallVector<unsigned, 2> SGPRsUsed; - unsigned SGPRUsed = findImplicitSGPRRead(MI); + SmallVector<Register, 2> SGPRsUsed; + Register SGPRUsed = findImplicitSGPRRead(MI); if (SGPRUsed != AMDGPU::NoRegister) { ++ConstantBusCount; SGPRsUsed.push_back(SGPRUsed); @@ -3482,7 +3787,7 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI, } if (isVOP3(MI) && LiteralCount) { - if (LiteralCount && !ST.hasVOP3Literal()) { + if (!ST.hasVOP3Literal()) { ErrInfo = "VOP3 instruction uses literal"; return false; } @@ -3665,11 +3970,34 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI, return false; } + bool IsA16 = false; + if (ST.hasR128A16()) { + const MachineOperand *R128A16 = getNamedOperand(MI, AMDGPU::OpName::r128); + IsA16 = R128A16->getImm() != 0; + } else if (ST.hasGFX10A16()) { + const MachineOperand *A16 = getNamedOperand(MI, AMDGPU::OpName::a16); + IsA16 = A16->getImm() != 0; + } + + bool PackDerivatives = IsA16 || BaseOpcode->G16; bool IsNSA = SRsrcIdx - VAddr0Idx > 1; - unsigned AddrWords = BaseOpcode->NumExtraArgs + - (BaseOpcode->Gradients ? Dim->NumGradients : 0) + - (BaseOpcode->Coordinates ? Dim->NumCoords : 0) + - (BaseOpcode->LodOrClampOrMip ? 1 : 0); + + unsigned AddrWords = BaseOpcode->NumExtraArgs; + unsigned AddrComponents = (BaseOpcode->Coordinates ? Dim->NumCoords : 0) + + (BaseOpcode->LodOrClampOrMip ? 1 : 0); + if (IsA16) + AddrWords += (AddrComponents + 1) / 2; + else + AddrWords += AddrComponents; + + if (BaseOpcode->Gradients) { + if (PackDerivatives) + // There are two gradients per coordinate, we pack them separately. + // For the 3d case, we get (dy/du, dx/du) (-, dz/du) (dy/dv, dx/dv) (-, dz/dv) + AddrWords += (Dim->NumGradients / 2 + 1) / 2 * 2; + else + AddrWords += Dim->NumGradients; + } unsigned VAddrWords; if (IsNSA) { @@ -3681,14 +4009,15 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI, AddrWords = 16; else if (AddrWords > 4) AddrWords = 8; - else if (AddrWords == 3 && VAddrWords == 4) { - // CodeGen uses the V4 variant of instructions for three addresses, - // because the selection DAG does not support non-power-of-two types. + else if (AddrWords == 4) AddrWords = 4; - } + else if (AddrWords == 3) + AddrWords = 3; } if (VAddrWords != AddrWords) { + LLVM_DEBUG(dbgs() << "bad vaddr size, expected " << AddrWords + << " but got " << VAddrWords << "\n"); ErrInfo = "bad vaddr size"; return false; } @@ -4217,7 +4546,7 @@ void SIInstrInfo::legalizeOperandsVOP3(MachineRegisterInfo &MRI, } } -unsigned SIInstrInfo::readlaneVGPRToSGPR(unsigned SrcReg, MachineInstr &UseMI, +Register SIInstrInfo::readlaneVGPRToSGPR(Register SrcReg, MachineInstr &UseMI, MachineRegisterInfo &MRI) const { const TargetRegisterClass *VRC = MRI.getRegClass(SrcReg); const TargetRegisterClass *SRC = RI.getEquivalentSGPRClass(VRC); @@ -5002,6 +5331,76 @@ void SIInstrInfo::moveToVALU(MachineInstr &TopInst, splitScalarBinOpN2(Worklist, Inst, AMDGPU::S_OR_B32); Inst.eraseFromParent(); continue; + + // TODO: remove as soon as everything is ready + // to replace VGPR to SGPR copy with V_READFIRSTLANEs. + // S_ADD/SUB_CO_PSEUDO as well as S_UADDO/USUBO_PSEUDO + // can only be selected from the uniform SDNode. + case AMDGPU::S_ADD_CO_PSEUDO: + case AMDGPU::S_SUB_CO_PSEUDO: { + unsigned Opc = (Inst.getOpcode() == AMDGPU::S_ADD_CO_PSEUDO) + ? AMDGPU::V_ADDC_U32_e64 + : AMDGPU::V_SUBB_U32_e64; + const auto *CarryRC = RI.getRegClass(AMDGPU::SReg_1_XEXECRegClassID); + + Register CarryInReg = Inst.getOperand(4).getReg(); + if (!MRI.constrainRegClass(CarryInReg, CarryRC)) { + Register NewCarryReg = MRI.createVirtualRegister(CarryRC); + BuildMI(*MBB, &Inst, Inst.getDebugLoc(), get(AMDGPU::COPY), NewCarryReg) + .addReg(CarryInReg); + } + + Register CarryOutReg = Inst.getOperand(1).getReg(); + + Register DestReg = MRI.createVirtualRegister(RI.getEquivalentVGPRClass( + MRI.getRegClass(Inst.getOperand(0).getReg()))); + MachineInstr *CarryOp = + BuildMI(*MBB, &Inst, Inst.getDebugLoc(), get(Opc), DestReg) + .addReg(CarryOutReg, RegState::Define) + .add(Inst.getOperand(2)) + .add(Inst.getOperand(3)) + .addReg(CarryInReg) + .addImm(0); + legalizeOperands(*CarryOp); + MRI.replaceRegWith(Inst.getOperand(0).getReg(), DestReg); + addUsersToMoveToVALUWorklist(DestReg, MRI, Worklist); + Inst.eraseFromParent(); + } + continue; + case AMDGPU::S_UADDO_PSEUDO: + case AMDGPU::S_USUBO_PSEUDO: { + const DebugLoc &DL = Inst.getDebugLoc(); + MachineOperand &Dest0 = Inst.getOperand(0); + MachineOperand &Dest1 = Inst.getOperand(1); + MachineOperand &Src0 = Inst.getOperand(2); + MachineOperand &Src1 = Inst.getOperand(3); + + unsigned Opc = (Inst.getOpcode() == AMDGPU::S_UADDO_PSEUDO) + ? AMDGPU::V_ADD_I32_e64 + : AMDGPU::V_SUB_I32_e64; + const TargetRegisterClass *NewRC = + RI.getEquivalentVGPRClass(MRI.getRegClass(Dest0.getReg())); + Register DestReg = MRI.createVirtualRegister(NewRC); + MachineInstr *NewInstr = BuildMI(*MBB, &Inst, DL, get(Opc), DestReg) + .addReg(Dest1.getReg(), RegState::Define) + .add(Src0) + .add(Src1) + .addImm(0); // clamp bit + + legalizeOperands(*NewInstr, MDT); + + MRI.replaceRegWith(Dest0.getReg(), DestReg); + addUsersToMoveToVALUWorklist(NewInstr->getOperand(0).getReg(), MRI, + Worklist); + Inst.eraseFromParent(); + } + continue; + + case AMDGPU::S_CSELECT_B32: + case AMDGPU::S_CSELECT_B64: + lowerSelect(Worklist, Inst, MDT); + Inst.eraseFromParent(); + continue; } if (NewOpcode == AMDGPU::INSTRUCTION_LIST_END) { @@ -5142,6 +5541,78 @@ bool SIInstrInfo::moveScalarAddSub(SetVectorType &Worklist, MachineInstr &Inst, return false; } +void SIInstrInfo::lowerSelect(SetVectorType &Worklist, MachineInstr &Inst, + MachineDominatorTree *MDT) const { + + MachineBasicBlock &MBB = *Inst.getParent(); + MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); + MachineBasicBlock::iterator MII = Inst; + DebugLoc DL = Inst.getDebugLoc(); + + MachineOperand &Dest = Inst.getOperand(0); + MachineOperand &Src0 = Inst.getOperand(1); + MachineOperand &Src1 = Inst.getOperand(2); + MachineOperand &Cond = Inst.getOperand(3); + + Register SCCSource = Cond.getReg(); + // Find SCC def, and if that is a copy (SCC = COPY reg) then use reg instead. + if (!Cond.isUndef()) { + for (MachineInstr &CandI : + make_range(std::next(MachineBasicBlock::reverse_iterator(Inst)), + Inst.getParent()->rend())) { + if (CandI.findRegisterDefOperandIdx(AMDGPU::SCC, false, false, &RI) != + -1) { + if (CandI.isCopy() && CandI.getOperand(0).getReg() == AMDGPU::SCC) { + SCCSource = CandI.getOperand(1).getReg(); + } + break; + } + } + } + + // If this is a trivial select where the condition is effectively not SCC + // (SCCSource is a source of copy to SCC), then the select is semantically + // equivalent to copying SCCSource. Hence, there is no need to create + // V_CNDMASK, we can just use that and bail out. + if ((SCCSource != AMDGPU::SCC) && Src0.isImm() && (Src0.getImm() == -1) && + Src1.isImm() && (Src1.getImm() == 0)) { + MRI.replaceRegWith(Dest.getReg(), SCCSource); + return; + } + + const TargetRegisterClass *TC = ST.getWavefrontSize() == 64 + ? &AMDGPU::SReg_64_XEXECRegClass + : &AMDGPU::SReg_32_XM0_XEXECRegClass; + Register CopySCC = MRI.createVirtualRegister(TC); + + if (SCCSource == AMDGPU::SCC) { + // Insert a trivial select instead of creating a copy, because a copy from + // SCC would semantically mean just copying a single bit, but we may need + // the result to be a vector condition mask that needs preserving. + unsigned Opcode = (ST.getWavefrontSize() == 64) ? AMDGPU::S_CSELECT_B64 + : AMDGPU::S_CSELECT_B32; + auto NewSelect = + BuildMI(MBB, MII, DL, get(Opcode), CopySCC).addImm(-1).addImm(0); + NewSelect->getOperand(3).setIsUndef(Cond.isUndef()); + } else { + BuildMI(MBB, MII, DL, get(AMDGPU::COPY), CopySCC).addReg(SCCSource); + } + + Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); + + auto UpdatedInst = + BuildMI(MBB, MII, DL, get(AMDGPU::V_CNDMASK_B32_e64), ResultReg) + .addImm(0) + .add(Src1) // False + .addImm(0) + .add(Src0) // True + .addReg(CopySCC); + + MRI.replaceRegWith(Dest.getReg(), ResultReg); + legalizeOperands(*UpdatedInst, MDT); + addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist); +} + void SIInstrInfo::lowerScalarAbs(SetVectorType &Worklist, MachineInstr &Inst) const { MachineBasicBlock &MBB = *Inst.getParent(); @@ -5623,7 +6094,7 @@ void SIInstrInfo::splitScalar64BitBFE(SetVectorType &Worklist, } void SIInstrInfo::addUsersToMoveToVALUWorklist( - unsigned DstReg, + Register DstReg, MachineRegisterInfo &MRI, SetVectorType &Worklist) const { for (MachineRegisterInfo::use_iterator I = MRI.use_begin(DstReg), @@ -5723,20 +6194,60 @@ void SIInstrInfo::movePackToVALU(SetVectorType &Worklist, void SIInstrInfo::addSCCDefUsersToVALUWorklist(MachineOperand &Op, MachineInstr &SCCDefInst, SetVectorType &Worklist) const { + bool SCCUsedImplicitly = false; + // Ensure that def inst defines SCC, which is still live. assert(Op.isReg() && Op.getReg() == AMDGPU::SCC && Op.isDef() && !Op.isDead() && Op.getParent() == &SCCDefInst); + SmallVector<MachineInstr *, 4> CopyToDelete; // This assumes that all the users of SCC are in the same block // as the SCC def. for (MachineInstr &MI : // Skip the def inst itself. make_range(std::next(MachineBasicBlock::iterator(SCCDefInst)), SCCDefInst.getParent()->end())) { // Check if SCC is used first. - if (MI.findRegisterUseOperandIdx(AMDGPU::SCC, false, &RI) != -1) - Worklist.insert(&MI); + if (MI.findRegisterUseOperandIdx(AMDGPU::SCC, false, &RI) != -1) { + if (MI.isCopy()) { + MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo(); + unsigned DestReg = MI.getOperand(0).getReg(); + + for (auto &User : MRI.use_nodbg_instructions(DestReg)) { + if ((User.getOpcode() == AMDGPU::S_ADD_CO_PSEUDO) || + (User.getOpcode() == AMDGPU::S_SUB_CO_PSEUDO)) { + User.getOperand(4).setReg(RI.getVCC()); + Worklist.insert(&User); + } else if (User.getOpcode() == AMDGPU::V_CNDMASK_B32_e64) { + User.getOperand(5).setReg(RI.getVCC()); + // No need to add to Worklist. + } + } + CopyToDelete.push_back(&MI); + } else { + if (MI.getOpcode() == AMDGPU::S_CSELECT_B32 || + MI.getOpcode() == AMDGPU::S_CSELECT_B64) { + // This is an implicit use of SCC and it is really expected by + // the SCC users to handle. + // We cannot preserve the edge to the user so add the explicit + // copy: SCC = COPY VCC. + // The copy will be cleaned up during the processing of the user + // in lowerSelect. + SCCUsedImplicitly = true; + } + + Worklist.insert(&MI); + } + } // Exit if we find another SCC def. if (MI.findRegisterDefOperandIdx(AMDGPU::SCC, false, false, &RI) != -1) - return; + break; + } + for (auto &Copy : CopyToDelete) + Copy->eraseFromParent(); + + if (SCCUsedImplicitly) { + BuildMI(*SCCDefInst.getParent(), std::next(SCCDefInst.getIterator()), + SCCDefInst.getDebugLoc(), get(AMDGPU::COPY), AMDGPU::SCC) + .addReg(RI.getVCC()); } } @@ -5789,7 +6300,7 @@ const TargetRegisterClass *SIInstrInfo::getDestEquivalentVGPRClass( } // Find the one SGPR operand we are allowed to use. -unsigned SIInstrInfo::findUsedSGPR(const MachineInstr &MI, +Register SIInstrInfo::findUsedSGPR(const MachineInstr &MI, int OpIndices[3]) const { const MCInstrDesc &Desc = MI.getDesc(); @@ -5802,11 +6313,11 @@ unsigned SIInstrInfo::findUsedSGPR(const MachineInstr &MI, // // If the operand's class is an SGPR, we can never move it. - unsigned SGPRReg = findImplicitSGPRRead(MI); + Register SGPRReg = findImplicitSGPRRead(MI); if (SGPRReg != AMDGPU::NoRegister) return SGPRReg; - unsigned UsedSGPRs[3] = { AMDGPU::NoRegister }; + Register UsedSGPRs[3] = { AMDGPU::NoRegister }; const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo(); for (unsigned i = 0; i < 3; ++i) { @@ -5919,10 +6430,9 @@ bool SIInstrInfo::isLowLatencyInstruction(const MachineInstr &MI) const { return isSMRD(Opc); } -bool SIInstrInfo::isHighLatencyInstruction(const MachineInstr &MI) const { - unsigned Opc = MI.getOpcode(); - - return isMUBUF(Opc) || isMTBUF(Opc) || isMIMG(Opc); +bool SIInstrInfo::isHighLatencyDef(int Opc) const { + return get(Opc).mayLoad() && + (isMUBUF(Opc) || isMTBUF(Opc) || isMIMG(Opc) || isFLAT(Opc)); } unsigned SIInstrInfo::isStackAccess(const MachineInstr &MI, @@ -6198,7 +6708,7 @@ MachineInstrBuilder SIInstrInfo::getAddNoCarry(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, const DebugLoc &DL, - unsigned DestReg) const { + Register DestReg) const { if (ST.hasAddNoCarry()) return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_U32_e64), DestReg); @@ -6608,20 +7118,24 @@ MachineInstr *SIInstrInfo::foldMemoryOperandImpl( // %0 may even spill. We can't spill $m0 normally (it would require copying to // a numbered SGPR anyway), and since it is in the SReg_32 register class, // TargetInstrInfo::foldMemoryOperand() is going to try. + // A similar issue also exists with spilling and reloading $exec registers. // // To prevent that, constrain the %0 register class here. if (MI.isFullCopy()) { Register DstReg = MI.getOperand(0).getReg(); Register SrcReg = MI.getOperand(1).getReg(); - - if (DstReg == AMDGPU::M0 && SrcReg.isVirtual()) { - MF.getRegInfo().constrainRegClass(SrcReg, &AMDGPU::SReg_32_XM0RegClass); - return nullptr; - } - - if (SrcReg == AMDGPU::M0 && DstReg.isVirtual()) { - MF.getRegInfo().constrainRegClass(DstReg, &AMDGPU::SReg_32_XM0RegClass); - return nullptr; + if ((DstReg.isVirtual() || SrcReg.isVirtual()) && + (DstReg.isVirtual() != SrcReg.isVirtual())) { + MachineRegisterInfo &MRI = MF.getRegInfo(); + Register VirtReg = DstReg.isVirtual() ? DstReg : SrcReg; + const TargetRegisterClass *RC = MRI.getRegClass(VirtReg); + if (RC->hasSuperClassEq(&AMDGPU::SReg_32RegClass)) { + MRI.constrainRegClass(VirtReg, &AMDGPU::SReg_32_XM0_XEXECRegClass); + return nullptr; + } else if (RC->hasSuperClassEq(&AMDGPU::SReg_64RegClass)) { + MRI.constrainRegClass(VirtReg, &AMDGPU::SReg_64_XEXECRegClass); + return nullptr; + } } } |