diff options
Diffstat (limited to 'llvm/lib/Target/AMDGPU/SIInstrInfo.cpp')
| -rw-r--r-- | llvm/lib/Target/AMDGPU/SIInstrInfo.cpp | 1019 |
1 files changed, 759 insertions, 260 deletions
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp index 278cf2b69ee3..0a06fa88b6b1 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -18,6 +18,7 @@ #include "GCNSubtarget.h" #include "SIMachineFunctionInfo.h" #include "llvm/Analysis/ValueTracking.h" +#include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h" #include "llvm/CodeGen/LiveIntervals.h" #include "llvm/CodeGen/LiveVariables.h" #include "llvm/CodeGen/MachineDominators.h" @@ -105,9 +106,27 @@ static bool nodesHaveSameOperandValue(SDNode *N0, SDNode* N1, unsigned OpName) { return N0->getOperand(Op0Idx) == N1->getOperand(Op1Idx); } +static bool canRemat(const MachineInstr &MI) { + + if (SIInstrInfo::isVOP1(MI) || SIInstrInfo::isVOP2(MI) || + SIInstrInfo::isVOP3(MI) || SIInstrInfo::isSDWA(MI) || + SIInstrInfo::isSALU(MI)) + return true; + + if (SIInstrInfo::isSMRD(MI)) { + return !MI.memoperands_empty() && + llvm::all_of(MI.memoperands(), [](const MachineMemOperand *MMO) { + return MMO->isLoad() && MMO->isInvariant(); + }); + } + + return false; +} + bool SIInstrInfo::isReallyTriviallyReMaterializable( const MachineInstr &MI) const { - if (isVOP1(MI) || isVOP2(MI) || isVOP3(MI) || isSDWA(MI) || isSALU(MI)) { + + if (canRemat(MI)) { // Normally VALU use of exec would block the rematerialization, but that // is OK in this case to have an implicit exec read as all VALU do. // We really want all of the generic logic for this except for this. @@ -119,12 +138,13 @@ bool SIInstrInfo::isReallyTriviallyReMaterializable( // There is difference to generic method which does not allow // rematerialization if there are virtual register uses. We allow this, // therefore this method includes SOP instructions as well. - return !MI.hasImplicitDef() && - MI.getNumImplicitOperands() == MI.getDesc().implicit_uses().size() && - !MI.mayRaiseFPException(); + if (!MI.hasImplicitDef() && + MI.getNumImplicitOperands() == MI.getDesc().implicit_uses().size() && + !MI.mayRaiseFPException()) + return true; } - return false; + return TargetInstrInfo::isReallyTriviallyReMaterializable(MI); } // Returns true if the scalar result of a VALU instruction depends on exec. @@ -169,6 +189,48 @@ bool SIInstrInfo::isIgnorableUse(const MachineOperand &MO) const { isVALU(*MO.getParent()) && !resultDependsOnExec(*MO.getParent()); } +bool SIInstrInfo::isSafeToSink(MachineInstr &MI, + MachineBasicBlock *SuccToSinkTo, + MachineCycleInfo *CI) const { + // Allow sinking if MI edits lane mask (divergent i1 in sgpr). + if (MI.getOpcode() == AMDGPU::SI_IF_BREAK) + return true; + + MachineRegisterInfo &MRI = MI.getMF()->getRegInfo(); + // Check if sinking of MI would create temporal divergent use. + for (auto Op : MI.uses()) { + if (Op.isReg() && Op.getReg().isVirtual() && + RI.isSGPRClass(MRI.getRegClass(Op.getReg()))) { + MachineInstr *SgprDef = MRI.getVRegDef(Op.getReg()); + + // SgprDef defined inside cycle + MachineCycle *FromCycle = CI->getCycle(SgprDef->getParent()); + if (FromCycle == nullptr) + continue; + + MachineCycle *ToCycle = CI->getCycle(SuccToSinkTo); + // Check if there is a FromCycle that contains SgprDef's basic block but + // does not contain SuccToSinkTo and also has divergent exit condition. + while (FromCycle && !FromCycle->contains(ToCycle)) { + // After structurize-cfg, there should be exactly one cycle exit. + SmallVector<MachineBasicBlock *, 1> ExitBlocks; + FromCycle->getExitBlocks(ExitBlocks); + assert(ExitBlocks.size() == 1); + assert(ExitBlocks[0]->getSinglePredecessor()); + + // FromCycle has divergent exit condition. + if (hasDivergentBranch(ExitBlocks[0]->getSinglePredecessor())) { + return false; + } + + FromCycle = FromCycle->getParentCycle(); + } + } + } + + return true; +} + bool SIInstrInfo::areLoadsFromSameBasePtr(SDNode *Load0, SDNode *Load1, int64_t &Offset0, int64_t &Offset1) const { @@ -479,8 +541,10 @@ static bool memOpsHaveSameBasePtr(const MachineInstr &MI1, } bool SIInstrInfo::shouldClusterMemOps(ArrayRef<const MachineOperand *> BaseOps1, + int64_t Offset1, bool OffsetIsScalable1, ArrayRef<const MachineOperand *> BaseOps2, - unsigned NumLoads, + int64_t Offset2, bool OffsetIsScalable2, + unsigned ClusterSize, unsigned NumBytes) const { // If the mem ops (to be clustered) do not have the same base ptr, then they // should not be clustered @@ -506,8 +570,8 @@ bool SIInstrInfo::shouldClusterMemOps(ArrayRef<const MachineOperand *> BaseOps1, // (3) 9 <= LoadSize <= 12: cluster at max 2 mem ops // (4) 13 <= LoadSize <= 16: cluster at max 2 mem ops // (5) LoadSize >= 17: do not cluster - const unsigned LoadSize = NumBytes / NumLoads; - const unsigned NumDWORDs = ((LoadSize + 3) / 4) * NumLoads; + const unsigned LoadSize = NumBytes / ClusterSize; + const unsigned NumDWORDs = ((LoadSize + 3) / 4) * ClusterSize; return NumDWORDs <= 8; } @@ -619,7 +683,7 @@ static void indirectCopyToAGPR(const SIInstrInfo &TII, } RS.enterBasicBlockEnd(MBB); - RS.backward(MI); + RS.backward(std::next(MI)); // Ideally we want to have three registers for a long reg_sequence copy // to hide 2 waitstates between v_mov_b32 and accvgpr_write. @@ -680,23 +744,27 @@ static void expandSGPRCopy(const SIInstrInfo &TII, MachineBasicBlock &MBB, for (unsigned Idx = 0; Idx < BaseIndices.size(); ++Idx) { int16_t SubIdx = BaseIndices[Idx]; - Register Reg = RI.getSubReg(DestReg, SubIdx); + Register DestSubReg = RI.getSubReg(DestReg, SubIdx); + Register SrcSubReg = RI.getSubReg(SrcReg, SubIdx); + assert(DestSubReg && SrcSubReg && "Failed to find subregs!"); unsigned Opcode = AMDGPU::S_MOV_B32; // Is SGPR aligned? If so try to combine with next. - Register Src = RI.getSubReg(SrcReg, SubIdx); - bool AlignedDest = ((Reg - AMDGPU::SGPR0) % 2) == 0; - bool AlignedSrc = ((Src - AMDGPU::SGPR0) % 2) == 0; + bool AlignedDest = ((DestSubReg - AMDGPU::SGPR0) % 2) == 0; + bool AlignedSrc = ((SrcSubReg - AMDGPU::SGPR0) % 2) == 0; if (AlignedDest && AlignedSrc && (Idx + 1 < BaseIndices.size())) { // Can use SGPR64 copy unsigned Channel = RI.getChannelFromSubReg(SubIdx); SubIdx = RI.getSubRegFromChannel(Channel, 2); + DestSubReg = RI.getSubReg(DestReg, SubIdx); + SrcSubReg = RI.getSubReg(SrcReg, SubIdx); + assert(DestSubReg && SrcSubReg && "Failed to find subregs!"); Opcode = AMDGPU::S_MOV_B64; Idx++; } - LastMI = BuildMI(MBB, I, DL, TII.get(Opcode), RI.getSubReg(DestReg, SubIdx)) - .addReg(RI.getSubReg(SrcReg, SubIdx)) + LastMI = BuildMI(MBB, I, DL, TII.get(Opcode), DestSubReg) + .addReg(SrcSubReg) .addReg(SrcReg, RegState::Implicit); if (!FirstMI) @@ -722,24 +790,32 @@ void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB, const DebugLoc &DL, MCRegister DestReg, MCRegister SrcReg, bool KillSrc) const { const TargetRegisterClass *RC = RI.getPhysRegBaseClass(DestReg); + unsigned Size = RI.getRegSizeInBits(*RC); + const TargetRegisterClass *SrcRC = RI.getPhysRegBaseClass(SrcReg); + unsigned SrcSize = RI.getRegSizeInBits(*SrcRC); - // FIXME: This is hack to resolve copies between 16 bit and 32 bit - // registers until all patterns are fixed. - if (Fix16BitCopies && - ((RI.getRegSizeInBits(*RC) == 16) ^ - (RI.getRegSizeInBits(*RI.getPhysRegBaseClass(SrcReg)) == 16))) { - MCRegister &RegToFix = (RI.getRegSizeInBits(*RC) == 16) ? DestReg : SrcReg; - MCRegister Super = RI.get32BitRegister(RegToFix); - assert(RI.getSubReg(Super, AMDGPU::lo16) == RegToFix); - RegToFix = Super; + // The rest of copyPhysReg assumes Src and Dst size are the same size. + // TODO-GFX11_16BIT If all true 16 bit instruction patterns are completed can + // we remove Fix16BitCopies and this code block? + if (Fix16BitCopies) { + if (((Size == 16) != (SrcSize == 16))) { + // Non-VGPR Src and Dst will later be expanded back to 32 bits. + assert(ST.hasTrue16BitInsts()); + MCRegister &RegToFix = (Size == 32) ? DestReg : SrcReg; + MCRegister SubReg = RI.getSubReg(RegToFix, AMDGPU::lo16); + RegToFix = SubReg; - if (DestReg == SrcReg) { - // Insert empty bundle since ExpandPostRA expects an instruction here. - BuildMI(MBB, MI, DL, get(AMDGPU::BUNDLE)); - return; + if (DestReg == SrcReg) { + // Identity copy. Insert empty bundle since ExpandPostRA expects an + // instruction here. + BuildMI(MBB, MI, DL, get(AMDGPU::BUNDLE)); + return; + } + RC = RI.getPhysRegBaseClass(DestReg); + Size = RI.getRegSizeInBits(*RC); + SrcRC = RI.getPhysRegBaseClass(SrcReg); + SrcSize = RI.getRegSizeInBits(*SrcRC); } - - RC = RI.getPhysRegBaseClass(DestReg); } if (RC == &AMDGPU::VGPR_32RegClass) { @@ -863,10 +939,8 @@ void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB, return; } - const unsigned Size = RI.getRegSizeInBits(*RC); if (Size == 16) { - assert(AMDGPU::VGPR_LO16RegClass.contains(SrcReg) || - AMDGPU::VGPR_HI16RegClass.contains(SrcReg) || + assert(AMDGPU::VGPR_16RegClass.contains(SrcReg) || AMDGPU::SReg_LO16RegClass.contains(SrcReg) || AMDGPU::AGPR_LO16RegClass.contains(SrcReg)); @@ -904,6 +978,25 @@ void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB, return; } + if (ST.hasTrue16BitInsts()) { + if (IsSGPRSrc) { + assert(SrcLow); + SrcReg = NewSrcReg; + } + // Use the smaller instruction encoding if possible. + if (AMDGPU::VGPR_16_Lo128RegClass.contains(DestReg) && + (IsSGPRSrc || AMDGPU::VGPR_16_Lo128RegClass.contains(SrcReg))) { + BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B16_t16_e32), DestReg) + .addReg(SrcReg); + } else { + BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B16_t16_e64), DestReg) + .addImm(0) // src0_modifiers + .addReg(SrcReg) + .addImm(0); // op_sel + } + return; + } + if (IsSGPRSrc && !ST.hasSDWAScalar()) { if (!DstLow || !SrcLow) { reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc, @@ -930,14 +1023,13 @@ void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB, return; } - const TargetRegisterClass *SrcRC = RI.getPhysRegBaseClass(SrcReg); if (RC == RI.getVGPR64Class() && (SrcRC == RC || RI.isSGPRClass(SrcRC))) { if (ST.hasMovB64()) { BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B64_e32), DestReg) .addReg(SrcReg, getKillRegState(KillSrc)); return; } - if (ST.hasPackedFP32Ops()) { + if (ST.hasPkMovB32()) { BuildMI(MBB, MI, DL, get(AMDGPU::V_PK_MOV_B32), DestReg) .addImm(SISrcMods::OP_SEL_1) .addReg(SrcReg) @@ -984,7 +1076,7 @@ void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB, if (ST.hasMovB64()) { Opcode = AMDGPU::V_MOV_B64_e32; EltSize = 8; - } else if (ST.hasPackedFP32Ops()) { + } else if (ST.hasPkMovB32()) { Opcode = AMDGPU::V_PK_MOV_B32; EltSize = 8; } @@ -1012,6 +1104,9 @@ void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB, SubIdx = SubIndices[Idx]; else SubIdx = SubIndices[SubIndices.size() - Idx - 1]; + Register DestSubReg = RI.getSubReg(DestReg, SubIdx); + Register SrcSubReg = RI.getSubReg(SrcReg, SubIdx); + assert(DestSubReg && SrcSubReg && "Failed to find subregs!"); bool IsFirstSubreg = Idx == 0; bool UseKill = CanKillSuperReg && Idx == SubIndices.size() - 1; @@ -1019,30 +1114,26 @@ void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB, if (Opcode == AMDGPU::INSTRUCTION_LIST_END) { Register ImpDefSuper = IsFirstSubreg ? Register(DestReg) : Register(); Register ImpUseSuper = SrcReg; - indirectCopyToAGPR(*this, MBB, MI, DL, RI.getSubReg(DestReg, SubIdx), - RI.getSubReg(SrcReg, SubIdx), UseKill, *RS, Overlap, - ImpDefSuper, ImpUseSuper); + indirectCopyToAGPR(*this, MBB, MI, DL, DestSubReg, SrcSubReg, UseKill, + *RS, Overlap, ImpDefSuper, ImpUseSuper); } else if (Opcode == AMDGPU::V_PK_MOV_B32) { - Register DstSubReg = RI.getSubReg(DestReg, SubIdx); - Register SrcSubReg = RI.getSubReg(SrcReg, SubIdx); MachineInstrBuilder MIB = - BuildMI(MBB, MI, DL, get(AMDGPU::V_PK_MOV_B32), DstSubReg) - .addImm(SISrcMods::OP_SEL_1) - .addReg(SrcSubReg) - .addImm(SISrcMods::OP_SEL_0 | SISrcMods::OP_SEL_1) - .addReg(SrcSubReg) - .addImm(0) // op_sel_lo - .addImm(0) // op_sel_hi - .addImm(0) // neg_lo - .addImm(0) // neg_hi - .addImm(0) // clamp - .addReg(SrcReg, getKillRegState(UseKill) | RegState::Implicit); + BuildMI(MBB, MI, DL, get(AMDGPU::V_PK_MOV_B32), DestSubReg) + .addImm(SISrcMods::OP_SEL_1) + .addReg(SrcSubReg) + .addImm(SISrcMods::OP_SEL_0 | SISrcMods::OP_SEL_1) + .addReg(SrcSubReg) + .addImm(0) // op_sel_lo + .addImm(0) // op_sel_hi + .addImm(0) // neg_lo + .addImm(0) // neg_hi + .addImm(0) // clamp + .addReg(SrcReg, getKillRegState(UseKill) | RegState::Implicit); if (IsFirstSubreg) MIB.addReg(DestReg, RegState::Define | RegState::Implicit); } else { MachineInstrBuilder Builder = - BuildMI(MBB, MI, DL, get(Opcode), RI.getSubReg(DestReg, SubIdx)) - .addReg(RI.getSubReg(SrcReg, SubIdx)); + BuildMI(MBB, MI, DL, get(Opcode), DestSubReg).addReg(SrcSubReg); if (IsFirstSubreg) Builder.addReg(DestReg, RegState::Define | RegState::Implicit); @@ -1286,7 +1377,11 @@ unsigned SIInstrInfo::getMovOpcode(const TargetRegisterClass *DstRC) const { if (RI.isAGPRClass(DstRC)) return AMDGPU::COPY; - if (RI.getRegSizeInBits(*DstRC) == 32) { + if (RI.getRegSizeInBits(*DstRC) == 16) { + // Assume hi bits are unneeded. Only _e64 true16 instructions are legal + // before RA. + return RI.isSGPRClass(DstRC) ? AMDGPU::COPY : AMDGPU::V_MOV_B16_t16_e64; + } else if (RI.getRegSizeInBits(*DstRC) == 32) { return RI.isSGPRClass(DstRC) ? AMDGPU::S_MOV_B32 : AMDGPU::V_MOV_B32_e32; } else if (RI.getRegSizeInBits(*DstRC) == 64 && RI.isSGPRClass(DstRC)) { return AMDGPU::S_MOV_B64; @@ -1587,11 +1682,15 @@ static unsigned getAVSpillSaveOpcode(unsigned Size) { } } -static unsigned getWWMRegSpillSaveOpcode(unsigned Size) { +static unsigned getWWMRegSpillSaveOpcode(unsigned Size, + bool IsVectorSuperClass) { // Currently, there is only 32-bit WWM register spills needed. if (Size != 4) llvm_unreachable("unknown wwm register spill size"); + if (IsVectorSuperClass) + return AMDGPU::SI_SPILL_WWM_AV32_SAVE; + return AMDGPU::SI_SPILL_WWM_V32_SAVE; } @@ -1600,11 +1699,13 @@ static unsigned getVectorRegSpillSaveOpcode(Register Reg, unsigned Size, const SIRegisterInfo &TRI, const SIMachineFunctionInfo &MFI) { + bool IsVectorSuperClass = TRI.isVectorSuperClass(RC); + // Choose the right opcode if spilling a WWM register. if (MFI.checkFlag(Reg, AMDGPU::VirtRegFlag::WWM_REG)) - return getWWMRegSpillSaveOpcode(Size); + return getWWMRegSpillSaveOpcode(Size, IsVectorSuperClass); - if (TRI.isVectorSuperClass(RC)) + if (IsVectorSuperClass) return getAVSpillSaveOpcode(Size); return TRI.isAGPRClass(RC) ? getAGPRSpillSaveOpcode(Size) @@ -1807,11 +1908,15 @@ static unsigned getAVSpillRestoreOpcode(unsigned Size) { } } -static unsigned getWWMRegSpillRestoreOpcode(unsigned Size) { +static unsigned getWWMRegSpillRestoreOpcode(unsigned Size, + bool IsVectorSuperClass) { // Currently, there is only 32-bit WWM register spills needed. if (Size != 4) llvm_unreachable("unknown wwm register spill size"); + if (IsVectorSuperClass) + return AMDGPU::SI_SPILL_WWM_AV32_RESTORE; + return AMDGPU::SI_SPILL_WWM_V32_RESTORE; } @@ -1819,11 +1924,13 @@ static unsigned getVectorRegSpillRestoreOpcode(Register Reg, const TargetRegisterClass *RC, unsigned Size, const SIRegisterInfo &TRI, const SIMachineFunctionInfo &MFI) { + bool IsVectorSuperClass = TRI.isVectorSuperClass(RC); + // Choose the right opcode if restoring a WWM register. if (MFI.checkFlag(Reg, AMDGPU::VirtRegFlag::WWM_REG)) - return getWWMRegSpillRestoreOpcode(Size); + return getWWMRegSpillRestoreOpcode(Size, IsVectorSuperClass); - if (TRI.isVectorSuperClass(RC)) + if (IsVectorSuperClass) return getAVSpillRestoreOpcode(Size); return TRI.isAGPRClass(RC) ? getAGPRSpillRestoreOpcode(Size) @@ -2006,6 +2113,14 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const { MI.setDesc(get(AMDGPU::S_AND_SAVEEXEC_B32)); break; + case AMDGPU::SI_SPILL_S32_TO_VGPR: + MI.setDesc(get(AMDGPU::V_WRITELANE_B32)); + break; + + case AMDGPU::SI_RESTORE_S32_FROM_VGPR: + MI.setDesc(get(AMDGPU::V_READLANE_B32)); + break; + case AMDGPU::V_MOV_B64_PSEUDO: { Register Dst = MI.getOperand(0).getReg(); Register DstLo = RI.getSubReg(Dst, AMDGPU::sub0); @@ -2024,7 +2139,7 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const { APInt Imm(64, SrcOp.getImm()); APInt Lo(32, Imm.getLoBits(32).getZExtValue()); APInt Hi(32, Imm.getHiBits(32).getZExtValue()); - if (ST.hasPackedFP32Ops() && Lo == Hi && isInlineConstant(Lo)) { + if (ST.hasPkMovB32() && Lo == Hi && isInlineConstant(Lo)) { BuildMI(MBB, MI, DL, get(AMDGPU::V_PK_MOV_B32), Dst) .addImm(SISrcMods::OP_SEL_1) .addImm(Lo.getSExtValue()) @@ -2045,7 +2160,7 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const { } } else { assert(SrcOp.isReg()); - if (ST.hasPackedFP32Ops() && + if (ST.hasPkMovB32() && !RI.isAGPR(MBB.getParent()->getRegInfo(), SrcOp.getReg())) { BuildMI(MBB, MI, DL, get(AMDGPU::V_PK_MOV_B32), Dst) .addImm(SISrcMods::OP_SEL_1) // src0_mod @@ -2275,23 +2390,34 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const { Register Reg = MI.getOperand(0).getReg(); Register RegLo = RI.getSubReg(Reg, AMDGPU::sub0); Register RegHi = RI.getSubReg(Reg, AMDGPU::sub1); + MachineOperand OpLo = MI.getOperand(1); + MachineOperand OpHi = MI.getOperand(2); // Create a bundle so these instructions won't be re-ordered by the // post-RA scheduler. MIBundleBuilder Bundler(MBB, MI); Bundler.append(BuildMI(MF, DL, get(AMDGPU::S_GETPC_B64), Reg)); - // Add 32-bit offset from this instruction to the start of the - // constant data. - Bundler.append(BuildMI(MF, DL, get(AMDGPU::S_ADD_U32), RegLo) - .addReg(RegLo) - .add(MI.getOperand(1))); + // What we want here is an offset from the value returned by s_getpc (which + // is the address of the s_add_u32 instruction) to the global variable, but + // since the encoding of $symbol starts 4 bytes after the start of the + // s_add_u32 instruction, we end up with an offset that is 4 bytes too + // small. This requires us to add 4 to the global variable offset in order + // to compute the correct address. Similarly for the s_addc_u32 instruction, + // the encoding of $symbol starts 12 bytes after the start of the s_add_u32 + // instruction. - MachineInstrBuilder MIB = BuildMI(MF, DL, get(AMDGPU::S_ADDC_U32), RegHi) - .addReg(RegHi); - MIB.add(MI.getOperand(2)); + if (OpLo.isGlobal()) + OpLo.setOffset(OpLo.getOffset() + 4); + Bundler.append( + BuildMI(MF, DL, get(AMDGPU::S_ADD_U32), RegLo).addReg(RegLo).add(OpLo)); + + if (OpHi.isGlobal()) + OpHi.setOffset(OpHi.getOffset() + 12); + Bundler.append(BuildMI(MF, DL, get(AMDGPU::S_ADDC_U32), RegHi) + .addReg(RegHi) + .add(OpHi)); - Bundler.append(MIB); finalizeBundle(MBB, Bundler.begin()); MI.eraseFromParent(); @@ -2350,12 +2476,98 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const { return true; } +void SIInstrInfo::reMaterialize(MachineBasicBlock &MBB, + MachineBasicBlock::iterator I, Register DestReg, + unsigned SubIdx, const MachineInstr &Orig, + const TargetRegisterInfo &RI) const { + + // Try shrinking the instruction to remat only the part needed for current + // context. + // TODO: Handle more cases. + unsigned Opcode = Orig.getOpcode(); + switch (Opcode) { + case AMDGPU::S_LOAD_DWORDX16_IMM: + case AMDGPU::S_LOAD_DWORDX8_IMM: { + if (SubIdx != 0) + break; + + if (I == MBB.end()) + break; + + if (I->isBundled()) + break; + + // Look for a single use of the register that is also a subreg. + Register RegToFind = Orig.getOperand(0).getReg(); + MachineOperand *UseMO = nullptr; + for (auto &CandMO : I->operands()) { + if (!CandMO.isReg() || CandMO.getReg() != RegToFind || CandMO.isDef()) + continue; + if (UseMO) { + UseMO = nullptr; + break; + } + UseMO = &CandMO; + } + if (!UseMO || UseMO->getSubReg() == AMDGPU::NoSubRegister) + break; + + unsigned Offset = RI.getSubRegIdxOffset(UseMO->getSubReg()); + unsigned SubregSize = RI.getSubRegIdxSize(UseMO->getSubReg()); + + MachineFunction *MF = MBB.getParent(); + MachineRegisterInfo &MRI = MF->getRegInfo(); + assert(MRI.use_nodbg_empty(DestReg) && "DestReg should have no users yet."); + + unsigned NewOpcode = -1; + if (SubregSize == 256) + NewOpcode = AMDGPU::S_LOAD_DWORDX8_IMM; + else if (SubregSize == 128) + NewOpcode = AMDGPU::S_LOAD_DWORDX4_IMM; + else + break; + + const MCInstrDesc &TID = get(NewOpcode); + const TargetRegisterClass *NewRC = + RI.getAllocatableClass(getRegClass(TID, 0, &RI, *MF)); + MRI.setRegClass(DestReg, NewRC); + + UseMO->setReg(DestReg); + UseMO->setSubReg(AMDGPU::NoSubRegister); + + // Use a smaller load with the desired size, possibly with updated offset. + MachineInstr *MI = MF->CloneMachineInstr(&Orig); + MI->setDesc(TID); + MI->getOperand(0).setReg(DestReg); + MI->getOperand(0).setSubReg(AMDGPU::NoSubRegister); + if (Offset) { + MachineOperand *OffsetMO = getNamedOperand(*MI, AMDGPU::OpName::offset); + int64_t FinalOffset = OffsetMO->getImm() + Offset / 8; + OffsetMO->setImm(FinalOffset); + } + SmallVector<MachineMemOperand *> NewMMOs; + for (const MachineMemOperand *MemOp : Orig.memoperands()) + NewMMOs.push_back(MF->getMachineMemOperand(MemOp, MemOp->getPointerInfo(), + SubregSize / 8)); + MI->setMemRefs(*MF, NewMMOs); + + MBB.insert(I, MI); + return; + } + + default: + break; + } + + TargetInstrInfo::reMaterialize(MBB, I, DestReg, SubIdx, Orig, RI); +} + std::pair<MachineInstr*, MachineInstr*> SIInstrInfo::expandMovDPP64(MachineInstr &MI) const { assert (MI.getOpcode() == AMDGPU::V_MOV_B64_DPP_PSEUDO); if (ST.hasMovB64() && - AMDGPU::isLegal64BitDPPControl( + AMDGPU::isLegalDPALU_DPPControl( getNamedOperand(MI, AMDGPU::OpName::dpp_ctrl)->getImm())) { MI.setDesc(get(AMDGPU::V_MOV_B64_dpp)); return std::pair(&MI, nullptr); @@ -2482,6 +2694,9 @@ MachineInstr *SIInstrInfo::commuteInstructionImpl(MachineInstr &MI, bool NewMI, if (CommutedOpcode == -1) return nullptr; + if (Src0Idx > Src1Idx) + std::swap(Src0Idx, Src1Idx); + assert(AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0) == static_cast<int>(Src0Idx) && AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1) == @@ -2564,14 +2779,8 @@ bool SIInstrInfo::isBranchOffsetInRange(unsigned BranchOp, return isIntN(BranchOffsetBits, BrOffset); } -MachineBasicBlock *SIInstrInfo::getBranchDestBlock( - const MachineInstr &MI) const { - if (MI.getOpcode() == AMDGPU::S_SETPC_B64) { - // This would be a difficult analysis to perform, but can always be legal so - // there's no need to analyze it. - return nullptr; - } - +MachineBasicBlock * +SIInstrInfo::getBranchDestBlock(const MachineInstr &MI) const { return MI.getOperand(0).getMBB(); } @@ -2882,7 +3091,6 @@ unsigned SIInstrInfo::insertBranch(MachineBasicBlock &MBB, = getBranchOpcode(static_cast<BranchPredicate>(Cond[0].getImm())); if (!FBB) { - Cond[1].isUndef(); MachineInstr *CondBr = BuildMI(&MBB, DL, get(Opcode)) .addMBB(TBB); @@ -3087,6 +3295,7 @@ bool SIInstrInfo::isFoldableCopy(const MachineInstr &MI) { case AMDGPU::V_MOV_B64_e64: case AMDGPU::S_MOV_B32: case AMDGPU::S_MOV_B64: + case AMDGPU::S_MOV_B64_IMM_PSEUDO: case AMDGPU::COPY: case AMDGPU::WWM_COPY: case AMDGPU::V_ACCVGPR_WRITE_B32_e64: @@ -3120,11 +3329,10 @@ bool SIInstrInfo::FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI, switch (DefMI.getOpcode()) { default: return false; + case AMDGPU::V_MOV_B64_e32: case AMDGPU::S_MOV_B64: - // TODO: We could fold 64-bit immediates, but this get complicated - // when there are sub-registers. - return false; - + case AMDGPU::V_MOV_B64_PSEUDO: + case AMDGPU::S_MOV_B64_IMM_PSEUDO: case AMDGPU::V_MOV_B32_e32: case AMDGPU::S_MOV_B32: case AMDGPU::V_ACCVGPR_WRITE_B32_e64: @@ -3137,19 +3345,45 @@ bool SIInstrInfo::FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI, if (!ImmOp->isImm()) return false; + auto getImmFor = [ImmOp](const MachineOperand &UseOp) -> int64_t { + int64_t Imm = ImmOp->getImm(); + switch (UseOp.getSubReg()) { + default: + return Imm; + case AMDGPU::sub0: + return Lo_32(Imm); + case AMDGPU::sub1: + return Hi_32(Imm); + case AMDGPU::lo16: + return APInt(16, Imm).getSExtValue(); + case AMDGPU::hi16: + return APInt(32, Imm).ashr(16).getSExtValue(); + case AMDGPU::sub1_lo16: + return APInt(16, Hi_32(Imm)).getSExtValue(); + case AMDGPU::sub1_hi16: + return APInt(32, Hi_32(Imm)).ashr(16).getSExtValue(); + } + }; + + assert(!DefMI.getOperand(0).getSubReg() && "Expected SSA form"); + unsigned Opc = UseMI.getOpcode(); if (Opc == AMDGPU::COPY) { + assert(!UseMI.getOperand(0).getSubReg() && "Expected SSA form"); + Register DstReg = UseMI.getOperand(0).getReg(); - bool Is16Bit = getOpSize(UseMI, 0) == 2; + unsigned OpSize = getOpSize(UseMI, 0); + bool Is16Bit = OpSize == 2; + bool Is64Bit = OpSize == 8; bool isVGPRCopy = RI.isVGPR(*MRI, DstReg); - unsigned NewOpc = isVGPRCopy ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32; - APInt Imm(32, ImmOp->getImm()); - - if (UseMI.getOperand(1).getSubReg() == AMDGPU::hi16) - Imm = Imm.ashr(16); + unsigned NewOpc = isVGPRCopy ? Is64Bit ? AMDGPU::V_MOV_B64_PSEUDO + : AMDGPU::V_MOV_B32_e32 + : Is64Bit ? AMDGPU::S_MOV_B64_IMM_PSEUDO + : AMDGPU::S_MOV_B32; + APInt Imm(Is64Bit ? 64 : 32, getImmFor(UseMI.getOperand(1))); if (RI.isAGPR(*MRI, DstReg)) { - if (!isInlineConstant(Imm)) + if (Is64Bit || !isInlineConstant(Imm)) return false; NewOpc = AMDGPU::V_ACCVGPR_WRITE_B32_e64; } @@ -3209,14 +3443,32 @@ bool SIInstrInfo::FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI, MachineOperand *Src2 = getNamedOperand(UseMI, AMDGPU::OpName::src2); // Multiplied part is the constant: Use v_madmk_{f16, f32}. - // We should only expect these to be on src0 due to canonicalization. - if (Src0->isReg() && Src0->getReg() == Reg) { - if (!Src1->isReg() || RI.isSGPRClass(MRI->getRegClass(Src1->getReg()))) + if ((Src0->isReg() && Src0->getReg() == Reg) || + (Src1->isReg() && Src1->getReg() == Reg)) { + MachineOperand *RegSrc = + Src1->isReg() && Src1->getReg() == Reg ? Src0 : Src1; + if (!RegSrc->isReg()) + return false; + if (RI.isSGPRClass(MRI->getRegClass(RegSrc->getReg())) && + ST.getConstantBusLimit(Opc) < 2) return false; if (!Src2->isReg() || RI.isSGPRClass(MRI->getRegClass(Src2->getReg()))) return false; + // If src2 is also a literal constant then we have to choose which one to + // fold. In general it is better to choose madak so that the other literal + // can be materialized in an sgpr instead of a vgpr: + // s_mov_b32 s0, literal + // v_madak_f32 v0, s0, v0, literal + // Instead of: + // v_mov_b32 v1, literal + // v_madmk_f32 v0, v0, literal, v1 + MachineInstr *Def = MRI->getUniqueVRegDef(Src2->getReg()); + if (Def && Def->isMoveImmediate() && + !isInlineConstant(Def->getOperand(1))) + return false; + unsigned NewOpc = IsFMA ? (IsF32 ? AMDGPU::V_FMAMK_F32 : ST.hasTrue16BitInsts() ? AMDGPU::V_FMAMK_F16_t16 @@ -3225,18 +3477,22 @@ bool SIInstrInfo::FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI, if (pseudoToMCOpcode(NewOpc) == -1) return false; - // We need to swap operands 0 and 1 since madmk constant is at operand 1. + // V_FMAMK_F16_t16 takes VGPR_32_Lo128 operands, so the rewrite + // would also require restricting their register classes. For now + // just bail out. + if (NewOpc == AMDGPU::V_FMAMK_F16_t16) + return false; - const int64_t Imm = ImmOp->getImm(); + const int64_t Imm = getImmFor(RegSrc == Src1 ? *Src0 : *Src1); // FIXME: This would be a lot easier if we could return a new instruction // instead of having to modify in place. - Register Src1Reg = Src1->getReg(); - unsigned Src1SubReg = Src1->getSubReg(); - Src0->setReg(Src1Reg); - Src0->setSubReg(Src1SubReg); - Src0->setIsKill(Src1->isKill()); + Register SrcReg = RegSrc->getReg(); + unsigned SrcSubReg = RegSrc->getSubReg(); + Src0->setReg(SrcReg); + Src0->setSubReg(SrcSubReg); + Src0->setIsKill(RegSrc->isKill()); if (Opc == AMDGPU::V_MAC_F32_e64 || Opc == AMDGPU::V_MAC_F16_e64 || Opc == AMDGPU::V_FMAC_F32_e64 || Opc == AMDGPU::V_FMAC_F16_t16_e64 || @@ -3258,43 +3514,38 @@ bool SIInstrInfo::FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI, // Added part is the constant: Use v_madak_{f16, f32}. if (Src2->isReg() && Src2->getReg() == Reg) { - // Not allowed to use constant bus for another operand. - // We can however allow an inline immediate as src0. - bool Src0Inlined = false; - if (Src0->isReg()) { - // Try to inline constant if possible. - // If the Def moves immediate and the use is single - // We are saving VGPR here. - MachineInstr *Def = MRI->getUniqueVRegDef(Src0->getReg()); - if (Def && Def->isMoveImmediate() && - isInlineConstant(Def->getOperand(1)) && - MRI->hasOneUse(Src0->getReg())) { - Src0->ChangeToImmediate(Def->getOperand(1).getImm()); - Src0Inlined = true; - } else if ((Src0->getReg().isPhysical() && - (ST.getConstantBusLimit(Opc) <= 1 && - RI.isSGPRClass(RI.getPhysRegBaseClass(Src0->getReg())))) || - (Src0->getReg().isVirtual() && - (ST.getConstantBusLimit(Opc) <= 1 && - RI.isSGPRClass(MRI->getRegClass(Src0->getReg()))))) - return false; + if (ST.getConstantBusLimit(Opc) < 2) { + // Not allowed to use constant bus for another operand. + // We can however allow an inline immediate as src0. + bool Src0Inlined = false; + if (Src0->isReg()) { + // Try to inline constant if possible. + // If the Def moves immediate and the use is single + // We are saving VGPR here. + MachineInstr *Def = MRI->getUniqueVRegDef(Src0->getReg()); + if (Def && Def->isMoveImmediate() && + isInlineConstant(Def->getOperand(1)) && + MRI->hasOneUse(Src0->getReg())) { + Src0->ChangeToImmediate(Def->getOperand(1).getImm()); + Src0Inlined = true; + } else if (ST.getConstantBusLimit(Opc) <= 1 && + RI.isSGPRReg(*MRI, Src0->getReg())) { + return false; + } // VGPR is okay as Src0 - fallthrough - } + } - if (Src1->isReg() && !Src0Inlined ) { - // We have one slot for inlinable constant so far - try to fill it - MachineInstr *Def = MRI->getUniqueVRegDef(Src1->getReg()); - if (Def && Def->isMoveImmediate() && - isInlineConstant(Def->getOperand(1)) && - MRI->hasOneUse(Src1->getReg()) && - commuteInstruction(UseMI)) { + if (Src1->isReg() && !Src0Inlined) { + // We have one slot for inlinable constant so far - try to fill it + MachineInstr *Def = MRI->getUniqueVRegDef(Src1->getReg()); + if (Def && Def->isMoveImmediate() && + isInlineConstant(Def->getOperand(1)) && + MRI->hasOneUse(Src1->getReg()) && commuteInstruction(UseMI)) Src0->ChangeToImmediate(Def->getOperand(1).getImm()); - } else if ((Src1->getReg().isPhysical() && - RI.isSGPRClass(RI.getPhysRegBaseClass(Src1->getReg()))) || - (Src1->getReg().isVirtual() && - RI.isSGPRClass(MRI->getRegClass(Src1->getReg())))) - return false; + else if (RI.isSGPRReg(*MRI, Src1->getReg())) + return false; // VGPR is okay as Src1 - fallthrough + } } unsigned NewOpc = @@ -3305,7 +3556,11 @@ bool SIInstrInfo::FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI, if (pseudoToMCOpcode(NewOpc) == -1) return false; - const int64_t Imm = ImmOp->getImm(); + // V_FMAAK_F16_t16 takes VGPR_32_Lo128 operands, so the rewrite + // would also require restricting their register classes. For now + // just bail out. + if (NewOpc == AMDGPU::V_FMAAK_F16_t16) + return false; // FIXME: This would be a lot easier if we could return a new instruction // instead of having to modify in place. @@ -3317,7 +3572,7 @@ bool SIInstrInfo::FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI, AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2)); // ChangingToImmediate adds Src2 back to the instruction. - Src2->ChangeToImmediate(Imm); + Src2->ChangeToImmediate(getImmFor(*Src2)); // These come before src2. removeModOperands(UseMI); @@ -3412,19 +3667,30 @@ bool SIInstrInfo::areMemAccessesTriviallyDisjoint(const MachineInstr &MIa, if (isMUBUF(MIb) || isMTBUF(MIb)) return checkInstOffsetsDoNotOverlap(MIa, MIb); - return !isFLAT(MIb) && !isSMRD(MIb); + if (isFLAT(MIb)) + return isFLATScratch(MIb); + + return !isSMRD(MIb); } if (isSMRD(MIa)) { if (isSMRD(MIb)) return checkInstOffsetsDoNotOverlap(MIa, MIb); - return !isFLAT(MIb) && !isMUBUF(MIb) && !isMTBUF(MIb); + if (isFLAT(MIb)) + return isFLATScratch(MIb); + + return !isMUBUF(MIb) && !isMTBUF(MIb); } if (isFLAT(MIa)) { - if (isFLAT(MIb)) + if (isFLAT(MIb)) { + if ((isFLATScratch(MIa) && isFLATGlobal(MIb)) || + (isFLATGlobal(MIa) && isFLATScratch(MIb))) + return true; + return checkInstOffsetsDoNotOverlap(MIa, MIb); + } return false; } @@ -3731,13 +3997,7 @@ bool SIInstrInfo::isSchedulingBoundary(const MachineInstr &MI, } bool SIInstrInfo::isAlwaysGDS(uint16_t Opcode) const { - return Opcode == AMDGPU::DS_ORDERED_COUNT || - Opcode == AMDGPU::DS_GWS_INIT || - Opcode == AMDGPU::DS_GWS_SEMA_V || - Opcode == AMDGPU::DS_GWS_SEMA_BR || - Opcode == AMDGPU::DS_GWS_SEMA_P || - Opcode == AMDGPU::DS_GWS_SEMA_RELEASE_ALL || - Opcode == AMDGPU::DS_GWS_BARRIER; + return Opcode == AMDGPU::DS_ORDERED_COUNT || isGWS(Opcode); } bool SIInstrInfo::modifiesModeRegister(const MachineInstr &MI) { @@ -3782,7 +4042,9 @@ bool SIInstrInfo::hasUnwantedEffectsWhenEXECEmpty(const MachineInstr &MI) const // However, executing them with EXEC = 0 causes them to operate on undefined // data, which we avoid by returning true here. if (Opcode == AMDGPU::V_READFIRSTLANE_B32 || - Opcode == AMDGPU::V_READLANE_B32 || Opcode == AMDGPU::V_WRITELANE_B32) + Opcode == AMDGPU::V_READLANE_B32 || Opcode == AMDGPU::V_WRITELANE_B32 || + Opcode == AMDGPU::SI_RESTORE_S32_FROM_VGPR || + Opcode == AMDGPU::SI_SPILL_S32_TO_VGPR) return true; return false; @@ -3836,9 +4098,7 @@ bool SIInstrInfo::isInlineConstant(const APInt &Imm) const { bool SIInstrInfo::isInlineConstant(const MachineOperand &MO, uint8_t OperandType) const { assert(!MO.isReg() && "isInlineConstant called on register operand!"); - if (!MO.isImm() || - OperandType < AMDGPU::OPERAND_SRC_FIRST || - OperandType > AMDGPU::OPERAND_SRC_LAST) + if (!MO.isImm()) return false; // MachineOperand provides no way to tell the true operand size, since it only @@ -3886,12 +4146,15 @@ bool SIInstrInfo::isInlineConstant(const MachineOperand &MO, case AMDGPU::OPERAND_REG_IMM_V2INT16: case AMDGPU::OPERAND_REG_INLINE_C_V2INT16: case AMDGPU::OPERAND_REG_INLINE_AC_V2INT16: - // This suffers the same problem as the scalar 16-bit cases. - return AMDGPU::isInlinableIntLiteralV216(Imm); + return (isInt<16>(Imm) || isUInt<16>(Imm)) && + AMDGPU::isInlinableIntLiteral((int16_t)Imm); case AMDGPU::OPERAND_REG_IMM_FP16: case AMDGPU::OPERAND_REG_IMM_FP16_DEFERRED: case AMDGPU::OPERAND_REG_INLINE_C_FP16: - case AMDGPU::OPERAND_REG_INLINE_AC_FP16: { + case AMDGPU::OPERAND_REG_INLINE_AC_FP16: + case AMDGPU::OPERAND_REG_IMM_V2FP16: + case AMDGPU::OPERAND_REG_INLINE_C_V2FP16: + case AMDGPU::OPERAND_REG_INLINE_AC_V2FP16: { if (isInt<16>(Imm) || isUInt<16>(Imm)) { // A few special case instructions have 16-bit operands on subtargets // where 16-bit instructions are not legal. @@ -3904,17 +4167,26 @@ bool SIInstrInfo::isInlineConstant(const MachineOperand &MO, return false; } - case AMDGPU::OPERAND_REG_IMM_V2FP16: - case AMDGPU::OPERAND_REG_INLINE_C_V2FP16: - case AMDGPU::OPERAND_REG_INLINE_AC_V2FP16: { - uint32_t Trunc = static_cast<uint32_t>(Imm); - return AMDGPU::isInlinableLiteralV216(Trunc, ST.hasInv2PiInlineImm()); - } case AMDGPU::OPERAND_KIMM32: case AMDGPU::OPERAND_KIMM16: return false; + case AMDGPU::OPERAND_INPUT_MODS: + case MCOI::OPERAND_IMMEDIATE: + // Always embedded in the instruction for free. + return true; + case MCOI::OPERAND_UNKNOWN: + case MCOI::OPERAND_REGISTER: + case MCOI::OPERAND_PCREL: + case MCOI::OPERAND_GENERIC_0: + case MCOI::OPERAND_GENERIC_1: + case MCOI::OPERAND_GENERIC_2: + case MCOI::OPERAND_GENERIC_3: + case MCOI::OPERAND_GENERIC_4: + case MCOI::OPERAND_GENERIC_5: + // Just ignore anything else. + return true; default: - llvm_unreachable("invalid bitwidth"); + llvm_unreachable("invalid operand type"); } } @@ -4163,7 +4435,9 @@ static bool shouldReadExec(const MachineInstr &MI) { if (SIInstrInfo::isVALU(MI)) { switch (MI.getOpcode()) { case AMDGPU::V_READLANE_B32: + case AMDGPU::SI_RESTORE_S32_FROM_VGPR: case AMDGPU::V_WRITELANE_B32: + case AMDGPU::SI_SPILL_S32_TO_VGPR: return false; } @@ -4788,20 +5062,10 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI, } } - int DstIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vdst); - if (Opcode != AMDGPU::V_MOV_B64_DPP_PSEUDO && - ((DstIdx >= 0 && - (Desc.operands()[DstIdx].RegClass == AMDGPU::VReg_64RegClassID || - Desc.operands()[DstIdx].RegClass == - AMDGPU::VReg_64_Align2RegClassID)) || - ((Src0Idx >= 0 && - (Desc.operands()[Src0Idx].RegClass == AMDGPU::VReg_64RegClassID || - Desc.operands()[Src0Idx].RegClass == - AMDGPU::VReg_64_Align2RegClassID)))) && - !AMDGPU::isLegal64BitDPPControl(DC)) { + !AMDGPU::isLegalDPALU_DPPControl(DC) && AMDGPU::isDPALU_DPP(Desc)) { ErrInfo = "Invalid dpp_ctrl value: " - "64 bit dpp only support row_newbcast"; + "DP ALU dpp only support row_newbcast"; return false; } } @@ -4969,6 +5233,64 @@ unsigned SIInstrInfo::getVALUOp(const MachineInstr &MI) const { case AMDGPU::S_FLBIT_I32: return AMDGPU::V_FFBH_I32_e64; case AMDGPU::S_CBRANCH_SCC0: return AMDGPU::S_CBRANCH_VCCZ; case AMDGPU::S_CBRANCH_SCC1: return AMDGPU::S_CBRANCH_VCCNZ; + case AMDGPU::S_CVT_F32_I32: return AMDGPU::V_CVT_F32_I32_e64; + case AMDGPU::S_CVT_F32_U32: return AMDGPU::V_CVT_F32_U32_e64; + case AMDGPU::S_CVT_I32_F32: return AMDGPU::V_CVT_I32_F32_e64; + case AMDGPU::S_CVT_U32_F32: return AMDGPU::V_CVT_U32_F32_e64; + case AMDGPU::S_CVT_F32_F16: return AMDGPU::V_CVT_F32_F16_t16_e64; + case AMDGPU::S_CVT_HI_F32_F16: return AMDGPU::V_CVT_F32_F16_t16_e64; + case AMDGPU::S_CVT_F16_F32: return AMDGPU::V_CVT_F16_F32_t16_e64; + case AMDGPU::S_CEIL_F32: return AMDGPU::V_CEIL_F32_e64; + case AMDGPU::S_FLOOR_F32: return AMDGPU::V_FLOOR_F32_e64; + case AMDGPU::S_TRUNC_F32: return AMDGPU::V_TRUNC_F32_e64; + case AMDGPU::S_RNDNE_F32: return AMDGPU::V_RNDNE_F32_e64; + case AMDGPU::S_CEIL_F16: return AMDGPU::V_CEIL_F16_t16_e64; + case AMDGPU::S_FLOOR_F16: return AMDGPU::V_FLOOR_F16_t16_e64; + case AMDGPU::S_TRUNC_F16: return AMDGPU::V_TRUNC_F16_t16_e64; + case AMDGPU::S_RNDNE_F16: return AMDGPU::V_RNDNE_F16_t16_e64; + case AMDGPU::S_ADD_F32: return AMDGPU::V_ADD_F32_e64; + case AMDGPU::S_SUB_F32: return AMDGPU::V_SUB_F32_e64; + case AMDGPU::S_MIN_F32: return AMDGPU::V_MIN_F32_e64; + case AMDGPU::S_MAX_F32: return AMDGPU::V_MAX_F32_e64; + case AMDGPU::S_MUL_F32: return AMDGPU::V_MUL_F32_e64; + case AMDGPU::S_ADD_F16: return AMDGPU::V_ADD_F16_fake16_e64; + case AMDGPU::S_SUB_F16: return AMDGPU::V_SUB_F16_fake16_e64; + case AMDGPU::S_MIN_F16: return AMDGPU::V_MIN_F16_fake16_e64; + case AMDGPU::S_MAX_F16: return AMDGPU::V_MAX_F16_fake16_e64; + case AMDGPU::S_MUL_F16: return AMDGPU::V_MUL_F16_fake16_e64; + case AMDGPU::S_CVT_PK_RTZ_F16_F32: return AMDGPU::V_CVT_PKRTZ_F16_F32_e64; + case AMDGPU::S_FMAC_F32: return AMDGPU::V_FMAC_F32_e64; + case AMDGPU::S_FMAC_F16: return AMDGPU::V_FMAC_F16_t16_e64; + case AMDGPU::S_FMAMK_F32: return AMDGPU::V_FMAMK_F32; + case AMDGPU::S_FMAAK_F32: return AMDGPU::V_FMAAK_F32; + case AMDGPU::S_CMP_LT_F32: return AMDGPU::V_CMP_LT_F32_e64; + case AMDGPU::S_CMP_EQ_F32: return AMDGPU::V_CMP_EQ_F32_e64; + case AMDGPU::S_CMP_LE_F32: return AMDGPU::V_CMP_LE_F32_e64; + case AMDGPU::S_CMP_GT_F32: return AMDGPU::V_CMP_GT_F32_e64; + case AMDGPU::S_CMP_LG_F32: return AMDGPU::V_CMP_LG_F32_e64; + case AMDGPU::S_CMP_GE_F32: return AMDGPU::V_CMP_GE_F32_e64; + case AMDGPU::S_CMP_O_F32: return AMDGPU::V_CMP_O_F32_e64; + case AMDGPU::S_CMP_U_F32: return AMDGPU::V_CMP_U_F32_e64; + case AMDGPU::S_CMP_NGE_F32: return AMDGPU::V_CMP_NGE_F32_e64; + case AMDGPU::S_CMP_NLG_F32: return AMDGPU::V_CMP_NLG_F32_e64; + case AMDGPU::S_CMP_NGT_F32: return AMDGPU::V_CMP_NGT_F32_e64; + case AMDGPU::S_CMP_NLE_F32: return AMDGPU::V_CMP_NLE_F32_e64; + case AMDGPU::S_CMP_NEQ_F32: return AMDGPU::V_CMP_NEQ_F32_e64; + case AMDGPU::S_CMP_NLT_F32: return AMDGPU::V_CMP_NLT_F32_e64; + case AMDGPU::S_CMP_LT_F16: return AMDGPU::V_CMP_LT_F16_t16_e64; + case AMDGPU::S_CMP_EQ_F16: return AMDGPU::V_CMP_EQ_F16_t16_e64; + case AMDGPU::S_CMP_LE_F16: return AMDGPU::V_CMP_LE_F16_t16_e64; + case AMDGPU::S_CMP_GT_F16: return AMDGPU::V_CMP_GT_F16_t16_e64; + case AMDGPU::S_CMP_LG_F16: return AMDGPU::V_CMP_LG_F16_t16_e64; + case AMDGPU::S_CMP_GE_F16: return AMDGPU::V_CMP_GE_F16_t16_e64; + case AMDGPU::S_CMP_O_F16: return AMDGPU::V_CMP_O_F16_t16_e64; + case AMDGPU::S_CMP_U_F16: return AMDGPU::V_CMP_U_F16_t16_e64; + case AMDGPU::S_CMP_NGE_F16: return AMDGPU::V_CMP_NGE_F16_t16_e64; + case AMDGPU::S_CMP_NLG_F16: return AMDGPU::V_CMP_NLG_F16_t16_e64; + case AMDGPU::S_CMP_NGT_F16: return AMDGPU::V_CMP_NGT_F16_t16_e64; + case AMDGPU::S_CMP_NLE_F16: return AMDGPU::V_CMP_NLE_F16_t16_e64; + case AMDGPU::S_CMP_NEQ_F16: return AMDGPU::V_CMP_NEQ_F16_t16_e64; + case AMDGPU::S_CMP_NLT_F16: return AMDGPU::V_CMP_NLT_F16_t16_e64; } llvm_unreachable( "Unexpected scalar opcode without corresponding vector one!"); @@ -5123,13 +5445,10 @@ void SIInstrInfo::legalizeOpWithMove(MachineInstr &MI, unsigned OpIdx) const { MO.ChangeToRegister(Reg, false); } -unsigned SIInstrInfo::buildExtractSubReg(MachineBasicBlock::iterator MI, - MachineRegisterInfo &MRI, - MachineOperand &SuperReg, - const TargetRegisterClass *SuperRC, - unsigned SubIdx, - const TargetRegisterClass *SubRC) - const { +unsigned SIInstrInfo::buildExtractSubReg( + MachineBasicBlock::iterator MI, MachineRegisterInfo &MRI, + const MachineOperand &SuperReg, const TargetRegisterClass *SuperRC, + unsigned SubIdx, const TargetRegisterClass *SubRC) const { MachineBasicBlock *MBB = MI->getParent(); DebugLoc DL = MI->getDebugLoc(); Register SubReg = MRI.createVirtualRegister(SubRC); @@ -5156,12 +5475,9 @@ unsigned SIInstrInfo::buildExtractSubReg(MachineBasicBlock::iterator MI, } MachineOperand SIInstrInfo::buildExtractSubRegOrImm( - MachineBasicBlock::iterator MII, - MachineRegisterInfo &MRI, - MachineOperand &Op, - const TargetRegisterClass *SuperRC, - unsigned SubIdx, - const TargetRegisterClass *SubRC) const { + MachineBasicBlock::iterator MII, MachineRegisterInfo &MRI, + const MachineOperand &Op, const TargetRegisterClass *SuperRC, + unsigned SubIdx, const TargetRegisterClass *SubRC) const { if (Op.isImm()) { if (SubIdx == AMDGPU::sub0) return MachineOperand::CreateImm(static_cast<int32_t>(Op.getImm())); @@ -5256,9 +5572,8 @@ bool SIInstrInfo::isOperandLegal(const MachineInstr &MI, unsigned OpIdx, return false; SGPRsUsed.insert(SGPR); } - } else if (InstDesc.operands()[i].OperandType == AMDGPU::OPERAND_KIMM32 || - (AMDGPU::isSISrcOperand(InstDesc, i) && - !isInlineConstant(Op, InstDesc.operands()[i]))) { + } else if (AMDGPU::isSISrcOperand(InstDesc, i) && + !isInlineConstant(Op, InstDesc.operands()[i])) { if (!LiteralLimit--) return false; if (--ConstantBusLimit <= 0) @@ -5306,6 +5621,27 @@ bool SIInstrInfo::isOperandLegal(const MachineInstr &MI, unsigned OpIdx, return true; } + if (MO->isImm()) { + uint64_t Imm = MO->getImm(); + bool Is64BitFPOp = OpInfo.OperandType == AMDGPU::OPERAND_REG_IMM_FP64; + bool Is64BitOp = Is64BitFPOp || + OpInfo.OperandType == AMDGPU::OPERAND_REG_IMM_INT64 || + OpInfo.OperandType == AMDGPU::OPERAND_REG_IMM_V2INT32 || + OpInfo.OperandType == AMDGPU::OPERAND_REG_IMM_V2FP32; + if (Is64BitOp && + !AMDGPU::isInlinableLiteral64(Imm, ST.hasInv2PiInlineImm())) { + if (!AMDGPU::isValid32BitLiteral(Imm, Is64BitFPOp)) + return false; + + // FIXME: We can use sign extended 64-bit literals, but only for signed + // operands. At the moment we do not know if an operand is signed. + // Such operand will be encoded as its low 32 bits and then either + // correctly sign extended or incorrectly zero extended by HW. + if (!Is64BitFPOp && (int32_t)Imm < 0) + return false; + } + } + // Handle non-register types that are treated like immediates. assert(MO->isImm() || MO->isTargetIndex() || MO->isFI() || MO->isGlobal()); @@ -5363,6 +5699,13 @@ void SIInstrInfo::legalizeOperandsVOP2(MachineRegisterInfo &MRI, if (Src1.isReg() && RI.isAGPR(MRI, Src1.getReg())) legalizeOpWithMove(MI, Src1Idx); + // Special case: V_FMAC_F32 and V_FMAC_F16 have src2. + if (Opc == AMDGPU::V_FMAC_F32_e32 || Opc == AMDGPU::V_FMAC_F16_e32) { + int Src2Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2); + if (!RI.isVGPR(MRI, MI.getOperand(Src2Idx).getReg())) + legalizeOpWithMove(MI, Src2Idx); + } + // VOP2 src0 instructions support all operand types, so we don't need to check // their legality. If src1 is already legal, we don't need to do anything. if (isLegalRegOperand(MRI, InstrDesc.operands()[Src1Idx], Src1)) @@ -5512,6 +5855,11 @@ void SIInstrInfo::legalizeOperandsVOP3(MachineRegisterInfo &MRI, // legalize it. legalizeOpWithMove(MI, Idx); } + + // Special case: V_FMAC_F32 and V_FMAC_F16 have src2 tied to vdst. + if ((Opc == AMDGPU::V_FMAC_F32_e64 || Opc == AMDGPU::V_FMAC_F16_e64) && + !RI.isVGPR(MRI, MI.getOperand(VOP3Idx[2]).getReg())) + legalizeOpWithMove(MI, VOP3Idx[2]); } Register SIInstrInfo::readlaneVGPRToSGPR(Register SrcReg, MachineInstr &UseMI, @@ -5883,6 +6231,17 @@ loadMBUFScalarOperandsFromVGPR(const SIInstrInfo &TII, MachineInstr &MI, unsigned MovExecOpc = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64; const auto *BoolXExecRC = TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID); + // Save SCC. Waterfall Loop may overwrite SCC. + Register SaveSCCReg; + bool SCCNotDead = (MBB.computeRegisterLiveness(TRI, AMDGPU::SCC, MI, 30) != + MachineBasicBlock::LQR_Dead); + if (SCCNotDead) { + SaveSCCReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); + BuildMI(MBB, Begin, DL, TII.get(AMDGPU::S_CSELECT_B32), SaveSCCReg) + .addImm(1) + .addImm(0); + } + Register SaveExec = MRI.createVirtualRegister(BoolXExecRC); // Save the EXEC mask @@ -5938,8 +6297,15 @@ loadMBUFScalarOperandsFromVGPR(const SIInstrInfo &TII, MachineInstr &MI, emitLoadScalarOpsFromVGPRLoop(TII, MRI, MBB, *LoopBB, *BodyBB, DL, ScalarOps); - // Restore the EXEC mask MachineBasicBlock::iterator First = RemainderBB->begin(); + // Restore SCC + if (SCCNotDead) { + BuildMI(*RemainderBB, First, DL, TII.get(AMDGPU::S_CMP_LG_U32)) + .addReg(SaveSCCReg, RegState::Kill) + .addImm(0); + } + + // Restore the EXEC mask BuildMI(*RemainderBB, First, DL, TII.get(MovExecOpc), Exec).addReg(SaveExec); return BodyBB; } @@ -6124,6 +6490,18 @@ SIInstrInfo::legalizeOperands(MachineInstr &MI, return CreatedBB; } + // Legalize S_BITREPLICATE, S_QUADMASK and S_WQM + if (MI.getOpcode() == AMDGPU::S_BITREPLICATE_B64_B32 || + MI.getOpcode() == AMDGPU::S_QUADMASK_B32 || + MI.getOpcode() == AMDGPU::S_QUADMASK_B64 || + MI.getOpcode() == AMDGPU::S_WQM_B32 || + MI.getOpcode() == AMDGPU::S_WQM_B64) { + MachineOperand &Src = MI.getOperand(1); + if (Src.isReg() && RI.hasVectorRegisters(MRI.getRegClass(Src.getReg()))) + Src.setReg(readlaneVGPRToSGPR(Src.getReg(), MI, MRI)); + return CreatedBB; + } + // Legalize MIMG and MUBUF/MTBUF for shaders. // // Shaders only generate MUBUF/MTBUF instructions via intrinsics or via @@ -6391,10 +6769,11 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist, default: break; case AMDGPU::S_ADD_U64_PSEUDO: + NewOpcode = AMDGPU::V_ADD_U64_PSEUDO; + break; case AMDGPU::S_SUB_U64_PSEUDO: - splitScalar64BitAddSub(Worklist, Inst, MDT); - Inst.eraseFromParent(); - return; + NewOpcode = AMDGPU::V_SUB_U64_PSEUDO; + break; case AMDGPU::S_ADD_I32: case AMDGPU::S_SUB_I32: { // FIXME: The u32 versions currently selected use the carry. @@ -6644,21 +7023,78 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist, case AMDGPU::S_CMP_LT_U32: case AMDGPU::S_CMP_LE_U32: case AMDGPU::S_CMP_EQ_U64: - case AMDGPU::S_CMP_LG_U64: { - const MCInstrDesc &NewDesc = get(NewOpcode); + case AMDGPU::S_CMP_LG_U64: + case AMDGPU::S_CMP_LT_F32: + case AMDGPU::S_CMP_EQ_F32: + case AMDGPU::S_CMP_LE_F32: + case AMDGPU::S_CMP_GT_F32: + case AMDGPU::S_CMP_LG_F32: + case AMDGPU::S_CMP_GE_F32: + case AMDGPU::S_CMP_O_F32: + case AMDGPU::S_CMP_U_F32: + case AMDGPU::S_CMP_NGE_F32: + case AMDGPU::S_CMP_NLG_F32: + case AMDGPU::S_CMP_NGT_F32: + case AMDGPU::S_CMP_NLE_F32: + case AMDGPU::S_CMP_NEQ_F32: + case AMDGPU::S_CMP_NLT_F32: + case AMDGPU::S_CMP_LT_F16: + case AMDGPU::S_CMP_EQ_F16: + case AMDGPU::S_CMP_LE_F16: + case AMDGPU::S_CMP_GT_F16: + case AMDGPU::S_CMP_LG_F16: + case AMDGPU::S_CMP_GE_F16: + case AMDGPU::S_CMP_O_F16: + case AMDGPU::S_CMP_U_F16: + case AMDGPU::S_CMP_NGE_F16: + case AMDGPU::S_CMP_NLG_F16: + case AMDGPU::S_CMP_NGT_F16: + case AMDGPU::S_CMP_NLE_F16: + case AMDGPU::S_CMP_NEQ_F16: + case AMDGPU::S_CMP_NLT_F16: { Register CondReg = MRI.createVirtualRegister(RI.getWaveMaskRegClass()); - MachineInstr *NewInstr = - BuildMI(*MBB, Inst, Inst.getDebugLoc(), NewDesc, CondReg) - .add(Inst.getOperand(0)) - .add(Inst.getOperand(1)); + auto NewInstr = + BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(NewOpcode), CondReg) + .setMIFlags(Inst.getFlags()); + if (AMDGPU::getNamedOperandIdx(NewOpcode, + AMDGPU::OpName::src0_modifiers) >= 0) { + NewInstr + .addImm(0) // src0_modifiers + .add(Inst.getOperand(0)) // src0 + .addImm(0) // src1_modifiers + .add(Inst.getOperand(1)) // src1 + .addImm(0); // clamp + } else { + NewInstr + .add(Inst.getOperand(0)) + .add(Inst.getOperand(1)); + } legalizeOperands(*NewInstr, MDT); int SCCIdx = Inst.findRegisterDefOperandIdx(AMDGPU::SCC); MachineOperand SCCOp = Inst.getOperand(SCCIdx); addSCCDefUsersToVALUWorklist(SCCOp, Inst, Worklist, CondReg); Inst.eraseFromParent(); + return; } + case AMDGPU::S_CVT_HI_F32_F16: { + const DebugLoc &DL = Inst.getDebugLoc(); + Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); + Register NewDst = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); + BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHRREV_B32_e64), TmpReg) + .addImm(16) + .add(Inst.getOperand(1)); + BuildMI(*MBB, Inst, DL, get(NewOpcode), NewDst) + .addImm(0) // src0_modifiers + .addReg(TmpReg) + .addImm(0) // clamp + .addImm(0); // omod + + MRI.replaceRegWith(Inst.getOperand(0).getReg(), NewDst); + addUsersToMoveToVALUWorklist(NewDst, MRI, Worklist); + Inst.eraseFromParent(); return; } + } if (NewOpcode == AMDGPU::INSTRUCTION_LIST_END) { // We cannot move this instruction to the VALU, so we should try to @@ -6702,8 +7138,61 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist, // Use the new VALU Opcode. auto NewInstr = BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(NewOpcode)) .setMIFlags(Inst.getFlags()); - for (const MachineOperand &Op : Inst.explicit_operands()) - NewInstr->addOperand(Op); + if (isVOP3(NewOpcode)) { + // Intersperse VOP3 modifiers among the SALU operands. + NewInstr->addOperand(Inst.getOperand(0)); + if (AMDGPU::getNamedOperandIdx(NewOpcode, + AMDGPU::OpName::src0_modifiers) >= 0) + NewInstr.addImm(0); + if (AMDGPU::getNamedOperandIdx(NewOpcode, AMDGPU::OpName::src0) >= 0) + NewInstr->addOperand(Inst.getOperand(1)); + + if (Opcode == AMDGPU::S_SEXT_I32_I8 || Opcode == AMDGPU::S_SEXT_I32_I16) { + // We are converting these to a BFE, so we need to add the missing + // operands for the size and offset. + unsigned Size = (Opcode == AMDGPU::S_SEXT_I32_I8) ? 8 : 16; + NewInstr.addImm(0); + NewInstr.addImm(Size); + } else if (Opcode == AMDGPU::S_BCNT1_I32_B32) { + // The VALU version adds the second operand to the result, so insert an + // extra 0 operand. + NewInstr.addImm(0); + } else if (Opcode == AMDGPU::S_BFE_I32 || Opcode == AMDGPU::S_BFE_U32) { + const MachineOperand &OffsetWidthOp = Inst.getOperand(2); + // If we need to move this to VGPRs, we need to unpack the second + // operand back into the 2 separate ones for bit offset and width. + assert(OffsetWidthOp.isImm() && + "Scalar BFE is only implemented for constant width and offset"); + uint32_t Imm = OffsetWidthOp.getImm(); + + uint32_t Offset = Imm & 0x3f; // Extract bits [5:0]. + uint32_t BitWidth = (Imm & 0x7f0000) >> 16; // Extract bits [22:16]. + NewInstr.addImm(Offset); + NewInstr.addImm(BitWidth); + } else { + if (AMDGPU::getNamedOperandIdx(NewOpcode, + AMDGPU::OpName::src1_modifiers) >= 0) + NewInstr.addImm(0); + if (AMDGPU::getNamedOperandIdx(NewOpcode, AMDGPU::OpName::src1) >= 0) + NewInstr->addOperand(Inst.getOperand(2)); + if (AMDGPU::getNamedOperandIdx(NewOpcode, + AMDGPU::OpName::src2_modifiers) >= 0) + NewInstr.addImm(0); + if (AMDGPU::getNamedOperandIdx(NewOpcode, AMDGPU::OpName::src2) >= 0) + NewInstr->addOperand(Inst.getOperand(3)); + if (AMDGPU::getNamedOperandIdx(NewOpcode, AMDGPU::OpName::clamp) >= 0) + NewInstr.addImm(0); + if (AMDGPU::getNamedOperandIdx(NewOpcode, AMDGPU::OpName::omod) >= 0) + NewInstr.addImm(0); + if (AMDGPU::getNamedOperandIdx(NewOpcode, AMDGPU::OpName::op_sel) >= 0) + NewInstr.addImm(0); + } + } else { + // Just copy the SALU operands. + for (const MachineOperand &Op : Inst.explicit_operands()) + NewInstr->addOperand(Op); + } + // Remove any references to SCC. Vector instructions can't read from it, and // We're just about to add the implicit use / defs of VCC, and we don't want // both. @@ -6727,30 +7216,6 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist, NewDstReg = MRI.createVirtualRegister(NewDstRC); MRI.replaceRegWith(DstReg, NewDstReg); } - if (Opcode == AMDGPU::S_SEXT_I32_I8 || Opcode == AMDGPU::S_SEXT_I32_I16) { - // We are converting these to a BFE, so we need to add the missing - // operands for the size and offset. - unsigned Size = (Opcode == AMDGPU::S_SEXT_I32_I8) ? 8 : 16; - NewInstr.addImm(0); - NewInstr.addImm(Size); - } else if (Opcode == AMDGPU::S_BCNT1_I32_B32) { - // The VALU version adds the second operand to the result, so insert an - // extra 0 operand. - NewInstr.addImm(0); - } - if (Opcode == AMDGPU::S_BFE_I32 || Opcode == AMDGPU::S_BFE_U32) { - const MachineOperand &OffsetWidthOp = NewInstr->getOperand(2); - // If we need to move this to VGPRs, we need to unpack the second operand - // back into the 2 separate ones for bit offset and width. - assert(OffsetWidthOp.isImm() && - "Scalar BFE is only implemented for constant width and offset"); - uint32_t Imm = OffsetWidthOp.getImm(); - uint32_t Offset = Imm & 0x3f; // Extract bits [5:0]. - uint32_t BitWidth = (Imm & 0x7f0000) >> 16; // Extract bits [22:16]. - NewInstr->removeOperand(2); - NewInstr.addImm(Offset); - NewInstr.addImm(BitWidth); - } fixImplicitOperands(*NewInstr); // Legalize the operands legalizeOperands(*NewInstr, MDT); @@ -6808,27 +7273,27 @@ void SIInstrInfo::lowerSelect(SIInstrWorklist &Worklist, MachineInstr &Inst, MachineOperand &Src1 = Inst.getOperand(2); MachineOperand &Cond = Inst.getOperand(3); - Register SCCSource = Cond.getReg(); - bool IsSCC = (SCCSource == AMDGPU::SCC); + Register CondReg = Cond.getReg(); + bool IsSCC = (CondReg == AMDGPU::SCC); // If this is a trivial select where the condition is effectively not SCC - // (SCCSource is a source of copy to SCC), then the select is semantically - // equivalent to copying SCCSource. Hence, there is no need to create + // (CondReg is a source of copy to SCC), then the select is semantically + // equivalent to copying CondReg. Hence, there is no need to create // V_CNDMASK, we can just use that and bail out. if (!IsSCC && Src0.isImm() && (Src0.getImm() == -1) && Src1.isImm() && (Src1.getImm() == 0)) { - MRI.replaceRegWith(Dest.getReg(), SCCSource); + MRI.replaceRegWith(Dest.getReg(), CondReg); return; } - const TargetRegisterClass *TC = - RI.getRegClass(AMDGPU::SReg_1_XEXECRegClassID); - - Register CopySCC = MRI.createVirtualRegister(TC); - + Register NewCondReg = CondReg; if (IsSCC) { + const TargetRegisterClass *TC = + RI.getRegClass(AMDGPU::SReg_1_XEXECRegClassID); + NewCondReg = MRI.createVirtualRegister(TC); + // Now look for the closest SCC def if it is a copy - // replacing the SCCSource with the COPY source register + // replacing the CondReg with the COPY source register bool CopyFound = false; for (MachineInstr &CandI : make_range(std::next(MachineBasicBlock::reverse_iterator(Inst)), @@ -6836,7 +7301,7 @@ void SIInstrInfo::lowerSelect(SIInstrWorklist &Worklist, MachineInstr &Inst, if (CandI.findRegisterDefOperandIdx(AMDGPU::SCC, false, false, &RI) != -1) { if (CandI.isCopy() && CandI.getOperand(0).getReg() == AMDGPU::SCC) { - BuildMI(MBB, MII, DL, get(AMDGPU::COPY), CopySCC) + BuildMI(MBB, MII, DL, get(AMDGPU::COPY), NewCondReg) .addReg(CandI.getOperand(1).getReg()); CopyFound = true; } @@ -6851,24 +7316,31 @@ void SIInstrInfo::lowerSelect(SIInstrWorklist &Worklist, MachineInstr &Inst, unsigned Opcode = (ST.getWavefrontSize() == 64) ? AMDGPU::S_CSELECT_B64 : AMDGPU::S_CSELECT_B32; auto NewSelect = - BuildMI(MBB, MII, DL, get(Opcode), CopySCC).addImm(-1).addImm(0); + BuildMI(MBB, MII, DL, get(Opcode), NewCondReg).addImm(-1).addImm(0); NewSelect->getOperand(3).setIsUndef(Cond.isUndef()); } } - Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); - - auto UpdatedInst = - BuildMI(MBB, MII, DL, get(AMDGPU::V_CNDMASK_B32_e64), ResultReg) - .addImm(0) - .add(Src1) // False - .addImm(0) - .add(Src0) // True - .addReg(IsSCC ? CopySCC : SCCSource); - - MRI.replaceRegWith(Dest.getReg(), ResultReg); - legalizeOperands(*UpdatedInst, MDT); - addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist); + Register NewDestReg = MRI.createVirtualRegister( + RI.getEquivalentVGPRClass(MRI.getRegClass(Dest.getReg()))); + MachineInstr *NewInst; + if (Inst.getOpcode() == AMDGPU::S_CSELECT_B32) { + NewInst = BuildMI(MBB, MII, DL, get(AMDGPU::V_CNDMASK_B32_e64), NewDestReg) + .addImm(0) + .add(Src1) // False + .addImm(0) + .add(Src0) // True + .addReg(NewCondReg); + } else { + NewInst = + BuildMI(MBB, MII, DL, get(AMDGPU::V_CNDMASK_B64_PSEUDO), NewDestReg) + .add(Src1) // False + .add(Src0) // True + .addReg(NewCondReg); + } + MRI.replaceRegWith(Dest.getReg(), NewDestReg); + legalizeOperands(*NewInst, MDT); + addUsersToMoveToVALUWorklist(NewDestReg, MRI, Worklist); } void SIInstrInfo::lowerScalarAbs(SIInstrWorklist &Worklist, @@ -8011,9 +8483,26 @@ unsigned SIInstrInfo::getLiveRangeSplitOpcode(Register SrcReg, return AMDGPU::COPY; } -bool SIInstrInfo::isBasicBlockPrologue(const MachineInstr &MI) const { - return !MI.isTerminator() && MI.getOpcode() != AMDGPU::COPY && - MI.modifiesRegister(AMDGPU::EXEC, &RI); +bool SIInstrInfo::isBasicBlockPrologue(const MachineInstr &MI, + Register Reg) const { + // We need to handle instructions which may be inserted during register + // allocation to handle the prolog. The initial prolog instruction may have + // been separated from the start of the block by spills and copies inserted + // needed by the prolog. However, the insertions for scalar registers can + // always be placed at the BB top as they are independent of the exec mask + // value. + bool IsNullOrVectorRegister = true; + if (Reg) { + const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo(); + IsNullOrVectorRegister = !RI.isSGPRClass(RI.getRegClassForReg(MRI, Reg)); + } + + uint16_t Opc = MI.getOpcode(); + // FIXME: Copies inserted in the block prolog for live-range split should also + // be included. + return IsNullOrVectorRegister && + (isSpillOpcode(Opc) || (!MI.isTerminator() && Opc != AMDGPU::COPY && + MI.modifiesRegister(AMDGPU::EXEC, &RI))); } MachineInstrBuilder @@ -8254,6 +8743,8 @@ static unsigned subtargetEncodingFamily(const GCNSubtarget &ST) { return SIEncodingFamily::GFX10; case AMDGPUSubtarget::GFX11: return SIEncodingFamily::GFX11; + case AMDGPUSubtarget::GFX12: + return SIEncodingFamily::GFX12; } llvm_unreachable("Unknown subtarget generation!"); } @@ -8313,6 +8804,12 @@ int SIInstrInfo::pseudoToMCOpcode(int Opcode) const { int MCOp = AMDGPU::getMCOpcode(Opcode, Gen); + // TODO-GFX12: Remove this. + // Hack to allow some GFX12 codegen tests to run before all the encodings are + // implemented. + if (MCOp == (uint16_t)-1 && Gen == SIEncodingFamily::GFX12) + MCOp = AMDGPU::getMCOpcode(Opcode, SIEncodingFamily::GFX11); + // -1 means that Opcode is already a native instruction. if (MCOp == -1) return Opcode; @@ -8603,9 +9100,8 @@ unsigned SIInstrInfo::getInstrLatency(const InstrItineraryData *ItinData, InstructionUniformity SIInstrInfo::getGenericInstructionUniformity(const MachineInstr &MI) const { unsigned opcode = MI.getOpcode(); - if (opcode == AMDGPU::G_INTRINSIC || - opcode == AMDGPU::G_INTRINSIC_W_SIDE_EFFECTS) { - auto IID = static_cast<Intrinsic::ID>(MI.getIntrinsicID()); + if (auto *GI = dyn_cast<GIntrinsic>(&MI)) { + auto IID = GI->getIntrinsicID(); if (AMDGPU::isIntrinsicSourceOfDivergence(IID)) return InstructionUniformity::NeverUniform; if (AMDGPU::isIntrinsicAlwaysUniform(IID)) @@ -8643,7 +9139,8 @@ SIInstrInfo::getGenericInstructionUniformity(const MachineInstr &MI) const { if (SIInstrInfo::isGenericAtomicRMWOpcode(opcode) || opcode == AMDGPU::G_ATOMIC_CMPXCHG || - opcode == AMDGPU::G_ATOMIC_CMPXCHG_WITH_SUCCESS) { + opcode == AMDGPU::G_ATOMIC_CMPXCHG_WITH_SUCCESS || + AMDGPU::isGenericAtomic(opcode)) { return InstructionUniformity::NeverUniform; } return InstructionUniformity::Default; @@ -8656,7 +9153,9 @@ SIInstrInfo::getInstructionUniformity(const MachineInstr &MI) const { return InstructionUniformity::NeverUniform; unsigned opcode = MI.getOpcode(); - if (opcode == AMDGPU::V_READLANE_B32 || opcode == AMDGPU::V_READFIRSTLANE_B32) + if (opcode == AMDGPU::V_READLANE_B32 || + opcode == AMDGPU::V_READFIRSTLANE_B32 || + opcode == AMDGPU::SI_RESTORE_S32_FROM_VGPR) return InstructionUniformity::AlwaysUniform; if (isCopyInstr(MI)) { |
