diff options
Diffstat (limited to 'lib/Target/AMDGPU/SIInstrInfo.cpp')
-rw-r--r-- | lib/Target/AMDGPU/SIInstrInfo.cpp | 455 |
1 files changed, 379 insertions, 76 deletions
diff --git a/lib/Target/AMDGPU/SIInstrInfo.cpp b/lib/Target/AMDGPU/SIInstrInfo.cpp index 26a8d22062a9..05ac67d26620 100644 --- a/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -21,6 +21,7 @@ #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/ScheduleDAG.h" +#include "llvm/IR/DiagnosticInfo.h" #include "llvm/IR/Function.h" #include "llvm/CodeGen/RegisterScavenging.h" #include "llvm/MC/MCInstrDesc.h" @@ -36,7 +37,7 @@ BranchOffsetBits("amdgpu-s-branch-bits", cl::ReallyHidden, cl::init(16), cl::desc("Restrict range of branch instructions (DEBUG)")); SIInstrInfo::SIInstrInfo(const SISubtarget &ST) - : AMDGPUInstrInfo(ST), RI(), ST(ST) {} + : AMDGPUInstrInfo(ST), RI(ST), ST(ST) {} //===----------------------------------------------------------------------===// // TargetInstrInfo callbacks @@ -315,7 +316,8 @@ bool SIInstrInfo::shouldClusterMemOps(MachineInstr &FirstLdSt, const MachineOperand *SecondDst = nullptr; if ((isMUBUF(FirstLdSt) && isMUBUF(SecondLdSt)) || - (isMTBUF(FirstLdSt) && isMTBUF(SecondLdSt))) { + (isMTBUF(FirstLdSt) && isMTBUF(SecondLdSt)) || + (isFLAT(FirstLdSt) && isFLAT(SecondLdSt))) { FirstDst = getNamedOperand(FirstLdSt, AMDGPU::OpName::vdata); SecondDst = getNamedOperand(SecondLdSt, AMDGPU::OpName::vdata); } else if (isSMRD(FirstLdSt) && isSMRD(SecondLdSt)) { @@ -346,6 +348,21 @@ bool SIInstrInfo::shouldClusterMemOps(MachineInstr &FirstLdSt, return (NumLoads * DstRC->getSize()) <= LoadClusterThreshold; } +static void reportIllegalCopy(const SIInstrInfo *TII, MachineBasicBlock &MBB, + MachineBasicBlock::iterator MI, + const DebugLoc &DL, unsigned DestReg, + unsigned SrcReg, bool KillSrc) { + MachineFunction *MF = MBB.getParent(); + DiagnosticInfoUnsupported IllegalCopy(*MF->getFunction(), + "illegal SGPR to VGPR copy", + DL, DS_Error); + LLVMContext &C = MF->getFunction()->getContext(); + C.diagnose(IllegalCopy); + + BuildMI(MBB, MI, DL, TII->get(AMDGPU::SI_ILLEGAL_COPY), DestReg) + .addReg(SrcReg, getKillRegState(KillSrc)); +} + void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, const DebugLoc &DL, unsigned DestReg, @@ -369,7 +386,11 @@ void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB, return; } - assert(AMDGPU::SReg_32RegClass.contains(SrcReg)); + if (!AMDGPU::SReg_32RegClass.contains(SrcReg)) { + reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc); + return; + } + BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DestReg) .addReg(SrcReg, getKillRegState(KillSrc)); return; @@ -391,7 +412,11 @@ void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB, return; } - assert(AMDGPU::SReg_64RegClass.contains(SrcReg)); + if (!AMDGPU::SReg_64RegClass.contains(SrcReg)) { + reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc); + return; + } + BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B64), DestReg) .addReg(SrcReg, getKillRegState(KillSrc)); return; @@ -415,8 +440,14 @@ void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB, Opcode = AMDGPU::S_MOV_B32; EltSize = 4; } + + if (!RI.isSGPRClass(RI.getPhysRegClass(SrcReg))) { + reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc); + return; + } } + ArrayRef<int16_t> SubIndices = RI.getRegSplitParts(RC, EltSize); bool Forward = RI.getHWRegIndex(DestReg) <= RI.getHWRegIndex(SrcReg); @@ -870,9 +901,10 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const { MachineInstr *MovRel = BuildMI(MBB, MI, DL, MovRelDesc) .addReg(RI.getSubReg(VecReg, SubReg), RegState::Undef) - .addOperand(MI.getOperand(2)) + .add(MI.getOperand(2)) .addReg(VecReg, RegState::ImplicitDefine) - .addReg(VecReg, RegState::Implicit | (IsUndef ? RegState::Undef : 0)); + .addReg(VecReg, + RegState::Implicit | (IsUndef ? RegState::Undef : 0)); const int ImpDefIdx = MovRelDesc.getNumOperands() + MovRelDesc.getNumImplicitUses(); @@ -897,14 +929,14 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const { // constant data. Bundler.append(BuildMI(MF, DL, get(AMDGPU::S_ADD_U32), RegLo) .addReg(RegLo) - .addOperand(MI.getOperand(1))); + .add(MI.getOperand(1))); MachineInstrBuilder MIB = BuildMI(MF, DL, get(AMDGPU::S_ADDC_U32), RegHi) .addReg(RegHi); if (MI.getOperand(2).getTargetFlags() == SIInstrInfo::MO_NONE) MIB.addImm(0); else - MIB.addOperand(MI.getOperand(2)); + MIB.add(MI.getOperand(2)); Bundler.append(MIB); llvm::finalizeBundle(MBB, Bundler.begin()); @@ -1290,6 +1322,13 @@ unsigned SIInstrInfo::removeBranch(MachineBasicBlock &MBB, return Count; } +// Copy the flags onto the implicit condition register operand. +static void preserveCondRegFlags(MachineOperand &CondReg, + const MachineOperand &OrigCond) { + CondReg.setIsUndef(OrigCond.isUndef()); + CondReg.setIsKill(OrigCond.isKill()); +} + unsigned SIInstrInfo::insertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB, MachineBasicBlock *FBB, @@ -1317,9 +1356,7 @@ unsigned SIInstrInfo::insertBranch(MachineBasicBlock &MBB, .addMBB(TBB); // Copy the flags onto the implicit condition register operand. - MachineOperand &CondReg = CondBr->getOperand(1); - CondReg.setIsUndef(Cond[1].isUndef()); - CondReg.setIsKill(Cond[1].isKill()); + preserveCondRegFlags(CondBr->getOperand(1), Cond[1]); if (BytesAdded) *BytesAdded = 4; @@ -1351,6 +1388,157 @@ bool SIInstrInfo::reverseBranchCondition( return false; } +bool SIInstrInfo::canInsertSelect(const MachineBasicBlock &MBB, + ArrayRef<MachineOperand> Cond, + unsigned TrueReg, unsigned FalseReg, + int &CondCycles, + int &TrueCycles, int &FalseCycles) const { + switch (Cond[0].getImm()) { + case VCCNZ: + case VCCZ: { + const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); + const TargetRegisterClass *RC = MRI.getRegClass(TrueReg); + assert(MRI.getRegClass(FalseReg) == RC); + + int NumInsts = AMDGPU::getRegBitWidth(RC->getID()) / 32; + CondCycles = TrueCycles = FalseCycles = NumInsts; // ??? + + // Limit to equal cost for branch vs. N v_cndmask_b32s. + return !RI.isSGPRClass(RC) && NumInsts <= 6; + } + case SCC_TRUE: + case SCC_FALSE: { + // FIXME: We could insert for VGPRs if we could replace the original compare + // with a vector one. + const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); + const TargetRegisterClass *RC = MRI.getRegClass(TrueReg); + assert(MRI.getRegClass(FalseReg) == RC); + + int NumInsts = AMDGPU::getRegBitWidth(RC->getID()) / 32; + + // Multiples of 8 can do s_cselect_b64 + if (NumInsts % 2 == 0) + NumInsts /= 2; + + CondCycles = TrueCycles = FalseCycles = NumInsts; // ??? + return RI.isSGPRClass(RC); + } + default: + return false; + } +} + +void SIInstrInfo::insertSelect(MachineBasicBlock &MBB, + MachineBasicBlock::iterator I, const DebugLoc &DL, + unsigned DstReg, ArrayRef<MachineOperand> Cond, + unsigned TrueReg, unsigned FalseReg) const { + BranchPredicate Pred = static_cast<BranchPredicate>(Cond[0].getImm()); + if (Pred == VCCZ || Pred == SCC_FALSE) { + Pred = static_cast<BranchPredicate>(-Pred); + std::swap(TrueReg, FalseReg); + } + + MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); + const TargetRegisterClass *DstRC = MRI.getRegClass(DstReg); + unsigned DstSize = DstRC->getSize(); + + if (DstSize == 4) { + unsigned SelOp = Pred == SCC_TRUE ? + AMDGPU::S_CSELECT_B32 : AMDGPU::V_CNDMASK_B32_e32; + + // Instruction's operands are backwards from what is expected. + MachineInstr *Select = + BuildMI(MBB, I, DL, get(SelOp), DstReg) + .addReg(FalseReg) + .addReg(TrueReg); + + preserveCondRegFlags(Select->getOperand(3), Cond[1]); + return; + } + + if (DstSize == 8 && Pred == SCC_TRUE) { + MachineInstr *Select = + BuildMI(MBB, I, DL, get(AMDGPU::S_CSELECT_B64), DstReg) + .addReg(FalseReg) + .addReg(TrueReg); + + preserveCondRegFlags(Select->getOperand(3), Cond[1]); + return; + } + + static const int16_t Sub0_15[] = { + AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3, + AMDGPU::sub4, AMDGPU::sub5, AMDGPU::sub6, AMDGPU::sub7, + AMDGPU::sub8, AMDGPU::sub9, AMDGPU::sub10, AMDGPU::sub11, + AMDGPU::sub12, AMDGPU::sub13, AMDGPU::sub14, AMDGPU::sub15, + }; + + static const int16_t Sub0_15_64[] = { + AMDGPU::sub0_sub1, AMDGPU::sub2_sub3, + AMDGPU::sub4_sub5, AMDGPU::sub6_sub7, + AMDGPU::sub8_sub9, AMDGPU::sub10_sub11, + AMDGPU::sub12_sub13, AMDGPU::sub14_sub15, + }; + + unsigned SelOp = AMDGPU::V_CNDMASK_B32_e32; + const TargetRegisterClass *EltRC = &AMDGPU::VGPR_32RegClass; + const int16_t *SubIndices = Sub0_15; + int NElts = DstSize / 4; + + // 64-bit select is only avaialble for SALU. + if (Pred == SCC_TRUE) { + SelOp = AMDGPU::S_CSELECT_B64; + EltRC = &AMDGPU::SGPR_64RegClass; + SubIndices = Sub0_15_64; + + assert(NElts % 2 == 0); + NElts /= 2; + } + + MachineInstrBuilder MIB = BuildMI( + MBB, I, DL, get(AMDGPU::REG_SEQUENCE), DstReg); + + I = MIB->getIterator(); + + SmallVector<unsigned, 8> Regs; + for (int Idx = 0; Idx != NElts; ++Idx) { + unsigned DstElt = MRI.createVirtualRegister(EltRC); + Regs.push_back(DstElt); + + unsigned SubIdx = SubIndices[Idx]; + + MachineInstr *Select = + BuildMI(MBB, I, DL, get(SelOp), DstElt) + .addReg(FalseReg, 0, SubIdx) + .addReg(TrueReg, 0, SubIdx); + preserveCondRegFlags(Select->getOperand(3), Cond[1]); + + MIB.addReg(DstElt) + .addImm(SubIdx); + } +} + +bool SIInstrInfo::isFoldableCopy(const MachineInstr &MI) const { + switch (MI.getOpcode()) { + case AMDGPU::V_MOV_B32_e32: + case AMDGPU::V_MOV_B32_e64: + case AMDGPU::V_MOV_B64_PSEUDO: { + // If there are additional implicit register operands, this may be used for + // register indexing so the source register operand isn't simply copied. + unsigned NumOps = MI.getDesc().getNumOperands() + + MI.getDesc().getNumImplicitUses(); + + return MI.getNumOperands() == NumOps; + } + case AMDGPU::S_MOV_B32: + case AMDGPU::S_MOV_B64: + case AMDGPU::COPY: + return true; + default: + return false; + } +} + static void removeModOperands(MachineInstr &MI) { unsigned Opc = MI.getOpcode(); int Src0ModIdx = AMDGPU::getNamedOperandIdx(Opc, @@ -1400,15 +1588,10 @@ bool SIInstrInfo::FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI, if (Opc == AMDGPU::V_MAD_F32 || Opc == AMDGPU::V_MAC_F32_e64 || Opc == AMDGPU::V_MAD_F16 || Opc == AMDGPU::V_MAC_F16_e64) { - bool IsF32 = Opc == AMDGPU::V_MAD_F32 || Opc == AMDGPU::V_MAC_F32_e64; - - // Don't fold if we are using source modifiers. The new VOP2 instructions - // don't have them. - if (hasModifiersSet(UseMI, AMDGPU::OpName::src0_modifiers) || - hasModifiersSet(UseMI, AMDGPU::OpName::src1_modifiers) || - hasModifiersSet(UseMI, AMDGPU::OpName::src2_modifiers)) { + // Don't fold if we are using source or output modifiers. The new VOP2 + // instructions don't have them. + if (hasAnyModifiersSet(UseMI)) return false; - } const MachineOperand &ImmOp = DefMI.getOperand(1); @@ -1421,6 +1604,7 @@ bool SIInstrInfo::FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI, if (isInlineConstant(UseMI, *Src0, ImmOp)) return false; + bool IsF32 = Opc == AMDGPU::V_MAD_F32 || Opc == AMDGPU::V_MAC_F32_e64; MachineOperand *Src1 = getNamedOperand(UseMI, AMDGPU::OpName::src1); MachineOperand *Src2 = getNamedOperand(UseMI, AMDGPU::OpName::src2); @@ -1633,20 +1817,26 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineFunction::iterator &MBB, const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::vdst); const MachineOperand *Src0 = getNamedOperand(MI, AMDGPU::OpName::src0); + const MachineOperand *Src0Mods = + getNamedOperand(MI, AMDGPU::OpName::src0_modifiers); const MachineOperand *Src1 = getNamedOperand(MI, AMDGPU::OpName::src1); + const MachineOperand *Src1Mods = + getNamedOperand(MI, AMDGPU::OpName::src1_modifiers); const MachineOperand *Src2 = getNamedOperand(MI, AMDGPU::OpName::src2); + const MachineOperand *Clamp = getNamedOperand(MI, AMDGPU::OpName::clamp); + const MachineOperand *Omod = getNamedOperand(MI, AMDGPU::OpName::omod); return BuildMI(*MBB, MI, MI.getDebugLoc(), get(IsF16 ? AMDGPU::V_MAD_F16 : AMDGPU::V_MAD_F32)) - .addOperand(*Dst) - .addImm(0) // Src0 mods - .addOperand(*Src0) - .addImm(0) // Src1 mods - .addOperand(*Src1) + .add(*Dst) + .addImm(Src0Mods ? Src0Mods->getImm() : 0) + .add(*Src0) + .addImm(Src1Mods ? Src1Mods->getImm() : 0) + .add(*Src1) .addImm(0) // Src mods - .addOperand(*Src2) - .addImm(0) // clamp - .addImm(0); // omod + .add(*Src2) + .addImm(Clamp ? Clamp->getImm() : 0) + .addImm(Omod ? Omod->getImm() : 0); } // It's not generally safe to move VALU instructions across these since it will @@ -1687,7 +1877,8 @@ bool SIInstrInfo::isInlineConstant(const APInt &Imm) const { return AMDGPU::isInlinableLiteral64(Imm.getSExtValue(), ST.hasInv2PiInlineImm()); case 16: - return AMDGPU::isInlinableLiteral16(Imm.getSExtValue(), + return ST.has16BitInsts() && + AMDGPU::isInlinableLiteral16(Imm.getSExtValue(), ST.hasInv2PiInlineImm()); default: llvm_unreachable("invalid bitwidth"); @@ -1705,24 +1896,43 @@ bool SIInstrInfo::isInlineConstant(const MachineOperand &MO, // would be for any 32-bit integer operand, but would not be for a 64-bit one. int64_t Imm = MO.getImm(); - switch (operandBitWidth(OperandType)) { - case 32: { + switch (OperandType) { + case AMDGPU::OPERAND_REG_IMM_INT32: + case AMDGPU::OPERAND_REG_IMM_FP32: + case AMDGPU::OPERAND_REG_INLINE_C_INT32: + case AMDGPU::OPERAND_REG_INLINE_C_FP32: { int32_t Trunc = static_cast<int32_t>(Imm); return Trunc == Imm && AMDGPU::isInlinableLiteral32(Trunc, ST.hasInv2PiInlineImm()); } - case 64: { + case AMDGPU::OPERAND_REG_IMM_INT64: + case AMDGPU::OPERAND_REG_IMM_FP64: + case AMDGPU::OPERAND_REG_INLINE_C_INT64: + case AMDGPU::OPERAND_REG_INLINE_C_FP64: { return AMDGPU::isInlinableLiteral64(MO.getImm(), ST.hasInv2PiInlineImm()); } - case 16: { + case AMDGPU::OPERAND_REG_IMM_INT16: + case AMDGPU::OPERAND_REG_IMM_FP16: + case AMDGPU::OPERAND_REG_INLINE_C_INT16: + case AMDGPU::OPERAND_REG_INLINE_C_FP16: { if (isInt<16>(Imm) || isUInt<16>(Imm)) { + // A few special case instructions have 16-bit operands on subtargets + // where 16-bit instructions are not legal. + // TODO: Do the 32-bit immediates work? We shouldn't really need to handle + // constants in these cases int16_t Trunc = static_cast<int16_t>(Imm); - return AMDGPU::isInlinableLiteral16(Trunc, ST.hasInv2PiInlineImm()); + return ST.has16BitInsts() && + AMDGPU::isInlinableLiteral16(Trunc, ST.hasInv2PiInlineImm()); } return false; } + case AMDGPU::OPERAND_REG_INLINE_C_V2INT16: + case AMDGPU::OPERAND_REG_INLINE_C_V2FP16: { + uint32_t Trunc = static_cast<uint32_t>(Imm); + return AMDGPU::isInlinableLiteralV216(Trunc, ST.hasInv2PiInlineImm()); + } default: llvm_unreachable("invalid bitwidth"); } @@ -1801,6 +2011,14 @@ bool SIInstrInfo::hasModifiersSet(const MachineInstr &MI, return Mods && Mods->getImm(); } +bool SIInstrInfo::hasAnyModifiersSet(const MachineInstr &MI) const { + return hasModifiersSet(MI, AMDGPU::OpName::src0_modifiers) || + hasModifiersSet(MI, AMDGPU::OpName::src1_modifiers) || + hasModifiersSet(MI, AMDGPU::OpName::src2_modifiers) || + hasModifiersSet(MI, AMDGPU::OpName::clamp) || + hasModifiersSet(MI, AMDGPU::OpName::omod); +} + bool SIInstrInfo::usesConstantBus(const MachineRegisterInfo &MRI, const MachineOperand &MO, const MCOperandInfo &OpInfo) const { @@ -2238,7 +2456,7 @@ void SIInstrInfo::legalizeOpWithMove(MachineInstr &MI, unsigned OpIdx) const { unsigned Reg = MRI.createVirtualRegister(VRC); DebugLoc DL = MBB->findDebugLoc(I); - BuildMI(*MI.getParent(), I, DL, get(Opcode), Reg).addOperand(MO); + BuildMI(*MI.getParent(), I, DL, get(Opcode), Reg).add(MO); MO.ChangeToRegister(Reg, false); } @@ -2564,8 +2782,8 @@ void SIInstrInfo::legalizeGenericOperand(MachineBasicBlock &InsertMBB, return; unsigned DstReg = MRI.createVirtualRegister(DstRC); - MachineInstr *Copy = BuildMI(InsertMBB, I, DL, get(AMDGPU::COPY), DstReg) - .addOperand(Op); + MachineInstr *Copy = + BuildMI(InsertMBB, I, DL, get(AMDGPU::COPY), DstReg).add(Op); Op.setReg(DstReg); Op.setSubReg(0); @@ -2810,13 +3028,13 @@ void SIInstrInfo::legalizeOperands(MachineInstr &MI) const { // Regular buffer load / store. MachineInstrBuilder MIB = BuildMI(MBB, MI, MI.getDebugLoc(), get(Addr64Opcode)) - .addOperand(*VData) + .add(*VData) .addReg(AMDGPU::NoRegister) // Dummy value for vaddr. // This will be replaced later // with the new value of vaddr. - .addOperand(*SRsrc) - .addOperand(*SOffset) - .addOperand(*Offset); + .add(*SRsrc) + .add(*SOffset) + .add(*Offset); // Atomics do not have this operand. if (const MachineOperand *GLC = @@ -2836,14 +3054,14 @@ void SIInstrInfo::legalizeOperands(MachineInstr &MI) const { } else { // Atomics with return. Addr64 = BuildMI(MBB, MI, MI.getDebugLoc(), get(Addr64Opcode)) - .addOperand(*VData) - .addOperand(*VDataIn) + .add(*VData) + .add(*VDataIn) .addReg(AMDGPU::NoRegister) // Dummy value for vaddr. // This will be replaced later // with the new value of vaddr. - .addOperand(*SRsrc) - .addOperand(*SOffset) - .addOperand(*Offset) + .add(*SRsrc) + .add(*SOffset) + .add(*Offset) .addImm(getNamedImmOperand(MI, AMDGPU::OpName::slc)) .setMemRefs(MI.memoperands_begin(), MI.memoperands_end()); } @@ -2970,6 +3188,14 @@ void SIInstrInfo::moveToVALU(MachineInstr &TopInst) const { case AMDGPU::S_BFE_U64: case AMDGPU::S_BFM_B64: llvm_unreachable("Moving this op to VALU not implemented"); + + case AMDGPU::S_PACK_LL_B32_B16: + case AMDGPU::S_PACK_LH_B32_B16: + case AMDGPU::S_PACK_HH_B32_B16: { + movePackToVALU(Worklist, MRI, Inst); + Inst.eraseFromParent(); + continue; + } } if (NewOpcode == AMDGPU::INSTRUCTION_LIST_END) { @@ -3027,12 +3253,15 @@ void SIInstrInfo::moveToVALU(MachineInstr &TopInst) const { bool HasDst = Inst.getOperand(0).isReg() && Inst.getOperand(0).isDef(); unsigned NewDstReg = AMDGPU::NoRegister; if (HasDst) { + unsigned DstReg = Inst.getOperand(0).getReg(); + if (TargetRegisterInfo::isPhysicalRegister(DstReg)) + continue; + // Update the destination register class. const TargetRegisterClass *NewDstRC = getDestEquivalentVGPRClass(Inst); if (!NewDstRC) continue; - unsigned DstReg = Inst.getOperand(0).getReg(); if (Inst.isCopy() && TargetRegisterInfo::isVirtualRegister(Inst.getOperand(1).getReg()) && NewDstRC == RI.getRegClassForReg(MRI, Inst.getOperand(1).getReg())) { @@ -3112,15 +3341,13 @@ void SIInstrInfo::splitScalar64BitUnaryOp( const TargetRegisterClass *NewDestSubRC = RI.getSubRegClass(NewDestRC, AMDGPU::sub0); unsigned DestSub0 = MRI.createVirtualRegister(NewDestSubRC); - BuildMI(MBB, MII, DL, InstDesc, DestSub0) - .addOperand(SrcReg0Sub0); + BuildMI(MBB, MII, DL, InstDesc, DestSub0).add(SrcReg0Sub0); MachineOperand SrcReg0Sub1 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, AMDGPU::sub1, Src0SubRC); unsigned DestSub1 = MRI.createVirtualRegister(NewDestSubRC); - BuildMI(MBB, MII, DL, InstDesc, DestSub1) - .addOperand(SrcReg0Sub1); + BuildMI(MBB, MII, DL, InstDesc, DestSub1).add(SrcReg0Sub1); unsigned FullDestReg = MRI.createVirtualRegister(NewDestRC); BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg) @@ -3174,8 +3401,8 @@ void SIInstrInfo::splitScalar64BitBinaryOp( unsigned DestSub0 = MRI.createVirtualRegister(NewDestSubRC); MachineInstr &LoHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub0) - .addOperand(SrcReg0Sub0) - .addOperand(SrcReg1Sub0); + .add(SrcReg0Sub0) + .add(SrcReg1Sub0); MachineOperand SrcReg0Sub1 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, AMDGPU::sub1, Src0SubRC); @@ -3184,8 +3411,8 @@ void SIInstrInfo::splitScalar64BitBinaryOp( unsigned DestSub1 = MRI.createVirtualRegister(NewDestSubRC); MachineInstr &HiHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub1) - .addOperand(SrcReg0Sub1) - .addOperand(SrcReg1Sub1); + .add(SrcReg0Sub1) + .add(SrcReg1Sub1); unsigned FullDestReg = MRI.createVirtualRegister(NewDestRC); BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg) @@ -3231,13 +3458,9 @@ void SIInstrInfo::splitScalar64BitBCNT( MachineOperand SrcRegSub1 = buildExtractSubRegOrImm(MII, MRI, Src, SrcRC, AMDGPU::sub1, SrcSubRC); - BuildMI(MBB, MII, DL, InstDesc, MidReg) - .addOperand(SrcRegSub0) - .addImm(0); + BuildMI(MBB, MII, DL, InstDesc, MidReg).add(SrcRegSub0).addImm(0); - BuildMI(MBB, MII, DL, InstDesc, ResultReg) - .addOperand(SrcRegSub1) - .addReg(MidReg); + BuildMI(MBB, MII, DL, InstDesc, ResultReg).add(SrcRegSub1).addReg(MidReg); MRI.replaceRegWith(Dest.getReg(), ResultReg); @@ -3326,6 +3549,68 @@ void SIInstrInfo::addUsersToMoveToVALUWorklist( } } +void SIInstrInfo::movePackToVALU(SmallVectorImpl<MachineInstr *> &Worklist, + MachineRegisterInfo &MRI, + MachineInstr &Inst) const { + unsigned ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); + MachineBasicBlock *MBB = Inst.getParent(); + MachineOperand &Src0 = Inst.getOperand(1); + MachineOperand &Src1 = Inst.getOperand(2); + const DebugLoc &DL = Inst.getDebugLoc(); + + switch (Inst.getOpcode()) { + case AMDGPU::S_PACK_LL_B32_B16: { + unsigned ImmReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); + unsigned TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); + + // FIXME: Can do a lot better if we know the high bits of src0 or src1 are + // 0. + BuildMI(*MBB, Inst, DL, get(AMDGPU::V_MOV_B32_e32), ImmReg) + .addImm(0xffff); + + BuildMI(*MBB, Inst, DL, get(AMDGPU::V_AND_B32_e64), TmpReg) + .addReg(ImmReg, RegState::Kill) + .add(Src0); + + BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHL_OR_B32), ResultReg) + .add(Src1) + .addImm(16) + .addReg(TmpReg, RegState::Kill); + break; + } + case AMDGPU::S_PACK_LH_B32_B16: { + unsigned ImmReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); + BuildMI(*MBB, Inst, DL, get(AMDGPU::V_MOV_B32_e32), ImmReg) + .addImm(0xffff); + BuildMI(*MBB, Inst, DL, get(AMDGPU::V_BFI_B32), ResultReg) + .addReg(ImmReg, RegState::Kill) + .add(Src0) + .add(Src1); + break; + } + case AMDGPU::S_PACK_HH_B32_B16: { + unsigned ImmReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); + unsigned TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); + BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHRREV_B32_e64), TmpReg) + .addImm(16) + .add(Src0); + BuildMI(*MBB, Inst, DL, get(AMDGPU::V_MOV_B32_e32), ImmReg) + .addImm(0xffff); + BuildMI(*MBB, Inst, DL, get(AMDGPU::V_AND_OR_B32), ResultReg) + .add(Src1) + .addReg(ImmReg, RegState::Kill) + .addReg(TmpReg, RegState::Kill); + break; + } + default: + llvm_unreachable("unhandled s_pack_* instruction"); + } + + MachineOperand &Dest = Inst.getOperand(0); + MRI.replaceRegWith(Dest.getReg(), ResultReg); + addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist); +} + void SIInstrInfo::addSCCDefUsersToVALUWorklist( MachineInstr &SCCDefInst, SmallVectorImpl<MachineInstr *> &Worklist) const { // This assumes that all the users of SCC are in the same block @@ -3448,10 +3733,13 @@ MachineOperand *SIInstrInfo::getNamedOperand(MachineInstr &MI, uint64_t SIInstrInfo::getDefaultRsrcDataFormat() const { uint64_t RsrcDataFormat = AMDGPU::RSRC_DATA_FORMAT; if (ST.isAmdHsaOS()) { - RsrcDataFormat |= (1ULL << 56); + // Set ATC = 1. GFX9 doesn't have this bit. + if (ST.getGeneration() <= SISubtarget::VOLCANIC_ISLANDS) + RsrcDataFormat |= (1ULL << 56); - if (ST.getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) - // Set MTYPE = 2 + // Set MTYPE = 2 (MTYPE_UC = uncached). GFX9 doesn't have this. + // BTW, it disables TC L2 and therefore decreases performance. + if (ST.getGeneration() == SISubtarget::VOLCANIC_ISLANDS) RsrcDataFormat |= (2ULL << 59); } @@ -3463,11 +3751,14 @@ uint64_t SIInstrInfo::getScratchRsrcWords23() const { AMDGPU::RSRC_TID_ENABLE | 0xffffffff; // Size; - uint64_t EltSizeValue = Log2_32(ST.getMaxPrivateElementSize()) - 1; + // GFX9 doesn't have ELEMENT_SIZE. + if (ST.getGeneration() <= SISubtarget::VOLCANIC_ISLANDS) { + uint64_t EltSizeValue = Log2_32(ST.getMaxPrivateElementSize()) - 1; + Rsrc23 |= EltSizeValue << AMDGPU::RSRC_ELEMENT_SIZE_SHIFT; + } - Rsrc23 |= (EltSizeValue << AMDGPU::RSRC_ELEMENT_SIZE_SHIFT) | - // IndexStride = 64 - (UINT64_C(3) << AMDGPU::RSRC_INDEX_STRIDE_SHIFT); + // IndexStride = 64. + Rsrc23 |= UINT64_C(3) << AMDGPU::RSRC_INDEX_STRIDE_SHIFT; // If TID_ENABLE is set, DATA_FORMAT specifies stride bits [14:17]. // Clear them unless we want a huge stride. @@ -3496,7 +3787,7 @@ unsigned SIInstrInfo::isStackAccess(const MachineInstr &MI, return AMDGPU::NoRegister; assert(!MI.memoperands_empty() && - (*MI.memoperands_begin())->getAddrSpace() == AMDGPUAS::PRIVATE_ADDRESS); + (*MI.memoperands_begin())->getAddrSpace() == AMDGPUASI.PRIVATE_ADDRESS); FrameIndex = Addr->getIndex(); return getNamedOperand(MI, AMDGPU::OpName::vdata)->getReg(); @@ -3552,16 +3843,11 @@ unsigned SIInstrInfo::getInstSizeInBytes(const MachineInstr &MI) const { if (DescSize != 0 && DescSize != 4) return DescSize; - if (Opc == AMDGPU::WAVE_BARRIER) - return 0; - // 4-byte instructions may have a 32-bit literal encoded after them. Check // operands that coud ever be literals. if (isVALU(MI) || isSALU(MI)) { - if (isFixedSize(MI)) { - assert(DescSize == 4); + if (isFixedSize(MI)) return DescSize; - } int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0); if (Src0Idx == -1) @@ -3584,7 +3870,6 @@ unsigned SIInstrInfo::getInstSizeInBytes(const MachineInstr &MI) const { return 4; switch (Opc) { - case AMDGPU::SI_MASK_BRANCH: case TargetOpcode::IMPLICIT_DEF: case TargetOpcode::KILL: case TargetOpcode::DBG_VALUE: @@ -3609,7 +3894,7 @@ bool SIInstrInfo::mayAccessFlatAddressSpace(const MachineInstr &MI) const { return true; for (const MachineMemOperand *MMO : MI.memoperands()) { - if (MMO->getAddrSpace() == AMDGPUAS::FLAT_ADDRESS) + if (MMO->getAddrSpace() == AMDGPUASI.FLAT_ADDRESS) return true; } return false; @@ -3640,3 +3925,21 @@ ScheduleHazardRecognizer * SIInstrInfo::CreateTargetPostRAHazardRecognizer(const MachineFunction &MF) const { return new GCNHazardRecognizer(MF); } + +bool SIInstrInfo::isBasicBlockPrologue(const MachineInstr &MI) const { + return !MI.isTerminator() && MI.getOpcode() != AMDGPU::COPY && + MI.modifiesRegister(AMDGPU::EXEC, &RI); +} + +MachineInstrBuilder +SIInstrInfo::getAddNoCarry(MachineBasicBlock &MBB, + MachineBasicBlock::iterator I, + const DebugLoc &DL, + unsigned DestReg) const { + MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); + + unsigned UnusedCarry = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); + + return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_I32_e64), DestReg) + .addReg(UnusedCarry, RegState::Define | RegState::Dead); +} |