diff options
Diffstat (limited to 'lib/Target/AMDGPU/SIInstrInfo.cpp')
-rw-r--r-- | lib/Target/AMDGPU/SIInstrInfo.cpp | 1736 |
1 files changed, 904 insertions, 832 deletions
diff --git a/lib/Target/AMDGPU/SIInstrInfo.cpp b/lib/Target/AMDGPU/SIInstrInfo.cpp index 1e10d25e8fb7a..d171e21c8a4fe 100644 --- a/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -12,14 +12,15 @@ // //===----------------------------------------------------------------------===// - #include "SIInstrInfo.h" #include "AMDGPUTargetMachine.h" +#include "GCNHazardRecognizer.h" #include "SIDefines.h" #include "SIMachineFunctionInfo.h" #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/ScheduleDAG.h" #include "llvm/IR/Function.h" #include "llvm/CodeGen/RegisterScavenging.h" #include "llvm/MC/MCInstrDesc.h" @@ -27,8 +28,8 @@ using namespace llvm; -SIInstrInfo::SIInstrInfo(const AMDGPUSubtarget &st) - : AMDGPUInstrInfo(st), RI() {} +SIInstrInfo::SIInstrInfo(const SISubtarget &ST) + : AMDGPUInstrInfo(ST), RI(), ST(ST) {} //===----------------------------------------------------------------------===// // TargetInstrInfo callbacks @@ -74,12 +75,12 @@ static bool nodesHaveSameOperandValue(SDNode *N0, SDNode* N1, unsigned OpName) { return N0->getOperand(Op0Idx) == N1->getOperand(Op1Idx); } -bool SIInstrInfo::isReallyTriviallyReMaterializable(const MachineInstr *MI, +bool SIInstrInfo::isReallyTriviallyReMaterializable(const MachineInstr &MI, AliasAnalysis *AA) const { // TODO: The generic check fails for VALU instructions that should be // rematerializable due to implicit reads of exec. We really want all of the // generic logic for this except for this. - switch (MI->getOpcode()) { + switch (MI.getOpcode()) { case AMDGPU::V_MOV_B32_e32: case AMDGPU::V_MOV_B32_e64: case AMDGPU::V_MOV_B64_PSEUDO: @@ -201,18 +202,18 @@ static bool isStride64(unsigned Opc) { } } -bool SIInstrInfo::getMemOpBaseRegImmOfs(MachineInstr *LdSt, unsigned &BaseReg, - unsigned &Offset, +bool SIInstrInfo::getMemOpBaseRegImmOfs(MachineInstr &LdSt, unsigned &BaseReg, + int64_t &Offset, const TargetRegisterInfo *TRI) const { - unsigned Opc = LdSt->getOpcode(); + unsigned Opc = LdSt.getOpcode(); - if (isDS(*LdSt)) { - const MachineOperand *OffsetImm = getNamedOperand(*LdSt, - AMDGPU::OpName::offset); + if (isDS(LdSt)) { + const MachineOperand *OffsetImm = + getNamedOperand(LdSt, AMDGPU::OpName::offset); if (OffsetImm) { // Normal, single offset LDS instruction. - const MachineOperand *AddrReg = getNamedOperand(*LdSt, - AMDGPU::OpName::addr); + const MachineOperand *AddrReg = + getNamedOperand(LdSt, AMDGPU::OpName::addr); BaseReg = AddrReg->getReg(); Offset = OffsetImm->getImm(); @@ -222,10 +223,10 @@ bool SIInstrInfo::getMemOpBaseRegImmOfs(MachineInstr *LdSt, unsigned &BaseReg, // The 2 offset instructions use offset0 and offset1 instead. We can treat // these as a load with a single offset if the 2 offsets are consecutive. We // will use this for some partially aligned loads. - const MachineOperand *Offset0Imm = getNamedOperand(*LdSt, - AMDGPU::OpName::offset0); - const MachineOperand *Offset1Imm = getNamedOperand(*LdSt, - AMDGPU::OpName::offset1); + const MachineOperand *Offset0Imm = + getNamedOperand(LdSt, AMDGPU::OpName::offset0); + const MachineOperand *Offset1Imm = + getNamedOperand(LdSt, AMDGPU::OpName::offset1); uint8_t Offset0 = Offset0Imm->getImm(); uint8_t Offset1 = Offset1Imm->getImm(); @@ -235,19 +236,19 @@ bool SIInstrInfo::getMemOpBaseRegImmOfs(MachineInstr *LdSt, unsigned &BaseReg, // to bytes of the individual reads. unsigned EltSize; - if (LdSt->mayLoad()) - EltSize = getOpRegClass(*LdSt, 0)->getSize() / 2; + if (LdSt.mayLoad()) + EltSize = getOpRegClass(LdSt, 0)->getSize() / 2; else { - assert(LdSt->mayStore()); + assert(LdSt.mayStore()); int Data0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data0); - EltSize = getOpRegClass(*LdSt, Data0Idx)->getSize(); + EltSize = getOpRegClass(LdSt, Data0Idx)->getSize(); } if (isStride64(Opc)) EltSize *= 64; - const MachineOperand *AddrReg = getNamedOperand(*LdSt, - AMDGPU::OpName::addr); + const MachineOperand *AddrReg = + getNamedOperand(LdSt, AMDGPU::OpName::addr); BaseReg = AddrReg->getReg(); Offset = EltSize * Offset0; return true; @@ -256,63 +257,91 @@ bool SIInstrInfo::getMemOpBaseRegImmOfs(MachineInstr *LdSt, unsigned &BaseReg, return false; } - if (isMUBUF(*LdSt) || isMTBUF(*LdSt)) { + if (isMUBUF(LdSt) || isMTBUF(LdSt)) { if (AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::soffset) != -1) return false; - const MachineOperand *AddrReg = getNamedOperand(*LdSt, - AMDGPU::OpName::vaddr); + const MachineOperand *AddrReg = + getNamedOperand(LdSt, AMDGPU::OpName::vaddr); if (!AddrReg) return false; - const MachineOperand *OffsetImm = getNamedOperand(*LdSt, - AMDGPU::OpName::offset); + const MachineOperand *OffsetImm = + getNamedOperand(LdSt, AMDGPU::OpName::offset); BaseReg = AddrReg->getReg(); Offset = OffsetImm->getImm(); return true; } - if (isSMRD(*LdSt)) { - const MachineOperand *OffsetImm = getNamedOperand(*LdSt, - AMDGPU::OpName::offset); + if (isSMRD(LdSt)) { + const MachineOperand *OffsetImm = + getNamedOperand(LdSt, AMDGPU::OpName::offset); if (!OffsetImm) return false; - const MachineOperand *SBaseReg = getNamedOperand(*LdSt, - AMDGPU::OpName::sbase); + const MachineOperand *SBaseReg = + getNamedOperand(LdSt, AMDGPU::OpName::sbase); BaseReg = SBaseReg->getReg(); Offset = OffsetImm->getImm(); return true; } + if (isFLAT(LdSt)) { + const MachineOperand *AddrReg = getNamedOperand(LdSt, AMDGPU::OpName::addr); + BaseReg = AddrReg->getReg(); + Offset = 0; + return true; + } + return false; } -bool SIInstrInfo::shouldClusterLoads(MachineInstr *FirstLdSt, - MachineInstr *SecondLdSt, - unsigned NumLoads) const { - // TODO: This needs finer tuning - if (NumLoads > 4) +bool SIInstrInfo::shouldClusterMemOps(MachineInstr &FirstLdSt, + MachineInstr &SecondLdSt, + unsigned NumLoads) const { + const MachineOperand *FirstDst = nullptr; + const MachineOperand *SecondDst = nullptr; + + if (isDS(FirstLdSt) && isDS(SecondLdSt)) { + FirstDst = getNamedOperand(FirstLdSt, AMDGPU::OpName::vdst); + SecondDst = getNamedOperand(SecondLdSt, AMDGPU::OpName::vdst); + } + + if (isSMRD(FirstLdSt) && isSMRD(SecondLdSt)) { + FirstDst = getNamedOperand(FirstLdSt, AMDGPU::OpName::sdst); + SecondDst = getNamedOperand(SecondLdSt, AMDGPU::OpName::sdst); + } + + if ((isMUBUF(FirstLdSt) && isMUBUF(SecondLdSt)) || + (isMTBUF(FirstLdSt) && isMTBUF(SecondLdSt))) { + FirstDst = getNamedOperand(FirstLdSt, AMDGPU::OpName::vdata); + SecondDst = getNamedOperand(SecondLdSt, AMDGPU::OpName::vdata); + } + + if (!FirstDst || !SecondDst) return false; - if (isDS(*FirstLdSt) && isDS(*SecondLdSt)) - return true; + // Try to limit clustering based on the total number of bytes loaded + // rather than the number of instructions. This is done to help reduce + // register pressure. The method used is somewhat inexact, though, + // because it assumes that all loads in the cluster will load the + // same number of bytes as FirstLdSt. - if (isSMRD(*FirstLdSt) && isSMRD(*SecondLdSt)) - return true; + // The unit of this value is bytes. + // FIXME: This needs finer tuning. + unsigned LoadClusterThreshold = 16; - if ((isMUBUF(*FirstLdSt) || isMTBUF(*FirstLdSt)) && - (isMUBUF(*SecondLdSt) || isMTBUF(*SecondLdSt))) - return true; + const MachineRegisterInfo &MRI = + FirstLdSt.getParent()->getParent()->getRegInfo(); + const TargetRegisterClass *DstRC = MRI.getRegClass(FirstDst->getReg()); - return false; + return (NumLoads * DstRC->getSize()) <= LoadClusterThreshold; } -void -SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB, - MachineBasicBlock::iterator MI, DebugLoc DL, - unsigned DestReg, unsigned SrcReg, - bool KillSrc) const { +void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MI, + const DebugLoc &DL, unsigned DestReg, + unsigned SrcReg, bool KillSrc) const { // If we are trying to copy to or from SCC, there is a bug somewhere else in // the backend. While it may be theoretically possible to do this, it should @@ -361,7 +390,6 @@ SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB, unsigned Opcode; ArrayRef<int16_t> SubIndices; - bool Forward; if (AMDGPU::SReg_32RegClass.contains(DestReg)) { assert(AMDGPU::SReg_32RegClass.contains(SrcReg)); @@ -445,10 +473,7 @@ SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB, llvm_unreachable("Can't copy register!"); } - if (RI.getHWRegIndex(DestReg) <= RI.getHWRegIndex(SrcReg)) - Forward = true; - else - Forward = false; + bool Forward = RI.getHWRegIndex(DestReg) <= RI.getHWRegIndex(SrcReg); for (unsigned Idx = 0; Idx < SubIndices.size(); ++Idx) { unsigned SubIdx; @@ -463,10 +488,12 @@ SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB, Builder.addReg(RI.getSubReg(SrcReg, SubIdx)); if (Idx == SubIndices.size() - 1) - Builder.addReg(SrcReg, RegState::Kill | RegState::Implicit); + Builder.addReg(SrcReg, getKillRegState(KillSrc) | RegState::Implicit); if (Idx == 0) Builder.addReg(DestReg, RegState::Define | RegState::Implicit); + + Builder.addReg(SrcReg, RegState::Implicit); } } @@ -525,6 +552,8 @@ static unsigned getVGPRSpillSaveOpcode(unsigned Size) { return AMDGPU::SI_SPILL_V32_SAVE; case 8: return AMDGPU::SI_SPILL_V64_SAVE; + case 12: + return AMDGPU::SI_SPILL_V96_SAVE; case 16: return AMDGPU::SI_SPILL_V128_SAVE; case 32: @@ -558,19 +587,25 @@ void SIInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB, if (RI.isSGPRClass(RC)) { MFI->setHasSpilledSGPRs(); + if (TargetRegisterInfo::isVirtualRegister(SrcReg) && RC->getSize() == 4) { + // m0 may not be allowed for readlane. + MachineRegisterInfo &MRI = MF->getRegInfo(); + MRI.constrainRegClass(SrcReg, &AMDGPU::SReg_32_XM0RegClass); + } + // We are only allowed to create one new instruction when spilling // registers, so we need to use pseudo instruction for spilling // SGPRs. unsigned Opcode = getSGPRSpillSaveOpcode(RC->getSize()); BuildMI(MBB, MI, DL, get(Opcode)) - .addReg(SrcReg) // src + .addReg(SrcReg, getKillRegState(isKill)) // src .addFrameIndex(FrameIndex) // frame_idx .addMemOperand(MMO); return; } - if (!ST.isVGPRSpillingEnabled(MFI)) { + if (!ST.isVGPRSpillingEnabled(*MF->getFunction())) { LLVMContext &Ctx = MF->getFunction()->getContext(); Ctx.emitError("SIInstrInfo::storeRegToStackSlot - Do not know how to" " spill register"); @@ -585,10 +620,11 @@ void SIInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB, unsigned Opcode = getVGPRSpillSaveOpcode(RC->getSize()); MFI->setHasSpilledVGPRs(); BuildMI(MBB, MI, DL, get(Opcode)) - .addReg(SrcReg) // src + .addReg(SrcReg, getKillRegState(isKill)) // src .addFrameIndex(FrameIndex) // frame_idx .addReg(MFI->getScratchRSrcReg()) // scratch_rsrc .addReg(MFI->getScratchWaveOffsetReg()) // scratch_offset + .addImm(0) // offset .addMemOperand(MMO); } @@ -615,6 +651,8 @@ static unsigned getVGPRSpillRestoreOpcode(unsigned Size) { return AMDGPU::SI_SPILL_V32_RESTORE; case 8: return AMDGPU::SI_SPILL_V64_RESTORE; + case 12: + return AMDGPU::SI_SPILL_V96_RESTORE; case 16: return AMDGPU::SI_SPILL_V128_RESTORE; case 32: @@ -648,6 +686,13 @@ void SIInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB, // FIXME: Maybe this should not include a memoperand because it will be // lowered to non-memory instructions. unsigned Opcode = getSGPRSpillRestoreOpcode(RC->getSize()); + + if (TargetRegisterInfo::isVirtualRegister(DestReg) && RC->getSize() == 4) { + // m0 may not be allowed for readlane. + MachineRegisterInfo &MRI = MF->getRegInfo(); + MRI.constrainRegClass(DestReg, &AMDGPU::SReg_32_XM0RegClass); + } + BuildMI(MBB, MI, DL, get(Opcode), DestReg) .addFrameIndex(FrameIndex) // frame_idx .addMemOperand(MMO); @@ -655,7 +700,7 @@ void SIInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB, return; } - if (!ST.isVGPRSpillingEnabled(MFI)) { + if (!ST.isVGPRSpillingEnabled(*MF->getFunction())) { LLVMContext &Ctx = MF->getFunction()->getContext(); Ctx.emitError("SIInstrInfo::loadRegFromStackSlot - Do not know how to" " restore register"); @@ -671,20 +716,18 @@ void SIInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB, .addFrameIndex(FrameIndex) // frame_idx .addReg(MFI->getScratchRSrcReg()) // scratch_rsrc .addReg(MFI->getScratchWaveOffsetReg()) // scratch_offset + .addImm(0) // offset .addMemOperand(MMO); } /// \param @Offset Offset in bytes of the FrameIndex being spilled -unsigned SIInstrInfo::calculateLDSSpillAddress(MachineBasicBlock &MBB, - MachineBasicBlock::iterator MI, - RegScavenger *RS, unsigned TmpReg, - unsigned FrameOffset, - unsigned Size) const { +unsigned SIInstrInfo::calculateLDSSpillAddress( + MachineBasicBlock &MBB, MachineInstr &MI, RegScavenger *RS, unsigned TmpReg, + unsigned FrameOffset, unsigned Size) const { MachineFunction *MF = MBB.getParent(); SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>(); - const AMDGPUSubtarget &ST = MF->getSubtarget<AMDGPUSubtarget>(); - const SIRegisterInfo *TRI = - static_cast<const SIRegisterInfo*>(ST.getRegisterInfo()); + const SISubtarget &ST = MF->getSubtarget<SISubtarget>(); + const SIRegisterInfo *TRI = ST.getRegisterInfo(); DebugLoc DL = MBB.findDebugLoc(MI); unsigned WorkGroupSize = MFI->getMaximumWorkGroupSize(*MF); unsigned WavefrontSize = ST.getWavefrontSize(); @@ -699,8 +742,7 @@ unsigned SIInstrInfo::calculateLDSSpillAddress(MachineBasicBlock &MBB, if (TIDReg == AMDGPU::NoRegister) return TIDReg; - - if (MFI->getShaderType() == ShaderType::COMPUTE && + if (!AMDGPU::isShader(MF->getFunction()->getCallingConv()) && WorkGroupSize > WavefrontSize) { unsigned TIDIGXReg @@ -716,7 +758,7 @@ unsigned SIInstrInfo::calculateLDSSpillAddress(MachineBasicBlock &MBB, Entry.addLiveIn(Reg); } - RS->enterBasicBlock(&Entry); + RS->enterBasicBlock(Entry); // FIXME: Can we scavenge an SReg_64 and access the subregs? unsigned STmp0 = RS->scavengeRegister(&AMDGPU::SGPR_32RegClass, 0); unsigned STmp1 = RS->scavengeRegister(&AMDGPU::SGPR_32RegClass, 0); @@ -773,8 +815,10 @@ unsigned SIInstrInfo::calculateLDSSpillAddress(MachineBasicBlock &MBB, return TmpReg; } -void SIInstrInfo::insertWaitStates(MachineBasicBlock::iterator MI, +void SIInstrInfo::insertWaitStates(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MI, int Count) const { + DebugLoc DL = MBB.findDebugLoc(MI); while (Count > 0) { int Arg; if (Count >= 8) @@ -782,76 +826,87 @@ void SIInstrInfo::insertWaitStates(MachineBasicBlock::iterator MI, else Arg = Count - 1; Count -= 8; - BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), get(AMDGPU::S_NOP)) + BuildMI(MBB, MI, DL, get(AMDGPU::S_NOP)) .addImm(Arg); } } -bool SIInstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const { - MachineBasicBlock &MBB = *MI->getParent(); +void SIInstrInfo::insertNoop(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MI) const { + insertWaitStates(MBB, MI, 1); +} + +unsigned SIInstrInfo::getNumWaitStates(const MachineInstr &MI) const { + switch (MI.getOpcode()) { + default: return 1; // FIXME: Do wait states equal cycles? + + case AMDGPU::S_NOP: + return MI.getOperand(0).getImm() + 1; + } +} + +bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const { + MachineBasicBlock &MBB = *MI.getParent(); DebugLoc DL = MBB.findDebugLoc(MI); - switch (MI->getOpcode()) { + switch (MI.getOpcode()) { default: return AMDGPUInstrInfo::expandPostRAPseudo(MI); - case AMDGPU::SGPR_USE: - // This is just a placeholder for register allocation. - MI->eraseFromParent(); - break; - case AMDGPU::V_MOV_B64_PSEUDO: { - unsigned Dst = MI->getOperand(0).getReg(); + unsigned Dst = MI.getOperand(0).getReg(); unsigned DstLo = RI.getSubReg(Dst, AMDGPU::sub0); unsigned DstHi = RI.getSubReg(Dst, AMDGPU::sub1); - const MachineOperand &SrcOp = MI->getOperand(1); + const MachineOperand &SrcOp = MI.getOperand(1); // FIXME: Will this work for 64-bit floating point immediates? assert(!SrcOp.isFPImm()); if (SrcOp.isImm()) { APInt Imm(64, SrcOp.getImm()); BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstLo) - .addImm(Imm.getLoBits(32).getZExtValue()) - .addReg(Dst, RegState::Implicit); + .addImm(Imm.getLoBits(32).getZExtValue()) + .addReg(Dst, RegState::Implicit | RegState::Define); BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstHi) - .addImm(Imm.getHiBits(32).getZExtValue()) - .addReg(Dst, RegState::Implicit); + .addImm(Imm.getHiBits(32).getZExtValue()) + .addReg(Dst, RegState::Implicit | RegState::Define); } else { assert(SrcOp.isReg()); BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstLo) - .addReg(RI.getSubReg(SrcOp.getReg(), AMDGPU::sub0)) - .addReg(Dst, RegState::Implicit); + .addReg(RI.getSubReg(SrcOp.getReg(), AMDGPU::sub0)) + .addReg(Dst, RegState::Implicit | RegState::Define); BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstHi) - .addReg(RI.getSubReg(SrcOp.getReg(), AMDGPU::sub1)) - .addReg(Dst, RegState::Implicit); + .addReg(RI.getSubReg(SrcOp.getReg(), AMDGPU::sub1)) + .addReg(Dst, RegState::Implicit | RegState::Define); } - MI->eraseFromParent(); + MI.eraseFromParent(); break; } case AMDGPU::V_CNDMASK_B64_PSEUDO: { - unsigned Dst = MI->getOperand(0).getReg(); + unsigned Dst = MI.getOperand(0).getReg(); unsigned DstLo = RI.getSubReg(Dst, AMDGPU::sub0); unsigned DstHi = RI.getSubReg(Dst, AMDGPU::sub1); - unsigned Src0 = MI->getOperand(1).getReg(); - unsigned Src1 = MI->getOperand(2).getReg(); - const MachineOperand &SrcCond = MI->getOperand(3); + unsigned Src0 = MI.getOperand(1).getReg(); + unsigned Src1 = MI.getOperand(2).getReg(); + const MachineOperand &SrcCond = MI.getOperand(3); BuildMI(MBB, MI, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstLo) - .addReg(RI.getSubReg(Src0, AMDGPU::sub0)) - .addReg(RI.getSubReg(Src1, AMDGPU::sub0)) - .addOperand(SrcCond); + .addReg(RI.getSubReg(Src0, AMDGPU::sub0)) + .addReg(RI.getSubReg(Src1, AMDGPU::sub0)) + .addReg(SrcCond.getReg()) + .addReg(Dst, RegState::Implicit | RegState::Define); BuildMI(MBB, MI, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstHi) - .addReg(RI.getSubReg(Src0, AMDGPU::sub1)) - .addReg(RI.getSubReg(Src1, AMDGPU::sub1)) - .addOperand(SrcCond); - MI->eraseFromParent(); + .addReg(RI.getSubReg(Src0, AMDGPU::sub1)) + .addReg(RI.getSubReg(Src1, AMDGPU::sub1)) + .addReg(SrcCond.getReg(), getKillRegState(SrcCond.isKill())) + .addReg(Dst, RegState::Implicit | RegState::Define); + MI.eraseFromParent(); break; } - case AMDGPU::SI_CONSTDATA_PTR: { - const SIRegisterInfo *TRI = - static_cast<const SIRegisterInfo *>(ST.getRegisterInfo()); + case AMDGPU::SI_PC_ADD_REL_OFFSET: { + const SIRegisterInfo *TRI + = static_cast<const SIRegisterInfo *>(ST.getRegisterInfo()); MachineFunction &MF = *MBB.getParent(); - unsigned Reg = MI->getOperand(0).getReg(); + unsigned Reg = MI.getOperand(0).getReg(); unsigned RegLo = TRI->getSubReg(Reg, AMDGPU::sub0); unsigned RegHi = TRI->getSubReg(Reg, AMDGPU::sub1); @@ -863,15 +918,15 @@ bool SIInstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const { // Add 32-bit offset from this instruction to the start of the // constant data. Bundler.append(BuildMI(MF, DL, get(AMDGPU::S_ADD_U32), RegLo) - .addReg(RegLo) - .addOperand(MI->getOperand(1))); + .addReg(RegLo) + .addOperand(MI.getOperand(1))); Bundler.append(BuildMI(MF, DL, get(AMDGPU::S_ADDC_U32), RegHi) .addReg(RegHi) .addImm(0)); llvm::finalizeBundle(MBB, Bundler.begin()); - MI->eraseFromParent(); + MI.eraseFromParent(); break; } } @@ -885,22 +940,21 @@ bool SIInstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const { /// non-commutable pair of operand indices OpIdx0 and OpIdx1. /// Even though the instruction is commutable, the method may still /// fail to commute the operands, null pointer is returned in such cases. -MachineInstr *SIInstrInfo::commuteInstructionImpl(MachineInstr *MI, - bool NewMI, +MachineInstr *SIInstrInfo::commuteInstructionImpl(MachineInstr &MI, bool NewMI, unsigned OpIdx0, unsigned OpIdx1) const { - int CommutedOpcode = commuteOpcode(*MI); + int CommutedOpcode = commuteOpcode(MI); if (CommutedOpcode == -1) return nullptr; - int Src0Idx = AMDGPU::getNamedOperandIdx(MI->getOpcode(), - AMDGPU::OpName::src0); - MachineOperand &Src0 = MI->getOperand(Src0Idx); + int Src0Idx = + AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::src0); + MachineOperand &Src0 = MI.getOperand(Src0Idx); if (!Src0.isReg()) return nullptr; - int Src1Idx = AMDGPU::getNamedOperandIdx(MI->getOpcode(), - AMDGPU::OpName::src1); + int Src1Idx = + AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::src1); if ((OpIdx0 != static_cast<unsigned>(Src0Idx) || OpIdx1 != static_cast<unsigned>(Src1Idx)) && @@ -908,33 +962,32 @@ MachineInstr *SIInstrInfo::commuteInstructionImpl(MachineInstr *MI, OpIdx1 != static_cast<unsigned>(Src0Idx))) return nullptr; - MachineOperand &Src1 = MI->getOperand(Src1Idx); + MachineOperand &Src1 = MI.getOperand(Src1Idx); - - if (isVOP2(*MI)) { - const MCInstrDesc &InstrDesc = MI->getDesc(); - // For VOP2 instructions, any operand type is valid to use for src0. Make - // sure we can use the src1 as src0. + if (isVOP2(MI) || isVOPC(MI)) { + const MCInstrDesc &InstrDesc = MI.getDesc(); + // For VOP2 and VOPC instructions, any operand type is valid to use for + // src0. Make sure we can use the src0 as src1. // // We could be stricter here and only allow commuting if there is a reason // to do so. i.e. if both operands are VGPRs there is no real benefit, // although MachineCSE attempts to find matches by commuting. - const MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo(); + const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo(); if (!isLegalRegOperand(MRI, InstrDesc.OpInfo[Src1Idx], Src0)) return nullptr; } + MachineInstr *CommutedMI = &MI; if (!Src1.isReg()) { // Allow commuting instructions with Imm operands. - if (NewMI || !Src1.isImm() || - (!isVOP2(*MI) && !isVOP3(*MI))) { + if (NewMI || !Src1.isImm() || (!isVOP2(MI) && !isVOP3(MI))) { return nullptr; } // Be sure to copy the source modifiers to the right place. - if (MachineOperand *Src0Mods - = getNamedOperand(*MI, AMDGPU::OpName::src0_modifiers)) { - MachineOperand *Src1Mods - = getNamedOperand(*MI, AMDGPU::OpName::src1_modifiers); + if (MachineOperand *Src0Mods = + getNamedOperand(MI, AMDGPU::OpName::src0_modifiers)) { + MachineOperand *Src1Mods = + getNamedOperand(MI, AMDGPU::OpName::src1_modifiers); int Src0ModsVal = Src0Mods->getImm(); if (!Src1Mods && Src0ModsVal != 0) @@ -959,26 +1012,26 @@ MachineInstr *SIInstrInfo::commuteInstructionImpl(MachineInstr *MI, Src1.ChangeToRegister(Reg, false); Src1.setSubReg(SubReg); } else { - MI = TargetInstrInfo::commuteInstructionImpl(MI, NewMI, OpIdx0, OpIdx1); + CommutedMI = + TargetInstrInfo::commuteInstructionImpl(MI, NewMI, OpIdx0, OpIdx1); } - if (MI) - MI->setDesc(get(CommutedOpcode)); + if (CommutedMI) + CommutedMI->setDesc(get(CommutedOpcode)); - return MI; + return CommutedMI; } // This needs to be implemented because the source modifiers may be inserted // between the true commutable operands, and the base // TargetInstrInfo::commuteInstruction uses it. -bool SIInstrInfo::findCommutedOpIndices(MachineInstr *MI, - unsigned &SrcOpIdx0, +bool SIInstrInfo::findCommutedOpIndices(MachineInstr &MI, unsigned &SrcOpIdx0, unsigned &SrcOpIdx1) const { - const MCInstrDesc &MCID = MI->getDesc(); + const MCInstrDesc &MCID = MI.getDesc(); if (!MCID.isCommutable()) return false; - unsigned Opc = MI->getOpcode(); + unsigned Opc = MI.getOpcode(); int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0); if (Src0Idx == -1) return false; @@ -986,24 +1039,24 @@ bool SIInstrInfo::findCommutedOpIndices(MachineInstr *MI, // FIXME: Workaround TargetInstrInfo::commuteInstruction asserting on // immediate. Also, immediate src0 operand is not handled in // SIInstrInfo::commuteInstruction(); - if (!MI->getOperand(Src0Idx).isReg()) + if (!MI.getOperand(Src0Idx).isReg()) return false; int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1); if (Src1Idx == -1) return false; - MachineOperand &Src1 = MI->getOperand(Src1Idx); + MachineOperand &Src1 = MI.getOperand(Src1Idx); if (Src1.isImm()) { // SIInstrInfo::commuteInstruction() does support commuting the immediate // operand src1 in 2 and 3 operand instructions. - if (!isVOP2(MI->getOpcode()) && !isVOP3(MI->getOpcode())) + if (!isVOP2(MI.getOpcode()) && !isVOP3(MI.getOpcode())) return false; } else if (Src1.isReg()) { // If any source modifiers are set, the generic instruction commuting won't // understand how to copy the source modifiers. - if (hasModifiersSet(*MI, AMDGPU::OpName::src0_modifiers) || - hasModifiersSet(*MI, AMDGPU::OpName::src1_modifiers)) + if (hasModifiersSet(MI, AMDGPU::OpName::src0_modifiers) || + hasModifiersSet(MI, AMDGPU::OpName::src1_modifiers)) return false; } else return false; @@ -1011,23 +1064,135 @@ bool SIInstrInfo::findCommutedOpIndices(MachineInstr *MI, return fixCommutedOpIndices(SrcOpIdx0, SrcOpIdx1, Src0Idx, Src1Idx); } -MachineInstr *SIInstrInfo::buildMovInstr(MachineBasicBlock *MBB, - MachineBasicBlock::iterator I, - unsigned DstReg, - unsigned SrcReg) const { - return BuildMI(*MBB, I, MBB->findDebugLoc(I), get(AMDGPU::V_MOV_B32_e32), - DstReg) .addReg(SrcReg); +unsigned SIInstrInfo::getBranchOpcode(SIInstrInfo::BranchPredicate Cond) { + switch (Cond) { + case SIInstrInfo::SCC_TRUE: + return AMDGPU::S_CBRANCH_SCC1; + case SIInstrInfo::SCC_FALSE: + return AMDGPU::S_CBRANCH_SCC0; + case SIInstrInfo::VCCNZ: + return AMDGPU::S_CBRANCH_VCCNZ; + case SIInstrInfo::VCCZ: + return AMDGPU::S_CBRANCH_VCCZ; + case SIInstrInfo::EXECNZ: + return AMDGPU::S_CBRANCH_EXECNZ; + case SIInstrInfo::EXECZ: + return AMDGPU::S_CBRANCH_EXECZ; + default: + llvm_unreachable("invalid branch predicate"); + } +} + +SIInstrInfo::BranchPredicate SIInstrInfo::getBranchPredicate(unsigned Opcode) { + switch (Opcode) { + case AMDGPU::S_CBRANCH_SCC0: + return SCC_FALSE; + case AMDGPU::S_CBRANCH_SCC1: + return SCC_TRUE; + case AMDGPU::S_CBRANCH_VCCNZ: + return VCCNZ; + case AMDGPU::S_CBRANCH_VCCZ: + return VCCZ; + case AMDGPU::S_CBRANCH_EXECNZ: + return EXECNZ; + case AMDGPU::S_CBRANCH_EXECZ: + return EXECZ; + default: + return INVALID_BR; + } } -bool SIInstrInfo::isMov(unsigned Opcode) const { - switch(Opcode) { - default: return false; - case AMDGPU::S_MOV_B32: - case AMDGPU::S_MOV_B64: - case AMDGPU::V_MOV_B32_e32: - case AMDGPU::V_MOV_B32_e64: +bool SIInstrInfo::analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB, + MachineBasicBlock *&FBB, + SmallVectorImpl<MachineOperand> &Cond, + bool AllowModify) const { + MachineBasicBlock::iterator I = MBB.getFirstTerminator(); + + if (I == MBB.end()) + return false; + + if (I->getOpcode() == AMDGPU::S_BRANCH) { + // Unconditional Branch + TBB = I->getOperand(0).getMBB(); + return false; + } + + BranchPredicate Pred = getBranchPredicate(I->getOpcode()); + if (Pred == INVALID_BR) return true; + + MachineBasicBlock *CondBB = I->getOperand(0).getMBB(); + Cond.push_back(MachineOperand::CreateImm(Pred)); + + ++I; + + if (I == MBB.end()) { + // Conditional branch followed by fall-through. + TBB = CondBB; + return false; + } + + if (I->getOpcode() == AMDGPU::S_BRANCH) { + TBB = CondBB; + FBB = I->getOperand(0).getMBB(); + return false; + } + + return true; +} + +unsigned SIInstrInfo::RemoveBranch(MachineBasicBlock &MBB) const { + MachineBasicBlock::iterator I = MBB.getFirstTerminator(); + + unsigned Count = 0; + while (I != MBB.end()) { + MachineBasicBlock::iterator Next = std::next(I); + I->eraseFromParent(); + ++Count; + I = Next; + } + + return Count; +} + +unsigned SIInstrInfo::InsertBranch(MachineBasicBlock &MBB, + MachineBasicBlock *TBB, + MachineBasicBlock *FBB, + ArrayRef<MachineOperand> Cond, + const DebugLoc &DL) const { + + if (!FBB && Cond.empty()) { + BuildMI(&MBB, DL, get(AMDGPU::S_BRANCH)) + .addMBB(TBB); + return 1; + } + + assert(TBB && Cond[0].isImm()); + + unsigned Opcode + = getBranchOpcode(static_cast<BranchPredicate>(Cond[0].getImm())); + + if (!FBB) { + BuildMI(&MBB, DL, get(Opcode)) + .addMBB(TBB); + return 1; } + + assert(TBB && FBB); + + BuildMI(&MBB, DL, get(Opcode)) + .addMBB(TBB); + BuildMI(&MBB, DL, get(AMDGPU::S_BRANCH)) + .addMBB(FBB); + + return 2; +} + +bool SIInstrInfo::ReverseBranchCondition( + SmallVectorImpl<MachineOperand> &Cond) const { + assert(Cond.size() == 1); + Cond[0].setImm(-Cond[0].getImm()); + return false; } static void removeModOperands(MachineInstr &MI) { @@ -1044,81 +1209,76 @@ static void removeModOperands(MachineInstr &MI) { MI.RemoveOperand(Src0ModIdx); } -bool SIInstrInfo::FoldImmediate(MachineInstr *UseMI, MachineInstr *DefMI, +// TODO: Maybe this should be removed this and custom fold everything in +// SIFoldOperands? +bool SIInstrInfo::FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI, unsigned Reg, MachineRegisterInfo *MRI) const { if (!MRI->hasOneNonDBGUse(Reg)) return false; - unsigned Opc = UseMI->getOpcode(); + unsigned Opc = UseMI.getOpcode(); if (Opc == AMDGPU::V_MAD_F32 || Opc == AMDGPU::V_MAC_F32_e64) { // Don't fold if we are using source modifiers. The new VOP2 instructions // don't have them. - if (hasModifiersSet(*UseMI, AMDGPU::OpName::src0_modifiers) || - hasModifiersSet(*UseMI, AMDGPU::OpName::src1_modifiers) || - hasModifiersSet(*UseMI, AMDGPU::OpName::src2_modifiers)) { + if (hasModifiersSet(UseMI, AMDGPU::OpName::src0_modifiers) || + hasModifiersSet(UseMI, AMDGPU::OpName::src1_modifiers) || + hasModifiersSet(UseMI, AMDGPU::OpName::src2_modifiers)) { return false; } - MachineOperand *Src0 = getNamedOperand(*UseMI, AMDGPU::OpName::src0); - MachineOperand *Src1 = getNamedOperand(*UseMI, AMDGPU::OpName::src1); - MachineOperand *Src2 = getNamedOperand(*UseMI, AMDGPU::OpName::src2); + const MachineOperand &ImmOp = DefMI.getOperand(1); + + // If this is a free constant, there's no reason to do this. + // TODO: We could fold this here instead of letting SIFoldOperands do it + // later. + if (isInlineConstant(ImmOp, 4)) + return false; + + MachineOperand *Src0 = getNamedOperand(UseMI, AMDGPU::OpName::src0); + MachineOperand *Src1 = getNamedOperand(UseMI, AMDGPU::OpName::src1); + MachineOperand *Src2 = getNamedOperand(UseMI, AMDGPU::OpName::src2); // Multiplied part is the constant: Use v_madmk_f32 // We should only expect these to be on src0 due to canonicalizations. if (Src0->isReg() && Src0->getReg() == Reg) { - if (!Src1->isReg() || - (Src1->isReg() && RI.isSGPRClass(MRI->getRegClass(Src1->getReg())))) + if (!Src1->isReg() || RI.isSGPRClass(MRI->getRegClass(Src1->getReg()))) return false; - if (!Src2->isReg() || - (Src2->isReg() && RI.isSGPRClass(MRI->getRegClass(Src2->getReg())))) + if (!Src2->isReg() || RI.isSGPRClass(MRI->getRegClass(Src2->getReg()))) return false; - // We need to do some weird looking operand shuffling since the madmk - // operands are out of the normal expected order with the multiplied - // constant as the last operand. - // - // v_mad_f32 src0, src1, src2 -> v_madmk_f32 src0 * src2K + src1 - // src0 -> src2 K - // src1 -> src0 - // src2 -> src1 + // We need to swap operands 0 and 1 since madmk constant is at operand 1. - const int64_t Imm = DefMI->getOperand(1).getImm(); + const int64_t Imm = DefMI.getOperand(1).getImm(); // FIXME: This would be a lot easier if we could return a new instruction // instead of having to modify in place. // Remove these first since they are at the end. - UseMI->RemoveOperand(AMDGPU::getNamedOperandIdx(Opc, - AMDGPU::OpName::omod)); - UseMI->RemoveOperand(AMDGPU::getNamedOperandIdx(Opc, - AMDGPU::OpName::clamp)); + UseMI.RemoveOperand( + AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::omod)); + UseMI.RemoveOperand( + AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::clamp)); unsigned Src1Reg = Src1->getReg(); unsigned Src1SubReg = Src1->getSubReg(); - unsigned Src2Reg = Src2->getReg(); - unsigned Src2SubReg = Src2->getSubReg(); Src0->setReg(Src1Reg); Src0->setSubReg(Src1SubReg); Src0->setIsKill(Src1->isKill()); - Src1->setReg(Src2Reg); - Src1->setSubReg(Src2SubReg); - Src1->setIsKill(Src2->isKill()); - if (Opc == AMDGPU::V_MAC_F32_e64) { - UseMI->untieRegOperand( - AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2)); + UseMI.untieRegOperand( + AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2)); } - Src2->ChangeToImmediate(Imm); + Src1->ChangeToImmediate(Imm); - removeModOperands(*UseMI); - UseMI->setDesc(get(AMDGPU::V_MADMK_F32)); + removeModOperands(UseMI); + UseMI.setDesc(get(AMDGPU::V_MADMK_F32)); bool DeleteDef = MRI->hasOneNonDBGUse(Reg); if (DeleteDef) - DefMI->eraseFromParent(); + DefMI.eraseFromParent(); return true; } @@ -1131,36 +1291,35 @@ bool SIInstrInfo::FoldImmediate(MachineInstr *UseMI, MachineInstr *DefMI, (Src0->isReg() && RI.isSGPRClass(MRI->getRegClass(Src0->getReg())))) return false; - if (!Src1->isReg() || - (Src1->isReg() && RI.isSGPRClass(MRI->getRegClass(Src1->getReg())))) + if (!Src1->isReg() || RI.isSGPRClass(MRI->getRegClass(Src1->getReg()))) return false; - const int64_t Imm = DefMI->getOperand(1).getImm(); + const int64_t Imm = DefMI.getOperand(1).getImm(); // FIXME: This would be a lot easier if we could return a new instruction // instead of having to modify in place. // Remove these first since they are at the end. - UseMI->RemoveOperand(AMDGPU::getNamedOperandIdx(Opc, - AMDGPU::OpName::omod)); - UseMI->RemoveOperand(AMDGPU::getNamedOperandIdx(Opc, - AMDGPU::OpName::clamp)); + UseMI.RemoveOperand( + AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::omod)); + UseMI.RemoveOperand( + AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::clamp)); if (Opc == AMDGPU::V_MAC_F32_e64) { - UseMI->untieRegOperand( - AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2)); + UseMI.untieRegOperand( + AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2)); } // ChangingToImmediate adds Src2 back to the instruction. Src2->ChangeToImmediate(Imm); // These come before src2. - removeModOperands(*UseMI); - UseMI->setDesc(get(AMDGPU::V_MADAK_F32)); + removeModOperands(UseMI); + UseMI.setDesc(get(AMDGPU::V_MADAK_F32)); bool DeleteDef = MRI->hasOneNonDBGUse(Reg); if (DeleteDef) - DefMI->eraseFromParent(); + DefMI.eraseFromParent(); return true; } @@ -1177,17 +1336,20 @@ static bool offsetsDoNotOverlap(int WidthA, int OffsetA, return LowOffset + LowWidth <= HighOffset; } -bool SIInstrInfo::checkInstOffsetsDoNotOverlap(MachineInstr *MIa, - MachineInstr *MIb) const { - unsigned BaseReg0, Offset0; - unsigned BaseReg1, Offset1; +bool SIInstrInfo::checkInstOffsetsDoNotOverlap(MachineInstr &MIa, + MachineInstr &MIb) const { + unsigned BaseReg0, BaseReg1; + int64_t Offset0, Offset1; if (getMemOpBaseRegImmOfs(MIa, BaseReg0, Offset0, &RI) && getMemOpBaseRegImmOfs(MIb, BaseReg1, Offset1, &RI)) { - assert(MIa->hasOneMemOperand() && MIb->hasOneMemOperand() && - "read2 / write2 not expected here yet"); - unsigned Width0 = (*MIa->memoperands_begin())->getSize(); - unsigned Width1 = (*MIb->memoperands_begin())->getSize(); + + if (!MIa.hasOneMemOperand() || !MIb.hasOneMemOperand()) { + // FIXME: Handle ds_read2 / ds_write2. + return false; + } + unsigned Width0 = (*MIa.memoperands_begin())->getSize(); + unsigned Width1 = (*MIb.memoperands_begin())->getSize(); if (BaseReg0 == BaseReg1 && offsetsDoNotOverlap(Width0, Offset0, Width1, Offset1)) { return true; @@ -1197,19 +1359,19 @@ bool SIInstrInfo::checkInstOffsetsDoNotOverlap(MachineInstr *MIa, return false; } -bool SIInstrInfo::areMemAccessesTriviallyDisjoint(MachineInstr *MIa, - MachineInstr *MIb, +bool SIInstrInfo::areMemAccessesTriviallyDisjoint(MachineInstr &MIa, + MachineInstr &MIb, AliasAnalysis *AA) const { - assert(MIa && (MIa->mayLoad() || MIa->mayStore()) && + assert((MIa.mayLoad() || MIa.mayStore()) && "MIa must load from or modify a memory location"); - assert(MIb && (MIb->mayLoad() || MIb->mayStore()) && + assert((MIb.mayLoad() || MIb.mayStore()) && "MIb must load from or modify a memory location"); - if (MIa->hasUnmodeledSideEffects() || MIb->hasUnmodeledSideEffects()) + if (MIa.hasUnmodeledSideEffects() || MIb.hasUnmodeledSideEffects()) return false; // XXX - Can we relax this between address spaces? - if (MIa->hasOrderedMemoryRef() || MIb->hasOrderedMemoryRef()) + if (MIa.hasOrderedMemoryRef() || MIb.hasOrderedMemoryRef()) return false; // TODO: Should we check the address space from the MachineMemOperand? That @@ -1217,29 +1379,29 @@ bool SIInstrInfo::areMemAccessesTriviallyDisjoint(MachineInstr *MIa, // underlying address space, even if it was lowered to a different one, // e.g. private accesses lowered to use MUBUF instructions on a scratch // buffer. - if (isDS(*MIa)) { - if (isDS(*MIb)) + if (isDS(MIa)) { + if (isDS(MIb)) return checkInstOffsetsDoNotOverlap(MIa, MIb); - return !isFLAT(*MIb); + return !isFLAT(MIb); } - if (isMUBUF(*MIa) || isMTBUF(*MIa)) { - if (isMUBUF(*MIb) || isMTBUF(*MIb)) + if (isMUBUF(MIa) || isMTBUF(MIa)) { + if (isMUBUF(MIb) || isMTBUF(MIb)) return checkInstOffsetsDoNotOverlap(MIa, MIb); - return !isFLAT(*MIb) && !isSMRD(*MIb); + return !isFLAT(MIb) && !isSMRD(MIb); } - if (isSMRD(*MIa)) { - if (isSMRD(*MIb)) + if (isSMRD(MIa)) { + if (isSMRD(MIb)) return checkInstOffsetsDoNotOverlap(MIa, MIb); - return !isFLAT(*MIb) && !isMUBUF(*MIa) && !isMTBUF(*MIa); + return !isFLAT(MIb) && !isMUBUF(MIa) && !isMTBUF(MIa); } - if (isFLAT(*MIa)) { - if (isFLAT(*MIb)) + if (isFLAT(MIa)) { + if (isFLAT(MIb)) return checkInstOffsetsDoNotOverlap(MIa, MIb); return false; @@ -1249,35 +1411,49 @@ bool SIInstrInfo::areMemAccessesTriviallyDisjoint(MachineInstr *MIa, } MachineInstr *SIInstrInfo::convertToThreeAddress(MachineFunction::iterator &MBB, - MachineBasicBlock::iterator &MI, - LiveVariables *LV) const { - - switch (MI->getOpcode()) { - default: return nullptr; - case AMDGPU::V_MAC_F32_e64: break; - case AMDGPU::V_MAC_F32_e32: { - const MachineOperand *Src0 = getNamedOperand(*MI, AMDGPU::OpName::src0); - if (Src0->isImm() && !isInlineConstant(*Src0, 4)) - return nullptr; - break; - } + MachineInstr &MI, + LiveVariables *LV) const { + + switch (MI.getOpcode()) { + default: + return nullptr; + case AMDGPU::V_MAC_F32_e64: + break; + case AMDGPU::V_MAC_F32_e32: { + const MachineOperand *Src0 = getNamedOperand(MI, AMDGPU::OpName::src0); + if (Src0->isImm() && !isInlineConstant(*Src0, 4)) + return nullptr; + break; + } } - const MachineOperand *Dst = getNamedOperand(*MI, AMDGPU::OpName::dst); - const MachineOperand *Src0 = getNamedOperand(*MI, AMDGPU::OpName::src0); - const MachineOperand *Src1 = getNamedOperand(*MI, AMDGPU::OpName::src1); - const MachineOperand *Src2 = getNamedOperand(*MI, AMDGPU::OpName::src2); + const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::vdst); + const MachineOperand *Src0 = getNamedOperand(MI, AMDGPU::OpName::src0); + const MachineOperand *Src1 = getNamedOperand(MI, AMDGPU::OpName::src1); + const MachineOperand *Src2 = getNamedOperand(MI, AMDGPU::OpName::src2); - return BuildMI(*MBB, MI, MI->getDebugLoc(), get(AMDGPU::V_MAD_F32)) - .addOperand(*Dst) - .addImm(0) // Src0 mods - .addOperand(*Src0) - .addImm(0) // Src1 mods - .addOperand(*Src1) - .addImm(0) // Src mods - .addOperand(*Src2) - .addImm(0) // clamp - .addImm(0); // omod + return BuildMI(*MBB, MI, MI.getDebugLoc(), get(AMDGPU::V_MAD_F32)) + .addOperand(*Dst) + .addImm(0) // Src0 mods + .addOperand(*Src0) + .addImm(0) // Src1 mods + .addOperand(*Src1) + .addImm(0) // Src mods + .addOperand(*Src2) + .addImm(0) // clamp + .addImm(0); // omod +} + +bool SIInstrInfo::isSchedulingBoundary(const MachineInstr &MI, + const MachineBasicBlock *MBB, + const MachineFunction &MF) const { + // XXX - Do we want the SP check in the base implementation? + + // Target-independent instructions do not have an implicit-use of EXEC, even + // when they operate on VGPRs. Treating EXEC modifications as scheduling + // boundaries prevents incorrect movements of such instructions. + return TargetInstrInfo::isSchedulingBoundary(MI, MBB, MF) || + MI.modifiesRegister(AMDGPU::EXEC, &RI); } bool SIInstrInfo::isInlineConstant(const APInt &Imm) const { @@ -1355,9 +1531,9 @@ static bool compareMachineOp(const MachineOperand &Op0, } } -bool SIInstrInfo::isImmOperandLegal(const MachineInstr *MI, unsigned OpNo, - const MachineOperand &MO) const { - const MCOperandInfo &OpInfo = get(MI->getOpcode()).OpInfo[OpNo]; +bool SIInstrInfo::isImmOperandLegal(const MachineInstr &MI, unsigned OpNo, + const MachineOperand &MO) const { + const MCOperandInfo &OpInfo = get(MI.getOpcode()).OpInfo[OpNo]; assert(MO.isImm() || MO.isTargetIndex() || MO.isFI()); @@ -1418,14 +1594,10 @@ bool SIInstrInfo::usesConstantBus(const MachineRegisterInfo &MRI, return true; // SGPRs use the constant bus - if (MO.getReg() == AMDGPU::M0 || MO.getReg() == AMDGPU::VCC || - (!MO.isImplicit() && - (AMDGPU::SGPR_32RegClass.contains(MO.getReg()) || - AMDGPU::SGPR_64RegClass.contains(MO.getReg())))) { - return true; - } - - return false; + return (MO.getReg() == AMDGPU::VCC || MO.getReg() == AMDGPU::M0 || + (!MO.isImplicit() && + (AMDGPU::SGPR_32RegClass.contains(MO.getReg()) || + AMDGPU::SGPR_64RegClass.contains(MO.getReg())))); } static unsigned findImplicitSGPRRead(const MachineInstr &MI) { @@ -1448,10 +1620,33 @@ static unsigned findImplicitSGPRRead(const MachineInstr &MI) { return AMDGPU::NoRegister; } -bool SIInstrInfo::verifyInstruction(const MachineInstr *MI, +static bool shouldReadExec(const MachineInstr &MI) { + if (SIInstrInfo::isVALU(MI)) { + switch (MI.getOpcode()) { + case AMDGPU::V_READLANE_B32: + case AMDGPU::V_READLANE_B32_si: + case AMDGPU::V_READLANE_B32_vi: + case AMDGPU::V_WRITELANE_B32: + case AMDGPU::V_WRITELANE_B32_si: + case AMDGPU::V_WRITELANE_B32_vi: + return false; + } + + return true; + } + + if (SIInstrInfo::isGenericOpcode(MI.getOpcode()) || + SIInstrInfo::isSALU(MI) || + SIInstrInfo::isSMRD(MI)) + return false; + + return true; +} + +bool SIInstrInfo::verifyInstruction(const MachineInstr &MI, StringRef &ErrInfo) const { - uint16_t Opcode = MI->getOpcode(); - const MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo(); + uint16_t Opcode = MI.getOpcode(); + const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo(); int Src0Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0); int Src1Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src1); int Src2Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src2); @@ -1459,14 +1654,14 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr *MI, // Make sure the number of operands is correct. const MCInstrDesc &Desc = get(Opcode); if (!Desc.isVariadic() && - Desc.getNumOperands() != MI->getNumExplicitOperands()) { - ErrInfo = "Instruction has wrong number of operands."; - return false; + Desc.getNumOperands() != MI.getNumExplicitOperands()) { + ErrInfo = "Instruction has wrong number of operands."; + return false; } // Make sure the register classes are correct. for (int i = 0, e = Desc.getNumOperands(); i != e; ++i) { - if (MI->getOperand(i).isFPImm()) { + if (MI.getOperand(i).isFPImm()) { ErrInfo = "FPImm Machine Operands are not supported. ISel should bitcast " "all fp values to integers."; return false; @@ -1476,7 +1671,7 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr *MI, switch (Desc.OpInfo[i].OperandType) { case MCOI::OPERAND_REGISTER: - if (MI->getOperand(i).isImm()) { + if (MI.getOperand(i).isImm()) { ErrInfo = "Illegal immediate value for operand."; return false; } @@ -1484,17 +1679,18 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr *MI, case AMDGPU::OPERAND_REG_IMM32: break; case AMDGPU::OPERAND_REG_INLINE_C: - if (isLiteralConstant(MI->getOperand(i), + if (isLiteralConstant(MI.getOperand(i), RI.getRegClass(RegClass)->getSize())) { ErrInfo = "Illegal immediate value for operand."; return false; } break; case MCOI::OPERAND_IMMEDIATE: + case AMDGPU::OPERAND_KIMM32: // Check if this operand is an immediate. // FrameIndex operands will be replaced by immediates, so they are // allowed. - if (!MI->getOperand(i).isImm() && !MI->getOperand(i).isFI()) { + if (!MI.getOperand(i).isImm() && !MI.getOperand(i).isFI()) { ErrInfo = "Expected immediate, but got non-immediate"; return false; } @@ -1503,12 +1699,13 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr *MI, continue; } - if (!MI->getOperand(i).isReg()) + if (!MI.getOperand(i).isReg()) continue; if (RegClass != -1) { - unsigned Reg = MI->getOperand(i).getReg(); - if (TargetRegisterInfo::isVirtualRegister(Reg)) + unsigned Reg = MI.getOperand(i).getReg(); + if (Reg == AMDGPU::NoRegister || + TargetRegisterInfo::isVirtualRegister(Reg)) continue; const TargetRegisterClass *RC = RI.getRegClass(RegClass); @@ -1519,23 +1716,26 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr *MI, } } - // Verify VOP* - if (isVOP1(*MI) || isVOP2(*MI) || isVOP3(*MI) || isVOPC(*MI)) { + if (isVOP1(MI) || isVOP2(MI) || isVOP3(MI) || isVOPC(MI)) { // Only look at the true operands. Only a real operand can use the constant // bus, and we don't want to check pseudo-operands like the source modifier // flags. const int OpIndices[] = { Src0Idx, Src1Idx, Src2Idx }; unsigned ConstantBusCount = 0; - unsigned SGPRUsed = findImplicitSGPRRead(*MI); + + if (AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::imm) != -1) + ++ConstantBusCount; + + unsigned SGPRUsed = findImplicitSGPRRead(MI); if (SGPRUsed != AMDGPU::NoRegister) ++ConstantBusCount; for (int OpIdx : OpIndices) { if (OpIdx == -1) break; - const MachineOperand &MO = MI->getOperand(OpIdx); + const MachineOperand &MO = MI.getOperand(OpIdx); if (usesConstantBus(MRI, MO, getOpSize(Opcode, OpIdx))) { if (MO.isReg()) { if (MO.getReg() != SGPRUsed) @@ -1555,9 +1755,9 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr *MI, // Verify misc. restrictions on specific instructions. if (Desc.getOpcode() == AMDGPU::V_DIV_SCALE_F32 || Desc.getOpcode() == AMDGPU::V_DIV_SCALE_F64) { - const MachineOperand &Src0 = MI->getOperand(Src0Idx); - const MachineOperand &Src1 = MI->getOperand(Src1Idx); - const MachineOperand &Src2 = MI->getOperand(Src2Idx); + const MachineOperand &Src0 = MI.getOperand(Src0Idx); + const MachineOperand &Src1 = MI.getOperand(Src1Idx); + const MachineOperand &Src2 = MI.getOperand(Src2Idx); if (Src0.isReg() && Src1.isReg() && Src2.isReg()) { if (!compareMachineOp(Src0, Src1) && !compareMachineOp(Src0, Src2)) { @@ -1569,9 +1769,8 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr *MI, // Make sure we aren't losing exec uses in the td files. This mostly requires // being careful when using let Uses to try to add other use registers. - if (!isGenericOpcode(Opcode) && !isSALU(Opcode) && !isSMRD(Opcode)) { - const MachineOperand *Exec = MI->findRegisterUseOperand(AMDGPU::EXEC); - if (!Exec || !Exec->isImplicit()) { + if (shouldReadExec(MI)) { + if (!MI.hasRegisterImplicitUseOperand(AMDGPU::EXEC)) { ErrInfo = "VALU instruction does not implicitly read exec mask"; return false; } @@ -1624,22 +1823,18 @@ unsigned SIInstrInfo::getVALUOp(const MachineInstr &MI) { case AMDGPU::S_CMP_GE_I32: return AMDGPU::V_CMP_GE_I32_e32; case AMDGPU::S_CMP_LT_I32: return AMDGPU::V_CMP_LT_I32_e32; case AMDGPU::S_CMP_LE_I32: return AMDGPU::V_CMP_LE_I32_e32; - case AMDGPU::S_LOAD_DWORD_IMM: - case AMDGPU::S_LOAD_DWORD_SGPR: - case AMDGPU::S_LOAD_DWORD_IMM_ci: - return AMDGPU::BUFFER_LOAD_DWORD_ADDR64; - case AMDGPU::S_LOAD_DWORDX2_IMM: - case AMDGPU::S_LOAD_DWORDX2_SGPR: - case AMDGPU::S_LOAD_DWORDX2_IMM_ci: - return AMDGPU::BUFFER_LOAD_DWORDX2_ADDR64; - case AMDGPU::S_LOAD_DWORDX4_IMM: - case AMDGPU::S_LOAD_DWORDX4_SGPR: - case AMDGPU::S_LOAD_DWORDX4_IMM_ci: - return AMDGPU::BUFFER_LOAD_DWORDX4_ADDR64; + case AMDGPU::S_CMP_EQ_U32: return AMDGPU::V_CMP_EQ_U32_e32; + case AMDGPU::S_CMP_LG_U32: return AMDGPU::V_CMP_NE_U32_e32; + case AMDGPU::S_CMP_GT_U32: return AMDGPU::V_CMP_GT_U32_e32; + case AMDGPU::S_CMP_GE_U32: return AMDGPU::V_CMP_GE_U32_e32; + case AMDGPU::S_CMP_LT_U32: return AMDGPU::V_CMP_LT_U32_e32; + case AMDGPU::S_CMP_LE_U32: return AMDGPU::V_CMP_LE_U32_e32; case AMDGPU::S_BCNT1_I32_B32: return AMDGPU::V_BCNT_U32_B32_e64; case AMDGPU::S_FF1_I32_B32: return AMDGPU::V_FFBL_B32_e32; case AMDGPU::S_FLBIT_I32_B32: return AMDGPU::V_FFBH_U32_e32; case AMDGPU::S_FLBIT_I32: return AMDGPU::V_FFBH_I32_e64; + case AMDGPU::S_CBRANCH_SCC0: return AMDGPU::S_CBRANCH_VCCZ; + case AMDGPU::S_CBRANCH_SCC1: return AMDGPU::S_CBRANCH_VCCNZ; } } @@ -1676,12 +1871,12 @@ bool SIInstrInfo::canReadVGPR(const MachineInstr &MI, unsigned OpNo) const { } } -void SIInstrInfo::legalizeOpWithMove(MachineInstr *MI, unsigned OpIdx) const { +void SIInstrInfo::legalizeOpWithMove(MachineInstr &MI, unsigned OpIdx) const { MachineBasicBlock::iterator I = MI; - MachineBasicBlock *MBB = MI->getParent(); - MachineOperand &MO = MI->getOperand(OpIdx); + MachineBasicBlock *MBB = MI.getParent(); + MachineOperand &MO = MI.getOperand(OpIdx); MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo(); - unsigned RCID = get(MI->getOpcode()).OpInfo[OpIdx].RegClass; + unsigned RCID = get(MI.getOpcode()).OpInfo[OpIdx].RegClass; const TargetRegisterClass *RC = RI.getRegClass(RCID); unsigned Opcode = AMDGPU::V_MOV_B32_e32; if (MO.isReg()) @@ -1689,7 +1884,6 @@ void SIInstrInfo::legalizeOpWithMove(MachineInstr *MI, unsigned OpIdx) const { else if (RI.isSGPRClass(RC)) Opcode = AMDGPU::S_MOV_B32; - const TargetRegisterClass *VRC = RI.getEquivalentVGPRClass(RC); if (RI.getCommonSubClass(&AMDGPU::VReg_64RegClass, VRC)) VRC = &AMDGPU::VReg_64RegClass; @@ -1698,8 +1892,7 @@ void SIInstrInfo::legalizeOpWithMove(MachineInstr *MI, unsigned OpIdx) const { unsigned Reg = MRI.createVirtualRegister(VRC); DebugLoc DL = MBB->findDebugLoc(I); - BuildMI(*MI->getParent(), I, DL, get(Opcode), Reg) - .addOperand(MO); + BuildMI(*MI.getParent(), I, DL, get(Opcode), Reg).addOperand(MO); MO.ChangeToRegister(Reg, false); } @@ -1758,11 +1951,11 @@ MachineOperand SIInstrInfo::buildExtractSubRegOrImm( } // Change the order of operands from (0, 1, 2) to (0, 2, 1) -void SIInstrInfo::swapOperands(MachineBasicBlock::iterator Inst) const { - assert(Inst->getNumExplicitOperands() == 3); - MachineOperand Op1 = Inst->getOperand(1); - Inst->RemoveOperand(1); - Inst->addOperand(Op1); +void SIInstrInfo::swapOperands(MachineInstr &Inst) const { + assert(Inst.getNumExplicitOperands() == 3); + MachineOperand Op1 = Inst.getOperand(1); + Inst.RemoveOperand(1); + Inst.addOperand(Op1); } bool SIInstrInfo::isLegalRegOperand(const MachineRegisterInfo &MRI, @@ -1804,26 +1997,32 @@ bool SIInstrInfo::isLegalVSrcOperand(const MachineRegisterInfo &MRI, return true; } -bool SIInstrInfo::isOperandLegal(const MachineInstr *MI, unsigned OpIdx, +bool SIInstrInfo::isOperandLegal(const MachineInstr &MI, unsigned OpIdx, const MachineOperand *MO) const { - const MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo(); - const MCInstrDesc &InstDesc = get(MI->getOpcode()); + const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo(); + const MCInstrDesc &InstDesc = MI.getDesc(); const MCOperandInfo &OpInfo = InstDesc.OpInfo[OpIdx]; const TargetRegisterClass *DefinedRC = OpInfo.RegClass != -1 ? RI.getRegClass(OpInfo.RegClass) : nullptr; if (!MO) - MO = &MI->getOperand(OpIdx); + MO = &MI.getOperand(OpIdx); + + if (isVALU(MI) && usesConstantBus(MRI, *MO, DefinedRC->getSize())) { + + RegSubRegPair SGPRUsed; + if (MO->isReg()) + SGPRUsed = RegSubRegPair(MO->getReg(), MO->getSubReg()); - if (isVALU(*MI) && - usesConstantBus(MRI, *MO, DefinedRC->getSize())) { - unsigned SGPRUsed = - MO->isReg() ? MO->getReg() : (unsigned)AMDGPU::NoRegister; - for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) { + for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) { if (i == OpIdx) continue; - const MachineOperand &Op = MI->getOperand(i); - if (Op.isReg() && Op.getReg() != SGPRUsed && - usesConstantBus(MRI, Op, getOpSize(*MI, i))) { + const MachineOperand &Op = MI.getOperand(i); + if (Op.isReg()) { + if ((Op.getReg() != SGPRUsed.Reg || Op.getSubReg() != SGPRUsed.SubReg) && + usesConstantBus(MRI, Op, getOpSize(MI, i))) { + return false; + } + } else if (InstDesc.OpInfo[i].OperandType == AMDGPU::OPERAND_KIMM32) { return false; } } @@ -1834,7 +2033,6 @@ bool SIInstrInfo::isOperandLegal(const MachineInstr *MI, unsigned OpIdx, return isLegalRegOperand(MRI, OpInfo, *MO); } - // Handle non-register types that are treated like immediates. assert(MO->isImm() || MO->isTargetIndex() || MO->isFI()); @@ -1847,12 +2045,12 @@ bool SIInstrInfo::isOperandLegal(const MachineInstr *MI, unsigned OpIdx, } void SIInstrInfo::legalizeOperandsVOP2(MachineRegisterInfo &MRI, - MachineInstr *MI) const { - unsigned Opc = MI->getOpcode(); + MachineInstr &MI) const { + unsigned Opc = MI.getOpcode(); const MCInstrDesc &InstrDesc = get(Opc); int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1); - MachineOperand &Src1 = MI->getOperand(Src1Idx); + MachineOperand &Src1 = MI.getOperand(Src1Idx); // If there is an implicit SGPR use such as VCC use for v_addc_u32/v_subb_u32 // we need to only have one constant bus use. @@ -1860,10 +2058,10 @@ void SIInstrInfo::legalizeOperandsVOP2(MachineRegisterInfo &MRI, // Note we do not need to worry about literal constants here. They are // disabled for the operand type for instructions because they will always // violate the one constant bus use rule. - bool HasImplicitSGPR = findImplicitSGPRRead(*MI) != AMDGPU::NoRegister; + bool HasImplicitSGPR = findImplicitSGPRRead(MI) != AMDGPU::NoRegister; if (HasImplicitSGPR) { int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0); - MachineOperand &Src0 = MI->getOperand(Src0Idx); + MachineOperand &Src0 = MI.getOperand(Src0Idx); if (Src0.isReg() && RI.isSGPRReg(MRI, Src0.getReg())) legalizeOpWithMove(MI, Src0Idx); @@ -1878,13 +2076,13 @@ void SIInstrInfo::legalizeOperandsVOP2(MachineRegisterInfo &MRI, // commute if it is possible. We only want to commute here if it improves // legality. This can be called a fairly large number of times so don't waste // compile time pointlessly swapping and checking legality again. - if (HasImplicitSGPR || !MI->isCommutable()) { + if (HasImplicitSGPR || !MI.isCommutable()) { legalizeOpWithMove(MI, Src1Idx); return; } int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0); - MachineOperand &Src0 = MI->getOperand(Src0Idx); + MachineOperand &Src0 = MI.getOperand(Src0Idx); // If src0 can be used as src1, commuting will make the operands legal. // Otherwise we have to give up and insert a move. @@ -1897,13 +2095,13 @@ void SIInstrInfo::legalizeOperandsVOP2(MachineRegisterInfo &MRI, return; } - int CommutedOpc = commuteOpcode(*MI); + int CommutedOpc = commuteOpcode(MI); if (CommutedOpc == -1) { legalizeOpWithMove(MI, Src1Idx); return; } - MI->setDesc(get(CommutedOpc)); + MI.setDesc(get(CommutedOpc)); unsigned Src0Reg = Src0.getReg(); unsigned Src0SubReg = Src0.getSubReg(); @@ -1925,10 +2123,9 @@ void SIInstrInfo::legalizeOperandsVOP2(MachineRegisterInfo &MRI, // operand, and since literal constants are not allowed and should never be // seen, we only need to worry about inserting copies if we use multiple SGPR // operands. -void SIInstrInfo::legalizeOperandsVOP3( - MachineRegisterInfo &MRI, - MachineInstr *MI) const { - unsigned Opc = MI->getOpcode(); +void SIInstrInfo::legalizeOperandsVOP3(MachineRegisterInfo &MRI, + MachineInstr &MI) const { + unsigned Opc = MI.getOpcode(); int VOP3Idx[3] = { AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0), @@ -1943,7 +2140,7 @@ void SIInstrInfo::legalizeOperandsVOP3( int Idx = VOP3Idx[i]; if (Idx == -1) break; - MachineOperand &MO = MI->getOperand(Idx); + MachineOperand &MO = MI.getOperand(Idx); // We should never see a VOP3 instruction with an illegal immediate operand. if (!MO.isReg()) @@ -1964,32 +2161,78 @@ void SIInstrInfo::legalizeOperandsVOP3( } } -void SIInstrInfo::legalizeOperands(MachineInstr *MI) const { - MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo(); +unsigned SIInstrInfo::readlaneVGPRToSGPR(unsigned SrcReg, MachineInstr &UseMI, + MachineRegisterInfo &MRI) const { + const TargetRegisterClass *VRC = MRI.getRegClass(SrcReg); + const TargetRegisterClass *SRC = RI.getEquivalentSGPRClass(VRC); + unsigned DstReg = MRI.createVirtualRegister(SRC); + unsigned SubRegs = VRC->getSize() / 4; + + SmallVector<unsigned, 8> SRegs; + for (unsigned i = 0; i < SubRegs; ++i) { + unsigned SGPR = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); + BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(), + get(AMDGPU::V_READFIRSTLANE_B32), SGPR) + .addReg(SrcReg, 0, RI.getSubRegFromChannel(i)); + SRegs.push_back(SGPR); + } + + MachineInstrBuilder MIB = + BuildMI(*UseMI.getParent(), UseMI, UseMI.getDebugLoc(), + get(AMDGPU::REG_SEQUENCE), DstReg); + for (unsigned i = 0; i < SubRegs; ++i) { + MIB.addReg(SRegs[i]); + MIB.addImm(RI.getSubRegFromChannel(i)); + } + return DstReg; +} + +void SIInstrInfo::legalizeOperandsSMRD(MachineRegisterInfo &MRI, + MachineInstr &MI) const { + + // If the pointer is store in VGPRs, then we need to move them to + // SGPRs using v_readfirstlane. This is safe because we only select + // loads with uniform pointers to SMRD instruction so we know the + // pointer value is uniform. + MachineOperand *SBase = getNamedOperand(MI, AMDGPU::OpName::sbase); + if (SBase && !RI.isSGPRClass(MRI.getRegClass(SBase->getReg()))) { + unsigned SGPR = readlaneVGPRToSGPR(SBase->getReg(), MI, MRI); + SBase->setReg(SGPR); + } +} + +void SIInstrInfo::legalizeOperands(MachineInstr &MI) const { + MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo(); // Legalize VOP2 - if (isVOP2(*MI)) { + if (isVOP2(MI) || isVOPC(MI)) { legalizeOperandsVOP2(MRI, MI); return; } // Legalize VOP3 - if (isVOP3(*MI)) { + if (isVOP3(MI)) { legalizeOperandsVOP3(MRI, MI); return; } + // Legalize SMRD + if (isSMRD(MI)) { + legalizeOperandsSMRD(MRI, MI); + return; + } + // Legalize REG_SEQUENCE and PHI // The register class of the operands much be the same type as the register // class of the output. - if (MI->getOpcode() == AMDGPU::PHI) { + if (MI.getOpcode() == AMDGPU::PHI) { const TargetRegisterClass *RC = nullptr, *SRC = nullptr, *VRC = nullptr; - for (unsigned i = 1, e = MI->getNumOperands(); i != e; i+=2) { - if (!MI->getOperand(i).isReg() || - !TargetRegisterInfo::isVirtualRegister(MI->getOperand(i).getReg())) + for (unsigned i = 1, e = MI.getNumOperands(); i != e; i += 2) { + if (!MI.getOperand(i).isReg() || + !TargetRegisterInfo::isVirtualRegister(MI.getOperand(i).getReg())) continue; const TargetRegisterClass *OpRC = - MRI.getRegClass(MI->getOperand(i).getReg()); + MRI.getRegClass(MI.getOperand(i).getReg()); if (RI.hasVGPRs(OpRC)) { VRC = OpRC; } else { @@ -2000,7 +2243,7 @@ void SIInstrInfo::legalizeOperands(MachineInstr *MI) const { // If any of the operands are VGPR registers, then they all most be // otherwise we will create illegal VGPR->SGPR copies when legalizing // them. - if (VRC || !RI.isSGPRClass(getOpRegClass(*MI, 0))) { + if (VRC || !RI.isSGPRClass(getOpRegClass(MI, 0))) { if (!VRC) { assert(SRC); VRC = RI.getEquivalentVGPRClass(SRC); @@ -2011,18 +2254,18 @@ void SIInstrInfo::legalizeOperands(MachineInstr *MI) const { } // Update all the operands so they have the same type. - for (unsigned I = 1, E = MI->getNumOperands(); I != E; I += 2) { - MachineOperand &Op = MI->getOperand(I); + for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) { + MachineOperand &Op = MI.getOperand(I); if (!Op.isReg() || !TargetRegisterInfo::isVirtualRegister(Op.getReg())) continue; unsigned DstReg = MRI.createVirtualRegister(RC); // MI is a PHI instruction. - MachineBasicBlock *InsertBB = MI->getOperand(I + 1).getMBB(); + MachineBasicBlock *InsertBB = MI.getOperand(I + 1).getMBB(); MachineBasicBlock::iterator Insert = InsertBB->getFirstTerminator(); - BuildMI(*InsertBB, Insert, MI->getDebugLoc(), get(AMDGPU::COPY), DstReg) - .addOperand(Op); + BuildMI(*InsertBB, Insert, MI.getDebugLoc(), get(AMDGPU::COPY), DstReg) + .addOperand(Op); Op.setReg(DstReg); } } @@ -2030,15 +2273,15 @@ void SIInstrInfo::legalizeOperands(MachineInstr *MI) const { // REG_SEQUENCE doesn't really require operand legalization, but if one has a // VGPR dest type and SGPR sources, insert copies so all operands are // VGPRs. This seems to help operand folding / the register coalescer. - if (MI->getOpcode() == AMDGPU::REG_SEQUENCE) { - MachineBasicBlock *MBB = MI->getParent(); - const TargetRegisterClass *DstRC = getOpRegClass(*MI, 0); + if (MI.getOpcode() == AMDGPU::REG_SEQUENCE) { + MachineBasicBlock *MBB = MI.getParent(); + const TargetRegisterClass *DstRC = getOpRegClass(MI, 0); if (RI.hasVGPRs(DstRC)) { // Update all the operands so they are VGPR register classes. These may // not be the same register class because REG_SEQUENCE supports mixing // subregister index types e.g. sub0_sub1 + sub2 + sub3 - for (unsigned I = 1, E = MI->getNumOperands(); I != E; I += 2) { - MachineOperand &Op = MI->getOperand(I); + for (unsigned I = 1, E = MI.getNumOperands(); I != E; I += 2) { + MachineOperand &Op = MI.getOperand(I); if (!Op.isReg() || !TargetRegisterInfo::isVirtualRegister(Op.getReg())) continue; @@ -2049,8 +2292,8 @@ void SIInstrInfo::legalizeOperands(MachineInstr *MI) const { unsigned DstReg = MRI.createVirtualRegister(VRC); - BuildMI(*MBB, MI, MI->getDebugLoc(), get(AMDGPU::COPY), DstReg) - .addOperand(Op); + BuildMI(*MBB, MI, MI.getDebugLoc(), get(AMDGPU::COPY), DstReg) + .addOperand(Op); Op.setReg(DstReg); Op.setIsKill(); @@ -2062,17 +2305,33 @@ void SIInstrInfo::legalizeOperands(MachineInstr *MI) const { // Legalize INSERT_SUBREG // src0 must have the same register class as dst - if (MI->getOpcode() == AMDGPU::INSERT_SUBREG) { - unsigned Dst = MI->getOperand(0).getReg(); - unsigned Src0 = MI->getOperand(1).getReg(); + if (MI.getOpcode() == AMDGPU::INSERT_SUBREG) { + unsigned Dst = MI.getOperand(0).getReg(); + unsigned Src0 = MI.getOperand(1).getReg(); const TargetRegisterClass *DstRC = MRI.getRegClass(Dst); const TargetRegisterClass *Src0RC = MRI.getRegClass(Src0); if (DstRC != Src0RC) { - MachineBasicBlock &MBB = *MI->getParent(); + MachineBasicBlock &MBB = *MI.getParent(); unsigned NewSrc0 = MRI.createVirtualRegister(DstRC); - BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::COPY), NewSrc0) - .addReg(Src0); - MI->getOperand(1).setReg(NewSrc0); + BuildMI(MBB, MI, MI.getDebugLoc(), get(AMDGPU::COPY), NewSrc0) + .addReg(Src0); + MI.getOperand(1).setReg(NewSrc0); + } + return; + } + + // Legalize MIMG + if (isMIMG(MI)) { + MachineOperand *SRsrc = getNamedOperand(MI, AMDGPU::OpName::srsrc); + if (SRsrc && !RI.isSGPRClass(MRI.getRegClass(SRsrc->getReg()))) { + unsigned SGPR = readlaneVGPRToSGPR(SRsrc->getReg(), MI, MRI); + SRsrc->setReg(SGPR); + } + + MachineOperand *SSamp = getNamedOperand(MI, AMDGPU::OpName::ssamp); + if (SSamp && !RI.isSGPRClass(MRI.getRegClass(SSamp->getReg()))) { + unsigned SGPR = readlaneVGPRToSGPR(SSamp->getReg(), MI, MRI); + SSamp->setReg(SGPR); } return; } @@ -2081,11 +2340,11 @@ void SIInstrInfo::legalizeOperands(MachineInstr *MI) const { // FIXME: If we start using the non-addr64 instructions for compute, we // may need to legalize them here. int SRsrcIdx = - AMDGPU::getNamedOperandIdx(MI->getOpcode(), AMDGPU::OpName::srsrc); + AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::srsrc); if (SRsrcIdx != -1) { // We have an MUBUF instruction - MachineOperand *SRsrc = &MI->getOperand(SRsrcIdx); - unsigned SRsrcRC = get(MI->getOpcode()).OpInfo[SRsrcIdx].RegClass; + MachineOperand *SRsrc = &MI.getOperand(SRsrcIdx); + unsigned SRsrcRC = get(MI.getOpcode()).OpInfo[SRsrcIdx].RegClass; if (RI.getCommonSubClass(MRI.getRegClass(SRsrc->getReg()), RI.getRegClass(SRsrcRC))) { // The operands are legal. @@ -2093,7 +2352,7 @@ void SIInstrInfo::legalizeOperands(MachineInstr *MI) const { return; } - MachineBasicBlock &MBB = *MI->getParent(); + MachineBasicBlock &MBB = *MI.getParent(); // Extract the ptr from the resource descriptor. unsigned SRsrcPtr = buildExtractSubReg(MI, MRI, *SRsrc, @@ -2107,30 +2366,27 @@ void SIInstrInfo::legalizeOperands(MachineInstr *MI) const { uint64_t RsrcDataFormat = getDefaultRsrcDataFormat(); // Zero64 = 0 - BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::S_MOV_B64), - Zero64) - .addImm(0); + BuildMI(MBB, MI, MI.getDebugLoc(), get(AMDGPU::S_MOV_B64), Zero64) + .addImm(0); // SRsrcFormatLo = RSRC_DATA_FORMAT{31-0} - BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::S_MOV_B32), - SRsrcFormatLo) - .addImm(RsrcDataFormat & 0xFFFFFFFF); + BuildMI(MBB, MI, MI.getDebugLoc(), get(AMDGPU::S_MOV_B32), SRsrcFormatLo) + .addImm(RsrcDataFormat & 0xFFFFFFFF); // SRsrcFormatHi = RSRC_DATA_FORMAT{63-32} - BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::S_MOV_B32), - SRsrcFormatHi) - .addImm(RsrcDataFormat >> 32); + BuildMI(MBB, MI, MI.getDebugLoc(), get(AMDGPU::S_MOV_B32), SRsrcFormatHi) + .addImm(RsrcDataFormat >> 32); // NewSRsrc = {Zero64, SRsrcFormat} - BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::REG_SEQUENCE), NewSRsrc) - .addReg(Zero64) - .addImm(AMDGPU::sub0_sub1) - .addReg(SRsrcFormatLo) - .addImm(AMDGPU::sub2) - .addReg(SRsrcFormatHi) - .addImm(AMDGPU::sub3); - - MachineOperand *VAddr = getNamedOperand(*MI, AMDGPU::OpName::vaddr); + BuildMI(MBB, MI, MI.getDebugLoc(), get(AMDGPU::REG_SEQUENCE), NewSRsrc) + .addReg(Zero64) + .addImm(AMDGPU::sub0_sub1) + .addReg(SRsrcFormatLo) + .addImm(AMDGPU::sub2) + .addReg(SRsrcFormatHi) + .addImm(AMDGPU::sub3); + + MachineOperand *VAddr = getNamedOperand(MI, AMDGPU::OpName::vaddr); unsigned NewVAddr = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass); if (VAddr) { // This is already an ADDR64 instruction so we need to add the pointer @@ -2139,7 +2395,7 @@ void SIInstrInfo::legalizeOperands(MachineInstr *MI) const { unsigned NewVAddrHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); // NewVaddrLo = SRsrcPtr:sub0 + VAddr:sub0 - DebugLoc DL = MI->getDebugLoc(); + DebugLoc DL = MI.getDebugLoc(); BuildMI(MBB, MI, DL, get(AMDGPU::V_ADD_I32_e32), NewVAddrLo) .addReg(SRsrcPtr, 0, AMDGPU::sub0) .addReg(VAddr->getReg(), 0, AMDGPU::sub0); @@ -2150,82 +2406,82 @@ void SIInstrInfo::legalizeOperands(MachineInstr *MI) const { .addReg(VAddr->getReg(), 0, AMDGPU::sub1); // NewVaddr = {NewVaddrHi, NewVaddrLo} - BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::REG_SEQUENCE), NewVAddr) - .addReg(NewVAddrLo) - .addImm(AMDGPU::sub0) - .addReg(NewVAddrHi) - .addImm(AMDGPU::sub1); + BuildMI(MBB, MI, MI.getDebugLoc(), get(AMDGPU::REG_SEQUENCE), NewVAddr) + .addReg(NewVAddrLo) + .addImm(AMDGPU::sub0) + .addReg(NewVAddrHi) + .addImm(AMDGPU::sub1); } else { // This instructions is the _OFFSET variant, so we need to convert it to // ADDR64. - assert(MBB.getParent()->getSubtarget<AMDGPUSubtarget>().getGeneration() - < AMDGPUSubtarget::VOLCANIC_ISLANDS && + assert(MBB.getParent()->getSubtarget<SISubtarget>().getGeneration() + < SISubtarget::VOLCANIC_ISLANDS && "FIXME: Need to emit flat atomics here"); - MachineOperand *VData = getNamedOperand(*MI, AMDGPU::OpName::vdata); - MachineOperand *Offset = getNamedOperand(*MI, AMDGPU::OpName::offset); - MachineOperand *SOffset = getNamedOperand(*MI, AMDGPU::OpName::soffset); - unsigned Addr64Opcode = AMDGPU::getAddr64Inst(MI->getOpcode()); + MachineOperand *VData = getNamedOperand(MI, AMDGPU::OpName::vdata); + MachineOperand *Offset = getNamedOperand(MI, AMDGPU::OpName::offset); + MachineOperand *SOffset = getNamedOperand(MI, AMDGPU::OpName::soffset); + unsigned Addr64Opcode = AMDGPU::getAddr64Inst(MI.getOpcode()); // Atomics rith return have have an additional tied operand and are // missing some of the special bits. - MachineOperand *VDataIn = getNamedOperand(*MI, AMDGPU::OpName::vdata_in); + MachineOperand *VDataIn = getNamedOperand(MI, AMDGPU::OpName::vdata_in); MachineInstr *Addr64; if (!VDataIn) { // Regular buffer load / store. - MachineInstrBuilder MIB - = BuildMI(MBB, MI, MI->getDebugLoc(), get(Addr64Opcode)) - .addOperand(*VData) - .addReg(AMDGPU::NoRegister) // Dummy value for vaddr. - // This will be replaced later - // with the new value of vaddr. - .addOperand(*SRsrc) - .addOperand(*SOffset) - .addOperand(*Offset); + MachineInstrBuilder MIB = + BuildMI(MBB, MI, MI.getDebugLoc(), get(Addr64Opcode)) + .addOperand(*VData) + .addReg(AMDGPU::NoRegister) // Dummy value for vaddr. + // This will be replaced later + // with the new value of vaddr. + .addOperand(*SRsrc) + .addOperand(*SOffset) + .addOperand(*Offset); // Atomics do not have this operand. - if (const MachineOperand *GLC - = getNamedOperand(*MI, AMDGPU::OpName::glc)) { + if (const MachineOperand *GLC = + getNamedOperand(MI, AMDGPU::OpName::glc)) { MIB.addImm(GLC->getImm()); } - MIB.addImm(getNamedImmOperand(*MI, AMDGPU::OpName::slc)); + MIB.addImm(getNamedImmOperand(MI, AMDGPU::OpName::slc)); - if (const MachineOperand *TFE - = getNamedOperand(*MI, AMDGPU::OpName::tfe)) { + if (const MachineOperand *TFE = + getNamedOperand(MI, AMDGPU::OpName::tfe)) { MIB.addImm(TFE->getImm()); } - MIB.setMemRefs(MI->memoperands_begin(), MI->memoperands_end()); + MIB.setMemRefs(MI.memoperands_begin(), MI.memoperands_end()); Addr64 = MIB; } else { // Atomics with return. - Addr64 = BuildMI(MBB, MI, MI->getDebugLoc(), get(Addr64Opcode)) - .addOperand(*VData) - .addOperand(*VDataIn) - .addReg(AMDGPU::NoRegister) // Dummy value for vaddr. - // This will be replaced later - // with the new value of vaddr. - .addOperand(*SRsrc) - .addOperand(*SOffset) - .addOperand(*Offset) - .addImm(getNamedImmOperand(*MI, AMDGPU::OpName::slc)) - .setMemRefs(MI->memoperands_begin(), MI->memoperands_end()); + Addr64 = BuildMI(MBB, MI, MI.getDebugLoc(), get(Addr64Opcode)) + .addOperand(*VData) + .addOperand(*VDataIn) + .addReg(AMDGPU::NoRegister) // Dummy value for vaddr. + // This will be replaced later + // with the new value of vaddr. + .addOperand(*SRsrc) + .addOperand(*SOffset) + .addOperand(*Offset) + .addImm(getNamedImmOperand(MI, AMDGPU::OpName::slc)) + .setMemRefs(MI.memoperands_begin(), MI.memoperands_end()); } - MI->removeFromParent(); - MI = Addr64; + MI.removeFromParent(); // NewVaddr = {NewVaddrHi, NewVaddrLo} - BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::REG_SEQUENCE), NewVAddr) - .addReg(SRsrcPtr, 0, AMDGPU::sub0) - .addImm(AMDGPU::sub0) - .addReg(SRsrcPtr, 0, AMDGPU::sub1) - .addImm(AMDGPU::sub1); - - VAddr = getNamedOperand(*MI, AMDGPU::OpName::vaddr); - SRsrc = getNamedOperand(*MI, AMDGPU::OpName::srsrc); + BuildMI(MBB, Addr64, Addr64->getDebugLoc(), get(AMDGPU::REG_SEQUENCE), + NewVAddr) + .addReg(SRsrcPtr, 0, AMDGPU::sub0) + .addImm(AMDGPU::sub0) + .addReg(SRsrcPtr, 0, AMDGPU::sub1) + .addImm(AMDGPU::sub1); + + VAddr = getNamedOperand(*Addr64, AMDGPU::OpName::vaddr); + SRsrc = getNamedOperand(*Addr64, AMDGPU::OpName::srsrc); } // Update the instruction to use NewVaddr @@ -2235,300 +2491,85 @@ void SIInstrInfo::legalizeOperands(MachineInstr *MI) const { } } -void SIInstrInfo::splitSMRD(MachineInstr *MI, - const TargetRegisterClass *HalfRC, - unsigned HalfImmOp, unsigned HalfSGPROp, - MachineInstr *&Lo, MachineInstr *&Hi) const { - - DebugLoc DL = MI->getDebugLoc(); - MachineBasicBlock *MBB = MI->getParent(); - MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo(); - unsigned RegLo = MRI.createVirtualRegister(HalfRC); - unsigned RegHi = MRI.createVirtualRegister(HalfRC); - unsigned HalfSize = HalfRC->getSize(); - const MachineOperand *OffOp = - getNamedOperand(*MI, AMDGPU::OpName::offset); - const MachineOperand *SBase = getNamedOperand(*MI, AMDGPU::OpName::sbase); - - // The SMRD has an 8-bit offset in dwords on SI and a 20-bit offset in bytes - // on VI. - - bool IsKill = SBase->isKill(); - if (OffOp) { - bool isVI = - MBB->getParent()->getSubtarget<AMDGPUSubtarget>().getGeneration() >= - AMDGPUSubtarget::VOLCANIC_ISLANDS; - unsigned OffScale = isVI ? 1 : 4; - // Handle the _IMM variant - unsigned LoOffset = OffOp->getImm() * OffScale; - unsigned HiOffset = LoOffset + HalfSize; - Lo = BuildMI(*MBB, MI, DL, get(HalfImmOp), RegLo) - // Use addReg instead of addOperand - // to make sure kill flag is cleared. - .addReg(SBase->getReg(), 0, SBase->getSubReg()) - .addImm(LoOffset / OffScale); - - if (!isUInt<20>(HiOffset) || (!isVI && !isUInt<8>(HiOffset / OffScale))) { - unsigned OffsetSGPR = - MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); - BuildMI(*MBB, MI, DL, get(AMDGPU::S_MOV_B32), OffsetSGPR) - .addImm(HiOffset); // The offset in register is in bytes. - Hi = BuildMI(*MBB, MI, DL, get(HalfSGPROp), RegHi) - .addReg(SBase->getReg(), getKillRegState(IsKill), - SBase->getSubReg()) - .addReg(OffsetSGPR); - } else { - Hi = BuildMI(*MBB, MI, DL, get(HalfImmOp), RegHi) - .addReg(SBase->getReg(), getKillRegState(IsKill), - SBase->getSubReg()) - .addImm(HiOffset / OffScale); - } - } else { - // Handle the _SGPR variant - MachineOperand *SOff = getNamedOperand(*MI, AMDGPU::OpName::soff); - Lo = BuildMI(*MBB, MI, DL, get(HalfSGPROp), RegLo) - .addReg(SBase->getReg(), 0, SBase->getSubReg()) - .addOperand(*SOff); - unsigned OffsetSGPR = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); - BuildMI(*MBB, MI, DL, get(AMDGPU::S_ADD_I32), OffsetSGPR) - .addReg(SOff->getReg(), 0, SOff->getSubReg()) - .addImm(HalfSize); - Hi = BuildMI(*MBB, MI, DL, get(HalfSGPROp), RegHi) - .addReg(SBase->getReg(), getKillRegState(IsKill), - SBase->getSubReg()) - .addReg(OffsetSGPR); - } - - unsigned SubLo, SubHi; - const TargetRegisterClass *NewDstRC; - switch (HalfSize) { - case 4: - SubLo = AMDGPU::sub0; - SubHi = AMDGPU::sub1; - NewDstRC = &AMDGPU::VReg_64RegClass; - break; - case 8: - SubLo = AMDGPU::sub0_sub1; - SubHi = AMDGPU::sub2_sub3; - NewDstRC = &AMDGPU::VReg_128RegClass; - break; - case 16: - SubLo = AMDGPU::sub0_sub1_sub2_sub3; - SubHi = AMDGPU::sub4_sub5_sub6_sub7; - NewDstRC = &AMDGPU::VReg_256RegClass; - break; - case 32: - SubLo = AMDGPU::sub0_sub1_sub2_sub3_sub4_sub5_sub6_sub7; - SubHi = AMDGPU::sub8_sub9_sub10_sub11_sub12_sub13_sub14_sub15; - NewDstRC = &AMDGPU::VReg_512RegClass; - break; - default: - llvm_unreachable("Unhandled HalfSize"); - } - - unsigned OldDst = MI->getOperand(0).getReg(); - unsigned NewDst = MRI.createVirtualRegister(NewDstRC); - - MRI.replaceRegWith(OldDst, NewDst); - - BuildMI(*MBB, MI, DL, get(AMDGPU::REG_SEQUENCE), NewDst) - .addReg(RegLo) - .addImm(SubLo) - .addReg(RegHi) - .addImm(SubHi); -} - -void SIInstrInfo::moveSMRDToVALU(MachineInstr *MI, - MachineRegisterInfo &MRI, - SmallVectorImpl<MachineInstr *> &Worklist) const { - MachineBasicBlock *MBB = MI->getParent(); - int DstIdx = AMDGPU::getNamedOperandIdx(MI->getOpcode(), AMDGPU::OpName::dst); - assert(DstIdx != -1); - unsigned DstRCID = get(MI->getOpcode()).OpInfo[DstIdx].RegClass; - switch(RI.getRegClass(DstRCID)->getSize()) { - case 4: - case 8: - case 16: { - unsigned NewOpcode = getVALUOp(*MI); - unsigned RegOffset; - unsigned ImmOffset; - - if (MI->getOperand(2).isReg()) { - RegOffset = MI->getOperand(2).getReg(); - ImmOffset = 0; - } else { - assert(MI->getOperand(2).isImm()); - // SMRD instructions take a dword offsets on SI and byte offset on VI - // and MUBUF instructions always take a byte offset. - ImmOffset = MI->getOperand(2).getImm(); - if (MBB->getParent()->getSubtarget<AMDGPUSubtarget>().getGeneration() <= - AMDGPUSubtarget::SEA_ISLANDS) - ImmOffset <<= 2; - RegOffset = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); - - if (isUInt<12>(ImmOffset)) { - BuildMI(*MBB, MI, MI->getDebugLoc(), get(AMDGPU::S_MOV_B32), - RegOffset) - .addImm(0); - } else { - BuildMI(*MBB, MI, MI->getDebugLoc(), get(AMDGPU::S_MOV_B32), - RegOffset) - .addImm(ImmOffset); - ImmOffset = 0; - } - } - - unsigned SRsrc = MRI.createVirtualRegister(&AMDGPU::SReg_128RegClass); - unsigned DWord0 = RegOffset; - unsigned DWord1 = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); - unsigned DWord2 = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); - unsigned DWord3 = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); - uint64_t RsrcDataFormat = getDefaultRsrcDataFormat(); - - BuildMI(*MBB, MI, MI->getDebugLoc(), get(AMDGPU::S_MOV_B32), DWord1) - .addImm(0); - BuildMI(*MBB, MI, MI->getDebugLoc(), get(AMDGPU::S_MOV_B32), DWord2) - .addImm(RsrcDataFormat & 0xFFFFFFFF); - BuildMI(*MBB, MI, MI->getDebugLoc(), get(AMDGPU::S_MOV_B32), DWord3) - .addImm(RsrcDataFormat >> 32); - BuildMI(*MBB, MI, MI->getDebugLoc(), get(AMDGPU::REG_SEQUENCE), SRsrc) - .addReg(DWord0) - .addImm(AMDGPU::sub0) - .addReg(DWord1) - .addImm(AMDGPU::sub1) - .addReg(DWord2) - .addImm(AMDGPU::sub2) - .addReg(DWord3) - .addImm(AMDGPU::sub3); - - const MCInstrDesc &NewInstDesc = get(NewOpcode); - const TargetRegisterClass *NewDstRC - = RI.getRegClass(NewInstDesc.OpInfo[0].RegClass); - unsigned NewDstReg = MRI.createVirtualRegister(NewDstRC); - unsigned DstReg = MI->getOperand(0).getReg(); - MRI.replaceRegWith(DstReg, NewDstReg); - - MachineInstr *NewInst = - BuildMI(*MBB, MI, MI->getDebugLoc(), NewInstDesc, NewDstReg) - .addOperand(MI->getOperand(1)) // sbase - .addReg(SRsrc) - .addImm(0) - .addImm(ImmOffset) - .addImm(0) // glc - .addImm(0) // slc - .addImm(0) // tfe - .setMemRefs(MI->memoperands_begin(), MI->memoperands_end()); - MI->eraseFromParent(); - - legalizeOperands(NewInst); - addUsersToMoveToVALUWorklist(NewDstReg, MRI, Worklist); - break; - } - case 32: { - MachineInstr *Lo, *Hi; - splitSMRD(MI, &AMDGPU::SReg_128RegClass, AMDGPU::S_LOAD_DWORDX4_IMM, - AMDGPU::S_LOAD_DWORDX4_SGPR, Lo, Hi); - MI->eraseFromParent(); - moveSMRDToVALU(Lo, MRI, Worklist); - moveSMRDToVALU(Hi, MRI, Worklist); - break; - } - - case 64: { - MachineInstr *Lo, *Hi; - splitSMRD(MI, &AMDGPU::SReg_256RegClass, AMDGPU::S_LOAD_DWORDX8_IMM, - AMDGPU::S_LOAD_DWORDX8_SGPR, Lo, Hi); - MI->eraseFromParent(); - moveSMRDToVALU(Lo, MRI, Worklist); - moveSMRDToVALU(Hi, MRI, Worklist); - break; - } - } -} - void SIInstrInfo::moveToVALU(MachineInstr &TopInst) const { SmallVector<MachineInstr *, 128> Worklist; Worklist.push_back(&TopInst); while (!Worklist.empty()) { - MachineInstr *Inst = Worklist.pop_back_val(); - MachineBasicBlock *MBB = Inst->getParent(); + MachineInstr &Inst = *Worklist.pop_back_val(); + MachineBasicBlock *MBB = Inst.getParent(); MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo(); - unsigned Opcode = Inst->getOpcode(); - unsigned NewOpcode = getVALUOp(*Inst); + unsigned Opcode = Inst.getOpcode(); + unsigned NewOpcode = getVALUOp(Inst); // Handle some special cases switch (Opcode) { default: - if (isSMRD(*Inst)) { - moveSMRDToVALU(Inst, MRI, Worklist); - continue; - } break; case AMDGPU::S_AND_B64: splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::V_AND_B32_e64); - Inst->eraseFromParent(); + Inst.eraseFromParent(); continue; case AMDGPU::S_OR_B64: splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::V_OR_B32_e64); - Inst->eraseFromParent(); + Inst.eraseFromParent(); continue; case AMDGPU::S_XOR_B64: splitScalar64BitBinaryOp(Worklist, Inst, AMDGPU::V_XOR_B32_e64); - Inst->eraseFromParent(); + Inst.eraseFromParent(); continue; case AMDGPU::S_NOT_B64: splitScalar64BitUnaryOp(Worklist, Inst, AMDGPU::V_NOT_B32_e32); - Inst->eraseFromParent(); + Inst.eraseFromParent(); continue; case AMDGPU::S_BCNT1_I32_B64: splitScalar64BitBCNT(Worklist, Inst); - Inst->eraseFromParent(); + Inst.eraseFromParent(); continue; case AMDGPU::S_BFE_I64: { splitScalar64BitBFE(Worklist, Inst); - Inst->eraseFromParent(); + Inst.eraseFromParent(); continue; } case AMDGPU::S_LSHL_B32: - if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) { + if (ST.getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) { NewOpcode = AMDGPU::V_LSHLREV_B32_e64; swapOperands(Inst); } break; case AMDGPU::S_ASHR_I32: - if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) { + if (ST.getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) { NewOpcode = AMDGPU::V_ASHRREV_I32_e64; swapOperands(Inst); } break; case AMDGPU::S_LSHR_B32: - if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) { + if (ST.getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) { NewOpcode = AMDGPU::V_LSHRREV_B32_e64; swapOperands(Inst); } break; case AMDGPU::S_LSHL_B64: - if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) { + if (ST.getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) { NewOpcode = AMDGPU::V_LSHLREV_B64; swapOperands(Inst); } break; case AMDGPU::S_ASHR_I64: - if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) { + if (ST.getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) { NewOpcode = AMDGPU::V_ASHRREV_I64; swapOperands(Inst); } break; case AMDGPU::S_LSHR_B64: - if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) { + if (ST.getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) { NewOpcode = AMDGPU::V_LSHRREV_B64; swapOperands(Inst); } @@ -2536,9 +2577,18 @@ void SIInstrInfo::moveToVALU(MachineInstr &TopInst) const { case AMDGPU::S_ABS_I32: lowerScalarAbs(Worklist, Inst); - Inst->eraseFromParent(); + Inst.eraseFromParent(); continue; + case AMDGPU::S_CBRANCH_SCC0: + case AMDGPU::S_CBRANCH_SCC1: + // Clear unused bits of vcc + BuildMI(*MBB, Inst, Inst.getDebugLoc(), get(AMDGPU::S_AND_B64), + AMDGPU::VCC) + .addReg(AMDGPU::EXEC) + .addReg(AMDGPU::VCC); + break; + case AMDGPU::S_BFE_U64: case AMDGPU::S_BFM_B64: llvm_unreachable("Moving this op to VALU not implemented"); @@ -2553,34 +2603,36 @@ void SIInstrInfo::moveToVALU(MachineInstr &TopInst) const { // Use the new VALU Opcode. const MCInstrDesc &NewDesc = get(NewOpcode); - Inst->setDesc(NewDesc); + Inst.setDesc(NewDesc); // Remove any references to SCC. Vector instructions can't read from it, and // We're just about to add the implicit use / defs of VCC, and we don't want // both. - for (unsigned i = Inst->getNumOperands() - 1; i > 0; --i) { - MachineOperand &Op = Inst->getOperand(i); - if (Op.isReg() && Op.getReg() == AMDGPU::SCC) - Inst->RemoveOperand(i); + for (unsigned i = Inst.getNumOperands() - 1; i > 0; --i) { + MachineOperand &Op = Inst.getOperand(i); + if (Op.isReg() && Op.getReg() == AMDGPU::SCC) { + Inst.RemoveOperand(i); + addSCCDefUsersToVALUWorklist(Inst, Worklist); + } } if (Opcode == AMDGPU::S_SEXT_I32_I8 || Opcode == AMDGPU::S_SEXT_I32_I16) { // We are converting these to a BFE, so we need to add the missing // operands for the size and offset. unsigned Size = (Opcode == AMDGPU::S_SEXT_I32_I8) ? 8 : 16; - Inst->addOperand(MachineOperand::CreateImm(0)); - Inst->addOperand(MachineOperand::CreateImm(Size)); + Inst.addOperand(MachineOperand::CreateImm(0)); + Inst.addOperand(MachineOperand::CreateImm(Size)); } else if (Opcode == AMDGPU::S_BCNT1_I32_B32) { // The VALU version adds the second operand to the result, so insert an // extra 0 operand. - Inst->addOperand(MachineOperand::CreateImm(0)); + Inst.addOperand(MachineOperand::CreateImm(0)); } - Inst->addImplicitDefUseOperands(*Inst->getParent()->getParent()); + Inst.addImplicitDefUseOperands(*Inst.getParent()->getParent()); if (Opcode == AMDGPU::S_BFE_I32 || Opcode == AMDGPU::S_BFE_U32) { - const MachineOperand &OffsetWidthOp = Inst->getOperand(2); + const MachineOperand &OffsetWidthOp = Inst.getOperand(2); // If we need to move this to VGPRs, we need to unpack the second operand // back into the 2 separate ones for bit offset and width. assert(OffsetWidthOp.isImm() && @@ -2589,50 +2641,41 @@ void SIInstrInfo::moveToVALU(MachineInstr &TopInst) const { uint32_t Offset = Imm & 0x3f; // Extract bits [5:0]. uint32_t BitWidth = (Imm & 0x7f0000) >> 16; // Extract bits [22:16]. - Inst->RemoveOperand(2); // Remove old immediate. - Inst->addOperand(MachineOperand::CreateImm(Offset)); - Inst->addOperand(MachineOperand::CreateImm(BitWidth)); + Inst.RemoveOperand(2); // Remove old immediate. + Inst.addOperand(MachineOperand::CreateImm(Offset)); + Inst.addOperand(MachineOperand::CreateImm(BitWidth)); } - // Update the destination register class. - const TargetRegisterClass *NewDstRC = getDestEquivalentVGPRClass(*Inst); - if (!NewDstRC) - continue; + bool HasDst = Inst.getOperand(0).isReg() && Inst.getOperand(0).isDef(); + unsigned NewDstReg = AMDGPU::NoRegister; + if (HasDst) { + // Update the destination register class. + const TargetRegisterClass *NewDstRC = getDestEquivalentVGPRClass(Inst); + if (!NewDstRC) + continue; - unsigned DstReg = Inst->getOperand(0).getReg(); - unsigned NewDstReg = MRI.createVirtualRegister(NewDstRC); - MRI.replaceRegWith(DstReg, NewDstReg); + unsigned DstReg = Inst.getOperand(0).getReg(); + NewDstReg = MRI.createVirtualRegister(NewDstRC); + MRI.replaceRegWith(DstReg, NewDstReg); + } // Legalize the operands legalizeOperands(Inst); - addUsersToMoveToVALUWorklist(NewDstReg, MRI, Worklist); + if (HasDst) + addUsersToMoveToVALUWorklist(NewDstReg, MRI, Worklist); } } -//===----------------------------------------------------------------------===// -// Indirect addressing callbacks -//===----------------------------------------------------------------------===// - -unsigned SIInstrInfo::calculateIndirectAddress(unsigned RegIndex, - unsigned Channel) const { - assert(Channel == 0); - return RegIndex; -} - -const TargetRegisterClass *SIInstrInfo::getIndirectAddrRegClass() const { - return &AMDGPU::VGPR_32RegClass; -} - void SIInstrInfo::lowerScalarAbs(SmallVectorImpl<MachineInstr *> &Worklist, - MachineInstr *Inst) const { - MachineBasicBlock &MBB = *Inst->getParent(); + MachineInstr &Inst) const { + MachineBasicBlock &MBB = *Inst.getParent(); MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); MachineBasicBlock::iterator MII = Inst; - DebugLoc DL = Inst->getDebugLoc(); + DebugLoc DL = Inst.getDebugLoc(); - MachineOperand &Dest = Inst->getOperand(0); - MachineOperand &Src = Inst->getOperand(1); + MachineOperand &Dest = Inst.getOperand(0); + MachineOperand &Src = Inst.getOperand(1); unsigned TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); unsigned ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); @@ -2649,15 +2692,14 @@ void SIInstrInfo::lowerScalarAbs(SmallVectorImpl<MachineInstr *> &Worklist, } void SIInstrInfo::splitScalar64BitUnaryOp( - SmallVectorImpl<MachineInstr *> &Worklist, - MachineInstr *Inst, - unsigned Opcode) const { - MachineBasicBlock &MBB = *Inst->getParent(); + SmallVectorImpl<MachineInstr *> &Worklist, MachineInstr &Inst, + unsigned Opcode) const { + MachineBasicBlock &MBB = *Inst.getParent(); MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); - MachineOperand &Dest = Inst->getOperand(0); - MachineOperand &Src0 = Inst->getOperand(1); - DebugLoc DL = Inst->getDebugLoc(); + MachineOperand &Dest = Inst.getOperand(0); + MachineOperand &Src0 = Inst.getOperand(1); + DebugLoc DL = Inst.getDebugLoc(); MachineBasicBlock::iterator MII = Inst; @@ -2703,16 +2745,15 @@ void SIInstrInfo::splitScalar64BitUnaryOp( } void SIInstrInfo::splitScalar64BitBinaryOp( - SmallVectorImpl<MachineInstr *> &Worklist, - MachineInstr *Inst, - unsigned Opcode) const { - MachineBasicBlock &MBB = *Inst->getParent(); + SmallVectorImpl<MachineInstr *> &Worklist, MachineInstr &Inst, + unsigned Opcode) const { + MachineBasicBlock &MBB = *Inst.getParent(); MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); - MachineOperand &Dest = Inst->getOperand(0); - MachineOperand &Src0 = Inst->getOperand(1); - MachineOperand &Src1 = Inst->getOperand(2); - DebugLoc DL = Inst->getDebugLoc(); + MachineOperand &Dest = Inst.getOperand(0); + MachineOperand &Src0 = Inst.getOperand(1); + MachineOperand &Src1 = Inst.getOperand(2); + DebugLoc DL = Inst.getDebugLoc(); MachineBasicBlock::iterator MII = Inst; @@ -2738,9 +2779,9 @@ void SIInstrInfo::splitScalar64BitBinaryOp( const TargetRegisterClass *NewDestSubRC = RI.getSubRegClass(NewDestRC, AMDGPU::sub0); unsigned DestSub0 = MRI.createVirtualRegister(NewDestSubRC); - MachineInstr *LoHalf = BuildMI(MBB, MII, DL, InstDesc, DestSub0) - .addOperand(SrcReg0Sub0) - .addOperand(SrcReg1Sub0); + MachineInstr &LoHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub0) + .addOperand(SrcReg0Sub0) + .addOperand(SrcReg1Sub0); MachineOperand SrcReg0Sub1 = buildExtractSubRegOrImm(MII, MRI, Src0, Src0RC, AMDGPU::sub1, Src0SubRC); @@ -2748,9 +2789,9 @@ void SIInstrInfo::splitScalar64BitBinaryOp( AMDGPU::sub1, Src1SubRC); unsigned DestSub1 = MRI.createVirtualRegister(NewDestSubRC); - MachineInstr *HiHalf = BuildMI(MBB, MII, DL, InstDesc, DestSub1) - .addOperand(SrcReg0Sub1) - .addOperand(SrcReg1Sub1); + MachineInstr &HiHalf = *BuildMI(MBB, MII, DL, InstDesc, DestSub1) + .addOperand(SrcReg0Sub1) + .addOperand(SrcReg1Sub1); unsigned FullDestReg = MRI.createVirtualRegister(NewDestRC); BuildMI(MBB, MII, DL, get(TargetOpcode::REG_SEQUENCE), FullDestReg) @@ -2770,16 +2811,16 @@ void SIInstrInfo::splitScalar64BitBinaryOp( addUsersToMoveToVALUWorklist(FullDestReg, MRI, Worklist); } -void SIInstrInfo::splitScalar64BitBCNT(SmallVectorImpl<MachineInstr *> &Worklist, - MachineInstr *Inst) const { - MachineBasicBlock &MBB = *Inst->getParent(); +void SIInstrInfo::splitScalar64BitBCNT( + SmallVectorImpl<MachineInstr *> &Worklist, MachineInstr &Inst) const { + MachineBasicBlock &MBB = *Inst.getParent(); MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); MachineBasicBlock::iterator MII = Inst; - DebugLoc DL = Inst->getDebugLoc(); + DebugLoc DL = Inst.getDebugLoc(); - MachineOperand &Dest = Inst->getOperand(0); - MachineOperand &Src = Inst->getOperand(1); + MachineOperand &Dest = Inst.getOperand(0); + MachineOperand &Src = Inst.getOperand(1); const MCInstrDesc &InstDesc = get(AMDGPU::V_BCNT_U32_B32_e64); const TargetRegisterClass *SrcRC = Src.isReg() ? @@ -2812,24 +2853,22 @@ void SIInstrInfo::splitScalar64BitBCNT(SmallVectorImpl<MachineInstr *> &Worklist } void SIInstrInfo::splitScalar64BitBFE(SmallVectorImpl<MachineInstr *> &Worklist, - MachineInstr *Inst) const { - MachineBasicBlock &MBB = *Inst->getParent(); + MachineInstr &Inst) const { + MachineBasicBlock &MBB = *Inst.getParent(); MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); MachineBasicBlock::iterator MII = Inst; - DebugLoc DL = Inst->getDebugLoc(); + DebugLoc DL = Inst.getDebugLoc(); - MachineOperand &Dest = Inst->getOperand(0); - uint32_t Imm = Inst->getOperand(2).getImm(); + MachineOperand &Dest = Inst.getOperand(0); + uint32_t Imm = Inst.getOperand(2).getImm(); uint32_t Offset = Imm & 0x3f; // Extract bits [5:0]. uint32_t BitWidth = (Imm & 0x7f0000) >> 16; // Extract bits [22:16]. (void) Offset; // Only sext_inreg cases handled. - assert(Inst->getOpcode() == AMDGPU::S_BFE_I64 && - BitWidth <= 32 && - Offset == 0 && - "Not implemented"); + assert(Inst.getOpcode() == AMDGPU::S_BFE_I64 && BitWidth <= 32 && + Offset == 0 && "Not implemented"); if (BitWidth < 32) { unsigned MidRegLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); @@ -2837,9 +2876,9 @@ void SIInstrInfo::splitScalar64BitBFE(SmallVectorImpl<MachineInstr *> &Worklist, unsigned ResultReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass); BuildMI(MBB, MII, DL, get(AMDGPU::V_BFE_I32), MidRegLo) - .addReg(Inst->getOperand(1).getReg(), 0, AMDGPU::sub0) - .addImm(0) - .addImm(BitWidth); + .addReg(Inst.getOperand(1).getReg(), 0, AMDGPU::sub0) + .addImm(0) + .addImm(BitWidth); BuildMI(MBB, MII, DL, get(AMDGPU::V_ASHRREV_I32_e32), MidRegHi) .addImm(31) @@ -2856,7 +2895,7 @@ void SIInstrInfo::splitScalar64BitBFE(SmallVectorImpl<MachineInstr *> &Worklist, return; } - MachineOperand &Src = Inst->getOperand(1); + MachineOperand &Src = Inst.getOperand(1); unsigned TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); unsigned ResultReg = MRI.createVirtualRegister(&AMDGPU::VReg_64RegClass); @@ -2887,6 +2926,22 @@ void SIInstrInfo::addUsersToMoveToVALUWorklist( } } +void SIInstrInfo::addSCCDefUsersToVALUWorklist( + MachineInstr &SCCDefInst, SmallVectorImpl<MachineInstr *> &Worklist) const { + // This assumes that all the users of SCC are in the same block + // as the SCC def. + for (MachineInstr &MI : + llvm::make_range(MachineBasicBlock::iterator(SCCDefInst), + SCCDefInst.getParent()->end())) { + // Exit if we find another SCC def. + if (MI.findRegisterDefOperandIdx(AMDGPU::SCC) != -1) + return; + + if (MI.findRegisterUseOperandIdx(AMDGPU::SCC) != -1) + Worklist.push_back(&MI); + } +} + const TargetRegisterClass *SIInstrInfo::getDestEquivalentVGPRClass( const MachineInstr &Inst) const { const TargetRegisterClass *NewDstRC = getOpRegClass(Inst, 0); @@ -2912,9 +2967,9 @@ const TargetRegisterClass *SIInstrInfo::getDestEquivalentVGPRClass( } // Find the one SGPR operand we are allowed to use. -unsigned SIInstrInfo::findUsedSGPR(const MachineInstr *MI, +unsigned SIInstrInfo::findUsedSGPR(const MachineInstr &MI, int OpIndices[3]) const { - const MCInstrDesc &Desc = MI->getDesc(); + const MCInstrDesc &Desc = MI.getDesc(); // Find the one SGPR operand we are allowed to use. // @@ -2925,19 +2980,19 @@ unsigned SIInstrInfo::findUsedSGPR(const MachineInstr *MI, // // If the operand's class is an SGPR, we can never move it. - unsigned SGPRReg = findImplicitSGPRRead(*MI); + unsigned SGPRReg = findImplicitSGPRRead(MI); if (SGPRReg != AMDGPU::NoRegister) return SGPRReg; unsigned UsedSGPRs[3] = { AMDGPU::NoRegister }; - const MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo(); + const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo(); for (unsigned i = 0; i < 3; ++i) { int Idx = OpIndices[i]; if (Idx == -1) break; - const MachineOperand &MO = MI->getOperand(Idx); + const MachineOperand &MO = MI.getOperand(Idx); if (!MO.isReg()) continue; @@ -2981,70 +3036,6 @@ unsigned SIInstrInfo::findUsedSGPR(const MachineInstr *MI, return SGPRReg; } -MachineInstrBuilder SIInstrInfo::buildIndirectWrite( - MachineBasicBlock *MBB, - MachineBasicBlock::iterator I, - unsigned ValueReg, - unsigned Address, unsigned OffsetReg) const { - const DebugLoc &DL = MBB->findDebugLoc(I); - unsigned IndirectBaseReg = AMDGPU::VGPR_32RegClass.getRegister( - getIndirectIndexBegin(*MBB->getParent())); - - return BuildMI(*MBB, I, DL, get(AMDGPU::SI_INDIRECT_DST_V1)) - .addReg(IndirectBaseReg, RegState::Define) - .addOperand(I->getOperand(0)) - .addReg(IndirectBaseReg) - .addReg(OffsetReg) - .addImm(0) - .addReg(ValueReg); -} - -MachineInstrBuilder SIInstrInfo::buildIndirectRead( - MachineBasicBlock *MBB, - MachineBasicBlock::iterator I, - unsigned ValueReg, - unsigned Address, unsigned OffsetReg) const { - const DebugLoc &DL = MBB->findDebugLoc(I); - unsigned IndirectBaseReg = AMDGPU::VGPR_32RegClass.getRegister( - getIndirectIndexBegin(*MBB->getParent())); - - return BuildMI(*MBB, I, DL, get(AMDGPU::SI_INDIRECT_SRC_V1)) - .addOperand(I->getOperand(0)) - .addOperand(I->getOperand(1)) - .addReg(IndirectBaseReg) - .addReg(OffsetReg) - .addImm(0); - -} - -void SIInstrInfo::reserveIndirectRegisters(BitVector &Reserved, - const MachineFunction &MF) const { - int End = getIndirectIndexEnd(MF); - int Begin = getIndirectIndexBegin(MF); - - if (End == -1) - return; - - - for (int Index = Begin; Index <= End; ++Index) - Reserved.set(AMDGPU::VGPR_32RegClass.getRegister(Index)); - - for (int Index = std::max(0, Begin - 1); Index <= End; ++Index) - Reserved.set(AMDGPU::VReg_64RegClass.getRegister(Index)); - - for (int Index = std::max(0, Begin - 2); Index <= End; ++Index) - Reserved.set(AMDGPU::VReg_96RegClass.getRegister(Index)); - - for (int Index = std::max(0, Begin - 3); Index <= End; ++Index) - Reserved.set(AMDGPU::VReg_128RegClass.getRegister(Index)); - - for (int Index = std::max(0, Begin - 7); Index <= End; ++Index) - Reserved.set(AMDGPU::VReg_256RegClass.getRegister(Index)); - - for (int Index = std::max(0, Begin - 15); Index <= End; ++Index) - Reserved.set(AMDGPU::VReg_512RegClass.getRegister(Index)); -} - MachineOperand *SIInstrInfo::getNamedOperand(MachineInstr &MI, unsigned OperandName) const { int Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), OperandName); @@ -3059,9 +3050,9 @@ uint64_t SIInstrInfo::getDefaultRsrcDataFormat() const { if (ST.isAmdHsaOS()) { RsrcDataFormat |= (1ULL << 56); - if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) - // Set MTYPE = 2 - RsrcDataFormat |= (2ULL << 59); + if (ST.getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) + // Set MTYPE = 2 + RsrcDataFormat |= (2ULL << 59); } return RsrcDataFormat; @@ -3072,22 +3063,103 @@ uint64_t SIInstrInfo::getScratchRsrcWords23() const { AMDGPU::RSRC_TID_ENABLE | 0xffffffff; // Size; + uint64_t EltSizeValue = Log2_32(ST.getMaxPrivateElementSize()) - 1; + + Rsrc23 |= (EltSizeValue << AMDGPU::RSRC_ELEMENT_SIZE_SHIFT) | + // IndexStride = 64 + (UINT64_C(3) << AMDGPU::RSRC_INDEX_STRIDE_SHIFT); + // If TID_ENABLE is set, DATA_FORMAT specifies stride bits [14:17]. // Clear them unless we want a huge stride. - if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) + if (ST.getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) Rsrc23 &= ~AMDGPU::RSRC_DATA_FORMAT; return Rsrc23; } -bool SIInstrInfo::isLowLatencyInstruction(const MachineInstr *MI) const { - unsigned Opc = MI->getOpcode(); +bool SIInstrInfo::isLowLatencyInstruction(const MachineInstr &MI) const { + unsigned Opc = MI.getOpcode(); return isSMRD(Opc); } -bool SIInstrInfo::isHighLatencyInstruction(const MachineInstr *MI) const { - unsigned Opc = MI->getOpcode(); +bool SIInstrInfo::isHighLatencyInstruction(const MachineInstr &MI) const { + unsigned Opc = MI.getOpcode(); return isMUBUF(Opc) || isMTBUF(Opc) || isMIMG(Opc); } + +unsigned SIInstrInfo::getInstSizeInBytes(const MachineInstr &MI) const { + unsigned Opc = MI.getOpcode(); + const MCInstrDesc &Desc = getMCOpcodeFromPseudo(Opc); + unsigned DescSize = Desc.getSize(); + + // If we have a definitive size, we can use it. Otherwise we need to inspect + // the operands to know the size. + if (DescSize == 8 || DescSize == 4) + return DescSize; + + assert(DescSize == 0); + + // 4-byte instructions may have a 32-bit literal encoded after them. Check + // operands that coud ever be literals. + if (isVALU(MI) || isSALU(MI)) { + int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0); + if (Src0Idx == -1) + return 4; // No operands. + + if (isLiteralConstant(MI.getOperand(Src0Idx), getOpSize(MI, Src0Idx))) + return 8; + + int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1); + if (Src1Idx == -1) + return 4; + + if (isLiteralConstant(MI.getOperand(Src1Idx), getOpSize(MI, Src1Idx))) + return 8; + + return 4; + } + + switch (Opc) { + case TargetOpcode::IMPLICIT_DEF: + case TargetOpcode::KILL: + case TargetOpcode::DBG_VALUE: + case TargetOpcode::BUNDLE: + case TargetOpcode::EH_LABEL: + return 0; + case TargetOpcode::INLINEASM: { + const MachineFunction *MF = MI.getParent()->getParent(); + const char *AsmStr = MI.getOperand(0).getSymbolName(); + return getInlineAsmLength(AsmStr, *MF->getTarget().getMCAsmInfo()); + } + default: + llvm_unreachable("unable to find instruction size"); + } +} + +ArrayRef<std::pair<int, const char *>> +SIInstrInfo::getSerializableTargetIndices() const { + static const std::pair<int, const char *> TargetIndices[] = { + {AMDGPU::TI_CONSTDATA_START, "amdgpu-constdata-start"}, + {AMDGPU::TI_SCRATCH_RSRC_DWORD0, "amdgpu-scratch-rsrc-dword0"}, + {AMDGPU::TI_SCRATCH_RSRC_DWORD1, "amdgpu-scratch-rsrc-dword1"}, + {AMDGPU::TI_SCRATCH_RSRC_DWORD2, "amdgpu-scratch-rsrc-dword2"}, + {AMDGPU::TI_SCRATCH_RSRC_DWORD3, "amdgpu-scratch-rsrc-dword3"}}; + return makeArrayRef(TargetIndices); +} + +/// This is used by the post-RA scheduler (SchedulePostRAList.cpp). The +/// post-RA version of misched uses CreateTargetMIHazardRecognizer. +ScheduleHazardRecognizer * +SIInstrInfo::CreateTargetPostRAHazardRecognizer(const InstrItineraryData *II, + const ScheduleDAG *DAG) const { + return new GCNHazardRecognizer(DAG->MF); +} + +/// This is the hazard recognizer used at -O0 by the PostRAHazardRecognizer +/// pass. +ScheduleHazardRecognizer * +SIInstrInfo::CreateTargetPostRAHazardRecognizer(const MachineFunction &MF) const { + return new GCNHazardRecognizer(MF); +} |