diff options
Diffstat (limited to 'lib/Target/AMDGPU/SIFoldOperands.cpp')
-rw-r--r-- | lib/Target/AMDGPU/SIFoldOperands.cpp | 114 |
1 files changed, 83 insertions, 31 deletions
diff --git a/lib/Target/AMDGPU/SIFoldOperands.cpp b/lib/Target/AMDGPU/SIFoldOperands.cpp index 74d77d328019..4eac03168760 100644 --- a/lib/Target/AMDGPU/SIFoldOperands.cpp +++ b/lib/Target/AMDGPU/SIFoldOperands.cpp @@ -142,16 +142,20 @@ static bool isInlineConstantIfFolded(const SIInstrInfo *TII, switch (Opc) { case AMDGPU::V_MAC_F32_e64: case AMDGPU::V_MAC_F16_e64: - case AMDGPU::V_FMAC_F32_e64: { + case AMDGPU::V_FMAC_F32_e64: + case AMDGPU::V_FMAC_F16_e64: { // Special case for mac. Since this is replaced with mad when folded into // src2, we need to check the legality for the final instruction. int Src2Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2); if (static_cast<int>(OpNo) == Src2Idx) { - bool IsFMA = Opc == AMDGPU::V_FMAC_F32_e64; - bool IsF32 = Opc == AMDGPU::V_MAC_F32_e64; + bool IsFMA = Opc == AMDGPU::V_FMAC_F32_e64 || + Opc == AMDGPU::V_FMAC_F16_e64; + bool IsF32 = Opc == AMDGPU::V_MAC_F32_e64 || + Opc == AMDGPU::V_FMAC_F32_e64; unsigned Opc = IsFMA ? - AMDGPU::V_FMA_F32 : (IsF32 ? AMDGPU::V_MAD_F32 : AMDGPU::V_MAD_F16); + (IsF32 ? AMDGPU::V_FMA_F32 : AMDGPU::V_FMA_F16_gfx9) : + (IsF32 ? AMDGPU::V_MAD_F32 : AMDGPU::V_MAD_F16); const MCInstrDesc &MadDesc = TII->get(Opc); return TII->isInlineConstant(OpToFold, MadDesc.OpInfo[OpNo].OperandType); } @@ -235,9 +239,11 @@ static bool updateOperand(FoldCandidate &Fold, if ((Fold.isImm() || Fold.isFI() || Fold.isGlobal()) && Fold.needsShrink()) { MachineBasicBlock *MBB = MI->getParent(); - auto Liveness = MBB->computeRegisterLiveness(&TRI, AMDGPU::VCC, MI); - if (Liveness != MachineBasicBlock::LQR_Dead) + auto Liveness = MBB->computeRegisterLiveness(&TRI, AMDGPU::VCC, MI, 16); + if (Liveness != MachineBasicBlock::LQR_Dead) { + LLVM_DEBUG(dbgs() << "Not shrinking " << MI << " due to vcc liveness\n"); return false; + } MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo(); int Op32 = Fold.getShrinkOpcode(); @@ -248,7 +254,7 @@ static bool updateOperand(FoldCandidate &Fold, bool HaveNonDbgCarryUse = !MRI.use_nodbg_empty(Dst1.getReg()); const TargetRegisterClass *Dst0RC = MRI.getRegClass(Dst0.getReg()); - unsigned NewReg0 = MRI.createVirtualRegister(Dst0RC); + Register NewReg0 = MRI.createVirtualRegister(Dst0RC); MachineInstr *Inst32 = TII.buildShrunkInst(*MI, Op32); @@ -314,12 +320,15 @@ static bool tryAddToFoldList(SmallVectorImpl<FoldCandidate> &FoldList, // Special case for v_mac_{f16, f32}_e64 if we are trying to fold into src2 unsigned Opc = MI->getOpcode(); if ((Opc == AMDGPU::V_MAC_F32_e64 || Opc == AMDGPU::V_MAC_F16_e64 || - Opc == AMDGPU::V_FMAC_F32_e64) && + Opc == AMDGPU::V_FMAC_F32_e64 || Opc == AMDGPU::V_FMAC_F16_e64) && (int)OpNo == AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2)) { - bool IsFMA = Opc == AMDGPU::V_FMAC_F32_e64; - bool IsF32 = Opc == AMDGPU::V_MAC_F32_e64; + bool IsFMA = Opc == AMDGPU::V_FMAC_F32_e64 || + Opc == AMDGPU::V_FMAC_F16_e64; + bool IsF32 = Opc == AMDGPU::V_MAC_F32_e64 || + Opc == AMDGPU::V_FMAC_F32_e64; unsigned NewOpc = IsFMA ? - AMDGPU::V_FMA_F32 : (IsF32 ? AMDGPU::V_MAD_F32 : AMDGPU::V_MAD_F16); + (IsF32 ? AMDGPU::V_FMA_F32 : AMDGPU::V_FMA_F16_gfx9) : + (IsF32 ? AMDGPU::V_MAD_F32 : AMDGPU::V_MAD_F16); // Check if changing this to a v_mad_{f16, f32} instruction will allow us // to fold the operand. @@ -435,7 +444,8 @@ static bool tryToFoldACImm(const SIInstrInfo *TII, OpTy > AMDGPU::OPERAND_REG_INLINE_AC_LAST) return false; - if (OpToFold.isImm() && TII->isInlineConstant(OpToFold, OpTy)) { + if (OpToFold.isImm() && TII->isInlineConstant(OpToFold, OpTy) && + TII->isOperandLegal(*UseMI, UseOpIdx, &OpToFold)) { UseMI->getOperand(UseOpIdx).ChangeToImmediate(OpToFold.getImm()); return true; } @@ -443,8 +453,8 @@ static bool tryToFoldACImm(const SIInstrInfo *TII, if (!OpToFold.isReg()) return false; - unsigned UseReg = OpToFold.getReg(); - if (!TargetRegisterInfo::isVirtualRegister(UseReg)) + Register UseReg = OpToFold.getReg(); + if (!Register::isVirtualRegister(UseReg)) return false; if (llvm::find_if(FoldList, [UseMI](const FoldCandidate &FC) { @@ -481,6 +491,9 @@ static bool tryToFoldACImm(const SIInstrInfo *TII, return false; // Can only fold splat constants } + if (!TII->isOperandLegal(*UseMI, UseOpIdx, Op)) + return false; + FoldList.push_back(FoldCandidate(UseMI, UseOpIdx, Op)); return true; } @@ -518,7 +531,7 @@ void SIFoldOperands::foldOperand( // REG_SEQUENCE instructions, so we have to fold them into the // uses of REG_SEQUENCE. if (UseMI->isRegSequence()) { - unsigned RegSeqDstReg = UseMI->getOperand(0).getReg(); + Register RegSeqDstReg = UseMI->getOperand(0).getReg(); unsigned RegSeqDstSubReg = UseMI->getOperand(UseOpIdx + 1).getImm(); MachineRegisterInfo::use_iterator Next; @@ -569,15 +582,18 @@ void SIFoldOperands::foldOperand( OpToFold.isImm() || OpToFold.isFI() || OpToFold.isGlobal(); if (FoldingImmLike && UseMI->isCopy()) { - unsigned DestReg = UseMI->getOperand(0).getReg(); - const TargetRegisterClass *DestRC - = TargetRegisterInfo::isVirtualRegister(DestReg) ? - MRI->getRegClass(DestReg) : - TRI->getPhysRegClass(DestReg); - - unsigned SrcReg = UseMI->getOperand(1).getReg(); - if (TargetRegisterInfo::isVirtualRegister(DestReg) && - TargetRegisterInfo::isVirtualRegister(SrcReg)) { + Register DestReg = UseMI->getOperand(0).getReg(); + + // Don't fold into a copy to a physical register. Doing so would interfere + // with the register coalescer's logic which would avoid redundant + // initalizations. + if (DestReg.isPhysical()) + return; + + const TargetRegisterClass *DestRC = MRI->getRegClass(DestReg); + + Register SrcReg = UseMI->getOperand(1).getReg(); + if (SrcReg.isVirtual()) { // XXX - This can be an assert? const TargetRegisterClass * SrcRC = MRI->getRegClass(SrcReg); if (TRI->isSGPRClass(SrcRC) && TRI->hasVectorRegisters(DestRC)) { MachineRegisterInfo::use_iterator NextUse; @@ -613,10 +629,17 @@ void SIFoldOperands::foldOperand( return; UseMI->setDesc(TII->get(MovOp)); + MachineInstr::mop_iterator ImpOpI = UseMI->implicit_operands().begin(); + MachineInstr::mop_iterator ImpOpE = UseMI->implicit_operands().end(); + while (ImpOpI != ImpOpE) { + MachineInstr::mop_iterator Tmp = ImpOpI; + ImpOpI++; + UseMI->RemoveOperand(UseMI->getOperandNo(Tmp)); + } CopiesToReplace.push_back(UseMI); } else { if (UseMI->isCopy() && OpToFold.isReg() && - TargetRegisterInfo::isVirtualRegister(UseMI->getOperand(0).getReg()) && + Register::isVirtualRegister(UseMI->getOperand(0).getReg()) && TRI->isVectorRegister(*MRI, UseMI->getOperand(0).getReg()) && TRI->isVectorRegister(*MRI, UseMI->getOperand(1).getReg()) && !UseMI->getOperand(1).getSubReg()) { @@ -677,6 +700,9 @@ void SIFoldOperands::foldOperand( // => // %sgpr1 = COPY %sgpr0 UseMI->setDesc(TII->get(AMDGPU::COPY)); + UseMI->getOperand(1).setReg(OpToFold.getReg()); + UseMI->getOperand(1).setSubReg(OpToFold.getSubReg()); + UseMI->getOperand(1).setIsKill(false); UseMI->RemoveOperand(2); // Remove exec read (or src1 for readlane) return; } @@ -708,7 +734,7 @@ void SIFoldOperands::foldOperand( // Split 64-bit constants into 32-bits for folding. if (UseOp.getSubReg() && AMDGPU::getRegBitWidth(FoldRC->getID()) == 64) { - unsigned UseReg = UseOp.getReg(); + Register UseReg = UseOp.getReg(); const TargetRegisterClass *UseRC = MRI->getRegClass(UseReg); if (AMDGPU::getRegBitWidth(UseRC->getID()) != 64) @@ -810,7 +836,7 @@ static MachineOperand *getImmOrMaterializedImm(MachineRegisterInfo &MRI, if (Op.isReg()) { // If this has a subregister, it obviously is a register source. if (Op.getSubReg() != AMDGPU::NoSubRegister || - !TargetRegisterInfo::isVirtualRegister(Op.getReg())) + !Register::isVirtualRegister(Op.getReg())) return &Op; MachineInstr *Def = MRI.getVRegDef(Op.getReg()); @@ -1073,6 +1099,13 @@ void SIFoldOperands::foldInstOperand(MachineInstr &MI, Copy->addImplicitDefUseOperands(*MF); for (FoldCandidate &Fold : FoldList) { + if (Fold.isReg() && Register::isVirtualRegister(Fold.OpToFold->getReg())) { + Register Reg = Fold.OpToFold->getReg(); + MachineInstr *DefMI = Fold.OpToFold->getParent(); + if (DefMI->readsRegister(AMDGPU::EXEC, TRI) && + execMayBeModifiedBeforeUse(*MRI, Reg, *DefMI, *Fold.UseMI)) + continue; + } if (updateOperand(Fold, *TII, *TRI, *ST)) { // Clear kill flags. if (Fold.isReg()) { @@ -1316,6 +1349,8 @@ bool SIFoldOperands::runOnMachineFunction(MachineFunction &MF) { for (MachineBasicBlock *MBB : depth_first(&MF)) { MachineBasicBlock::iterator I, Next; + + MachineOperand *CurrentKnownM0Val = nullptr; for (I = MBB->begin(); I != MBB->end(); I = Next) { Next = std::next(I); MachineInstr &MI = *I; @@ -1328,6 +1363,25 @@ bool SIFoldOperands::runOnMachineFunction(MachineFunction &MF) { if (IsIEEEMode || (!HasNSZ && !MI.getFlag(MachineInstr::FmNsz)) || !tryFoldOMod(MI)) tryFoldClamp(MI); + + // Saw an unknown clobber of m0, so we no longer know what it is. + if (CurrentKnownM0Val && MI.modifiesRegister(AMDGPU::M0, TRI)) + CurrentKnownM0Val = nullptr; + continue; + } + + // Specially track simple redefs of m0 to the same value in a block, so we + // can erase the later ones. + if (MI.getOperand(0).getReg() == AMDGPU::M0) { + MachineOperand &NewM0Val = MI.getOperand(1); + if (CurrentKnownM0Val && CurrentKnownM0Val->isIdenticalTo(NewM0Val)) { + MI.eraseFromParent(); + continue; + } + + // We aren't tracking other physical registers + CurrentKnownM0Val = (NewM0Val.isReg() && NewM0Val.getReg().isPhysical()) ? + nullptr : &NewM0Val; continue; } @@ -1339,8 +1393,7 @@ bool SIFoldOperands::runOnMachineFunction(MachineFunction &MF) { if (!FoldingImm && !OpToFold.isReg()) continue; - if (OpToFold.isReg() && - !TargetRegisterInfo::isVirtualRegister(OpToFold.getReg())) + if (OpToFold.isReg() && !Register::isVirtualRegister(OpToFold.getReg())) continue; // Prevent folding operands backwards in the function. For example, @@ -1350,8 +1403,7 @@ bool SIFoldOperands::runOnMachineFunction(MachineFunction &MF) { // ... // %vgpr0 = V_MOV_B32_e32 1, implicit %exec MachineOperand &Dst = MI.getOperand(0); - if (Dst.isReg() && - !TargetRegisterInfo::isVirtualRegister(Dst.getReg())) + if (Dst.isReg() && !Register::isVirtualRegister(Dst.getReg())) continue; foldInstOperand(MI, OpToFold); |