diff options
author | Dimitry Andric <dim@FreeBSD.org> | 2019-08-20 20:50:12 +0000 |
---|---|---|
committer | Dimitry Andric <dim@FreeBSD.org> | 2019-08-20 20:50:12 +0000 |
commit | e6d1592492a3a379186bfb02bd0f4eda0669c0d5 (patch) | |
tree | 599ab169a01f1c86eda9adc774edaedde2f2db5b /lib/Target/AMDGPU/SIFoldOperands.cpp | |
parent | 1a56a5ead7a2e84bee8240f5f6b033b5f1707154 (diff) |
Diffstat (limited to 'lib/Target/AMDGPU/SIFoldOperands.cpp')
-rw-r--r-- | lib/Target/AMDGPU/SIFoldOperands.cpp | 363 |
1 files changed, 282 insertions, 81 deletions
diff --git a/lib/Target/AMDGPU/SIFoldOperands.cpp b/lib/Target/AMDGPU/SIFoldOperands.cpp index f4e866958369..74d77d328019 100644 --- a/lib/Target/AMDGPU/SIFoldOperands.cpp +++ b/lib/Target/AMDGPU/SIFoldOperands.cpp @@ -1,9 +1,8 @@ //===-- SIFoldOperands.cpp - Fold operands --- ----------------------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // /// \file //===----------------------------------------------------------------------===// @@ -51,7 +50,7 @@ struct FoldCandidate { } else if (FoldOp->isFI()) { FrameIndexToFold = FoldOp->getIndex(); } else { - assert(FoldOp->isReg()); + assert(FoldOp->isReg() || FoldOp->isGlobal()); OpToFold = FoldOp; } } @@ -68,6 +67,8 @@ struct FoldCandidate { return Kind == MachineOperand::MO_Register; } + bool isGlobal() const { return Kind == MachineOperand::MO_GlobalAddress; } + bool isCommuted() const { return Commuted; } @@ -88,10 +89,11 @@ public: const SIInstrInfo *TII; const SIRegisterInfo *TRI; const GCNSubtarget *ST; + const SIMachineFunctionInfo *MFI; void foldOperand(MachineOperand &OpToFold, MachineInstr *UseMI, - unsigned UseOpIdx, + int UseOpIdx, SmallVectorImpl<FoldCandidate> &FoldList, SmallVectorImpl<MachineInstr *> &CopiesToReplace) const; @@ -160,19 +162,34 @@ static bool isInlineConstantIfFolded(const SIInstrInfo *TII, } } +// TODO: Add heuristic that the frame index might not fit in the addressing mode +// immediate offset to avoid materializing in loops. +static bool frameIndexMayFold(const SIInstrInfo *TII, + const MachineInstr &UseMI, + int OpNo, + const MachineOperand &OpToFold) { + return OpToFold.isFI() && + (TII->isMUBUF(UseMI) || TII->isFLATScratch(UseMI)) && + OpNo == AMDGPU::getNamedOperandIdx(UseMI.getOpcode(), AMDGPU::OpName::vaddr); +} + FunctionPass *llvm::createSIFoldOperandsPass() { return new SIFoldOperands(); } static bool updateOperand(FoldCandidate &Fold, const SIInstrInfo &TII, - const TargetRegisterInfo &TRI) { + const TargetRegisterInfo &TRI, + const GCNSubtarget &ST) { MachineInstr *MI = Fold.UseMI; MachineOperand &Old = MI->getOperand(Fold.UseOpNo); assert(Old.isReg()); if (Fold.isImm()) { - if (MI->getDesc().TSFlags & SIInstrFlags::IsPacked) { + if (MI->getDesc().TSFlags & SIInstrFlags::IsPacked && + !(MI->getDesc().TSFlags & SIInstrFlags::IsMAI) && + AMDGPU::isInlinableLiteralV216(static_cast<uint16_t>(Fold.ImmToFold), + ST.hasInv2PiInlineImm())) { // Set op_sel/op_sel_hi on this operand or bail out if op_sel is // already set. unsigned Opcode = MI->getOpcode(); @@ -190,77 +207,94 @@ static bool updateOperand(FoldCandidate &Fold, unsigned Val = Mod.getImm(); if ((Val & SISrcMods::OP_SEL_0) || !(Val & SISrcMods::OP_SEL_1)) return false; - // If upper part is all zero we do not need op_sel_hi. - if (!isUInt<16>(Fold.ImmToFold)) { - if (!(Fold.ImmToFold & 0xffff)) { - Mod.setImm(Mod.getImm() | SISrcMods::OP_SEL_0); + // Only apply the following transformation if that operand requries + // a packed immediate. + switch (TII.get(Opcode).OpInfo[OpNo].OperandType) { + case AMDGPU::OPERAND_REG_IMM_V2FP16: + case AMDGPU::OPERAND_REG_IMM_V2INT16: + case AMDGPU::OPERAND_REG_INLINE_C_V2FP16: + case AMDGPU::OPERAND_REG_INLINE_C_V2INT16: + // If upper part is all zero we do not need op_sel_hi. + if (!isUInt<16>(Fold.ImmToFold)) { + if (!(Fold.ImmToFold & 0xffff)) { + Mod.setImm(Mod.getImm() | SISrcMods::OP_SEL_0); + Mod.setImm(Mod.getImm() & ~SISrcMods::OP_SEL_1); + Old.ChangeToImmediate((Fold.ImmToFold >> 16) & 0xffff); + return true; + } Mod.setImm(Mod.getImm() & ~SISrcMods::OP_SEL_1); - Old.ChangeToImmediate((Fold.ImmToFold >> 16) & 0xffff); + Old.ChangeToImmediate(Fold.ImmToFold & 0xffff); return true; } - Mod.setImm(Mod.getImm() & ~SISrcMods::OP_SEL_1); + break; + default: + break; } } + } - if (Fold.needsShrink()) { - MachineBasicBlock *MBB = MI->getParent(); - auto Liveness = MBB->computeRegisterLiveness(&TRI, AMDGPU::VCC, MI); - if (Liveness != MachineBasicBlock::LQR_Dead) - return false; - - MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo(); - int Op32 = Fold.getShrinkOpcode(); - MachineOperand &Dst0 = MI->getOperand(0); - MachineOperand &Dst1 = MI->getOperand(1); - assert(Dst0.isDef() && Dst1.isDef()); - - bool HaveNonDbgCarryUse = !MRI.use_nodbg_empty(Dst1.getReg()); + if ((Fold.isImm() || Fold.isFI() || Fold.isGlobal()) && Fold.needsShrink()) { + MachineBasicBlock *MBB = MI->getParent(); + auto Liveness = MBB->computeRegisterLiveness(&TRI, AMDGPU::VCC, MI); + if (Liveness != MachineBasicBlock::LQR_Dead) + return false; - const TargetRegisterClass *Dst0RC = MRI.getRegClass(Dst0.getReg()); - unsigned NewReg0 = MRI.createVirtualRegister(Dst0RC); - const TargetRegisterClass *Dst1RC = MRI.getRegClass(Dst1.getReg()); - unsigned NewReg1 = MRI.createVirtualRegister(Dst1RC); + MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo(); + int Op32 = Fold.getShrinkOpcode(); + MachineOperand &Dst0 = MI->getOperand(0); + MachineOperand &Dst1 = MI->getOperand(1); + assert(Dst0.isDef() && Dst1.isDef()); - MachineInstr *Inst32 = TII.buildShrunkInst(*MI, Op32); + bool HaveNonDbgCarryUse = !MRI.use_nodbg_empty(Dst1.getReg()); - if (HaveNonDbgCarryUse) { - BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::COPY), Dst1.getReg()) - .addReg(AMDGPU::VCC, RegState::Kill); - } + const TargetRegisterClass *Dst0RC = MRI.getRegClass(Dst0.getReg()); + unsigned NewReg0 = MRI.createVirtualRegister(Dst0RC); - // Keep the old instruction around to avoid breaking iterators, but - // replace the outputs with dummy registers. - Dst0.setReg(NewReg0); - Dst1.setReg(NewReg1); + MachineInstr *Inst32 = TII.buildShrunkInst(*MI, Op32); - if (Fold.isCommuted()) - TII.commuteInstruction(*Inst32, false); - return true; + if (HaveNonDbgCarryUse) { + BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::COPY), Dst1.getReg()) + .addReg(AMDGPU::VCC, RegState::Kill); } - Old.ChangeToImmediate(Fold.ImmToFold); + // Keep the old instruction around to avoid breaking iterators, but + // replace it with a dummy instruction to remove uses. + // + // FIXME: We should not invert how this pass looks at operands to avoid + // this. Should track set of foldable movs instead of looking for uses + // when looking at a use. + Dst0.setReg(NewReg0); + for (unsigned I = MI->getNumOperands() - 1; I > 0; --I) + MI->RemoveOperand(I); + MI->setDesc(TII.get(AMDGPU::IMPLICIT_DEF)); + + if (Fold.isCommuted()) + TII.commuteInstruction(*Inst32, false); return true; } assert(!Fold.needsShrink() && "not handled"); - if (Fold.isFI()) { - Old.ChangeToFrameIndex(Fold.FrameIndexToFold); + if (Fold.isImm()) { + Old.ChangeToImmediate(Fold.ImmToFold); return true; } - MachineOperand *New = Fold.OpToFold; - if (TargetRegisterInfo::isVirtualRegister(Old.getReg()) && - TargetRegisterInfo::isVirtualRegister(New->getReg())) { - Old.substVirtReg(New->getReg(), New->getSubReg(), TRI); - - Old.setIsUndef(New->isUndef()); + if (Fold.isGlobal()) { + Old.ChangeToGA(Fold.OpToFold->getGlobal(), Fold.OpToFold->getOffset(), + Fold.OpToFold->getTargetFlags()); return true; } - // FIXME: Handle physical registers. + if (Fold.isFI()) { + Old.ChangeToFrameIndex(Fold.FrameIndexToFold); + return true; + } - return false; + MachineOperand *New = Fold.OpToFold; + Old.substVirtReg(New->getReg(), New->getSubReg(), TRI); + Old.setIsUndef(New->isUndef()); + return true; } static bool isUseMIInFoldList(ArrayRef<FoldCandidate> FoldList, @@ -277,7 +311,6 @@ static bool tryAddToFoldList(SmallVectorImpl<FoldCandidate> &FoldList, MachineOperand *OpToFold, const SIInstrInfo *TII) { if (!TII->isOperandLegal(*MI, OpNo, OpToFold)) { - // Special case for v_mac_{f16, f32}_e64 if we are trying to fold into src2 unsigned Opc = MI->getOpcode(); if ((Opc == AMDGPU::V_MAC_F32_e64 || Opc == AMDGPU::V_MAC_F16_e64 || @@ -344,7 +377,7 @@ static bool tryAddToFoldList(SmallVectorImpl<FoldCandidate> &FoldList, if ((Opc == AMDGPU::V_ADD_I32_e64 || Opc == AMDGPU::V_SUB_I32_e64 || Opc == AMDGPU::V_SUBREV_I32_e64) && // FIXME - OpToFold->isImm()) { + (OpToFold->isImm() || OpToFold->isFI() || OpToFold->isGlobal())) { MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo(); // Verify the other operand is a VGPR, otherwise we would violate the @@ -357,7 +390,10 @@ static bool tryAddToFoldList(SmallVectorImpl<FoldCandidate> &FoldList, assert(MI->getOperand(1).isDef()); - int Op32 = AMDGPU::getVOPe32(Opc); + // Make sure to get the 32-bit version of the commuted opcode. + unsigned MaybeCommutedOpc = MI->getOpcode(); + int Op32 = AMDGPU::getVOPe32(MaybeCommutedOpc); + FoldList.push_back(FoldCandidate(MI, CommuteOpNo, OpToFold, true, Op32)); return true; @@ -384,10 +420,75 @@ static bool isUseSafeToFold(const SIInstrInfo *TII, //return !MI.hasRegisterImplicitUseOperand(UseMO.getReg()); } +static bool tryToFoldACImm(const SIInstrInfo *TII, + const MachineOperand &OpToFold, + MachineInstr *UseMI, + unsigned UseOpIdx, + SmallVectorImpl<FoldCandidate> &FoldList) { + const MCInstrDesc &Desc = UseMI->getDesc(); + const MCOperandInfo *OpInfo = Desc.OpInfo; + if (!OpInfo || UseOpIdx >= Desc.getNumOperands()) + return false; + + uint8_t OpTy = OpInfo[UseOpIdx].OperandType; + if (OpTy < AMDGPU::OPERAND_REG_INLINE_AC_FIRST || + OpTy > AMDGPU::OPERAND_REG_INLINE_AC_LAST) + return false; + + if (OpToFold.isImm() && TII->isInlineConstant(OpToFold, OpTy)) { + UseMI->getOperand(UseOpIdx).ChangeToImmediate(OpToFold.getImm()); + return true; + } + + if (!OpToFold.isReg()) + return false; + + unsigned UseReg = OpToFold.getReg(); + if (!TargetRegisterInfo::isVirtualRegister(UseReg)) + return false; + + if (llvm::find_if(FoldList, [UseMI](const FoldCandidate &FC) { + return FC.UseMI == UseMI; }) != FoldList.end()) + return false; + + MachineRegisterInfo &MRI = UseMI->getParent()->getParent()->getRegInfo(); + const MachineInstr *Def = MRI.getUniqueVRegDef(UseReg); + if (!Def || !Def->isRegSequence()) + return false; + + int64_t Imm; + MachineOperand *Op; + for (unsigned I = 1, E = Def->getNumExplicitOperands(); I < E; I += 2) { + const MachineOperand &Sub = Def->getOperand(I); + if (!Sub.isReg() || Sub.getSubReg()) + return false; + MachineInstr *SubDef = MRI.getUniqueVRegDef(Sub.getReg()); + while (SubDef && !SubDef->isMoveImmediate() && + !SubDef->getOperand(1).isImm() && TII->isFoldableCopy(*SubDef)) + SubDef = MRI.getUniqueVRegDef(SubDef->getOperand(1).getReg()); + if (!SubDef || !SubDef->isMoveImmediate() || !SubDef->getOperand(1).isImm()) + return false; + Op = &SubDef->getOperand(1); + auto SubImm = Op->getImm(); + if (I == 1) { + if (!TII->isInlineConstant(SubDef->getOperand(1), OpTy)) + return false; + + Imm = SubImm; + continue; + } + if (Imm != SubImm) + return false; // Can only fold splat constants + } + + FoldList.push_back(FoldCandidate(UseMI, UseOpIdx, Op)); + return true; +} + void SIFoldOperands::foldOperand( MachineOperand &OpToFold, MachineInstr *UseMI, - unsigned UseOpIdx, + int UseOpIdx, SmallVectorImpl<FoldCandidate> &FoldList, SmallVectorImpl<MachineInstr *> &CopiesToReplace) const { const MachineOperand &UseOp = UseMI->getOperand(UseOpIdx); @@ -420,11 +521,18 @@ void SIFoldOperands::foldOperand( unsigned RegSeqDstReg = UseMI->getOperand(0).getReg(); unsigned RegSeqDstSubReg = UseMI->getOperand(UseOpIdx + 1).getImm(); + MachineRegisterInfo::use_iterator Next; for (MachineRegisterInfo::use_iterator RSUse = MRI->use_begin(RegSeqDstReg), RSE = MRI->use_end(); - RSUse != RSE; ++RSUse) { + RSUse != RSE; RSUse = Next) { + Next = std::next(RSUse); MachineInstr *RSUseMI = RSUse->getParent(); + + if (tryToFoldACImm(TII, UseMI->getOperand(0), RSUseMI, + RSUse.getOperandNo(), FoldList)) + continue; + if (RSUse->getSubReg() != RegSeqDstSubReg) continue; @@ -435,10 +543,32 @@ void SIFoldOperands::foldOperand( return; } + if (tryToFoldACImm(TII, OpToFold, UseMI, UseOpIdx, FoldList)) + return; - bool FoldingImm = OpToFold.isImm(); + if (frameIndexMayFold(TII, *UseMI, UseOpIdx, OpToFold)) { + // Sanity check that this is a stack access. + // FIXME: Should probably use stack pseudos before frame lowering. + MachineOperand *SOff = TII->getNamedOperand(*UseMI, AMDGPU::OpName::soffset); + if (!SOff->isReg() || (SOff->getReg() != MFI->getScratchWaveOffsetReg() && + SOff->getReg() != MFI->getStackPtrOffsetReg())) + return; + + if (TII->getNamedOperand(*UseMI, AMDGPU::OpName::srsrc)->getReg() != + MFI->getScratchRSrcReg()) + return; - if (FoldingImm && UseMI->isCopy()) { + // A frame index will resolve to a positive constant, so it should always be + // safe to fold the addressing mode, even pre-GFX9. + UseMI->getOperand(UseOpIdx).ChangeToFrameIndex(OpToFold.getIndex()); + SOff->setReg(MFI->getStackPtrOffsetReg()); + return; + } + + bool FoldingImmLike = + OpToFold.isImm() || OpToFold.isFI() || OpToFold.isGlobal(); + + if (FoldingImmLike && UseMI->isCopy()) { unsigned DestReg = UseMI->getOperand(0).getReg(); const TargetRegisterClass *DestRC = TargetRegisterInfo::isVirtualRegister(DestReg) ? @@ -449,7 +579,7 @@ void SIFoldOperands::foldOperand( if (TargetRegisterInfo::isVirtualRegister(DestReg) && TargetRegisterInfo::isVirtualRegister(SrcReg)) { const TargetRegisterClass * SrcRC = MRI->getRegClass(SrcReg); - if (TRI->isSGPRClass(SrcRC) && TRI->hasVGPRs(DestRC)) { + if (TRI->isSGPRClass(SrcRC) && TRI->hasVectorRegisters(DestRC)) { MachineRegisterInfo::use_iterator NextUse; SmallVector<FoldCandidate, 4> CopyUses; for (MachineRegisterInfo::use_iterator @@ -467,6 +597,14 @@ void SIFoldOperands::foldOperand( } } + if (DestRC == &AMDGPU::AGPR_32RegClass && + TII->isInlineConstant(OpToFold, AMDGPU::OPERAND_REG_INLINE_C_INT32)) { + UseMI->setDesc(TII->get(AMDGPU::V_ACCVGPR_WRITE_B32)); + UseMI->getOperand(1).ChangeToImmediate(OpToFold.getImm()); + CopiesToReplace.push_back(UseMI); + return; + } + // In order to fold immediates into copies, we need to change the // copy to a MOV. @@ -479,18 +617,71 @@ void SIFoldOperands::foldOperand( } else { if (UseMI->isCopy() && OpToFold.isReg() && TargetRegisterInfo::isVirtualRegister(UseMI->getOperand(0).getReg()) && - TargetRegisterInfo::isVirtualRegister(UseMI->getOperand(1).getReg()) && - TRI->isVGPR(*MRI, UseMI->getOperand(0).getReg()) && - TRI->isVGPR(*MRI, UseMI->getOperand(1).getReg()) && + TRI->isVectorRegister(*MRI, UseMI->getOperand(0).getReg()) && + TRI->isVectorRegister(*MRI, UseMI->getOperand(1).getReg()) && !UseMI->getOperand(1).getSubReg()) { + unsigned Size = TII->getOpSize(*UseMI, 1); UseMI->getOperand(1).setReg(OpToFold.getReg()); UseMI->getOperand(1).setSubReg(OpToFold.getSubReg()); UseMI->getOperand(1).setIsKill(false); CopiesToReplace.push_back(UseMI); OpToFold.setIsKill(false); + if (Size != 4) + return; + if (TRI->isAGPR(*MRI, UseMI->getOperand(0).getReg()) && + TRI->isVGPR(*MRI, UseMI->getOperand(1).getReg())) + UseMI->setDesc(TII->get(AMDGPU::V_ACCVGPR_WRITE_B32)); + else if (TRI->isVGPR(*MRI, UseMI->getOperand(0).getReg()) && + TRI->isAGPR(*MRI, UseMI->getOperand(1).getReg())) + UseMI->setDesc(TII->get(AMDGPU::V_ACCVGPR_READ_B32)); return; } + unsigned UseOpc = UseMI->getOpcode(); + if (UseOpc == AMDGPU::V_READFIRSTLANE_B32 || + (UseOpc == AMDGPU::V_READLANE_B32 && + (int)UseOpIdx == + AMDGPU::getNamedOperandIdx(UseOpc, AMDGPU::OpName::src0))) { + // %vgpr = V_MOV_B32 imm + // %sgpr = V_READFIRSTLANE_B32 %vgpr + // => + // %sgpr = S_MOV_B32 imm + if (FoldingImmLike) { + if (execMayBeModifiedBeforeUse(*MRI, + UseMI->getOperand(UseOpIdx).getReg(), + *OpToFold.getParent(), + *UseMI)) + return; + + UseMI->setDesc(TII->get(AMDGPU::S_MOV_B32)); + + // FIXME: ChangeToImmediate should clear subreg + UseMI->getOperand(1).setSubReg(0); + if (OpToFold.isImm()) + UseMI->getOperand(1).ChangeToImmediate(OpToFold.getImm()); + else + UseMI->getOperand(1).ChangeToFrameIndex(OpToFold.getIndex()); + UseMI->RemoveOperand(2); // Remove exec read (or src1 for readlane) + return; + } + + if (OpToFold.isReg() && TRI->isSGPRReg(*MRI, OpToFold.getReg())) { + if (execMayBeModifiedBeforeUse(*MRI, + UseMI->getOperand(UseOpIdx).getReg(), + *OpToFold.getParent(), + *UseMI)) + return; + + // %vgpr = COPY %sgpr0 + // %sgpr1 = V_READFIRSTLANE_B32 %vgpr + // => + // %sgpr1 = COPY %sgpr0 + UseMI->setDesc(TII->get(AMDGPU::COPY)); + UseMI->RemoveOperand(2); // Remove exec read (or src1 for readlane) + return; + } + } + const MCInstrDesc &UseDesc = UseMI->getDesc(); // Don't fold into target independent nodes. Target independent opcodes @@ -501,7 +692,7 @@ void SIFoldOperands::foldOperand( return; } - if (!FoldingImm) { + if (!FoldingImmLike) { tryAddToFoldList(FoldList, UseMI, UseOpIdx, &OpToFold, TII); // FIXME: We could try to change the instruction from 64-bit to 32-bit @@ -515,14 +706,10 @@ void SIFoldOperands::foldOperand( const TargetRegisterClass *FoldRC = TRI->getRegClass(FoldDesc.OpInfo[0].RegClass); - // Split 64-bit constants into 32-bits for folding. if (UseOp.getSubReg() && AMDGPU::getRegBitWidth(FoldRC->getID()) == 64) { unsigned UseReg = UseOp.getReg(); - const TargetRegisterClass *UseRC - = TargetRegisterInfo::isVirtualRegister(UseReg) ? - MRI->getRegClass(UseReg) : - TRI->getPhysRegClass(UseReg); + const TargetRegisterClass *UseRC = MRI->getRegClass(UseReg); if (AMDGPU::getRegBitWidth(UseRC->getID()) != 64) return; @@ -763,14 +950,23 @@ static bool tryFoldInst(const SIInstrInfo *TII, Opc == AMDGPU::V_CNDMASK_B64_PSEUDO) { const MachineOperand *Src0 = TII->getNamedOperand(*MI, AMDGPU::OpName::src0); const MachineOperand *Src1 = TII->getNamedOperand(*MI, AMDGPU::OpName::src1); - if (Src1->isIdenticalTo(*Src0)) { + int Src1ModIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1_modifiers); + int Src0ModIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0_modifiers); + if (Src1->isIdenticalTo(*Src0) && + (Src1ModIdx == -1 || !MI->getOperand(Src1ModIdx).getImm()) && + (Src0ModIdx == -1 || !MI->getOperand(Src0ModIdx).getImm())) { LLVM_DEBUG(dbgs() << "Folded " << *MI << " into "); + auto &NewDesc = + TII->get(Src0->isReg() ? (unsigned)AMDGPU::COPY : getMovOpc(false)); int Src2Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2); if (Src2Idx != -1) MI->RemoveOperand(Src2Idx); MI->RemoveOperand(AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1)); - mutateCopyOp(*MI, TII->get(Src0->isReg() ? (unsigned)AMDGPU::COPY - : getMovOpc(false))); + if (Src1ModIdx != -1) + MI->RemoveOperand(Src1ModIdx); + if (Src0ModIdx != -1) + MI->RemoveOperand(Src0ModIdx); + mutateCopyOp(*MI, NewDesc); LLVM_DEBUG(dbgs() << *MI << '\n'); return true; } @@ -788,7 +984,7 @@ void SIFoldOperands::foldInstOperand(MachineInstr &MI, SmallVector<FoldCandidate, 4> FoldList; MachineOperand &Dst = MI.getOperand(0); - bool FoldingImm = OpToFold.isImm() || OpToFold.isFI(); + bool FoldingImm = OpToFold.isImm() || OpToFold.isFI() || OpToFold.isGlobal(); if (FoldingImm) { unsigned NumLiteralUses = 0; MachineOperand *NonInlineUse = nullptr; @@ -840,6 +1036,9 @@ void SIFoldOperands::foldInstOperand(MachineInstr &MI, // in some cases. A better heuristic is needed. if (isInlineConstantIfFolded(TII, *UseMI, OpNo, OpToFold)) { foldOperand(OpToFold, UseMI, OpNo, FoldList, CopiesToReplace); + } else if (frameIndexMayFold(TII, *UseMI, OpNo, OpToFold)) { + foldOperand(OpToFold, UseMI, OpNo, FoldList, + CopiesToReplace); } else { if (++NumLiteralUses == 1) { NonInlineUse = &*Use; @@ -874,7 +1073,7 @@ void SIFoldOperands::foldInstOperand(MachineInstr &MI, Copy->addImplicitDefUseOperands(*MF); for (FoldCandidate &Fold : FoldList) { - if (updateOperand(Fold, *TII, *TRI)) { + if (updateOperand(Fold, *TII, *TRI, *ST)) { // Clear kill flags. if (Fold.isReg()) { assert(Fold.OpToFold && Fold.OpToFold->isReg()); @@ -926,7 +1125,8 @@ const MachineOperand *SIFoldOperands::isClamp(const MachineInstr &MI) const { // Having a 0 op_sel_hi would require swizzling the output in the source // instruction, which we can't do. - unsigned UnsetMods = (Op == AMDGPU::V_PK_MAX_F16) ? SISrcMods::OP_SEL_1 : 0; + unsigned UnsetMods = (Op == AMDGPU::V_PK_MAX_F16) ? SISrcMods::OP_SEL_1 + : 0u; if (Src0Mods != UnsetMods && Src1Mods != UnsetMods) return nullptr; return Src0; @@ -1105,13 +1305,13 @@ bool SIFoldOperands::runOnMachineFunction(MachineFunction &MF) { ST = &MF.getSubtarget<GCNSubtarget>(); TII = ST->getInstrInfo(); TRI = &TII->getRegisterInfo(); - - const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); + MFI = MF.getInfo<SIMachineFunctionInfo>(); // omod is ignored by hardware if IEEE bit is enabled. omod also does not // correctly handle signed zeros. // - bool IsIEEEMode = ST->enableIEEEBit(MF); + // FIXME: Also need to check strictfp + bool IsIEEEMode = MFI->getMode().IEEE; bool HasNSZ = MFI->hasNoSignedZerosFPMath(); for (MachineBasicBlock *MBB : depth_first(&MF)) { @@ -1132,7 +1332,8 @@ bool SIFoldOperands::runOnMachineFunction(MachineFunction &MF) { } MachineOperand &OpToFold = MI.getOperand(1); - bool FoldingImm = OpToFold.isImm() || OpToFold.isFI(); + bool FoldingImm = + OpToFold.isImm() || OpToFold.isFI() || OpToFold.isGlobal(); // FIXME: We could also be folding things like TargetIndexes. if (!FoldingImm && !OpToFold.isReg()) |