src - FreeBSD source tree

diff options


context:
space:
mode:

author	Dimitry Andric <dim@FreeBSD.org>	2019-08-20 20:50:12 +0000
committer	Dimitry Andric <dim@FreeBSD.org>	2019-08-20 20:50:12 +0000
commit	e6d1592492a3a379186bfb02bd0f4eda0669c0d5 (patch)
tree	599ab169a01f1c86eda9adc774edaedde2f2db5b /lib/Target/AMDGPU/SIFoldOperands.cpp
parent	1a56a5ead7a2e84bee8240f5f6b033b5f1707154 (diff)

vendor/llvm/llvm-trunk-r366426

Diffstat (limited to 'lib/Target/AMDGPU/SIFoldOperands.cpp')

-rw-r--r--

lib/Target/AMDGPU/SIFoldOperands.cpp

363

1 files changed, 282 insertions, 81 deletions

diff --git a/lib/Target/AMDGPU/SIFoldOperands.cpp b/lib/Target/AMDGPU/SIFoldOperands.cpp
index f4e866958369..74d77d328019 100644
--- a/lib/Target/AMDGPU/SIFoldOperands.cpp
+++ b/lib/Target/AMDGPU/SIFoldOperands.cpp

@@ -1,9 +1,8 @@

//===-- SIFoldOperands.cpp - Fold operands --- ----------------------------===//

-// The LLVM Compiler Infrastructure

-//

-// This file is distributed under the University of Illinois Open Source

-// License. See LICENSE.TXT for details.

+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.

+// See https://llvm.org/LICENSE.txt for license information.

+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception

/// \file

//===----------------------------------------------------------------------===//

@@ -51,7 +50,7 @@ struct FoldCandidate {

} else if (FoldOp->isFI()) {

FrameIndexToFold = FoldOp->getIndex();

} else {

- assert(FoldOp->isReg());

+ assert(FoldOp->isReg() || FoldOp->isGlobal());

OpToFold = FoldOp;

}

@@ -68,6 +67,8 @@ struct FoldCandidate {

return Kind == MachineOperand::MO_Register;

}

+ bool isGlobal() const { return Kind == MachineOperand::MO_GlobalAddress; }

bool isCommuted() const {

return Commuted;

}

@@ -88,10 +89,11 @@ public:

const SIInstrInfo *TII;

const SIRegisterInfo *TRI;

const GCNSubtarget *ST;

+ const SIMachineFunctionInfo *MFI;

void foldOperand(MachineOperand &OpToFold,

MachineInstr *UseMI,

- unsigned UseOpIdx,

+ int UseOpIdx,

SmallVectorImpl<FoldCandidate> &FoldList,

SmallVectorImpl<MachineInstr *> &CopiesToReplace) const;

@@ -160,19 +162,34 @@ static bool isInlineConstantIfFolded(const SIInstrInfo *TII,

}

+// TODO: Add heuristic that the frame index might not fit in the addressing mode

+// immediate offset to avoid materializing in loops.

+static bool frameIndexMayFold(const SIInstrInfo *TII,

+ const MachineInstr &UseMI,

+ int OpNo,

+ const MachineOperand &OpToFold) {

+ return OpToFold.isFI() &&

+ (TII->isMUBUF(UseMI) || TII->isFLATScratch(UseMI)) &&

+ OpNo == AMDGPU::getNamedOperandIdx(UseMI.getOpcode(), AMDGPU::OpName::vaddr);

FunctionPass *llvm::createSIFoldOperandsPass() {

return new SIFoldOperands();

}

static bool updateOperand(FoldCandidate &Fold,

const SIInstrInfo &TII,

- const TargetRegisterInfo &TRI) {

+ const TargetRegisterInfo &TRI,

+ const GCNSubtarget &ST) {

MachineInstr *MI = Fold.UseMI;

MachineOperand &Old = MI->getOperand(Fold.UseOpNo);

assert(Old.isReg());

if (Fold.isImm()) {

- if (MI->getDesc().TSFlags & SIInstrFlags::IsPacked) {

+ if (MI->getDesc().TSFlags & SIInstrFlags::IsPacked &&

+ !(MI->getDesc().TSFlags & SIInstrFlags::IsMAI) &&

+ AMDGPU::isInlinableLiteralV216(static_cast<uint16_t>(Fold.ImmToFold),

+ ST.hasInv2PiInlineImm())) {

// Set op_sel/op_sel_hi on this operand or bail out if op_sel is

// already set.

unsigned Opcode = MI->getOpcode();

@@ -190,77 +207,94 @@ static bool updateOperand(FoldCandidate &Fold,

unsigned Val = Mod.getImm();

if ((Val & SISrcMods::OP_SEL_0) || !(Val & SISrcMods::OP_SEL_1))

return false;

- // If upper part is all zero we do not need op_sel_hi.

- if (!isUInt<16>(Fold.ImmToFold)) {

- if (!(Fold.ImmToFold & 0xffff)) {

- Mod.setImm(Mod.getImm() | SISrcMods::OP_SEL_0);

+ // Only apply the following transformation if that operand requries

+ // a packed immediate.

+ switch (TII.get(Opcode).OpInfo[OpNo].OperandType) {

+ case AMDGPU::OPERAND_REG_IMM_V2FP16:

+ case AMDGPU::OPERAND_REG_IMM_V2INT16:

+ case AMDGPU::OPERAND_REG_INLINE_C_V2FP16:

+ case AMDGPU::OPERAND_REG_INLINE_C_V2INT16:

+ // If upper part is all zero we do not need op_sel_hi.

+ if (!isUInt<16>(Fold.ImmToFold)) {

+ if (!(Fold.ImmToFold & 0xffff)) {

+ Mod.setImm(Mod.getImm() | SISrcMods::OP_SEL_0);

+ Mod.setImm(Mod.getImm() & ~SISrcMods::OP_SEL_1);

+ Old.ChangeToImmediate((Fold.ImmToFold >> 16) & 0xffff);

+ return true;

+ }

Mod.setImm(Mod.getImm() & ~SISrcMods::OP_SEL_1);

- Old.ChangeToImmediate((Fold.ImmToFold >> 16) & 0xffff);

+ Old.ChangeToImmediate(Fold.ImmToFold & 0xffff);

return true;

}

- Mod.setImm(Mod.getImm() & ~SISrcMods::OP_SEL_1);

+ break;

+ default:

+ break;

}

+ }

- if (Fold.needsShrink()) {

- MachineBasicBlock *MBB = MI->getParent();

- auto Liveness = MBB->computeRegisterLiveness(&TRI, AMDGPU::VCC, MI);

- if (Liveness != MachineBasicBlock::LQR_Dead)

- return false;

- MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();

- int Op32 = Fold.getShrinkOpcode();

- MachineOperand &Dst0 = MI->getOperand(0);

- MachineOperand &Dst1 = MI->getOperand(1);

- assert(Dst0.isDef() && Dst1.isDef());

- bool HaveNonDbgCarryUse = !MRI.use_nodbg_empty(Dst1.getReg());

+ if ((Fold.isImm() || Fold.isFI() || Fold.isGlobal()) && Fold.needsShrink()) {

+ MachineBasicBlock *MBB = MI->getParent();

+ auto Liveness = MBB->computeRegisterLiveness(&TRI, AMDGPU::VCC, MI);

+ if (Liveness != MachineBasicBlock::LQR_Dead)

+ return false;

- const TargetRegisterClass *Dst0RC = MRI.getRegClass(Dst0.getReg());

- unsigned NewReg0 = MRI.createVirtualRegister(Dst0RC);

- const TargetRegisterClass *Dst1RC = MRI.getRegClass(Dst1.getReg());

- unsigned NewReg1 = MRI.createVirtualRegister(Dst1RC);

+ MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();

+ int Op32 = Fold.getShrinkOpcode();

+ MachineOperand &Dst0 = MI->getOperand(0);

+ MachineOperand &Dst1 = MI->getOperand(1);

+ assert(Dst0.isDef() && Dst1.isDef());

- MachineInstr *Inst32 = TII.buildShrunkInst(*MI, Op32);

+ bool HaveNonDbgCarryUse = !MRI.use_nodbg_empty(Dst1.getReg());

- if (HaveNonDbgCarryUse) {

- BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::COPY), Dst1.getReg())

- .addReg(AMDGPU::VCC, RegState::Kill);

- }

+ const TargetRegisterClass *Dst0RC = MRI.getRegClass(Dst0.getReg());

+ unsigned NewReg0 = MRI.createVirtualRegister(Dst0RC);

- // Keep the old instruction around to avoid breaking iterators, but

- // replace the outputs with dummy registers.

- Dst0.setReg(NewReg0);

- Dst1.setReg(NewReg1);

+ MachineInstr *Inst32 = TII.buildShrunkInst(*MI, Op32);

- if (Fold.isCommuted())

- TII.commuteInstruction(*Inst32, false);

- return true;

+ if (HaveNonDbgCarryUse) {

+ BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::COPY), Dst1.getReg())

+ .addReg(AMDGPU::VCC, RegState::Kill);

}

- Old.ChangeToImmediate(Fold.ImmToFold);

+ // Keep the old instruction around to avoid breaking iterators, but

+ // replace it with a dummy instruction to remove uses.

+ //

+ // FIXME: We should not invert how this pass looks at operands to avoid

+ // this. Should track set of foldable movs instead of looking for uses

+ // when looking at a use.

+ Dst0.setReg(NewReg0);

+ for (unsigned I = MI->getNumOperands() - 1; I > 0; --I)

+ MI->RemoveOperand(I);

+ MI->setDesc(TII.get(AMDGPU::IMPLICIT_DEF));

+ if (Fold.isCommuted())

+ TII.commuteInstruction(*Inst32, false);

return true;

}

assert(!Fold.needsShrink() && "not handled");

- if (Fold.isFI()) {

- Old.ChangeToFrameIndex(Fold.FrameIndexToFold);

+ if (Fold.isImm()) {

+ Old.ChangeToImmediate(Fold.ImmToFold);

return true;

}

- MachineOperand *New = Fold.OpToFold;

- if (TargetRegisterInfo::isVirtualRegister(Old.getReg()) &&

- TargetRegisterInfo::isVirtualRegister(New->getReg())) {

- Old.substVirtReg(New->getReg(), New->getSubReg(), TRI);

- Old.setIsUndef(New->isUndef());

+ if (Fold.isGlobal()) {

+ Old.ChangeToGA(Fold.OpToFold->getGlobal(), Fold.OpToFold->getOffset(),

+ Fold.OpToFold->getTargetFlags());

return true;

}

- // FIXME: Handle physical registers.

+ if (Fold.isFI()) {

+ Old.ChangeToFrameIndex(Fold.FrameIndexToFold);

+ return true;

+ }

- return false;

+ MachineOperand *New = Fold.OpToFold;

+ Old.substVirtReg(New->getReg(), New->getSubReg(), TRI);

+ Old.setIsUndef(New->isUndef());

+ return true;

}

static bool isUseMIInFoldList(ArrayRef<FoldCandidate> FoldList,

@@ -277,7 +311,6 @@ static bool tryAddToFoldList(SmallVectorImpl<FoldCandidate> &FoldList,

MachineOperand *OpToFold,

const SIInstrInfo *TII) {

if (!TII->isOperandLegal(*MI, OpNo, OpToFold)) {

// Special case for v_mac_{f16, f32}_e64 if we are trying to fold into src2

unsigned Opc = MI->getOpcode();

if ((Opc == AMDGPU::V_MAC_F32_e64 || Opc == AMDGPU::V_MAC_F16_e64 ||

@@ -344,7 +377,7 @@ static bool tryAddToFoldList(SmallVectorImpl<FoldCandidate> &FoldList,

if ((Opc == AMDGPU::V_ADD_I32_e64 ||

Opc == AMDGPU::V_SUB_I32_e64 ||

Opc == AMDGPU::V_SUBREV_I32_e64) && // FIXME

- OpToFold->isImm()) {

+ (OpToFold->isImm() || OpToFold->isFI() || OpToFold->isGlobal())) {

MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo();

// Verify the other operand is a VGPR, otherwise we would violate the

@@ -357,7 +390,10 @@ static bool tryAddToFoldList(SmallVectorImpl<FoldCandidate> &FoldList,

assert(MI->getOperand(1).isDef());

- int Op32 = AMDGPU::getVOPe32(Opc);

+ // Make sure to get the 32-bit version of the commuted opcode.

+ unsigned MaybeCommutedOpc = MI->getOpcode();

+ int Op32 = AMDGPU::getVOPe32(MaybeCommutedOpc);

FoldList.push_back(FoldCandidate(MI, CommuteOpNo, OpToFold, true,

Op32));

return true;

@@ -384,10 +420,75 @@ static bool isUseSafeToFold(const SIInstrInfo *TII,

//return !MI.hasRegisterImplicitUseOperand(UseMO.getReg());

}

+static bool tryToFoldACImm(const SIInstrInfo *TII,

+ const MachineOperand &OpToFold,

+ MachineInstr *UseMI,

+ unsigned UseOpIdx,

+ SmallVectorImpl<FoldCandidate> &FoldList) {

+ const MCInstrDesc &Desc = UseMI->getDesc();

+ const MCOperandInfo *OpInfo = Desc.OpInfo;

+ if (!OpInfo || UseOpIdx >= Desc.getNumOperands())

+ return false;

+ uint8_t OpTy = OpInfo[UseOpIdx].OperandType;

+ if (OpTy < AMDGPU::OPERAND_REG_INLINE_AC_FIRST ||

+ OpTy > AMDGPU::OPERAND_REG_INLINE_AC_LAST)

+ return false;

+ if (OpToFold.isImm() && TII->isInlineConstant(OpToFold, OpTy)) {

+ UseMI->getOperand(UseOpIdx).ChangeToImmediate(OpToFold.getImm());

+ return true;

+ }

+ if (!OpToFold.isReg())

+ return false;

+ unsigned UseReg = OpToFold.getReg();

+ if (!TargetRegisterInfo::isVirtualRegister(UseReg))

+ return false;

+ if (llvm::find_if(FoldList, [UseMI](const FoldCandidate &FC) {

+ return FC.UseMI == UseMI; }) != FoldList.end())

+ return false;

+ MachineRegisterInfo &MRI = UseMI->getParent()->getParent()->getRegInfo();

+ const MachineInstr *Def = MRI.getUniqueVRegDef(UseReg);

+ if (!Def || !Def->isRegSequence())

+ return false;

+ int64_t Imm;

+ MachineOperand *Op;

+ for (unsigned I = 1, E = Def->getNumExplicitOperands(); I < E; I += 2) {

+ const MachineOperand &Sub = Def->getOperand(I);

+ if (!Sub.isReg() || Sub.getSubReg())

+ return false;

+ MachineInstr *SubDef = MRI.getUniqueVRegDef(Sub.getReg());

+ while (SubDef && !SubDef->isMoveImmediate() &&

+ !SubDef->getOperand(1).isImm() && TII->isFoldableCopy(*SubDef))

+ SubDef = MRI.getUniqueVRegDef(SubDef->getOperand(1).getReg());

+ if (!SubDef || !SubDef->isMoveImmediate() || !SubDef->getOperand(1).isImm())

+ return false;

+ Op = &SubDef->getOperand(1);

+ auto SubImm = Op->getImm();

+ if (I == 1) {

+ if (!TII->isInlineConstant(SubDef->getOperand(1), OpTy))

+ return false;

+ Imm = SubImm;

+ continue;

+ }

+ if (Imm != SubImm)

+ return false; // Can only fold splat constants

+ }

+ FoldList.push_back(FoldCandidate(UseMI, UseOpIdx, Op));

+ return true;

void SIFoldOperands::foldOperand(

MachineOperand &OpToFold,

MachineInstr *UseMI,

- unsigned UseOpIdx,

+ int UseOpIdx,

SmallVectorImpl<FoldCandidate> &FoldList,

SmallVectorImpl<MachineInstr *> &CopiesToReplace) const {

const MachineOperand &UseOp = UseMI->getOperand(UseOpIdx);

@@ -420,11 +521,18 @@ void SIFoldOperands::foldOperand(

unsigned RegSeqDstReg = UseMI->getOperand(0).getReg();

unsigned RegSeqDstSubReg = UseMI->getOperand(UseOpIdx + 1).getImm();

+ MachineRegisterInfo::use_iterator Next;

for (MachineRegisterInfo::use_iterator

RSUse = MRI->use_begin(RegSeqDstReg), RSE = MRI->use_end();

- RSUse != RSE; ++RSUse) {

+ RSUse != RSE; RSUse = Next) {

+ Next = std::next(RSUse);

MachineInstr *RSUseMI = RSUse->getParent();

+ if (tryToFoldACImm(TII, UseMI->getOperand(0), RSUseMI,

+ RSUse.getOperandNo(), FoldList))

+ continue;

if (RSUse->getSubReg() != RegSeqDstSubReg)

continue;

@@ -435,10 +543,32 @@ void SIFoldOperands::foldOperand(

return;

}

+ if (tryToFoldACImm(TII, OpToFold, UseMI, UseOpIdx, FoldList))

+ return;

- bool FoldingImm = OpToFold.isImm();

+ if (frameIndexMayFold(TII, *UseMI, UseOpIdx, OpToFold)) {

+ // Sanity check that this is a stack access.

+ // FIXME: Should probably use stack pseudos before frame lowering.

+ MachineOperand *SOff = TII->getNamedOperand(*UseMI, AMDGPU::OpName::soffset);

+ if (!SOff->isReg() || (SOff->getReg() != MFI->getScratchWaveOffsetReg() &&

+ SOff->getReg() != MFI->getStackPtrOffsetReg()))

+ return;

+ if (TII->getNamedOperand(*UseMI, AMDGPU::OpName::srsrc)->getReg() !=

+ MFI->getScratchRSrcReg())

+ return;

- if (FoldingImm && UseMI->isCopy()) {

+ // A frame index will resolve to a positive constant, so it should always be

+ // safe to fold the addressing mode, even pre-GFX9.

+ UseMI->getOperand(UseOpIdx).ChangeToFrameIndex(OpToFold.getIndex());

+ SOff->setReg(MFI->getStackPtrOffsetReg());

+ return;

+ }

+ bool FoldingImmLike =

+ OpToFold.isImm() || OpToFold.isFI() || OpToFold.isGlobal();

+ if (FoldingImmLike && UseMI->isCopy()) {

unsigned DestReg = UseMI->getOperand(0).getReg();

const TargetRegisterClass *DestRC

= TargetRegisterInfo::isVirtualRegister(DestReg) ?

@@ -449,7 +579,7 @@ void SIFoldOperands::foldOperand(

if (TargetRegisterInfo::isVirtualRegister(DestReg) &&

TargetRegisterInfo::isVirtualRegister(SrcReg)) {

const TargetRegisterClass * SrcRC = MRI->getRegClass(SrcReg);

- if (TRI->isSGPRClass(SrcRC) && TRI->hasVGPRs(DestRC)) {

+ if (TRI->isSGPRClass(SrcRC) && TRI->hasVectorRegisters(DestRC)) {

MachineRegisterInfo::use_iterator NextUse;

SmallVector<FoldCandidate, 4> CopyUses;

for (MachineRegisterInfo::use_iterator

@@ -467,6 +597,14 @@ void SIFoldOperands::foldOperand(

}

+ if (DestRC == &AMDGPU::AGPR_32RegClass &&

+ TII->isInlineConstant(OpToFold, AMDGPU::OPERAND_REG_INLINE_C_INT32)) {

+ UseMI->setDesc(TII->get(AMDGPU::V_ACCVGPR_WRITE_B32));

+ UseMI->getOperand(1).ChangeToImmediate(OpToFold.getImm());

+ CopiesToReplace.push_back(UseMI);

+ return;

+ }

// In order to fold immediates into copies, we need to change the

// copy to a MOV.

@@ -479,18 +617,71 @@ void SIFoldOperands::foldOperand(

} else {

if (UseMI->isCopy() && OpToFold.isReg() &&

TargetRegisterInfo::isVirtualRegister(UseMI->getOperand(0).getReg()) &&

- TargetRegisterInfo::isVirtualRegister(UseMI->getOperand(1).getReg()) &&

- TRI->isVGPR(*MRI, UseMI->getOperand(0).getReg()) &&

- TRI->isVGPR(*MRI, UseMI->getOperand(1).getReg()) &&

+ TRI->isVectorRegister(*MRI, UseMI->getOperand(0).getReg()) &&

+ TRI->isVectorRegister(*MRI, UseMI->getOperand(1).getReg()) &&

!UseMI->getOperand(1).getSubReg()) {

+ unsigned Size = TII->getOpSize(*UseMI, 1);

UseMI->getOperand(1).setReg(OpToFold.getReg());

UseMI->getOperand(1).setSubReg(OpToFold.getSubReg());

UseMI->getOperand(1).setIsKill(false);

CopiesToReplace.push_back(UseMI);

OpToFold.setIsKill(false);

+ if (Size != 4)

+ return;

+ if (TRI->isAGPR(*MRI, UseMI->getOperand(0).getReg()) &&

+ TRI->isVGPR(*MRI, UseMI->getOperand(1).getReg()))

+ UseMI->setDesc(TII->get(AMDGPU::V_ACCVGPR_WRITE_B32));

+ else if (TRI->isVGPR(*MRI, UseMI->getOperand(0).getReg()) &&

+ TRI->isAGPR(*MRI, UseMI->getOperand(1).getReg()))

+ UseMI->setDesc(TII->get(AMDGPU::V_ACCVGPR_READ_B32));

return;

}

+ unsigned UseOpc = UseMI->getOpcode();

+ if (UseOpc == AMDGPU::V_READFIRSTLANE_B32 ||

+ (UseOpc == AMDGPU::V_READLANE_B32 &&

+ (int)UseOpIdx ==

+ AMDGPU::getNamedOperandIdx(UseOpc, AMDGPU::OpName::src0))) {

+ // %vgpr = V_MOV_B32 imm

+ // %sgpr = V_READFIRSTLANE_B32 %vgpr

+ // =>

+ // %sgpr = S_MOV_B32 imm

+ if (FoldingImmLike) {

+ if (execMayBeModifiedBeforeUse(*MRI,

+ UseMI->getOperand(UseOpIdx).getReg(),

+ *OpToFold.getParent(),

+ *UseMI))

+ return;

+ UseMI->setDesc(TII->get(AMDGPU::S_MOV_B32));

+ // FIXME: ChangeToImmediate should clear subreg

+ UseMI->getOperand(1).setSubReg(0);

+ if (OpToFold.isImm())

+ UseMI->getOperand(1).ChangeToImmediate(OpToFold.getImm());

+ else

+ UseMI->getOperand(1).ChangeToFrameIndex(OpToFold.getIndex());

+ UseMI->RemoveOperand(2); // Remove exec read (or src1 for readlane)

+ return;

+ }

+ if (OpToFold.isReg() && TRI->isSGPRReg(*MRI, OpToFold.getReg())) {

+ if (execMayBeModifiedBeforeUse(*MRI,

+ UseMI->getOperand(UseOpIdx).getReg(),

+ *OpToFold.getParent(),

+ *UseMI))

+ return;

+ // %vgpr = COPY %sgpr0

+ // %sgpr1 = V_READFIRSTLANE_B32 %vgpr

+ // =>

+ // %sgpr1 = COPY %sgpr0

+ UseMI->setDesc(TII->get(AMDGPU::COPY));

+ UseMI->RemoveOperand(2); // Remove exec read (or src1 for readlane)

+ return;

+ }

const MCInstrDesc &UseDesc = UseMI->getDesc();

// Don't fold into target independent nodes. Target independent opcodes

@@ -501,7 +692,7 @@ void SIFoldOperands::foldOperand(

return;

}

- if (!FoldingImm) {

+ if (!FoldingImmLike) {

tryAddToFoldList(FoldList, UseMI, UseOpIdx, &OpToFold, TII);

// FIXME: We could try to change the instruction from 64-bit to 32-bit

@@ -515,14 +706,10 @@ void SIFoldOperands::foldOperand(

const TargetRegisterClass *FoldRC =

TRI->getRegClass(FoldDesc.OpInfo[0].RegClass);

// Split 64-bit constants into 32-bits for folding.

if (UseOp.getSubReg() && AMDGPU::getRegBitWidth(FoldRC->getID()) == 64) {

unsigned UseReg = UseOp.getReg();

- const TargetRegisterClass *UseRC

- = TargetRegisterInfo::isVirtualRegister(UseReg) ?

- MRI->getRegClass(UseReg) :

- TRI->getPhysRegClass(UseReg);

+ const TargetRegisterClass *UseRC = MRI->getRegClass(UseReg);

if (AMDGPU::getRegBitWidth(UseRC->getID()) != 64)

return;

@@ -763,14 +950,23 @@ static bool tryFoldInst(const SIInstrInfo *TII,

Opc == AMDGPU::V_CNDMASK_B64_PSEUDO) {

const MachineOperand *Src0 = TII->getNamedOperand(*MI, AMDGPU::OpName::src0);

const MachineOperand *Src1 = TII->getNamedOperand(*MI, AMDGPU::OpName::src1);

- if (Src1->isIdenticalTo(*Src0)) {

+ int Src1ModIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1_modifiers);

+ int Src0ModIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0_modifiers);

+ if (Src1->isIdenticalTo(*Src0) &&

+ (Src1ModIdx == -1 || !MI->getOperand(Src1ModIdx).getImm()) &&

+ (Src0ModIdx == -1 || !MI->getOperand(Src0ModIdx).getImm())) {

LLVM_DEBUG(dbgs() << "Folded " << *MI << " into ");

+ auto &NewDesc =

+ TII->get(Src0->isReg() ? (unsigned)AMDGPU::COPY : getMovOpc(false));

int Src2Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2);

if (Src2Idx != -1)

MI->RemoveOperand(Src2Idx);

MI->RemoveOperand(AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1));

- mutateCopyOp(*MI, TII->get(Src0->isReg() ? (unsigned)AMDGPU::COPY

- : getMovOpc(false)));

+ if (Src1ModIdx != -1)

+ MI->RemoveOperand(Src1ModIdx);

+ if (Src0ModIdx != -1)

+ MI->RemoveOperand(Src0ModIdx);

+ mutateCopyOp(*MI, NewDesc);

LLVM_DEBUG(dbgs() << *MI << '\n');

return true;

}

@@ -788,7 +984,7 @@ void SIFoldOperands::foldInstOperand(MachineInstr &MI,

SmallVector<FoldCandidate, 4> FoldList;

MachineOperand &Dst = MI.getOperand(0);

- bool FoldingImm = OpToFold.isImm() || OpToFold.isFI();

+ bool FoldingImm = OpToFold.isImm() || OpToFold.isFI() || OpToFold.isGlobal();

if (FoldingImm) {

unsigned NumLiteralUses = 0;

MachineOperand *NonInlineUse = nullptr;

@@ -840,6 +1036,9 @@ void SIFoldOperands::foldInstOperand(MachineInstr &MI,

// in some cases. A better heuristic is needed.

if (isInlineConstantIfFolded(TII, *UseMI, OpNo, OpToFold)) {

foldOperand(OpToFold, UseMI, OpNo, FoldList, CopiesToReplace);

+ } else if (frameIndexMayFold(TII, *UseMI, OpNo, OpToFold)) {

+ foldOperand(OpToFold, UseMI, OpNo, FoldList,

+ CopiesToReplace);

} else {

if (++NumLiteralUses == 1) {

NonInlineUse = &*Use;

@@ -874,7 +1073,7 @@ void SIFoldOperands::foldInstOperand(MachineInstr &MI,

Copy->addImplicitDefUseOperands(*MF);

for (FoldCandidate &Fold : FoldList) {

- if (updateOperand(Fold, *TII, *TRI)) {

+ if (updateOperand(Fold, *TII, *TRI, *ST)) {

// Clear kill flags.

if (Fold.isReg()) {

assert(Fold.OpToFold && Fold.OpToFold->isReg());

@@ -926,7 +1125,8 @@ const MachineOperand *SIFoldOperands::isClamp(const MachineInstr &MI) const {

// Having a 0 op_sel_hi would require swizzling the output in the source

// instruction, which we can't do.

- unsigned UnsetMods = (Op == AMDGPU::V_PK_MAX_F16) ? SISrcMods::OP_SEL_1 : 0;

+ unsigned UnsetMods = (Op == AMDGPU::V_PK_MAX_F16) ? SISrcMods::OP_SEL_1

+ : 0u;

if (Src0Mods != UnsetMods && Src1Mods != UnsetMods)

return nullptr;

return Src0;

@@ -1105,13 +1305,13 @@ bool SIFoldOperands::runOnMachineFunction(MachineFunction &MF) {

ST = &MF.getSubtarget<GCNSubtarget>();

TII = ST->getInstrInfo();

TRI = &TII->getRegisterInfo();

- const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();

+ MFI = MF.getInfo<SIMachineFunctionInfo>();

// omod is ignored by hardware if IEEE bit is enabled. omod also does not

// correctly handle signed zeros.

- bool IsIEEEMode = ST->enableIEEEBit(MF);

+ // FIXME: Also need to check strictfp

+ bool IsIEEEMode = MFI->getMode().IEEE;

bool HasNSZ = MFI->hasNoSignedZerosFPMath();

for (MachineBasicBlock *MBB : depth_first(&MF)) {

@@ -1132,7 +1332,8 @@ bool SIFoldOperands::runOnMachineFunction(MachineFunction &MF) {

}

MachineOperand &OpToFold = MI.getOperand(1);

- bool FoldingImm = OpToFold.isImm() || OpToFold.isFI();

+ bool FoldingImm =

+ OpToFold.isImm() || OpToFold.isFI() || OpToFold.isGlobal();

// FIXME: We could also be folding things like TargetIndexes.

if (!FoldingImm && !OpToFold.isReg())