summaryrefslogtreecommitdiff
path: root/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'llvm/lib/Target/AMDGPU/SIFoldOperands.cpp')
-rw-r--r--llvm/lib/Target/AMDGPU/SIFoldOperands.cpp235
1 files changed, 184 insertions, 51 deletions
diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
index 4eac03168760..2ff8baf29394 100644
--- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
@@ -14,7 +14,7 @@
#include "SIMachineFunctionInfo.h"
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
#include "llvm/ADT/DepthFirstIterator.h"
-#include "llvm/CodeGen/LiveIntervals.h"
+#include "llvm/ADT/SetVector.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
#include "llvm/CodeGen/MachineInstrBuilder.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
@@ -312,6 +312,19 @@ static bool isUseMIInFoldList(ArrayRef<FoldCandidate> FoldList,
return false;
}
+static void appendFoldCandidate(SmallVectorImpl<FoldCandidate> &FoldList,
+ MachineInstr *MI, unsigned OpNo,
+ MachineOperand *FoldOp, bool Commuted = false,
+ int ShrinkOp = -1) {
+ // Skip additional folding on the same operand.
+ for (FoldCandidate &Fold : FoldList)
+ if (Fold.UseMI == MI && Fold.UseOpNo == OpNo)
+ return;
+ LLVM_DEBUG(dbgs() << "Append " << (Commuted ? "commuted" : "normal")
+ << " operand " << OpNo << "\n " << *MI << '\n');
+ FoldList.push_back(FoldCandidate(MI, OpNo, FoldOp, Commuted, ShrinkOp));
+}
+
static bool tryAddToFoldList(SmallVectorImpl<FoldCandidate> &FoldList,
MachineInstr *MI, unsigned OpNo,
MachineOperand *OpToFold,
@@ -344,7 +357,7 @@ static bool tryAddToFoldList(SmallVectorImpl<FoldCandidate> &FoldList,
// Special case for s_setreg_b32
if (Opc == AMDGPU::S_SETREG_B32 && OpToFold->isImm()) {
MI->setDesc(TII->get(AMDGPU::S_SETREG_IMM32_B32));
- FoldList.push_back(FoldCandidate(MI, OpNo, OpToFold));
+ appendFoldCandidate(FoldList, MI, OpNo, OpToFold);
return true;
}
@@ -403,8 +416,7 @@ static bool tryAddToFoldList(SmallVectorImpl<FoldCandidate> &FoldList,
unsigned MaybeCommutedOpc = MI->getOpcode();
int Op32 = AMDGPU::getVOPe32(MaybeCommutedOpc);
- FoldList.push_back(FoldCandidate(MI, CommuteOpNo, OpToFold, true,
- Op32));
+ appendFoldCandidate(FoldList, MI, CommuteOpNo, OpToFold, true, Op32);
return true;
}
@@ -412,11 +424,34 @@ static bool tryAddToFoldList(SmallVectorImpl<FoldCandidate> &FoldList,
return false;
}
- FoldList.push_back(FoldCandidate(MI, CommuteOpNo, OpToFold, true));
+ appendFoldCandidate(FoldList, MI, CommuteOpNo, OpToFold, true);
return true;
}
- FoldList.push_back(FoldCandidate(MI, OpNo, OpToFold));
+ // Check the case where we might introduce a second constant operand to a
+ // scalar instruction
+ if (TII->isSALU(MI->getOpcode())) {
+ const MCInstrDesc &InstDesc = MI->getDesc();
+ const MCOperandInfo &OpInfo = InstDesc.OpInfo[OpNo];
+ const SIRegisterInfo &SRI = TII->getRegisterInfo();
+
+ // Fine if the operand can be encoded as an inline constant
+ if (OpToFold->isImm()) {
+ if (!SRI.opCanUseInlineConstant(OpInfo.OperandType) ||
+ !TII->isInlineConstant(*OpToFold, OpInfo)) {
+ // Otherwise check for another constant
+ for (unsigned i = 0, e = InstDesc.getNumOperands(); i != e; ++i) {
+ auto &Op = MI->getOperand(i);
+ if (OpNo != i &&
+ TII->isLiteralConstantLike(Op, OpInfo)) {
+ return false;
+ }
+ }
+ }
+ }
+ }
+
+ appendFoldCandidate(FoldList, MI, OpNo, OpToFold);
return true;
}
@@ -429,6 +464,42 @@ static bool isUseSafeToFold(const SIInstrInfo *TII,
//return !MI.hasRegisterImplicitUseOperand(UseMO.getReg());
}
+// Find a def of the UseReg, check if it is a reg_seqence and find initializers
+// for each subreg, tracking it to foldable inline immediate if possible.
+// Returns true on success.
+static bool getRegSeqInit(
+ SmallVectorImpl<std::pair<MachineOperand*, unsigned>> &Defs,
+ Register UseReg, uint8_t OpTy,
+ const SIInstrInfo *TII, const MachineRegisterInfo &MRI) {
+ MachineInstr *Def = MRI.getUniqueVRegDef(UseReg);
+ if (!Def || !Def->isRegSequence())
+ return false;
+
+ for (unsigned I = 1, E = Def->getNumExplicitOperands(); I < E; I += 2) {
+ MachineOperand *Sub = &Def->getOperand(I);
+ assert (Sub->isReg());
+
+ for (MachineInstr *SubDef = MRI.getUniqueVRegDef(Sub->getReg());
+ SubDef && Sub->isReg() && !Sub->getSubReg() &&
+ TII->isFoldableCopy(*SubDef);
+ SubDef = MRI.getUniqueVRegDef(Sub->getReg())) {
+ MachineOperand *Op = &SubDef->getOperand(1);
+ if (Op->isImm()) {
+ if (TII->isInlineConstant(*Op, OpTy))
+ Sub = Op;
+ break;
+ }
+ if (!Op->isReg())
+ break;
+ Sub = Op;
+ }
+
+ Defs.push_back(std::make_pair(Sub, Def->getOperand(I + 1).getImm()));
+ }
+
+ return true;
+}
+
static bool tryToFoldACImm(const SIInstrInfo *TII,
const MachineOperand &OpToFold,
MachineInstr *UseMI,
@@ -462,39 +533,30 @@ static bool tryToFoldACImm(const SIInstrInfo *TII,
return false;
MachineRegisterInfo &MRI = UseMI->getParent()->getParent()->getRegInfo();
- const MachineInstr *Def = MRI.getUniqueVRegDef(UseReg);
- if (!Def || !Def->isRegSequence())
+ SmallVector<std::pair<MachineOperand*, unsigned>, 32> Defs;
+ if (!getRegSeqInit(Defs, UseReg, OpTy, TII, MRI))
return false;
- int64_t Imm;
- MachineOperand *Op;
- for (unsigned I = 1, E = Def->getNumExplicitOperands(); I < E; I += 2) {
- const MachineOperand &Sub = Def->getOperand(I);
- if (!Sub.isReg() || Sub.getSubReg())
- return false;
- MachineInstr *SubDef = MRI.getUniqueVRegDef(Sub.getReg());
- while (SubDef && !SubDef->isMoveImmediate() &&
- !SubDef->getOperand(1).isImm() && TII->isFoldableCopy(*SubDef))
- SubDef = MRI.getUniqueVRegDef(SubDef->getOperand(1).getReg());
- if (!SubDef || !SubDef->isMoveImmediate() || !SubDef->getOperand(1).isImm())
+ int32_t Imm;
+ for (unsigned I = 0, E = Defs.size(); I != E; ++I) {
+ const MachineOperand *Op = Defs[I].first;
+ if (!Op->isImm())
return false;
- Op = &SubDef->getOperand(1);
+
auto SubImm = Op->getImm();
- if (I == 1) {
- if (!TII->isInlineConstant(SubDef->getOperand(1), OpTy))
+ if (!I) {
+ Imm = SubImm;
+ if (!TII->isInlineConstant(*Op, OpTy) ||
+ !TII->isOperandLegal(*UseMI, UseOpIdx, Op))
return false;
- Imm = SubImm;
continue;
}
if (Imm != SubImm)
return false; // Can only fold splat constants
}
- if (!TII->isOperandLegal(*UseMI, UseOpIdx, Op))
- return false;
-
- FoldList.push_back(FoldCandidate(UseMI, UseOpIdx, Op));
+ appendFoldCandidate(FoldList, UseMI, UseOpIdx, Defs[0].first);
return true;
}
@@ -513,18 +575,6 @@ void SIFoldOperands::foldOperand(
if (UseOp.isReg() && OpToFold.isReg()) {
if (UseOp.isImplicit() || UseOp.getSubReg() != AMDGPU::NoSubRegister)
return;
-
- // Don't fold subregister extracts into tied operands, only if it is a full
- // copy since a subregister use tied to a full register def doesn't really
- // make sense. e.g. don't fold:
- //
- // %1 = COPY %0:sub1
- // %2<tied3> = V_MAC_{F16, F32} %3, %4, %1<tied0>
- //
- // into
- // %2<tied3> = V_MAC_{F16, F32} %3, %4, %0:sub1<tied0>
- if (UseOp.isTied() && OpToFold.getSubReg() != AMDGPU::NoSubRegister)
- return;
}
// Special case for REG_SEQUENCE: We can't fold literals into
@@ -639,16 +689,97 @@ void SIFoldOperands::foldOperand(
CopiesToReplace.push_back(UseMI);
} else {
if (UseMI->isCopy() && OpToFold.isReg() &&
- Register::isVirtualRegister(UseMI->getOperand(0).getReg()) &&
- TRI->isVectorRegister(*MRI, UseMI->getOperand(0).getReg()) &&
- TRI->isVectorRegister(*MRI, UseMI->getOperand(1).getReg()) &&
+ UseMI->getOperand(0).getReg().isVirtual() &&
!UseMI->getOperand(1).getSubReg()) {
+ LLVM_DEBUG(dbgs() << "Folding " << OpToFold
+ << "\n into " << *UseMI << '\n');
unsigned Size = TII->getOpSize(*UseMI, 1);
- UseMI->getOperand(1).setReg(OpToFold.getReg());
+ Register UseReg = OpToFold.getReg();
+ UseMI->getOperand(1).setReg(UseReg);
UseMI->getOperand(1).setSubReg(OpToFold.getSubReg());
UseMI->getOperand(1).setIsKill(false);
CopiesToReplace.push_back(UseMI);
OpToFold.setIsKill(false);
+
+ // That is very tricky to store a value into an AGPR. v_accvgpr_write_b32
+ // can only accept VGPR or inline immediate. Recreate a reg_sequence with
+ // its initializers right here, so we will rematerialize immediates and
+ // avoid copies via different reg classes.
+ SmallVector<std::pair<MachineOperand*, unsigned>, 32> Defs;
+ if (Size > 4 && TRI->isAGPR(*MRI, UseMI->getOperand(0).getReg()) &&
+ getRegSeqInit(Defs, UseReg, AMDGPU::OPERAND_REG_INLINE_C_INT32, TII,
+ *MRI)) {
+ const DebugLoc &DL = UseMI->getDebugLoc();
+ MachineBasicBlock &MBB = *UseMI->getParent();
+
+ UseMI->setDesc(TII->get(AMDGPU::REG_SEQUENCE));
+ for (unsigned I = UseMI->getNumOperands() - 1; I > 0; --I)
+ UseMI->RemoveOperand(I);
+
+ MachineInstrBuilder B(*MBB.getParent(), UseMI);
+ DenseMap<TargetInstrInfo::RegSubRegPair, Register> VGPRCopies;
+ SmallSetVector<TargetInstrInfo::RegSubRegPair, 32> SeenAGPRs;
+ for (unsigned I = 0; I < Size / 4; ++I) {
+ MachineOperand *Def = Defs[I].first;
+ TargetInstrInfo::RegSubRegPair CopyToVGPR;
+ if (Def->isImm() &&
+ TII->isInlineConstant(*Def, AMDGPU::OPERAND_REG_INLINE_C_INT32)) {
+ int64_t Imm = Def->getImm();
+
+ auto Tmp = MRI->createVirtualRegister(&AMDGPU::AGPR_32RegClass);
+ BuildMI(MBB, UseMI, DL,
+ TII->get(AMDGPU::V_ACCVGPR_WRITE_B32), Tmp).addImm(Imm);
+ B.addReg(Tmp);
+ } else if (Def->isReg() && TRI->isAGPR(*MRI, Def->getReg())) {
+ auto Src = getRegSubRegPair(*Def);
+ Def->setIsKill(false);
+ if (!SeenAGPRs.insert(Src)) {
+ // We cannot build a reg_sequence out of the same registers, they
+ // must be copied. Better do it here before copyPhysReg() created
+ // several reads to do the AGPR->VGPR->AGPR copy.
+ CopyToVGPR = Src;
+ } else {
+ B.addReg(Src.Reg, Def->isUndef() ? RegState::Undef : 0,
+ Src.SubReg);
+ }
+ } else {
+ assert(Def->isReg());
+ Def->setIsKill(false);
+ auto Src = getRegSubRegPair(*Def);
+
+ // Direct copy from SGPR to AGPR is not possible. To avoid creation
+ // of exploded copies SGPR->VGPR->AGPR in the copyPhysReg() later,
+ // create a copy here and track if we already have such a copy.
+ if (TRI->isSGPRReg(*MRI, Src.Reg)) {
+ CopyToVGPR = Src;
+ } else {
+ auto Tmp = MRI->createVirtualRegister(&AMDGPU::AGPR_32RegClass);
+ BuildMI(MBB, UseMI, DL, TII->get(AMDGPU::COPY), Tmp).add(*Def);
+ B.addReg(Tmp);
+ }
+ }
+
+ if (CopyToVGPR.Reg) {
+ Register Vgpr;
+ if (VGPRCopies.count(CopyToVGPR)) {
+ Vgpr = VGPRCopies[CopyToVGPR];
+ } else {
+ Vgpr = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+ BuildMI(MBB, UseMI, DL, TII->get(AMDGPU::COPY), Vgpr).add(*Def);
+ VGPRCopies[CopyToVGPR] = Vgpr;
+ }
+ auto Tmp = MRI->createVirtualRegister(&AMDGPU::AGPR_32RegClass);
+ BuildMI(MBB, UseMI, DL,
+ TII->get(AMDGPU::V_ACCVGPR_WRITE_B32), Tmp).addReg(Vgpr);
+ B.addReg(Tmp);
+ }
+
+ B.addImm(Defs[I].second);
+ }
+ LLVM_DEBUG(dbgs() << "Folded " << *UseMI << '\n');
+ return;
+ }
+
if (Size != 4)
return;
if (TRI->isAGPR(*MRI, UseMI->getOperand(0).getReg()) &&
@@ -1099,6 +1230,7 @@ void SIFoldOperands::foldInstOperand(MachineInstr &MI,
Copy->addImplicitDefUseOperands(*MF);
for (FoldCandidate &Fold : FoldList) {
+ assert(!Fold.isReg() || Fold.OpToFold);
if (Fold.isReg() && Register::isVirtualRegister(Fold.OpToFold->getReg())) {
Register Reg = Fold.OpToFold->getReg();
MachineInstr *DefMI = Fold.OpToFold->getParent();
@@ -1249,8 +1381,8 @@ SIFoldOperands::isOMod(const MachineInstr &MI) const {
case AMDGPU::V_MUL_F32_e64:
case AMDGPU::V_MUL_F16_e64: {
// If output denormals are enabled, omod is ignored.
- if ((Op == AMDGPU::V_MUL_F32_e64 && ST->hasFP32Denormals()) ||
- (Op == AMDGPU::V_MUL_F16_e64 && ST->hasFP16Denormals()))
+ if ((Op == AMDGPU::V_MUL_F32_e64 && MFI->getMode().FP32Denormals) ||
+ (Op == AMDGPU::V_MUL_F16_e64 && MFI->getMode().FP64FP16Denormals))
return std::make_pair(nullptr, SIOutMods::NONE);
const MachineOperand *RegOp = nullptr;
@@ -1279,8 +1411,8 @@ SIFoldOperands::isOMod(const MachineInstr &MI) const {
case AMDGPU::V_ADD_F32_e64:
case AMDGPU::V_ADD_F16_e64: {
// If output denormals are enabled, omod is ignored.
- if ((Op == AMDGPU::V_ADD_F32_e64 && ST->hasFP32Denormals()) ||
- (Op == AMDGPU::V_ADD_F16_e64 && ST->hasFP16Denormals()))
+ if ((Op == AMDGPU::V_ADD_F32_e64 && MFI->getMode().FP32Denormals) ||
+ (Op == AMDGPU::V_ADD_F16_e64 && MFI->getMode().FP64FP16Denormals))
return std::make_pair(nullptr, SIOutMods::NONE);
// Look through the DAGCombiner canonicalization fmul x, 2 -> fadd x, x
@@ -1358,15 +1490,16 @@ bool SIFoldOperands::runOnMachineFunction(MachineFunction &MF) {
tryFoldInst(TII, &MI);
if (!TII->isFoldableCopy(MI)) {
+ // Saw an unknown clobber of m0, so we no longer know what it is.
+ if (CurrentKnownM0Val && MI.modifiesRegister(AMDGPU::M0, TRI))
+ CurrentKnownM0Val = nullptr;
+
// TODO: Omod might be OK if there is NSZ only on the source
// instruction, and not the omod multiply.
if (IsIEEEMode || (!HasNSZ && !MI.getFlag(MachineInstr::FmNsz)) ||
!tryFoldOMod(MI))
tryFoldClamp(MI);
- // Saw an unknown clobber of m0, so we no longer know what it is.
- if (CurrentKnownM0Val && MI.modifiesRegister(AMDGPU::M0, TRI))
- CurrentKnownM0Val = nullptr;
continue;
}
@@ -1409,5 +1542,5 @@ bool SIFoldOperands::runOnMachineFunction(MachineFunction &MF) {
foldInstOperand(MI, OpToFold);
}
}
- return false;
+ return true;
}