summaryrefslogtreecommitdiff
path: root/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'llvm/lib/Target/AMDGPU/SIFoldOperands.cpp')
-rw-r--r--llvm/lib/Target/AMDGPU/SIFoldOperands.cpp310
1 files changed, 191 insertions, 119 deletions
diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
index 9f1d6038f1b6..709de612d81d 100644
--- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
@@ -80,6 +80,10 @@ public:
bool updateOperand(FoldCandidate &Fold) const;
+ bool canUseImmWithOpSel(FoldCandidate &Fold) const;
+
+ bool tryFoldImmWithOpSel(FoldCandidate &Fold) const;
+
bool tryAddToFoldList(SmallVectorImpl<FoldCandidate> &FoldList,
MachineInstr *MI, unsigned OpNo,
MachineOperand *OpToFold) const;
@@ -196,60 +200,85 @@ FunctionPass *llvm::createSIFoldOperandsPass() {
return new SIFoldOperands();
}
-bool SIFoldOperands::updateOperand(FoldCandidate &Fold) const {
+bool SIFoldOperands::canUseImmWithOpSel(FoldCandidate &Fold) const {
MachineInstr *MI = Fold.UseMI;
MachineOperand &Old = MI->getOperand(Fold.UseOpNo);
- assert(Old.isReg());
+ const uint64_t TSFlags = MI->getDesc().TSFlags;
+ assert(Old.isReg() && Fold.isImm());
- const uint64_t TSFlags = MI->getDesc().TSFlags;
- if (Fold.isImm()) {
- if (TSFlags & SIInstrFlags::IsPacked && !(TSFlags & SIInstrFlags::IsMAI) &&
- (!ST->hasDOTOpSelHazard() || !(TSFlags & SIInstrFlags::IsDOT)) &&
- AMDGPU::isFoldableLiteralV216(Fold.ImmToFold,
- ST->hasInv2PiInlineImm())) {
- // Set op_sel/op_sel_hi on this operand or bail out if op_sel is
- // already set.
- unsigned Opcode = MI->getOpcode();
- int OpNo = MI->getOperandNo(&Old);
- int ModIdx = -1;
- if (OpNo == AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0))
- ModIdx = AMDGPU::OpName::src0_modifiers;
- else if (OpNo == AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src1))
- ModIdx = AMDGPU::OpName::src1_modifiers;
- else if (OpNo == AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src2))
- ModIdx = AMDGPU::OpName::src2_modifiers;
- assert(ModIdx != -1);
- ModIdx = AMDGPU::getNamedOperandIdx(Opcode, ModIdx);
- MachineOperand &Mod = MI->getOperand(ModIdx);
- unsigned Val = Mod.getImm();
- if (!(Val & SISrcMods::OP_SEL_0) && (Val & SISrcMods::OP_SEL_1)) {
- // Only apply the following transformation if that operand requires
- // a packed immediate.
- switch (TII->get(Opcode).operands()[OpNo].OperandType) {
- case AMDGPU::OPERAND_REG_IMM_V2FP16:
- case AMDGPU::OPERAND_REG_IMM_V2INT16:
- case AMDGPU::OPERAND_REG_INLINE_C_V2FP16:
- case AMDGPU::OPERAND_REG_INLINE_C_V2INT16:
- // If upper part is all zero we do not need op_sel_hi.
- if (!isUInt<16>(Fold.ImmToFold)) {
- if (!(Fold.ImmToFold & 0xffff)) {
- Mod.setImm(Mod.getImm() | SISrcMods::OP_SEL_0);
- Mod.setImm(Mod.getImm() & ~SISrcMods::OP_SEL_1);
- Old.ChangeToImmediate((Fold.ImmToFold >> 16) & 0xffff);
- return true;
- }
- Mod.setImm(Mod.getImm() & ~SISrcMods::OP_SEL_1);
- Old.ChangeToImmediate(Fold.ImmToFold & 0xffff);
- return true;
- }
- break;
- default:
- break;
- }
- }
- }
+ if (!(TSFlags & SIInstrFlags::IsPacked) || (TSFlags & SIInstrFlags::IsMAI) ||
+ (ST->hasDOTOpSelHazard() && (TSFlags & SIInstrFlags::IsDOT)) ||
+ isUInt<16>(Fold.ImmToFold) ||
+ !AMDGPU::isFoldableLiteralV216(Fold.ImmToFold, ST->hasInv2PiInlineImm()))
+ return false;
+
+ unsigned Opcode = MI->getOpcode();
+ int OpNo = MI->getOperandNo(&Old);
+ uint8_t OpType = TII->get(Opcode).operands()[OpNo].OperandType;
+ switch (OpType) {
+ default:
+ return false;
+ case AMDGPU::OPERAND_REG_IMM_V2FP16:
+ case AMDGPU::OPERAND_REG_IMM_V2INT16:
+ case AMDGPU::OPERAND_REG_INLINE_C_V2FP16:
+ case AMDGPU::OPERAND_REG_INLINE_C_V2INT16:
+ break;
+ }
+
+ return true;
+}
+
+bool SIFoldOperands::tryFoldImmWithOpSel(FoldCandidate &Fold) const {
+ MachineInstr *MI = Fold.UseMI;
+ MachineOperand &Old = MI->getOperand(Fold.UseOpNo);
+ unsigned Opcode = MI->getOpcode();
+ int OpNo = MI->getOperandNo(&Old);
+
+ // Set op_sel/op_sel_hi on this operand or bail out if op_sel is
+ // already set.
+ int ModIdx = -1;
+ if (OpNo == AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0))
+ ModIdx = AMDGPU::OpName::src0_modifiers;
+ else if (OpNo == AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src1))
+ ModIdx = AMDGPU::OpName::src1_modifiers;
+ else if (OpNo == AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src2))
+ ModIdx = AMDGPU::OpName::src2_modifiers;
+ assert(ModIdx != -1);
+ ModIdx = AMDGPU::getNamedOperandIdx(Opcode, ModIdx);
+ MachineOperand &Mod = MI->getOperand(ModIdx);
+ unsigned Val = Mod.getImm();
+ if ((Val & SISrcMods::OP_SEL_0) || !(Val & SISrcMods::OP_SEL_1))
+ return false;
+
+ // Only apply the following transformation if that operand requires
+ // a packed immediate.
+ // If upper part is all zero we do not need op_sel_hi.
+ if (!(Fold.ImmToFold & 0xffff)) {
+ MachineOperand New =
+ MachineOperand::CreateImm((Fold.ImmToFold >> 16) & 0xffff);
+ if (!TII->isOperandLegal(*MI, OpNo, &New))
+ return false;
+ Mod.setImm(Mod.getImm() | SISrcMods::OP_SEL_0);
+ Mod.setImm(Mod.getImm() & ~SISrcMods::OP_SEL_1);
+ Old.ChangeToImmediate((Fold.ImmToFold >> 16) & 0xffff);
+ return true;
}
+ MachineOperand New = MachineOperand::CreateImm(Fold.ImmToFold & 0xffff);
+ if (!TII->isOperandLegal(*MI, OpNo, &New))
+ return false;
+ Mod.setImm(Mod.getImm() & ~SISrcMods::OP_SEL_1);
+ Old.ChangeToImmediate(Fold.ImmToFold & 0xffff);
+ return true;
+}
+
+bool SIFoldOperands::updateOperand(FoldCandidate &Fold) const {
+ MachineInstr *MI = Fold.UseMI;
+ MachineOperand &Old = MI->getOperand(Fold.UseOpNo);
+ assert(Old.isReg());
+
+ if (Fold.isImm() && canUseImmWithOpSel(Fold))
+ return tryFoldImmWithOpSel(Fold);
if ((Fold.isImm() || Fold.isFI() || Fold.isGlobal()) && Fold.needsShrink()) {
MachineBasicBlock *MBB = MI->getParent();
@@ -345,9 +374,50 @@ static void appendFoldCandidate(SmallVectorImpl<FoldCandidate> &FoldList,
bool SIFoldOperands::tryAddToFoldList(SmallVectorImpl<FoldCandidate> &FoldList,
MachineInstr *MI, unsigned OpNo,
MachineOperand *OpToFold) const {
- if (!TII->isOperandLegal(*MI, OpNo, OpToFold)) {
+ const unsigned Opc = MI->getOpcode();
+
+ auto tryToFoldAsFMAAKorMK = [&]() {
+ if (!OpToFold->isImm())
+ return false;
+
+ const bool TryAK = OpNo == 3;
+ const unsigned NewOpc = TryAK ? AMDGPU::S_FMAAK_F32 : AMDGPU::S_FMAMK_F32;
+ MI->setDesc(TII->get(NewOpc));
+
+ // We have to fold into operand which would be Imm not into OpNo.
+ bool FoldAsFMAAKorMK =
+ tryAddToFoldList(FoldList, MI, TryAK ? 3 : 2, OpToFold);
+ if (FoldAsFMAAKorMK) {
+ // Untie Src2 of fmac.
+ MI->untieRegOperand(3);
+ // For fmamk swap operands 1 and 2 if OpToFold was meant for operand 1.
+ if (OpNo == 1) {
+ MachineOperand &Op1 = MI->getOperand(1);
+ MachineOperand &Op2 = MI->getOperand(2);
+ Register OldReg = Op1.getReg();
+ // Operand 2 might be an inlinable constant
+ if (Op2.isImm()) {
+ Op1.ChangeToImmediate(Op2.getImm());
+ Op2.ChangeToRegister(OldReg, false);
+ } else {
+ Op1.setReg(Op2.getReg());
+ Op2.setReg(OldReg);
+ }
+ }
+ return true;
+ }
+ MI->setDesc(TII->get(Opc));
+ return false;
+ };
+
+ bool IsLegal = TII->isOperandLegal(*MI, OpNo, OpToFold);
+ if (!IsLegal && OpToFold->isImm()) {
+ FoldCandidate Fold(MI, OpNo, OpToFold);
+ IsLegal = canUseImmWithOpSel(Fold);
+ }
+
+ if (!IsLegal) {
// Special case for v_mac_{f16, f32}_e64 if we are trying to fold into src2
- unsigned Opc = MI->getOpcode();
unsigned NewOpc = macToMad(Opc);
if (NewOpc != AMDGPU::INSTRUCTION_LIST_END) {
// Check if changing this to a v_mad_{f16, f32} instruction will allow us
@@ -367,6 +437,13 @@ bool SIFoldOperands::tryAddToFoldList(SmallVectorImpl<FoldCandidate> &FoldList,
MI->setDesc(TII->get(Opc));
}
+ // Special case for s_fmac_f32 if we are trying to fold into Src2.
+ // By transforming into fmaak we can untie Src2 and make folding legal.
+ if (Opc == AMDGPU::S_FMAC_F32 && OpNo == 3) {
+ if (tryToFoldAsFMAAKorMK())
+ return true;
+ }
+
// Special case for s_setreg_b32
if (OpToFold->isImm()) {
unsigned ImmOpc = 0;
@@ -387,66 +464,72 @@ bool SIFoldOperands::tryAddToFoldList(SmallVectorImpl<FoldCandidate> &FoldList,
if (isUseMIInFoldList(FoldList, MI))
return false;
- unsigned CommuteOpNo = OpNo;
-
// Operand is not legal, so try to commute the instruction to
// see if this makes it possible to fold.
- unsigned CommuteIdx0 = TargetInstrInfo::CommuteAnyOperandIndex;
- unsigned CommuteIdx1 = TargetInstrInfo::CommuteAnyOperandIndex;
- bool CanCommute = TII->findCommutedOpIndices(*MI, CommuteIdx0, CommuteIdx1);
-
- if (CanCommute) {
- if (CommuteIdx0 == OpNo)
- CommuteOpNo = CommuteIdx1;
- else if (CommuteIdx1 == OpNo)
- CommuteOpNo = CommuteIdx0;
- }
-
+ unsigned CommuteOpNo = TargetInstrInfo::CommuteAnyOperandIndex;
+ bool CanCommute = TII->findCommutedOpIndices(*MI, OpNo, CommuteOpNo);
+ if (!CanCommute)
+ return false;
// One of operands might be an Imm operand, and OpNo may refer to it after
// the call of commuteInstruction() below. Such situations are avoided
// here explicitly as OpNo must be a register operand to be a candidate
// for memory folding.
- if (CanCommute && (!MI->getOperand(CommuteIdx0).isReg() ||
- !MI->getOperand(CommuteIdx1).isReg()))
+ if (!MI->getOperand(OpNo).isReg() || !MI->getOperand(CommuteOpNo).isReg())
return false;
- if (!CanCommute ||
- !TII->commuteInstruction(*MI, false, CommuteIdx0, CommuteIdx1))
+ if (!TII->commuteInstruction(*MI, false, OpNo, CommuteOpNo))
return false;
+ int Op32 = -1;
if (!TII->isOperandLegal(*MI, CommuteOpNo, OpToFold)) {
- if ((Opc == AMDGPU::V_ADD_CO_U32_e64 ||
- Opc == AMDGPU::V_SUB_CO_U32_e64 ||
- Opc == AMDGPU::V_SUBREV_CO_U32_e64) && // FIXME
- (OpToFold->isImm() || OpToFold->isFI() || OpToFold->isGlobal())) {
-
- // Verify the other operand is a VGPR, otherwise we would violate the
- // constant bus restriction.
- unsigned OtherIdx = CommuteOpNo == CommuteIdx0 ? CommuteIdx1 : CommuteIdx0;
- MachineOperand &OtherOp = MI->getOperand(OtherIdx);
- if (!OtherOp.isReg() ||
- !TII->getRegisterInfo().isVGPR(*MRI, OtherOp.getReg()))
- return false;
-
- assert(MI->getOperand(1).isDef());
+ if ((Opc != AMDGPU::V_ADD_CO_U32_e64 && Opc != AMDGPU::V_SUB_CO_U32_e64 &&
+ Opc != AMDGPU::V_SUBREV_CO_U32_e64) || // FIXME
+ (!OpToFold->isImm() && !OpToFold->isFI() && !OpToFold->isGlobal())) {
+ TII->commuteInstruction(*MI, false, OpNo, CommuteOpNo);
+ return false;
+ }
- // Make sure to get the 32-bit version of the commuted opcode.
- unsigned MaybeCommutedOpc = MI->getOpcode();
- int Op32 = AMDGPU::getVOPe32(MaybeCommutedOpc);
+ // Verify the other operand is a VGPR, otherwise we would violate the
+ // constant bus restriction.
+ MachineOperand &OtherOp = MI->getOperand(OpNo);
+ if (!OtherOp.isReg() ||
+ !TII->getRegisterInfo().isVGPR(*MRI, OtherOp.getReg()))
+ return false;
- appendFoldCandidate(FoldList, MI, CommuteOpNo, OpToFold, true, Op32);
- return true;
- }
+ assert(MI->getOperand(1).isDef());
- TII->commuteInstruction(*MI, false, CommuteIdx0, CommuteIdx1);
- return false;
+ // Make sure to get the 32-bit version of the commuted opcode.
+ unsigned MaybeCommutedOpc = MI->getOpcode();
+ Op32 = AMDGPU::getVOPe32(MaybeCommutedOpc);
}
- appendFoldCandidate(FoldList, MI, CommuteOpNo, OpToFold, true);
+ appendFoldCandidate(FoldList, MI, CommuteOpNo, OpToFold, true, Op32);
return true;
}
+ // Inlineable constant might have been folded into Imm operand of fmaak or
+ // fmamk and we are trying to fold a non-inlinable constant.
+ if ((Opc == AMDGPU::S_FMAAK_F32 || Opc == AMDGPU::S_FMAMK_F32) &&
+ !OpToFold->isReg() && !TII->isInlineConstant(*OpToFold)) {
+ unsigned ImmIdx = Opc == AMDGPU::S_FMAAK_F32 ? 3 : 2;
+ MachineOperand &OpImm = MI->getOperand(ImmIdx);
+ if (!OpImm.isReg() &&
+ TII->isInlineConstant(*MI, MI->getOperand(OpNo), OpImm))
+ return tryToFoldAsFMAAKorMK();
+ }
+
+ // Special case for s_fmac_f32 if we are trying to fold into Src0 or Src1.
+ // By changing into fmamk we can untie Src2.
+ // If folding for Src0 happens first and it is identical operand to Src1 we
+ // should avoid transforming into fmamk which requires commuting as it would
+ // cause folding into Src1 to fail later on due to wrong OpNo used.
+ if (Opc == AMDGPU::S_FMAC_F32 &&
+ (OpNo != 1 || !MI->getOperand(1).isIdenticalTo(MI->getOperand(2)))) {
+ if (tryToFoldAsFMAAKorMK())
+ return true;
+ }
+
// Check the case where we might introduce a second constant operand to a
// scalar instruction
if (TII->isSALU(MI->getOpcode())) {
@@ -458,7 +541,8 @@ bool SIFoldOperands::tryAddToFoldList(SmallVectorImpl<FoldCandidate> &FoldList,
// Otherwise check for another constant
for (unsigned i = 0, e = InstDesc.getNumOperands(); i != e; ++i) {
auto &Op = MI->getOperand(i);
- if (OpNo != i && !Op.isReg() && !TII->isInlineConstant(Op, OpInfo))
+ if (OpNo != i && !Op.isReg() &&
+ !TII->isInlineConstant(Op, InstDesc.operands()[i]))
return false;
}
}
@@ -516,13 +600,10 @@ bool SIFoldOperands::tryToFoldACImm(
if (UseOpIdx >= Desc.getNumOperands())
return false;
- uint8_t OpTy = Desc.operands()[UseOpIdx].OperandType;
- if ((OpTy < AMDGPU::OPERAND_REG_INLINE_AC_FIRST ||
- OpTy > AMDGPU::OPERAND_REG_INLINE_AC_LAST) &&
- (OpTy < AMDGPU::OPERAND_REG_INLINE_C_FIRST ||
- OpTy > AMDGPU::OPERAND_REG_INLINE_C_LAST))
+ if (!AMDGPU::isSISrcInlinableOperand(Desc, UseOpIdx))
return false;
+ uint8_t OpTy = Desc.operands()[UseOpIdx].OperandType;
if (OpToFold.isImm() && TII->isInlineConstant(OpToFold, OpTy) &&
TII->isOperandLegal(*UseMI, UseOpIdx, &OpToFold)) {
UseMI->getOperand(UseOpIdx).ChangeToImmediate(OpToFold.getImm());
@@ -671,24 +752,6 @@ void SIFoldOperands::foldOperand(
const TargetRegisterClass *DestRC = TRI->getRegClassForReg(*MRI, DestReg);
if (!DestReg.isPhysical()) {
- if (TRI->isSGPRClass(SrcRC) && TRI->hasVectorRegisters(DestRC)) {
- SmallVector<FoldCandidate, 4> CopyUses;
- for (auto &Use : MRI->use_nodbg_operands(DestReg)) {
- // There's no point trying to fold into an implicit operand.
- if (Use.isImplicit())
- continue;
-
- CopyUses.emplace_back(Use.getParent(),
- Use.getParent()->getOperandNo(&Use),
- &UseMI->getOperand(1));
- }
-
- for (auto &F : CopyUses) {
- foldOperand(*F.OpToFold, F.UseMI, F.UseOpNo, FoldList,
- CopiesToReplace);
- }
- }
-
if (DestRC == &AMDGPU::AGPR_32RegClass &&
TII->isInlineConstant(OpToFold, AMDGPU::OPERAND_REG_INLINE_C_INT32)) {
UseMI->setDesc(TII->get(AMDGPU::V_ACCVGPR_WRITE_B32_e64));
@@ -1035,6 +1098,9 @@ SIFoldOperands::getImmOrMaterializedImm(MachineOperand &Op) const {
// selection.
// TODO: See if a frame index with a fixed offset can fold.
bool SIFoldOperands::tryConstantFoldOp(MachineInstr *MI) const {
+ if (!MI->allImplicitDefsAreDead())
+ return false;
+
unsigned Opc = MI->getOpcode();
int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
@@ -1340,6 +1406,7 @@ const MachineOperand *SIFoldOperands::isClamp(const MachineInstr &MI) const {
case AMDGPU::V_MAX_F32_e64:
case AMDGPU::V_MAX_F16_e64:
case AMDGPU::V_MAX_F16_t16_e64:
+ case AMDGPU::V_MAX_F16_fake16_e64:
case AMDGPU::V_MAX_F64_e64:
case AMDGPU::V_PK_MAX_F16: {
if (!TII->getNamedOperand(MI, AMDGPU::OpName::clamp)->getImm())
@@ -1435,7 +1502,8 @@ static int getOModValue(unsigned Opc, int64_t Val) {
}
}
case AMDGPU::V_MUL_F16_e64:
- case AMDGPU::V_MUL_F16_t16_e64: {
+ case AMDGPU::V_MUL_F16_t16_e64:
+ case AMDGPU::V_MUL_F16_fake16_e64: {
switch (static_cast<uint16_t>(Val)) {
case 0x3800: // 0.5
return SIOutMods::DIV2;
@@ -1462,12 +1530,14 @@ SIFoldOperands::isOMod(const MachineInstr &MI) const {
case AMDGPU::V_MUL_F64_e64:
case AMDGPU::V_MUL_F32_e64:
case AMDGPU::V_MUL_F16_t16_e64:
+ case AMDGPU::V_MUL_F16_fake16_e64:
case AMDGPU::V_MUL_F16_e64: {
// If output denormals are enabled, omod is ignored.
if ((Op == AMDGPU::V_MUL_F32_e64 &&
MFI->getMode().FP32Denormals.Output != DenormalMode::PreserveSign) ||
((Op == AMDGPU::V_MUL_F64_e64 || Op == AMDGPU::V_MUL_F16_e64 ||
- Op == AMDGPU::V_MUL_F16_t16_e64) &&
+ Op == AMDGPU::V_MUL_F16_t16_e64 ||
+ Op == AMDGPU::V_MUL_F16_fake16_e64) &&
MFI->getMode().FP64FP16Denormals.Output != DenormalMode::PreserveSign))
return std::pair(nullptr, SIOutMods::NONE);
@@ -1497,12 +1567,14 @@ SIFoldOperands::isOMod(const MachineInstr &MI) const {
case AMDGPU::V_ADD_F64_e64:
case AMDGPU::V_ADD_F32_e64:
case AMDGPU::V_ADD_F16_e64:
- case AMDGPU::V_ADD_F16_t16_e64: {
+ case AMDGPU::V_ADD_F16_t16_e64:
+ case AMDGPU::V_ADD_F16_fake16_e64: {
// If output denormals are enabled, omod is ignored.
if ((Op == AMDGPU::V_ADD_F32_e64 &&
MFI->getMode().FP32Denormals.Output != DenormalMode::PreserveSign) ||
((Op == AMDGPU::V_ADD_F64_e64 || Op == AMDGPU::V_ADD_F16_e64 ||
- Op == AMDGPU::V_ADD_F16_t16_e64) &&
+ Op == AMDGPU::V_ADD_F16_t16_e64 ||
+ Op == AMDGPU::V_ADD_F16_fake16_e64) &&
MFI->getMode().FP64FP16Denormals.Output != DenormalMode::PreserveSign))
return std::pair(nullptr, SIOutMods::NONE);