aboutsummaryrefslogtreecommitdiff
path: root/lib/Target/AMDGPU/SIFoldOperands.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'lib/Target/AMDGPU/SIFoldOperands.cpp')
-rw-r--r--lib/Target/AMDGPU/SIFoldOperands.cpp275
1 files changed, 244 insertions, 31 deletions
diff --git a/lib/Target/AMDGPU/SIFoldOperands.cpp b/lib/Target/AMDGPU/SIFoldOperands.cpp
index a5c0d4923d6b..d63414735b95 100644
--- a/lib/Target/AMDGPU/SIFoldOperands.cpp
+++ b/lib/Target/AMDGPU/SIFoldOperands.cpp
@@ -12,6 +12,7 @@
#include "AMDGPU.h"
#include "AMDGPUSubtarget.h"
#include "SIInstrInfo.h"
+#include "SIMachineFunctionInfo.h"
#include "llvm/CodeGen/LiveIntervalAnalysis.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
#include "llvm/CodeGen/MachineInstrBuilder.h"
@@ -66,6 +67,7 @@ public:
MachineRegisterInfo *MRI;
const SIInstrInfo *TII;
const SIRegisterInfo *TRI;
+ const SISubtarget *ST;
void foldOperand(MachineOperand &OpToFold,
MachineInstr *UseMI,
@@ -75,6 +77,12 @@ public:
void foldInstOperand(MachineInstr &MI, MachineOperand &OpToFold) const;
+ const MachineOperand *isClamp(const MachineInstr &MI) const;
+ bool tryFoldClamp(MachineInstr &MI);
+
+ std::pair<const MachineOperand *, int> isOMod(const MachineInstr &MI) const;
+ bool tryFoldOMod(MachineInstr &MI);
+
public:
SIFoldOperands() : MachineFunctionPass(ID) {
initializeSIFoldOperandsPass(*PassRegistry::getPassRegistry());
@@ -131,27 +139,6 @@ FunctionPass *llvm::createSIFoldOperandsPass() {
return new SIFoldOperands();
}
-static bool isSafeToFold(const MachineInstr &MI) {
- switch (MI.getOpcode()) {
- case AMDGPU::V_MOV_B32_e32:
- case AMDGPU::V_MOV_B32_e64:
- case AMDGPU::V_MOV_B64_PSEUDO: {
- // If there are additional implicit register operands, this may be used for
- // register indexing so the source register operand isn't simply copied.
- unsigned NumOps = MI.getDesc().getNumOperands() +
- MI.getDesc().getNumImplicitUses();
-
- return MI.getNumOperands() == NumOps;
- }
- case AMDGPU::S_MOV_B32:
- case AMDGPU::S_MOV_B64:
- case AMDGPU::COPY:
- return true;
- default:
- return false;
- }
-}
-
static bool updateOperand(FoldCandidate &Fold,
const TargetRegisterInfo &TRI) {
MachineInstr *MI = Fold.UseMI;
@@ -359,8 +346,6 @@ void SIFoldOperands::foldOperand(
const TargetRegisterClass *FoldRC =
TRI->getRegClass(FoldDesc.OpInfo[0].RegClass);
- APInt Imm(TII->operandBitWidth(FoldDesc.OpInfo[1].OperandType),
- OpToFold.getImm());
// Split 64-bit constants into 32-bits for folding.
if (UseOp.getSubReg() && AMDGPU::getRegBitWidth(FoldRC->getID()) == 64) {
@@ -370,21 +355,25 @@ void SIFoldOperands::foldOperand(
MRI->getRegClass(UseReg) :
TRI->getPhysRegClass(UseReg);
- assert(Imm.getBitWidth() == 64);
-
if (AMDGPU::getRegBitWidth(UseRC->getID()) != 64)
return;
+ APInt Imm(64, OpToFold.getImm());
if (UseOp.getSubReg() == AMDGPU::sub0) {
Imm = Imm.getLoBits(32);
} else {
assert(UseOp.getSubReg() == AMDGPU::sub1);
Imm = Imm.getHiBits(32);
}
+
+ MachineOperand ImmOp = MachineOperand::CreateImm(Imm.getSExtValue());
+ tryAddToFoldList(FoldList, UseMI, UseOpIdx, &ImmOp, TII);
+ return;
}
- MachineOperand ImmOp = MachineOperand::CreateImm(Imm.getSExtValue());
- tryAddToFoldList(FoldList, UseMI, UseOpIdx, &ImmOp, TII);
+
+
+ tryAddToFoldList(FoldList, UseMI, UseOpIdx, &OpToFold, TII);
}
static bool evalBinaryInstruction(unsigned Opcode, int32_t &Result,
@@ -581,6 +570,32 @@ static bool tryConstantFoldOp(MachineRegisterInfo &MRI,
return false;
}
+// Try to fold an instruction into a simpler one
+static bool tryFoldInst(const SIInstrInfo *TII,
+ MachineInstr *MI) {
+ unsigned Opc = MI->getOpcode();
+
+ if (Opc == AMDGPU::V_CNDMASK_B32_e32 ||
+ Opc == AMDGPU::V_CNDMASK_B32_e64 ||
+ Opc == AMDGPU::V_CNDMASK_B64_PSEUDO) {
+ const MachineOperand *Src0 = TII->getNamedOperand(*MI, AMDGPU::OpName::src0);
+ const MachineOperand *Src1 = TII->getNamedOperand(*MI, AMDGPU::OpName::src1);
+ if (Src1->isIdenticalTo(*Src0)) {
+ DEBUG(dbgs() << "Folded " << *MI << " into ");
+ int Src2Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2);
+ if (Src2Idx != -1)
+ MI->RemoveOperand(Src2Idx);
+ MI->RemoveOperand(AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1));
+ mutateCopyOp(*MI, TII->get(Src0->isReg() ? (unsigned)AMDGPU::COPY
+ : getMovOpc(false)));
+ DEBUG(dbgs() << *MI << '\n');
+ return true;
+ }
+ }
+
+ return false;
+}
+
void SIFoldOperands::foldInstOperand(MachineInstr &MI,
MachineOperand &OpToFold) const {
// We need mutate the operands of new mov instructions to add implicit
@@ -682,20 +697,213 @@ void SIFoldOperands::foldInstOperand(MachineInstr &MI,
}
DEBUG(dbgs() << "Folded source from " << MI << " into OpNo " <<
static_cast<int>(Fold.UseOpNo) << " of " << *Fold.UseMI << '\n');
+ tryFoldInst(TII, Fold.UseMI);
}
}
}
+const MachineOperand *SIFoldOperands::isClamp(const MachineInstr &MI) const {
+ unsigned Op = MI.getOpcode();
+ switch (Op) {
+ case AMDGPU::V_MAX_F32_e64:
+ case AMDGPU::V_MAX_F16_e64:
+ case AMDGPU::V_MAX_F64: {
+ if (!TII->getNamedOperand(MI, AMDGPU::OpName::clamp)->getImm())
+ return nullptr;
+
+ // Make sure sources are identical.
+ const MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
+ const MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
+ if (!Src0->isReg() || Src0->getSubReg() != Src1->getSubReg() ||
+ Src0->getSubReg() != AMDGPU::NoSubRegister)
+ return nullptr;
+
+ // Can't fold up if we have modifiers.
+ if (TII->hasModifiersSet(MI, AMDGPU::OpName::src0_modifiers) ||
+ TII->hasModifiersSet(MI, AMDGPU::OpName::src1_modifiers) ||
+ TII->hasModifiersSet(MI, AMDGPU::OpName::omod))
+ return nullptr;
+ return Src0;
+ }
+ default:
+ return nullptr;
+ }
+}
+
+// We obviously have multiple uses in a clamp since the register is used twice
+// in the same instruction.
+static bool hasOneNonDBGUseInst(const MachineRegisterInfo &MRI, unsigned Reg) {
+ int Count = 0;
+ for (auto I = MRI.use_instr_nodbg_begin(Reg), E = MRI.use_instr_nodbg_end();
+ I != E; ++I) {
+ if (++Count > 1)
+ return false;
+ }
+
+ return true;
+}
+
+bool SIFoldOperands::tryFoldClamp(MachineInstr &MI) {
+ const MachineOperand *ClampSrc = isClamp(MI);
+ if (!ClampSrc || !hasOneNonDBGUseInst(*MRI, ClampSrc->getReg()))
+ return false;
+
+ MachineInstr *Def = MRI->getVRegDef(ClampSrc->getReg());
+ if (!TII->hasFPClamp(*Def))
+ return false;
+ MachineOperand *DefClamp = TII->getNamedOperand(*Def, AMDGPU::OpName::clamp);
+ if (!DefClamp)
+ return false;
+
+ DEBUG(dbgs() << "Folding clamp " << *DefClamp << " into " << *Def << '\n');
+
+ // Clamp is applied after omod, so it is OK if omod is set.
+ DefClamp->setImm(1);
+ MRI->replaceRegWith(MI.getOperand(0).getReg(), Def->getOperand(0).getReg());
+ MI.eraseFromParent();
+ return true;
+}
+
+static int getOModValue(unsigned Opc, int64_t Val) {
+ switch (Opc) {
+ case AMDGPU::V_MUL_F32_e64: {
+ switch (static_cast<uint32_t>(Val)) {
+ case 0x3f000000: // 0.5
+ return SIOutMods::DIV2;
+ case 0x40000000: // 2.0
+ return SIOutMods::MUL2;
+ case 0x40800000: // 4.0
+ return SIOutMods::MUL4;
+ default:
+ return SIOutMods::NONE;
+ }
+ }
+ case AMDGPU::V_MUL_F16_e64: {
+ switch (static_cast<uint16_t>(Val)) {
+ case 0x3800: // 0.5
+ return SIOutMods::DIV2;
+ case 0x4000: // 2.0
+ return SIOutMods::MUL2;
+ case 0x4400: // 4.0
+ return SIOutMods::MUL4;
+ default:
+ return SIOutMods::NONE;
+ }
+ }
+ default:
+ llvm_unreachable("invalid mul opcode");
+ }
+}
+
+// FIXME: Does this really not support denormals with f16?
+// FIXME: Does this need to check IEEE mode bit? SNaNs are generally not
+// handled, so will anything other than that break?
+std::pair<const MachineOperand *, int>
+SIFoldOperands::isOMod(const MachineInstr &MI) const {
+ unsigned Op = MI.getOpcode();
+ switch (Op) {
+ case AMDGPU::V_MUL_F32_e64:
+ case AMDGPU::V_MUL_F16_e64: {
+ // If output denormals are enabled, omod is ignored.
+ if ((Op == AMDGPU::V_MUL_F32_e64 && ST->hasFP32Denormals()) ||
+ (Op == AMDGPU::V_MUL_F16_e64 && ST->hasFP16Denormals()))
+ return std::make_pair(nullptr, SIOutMods::NONE);
+
+ const MachineOperand *RegOp = nullptr;
+ const MachineOperand *ImmOp = nullptr;
+ const MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
+ const MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
+ if (Src0->isImm()) {
+ ImmOp = Src0;
+ RegOp = Src1;
+ } else if (Src1->isImm()) {
+ ImmOp = Src1;
+ RegOp = Src0;
+ } else
+ return std::make_pair(nullptr, SIOutMods::NONE);
+
+ int OMod = getOModValue(Op, ImmOp->getImm());
+ if (OMod == SIOutMods::NONE ||
+ TII->hasModifiersSet(MI, AMDGPU::OpName::src0_modifiers) ||
+ TII->hasModifiersSet(MI, AMDGPU::OpName::src1_modifiers) ||
+ TII->hasModifiersSet(MI, AMDGPU::OpName::omod) ||
+ TII->hasModifiersSet(MI, AMDGPU::OpName::clamp))
+ return std::make_pair(nullptr, SIOutMods::NONE);
+
+ return std::make_pair(RegOp, OMod);
+ }
+ case AMDGPU::V_ADD_F32_e64:
+ case AMDGPU::V_ADD_F16_e64: {
+ // If output denormals are enabled, omod is ignored.
+ if ((Op == AMDGPU::V_ADD_F32_e64 && ST->hasFP32Denormals()) ||
+ (Op == AMDGPU::V_ADD_F16_e64 && ST->hasFP16Denormals()))
+ return std::make_pair(nullptr, SIOutMods::NONE);
+
+ // Look through the DAGCombiner canonicalization fmul x, 2 -> fadd x, x
+ const MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
+ const MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
+
+ if (Src0->isReg() && Src1->isReg() && Src0->getReg() == Src1->getReg() &&
+ Src0->getSubReg() == Src1->getSubReg() &&
+ !TII->hasModifiersSet(MI, AMDGPU::OpName::src0_modifiers) &&
+ !TII->hasModifiersSet(MI, AMDGPU::OpName::src1_modifiers) &&
+ !TII->hasModifiersSet(MI, AMDGPU::OpName::clamp) &&
+ !TII->hasModifiersSet(MI, AMDGPU::OpName::omod))
+ return std::make_pair(Src0, SIOutMods::MUL2);
+
+ return std::make_pair(nullptr, SIOutMods::NONE);
+ }
+ default:
+ return std::make_pair(nullptr, SIOutMods::NONE);
+ }
+}
+
+// FIXME: Does this need to check IEEE bit on function?
+bool SIFoldOperands::tryFoldOMod(MachineInstr &MI) {
+ const MachineOperand *RegOp;
+ int OMod;
+ std::tie(RegOp, OMod) = isOMod(MI);
+ if (OMod == SIOutMods::NONE || !RegOp->isReg() ||
+ RegOp->getSubReg() != AMDGPU::NoSubRegister ||
+ !hasOneNonDBGUseInst(*MRI, RegOp->getReg()))
+ return false;
+
+ MachineInstr *Def = MRI->getVRegDef(RegOp->getReg());
+ MachineOperand *DefOMod = TII->getNamedOperand(*Def, AMDGPU::OpName::omod);
+ if (!DefOMod || DefOMod->getImm() != SIOutMods::NONE)
+ return false;
+
+ // Clamp is applied after omod. If the source already has clamp set, don't
+ // fold it.
+ if (TII->hasModifiersSet(*Def, AMDGPU::OpName::clamp))
+ return false;
+
+ DEBUG(dbgs() << "Folding omod " << MI << " into " << *Def << '\n');
+
+ DefOMod->setImm(OMod);
+ MRI->replaceRegWith(MI.getOperand(0).getReg(), Def->getOperand(0).getReg());
+ MI.eraseFromParent();
+ return true;
+}
+
bool SIFoldOperands::runOnMachineFunction(MachineFunction &MF) {
if (skipFunction(*MF.getFunction()))
return false;
- const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
-
MRI = &MF.getRegInfo();
- TII = ST.getInstrInfo();
+ ST = &MF.getSubtarget<SISubtarget>();
+ TII = ST->getInstrInfo();
TRI = &TII->getRegisterInfo();
+ const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
+
+ // omod is ignored by hardware if IEEE bit is enabled. omod also does not
+ // correctly handle signed zeros.
+ //
+ // TODO: Check nsz on instructions when fast math flags are preserved to MI
+ // level.
+ bool IsIEEEMode = ST->enableIEEEBit(MF) || !MFI->hasNoSignedZerosFPMath();
+
for (MachineFunction::iterator BI = MF.begin(), BE = MF.end();
BI != BE; ++BI) {
@@ -705,8 +913,13 @@ bool SIFoldOperands::runOnMachineFunction(MachineFunction &MF) {
Next = std::next(I);
MachineInstr &MI = *I;
- if (!isSafeToFold(MI))
+ tryFoldInst(TII, &MI);
+
+ if (!TII->isFoldableCopy(MI)) {
+ if (IsIEEEMode || !tryFoldOMod(MI))
+ tryFoldClamp(MI);
continue;
+ }
MachineOperand &OpToFold = MI.getOperand(1);
bool FoldingImm = OpToFold.isImm() || OpToFold.isFI();