diff options
Diffstat (limited to 'lib/Target/AMDGPU/SIFoldOperands.cpp')
-rw-r--r-- | lib/Target/AMDGPU/SIFoldOperands.cpp | 275 |
1 files changed, 244 insertions, 31 deletions
diff --git a/lib/Target/AMDGPU/SIFoldOperands.cpp b/lib/Target/AMDGPU/SIFoldOperands.cpp index a5c0d4923d6b..d63414735b95 100644 --- a/lib/Target/AMDGPU/SIFoldOperands.cpp +++ b/lib/Target/AMDGPU/SIFoldOperands.cpp @@ -12,6 +12,7 @@ #include "AMDGPU.h" #include "AMDGPUSubtarget.h" #include "SIInstrInfo.h" +#include "SIMachineFunctionInfo.h" #include "llvm/CodeGen/LiveIntervalAnalysis.h" #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/MachineInstrBuilder.h" @@ -66,6 +67,7 @@ public: MachineRegisterInfo *MRI; const SIInstrInfo *TII; const SIRegisterInfo *TRI; + const SISubtarget *ST; void foldOperand(MachineOperand &OpToFold, MachineInstr *UseMI, @@ -75,6 +77,12 @@ public: void foldInstOperand(MachineInstr &MI, MachineOperand &OpToFold) const; + const MachineOperand *isClamp(const MachineInstr &MI) const; + bool tryFoldClamp(MachineInstr &MI); + + std::pair<const MachineOperand *, int> isOMod(const MachineInstr &MI) const; + bool tryFoldOMod(MachineInstr &MI); + public: SIFoldOperands() : MachineFunctionPass(ID) { initializeSIFoldOperandsPass(*PassRegistry::getPassRegistry()); @@ -131,27 +139,6 @@ FunctionPass *llvm::createSIFoldOperandsPass() { return new SIFoldOperands(); } -static bool isSafeToFold(const MachineInstr &MI) { - switch (MI.getOpcode()) { - case AMDGPU::V_MOV_B32_e32: - case AMDGPU::V_MOV_B32_e64: - case AMDGPU::V_MOV_B64_PSEUDO: { - // If there are additional implicit register operands, this may be used for - // register indexing so the source register operand isn't simply copied. - unsigned NumOps = MI.getDesc().getNumOperands() + - MI.getDesc().getNumImplicitUses(); - - return MI.getNumOperands() == NumOps; - } - case AMDGPU::S_MOV_B32: - case AMDGPU::S_MOV_B64: - case AMDGPU::COPY: - return true; - default: - return false; - } -} - static bool updateOperand(FoldCandidate &Fold, const TargetRegisterInfo &TRI) { MachineInstr *MI = Fold.UseMI; @@ -359,8 +346,6 @@ void SIFoldOperands::foldOperand( const TargetRegisterClass *FoldRC = TRI->getRegClass(FoldDesc.OpInfo[0].RegClass); - APInt Imm(TII->operandBitWidth(FoldDesc.OpInfo[1].OperandType), - OpToFold.getImm()); // Split 64-bit constants into 32-bits for folding. if (UseOp.getSubReg() && AMDGPU::getRegBitWidth(FoldRC->getID()) == 64) { @@ -370,21 +355,25 @@ void SIFoldOperands::foldOperand( MRI->getRegClass(UseReg) : TRI->getPhysRegClass(UseReg); - assert(Imm.getBitWidth() == 64); - if (AMDGPU::getRegBitWidth(UseRC->getID()) != 64) return; + APInt Imm(64, OpToFold.getImm()); if (UseOp.getSubReg() == AMDGPU::sub0) { Imm = Imm.getLoBits(32); } else { assert(UseOp.getSubReg() == AMDGPU::sub1); Imm = Imm.getHiBits(32); } + + MachineOperand ImmOp = MachineOperand::CreateImm(Imm.getSExtValue()); + tryAddToFoldList(FoldList, UseMI, UseOpIdx, &ImmOp, TII); + return; } - MachineOperand ImmOp = MachineOperand::CreateImm(Imm.getSExtValue()); - tryAddToFoldList(FoldList, UseMI, UseOpIdx, &ImmOp, TII); + + + tryAddToFoldList(FoldList, UseMI, UseOpIdx, &OpToFold, TII); } static bool evalBinaryInstruction(unsigned Opcode, int32_t &Result, @@ -581,6 +570,32 @@ static bool tryConstantFoldOp(MachineRegisterInfo &MRI, return false; } +// Try to fold an instruction into a simpler one +static bool tryFoldInst(const SIInstrInfo *TII, + MachineInstr *MI) { + unsigned Opc = MI->getOpcode(); + + if (Opc == AMDGPU::V_CNDMASK_B32_e32 || + Opc == AMDGPU::V_CNDMASK_B32_e64 || + Opc == AMDGPU::V_CNDMASK_B64_PSEUDO) { + const MachineOperand *Src0 = TII->getNamedOperand(*MI, AMDGPU::OpName::src0); + const MachineOperand *Src1 = TII->getNamedOperand(*MI, AMDGPU::OpName::src1); + if (Src1->isIdenticalTo(*Src0)) { + DEBUG(dbgs() << "Folded " << *MI << " into "); + int Src2Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2); + if (Src2Idx != -1) + MI->RemoveOperand(Src2Idx); + MI->RemoveOperand(AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1)); + mutateCopyOp(*MI, TII->get(Src0->isReg() ? (unsigned)AMDGPU::COPY + : getMovOpc(false))); + DEBUG(dbgs() << *MI << '\n'); + return true; + } + } + + return false; +} + void SIFoldOperands::foldInstOperand(MachineInstr &MI, MachineOperand &OpToFold) const { // We need mutate the operands of new mov instructions to add implicit @@ -682,20 +697,213 @@ void SIFoldOperands::foldInstOperand(MachineInstr &MI, } DEBUG(dbgs() << "Folded source from " << MI << " into OpNo " << static_cast<int>(Fold.UseOpNo) << " of " << *Fold.UseMI << '\n'); + tryFoldInst(TII, Fold.UseMI); } } } +const MachineOperand *SIFoldOperands::isClamp(const MachineInstr &MI) const { + unsigned Op = MI.getOpcode(); + switch (Op) { + case AMDGPU::V_MAX_F32_e64: + case AMDGPU::V_MAX_F16_e64: + case AMDGPU::V_MAX_F64: { + if (!TII->getNamedOperand(MI, AMDGPU::OpName::clamp)->getImm()) + return nullptr; + + // Make sure sources are identical. + const MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0); + const MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1); + if (!Src0->isReg() || Src0->getSubReg() != Src1->getSubReg() || + Src0->getSubReg() != AMDGPU::NoSubRegister) + return nullptr; + + // Can't fold up if we have modifiers. + if (TII->hasModifiersSet(MI, AMDGPU::OpName::src0_modifiers) || + TII->hasModifiersSet(MI, AMDGPU::OpName::src1_modifiers) || + TII->hasModifiersSet(MI, AMDGPU::OpName::omod)) + return nullptr; + return Src0; + } + default: + return nullptr; + } +} + +// We obviously have multiple uses in a clamp since the register is used twice +// in the same instruction. +static bool hasOneNonDBGUseInst(const MachineRegisterInfo &MRI, unsigned Reg) { + int Count = 0; + for (auto I = MRI.use_instr_nodbg_begin(Reg), E = MRI.use_instr_nodbg_end(); + I != E; ++I) { + if (++Count > 1) + return false; + } + + return true; +} + +bool SIFoldOperands::tryFoldClamp(MachineInstr &MI) { + const MachineOperand *ClampSrc = isClamp(MI); + if (!ClampSrc || !hasOneNonDBGUseInst(*MRI, ClampSrc->getReg())) + return false; + + MachineInstr *Def = MRI->getVRegDef(ClampSrc->getReg()); + if (!TII->hasFPClamp(*Def)) + return false; + MachineOperand *DefClamp = TII->getNamedOperand(*Def, AMDGPU::OpName::clamp); + if (!DefClamp) + return false; + + DEBUG(dbgs() << "Folding clamp " << *DefClamp << " into " << *Def << '\n'); + + // Clamp is applied after omod, so it is OK if omod is set. + DefClamp->setImm(1); + MRI->replaceRegWith(MI.getOperand(0).getReg(), Def->getOperand(0).getReg()); + MI.eraseFromParent(); + return true; +} + +static int getOModValue(unsigned Opc, int64_t Val) { + switch (Opc) { + case AMDGPU::V_MUL_F32_e64: { + switch (static_cast<uint32_t>(Val)) { + case 0x3f000000: // 0.5 + return SIOutMods::DIV2; + case 0x40000000: // 2.0 + return SIOutMods::MUL2; + case 0x40800000: // 4.0 + return SIOutMods::MUL4; + default: + return SIOutMods::NONE; + } + } + case AMDGPU::V_MUL_F16_e64: { + switch (static_cast<uint16_t>(Val)) { + case 0x3800: // 0.5 + return SIOutMods::DIV2; + case 0x4000: // 2.0 + return SIOutMods::MUL2; + case 0x4400: // 4.0 + return SIOutMods::MUL4; + default: + return SIOutMods::NONE; + } + } + default: + llvm_unreachable("invalid mul opcode"); + } +} + +// FIXME: Does this really not support denormals with f16? +// FIXME: Does this need to check IEEE mode bit? SNaNs are generally not +// handled, so will anything other than that break? +std::pair<const MachineOperand *, int> +SIFoldOperands::isOMod(const MachineInstr &MI) const { + unsigned Op = MI.getOpcode(); + switch (Op) { + case AMDGPU::V_MUL_F32_e64: + case AMDGPU::V_MUL_F16_e64: { + // If output denormals are enabled, omod is ignored. + if ((Op == AMDGPU::V_MUL_F32_e64 && ST->hasFP32Denormals()) || + (Op == AMDGPU::V_MUL_F16_e64 && ST->hasFP16Denormals())) + return std::make_pair(nullptr, SIOutMods::NONE); + + const MachineOperand *RegOp = nullptr; + const MachineOperand *ImmOp = nullptr; + const MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0); + const MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1); + if (Src0->isImm()) { + ImmOp = Src0; + RegOp = Src1; + } else if (Src1->isImm()) { + ImmOp = Src1; + RegOp = Src0; + } else + return std::make_pair(nullptr, SIOutMods::NONE); + + int OMod = getOModValue(Op, ImmOp->getImm()); + if (OMod == SIOutMods::NONE || + TII->hasModifiersSet(MI, AMDGPU::OpName::src0_modifiers) || + TII->hasModifiersSet(MI, AMDGPU::OpName::src1_modifiers) || + TII->hasModifiersSet(MI, AMDGPU::OpName::omod) || + TII->hasModifiersSet(MI, AMDGPU::OpName::clamp)) + return std::make_pair(nullptr, SIOutMods::NONE); + + return std::make_pair(RegOp, OMod); + } + case AMDGPU::V_ADD_F32_e64: + case AMDGPU::V_ADD_F16_e64: { + // If output denormals are enabled, omod is ignored. + if ((Op == AMDGPU::V_ADD_F32_e64 && ST->hasFP32Denormals()) || + (Op == AMDGPU::V_ADD_F16_e64 && ST->hasFP16Denormals())) + return std::make_pair(nullptr, SIOutMods::NONE); + + // Look through the DAGCombiner canonicalization fmul x, 2 -> fadd x, x + const MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0); + const MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1); + + if (Src0->isReg() && Src1->isReg() && Src0->getReg() == Src1->getReg() && + Src0->getSubReg() == Src1->getSubReg() && + !TII->hasModifiersSet(MI, AMDGPU::OpName::src0_modifiers) && + !TII->hasModifiersSet(MI, AMDGPU::OpName::src1_modifiers) && + !TII->hasModifiersSet(MI, AMDGPU::OpName::clamp) && + !TII->hasModifiersSet(MI, AMDGPU::OpName::omod)) + return std::make_pair(Src0, SIOutMods::MUL2); + + return std::make_pair(nullptr, SIOutMods::NONE); + } + default: + return std::make_pair(nullptr, SIOutMods::NONE); + } +} + +// FIXME: Does this need to check IEEE bit on function? +bool SIFoldOperands::tryFoldOMod(MachineInstr &MI) { + const MachineOperand *RegOp; + int OMod; + std::tie(RegOp, OMod) = isOMod(MI); + if (OMod == SIOutMods::NONE || !RegOp->isReg() || + RegOp->getSubReg() != AMDGPU::NoSubRegister || + !hasOneNonDBGUseInst(*MRI, RegOp->getReg())) + return false; + + MachineInstr *Def = MRI->getVRegDef(RegOp->getReg()); + MachineOperand *DefOMod = TII->getNamedOperand(*Def, AMDGPU::OpName::omod); + if (!DefOMod || DefOMod->getImm() != SIOutMods::NONE) + return false; + + // Clamp is applied after omod. If the source already has clamp set, don't + // fold it. + if (TII->hasModifiersSet(*Def, AMDGPU::OpName::clamp)) + return false; + + DEBUG(dbgs() << "Folding omod " << MI << " into " << *Def << '\n'); + + DefOMod->setImm(OMod); + MRI->replaceRegWith(MI.getOperand(0).getReg(), Def->getOperand(0).getReg()); + MI.eraseFromParent(); + return true; +} + bool SIFoldOperands::runOnMachineFunction(MachineFunction &MF) { if (skipFunction(*MF.getFunction())) return false; - const SISubtarget &ST = MF.getSubtarget<SISubtarget>(); - MRI = &MF.getRegInfo(); - TII = ST.getInstrInfo(); + ST = &MF.getSubtarget<SISubtarget>(); + TII = ST->getInstrInfo(); TRI = &TII->getRegisterInfo(); + const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); + + // omod is ignored by hardware if IEEE bit is enabled. omod also does not + // correctly handle signed zeros. + // + // TODO: Check nsz on instructions when fast math flags are preserved to MI + // level. + bool IsIEEEMode = ST->enableIEEEBit(MF) || !MFI->hasNoSignedZerosFPMath(); + for (MachineFunction::iterator BI = MF.begin(), BE = MF.end(); BI != BE; ++BI) { @@ -705,8 +913,13 @@ bool SIFoldOperands::runOnMachineFunction(MachineFunction &MF) { Next = std::next(I); MachineInstr &MI = *I; - if (!isSafeToFold(MI)) + tryFoldInst(TII, &MI); + + if (!TII->isFoldableCopy(MI)) { + if (IsIEEEMode || !tryFoldOMod(MI)) + tryFoldClamp(MI); continue; + } MachineOperand &OpToFold = MI.getOperand(1); bool FoldingImm = OpToFold.isImm() || OpToFold.isFI(); |