diff options
Diffstat (limited to 'llvm/lib/Target/AMDGPU/AMDGPUCombinerHelper.cpp')
| -rw-r--r-- | llvm/lib/Target/AMDGPU/AMDGPUCombinerHelper.cpp | 382 |
1 files changed, 382 insertions, 0 deletions
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCombinerHelper.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCombinerHelper.cpp new file mode 100644 index 000000000000..301e6f6d6f42 --- /dev/null +++ b/llvm/lib/Target/AMDGPU/AMDGPUCombinerHelper.cpp @@ -0,0 +1,382 @@ +//=== lib/CodeGen/GlobalISel/AMDGPUCombinerHelper.cpp ---------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "AMDGPUCombinerHelper.h" +#include "GCNSubtarget.h" +#include "MCTargetDesc/AMDGPUMCTargetDesc.h" +#include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" +#include "llvm/IR/IntrinsicsAMDGPU.h" +#include "llvm/Target/TargetMachine.h" + +using namespace llvm; +using namespace MIPatternMatch; + +LLVM_READNONE +static bool fnegFoldsIntoMI(const MachineInstr &MI) { + switch (MI.getOpcode()) { + case AMDGPU::G_FADD: + case AMDGPU::G_FSUB: + case AMDGPU::G_FMUL: + case AMDGPU::G_FMA: + case AMDGPU::G_FMAD: + case AMDGPU::G_FMINNUM: + case AMDGPU::G_FMAXNUM: + case AMDGPU::G_FMINNUM_IEEE: + case AMDGPU::G_FMAXNUM_IEEE: + case AMDGPU::G_FSIN: + case AMDGPU::G_FPEXT: + case AMDGPU::G_INTRINSIC_TRUNC: + case AMDGPU::G_FPTRUNC: + case AMDGPU::G_FRINT: + case AMDGPU::G_FNEARBYINT: + case AMDGPU::G_INTRINSIC_ROUND: + case AMDGPU::G_INTRINSIC_ROUNDEVEN: + case AMDGPU::G_FCANONICALIZE: + case AMDGPU::G_AMDGPU_RCP_IFLAG: + case AMDGPU::G_AMDGPU_FMIN_LEGACY: + case AMDGPU::G_AMDGPU_FMAX_LEGACY: + return true; + case AMDGPU::G_INTRINSIC: { + unsigned IntrinsicID = MI.getIntrinsicID(); + switch (IntrinsicID) { + case Intrinsic::amdgcn_rcp: + case Intrinsic::amdgcn_rcp_legacy: + case Intrinsic::amdgcn_sin: + case Intrinsic::amdgcn_fmul_legacy: + case Intrinsic::amdgcn_fmed3: + case Intrinsic::amdgcn_fma_legacy: + return true; + default: + return false; + } + } + default: + return false; + } +} + +/// \p returns true if the operation will definitely need to use a 64-bit +/// encoding, and thus will use a VOP3 encoding regardless of the source +/// modifiers. +LLVM_READONLY +static bool opMustUseVOP3Encoding(const MachineInstr &MI, + const MachineRegisterInfo &MRI) { + return MI.getNumOperands() > + (MI.getOpcode() == AMDGPU::G_INTRINSIC ? 4u : 3u) || + MRI.getType(MI.getOperand(0).getReg()).getScalarSizeInBits() == 64; +} + +// Most FP instructions support source modifiers. +LLVM_READONLY +static bool hasSourceMods(const MachineInstr &MI) { + if (!MI.memoperands().empty()) + return false; + + switch (MI.getOpcode()) { + case AMDGPU::COPY: + case AMDGPU::G_SELECT: + case AMDGPU::G_FDIV: + case AMDGPU::G_FREM: + case TargetOpcode::INLINEASM: + case TargetOpcode::INLINEASM_BR: + case AMDGPU::G_INTRINSIC_W_SIDE_EFFECTS: + case AMDGPU::G_BITCAST: + case AMDGPU::G_ANYEXT: + case AMDGPU::G_BUILD_VECTOR: + case AMDGPU::G_BUILD_VECTOR_TRUNC: + case AMDGPU::G_PHI: + return false; + case AMDGPU::G_INTRINSIC: { + unsigned IntrinsicID = MI.getIntrinsicID(); + switch (IntrinsicID) { + case Intrinsic::amdgcn_interp_p1: + case Intrinsic::amdgcn_interp_p2: + case Intrinsic::amdgcn_interp_mov: + case Intrinsic::amdgcn_interp_p1_f16: + case Intrinsic::amdgcn_interp_p2_f16: + case Intrinsic::amdgcn_div_scale: + return false; + default: + return true; + } + } + default: + return true; + } +} + +static bool allUsesHaveSourceMods(MachineInstr &MI, MachineRegisterInfo &MRI, + unsigned CostThreshold = 4) { + // Some users (such as 3-operand FMA/MAD) must use a VOP3 encoding, and thus + // it is truly free to use a source modifier in all cases. If there are + // multiple users but for each one will necessitate using VOP3, there will be + // a code size increase. Try to avoid increasing code size unless we know it + // will save on the instruction count. + unsigned NumMayIncreaseSize = 0; + Register Dst = MI.getOperand(0).getReg(); + for (const MachineInstr &Use : MRI.use_nodbg_instructions(Dst)) { + if (!hasSourceMods(Use)) + return false; + + if (!opMustUseVOP3Encoding(Use, MRI)) { + if (++NumMayIncreaseSize > CostThreshold) + return false; + } + } + return true; +} + +static bool mayIgnoreSignedZero(MachineInstr &MI) { + const TargetOptions &Options = MI.getMF()->getTarget().Options; + return Options.NoSignedZerosFPMath || MI.getFlag(MachineInstr::MIFlag::FmNsz); +} + +static bool isInv2Pi(const APFloat &APF) { + static const APFloat KF16(APFloat::IEEEhalf(), APInt(16, 0x3118)); + static const APFloat KF32(APFloat::IEEEsingle(), APInt(32, 0x3e22f983)); + static const APFloat KF64(APFloat::IEEEdouble(), + APInt(64, 0x3fc45f306dc9c882)); + + return APF.bitwiseIsEqual(KF16) || APF.bitwiseIsEqual(KF32) || + APF.bitwiseIsEqual(KF64); +} + +// 0 and 1.0 / (0.5 * pi) do not have inline immmediates, so there is an +// additional cost to negate them. +static bool isConstantCostlierToNegate(MachineInstr &MI, Register Reg, + MachineRegisterInfo &MRI) { + Optional<FPValueAndVReg> FPValReg; + if (mi_match(Reg, MRI, m_GFCstOrSplat(FPValReg))) { + if (FPValReg->Value.isZero() && !FPValReg->Value.isNegative()) + return true; + + const GCNSubtarget &ST = MI.getMF()->getSubtarget<GCNSubtarget>(); + if (ST.hasInv2PiInlineImm() && isInv2Pi(FPValReg->Value)) + return true; + } + return false; +} + +static unsigned inverseMinMax(unsigned Opc) { + switch (Opc) { + case AMDGPU::G_FMAXNUM: + return AMDGPU::G_FMINNUM; + case AMDGPU::G_FMINNUM: + return AMDGPU::G_FMAXNUM; + case AMDGPU::G_FMAXNUM_IEEE: + return AMDGPU::G_FMINNUM_IEEE; + case AMDGPU::G_FMINNUM_IEEE: + return AMDGPU::G_FMAXNUM_IEEE; + case AMDGPU::G_AMDGPU_FMAX_LEGACY: + return AMDGPU::G_AMDGPU_FMIN_LEGACY; + case AMDGPU::G_AMDGPU_FMIN_LEGACY: + return AMDGPU::G_AMDGPU_FMAX_LEGACY; + default: + llvm_unreachable("invalid min/max opcode"); + } +} + +bool AMDGPUCombinerHelper::matchFoldableFneg(MachineInstr &MI, + MachineInstr *&MatchInfo) { + Register Src = MI.getOperand(1).getReg(); + MatchInfo = MRI.getVRegDef(Src); + + // If the input has multiple uses and we can either fold the negate down, or + // the other uses cannot, give up. This both prevents unprofitable + // transformations and infinite loops: we won't repeatedly try to fold around + // a negate that has no 'good' form. + if (MRI.hasOneNonDBGUse(Src)) { + if (allUsesHaveSourceMods(MI, MRI, 0)) + return false; + } else { + if (fnegFoldsIntoMI(*MatchInfo) && + (allUsesHaveSourceMods(MI, MRI) || + !allUsesHaveSourceMods(*MatchInfo, MRI))) + return false; + } + + switch (MatchInfo->getOpcode()) { + case AMDGPU::G_FMINNUM: + case AMDGPU::G_FMAXNUM: + case AMDGPU::G_FMINNUM_IEEE: + case AMDGPU::G_FMAXNUM_IEEE: + case AMDGPU::G_AMDGPU_FMIN_LEGACY: + case AMDGPU::G_AMDGPU_FMAX_LEGACY: + // 0 doesn't have a negated inline immediate. + return !isConstantCostlierToNegate(*MatchInfo, + MatchInfo->getOperand(2).getReg(), MRI); + case AMDGPU::G_FADD: + case AMDGPU::G_FSUB: + case AMDGPU::G_FMA: + case AMDGPU::G_FMAD: + return mayIgnoreSignedZero(*MatchInfo); + case AMDGPU::G_FMUL: + case AMDGPU::G_FPEXT: + case AMDGPU::G_INTRINSIC_TRUNC: + case AMDGPU::G_FPTRUNC: + case AMDGPU::G_FRINT: + case AMDGPU::G_FNEARBYINT: + case AMDGPU::G_INTRINSIC_ROUND: + case AMDGPU::G_INTRINSIC_ROUNDEVEN: + case AMDGPU::G_FSIN: + case AMDGPU::G_FCANONICALIZE: + case AMDGPU::G_AMDGPU_RCP_IFLAG: + return true; + case AMDGPU::G_INTRINSIC: { + unsigned IntrinsicID = MatchInfo->getIntrinsicID(); + switch (IntrinsicID) { + case Intrinsic::amdgcn_rcp: + case Intrinsic::amdgcn_rcp_legacy: + case Intrinsic::amdgcn_sin: + case Intrinsic::amdgcn_fmul_legacy: + case Intrinsic::amdgcn_fmed3: + return true; + case Intrinsic::amdgcn_fma_legacy: + return mayIgnoreSignedZero(*MatchInfo); + default: + return false; + } + } + default: + return false; + } +} + +void AMDGPUCombinerHelper::applyFoldableFneg(MachineInstr &MI, + MachineInstr *&MatchInfo) { + // Transform: + // %A = inst %Op1, ... + // %B = fneg %A + // + // into: + // + // (if %A has one use, specifically fneg above) + // %B = inst (maybe fneg %Op1), ... + // + // (if %A has multiple uses) + // %B = inst (maybe fneg %Op1), ... + // %A = fneg %B + + // Replace register in operand with a register holding negated value. + auto NegateOperand = [&](MachineOperand &Op) { + Register Reg = Op.getReg(); + if (!mi_match(Reg, MRI, m_GFNeg(m_Reg(Reg)))) + Reg = Builder.buildFNeg(MRI.getType(Reg), Reg).getReg(0); + replaceRegOpWith(MRI, Op, Reg); + }; + + // Replace either register in operands with a register holding negated value. + auto NegateEitherOperand = [&](MachineOperand &X, MachineOperand &Y) { + Register XReg = X.getReg(); + Register YReg = Y.getReg(); + if (mi_match(XReg, MRI, m_GFNeg(m_Reg(XReg)))) + replaceRegOpWith(MRI, X, XReg); + else if (mi_match(YReg, MRI, m_GFNeg(m_Reg(YReg)))) + replaceRegOpWith(MRI, Y, YReg); + else { + YReg = Builder.buildFNeg(MRI.getType(YReg), YReg).getReg(0); + replaceRegOpWith(MRI, Y, YReg); + } + }; + + Builder.setInstrAndDebugLoc(*MatchInfo); + + // Negate appropriate operands so that resulting value of MatchInfo is + // negated. + switch (MatchInfo->getOpcode()) { + case AMDGPU::G_FADD: + case AMDGPU::G_FSUB: + NegateOperand(MatchInfo->getOperand(1)); + NegateOperand(MatchInfo->getOperand(2)); + break; + case AMDGPU::G_FMUL: + NegateEitherOperand(MatchInfo->getOperand(1), MatchInfo->getOperand(2)); + break; + case AMDGPU::G_FMINNUM: + case AMDGPU::G_FMAXNUM: + case AMDGPU::G_FMINNUM_IEEE: + case AMDGPU::G_FMAXNUM_IEEE: + case AMDGPU::G_AMDGPU_FMIN_LEGACY: + case AMDGPU::G_AMDGPU_FMAX_LEGACY: { + NegateOperand(MatchInfo->getOperand(1)); + NegateOperand(MatchInfo->getOperand(2)); + unsigned Opposite = inverseMinMax(MatchInfo->getOpcode()); + replaceOpcodeWith(*MatchInfo, Opposite); + break; + } + case AMDGPU::G_FMA: + case AMDGPU::G_FMAD: + NegateEitherOperand(MatchInfo->getOperand(1), MatchInfo->getOperand(2)); + NegateOperand(MatchInfo->getOperand(3)); + break; + case AMDGPU::G_FPEXT: + case AMDGPU::G_INTRINSIC_TRUNC: + case AMDGPU::G_FRINT: + case AMDGPU::G_FNEARBYINT: + case AMDGPU::G_INTRINSIC_ROUND: + case AMDGPU::G_INTRINSIC_ROUNDEVEN: + case AMDGPU::G_FSIN: + case AMDGPU::G_FCANONICALIZE: + case AMDGPU::G_AMDGPU_RCP_IFLAG: + case AMDGPU::G_FPTRUNC: + NegateOperand(MatchInfo->getOperand(1)); + break; + case AMDGPU::G_INTRINSIC: { + unsigned IntrinsicID = MatchInfo->getIntrinsicID(); + switch (IntrinsicID) { + case Intrinsic::amdgcn_rcp: + case Intrinsic::amdgcn_rcp_legacy: + case Intrinsic::amdgcn_sin: + NegateOperand(MatchInfo->getOperand(2)); + break; + case Intrinsic::amdgcn_fmul_legacy: + NegateEitherOperand(MatchInfo->getOperand(2), MatchInfo->getOperand(3)); + break; + case Intrinsic::amdgcn_fmed3: + NegateOperand(MatchInfo->getOperand(2)); + NegateOperand(MatchInfo->getOperand(3)); + NegateOperand(MatchInfo->getOperand(4)); + break; + case Intrinsic::amdgcn_fma_legacy: + NegateEitherOperand(MatchInfo->getOperand(2), MatchInfo->getOperand(3)); + NegateOperand(MatchInfo->getOperand(4)); + break; + default: + llvm_unreachable("folding fneg not supported for this intrinsic"); + } + break; + } + default: + llvm_unreachable("folding fneg not supported for this instruction"); + } + + Register Dst = MI.getOperand(0).getReg(); + Register MatchInfoDst = MatchInfo->getOperand(0).getReg(); + + if (MRI.hasOneNonDBGUse(MatchInfoDst)) { + // MatchInfo now has negated value so use that instead of old Dst. + replaceRegWith(MRI, Dst, MatchInfoDst); + } else { + // We want to swap all uses of Dst with uses of MatchInfoDst and vice versa + // but replaceRegWith will replace defs as well. It is easier to replace one + // def with a new register. + LLT Type = MRI.getType(Dst); + Register NegatedMatchInfo = MRI.createGenericVirtualRegister(Type); + replaceRegOpWith(MRI, MatchInfo->getOperand(0), NegatedMatchInfo); + + // MatchInfo now has negated value so use that instead of old Dst. + replaceRegWith(MRI, Dst, NegatedMatchInfo); + + // Recreate non negated value for other uses of old MatchInfoDst + Builder.setInstrAndDebugLoc(MI); + Builder.buildFNeg(MatchInfoDst, NegatedMatchInfo, MI.getFlags()); + } + + MI.eraseFromParent(); + return; +} |
