aboutsummaryrefslogtreecommitdiff
path: root/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUCombinerHelper.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUCombinerHelper.cpp')
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUCombinerHelper.cpp447
1 files changed, 447 insertions, 0 deletions
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUCombinerHelper.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUCombinerHelper.cpp
new file mode 100644
index 000000000000..e5a376ab7357
--- /dev/null
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUCombinerHelper.cpp
@@ -0,0 +1,447 @@
+//=== lib/CodeGen/GlobalISel/AMDGPUCombinerHelper.cpp ---------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPUCombinerHelper.h"
+#include "GCNSubtarget.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
+#include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h"
+#include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
+#include "llvm/IR/IntrinsicsAMDGPU.h"
+#include "llvm/Target/TargetMachine.h"
+
+using namespace llvm;
+using namespace MIPatternMatch;
+
+LLVM_READNONE
+static bool fnegFoldsIntoMI(const MachineInstr &MI) {
+ switch (MI.getOpcode()) {
+ case AMDGPU::G_FADD:
+ case AMDGPU::G_FSUB:
+ case AMDGPU::G_FMUL:
+ case AMDGPU::G_FMA:
+ case AMDGPU::G_FMAD:
+ case AMDGPU::G_FMINNUM:
+ case AMDGPU::G_FMAXNUM:
+ case AMDGPU::G_FMINNUM_IEEE:
+ case AMDGPU::G_FMAXNUM_IEEE:
+ case AMDGPU::G_FMINIMUM:
+ case AMDGPU::G_FMAXIMUM:
+ case AMDGPU::G_FSIN:
+ case AMDGPU::G_FPEXT:
+ case AMDGPU::G_INTRINSIC_TRUNC:
+ case AMDGPU::G_FPTRUNC:
+ case AMDGPU::G_FRINT:
+ case AMDGPU::G_FNEARBYINT:
+ case AMDGPU::G_INTRINSIC_ROUND:
+ case AMDGPU::G_INTRINSIC_ROUNDEVEN:
+ case AMDGPU::G_FCANONICALIZE:
+ case AMDGPU::G_AMDGPU_RCP_IFLAG:
+ case AMDGPU::G_AMDGPU_FMIN_LEGACY:
+ case AMDGPU::G_AMDGPU_FMAX_LEGACY:
+ return true;
+ case AMDGPU::G_INTRINSIC: {
+ Intrinsic::ID IntrinsicID = cast<GIntrinsic>(MI).getIntrinsicID();
+ switch (IntrinsicID) {
+ case Intrinsic::amdgcn_rcp:
+ case Intrinsic::amdgcn_rcp_legacy:
+ case Intrinsic::amdgcn_sin:
+ case Intrinsic::amdgcn_fmul_legacy:
+ case Intrinsic::amdgcn_fmed3:
+ case Intrinsic::amdgcn_fma_legacy:
+ return true;
+ default:
+ return false;
+ }
+ }
+ default:
+ return false;
+ }
+}
+
+/// \p returns true if the operation will definitely need to use a 64-bit
+/// encoding, and thus will use a VOP3 encoding regardless of the source
+/// modifiers.
+LLVM_READONLY
+static bool opMustUseVOP3Encoding(const MachineInstr &MI,
+ const MachineRegisterInfo &MRI) {
+ return MI.getNumOperands() > (isa<GIntrinsic>(MI) ? 4u : 3u) ||
+ MRI.getType(MI.getOperand(0).getReg()).getScalarSizeInBits() == 64;
+}
+
+// Most FP instructions support source modifiers.
+LLVM_READONLY
+static bool hasSourceMods(const MachineInstr &MI) {
+ if (!MI.memoperands().empty())
+ return false;
+
+ switch (MI.getOpcode()) {
+ case AMDGPU::COPY:
+ case AMDGPU::G_SELECT:
+ case AMDGPU::G_FDIV:
+ case AMDGPU::G_FREM:
+ case TargetOpcode::INLINEASM:
+ case TargetOpcode::INLINEASM_BR:
+ case AMDGPU::G_INTRINSIC_W_SIDE_EFFECTS:
+ case AMDGPU::G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS:
+ case AMDGPU::G_BITCAST:
+ case AMDGPU::G_ANYEXT:
+ case AMDGPU::G_BUILD_VECTOR:
+ case AMDGPU::G_BUILD_VECTOR_TRUNC:
+ case AMDGPU::G_PHI:
+ return false;
+ case AMDGPU::G_INTRINSIC:
+ case AMDGPU::G_INTRINSIC_CONVERGENT: {
+ Intrinsic::ID IntrinsicID = cast<GIntrinsic>(MI).getIntrinsicID();
+ switch (IntrinsicID) {
+ case Intrinsic::amdgcn_interp_p1:
+ case Intrinsic::amdgcn_interp_p2:
+ case Intrinsic::amdgcn_interp_mov:
+ case Intrinsic::amdgcn_interp_p1_f16:
+ case Intrinsic::amdgcn_interp_p2_f16:
+ case Intrinsic::amdgcn_div_scale:
+ return false;
+ default:
+ return true;
+ }
+ }
+ default:
+ return true;
+ }
+}
+
+static bool allUsesHaveSourceMods(MachineInstr &MI, MachineRegisterInfo &MRI,
+ unsigned CostThreshold = 4) {
+ // Some users (such as 3-operand FMA/MAD) must use a VOP3 encoding, and thus
+ // it is truly free to use a source modifier in all cases. If there are
+ // multiple users but for each one will necessitate using VOP3, there will be
+ // a code size increase. Try to avoid increasing code size unless we know it
+ // will save on the instruction count.
+ unsigned NumMayIncreaseSize = 0;
+ Register Dst = MI.getOperand(0).getReg();
+ for (const MachineInstr &Use : MRI.use_nodbg_instructions(Dst)) {
+ if (!hasSourceMods(Use))
+ return false;
+
+ if (!opMustUseVOP3Encoding(Use, MRI)) {
+ if (++NumMayIncreaseSize > CostThreshold)
+ return false;
+ }
+ }
+ return true;
+}
+
+static bool mayIgnoreSignedZero(MachineInstr &MI) {
+ const TargetOptions &Options = MI.getMF()->getTarget().Options;
+ return Options.NoSignedZerosFPMath || MI.getFlag(MachineInstr::MIFlag::FmNsz);
+}
+
+static bool isInv2Pi(const APFloat &APF) {
+ static const APFloat KF16(APFloat::IEEEhalf(), APInt(16, 0x3118));
+ static const APFloat KF32(APFloat::IEEEsingle(), APInt(32, 0x3e22f983));
+ static const APFloat KF64(APFloat::IEEEdouble(),
+ APInt(64, 0x3fc45f306dc9c882));
+
+ return APF.bitwiseIsEqual(KF16) || APF.bitwiseIsEqual(KF32) ||
+ APF.bitwiseIsEqual(KF64);
+}
+
+// 0 and 1.0 / (0.5 * pi) do not have inline immmediates, so there is an
+// additional cost to negate them.
+static bool isConstantCostlierToNegate(MachineInstr &MI, Register Reg,
+ MachineRegisterInfo &MRI) {
+ std::optional<FPValueAndVReg> FPValReg;
+ if (mi_match(Reg, MRI, m_GFCstOrSplat(FPValReg))) {
+ if (FPValReg->Value.isZero() && !FPValReg->Value.isNegative())
+ return true;
+
+ const GCNSubtarget &ST = MI.getMF()->getSubtarget<GCNSubtarget>();
+ if (ST.hasInv2PiInlineImm() && isInv2Pi(FPValReg->Value))
+ return true;
+ }
+ return false;
+}
+
+static unsigned inverseMinMax(unsigned Opc) {
+ switch (Opc) {
+ case AMDGPU::G_FMAXNUM:
+ return AMDGPU::G_FMINNUM;
+ case AMDGPU::G_FMINNUM:
+ return AMDGPU::G_FMAXNUM;
+ case AMDGPU::G_FMAXNUM_IEEE:
+ return AMDGPU::G_FMINNUM_IEEE;
+ case AMDGPU::G_FMINNUM_IEEE:
+ return AMDGPU::G_FMAXNUM_IEEE;
+ case AMDGPU::G_FMAXIMUM:
+ return AMDGPU::G_FMINIMUM;
+ case AMDGPU::G_FMINIMUM:
+ return AMDGPU::G_FMAXIMUM;
+ case AMDGPU::G_AMDGPU_FMAX_LEGACY:
+ return AMDGPU::G_AMDGPU_FMIN_LEGACY;
+ case AMDGPU::G_AMDGPU_FMIN_LEGACY:
+ return AMDGPU::G_AMDGPU_FMAX_LEGACY;
+ default:
+ llvm_unreachable("invalid min/max opcode");
+ }
+}
+
+bool AMDGPUCombinerHelper::matchFoldableFneg(MachineInstr &MI,
+ MachineInstr *&MatchInfo) {
+ Register Src = MI.getOperand(1).getReg();
+ MatchInfo = MRI.getVRegDef(Src);
+
+ // If the input has multiple uses and we can either fold the negate down, or
+ // the other uses cannot, give up. This both prevents unprofitable
+ // transformations and infinite loops: we won't repeatedly try to fold around
+ // a negate that has no 'good' form.
+ if (MRI.hasOneNonDBGUse(Src)) {
+ if (allUsesHaveSourceMods(MI, MRI, 0))
+ return false;
+ } else {
+ if (fnegFoldsIntoMI(*MatchInfo) &&
+ (allUsesHaveSourceMods(MI, MRI) ||
+ !allUsesHaveSourceMods(*MatchInfo, MRI)))
+ return false;
+ }
+
+ switch (MatchInfo->getOpcode()) {
+ case AMDGPU::G_FMINNUM:
+ case AMDGPU::G_FMAXNUM:
+ case AMDGPU::G_FMINNUM_IEEE:
+ case AMDGPU::G_FMAXNUM_IEEE:
+ case AMDGPU::G_FMINIMUM:
+ case AMDGPU::G_FMAXIMUM:
+ case AMDGPU::G_AMDGPU_FMIN_LEGACY:
+ case AMDGPU::G_AMDGPU_FMAX_LEGACY:
+ // 0 doesn't have a negated inline immediate.
+ return !isConstantCostlierToNegate(*MatchInfo,
+ MatchInfo->getOperand(2).getReg(), MRI);
+ case AMDGPU::G_FADD:
+ case AMDGPU::G_FSUB:
+ case AMDGPU::G_FMA:
+ case AMDGPU::G_FMAD:
+ return mayIgnoreSignedZero(*MatchInfo);
+ case AMDGPU::G_FMUL:
+ case AMDGPU::G_FPEXT:
+ case AMDGPU::G_INTRINSIC_TRUNC:
+ case AMDGPU::G_FPTRUNC:
+ case AMDGPU::G_FRINT:
+ case AMDGPU::G_FNEARBYINT:
+ case AMDGPU::G_INTRINSIC_ROUND:
+ case AMDGPU::G_INTRINSIC_ROUNDEVEN:
+ case AMDGPU::G_FSIN:
+ case AMDGPU::G_FCANONICALIZE:
+ case AMDGPU::G_AMDGPU_RCP_IFLAG:
+ return true;
+ case AMDGPU::G_INTRINSIC:
+ case AMDGPU::G_INTRINSIC_CONVERGENT: {
+ Intrinsic::ID IntrinsicID = cast<GIntrinsic>(MatchInfo)->getIntrinsicID();
+ switch (IntrinsicID) {
+ case Intrinsic::amdgcn_rcp:
+ case Intrinsic::amdgcn_rcp_legacy:
+ case Intrinsic::amdgcn_sin:
+ case Intrinsic::amdgcn_fmul_legacy:
+ case Intrinsic::amdgcn_fmed3:
+ return true;
+ case Intrinsic::amdgcn_fma_legacy:
+ return mayIgnoreSignedZero(*MatchInfo);
+ default:
+ return false;
+ }
+ }
+ default:
+ return false;
+ }
+}
+
+void AMDGPUCombinerHelper::applyFoldableFneg(MachineInstr &MI,
+ MachineInstr *&MatchInfo) {
+ // Transform:
+ // %A = inst %Op1, ...
+ // %B = fneg %A
+ //
+ // into:
+ //
+ // (if %A has one use, specifically fneg above)
+ // %B = inst (maybe fneg %Op1), ...
+ //
+ // (if %A has multiple uses)
+ // %B = inst (maybe fneg %Op1), ...
+ // %A = fneg %B
+
+ // Replace register in operand with a register holding negated value.
+ auto NegateOperand = [&](MachineOperand &Op) {
+ Register Reg = Op.getReg();
+ if (!mi_match(Reg, MRI, m_GFNeg(m_Reg(Reg))))
+ Reg = Builder.buildFNeg(MRI.getType(Reg), Reg).getReg(0);
+ replaceRegOpWith(MRI, Op, Reg);
+ };
+
+ // Replace either register in operands with a register holding negated value.
+ auto NegateEitherOperand = [&](MachineOperand &X, MachineOperand &Y) {
+ Register XReg = X.getReg();
+ Register YReg = Y.getReg();
+ if (mi_match(XReg, MRI, m_GFNeg(m_Reg(XReg))))
+ replaceRegOpWith(MRI, X, XReg);
+ else if (mi_match(YReg, MRI, m_GFNeg(m_Reg(YReg))))
+ replaceRegOpWith(MRI, Y, YReg);
+ else {
+ YReg = Builder.buildFNeg(MRI.getType(YReg), YReg).getReg(0);
+ replaceRegOpWith(MRI, Y, YReg);
+ }
+ };
+
+ Builder.setInstrAndDebugLoc(*MatchInfo);
+
+ // Negate appropriate operands so that resulting value of MatchInfo is
+ // negated.
+ switch (MatchInfo->getOpcode()) {
+ case AMDGPU::G_FADD:
+ case AMDGPU::G_FSUB:
+ NegateOperand(MatchInfo->getOperand(1));
+ NegateOperand(MatchInfo->getOperand(2));
+ break;
+ case AMDGPU::G_FMUL:
+ NegateEitherOperand(MatchInfo->getOperand(1), MatchInfo->getOperand(2));
+ break;
+ case AMDGPU::G_FMINNUM:
+ case AMDGPU::G_FMAXNUM:
+ case AMDGPU::G_FMINNUM_IEEE:
+ case AMDGPU::G_FMAXNUM_IEEE:
+ case AMDGPU::G_FMINIMUM:
+ case AMDGPU::G_FMAXIMUM:
+ case AMDGPU::G_AMDGPU_FMIN_LEGACY:
+ case AMDGPU::G_AMDGPU_FMAX_LEGACY: {
+ NegateOperand(MatchInfo->getOperand(1));
+ NegateOperand(MatchInfo->getOperand(2));
+ unsigned Opposite = inverseMinMax(MatchInfo->getOpcode());
+ replaceOpcodeWith(*MatchInfo, Opposite);
+ break;
+ }
+ case AMDGPU::G_FMA:
+ case AMDGPU::G_FMAD:
+ NegateEitherOperand(MatchInfo->getOperand(1), MatchInfo->getOperand(2));
+ NegateOperand(MatchInfo->getOperand(3));
+ break;
+ case AMDGPU::G_FPEXT:
+ case AMDGPU::G_INTRINSIC_TRUNC:
+ case AMDGPU::G_FRINT:
+ case AMDGPU::G_FNEARBYINT:
+ case AMDGPU::G_INTRINSIC_ROUND:
+ case AMDGPU::G_INTRINSIC_ROUNDEVEN:
+ case AMDGPU::G_FSIN:
+ case AMDGPU::G_FCANONICALIZE:
+ case AMDGPU::G_AMDGPU_RCP_IFLAG:
+ case AMDGPU::G_FPTRUNC:
+ NegateOperand(MatchInfo->getOperand(1));
+ break;
+ case AMDGPU::G_INTRINSIC:
+ case AMDGPU::G_INTRINSIC_CONVERGENT: {
+ Intrinsic::ID IntrinsicID = cast<GIntrinsic>(MatchInfo)->getIntrinsicID();
+ switch (IntrinsicID) {
+ case Intrinsic::amdgcn_rcp:
+ case Intrinsic::amdgcn_rcp_legacy:
+ case Intrinsic::amdgcn_sin:
+ NegateOperand(MatchInfo->getOperand(2));
+ break;
+ case Intrinsic::amdgcn_fmul_legacy:
+ NegateEitherOperand(MatchInfo->getOperand(2), MatchInfo->getOperand(3));
+ break;
+ case Intrinsic::amdgcn_fmed3:
+ NegateOperand(MatchInfo->getOperand(2));
+ NegateOperand(MatchInfo->getOperand(3));
+ NegateOperand(MatchInfo->getOperand(4));
+ break;
+ case Intrinsic::amdgcn_fma_legacy:
+ NegateEitherOperand(MatchInfo->getOperand(2), MatchInfo->getOperand(3));
+ NegateOperand(MatchInfo->getOperand(4));
+ break;
+ default:
+ llvm_unreachable("folding fneg not supported for this intrinsic");
+ }
+ break;
+ }
+ default:
+ llvm_unreachable("folding fneg not supported for this instruction");
+ }
+
+ Register Dst = MI.getOperand(0).getReg();
+ Register MatchInfoDst = MatchInfo->getOperand(0).getReg();
+
+ if (MRI.hasOneNonDBGUse(MatchInfoDst)) {
+ // MatchInfo now has negated value so use that instead of old Dst.
+ replaceRegWith(MRI, Dst, MatchInfoDst);
+ } else {
+ // We want to swap all uses of Dst with uses of MatchInfoDst and vice versa
+ // but replaceRegWith will replace defs as well. It is easier to replace one
+ // def with a new register.
+ LLT Type = MRI.getType(Dst);
+ Register NegatedMatchInfo = MRI.createGenericVirtualRegister(Type);
+ replaceRegOpWith(MRI, MatchInfo->getOperand(0), NegatedMatchInfo);
+
+ // MatchInfo now has negated value so use that instead of old Dst.
+ replaceRegWith(MRI, Dst, NegatedMatchInfo);
+
+ // Recreate non negated value for other uses of old MatchInfoDst
+ auto NextInst = ++MatchInfo->getIterator();
+ Builder.setInstrAndDebugLoc(*NextInst);
+ Builder.buildFNeg(MatchInfoDst, NegatedMatchInfo, MI.getFlags());
+ }
+
+ MI.eraseFromParent();
+}
+
+// TODO: Should return converted value / extension source and avoid introducing
+// intermediate fptruncs in the apply function.
+static bool isFPExtFromF16OrConst(const MachineRegisterInfo &MRI,
+ Register Reg) {
+ const MachineInstr *Def = MRI.getVRegDef(Reg);
+ if (Def->getOpcode() == TargetOpcode::G_FPEXT) {
+ Register SrcReg = Def->getOperand(1).getReg();
+ return MRI.getType(SrcReg) == LLT::scalar(16);
+ }
+
+ if (Def->getOpcode() == TargetOpcode::G_FCONSTANT) {
+ APFloat Val = Def->getOperand(1).getFPImm()->getValueAPF();
+ bool LosesInfo = true;
+ Val.convert(APFloat::IEEEhalf(), APFloat::rmNearestTiesToEven, &LosesInfo);
+ return !LosesInfo;
+ }
+
+ return false;
+}
+
+bool AMDGPUCombinerHelper::matchExpandPromotedF16FMed3(MachineInstr &MI,
+ Register Src0,
+ Register Src1,
+ Register Src2) {
+ assert(MI.getOpcode() == TargetOpcode::G_FPTRUNC);
+ Register SrcReg = MI.getOperand(1).getReg();
+ if (!MRI.hasOneNonDBGUse(SrcReg) || MRI.getType(SrcReg) != LLT::scalar(32))
+ return false;
+
+ return isFPExtFromF16OrConst(MRI, Src0) && isFPExtFromF16OrConst(MRI, Src1) &&
+ isFPExtFromF16OrConst(MRI, Src2);
+}
+
+void AMDGPUCombinerHelper::applyExpandPromotedF16FMed3(MachineInstr &MI,
+ Register Src0,
+ Register Src1,
+ Register Src2) {
+ // We expect fptrunc (fpext x) to fold out, and to constant fold any constant
+ // sources.
+ Src0 = Builder.buildFPTrunc(LLT::scalar(16), Src0).getReg(0);
+ Src1 = Builder.buildFPTrunc(LLT::scalar(16), Src1).getReg(0);
+ Src2 = Builder.buildFPTrunc(LLT::scalar(16), Src2).getReg(0);
+
+ LLT Ty = MRI.getType(Src0);
+ auto A1 = Builder.buildFMinNumIEEE(Ty, Src0, Src1);
+ auto B1 = Builder.buildFMaxNumIEEE(Ty, Src0, Src1);
+ auto C1 = Builder.buildFMaxNumIEEE(Ty, A1, Src2);
+ Builder.buildFMinNumIEEE(MI.getOperand(0), B1, C1);
+ MI.eraseFromParent();
+}