Vendor import of llvm-project main llvmorg-14-init-10186-gff7f2cfa959b. - src

diff options


context:
space:
mode:

author	Dimitry Andric <dim@FreeBSD.org>	2021-11-19 20:06:13 +0000
committer	Dimitry Andric <dim@FreeBSD.org>	2021-11-19 20:06:13 +0000
commit	c0981da47d5696fe36474fcf86b4ce03ae3ff818 (patch)
tree	f42add1021b9f2ac6a69ac7cf6c4499962739a45 /llvm/lib/Target/AMDGPU/AMDGPUCombinerHelper.cpp
parent	344a3780b2e33f6ca763666c380202b18aab72a3 (diff)

vendor/llvm-project/llvmorg-14-init-10186-gff7f2cfa959b

Diffstat (limited to 'llvm/lib/Target/AMDGPU/AMDGPUCombinerHelper.cpp')

-rw-r--r--

llvm/lib/Target/AMDGPU/AMDGPUCombinerHelper.cpp

382

1 files changed, 382 insertions, 0 deletions

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCombinerHelper.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCombinerHelper.cpp
new file mode 100644
index 000000000000..301e6f6d6f42
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCombinerHelper.cpp

@@ -0,0 +1,382 @@

+//=== lib/CodeGen/GlobalISel/AMDGPUCombinerHelper.cpp ---------------------===//

+//

+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.

+// See https://llvm.org/LICENSE.txt for license information.

+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception

+//

+//===----------------------------------------------------------------------===//

+#include "AMDGPUCombinerHelper.h"

+#include "GCNSubtarget.h"

+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"

+#include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"

+#include "llvm/IR/IntrinsicsAMDGPU.h"

+#include "llvm/Target/TargetMachine.h"

+using namespace llvm;

+using namespace MIPatternMatch;

+LLVM_READNONE

+static bool fnegFoldsIntoMI(const MachineInstr &MI) {

+ switch (MI.getOpcode()) {

+ case AMDGPU::G_FADD:

+ case AMDGPU::G_FSUB:

+ case AMDGPU::G_FMUL:

+ case AMDGPU::G_FMA:

+ case AMDGPU::G_FMAD:

+ case AMDGPU::G_FMINNUM:

+ case AMDGPU::G_FMAXNUM:

+ case AMDGPU::G_FMINNUM_IEEE:

+ case AMDGPU::G_FMAXNUM_IEEE:

+ case AMDGPU::G_FSIN:

+ case AMDGPU::G_FPEXT:

+ case AMDGPU::G_INTRINSIC_TRUNC:

+ case AMDGPU::G_FPTRUNC:

+ case AMDGPU::G_FRINT:

+ case AMDGPU::G_FNEARBYINT:

+ case AMDGPU::G_INTRINSIC_ROUND:

+ case AMDGPU::G_INTRINSIC_ROUNDEVEN:

+ case AMDGPU::G_FCANONICALIZE:

+ case AMDGPU::G_AMDGPU_RCP_IFLAG:

+ case AMDGPU::G_AMDGPU_FMIN_LEGACY:

+ case AMDGPU::G_AMDGPU_FMAX_LEGACY:

+ return true;

+ case AMDGPU::G_INTRINSIC: {

+ unsigned IntrinsicID = MI.getIntrinsicID();

+ switch (IntrinsicID) {

+ case Intrinsic::amdgcn_rcp:

+ case Intrinsic::amdgcn_rcp_legacy:

+ case Intrinsic::amdgcn_sin:

+ case Intrinsic::amdgcn_fmul_legacy:

+ case Intrinsic::amdgcn_fmed3:

+ case Intrinsic::amdgcn_fma_legacy:

+ return true;

+ default:

+ return false;

+ }

+ default:

+ return false;

+ }

+/// \p returns true if the operation will definitely need to use a 64-bit

+/// encoding, and thus will use a VOP3 encoding regardless of the source

+/// modifiers.

+LLVM_READONLY

+static bool opMustUseVOP3Encoding(const MachineInstr &MI,

+ const MachineRegisterInfo &MRI) {

+ return MI.getNumOperands() >

+ (MI.getOpcode() == AMDGPU::G_INTRINSIC ? 4u : 3u) ||

+ MRI.getType(MI.getOperand(0).getReg()).getScalarSizeInBits() == 64;

+// Most FP instructions support source modifiers.

+LLVM_READONLY

+static bool hasSourceMods(const MachineInstr &MI) {

+ if (!MI.memoperands().empty())

+ return false;

+ switch (MI.getOpcode()) {

+ case AMDGPU::COPY:

+ case AMDGPU::G_SELECT:

+ case AMDGPU::G_FDIV:

+ case AMDGPU::G_FREM:

+ case TargetOpcode::INLINEASM:

+ case TargetOpcode::INLINEASM_BR:

+ case AMDGPU::G_INTRINSIC_W_SIDE_EFFECTS:

+ case AMDGPU::G_BITCAST:

+ case AMDGPU::G_ANYEXT:

+ case AMDGPU::G_BUILD_VECTOR:

+ case AMDGPU::G_BUILD_VECTOR_TRUNC:

+ case AMDGPU::G_PHI:

+ return false;

+ case AMDGPU::G_INTRINSIC: {

+ unsigned IntrinsicID = MI.getIntrinsicID();

+ switch (IntrinsicID) {

+ case Intrinsic::amdgcn_interp_p1:

+ case Intrinsic::amdgcn_interp_p2:

+ case Intrinsic::amdgcn_interp_mov:

+ case Intrinsic::amdgcn_interp_p1_f16:

+ case Intrinsic::amdgcn_interp_p2_f16:

+ case Intrinsic::amdgcn_div_scale:

+ return false;

+ default:

+ return true;

+ }

+ default:

+ return true;

+ }

+static bool allUsesHaveSourceMods(MachineInstr &MI, MachineRegisterInfo &MRI,

+ unsigned CostThreshold = 4) {

+ // Some users (such as 3-operand FMA/MAD) must use a VOP3 encoding, and thus

+ // it is truly free to use a source modifier in all cases. If there are

+ // multiple users but for each one will necessitate using VOP3, there will be

+ // a code size increase. Try to avoid increasing code size unless we know it

+ // will save on the instruction count.

+ unsigned NumMayIncreaseSize = 0;

+ Register Dst = MI.getOperand(0).getReg();

+ for (const MachineInstr &Use : MRI.use_nodbg_instructions(Dst)) {

+ if (!hasSourceMods(Use))

+ return false;

+ if (!opMustUseVOP3Encoding(Use, MRI)) {

+ if (++NumMayIncreaseSize > CostThreshold)

+ return false;

+ }

+ return true;

+static bool mayIgnoreSignedZero(MachineInstr &MI) {

+ const TargetOptions &Options = MI.getMF()->getTarget().Options;

+ return Options.NoSignedZerosFPMath || MI.getFlag(MachineInstr::MIFlag::FmNsz);

+static bool isInv2Pi(const APFloat &APF) {

+ static const APFloat KF16(APFloat::IEEEhalf(), APInt(16, 0x3118));

+ static const APFloat KF32(APFloat::IEEEsingle(), APInt(32, 0x3e22f983));

+ static const APFloat KF64(APFloat::IEEEdouble(),

+ APInt(64, 0x3fc45f306dc9c882));

+ return APF.bitwiseIsEqual(KF16) || APF.bitwiseIsEqual(KF32) ||

+ APF.bitwiseIsEqual(KF64);

+// 0 and 1.0 / (0.5 * pi) do not have inline immmediates, so there is an

+// additional cost to negate them.

+static bool isConstantCostlierToNegate(MachineInstr &MI, Register Reg,

+ MachineRegisterInfo &MRI) {

+ Optional<FPValueAndVReg> FPValReg;

+ if (mi_match(Reg, MRI, m_GFCstOrSplat(FPValReg))) {

+ if (FPValReg->Value.isZero() && !FPValReg->Value.isNegative())

+ return true;

+ const GCNSubtarget &ST = MI.getMF()->getSubtarget<GCNSubtarget>();

+ if (ST.hasInv2PiInlineImm() && isInv2Pi(FPValReg->Value))

+ return true;

+ }

+ return false;

+static unsigned inverseMinMax(unsigned Opc) {

+ switch (Opc) {

+ case AMDGPU::G_FMAXNUM:

+ return AMDGPU::G_FMINNUM;

+ case AMDGPU::G_FMINNUM:

+ return AMDGPU::G_FMAXNUM;

+ case AMDGPU::G_FMAXNUM_IEEE:

+ return AMDGPU::G_FMINNUM_IEEE;

+ case AMDGPU::G_FMINNUM_IEEE:

+ return AMDGPU::G_FMAXNUM_IEEE;

+ case AMDGPU::G_AMDGPU_FMAX_LEGACY:

+ return AMDGPU::G_AMDGPU_FMIN_LEGACY;

+ case AMDGPU::G_AMDGPU_FMIN_LEGACY:

+ return AMDGPU::G_AMDGPU_FMAX_LEGACY;

+ default:

+ llvm_unreachable("invalid min/max opcode");

+ }

+bool AMDGPUCombinerHelper::matchFoldableFneg(MachineInstr &MI,

+ MachineInstr *&MatchInfo) {

+ Register Src = MI.getOperand(1).getReg();

+ MatchInfo = MRI.getVRegDef(Src);

+ // If the input has multiple uses and we can either fold the negate down, or

+ // the other uses cannot, give up. This both prevents unprofitable

+ // transformations and infinite loops: we won't repeatedly try to fold around

+ // a negate that has no 'good' form.

+ if (MRI.hasOneNonDBGUse(Src)) {

+ if (allUsesHaveSourceMods(MI, MRI, 0))

+ return false;

+ } else {

+ if (fnegFoldsIntoMI(*MatchInfo) &&

+ (allUsesHaveSourceMods(MI, MRI) ||

+ !allUsesHaveSourceMods(*MatchInfo, MRI)))

+ return false;

+ }

+ switch (MatchInfo->getOpcode()) {

+ case AMDGPU::G_FMINNUM:

+ case AMDGPU::G_FMAXNUM:

+ case AMDGPU::G_FMINNUM_IEEE:

+ case AMDGPU::G_FMAXNUM_IEEE:

+ case AMDGPU::G_AMDGPU_FMIN_LEGACY:

+ case AMDGPU::G_AMDGPU_FMAX_LEGACY:

+ // 0 doesn't have a negated inline immediate.

+ return !isConstantCostlierToNegate(*MatchInfo,

+ MatchInfo->getOperand(2).getReg(), MRI);

+ case AMDGPU::G_FADD:

+ case AMDGPU::G_FSUB:

+ case AMDGPU::G_FMA:

+ case AMDGPU::G_FMAD:

+ return mayIgnoreSignedZero(*MatchInfo);

+ case AMDGPU::G_FMUL:

+ case AMDGPU::G_FPEXT:

+ case AMDGPU::G_INTRINSIC_TRUNC:

+ case AMDGPU::G_FPTRUNC:

+ case AMDGPU::G_FRINT:

+ case AMDGPU::G_FNEARBYINT:

+ case AMDGPU::G_INTRINSIC_ROUND:

+ case AMDGPU::G_INTRINSIC_ROUNDEVEN:

+ case AMDGPU::G_FSIN:

+ case AMDGPU::G_FCANONICALIZE:

+ case AMDGPU::G_AMDGPU_RCP_IFLAG:

+ return true;

+ case AMDGPU::G_INTRINSIC: {

+ unsigned IntrinsicID = MatchInfo->getIntrinsicID();

+ switch (IntrinsicID) {

+ case Intrinsic::amdgcn_rcp:

+ case Intrinsic::amdgcn_rcp_legacy:

+ case Intrinsic::amdgcn_sin:

+ case Intrinsic::amdgcn_fmul_legacy:

+ case Intrinsic::amdgcn_fmed3:

+ return true;

+ case Intrinsic::amdgcn_fma_legacy:

+ return mayIgnoreSignedZero(*MatchInfo);

+ default:

+ return false;

+ }

+ default:

+ return false;

+ }

+void AMDGPUCombinerHelper::applyFoldableFneg(MachineInstr &MI,

+ MachineInstr *&MatchInfo) {

+ // Transform:

+ // %A = inst %Op1, ...

+ // %B = fneg %A

+ //

+ // into:

+ //

+ // (if %A has one use, specifically fneg above)

+ // %B = inst (maybe fneg %Op1), ...

+ //

+ // (if %A has multiple uses)

+ // %B = inst (maybe fneg %Op1), ...

+ // %A = fneg %B

+ // Replace register in operand with a register holding negated value.

+ auto NegateOperand = [&](MachineOperand &Op) {

+ Register Reg = Op.getReg();

+ if (!mi_match(Reg, MRI, m_GFNeg(m_Reg(Reg))))

+ Reg = Builder.buildFNeg(MRI.getType(Reg), Reg).getReg(0);

+ replaceRegOpWith(MRI, Op, Reg);

+ };

+ // Replace either register in operands with a register holding negated value.

+ auto NegateEitherOperand = [&](MachineOperand &X, MachineOperand &Y) {

+ Register XReg = X.getReg();

+ Register YReg = Y.getReg();

+ if (mi_match(XReg, MRI, m_GFNeg(m_Reg(XReg))))

+ replaceRegOpWith(MRI, X, XReg);

+ else if (mi_match(YReg, MRI, m_GFNeg(m_Reg(YReg))))

+ replaceRegOpWith(MRI, Y, YReg);

+ else {

+ YReg = Builder.buildFNeg(MRI.getType(YReg), YReg).getReg(0);

+ replaceRegOpWith(MRI, Y, YReg);

+ }

+ };

+ Builder.setInstrAndDebugLoc(*MatchInfo);

+ // Negate appropriate operands so that resulting value of MatchInfo is

+ // negated.

+ switch (MatchInfo->getOpcode()) {

+ case AMDGPU::G_FADD:

+ case AMDGPU::G_FSUB:

+ NegateOperand(MatchInfo->getOperand(1));

+ NegateOperand(MatchInfo->getOperand(2));

+ break;

+ case AMDGPU::G_FMUL:

+ NegateEitherOperand(MatchInfo->getOperand(1), MatchInfo->getOperand(2));

+ break;

+ case AMDGPU::G_FMINNUM:

+ case AMDGPU::G_FMAXNUM:

+ case AMDGPU::G_FMINNUM_IEEE:

+ case AMDGPU::G_FMAXNUM_IEEE:

+ case AMDGPU::G_AMDGPU_FMIN_LEGACY:

+ case AMDGPU::G_AMDGPU_FMAX_LEGACY: {

+ NegateOperand(MatchInfo->getOperand(1));

+ NegateOperand(MatchInfo->getOperand(2));

+ unsigned Opposite = inverseMinMax(MatchInfo->getOpcode());

+ replaceOpcodeWith(*MatchInfo, Opposite);

+ break;

+ }

+ case AMDGPU::G_FMA:

+ case AMDGPU::G_FMAD:

+ NegateEitherOperand(MatchInfo->getOperand(1), MatchInfo->getOperand(2));

+ NegateOperand(MatchInfo->getOperand(3));

+ break;

+ case AMDGPU::G_FPEXT:

+ case AMDGPU::G_INTRINSIC_TRUNC:

+ case AMDGPU::G_FRINT:

+ case AMDGPU::G_FNEARBYINT:

+ case AMDGPU::G_INTRINSIC_ROUND:

+ case AMDGPU::G_INTRINSIC_ROUNDEVEN:

+ case AMDGPU::G_FSIN:

+ case AMDGPU::G_FCANONICALIZE:

+ case AMDGPU::G_AMDGPU_RCP_IFLAG:

+ case AMDGPU::G_FPTRUNC:

+ NegateOperand(MatchInfo->getOperand(1));

+ break;

+ case AMDGPU::G_INTRINSIC: {

+ unsigned IntrinsicID = MatchInfo->getIntrinsicID();

+ switch (IntrinsicID) {

+ case Intrinsic::amdgcn_rcp:

+ case Intrinsic::amdgcn_rcp_legacy:

+ case Intrinsic::amdgcn_sin:

+ NegateOperand(MatchInfo->getOperand(2));

+ break;

+ case Intrinsic::amdgcn_fmul_legacy:

+ NegateEitherOperand(MatchInfo->getOperand(2), MatchInfo->getOperand(3));

+ break;

+ case Intrinsic::amdgcn_fmed3:

+ NegateOperand(MatchInfo->getOperand(2));

+ NegateOperand(MatchInfo->getOperand(3));

+ NegateOperand(MatchInfo->getOperand(4));

+ break;

+ case Intrinsic::amdgcn_fma_legacy:

+ NegateEitherOperand(MatchInfo->getOperand(2), MatchInfo->getOperand(3));

+ NegateOperand(MatchInfo->getOperand(4));

+ break;

+ default:

+ llvm_unreachable("folding fneg not supported for this intrinsic");

+ }

+ break;

+ }

+ default:

+ llvm_unreachable("folding fneg not supported for this instruction");

+ }

+ Register Dst = MI.getOperand(0).getReg();

+ Register MatchInfoDst = MatchInfo->getOperand(0).getReg();

+ if (MRI.hasOneNonDBGUse(MatchInfoDst)) {

+ // MatchInfo now has negated value so use that instead of old Dst.

+ replaceRegWith(MRI, Dst, MatchInfoDst);

+ } else {

+ // We want to swap all uses of Dst with uses of MatchInfoDst and vice versa

+ // but replaceRegWith will replace defs as well. It is easier to replace one

+ // def with a new register.

+ LLT Type = MRI.getType(Dst);

+ Register NegatedMatchInfo = MRI.createGenericVirtualRegister(Type);

+ replaceRegOpWith(MRI, MatchInfo->getOperand(0), NegatedMatchInfo);

+ // MatchInfo now has negated value so use that instead of old Dst.

+ replaceRegWith(MRI, Dst, NegatedMatchInfo);

+ // Recreate non negated value for other uses of old MatchInfoDst

+ Builder.setInstrAndDebugLoc(MI);

+ Builder.buildFNeg(MatchInfoDst, NegatedMatchInfo, MI.getFlags());

+ }

+ MI.eraseFromParent();

+ return;