1 files changed, 382 insertions, 0 deletions
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCombinerHelper.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCombinerHelper.cpp
new file mode 100644
index 000000000000..301e6f6d6f42
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCombinerHelper.cpp
@@ -0,0 +1,382 @@
+//=== lib/CodeGen/GlobalISel/AMDGPUCombinerHelper.cpp ---------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPUCombinerHelper.h"
+#include "GCNSubtarget.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
+#include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
+#include "llvm/IR/IntrinsicsAMDGPU.h"
+#include "llvm/Target/TargetMachine.h"
+
+using namespace llvm;
+using namespace MIPatternMatch;
+
+LLVM_READNONE
+static bool fnegFoldsIntoMI(const MachineInstr &MI) {
+  switch (MI.getOpcode()) {
+  case AMDGPU::G_FADD:
+  case AMDGPU::G_FSUB:
+  case AMDGPU::G_FMUL:
+  case AMDGPU::G_FMA:
+  case AMDGPU::G_FMAD:
+  case AMDGPU::G_FMINNUM:
+  case AMDGPU::G_FMAXNUM:
+  case AMDGPU::G_FMINNUM_IEEE:
+  case AMDGPU::G_FMAXNUM_IEEE:
+  case AMDGPU::G_FSIN:
+  case AMDGPU::G_FPEXT:
+  case AMDGPU::G_INTRINSIC_TRUNC:
+  case AMDGPU::G_FPTRUNC:
+  case AMDGPU::G_FRINT:
+  case AMDGPU::G_FNEARBYINT:
+  case AMDGPU::G_INTRINSIC_ROUND:
+  case AMDGPU::G_INTRINSIC_ROUNDEVEN:
+  case AMDGPU::G_FCANONICALIZE:
+  case AMDGPU::G_AMDGPU_RCP_IFLAG:
+  case AMDGPU::G_AMDGPU_FMIN_LEGACY:
+  case AMDGPU::G_AMDGPU_FMAX_LEGACY:
+    return true;
+  case AMDGPU::G_INTRINSIC: {
+    unsigned IntrinsicID = MI.getIntrinsicID();
+    switch (IntrinsicID) {
+    case Intrinsic::amdgcn_rcp:
+    case Intrinsic::amdgcn_rcp_legacy:
+    case Intrinsic::amdgcn_sin:
+    case Intrinsic::amdgcn_fmul_legacy:
+    case Intrinsic::amdgcn_fmed3:
+    case Intrinsic::amdgcn_fma_legacy:
+      return true;
+    default:
+      return false;
+    }
+  }
+  default:
+    return false;
+  }
+}
+
+/// \p returns true if the operation will definitely need to use a 64-bit
+/// encoding, and thus will use a VOP3 encoding regardless of the source
+/// modifiers.
+LLVM_READONLY
+static bool opMustUseVOP3Encoding(const MachineInstr &MI,
+                                  const MachineRegisterInfo &MRI) {
+  return MI.getNumOperands() >
+             (MI.getOpcode() == AMDGPU::G_INTRINSIC ? 4u : 3u) ||
+         MRI.getType(MI.getOperand(0).getReg()).getScalarSizeInBits() == 64;
+}
+
+// Most FP instructions support source modifiers.
+LLVM_READONLY
+static bool hasSourceMods(const MachineInstr &MI) {
+  if (!MI.memoperands().empty())
+    return false;
+
+  switch (MI.getOpcode()) {
+  case AMDGPU::COPY:
+  case AMDGPU::G_SELECT:
+  case AMDGPU::G_FDIV:
+  case AMDGPU::G_FREM:
+  case TargetOpcode::INLINEASM:
+  case TargetOpcode::INLINEASM_BR:
+  case AMDGPU::G_INTRINSIC_W_SIDE_EFFECTS:
+  case AMDGPU::G_BITCAST:
+  case AMDGPU::G_ANYEXT:
+  case AMDGPU::G_BUILD_VECTOR:
+  case AMDGPU::G_BUILD_VECTOR_TRUNC:
+  case AMDGPU::G_PHI:
+    return false;
+  case AMDGPU::G_INTRINSIC: {
+    unsigned IntrinsicID = MI.getIntrinsicID();
+    switch (IntrinsicID) {
+    case Intrinsic::amdgcn_interp_p1:
+    case Intrinsic::amdgcn_interp_p2:
+    case Intrinsic::amdgcn_interp_mov:
+    case Intrinsic::amdgcn_interp_p1_f16:
+    case Intrinsic::amdgcn_interp_p2_f16:
+    case Intrinsic::amdgcn_div_scale:
+      return false;
+    default:
+      return true;
+    }
+  }
+  default:
+    return true;
+  }
+}
+
+static bool allUsesHaveSourceMods(MachineInstr &MI, MachineRegisterInfo &MRI,
+                                  unsigned CostThreshold = 4) {
+  // Some users (such as 3-operand FMA/MAD) must use a VOP3 encoding, and thus
+  // it is truly free to use a source modifier in all cases. If there are
+  // multiple users but for each one will necessitate using VOP3, there will be
+  // a code size increase. Try to avoid increasing code size unless we know it
+  // will save on the instruction count.
+  unsigned NumMayIncreaseSize = 0;
+  Register Dst = MI.getOperand(0).getReg();
+  for (const MachineInstr &Use : MRI.use_nodbg_instructions(Dst)) {
+    if (!hasSourceMods(Use))
+      return false;
+
+    if (!opMustUseVOP3Encoding(Use, MRI)) {
+      if (++NumMayIncreaseSize > CostThreshold)
+        return false;
+    }
+  }
+  return true;
+}
+
+static bool mayIgnoreSignedZero(MachineInstr &MI) {
+  const TargetOptions &Options = MI.getMF()->getTarget().Options;
+  return Options.NoSignedZerosFPMath || MI.getFlag(MachineInstr::MIFlag::FmNsz);
+}
+
+static bool isInv2Pi(const APFloat &APF) {
+  static const APFloat KF16(APFloat::IEEEhalf(), APInt(16, 0x3118));
+  static const APFloat KF32(APFloat::IEEEsingle(), APInt(32, 0x3e22f983));
+  static const APFloat KF64(APFloat::IEEEdouble(),
+                            APInt(64, 0x3fc45f306dc9c882));
+
+  return APF.bitwiseIsEqual(KF16) || APF.bitwiseIsEqual(KF32) ||
+         APF.bitwiseIsEqual(KF64);
+}
+
+// 0 and 1.0 / (0.5 * pi) do not have inline immmediates, so there is an
+// additional cost to negate them.
+static bool isConstantCostlierToNegate(MachineInstr &MI, Register Reg,
+                                       MachineRegisterInfo &MRI) {
+  Optional<FPValueAndVReg> FPValReg;
+  if (mi_match(Reg, MRI, m_GFCstOrSplat(FPValReg))) {
+    if (FPValReg->Value.isZero() && !FPValReg->Value.isNegative())
+      return true;
+
+    const GCNSubtarget &ST = MI.getMF()->getSubtarget<GCNSubtarget>();
+    if (ST.hasInv2PiInlineImm() && isInv2Pi(FPValReg->Value))
+      return true;
+  }
+  return false;
+}
+
+static unsigned inverseMinMax(unsigned Opc) {
+  switch (Opc) {
+  case AMDGPU::G_FMAXNUM:
+    return AMDGPU::G_FMINNUM;
+  case AMDGPU::G_FMINNUM:
+    return AMDGPU::G_FMAXNUM;
+  case AMDGPU::G_FMAXNUM_IEEE:
+    return AMDGPU::G_FMINNUM_IEEE;
+  case AMDGPU::G_FMINNUM_IEEE:
+    return AMDGPU::G_FMAXNUM_IEEE;
+  case AMDGPU::G_AMDGPU_FMAX_LEGACY:
+    return AMDGPU::G_AMDGPU_FMIN_LEGACY;
+  case AMDGPU::G_AMDGPU_FMIN_LEGACY:
+    return AMDGPU::G_AMDGPU_FMAX_LEGACY;
+  default:
+    llvm_unreachable("invalid min/max opcode");
+  }
+}
+
+bool AMDGPUCombinerHelper::matchFoldableFneg(MachineInstr &MI,
+                                             MachineInstr *&MatchInfo) {
+  Register Src = MI.getOperand(1).getReg();
+  MatchInfo = MRI.getVRegDef(Src);
+
+  // If the input has multiple uses and we can either fold the negate down, or
+  // the other uses cannot, give up. This both prevents unprofitable
+  // transformations and infinite loops: we won't repeatedly try to fold around
+  // a negate that has no 'good' form.
+  if (MRI.hasOneNonDBGUse(Src)) {
+    if (allUsesHaveSourceMods(MI, MRI, 0))
+      return false;
+  } else {
+    if (fnegFoldsIntoMI(*MatchInfo) &&
+        (allUsesHaveSourceMods(MI, MRI) ||
+         !allUsesHaveSourceMods(*MatchInfo, MRI)))
+      return false;
+  }
+
+  switch (MatchInfo->getOpcode()) {
+  case AMDGPU::G_FMINNUM:
+  case AMDGPU::G_FMAXNUM:
+  case AMDGPU::G_FMINNUM_IEEE:
+  case AMDGPU::G_FMAXNUM_IEEE:
+  case AMDGPU::G_AMDGPU_FMIN_LEGACY:
+  case AMDGPU::G_AMDGPU_FMAX_LEGACY:
+    // 0 doesn't have a negated inline immediate.
+    return !isConstantCostlierToNegate(*MatchInfo,
+                                       MatchInfo->getOperand(2).getReg(), MRI);
+  case AMDGPU::G_FADD:
+  case AMDGPU::G_FSUB:
+  case AMDGPU::G_FMA:
+  case AMDGPU::G_FMAD:
+    return mayIgnoreSignedZero(*MatchInfo);
+  case AMDGPU::G_FMUL:
+  case AMDGPU::G_FPEXT:
+  case AMDGPU::G_INTRINSIC_TRUNC:
+  case AMDGPU::G_FPTRUNC:
+  case AMDGPU::G_FRINT:
+  case AMDGPU::G_FNEARBYINT:
+  case AMDGPU::G_INTRINSIC_ROUND:
+  case AMDGPU::G_INTRINSIC_ROUNDEVEN:
+  case AMDGPU::G_FSIN:
+  case AMDGPU::G_FCANONICALIZE:
+  case AMDGPU::G_AMDGPU_RCP_IFLAG:
+    return true;
+  case AMDGPU::G_INTRINSIC: {
+    unsigned IntrinsicID = MatchInfo->getIntrinsicID();
+    switch (IntrinsicID) {
+    case Intrinsic::amdgcn_rcp:
+    case Intrinsic::amdgcn_rcp_legacy:
+    case Intrinsic::amdgcn_sin:
+    case Intrinsic::amdgcn_fmul_legacy:
+    case Intrinsic::amdgcn_fmed3:
+      return true;
+    case Intrinsic::amdgcn_fma_legacy:
+      return mayIgnoreSignedZero(*MatchInfo);
+    default:
+      return false;
+    }
+  }
+  default:
+    return false;
+  }
+}
+
+void AMDGPUCombinerHelper::applyFoldableFneg(MachineInstr &MI,
+                                             MachineInstr *&MatchInfo) {
+  // Transform:
+  // %A = inst %Op1, ...
+  // %B = fneg %A
+  //
+  // into:
+  //
+  // (if %A has one use, specifically fneg above)
+  // %B = inst (maybe fneg %Op1), ...
+  //
+  // (if %A has multiple uses)
+  // %B = inst (maybe fneg %Op1), ...
+  // %A = fneg %B
+
+  // Replace register in operand with a register holding negated value.
+  auto NegateOperand = [&](MachineOperand &Op) {
+    Register Reg = Op.getReg();
+    if (!mi_match(Reg, MRI, m_GFNeg(m_Reg(Reg))))
+      Reg = Builder.buildFNeg(MRI.getType(Reg), Reg).getReg(0);
+    replaceRegOpWith(MRI, Op, Reg);
+  };
+
+  // Replace either register in operands with a register holding negated value.
+  auto NegateEitherOperand = [&](MachineOperand &X, MachineOperand &Y) {
+    Register XReg = X.getReg();
+    Register YReg = Y.getReg();
+    if (mi_match(XReg, MRI, m_GFNeg(m_Reg(XReg))))
+      replaceRegOpWith(MRI, X, XReg);
+    else if (mi_match(YReg, MRI, m_GFNeg(m_Reg(YReg))))
+      replaceRegOpWith(MRI, Y, YReg);
+    else {
+      YReg = Builder.buildFNeg(MRI.getType(YReg), YReg).getReg(0);
+      replaceRegOpWith(MRI, Y, YReg);
+    }
+  };
+
+  Builder.setInstrAndDebugLoc(*MatchInfo);
+
+  // Negate appropriate operands so that resulting value of MatchInfo is
+  // negated.
+  switch (MatchInfo->getOpcode()) {
+  case AMDGPU::G_FADD:
+  case AMDGPU::G_FSUB:
+    NegateOperand(MatchInfo->getOperand(1));
+    NegateOperand(MatchInfo->getOperand(2));
+    break;
+  case AMDGPU::G_FMUL:
+    NegateEitherOperand(MatchInfo->getOperand(1), MatchInfo->getOperand(2));
+    break;
+  case AMDGPU::G_FMINNUM:
+  case AMDGPU::G_FMAXNUM:
+  case AMDGPU::G_FMINNUM_IEEE:
+  case AMDGPU::G_FMAXNUM_IEEE:
+  case AMDGPU::G_AMDGPU_FMIN_LEGACY:
+  case AMDGPU::G_AMDGPU_FMAX_LEGACY: {
+    NegateOperand(MatchInfo->getOperand(1));
+    NegateOperand(MatchInfo->getOperand(2));
+    unsigned Opposite = inverseMinMax(MatchInfo->getOpcode());
+    replaceOpcodeWith(*MatchInfo, Opposite);
+    break;
+  }
+  case AMDGPU::G_FMA:
+  case AMDGPU::G_FMAD:
+    NegateEitherOperand(MatchInfo->getOperand(1), MatchInfo->getOperand(2));
+    NegateOperand(MatchInfo->getOperand(3));
+    break;
+  case AMDGPU::G_FPEXT:
+  case AMDGPU::G_INTRINSIC_TRUNC:
+  case AMDGPU::G_FRINT:
+  case AMDGPU::G_FNEARBYINT:
+  case AMDGPU::G_INTRINSIC_ROUND:
+  case AMDGPU::G_INTRINSIC_ROUNDEVEN:
+  case AMDGPU::G_FSIN:
+  case AMDGPU::G_FCANONICALIZE:
+  case AMDGPU::G_AMDGPU_RCP_IFLAG:
+  case AMDGPU::G_FPTRUNC:
+    NegateOperand(MatchInfo->getOperand(1));
+    break;
+  case AMDGPU::G_INTRINSIC: {
+    unsigned IntrinsicID = MatchInfo->getIntrinsicID();
+    switch (IntrinsicID) {
+    case Intrinsic::amdgcn_rcp:
+    case Intrinsic::amdgcn_rcp_legacy:
+    case Intrinsic::amdgcn_sin:
+      NegateOperand(MatchInfo->getOperand(2));
+      break;
+    case Intrinsic::amdgcn_fmul_legacy:
+      NegateEitherOperand(MatchInfo->getOperand(2), MatchInfo->getOperand(3));
+      break;
+    case Intrinsic::amdgcn_fmed3:
+      NegateOperand(MatchInfo->getOperand(2));
+      NegateOperand(MatchInfo->getOperand(3));
+      NegateOperand(MatchInfo->getOperand(4));
+      break;
+    case Intrinsic::amdgcn_fma_legacy:
+      NegateEitherOperand(MatchInfo->getOperand(2), MatchInfo->getOperand(3));
+      NegateOperand(MatchInfo->getOperand(4));
+      break;
+    default:
+      llvm_unreachable("folding fneg not supported for this intrinsic");
+    }
+    break;
+  }
+  default:
+    llvm_unreachable("folding fneg not supported for this instruction");
+  }
+
+  Register Dst = MI.getOperand(0).getReg();
+  Register MatchInfoDst = MatchInfo->getOperand(0).getReg();
+
+  if (MRI.hasOneNonDBGUse(MatchInfoDst)) {
+    // MatchInfo now has negated value so use that instead of old Dst.
+    replaceRegWith(MRI, Dst, MatchInfoDst);
+  } else {
+    // We want to swap all uses of Dst with uses of MatchInfoDst and vice versa
+    // but replaceRegWith will replace defs as well. It is easier to replace one
+    // def with a new register.
+    LLT Type = MRI.getType(Dst);
+    Register NegatedMatchInfo = MRI.createGenericVirtualRegister(Type);
+    replaceRegOpWith(MRI, MatchInfo->getOperand(0), NegatedMatchInfo);
+
+    // MatchInfo now has negated value so use that instead of old Dst.
+    replaceRegWith(MRI, Dst, NegatedMatchInfo);
+
+    // Recreate non negated value for other uses of old MatchInfoDst
+    Builder.setInstrAndDebugLoc(MI);
+    Builder.buildFNeg(MatchInfoDst, NegatedMatchInfo, MI.getFlags());
+  }
+
+  MI.eraseFromParent();
+  return;
+}