diff options
Diffstat (limited to 'llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp')
| -rw-r--r-- | llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp | 192 |
1 files changed, 191 insertions, 1 deletions
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp index 12b5830ef930..3ce67a733c10 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp @@ -16,6 +16,7 @@ #include "AMDGPURegisterBankInfo.h" #include "GCNSubtarget.h" #include "MCTargetDesc/AMDGPUMCTargetDesc.h" +#include "SIMachineFunctionInfo.h" #include "llvm/CodeGen/GlobalISel/Combiner.h" #include "llvm/CodeGen/GlobalISel/CombinerHelper.h" #include "llvm/CodeGen/GlobalISel/CombinerInfo.h" @@ -23,6 +24,7 @@ #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" #include "llvm/CodeGen/MachineDominators.h" #include "llvm/CodeGen/TargetPassConfig.h" +#include "llvm/IR/IntrinsicsAMDGPU.h" #include "llvm/Target/TargetMachine.h" #define DEBUG_TYPE "amdgpu-regbank-combiner" @@ -36,13 +38,15 @@ protected: MachineRegisterInfo &MRI; const RegisterBankInfo &RBI; const TargetRegisterInfo &TRI; + const SIInstrInfo &TII; CombinerHelper &Helper; public: AMDGPURegBankCombinerHelper(MachineIRBuilder &B, CombinerHelper &Helper) : B(B), MF(B.getMF()), MRI(*B.getMRI()), RBI(*MF.getSubtarget().getRegBankInfo()), - TRI(*MF.getSubtarget().getRegisterInfo()), Helper(Helper){}; + TRI(*MF.getSubtarget().getRegisterInfo()), + TII(*MF.getSubtarget<GCNSubtarget>().getInstrInfo()), Helper(Helper){}; bool isVgprRegBank(Register Reg); Register getAsVgpr(Register Reg); @@ -63,7 +67,19 @@ public: Register &Val, CstTy &K0, CstTy &K1); bool matchIntMinMaxToMed3(MachineInstr &MI, Med3MatchInfo &MatchInfo); + bool matchFPMinMaxToMed3(MachineInstr &MI, Med3MatchInfo &MatchInfo); + bool matchFPMinMaxToClamp(MachineInstr &MI, Register &Reg); + bool matchFPMed3ToClamp(MachineInstr &MI, Register &Reg); void applyMed3(MachineInstr &MI, Med3MatchInfo &MatchInfo); + void applyClamp(MachineInstr &MI, Register &Reg); + +private: + AMDGPU::SIModeRegisterDefaults getMode(); + bool getIEEE(); + bool getDX10Clamp(); + bool isFminnumIeee(const MachineInstr &MI); + bool isFCst(MachineInstr *MI); + bool isClampZeroToOne(MachineInstr *K0, MachineInstr *K1); }; bool AMDGPURegBankCombinerHelper::isVgprRegBank(Register Reg) { @@ -98,6 +114,13 @@ AMDGPURegBankCombinerHelper::getMinMaxPair(unsigned Opc) { case AMDGPU::G_UMAX: case AMDGPU::G_UMIN: return {AMDGPU::G_UMIN, AMDGPU::G_UMAX, AMDGPU::G_AMDGPU_UMED3}; + case AMDGPU::G_FMAXNUM: + case AMDGPU::G_FMINNUM: + return {AMDGPU::G_FMINNUM, AMDGPU::G_FMAXNUM, AMDGPU::G_AMDGPU_FMED3}; + case AMDGPU::G_FMAXNUM_IEEE: + case AMDGPU::G_FMINNUM_IEEE: + return {AMDGPU::G_FMINNUM_IEEE, AMDGPU::G_FMAXNUM_IEEE, + AMDGPU::G_AMDGPU_FMED3}; } } @@ -148,6 +171,146 @@ bool AMDGPURegBankCombinerHelper::matchIntMinMaxToMed3( return true; } +// fmed3(NaN, K0, K1) = min(min(NaN, K0), K1) +// ieee = true : min/max(SNaN, K) = QNaN, min/max(QNaN, K) = K +// ieee = false : min/max(NaN, K) = K +// clamp(NaN) = dx10_clamp ? 0.0 : NaN +// Consider values of min(max(Val, K0), K1) and max(min(Val, K1), K0) as input. +// Other operand commutes (see matchMed) give same result since min and max are +// commutative. + +// Try to replace fp min(max(Val, K0), K1) or max(min(Val, K1), K0), KO<=K1 +// with fmed3(Val, K0, K1) or clamp(Val). Clamp requires K0 = 0.0 and K1 = 1.0. +// Val = SNaN only for ieee = true +// fmed3(SNaN, K0, K1) = min(min(SNaN, K0), K1) = min(QNaN, K1) = K1 +// min(max(SNaN, K0), K1) = min(QNaN, K1) = K1 +// max(min(SNaN, K1), K0) = max(K1, K0) = K1 +// Val = NaN,ieee = false or Val = QNaN,ieee = true +// fmed3(NaN, K0, K1) = min(min(NaN, K0), K1) = min(K0, K1) = K0 +// min(max(NaN, K0), K1) = min(K0, K1) = K0 (can clamp when dx10_clamp = true) +// max(min(NaN, K1), K0) = max(K1, K0) = K1 != K0 +bool AMDGPURegBankCombinerHelper::matchFPMinMaxToMed3( + MachineInstr &MI, Med3MatchInfo &MatchInfo) { + Register Dst = MI.getOperand(0).getReg(); + LLT Ty = MRI.getType(Dst); + if (Ty != LLT::scalar(16) && Ty != LLT::scalar(32)) + return false; + + auto OpcodeTriple = getMinMaxPair(MI.getOpcode()); + + Register Val; + Optional<FPValueAndVReg> K0, K1; + // Match min(max(Val, K0), K1) or max(min(Val, K1), K0). Then see if K0 <= K1. + if (!matchMed<GFCstAndRegMatch>(MI, MRI, OpcodeTriple, Val, K0, K1)) + return false; + + if (K0->Value > K1->Value) + return false; + + // For IEEE=false perform combine only when it's safe to assume that there are + // no NaN inputs. Most often MI is marked with nnan fast math flag. + // For IEEE=true consider NaN inputs. fmed3(NaN, K0, K1) is equivalent to + // min(min(NaN, K0), K1). Safe to fold for min(max(Val, K0), K1) since inner + // nodes(max/min) have same behavior when one input is NaN and other isn't. + // Don't consider max(min(SNaN, K1), K0) since there is no isKnownNeverQNaN, + // also post-legalizer inputs to min/max are fcanonicalized (never SNaN). + if ((getIEEE() && isFminnumIeee(MI)) || isKnownNeverNaN(Dst, MRI)) { + // Don't fold single use constant that can't be inlined. + if ((!MRI.hasOneNonDBGUse(K0->VReg) || TII.isInlineConstant(K0->Value)) && + (!MRI.hasOneNonDBGUse(K1->VReg) || TII.isInlineConstant(K1->Value))) { + MatchInfo = {OpcodeTriple.Med, Val, K0->VReg, K1->VReg}; + return true; + } + } + + return false; +} + +bool AMDGPURegBankCombinerHelper::matchFPMinMaxToClamp(MachineInstr &MI, + Register &Reg) { + // Clamp is available on all types after regbankselect (f16, f32, f64, v2f16). + auto OpcodeTriple = getMinMaxPair(MI.getOpcode()); + Register Val; + Optional<FPValueAndVReg> K0, K1; + // Match min(max(Val, K0), K1) or max(min(Val, K1), K0). + if (!matchMed<GFCstOrSplatGFCstMatch>(MI, MRI, OpcodeTriple, Val, K0, K1)) + return false; + + if (!K0->Value.isExactlyValue(0.0) || !K1->Value.isExactlyValue(1.0)) + return false; + + // For IEEE=false perform combine only when it's safe to assume that there are + // no NaN inputs. Most often MI is marked with nnan fast math flag. + // For IEEE=true consider NaN inputs. Only min(max(QNaN, 0.0), 1.0) evaluates + // to 0.0 requires dx10_clamp = true. + if ((getIEEE() && getDX10Clamp() && isFminnumIeee(MI) && + isKnownNeverSNaN(Val, MRI)) || + isKnownNeverNaN(MI.getOperand(0).getReg(), MRI)) { + Reg = Val; + return true; + } + + return false; +} + +// Replacing fmed3(NaN, 0.0, 1.0) with clamp. Requires dx10_clamp = true. +// Val = SNaN only for ieee = true. It is important which operand is NaN. +// min(min(SNaN, 0.0), 1.0) = min(QNaN, 1.0) = 1.0 +// min(min(SNaN, 1.0), 0.0) = min(QNaN, 0.0) = 0.0 +// min(min(0.0, 1.0), SNaN) = min(0.0, SNaN) = QNaN +// Val = NaN,ieee = false or Val = QNaN,ieee = true +// min(min(NaN, 0.0), 1.0) = min(0.0, 1.0) = 0.0 +// min(min(NaN, 1.0), 0.0) = min(1.0, 0.0) = 0.0 +// min(min(0.0, 1.0), NaN) = min(0.0, NaN) = 0.0 +bool AMDGPURegBankCombinerHelper::matchFPMed3ToClamp(MachineInstr &MI, + Register &Reg) { + if (MI.getIntrinsicID() != Intrinsic::amdgcn_fmed3) + return false; + + // In llvm-ir, clamp is often represented as an intrinsic call to + // @llvm.amdgcn.fmed3.f32(%Val, 0.0, 1.0). Check for other operand orders. + MachineInstr *Src0 = getDefIgnoringCopies(MI.getOperand(2).getReg(), MRI); + MachineInstr *Src1 = getDefIgnoringCopies(MI.getOperand(3).getReg(), MRI); + MachineInstr *Src2 = getDefIgnoringCopies(MI.getOperand(4).getReg(), MRI); + + if (isFCst(Src0) && !isFCst(Src1)) + std::swap(Src0, Src1); + if (isFCst(Src1) && !isFCst(Src2)) + std::swap(Src1, Src2); + if (isFCst(Src0) && !isFCst(Src1)) + std::swap(Src0, Src1); + if (!isClampZeroToOne(Src1, Src2)) + return false; + + Register Val = Src0->getOperand(0).getReg(); + + auto isOp3Zero = [&]() { + MachineInstr *Op3 = getDefIgnoringCopies(MI.getOperand(4).getReg(), MRI); + if (Op3->getOpcode() == TargetOpcode::G_FCONSTANT) + return Op3->getOperand(1).getFPImm()->isExactlyValue(0.0); + return false; + }; + // For IEEE=false perform combine only when it's safe to assume that there are + // no NaN inputs. Most often MI is marked with nnan fast math flag. + // For IEEE=true consider NaN inputs. Requires dx10_clamp = true. Safe to fold + // when Val could be QNaN. If Val can also be SNaN third input should be 0.0. + if (isKnownNeverNaN(MI.getOperand(0).getReg(), MRI) || + (getIEEE() && getDX10Clamp() && + (isKnownNeverSNaN(Val, MRI) || isOp3Zero()))) { + Reg = Val; + return true; + } + + return false; +} + +void AMDGPURegBankCombinerHelper::applyClamp(MachineInstr &MI, Register &Reg) { + B.setInstrAndDebugLoc(MI); + B.buildInstr(AMDGPU::G_AMDGPU_CLAMP, {MI.getOperand(0)}, {Reg}, + MI.getFlags()); + MI.eraseFromParent(); +} + void AMDGPURegBankCombinerHelper::applyMed3(MachineInstr &MI, Med3MatchInfo &MatchInfo) { B.setInstrAndDebugLoc(MI); @@ -158,6 +321,33 @@ void AMDGPURegBankCombinerHelper::applyMed3(MachineInstr &MI, MI.eraseFromParent(); } +AMDGPU::SIModeRegisterDefaults AMDGPURegBankCombinerHelper::getMode() { + return MF.getInfo<SIMachineFunctionInfo>()->getMode(); +} + +bool AMDGPURegBankCombinerHelper::getIEEE() { return getMode().IEEE; } + +bool AMDGPURegBankCombinerHelper::getDX10Clamp() { return getMode().DX10Clamp; } + +bool AMDGPURegBankCombinerHelper::isFminnumIeee(const MachineInstr &MI) { + return MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE; +} + +bool AMDGPURegBankCombinerHelper::isFCst(MachineInstr *MI) { + return MI->getOpcode() == AMDGPU::G_FCONSTANT; +} + +bool AMDGPURegBankCombinerHelper::isClampZeroToOne(MachineInstr *K0, + MachineInstr *K1) { + if (isFCst(K0) && isFCst(K1)) { + const ConstantFP *KO_FPImm = K0->getOperand(1).getFPImm(); + const ConstantFP *K1_FPImm = K1->getOperand(1).getFPImm(); + return (KO_FPImm->isExactlyValue(0.0) && K1_FPImm->isExactlyValue(1.0)) || + (KO_FPImm->isExactlyValue(1.0) && K1_FPImm->isExactlyValue(0.0)); + } + return false; +} + class AMDGPURegBankCombinerHelperState { protected: CombinerHelper &Helper; |
