diff options
Diffstat (limited to 'llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp')
-rw-r--r-- | llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp | 359 |
1 files changed, 359 insertions, 0 deletions
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp new file mode 100644 index 000000000000..098b0e993886 --- /dev/null +++ b/llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp @@ -0,0 +1,359 @@ +//=== lib/CodeGen/GlobalISel/AMDGPUPostLegalizerCombiner.cpp ---------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This pass does combining of machine instructions at the generic MI level, +// after the legalizer. +// +//===----------------------------------------------------------------------===// + +#include "AMDGPUTargetMachine.h" +#include "AMDGPULegalizerInfo.h" +#include "llvm/CodeGen/GlobalISel/Combiner.h" +#include "llvm/CodeGen/GlobalISel/CombinerHelper.h" +#include "llvm/CodeGen/GlobalISel/CombinerInfo.h" +#include "llvm/CodeGen/GlobalISel/GISelKnownBits.h" +#include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" +#include "llvm/CodeGen/MachineDominators.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/TargetPassConfig.h" +#include "llvm/Support/Debug.h" +#include "MCTargetDesc/AMDGPUMCTargetDesc.h" + +#define DEBUG_TYPE "amdgpu-postlegalizer-combiner" + +using namespace llvm; +using namespace MIPatternMatch; + +struct FMinFMaxLegacyInfo { + Register LHS; + Register RHS; + Register True; + Register False; + CmpInst::Predicate Pred; +}; + +// TODO: Make sure fmin_legacy/fmax_legacy don't canonicalize +static bool matchFMinFMaxLegacy(MachineInstr &MI, MachineRegisterInfo &MRI, + MachineFunction &MF, FMinFMaxLegacyInfo &Info) { + // FIXME: Combines should have subtarget predicates, and we shouldn't need + // this here. + if (!MF.getSubtarget<GCNSubtarget>().hasFminFmaxLegacy()) + return false; + + // FIXME: Type predicate on pattern + if (MRI.getType(MI.getOperand(0).getReg()) != LLT::scalar(32)) + return false; + + Register Cond = MI.getOperand(1).getReg(); + if (!MRI.hasOneNonDBGUse(Cond) || + !mi_match(Cond, MRI, + m_GFCmp(m_Pred(Info.Pred), m_Reg(Info.LHS), m_Reg(Info.RHS)))) + return false; + + Info.True = MI.getOperand(2).getReg(); + Info.False = MI.getOperand(3).getReg(); + + if (!(Info.LHS == Info.True && Info.RHS == Info.False) && + !(Info.LHS == Info.False && Info.RHS == Info.True)) + return false; + + switch (Info.Pred) { + case CmpInst::FCMP_FALSE: + case CmpInst::FCMP_OEQ: + case CmpInst::FCMP_ONE: + case CmpInst::FCMP_ORD: + case CmpInst::FCMP_UNO: + case CmpInst::FCMP_UEQ: + case CmpInst::FCMP_UNE: + case CmpInst::FCMP_TRUE: + return false; + default: + return true; + } +} + +static void applySelectFCmpToFMinToFMaxLegacy(MachineInstr &MI, + const FMinFMaxLegacyInfo &Info) { + + auto buildNewInst = [&MI](unsigned Opc, Register X, Register Y) { + MachineIRBuilder MIB(MI); + MIB.buildInstr(Opc, {MI.getOperand(0)}, {X, Y}, MI.getFlags()); + }; + + switch (Info.Pred) { + case CmpInst::FCMP_ULT: + case CmpInst::FCMP_ULE: + if (Info.LHS == Info.True) + buildNewInst(AMDGPU::G_AMDGPU_FMIN_LEGACY, Info.RHS, Info.LHS); + else + buildNewInst(AMDGPU::G_AMDGPU_FMAX_LEGACY, Info.LHS, Info.RHS); + break; + case CmpInst::FCMP_OLE: + case CmpInst::FCMP_OLT: { + // We need to permute the operands to get the correct NaN behavior. The + // selected operand is the second one based on the failing compare with NaN, + // so permute it based on the compare type the hardware uses. + if (Info.LHS == Info.True) + buildNewInst(AMDGPU::G_AMDGPU_FMIN_LEGACY, Info.LHS, Info.RHS); + else + buildNewInst(AMDGPU::G_AMDGPU_FMAX_LEGACY, Info.RHS, Info.LHS); + break; + } + case CmpInst::FCMP_UGE: + case CmpInst::FCMP_UGT: { + if (Info.LHS == Info.True) + buildNewInst(AMDGPU::G_AMDGPU_FMAX_LEGACY, Info.RHS, Info.LHS); + else + buildNewInst(AMDGPU::G_AMDGPU_FMIN_LEGACY, Info.LHS, Info.RHS); + break; + } + case CmpInst::FCMP_OGT: + case CmpInst::FCMP_OGE: { + if (Info.LHS == Info.True) + buildNewInst(AMDGPU::G_AMDGPU_FMAX_LEGACY, Info.LHS, Info.RHS); + else + buildNewInst(AMDGPU::G_AMDGPU_FMIN_LEGACY, Info.RHS, Info.LHS); + break; + } + default: + llvm_unreachable("predicate should not have matched"); + } + + MI.eraseFromParent(); +} + +static bool matchUCharToFloat(MachineInstr &MI, MachineRegisterInfo &MRI, + MachineFunction &MF, CombinerHelper &Helper) { + Register DstReg = MI.getOperand(0).getReg(); + + // TODO: We could try to match extracting the higher bytes, which would be + // easier if i8 vectors weren't promoted to i32 vectors, particularly after + // types are legalized. v4i8 -> v4f32 is probably the only case to worry + // about in practice. + LLT Ty = MRI.getType(DstReg); + if (Ty == LLT::scalar(32) || Ty == LLT::scalar(16)) { + Register SrcReg = MI.getOperand(1).getReg(); + unsigned SrcSize = MRI.getType(SrcReg).getSizeInBits(); + assert(SrcSize == 16 || SrcSize == 32 || SrcSize == 64); + const APInt Mask = APInt::getHighBitsSet(SrcSize, SrcSize - 8); + return Helper.getKnownBits()->maskedValueIsZero(SrcReg, Mask); + } + + return false; +} + +static void applyUCharToFloat(MachineInstr &MI) { + MachineIRBuilder B(MI); + + const LLT S32 = LLT::scalar(32); + + Register DstReg = MI.getOperand(0).getReg(); + Register SrcReg = MI.getOperand(1).getReg(); + LLT Ty = B.getMRI()->getType(DstReg); + LLT SrcTy = B.getMRI()->getType(SrcReg); + if (SrcTy != S32) + SrcReg = B.buildAnyExtOrTrunc(S32, SrcReg).getReg(0); + + if (Ty == S32) { + B.buildInstr(AMDGPU::G_AMDGPU_CVT_F32_UBYTE0, {DstReg}, + {SrcReg}, MI.getFlags()); + } else { + auto Cvt0 = B.buildInstr(AMDGPU::G_AMDGPU_CVT_F32_UBYTE0, {S32}, + {SrcReg}, MI.getFlags()); + B.buildFPTrunc(DstReg, Cvt0, MI.getFlags()); + } + + MI.eraseFromParent(); +} + +// FIXME: Should be able to have 2 separate matchdatas rather than custom struct +// boilerplate. +struct CvtF32UByteMatchInfo { + Register CvtVal; + unsigned ShiftOffset; +}; + +static bool matchCvtF32UByteN(MachineInstr &MI, MachineRegisterInfo &MRI, + MachineFunction &MF, + CvtF32UByteMatchInfo &MatchInfo) { + Register SrcReg = MI.getOperand(1).getReg(); + + // Look through G_ZEXT. + mi_match(SrcReg, MRI, m_GZExt(m_Reg(SrcReg))); + + Register Src0; + int64_t ShiftAmt; + bool IsShr = mi_match(SrcReg, MRI, m_GLShr(m_Reg(Src0), m_ICst(ShiftAmt))); + if (IsShr || mi_match(SrcReg, MRI, m_GShl(m_Reg(Src0), m_ICst(ShiftAmt)))) { + const unsigned Offset = MI.getOpcode() - AMDGPU::G_AMDGPU_CVT_F32_UBYTE0; + + unsigned ShiftOffset = 8 * Offset; + if (IsShr) + ShiftOffset += ShiftAmt; + else + ShiftOffset -= ShiftAmt; + + MatchInfo.CvtVal = Src0; + MatchInfo.ShiftOffset = ShiftOffset; + return ShiftOffset < 32 && ShiftOffset >= 8 && (ShiftOffset % 8) == 0; + } + + // TODO: Simplify demanded bits. + return false; +} + +static void applyCvtF32UByteN(MachineInstr &MI, + const CvtF32UByteMatchInfo &MatchInfo) { + MachineIRBuilder B(MI); + unsigned NewOpc = AMDGPU::G_AMDGPU_CVT_F32_UBYTE0 + MatchInfo.ShiftOffset / 8; + + const LLT S32 = LLT::scalar(32); + Register CvtSrc = MatchInfo.CvtVal; + LLT SrcTy = B.getMRI()->getType(MatchInfo.CvtVal); + if (SrcTy != S32) { + assert(SrcTy.isScalar() && SrcTy.getSizeInBits() >= 8); + CvtSrc = B.buildAnyExt(S32, CvtSrc).getReg(0); + } + + assert(MI.getOpcode() != NewOpc); + B.buildInstr(NewOpc, {MI.getOperand(0)}, {CvtSrc}, MI.getFlags()); + MI.eraseFromParent(); +} + +#define AMDGPUPOSTLEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_DEPS +#include "AMDGPUGenPostLegalizeGICombiner.inc" +#undef AMDGPUPOSTLEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_DEPS + +namespace { +#define AMDGPUPOSTLEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_H +#include "AMDGPUGenPostLegalizeGICombiner.inc" +#undef AMDGPUPOSTLEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_H + +class AMDGPUPostLegalizerCombinerInfo : public CombinerInfo { + GISelKnownBits *KB; + MachineDominatorTree *MDT; + +public: + AMDGPUGenPostLegalizerCombinerHelperRuleConfig GeneratedRuleCfg; + + AMDGPUPostLegalizerCombinerInfo(bool EnableOpt, bool OptSize, bool MinSize, + const AMDGPULegalizerInfo *LI, + GISelKnownBits *KB, MachineDominatorTree *MDT) + : CombinerInfo(/*AllowIllegalOps*/ false, /*ShouldLegalizeIllegal*/ true, + /*LegalizerInfo*/ LI, EnableOpt, OptSize, MinSize), + KB(KB), MDT(MDT) { + if (!GeneratedRuleCfg.parseCommandLineOption()) + report_fatal_error("Invalid rule identifier"); + } + + bool combine(GISelChangeObserver &Observer, MachineInstr &MI, + MachineIRBuilder &B) const override; +}; + +bool AMDGPUPostLegalizerCombinerInfo::combine(GISelChangeObserver &Observer, + MachineInstr &MI, + MachineIRBuilder &B) const { + CombinerHelper Helper(Observer, B, KB, MDT); + AMDGPUGenPostLegalizerCombinerHelper Generated(GeneratedRuleCfg); + + if (Generated.tryCombineAll(Observer, MI, B, Helper)) + return true; + + switch (MI.getOpcode()) { + case TargetOpcode::G_SHL: + case TargetOpcode::G_LSHR: + case TargetOpcode::G_ASHR: + // On some subtargets, 64-bit shift is a quarter rate instruction. In the + // common case, splitting this into a move and a 32-bit shift is faster and + // the same code size. + return Helper.tryCombineShiftToUnmerge(MI, 32); + } + + return false; +} + +#define AMDGPUPOSTLEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_CPP +#include "AMDGPUGenPostLegalizeGICombiner.inc" +#undef AMDGPUPOSTLEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_CPP + +// Pass boilerplate +// ================ + +class AMDGPUPostLegalizerCombiner : public MachineFunctionPass { +public: + static char ID; + + AMDGPUPostLegalizerCombiner(bool IsOptNone = false); + + StringRef getPassName() const override { + return "AMDGPUPostLegalizerCombiner"; + } + + bool runOnMachineFunction(MachineFunction &MF) override; + + void getAnalysisUsage(AnalysisUsage &AU) const override; +private: + bool IsOptNone; +}; +} // end anonymous namespace + +void AMDGPUPostLegalizerCombiner::getAnalysisUsage(AnalysisUsage &AU) const { + AU.addRequired<TargetPassConfig>(); + AU.setPreservesCFG(); + getSelectionDAGFallbackAnalysisUsage(AU); + AU.addRequired<GISelKnownBitsAnalysis>(); + AU.addPreserved<GISelKnownBitsAnalysis>(); + if (!IsOptNone) { + AU.addRequired<MachineDominatorTree>(); + AU.addPreserved<MachineDominatorTree>(); + } + MachineFunctionPass::getAnalysisUsage(AU); +} + +AMDGPUPostLegalizerCombiner::AMDGPUPostLegalizerCombiner(bool IsOptNone) + : MachineFunctionPass(ID), IsOptNone(IsOptNone) { + initializeAMDGPUPostLegalizerCombinerPass(*PassRegistry::getPassRegistry()); +} + +bool AMDGPUPostLegalizerCombiner::runOnMachineFunction(MachineFunction &MF) { + if (MF.getProperties().hasProperty( + MachineFunctionProperties::Property::FailedISel)) + return false; + auto *TPC = &getAnalysis<TargetPassConfig>(); + const Function &F = MF.getFunction(); + bool EnableOpt = + MF.getTarget().getOptLevel() != CodeGenOpt::None && !skipFunction(F); + + const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); + const AMDGPULegalizerInfo *LI + = static_cast<const AMDGPULegalizerInfo *>(ST.getLegalizerInfo()); + + GISelKnownBits *KB = &getAnalysis<GISelKnownBitsAnalysis>().get(MF); + MachineDominatorTree *MDT = + IsOptNone ? nullptr : &getAnalysis<MachineDominatorTree>(); + AMDGPUPostLegalizerCombinerInfo PCInfo(EnableOpt, F.hasOptSize(), + F.hasMinSize(), LI, KB, MDT); + Combiner C(PCInfo, TPC); + return C.combineMachineInstrs(MF, /*CSEInfo*/ nullptr); +} + +char AMDGPUPostLegalizerCombiner::ID = 0; +INITIALIZE_PASS_BEGIN(AMDGPUPostLegalizerCombiner, DEBUG_TYPE, + "Combine AMDGPU machine instrs after legalization", + false, false) +INITIALIZE_PASS_DEPENDENCY(TargetPassConfig) +INITIALIZE_PASS_DEPENDENCY(GISelKnownBitsAnalysis) +INITIALIZE_PASS_END(AMDGPUPostLegalizerCombiner, DEBUG_TYPE, + "Combine AMDGPU machine instrs after legalization", false, + false) + +namespace llvm { +FunctionPass *createAMDGPUPostLegalizeCombiner(bool IsOptNone) { + return new AMDGPUPostLegalizerCombiner(IsOptNone); +} +} // end namespace llvm |