diff options
Diffstat (limited to 'llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp')
| -rw-r--r-- | llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp | 237 |
1 files changed, 113 insertions, 124 deletions
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp index 940ec6f31c69..0b4b4776ad39 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -14,25 +14,17 @@ #include "AMDGPUISelLowering.h" #include "AMDGPU.h" -#include "AMDGPUCallLowering.h" -#include "AMDGPUFrameLowering.h" -#include "AMDGPUSubtarget.h" -#include "AMDGPUTargetMachine.h" -#include "Utils/AMDGPUBaseInfo.h" -#include "R600MachineFunctionInfo.h" -#include "SIInstrInfo.h" +#include "AMDGPUInstrInfo.h" +#include "AMDGPUMachineFunction.h" +#include "GCNSubtarget.h" #include "SIMachineFunctionInfo.h" -#include "MCTargetDesc/AMDGPUMCTargetDesc.h" #include "llvm/CodeGen/Analysis.h" -#include "llvm/CodeGen/CallingConvLower.h" -#include "llvm/CodeGen/MachineFunction.h" -#include "llvm/CodeGen/MachineRegisterInfo.h" -#include "llvm/CodeGen/SelectionDAG.h" -#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h" -#include "llvm/IR/DataLayout.h" #include "llvm/IR/DiagnosticInfo.h" +#include "llvm/IR/IntrinsicsAMDGPU.h" +#include "llvm/Support/CommandLine.h" #include "llvm/Support/KnownBits.h" -#include "llvm/Support/MathExtras.h" +#include "llvm/Target/TargetMachine.h" + using namespace llvm; #include "AMDGPUGenCallingConv.inc" @@ -320,6 +312,7 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM, setOperationAction(ISD::FNEARBYINT, MVT::f32, Custom); setOperationAction(ISD::FNEARBYINT, MVT::f64, Custom); + setOperationAction(ISD::FREM, MVT::f16, Custom); setOperationAction(ISD::FREM, MVT::f32, Custom); setOperationAction(ISD::FREM, MVT::f64, Custom); @@ -396,6 +389,9 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM, setOperationAction(ISD::ROTL, MVT::i64, Expand); setOperationAction(ISD::ROTR, MVT::i64, Expand); + setOperationAction(ISD::MULHU, MVT::i16, Expand); + setOperationAction(ISD::MULHS, MVT::i16, Expand); + setOperationAction(ISD::MUL, MVT::i64, Expand); setOperationAction(ISD::MULHU, MVT::i64, Expand); setOperationAction(ISD::MULHS, MVT::i64, Expand); @@ -569,6 +565,17 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM, setTargetDAGCombine(ISD::INTRINSIC_WO_CHAIN); } +bool AMDGPUTargetLowering::mayIgnoreSignedZero(SDValue Op) const { + if (getTargetMachine().Options.NoSignedZerosFPMath) + return true; + + const auto Flags = Op.getNode()->getFlags(); + if (Flags.hasNoSignedZeros()) + return true; + + return false; +} + //===----------------------------------------------------------------------===// // Target Information //===----------------------------------------------------------------------===// @@ -598,6 +605,7 @@ static bool fnegFoldsIntoOp(unsigned Opc) { case AMDGPUISD::FMIN_LEGACY: case AMDGPUISD::FMAX_LEGACY: case AMDGPUISD::FMED3: + // TODO: handle llvm.amdgcn.fma.legacy return true; default: return false; @@ -781,34 +789,27 @@ bool AMDGPUTargetLowering::isCheapToSpeculateCtlz() const { return true; } -bool AMDGPUTargetLowering::isSDNodeAlwaysUniform(const SDNode * N) const { +bool AMDGPUTargetLowering::isSDNodeAlwaysUniform(const SDNode *N) const { switch (N->getOpcode()) { - default: - return false; - case ISD::EntryToken: - case ISD::TokenFactor: + case ISD::EntryToken: + case ISD::TokenFactor: + return true; + case ISD::INTRINSIC_WO_CHAIN: { + unsigned IntrID = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue(); + switch (IntrID) { + case Intrinsic::amdgcn_readfirstlane: + case Intrinsic::amdgcn_readlane: return true; - case ISD::INTRINSIC_WO_CHAIN: - { - unsigned IntrID = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue(); - switch (IntrID) { - default: - return false; - case Intrinsic::amdgcn_readfirstlane: - case Intrinsic::amdgcn_readlane: - return true; - } } - break; - case ISD::LOAD: - { - if (cast<LoadSDNode>(N)->getMemOperand()->getAddrSpace() == - AMDGPUAS::CONSTANT_ADDRESS_32BIT) - return true; - return false; - } - break; + return false; } + case ISD::LOAD: + if (cast<LoadSDNode>(N)->getMemOperand()->getAddrSpace() == + AMDGPUAS::CONSTANT_ADDRESS_32BIT) + return true; + return false; + } + return false; } SDValue AMDGPUTargetLowering::getNegatedExpression( @@ -944,6 +945,8 @@ CCAssignFn *AMDGPUCallLowering::CCAssignFnForCall(CallingConv::ID CC, case CallingConv::Fast: case CallingConv::Cold: return CC_AMDGPU_Func; + case CallingConv::AMDGPU_Gfx: + return CC_SI_Gfx; case CallingConv::AMDGPU_KERNEL: case CallingConv::SPIR_KERNEL: default: @@ -965,6 +968,8 @@ CCAssignFn *AMDGPUCallLowering::CCAssignFnForReturn(CallingConv::ID CC, case CallingConv::AMDGPU_ES: case CallingConv::AMDGPU_LS: return RetCC_SI_Shader; + case CallingConv::AMDGPU_Gfx: + return RetCC_SI_Gfx; case CallingConv::C: case CallingConv::Fast: case CallingConv::Cold: @@ -1017,10 +1022,14 @@ void AMDGPUTargetLowering::analyzeFormalArgumentsCompute( unsigned InIndex = 0; for (const Argument &Arg : Fn.args()) { + const bool IsByRef = Arg.hasByRefAttr(); Type *BaseArgTy = Arg.getType(); - Align Alignment = DL.getABITypeAlign(BaseArgTy); - MaxAlign = std::max(Alignment, MaxAlign); - unsigned AllocSize = DL.getTypeAllocSize(BaseArgTy); + Type *MemArgTy = IsByRef ? Arg.getParamByRefType() : BaseArgTy; + MaybeAlign Alignment = IsByRef ? Arg.getParamAlign() : None; + if (!Alignment) + Alignment = DL.getABITypeAlign(MemArgTy); + MaxAlign = max(Alignment, MaxAlign); + uint64_t AllocSize = DL.getTypeAllocSize(MemArgTy); uint64_t ArgOffset = alignTo(ExplicitArgOffset, Alignment) + ExplicitOffset; ExplicitArgOffset = alignTo(ExplicitArgOffset, Alignment) + AllocSize; @@ -1224,7 +1233,7 @@ SDValue AMDGPUTargetLowering::LowerOperation(SDValue Op, switch (Op.getOpcode()) { default: Op->print(errs(), &DAG); - llvm_unreachable("Custom lowering code for this" + llvm_unreachable("Custom lowering code for this " "instruction is not implemented yet!"); break; case ISD::SIGN_EXTEND_INREG: return LowerSIGN_EXTEND_INREG(Op, DAG); @@ -1295,7 +1304,7 @@ SDValue AMDGPUTargetLowering::LowerGlobalAddress(AMDGPUMachineFunction* MFI, if (G->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS || G->getAddressSpace() == AMDGPUAS::REGION_ADDRESS) { - if (!MFI->isEntryFunction()) { + if (!MFI->isModuleEntryFunction()) { SDLoc DL(Op); const Function &Fn = DAG.getMachineFunction().getFunction(); DiagnosticInfoUnsupported BadLDSDecl( @@ -1539,7 +1548,7 @@ SDValue AMDGPUTargetLowering::SplitVectorLoad(const SDValue Op, SDValue LoLoad = DAG.getExtLoad(Load->getExtensionType(), SL, LoVT, Load->getChain(), BasePtr, SrcValue, LoMemVT, BaseAlign, Load->getMemOperand()->getFlags()); - SDValue HiPtr = DAG.getObjectPtrOffset(SL, BasePtr, Size); + SDValue HiPtr = DAG.getObjectPtrOffset(SL, BasePtr, TypeSize::Fixed(Size)); SDValue HiLoad = DAG.getExtLoad(Load->getExtensionType(), SL, HiVT, Load->getChain(), HiPtr, SrcValue.getWithOffset(LoMemVT.getStoreSize()), @@ -1564,17 +1573,25 @@ SDValue AMDGPUTargetLowering::SplitVectorLoad(const SDValue Op, return DAG.getMergeValues(Ops, SL); } -// Widen a vector load from vec3 to vec4. -SDValue AMDGPUTargetLowering::WidenVectorLoad(SDValue Op, - SelectionDAG &DAG) const { +SDValue AMDGPUTargetLowering::WidenOrSplitVectorLoad(SDValue Op, + SelectionDAG &DAG) const { LoadSDNode *Load = cast<LoadSDNode>(Op); EVT VT = Op.getValueType(); - assert(VT.getVectorNumElements() == 3); SDValue BasePtr = Load->getBasePtr(); EVT MemVT = Load->getMemoryVT(); SDLoc SL(Op); const MachinePointerInfo &SrcValue = Load->getMemOperand()->getPointerInfo(); unsigned BaseAlign = Load->getAlignment(); + unsigned NumElements = MemVT.getVectorNumElements(); + + // Widen from vec3 to vec4 when the load is at least 8-byte aligned + // or 16-byte fully dereferenceable. Otherwise, split the vector load. + if (NumElements != 3 || + (BaseAlign < 8 && + !SrcValue.isDereferenceable(16, *DAG.getContext(), DAG.getDataLayout()))) + return SplitVectorLoad(Op, DAG); + + assert(NumElements == 3); EVT WideVT = EVT::getVectorVT(*DAG.getContext(), VT.getVectorElementType(), 4); @@ -2075,20 +2092,19 @@ SDValue AMDGPUTargetLowering::LowerSDIVREM(SDValue Op, return DAG.getMergeValues(Res, DL); } -// (frem x, y) -> (fsub x, (fmul (ftrunc (fdiv x, y)), y)) +// (frem x, y) -> (fma (fneg (ftrunc (fdiv x, y))), y, x) SDValue AMDGPUTargetLowering::LowerFREM(SDValue Op, SelectionDAG &DAG) const { SDLoc SL(Op); EVT VT = Op.getValueType(); + auto Flags = Op->getFlags(); SDValue X = Op.getOperand(0); SDValue Y = Op.getOperand(1); - // TODO: Should this propagate fast-math-flags? - - SDValue Div = DAG.getNode(ISD::FDIV, SL, VT, X, Y); - SDValue Floor = DAG.getNode(ISD::FTRUNC, SL, VT, Div); - SDValue Mul = DAG.getNode(ISD::FMUL, SL, VT, Floor, Y); - - return DAG.getNode(ISD::FSUB, SL, VT, X, Mul); + SDValue Div = DAG.getNode(ISD::FDIV, SL, VT, X, Y, Flags); + SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, VT, Div, Flags); + SDValue Neg = DAG.getNode(ISD::FNEG, SL, VT, Trunc, Flags); + // TODO: For f32 use FMAD instead if !hasFastFMA32? + return DAG.getNode(ISD::FMA, SL, VT, Neg, Y, X, Flags); } SDValue AMDGPUTargetLowering::LowerFCEIL(SDValue Op, SelectionDAG &DAG) const { @@ -2698,14 +2714,12 @@ SDValue AMDGPUTargetLowering::LowerFP_TO_SINT(SDValue Op, // TODO: Factor out code common with LowerFP_TO_UINT. EVT SrcVT = Src.getValueType(); - if (Subtarget->has16BitInsts() && SrcVT == MVT::f16) { + if (SrcVT == MVT::f16 || + (SrcVT == MVT::f32 && Src.getOpcode() == ISD::FP16_TO_FP)) { SDLoc DL(Op); - SDValue FPExtend = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, Src); - SDValue FpToInt32 = - DAG.getNode(Op.getOpcode(), DL, MVT::i64, FPExtend); - - return FpToInt32; + SDValue FpToInt32 = DAG.getNode(Op.getOpcode(), DL, MVT::i32, Src); + return DAG.getNode(ISD::SIGN_EXTEND, DL, MVT::i64, FpToInt32); } if (Op.getValueType() == MVT::i64 && Src.getValueType() == MVT::f64) @@ -2721,14 +2735,12 @@ SDValue AMDGPUTargetLowering::LowerFP_TO_UINT(SDValue Op, // TODO: Factor out code common with LowerFP_TO_SINT. EVT SrcVT = Src.getValueType(); - if (Subtarget->has16BitInsts() && SrcVT == MVT::f16) { + if (SrcVT == MVT::f16 || + (SrcVT == MVT::f32 && Src.getOpcode() == ISD::FP16_TO_FP)) { SDLoc DL(Op); - SDValue FPExtend = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, Src); - SDValue FpToInt32 = - DAG.getNode(Op.getOpcode(), DL, MVT::i64, FPExtend); - - return FpToInt32; + SDValue FpToUInt32 = DAG.getNode(Op.getOpcode(), DL, MVT::i32, Src); + return DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, FpToUInt32); } if (Op.getValueType() == MVT::i64 && Src.getValueType() == MVT::f64) @@ -3204,7 +3216,7 @@ SDValue AMDGPUTargetLowering::performTruncateCombine( if (Vec.getOpcode() == ISD::BUILD_VECTOR) { SDValue Elt0 = Vec.getOperand(0); EVT EltVT = Elt0.getValueType(); - if (VT.getSizeInBits() <= EltVT.getSizeInBits()) { + if (VT.getFixedSizeInBits() <= EltVT.getFixedSizeInBits()) { if (EltVT.isFloatingPoint()) { Elt0 = DAG.getNode(ISD::BITCAST, SL, EltVT.changeTypeToInteger(), Elt0); @@ -3287,17 +3299,13 @@ static SDValue getMul24(SelectionDAG &DAG, const SDLoc &SL, return DAG.getNode(MulOpc, SL, MVT::i32, N0, N1); } - // Because we want to eliminate extension instructions before the - // operation, we need to create a single user here (i.e. not the separate - // mul_lo + mul_hi) so that SimplifyDemandedBits will deal with it. - - unsigned MulOpc = Signed ? AMDGPUISD::MUL_LOHI_I24 : AMDGPUISD::MUL_LOHI_U24; + unsigned MulLoOpc = Signed ? AMDGPUISD::MUL_I24 : AMDGPUISD::MUL_U24; + unsigned MulHiOpc = Signed ? AMDGPUISD::MULHI_I24 : AMDGPUISD::MULHI_U24; - SDValue Mul = DAG.getNode(MulOpc, SL, - DAG.getVTList(MVT::i32, MVT::i32), N0, N1); + SDValue MulLo = DAG.getNode(MulLoOpc, SL, MVT::i32, N0, N1); + SDValue MulHi = DAG.getNode(MulHiOpc, SL, MVT::i32, N0, N1); - return DAG.getNode(ISD::BUILD_PAIR, SL, MVT::i64, - Mul.getValue(0), Mul.getValue(1)); + return DAG.getNode(ISD::BUILD_PAIR, SL, MVT::i64, MulLo, MulHi); } SDValue AMDGPUTargetLowering::performMulCombine(SDNode *N, @@ -3395,29 +3403,6 @@ SDValue AMDGPUTargetLowering::performMulhuCombine(SDNode *N, return DAG.getZExtOrTrunc(Mulhi, DL, VT); } -SDValue AMDGPUTargetLowering::performMulLoHi24Combine( - SDNode *N, DAGCombinerInfo &DCI) const { - SelectionDAG &DAG = DCI.DAG; - - // Simplify demanded bits before splitting into multiple users. - if (SDValue V = simplifyI24(N, DCI)) - return V; - - SDValue N0 = N->getOperand(0); - SDValue N1 = N->getOperand(1); - - bool Signed = (N->getOpcode() == AMDGPUISD::MUL_LOHI_I24); - - unsigned MulLoOpc = Signed ? AMDGPUISD::MUL_I24 : AMDGPUISD::MUL_U24; - unsigned MulHiOpc = Signed ? AMDGPUISD::MULHI_I24 : AMDGPUISD::MULHI_U24; - - SDLoc SL(N); - - SDValue MulLo = DAG.getNode(MulLoOpc, SL, MVT::i32, N0, N1); - SDValue MulHi = DAG.getNode(MulHiOpc, SL, MVT::i32, N0, N1); - return DAG.getMergeValues({ MulLo, MulHi }, SL); -} - static bool isNegativeOne(SDValue Val) { if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Val)) return C->isAllOnesValue(); @@ -3730,6 +3715,7 @@ SDValue AMDGPUTargetLowering::performFNegCombine(SDNode *N, } case ISD::FMA: case ISD::FMAD: { + // TODO: handle llvm.amdgcn.fma.legacy if (!mayIgnoreSignedZero(N0)) return SDValue(); @@ -3795,8 +3781,15 @@ SDValue AMDGPUTargetLowering::performFNegCombine(SDNode *N, SDValue Res = DAG.getNode(AMDGPUISD::FMED3, SL, VT, Ops, N0->getFlags()); if (Res.getOpcode() != AMDGPUISD::FMED3) return SDValue(); // Op got folded away. - if (!N0.hasOneUse()) - DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res)); + + if (!N0.hasOneUse()) { + SDValue Neg = DAG.getNode(ISD::FNEG, SL, VT, Res); + DAG.ReplaceAllUsesWith(N0, Neg); + + for (SDNode *U : Neg->uses()) + DCI.AddToWorklist(U); + } + return Res; } case ISD::FP_EXTEND: @@ -3933,7 +3926,7 @@ SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N, } } - if (DestVT.getSizeInBits() != 64 && !DestVT.isVector()) + if (DestVT.getSizeInBits() != 64 || !DestVT.isVector()) break; // Fold bitcasts of constants. @@ -3942,14 +3935,12 @@ SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N, // TODO: Generalize and move to DAGCombiner SDValue Src = N->getOperand(0); if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Src)) { - if (Src.getValueType() == MVT::i64) { - SDLoc SL(N); - uint64_t CVal = C->getZExtValue(); - SDValue BV = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, - DAG.getConstant(Lo_32(CVal), SL, MVT::i32), - DAG.getConstant(Hi_32(CVal), SL, MVT::i32)); - return DAG.getNode(ISD::BITCAST, SL, DestVT, BV); - } + SDLoc SL(N); + uint64_t CVal = C->getZExtValue(); + SDValue BV = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, + DAG.getConstant(Lo_32(CVal), SL, MVT::i32), + DAG.getConstant(Hi_32(CVal), SL, MVT::i32)); + return DAG.getNode(ISD::BITCAST, SL, DestVT, BV); } if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Src)) { @@ -3999,9 +3990,6 @@ SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N, return V; return SDValue(); } - case AMDGPUISD::MUL_LOHI_I24: - case AMDGPUISD::MUL_LOHI_U24: - return performMulLoHi24Combine(N, DCI); case ISD::SELECT: return performSelectCombine(N, DCI); case ISD::FNEG: @@ -4159,9 +4147,9 @@ SDValue AMDGPUTargetLowering::loadStackInputValue(SelectionDAG &DAG, auto SrcPtrInfo = MachinePointerInfo::getStack(MF, Offset); SDValue Ptr = DAG.getFrameIndex(FI, MVT::i32); - return DAG.getLoad(VT, SL, DAG.getEntryNode(), Ptr, SrcPtrInfo, 4, + return DAG.getLoad(VT, SL, DAG.getEntryNode(), Ptr, SrcPtrInfo, Align(4), MachineMemOperand::MODereferenceable | - MachineMemOperand::MOInvariant); + MachineMemOperand::MOInvariant); } SDValue AMDGPUTargetLowering::storeStackInputValue(SelectionDAG &DAG, @@ -4173,7 +4161,7 @@ SDValue AMDGPUTargetLowering::storeStackInputValue(SelectionDAG &DAG, MachinePointerInfo DstInfo = MachinePointerInfo::getStack(MF, Offset); SDValue Ptr = DAG.getConstant(Offset, SL, MVT::i32); - SDValue Store = DAG.getStore(Chain, SL, ArgVal, Ptr, DstInfo, 4, + SDValue Store = DAG.getStore(Chain, SL, ArgVal, Ptr, DstInfo, Align(4), MachineMemOperand::MODereferenceable); return Store; } @@ -4285,8 +4273,6 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const { NODE_NAME_CASE(MUL_I24) NODE_NAME_CASE(MULHI_U24) NODE_NAME_CASE(MULHI_I24) - NODE_NAME_CASE(MUL_LOHI_U24) - NODE_NAME_CASE(MUL_LOHI_I24) NODE_NAME_CASE(MAD_U24) NODE_NAME_CASE(MAD_I24) NODE_NAME_CASE(MAD_I64_I32) @@ -4336,7 +4322,6 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const { NODE_NAME_CASE(ATOMIC_DEC) NODE_NAME_CASE(ATOMIC_LOAD_FMIN) NODE_NAME_CASE(ATOMIC_LOAD_FMAX) - NODE_NAME_CASE(ATOMIC_LOAD_CSUB) NODE_NAME_CASE(BUFFER_LOAD) NODE_NAME_CASE(BUFFER_LOAD_UBYTE) NODE_NAME_CASE(BUFFER_LOAD_USHORT) @@ -4365,8 +4350,6 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const { NODE_NAME_CASE(BUFFER_ATOMIC_CMPSWAP) NODE_NAME_CASE(BUFFER_ATOMIC_CSUB) NODE_NAME_CASE(BUFFER_ATOMIC_FADD) - NODE_NAME_CASE(BUFFER_ATOMIC_PK_FADD) - NODE_NAME_CASE(ATOMIC_PK_FADD) case AMDGPUISD::LAST_AMDGPU_ISD_NUMBER: break; } @@ -4718,6 +4701,12 @@ bool AMDGPUTargetLowering::isKnownNeverNaNForTargetNode(SDValue Op, case Intrinsic::amdgcn_fdot2: // TODO: Refine on operand return SNaN; + case Intrinsic::amdgcn_fma_legacy: + if (SNaN) + return true; + return DAG.isKnownNeverNaN(Op.getOperand(1), SNaN, Depth + 1) && + DAG.isKnownNeverNaN(Op.getOperand(2), SNaN, Depth + 1) && + DAG.isKnownNeverNaN(Op.getOperand(3), SNaN, Depth + 1); default: return false; } |
