diff options
Diffstat (limited to 'llvm/lib/Target/AMDGPU/SIISelLowering.cpp')
-rw-r--r-- | llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 265 |
1 files changed, 158 insertions, 107 deletions
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index 56ebf9c06741..e73d87cd66af 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -90,16 +90,21 @@ using namespace llvm; STATISTIC(NumTailCalls, "Number of tail calls"); -static cl::opt<bool> EnableVGPRIndexMode( - "amdgpu-vgpr-index-mode", - cl::desc("Use GPR indexing mode instead of movrel for vector indexing"), - cl::init(false)); - static cl::opt<bool> DisableLoopAlignment( "amdgpu-disable-loop-alignment", cl::desc("Do not align and prefetch loops"), cl::init(false)); +static bool hasFP32Denormals(const MachineFunction &MF) { + const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); + return Info->getMode().FP32Denormals; +} + +static bool hasFP64FP16Denormals(const MachineFunction &MF) { + const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); + return Info->getMode().FP64FP16Denormals; +} + static unsigned findFirstFreeSGPR(CCState &CCInfo) { unsigned NumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs(); for (unsigned Reg = 0; Reg < NumSGPRs; ++Reg) { @@ -160,6 +165,13 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, computeRegisterProperties(Subtarget->getRegisterInfo()); + // The boolean content concept here is too inflexible. Compares only ever + // really produce a 1-bit result. Any copy/extend from these will turn into a + // select, and zext/1 or sext/-1 are equally cheap. Arbitrarily choose 0/1, as + // it's what most targets use. + setBooleanContents(ZeroOrOneBooleanContent); + setBooleanVectorContents(ZeroOrOneBooleanContent); + // We need to custom lower vector stores from local memory setOperationAction(ISD::LOAD, MVT::v2i32, Custom); setOperationAction(ISD::LOAD, MVT::v3i32, Custom); @@ -358,14 +370,16 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, setOperationAction(ISD::DEBUGTRAP, MVT::Other, Custom); if (Subtarget->has16BitInsts()) { + setOperationAction(ISD::FPOW, MVT::f16, Promote); setOperationAction(ISD::FLOG, MVT::f16, Custom); setOperationAction(ISD::FEXP, MVT::f16, Custom); setOperationAction(ISD::FLOG10, MVT::f16, Custom); } - // v_mad_f32 does not support denormals according to some sources. - if (!Subtarget->hasFP32Denormals()) - setOperationAction(ISD::FMAD, MVT::f32, Legal); + // v_mad_f32 does not support denormals. We report it as unconditionally + // legal, and the context where it is formed will disallow it when fp32 + // denormals are enabled. + setOperationAction(ISD::FMAD, MVT::f32, Legal); if (!Subtarget->hasBFI()) { // fcopysign can be done in a single instruction with BFI. @@ -473,8 +487,6 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, setOperationAction(ISD::FP_TO_SINT, MVT::i16, Promote); setOperationAction(ISD::FP_TO_UINT, MVT::i16, Promote); - setOperationAction(ISD::SINT_TO_FP, MVT::i16, Promote); - setOperationAction(ISD::UINT_TO_FP, MVT::i16, Promote); // F16 - Constant Actions. setOperationAction(ISD::ConstantFP, MVT::f16, Legal); @@ -489,6 +501,10 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, setOperationAction(ISD::FP_ROUND, MVT::f16, Custom); setOperationAction(ISD::FCOS, MVT::f16, Promote); setOperationAction(ISD::FSIN, MVT::f16, Promote); + + setOperationAction(ISD::SINT_TO_FP, MVT::i16, Custom); + setOperationAction(ISD::UINT_TO_FP, MVT::i16, Custom); + setOperationAction(ISD::FP_TO_SINT, MVT::f16, Promote); setOperationAction(ISD::FP_TO_UINT, MVT::f16, Promote); setOperationAction(ISD::SINT_TO_FP, MVT::f16, Promote); @@ -503,7 +519,7 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, // F16 - VOP3 Actions. setOperationAction(ISD::FMA, MVT::f16, Legal); - if (!Subtarget->hasFP16Denormals() && STI.hasMadF16()) + if (STI.hasMadF16()) setOperationAction(ISD::FMAD, MVT::f16, Legal); for (MVT VT : {MVT::v2i16, MVT::v2f16, MVT::v4i16, MVT::v4f16}) { @@ -761,12 +777,13 @@ const GCNSubtarget *SITargetLowering::getSubtarget() const { // // There is only one special case when denormals are enabled we don't currently, // where this is OK to use. -bool SITargetLowering::isFPExtFoldable(unsigned Opcode, - EVT DestVT, EVT SrcVT) const { +bool SITargetLowering::isFPExtFoldable(const SelectionDAG &DAG, unsigned Opcode, + EVT DestVT, EVT SrcVT) const { return ((Opcode == ISD::FMAD && Subtarget->hasMadMixInsts()) || (Opcode == ISD::FMA && Subtarget->hasFmaMixInsts())) && - DestVT.getScalarType() == MVT::f32 && !Subtarget->hasFP32Denormals() && - SrcVT.getScalarType() == MVT::f16; + DestVT.getScalarType() == MVT::f32 && + SrcVT.getScalarType() == MVT::f16 && + !hasFP32Denormals(DAG.getMachineFunction()); } bool SITargetLowering::isShuffleMaskLegal(ArrayRef<int>, EVT) const { @@ -1069,23 +1086,18 @@ bool SITargetLowering::isLegalFlatAddressingMode(const AddrMode &AM) const { return AM.BaseOffs == 0 && AM.Scale == 0; } - // GFX9 added a 13-bit signed offset. When using regular flat instructions, - // the sign bit is ignored and is treated as a 12-bit unsigned offset. - - // GFX10 shrinked signed offset to 12 bits. When using regular flat - // instructions, the sign bit is also ignored and is treated as 11-bit - // unsigned offset. - - if (Subtarget->getGeneration() >= AMDGPUSubtarget::GFX10) - return isUInt<11>(AM.BaseOffs) && AM.Scale == 0; - - // Just r + i - return isUInt<12>(AM.BaseOffs) && AM.Scale == 0; + return AM.Scale == 0 && + (AM.BaseOffs == 0 || Subtarget->getInstrInfo()->isLegalFLATOffset( + AM.BaseOffs, AMDGPUAS::FLAT_ADDRESS, + /*Signed=*/false)); } bool SITargetLowering::isLegalGlobalAddressingMode(const AddrMode &AM) const { if (Subtarget->hasFlatGlobalInsts()) - return isInt<13>(AM.BaseOffs) && AM.Scale == 0; + return AM.Scale == 0 && + (AM.BaseOffs == 0 || Subtarget->getInstrInfo()->isLegalFLATOffset( + AM.BaseOffs, AMDGPUAS::GLOBAL_ADDRESS, + /*Signed=*/true)); if (!Subtarget->hasAddr64() || Subtarget->useFlatForGlobal()) { // Assume the we will use FLAT for all global memory accesses @@ -1326,13 +1338,6 @@ EVT SITargetLowering::getOptimalMemOpType( return MVT::Other; } -static bool isFlatGlobalAddrSpace(unsigned AS) { - return AS == AMDGPUAS::GLOBAL_ADDRESS || - AS == AMDGPUAS::FLAT_ADDRESS || - AS == AMDGPUAS::CONSTANT_ADDRESS || - AS > AMDGPUAS::MAX_AMDGPU_ADDRESS; -} - bool SITargetLowering::isNoopAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const { return isFlatGlobalAddrSpace(SrcAS) && isFlatGlobalAddrSpace(DestAS); @@ -1466,9 +1471,7 @@ SDValue SITargetLowering::lowerKernargMemParameter( const SDLoc &SL, SDValue Chain, uint64_t Offset, unsigned Align, bool Signed, const ISD::InputArg *Arg) const { - Type *Ty = MemVT.getTypeForEVT(*DAG.getContext()); - PointerType *PtrTy = PointerType::get(Ty, AMDGPUAS::CONSTANT_ADDRESS); - MachinePointerInfo PtrInfo(UndefValue::get(PtrTy)); + MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS); // Try to avoid using an extload by loading earlier than the argument address, // and extracting the relevant bits. The load should hopefully be merged with @@ -2666,9 +2669,7 @@ bool SITargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const { const Function *ParentFn = CI->getParent()->getParent(); if (AMDGPU::isEntryFunctionCC(ParentFn->getCallingConv())) return false; - - auto Attr = ParentFn->getFnAttribute("disable-tail-calls"); - return (Attr.getValueAsString() != "true"); + return true; } // The wave scratch offset register is used as the global base pointer. @@ -2787,10 +2788,9 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI, MVT PtrVT = MVT::i32; // Walk the register/memloc assignments, inserting copies/loads. - for (unsigned i = 0, realArgIdx = 0, e = ArgLocs.size(); i != e; - ++i, ++realArgIdx) { + for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { CCValAssign &VA = ArgLocs[i]; - SDValue Arg = OutVals[realArgIdx]; + SDValue Arg = OutVals[i]; // Promote the value if needed. switch (VA.getLocInfo()) { @@ -2830,7 +2830,7 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI, MaybeAlign Alignment; if (IsTailCall) { - ISD::ArgFlagsTy Flags = Outs[realArgIdx].Flags; + ISD::ArgFlagsTy Flags = Outs[i].Flags; unsigned OpSize = Flags.isByVal() ? Flags.getByValSize() : VA.getValVT().getStoreSize(); @@ -2868,8 +2868,7 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI, Chain, DL, DstAddr, Arg, SizeNode, Outs[i].Flags.getByValAlign(), /*isVol = */ false, /*AlwaysInline = */ true, /*isTailCall = */ false, DstInfo, - MachinePointerInfo(UndefValue::get(Type::getInt8PtrTy( - *DAG.getContext(), AMDGPUAS::PRIVATE_ADDRESS)))); + MachinePointerInfo(AMDGPUAS::PRIVATE_ADDRESS)); MemOpChains.push_back(Cpy); } else { @@ -2986,7 +2985,7 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI, IsThisReturn ? OutVals[0] : SDValue()); } -Register SITargetLowering::getRegisterByName(const char* RegName, EVT VT, +Register SITargetLowering::getRegisterByName(const char* RegName, LLT VT, const MachineFunction &MF) const { Register Reg = StringSwitch<Register>(RegName) .Case("m0", AMDGPU::M0) @@ -3411,7 +3410,7 @@ static MachineBasicBlock *emitIndirectSrc(MachineInstr &MI, std::tie(SubReg, Offset) = computeIndirectRegAndOffset(TRI, VecRC, SrcReg, Offset); - bool UseGPRIdxMode = ST.useVGPRIndexMode(EnableVGPRIndexMode); + const bool UseGPRIdxMode = ST.useVGPRIndexMode(); if (setM0ToIndexFromSGPR(TII, MRI, MI, Offset, UseGPRIdxMode, true)) { MachineBasicBlock::iterator I(&MI); @@ -3506,7 +3505,7 @@ static MachineBasicBlock *emitIndirectDst(MachineInstr &MI, std::tie(SubReg, Offset) = computeIndirectRegAndOffset(TRI, VecRC, SrcVec->getReg(), Offset); - bool UseGPRIdxMode = ST.useVGPRIndexMode(EnableVGPRIndexMode); + const bool UseGPRIdxMode = ST.useVGPRIndexMode(); if (Idx->getReg() == AMDGPU::NoRegister) { MachineBasicBlock::iterator I(&MI); @@ -3920,7 +3919,8 @@ MVT SITargetLowering::getScalarShiftAmountTy(const DataLayout &, EVT VT) const { // however does not support denormals, so we do report fma as faster if we have // a fast fma device and require denormals. // -bool SITargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const { +bool SITargetLowering::isFMAFasterThanFMulAndFAdd(const MachineFunction &MF, + EVT VT) const { VT = VT.getScalarType(); switch (VT.getSimpleVT().SimpleTy) { @@ -3929,7 +3929,7 @@ bool SITargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const { // mad available which returns the same result as the separate operations // which we should prefer over fma. We can't use this if we want to support // denormals, so only report this in these cases. - if (Subtarget->hasFP32Denormals()) + if (hasFP32Denormals(MF)) return Subtarget->hasFastFMAF32() || Subtarget->hasDLInsts(); // If the subtarget has v_fmac_f32, that's just as good as v_mac_f32. @@ -3938,7 +3938,7 @@ bool SITargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const { case MVT::f64: return true; case MVT::f16: - return Subtarget->has16BitInsts() && Subtarget->hasFP16Denormals(); + return Subtarget->has16BitInsts() && hasFP64FP16Denormals(MF); default: break; } @@ -3946,6 +3946,21 @@ bool SITargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const { return false; } +bool SITargetLowering::isFMADLegalForFAddFSub(const SelectionDAG &DAG, + const SDNode *N) const { + // TODO: Check future ftz flag + // v_mad_f32/v_mac_f32 do not support denormals. + EVT VT = N->getValueType(0); + if (VT == MVT::f32) + return !hasFP32Denormals(DAG.getMachineFunction()); + if (VT == MVT::f16) { + return Subtarget->hasMadF16() && + !hasFP64FP16Denormals(DAG.getMachineFunction()); + } + + return false; +} + //===----------------------------------------------------------------------===// // Custom DAG Lowering Operations //===----------------------------------------------------------------------===// @@ -4416,8 +4431,8 @@ unsigned SITargetLowering::isCFIntrinsic(const SDNode *Intr) const { bool SITargetLowering::shouldEmitFixup(const GlobalValue *GV) const { const Triple &TT = getTargetMachine().getTargetTriple(); - return (GV->getType()->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS || - GV->getType()->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT) && + return (GV->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS || + GV->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT) && AMDGPU::shouldEmitConstantsToTextSection(TT); } @@ -4425,9 +4440,9 @@ bool SITargetLowering::shouldEmitGOTReloc(const GlobalValue *GV) const { // FIXME: Either avoid relying on address space here or change the default // address space for functions to avoid the explicit check. return (GV->getValueType()->isFunctionTy() || - GV->getType()->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS || - GV->getType()->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS || - GV->getType()->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT) && + GV->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS || + GV->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS || + GV->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT) && !shouldEmitFixup(GV) && !getTargetMachine().shouldAssumeDSOLocal(*GV->getParent(), GV); } @@ -4694,10 +4709,7 @@ SDValue SITargetLowering::getSegmentAperture(unsigned AS, const SDLoc &DL, // TODO: Use custom target PseudoSourceValue. // TODO: We should use the value from the IR intrinsic call, but it might not // be available and how do we get it? - Value *V = UndefValue::get(PointerType::get(Type::getInt8Ty(*DAG.getContext()), - AMDGPUAS::CONSTANT_ADDRESS)); - - MachinePointerInfo PtrInfo(V, StructOffset); + MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS); return DAG.getLoad(MVT::i32, DL, QueuePtr.getValue(1), Ptr, PtrInfo, MinAlign(64, StructOffset), MachineMemOperand::MODereferenceable | @@ -5646,11 +5658,16 @@ SDValue SITargetLowering::lowerSBuffer(EVT VT, SDLoc DL, SDValue Rsrc, SDValue Offset, SDValue GLC, SDValue DLC, SelectionDAG &DAG) const { MachineFunction &MF = DAG.getMachineFunction(); + + const DataLayout &DataLayout = DAG.getDataLayout(); + unsigned Align = + DataLayout.getABITypeAlignment(VT.getTypeForEVT(*DAG.getContext())); + MachineMemOperand *MMO = MF.getMachineMemOperand( MachinePointerInfo(), MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | MachineMemOperand::MOInvariant, - VT.getStoreSize(), VT.getStoreSize()); + VT.getStoreSize(), Align); if (!Offset->isDivergent()) { SDValue Ops[] = { @@ -5659,6 +5676,20 @@ SDValue SITargetLowering::lowerSBuffer(EVT VT, SDLoc DL, SDValue Rsrc, GLC, DLC, }; + + // Widen vec3 load to vec4. + if (VT.isVector() && VT.getVectorNumElements() == 3) { + EVT WidenedVT = + EVT::getVectorVT(*DAG.getContext(), VT.getVectorElementType(), 4); + auto WidenedOp = DAG.getMemIntrinsicNode( + AMDGPUISD::SBUFFER_LOAD, DL, DAG.getVTList(WidenedVT), Ops, WidenedVT, + MF.getMachineMemOperand(MMO, 0, WidenedVT.getStoreSize())); + auto Subvector = DAG.getNode( + ISD::EXTRACT_SUBVECTOR, DL, VT, WidenedOp, + DAG.getConstant(0, DL, getVectorIdxTy(DAG.getDataLayout()))); + return Subvector; + } + return DAG.getMemIntrinsicNode(AMDGPUISD::SBUFFER_LOAD, DL, DAG.getVTList(VT), Ops, VT, MMO); } @@ -5670,11 +5701,10 @@ SDValue SITargetLowering::lowerSBuffer(EVT VT, SDLoc DL, SDValue Rsrc, MVT LoadVT = VT.getSimpleVT(); unsigned NumElts = LoadVT.isVector() ? LoadVT.getVectorNumElements() : 1; assert((LoadVT.getScalarType() == MVT::i32 || - LoadVT.getScalarType() == MVT::f32) && - isPowerOf2_32(NumElts)); + LoadVT.getScalarType() == MVT::f32)); if (NumElts == 8 || NumElts == 16) { - NumLoads = NumElts == 16 ? 4 : 2; + NumLoads = NumElts / 4; LoadVT = MVT::v4i32; } @@ -5698,8 +5728,8 @@ SDValue SITargetLowering::lowerSBuffer(EVT VT, SDLoc DL, SDValue Rsrc, uint64_t InstOffset = cast<ConstantSDNode>(Ops[5])->getZExtValue(); for (unsigned i = 0; i < NumLoads; ++i) { Ops[5] = DAG.getTargetConstant(InstOffset + 16 * i, DL, MVT::i32); - Loads.push_back(DAG.getMemIntrinsicNode(AMDGPUISD::BUFFER_LOAD, DL, VTList, - Ops, LoadVT, MMO)); + Loads.push_back(getMemIntrinsicNode(AMDGPUISD::BUFFER_LOAD, DL, VTList, Ops, + LoadVT, MMO, DAG)); } if (VT == MVT::v8i32 || VT == MVT::v16i32) @@ -5918,22 +5948,6 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, return DAG.getNode(AMDGPUISD::INTERP_P1LL_F16, DL, MVT::f32, Ops); } } - case Intrinsic::amdgcn_interp_p2_f16: { - SDValue ToM0 = DAG.getCopyToReg(DAG.getEntryNode(), DL, AMDGPU::M0, - Op.getOperand(6), SDValue()); - SDValue Ops[] = { - Op.getOperand(2), // Src0 - Op.getOperand(3), // Attrchan - Op.getOperand(4), // Attr - DAG.getTargetConstant(0, DL, MVT::i32), // $src0_modifiers - Op.getOperand(1), // Src2 - DAG.getTargetConstant(0, DL, MVT::i32), // $src2_modifiers - Op.getOperand(5), // high - DAG.getTargetConstant(0, DL, MVT::i1), // $clamp - ToM0.getValue(1) - }; - return DAG.getNode(AMDGPUISD::INTERP_P2_F16, DL, MVT::f16, Ops); - } case Intrinsic::amdgcn_sin: return DAG.getNode(AMDGPUISD::SIN_HW, DL, VT, Op.getOperand(1)); @@ -7088,13 +7102,16 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op, EVT VT = Op.getOperand(3).getValueType(); auto *M = cast<MemSDNode>(Op); - unsigned Opcode = VT.isVector() ? AMDGPUISD::ATOMIC_PK_FADD - : AMDGPUISD::ATOMIC_FADD; + if (VT.isVector()) { + return DAG.getMemIntrinsicNode( + AMDGPUISD::ATOMIC_PK_FADD, DL, Op->getVTList(), Ops, VT, + M->getMemOperand()); + } - return DAG.getMemIntrinsicNode(Opcode, DL, Op->getVTList(), Ops, VT, - M->getMemOperand()); + return DAG.getAtomic(ISD::ATOMIC_LOAD_FADD, DL, VT, + DAG.getVTList(VT, MVT::Other), Ops, + M->getMemOperand()).getValue(1); } - case Intrinsic::amdgcn_end_cf: return SDValue(DAG.getMachineNode(AMDGPU::SI_END_CF, DL, MVT::Other, Op->getOperand(2), Chain), 0); @@ -7451,8 +7468,11 @@ SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const { // resource descriptor, we can only make private accesses up to a certain // size. switch (Subtarget->getMaxPrivateElementSize()) { - case 4: - return scalarizeVectorLoad(Load, DAG); + case 4: { + SDValue Ops[2]; + std::tie(Ops[0], Ops[1]) = scalarizeVectorLoad(Load, DAG); + return DAG.getMergeValues(Ops, DL); + } case 8: if (NumElements > 2) return SplitVectorLoad(Op, DAG); @@ -7529,7 +7549,7 @@ SDValue SITargetLowering::lowerFastUnsafeFDIV(SDValue Op, const SDNodeFlags Flags = Op->getFlags(); bool Unsafe = DAG.getTarget().Options.UnsafeFPMath || Flags.hasAllowReciprocal(); - if (!Unsafe && VT == MVT::f32 && Subtarget->hasFP32Denormals()) + if (!Unsafe && VT == MVT::f32 && hasFP32Denormals(DAG.getMachineFunction())) return SDValue(); if (const ConstantFPSDNode *CLHS = dyn_cast<ConstantFPSDNode>(LHS)) { @@ -7672,7 +7692,7 @@ SDValue SITargetLowering::lowerFDIV_FAST(SDValue Op, SelectionDAG &DAG) const { static const SDValue getSPDenormModeValue(int SPDenormMode, SelectionDAG &DAG, const SDLoc &SL, const GCNSubtarget *ST) { assert(ST->hasDenormModeInst() && "Requires S_DENORM_MODE"); - int DPDenormModeDefault = ST->hasFP64Denormals() + int DPDenormModeDefault = hasFP64FP16Denormals(DAG.getMachineFunction()) ? FP_DENORM_FLUSH_NONE : FP_DENORM_FLUSH_IN_FLUSH_OUT; @@ -7708,7 +7728,9 @@ SDValue SITargetLowering::LowerFDIV32(SDValue Op, SelectionDAG &DAG) const { (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_); const SDValue BitField = DAG.getTargetConstant(Denorm32Reg, SL, MVT::i16); - if (!Subtarget->hasFP32Denormals()) { + const bool HasFP32Denormals = hasFP32Denormals(DAG.getMachineFunction()); + + if (!HasFP32Denormals) { SDVTList BindParamVTs = DAG.getVTList(MVT::Other, MVT::Glue); SDValue EnableDenorm; @@ -7752,8 +7774,7 @@ SDValue SITargetLowering::LowerFDIV32(SDValue Op, SelectionDAG &DAG) const { SDValue Fma4 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, NegDivScale0, Fma3, NumeratorScaled, Fma3); - if (!Subtarget->hasFP32Denormals()) { - + if (!HasFP32Denormals) { SDValue DisableDenorm; if (Subtarget->hasDenormModeInst()) { const SDValue DisableDenormValue = @@ -8727,7 +8748,7 @@ bool SITargetLowering::isCanonicalized(SelectionDAG &DAG, SDValue Op, auto F = CFP->getValueAPF(); if (F.isNaN() && F.isSignaling()) return false; - return !F.isDenormal() || denormalsEnabledForType(Op.getValueType()); + return !F.isDenormal() || denormalsEnabledForType(DAG, Op.getValueType()); } // If source is a result of another standard FP operation it is already in @@ -8796,7 +8817,7 @@ bool SITargetLowering::isCanonicalized(SelectionDAG &DAG, SDValue Op, // snans will be quieted, so we only need to worry about denormals. if (Subtarget->supportsMinMaxDenormModes() || - denormalsEnabledForType(Op.getValueType())) + denormalsEnabledForType(DAG, Op.getValueType())) return true; // Flushing may be required. @@ -8868,7 +8889,7 @@ bool SITargetLowering::isCanonicalized(SelectionDAG &DAG, SDValue Op, LLVM_FALLTHROUGH; } default: - return denormalsEnabledForType(Op.getValueType()) && + return denormalsEnabledForType(DAG, Op.getValueType()) && DAG.isKnownNeverSNaN(Op); } @@ -8879,7 +8900,7 @@ bool SITargetLowering::isCanonicalized(SelectionDAG &DAG, SDValue Op, SDValue SITargetLowering::getCanonicalConstantFP( SelectionDAG &DAG, const SDLoc &SL, EVT VT, const APFloat &C) const { // Flush denormals to 0 if not enabled. - if (C.isDenormal() && !denormalsEnabledForType(VT)) + if (C.isDenormal() && !denormalsEnabledForType(DAG, VT)) return DAG.getConstantFP(0.0, SL, VT); if (C.isNaN()) { @@ -9417,8 +9438,8 @@ unsigned SITargetLowering::getFusedOpcode(const SelectionDAG &DAG, // Only do this if we are not trying to support denormals. v_mad_f32 does not // support denormals ever. - if (((VT == MVT::f32 && !Subtarget->hasFP32Denormals()) || - (VT == MVT::f16 && !Subtarget->hasFP16Denormals() && + if (((VT == MVT::f32 && !hasFP32Denormals(DAG.getMachineFunction())) || + (VT == MVT::f16 && !hasFP64FP16Denormals(DAG.getMachineFunction()) && getSubtarget()->hasMadF16())) && isOperationLegal(ISD::FMAD, VT)) return ISD::FMAD; @@ -9427,7 +9448,7 @@ unsigned SITargetLowering::getFusedOpcode(const SelectionDAG &DAG, if ((Options.AllowFPOpFusion == FPOpFusion::Fast || Options.UnsafeFPMath || (N0->getFlags().hasAllowContract() && N1->getFlags().hasAllowContract())) && - isFMAFasterThanFMulAndFAdd(VT)) { + isFMAFasterThanFMulAndFAdd(DAG.getMachineFunction(), VT)) { return ISD::FMA; } @@ -9543,6 +9564,8 @@ SDValue SITargetLowering::performAddCombine(SDNode *N, case ISD::SIGN_EXTEND: case ISD::ANY_EXTEND: { auto Cond = RHS.getOperand(0); + // If this won't be a real VOPC output, we would still need to insert an + // extra instruction anyway. if (!isBoolSGPR(Cond)) break; SDVTList VTList = DAG.getVTList(MVT::i32, MVT::i1); @@ -9573,6 +9596,26 @@ SDValue SITargetLowering::performSubCombine(SDNode *N, SDValue LHS = N->getOperand(0); SDValue RHS = N->getOperand(1); + // sub x, zext (setcc) => subcarry x, 0, setcc + // sub x, sext (setcc) => addcarry x, 0, setcc + unsigned Opc = RHS.getOpcode(); + switch (Opc) { + default: break; + case ISD::ZERO_EXTEND: + case ISD::SIGN_EXTEND: + case ISD::ANY_EXTEND: { + auto Cond = RHS.getOperand(0); + // If this won't be a real VOPC output, we would still need to insert an + // extra instruction anyway. + if (!isBoolSGPR(Cond)) + break; + SDVTList VTList = DAG.getVTList(MVT::i32, MVT::i1); + SDValue Args[] = { LHS, DAG.getConstant(0, SL, MVT::i32), Cond }; + Opc = (Opc == ISD::SIGN_EXTEND) ? ISD::ADDCARRY : ISD::SUBCARRY; + return DAG.getNode(Opc, SL, VTList, Args); + } + } + if (LHS.getOpcode() == ISD::SUBCARRY) { // sub (subcarry x, 0, cc), y => subcarry x, y, cc auto C = dyn_cast<ConstantSDNode>(LHS.getOperand(1)); @@ -10884,14 +10927,14 @@ bool SITargetLowering::isSDNodeSourceOfDivergence(const SDNode * N, return false; } -bool SITargetLowering::denormalsEnabledForType(EVT VT) const { +bool SITargetLowering::denormalsEnabledForType(const SelectionDAG &DAG, + EVT VT) const { switch (VT.getScalarType().getSimpleVT().SimpleTy) { case MVT::f32: - return Subtarget->hasFP32Denormals(); + return hasFP32Denormals(DAG.getMachineFunction()); case MVT::f64: - return Subtarget->hasFP64Denormals(); case MVT::f16: - return Subtarget->hasFP16Denormals(); + return hasFP64FP16Denormals(DAG.getMachineFunction()); default: return false; } @@ -10930,6 +10973,12 @@ SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const { // TODO: Do have these for flat. Older targets also had them for buffers. unsigned AS = RMW->getPointerAddressSpace(); + + if (AS == AMDGPUAS::GLOBAL_ADDRESS && Subtarget->hasAtomicFaddInsts()) { + return RMW->use_empty() ? AtomicExpansionKind::None : + AtomicExpansionKind::CmpXChg; + } + return (AS == AMDGPUAS::LOCAL_ADDRESS && Subtarget->hasLDSFPAtomics()) ? AtomicExpansionKind::None : AtomicExpansionKind::CmpXChg; } @@ -10956,6 +11005,8 @@ SITargetLowering::getRegClassFor(MVT VT, bool isDivergent) const { } static bool hasCFUser(const Value *V, SmallPtrSet<const Value *, 16> &Visited) { + if (!isa<Instruction>(V)) + return false; if (!Visited.insert(V).second) return false; bool Result = false; |