diff options
Diffstat (limited to 'llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp')
| -rw-r--r-- | llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp | 824 |
1 files changed, 703 insertions, 121 deletions
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp index 645d05aa9238..01a3e78ea48c 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp @@ -26,6 +26,7 @@ #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" #include "llvm/IR/DiagnosticInfo.h" #include "llvm/IR/IntrinsicsAMDGPU.h" +#include "llvm/IR/IntrinsicsR600.h" #define DEBUG_TYPE "amdgpu-legalinfo" @@ -134,7 +135,6 @@ static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) { static LLT getBitcastRegisterType(const LLT Ty) { const unsigned Size = Ty.getSizeInBits(); - LLT CoercedTy; if (Size <= 32) { // <2 x s8> -> s16 // <4 x s8> -> s32 @@ -530,13 +530,22 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, if (ST.hasVOP3PInsts() && ST.hasAddNoCarry() && ST.hasIntClamp()) { // Full set of gfx9 features. - getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) + getActionDefinitionsBuilder({G_ADD, G_SUB}) .legalFor({S32, S16, V2S16}) + .clampMaxNumElementsStrict(0, S16, 2) + .scalarize(0) .minScalar(0, S16) + .widenScalarToNextMultipleOf(0, 32) + .maxScalar(0, S32); + + getActionDefinitionsBuilder(G_MUL) + .legalFor({S32, S16, V2S16}) .clampMaxNumElementsStrict(0, S16, 2) + .scalarize(0) + .minScalar(0, S16) .widenScalarToNextMultipleOf(0, 32) - .maxScalar(0, S32) - .scalarize(0); + .custom(); + assert(ST.hasMad64_32()); getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT, G_SADDSAT, G_SSUBSAT}) .legalFor({S32, S16, V2S16}) // Clamp modifier @@ -546,13 +555,21 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, .widenScalarToNextPow2(0, 32) .lower(); } else if (ST.has16BitInsts()) { - getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) + getActionDefinitionsBuilder({G_ADD, G_SUB}) .legalFor({S32, S16}) .minScalar(0, S16) .widenScalarToNextMultipleOf(0, 32) .maxScalar(0, S32) .scalarize(0); + getActionDefinitionsBuilder(G_MUL) + .legalFor({S32, S16}) + .scalarize(0) + .minScalar(0, S16) + .widenScalarToNextMultipleOf(0, 32) + .custom(); + assert(ST.hasMad64_32()); + // Technically the saturating operations require clamp bit support, but this // was introduced at the same time as 16-bit operations. getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT}) @@ -569,12 +586,23 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, .scalarize(0) .lower(); } else { - getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) + getActionDefinitionsBuilder({G_ADD, G_SUB}) .legalFor({S32}) .widenScalarToNextMultipleOf(0, 32) .clampScalar(0, S32, S32) .scalarize(0); + auto &Mul = getActionDefinitionsBuilder(G_MUL) + .legalFor({S32}) + .scalarize(0) + .minScalar(0, S32) + .widenScalarToNextMultipleOf(0, 32); + + if (ST.hasMad64_32()) + Mul.custom(); + else + Mul.maxScalar(0, S32); + if (ST.hasIntClamp()) { getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT}) .legalFor({S32}) // Clamp modifier. @@ -632,7 +660,7 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, G_UADDE, G_SADDE, G_USUBE, G_SSUBE}) .legalFor({{S32, S1}, {S32, S32}}) .minScalar(0, S32) - // TODO: .scalarize(0) + .scalarize(0) .lower(); getActionDefinitionsBuilder(G_BITCAST) @@ -767,13 +795,24 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, .narrowScalarFor({{S64, S16}}, changeTo(0, S32)) .scalarize(0); - getActionDefinitionsBuilder(G_FSUB) + auto &FSubActions = getActionDefinitionsBuilder(G_FSUB); + if (ST.has16BitInsts()) { + FSubActions + // Use actual fsub instruction + .legalFor({S32, S16}) + // Must use fadd + fneg + .lowerFor({S64, V2S16}); + } else { + FSubActions // Use actual fsub instruction .legalFor({S32}) // Must use fadd + fneg - .lowerFor({S64, S16, V2S16}) - .scalarize(0) - .clampScalar(0, S32, S64); + .lowerFor({S64, S16, V2S16}); + } + + FSubActions + .scalarize(0) + .clampScalar(0, S32, S64); // Whether this is legal depends on the floating point mode for the function. auto &FMad = getActionDefinitionsBuilder(G_FMAD); @@ -839,6 +878,11 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, .scalarize(0) .lower(); + getActionDefinitionsBuilder(G_INTRINSIC_FPTRUNC_ROUND) + .customFor({S16, S32}) + .scalarize(0) + .lower(); + // Lower roundeven into G_FRINT getActionDefinitionsBuilder({G_INTRINSIC_ROUND, G_INTRINSIC_ROUNDEVEN}) .scalarize(0) @@ -1292,6 +1336,8 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, Atomic.legalFor({{S32, LocalPtr}, {S32, RegionPtr}}); if (ST.hasGFX90AInsts()) Atomic.legalFor({{S64, LocalPtr}}); + if (ST.hasGFX940Insts()) + Atomic.legalFor({{V2S16, LocalPtr}}); } if (ST.hasAtomicFaddInsts()) Atomic.legalFor({{S32, GlobalPtr}}); @@ -1505,7 +1551,7 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, .clampMaxNumElements(1, S16, 2) // TODO: Make 4? .clampMaxNumElements(0, S16, 64); - // TODO: Don't fully scalarize v2s16 pieces? Or combine out thosse + // TODO: Don't fully scalarize v2s16 pieces? Or combine out those // pre-legalize. if (ST.hasVOP3PInsts()) { getActionDefinitionsBuilder(G_SHUFFLE_VECTOR) @@ -1756,9 +1802,13 @@ bool AMDGPULegalizerInfo::legalizeCustom(LegalizerHelper &Helper, return legalizeFFloor(MI, MRI, B); case TargetOpcode::G_BUILD_VECTOR: return legalizeBuildVector(MI, MRI, B); + case TargetOpcode::G_MUL: + return legalizeMul(Helper, MI); case TargetOpcode::G_CTLZ: case TargetOpcode::G_CTTZ: return legalizeCTLZ_CTTZ(MI, MRI, B); + case TargetOpcode::G_INTRINSIC_FPTRUNC_ROUND: + return legalizeFPTruncRound(MI, B); default: return false; } @@ -1801,6 +1851,39 @@ Register AMDGPULegalizerInfo::getSegmentAperture( return B.buildShl(S32, GetReg, ShiftAmt).getReg(0); } + // TODO: can we be smarter about machine pointer info? + MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS); + Register LoadAddr = MRI.createGenericVirtualRegister( + LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); + // For code object version 5, private_base and shared_base are passed through + // implicit kernargs. + if (AMDGPU::getAmdhsaCodeObjectVersion() == 5) { + AMDGPUTargetLowering::ImplicitParameter Param = + AS == AMDGPUAS::LOCAL_ADDRESS ? AMDGPUTargetLowering::SHARED_BASE + : AMDGPUTargetLowering::PRIVATE_BASE; + uint64_t Offset = + ST.getTargetLowering()->getImplicitParameterOffset(B.getMF(), Param); + + Register KernargPtrReg = MRI.createGenericVirtualRegister( + LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); + + if (!loadInputValue(KernargPtrReg, B, + AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR)) + return Register(); + + MachineMemOperand *MMO = MF.getMachineMemOperand( + PtrInfo, + MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | + MachineMemOperand::MOInvariant, + LLT::scalar(32), commonAlignment(Align(64), Offset)); + + // Pointer address + B.buildPtrAdd(LoadAddr, KernargPtrReg, + B.buildConstant(LLT::scalar(64), Offset).getReg(0)); + // Load address + return B.buildLoad(S32, LoadAddr, *MMO).getReg(0); + } + Register QueuePtr = MRI.createGenericVirtualRegister( LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); @@ -1811,17 +1894,14 @@ Register AMDGPULegalizerInfo::getSegmentAperture( // private_segment_aperture_base_hi. uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44; - // TODO: can we be smarter about machine pointer info? - MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS); MachineMemOperand *MMO = MF.getMachineMemOperand( PtrInfo, MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | MachineMemOperand::MOInvariant, LLT::scalar(32), commonAlignment(Align(64), StructOffset)); - Register LoadAddr; - - B.materializePtrAdd(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset); + B.buildPtrAdd(LoadAddr, QueuePtr, + B.buildConstant(LLT::scalar(64), StructOffset).getReg(0)); return B.buildLoad(S32, LoadAddr, *MMO).getReg(0); } @@ -1872,31 +1952,9 @@ bool AMDGPULegalizerInfo::legalizeAddrSpaceCast( return true; } - if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { - // Truncate. - B.buildExtract(Dst, Src, 0); - MI.eraseFromParent(); - return true; - } - - if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { - const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); - uint32_t AddrHiVal = Info->get32BitAddressHighBits(); - - // FIXME: This is a bit ugly due to creating a merge of 2 pointers to - // another. Merge operands are required to be the same type, but creating an - // extra ptrtoint would be kind of pointless. - auto HighAddr = B.buildConstant( - LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS_32BIT, 32), AddrHiVal); - B.buildMerge(Dst, {Src, HighAddr}); - MI.eraseFromParent(); - return true; - } - - if (SrcAS == AMDGPUAS::FLAT_ADDRESS) { - assert(DestAS == AMDGPUAS::LOCAL_ADDRESS || - DestAS == AMDGPUAS::PRIVATE_ADDRESS); - + if (SrcAS == AMDGPUAS::FLAT_ADDRESS && + (DestAS == AMDGPUAS::LOCAL_ADDRESS || + DestAS == AMDGPUAS::PRIVATE_ADDRESS)) { if (isKnownNonNull(Src, MRI, TM, SrcAS)) { // Extract low 32-bits of the pointer. B.buildExtract(Dst, Src, 0); @@ -1920,37 +1978,70 @@ bool AMDGPULegalizerInfo::legalizeAddrSpaceCast( return true; } - if (SrcAS != AMDGPUAS::LOCAL_ADDRESS && SrcAS != AMDGPUAS::PRIVATE_ADDRESS) - return false; + if (DestAS == AMDGPUAS::FLAT_ADDRESS && + (SrcAS == AMDGPUAS::LOCAL_ADDRESS || + SrcAS == AMDGPUAS::PRIVATE_ADDRESS)) { + if (!ST.hasFlatAddressSpace()) + return false; - if (!ST.hasFlatAddressSpace()) - return false; + Register ApertureReg = getSegmentAperture(SrcAS, MRI, B); + if (!ApertureReg.isValid()) + return false; - Register ApertureReg = getSegmentAperture(SrcAS, MRI, B); - if (!ApertureReg.isValid()) - return false; + // Coerce the type of the low half of the result so we can use merge_values. + Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0); + + // TODO: Should we allow mismatched types but matching sizes in merges to + // avoid the ptrtoint? + auto BuildPtr = B.buildMerge(DstTy, {SrcAsInt, ApertureReg}); + + if (isKnownNonNull(Src, MRI, TM, SrcAS)) { + B.buildCopy(Dst, BuildPtr); + MI.eraseFromParent(); + return true; + } + + auto SegmentNull = B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS)); + auto FlatNull = B.buildConstant(DstTy, TM.getNullPointerValue(DestAS)); - // Coerce the type of the low half of the result so we can use merge_values. - Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0); + auto CmpRes = B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, + SegmentNull.getReg(0)); - // TODO: Should we allow mismatched types but matching sizes in merges to - // avoid the ptrtoint? - auto BuildPtr = B.buildMerge(DstTy, {SrcAsInt, ApertureReg}); + B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull); - if (isKnownNonNull(Src, MRI, TM, SrcAS)) { - B.buildCopy(Dst, BuildPtr); MI.eraseFromParent(); return true; } - auto SegmentNull = B.buildConstant(SrcTy, TM.getNullPointerValue(SrcAS)); - auto FlatNull = B.buildConstant(DstTy, TM.getNullPointerValue(DestAS)); + if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT && + SrcTy.getSizeInBits() == 64) { + // Truncate. + B.buildExtract(Dst, Src, 0); + MI.eraseFromParent(); + return true; + } + + if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT && + DstTy.getSizeInBits() == 64) { + const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); + uint32_t AddrHiVal = Info->get32BitAddressHighBits(); - auto CmpRes = - B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, SegmentNull.getReg(0)); + // FIXME: This is a bit ugly due to creating a merge of 2 pointers to + // another. Merge operands are required to be the same type, but creating an + // extra ptrtoint would be kind of pointless. + auto HighAddr = B.buildConstant( + LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS_32BIT, 32), AddrHiVal); + B.buildMerge(Dst, {Src, HighAddr}); + MI.eraseFromParent(); + return true; + } - B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull); + DiagnosticInfoUnsupported InvalidAddrSpaceCast( + MF.getFunction(), "invalid addrspacecast", B.getDebugLoc()); + LLVMContext &Ctx = MF.getFunction().getContext(); + Ctx.diagnose(InvalidAddrSpaceCast); + B.buildUndef(Dst); MI.eraseFromParent(); return true; } @@ -2811,6 +2902,298 @@ bool AMDGPULegalizerInfo::legalizeBuildVector( return true; } +// Build a big integer multiply or multiply-add using MAD_64_32 instructions. +// +// Source and accumulation registers must all be 32-bits. +// +// TODO: When the multiply is uniform, we should produce a code sequence +// that is better suited to instruction selection on the SALU. Instead of +// the outer loop going over parts of the result, the outer loop should go +// over parts of one of the factors. This should result in instruction +// selection that makes full use of S_ADDC_U32 instructions. +void AMDGPULegalizerInfo::buildMultiply( + LegalizerHelper &Helper, MutableArrayRef<Register> Accum, + ArrayRef<Register> Src0, ArrayRef<Register> Src1, + bool UsePartialMad64_32, bool SeparateOddAlignedProducts) const { + // Use (possibly empty) vectors of S1 registers to represent the set of + // carries from one pair of positions to the next. + using Carry = SmallVector<Register, 2>; + + MachineIRBuilder &B = Helper.MIRBuilder; + + const LLT S1 = LLT::scalar(1); + const LLT S32 = LLT::scalar(32); + const LLT S64 = LLT::scalar(64); + + Register Zero32; + Register Zero64; + + auto getZero32 = [&]() -> Register { + if (!Zero32) + Zero32 = B.buildConstant(S32, 0).getReg(0); + return Zero32; + }; + auto getZero64 = [&]() -> Register { + if (!Zero64) + Zero64 = B.buildConstant(S64, 0).getReg(0); + return Zero64; + }; + + // Merge the given carries into the 32-bit LocalAccum, which is modified + // in-place. + // + // Returns the carry-out, which is a single S1 register or null. + auto mergeCarry = + [&](Register &LocalAccum, const Carry &CarryIn) -> Register { + if (CarryIn.empty()) + return Register(); + + bool HaveCarryOut = true; + Register CarryAccum; + if (CarryIn.size() == 1) { + if (!LocalAccum) { + LocalAccum = B.buildZExt(S32, CarryIn[0]).getReg(0); + return Register(); + } + + CarryAccum = getZero32(); + } else { + CarryAccum = B.buildZExt(S32, CarryIn[0]).getReg(0); + for (unsigned i = 1; i + 1 < CarryIn.size(); ++i) { + CarryAccum = + B.buildUAdde(S32, S1, CarryAccum, getZero32(), CarryIn[i]) + .getReg(0); + } + + if (!LocalAccum) { + LocalAccum = getZero32(); + HaveCarryOut = false; + } + } + + auto Add = + B.buildUAdde(S32, S1, CarryAccum, LocalAccum, CarryIn.back()); + LocalAccum = Add.getReg(0); + return HaveCarryOut ? Add.getReg(1) : Register(); + }; + + // Build a multiply-add chain to compute + // + // LocalAccum + (partial products at DstIndex) + // + (opportunistic subset of CarryIn) + // + // LocalAccum is an array of one or two 32-bit registers that are updated + // in-place. The incoming registers may be null. + // + // In some edge cases, carry-ins can be consumed "for free". In that case, + // the consumed carry bits are removed from CarryIn in-place. + auto buildMadChain = + [&](MutableArrayRef<Register> LocalAccum, unsigned DstIndex, Carry &CarryIn) + -> Carry { + assert((DstIndex + 1 < Accum.size() && LocalAccum.size() == 2) || + (DstIndex + 1 >= Accum.size() && LocalAccum.size() == 1)); + + Carry CarryOut; + unsigned j0 = 0; + + // Use plain 32-bit multiplication for the most significant part of the + // result by default. + if (LocalAccum.size() == 1 && + (!UsePartialMad64_32 || !CarryIn.empty())) { + do { + unsigned j1 = DstIndex - j0; + auto Mul = B.buildMul(S32, Src0[j0], Src1[j1]); + if (!LocalAccum[0]) { + LocalAccum[0] = Mul.getReg(0); + } else { + if (CarryIn.empty()) { + LocalAccum[0] = B.buildAdd(S32, LocalAccum[0], Mul).getReg(0); + } else { + LocalAccum[0] = + B.buildUAdde(S32, S1, LocalAccum[0], Mul, CarryIn.back()) + .getReg(0); + CarryIn.pop_back(); + } + } + ++j0; + } while (j0 <= DstIndex && (!UsePartialMad64_32 || !CarryIn.empty())); + } + + // Build full 64-bit multiplies. + if (j0 <= DstIndex) { + bool HaveSmallAccum = false; + Register Tmp; + + if (LocalAccum[0]) { + if (LocalAccum.size() == 1) { + Tmp = B.buildAnyExt(S64, LocalAccum[0]).getReg(0); + HaveSmallAccum = true; + } else if (LocalAccum[1]) { + Tmp = B.buildMerge(S64, LocalAccum).getReg(0); + HaveSmallAccum = false; + } else { + Tmp = B.buildZExt(S64, LocalAccum[0]).getReg(0); + HaveSmallAccum = true; + } + } else { + assert(LocalAccum.size() == 1 || !LocalAccum[1]); + Tmp = getZero64(); + HaveSmallAccum = true; + } + + do { + unsigned j1 = DstIndex - j0; + auto Mad = B.buildInstr(AMDGPU::G_AMDGPU_MAD_U64_U32, {S64, S1}, + {Src0[j0], Src1[j1], Tmp}); + Tmp = Mad.getReg(0); + if (!HaveSmallAccum) + CarryOut.push_back(Mad.getReg(1)); + HaveSmallAccum = false; + ++j0; + } while (j0 <= DstIndex); + + auto Unmerge = B.buildUnmerge(S32, Tmp); + LocalAccum[0] = Unmerge.getReg(0); + if (LocalAccum.size() > 1) + LocalAccum[1] = Unmerge.getReg(1); + } + + return CarryOut; + }; + + // Outer multiply loop, iterating over destination parts from least + // significant to most significant parts. + // + // The columns of the following diagram correspond to the destination parts + // affected by one iteration of the outer loop (ignoring boundary + // conditions). + // + // Dest index relative to 2 * i: 1 0 -1 + // ------ + // Carries from previous iteration: e o + // Even-aligned partial product sum: E E . + // Odd-aligned partial product sum: O O + // + // 'o' is OddCarry, 'e' is EvenCarry. + // EE and OO are computed from partial products via buildMadChain and use + // accumulation where possible and appropriate. + // + Register SeparateOddCarry; + Carry EvenCarry; + Carry OddCarry; + + for (unsigned i = 0; i <= Accum.size() / 2; ++i) { + Carry OddCarryIn = std::move(OddCarry); + Carry EvenCarryIn = std::move(EvenCarry); + OddCarry.clear(); + EvenCarry.clear(); + + // Partial products at offset 2 * i. + if (2 * i < Accum.size()) { + auto LocalAccum = Accum.drop_front(2 * i).take_front(2); + EvenCarry = buildMadChain(LocalAccum, 2 * i, EvenCarryIn); + } + + // Partial products at offset 2 * i - 1. + if (i > 0) { + if (!SeparateOddAlignedProducts) { + auto LocalAccum = Accum.drop_front(2 * i - 1).take_front(2); + OddCarry = buildMadChain(LocalAccum, 2 * i - 1, OddCarryIn); + } else { + bool IsHighest = 2 * i >= Accum.size(); + Register SeparateOddOut[2]; + auto LocalAccum = makeMutableArrayRef(SeparateOddOut) + .take_front(IsHighest ? 1 : 2); + OddCarry = buildMadChain(LocalAccum, 2 * i - 1, OddCarryIn); + + MachineInstr *Lo; + + if (i == 1) { + if (!IsHighest) + Lo = B.buildUAddo(S32, S1, Accum[2 * i - 1], SeparateOddOut[0]); + else + Lo = B.buildAdd(S32, Accum[2 * i - 1], SeparateOddOut[0]); + } else { + Lo = B.buildUAdde(S32, S1, Accum[2 * i - 1], SeparateOddOut[0], + SeparateOddCarry); + } + Accum[2 * i - 1] = Lo->getOperand(0).getReg(); + + if (!IsHighest) { + auto Hi = B.buildUAdde(S32, S1, Accum[2 * i], SeparateOddOut[1], + Lo->getOperand(1).getReg()); + Accum[2 * i] = Hi.getReg(0); + SeparateOddCarry = Hi.getReg(1); + } + } + } + + // Add in the carries from the previous iteration + if (i > 0) { + if (Register CarryOut = mergeCarry(Accum[2 * i - 1], OddCarryIn)) + EvenCarryIn.push_back(CarryOut); + + if (2 * i < Accum.size()) { + if (Register CarryOut = mergeCarry(Accum[2 * i], EvenCarryIn)) + OddCarry.push_back(CarryOut); + } + } + } +} + +// Custom narrowing of wide multiplies using wide multiply-add instructions. +// +// TODO: If the multiply is followed by an addition, we should attempt to +// integrate it to make better use of V_MAD_U64_U32's multiply-add capabilities. +bool AMDGPULegalizerInfo::legalizeMul(LegalizerHelper &Helper, + MachineInstr &MI) const { + assert(ST.hasMad64_32()); + assert(MI.getOpcode() == TargetOpcode::G_MUL); + + MachineIRBuilder &B = Helper.MIRBuilder; + MachineRegisterInfo &MRI = *B.getMRI(); + + Register DstReg = MI.getOperand(0).getReg(); + Register Src0 = MI.getOperand(1).getReg(); + Register Src1 = MI.getOperand(2).getReg(); + + LLT Ty = MRI.getType(DstReg); + assert(Ty.isScalar()); + + unsigned Size = Ty.getSizeInBits(); + unsigned NumParts = Size / 32; + assert((Size % 32) == 0); + assert(NumParts >= 2); + + // Whether to use MAD_64_32 for partial products whose high half is + // discarded. This avoids some ADD instructions but risks false dependency + // stalls on some subtargets in some cases. + const bool UsePartialMad64_32 = ST.getGeneration() < AMDGPUSubtarget::GFX10; + + // Whether to compute odd-aligned partial products separately. This is + // advisable on subtargets where the accumulator of MAD_64_32 must be placed + // in an even-aligned VGPR. + const bool SeparateOddAlignedProducts = ST.hasFullRate64Ops(); + + LLT S32 = LLT::scalar(32); + SmallVector<Register, 2> Src0Parts, Src1Parts; + for (unsigned i = 0; i < NumParts; ++i) { + Src0Parts.push_back(MRI.createGenericVirtualRegister(S32)); + Src1Parts.push_back(MRI.createGenericVirtualRegister(S32)); + } + B.buildUnmerge(Src0Parts, Src0); + B.buildUnmerge(Src1Parts, Src1); + + SmallVector<Register, 2> AccumRegs(NumParts); + buildMultiply(Helper, AccumRegs, Src0Parts, Src1Parts, UsePartialMad64_32, + SeparateOddAlignedProducts); + + B.buildMerge(DstReg, AccumRegs); + MI.eraseFromParent(); + return true; + +} + // Legalize ctlz/cttz to ffbh/ffbl instead of the default legalization to // ctlz/cttz_zero_undef. This allows us to fix up the result for the zero input // case with a single min instruction instead of a compare+select. @@ -2954,6 +3337,89 @@ bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin( return true; } +static bool replaceWithConstant(MachineIRBuilder &B, MachineInstr &MI, + int64_t C) { + B.buildConstant(MI.getOperand(0).getReg(), C); + MI.eraseFromParent(); + return true; +} + +bool AMDGPULegalizerInfo::legalizeWorkitemIDIntrinsic( + MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, + unsigned Dim, AMDGPUFunctionArgInfo::PreloadedValue ArgType) const { + unsigned MaxID = ST.getMaxWorkitemID(B.getMF().getFunction(), Dim); + if (MaxID == 0) + return replaceWithConstant(B, MI, 0); + + const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); + const ArgDescriptor *Arg; + const TargetRegisterClass *ArgRC; + LLT ArgTy; + std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType); + + Register DstReg = MI.getOperand(0).getReg(); + if (!Arg) { + // It's undefined behavior if a function marked with the amdgpu-no-* + // attributes uses the corresponding intrinsic. + B.buildUndef(DstReg); + MI.eraseFromParent(); + return true; + } + + if (Arg->isMasked()) { + // Don't bother inserting AssertZext for packed IDs since we're emitting the + // masking operations anyway. + // + // TODO: We could assert the top bit is 0 for the source copy. + if (!loadInputValue(DstReg, B, ArgType)) + return false; + } else { + Register TmpReg = MRI.createGenericVirtualRegister(LLT::scalar(32)); + if (!loadInputValue(TmpReg, B, ArgType)) + return false; + B.buildAssertZExt(DstReg, TmpReg, 32 - countLeadingZeros(MaxID)); + } + + MI.eraseFromParent(); + return true; +} + +Register AMDGPULegalizerInfo::getKernargParameterPtr(MachineIRBuilder &B, + int64_t Offset) const { + LLT PtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); + Register KernArgReg = B.getMRI()->createGenericVirtualRegister(PtrTy); + + // TODO: If we passed in the base kernel offset we could have a better + // alignment than 4, but we don't really need it. + if (!loadInputValue(KernArgReg, B, + AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR)) + llvm_unreachable("failed to find kernarg segment ptr"); + + auto COffset = B.buildConstant(LLT::scalar(64), Offset); + // TODO: Should get nuw + return B.buildPtrAdd(PtrTy, KernArgReg, COffset).getReg(0); +} + +/// Legalize a value that's loaded from kernel arguments. This is only used by +/// legacy intrinsics. +bool AMDGPULegalizerInfo::legalizeKernargMemParameter(MachineInstr &MI, + MachineIRBuilder &B, + uint64_t Offset, + Align Alignment) const { + Register DstReg = MI.getOperand(0).getReg(); + + assert(B.getMRI()->getType(DstReg) == LLT::scalar(32) && + "unexpected kernarg parameter type"); + + Register Ptr = getKernargParameterPtr(B, Offset); + MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS); + B.buildLoad(DstReg, Ptr, PtrInfo, Align(4), + MachineMemOperand::MODereferenceable | + MachineMemOperand::MOInvariant); + MI.eraseFromParent(); + return true; +} + bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { @@ -3688,9 +4154,9 @@ bool AMDGPULegalizerInfo::legalizeDSAtomicFPIntrinsic(LegalizerHelper &Helper, // The remaining operands were used to set fields in the MemOperand on // construction. for (int I = 6; I > 3; --I) - MI.RemoveOperand(I); + MI.removeOperand(I); - MI.RemoveOperand(1); // Remove the intrinsic ID. + MI.removeOperand(1); // Remove the intrinsic ID. Observer.changedInstr(MI); return true; } @@ -4359,7 +4825,7 @@ static void convertImageAddrToPacked(MachineIRBuilder &B, MachineInstr &MI, /// /// We don't want to directly select image instructions just yet, but also want /// to exposes all register repacking to the legalizer/combiners. We also don't -/// want a selected instrution entering RegBankSelect. In order to avoid +/// want a selected instruction entering RegBankSelect. In order to avoid /// defining a multitude of intermediate image instructions, directly hack on /// the intrinsic's arguments. In cases like a16 addresses, this requires /// padding now unnecessary arguments with $noreg. @@ -4508,6 +4974,10 @@ bool AMDGPULegalizerInfo::legalizeImageIntrinsic( // // SIShrinkInstructions will convert NSA encodings to non-NSA after register // allocation when possible. + // + // TODO: we can actually allow partial NSA where the final register is a + // contiguous set of the remaining addresses. + // This could help where there are more addresses than supported. const bool UseNSA = ST.hasNSAEncoding() && CorrectedNumVAddrs >= 3 && CorrectedNumVAddrs <= ST.getNSAMaxSize(); @@ -4607,7 +5077,7 @@ bool AMDGPULegalizerInfo::legalizeImageIntrinsic( return false; // TODO: Make sure the TFE operand bit is set. - MI.RemoveOperand(1); + MI.removeOperand(1); // Handle the easy case that requires no repack instructions. if (Ty == S32) { @@ -4737,7 +5207,7 @@ bool AMDGPULegalizerInfo::legalizeSBufferLoad( // should be fixed to have a memory operand. Since it's readnone, we're not // allowed to add one. MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_S_BUFFER_LOAD)); - MI.RemoveOperand(1); // Remove intrinsic ID + MI.removeOperand(1); // Remove intrinsic ID // FIXME: When intrinsic definition is fixed, this should have an MMO already. // TODO: Should this use datalayout alignment? @@ -4797,6 +5267,47 @@ bool AMDGPULegalizerInfo::legalizeTrapEndpgm( bool AMDGPULegalizerInfo::legalizeTrapHsaQueuePtr( MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { + MachineFunction &MF = B.getMF(); + const LLT S64 = LLT::scalar(64); + + Register SGPR01(AMDGPU::SGPR0_SGPR1); + // For code object version 5, queue_ptr is passed through implicit kernarg. + if (AMDGPU::getAmdhsaCodeObjectVersion() == 5) { + AMDGPUTargetLowering::ImplicitParameter Param = + AMDGPUTargetLowering::QUEUE_PTR; + uint64_t Offset = + ST.getTargetLowering()->getImplicitParameterOffset(B.getMF(), Param); + + Register KernargPtrReg = MRI.createGenericVirtualRegister( + LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); + + if (!loadInputValue(KernargPtrReg, B, + AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR)) + return false; + + // TODO: can we be smarter about machine pointer info? + MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS); + MachineMemOperand *MMO = MF.getMachineMemOperand( + PtrInfo, + MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | + MachineMemOperand::MOInvariant, + LLT::scalar(64), commonAlignment(Align(64), Offset)); + + // Pointer address + Register LoadAddr = MRI.createGenericVirtualRegister( + LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); + B.buildPtrAdd(LoadAddr, KernargPtrReg, + B.buildConstant(LLT::scalar(64), Offset).getReg(0)); + // Load address + Register Temp = B.buildLoad(S64, LoadAddr, *MMO).getReg(0); + B.buildCopy(SGPR01, Temp); + B.buildInstr(AMDGPU::S_TRAP) + .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap)) + .addReg(SGPR01, RegState::Implicit); + MI.eraseFromParent(); + return true; + } + // Pass queue pointer to trap handler as input, and insert trap instruction // Reference: https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi Register LiveIn = @@ -4804,7 +5315,6 @@ bool AMDGPULegalizerInfo::legalizeTrapHsaQueuePtr( if (!loadInputValue(LiveIn, B, AMDGPUFunctionArgInfo::QUEUE_PTR)) return false; - Register SGPR01(AMDGPU::SGPR0_SGPR1); B.buildCopy(SGPR01, LiveIn); B.buildInstr(AMDGPU::S_TRAP) .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap)) @@ -4848,6 +5358,8 @@ bool AMDGPULegalizerInfo::legalizeBVHIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI = *B.getMRI(); const LLT S16 = LLT::scalar(16); const LLT S32 = LLT::scalar(32); + const LLT V2S16 = LLT::fixed_vector(2, 16); + const LLT V3S32 = LLT::fixed_vector(3, 32); Register DstReg = MI.getOperand(0).getReg(); Register NodePtr = MI.getOperand(2).getReg(); @@ -4865,61 +5377,98 @@ bool AMDGPULegalizerInfo::legalizeBVHIntrinsic(MachineInstr &MI, return false; } + const bool IsGFX11Plus = AMDGPU::isGFX11Plus(ST); const bool IsA16 = MRI.getType(RayDir).getElementType().getSizeInBits() == 16; const bool Is64 = MRI.getType(NodePtr).getSizeInBits() == 64; const unsigned NumVDataDwords = 4; const unsigned NumVAddrDwords = IsA16 ? (Is64 ? 9 : 8) : (Is64 ? 12 : 11); - const bool UseNSA = - ST.hasNSAEncoding() && NumVAddrDwords <= ST.getNSAMaxSize(); + const unsigned NumVAddrs = IsGFX11Plus ? (IsA16 ? 4 : 5) : NumVAddrDwords; + const bool UseNSA = ST.hasNSAEncoding() && NumVAddrs <= ST.getNSAMaxSize(); const unsigned BaseOpcodes[2][2] = { {AMDGPU::IMAGE_BVH_INTERSECT_RAY, AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16}, {AMDGPU::IMAGE_BVH64_INTERSECT_RAY, AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16}}; int Opcode; if (UseNSA) { - Opcode = - AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16], AMDGPU::MIMGEncGfx10NSA, - NumVDataDwords, NumVAddrDwords); - } else { Opcode = AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16], - AMDGPU::MIMGEncGfx10Default, NumVDataDwords, - PowerOf2Ceil(NumVAddrDwords)); + IsGFX11Plus ? AMDGPU::MIMGEncGfx11NSA + : AMDGPU::MIMGEncGfx10NSA, + NumVDataDwords, NumVAddrDwords); + } else { + Opcode = AMDGPU::getMIMGOpcode( + BaseOpcodes[Is64][IsA16], + IsGFX11Plus ? AMDGPU::MIMGEncGfx11Default : AMDGPU::MIMGEncGfx10Default, + NumVDataDwords, PowerOf2Ceil(NumVAddrDwords)); } assert(Opcode != -1); SmallVector<Register, 12> Ops; - if (Is64) { - auto Unmerge = B.buildUnmerge({S32, S32}, NodePtr); - Ops.push_back(Unmerge.getReg(0)); - Ops.push_back(Unmerge.getReg(1)); - } else { - Ops.push_back(NodePtr); - } - Ops.push_back(RayExtent); + if (UseNSA && IsGFX11Plus) { + auto packLanes = [&Ops, &S32, &V3S32, &B](Register Src) { + auto Unmerge = B.buildUnmerge({S32, S32, S32}, Src); + auto Merged = B.buildMerge( + V3S32, {Unmerge.getReg(0), Unmerge.getReg(1), Unmerge.getReg(2)}); + Ops.push_back(Merged.getReg(0)); + }; - auto packLanes = [&Ops, &S32, &B](Register Src) { - auto Unmerge = B.buildUnmerge({S32, S32, S32}, Src); - Ops.push_back(Unmerge.getReg(0)); - Ops.push_back(Unmerge.getReg(1)); - Ops.push_back(Unmerge.getReg(2)); - }; + Ops.push_back(NodePtr); + Ops.push_back(RayExtent); + packLanes(RayOrigin); - packLanes(RayOrigin); - if (IsA16) { - auto UnmergeRayDir = B.buildUnmerge({S16, S16, S16}, RayDir); - auto UnmergeRayInvDir = B.buildUnmerge({S16, S16, S16}, RayInvDir); - Register R1 = MRI.createGenericVirtualRegister(S32); - Register R2 = MRI.createGenericVirtualRegister(S32); - Register R3 = MRI.createGenericVirtualRegister(S32); - B.buildMerge(R1, {UnmergeRayDir.getReg(0), UnmergeRayDir.getReg(1)}); - B.buildMerge(R2, {UnmergeRayDir.getReg(2), UnmergeRayInvDir.getReg(0)}); - B.buildMerge(R3, {UnmergeRayInvDir.getReg(1), UnmergeRayInvDir.getReg(2)}); - Ops.push_back(R1); - Ops.push_back(R2); - Ops.push_back(R3); + if (IsA16) { + auto UnmergeRayDir = B.buildUnmerge({S16, S16, S16}, RayDir); + auto UnmergeRayInvDir = B.buildUnmerge({S16, S16, S16}, RayInvDir); + auto MergedDir = B.buildMerge( + V3S32, + {B.buildBitcast(S32, B.buildMerge(V2S16, {UnmergeRayInvDir.getReg(0), + UnmergeRayDir.getReg(0)})) + .getReg(0), + B.buildBitcast(S32, B.buildMerge(V2S16, {UnmergeRayInvDir.getReg(1), + UnmergeRayDir.getReg(1)})) + .getReg(0), + B.buildBitcast(S32, B.buildMerge(V2S16, {UnmergeRayInvDir.getReg(2), + UnmergeRayDir.getReg(2)})) + .getReg(0)}); + Ops.push_back(MergedDir.getReg(0)); + } else { + packLanes(RayDir); + packLanes(RayInvDir); + } } else { - packLanes(RayDir); - packLanes(RayInvDir); + if (Is64) { + auto Unmerge = B.buildUnmerge({S32, S32}, NodePtr); + Ops.push_back(Unmerge.getReg(0)); + Ops.push_back(Unmerge.getReg(1)); + } else { + Ops.push_back(NodePtr); + } + Ops.push_back(RayExtent); + + auto packLanes = [&Ops, &S32, &B](Register Src) { + auto Unmerge = B.buildUnmerge({S32, S32, S32}, Src); + Ops.push_back(Unmerge.getReg(0)); + Ops.push_back(Unmerge.getReg(1)); + Ops.push_back(Unmerge.getReg(2)); + }; + + packLanes(RayOrigin); + if (IsA16) { + auto UnmergeRayDir = B.buildUnmerge({S16, S16, S16}, RayDir); + auto UnmergeRayInvDir = B.buildUnmerge({S16, S16, S16}, RayInvDir); + Register R1 = MRI.createGenericVirtualRegister(S32); + Register R2 = MRI.createGenericVirtualRegister(S32); + Register R3 = MRI.createGenericVirtualRegister(S32); + B.buildMerge(R1, {UnmergeRayDir.getReg(0), UnmergeRayDir.getReg(1)}); + B.buildMerge(R2, {UnmergeRayDir.getReg(2), UnmergeRayInvDir.getReg(0)}); + B.buildMerge(R3, + {UnmergeRayInvDir.getReg(1), UnmergeRayInvDir.getReg(2)}); + Ops.push_back(R1); + Ops.push_back(R2); + Ops.push_back(R3); + } else { + packLanes(RayDir); + packLanes(RayInvDir); + } } if (!UseNSA) { @@ -4946,9 +5495,24 @@ bool AMDGPULegalizerInfo::legalizeBVHIntrinsic(MachineInstr &MI, return true; } -static bool replaceWithConstant(MachineIRBuilder &B, MachineInstr &MI, int64_t C) { - B.buildConstant(MI.getOperand(0).getReg(), C); +bool AMDGPULegalizerInfo::legalizeFPTruncRound(MachineInstr &MI, + MachineIRBuilder &B) const { + unsigned Opc; + int RoundMode = MI.getOperand(2).getImm(); + + if (RoundMode == (int)RoundingMode::TowardPositive) + Opc = AMDGPU::G_FPTRUNC_ROUND_UPWARD; + else if (RoundMode == (int)RoundingMode::TowardNegative) + Opc = AMDGPU::G_FPTRUNC_ROUND_DOWNWARD; + else + return false; + + B.buildInstr(Opc) + .addDef(MI.getOperand(0).getReg()) + .addUse(MI.getOperand(1).getReg()); + MI.eraseFromParent(); + return true; } @@ -5055,22 +5619,14 @@ bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper, case Intrinsic::amdgcn_implicitarg_ptr: return legalizeImplicitArgPtr(MI, MRI, B); case Intrinsic::amdgcn_workitem_id_x: - if (ST.getMaxWorkitemID(B.getMF().getFunction(), 0) == 0) - return replaceWithConstant(B, MI, 0); - return legalizePreloadedArgIntrin(MI, MRI, B, - AMDGPUFunctionArgInfo::WORKITEM_ID_X); + return legalizeWorkitemIDIntrinsic(MI, MRI, B, 0, + AMDGPUFunctionArgInfo::WORKITEM_ID_X); case Intrinsic::amdgcn_workitem_id_y: - if (ST.getMaxWorkitemID(B.getMF().getFunction(), 1) == 0) - return replaceWithConstant(B, MI, 0); - - return legalizePreloadedArgIntrin(MI, MRI, B, - AMDGPUFunctionArgInfo::WORKITEM_ID_Y); + return legalizeWorkitemIDIntrinsic(MI, MRI, B, 1, + AMDGPUFunctionArgInfo::WORKITEM_ID_Y); case Intrinsic::amdgcn_workitem_id_z: - if (ST.getMaxWorkitemID(B.getMF().getFunction(), 2) == 0) - return replaceWithConstant(B, MI, 0); - - return legalizePreloadedArgIntrin(MI, MRI, B, - AMDGPUFunctionArgInfo::WORKITEM_ID_Z); + return legalizeWorkitemIDIntrinsic(MI, MRI, B, 2, + AMDGPUFunctionArgInfo::WORKITEM_ID_Z); case Intrinsic::amdgcn_workgroup_id_x: return legalizePreloadedArgIntrin(MI, MRI, B, AMDGPUFunctionArgInfo::WORKGROUP_ID_X); @@ -5092,6 +5648,31 @@ bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper, case Intrinsic::amdgcn_dispatch_id: return legalizePreloadedArgIntrin(MI, MRI, B, AMDGPUFunctionArgInfo::DISPATCH_ID); + case Intrinsic::r600_read_ngroups_x: + // TODO: Emit error for hsa + return legalizeKernargMemParameter(MI, B, + SI::KernelInputOffsets::NGROUPS_X); + case Intrinsic::r600_read_ngroups_y: + return legalizeKernargMemParameter(MI, B, + SI::KernelInputOffsets::NGROUPS_Y); + case Intrinsic::r600_read_ngroups_z: + return legalizeKernargMemParameter(MI, B, + SI::KernelInputOffsets::NGROUPS_Z); + case Intrinsic::r600_read_local_size_x: + // TODO: Could insert G_ASSERT_ZEXT from s16 + return legalizeKernargMemParameter(MI, B, SI::KernelInputOffsets::LOCAL_SIZE_X); + case Intrinsic::r600_read_local_size_y: + // TODO: Could insert G_ASSERT_ZEXT from s16 + return legalizeKernargMemParameter(MI, B, SI::KernelInputOffsets::LOCAL_SIZE_Y); + // TODO: Could insert G_ASSERT_ZEXT from s16 + case Intrinsic::r600_read_local_size_z: + return legalizeKernargMemParameter(MI, B, SI::KernelInputOffsets::LOCAL_SIZE_Z); + case Intrinsic::r600_read_global_size_x: + return legalizeKernargMemParameter(MI, B, SI::KernelInputOffsets::GLOBAL_SIZE_X); + case Intrinsic::r600_read_global_size_y: + return legalizeKernargMemParameter(MI, B, SI::KernelInputOffsets::GLOBAL_SIZE_Y); + case Intrinsic::r600_read_global_size_z: + return legalizeKernargMemParameter(MI, B, SI::KernelInputOffsets::GLOBAL_SIZE_Z); case Intrinsic::amdgcn_fdiv_fast: return legalizeFDIVFastIntrin(MI, MRI, B); case Intrinsic::amdgcn_is_shared: @@ -5157,7 +5738,8 @@ bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper, case Intrinsic::amdgcn_raw_buffer_atomic_fadd: case Intrinsic::amdgcn_struct_buffer_atomic_fadd: { Register DstReg = MI.getOperand(0).getReg(); - if (!MRI.use_empty(DstReg) && !ST.hasGFX90AInsts()) { + if (!MRI.use_empty(DstReg) && + !AMDGPU::hasAtomicFaddRtnForTy(ST, MRI.getType(DstReg))) { Function &F = B.getMF().getFunction(); DiagnosticInfoUnsupported NoFpRet( F, "return versions of fp atomics not supported", B.getDebugLoc(), |
