diff options
| author | Dimitry Andric <dim@FreeBSD.org> | 2021-02-16 20:13:02 +0000 |
|---|---|---|
| committer | Dimitry Andric <dim@FreeBSD.org> | 2021-02-16 20:13:02 +0000 |
| commit | b60736ec1405bb0a8dd40989f67ef4c93da068ab (patch) | |
| tree | 5c43fbb7c9fc45f0f87e0e6795a86267dbd12f9d /llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp | |
| parent | cfca06d7963fa0909f90483b42a6d7d194d01e08 (diff) | |
Diffstat (limited to 'llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp')
| -rw-r--r-- | llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp | 1259 |
1 files changed, 876 insertions, 383 deletions
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp index 2976794b49c3..9f359c232981 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp @@ -15,18 +15,15 @@ #include "AMDGPU.h" #include "AMDGPUGlobalISelUtils.h" +#include "AMDGPUInstrInfo.h" #include "AMDGPUTargetMachine.h" #include "SIMachineFunctionInfo.h" #include "llvm/ADT/ScopeExit.h" #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h" -#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" -#include "llvm/CodeGen/TargetOpcodes.h" -#include "llvm/CodeGen/ValueTypes.h" -#include "llvm/IR/DerivedTypes.h" +#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" #include "llvm/IR/DiagnosticInfo.h" -#include "llvm/IR/Type.h" -#include "llvm/Support/Debug.h" +#include "llvm/IR/IntrinsicsAMDGPU.h" #define DEBUG_TYPE "amdgpu-legalinfo" @@ -60,16 +57,30 @@ static LLT getPow2ScalarType(LLT Ty) { return LLT::scalar(Pow2Bits); } +/// \returs true if this is an odd sized vector which should widen by adding an +/// additional element. This is mostly to handle <3 x s16> -> <4 x s16>. This +/// excludes s1 vectors, which should always be scalarized. static LegalityPredicate isSmallOddVector(unsigned TypeIdx) { return [=](const LegalityQuery &Query) { const LLT Ty = Query.Types[TypeIdx]; - return Ty.isVector() && - Ty.getNumElements() % 2 != 0 && - Ty.getElementType().getSizeInBits() < 32 && + if (!Ty.isVector()) + return false; + + const LLT EltTy = Ty.getElementType(); + const unsigned EltSize = EltTy.getSizeInBits(); + return Ty.getNumElements() % 2 != 0 && + EltSize > 1 && EltSize < 32 && Ty.getSizeInBits() % 32 != 0; }; } +static LegalityPredicate sizeIsMultipleOf32(unsigned TypeIdx) { + return [=](const LegalityQuery &Query) { + const LLT Ty = Query.Types[TypeIdx]; + return Ty.getSizeInBits() % 32 == 0; + }; +} + static LegalityPredicate isWideVec16(unsigned TypeIdx) { return [=](const LegalityQuery &Query) { const LLT Ty = Query.Types[TypeIdx]; @@ -115,20 +126,32 @@ static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) { }; } +static LLT getBitcastRegisterType(const LLT Ty) { + const unsigned Size = Ty.getSizeInBits(); + + LLT CoercedTy; + if (Size <= 32) { + // <2 x s8> -> s16 + // <4 x s8> -> s32 + return LLT::scalar(Size); + } + + return LLT::scalarOrVector(Size / 32, 32); +} + static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx) { return [=](const LegalityQuery &Query) { const LLT Ty = Query.Types[TypeIdx]; - unsigned Size = Ty.getSizeInBits(); - - LLT CoercedTy; - if (Size <= 32) { - // <2 x s8> -> s16 - // <4 x s8> -> s32 - CoercedTy = LLT::scalar(Size); - } else - CoercedTy = LLT::scalarOrVector(Size / 32, 32); + return std::make_pair(TypeIdx, getBitcastRegisterType(Ty)); + }; +} - return std::make_pair(TypeIdx, CoercedTy); +static LegalizeMutation bitcastToVectorElement32(unsigned TypeIdx) { + return [=](const LegalityQuery &Query) { + const LLT Ty = Query.Types[TypeIdx]; + unsigned Size = Ty.getSizeInBits(); + assert(Size % 32 == 0); + return std::make_pair(TypeIdx, LLT::scalarOrVector(Size / 32, 32)); }; } @@ -213,7 +236,7 @@ static unsigned maxSizeForAddrSpace(const GCNSubtarget &ST, unsigned AS, switch (AS) { case AMDGPUAS::PRIVATE_ADDRESS: // FIXME: Private element size. - return 32; + return ST.enableFlatScratch() ? 128 : 32; case AMDGPUAS::LOCAL_ADDRESS: return ST.useDS128() ? 128 : 64; case AMDGPUAS::GLOBAL_ADDRESS: @@ -243,7 +266,7 @@ static bool isLoadStoreSizeLegal(const GCNSubtarget &ST, unsigned RegSize = Ty.getSizeInBits(); unsigned MemSize = Query.MMODescrs[0].SizeInBits; - unsigned Align = Query.MMODescrs[0].AlignInBits; + unsigned AlignBits = Query.MMODescrs[0].AlignInBits; unsigned AS = Query.Types[1].getAddressSpace(); // All of these need to be custom lowered to cast the pointer operand. @@ -286,9 +309,10 @@ static bool isLoadStoreSizeLegal(const GCNSubtarget &ST, assert(RegSize >= MemSize); - if (Align < MemSize) { + if (AlignBits < MemSize) { const SITargetLowering *TLI = ST.getTargetLowering(); - if (!TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, Align / 8)) + if (!TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, + Align(AlignBits / 8))) return false; } @@ -308,7 +332,12 @@ static bool loadStoreBitcastWorkaround(const LLT Ty) { return false; if (!Ty.isVector()) return true; - unsigned EltSize = Ty.getElementType().getSizeInBits(); + + LLT EltTy = Ty.getElementType(); + if (EltTy.isPointer()) + return true; + + unsigned EltSize = EltTy.getSizeInBits(); return EltSize != 32 && EltSize != 64; } @@ -319,6 +348,66 @@ static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query, !loadStoreBitcastWorkaround(Ty); } +/// Return true if a load or store of the type should be lowered with a bitcast +/// to a different type. +static bool shouldBitcastLoadStoreType(const GCNSubtarget &ST, const LLT Ty, + const unsigned MemSizeInBits) { + const unsigned Size = Ty.getSizeInBits(); + if (Size != MemSizeInBits) + return Size <= 32 && Ty.isVector(); + + if (loadStoreBitcastWorkaround(Ty) && isRegisterType(Ty)) + return true; + return Ty.isVector() && (Size <= 32 || isRegisterSize(Size)) && + !isRegisterVectorElementType(Ty.getElementType()); +} + +/// Return true if we should legalize a load by widening an odd sized memory +/// access up to the alignment. Note this case when the memory access itself +/// changes, not the size of the result register. +static bool shouldWidenLoad(const GCNSubtarget &ST, unsigned SizeInBits, + unsigned AlignInBits, unsigned AddrSpace, + unsigned Opcode) { + // We don't want to widen cases that are naturally legal. + if (isPowerOf2_32(SizeInBits)) + return false; + + // If we have 96-bit memory operations, we shouldn't touch them. Note we may + // end up widening these for a scalar load during RegBankSelect, since there + // aren't 96-bit scalar loads. + if (SizeInBits == 96 && ST.hasDwordx3LoadStores()) + return false; + + if (SizeInBits >= maxSizeForAddrSpace(ST, AddrSpace, Opcode)) + return false; + + // A load is known dereferenceable up to the alignment, so it's legal to widen + // to it. + // + // TODO: Could check dereferenceable for less aligned cases. + unsigned RoundedSize = NextPowerOf2(SizeInBits); + if (AlignInBits < RoundedSize) + return false; + + // Do not widen if it would introduce a slow unaligned load. + const SITargetLowering *TLI = ST.getTargetLowering(); + bool Fast = false; + return TLI->allowsMisalignedMemoryAccessesImpl( + RoundedSize, AddrSpace, Align(AlignInBits / 8), + MachineMemOperand::MOLoad, &Fast) && + Fast; +} + +static bool shouldWidenLoad(const GCNSubtarget &ST, const LegalityQuery &Query, + unsigned Opcode) { + if (Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic) + return false; + + return shouldWidenLoad(ST, Query.MMODescrs[0].SizeInBits, + Query.MMODescrs[0].AlignInBits, + Query.Types[1].getAddressSpace(), Opcode); +} + AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, const GCNTargetMachine &TM) : ST(ST_) { @@ -329,6 +418,7 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, }; const LLT S1 = LLT::scalar(1); + const LLT S8 = LLT::scalar(8); const LLT S16 = LLT::scalar(16); const LLT S32 = LLT::scalar(32); const LLT S64 = LLT::scalar(64); @@ -337,6 +427,7 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, const LLT S512 = LLT::scalar(512); const LLT MaxScalar = LLT::scalar(MaxRegisterSize); + const LLT V2S8 = LLT::vector(2, 8); const LLT V2S16 = LLT::vector(2, 16); const LLT V4S16 = LLT::vector(4, 16); @@ -410,48 +501,103 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more // elements for v3s16 getActionDefinitionsBuilder(G_PHI) - .legalFor({S32, S64, V2S16, V4S16, S1, S128, S256}) + .legalFor({S32, S64, V2S16, S16, V4S16, S1, S128, S256}) .legalFor(AllS32Vectors) .legalFor(AllS64Vectors) .legalFor(AddrSpaces64) .legalFor(AddrSpaces32) - .clampScalar(0, S32, S256) + .legalIf(isPointer(0)) + .clampScalar(0, S16, S256) .widenScalarToNextPow2(0, 32) .clampMaxNumElements(0, S32, 16) .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) - .legalIf(isPointer(0)); + .scalarize(0); - if (ST.hasVOP3PInsts()) { + if (ST.hasVOP3PInsts() && ST.hasAddNoCarry() && ST.hasIntClamp()) { + // Full set of gfx9 features. getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) .legalFor({S32, S16, V2S16}) .clampScalar(0, S16, S32) .clampMaxNumElements(0, S16, 2) .scalarize(0) .widenScalarToNextPow2(0, 32); + + getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT, G_SADDSAT, G_SSUBSAT}) + .legalFor({S32, S16, V2S16}) // Clamp modifier + .minScalarOrElt(0, S16) + .clampMaxNumElements(0, S16, 2) + .scalarize(0) + .widenScalarToNextPow2(0, 32) + .lower(); } else if (ST.has16BitInsts()) { getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) .legalFor({S32, S16}) .clampScalar(0, S16, S32) .scalarize(0) - .widenScalarToNextPow2(0, 32); + .widenScalarToNextPow2(0, 32); // FIXME: min should be 16 + + // Technically the saturating operations require clamp bit support, but this + // was introduced at the same time as 16-bit operations. + getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT}) + .legalFor({S32, S16}) // Clamp modifier + .minScalar(0, S16) + .scalarize(0) + .widenScalarToNextPow2(0, 16) + .lower(); + + // We're just lowering this, but it helps get a better result to try to + // coerce to the desired type first. + getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT}) + .minScalar(0, S16) + .scalarize(0) + .lower(); } else { getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) .legalFor({S32}) .clampScalar(0, S32, S32) .scalarize(0); + + if (ST.hasIntClamp()) { + getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT}) + .legalFor({S32}) // Clamp modifier. + .scalarize(0) + .minScalarOrElt(0, S32) + .lower(); + } else { + // Clamp bit support was added in VI, along with 16-bit operations. + getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT}) + .minScalar(0, S32) + .scalarize(0) + .lower(); + } + + // FIXME: DAG expansion gets better results. The widening uses the smaller + // range values and goes for the min/max lowering directly. + getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT}) + .minScalar(0, S32) + .scalarize(0) + .lower(); } - // FIXME: Not really legal. Placeholder for custom lowering. getActionDefinitionsBuilder({G_SDIV, G_UDIV, G_SREM, G_UREM}) .customFor({S32, S64}) .clampScalar(0, S32, S64) .widenScalarToNextPow2(0, 32) .scalarize(0); - getActionDefinitionsBuilder({G_UMULH, G_SMULH}) - .legalFor({S32}) - .clampScalar(0, S32, S32) - .scalarize(0); + auto &Mulh = getActionDefinitionsBuilder({G_UMULH, G_SMULH}) + .legalFor({S32}) + .maxScalarOrElt(0, S32); + + if (ST.hasVOP3PInsts()) { + Mulh + .clampMaxNumElements(0, S8, 2) + .lowerFor({V2S8}); + } + + Mulh + .scalarize(0) + .lower(); // Report legal for any types we can handle anywhere. For the cases only legal // on the SALU, RegBankSelect will be able to re-legalize. @@ -479,9 +625,9 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, getActionDefinitionsBuilder(G_CONSTANT) .legalFor({S1, S32, S64, S16, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr }) + .legalIf(isPointer(0)) .clampScalar(0, S32, S64) - .widenScalarToNextPow2(0) - .legalIf(isPointer(0)); + .widenScalarToNextPow2(0); getActionDefinitionsBuilder(G_FCONSTANT) .legalFor({S32, S64, S16}) @@ -505,8 +651,8 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, .legalFor({{PrivatePtr, S32}}); getActionDefinitionsBuilder(G_GLOBAL_VALUE) - .unsupportedFor({PrivatePtr}) - .custom(); + .customIf(typeIsNot(0, PrivatePtr)); + setAction({G_BLOCK_ADDR, CodePtr}, Legal); auto &FPOpActions = getActionDefinitionsBuilder( @@ -599,7 +745,7 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, getActionDefinitionsBuilder(G_FPEXT) .legalFor({{S64, S32}, {S32, S16}}) - .lowerFor({{S64, S16}}) // FIXME: Implement + .narrowScalarFor({{S64, S16}}, changeTo(0, S32)) .scalarize(0); getActionDefinitionsBuilder(G_FSUB) @@ -621,6 +767,15 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, FMad.scalarize(0) .lower(); + auto &FRem = getActionDefinitionsBuilder(G_FREM); + if (ST.has16BitInsts()) { + FRem.customFor({S16, S32, S64}); + } else { + FRem.minScalar(0, S32) + .customFor({S32, S64}); + } + FRem.scalarize(0); + // TODO: Do we need to clamp maximum bitwidth? getActionDefinitionsBuilder(G_TRUNC) .legalIf(isScalar(0)) @@ -648,12 +803,14 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, if (ST.has16BitInsts()) IToFP.legalFor({{S16, S16}}); IToFP.clampScalar(1, S32, S64) + .minScalar(0, S32) .scalarize(0) .widenScalarToNextPow2(1); auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI}) .legalFor({{S32, S32}, {S32, S64}, {S32, S16}}) - .customFor({{S64, S64}}); + .customFor({{S64, S64}}) + .narrowScalarFor({{S64, S16}}, changeTo(0, S32)); if (ST.has16BitInsts()) FPToI.legalFor({{S16, S16}}); else @@ -663,7 +820,8 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, .scalarize(0) .lower(); - getActionDefinitionsBuilder(G_INTRINSIC_ROUND) + // Lower roundeven into G_FRINT + getActionDefinitionsBuilder({G_INTRINSIC_ROUND, G_INTRINSIC_ROUNDEVEN}) .scalarize(0) .lower(); @@ -685,16 +843,14 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, .scalarize(0); } - // FIXME: Clamp offset operand. getActionDefinitionsBuilder(G_PTR_ADD) - .legalIf(isPointer(0)) - .scalarize(0); + .legalIf(all(isPointer(0), sameSize(0, 1))) + .scalarize(0) + .scalarSameSizeAs(1, 0); getActionDefinitionsBuilder(G_PTRMASK) - .legalIf(typeInSet(1, {S64, S32})) - .minScalar(1, S32) - .maxScalarIf(sizeIs(0, 32), 1, S32) - .maxScalarIf(sizeIs(0, 64), 1, S64) + .legalIf(all(sameSize(0, 1), typeInSet(1, {S64, S32}))) + .scalarSameSizeAs(1, 0) .scalarize(0); auto &CmpBuilder = @@ -746,6 +902,10 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, ExpOps.clampScalar(0, MinScalarFPTy, S32) .scalarize(0); + getActionDefinitionsBuilder(G_FPOWI) + .clampScalar(0, MinScalarFPTy, S32) + .lower(); + // The 64-bit versions produce 32-bit results, but only on the SALU. getActionDefinitionsBuilder(G_CTPOP) .legalFor({{S32, S32}, {S32, S64}}) @@ -870,10 +1030,10 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, // Split vector extloads. unsigned MemSize = Query.MMODescrs[0].SizeInBits; - unsigned Align = Query.MMODescrs[0].AlignInBits; + unsigned AlignBits = Query.MMODescrs[0].AlignInBits; if (MemSize < DstTy.getSizeInBits()) - MemSize = std::max(MemSize, Align); + MemSize = std::max(MemSize, AlignBits); if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize) return true; @@ -895,35 +1055,18 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, return true; } - if (Align < MemSize) { + if (AlignBits < MemSize) { const SITargetLowering *TLI = ST.getTargetLowering(); - return !TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, Align / 8); + return !TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, + Align(AlignBits / 8)); } return false; }; - const auto shouldWidenLoadResult = [=](const LegalityQuery &Query, - unsigned Opc) -> bool { - unsigned Size = Query.Types[0].getSizeInBits(); - if (isPowerOf2_32(Size)) - return false; - - if (Size == 96 && ST.hasDwordx3LoadStores()) - return false; - - unsigned AddrSpace = Query.Types[1].getAddressSpace(); - if (Size >= maxSizeForAddrSpace(ST, AddrSpace, Opc)) - return false; - - unsigned Align = Query.MMODescrs[0].AlignInBits; - unsigned RoundedSize = NextPowerOf2(Size); - return (Align >= RoundedSize); - }; - - unsigned GlobalAlign32 = ST.hasUnalignedBufferAccess() ? 0 : 32; - unsigned GlobalAlign16 = ST.hasUnalignedBufferAccess() ? 0 : 16; - unsigned GlobalAlign8 = ST.hasUnalignedBufferAccess() ? 0 : 8; + unsigned GlobalAlign32 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 32; + unsigned GlobalAlign16 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 16; + unsigned GlobalAlign8 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 8; // TODO: Refine based on subtargets which support unaligned access or 128-bit // LDS @@ -981,31 +1124,20 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, // 16-bit vector parts. Actions.bitcastIf( [=](const LegalityQuery &Query) -> bool { - const LLT Ty = Query.Types[0]; - const unsigned Size = Ty.getSizeInBits(); - - if (Size != Query.MMODescrs[0].SizeInBits) - return Size <= 32 && Ty.isVector(); - - if (loadStoreBitcastWorkaround(Ty) && isRegisterType(Ty)) - return true; - return Ty.isVector() && (Size <= 32 || isRegisterSize(Size)) && - !isRegisterVectorElementType(Ty.getElementType()); + return shouldBitcastLoadStoreType(ST, Query.Types[0], + Query.MMODescrs[0].SizeInBits); }, bitcastToRegisterType(0)); + if (!IsStore) { + // Widen suitably aligned loads by loading extra bytes. The standard + // legalization actions can't properly express widening memory operands. + Actions.customIf([=](const LegalityQuery &Query) -> bool { + return shouldWidenLoad(ST, Query, G_LOAD); + }); + } + + // FIXME: load/store narrowing should be moved to lower action Actions - .customIf(typeIs(1, Constant32Ptr)) - // Widen suitably aligned loads by loading extra elements. - .moreElementsIf([=](const LegalityQuery &Query) { - const LLT Ty = Query.Types[0]; - return Op == G_LOAD && Ty.isVector() && - shouldWidenLoadResult(Query, Op); - }, moreElementsToNextPow2(0)) - .widenScalarIf([=](const LegalityQuery &Query) { - const LLT Ty = Query.Types[0]; - return Op == G_LOAD && !Ty.isVector() && - shouldWidenLoadResult(Query, Op); - }, widenScalarOrEltToNextPow2(0)) .narrowScalarIf( [=](const LegalityQuery &Query) -> bool { return !Query.Types[0].isVector() && @@ -1111,15 +1243,16 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, // May need relegalization for the scalars. return std::make_pair(0, EltTy); }) - .minScalar(0, S32); + .lowerIfMemSizeNotPow2() + .minScalar(0, S32); if (IsStore) Actions.narrowScalarIf(isWideScalarTruncStore(0), changeTo(0, S32)); - // TODO: Need a bitcast lower option? Actions .widenScalarToNextPow2(0) - .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0)); + .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0)) + .lower(); } auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD}) @@ -1147,14 +1280,15 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX, G_ATOMICRMW_UMIN}) .legalFor({{S32, GlobalPtr}, {S32, LocalPtr}, - {S64, GlobalPtr}, {S64, LocalPtr}}); + {S64, GlobalPtr}, {S64, LocalPtr}, + {S32, RegionPtr}, {S64, RegionPtr}}); if (ST.hasFlatAddressSpace()) { Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}}); } if (ST.hasLDSFPAtomics()) { getActionDefinitionsBuilder(G_ATOMICRMW_FADD) - .legalFor({{S32, LocalPtr}}); + .legalFor({{S32, LocalPtr}, {S32, RegionPtr}}); } // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output @@ -1207,6 +1341,11 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, Shifts.clampScalar(1, S32, S32); Shifts.clampScalar(0, S16, S64); Shifts.widenScalarToNextPow2(0, 16); + + getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT}) + .minScalar(0, S16) + .scalarize(0) + .lower(); } else { // Make sure we legalize the shift amount type first, as the general // expansion for the shifted type will produce much worse code if it hasn't @@ -1214,6 +1353,11 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, Shifts.clampScalar(1, S32, S32); Shifts.clampScalar(0, S32, S64); Shifts.widenScalarToNextPow2(0, 32); + + getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT}) + .minScalar(0, S32) + .scalarize(0) + .lower(); } Shifts.scalarize(0); @@ -1227,15 +1371,38 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, const LLT EltTy = Query.Types[EltTypeIdx]; const LLT VecTy = Query.Types[VecTypeIdx]; const LLT IdxTy = Query.Types[IdxTypeIdx]; - return (EltTy.getSizeInBits() == 16 || - EltTy.getSizeInBits() % 32 == 0) && - VecTy.getSizeInBits() % 32 == 0 && - VecTy.getSizeInBits() <= MaxRegisterSize && - IdxTy.getSizeInBits() == 32; + const unsigned EltSize = EltTy.getSizeInBits(); + return (EltSize == 32 || EltSize == 64) && + VecTy.getSizeInBits() % 32 == 0 && + VecTy.getSizeInBits() <= MaxRegisterSize && + IdxTy.getSizeInBits() == 32; + }) + .bitcastIf(all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltNarrowerThan(VecTypeIdx, 32)), + bitcastToVectorElement32(VecTypeIdx)) + //.bitcastIf(vectorSmallerThan(1, 32), bitcastToScalar(1)) + .bitcastIf( + all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltWiderThan(VecTypeIdx, 64)), + [=](const LegalityQuery &Query) { + // For > 64-bit element types, try to turn this into a 64-bit + // element vector since we may be able to do better indexing + // if this is scalar. If not, fall back to 32. + const LLT EltTy = Query.Types[EltTypeIdx]; + const LLT VecTy = Query.Types[VecTypeIdx]; + const unsigned DstEltSize = EltTy.getSizeInBits(); + const unsigned VecSize = VecTy.getSizeInBits(); + + const unsigned TargetEltSize = DstEltSize % 64 == 0 ? 64 : 32; + return std::make_pair( + VecTypeIdx, LLT::vector(VecSize / TargetEltSize, TargetEltSize)); }) .clampScalar(EltTypeIdx, S32, S64) .clampScalar(VecTypeIdx, S32, S64) - .clampScalar(IdxTypeIdx, S32, S32); + .clampScalar(IdxTypeIdx, S32, S32) + .clampMaxNumElements(VecTypeIdx, S32, 32) + // TODO: Clamp elements for 64-bit vectors? + // It should only be necessary with variable indexes. + // As a last resort, lower to the stack + .lower(); } getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT) @@ -1306,7 +1473,10 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, // FIXME: Clamp maximum size getActionDefinitionsBuilder(G_CONCAT_VECTORS) - .legalIf(isRegisterType(0)); + .legalIf(all(isRegisterType(0), isRegisterType(1))) + .clampMaxNumElements(0, S32, 32) + .clampMaxNumElements(1, S16, 2) // TODO: Make 4? + .clampMaxNumElements(0, S16, 64); // TODO: Don't fully scalarize v2s16 pieces? Or combine out thosse // pre-legalize. @@ -1335,6 +1505,7 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, }; auto &Builder = getActionDefinitionsBuilder(Op) + .legalIf(all(isRegisterType(0), isRegisterType(1))) .lowerFor({{S16, V2S16}}) .lowerIf([=](const LegalityQuery &Query) { const LLT BigTy = Query.Types[BigTyIdx]; @@ -1390,19 +1561,6 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, } return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits)); }) - .legalIf([=](const LegalityQuery &Query) { - const LLT &BigTy = Query.Types[BigTyIdx]; - const LLT &LitTy = Query.Types[LitTyIdx]; - - if (BigTy.isVector() && BigTy.getSizeInBits() < 32) - return false; - if (LitTy.isVector() && LitTy.getSizeInBits() < 32) - return false; - - return BigTy.getSizeInBits() % 16 == 0 && - LitTy.getSizeInBits() % 16 == 0 && - BigTy.getSizeInBits() <= MaxRegisterSize; - }) // Any vectors left are the wrong size. Scalarize them. .scalarize(0) .scalarize(1); @@ -1427,12 +1585,6 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, SextInReg.lowerFor({{S32}, {S64}}); } - // FIXME: Placeholder rule. Really depends on whether the clamp modifier is - // available, and is selectively legal for s16, s32, v2s16. - getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT, G_UADDSAT, G_USUBSAT}) - .scalarize(0) - .clampScalar(0, S16, S32); - SextInReg .scalarize(0) .clampScalar(0, S32, S64) @@ -1446,11 +1598,16 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, getActionDefinitionsBuilder(G_READCYCLECOUNTER) .legalFor({S64}); + getActionDefinitionsBuilder(G_FENCE) + .alwaysLegal(); + getActionDefinitionsBuilder({ // TODO: Verify V_BFI_B32 is generated from expanded bit ops G_FCOPYSIGN, G_ATOMIC_CMPXCHG_WITH_SUCCESS, + G_ATOMICRMW_NAND, + G_ATOMICRMW_FSUB, G_READ_REGISTER, G_WRITE_REGISTER, @@ -1474,7 +1631,6 @@ bool AMDGPULegalizerInfo::legalizeCustom(LegalizerHelper &Helper, MachineInstr &MI) const { MachineIRBuilder &B = Helper.MIRBuilder; MachineRegisterInfo &MRI = *B.getMRI(); - GISelChangeObserver &Observer = Helper.Observer; switch (MI.getOpcode()) { case TargetOpcode::G_ADDRSPACE_CAST: @@ -1483,6 +1639,8 @@ bool AMDGPULegalizerInfo::legalizeCustom(LegalizerHelper &Helper, return legalizeFrint(MI, MRI, B); case TargetOpcode::G_FCEIL: return legalizeFceil(MI, MRI, B); + case TargetOpcode::G_FREM: + return legalizeFrem(MI, MRI, B); case TargetOpcode::G_INTRINSIC_TRUNC: return legalizeIntrinsicTrunc(MI, MRI, B); case TargetOpcode::G_SITOFP: @@ -1510,7 +1668,7 @@ bool AMDGPULegalizerInfo::legalizeCustom(LegalizerHelper &Helper, case TargetOpcode::G_GLOBAL_VALUE: return legalizeGlobalValue(MI, MRI, B); case TargetOpcode::G_LOAD: - return legalizeLoad(MI, MRI, B, Observer); + return legalizeLoad(Helper, MI); case TargetOpcode::G_FMAD: return legalizeFMad(MI, MRI, B); case TargetOpcode::G_FDIV: @@ -1580,8 +1738,7 @@ Register AMDGPULegalizerInfo::getSegmentAperture( Register QueuePtr = MRI.createGenericVirtualRegister( LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); - const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); - if (!loadInputValue(QueuePtr, B, &MFI->getArgInfo().QueuePtr)) + if (!loadInputValue(QueuePtr, B, AMDGPUFunctionArgInfo::QUEUE_PTR)) return Register(); // Offset into amd_queue_t for group_segment_aperture_base_hi / @@ -1623,8 +1780,7 @@ bool AMDGPULegalizerInfo::legalizeAddrSpaceCast( const AMDGPUTargetMachine &TM = static_cast<const AMDGPUTargetMachine &>(MF.getTarget()); - const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); - if (ST.getTargetLowering()->isNoopAddrSpaceCast(SrcAS, DestAS)) { + if (TM.isNoopAddrSpaceCast(SrcAS, DestAS)) { MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST)); return true; } @@ -1721,6 +1877,7 @@ bool AMDGPULegalizerInfo::legalizeFrint( auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2); B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2); + MI.eraseFromParent(); return true; } @@ -1752,7 +1909,24 @@ bool AMDGPULegalizerInfo::legalizeFceil( return true; } -static MachineInstrBuilder extractF64Exponent(unsigned Hi, +bool AMDGPULegalizerInfo::legalizeFrem( + MachineInstr &MI, MachineRegisterInfo &MRI, + MachineIRBuilder &B) const { + Register DstReg = MI.getOperand(0).getReg(); + Register Src0Reg = MI.getOperand(1).getReg(); + Register Src1Reg = MI.getOperand(2).getReg(); + auto Flags = MI.getFlags(); + LLT Ty = MRI.getType(DstReg); + + auto Div = B.buildFDiv(Ty, Src0Reg, Src1Reg, Flags); + auto Trunc = B.buildIntrinsicTrunc(Ty, Div, Flags); + auto Neg = B.buildFNeg(Ty, Trunc, Flags); + B.buildFMA(DstReg, Neg, Src1Reg, Src0Reg, Flags); + MI.eraseFromParent(); + return true; +} + +static MachineInstrBuilder extractF64Exponent(Register Hi, MachineIRBuilder &B) { const unsigned FractBits = 52; const unsigned ExpBits = 11; @@ -1762,6 +1936,7 @@ static MachineInstrBuilder extractF64Exponent(unsigned Hi, auto Const1 = B.buildConstant(S32, ExpBits); auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false) + .addUse(Hi) .addUse(Const0.getReg(0)) .addUse(Const1.getReg(0)); @@ -1809,6 +1984,7 @@ bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc( auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0); B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1); + MI.eraseFromParent(); return true; } @@ -1907,10 +2083,11 @@ bool AMDGPULegalizerInfo::legalizeExtractVectorElt( // FIXME: Artifact combiner probably should have replaced the truncated // constant before this, so we shouldn't need // getConstantVRegValWithLookThrough. - Optional<ValueAndVReg> IdxVal = getConstantVRegValWithLookThrough( - MI.getOperand(2).getReg(), MRI); - if (!IdxVal) // Dynamic case will be selected to register indexing. + Optional<ValueAndVReg> MaybeIdxVal = + getConstantVRegValWithLookThrough(MI.getOperand(2).getReg(), MRI); + if (!MaybeIdxVal) // Dynamic case will be selected to register indexing. return true; + const int64_t IdxVal = MaybeIdxVal->Value.getSExtValue(); Register Dst = MI.getOperand(0).getReg(); Register Vec = MI.getOperand(1).getReg(); @@ -1919,8 +2096,8 @@ bool AMDGPULegalizerInfo::legalizeExtractVectorElt( LLT EltTy = VecTy.getElementType(); assert(EltTy == MRI.getType(Dst)); - if (IdxVal->Value < VecTy.getNumElements()) - B.buildExtract(Dst, Vec, IdxVal->Value * EltTy.getSizeInBits()); + if (IdxVal < VecTy.getNumElements()) + B.buildExtract(Dst, Vec, IdxVal * EltTy.getSizeInBits()); else B.buildUndef(Dst); @@ -1938,11 +2115,12 @@ bool AMDGPULegalizerInfo::legalizeInsertVectorElt( // FIXME: Artifact combiner probably should have replaced the truncated // constant before this, so we shouldn't need // getConstantVRegValWithLookThrough. - Optional<ValueAndVReg> IdxVal = getConstantVRegValWithLookThrough( - MI.getOperand(3).getReg(), MRI); - if (!IdxVal) // Dynamic case will be selected to register indexing. + Optional<ValueAndVReg> MaybeIdxVal = + getConstantVRegValWithLookThrough(MI.getOperand(3).getReg(), MRI); + if (!MaybeIdxVal) // Dynamic case will be selected to register indexing. return true; + int64_t IdxVal = MaybeIdxVal->Value.getSExtValue(); Register Dst = MI.getOperand(0).getReg(); Register Vec = MI.getOperand(1).getReg(); Register Ins = MI.getOperand(2).getReg(); @@ -1951,8 +2129,8 @@ bool AMDGPULegalizerInfo::legalizeInsertVectorElt( LLT EltTy = VecTy.getElementType(); assert(EltTy == MRI.getType(Ins)); - if (IdxVal->Value < VecTy.getNumElements()) - B.buildInsert(Dst, Vec, Ins, IdxVal->Value * EltTy.getSizeInBits()); + if (IdxVal < VecTy.getNumElements()) + B.buildInsert(Dst, Vec, Ins, IdxVal * EltTy.getSizeInBits()); else B.buildUndef(Dst); @@ -2043,7 +2221,9 @@ bool AMDGPULegalizerInfo::buildPCRelGlobalAddress(Register DstReg, LLT PtrTy, // variable, but since the encoding of $symbol starts 4 bytes after the start // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too // small. This requires us to add 4 to the global variable offset in order to - // compute the correct address. + // compute the correct address. Similarly for the s_addc_u32 instruction, the + // encoding of $symbol starts 12 bytes after the start of the s_add_u32 + // instruction. LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); @@ -2057,7 +2237,7 @@ bool AMDGPULegalizerInfo::buildPCRelGlobalAddress(Register DstReg, LLT PtrTy, if (GAFlags == SIInstrInfo::MO_NONE) MIB.addImm(0); else - MIB.addGlobalAddress(GV, Offset + 4, GAFlags + 1); + MIB.addGlobalAddress(GV, Offset + 12, GAFlags + 1); B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass); @@ -2078,7 +2258,7 @@ bool AMDGPULegalizerInfo::legalizeGlobalValue( SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) { - if (!MFI->isEntryFunction()) { + if (!MFI->isModuleEntryFunction()) { const Function &Fn = MF.getFunction(); DiagnosticInfoUnsupported BadLDSDecl( Fn, "local memory global used by non-kernel function", MI.getDebugLoc(), @@ -2104,6 +2284,25 @@ bool AMDGPULegalizerInfo::legalizeGlobalValue( return true; // Leave in place; } + if (AS == AMDGPUAS::LOCAL_ADDRESS && GV->hasExternalLinkage()) { + Type *Ty = GV->getValueType(); + // HIP uses an unsized array `extern __shared__ T s[]` or similar + // zero-sized type in other languages to declare the dynamic shared + // memory which size is not known at the compile time. They will be + // allocated by the runtime and placed directly after the static + // allocated ones. They all share the same offset. + if (B.getDataLayout().getTypeAllocSize(Ty).isZero()) { + // Adjust alignment for that dynamic shared memory array. + MFI->setDynLDSAlign(B.getDataLayout(), *cast<GlobalVariable>(GV)); + LLT S32 = LLT::scalar(32); + auto Sz = + B.buildIntrinsic(Intrinsic::amdgcn_groupstaticsize, {S32}, false); + B.buildIntToPtr(DstReg, Sz); + MI.eraseFromParent(); + return true; + } + } + B.buildConstant( DstReg, MFI->allocateLDSGlobal(B.getDataLayout(), *cast<GlobalVariable>(GV))); @@ -2154,15 +2353,90 @@ bool AMDGPULegalizerInfo::legalizeGlobalValue( return true; } -bool AMDGPULegalizerInfo::legalizeLoad( - MachineInstr &MI, MachineRegisterInfo &MRI, - MachineIRBuilder &B, GISelChangeObserver &Observer) const { - LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); - auto Cast = B.buildAddrSpaceCast(ConstPtr, MI.getOperand(1).getReg()); - Observer.changingInstr(MI); - MI.getOperand(1).setReg(Cast.getReg(0)); - Observer.changedInstr(MI); - return true; +static LLT widenToNextPowerOf2(LLT Ty) { + if (Ty.isVector()) + return Ty.changeNumElements(PowerOf2Ceil(Ty.getNumElements())); + return LLT::scalar(PowerOf2Ceil(Ty.getSizeInBits())); +} + +bool AMDGPULegalizerInfo::legalizeLoad(LegalizerHelper &Helper, + MachineInstr &MI) const { + MachineIRBuilder &B = Helper.MIRBuilder; + MachineRegisterInfo &MRI = *B.getMRI(); + GISelChangeObserver &Observer = Helper.Observer; + + Register PtrReg = MI.getOperand(1).getReg(); + LLT PtrTy = MRI.getType(PtrReg); + unsigned AddrSpace = PtrTy.getAddressSpace(); + + if (AddrSpace == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { + LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); + auto Cast = B.buildAddrSpaceCast(ConstPtr, PtrReg); + Observer.changingInstr(MI); + MI.getOperand(1).setReg(Cast.getReg(0)); + Observer.changedInstr(MI); + return true; + } + + Register ValReg = MI.getOperand(0).getReg(); + LLT ValTy = MRI.getType(ValReg); + + MachineMemOperand *MMO = *MI.memoperands_begin(); + const unsigned ValSize = ValTy.getSizeInBits(); + const unsigned MemSize = 8 * MMO->getSize(); + const Align MemAlign = MMO->getAlign(); + const unsigned AlignInBits = 8 * MemAlign.value(); + + // Widen non-power-of-2 loads to the alignment if needed + if (shouldWidenLoad(ST, MemSize, AlignInBits, AddrSpace, MI.getOpcode())) { + const unsigned WideMemSize = PowerOf2Ceil(MemSize); + + // This was already the correct extending load result type, so just adjust + // the memory type. + if (WideMemSize == ValSize) { + MachineFunction &MF = B.getMF(); + + MachineMemOperand *WideMMO = + MF.getMachineMemOperand(MMO, 0, WideMemSize / 8); + Observer.changingInstr(MI); + MI.setMemRefs(MF, {WideMMO}); + Observer.changedInstr(MI); + return true; + } + + // Don't bother handling edge case that should probably never be produced. + if (ValSize > WideMemSize) + return false; + + LLT WideTy = widenToNextPowerOf2(ValTy); + + Register WideLoad; + if (!WideTy.isVector()) { + WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0); + B.buildTrunc(ValReg, WideLoad).getReg(0); + } else { + // Extract the subvector. + + if (isRegisterType(ValTy)) { + // If this a case where G_EXTRACT is legal, use it. + // (e.g. <3 x s32> -> <4 x s32>) + WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0); + B.buildExtract(ValReg, WideLoad, 0); + } else { + // For cases where the widened type isn't a nice register value, unmerge + // from a widened register (e.g. <3 x s16> -> <4 x s16>) + B.setInsertPt(B.getMBB(), ++B.getInsertPt()); + WideLoad = Helper.widenWithUnmerge(WideTy, ValReg); + B.setInsertPt(B.getMBB(), MI.getIterator()); + B.buildLoadFromOffset(WideLoad, PtrReg, *MMO, 0); + } + } + + MI.eraseFromParent(); + return true; + } + + return false; } bool AMDGPULegalizerInfo::legalizeFMad( @@ -2194,8 +2468,7 @@ bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg( Register CmpVal = MI.getOperand(2).getReg(); Register NewVal = MI.getOperand(3).getReg(); - assert(SITargetLowering::isFlatGlobalAddrSpace( - MRI.getType(PtrReg).getAddressSpace()) && + assert(AMDGPU::isFlatGlobalAddrSpace(MRI.getType(PtrReg).getAddressSpace()) && "this should not have been custom lowered"); LLT ValTy = MRI.getType(CmpVal); @@ -2364,23 +2637,42 @@ bool AMDGPULegalizerInfo::legalizeBuildVector( return true; } +// Check that this is a G_XOR x, -1 +static bool isNot(const MachineRegisterInfo &MRI, const MachineInstr &MI) { + if (MI.getOpcode() != TargetOpcode::G_XOR) + return false; + auto ConstVal = getConstantVRegSExtVal(MI.getOperand(2).getReg(), MRI); + return ConstVal && *ConstVal == -1; +} + // Return the use branch instruction, otherwise null if the usage is invalid. -static MachineInstr *verifyCFIntrinsic(MachineInstr &MI, - MachineRegisterInfo &MRI, - MachineInstr *&Br, - MachineBasicBlock *&UncondBrTarget) { +static MachineInstr * +verifyCFIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineInstr *&Br, + MachineBasicBlock *&UncondBrTarget, bool &Negated) { Register CondDef = MI.getOperand(0).getReg(); if (!MRI.hasOneNonDBGUse(CondDef)) return nullptr; MachineBasicBlock *Parent = MI.getParent(); - MachineInstr &UseMI = *MRI.use_instr_nodbg_begin(CondDef); - if (UseMI.getParent() != Parent || - UseMI.getOpcode() != AMDGPU::G_BRCOND) + MachineInstr *UseMI = &*MRI.use_instr_nodbg_begin(CondDef); + + if (isNot(MRI, *UseMI)) { + Register NegatedCond = UseMI->getOperand(0).getReg(); + if (!MRI.hasOneNonDBGUse(NegatedCond)) + return nullptr; + + // We're deleting the def of this value, so we need to remove it. + UseMI->eraseFromParent(); + + UseMI = &*MRI.use_instr_nodbg_begin(NegatedCond); + Negated = true; + } + + if (UseMI->getParent() != Parent || UseMI->getOpcode() != AMDGPU::G_BRCOND) return nullptr; // Make sure the cond br is followed by a G_BR, or is the last instruction. - MachineBasicBlock::iterator Next = std::next(UseMI.getIterator()); + MachineBasicBlock::iterator Next = std::next(UseMI->getIterator()); if (Next == Parent->end()) { MachineFunction::iterator NextMBB = std::next(Parent->getIterator()); if (NextMBB == Parent->getParent()->end()) // Illegal intrinsic use. @@ -2393,84 +2685,19 @@ static MachineInstr *verifyCFIntrinsic(MachineInstr &MI, UncondBrTarget = Br->getOperand(0).getMBB(); } - return &UseMI; -} - -Register AMDGPULegalizerInfo::insertLiveInCopy(MachineIRBuilder &B, - MachineRegisterInfo &MRI, - Register LiveIn, - Register PhyReg) const { - assert(PhyReg.isPhysical() && "Physical register expected"); - - // Insert the live-in copy, if required, by defining destination virtual - // register. - // FIXME: It seems EmitLiveInCopies isn't called anywhere? - if (!MRI.getVRegDef(LiveIn)) { - // FIXME: Should have scoped insert pt - MachineBasicBlock &OrigInsBB = B.getMBB(); - auto OrigInsPt = B.getInsertPt(); - - MachineBasicBlock &EntryMBB = B.getMF().front(); - EntryMBB.addLiveIn(PhyReg); - B.setInsertPt(EntryMBB, EntryMBB.begin()); - B.buildCopy(LiveIn, PhyReg); - - B.setInsertPt(OrigInsBB, OrigInsPt); - } - - return LiveIn; -} - -Register AMDGPULegalizerInfo::getLiveInRegister(MachineIRBuilder &B, - MachineRegisterInfo &MRI, - Register PhyReg, LLT Ty, - bool InsertLiveInCopy) const { - assert(PhyReg.isPhysical() && "Physical register expected"); - - // Get or create virtual live-in regester - Register LiveIn = MRI.getLiveInVirtReg(PhyReg); - if (!LiveIn) { - LiveIn = MRI.createGenericVirtualRegister(Ty); - MRI.addLiveIn(PhyReg, LiveIn); - } - - // When the actual true copy required is from virtual register to physical - // register (to be inserted later), live-in copy insertion from physical - // to register virtual register is not required - if (!InsertLiveInCopy) - return LiveIn; - - return insertLiveInCopy(B, MRI, LiveIn, PhyReg); -} - -const ArgDescriptor *AMDGPULegalizerInfo::getArgDescriptor( - MachineIRBuilder &B, AMDGPUFunctionArgInfo::PreloadedValue ArgType) const { - const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); - const ArgDescriptor *Arg; - const TargetRegisterClass *RC; - LLT ArgTy; - std::tie(Arg, RC, ArgTy) = MFI->getPreloadedValue(ArgType); - if (!Arg) { - LLVM_DEBUG(dbgs() << "Required arg register missing\n"); - return nullptr; - } - return Arg; + return UseMI; } bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B, - const ArgDescriptor *Arg) const { - if (!Arg->isRegister() || !Arg->getRegister().isValid()) - return false; // TODO: Handle these - - Register SrcReg = Arg->getRegister(); - assert(SrcReg.isPhysical() && "Physical register expected"); + const ArgDescriptor *Arg, + const TargetRegisterClass *ArgRC, + LLT ArgTy) const { + MCRegister SrcReg = Arg->getRegister(); + assert(Register::isPhysicalRegister(SrcReg) && "Physical register expected"); assert(DstReg.isVirtual() && "Virtual register expected"); - MachineRegisterInfo &MRI = *B.getMRI(); - - LLT Ty = MRI.getType(DstReg); - Register LiveIn = getLiveInRegister(B, MRI, SrcReg, Ty); - + Register LiveIn = getFunctionLiveInPhysReg(B.getMF(), B.getTII(), SrcReg, *ArgRC, + ArgTy); if (Arg->isMasked()) { // TODO: Should we try to emit this once in the entry block? const LLT S32 = LLT::scalar(32); @@ -2492,15 +2719,24 @@ bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B, return true; } -bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin( - MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, +bool AMDGPULegalizerInfo::loadInputValue( + Register DstReg, MachineIRBuilder &B, AMDGPUFunctionArgInfo::PreloadedValue ArgType) const { + const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); + const ArgDescriptor *Arg; + const TargetRegisterClass *ArgRC; + LLT ArgTy; + std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType); - const ArgDescriptor *Arg = getArgDescriptor(B, ArgType); - if (!Arg) - return false; + if (!Arg->isRegister() || !Arg->getRegister().isValid()) + return false; // TODO: Handle these + return loadInputValue(DstReg, B, Arg, ArgRC, ArgTy); +} - if (!loadInputValue(MI.getOperand(0).getReg(), B, Arg)) +bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin( + MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, + AMDGPUFunctionArgInfo::PreloadedValue ArgType) const { + if (!loadInputValue(MI.getOperand(0).getReg(), B, ArgType)) return false; MI.eraseFromParent(); @@ -2516,9 +2752,6 @@ bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI, LLT S32 = LLT::scalar(32); LLT S64 = LLT::scalar(64); - if (legalizeFastUnsafeFDIV(MI, MRI, B)) - return true; - if (DstTy == S16) return legalizeFDIV16(MI, MRI, B); if (DstTy == S32) @@ -2813,22 +3046,14 @@ bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI, Register Res = MI.getOperand(0).getReg(); Register LHS = MI.getOperand(1).getReg(); Register RHS = MI.getOperand(2).getReg(); - uint16_t Flags = MI.getFlags(); - LLT ResTy = MRI.getType(Res); - LLT S32 = LLT::scalar(32); - LLT S64 = LLT::scalar(64); const MachineFunction &MF = B.getMF(); - bool Unsafe = - MF.getTarget().Options.UnsafeFPMath || MI.getFlag(MachineInstr::FmArcp); - - if (!MF.getTarget().Options.UnsafeFPMath && ResTy == S64) - return false; + bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath || + MI.getFlag(MachineInstr::FmAfn); - if (!Unsafe && ResTy == S32 && - MF.getInfo<SIMachineFunctionInfo>()->getMode().allFP32Denormals()) + if (!AllowInaccurateRcp) return false; if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) { @@ -2855,22 +3080,58 @@ bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI, } // x / y -> x * (1.0 / y) - if (Unsafe) { - auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false) - .addUse(RHS) - .setMIFlags(Flags); - B.buildFMul(Res, LHS, RCP, Flags); + auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false) + .addUse(RHS) + .setMIFlags(Flags); + B.buildFMul(Res, LHS, RCP, Flags); - MI.eraseFromParent(); - return true; - } + MI.eraseFromParent(); + return true; +} - return false; +bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV64(MachineInstr &MI, + MachineRegisterInfo &MRI, + MachineIRBuilder &B) const { + Register Res = MI.getOperand(0).getReg(); + Register X = MI.getOperand(1).getReg(); + Register Y = MI.getOperand(2).getReg(); + uint16_t Flags = MI.getFlags(); + LLT ResTy = MRI.getType(Res); + + const MachineFunction &MF = B.getMF(); + bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath || + MI.getFlag(MachineInstr::FmAfn); + + if (!AllowInaccurateRcp) + return false; + + auto NegY = B.buildFNeg(ResTy, Y); + auto One = B.buildFConstant(ResTy, 1.0); + + auto R = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false) + .addUse(Y) + .setMIFlags(Flags); + + auto Tmp0 = B.buildFMA(ResTy, NegY, R, One); + R = B.buildFMA(ResTy, Tmp0, R, R); + + auto Tmp1 = B.buildFMA(ResTy, NegY, R, One); + R = B.buildFMA(ResTy, Tmp1, R, R); + + auto Ret = B.buildFMul(ResTy, X, R); + auto Tmp2 = B.buildFMA(ResTy, NegY, Ret, X); + + B.buildFMA(Res, Tmp2, R, Ret); + MI.eraseFromParent(); + return true; } bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { + if (legalizeFastUnsafeFDIV(MI, MRI, B)) + return true; + Register Res = MI.getOperand(0).getReg(); Register LHS = MI.getOperand(1).getReg(); Register RHS = MI.getOperand(2).getReg(); @@ -2933,6 +3194,9 @@ static void toggleSPDenormMode(bool Enable, bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { + if (legalizeFastUnsafeFDIV(MI, MRI, B)) + return true; + Register Res = MI.getOperand(0).getReg(); Register LHS = MI.getOperand(1).getReg(); Register RHS = MI.getOperand(2).getReg(); @@ -2999,6 +3263,9 @@ bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI, bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { + if (legalizeFastUnsafeFDIV64(MI, MRI, B)) + return true; + Register Res = MI.getOperand(0).getReg(); Register LHS = MI.getOperand(1).getReg(); Register RHS = MI.getOperand(2).getReg(); @@ -3109,35 +3376,118 @@ bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI, return true; } -bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI, - MachineRegisterInfo &MRI, - MachineIRBuilder &B) const { +// Expand llvm.amdgcn.rsq.clamp on targets that don't support the instruction. +// FIXME: Why do we handle this one but not other removed instructions? +// +// Reciprocal square root. The clamp prevents infinite results, clamping +// infinities to max_float. D.f = 1.0 / sqrt(S0.f), result clamped to +// +-max_float. +bool AMDGPULegalizerInfo::legalizeRsqClampIntrinsic(MachineInstr &MI, + MachineRegisterInfo &MRI, + MachineIRBuilder &B) const { + if (ST.getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS) + return true; + + Register Dst = MI.getOperand(0).getReg(); + Register Src = MI.getOperand(2).getReg(); + auto Flags = MI.getFlags(); + + LLT Ty = MRI.getType(Dst); + + const fltSemantics *FltSemantics; + if (Ty == LLT::scalar(32)) + FltSemantics = &APFloat::IEEEsingle(); + else if (Ty == LLT::scalar(64)) + FltSemantics = &APFloat::IEEEdouble(); + else + return false; + + auto Rsq = B.buildIntrinsic(Intrinsic::amdgcn_rsq, {Ty}, false) + .addUse(Src) + .setMIFlags(Flags); + + // We don't need to concern ourselves with the snan handling difference, since + // the rsq quieted (or not) so use the one which will directly select. const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); - if (!MFI->isEntryFunction()) { - return legalizePreloadedArgIntrin(MI, MRI, B, - AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR); + const bool UseIEEE = MFI->getMode().IEEE; + + auto MaxFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics)); + auto ClampMax = UseIEEE ? B.buildFMinNumIEEE(Ty, Rsq, MaxFlt, Flags) : + B.buildFMinNum(Ty, Rsq, MaxFlt, Flags); + + auto MinFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics, true)); + + if (UseIEEE) + B.buildFMaxNumIEEE(Dst, ClampMax, MinFlt, Flags); + else + B.buildFMaxNum(Dst, ClampMax, MinFlt, Flags); + MI.eraseFromParent(); + return true; +} + +static unsigned getDSFPAtomicOpcode(Intrinsic::ID IID) { + switch (IID) { + case Intrinsic::amdgcn_ds_fadd: + return AMDGPU::G_ATOMICRMW_FADD; + case Intrinsic::amdgcn_ds_fmin: + return AMDGPU::G_AMDGPU_ATOMIC_FMIN; + case Intrinsic::amdgcn_ds_fmax: + return AMDGPU::G_AMDGPU_ATOMIC_FMAX; + default: + llvm_unreachable("not a DS FP intrinsic"); } +} + +bool AMDGPULegalizerInfo::legalizeDSAtomicFPIntrinsic(LegalizerHelper &Helper, + MachineInstr &MI, + Intrinsic::ID IID) const { + GISelChangeObserver &Observer = Helper.Observer; + Observer.changingInstr(MI); + + MI.setDesc(ST.getInstrInfo()->get(getDSFPAtomicOpcode(IID))); + + // The remaining operands were used to set fields in the MemOperand on + // construction. + for (int I = 6; I > 3; --I) + MI.RemoveOperand(I); + + MI.RemoveOperand(1); // Remove the intrinsic ID. + Observer.changedInstr(MI); + return true; +} +bool AMDGPULegalizerInfo::getImplicitArgPtr(Register DstReg, + MachineRegisterInfo &MRI, + MachineIRBuilder &B) const { uint64_t Offset = ST.getTargetLowering()->getImplicitParameterOffset( B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT); - Register DstReg = MI.getOperand(0).getReg(); LLT DstTy = MRI.getType(DstReg); LLT IdxTy = LLT::scalar(DstTy.getSizeInBits()); - const ArgDescriptor *Arg; - const TargetRegisterClass *RC; - LLT ArgTy; - std::tie(Arg, RC, ArgTy) = - MFI->getPreloadedValue(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR); - if (!Arg) - return false; - Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy); - if (!loadInputValue(KernargPtrReg, B, Arg)) + if (!loadInputValue(KernargPtrReg, B, + AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR)) return false; + // FIXME: This should be nuw B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0)); + return true; +} + +bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI, + MachineRegisterInfo &MRI, + MachineIRBuilder &B) const { + const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); + if (!MFI->isEntryFunction()) { + return legalizePreloadedArgIntrin(MI, MRI, B, + AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR); + } + + Register DstReg = MI.getOperand(0).getReg(); + if (!getImplicitArgPtr(DstReg, MRI, B)) + return false; + MI.eraseFromParent(); return true; } @@ -3147,7 +3497,9 @@ bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI, MachineIRBuilder &B, unsigned AddrSpace) const { Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B); - auto Hi32 = B.buildExtract(LLT::scalar(32), MI.getOperand(2).getReg(), 32); + auto Unmerge = B.buildUnmerge(LLT::scalar(32), MI.getOperand(2).getReg()); + Register Hi32 = Unmerge.getReg(1); + B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg); MI.eraseFromParent(); return true; @@ -3165,11 +3517,10 @@ AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B, const unsigned MaxImm = 4095; Register BaseReg; unsigned TotalConstOffset; - MachineInstr *OffsetDef; const LLT S32 = LLT::scalar(32); - std::tie(BaseReg, TotalConstOffset, OffsetDef) - = AMDGPU::getBaseWithConstantOffset(*B.getMRI(), OrigOffset); + std::tie(BaseReg, TotalConstOffset) = + AMDGPU::getBaseWithConstantOffset(*B.getMRI(), OrigOffset); unsigned ImmOffset = TotalConstOffset; @@ -3205,24 +3556,58 @@ AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B, /// Handle register layout difference for f16 images for some subtargets. Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B, MachineRegisterInfo &MRI, - Register Reg) const { - if (!ST.hasUnpackedD16VMem()) - return Reg; - + Register Reg, + bool ImageStore) const { const LLT S16 = LLT::scalar(16); const LLT S32 = LLT::scalar(32); LLT StoreVT = MRI.getType(Reg); assert(StoreVT.isVector() && StoreVT.getElementType() == S16); - auto Unmerge = B.buildUnmerge(S16, Reg); + if (ST.hasUnpackedD16VMem()) { + auto Unmerge = B.buildUnmerge(S16, Reg); - SmallVector<Register, 4> WideRegs; - for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I) - WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0)); + SmallVector<Register, 4> WideRegs; + for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I) + WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0)); + + int NumElts = StoreVT.getNumElements(); + + return B.buildBuildVector(LLT::vector(NumElts, S32), WideRegs).getReg(0); + } + + if (ImageStore && ST.hasImageStoreD16Bug()) { + if (StoreVT.getNumElements() == 2) { + SmallVector<Register, 4> PackedRegs; + Reg = B.buildBitcast(S32, Reg).getReg(0); + PackedRegs.push_back(Reg); + PackedRegs.resize(2, B.buildUndef(S32).getReg(0)); + return B.buildBuildVector(LLT::vector(2, S32), PackedRegs).getReg(0); + } + + if (StoreVT.getNumElements() == 3) { + SmallVector<Register, 4> PackedRegs; + auto Unmerge = B.buildUnmerge(S16, Reg); + for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I) + PackedRegs.push_back(Unmerge.getReg(I)); + PackedRegs.resize(6, B.buildUndef(S16).getReg(0)); + Reg = B.buildBuildVector(LLT::vector(6, S16), PackedRegs).getReg(0); + return B.buildBitcast(LLT::vector(3, S32), Reg).getReg(0); + } + + if (StoreVT.getNumElements() == 4) { + SmallVector<Register, 4> PackedRegs; + Reg = B.buildBitcast(LLT::vector(2, S32), Reg).getReg(0); + auto Unmerge = B.buildUnmerge(S32, Reg); + for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I) + PackedRegs.push_back(Unmerge.getReg(I)); + PackedRegs.resize(4, B.buildUndef(S32).getReg(0)); + return B.buildBuildVector(LLT::vector(4, S32), PackedRegs).getReg(0); + } - int NumElts = StoreVT.getNumElements(); + llvm_unreachable("invalid data type"); + } - return B.buildBuildVector(LLT::vector(NumElts, S32), WideRegs).getReg(0); + return Reg; } Register AMDGPULegalizerInfo::fixStoreSourceType( @@ -3513,6 +3898,9 @@ static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) { case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap: case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap: return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP; + case Intrinsic::amdgcn_raw_buffer_atomic_fadd: + case Intrinsic::amdgcn_struct_buffer_atomic_fadd: + return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD; default: llvm_unreachable("unhandled atomic opcode"); } @@ -3523,12 +3911,20 @@ bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI, Intrinsic::ID IID) const { const bool IsCmpSwap = IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap || IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap; + const bool HasReturn = MI.getNumExplicitDefs() != 0; - Register Dst = MI.getOperand(0).getReg(); - Register VData = MI.getOperand(2).getReg(); + Register Dst; - Register CmpVal; int OpOffset = 0; + if (HasReturn) { + // A few FP atomics do not support return values. + Dst = MI.getOperand(0).getReg(); + } else { + OpOffset = -1; + } + + Register VData = MI.getOperand(2 + OpOffset).getReg(); + Register CmpVal; if (IsCmpSwap) { CmpVal = MI.getOperand(3 + OpOffset).getReg(); @@ -3536,7 +3932,7 @@ bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI, } Register RSrc = MI.getOperand(3 + OpOffset).getReg(); - const unsigned NumVIndexOps = IsCmpSwap ? 9 : 8; + const unsigned NumVIndexOps = (IsCmpSwap ? 8 : 7) + HasReturn; // The struct intrinsic variants add one additional operand over raw. const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; @@ -3561,9 +3957,12 @@ bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI, if (!VIndex) VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0); - auto MIB = B.buildInstr(getBufferAtomicPseudo(IID)) - .addDef(Dst) - .addUse(VData); // vdata + auto MIB = B.buildInstr(getBufferAtomicPseudo(IID)); + + if (HasReturn) + MIB.addDef(Dst); + + MIB.addUse(VData); // vdata if (IsCmpSwap) MIB.addReg(CmpVal); @@ -3583,38 +3982,41 @@ bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI, /// Turn a set of s16 typed registers in \p A16AddrRegs into a dword sized /// vector with s16 typed elements. -static void packImageA16AddressToDwords(MachineIRBuilder &B, MachineInstr &MI, - SmallVectorImpl<Register> &PackedAddrs, - int AddrIdx, int DimIdx, int EndIdx, - int NumGradients) { +static void packImageA16AddressToDwords( + MachineIRBuilder &B, MachineInstr &MI, + SmallVectorImpl<Register> &PackedAddrs, unsigned ArgOffset, + const AMDGPU::ImageDimIntrinsicInfo *Intr, unsigned EndIdx) { const LLT S16 = LLT::scalar(16); const LLT V2S16 = LLT::vector(2, 16); - for (int I = AddrIdx; I < EndIdx; ++I) { - MachineOperand &SrcOp = MI.getOperand(I); + for (unsigned I = Intr->VAddrStart; I < EndIdx; I++) { + MachineOperand &SrcOp = MI.getOperand(ArgOffset + I); if (!SrcOp.isReg()) continue; // _L to _LZ may have eliminated this. Register AddrReg = SrcOp.getReg(); - if (I < DimIdx) { + if (I < Intr->GradientStart) { AddrReg = B.buildBitcast(V2S16, AddrReg).getReg(0); PackedAddrs.push_back(AddrReg); } else { // Dz/dh, dz/dv and the last odd coord are packed with undef. Also, in 1D, // derivatives dx/dh and dx/dv are packed with undef. if (((I + 1) >= EndIdx) || - ((NumGradients / 2) % 2 == 1 && - (I == DimIdx + (NumGradients / 2) - 1 || - I == DimIdx + NumGradients - 1)) || + ((Intr->NumGradients / 2) % 2 == 1 && + (I == static_cast<unsigned>(Intr->GradientStart + + (Intr->NumGradients / 2) - 1) || + I == static_cast<unsigned>(Intr->GradientStart + + Intr->NumGradients - 1))) || // Check for _L to _LZ optimization - !MI.getOperand(I + 1).isReg()) { + !MI.getOperand(ArgOffset + I + 1).isReg()) { PackedAddrs.push_back( B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)}) .getReg(0)); } else { PackedAddrs.push_back( - B.buildBuildVector(V2S16, {AddrReg, MI.getOperand(I + 1).getReg()}) + B.buildBuildVector( + V2S16, {AddrReg, MI.getOperand(ArgOffset + I + 1).getReg()}) .getReg(0)); ++I; } @@ -3673,43 +4075,37 @@ static void convertImageAddrToPacked(MachineIRBuilder &B, MachineInstr &MI, /// the intrinsic's arguments. In cases like a16 addreses, this requires padding /// now unnecessary arguments with $noreg. bool AMDGPULegalizerInfo::legalizeImageIntrinsic( - MachineInstr &MI, MachineIRBuilder &B, - GISelChangeObserver &Observer, - const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr) const { + MachineInstr &MI, MachineIRBuilder &B, GISelChangeObserver &Observer, + const AMDGPU::ImageDimIntrinsicInfo *Intr) const { - const int NumDefs = MI.getNumExplicitDefs(); + const unsigned NumDefs = MI.getNumExplicitDefs(); + const unsigned ArgOffset = NumDefs + 1; bool IsTFE = NumDefs == 2; // We are only processing the operands of d16 image operations on subtargets // that use the unpacked register layout, or need to repack the TFE result. // TODO: Do we need to guard against already legalized intrinsics? const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode = - AMDGPU::getMIMGBaseOpcodeInfo(ImageDimIntr->BaseOpcode); + AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode); MachineRegisterInfo *MRI = B.getMRI(); const LLT S32 = LLT::scalar(32); const LLT S16 = LLT::scalar(16); const LLT V2S16 = LLT::vector(2, 16); - // Index of first address argument - const int AddrIdx = getImageVAddrIdxBegin(BaseOpcode, NumDefs); - - int NumVAddrs, NumGradients; - std::tie(NumVAddrs, NumGradients) = getImageNumVAddr(ImageDimIntr, BaseOpcode); - const int DMaskIdx = BaseOpcode->Atomic ? -1 : - getDMaskIdx(BaseOpcode, NumDefs); unsigned DMask = 0; // Check for 16 bit addresses and pack if true. - int DimIdx = AddrIdx + BaseOpcode->NumExtraArgs; - LLT GradTy = MRI->getType(MI.getOperand(DimIdx).getReg()); - LLT AddrTy = MRI->getType(MI.getOperand(DimIdx + NumGradients).getReg()); + LLT GradTy = + MRI->getType(MI.getOperand(ArgOffset + Intr->GradientStart).getReg()); + LLT AddrTy = + MRI->getType(MI.getOperand(ArgOffset + Intr->CoordStart).getReg()); const bool IsG16 = GradTy == S16; const bool IsA16 = AddrTy == S16; int DMaskLanes = 0; if (!BaseOpcode->Atomic) { - DMask = MI.getOperand(DMaskIdx).getImm(); + DMask = MI.getOperand(ArgOffset + Intr->DMaskIndex).getImm(); if (BaseOpcode->Gather4) { DMaskLanes = 4; } else if (DMask != 0) { @@ -3736,7 +4132,7 @@ bool AMDGPULegalizerInfo::legalizeImageIntrinsic( if (IsTFE && DMask == 0) { DMask = 0x1; DMaskLanes = 1; - MI.getOperand(DMaskIdx).setImm(DMask); + MI.getOperand(ArgOffset + Intr->DMaskIndex).setImm(DMask); } if (BaseOpcode->Atomic) { @@ -3757,41 +4153,41 @@ bool AMDGPULegalizerInfo::legalizeImageIntrinsic( } } - int CorrectedNumVAddrs = NumVAddrs; + unsigned CorrectedNumVAddrs = Intr->NumVAddrs; // Optimize _L to _LZ when _L is zero if (const AMDGPU::MIMGLZMappingInfo *LZMappingInfo = - AMDGPU::getMIMGLZMappingInfo(ImageDimIntr->BaseOpcode)) { + AMDGPU::getMIMGLZMappingInfo(Intr->BaseOpcode)) { const ConstantFP *ConstantLod; - const int LodIdx = AddrIdx + NumVAddrs - 1; - if (mi_match(MI.getOperand(LodIdx).getReg(), *MRI, m_GFCst(ConstantLod))) { + if (mi_match(MI.getOperand(ArgOffset + Intr->LodIndex).getReg(), *MRI, + m_GFCst(ConstantLod))) { if (ConstantLod->isZero() || ConstantLod->isNegative()) { // Set new opcode to _lz variant of _l, and change the intrinsic ID. - ImageDimIntr = AMDGPU::getImageDimInstrinsicByBaseOpcode( - LZMappingInfo->LZ, ImageDimIntr->Dim); + const AMDGPU::ImageDimIntrinsicInfo *NewImageDimIntr = + AMDGPU::getImageDimInstrinsicByBaseOpcode(LZMappingInfo->LZ, + Intr->Dim); // The starting indexes should remain in the same place. - --NumVAddrs; --CorrectedNumVAddrs; - MI.getOperand(MI.getNumExplicitDefs()).setIntrinsicID( - static_cast<Intrinsic::ID>(ImageDimIntr->Intr)); - MI.RemoveOperand(LodIdx); + MI.getOperand(MI.getNumExplicitDefs()) + .setIntrinsicID(static_cast<Intrinsic::ID>(NewImageDimIntr->Intr)); + MI.RemoveOperand(ArgOffset + Intr->LodIndex); + Intr = NewImageDimIntr; } } } // Optimize _mip away, when 'lod' is zero - if (AMDGPU::getMIMGMIPMappingInfo(ImageDimIntr->BaseOpcode)) { + if (AMDGPU::getMIMGMIPMappingInfo(Intr->BaseOpcode)) { int64_t ConstantLod; - const int LodIdx = AddrIdx + NumVAddrs - 1; - - if (mi_match(MI.getOperand(LodIdx).getReg(), *MRI, m_ICst(ConstantLod))) { + if (mi_match(MI.getOperand(ArgOffset + Intr->MipIndex).getReg(), *MRI, + m_ICst(ConstantLod))) { if (ConstantLod == 0) { // TODO: Change intrinsic opcode and remove operand instead or replacing // it with 0, as the _L to _LZ handling is done above. - MI.getOperand(LodIdx).ChangeToImmediate(0); + MI.getOperand(ArgOffset + Intr->MipIndex).ChangeToImmediate(0); --CorrectedNumVAddrs; } } @@ -3806,18 +4202,17 @@ bool AMDGPULegalizerInfo::legalizeImageIntrinsic( } else if (!ST.hasG16()) return false; - if (NumVAddrs > 1) { + if (Intr->NumVAddrs > 1) { SmallVector<Register, 4> PackedRegs; // Don't compress addresses for G16 - const int PackEndIdx = - IsA16 ? (AddrIdx + NumVAddrs) : (DimIdx + NumGradients); - packImageA16AddressToDwords(B, MI, PackedRegs, AddrIdx, DimIdx, - PackEndIdx, NumGradients); + const int PackEndIdx = IsA16 ? Intr->VAddrEnd : Intr->CoordStart; + packImageA16AddressToDwords(B, MI, PackedRegs, ArgOffset, Intr, + PackEndIdx); if (!IsA16) { // Add uncompressed address - for (int I = DimIdx + NumGradients; I != AddrIdx + NumVAddrs; ++I) { - int AddrReg = MI.getOperand(I).getReg(); + for (unsigned I = Intr->CoordStart; I < Intr->VAddrEnd; I++) { + int AddrReg = MI.getOperand(ArgOffset + I).getReg(); assert(B.getMRI()->getType(AddrReg) == LLT::scalar(32)); PackedRegs.push_back(AddrReg); } @@ -3833,9 +4228,9 @@ bool AMDGPULegalizerInfo::legalizeImageIntrinsic( PackedRegs.resize(1); } - const int NumPacked = PackedRegs.size(); - for (int I = 0; I != NumVAddrs; ++I) { - MachineOperand &SrcOp = MI.getOperand(AddrIdx + I); + const unsigned NumPacked = PackedRegs.size(); + for (unsigned I = Intr->VAddrStart; I < Intr->VAddrEnd; I++) { + MachineOperand &SrcOp = MI.getOperand(ArgOffset + I); if (!SrcOp.isReg()) { assert(SrcOp.isImm() && SrcOp.getImm() == 0); continue; @@ -3843,8 +4238,8 @@ bool AMDGPULegalizerInfo::legalizeImageIntrinsic( assert(SrcOp.getReg() != AMDGPU::NoRegister); - if (I < NumPacked) - SrcOp.setReg(PackedRegs[I]); + if (I - Intr->VAddrStart < NumPacked) + SrcOp.setReg(PackedRegs[I - Intr->VAddrStart]); else SrcOp.setReg(AMDGPU::NoRegister); } @@ -3863,8 +4258,9 @@ bool AMDGPULegalizerInfo::legalizeImageIntrinsic( // allocation when possible. const bool UseNSA = CorrectedNumVAddrs >= 3 && ST.hasNSAEncoding(); - if (!UseNSA && NumVAddrs > 1) - convertImageAddrToPacked(B, MI, AddrIdx, NumVAddrs); + if (!UseNSA && Intr->NumVAddrs > 1) + convertImageAddrToPacked(B, MI, ArgOffset + Intr->VAddrStart, + Intr->NumVAddrs); } int Flags = 0; @@ -3881,7 +4277,7 @@ bool AMDGPULegalizerInfo::legalizeImageIntrinsic( if (!Ty.isVector() || Ty.getElementType() != S16) return true; - Register RepackedReg = handleD16VData(B, *MRI, VData); + Register RepackedReg = handleD16VData(B, *MRI, VData, true); if (RepackedReg != VData) { MI.getOperand(1).setReg(RepackedReg); } @@ -4053,8 +4449,10 @@ bool AMDGPULegalizerInfo::legalizeImageIntrinsic( } bool AMDGPULegalizerInfo::legalizeSBufferLoad( - MachineInstr &MI, MachineIRBuilder &B, - GISelChangeObserver &Observer) const { + LegalizerHelper &Helper, MachineInstr &MI) const { + MachineIRBuilder &B = Helper.MIRBuilder; + GISelChangeObserver &Observer = Helper.Observer; + Register Dst = MI.getOperand(0).getReg(); LLT Ty = B.getMRI()->getType(Dst); unsigned Size = Ty.getSizeInBits(); @@ -4062,6 +4460,13 @@ bool AMDGPULegalizerInfo::legalizeSBufferLoad( Observer.changingInstr(MI); + if (shouldBitcastLoadStoreType(ST, Ty, Size)) { + Ty = getBitcastRegisterType(Ty); + Helper.bitcastDst(MI, Ty, 0); + Dst = MI.getOperand(0).getReg(); + B.setInsertPt(B.getMBB(), MI); + } + // FIXME: We don't really need this intermediate instruction. The intrinsic // should be fixed to have a memory operand. Since it's readnone, we're not // allowed to add one. @@ -4083,8 +4488,6 @@ bool AMDGPULegalizerInfo::legalizeSBufferLoad( // always be legal. We may need to restore this to a 96-bit result if it turns // out this needs to be converted to a vector load during RegBankSelect. if (!isPowerOf2_32(Size)) { - LegalizerHelper Helper(MF, *this, Observer, B); - if (Ty.isVector()) Helper.moreElementsVectorDst(MI, getPow2VectorType(Ty), 0); else @@ -4095,6 +4498,7 @@ bool AMDGPULegalizerInfo::legalizeSBufferLoad( return true; } +// TODO: Move to selection bool AMDGPULegalizerInfo::legalizeTrapIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { @@ -4105,17 +4509,14 @@ bool AMDGPULegalizerInfo::legalizeTrapIntrinsic(MachineInstr &MI, } else { // Pass queue pointer to trap handler as input, and insert trap instruction // Reference: https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi - const ArgDescriptor *Arg = - getArgDescriptor(B, AMDGPUFunctionArgInfo::QUEUE_PTR); - if (!Arg) - return false; MachineRegisterInfo &MRI = *B.getMRI(); - Register SGPR01(AMDGPU::SGPR0_SGPR1); - Register LiveIn = getLiveInRegister( - B, MRI, SGPR01, LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64), - /*InsertLiveInCopy=*/false); - if (!loadInputValue(LiveIn, B, Arg)) + + Register LiveIn = + MRI.createGenericVirtualRegister(LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); + if (!loadInputValue(LiveIn, B, AMDGPUFunctionArgInfo::QUEUE_PTR)) return false; + + Register SGPR01(AMDGPU::SGPR0_SGPR1); B.buildCopy(SGPR01, LiveIn); B.buildInstr(AMDGPU::S_TRAP) .addImm(GCNSubtarget::TrapIDLLVMTrap) @@ -4146,6 +4547,78 @@ bool AMDGPULegalizerInfo::legalizeDebugTrapIntrinsic( return true; } +bool AMDGPULegalizerInfo::legalizeBVHIntrinsic(MachineInstr &MI, + MachineIRBuilder &B) const { + MachineRegisterInfo &MRI = *B.getMRI(); + const LLT S16 = LLT::scalar(16); + const LLT S32 = LLT::scalar(32); + + Register DstReg = MI.getOperand(0).getReg(); + Register NodePtr = MI.getOperand(2).getReg(); + Register RayExtent = MI.getOperand(3).getReg(); + Register RayOrigin = MI.getOperand(4).getReg(); + Register RayDir = MI.getOperand(5).getReg(); + Register RayInvDir = MI.getOperand(6).getReg(); + Register TDescr = MI.getOperand(7).getReg(); + + bool IsA16 = MRI.getType(RayDir).getElementType().getSizeInBits() == 16; + bool Is64 = MRI.getType(NodePtr).getSizeInBits() == 64; + unsigned Opcode = IsA16 ? Is64 ? AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16_nsa + : AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16_nsa + : Is64 ? AMDGPU::IMAGE_BVH64_INTERSECT_RAY_nsa + : AMDGPU::IMAGE_BVH_INTERSECT_RAY_nsa; + + SmallVector<Register, 12> Ops; + if (Is64) { + auto Unmerge = B.buildUnmerge({S32, S32}, NodePtr); + Ops.push_back(Unmerge.getReg(0)); + Ops.push_back(Unmerge.getReg(1)); + } else { + Ops.push_back(NodePtr); + } + Ops.push_back(RayExtent); + + auto packLanes = [&Ops, &S32, &B] (Register Src) { + auto Unmerge = B.buildUnmerge({S32, S32, S32, S32}, Src); + Ops.push_back(Unmerge.getReg(0)); + Ops.push_back(Unmerge.getReg(1)); + Ops.push_back(Unmerge.getReg(2)); + }; + + packLanes(RayOrigin); + if (IsA16) { + auto UnmergeRayDir = B.buildUnmerge({S16, S16, S16, S16}, RayDir); + auto UnmergeRayInvDir = B.buildUnmerge({S16, S16, S16, S16}, RayInvDir); + Register R1 = MRI.createGenericVirtualRegister(S32); + Register R2 = MRI.createGenericVirtualRegister(S32); + Register R3 = MRI.createGenericVirtualRegister(S32); + B.buildMerge(R1, {UnmergeRayDir.getReg(0), UnmergeRayDir.getReg(1)}); + B.buildMerge(R2, {UnmergeRayDir.getReg(2), UnmergeRayInvDir.getReg(0)}); + B.buildMerge(R3, {UnmergeRayInvDir.getReg(1), UnmergeRayInvDir.getReg(2)}); + Ops.push_back(R1); + Ops.push_back(R2); + Ops.push_back(R3); + } else { + packLanes(RayDir); + packLanes(RayInvDir); + } + + auto MIB = B.buildInstr(AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY) + .addDef(DstReg) + .addImm(Opcode); + + for (Register R : Ops) { + MIB.addUse(R); + } + + MIB.addUse(TDescr) + .addImm(IsA16 ? 1 : 0) + .cloneMemRefs(MI); + + MI.eraseFromParent(); + return true; +} + bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper, MachineInstr &MI) const { MachineIRBuilder &B = Helper.MIRBuilder; @@ -4158,7 +4631,9 @@ bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper, case Intrinsic::amdgcn_else: { MachineInstr *Br = nullptr; MachineBasicBlock *UncondBrTarget = nullptr; - if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget)) { + bool Negated = false; + if (MachineInstr *BrCond = + verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) { const SIRegisterInfo *TRI = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); @@ -4166,6 +4641,10 @@ bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper, Register Use = MI.getOperand(3).getReg(); MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB(); + + if (Negated) + std::swap(CondBrTarget, UncondBrTarget); + B.setInsertPt(B.getMBB(), BrCond->getIterator()); if (IntrID == Intrinsic::amdgcn_if) { B.buildInstr(AMDGPU::SI_IF) @@ -4174,10 +4653,9 @@ bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper, .addMBB(UncondBrTarget); } else { B.buildInstr(AMDGPU::SI_ELSE) - .addDef(Def) - .addUse(Use) - .addMBB(UncondBrTarget) - .addImm(0); + .addDef(Def) + .addUse(Use) + .addMBB(UncondBrTarget); } if (Br) { @@ -4201,13 +4679,18 @@ bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper, case Intrinsic::amdgcn_loop: { MachineInstr *Br = nullptr; MachineBasicBlock *UncondBrTarget = nullptr; - if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget)) { + bool Negated = false; + if (MachineInstr *BrCond = + verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) { const SIRegisterInfo *TRI = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB(); Register Reg = MI.getOperand(2).getReg(); + if (Negated) + std::swap(CondBrTarget, UncondBrTarget); + B.setInsertPt(B.getMBB(), BrCond->getIterator()); B.buildInstr(AMDGPU::SI_LOOP) .addUse(Reg) @@ -4280,7 +4763,7 @@ bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper, return true; } case Intrinsic::amdgcn_s_buffer_load: - return legalizeSBufferLoad(MI, B, Helper.Observer); + return legalizeSBufferLoad(Helper, MI); case Intrinsic::amdgcn_raw_buffer_store: case Intrinsic::amdgcn_struct_buffer_store: return legalizeBufferStore(MI, MRI, B, false, false); @@ -4323,6 +4806,8 @@ bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper, case Intrinsic::amdgcn_struct_buffer_atomic_inc: case Intrinsic::amdgcn_raw_buffer_atomic_dec: case Intrinsic::amdgcn_struct_buffer_atomic_dec: + case Intrinsic::amdgcn_raw_buffer_atomic_fadd: + case Intrinsic::amdgcn_struct_buffer_atomic_fadd: case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap: case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap: return legalizeBufferAtomic(MI, B, IntrID); @@ -4334,6 +4819,14 @@ bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper, return legalizeTrapIntrinsic(MI, MRI, B); case Intrinsic::debugtrap: return legalizeDebugTrapIntrinsic(MI, MRI, B); + case Intrinsic::amdgcn_rsq_clamp: + return legalizeRsqClampIntrinsic(MI, MRI, B); + case Intrinsic::amdgcn_ds_fadd: + case Intrinsic::amdgcn_ds_fmin: + case Intrinsic::amdgcn_ds_fmax: + return legalizeDSAtomicFPIntrinsic(Helper, MI, IntrID); + case Intrinsic::amdgcn_image_bvh_intersect_ray: + return legalizeBVHIntrinsic(MI, B); default: { if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr = AMDGPU::getImageDimIntrinsicInfo(IntrID)) |
