diff options
Diffstat (limited to 'llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp')
-rw-r--r-- | llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp | 2747 |
1 files changed, 2311 insertions, 436 deletions
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp index 3f99d5cfb7f9a..2976794b49c3b 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp @@ -11,19 +11,16 @@ /// \todo This should be generated by TableGen. //===----------------------------------------------------------------------===// -#if defined(_MSC_VER) || defined(__MINGW32__) -// According to Microsoft, one must set _USE_MATH_DEFINES in order to get M_PI -// from the Visual C++ cmath / math.h headers: -// https://docs.microsoft.com/en-us/cpp/c-runtime-library/math-constants?view=vs-2019 -#define _USE_MATH_DEFINES -#endif +#include "AMDGPULegalizerInfo.h" #include "AMDGPU.h" -#include "AMDGPULegalizerInfo.h" +#include "AMDGPUGlobalISelUtils.h" #include "AMDGPUTargetMachine.h" #include "SIMachineFunctionInfo.h" +#include "llvm/ADT/ScopeExit.h" #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h" #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" +#include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" #include "llvm/CodeGen/TargetOpcodes.h" #include "llvm/CodeGen/ValueTypes.h" #include "llvm/IR/DerivedTypes.h" @@ -37,21 +34,30 @@ using namespace llvm; using namespace LegalizeActions; using namespace LegalizeMutations; using namespace LegalityPredicates; - - -static LegalityPredicate isMultiple32(unsigned TypeIdx, - unsigned MaxSize = 1024) { - return [=](const LegalityQuery &Query) { - const LLT Ty = Query.Types[TypeIdx]; - const LLT EltTy = Ty.getScalarType(); - return Ty.getSizeInBits() <= MaxSize && EltTy.getSizeInBits() % 32 == 0; - }; +using namespace MIPatternMatch; + +// Hack until load/store selection patterns support any tuple of legal types. +static cl::opt<bool> EnableNewLegality( + "amdgpu-global-isel-new-legality", + cl::desc("Use GlobalISel desired legality, rather than try to use" + "rules compatible with selection patterns"), + cl::init(false), + cl::ReallyHidden); + +static constexpr unsigned MaxRegisterSize = 1024; + +// Round the number of elements to the next power of two elements +static LLT getPow2VectorType(LLT Ty) { + unsigned NElts = Ty.getNumElements(); + unsigned Pow2NElts = 1 << Log2_32_Ceil(NElts); + return Ty.changeNumElements(Pow2NElts); } -static LegalityPredicate sizeIs(unsigned TypeIdx, unsigned Size) { - return [=](const LegalityQuery &Query) { - return Query.Types[TypeIdx].getSizeInBits() == Size; - }; +// Round the number of bits to the next power of two bits +static LLT getPow2ScalarType(LLT Ty) { + unsigned Bits = Ty.getSizeInBits(); + unsigned Pow2Bits = 1 << Log2_32_Ceil(Bits); + return LLT::scalar(Pow2Bits); } static LegalityPredicate isSmallOddVector(unsigned TypeIdx) { @@ -109,6 +115,23 @@ static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) { }; } +static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx) { + return [=](const LegalityQuery &Query) { + const LLT Ty = Query.Types[TypeIdx]; + unsigned Size = Ty.getSizeInBits(); + + LLT CoercedTy; + if (Size <= 32) { + // <2 x s8> -> s16 + // <4 x s8> -> s32 + CoercedTy = LLT::scalar(Size); + } else + CoercedTy = LLT::scalarOrVector(Size / 32, 32); + + return std::make_pair(TypeIdx, CoercedTy); + }; +} + static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) { return [=](const LegalityQuery &Query) { const LLT QueryTy = Query.Types[TypeIdx]; @@ -130,25 +153,47 @@ static LegalityPredicate numElementsNotEven(unsigned TypeIdx) { }; } -// Any combination of 32 or 64-bit elements up to 1024 bits, and multiples of -// v2s16. +static bool isRegisterSize(unsigned Size) { + return Size % 32 == 0 && Size <= MaxRegisterSize; +} + +static bool isRegisterVectorElementType(LLT EltTy) { + const int EltSize = EltTy.getSizeInBits(); + return EltSize == 16 || EltSize % 32 == 0; +} + +static bool isRegisterVectorType(LLT Ty) { + const int EltSize = Ty.getElementType().getSizeInBits(); + return EltSize == 32 || EltSize == 64 || + (EltSize == 16 && Ty.getNumElements() % 2 == 0) || + EltSize == 128 || EltSize == 256; +} + +static bool isRegisterType(LLT Ty) { + if (!isRegisterSize(Ty.getSizeInBits())) + return false; + + if (Ty.isVector()) + return isRegisterVectorType(Ty); + + return true; +} + +// Any combination of 32 or 64-bit elements up the maximum register size, and +// multiples of v2s16. static LegalityPredicate isRegisterType(unsigned TypeIdx) { return [=](const LegalityQuery &Query) { - const LLT Ty = Query.Types[TypeIdx]; - if (Ty.isVector()) { - const int EltSize = Ty.getElementType().getSizeInBits(); - return EltSize == 32 || EltSize == 64 || - (EltSize == 16 && Ty.getNumElements() % 2 == 0) || - EltSize == 128 || EltSize == 256; - } - - return Ty.getSizeInBits() % 32 == 0 && Ty.getSizeInBits() <= 1024; + return isRegisterType(Query.Types[TypeIdx]); }; } -static LegalityPredicate elementTypeIs(unsigned TypeIdx, LLT Type) { +static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx) { return [=](const LegalityQuery &Query) { - return Query.Types[TypeIdx].getElementType() == Type; + const LLT QueryTy = Query.Types[TypeIdx]; + if (!QueryTy.isVector()) + return false; + const LLT EltTy = QueryTy.getElementType(); + return EltTy == LLT::scalar(16) || EltTy.getSizeInBits() >= 32; }; } @@ -160,6 +205,120 @@ static LegalityPredicate isWideScalarTruncStore(unsigned TypeIdx) { }; } +// TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we +// handle some operations by just promoting the register during +// selection. There are also d16 loads on GFX9+ which preserve the high bits. +static unsigned maxSizeForAddrSpace(const GCNSubtarget &ST, unsigned AS, + bool IsLoad) { + switch (AS) { + case AMDGPUAS::PRIVATE_ADDRESS: + // FIXME: Private element size. + return 32; + case AMDGPUAS::LOCAL_ADDRESS: + return ST.useDS128() ? 128 : 64; + case AMDGPUAS::GLOBAL_ADDRESS: + case AMDGPUAS::CONSTANT_ADDRESS: + case AMDGPUAS::CONSTANT_ADDRESS_32BIT: + // Treat constant and global as identical. SMRD loads are sometimes usable for + // global loads (ideally constant address space should be eliminated) + // depending on the context. Legality cannot be context dependent, but + // RegBankSelect can split the load as necessary depending on the pointer + // register bank/uniformity and if the memory is invariant or not written in a + // kernel. + return IsLoad ? 512 : 128; + default: + // Flat addresses may contextually need to be split to 32-bit parts if they + // may alias scratch depending on the subtarget. + return 128; + } +} + +static bool isLoadStoreSizeLegal(const GCNSubtarget &ST, + const LegalityQuery &Query, + unsigned Opcode) { + const LLT Ty = Query.Types[0]; + + // Handle G_LOAD, G_ZEXTLOAD, G_SEXTLOAD + const bool IsLoad = Opcode != AMDGPU::G_STORE; + + unsigned RegSize = Ty.getSizeInBits(); + unsigned MemSize = Query.MMODescrs[0].SizeInBits; + unsigned Align = Query.MMODescrs[0].AlignInBits; + unsigned AS = Query.Types[1].getAddressSpace(); + + // All of these need to be custom lowered to cast the pointer operand. + if (AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) + return false; + + // TODO: We should be able to widen loads if the alignment is high enough, but + // we also need to modify the memory access size. +#if 0 + // Accept widening loads based on alignment. + if (IsLoad && MemSize < Size) + MemSize = std::max(MemSize, Align); +#endif + + // Only 1-byte and 2-byte to 32-bit extloads are valid. + if (MemSize != RegSize && RegSize != 32) + return false; + + if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad)) + return false; + + switch (MemSize) { + case 8: + case 16: + case 32: + case 64: + case 128: + break; + case 96: + if (!ST.hasDwordx3LoadStores()) + return false; + break; + case 256: + case 512: + // These may contextually need to be broken down. + break; + default: + return false; + } + + assert(RegSize >= MemSize); + + if (Align < MemSize) { + const SITargetLowering *TLI = ST.getTargetLowering(); + if (!TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, Align / 8)) + return false; + } + + return true; +} + +// The current selector can't handle <6 x s16>, <8 x s16>, s96, s128 etc, so +// workaround this. Eventually it should ignore the type for loads and only care +// about the size. Return true in cases where we will workaround this for now by +// bitcasting. +static bool loadStoreBitcastWorkaround(const LLT Ty) { + if (EnableNewLegality) + return false; + + const unsigned Size = Ty.getSizeInBits(); + if (Size <= 64) + return false; + if (!Ty.isVector()) + return true; + unsigned EltSize = Ty.getElementType().getSizeInBits(); + return EltSize != 32 && EltSize != 64; +} + +static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query, + unsigned Opcode) { + const LLT Ty = Query.Types[0]; + return isRegisterType(Ty) && isLoadStoreSizeLegal(ST, Query, Opcode) && + !loadStoreBitcastWorkaround(Ty); +} + AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, const GCNTargetMachine &TM) : ST(ST_) { @@ -170,14 +329,13 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, }; const LLT S1 = LLT::scalar(1); - const LLT S8 = LLT::scalar(8); const LLT S16 = LLT::scalar(16); const LLT S32 = LLT::scalar(32); const LLT S64 = LLT::scalar(64); - const LLT S96 = LLT::scalar(96); const LLT S128 = LLT::scalar(128); const LLT S256 = LLT::scalar(256); - const LLT S1024 = LLT::scalar(1024); + const LLT S512 = LLT::scalar(512); + const LLT MaxScalar = LLT::scalar(MaxRegisterSize); const LLT V2S16 = LLT::vector(2, 16); const LLT V4S16 = LLT::vector(4, 16); @@ -244,6 +402,8 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, S32, S64, S16, V2S16 }; + const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32; + setAction({G_BRCOND, S1}, Legal); // VCC branches setAction({G_BRCOND, S32}, Legal); // SCC branches @@ -261,11 +421,19 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) .legalIf(isPointer(0)); - if (ST.has16BitInsts()) { + if (ST.hasVOP3PInsts()) { + getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) + .legalFor({S32, S16, V2S16}) + .clampScalar(0, S16, S32) + .clampMaxNumElements(0, S16, 2) + .scalarize(0) + .widenScalarToNextPow2(0, 32); + } else if (ST.has16BitInsts()) { getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) .legalFor({S32, S16}) .clampScalar(0, S16, S32) - .scalarize(0); + .scalarize(0) + .widenScalarToNextPow2(0, 32); } else { getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL}) .legalFor({S32}) @@ -275,7 +443,7 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, // FIXME: Not really legal. Placeholder for custom lowering. getActionDefinitionsBuilder({G_SDIV, G_UDIV, G_SREM, G_UREM}) - .legalFor({S32, S64}) + .customFor({S32, S64}) .clampScalar(0, S32, S64) .widenScalarToNextPow2(0, 32) .scalarize(0); @@ -298,35 +466,16 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, getActionDefinitionsBuilder({G_UADDO, G_USUBO, G_UADDE, G_SADDE, G_USUBE, G_SSUBE}) .legalFor({{S32, S1}, {S32, S32}}) - .clampScalar(0, S32, S32) - .scalarize(0); // TODO: Implement. - - getActionDefinitionsBuilder({G_SADDO, G_SSUBO}) + .minScalar(0, S32) + // TODO: .scalarize(0) .lower(); getActionDefinitionsBuilder(G_BITCAST) // Don't worry about the size constraint. .legalIf(all(isRegisterType(0), isRegisterType(1))) - // FIXME: Testing hack - .legalForCartesianProduct({S16, LLT::vector(2, 8), }); - - getActionDefinitionsBuilder(G_FCONSTANT) - .legalFor({S32, S64, S16}) - .clampScalar(0, S16, S64); - - getActionDefinitionsBuilder(G_IMPLICIT_DEF) - .legalFor({S1, S32, S64, S16, V2S32, V4S32, V2S16, V4S16, GlobalPtr, - ConstantPtr, LocalPtr, FlatPtr, PrivatePtr}) - .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) - .clampScalarOrElt(0, S32, S1024) - .legalIf(isMultiple32(0)) - .widenScalarToNextPow2(0, 32) - .clampMaxNumElements(0, S32, 16); + .lower(); - // FIXME: i1 operands to intrinsics should always be legal, but other i1 - // values may not be legal. We need to figure out how to distinguish - // between these two scenarios. getActionDefinitionsBuilder(G_CONSTANT) .legalFor({S1, S32, S64, S16, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr }) @@ -334,10 +483,31 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, .widenScalarToNextPow2(0) .legalIf(isPointer(0)); + getActionDefinitionsBuilder(G_FCONSTANT) + .legalFor({S32, S64, S16}) + .clampScalar(0, S16, S64); + + getActionDefinitionsBuilder({G_IMPLICIT_DEF, G_FREEZE}) + .legalIf(isRegisterType(0)) + // s1 and s16 are special cases because they have legal operations on + // them, but don't really occupy registers in the normal way. + .legalFor({S1, S16}) + .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) + .clampScalarOrElt(0, S32, MaxScalar) + .widenScalarToNextPow2(0, 32) + .clampMaxNumElements(0, S32, 16); + setAction({G_FRAME_INDEX, PrivatePtr}, Legal); - getActionDefinitionsBuilder(G_GLOBAL_VALUE) - .customFor({LocalPtr, GlobalPtr, ConstantPtr, Constant32Ptr}); + // If the amount is divergent, we have to do a wave reduction to get the + // maximum value, so this is expanded during RegBankSelect. + getActionDefinitionsBuilder(G_DYN_STACKALLOC) + .legalFor({{PrivatePtr, S32}}); + + getActionDefinitionsBuilder(G_GLOBAL_VALUE) + .unsupportedFor({PrivatePtr}) + .custom(); + setAction({G_BLOCK_ADDR, CodePtr}, Legal); auto &FPOpActions = getActionDefinitionsBuilder( { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE}) @@ -397,33 +567,41 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, .scalarize(0) .clampScalar(0, S16, S64); - // TODO: Implement - getActionDefinitionsBuilder({G_FMINIMUM, G_FMAXIMUM}).lower(); - if (ST.has16BitInsts()) { getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR}) .legalFor({S32, S64, S16}) .scalarize(0) .clampScalar(0, S16, S64); } else { - getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR}) + getActionDefinitionsBuilder(G_FSQRT) .legalFor({S32, S64}) .scalarize(0) .clampScalar(0, S32, S64); + + if (ST.hasFractBug()) { + getActionDefinitionsBuilder(G_FFLOOR) + .customFor({S64}) + .legalFor({S32, S64}) + .scalarize(0) + .clampScalar(0, S32, S64); + } else { + getActionDefinitionsBuilder(G_FFLOOR) + .legalFor({S32, S64}) + .scalarize(0) + .clampScalar(0, S32, S64); + } } getActionDefinitionsBuilder(G_FPTRUNC) .legalFor({{S32, S64}, {S16, S32}}) - .scalarize(0); + .scalarize(0) + .lower(); getActionDefinitionsBuilder(G_FPEXT) .legalFor({{S64, S32}, {S32, S16}}) .lowerFor({{S64, S16}}) // FIXME: Implement .scalarize(0); - // TODO: Verify V_BFI_B32 is generated from expanded bit ops. - getActionDefinitionsBuilder(G_FCOPYSIGN).lower(); - getActionDefinitionsBuilder(G_FSUB) // Use actual fsub instruction .legalFor({S32}) @@ -434,22 +612,32 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, // Whether this is legal depends on the floating point mode for the function. auto &FMad = getActionDefinitionsBuilder(G_FMAD); - if (ST.hasMadF16()) + if (ST.hasMadF16() && ST.hasMadMacF32Insts()) FMad.customFor({S32, S16}); - else + else if (ST.hasMadMacF32Insts()) FMad.customFor({S32}); + else if (ST.hasMadF16()) + FMad.customFor({S16}); FMad.scalarize(0) .lower(); + // TODO: Do we need to clamp maximum bitwidth? + getActionDefinitionsBuilder(G_TRUNC) + .legalIf(isScalar(0)) + .legalFor({{V2S16, V2S32}}) + .clampMaxNumElements(0, S16, 2) + // Avoid scalarizing in cases that should be truly illegal. In unresolvable + // situations (like an invalid implicit use), we don't want to infinite loop + // in the legalizer. + .fewerElementsIf(elementTypeIsLegal(0), LegalizeMutations::scalarize(0)) + .alwaysLegal(); + getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT}) .legalFor({{S64, S32}, {S32, S16}, {S64, S16}, - {S32, S1}, {S64, S1}, {S16, S1}, - {S96, S32}, - // FIXME: Hack - {S64, LLT::scalar(33)}, - {S32, S8}, {S32, LLT::scalar(24)}}) + {S32, S1}, {S64, S1}, {S16, S1}}) .scalarize(0) - .clampScalar(0, S32, S64); + .clampScalar(0, S32, S64) + .widenScalarToNextPow2(1, 32); // TODO: Split s1->s64 during regbankselect for VALU. auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP}) @@ -460,17 +648,20 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, if (ST.has16BitInsts()) IToFP.legalFor({{S16, S16}}); IToFP.clampScalar(1, S32, S64) - .scalarize(0); + .scalarize(0) + .widenScalarToNextPow2(1); auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI}) - .legalFor({{S32, S32}, {S32, S64}, {S32, S16}}); + .legalFor({{S32, S32}, {S32, S64}, {S32, S16}}) + .customFor({{S64, S64}}); if (ST.has16BitInsts()) FPToI.legalFor({{S16, S16}}); else FPToI.minScalar(1, S32); FPToI.minScalar(0, S32) - .scalarize(0); + .scalarize(0) + .lower(); getActionDefinitionsBuilder(G_INTRINSIC_ROUND) .scalarize(0) @@ -494,16 +685,17 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, .scalarize(0); } + // FIXME: Clamp offset operand. getActionDefinitionsBuilder(G_PTR_ADD) - .legalForCartesianProduct(AddrSpaces64, {S64}) - .legalForCartesianProduct(AddrSpaces32, {S32}) + .legalIf(isPointer(0)) .scalarize(0); - getActionDefinitionsBuilder(G_PTR_MASK) - .scalarize(0) - .alwaysLegal(); - - setAction({G_BLOCK_ADDR, CodePtr}, Legal); + getActionDefinitionsBuilder(G_PTRMASK) + .legalIf(typeInSet(1, {S64, S32})) + .minScalar(1, S32) + .maxScalarIf(sizeIs(0, 32), 1, S32) + .maxScalarIf(sizeIs(0, 64), 1, S64) + .scalarize(0); auto &CmpBuilder = getActionDefinitionsBuilder(G_ICMP) @@ -537,16 +729,45 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, .clampScalar(1, S32, S64) .scalarize(0); - // FIXME: fexp, flog2, flog10 needs to be custom lowered. - getActionDefinitionsBuilder({G_FPOW, G_FEXP, G_FEXP2, - G_FLOG, G_FLOG2, G_FLOG10}) - .legalFor({S32}) - .scalarize(0); + // FIXME: fpow has a selection pattern that should move to custom lowering. + auto &Exp2Ops = getActionDefinitionsBuilder({G_FEXP2, G_FLOG2}); + if (ST.has16BitInsts()) + Exp2Ops.legalFor({S32, S16}); + else + Exp2Ops.legalFor({S32}); + Exp2Ops.clampScalar(0, MinScalarFPTy, S32); + Exp2Ops.scalarize(0); + + auto &ExpOps = getActionDefinitionsBuilder({G_FEXP, G_FLOG, G_FLOG10, G_FPOW}); + if (ST.has16BitInsts()) + ExpOps.customFor({{S32}, {S16}}); + else + ExpOps.customFor({S32}); + ExpOps.clampScalar(0, MinScalarFPTy, S32) + .scalarize(0); + + // The 64-bit versions produce 32-bit results, but only on the SALU. + getActionDefinitionsBuilder(G_CTPOP) + .legalFor({{S32, S32}, {S32, S64}}) + .clampScalar(0, S32, S32) + .clampScalar(1, S32, S64) + .scalarize(0) + .widenScalarToNextPow2(0, 32) + .widenScalarToNextPow2(1, 32); + + // The hardware instructions return a different result on 0 than the generic + // instructions expect. The hardware produces -1, but these produce the + // bitwidth. + getActionDefinitionsBuilder({G_CTLZ, G_CTTZ}) + .scalarize(0) + .clampScalar(0, S32, S32) + .clampScalar(1, S32, S64) + .widenScalarToNextPow2(0, 32) + .widenScalarToNextPow2(1, 32) + .lower(); // The 64-bit versions produce 32-bit results, but only on the SALU. - getActionDefinitionsBuilder({G_CTLZ, G_CTLZ_ZERO_UNDEF, - G_CTTZ, G_CTTZ_ZERO_UNDEF, - G_CTPOP}) + getActionDefinitionsBuilder({G_CTLZ_ZERO_UNDEF, G_CTTZ_ZERO_UNDEF}) .legalFor({{S32, S32}, {S32, S64}}) .clampScalar(0, S32, S32) .clampScalar(1, S32, S64) @@ -554,50 +775,58 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, .widenScalarToNextPow2(0, 32) .widenScalarToNextPow2(1, 32); - // TODO: Expand for > s32 - getActionDefinitionsBuilder({G_BSWAP, G_BITREVERSE}) + getActionDefinitionsBuilder(G_BITREVERSE) .legalFor({S32}) .clampScalar(0, S32, S32) .scalarize(0); if (ST.has16BitInsts()) { + getActionDefinitionsBuilder(G_BSWAP) + .legalFor({S16, S32, V2S16}) + .clampMaxNumElements(0, S16, 2) + // FIXME: Fixing non-power-of-2 before clamp is workaround for + // narrowScalar limitation. + .widenScalarToNextPow2(0) + .clampScalar(0, S16, S32) + .scalarize(0); + if (ST.hasVOP3PInsts()) { getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) .legalFor({S32, S16, V2S16}) .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) .clampMaxNumElements(0, S16, 2) - .clampScalar(0, S16, S32) + .minScalar(0, S16) .widenScalarToNextPow2(0) - .scalarize(0); + .scalarize(0) + .lower(); } else { getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) .legalFor({S32, S16}) .widenScalarToNextPow2(0) - .clampScalar(0, S16, S32) - .scalarize(0); + .minScalar(0, S16) + .scalarize(0) + .lower(); } } else { + // TODO: Should have same legality without v_perm_b32 + getActionDefinitionsBuilder(G_BSWAP) + .legalFor({S32}) + .lowerIf(scalarNarrowerThan(0, 32)) + // FIXME: Fixing non-power-of-2 before clamp is workaround for + // narrowScalar limitation. + .widenScalarToNextPow2(0) + .maxScalar(0, S32) + .scalarize(0) + .lower(); + getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX}) .legalFor({S32}) - .clampScalar(0, S32, S32) + .minScalar(0, S32) .widenScalarToNextPow2(0) - .scalarize(0); + .scalarize(0) + .lower(); } - auto smallerThan = [](unsigned TypeIdx0, unsigned TypeIdx1) { - return [=](const LegalityQuery &Query) { - return Query.Types[TypeIdx0].getSizeInBits() < - Query.Types[TypeIdx1].getSizeInBits(); - }; - }; - - auto greaterThan = [](unsigned TypeIdx0, unsigned TypeIdx1) { - return [=](const LegalityQuery &Query) { - return Query.Types[TypeIdx0].getSizeInBits() > - Query.Types[TypeIdx1].getSizeInBits(); - }; - }; - getActionDefinitionsBuilder(G_INTTOPTR) // List the common cases .legalForCartesianProduct(AddrSpaces64, {S64}) @@ -609,7 +838,7 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, [](const LegalityQuery &Query) { return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits())); }) - .narrowScalarIf(greaterThan(1, 0), + .narrowScalarIf(largerThan(1, 0), [](const LegalityQuery &Query) { return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits())); }); @@ -626,7 +855,7 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits())); }) .narrowScalarIf( - greaterThan(0, 1), + largerThan(0, 1), [](const LegalityQuery &Query) { return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits())); }); @@ -635,33 +864,8 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, .scalarize(0) .custom(); - // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we - // handle some operations by just promoting the register during - // selection. There are also d16 loads on GFX9+ which preserve the high bits. - auto maxSizeForAddrSpace = [this](unsigned AS) -> unsigned { - switch (AS) { - // FIXME: Private element size. - case AMDGPUAS::PRIVATE_ADDRESS: - return 32; - // FIXME: Check subtarget - case AMDGPUAS::LOCAL_ADDRESS: - return ST.useDS128() ? 128 : 64; - - // Treat constant and global as identical. SMRD loads are sometimes usable - // for global loads (ideally constant address space should be eliminated) - // depending on the context. Legality cannot be context dependent, but - // RegBankSelect can split the load as necessary depending on the pointer - // register bank/uniformity and if the memory is invariant or not written in - // a kernel. - case AMDGPUAS::CONSTANT_ADDRESS: - case AMDGPUAS::GLOBAL_ADDRESS: - return 512; - default: - return 128; - } - }; - - const auto needToSplitLoad = [=](const LegalityQuery &Query) -> bool { + const auto needToSplitMemOp = [=](const LegalityQuery &Query, + bool IsLoad) -> bool { const LLT DstTy = Query.Types[0]; // Split vector extloads. @@ -676,14 +880,20 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, const LLT PtrTy = Query.Types[1]; unsigned AS = PtrTy.getAddressSpace(); - if (MemSize > maxSizeForAddrSpace(AS)) + if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad)) return true; // Catch weird sized loads that don't evenly divide into the access sizes // TODO: May be able to widen depending on alignment etc. - unsigned NumRegs = MemSize / 32; - if (NumRegs == 3 && !ST.hasDwordx3LoadStores()) - return true; + unsigned NumRegs = (MemSize + 31) / 32; + if (NumRegs == 3) { + if (!ST.hasDwordx3LoadStores()) + return true; + } else { + // If the alignment allows, these should have been widened. + if (!isPowerOf2_32(NumRegs)) + return true; + } if (Align < MemSize) { const SITargetLowering *TLI = ST.getTargetLowering(); @@ -693,6 +903,24 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, return false; }; + const auto shouldWidenLoadResult = [=](const LegalityQuery &Query, + unsigned Opc) -> bool { + unsigned Size = Query.Types[0].getSizeInBits(); + if (isPowerOf2_32(Size)) + return false; + + if (Size == 96 && ST.hasDwordx3LoadStores()) + return false; + + unsigned AddrSpace = Query.Types[1].getAddressSpace(); + if (Size >= maxSizeForAddrSpace(ST, AddrSpace, Opc)) + return false; + + unsigned Align = Query.MMODescrs[0].AlignInBits; + unsigned RoundedSize = NextPowerOf2(Size); + return (Align >= RoundedSize); + }; + unsigned GlobalAlign32 = ST.hasUnalignedBufferAccess() ? 0 : 32; unsigned GlobalAlign16 = ST.hasUnalignedBufferAccess() ? 0 : 16; unsigned GlobalAlign8 = ST.hasUnalignedBufferAccess() ? 0 : 8; @@ -705,17 +933,11 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, const bool IsStore = Op == G_STORE; auto &Actions = getActionDefinitionsBuilder(Op); - // Whitelist the common cases. - // TODO: Pointer loads - // TODO: Wide constant loads - // TODO: Only CI+ has 3x loads - // TODO: Loads to s16 on gfx9 + // Explicitly list some common cases. + // TODO: Does this help compile time at all? Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, 32, GlobalAlign32}, {V2S32, GlobalPtr, 64, GlobalAlign32}, - {V3S32, GlobalPtr, 96, GlobalAlign32}, - {S96, GlobalPtr, 96, GlobalAlign32}, {V4S32, GlobalPtr, 128, GlobalAlign32}, - {S128, GlobalPtr, 128, GlobalAlign32}, {S64, GlobalPtr, 64, GlobalAlign32}, {V2S64, GlobalPtr, 128, GlobalAlign32}, {V2S16, GlobalPtr, 32, GlobalAlign32}, @@ -734,23 +956,60 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, {S32, PrivatePtr, 16, 16}, {V2S16, PrivatePtr, 32, 32}, - {S32, FlatPtr, 32, GlobalAlign32}, - {S32, FlatPtr, 16, GlobalAlign16}, - {S32, FlatPtr, 8, GlobalAlign8}, - {V2S16, FlatPtr, 32, GlobalAlign32}, - {S32, ConstantPtr, 32, GlobalAlign32}, {V2S32, ConstantPtr, 64, GlobalAlign32}, - {V3S32, ConstantPtr, 96, GlobalAlign32}, {V4S32, ConstantPtr, 128, GlobalAlign32}, {S64, ConstantPtr, 64, GlobalAlign32}, - {S128, ConstantPtr, 128, GlobalAlign32}, {V2S32, ConstantPtr, 32, GlobalAlign32}}); + Actions.legalIf( + [=](const LegalityQuery &Query) -> bool { + return isLoadStoreLegal(ST, Query, Op); + }); + + // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to + // 64-bits. + // + // TODO: Should generalize bitcast action into coerce, which will also cover + // inserting addrspacecasts. + Actions.customIf(typeIs(1, Constant32Ptr)); + + // Turn any illegal element vectors into something easier to deal + // with. These will ultimately produce 32-bit scalar shifts to extract the + // parts anyway. + // + // For odd 16-bit element vectors, prefer to split those into pieces with + // 16-bit vector parts. + Actions.bitcastIf( + [=](const LegalityQuery &Query) -> bool { + const LLT Ty = Query.Types[0]; + const unsigned Size = Ty.getSizeInBits(); + + if (Size != Query.MMODescrs[0].SizeInBits) + return Size <= 32 && Ty.isVector(); + + if (loadStoreBitcastWorkaround(Ty) && isRegisterType(Ty)) + return true; + return Ty.isVector() && (Size <= 32 || isRegisterSize(Size)) && + !isRegisterVectorElementType(Ty.getElementType()); + }, bitcastToRegisterType(0)); + Actions .customIf(typeIs(1, Constant32Ptr)) + // Widen suitably aligned loads by loading extra elements. + .moreElementsIf([=](const LegalityQuery &Query) { + const LLT Ty = Query.Types[0]; + return Op == G_LOAD && Ty.isVector() && + shouldWidenLoadResult(Query, Op); + }, moreElementsToNextPow2(0)) + .widenScalarIf([=](const LegalityQuery &Query) { + const LLT Ty = Query.Types[0]; + return Op == G_LOAD && !Ty.isVector() && + shouldWidenLoadResult(Query, Op); + }, widenScalarOrEltToNextPow2(0)) .narrowScalarIf( [=](const LegalityQuery &Query) -> bool { - return !Query.Types[0].isVector() && needToSplitLoad(Query); + return !Query.Types[0].isVector() && + needToSplitMemOp(Query, Op == G_LOAD); }, [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> { const LLT DstTy = Query.Types[0]; @@ -763,13 +1022,23 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, if (DstSize > MemSize) return std::make_pair(0, LLT::scalar(MemSize)); + if (!isPowerOf2_32(DstSize)) { + // We're probably decomposing an odd sized store. Try to split + // to the widest type. TODO: Account for alignment. As-is it + // should be OK, since the new parts will be further legalized. + unsigned FloorSize = PowerOf2Floor(DstSize); + return std::make_pair(0, LLT::scalar(FloorSize)); + } + if (DstSize > 32 && (DstSize % 32 != 0)) { // FIXME: Need a way to specify non-extload of larger size if // suitably aligned. return std::make_pair(0, LLT::scalar(32 * (DstSize / 32))); } - unsigned MaxSize = maxSizeForAddrSpace(PtrTy.getAddressSpace()); + unsigned MaxSize = maxSizeForAddrSpace(ST, + PtrTy.getAddressSpace(), + Op == G_LOAD); if (MemSize > MaxSize) return std::make_pair(0, LLT::scalar(MaxSize)); @@ -778,18 +1047,32 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, }) .fewerElementsIf( [=](const LegalityQuery &Query) -> bool { - return Query.Types[0].isVector() && needToSplitLoad(Query); + return Query.Types[0].isVector() && + needToSplitMemOp(Query, Op == G_LOAD); }, [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> { const LLT DstTy = Query.Types[0]; const LLT PtrTy = Query.Types[1]; LLT EltTy = DstTy.getElementType(); - unsigned MaxSize = maxSizeForAddrSpace(PtrTy.getAddressSpace()); + unsigned MaxSize = maxSizeForAddrSpace(ST, + PtrTy.getAddressSpace(), + Op == G_LOAD); + + // FIXME: Handle widened to power of 2 results better. This ends + // up scalarizing. + // FIXME: 3 element stores scalarized on SI // Split if it's too large for the address space. if (Query.MMODescrs[0].SizeInBits > MaxSize) { unsigned NumElts = DstTy.getNumElements(); + unsigned EltSize = EltTy.getSizeInBits(); + + if (MaxSize % EltSize == 0) { + return std::make_pair( + 0, LLT::scalarOrVector(MaxSize / EltSize, EltTy)); + } + unsigned NumPieces = Query.MMODescrs[0].SizeInBits / MaxSize; // FIXME: Refine when odd breakdowns handled @@ -802,9 +1085,24 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, LLT::vector(NumElts / NumPieces, EltTy)); } + // FIXME: We could probably handle weird extending loads better. + unsigned MemSize = Query.MMODescrs[0].SizeInBits; + if (DstTy.getSizeInBits() > MemSize) + return std::make_pair(0, EltTy); + + unsigned EltSize = EltTy.getSizeInBits(); + unsigned DstSize = DstTy.getSizeInBits(); + if (!isPowerOf2_32(DstSize)) { + // We're probably decomposing an odd sized store. Try to split + // to the widest type. TODO: Account for alignment. As-is it + // should be OK, since the new parts will be further legalized. + unsigned FloorSize = PowerOf2Floor(DstSize); + return std::make_pair( + 0, LLT::scalarOrVector(FloorSize / EltSize, EltTy)); + } + // Need to split because of alignment. unsigned Align = Query.MMODescrs[0].AlignInBits; - unsigned EltSize = EltTy.getSizeInBits(); if (EltSize > Align && (EltSize / Align < DstTy.getNumElements())) { return std::make_pair(0, LLT::vector(EltSize / Align, EltTy)); @@ -820,39 +1118,7 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, // TODO: Need a bitcast lower option? Actions - .legalIf([=](const LegalityQuery &Query) { - const LLT Ty0 = Query.Types[0]; - unsigned Size = Ty0.getSizeInBits(); - unsigned MemSize = Query.MMODescrs[0].SizeInBits; - unsigned Align = Query.MMODescrs[0].AlignInBits; - - // FIXME: Widening store from alignment not valid. - if (MemSize < Size) - MemSize = std::max(MemSize, Align); - - // No extending vector loads. - if (Size > MemSize && Ty0.isVector()) - return false; - - switch (MemSize) { - case 8: - case 16: - return Size == 32; - case 32: - case 64: - case 128: - return true; - case 96: - return ST.hasDwordx3LoadStores(); - case 256: - case 512: - return true; - default: - return false; - } - }) .widenScalarToNextPow2(0) - // TODO: v3s32->v4s32 with alignment .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0)); } @@ -886,8 +1152,10 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}}); } - getActionDefinitionsBuilder(G_ATOMICRMW_FADD) - .legalFor({{S32, LocalPtr}}); + if (ST.hasLDSFPAtomics()) { + getActionDefinitionsBuilder(G_ATOMICRMW_FADD) + .legalFor({{S32, LocalPtr}}); + } // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output // demarshalling @@ -896,10 +1164,6 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, {S32, FlatPtr}, {S64, FlatPtr}}) .legalFor({{S32, LocalPtr}, {S64, LocalPtr}, {S32, RegionPtr}, {S64, RegionPtr}}); - - getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG_WITH_SUCCESS) - .lower(); - // TODO: Pointer types, any 32-bit or 64-bit vector // Condition should be s32 for scalar, s1 for vector. @@ -908,9 +1172,9 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, GlobalPtr, LocalPtr, FlatPtr, PrivatePtr, LLT::vector(2, LocalPtr), LLT::vector(2, PrivatePtr)}, {S1, S32}) .clampScalar(0, S16, S64) + .scalarize(1) .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) .fewerElementsIf(numElementsNotEven(0), scalarize(0)) - .scalarize(1) .clampMaxNumElements(0, S32, 2) .clampMaxNumElements(0, LocalPtr, 2) .clampMaxNumElements(0, PrivatePtr, 2) @@ -924,12 +1188,22 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, .legalFor({{S32, S32}, {S64, S32}}); if (ST.has16BitInsts()) { if (ST.hasVOP3PInsts()) { - Shifts.legalFor({{S16, S32}, {S16, S16}, {V2S16, V2S16}}) + Shifts.legalFor({{S16, S16}, {V2S16, V2S16}}) .clampMaxNumElements(0, S16, 2); } else - Shifts.legalFor({{S16, S32}, {S16, S16}}); + Shifts.legalFor({{S16, S16}}); - // TODO: Support 16-bit shift amounts + // TODO: Support 16-bit shift amounts for all types + Shifts.widenScalarIf( + [=](const LegalityQuery &Query) { + // Use 16-bit shift amounts for any 16-bit shift. Otherwise we want a + // 32-bit amount. + const LLT ValTy = Query.Types[0]; + const LLT AmountTy = Query.Types[1]; + return ValTy.getSizeInBits() <= 16 && + AmountTy.getSizeInBits() < 16; + }, changeTo(1, S16)); + Shifts.maxScalarIf(typeIs(0, S16), 1, S16); Shifts.clampScalar(1, S32, S32); Shifts.clampScalar(0, S16, S64); Shifts.widenScalarToNextPow2(0, 16); @@ -956,7 +1230,7 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, return (EltTy.getSizeInBits() == 16 || EltTy.getSizeInBits() % 32 == 0) && VecTy.getSizeInBits() % 32 == 0 && - VecTy.getSizeInBits() <= 1024 && + VecTy.getSizeInBits() <= MaxRegisterSize && IdxTy.getSizeInBits() == 32; }) .clampScalar(EltTypeIdx, S32, S64) @@ -1008,28 +1282,40 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, .clampNumElements(0, V2S64, V16S64) .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16)); - if (ST.hasScalarPackInsts()) - BuildVector.legalFor({V2S16, S32}); - - BuildVector - .minScalarSameAs(1, 0) - .legalIf(isRegisterType(0)) - .minScalarOrElt(0, S32); - if (ST.hasScalarPackInsts()) { + BuildVector + // FIXME: Should probably widen s1 vectors straight to s32 + .minScalarOrElt(0, S16) + // Widen source elements and produce a G_BUILD_VECTOR_TRUNC + .minScalar(1, S32); + getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC) .legalFor({V2S16, S32}) .lower(); + BuildVector.minScalarOrElt(0, S32); } else { + BuildVector.customFor({V2S16, S16}); + BuildVector.minScalarOrElt(0, S32); + getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC) + .customFor({V2S16, S32}) .lower(); } + BuildVector.legalIf(isRegisterType(0)); + + // FIXME: Clamp maximum size getActionDefinitionsBuilder(G_CONCAT_VECTORS) .legalIf(isRegisterType(0)); - // TODO: Don't fully scalarize v2s16 pieces - getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower(); + // TODO: Don't fully scalarize v2s16 pieces? Or combine out thosse + // pre-legalize. + if (ST.hasVOP3PInsts()) { + getActionDefinitionsBuilder(G_SHUFFLE_VECTOR) + .customFor({V2S16, V2S16}) + .lower(); + } else + getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower(); // Merge/Unmerge for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) { @@ -1037,10 +1323,10 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0; auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) { - const LLT &Ty = Query.Types[TypeIdx]; + const LLT Ty = Query.Types[TypeIdx]; if (Ty.isVector()) { const LLT &EltTy = Ty.getElementType(); - if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 64) + if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 512) return true; if (!isPowerOf2_32(EltTy.getSizeInBits())) return true; @@ -1049,25 +1335,32 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, }; auto &Builder = getActionDefinitionsBuilder(Op) + .lowerFor({{S16, V2S16}}) + .lowerIf([=](const LegalityQuery &Query) { + const LLT BigTy = Query.Types[BigTyIdx]; + return BigTy.getSizeInBits() == 32; + }) + // Try to widen to s16 first for small types. + // TODO: Only do this on targets with legal s16 shifts + .minScalarOrEltIf(scalarNarrowerThan(LitTyIdx, 16), LitTyIdx, S16) .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16) - // Clamp the little scalar to s8-s256 and make it a power of 2. It's not - // worth considering the multiples of 64 since 2*192 and 2*384 are not - // valid. - .clampScalar(LitTyIdx, S16, S256) - .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32) .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx)) .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32), elementTypeIs(1, S16)), changeTo(1, V2S16)) + // Clamp the little scalar to s8-s256 and make it a power of 2. It's not + // worth considering the multiples of 64 since 2*192 and 2*384 are not + // valid. + .clampScalar(LitTyIdx, S32, S512) + .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32) // Break up vectors with weird elements into scalars .fewerElementsIf( - [=](const LegalityQuery &Query) { return notValidElt(Query, 0); }, + [=](const LegalityQuery &Query) { return notValidElt(Query, LitTyIdx); }, scalarize(0)) .fewerElementsIf( - [=](const LegalityQuery &Query) { return notValidElt(Query, 1); }, + [=](const LegalityQuery &Query) { return notValidElt(Query, BigTyIdx); }, scalarize(1)) - .clampScalar(BigTyIdx, S32, S1024) - .lowerFor({{S16, V2S16}}); + .clampScalar(BigTyIdx, S32, MaxScalar); if (Op == G_MERGE_VALUES) { Builder.widenScalarIf( @@ -1108,22 +1401,68 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, return BigTy.getSizeInBits() % 16 == 0 && LitTy.getSizeInBits() % 16 == 0 && - BigTy.getSizeInBits() <= 1024; + BigTy.getSizeInBits() <= MaxRegisterSize; }) // Any vectors left are the wrong size. Scalarize them. .scalarize(0) .scalarize(1); } - getActionDefinitionsBuilder(G_SEXT_INREG).lower(); + // S64 is only legal on SALU, and needs to be broken into 32-bit elements in + // RegBankSelect. + auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG) + .legalFor({{S32}, {S64}}); + + if (ST.hasVOP3PInsts()) { + SextInReg.lowerFor({{V2S16}}) + // Prefer to reduce vector widths for 16-bit vectors before lowering, to + // get more vector shift opportunities, since we'll get those when + // expanded. + .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16)); + } else if (ST.has16BitInsts()) { + SextInReg.lowerFor({{S32}, {S64}, {S16}}); + } else { + // Prefer to promote to s32 before lowering if we don't have 16-bit + // shifts. This avoid a lot of intermediate truncate and extend operations. + SextInReg.lowerFor({{S32}, {S64}}); + } + + // FIXME: Placeholder rule. Really depends on whether the clamp modifier is + // available, and is selectively legal for s16, s32, v2s16. + getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT, G_UADDSAT, G_USUBSAT}) + .scalarize(0) + .clampScalar(0, S16, S32); - getActionDefinitionsBuilder({G_READ_REGISTER, G_WRITE_REGISTER}).lower(); + SextInReg + .scalarize(0) + .clampScalar(0, S32, S64) + .lower(); + + getActionDefinitionsBuilder(G_FSHR) + .legalFor({{S32, S32}}) + .scalarize(0) + .lower(); getActionDefinitionsBuilder(G_READCYCLECOUNTER) .legalFor({S64}); + getActionDefinitionsBuilder({ + // TODO: Verify V_BFI_B32 is generated from expanded bit ops + G_FCOPYSIGN, + + G_ATOMIC_CMPXCHG_WITH_SUCCESS, + G_READ_REGISTER, + G_WRITE_REGISTER, + + G_SADDO, G_SSUBO, + + // TODO: Implement + G_FMINIMUM, G_FMAXIMUM, + G_FSHL + }).lower(); + getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE, - G_DYN_STACKALLOC, G_INDEXED_LOAD, G_INDEXED_SEXTLOAD, + G_INDEXED_LOAD, G_INDEXED_SEXTLOAD, G_INDEXED_ZEXTLOAD, G_INDEXED_STORE}) .unsupported(); @@ -1131,10 +1470,12 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, verify(*ST.getInstrInfo()); } -bool AMDGPULegalizerInfo::legalizeCustom(MachineInstr &MI, - MachineRegisterInfo &MRI, - MachineIRBuilder &B, - GISelChangeObserver &Observer) const { +bool AMDGPULegalizerInfo::legalizeCustom(LegalizerHelper &Helper, + MachineInstr &MI) const { + MachineIRBuilder &B = Helper.MIRBuilder; + MachineRegisterInfo &MRI = *B.getMRI(); + GISelChangeObserver &Observer = Helper.Observer; + switch (MI.getOpcode()) { case TargetOpcode::G_ADDRSPACE_CAST: return legalizeAddrSpaceCast(MI, MRI, B); @@ -1148,15 +1489,21 @@ bool AMDGPULegalizerInfo::legalizeCustom(MachineInstr &MI, return legalizeITOFP(MI, MRI, B, true); case TargetOpcode::G_UITOFP: return legalizeITOFP(MI, MRI, B, false); + case TargetOpcode::G_FPTOSI: + return legalizeFPTOI(MI, MRI, B, true); + case TargetOpcode::G_FPTOUI: + return legalizeFPTOI(MI, MRI, B, false); case TargetOpcode::G_FMINNUM: case TargetOpcode::G_FMAXNUM: case TargetOpcode::G_FMINNUM_IEEE: case TargetOpcode::G_FMAXNUM_IEEE: - return legalizeMinNumMaxNum(MI, MRI, B); + return legalizeMinNumMaxNum(Helper, MI); case TargetOpcode::G_EXTRACT_VECTOR_ELT: return legalizeExtractVectorElt(MI, MRI, B); case TargetOpcode::G_INSERT_VECTOR_ELT: return legalizeInsertVectorElt(MI, MRI, B); + case TargetOpcode::G_SHUFFLE_VECTOR: + return legalizeShuffleVector(MI, MRI, B); case TargetOpcode::G_FSIN: case TargetOpcode::G_FCOS: return legalizeSinCos(MI, MRI, B); @@ -1168,8 +1515,26 @@ bool AMDGPULegalizerInfo::legalizeCustom(MachineInstr &MI, return legalizeFMad(MI, MRI, B); case TargetOpcode::G_FDIV: return legalizeFDIV(MI, MRI, B); + case TargetOpcode::G_UDIV: + case TargetOpcode::G_UREM: + return legalizeUDIV_UREM(MI, MRI, B); + case TargetOpcode::G_SDIV: + case TargetOpcode::G_SREM: + return legalizeSDIV_SREM(MI, MRI, B); case TargetOpcode::G_ATOMIC_CMPXCHG: return legalizeAtomicCmpXChg(MI, MRI, B); + case TargetOpcode::G_FLOG: + return legalizeFlog(MI, B, numbers::ln2f); + case TargetOpcode::G_FLOG10: + return legalizeFlog(MI, B, numbers::ln2f / numbers::ln10f); + case TargetOpcode::G_FEXP: + return legalizeFExp(MI, B); + case TargetOpcode::G_FPOW: + return legalizeFPow(MI, B); + case TargetOpcode::G_FFLOOR: + return legalizeFFloor(MI, MRI, B); + case TargetOpcode::G_BUILD_VECTOR: + return legalizeBuildVector(MI, MRI, B); default: return false; } @@ -1201,7 +1566,6 @@ Register AMDGPULegalizerInfo::getSegmentAperture( Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ | WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_; - Register ApertureReg = MRI.createGenericVirtualRegister(S32); Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); B.buildInstr(AMDGPU::S_GETREG_B32) @@ -1210,12 +1574,7 @@ Register AMDGPULegalizerInfo::getSegmentAperture( MRI.setType(GetReg, S32); auto ShiftAmt = B.buildConstant(S32, WidthM1 + 1); - B.buildInstr(TargetOpcode::G_SHL) - .addDef(ApertureReg) - .addUse(GetReg) - .addUse(ShiftAmt.getReg(0)); - - return ApertureReg; + return B.buildShl(S32, GetReg, ShiftAmt).getReg(0); } Register QueuePtr = MRI.createGenericVirtualRegister( @@ -1232,19 +1591,15 @@ Register AMDGPULegalizerInfo::getSegmentAperture( // TODO: can we be smarter about machine pointer info? MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS); MachineMemOperand *MMO = MF.getMachineMemOperand( - PtrInfo, - MachineMemOperand::MOLoad | - MachineMemOperand::MODereferenceable | - MachineMemOperand::MOInvariant, - 4, - MinAlign(64, StructOffset)); - - Register LoadResult = MRI.createGenericVirtualRegister(S32); + PtrInfo, + MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | + MachineMemOperand::MOInvariant, + 4, commonAlignment(Align(64), StructOffset)); + Register LoadAddr; B.materializePtrAdd(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset); - B.buildLoad(LoadResult, LoadAddr, *MMO); - return LoadResult; + return B.buildLoad(S32, LoadAddr, *MMO).getReg(0); } bool AMDGPULegalizerInfo::legalizeAddrSpaceCast( @@ -1252,8 +1607,6 @@ bool AMDGPULegalizerInfo::legalizeAddrSpaceCast( MachineIRBuilder &B) const { MachineFunction &MF = B.getMF(); - B.setInstr(MI); - const LLT S32 = LLT::scalar(32); Register Dst = MI.getOperand(0).getReg(); Register Src = MI.getOperand(1).getReg(); @@ -1292,7 +1645,7 @@ bool AMDGPULegalizerInfo::legalizeAddrSpaceCast( // extra ptrtoint would be kind of pointless. auto HighAddr = B.buildConstant( LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS_32BIT, 32), AddrHiVal); - B.buildMerge(Dst, {Src, HighAddr.getReg(0)}); + B.buildMerge(Dst, {Src, HighAddr}); MI.eraseFromParent(); return true; } @@ -1305,13 +1658,11 @@ bool AMDGPULegalizerInfo::legalizeAddrSpaceCast( auto SegmentNull = B.buildConstant(DstTy, NullVal); auto FlatNull = B.buildConstant(SrcTy, 0); - Register PtrLo32 = MRI.createGenericVirtualRegister(DstTy); - // Extract low 32-bits of the pointer. - B.buildExtract(PtrLo32, Src, 0); + auto PtrLo32 = B.buildExtract(DstTy, Src, 0); - Register CmpRes = MRI.createGenericVirtualRegister(LLT::scalar(1)); - B.buildICmp(CmpInst::ICMP_NE, CmpRes, Src, FlatNull.getReg(0)); + auto CmpRes = + B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, FlatNull.getReg(0)); B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0)); MI.eraseFromParent(); @@ -1333,21 +1684,16 @@ bool AMDGPULegalizerInfo::legalizeAddrSpaceCast( if (!ApertureReg.isValid()) return false; - Register CmpRes = MRI.createGenericVirtualRegister(LLT::scalar(1)); - B.buildICmp(CmpInst::ICMP_NE, CmpRes, Src, SegmentNull.getReg(0)); - - Register BuildPtr = MRI.createGenericVirtualRegister(DstTy); + auto CmpRes = + B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, SegmentNull.getReg(0)); // Coerce the type of the low half of the result so we can use merge_values. - Register SrcAsInt = MRI.createGenericVirtualRegister(S32); - B.buildInstr(TargetOpcode::G_PTRTOINT) - .addDef(SrcAsInt) - .addUse(Src); + Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0); // TODO: Should we allow mismatched types but matching sizes in merges to // avoid the ptrtoint? - B.buildMerge(BuildPtr, {SrcAsInt, ApertureReg}); - B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull.getReg(0)); + auto BuildPtr = B.buildMerge(DstTy, {SrcAsInt, ApertureReg}); + B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull); MI.eraseFromParent(); return true; @@ -1356,8 +1702,6 @@ bool AMDGPULegalizerInfo::legalizeAddrSpaceCast( bool AMDGPULegalizerInfo::legalizeFrint( MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { - B.setInstr(MI); - Register Src = MI.getOperand(1).getReg(); LLT Ty = MRI.getType(Src); assert(Ty.isScalar() && Ty.getSizeInBits() == 64); @@ -1383,7 +1727,6 @@ bool AMDGPULegalizerInfo::legalizeFrint( bool AMDGPULegalizerInfo::legalizeFceil( MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { - B.setInstr(MI); const LLT S1 = LLT::scalar(1); const LLT S64 = LLT::scalar(64); @@ -1395,7 +1738,7 @@ bool AMDGPULegalizerInfo::legalizeFceil( // if (src > 0.0 && src != result) // result += 1.0 - auto Trunc = B.buildInstr(TargetOpcode::G_INTRINSIC_TRUNC, {S64}, {Src}); + auto Trunc = B.buildIntrinsicTrunc(S64, Src); const auto Zero = B.buildFConstant(S64, 0.0); const auto One = B.buildFConstant(S64, 1.0); @@ -1428,8 +1771,6 @@ static MachineInstrBuilder extractF64Exponent(unsigned Hi, bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc( MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { - B.setInstr(MI); - const LLT S1 = LLT::scalar(1); const LLT S32 = LLT::scalar(32); const LLT S64 = LLT::scalar(64); @@ -1456,7 +1797,7 @@ bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc( const auto Zero32 = B.buildConstant(S32, 0); // Extend back to 64-bits. - auto SignBit64 = B.buildMerge(S64, {Zero32.getReg(0), SignBit.getReg(0)}); + auto SignBit64 = B.buildMerge(S64, {Zero32, SignBit}); auto Shr = B.buildAShr(S64, FractMask, Exp); auto Not = B.buildNot(S64, Shr); @@ -1474,7 +1815,6 @@ bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc( bool AMDGPULegalizerInfo::legalizeITOFP( MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, bool Signed) const { - B.setInstr(MI); Register Dst = MI.getOperand(0).getReg(); Register Src = MI.getOperand(1).getReg(); @@ -1503,10 +1843,44 @@ bool AMDGPULegalizerInfo::legalizeITOFP( return true; } -bool AMDGPULegalizerInfo::legalizeMinNumMaxNum( +// TODO: Copied from DAG implementation. Verify logic and document how this +// actually works. +bool AMDGPULegalizerInfo::legalizeFPTOI( MachineInstr &MI, MachineRegisterInfo &MRI, - MachineIRBuilder &B) const { - MachineFunction &MF = B.getMF(); + MachineIRBuilder &B, bool Signed) const { + + Register Dst = MI.getOperand(0).getReg(); + Register Src = MI.getOperand(1).getReg(); + + const LLT S64 = LLT::scalar(64); + const LLT S32 = LLT::scalar(32); + + assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64); + + unsigned Flags = MI.getFlags(); + + auto Trunc = B.buildIntrinsicTrunc(S64, Src, Flags); + auto K0 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0x3df0000000000000))); + auto K1 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0xc1f0000000000000))); + + auto Mul = B.buildFMul(S64, Trunc, K0, Flags); + auto FloorMul = B.buildFFloor(S64, Mul, Flags); + auto Fma = B.buildFMA(S64, FloorMul, K1, Trunc, Flags); + + auto Hi = Signed ? + B.buildFPTOSI(S32, FloorMul) : + B.buildFPTOUI(S32, FloorMul); + auto Lo = B.buildFPTOUI(S32, Fma); + + B.buildMerge(Dst, { Lo, Hi }); + MI.eraseFromParent(); + + return true; +} + +bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(LegalizerHelper &Helper, + MachineInstr &MI) const { + MachineFunction &MF = Helper.MIRBuilder.getMF(); const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE || @@ -1520,10 +1894,6 @@ bool AMDGPULegalizerInfo::legalizeMinNumMaxNum( if (IsIEEEOp) return true; - MachineIRBuilder HelperBuilder(MI); - GISelObserverWrapper DummyObserver; - LegalizerHelper Helper(MF, DummyObserver, HelperBuilder); - HelperBuilder.setInstr(MI); return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized; } @@ -1533,8 +1903,12 @@ bool AMDGPULegalizerInfo::legalizeExtractVectorElt( // TODO: Should move some of this into LegalizerHelper. // TODO: Promote dynamic indexing of s16 to s32 - // TODO: Dynamic s64 indexing is only legal for SGPR. - Optional<int64_t> IdxVal = getConstantVRegVal(MI.getOperand(2).getReg(), MRI); + + // FIXME: Artifact combiner probably should have replaced the truncated + // constant before this, so we shouldn't need + // getConstantVRegValWithLookThrough. + Optional<ValueAndVReg> IdxVal = getConstantVRegValWithLookThrough( + MI.getOperand(2).getReg(), MRI); if (!IdxVal) // Dynamic case will be selected to register indexing. return true; @@ -1545,10 +1919,8 @@ bool AMDGPULegalizerInfo::legalizeExtractVectorElt( LLT EltTy = VecTy.getElementType(); assert(EltTy == MRI.getType(Dst)); - B.setInstr(MI); - - if (IdxVal.getValue() < VecTy.getNumElements()) - B.buildExtract(Dst, Vec, IdxVal.getValue() * EltTy.getSizeInBits()); + if (IdxVal->Value < VecTy.getNumElements()) + B.buildExtract(Dst, Vec, IdxVal->Value * EltTy.getSizeInBits()); else B.buildUndef(Dst); @@ -1562,8 +1934,12 @@ bool AMDGPULegalizerInfo::legalizeInsertVectorElt( // TODO: Should move some of this into LegalizerHelper. // TODO: Promote dynamic indexing of s16 to s32 - // TODO: Dynamic s64 indexing is only legal for SGPR. - Optional<int64_t> IdxVal = getConstantVRegVal(MI.getOperand(3).getReg(), MRI); + + // FIXME: Artifact combiner probably should have replaced the truncated + // constant before this, so we shouldn't need + // getConstantVRegValWithLookThrough. + Optional<ValueAndVReg> IdxVal = getConstantVRegValWithLookThrough( + MI.getOperand(3).getReg(), MRI); if (!IdxVal) // Dynamic case will be selected to register indexing. return true; @@ -1575,10 +1951,8 @@ bool AMDGPULegalizerInfo::legalizeInsertVectorElt( LLT EltTy = VecTy.getElementType(); assert(EltTy == MRI.getType(Ins)); - B.setInstr(MI); - - if (IdxVal.getValue() < VecTy.getNumElements()) - B.buildInsert(Dst, Vec, Ins, IdxVal.getValue() * EltTy.getSizeInBits()); + if (IdxVal->Value < VecTy.getNumElements()) + B.buildInsert(Dst, Vec, Ins, IdxVal->Value * EltTy.getSizeInBits()); else B.buildUndef(Dst); @@ -1586,10 +1960,29 @@ bool AMDGPULegalizerInfo::legalizeInsertVectorElt( return true; } +bool AMDGPULegalizerInfo::legalizeShuffleVector( + MachineInstr &MI, MachineRegisterInfo &MRI, + MachineIRBuilder &B) const { + const LLT V2S16 = LLT::vector(2, 16); + + Register Dst = MI.getOperand(0).getReg(); + Register Src0 = MI.getOperand(1).getReg(); + LLT DstTy = MRI.getType(Dst); + LLT SrcTy = MRI.getType(Src0); + + if (SrcTy == V2S16 && DstTy == V2S16 && + AMDGPU::isLegalVOP3PShuffleMask(MI.getOperand(3).getShuffleMask())) + return true; + + MachineIRBuilder HelperBuilder(MI); + GISelObserverWrapper DummyObserver; + LegalizerHelper Helper(B.getMF(), DummyObserver, HelperBuilder); + return Helper.lowerShuffleVector(MI) == LegalizerHelper::Legalized; +} + bool AMDGPULegalizerInfo::legalizeSinCos( MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { - B.setInstr(MI); Register DstReg = MI.getOperand(0).getReg(); Register SrcReg = MI.getOperand(1).getReg(); @@ -1597,7 +1990,7 @@ bool AMDGPULegalizerInfo::legalizeSinCos( unsigned Flags = MI.getFlags(); Register TrigVal; - auto OneOver2Pi = B.buildFConstant(Ty, 0.5 / M_PI); + auto OneOver2Pi = B.buildFConstant(Ty, 0.5 * numbers::inv_pi); if (ST.hasTrigReducedRange()) { auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags); TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false) @@ -1615,10 +2008,12 @@ bool AMDGPULegalizerInfo::legalizeSinCos( return true; } -bool AMDGPULegalizerInfo::buildPCRelGlobalAddress( - Register DstReg, LLT PtrTy, - MachineIRBuilder &B, const GlobalValue *GV, - unsigned Offset, unsigned GAFlags) const { +bool AMDGPULegalizerInfo::buildPCRelGlobalAddress(Register DstReg, LLT PtrTy, + MachineIRBuilder &B, + const GlobalValue *GV, + int64_t Offset, + unsigned GAFlags) const { + assert(isInt<32>(Offset + 4) && "32-bit offset is expected!"); // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered // to the following code sequence: // @@ -1681,19 +2076,37 @@ bool AMDGPULegalizerInfo::legalizeGlobalValue( const GlobalValue *GV = MI.getOperand(1).getGlobal(); MachineFunction &MF = B.getMF(); SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); - B.setInstr(MI); if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) { if (!MFI->isEntryFunction()) { const Function &Fn = MF.getFunction(); DiagnosticInfoUnsupported BadLDSDecl( - Fn, "local memory global used by non-kernel function", MI.getDebugLoc()); + Fn, "local memory global used by non-kernel function", MI.getDebugLoc(), + DS_Warning); Fn.getContext().diagnose(BadLDSDecl); + + // We currently don't have a way to correctly allocate LDS objects that + // aren't directly associated with a kernel. We do force inlining of + // functions that use local objects. However, if these dead functions are + // not eliminated, we don't want a compile time error. Just emit a warning + // and a trap, since there should be no callable path here. + B.buildIntrinsic(Intrinsic::trap, ArrayRef<Register>(), true); + B.buildUndef(DstReg); + MI.eraseFromParent(); + return true; } // TODO: We could emit code to handle the initialization somewhere. if (!AMDGPUTargetLowering::hasDefinedInitializer(GV)) { - B.buildConstant(DstReg, MFI->allocateLDSGlobal(B.getDataLayout(), *GV)); + const SITargetLowering *TLI = ST.getTargetLowering(); + if (!TLI->shouldUseLDSConstAddress(GV)) { + MI.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO); + return true; // Leave in place; + } + + B.buildConstant( + DstReg, + MFI->allocateLDSGlobal(B.getDataLayout(), *cast<GlobalVariable>(GV))); MI.eraseFromParent(); return true; } @@ -1723,10 +2136,10 @@ bool AMDGPULegalizerInfo::legalizeGlobalValue( Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy); MachineMemOperand *GOTMMO = MF.getMachineMemOperand( - MachinePointerInfo::getGOT(MF), - MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | - MachineMemOperand::MOInvariant, - 8 /*Size*/, 8 /*Align*/); + MachinePointerInfo::getGOT(MF), + MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | + MachineMemOperand::MOInvariant, + 8 /*Size*/, Align(8)); buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32); @@ -1744,7 +2157,6 @@ bool AMDGPULegalizerInfo::legalizeGlobalValue( bool AMDGPULegalizerInfo::legalizeLoad( MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, GISelChangeObserver &Observer) const { - B.setInstr(MI); LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64); auto Cast = B.buildAddrSpaceCast(ConstPtr, MI.getOperand(1).getReg()); Observer.changingInstr(MI); @@ -1763,16 +2175,15 @@ bool AMDGPULegalizerInfo::legalizeFMad( const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); // TODO: Always legal with future ftz flag. - if (Ty == LLT::scalar(32) && !MFI->getMode().FP32Denormals) + // FIXME: Do we need just output? + if (Ty == LLT::scalar(32) && !MFI->getMode().allFP32Denormals()) return true; - if (Ty == LLT::scalar(16) && !MFI->getMode().FP64FP16Denormals) + if (Ty == LLT::scalar(16) && !MFI->getMode().allFP64FP16Denormals()) return true; - MachineIRBuilder HelperBuilder(MI); GISelObserverWrapper DummyObserver; LegalizerHelper Helper(MF, DummyObserver, HelperBuilder); - HelperBuilder.setMBB(*MI.getParent()); return Helper.lowerFMad(MI) == LegalizerHelper::Legalized; } @@ -1790,7 +2201,6 @@ bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg( LLT ValTy = MRI.getType(CmpVal); LLT VecTy = LLT::vector(2, ValTy); - B.setInstr(MI); Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0); B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG) @@ -1803,39 +2213,248 @@ bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg( return true; } +bool AMDGPULegalizerInfo::legalizeFlog( + MachineInstr &MI, MachineIRBuilder &B, double Log2BaseInverted) const { + Register Dst = MI.getOperand(0).getReg(); + Register Src = MI.getOperand(1).getReg(); + LLT Ty = B.getMRI()->getType(Dst); + unsigned Flags = MI.getFlags(); + + auto Log2Operand = B.buildFLog2(Ty, Src, Flags); + auto Log2BaseInvertedOperand = B.buildFConstant(Ty, Log2BaseInverted); + + B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags); + MI.eraseFromParent(); + return true; +} + +bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI, + MachineIRBuilder &B) const { + Register Dst = MI.getOperand(0).getReg(); + Register Src = MI.getOperand(1).getReg(); + unsigned Flags = MI.getFlags(); + LLT Ty = B.getMRI()->getType(Dst); + + auto K = B.buildFConstant(Ty, numbers::log2e); + auto Mul = B.buildFMul(Ty, Src, K, Flags); + B.buildFExp2(Dst, Mul, Flags); + MI.eraseFromParent(); + return true; +} + +bool AMDGPULegalizerInfo::legalizeFPow(MachineInstr &MI, + MachineIRBuilder &B) const { + Register Dst = MI.getOperand(0).getReg(); + Register Src0 = MI.getOperand(1).getReg(); + Register Src1 = MI.getOperand(2).getReg(); + unsigned Flags = MI.getFlags(); + LLT Ty = B.getMRI()->getType(Dst); + const LLT S16 = LLT::scalar(16); + const LLT S32 = LLT::scalar(32); + + if (Ty == S32) { + auto Log = B.buildFLog2(S32, Src0, Flags); + auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false) + .addUse(Log.getReg(0)) + .addUse(Src1) + .setMIFlags(Flags); + B.buildFExp2(Dst, Mul, Flags); + } else if (Ty == S16) { + // There's no f16 fmul_legacy, so we need to convert for it. + auto Log = B.buildFLog2(S16, Src0, Flags); + auto Ext0 = B.buildFPExt(S32, Log, Flags); + auto Ext1 = B.buildFPExt(S32, Src1, Flags); + auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false) + .addUse(Ext0.getReg(0)) + .addUse(Ext1.getReg(0)) + .setMIFlags(Flags); + + B.buildFExp2(Dst, B.buildFPTrunc(S16, Mul), Flags); + } else + return false; + + MI.eraseFromParent(); + return true; +} + +// Find a source register, ignoring any possible source modifiers. +static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI) { + Register ModSrc = OrigSrc; + if (MachineInstr *SrcFNeg = getOpcodeDef(AMDGPU::G_FNEG, ModSrc, MRI)) { + ModSrc = SrcFNeg->getOperand(1).getReg(); + if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI)) + ModSrc = SrcFAbs->getOperand(1).getReg(); + } else if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI)) + ModSrc = SrcFAbs->getOperand(1).getReg(); + return ModSrc; +} + +bool AMDGPULegalizerInfo::legalizeFFloor(MachineInstr &MI, + MachineRegisterInfo &MRI, + MachineIRBuilder &B) const { + + const LLT S1 = LLT::scalar(1); + const LLT S64 = LLT::scalar(64); + Register Dst = MI.getOperand(0).getReg(); + Register OrigSrc = MI.getOperand(1).getReg(); + unsigned Flags = MI.getFlags(); + assert(ST.hasFractBug() && MRI.getType(Dst) == S64 && + "this should not have been custom lowered"); + + // V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x)) + // is used instead. However, SI doesn't have V_FLOOR_F64, so the most + // efficient way to implement it is using V_FRACT_F64. The workaround for the + // V_FRACT bug is: + // fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999) + // + // Convert floor(x) to (x - fract(x)) + + auto Fract = B.buildIntrinsic(Intrinsic::amdgcn_fract, {S64}, false) + .addUse(OrigSrc) + .setMIFlags(Flags); + + // Give source modifier matching some assistance before obscuring a foldable + // pattern. + + // TODO: We can avoid the neg on the fract? The input sign to fract + // shouldn't matter? + Register ModSrc = stripAnySourceMods(OrigSrc, MRI); + + auto Const = B.buildFConstant(S64, BitsToDouble(0x3fefffffffffffff)); + + Register Min = MRI.createGenericVirtualRegister(S64); + + // We don't need to concern ourselves with the snan handling difference, so + // use the one which will directly select. + const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); + if (MFI->getMode().IEEE) + B.buildFMinNumIEEE(Min, Fract, Const, Flags); + else + B.buildFMinNum(Min, Fract, Const, Flags); + + Register CorrectedFract = Min; + if (!MI.getFlag(MachineInstr::FmNoNans)) { + auto IsNan = B.buildFCmp(CmpInst::FCMP_ORD, S1, ModSrc, ModSrc, Flags); + CorrectedFract = B.buildSelect(S64, IsNan, ModSrc, Min, Flags).getReg(0); + } + + auto NegFract = B.buildFNeg(S64, CorrectedFract, Flags); + B.buildFAdd(Dst, OrigSrc, NegFract, Flags); + + MI.eraseFromParent(); + return true; +} + +// Turn an illegal packed v2s16 build vector into bit operations. +// TODO: This should probably be a bitcast action in LegalizerHelper. +bool AMDGPULegalizerInfo::legalizeBuildVector( + MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { + Register Dst = MI.getOperand(0).getReg(); + const LLT S32 = LLT::scalar(32); + assert(MRI.getType(Dst) == LLT::vector(2, 16)); + + Register Src0 = MI.getOperand(1).getReg(); + Register Src1 = MI.getOperand(2).getReg(); + assert(MRI.getType(Src0) == LLT::scalar(16)); + + auto Merge = B.buildMerge(S32, {Src0, Src1}); + B.buildBitcast(Dst, Merge); + + MI.eraseFromParent(); + return true; +} + // Return the use branch instruction, otherwise null if the usage is invalid. static MachineInstr *verifyCFIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, - MachineInstr *&Br) { + MachineInstr *&Br, + MachineBasicBlock *&UncondBrTarget) { Register CondDef = MI.getOperand(0).getReg(); if (!MRI.hasOneNonDBGUse(CondDef)) return nullptr; + MachineBasicBlock *Parent = MI.getParent(); MachineInstr &UseMI = *MRI.use_instr_nodbg_begin(CondDef); - if (UseMI.getParent() != MI.getParent() || + if (UseMI.getParent() != Parent || UseMI.getOpcode() != AMDGPU::G_BRCOND) return nullptr; - // Make sure the cond br is followed by a G_BR + // Make sure the cond br is followed by a G_BR, or is the last instruction. MachineBasicBlock::iterator Next = std::next(UseMI.getIterator()); - if (Next != MI.getParent()->end()) { + if (Next == Parent->end()) { + MachineFunction::iterator NextMBB = std::next(Parent->getIterator()); + if (NextMBB == Parent->getParent()->end()) // Illegal intrinsic use. + return nullptr; + UncondBrTarget = &*NextMBB; + } else { if (Next->getOpcode() != AMDGPU::G_BR) return nullptr; Br = &*Next; + UncondBrTarget = Br->getOperand(0).getMBB(); } return &UseMI; } -Register AMDGPULegalizerInfo::getLiveInRegister(MachineRegisterInfo &MRI, - Register Reg, LLT Ty) const { - Register LiveIn = MRI.getLiveInVirtReg(Reg); - if (LiveIn) +Register AMDGPULegalizerInfo::insertLiveInCopy(MachineIRBuilder &B, + MachineRegisterInfo &MRI, + Register LiveIn, + Register PhyReg) const { + assert(PhyReg.isPhysical() && "Physical register expected"); + + // Insert the live-in copy, if required, by defining destination virtual + // register. + // FIXME: It seems EmitLiveInCopies isn't called anywhere? + if (!MRI.getVRegDef(LiveIn)) { + // FIXME: Should have scoped insert pt + MachineBasicBlock &OrigInsBB = B.getMBB(); + auto OrigInsPt = B.getInsertPt(); + + MachineBasicBlock &EntryMBB = B.getMF().front(); + EntryMBB.addLiveIn(PhyReg); + B.setInsertPt(EntryMBB, EntryMBB.begin()); + B.buildCopy(LiveIn, PhyReg); + + B.setInsertPt(OrigInsBB, OrigInsPt); + } + + return LiveIn; +} + +Register AMDGPULegalizerInfo::getLiveInRegister(MachineIRBuilder &B, + MachineRegisterInfo &MRI, + Register PhyReg, LLT Ty, + bool InsertLiveInCopy) const { + assert(PhyReg.isPhysical() && "Physical register expected"); + + // Get or create virtual live-in regester + Register LiveIn = MRI.getLiveInVirtReg(PhyReg); + if (!LiveIn) { + LiveIn = MRI.createGenericVirtualRegister(Ty); + MRI.addLiveIn(PhyReg, LiveIn); + } + + // When the actual true copy required is from virtual register to physical + // register (to be inserted later), live-in copy insertion from physical + // to register virtual register is not required + if (!InsertLiveInCopy) return LiveIn; - Register NewReg = MRI.createGenericVirtualRegister(Ty); - MRI.addLiveIn(Reg, NewReg); - return NewReg; + return insertLiveInCopy(B, MRI, LiveIn, PhyReg); +} + +const ArgDescriptor *AMDGPULegalizerInfo::getArgDescriptor( + MachineIRBuilder &B, AMDGPUFunctionArgInfo::PreloadedValue ArgType) const { + const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); + const ArgDescriptor *Arg; + const TargetRegisterClass *RC; + LLT ArgTy; + std::tie(Arg, RC, ArgTy) = MFI->getPreloadedValue(ArgType); + if (!Arg) { + LLVM_DEBUG(dbgs() << "Required arg register missing\n"); + return nullptr; + } + return Arg; } bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B, @@ -1843,12 +2462,14 @@ bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B, if (!Arg->isRegister() || !Arg->getRegister().isValid()) return false; // TODO: Handle these - assert(Arg->getRegister().isPhysical()); + Register SrcReg = Arg->getRegister(); + assert(SrcReg.isPhysical() && "Physical register expected"); + assert(DstReg.isVirtual() && "Virtual register expected"); MachineRegisterInfo &MRI = *B.getMRI(); LLT Ty = MRI.getType(DstReg); - Register LiveIn = getLiveInRegister(MRI, Arg->getRegister(), Ty); + Register LiveIn = getLiveInRegister(B, MRI, SrcReg, Ty); if (Arg->isMasked()) { // TODO: Should we try to emit this once in the entry block? @@ -1864,56 +2485,31 @@ bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B, } B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift)); - } else + } else { B.buildCopy(DstReg, LiveIn); - - // Insert the argument copy if it doens't already exist. - // FIXME: It seems EmitLiveInCopies isn't called anywhere? - if (!MRI.getVRegDef(LiveIn)) { - // FIXME: Should have scoped insert pt - MachineBasicBlock &OrigInsBB = B.getMBB(); - auto OrigInsPt = B.getInsertPt(); - - MachineBasicBlock &EntryMBB = B.getMF().front(); - EntryMBB.addLiveIn(Arg->getRegister()); - B.setInsertPt(EntryMBB, EntryMBB.begin()); - B.buildCopy(LiveIn, Arg->getRegister()); - - B.setInsertPt(OrigInsBB, OrigInsPt); } return true; } bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin( - MachineInstr &MI, - MachineRegisterInfo &MRI, - MachineIRBuilder &B, - AMDGPUFunctionArgInfo::PreloadedValue ArgType) const { - B.setInstr(MI); - - const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>(); + MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, + AMDGPUFunctionArgInfo::PreloadedValue ArgType) const { - const ArgDescriptor *Arg; - const TargetRegisterClass *RC; - std::tie(Arg, RC) = MFI->getPreloadedValue(ArgType); - if (!Arg) { - LLVM_DEBUG(dbgs() << "Required arg register missing\n"); + const ArgDescriptor *Arg = getArgDescriptor(B, ArgType); + if (!Arg) return false; - } - if (loadInputValue(MI.getOperand(0).getReg(), B, Arg)) { - MI.eraseFromParent(); - return true; - } + if (!loadInputValue(MI.getOperand(0).getReg(), B, Arg)) + return false; - return false; + MI.eraseFromParent(); + return true; } bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { - B.setInstr(MI); Register Dst = MI.getOperand(0).getReg(); LLT DstTy = MRI.getType(Dst); LLT S16 = LLT::scalar(16); @@ -1933,6 +2529,284 @@ bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI, return false; } +void AMDGPULegalizerInfo::legalizeUDIV_UREM32Impl(MachineIRBuilder &B, + Register DstReg, + Register X, + Register Y, + bool IsDiv) const { + const LLT S1 = LLT::scalar(1); + const LLT S32 = LLT::scalar(32); + + // See AMDGPUCodeGenPrepare::expandDivRem32 for a description of the + // algorithm used here. + + // Initial estimate of inv(y). + auto FloatY = B.buildUITOFP(S32, Y); + auto RcpIFlag = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {FloatY}); + auto Scale = B.buildFConstant(S32, BitsToFloat(0x4f7ffffe)); + auto ScaledY = B.buildFMul(S32, RcpIFlag, Scale); + auto Z = B.buildFPTOUI(S32, ScaledY); + + // One round of UNR. + auto NegY = B.buildSub(S32, B.buildConstant(S32, 0), Y); + auto NegYZ = B.buildMul(S32, NegY, Z); + Z = B.buildAdd(S32, Z, B.buildUMulH(S32, Z, NegYZ)); + + // Quotient/remainder estimate. + auto Q = B.buildUMulH(S32, X, Z); + auto R = B.buildSub(S32, X, B.buildMul(S32, Q, Y)); + + // First quotient/remainder refinement. + auto One = B.buildConstant(S32, 1); + auto Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y); + if (IsDiv) + Q = B.buildSelect(S32, Cond, B.buildAdd(S32, Q, One), Q); + R = B.buildSelect(S32, Cond, B.buildSub(S32, R, Y), R); + + // Second quotient/remainder refinement. + Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y); + if (IsDiv) + B.buildSelect(DstReg, Cond, B.buildAdd(S32, Q, One), Q); + else + B.buildSelect(DstReg, Cond, B.buildSub(S32, R, Y), R); +} + +bool AMDGPULegalizerInfo::legalizeUDIV_UREM32(MachineInstr &MI, + MachineRegisterInfo &MRI, + MachineIRBuilder &B) const { + const bool IsDiv = MI.getOpcode() == AMDGPU::G_UDIV; + Register DstReg = MI.getOperand(0).getReg(); + Register Num = MI.getOperand(1).getReg(); + Register Den = MI.getOperand(2).getReg(); + legalizeUDIV_UREM32Impl(B, DstReg, Num, Den, IsDiv); + MI.eraseFromParent(); + return true; +} + +// Build integer reciprocal sequence arounud V_RCP_IFLAG_F32 +// +// Return lo, hi of result +// +// %cvt.lo = G_UITOFP Val.lo +// %cvt.hi = G_UITOFP Val.hi +// %mad = G_FMAD %cvt.hi, 2**32, %cvt.lo +// %rcp = G_AMDGPU_RCP_IFLAG %mad +// %mul1 = G_FMUL %rcp, 0x5f7ffffc +// %mul2 = G_FMUL %mul1, 2**(-32) +// %trunc = G_INTRINSIC_TRUNC %mul2 +// %mad2 = G_FMAD %trunc, -(2**32), %mul1 +// return {G_FPTOUI %mad2, G_FPTOUI %trunc} +static std::pair<Register, Register> emitReciprocalU64(MachineIRBuilder &B, + Register Val) { + const LLT S32 = LLT::scalar(32); + auto Unmerge = B.buildUnmerge(S32, Val); + + auto CvtLo = B.buildUITOFP(S32, Unmerge.getReg(0)); + auto CvtHi = B.buildUITOFP(S32, Unmerge.getReg(1)); + + auto Mad = B.buildFMAD(S32, CvtHi, // 2**32 + B.buildFConstant(S32, BitsToFloat(0x4f800000)), CvtLo); + + auto Rcp = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Mad}); + auto Mul1 = + B.buildFMul(S32, Rcp, B.buildFConstant(S32, BitsToFloat(0x5f7ffffc))); + + // 2**(-32) + auto Mul2 = + B.buildFMul(S32, Mul1, B.buildFConstant(S32, BitsToFloat(0x2f800000))); + auto Trunc = B.buildIntrinsicTrunc(S32, Mul2); + + // -(2**32) + auto Mad2 = B.buildFMAD(S32, Trunc, + B.buildFConstant(S32, BitsToFloat(0xcf800000)), Mul1); + + auto ResultLo = B.buildFPTOUI(S32, Mad2); + auto ResultHi = B.buildFPTOUI(S32, Trunc); + + return {ResultLo.getReg(0), ResultHi.getReg(0)}; +} + +void AMDGPULegalizerInfo::legalizeUDIV_UREM64Impl(MachineIRBuilder &B, + Register DstReg, + Register Numer, + Register Denom, + bool IsDiv) const { + const LLT S32 = LLT::scalar(32); + const LLT S64 = LLT::scalar(64); + const LLT S1 = LLT::scalar(1); + Register RcpLo, RcpHi; + + std::tie(RcpLo, RcpHi) = emitReciprocalU64(B, Denom); + + auto Rcp = B.buildMerge(S64, {RcpLo, RcpHi}); + + auto Zero64 = B.buildConstant(S64, 0); + auto NegDenom = B.buildSub(S64, Zero64, Denom); + + auto MulLo1 = B.buildMul(S64, NegDenom, Rcp); + auto MulHi1 = B.buildUMulH(S64, Rcp, MulLo1); + + auto UnmergeMulHi1 = B.buildUnmerge(S32, MulHi1); + Register MulHi1_Lo = UnmergeMulHi1.getReg(0); + Register MulHi1_Hi = UnmergeMulHi1.getReg(1); + + auto Add1_Lo = B.buildUAddo(S32, S1, RcpLo, MulHi1_Lo); + auto Add1_Hi = B.buildUAdde(S32, S1, RcpHi, MulHi1_Hi, Add1_Lo.getReg(1)); + auto Add1_HiNc = B.buildAdd(S32, RcpHi, MulHi1_Hi); + auto Add1 = B.buildMerge(S64, {Add1_Lo, Add1_Hi}); + + auto MulLo2 = B.buildMul(S64, NegDenom, Add1); + auto MulHi2 = B.buildUMulH(S64, Add1, MulLo2); + auto UnmergeMulHi2 = B.buildUnmerge(S32, MulHi2); + Register MulHi2_Lo = UnmergeMulHi2.getReg(0); + Register MulHi2_Hi = UnmergeMulHi2.getReg(1); + + auto Zero32 = B.buildConstant(S32, 0); + auto Add2_Lo = B.buildUAddo(S32, S1, Add1_Lo, MulHi2_Lo); + auto Add2_HiC = + B.buildUAdde(S32, S1, Add1_HiNc, MulHi2_Hi, Add1_Lo.getReg(1)); + auto Add2_Hi = B.buildUAdde(S32, S1, Add2_HiC, Zero32, Add2_Lo.getReg(1)); + auto Add2 = B.buildMerge(S64, {Add2_Lo, Add2_Hi}); + + auto UnmergeNumer = B.buildUnmerge(S32, Numer); + Register NumerLo = UnmergeNumer.getReg(0); + Register NumerHi = UnmergeNumer.getReg(1); + + auto MulHi3 = B.buildUMulH(S64, Numer, Add2); + auto Mul3 = B.buildMul(S64, Denom, MulHi3); + auto UnmergeMul3 = B.buildUnmerge(S32, Mul3); + Register Mul3_Lo = UnmergeMul3.getReg(0); + Register Mul3_Hi = UnmergeMul3.getReg(1); + auto Sub1_Lo = B.buildUSubo(S32, S1, NumerLo, Mul3_Lo); + auto Sub1_Hi = B.buildUSube(S32, S1, NumerHi, Mul3_Hi, Sub1_Lo.getReg(1)); + auto Sub1_Mi = B.buildSub(S32, NumerHi, Mul3_Hi); + auto Sub1 = B.buildMerge(S64, {Sub1_Lo, Sub1_Hi}); + + auto UnmergeDenom = B.buildUnmerge(S32, Denom); + Register DenomLo = UnmergeDenom.getReg(0); + Register DenomHi = UnmergeDenom.getReg(1); + + auto CmpHi = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Hi, DenomHi); + auto C1 = B.buildSExt(S32, CmpHi); + + auto CmpLo = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Lo, DenomLo); + auto C2 = B.buildSExt(S32, CmpLo); + + auto CmpEq = B.buildICmp(CmpInst::ICMP_EQ, S1, Sub1_Hi, DenomHi); + auto C3 = B.buildSelect(S32, CmpEq, C2, C1); + + // TODO: Here and below portions of the code can be enclosed into if/endif. + // Currently control flow is unconditional and we have 4 selects after + // potential endif to substitute PHIs. + + // if C3 != 0 ... + auto Sub2_Lo = B.buildUSubo(S32, S1, Sub1_Lo, DenomLo); + auto Sub2_Mi = B.buildUSube(S32, S1, Sub1_Mi, DenomHi, Sub1_Lo.getReg(1)); + auto Sub2_Hi = B.buildUSube(S32, S1, Sub2_Mi, Zero32, Sub2_Lo.getReg(1)); + auto Sub2 = B.buildMerge(S64, {Sub2_Lo, Sub2_Hi}); + + auto One64 = B.buildConstant(S64, 1); + auto Add3 = B.buildAdd(S64, MulHi3, One64); + + auto C4 = + B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Hi, DenomHi)); + auto C5 = + B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Lo, DenomLo)); + auto C6 = B.buildSelect( + S32, B.buildICmp(CmpInst::ICMP_EQ, S1, Sub2_Hi, DenomHi), C5, C4); + + // if (C6 != 0) + auto Add4 = B.buildAdd(S64, Add3, One64); + auto Sub3_Lo = B.buildUSubo(S32, S1, Sub2_Lo, DenomLo); + + auto Sub3_Mi = B.buildUSube(S32, S1, Sub2_Mi, DenomHi, Sub2_Lo.getReg(1)); + auto Sub3_Hi = B.buildUSube(S32, S1, Sub3_Mi, Zero32, Sub3_Lo.getReg(1)); + auto Sub3 = B.buildMerge(S64, {Sub3_Lo, Sub3_Hi}); + + // endif C6 + // endif C3 + + if (IsDiv) { + auto Sel1 = B.buildSelect( + S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Add4, Add3); + B.buildSelect(DstReg, + B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), Sel1, MulHi3); + } else { + auto Sel2 = B.buildSelect( + S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Sub3, Sub2); + B.buildSelect(DstReg, + B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), Sel2, Sub1); + } +} + +bool AMDGPULegalizerInfo::legalizeUDIV_UREM(MachineInstr &MI, + MachineRegisterInfo &MRI, + MachineIRBuilder &B) const { + const LLT S64 = LLT::scalar(64); + const LLT S32 = LLT::scalar(32); + const bool IsDiv = MI.getOpcode() == AMDGPU::G_UDIV; + Register DstReg = MI.getOperand(0).getReg(); + Register Num = MI.getOperand(1).getReg(); + Register Den = MI.getOperand(2).getReg(); + LLT Ty = MRI.getType(DstReg); + + if (Ty == S32) + legalizeUDIV_UREM32Impl(B, DstReg, Num, Den, IsDiv); + else if (Ty == S64) + legalizeUDIV_UREM64Impl(B, DstReg, Num, Den, IsDiv); + else + return false; + + MI.eraseFromParent(); + return true; + +} + +bool AMDGPULegalizerInfo::legalizeSDIV_SREM(MachineInstr &MI, + MachineRegisterInfo &MRI, + MachineIRBuilder &B) const { + const LLT S64 = LLT::scalar(64); + const LLT S32 = LLT::scalar(32); + + Register DstReg = MI.getOperand(0).getReg(); + const LLT Ty = MRI.getType(DstReg); + if (Ty != S32 && Ty != S64) + return false; + + const bool IsDiv = MI.getOpcode() == AMDGPU::G_SDIV; + + Register LHS = MI.getOperand(1).getReg(); + Register RHS = MI.getOperand(2).getReg(); + + auto SignBitOffset = B.buildConstant(S32, Ty.getSizeInBits() - 1); + auto LHSign = B.buildAShr(Ty, LHS, SignBitOffset); + auto RHSign = B.buildAShr(Ty, RHS, SignBitOffset); + + LHS = B.buildAdd(Ty, LHS, LHSign).getReg(0); + RHS = B.buildAdd(Ty, RHS, RHSign).getReg(0); + + LHS = B.buildXor(Ty, LHS, LHSign).getReg(0); + RHS = B.buildXor(Ty, RHS, RHSign).getReg(0); + + Register UDivRem = MRI.createGenericVirtualRegister(Ty); + if (Ty == S32) + legalizeUDIV_UREM32Impl(B, UDivRem, LHS, RHS, IsDiv); + else + legalizeUDIV_UREM64Impl(B, UDivRem, LHS, RHS, IsDiv); + + Register Sign; + if (IsDiv) + Sign = B.buildXor(Ty, LHSign, RHSign).getReg(0); + else + Sign = LHSign.getReg(0); // Remainder sign is the same as LHS + + UDivRem = B.buildXor(Ty, UDivRem, Sign).getReg(0); + B.buildSub(DstReg, UDivRem, Sign); + + MI.eraseFromParent(); + return true; +} + bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { @@ -1954,7 +2828,7 @@ bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI, return false; if (!Unsafe && ResTy == S32 && - MF.getInfo<SIMachineFunctionInfo>()->getMode().FP32Denormals) + MF.getInfo<SIMachineFunctionInfo>()->getMode().allFP32Denormals()) return false; if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) { @@ -1997,7 +2871,6 @@ bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI, bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { - B.setInstr(MI); Register Res = MI.getOperand(0).getReg(); Register LHS = MI.getOperand(1).getReg(); Register RHS = MI.getOperand(2).getReg(); @@ -2035,15 +2908,13 @@ static void toggleSPDenormMode(bool Enable, AMDGPU::SIModeRegisterDefaults Mode) { // Set SP denorm mode to this value. unsigned SPDenormMode = - Enable ? FP_DENORM_FLUSH_NONE : FP_DENORM_FLUSH_IN_FLUSH_OUT; + Enable ? FP_DENORM_FLUSH_NONE : Mode.fpDenormModeSPValue(); if (ST.hasDenormModeInst()) { // Preserve default FP64FP16 denorm mode while updating FP32 mode. - unsigned DPDenormModeDefault = Mode.FP64FP16Denormals - ? FP_DENORM_FLUSH_NONE - : FP_DENORM_FLUSH_IN_FLUSH_OUT; + uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue(); - unsigned NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2); + uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2); B.buildInstr(AMDGPU::S_DENORM_MODE) .addImm(NewDenormModeValue); @@ -2062,7 +2933,6 @@ static void toggleSPDenormMode(bool Enable, bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { - B.setInstr(MI); Register Res = MI.getOperand(0).getReg(); Register LHS = MI.getOperand(1).getReg(); Register RHS = MI.getOperand(2).getReg(); @@ -2078,15 +2948,15 @@ bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI, auto DenominatorScaled = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false) - .addUse(RHS) .addUse(LHS) - .addImm(1) + .addUse(RHS) + .addImm(0) .setMIFlags(Flags); auto NumeratorScaled = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false) .addUse(LHS) .addUse(RHS) - .addImm(0) + .addImm(1) .setMIFlags(Flags); auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false) @@ -2096,7 +2966,7 @@ bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI, // FIXME: Doesn't correctly model the FP mode switch, and the FP operations // aren't modeled as reading it. - if (!Mode.FP32Denormals) + if (!Mode.allFP32Denormals()) toggleSPDenormMode(true, B, ST, Mode); auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags); @@ -2106,7 +2976,7 @@ bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI, auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags); auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags); - if (!Mode.FP32Denormals) + if (!Mode.allFP32Denormals()) toggleSPDenormMode(false, B, ST, Mode); auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32}, false) @@ -2129,7 +2999,6 @@ bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI, bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { - B.setInstr(MI); Register Res = MI.getOperand(0).getReg(); Register LHS = MI.getOperand(1).getReg(); Register RHS = MI.getOperand(2).getReg(); @@ -2144,7 +3013,7 @@ bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI, auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false) .addUse(LHS) .addUse(RHS) - .addImm(1) + .addImm(0) .setMIFlags(Flags); auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags); @@ -2160,11 +3029,11 @@ bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI, auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false) .addUse(LHS) .addUse(RHS) - .addImm(0) + .addImm(1) .setMIFlags(Flags); auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags); - auto Mul = B.buildMul(S64, DivScale1.getReg(0), Fma3, Flags); + auto Mul = B.buildFMul(S64, DivScale1.getReg(0), Fma3, Flags); auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags); Register Scale; @@ -2172,8 +3041,6 @@ bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI, // Workaround a hardware bug on SI where the condition output from div_scale // is not usable. - Scale = MRI.createGenericVirtualRegister(S1); - LLT S32 = LLT::scalar(32); auto NumUnmerge = B.buildUnmerge(S32, LHS); @@ -2185,7 +3052,7 @@ bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI, Scale1Unmerge.getReg(1)); auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1), Scale0Unmerge.getReg(1)); - B.buildXor(Scale, CmpNum, CmpDen); + Scale = B.buildXor(S1, CmpNum, CmpDen).getReg(0); } else { Scale = DivScale1.getReg(1); } @@ -2210,7 +3077,6 @@ bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI, bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { - B.setInstr(MI); Register Res = MI.getOperand(0).getReg(); Register LHS = MI.getOperand(2).getReg(); Register RHS = MI.getOperand(3).getReg(); @@ -2252,8 +3118,6 @@ bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI, AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR); } - B.setInstr(MI); - uint64_t Offset = ST.getTargetLowering()->getImplicitParameterOffset( B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT); @@ -2263,8 +3127,9 @@ bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI, const ArgDescriptor *Arg; const TargetRegisterClass *RC; - std::tie(Arg, RC) - = MFI->getPreloadedValue(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR); + LLT ArgTy; + std::tie(Arg, RC, ArgTy) = + MFI->getPreloadedValue(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR); if (!Arg) return false; @@ -2281,7 +3146,6 @@ bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, unsigned AddrSpace) const { - B.setInstr(MI); Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B); auto Hi32 = B.buildExtract(LLT::scalar(32), MI.getOperand(2).getReg(), 32); B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg); @@ -2289,6 +3153,55 @@ bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI, return true; } +// The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args: +// offset (the offset that is included in bounds checking and swizzling, to be +// split between the instruction's voffset and immoffset fields) and soffset +// (the offset that is excluded from bounds checking and swizzling, to go in +// the instruction's soffset field). This function takes the first kind of +// offset and figures out how to split it between voffset and immoffset. +std::tuple<Register, unsigned, unsigned> +AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B, + Register OrigOffset) const { + const unsigned MaxImm = 4095; + Register BaseReg; + unsigned TotalConstOffset; + MachineInstr *OffsetDef; + const LLT S32 = LLT::scalar(32); + + std::tie(BaseReg, TotalConstOffset, OffsetDef) + = AMDGPU::getBaseWithConstantOffset(*B.getMRI(), OrigOffset); + + unsigned ImmOffset = TotalConstOffset; + + // If the immediate value is too big for the immoffset field, put the value + // and -4096 into the immoffset field so that the value that is copied/added + // for the voffset field is a multiple of 4096, and it stands more chance + // of being CSEd with the copy/add for another similar load/store. + // However, do not do that rounding down to a multiple of 4096 if that is a + // negative number, as it appears to be illegal to have a negative offset + // in the vgpr, even if adding the immediate offset makes it positive. + unsigned Overflow = ImmOffset & ~MaxImm; + ImmOffset -= Overflow; + if ((int32_t)Overflow < 0) { + Overflow += ImmOffset; + ImmOffset = 0; + } + + if (Overflow != 0) { + if (!BaseReg) { + BaseReg = B.buildConstant(S32, Overflow).getReg(0); + } else { + auto OverflowVal = B.buildConstant(S32, Overflow); + BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0); + } + } + + if (!BaseReg) + BaseReg = B.buildConstant(S32, 0).getReg(0); + + return std::make_tuple(BaseReg, ImmOffset, TotalConstOffset); +} + /// Handle register layout difference for f16 images for some subtargets. Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B, MachineRegisterInfo &MRI, @@ -2312,75 +3225,969 @@ Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B, return B.buildBuildVector(LLT::vector(NumElts, S32), WideRegs).getReg(0); } -bool AMDGPULegalizerInfo::legalizeRawBufferStore(MachineInstr &MI, - MachineRegisterInfo &MRI, - MachineIRBuilder &B, - bool IsFormat) const { - // TODO: Reject f16 format on targets where unsupported. - Register VData = MI.getOperand(1).getReg(); - LLT Ty = MRI.getType(VData); +Register AMDGPULegalizerInfo::fixStoreSourceType( + MachineIRBuilder &B, Register VData, bool IsFormat) const { + MachineRegisterInfo *MRI = B.getMRI(); + LLT Ty = MRI->getType(VData); - B.setInstr(MI); - - const LLT S32 = LLT::scalar(32); const LLT S16 = LLT::scalar(16); // Fixup illegal register types for i8 stores. if (Ty == LLT::scalar(8) || Ty == S16) { Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0); - MI.getOperand(1).setReg(AnyExt); - return true; + return AnyExt; } if (Ty.isVector()) { if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) { if (IsFormat) - MI.getOperand(1).setReg(handleD16VData(B, MRI, VData)); + return handleD16VData(B, *MRI, VData); + } + } + + return VData; +} + +bool AMDGPULegalizerInfo::legalizeBufferStore(MachineInstr &MI, + MachineRegisterInfo &MRI, + MachineIRBuilder &B, + bool IsTyped, + bool IsFormat) const { + Register VData = MI.getOperand(1).getReg(); + LLT Ty = MRI.getType(VData); + LLT EltTy = Ty.getScalarType(); + const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16); + const LLT S32 = LLT::scalar(32); + + VData = fixStoreSourceType(B, VData, IsFormat); + Register RSrc = MI.getOperand(2).getReg(); + + MachineMemOperand *MMO = *MI.memoperands_begin(); + const int MemSize = MMO->getSize(); + + unsigned ImmOffset; + unsigned TotalOffset; + + // The typed intrinsics add an immediate after the registers. + const unsigned NumVIndexOps = IsTyped ? 8 : 7; + + // The struct intrinsic variants add one additional operand over raw. + const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; + Register VIndex; + int OpOffset = 0; + if (HasVIndex) { + VIndex = MI.getOperand(3).getReg(); + OpOffset = 1; + } + + Register VOffset = MI.getOperand(3 + OpOffset).getReg(); + Register SOffset = MI.getOperand(4 + OpOffset).getReg(); + + unsigned Format = 0; + if (IsTyped) { + Format = MI.getOperand(5 + OpOffset).getImm(); + ++OpOffset; + } + + unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm(); + + std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset); + if (TotalOffset != 0) + MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize); + + unsigned Opc; + if (IsTyped) { + Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 : + AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT; + } else if (IsFormat) { + Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 : + AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT; + } else { + switch (MemSize) { + case 1: + Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE; + break; + case 2: + Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT; + break; + default: + Opc = AMDGPU::G_AMDGPU_BUFFER_STORE; + break; + } + } + + if (!VIndex) + VIndex = B.buildConstant(S32, 0).getReg(0); + + auto MIB = B.buildInstr(Opc) + .addUse(VData) // vdata + .addUse(RSrc) // rsrc + .addUse(VIndex) // vindex + .addUse(VOffset) // voffset + .addUse(SOffset) // soffset + .addImm(ImmOffset); // offset(imm) + + if (IsTyped) + MIB.addImm(Format); + + MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) + .addImm(HasVIndex ? -1 : 0) // idxen(imm) + .addMemOperand(MMO); + + MI.eraseFromParent(); + return true; +} + +bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI, + MachineRegisterInfo &MRI, + MachineIRBuilder &B, + bool IsFormat, + bool IsTyped) const { + // FIXME: Verifier should enforce 1 MMO for these intrinsics. + MachineMemOperand *MMO = *MI.memoperands_begin(); + const int MemSize = MMO->getSize(); + const LLT S32 = LLT::scalar(32); + + Register Dst = MI.getOperand(0).getReg(); + Register RSrc = MI.getOperand(2).getReg(); + + // The typed intrinsics add an immediate after the registers. + const unsigned NumVIndexOps = IsTyped ? 8 : 7; + + // The struct intrinsic variants add one additional operand over raw. + const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; + Register VIndex; + int OpOffset = 0; + if (HasVIndex) { + VIndex = MI.getOperand(3).getReg(); + OpOffset = 1; + } + + Register VOffset = MI.getOperand(3 + OpOffset).getReg(); + Register SOffset = MI.getOperand(4 + OpOffset).getReg(); + + unsigned Format = 0; + if (IsTyped) { + Format = MI.getOperand(5 + OpOffset).getImm(); + ++OpOffset; + } + + unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm(); + unsigned ImmOffset; + unsigned TotalOffset; + + LLT Ty = MRI.getType(Dst); + LLT EltTy = Ty.getScalarType(); + const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16); + const bool Unpacked = ST.hasUnpackedD16VMem(); + + std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset); + if (TotalOffset != 0) + MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize); + + unsigned Opc; + + if (IsTyped) { + Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 : + AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT; + } else if (IsFormat) { + Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16 : + AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT; + } else { + switch (MemSize) { + case 1: + Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE; + break; + case 2: + Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT; + break; + default: + Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD; + break; + } + } + + Register LoadDstReg; + + bool IsExtLoad = (!IsD16 && MemSize < 4) || (IsD16 && !Ty.isVector()); + LLT UnpackedTy = Ty.changeElementSize(32); + + if (IsExtLoad) + LoadDstReg = B.getMRI()->createGenericVirtualRegister(S32); + else if (Unpacked && IsD16 && Ty.isVector()) + LoadDstReg = B.getMRI()->createGenericVirtualRegister(UnpackedTy); + else + LoadDstReg = Dst; + + if (!VIndex) + VIndex = B.buildConstant(S32, 0).getReg(0); + + auto MIB = B.buildInstr(Opc) + .addDef(LoadDstReg) // vdata + .addUse(RSrc) // rsrc + .addUse(VIndex) // vindex + .addUse(VOffset) // voffset + .addUse(SOffset) // soffset + .addImm(ImmOffset); // offset(imm) + + if (IsTyped) + MIB.addImm(Format); + + MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) + .addImm(HasVIndex ? -1 : 0) // idxen(imm) + .addMemOperand(MMO); + + if (LoadDstReg != Dst) { + B.setInsertPt(B.getMBB(), ++B.getInsertPt()); + + // Widen result for extending loads was widened. + if (IsExtLoad) + B.buildTrunc(Dst, LoadDstReg); + else { + // Repack to original 16-bit vector result + // FIXME: G_TRUNC should work, but legalization currently fails + auto Unmerge = B.buildUnmerge(S32, LoadDstReg); + SmallVector<Register, 4> Repack; + for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I) + Repack.push_back(B.buildTrunc(EltTy, Unmerge.getReg(I)).getReg(0)); + B.buildMerge(Dst, Repack); + } + } + + MI.eraseFromParent(); + return true; +} + +bool AMDGPULegalizerInfo::legalizeAtomicIncDec(MachineInstr &MI, + MachineIRBuilder &B, + bool IsInc) const { + unsigned Opc = IsInc ? AMDGPU::G_AMDGPU_ATOMIC_INC : + AMDGPU::G_AMDGPU_ATOMIC_DEC; + B.buildInstr(Opc) + .addDef(MI.getOperand(0).getReg()) + .addUse(MI.getOperand(2).getReg()) + .addUse(MI.getOperand(3).getReg()) + .cloneMemRefs(MI); + MI.eraseFromParent(); + return true; +} + +static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) { + switch (IntrID) { + case Intrinsic::amdgcn_raw_buffer_atomic_swap: + case Intrinsic::amdgcn_struct_buffer_atomic_swap: + return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP; + case Intrinsic::amdgcn_raw_buffer_atomic_add: + case Intrinsic::amdgcn_struct_buffer_atomic_add: + return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD; + case Intrinsic::amdgcn_raw_buffer_atomic_sub: + case Intrinsic::amdgcn_struct_buffer_atomic_sub: + return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB; + case Intrinsic::amdgcn_raw_buffer_atomic_smin: + case Intrinsic::amdgcn_struct_buffer_atomic_smin: + return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN; + case Intrinsic::amdgcn_raw_buffer_atomic_umin: + case Intrinsic::amdgcn_struct_buffer_atomic_umin: + return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN; + case Intrinsic::amdgcn_raw_buffer_atomic_smax: + case Intrinsic::amdgcn_struct_buffer_atomic_smax: + return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX; + case Intrinsic::amdgcn_raw_buffer_atomic_umax: + case Intrinsic::amdgcn_struct_buffer_atomic_umax: + return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX; + case Intrinsic::amdgcn_raw_buffer_atomic_and: + case Intrinsic::amdgcn_struct_buffer_atomic_and: + return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND; + case Intrinsic::amdgcn_raw_buffer_atomic_or: + case Intrinsic::amdgcn_struct_buffer_atomic_or: + return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR; + case Intrinsic::amdgcn_raw_buffer_atomic_xor: + case Intrinsic::amdgcn_struct_buffer_atomic_xor: + return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR; + case Intrinsic::amdgcn_raw_buffer_atomic_inc: + case Intrinsic::amdgcn_struct_buffer_atomic_inc: + return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC; + case Intrinsic::amdgcn_raw_buffer_atomic_dec: + case Intrinsic::amdgcn_struct_buffer_atomic_dec: + return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC; + case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap: + case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap: + return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP; + default: + llvm_unreachable("unhandled atomic opcode"); + } +} + +bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI, + MachineIRBuilder &B, + Intrinsic::ID IID) const { + const bool IsCmpSwap = IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap || + IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap; + + Register Dst = MI.getOperand(0).getReg(); + Register VData = MI.getOperand(2).getReg(); + + Register CmpVal; + int OpOffset = 0; + + if (IsCmpSwap) { + CmpVal = MI.getOperand(3 + OpOffset).getReg(); + ++OpOffset; + } + + Register RSrc = MI.getOperand(3 + OpOffset).getReg(); + const unsigned NumVIndexOps = IsCmpSwap ? 9 : 8; + + // The struct intrinsic variants add one additional operand over raw. + const bool HasVIndex = MI.getNumOperands() == NumVIndexOps; + Register VIndex; + if (HasVIndex) { + VIndex = MI.getOperand(4 + OpOffset).getReg(); + ++OpOffset; + } + + Register VOffset = MI.getOperand(4 + OpOffset).getReg(); + Register SOffset = MI.getOperand(5 + OpOffset).getReg(); + unsigned AuxiliaryData = MI.getOperand(6 + OpOffset).getImm(); + + MachineMemOperand *MMO = *MI.memoperands_begin(); + + unsigned ImmOffset; + unsigned TotalOffset; + std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset); + if (TotalOffset != 0) + MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MMO->getSize()); + + if (!VIndex) + VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0); + + auto MIB = B.buildInstr(getBufferAtomicPseudo(IID)) + .addDef(Dst) + .addUse(VData); // vdata + + if (IsCmpSwap) + MIB.addReg(CmpVal); + + MIB.addUse(RSrc) // rsrc + .addUse(VIndex) // vindex + .addUse(VOffset) // voffset + .addUse(SOffset) // soffset + .addImm(ImmOffset) // offset(imm) + .addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm) + .addImm(HasVIndex ? -1 : 0) // idxen(imm) + .addMemOperand(MMO); + + MI.eraseFromParent(); + return true; +} + +/// Turn a set of s16 typed registers in \p A16AddrRegs into a dword sized +/// vector with s16 typed elements. +static void packImageA16AddressToDwords(MachineIRBuilder &B, MachineInstr &MI, + SmallVectorImpl<Register> &PackedAddrs, + int AddrIdx, int DimIdx, int EndIdx, + int NumGradients) { + const LLT S16 = LLT::scalar(16); + const LLT V2S16 = LLT::vector(2, 16); + + for (int I = AddrIdx; I < EndIdx; ++I) { + MachineOperand &SrcOp = MI.getOperand(I); + if (!SrcOp.isReg()) + continue; // _L to _LZ may have eliminated this. + + Register AddrReg = SrcOp.getReg(); + + if (I < DimIdx) { + AddrReg = B.buildBitcast(V2S16, AddrReg).getReg(0); + PackedAddrs.push_back(AddrReg); + } else { + // Dz/dh, dz/dv and the last odd coord are packed with undef. Also, in 1D, + // derivatives dx/dh and dx/dv are packed with undef. + if (((I + 1) >= EndIdx) || + ((NumGradients / 2) % 2 == 1 && + (I == DimIdx + (NumGradients / 2) - 1 || + I == DimIdx + NumGradients - 1)) || + // Check for _L to _LZ optimization + !MI.getOperand(I + 1).isReg()) { + PackedAddrs.push_back( + B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)}) + .getReg(0)); + } else { + PackedAddrs.push_back( + B.buildBuildVector(V2S16, {AddrReg, MI.getOperand(I + 1).getReg()}) + .getReg(0)); + ++I; + } + } + } +} + +/// Convert from separate vaddr components to a single vector address register, +/// and replace the remaining operands with $noreg. +static void convertImageAddrToPacked(MachineIRBuilder &B, MachineInstr &MI, + int DimIdx, int NumVAddrs) { + const LLT S32 = LLT::scalar(32); + + SmallVector<Register, 8> AddrRegs; + for (int I = 0; I != NumVAddrs; ++I) { + MachineOperand &SrcOp = MI.getOperand(DimIdx + I); + if (SrcOp.isReg()) { + AddrRegs.push_back(SrcOp.getReg()); + assert(B.getMRI()->getType(SrcOp.getReg()) == S32); + } + } + + int NumAddrRegs = AddrRegs.size(); + if (NumAddrRegs != 1) { + // Round up to 8 elements for v5-v7 + // FIXME: Missing intermediate sized register classes and instructions. + if (NumAddrRegs > 4 && !isPowerOf2_32(NumAddrRegs)) { + const int RoundedNumRegs = NextPowerOf2(NumAddrRegs); + auto Undef = B.buildUndef(S32); + AddrRegs.append(RoundedNumRegs - NumAddrRegs, Undef.getReg(0)); + NumAddrRegs = RoundedNumRegs; + } + + auto VAddr = B.buildBuildVector(LLT::vector(NumAddrRegs, 32), AddrRegs); + MI.getOperand(DimIdx).setReg(VAddr.getReg(0)); + } + + for (int I = 1; I != NumVAddrs; ++I) { + MachineOperand &SrcOp = MI.getOperand(DimIdx + I); + if (SrcOp.isReg()) + MI.getOperand(DimIdx + I).setReg(AMDGPU::NoRegister); + } +} + +/// Rewrite image intrinsics to use register layouts expected by the subtarget. +/// +/// Depending on the subtarget, load/store with 16-bit element data need to be +/// rewritten to use the low half of 32-bit registers, or directly use a packed +/// layout. 16-bit addresses should also sometimes be packed into 32-bit +/// registers. +/// +/// We don't want to directly select image instructions just yet, but also want +/// to exposes all register repacking to the legalizer/combiners. We also don't +/// want a selected instrution entering RegBankSelect. In order to avoid +/// defining a multitude of intermediate image instructions, directly hack on +/// the intrinsic's arguments. In cases like a16 addreses, this requires padding +/// now unnecessary arguments with $noreg. +bool AMDGPULegalizerInfo::legalizeImageIntrinsic( + MachineInstr &MI, MachineIRBuilder &B, + GISelChangeObserver &Observer, + const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr) const { + + const int NumDefs = MI.getNumExplicitDefs(); + bool IsTFE = NumDefs == 2; + // We are only processing the operands of d16 image operations on subtargets + // that use the unpacked register layout, or need to repack the TFE result. + + // TODO: Do we need to guard against already legalized intrinsics? + const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode = + AMDGPU::getMIMGBaseOpcodeInfo(ImageDimIntr->BaseOpcode); + + MachineRegisterInfo *MRI = B.getMRI(); + const LLT S32 = LLT::scalar(32); + const LLT S16 = LLT::scalar(16); + const LLT V2S16 = LLT::vector(2, 16); + + // Index of first address argument + const int AddrIdx = getImageVAddrIdxBegin(BaseOpcode, NumDefs); + + int NumVAddrs, NumGradients; + std::tie(NumVAddrs, NumGradients) = getImageNumVAddr(ImageDimIntr, BaseOpcode); + const int DMaskIdx = BaseOpcode->Atomic ? -1 : + getDMaskIdx(BaseOpcode, NumDefs); + unsigned DMask = 0; + + // Check for 16 bit addresses and pack if true. + int DimIdx = AddrIdx + BaseOpcode->NumExtraArgs; + LLT GradTy = MRI->getType(MI.getOperand(DimIdx).getReg()); + LLT AddrTy = MRI->getType(MI.getOperand(DimIdx + NumGradients).getReg()); + const bool IsG16 = GradTy == S16; + const bool IsA16 = AddrTy == S16; + + int DMaskLanes = 0; + if (!BaseOpcode->Atomic) { + DMask = MI.getOperand(DMaskIdx).getImm(); + if (BaseOpcode->Gather4) { + DMaskLanes = 4; + } else if (DMask != 0) { + DMaskLanes = countPopulation(DMask); + } else if (!IsTFE && !BaseOpcode->Store) { + // If dmask is 0, this is a no-op load. This can be eliminated. + B.buildUndef(MI.getOperand(0)); + MI.eraseFromParent(); return true; } + } + + Observer.changingInstr(MI); + auto ChangedInstr = make_scope_exit([&] { Observer.changedInstr(MI); }); + + unsigned NewOpcode = NumDefs == 0 ? + AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE : AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD; + + // Track that we legalized this + MI.setDesc(B.getTII().get(NewOpcode)); + + // Expecting to get an error flag since TFC is on - and dmask is 0 Force + // dmask to be at least 1 otherwise the instruction will fail + if (IsTFE && DMask == 0) { + DMask = 0x1; + DMaskLanes = 1; + MI.getOperand(DMaskIdx).setImm(DMask); + } + + if (BaseOpcode->Atomic) { + Register VData0 = MI.getOperand(2).getReg(); + LLT Ty = MRI->getType(VData0); + + // TODO: Allow atomic swap and bit ops for v2s16/v4s16 + if (Ty.isVector()) + return false; + + if (BaseOpcode->AtomicX2) { + Register VData1 = MI.getOperand(3).getReg(); + // The two values are packed in one register. + LLT PackedTy = LLT::vector(2, Ty); + auto Concat = B.buildBuildVector(PackedTy, {VData0, VData1}); + MI.getOperand(2).setReg(Concat.getReg(0)); + MI.getOperand(3).setReg(AMDGPU::NoRegister); + } + } - return Ty.getElementType() == S32 && Ty.getNumElements() <= 4; + int CorrectedNumVAddrs = NumVAddrs; + + // Optimize _L to _LZ when _L is zero + if (const AMDGPU::MIMGLZMappingInfo *LZMappingInfo = + AMDGPU::getMIMGLZMappingInfo(ImageDimIntr->BaseOpcode)) { + const ConstantFP *ConstantLod; + const int LodIdx = AddrIdx + NumVAddrs - 1; + + if (mi_match(MI.getOperand(LodIdx).getReg(), *MRI, m_GFCst(ConstantLod))) { + if (ConstantLod->isZero() || ConstantLod->isNegative()) { + // Set new opcode to _lz variant of _l, and change the intrinsic ID. + ImageDimIntr = AMDGPU::getImageDimInstrinsicByBaseOpcode( + LZMappingInfo->LZ, ImageDimIntr->Dim); + + // The starting indexes should remain in the same place. + --NumVAddrs; + --CorrectedNumVAddrs; + + MI.getOperand(MI.getNumExplicitDefs()).setIntrinsicID( + static_cast<Intrinsic::ID>(ImageDimIntr->Intr)); + MI.RemoveOperand(LodIdx); + } + } } - return Ty == S32; + // Optimize _mip away, when 'lod' is zero + if (AMDGPU::getMIMGMIPMappingInfo(ImageDimIntr->BaseOpcode)) { + int64_t ConstantLod; + const int LodIdx = AddrIdx + NumVAddrs - 1; + + if (mi_match(MI.getOperand(LodIdx).getReg(), *MRI, m_ICst(ConstantLod))) { + if (ConstantLod == 0) { + // TODO: Change intrinsic opcode and remove operand instead or replacing + // it with 0, as the _L to _LZ handling is done above. + MI.getOperand(LodIdx).ChangeToImmediate(0); + --CorrectedNumVAddrs; + } + } + } + + // Rewrite the addressing register layout before doing anything else. + if (IsA16 || IsG16) { + if (IsA16) { + // Target must support the feature and gradients need to be 16 bit too + if (!ST.hasA16() || !IsG16) + return false; + } else if (!ST.hasG16()) + return false; + + if (NumVAddrs > 1) { + SmallVector<Register, 4> PackedRegs; + // Don't compress addresses for G16 + const int PackEndIdx = + IsA16 ? (AddrIdx + NumVAddrs) : (DimIdx + NumGradients); + packImageA16AddressToDwords(B, MI, PackedRegs, AddrIdx, DimIdx, + PackEndIdx, NumGradients); + + if (!IsA16) { + // Add uncompressed address + for (int I = DimIdx + NumGradients; I != AddrIdx + NumVAddrs; ++I) { + int AddrReg = MI.getOperand(I).getReg(); + assert(B.getMRI()->getType(AddrReg) == LLT::scalar(32)); + PackedRegs.push_back(AddrReg); + } + } + + // See also below in the non-a16 branch + const bool UseNSA = PackedRegs.size() >= 3 && ST.hasNSAEncoding(); + + if (!UseNSA && PackedRegs.size() > 1) { + LLT PackedAddrTy = LLT::vector(2 * PackedRegs.size(), 16); + auto Concat = B.buildConcatVectors(PackedAddrTy, PackedRegs); + PackedRegs[0] = Concat.getReg(0); + PackedRegs.resize(1); + } + + const int NumPacked = PackedRegs.size(); + for (int I = 0; I != NumVAddrs; ++I) { + MachineOperand &SrcOp = MI.getOperand(AddrIdx + I); + if (!SrcOp.isReg()) { + assert(SrcOp.isImm() && SrcOp.getImm() == 0); + continue; + } + + assert(SrcOp.getReg() != AMDGPU::NoRegister); + + if (I < NumPacked) + SrcOp.setReg(PackedRegs[I]); + else + SrcOp.setReg(AMDGPU::NoRegister); + } + } + } else { + // If the register allocator cannot place the address registers contiguously + // without introducing moves, then using the non-sequential address encoding + // is always preferable, since it saves VALU instructions and is usually a + // wash in terms of code size or even better. + // + // However, we currently have no way of hinting to the register allocator + // that MIMG addresses should be placed contiguously when it is possible to + // do so, so force non-NSA for the common 2-address case as a heuristic. + // + // SIShrinkInstructions will convert NSA encodings to non-NSA after register + // allocation when possible. + const bool UseNSA = CorrectedNumVAddrs >= 3 && ST.hasNSAEncoding(); + + if (!UseNSA && NumVAddrs > 1) + convertImageAddrToPacked(B, MI, AddrIdx, NumVAddrs); + } + + int Flags = 0; + if (IsA16) + Flags |= 1; + if (IsG16) + Flags |= 2; + MI.addOperand(MachineOperand::CreateImm(Flags)); + + if (BaseOpcode->Store) { // No TFE for stores? + // TODO: Handle dmask trim + Register VData = MI.getOperand(1).getReg(); + LLT Ty = MRI->getType(VData); + if (!Ty.isVector() || Ty.getElementType() != S16) + return true; + + Register RepackedReg = handleD16VData(B, *MRI, VData); + if (RepackedReg != VData) { + MI.getOperand(1).setReg(RepackedReg); + } + + return true; + } + + Register DstReg = MI.getOperand(0).getReg(); + LLT Ty = MRI->getType(DstReg); + const LLT EltTy = Ty.getScalarType(); + const bool IsD16 = Ty.getScalarType() == S16; + const int NumElts = Ty.isVector() ? Ty.getNumElements() : 1; + + // Confirm that the return type is large enough for the dmask specified + if (NumElts < DMaskLanes) + return false; + + if (NumElts > 4 || DMaskLanes > 4) + return false; + + const unsigned AdjustedNumElts = DMaskLanes == 0 ? 1 : DMaskLanes; + const LLT AdjustedTy = Ty.changeNumElements(AdjustedNumElts); + + // The raw dword aligned data component of the load. The only legal cases + // where this matters should be when using the packed D16 format, for + // s16 -> <2 x s16>, and <3 x s16> -> <4 x s16>, + LLT RoundedTy; + + // S32 vector to to cover all data, plus TFE result element. + LLT TFETy; + + // Register type to use for each loaded component. Will be S32 or V2S16. + LLT RegTy; + + if (IsD16 && ST.hasUnpackedD16VMem()) { + RoundedTy = LLT::scalarOrVector(AdjustedNumElts, 32); + TFETy = LLT::vector(AdjustedNumElts + 1, 32); + RegTy = S32; + } else { + unsigned EltSize = EltTy.getSizeInBits(); + unsigned RoundedElts = (AdjustedTy.getSizeInBits() + 31) / 32; + unsigned RoundedSize = 32 * RoundedElts; + RoundedTy = LLT::scalarOrVector(RoundedSize / EltSize, EltSize); + TFETy = LLT::vector(RoundedSize / 32 + 1, S32); + RegTy = !IsTFE && EltSize == 16 ? V2S16 : S32; + } + + // The return type does not need adjustment. + // TODO: Should we change s16 case to s32 or <2 x s16>? + if (!IsTFE && (RoundedTy == Ty || !Ty.isVector())) + return true; + + Register Dst1Reg; + + // Insert after the instruction. + B.setInsertPt(*MI.getParent(), ++MI.getIterator()); + + // TODO: For TFE with d16, if we used a TFE type that was a multiple of <2 x + // s16> instead of s32, we would only need 1 bitcast instead of multiple. + const LLT LoadResultTy = IsTFE ? TFETy : RoundedTy; + const int ResultNumRegs = LoadResultTy.getSizeInBits() / 32; + + Register NewResultReg = MRI->createGenericVirtualRegister(LoadResultTy); + + MI.getOperand(0).setReg(NewResultReg); + + // In the IR, TFE is supposed to be used with a 2 element struct return + // type. The intruction really returns these two values in one contiguous + // register, with one additional dword beyond the loaded data. Rewrite the + // return type to use a single register result. + + if (IsTFE) { + Dst1Reg = MI.getOperand(1).getReg(); + if (MRI->getType(Dst1Reg) != S32) + return false; + + // TODO: Make sure the TFE operand bit is set. + MI.RemoveOperand(1); + + // Handle the easy case that requires no repack instructions. + if (Ty == S32) { + B.buildUnmerge({DstReg, Dst1Reg}, NewResultReg); + return true; + } + } + + // Now figure out how to copy the new result register back into the old + // result. + SmallVector<Register, 5> ResultRegs(ResultNumRegs, Dst1Reg); + + const int NumDataRegs = IsTFE ? ResultNumRegs - 1 : ResultNumRegs; + + if (ResultNumRegs == 1) { + assert(!IsTFE); + ResultRegs[0] = NewResultReg; + } else { + // We have to repack into a new vector of some kind. + for (int I = 0; I != NumDataRegs; ++I) + ResultRegs[I] = MRI->createGenericVirtualRegister(RegTy); + B.buildUnmerge(ResultRegs, NewResultReg); + + // Drop the final TFE element to get the data part. The TFE result is + // directly written to the right place already. + if (IsTFE) + ResultRegs.resize(NumDataRegs); + } + + // For an s16 scalar result, we form an s32 result with a truncate regardless + // of packed vs. unpacked. + if (IsD16 && !Ty.isVector()) { + B.buildTrunc(DstReg, ResultRegs[0]); + return true; + } + + // Avoid a build/concat_vector of 1 entry. + if (Ty == V2S16 && NumDataRegs == 1 && !ST.hasUnpackedD16VMem()) { + B.buildBitcast(DstReg, ResultRegs[0]); + return true; + } + + assert(Ty.isVector()); + + if (IsD16) { + // For packed D16 results with TFE enabled, all the data components are + // S32. Cast back to the expected type. + // + // TODO: We don't really need to use load s32 elements. We would only need one + // cast for the TFE result if a multiple of v2s16 was used. + if (RegTy != V2S16 && !ST.hasUnpackedD16VMem()) { + for (Register &Reg : ResultRegs) + Reg = B.buildBitcast(V2S16, Reg).getReg(0); + } else if (ST.hasUnpackedD16VMem()) { + for (Register &Reg : ResultRegs) + Reg = B.buildTrunc(S16, Reg).getReg(0); + } + } + + auto padWithUndef = [&](LLT Ty, int NumElts) { + if (NumElts == 0) + return; + Register Undef = B.buildUndef(Ty).getReg(0); + for (int I = 0; I != NumElts; ++I) + ResultRegs.push_back(Undef); + }; + + // Pad out any elements eliminated due to the dmask. + LLT ResTy = MRI->getType(ResultRegs[0]); + if (!ResTy.isVector()) { + padWithUndef(ResTy, NumElts - ResultRegs.size()); + B.buildBuildVector(DstReg, ResultRegs); + return true; + } + + assert(!ST.hasUnpackedD16VMem() && ResTy == V2S16); + const int RegsToCover = (Ty.getSizeInBits() + 31) / 32; + + // Deal with the one annoying legal case. + const LLT V3S16 = LLT::vector(3, 16); + if (Ty == V3S16) { + padWithUndef(ResTy, RegsToCover - ResultRegs.size() + 1); + auto Concat = B.buildConcatVectors(LLT::vector(6, 16), ResultRegs); + B.buildUnmerge({DstReg, MRI->createGenericVirtualRegister(V3S16)}, Concat); + return true; + } + + padWithUndef(ResTy, RegsToCover - ResultRegs.size()); + B.buildConcatVectors(DstReg, ResultRegs); + return true; } -bool AMDGPULegalizerInfo::legalizeIntrinsic(MachineInstr &MI, - MachineRegisterInfo &MRI, - MachineIRBuilder &B) const { +bool AMDGPULegalizerInfo::legalizeSBufferLoad( + MachineInstr &MI, MachineIRBuilder &B, + GISelChangeObserver &Observer) const { + Register Dst = MI.getOperand(0).getReg(); + LLT Ty = B.getMRI()->getType(Dst); + unsigned Size = Ty.getSizeInBits(); + MachineFunction &MF = B.getMF(); + + Observer.changingInstr(MI); + + // FIXME: We don't really need this intermediate instruction. The intrinsic + // should be fixed to have a memory operand. Since it's readnone, we're not + // allowed to add one. + MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_S_BUFFER_LOAD)); + MI.RemoveOperand(1); // Remove intrinsic ID + + // FIXME: When intrinsic definition is fixed, this should have an MMO already. + // TODO: Should this use datalayout alignment? + const unsigned MemSize = (Size + 7) / 8; + const Align MemAlign(4); + MachineMemOperand *MMO = MF.getMachineMemOperand( + MachinePointerInfo(), + MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | + MachineMemOperand::MOInvariant, + MemSize, MemAlign); + MI.addMemOperand(MF, MMO); + + // There are no 96-bit result scalar loads, but widening to 128-bit should + // always be legal. We may need to restore this to a 96-bit result if it turns + // out this needs to be converted to a vector load during RegBankSelect. + if (!isPowerOf2_32(Size)) { + LegalizerHelper Helper(MF, *this, Observer, B); + + if (Ty.isVector()) + Helper.moreElementsVectorDst(MI, getPow2VectorType(Ty), 0); + else + Helper.widenScalarDst(MI, getPow2ScalarType(Ty), 0); + } + + Observer.changedInstr(MI); + return true; +} + +bool AMDGPULegalizerInfo::legalizeTrapIntrinsic(MachineInstr &MI, + MachineRegisterInfo &MRI, + MachineIRBuilder &B) const { + // Is non-HSA path or trap-handler disabled? then, insert s_endpgm instruction + if (ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbiHsa || + !ST.isTrapHandlerEnabled()) { + B.buildInstr(AMDGPU::S_ENDPGM).addImm(0); + } else { + // Pass queue pointer to trap handler as input, and insert trap instruction + // Reference: https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi + const ArgDescriptor *Arg = + getArgDescriptor(B, AMDGPUFunctionArgInfo::QUEUE_PTR); + if (!Arg) + return false; + MachineRegisterInfo &MRI = *B.getMRI(); + Register SGPR01(AMDGPU::SGPR0_SGPR1); + Register LiveIn = getLiveInRegister( + B, MRI, SGPR01, LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64), + /*InsertLiveInCopy=*/false); + if (!loadInputValue(LiveIn, B, Arg)) + return false; + B.buildCopy(SGPR01, LiveIn); + B.buildInstr(AMDGPU::S_TRAP) + .addImm(GCNSubtarget::TrapIDLLVMTrap) + .addReg(SGPR01, RegState::Implicit); + } + + MI.eraseFromParent(); + return true; +} + +bool AMDGPULegalizerInfo::legalizeDebugTrapIntrinsic( + MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const { + // Is non-HSA path or trap-handler disabled? then, report a warning + // accordingly + if (ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbiHsa || + !ST.isTrapHandlerEnabled()) { + DiagnosticInfoUnsupported NoTrap(B.getMF().getFunction(), + "debugtrap handler not supported", + MI.getDebugLoc(), DS_Warning); + LLVMContext &Ctx = B.getMF().getFunction().getContext(); + Ctx.diagnose(NoTrap); + } else { + // Insert debug-trap instruction + B.buildInstr(AMDGPU::S_TRAP).addImm(GCNSubtarget::TrapIDLLVMDebugTrap); + } + + MI.eraseFromParent(); + return true; +} + +bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper, + MachineInstr &MI) const { + MachineIRBuilder &B = Helper.MIRBuilder; + MachineRegisterInfo &MRI = *B.getMRI(); + // Replace the use G_BRCOND with the exec manipulate and branch pseudos. auto IntrID = MI.getIntrinsicID(); switch (IntrID) { case Intrinsic::amdgcn_if: case Intrinsic::amdgcn_else: { MachineInstr *Br = nullptr; - if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI, Br)) { + MachineBasicBlock *UncondBrTarget = nullptr; + if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget)) { const SIRegisterInfo *TRI = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); - B.setInstr(*BrCond); Register Def = MI.getOperand(1).getReg(); Register Use = MI.getOperand(3).getReg(); - MachineBasicBlock *BrTarget = BrCond->getOperand(1).getMBB(); - if (Br) - BrTarget = Br->getOperand(0).getMBB(); - + MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB(); + B.setInsertPt(B.getMBB(), BrCond->getIterator()); if (IntrID == Intrinsic::amdgcn_if) { B.buildInstr(AMDGPU::SI_IF) .addDef(Def) .addUse(Use) - .addMBB(BrTarget); + .addMBB(UncondBrTarget); } else { B.buildInstr(AMDGPU::SI_ELSE) .addDef(Def) .addUse(Use) - .addMBB(BrTarget) + .addMBB(UncondBrTarget) .addImm(0); } - if (Br) - Br->getOperand(0).setMBB(BrCond->getOperand(1).getMBB()); + if (Br) { + Br->getOperand(0).setMBB(CondBrTarget); + } else { + // The IRTranslator skips inserting the G_BR for fallthrough cases, but + // since we're swapping branch targets it needs to be reinserted. + // FIXME: IRTranslator should probably not do this + B.buildBr(*CondBrTarget); + } MRI.setRegClass(Def, TRI->getWaveMaskRegClass()); MRI.setRegClass(Use, TRI->getWaveMaskRegClass()); @@ -2393,17 +4200,24 @@ bool AMDGPULegalizerInfo::legalizeIntrinsic(MachineInstr &MI, } case Intrinsic::amdgcn_loop: { MachineInstr *Br = nullptr; - if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI, Br)) { + MachineBasicBlock *UncondBrTarget = nullptr; + if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget)) { const SIRegisterInfo *TRI = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo()); - B.setInstr(*BrCond); - - // FIXME: Need to adjust branch targets based on unconditional branch. + MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB(); Register Reg = MI.getOperand(2).getReg(); + + B.setInsertPt(B.getMBB(), BrCond->getIterator()); B.buildInstr(AMDGPU::SI_LOOP) .addUse(Reg) - .addMBB(BrCond->getOperand(1).getMBB()); + .addMBB(UncondBrTarget); + + if (Br) + Br->getOperand(0).setMBB(CondBrTarget); + else + B.buildBr(*CondBrTarget); + MI.eraseFromParent(); BrCond->eraseFromParent(); MRI.setRegClass(Reg, TRI->getWaveMaskRegClass()); @@ -2413,6 +4227,13 @@ bool AMDGPULegalizerInfo::legalizeIntrinsic(MachineInstr &MI, return false; } case Intrinsic::amdgcn_kernarg_segment_ptr: + if (!AMDGPU::isKernel(B.getMF().getFunction().getCallingConv())) { + // This only makes sense to call in a kernel, so just lower to null. + B.buildConstant(MI.getOperand(0).getReg(), 0); + MI.eraseFromParent(); + return true; + } + return legalizePreloadedArgIntrin( MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR); case Intrinsic::amdgcn_implicitarg_ptr: @@ -2454,18 +4275,72 @@ bool AMDGPULegalizerInfo::legalizeIntrinsic(MachineInstr &MI, case Intrinsic::amdgcn_is_private: return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS); case Intrinsic::amdgcn_wavefrontsize: { - B.setInstr(MI); B.buildConstant(MI.getOperand(0), ST.getWavefrontSize()); MI.eraseFromParent(); return true; } + case Intrinsic::amdgcn_s_buffer_load: + return legalizeSBufferLoad(MI, B, Helper.Observer); case Intrinsic::amdgcn_raw_buffer_store: - return legalizeRawBufferStore(MI, MRI, B, false); + case Intrinsic::amdgcn_struct_buffer_store: + return legalizeBufferStore(MI, MRI, B, false, false); case Intrinsic::amdgcn_raw_buffer_store_format: - return legalizeRawBufferStore(MI, MRI, B, true); - default: + case Intrinsic::amdgcn_struct_buffer_store_format: + return legalizeBufferStore(MI, MRI, B, false, true); + case Intrinsic::amdgcn_raw_tbuffer_store: + case Intrinsic::amdgcn_struct_tbuffer_store: + return legalizeBufferStore(MI, MRI, B, true, true); + case Intrinsic::amdgcn_raw_buffer_load: + case Intrinsic::amdgcn_struct_buffer_load: + return legalizeBufferLoad(MI, MRI, B, false, false); + case Intrinsic::amdgcn_raw_buffer_load_format: + case Intrinsic::amdgcn_struct_buffer_load_format: + return legalizeBufferLoad(MI, MRI, B, true, false); + case Intrinsic::amdgcn_raw_tbuffer_load: + case Intrinsic::amdgcn_struct_tbuffer_load: + return legalizeBufferLoad(MI, MRI, B, true, true); + case Intrinsic::amdgcn_raw_buffer_atomic_swap: + case Intrinsic::amdgcn_struct_buffer_atomic_swap: + case Intrinsic::amdgcn_raw_buffer_atomic_add: + case Intrinsic::amdgcn_struct_buffer_atomic_add: + case Intrinsic::amdgcn_raw_buffer_atomic_sub: + case Intrinsic::amdgcn_struct_buffer_atomic_sub: + case Intrinsic::amdgcn_raw_buffer_atomic_smin: + case Intrinsic::amdgcn_struct_buffer_atomic_smin: + case Intrinsic::amdgcn_raw_buffer_atomic_umin: + case Intrinsic::amdgcn_struct_buffer_atomic_umin: + case Intrinsic::amdgcn_raw_buffer_atomic_smax: + case Intrinsic::amdgcn_struct_buffer_atomic_smax: + case Intrinsic::amdgcn_raw_buffer_atomic_umax: + case Intrinsic::amdgcn_struct_buffer_atomic_umax: + case Intrinsic::amdgcn_raw_buffer_atomic_and: + case Intrinsic::amdgcn_struct_buffer_atomic_and: + case Intrinsic::amdgcn_raw_buffer_atomic_or: + case Intrinsic::amdgcn_struct_buffer_atomic_or: + case Intrinsic::amdgcn_raw_buffer_atomic_xor: + case Intrinsic::amdgcn_struct_buffer_atomic_xor: + case Intrinsic::amdgcn_raw_buffer_atomic_inc: + case Intrinsic::amdgcn_struct_buffer_atomic_inc: + case Intrinsic::amdgcn_raw_buffer_atomic_dec: + case Intrinsic::amdgcn_struct_buffer_atomic_dec: + case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap: + case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap: + return legalizeBufferAtomic(MI, B, IntrID); + case Intrinsic::amdgcn_atomic_inc: + return legalizeAtomicIncDec(MI, B, true); + case Intrinsic::amdgcn_atomic_dec: + return legalizeAtomicIncDec(MI, B, false); + case Intrinsic::trap: + return legalizeTrapIntrinsic(MI, MRI, B); + case Intrinsic::debugtrap: + return legalizeDebugTrapIntrinsic(MI, MRI, B); + default: { + if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr = + AMDGPU::getImageDimIntrinsicInfo(IntrID)) + return legalizeImageIntrinsic(MI, B, Helper.Observer, ImageDimIntr); return true; } + } return true; } |