summaryrefslogtreecommitdiff
path: root/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
diff options
context:
space:
mode:
authorDimitry Andric <dim@FreeBSD.org>2021-02-16 20:13:02 +0000
committerDimitry Andric <dim@FreeBSD.org>2021-02-16 20:13:02 +0000
commitb60736ec1405bb0a8dd40989f67ef4c93da068ab (patch)
tree5c43fbb7c9fc45f0f87e0e6795a86267dbd12f9d /llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
parentcfca06d7963fa0909f90483b42a6d7d194d01e08 (diff)
Diffstat (limited to 'llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp')
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp1259
1 files changed, 876 insertions, 383 deletions
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
index 2976794b49c3..9f359c232981 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -15,18 +15,15 @@
#include "AMDGPU.h"
#include "AMDGPUGlobalISelUtils.h"
+#include "AMDGPUInstrInfo.h"
#include "AMDGPUTargetMachine.h"
#include "SIMachineFunctionInfo.h"
#include "llvm/ADT/ScopeExit.h"
#include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
-#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
#include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
-#include "llvm/CodeGen/TargetOpcodes.h"
-#include "llvm/CodeGen/ValueTypes.h"
-#include "llvm/IR/DerivedTypes.h"
+#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
#include "llvm/IR/DiagnosticInfo.h"
-#include "llvm/IR/Type.h"
-#include "llvm/Support/Debug.h"
+#include "llvm/IR/IntrinsicsAMDGPU.h"
#define DEBUG_TYPE "amdgpu-legalinfo"
@@ -60,16 +57,30 @@ static LLT getPow2ScalarType(LLT Ty) {
return LLT::scalar(Pow2Bits);
}
+/// \returs true if this is an odd sized vector which should widen by adding an
+/// additional element. This is mostly to handle <3 x s16> -> <4 x s16>. This
+/// excludes s1 vectors, which should always be scalarized.
static LegalityPredicate isSmallOddVector(unsigned TypeIdx) {
return [=](const LegalityQuery &Query) {
const LLT Ty = Query.Types[TypeIdx];
- return Ty.isVector() &&
- Ty.getNumElements() % 2 != 0 &&
- Ty.getElementType().getSizeInBits() < 32 &&
+ if (!Ty.isVector())
+ return false;
+
+ const LLT EltTy = Ty.getElementType();
+ const unsigned EltSize = EltTy.getSizeInBits();
+ return Ty.getNumElements() % 2 != 0 &&
+ EltSize > 1 && EltSize < 32 &&
Ty.getSizeInBits() % 32 != 0;
};
}
+static LegalityPredicate sizeIsMultipleOf32(unsigned TypeIdx) {
+ return [=](const LegalityQuery &Query) {
+ const LLT Ty = Query.Types[TypeIdx];
+ return Ty.getSizeInBits() % 32 == 0;
+ };
+}
+
static LegalityPredicate isWideVec16(unsigned TypeIdx) {
return [=](const LegalityQuery &Query) {
const LLT Ty = Query.Types[TypeIdx];
@@ -115,20 +126,32 @@ static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) {
};
}
+static LLT getBitcastRegisterType(const LLT Ty) {
+ const unsigned Size = Ty.getSizeInBits();
+
+ LLT CoercedTy;
+ if (Size <= 32) {
+ // <2 x s8> -> s16
+ // <4 x s8> -> s32
+ return LLT::scalar(Size);
+ }
+
+ return LLT::scalarOrVector(Size / 32, 32);
+}
+
static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx) {
return [=](const LegalityQuery &Query) {
const LLT Ty = Query.Types[TypeIdx];
- unsigned Size = Ty.getSizeInBits();
-
- LLT CoercedTy;
- if (Size <= 32) {
- // <2 x s8> -> s16
- // <4 x s8> -> s32
- CoercedTy = LLT::scalar(Size);
- } else
- CoercedTy = LLT::scalarOrVector(Size / 32, 32);
+ return std::make_pair(TypeIdx, getBitcastRegisterType(Ty));
+ };
+}
- return std::make_pair(TypeIdx, CoercedTy);
+static LegalizeMutation bitcastToVectorElement32(unsigned TypeIdx) {
+ return [=](const LegalityQuery &Query) {
+ const LLT Ty = Query.Types[TypeIdx];
+ unsigned Size = Ty.getSizeInBits();
+ assert(Size % 32 == 0);
+ return std::make_pair(TypeIdx, LLT::scalarOrVector(Size / 32, 32));
};
}
@@ -213,7 +236,7 @@ static unsigned maxSizeForAddrSpace(const GCNSubtarget &ST, unsigned AS,
switch (AS) {
case AMDGPUAS::PRIVATE_ADDRESS:
// FIXME: Private element size.
- return 32;
+ return ST.enableFlatScratch() ? 128 : 32;
case AMDGPUAS::LOCAL_ADDRESS:
return ST.useDS128() ? 128 : 64;
case AMDGPUAS::GLOBAL_ADDRESS:
@@ -243,7 +266,7 @@ static bool isLoadStoreSizeLegal(const GCNSubtarget &ST,
unsigned RegSize = Ty.getSizeInBits();
unsigned MemSize = Query.MMODescrs[0].SizeInBits;
- unsigned Align = Query.MMODescrs[0].AlignInBits;
+ unsigned AlignBits = Query.MMODescrs[0].AlignInBits;
unsigned AS = Query.Types[1].getAddressSpace();
// All of these need to be custom lowered to cast the pointer operand.
@@ -286,9 +309,10 @@ static bool isLoadStoreSizeLegal(const GCNSubtarget &ST,
assert(RegSize >= MemSize);
- if (Align < MemSize) {
+ if (AlignBits < MemSize) {
const SITargetLowering *TLI = ST.getTargetLowering();
- if (!TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, Align / 8))
+ if (!TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS,
+ Align(AlignBits / 8)))
return false;
}
@@ -308,7 +332,12 @@ static bool loadStoreBitcastWorkaround(const LLT Ty) {
return false;
if (!Ty.isVector())
return true;
- unsigned EltSize = Ty.getElementType().getSizeInBits();
+
+ LLT EltTy = Ty.getElementType();
+ if (EltTy.isPointer())
+ return true;
+
+ unsigned EltSize = EltTy.getSizeInBits();
return EltSize != 32 && EltSize != 64;
}
@@ -319,6 +348,66 @@ static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query,
!loadStoreBitcastWorkaround(Ty);
}
+/// Return true if a load or store of the type should be lowered with a bitcast
+/// to a different type.
+static bool shouldBitcastLoadStoreType(const GCNSubtarget &ST, const LLT Ty,
+ const unsigned MemSizeInBits) {
+ const unsigned Size = Ty.getSizeInBits();
+ if (Size != MemSizeInBits)
+ return Size <= 32 && Ty.isVector();
+
+ if (loadStoreBitcastWorkaround(Ty) && isRegisterType(Ty))
+ return true;
+ return Ty.isVector() && (Size <= 32 || isRegisterSize(Size)) &&
+ !isRegisterVectorElementType(Ty.getElementType());
+}
+
+/// Return true if we should legalize a load by widening an odd sized memory
+/// access up to the alignment. Note this case when the memory access itself
+/// changes, not the size of the result register.
+static bool shouldWidenLoad(const GCNSubtarget &ST, unsigned SizeInBits,
+ unsigned AlignInBits, unsigned AddrSpace,
+ unsigned Opcode) {
+ // We don't want to widen cases that are naturally legal.
+ if (isPowerOf2_32(SizeInBits))
+ return false;
+
+ // If we have 96-bit memory operations, we shouldn't touch them. Note we may
+ // end up widening these for a scalar load during RegBankSelect, since there
+ // aren't 96-bit scalar loads.
+ if (SizeInBits == 96 && ST.hasDwordx3LoadStores())
+ return false;
+
+ if (SizeInBits >= maxSizeForAddrSpace(ST, AddrSpace, Opcode))
+ return false;
+
+ // A load is known dereferenceable up to the alignment, so it's legal to widen
+ // to it.
+ //
+ // TODO: Could check dereferenceable for less aligned cases.
+ unsigned RoundedSize = NextPowerOf2(SizeInBits);
+ if (AlignInBits < RoundedSize)
+ return false;
+
+ // Do not widen if it would introduce a slow unaligned load.
+ const SITargetLowering *TLI = ST.getTargetLowering();
+ bool Fast = false;
+ return TLI->allowsMisalignedMemoryAccessesImpl(
+ RoundedSize, AddrSpace, Align(AlignInBits / 8),
+ MachineMemOperand::MOLoad, &Fast) &&
+ Fast;
+}
+
+static bool shouldWidenLoad(const GCNSubtarget &ST, const LegalityQuery &Query,
+ unsigned Opcode) {
+ if (Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic)
+ return false;
+
+ return shouldWidenLoad(ST, Query.MMODescrs[0].SizeInBits,
+ Query.MMODescrs[0].AlignInBits,
+ Query.Types[1].getAddressSpace(), Opcode);
+}
+
AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
const GCNTargetMachine &TM)
: ST(ST_) {
@@ -329,6 +418,7 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
};
const LLT S1 = LLT::scalar(1);
+ const LLT S8 = LLT::scalar(8);
const LLT S16 = LLT::scalar(16);
const LLT S32 = LLT::scalar(32);
const LLT S64 = LLT::scalar(64);
@@ -337,6 +427,7 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
const LLT S512 = LLT::scalar(512);
const LLT MaxScalar = LLT::scalar(MaxRegisterSize);
+ const LLT V2S8 = LLT::vector(2, 8);
const LLT V2S16 = LLT::vector(2, 16);
const LLT V4S16 = LLT::vector(4, 16);
@@ -410,48 +501,103 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
// TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more
// elements for v3s16
getActionDefinitionsBuilder(G_PHI)
- .legalFor({S32, S64, V2S16, V4S16, S1, S128, S256})
+ .legalFor({S32, S64, V2S16, S16, V4S16, S1, S128, S256})
.legalFor(AllS32Vectors)
.legalFor(AllS64Vectors)
.legalFor(AddrSpaces64)
.legalFor(AddrSpaces32)
- .clampScalar(0, S32, S256)
+ .legalIf(isPointer(0))
+ .clampScalar(0, S16, S256)
.widenScalarToNextPow2(0, 32)
.clampMaxNumElements(0, S32, 16)
.moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
- .legalIf(isPointer(0));
+ .scalarize(0);
- if (ST.hasVOP3PInsts()) {
+ if (ST.hasVOP3PInsts() && ST.hasAddNoCarry() && ST.hasIntClamp()) {
+ // Full set of gfx9 features.
getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
.legalFor({S32, S16, V2S16})
.clampScalar(0, S16, S32)
.clampMaxNumElements(0, S16, 2)
.scalarize(0)
.widenScalarToNextPow2(0, 32);
+
+ getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT, G_SADDSAT, G_SSUBSAT})
+ .legalFor({S32, S16, V2S16}) // Clamp modifier
+ .minScalarOrElt(0, S16)
+ .clampMaxNumElements(0, S16, 2)
+ .scalarize(0)
+ .widenScalarToNextPow2(0, 32)
+ .lower();
} else if (ST.has16BitInsts()) {
getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
.legalFor({S32, S16})
.clampScalar(0, S16, S32)
.scalarize(0)
- .widenScalarToNextPow2(0, 32);
+ .widenScalarToNextPow2(0, 32); // FIXME: min should be 16
+
+ // Technically the saturating operations require clamp bit support, but this
+ // was introduced at the same time as 16-bit operations.
+ getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
+ .legalFor({S32, S16}) // Clamp modifier
+ .minScalar(0, S16)
+ .scalarize(0)
+ .widenScalarToNextPow2(0, 16)
+ .lower();
+
+ // We're just lowering this, but it helps get a better result to try to
+ // coerce to the desired type first.
+ getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT})
+ .minScalar(0, S16)
+ .scalarize(0)
+ .lower();
} else {
getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
.legalFor({S32})
.clampScalar(0, S32, S32)
.scalarize(0);
+
+ if (ST.hasIntClamp()) {
+ getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
+ .legalFor({S32}) // Clamp modifier.
+ .scalarize(0)
+ .minScalarOrElt(0, S32)
+ .lower();
+ } else {
+ // Clamp bit support was added in VI, along with 16-bit operations.
+ getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
+ .minScalar(0, S32)
+ .scalarize(0)
+ .lower();
+ }
+
+ // FIXME: DAG expansion gets better results. The widening uses the smaller
+ // range values and goes for the min/max lowering directly.
+ getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT})
+ .minScalar(0, S32)
+ .scalarize(0)
+ .lower();
}
- // FIXME: Not really legal. Placeholder for custom lowering.
getActionDefinitionsBuilder({G_SDIV, G_UDIV, G_SREM, G_UREM})
.customFor({S32, S64})
.clampScalar(0, S32, S64)
.widenScalarToNextPow2(0, 32)
.scalarize(0);
- getActionDefinitionsBuilder({G_UMULH, G_SMULH})
- .legalFor({S32})
- .clampScalar(0, S32, S32)
- .scalarize(0);
+ auto &Mulh = getActionDefinitionsBuilder({G_UMULH, G_SMULH})
+ .legalFor({S32})
+ .maxScalarOrElt(0, S32);
+
+ if (ST.hasVOP3PInsts()) {
+ Mulh
+ .clampMaxNumElements(0, S8, 2)
+ .lowerFor({V2S8});
+ }
+
+ Mulh
+ .scalarize(0)
+ .lower();
// Report legal for any types we can handle anywhere. For the cases only legal
// on the SALU, RegBankSelect will be able to re-legalize.
@@ -479,9 +625,9 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
getActionDefinitionsBuilder(G_CONSTANT)
.legalFor({S1, S32, S64, S16, GlobalPtr,
LocalPtr, ConstantPtr, PrivatePtr, FlatPtr })
+ .legalIf(isPointer(0))
.clampScalar(0, S32, S64)
- .widenScalarToNextPow2(0)
- .legalIf(isPointer(0));
+ .widenScalarToNextPow2(0);
getActionDefinitionsBuilder(G_FCONSTANT)
.legalFor({S32, S64, S16})
@@ -505,8 +651,8 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
.legalFor({{PrivatePtr, S32}});
getActionDefinitionsBuilder(G_GLOBAL_VALUE)
- .unsupportedFor({PrivatePtr})
- .custom();
+ .customIf(typeIsNot(0, PrivatePtr));
+
setAction({G_BLOCK_ADDR, CodePtr}, Legal);
auto &FPOpActions = getActionDefinitionsBuilder(
@@ -599,7 +745,7 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
getActionDefinitionsBuilder(G_FPEXT)
.legalFor({{S64, S32}, {S32, S16}})
- .lowerFor({{S64, S16}}) // FIXME: Implement
+ .narrowScalarFor({{S64, S16}}, changeTo(0, S32))
.scalarize(0);
getActionDefinitionsBuilder(G_FSUB)
@@ -621,6 +767,15 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
FMad.scalarize(0)
.lower();
+ auto &FRem = getActionDefinitionsBuilder(G_FREM);
+ if (ST.has16BitInsts()) {
+ FRem.customFor({S16, S32, S64});
+ } else {
+ FRem.minScalar(0, S32)
+ .customFor({S32, S64});
+ }
+ FRem.scalarize(0);
+
// TODO: Do we need to clamp maximum bitwidth?
getActionDefinitionsBuilder(G_TRUNC)
.legalIf(isScalar(0))
@@ -648,12 +803,14 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
if (ST.has16BitInsts())
IToFP.legalFor({{S16, S16}});
IToFP.clampScalar(1, S32, S64)
+ .minScalar(0, S32)
.scalarize(0)
.widenScalarToNextPow2(1);
auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI})
.legalFor({{S32, S32}, {S32, S64}, {S32, S16}})
- .customFor({{S64, S64}});
+ .customFor({{S64, S64}})
+ .narrowScalarFor({{S64, S16}}, changeTo(0, S32));
if (ST.has16BitInsts())
FPToI.legalFor({{S16, S16}});
else
@@ -663,7 +820,8 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
.scalarize(0)
.lower();
- getActionDefinitionsBuilder(G_INTRINSIC_ROUND)
+ // Lower roundeven into G_FRINT
+ getActionDefinitionsBuilder({G_INTRINSIC_ROUND, G_INTRINSIC_ROUNDEVEN})
.scalarize(0)
.lower();
@@ -685,16 +843,14 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
.scalarize(0);
}
- // FIXME: Clamp offset operand.
getActionDefinitionsBuilder(G_PTR_ADD)
- .legalIf(isPointer(0))
- .scalarize(0);
+ .legalIf(all(isPointer(0), sameSize(0, 1)))
+ .scalarize(0)
+ .scalarSameSizeAs(1, 0);
getActionDefinitionsBuilder(G_PTRMASK)
- .legalIf(typeInSet(1, {S64, S32}))
- .minScalar(1, S32)
- .maxScalarIf(sizeIs(0, 32), 1, S32)
- .maxScalarIf(sizeIs(0, 64), 1, S64)
+ .legalIf(all(sameSize(0, 1), typeInSet(1, {S64, S32})))
+ .scalarSameSizeAs(1, 0)
.scalarize(0);
auto &CmpBuilder =
@@ -746,6 +902,10 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
ExpOps.clampScalar(0, MinScalarFPTy, S32)
.scalarize(0);
+ getActionDefinitionsBuilder(G_FPOWI)
+ .clampScalar(0, MinScalarFPTy, S32)
+ .lower();
+
// The 64-bit versions produce 32-bit results, but only on the SALU.
getActionDefinitionsBuilder(G_CTPOP)
.legalFor({{S32, S32}, {S32, S64}})
@@ -870,10 +1030,10 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
// Split vector extloads.
unsigned MemSize = Query.MMODescrs[0].SizeInBits;
- unsigned Align = Query.MMODescrs[0].AlignInBits;
+ unsigned AlignBits = Query.MMODescrs[0].AlignInBits;
if (MemSize < DstTy.getSizeInBits())
- MemSize = std::max(MemSize, Align);
+ MemSize = std::max(MemSize, AlignBits);
if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize)
return true;
@@ -895,35 +1055,18 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
return true;
}
- if (Align < MemSize) {
+ if (AlignBits < MemSize) {
const SITargetLowering *TLI = ST.getTargetLowering();
- return !TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, Align / 8);
+ return !TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS,
+ Align(AlignBits / 8));
}
return false;
};
- const auto shouldWidenLoadResult = [=](const LegalityQuery &Query,
- unsigned Opc) -> bool {
- unsigned Size = Query.Types[0].getSizeInBits();
- if (isPowerOf2_32(Size))
- return false;
-
- if (Size == 96 && ST.hasDwordx3LoadStores())
- return false;
-
- unsigned AddrSpace = Query.Types[1].getAddressSpace();
- if (Size >= maxSizeForAddrSpace(ST, AddrSpace, Opc))
- return false;
-
- unsigned Align = Query.MMODescrs[0].AlignInBits;
- unsigned RoundedSize = NextPowerOf2(Size);
- return (Align >= RoundedSize);
- };
-
- unsigned GlobalAlign32 = ST.hasUnalignedBufferAccess() ? 0 : 32;
- unsigned GlobalAlign16 = ST.hasUnalignedBufferAccess() ? 0 : 16;
- unsigned GlobalAlign8 = ST.hasUnalignedBufferAccess() ? 0 : 8;
+ unsigned GlobalAlign32 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 32;
+ unsigned GlobalAlign16 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 16;
+ unsigned GlobalAlign8 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 8;
// TODO: Refine based on subtargets which support unaligned access or 128-bit
// LDS
@@ -981,31 +1124,20 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
// 16-bit vector parts.
Actions.bitcastIf(
[=](const LegalityQuery &Query) -> bool {
- const LLT Ty = Query.Types[0];
- const unsigned Size = Ty.getSizeInBits();
-
- if (Size != Query.MMODescrs[0].SizeInBits)
- return Size <= 32 && Ty.isVector();
-
- if (loadStoreBitcastWorkaround(Ty) && isRegisterType(Ty))
- return true;
- return Ty.isVector() && (Size <= 32 || isRegisterSize(Size)) &&
- !isRegisterVectorElementType(Ty.getElementType());
+ return shouldBitcastLoadStoreType(ST, Query.Types[0],
+ Query.MMODescrs[0].SizeInBits);
}, bitcastToRegisterType(0));
+ if (!IsStore) {
+ // Widen suitably aligned loads by loading extra bytes. The standard
+ // legalization actions can't properly express widening memory operands.
+ Actions.customIf([=](const LegalityQuery &Query) -> bool {
+ return shouldWidenLoad(ST, Query, G_LOAD);
+ });
+ }
+
+ // FIXME: load/store narrowing should be moved to lower action
Actions
- .customIf(typeIs(1, Constant32Ptr))
- // Widen suitably aligned loads by loading extra elements.
- .moreElementsIf([=](const LegalityQuery &Query) {
- const LLT Ty = Query.Types[0];
- return Op == G_LOAD && Ty.isVector() &&
- shouldWidenLoadResult(Query, Op);
- }, moreElementsToNextPow2(0))
- .widenScalarIf([=](const LegalityQuery &Query) {
- const LLT Ty = Query.Types[0];
- return Op == G_LOAD && !Ty.isVector() &&
- shouldWidenLoadResult(Query, Op);
- }, widenScalarOrEltToNextPow2(0))
.narrowScalarIf(
[=](const LegalityQuery &Query) -> bool {
return !Query.Types[0].isVector() &&
@@ -1111,15 +1243,16 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
// May need relegalization for the scalars.
return std::make_pair(0, EltTy);
})
- .minScalar(0, S32);
+ .lowerIfMemSizeNotPow2()
+ .minScalar(0, S32);
if (IsStore)
Actions.narrowScalarIf(isWideScalarTruncStore(0), changeTo(0, S32));
- // TODO: Need a bitcast lower option?
Actions
.widenScalarToNextPow2(0)
- .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0));
+ .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0))
+ .lower();
}
auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD})
@@ -1147,14 +1280,15 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX,
G_ATOMICRMW_UMIN})
.legalFor({{S32, GlobalPtr}, {S32, LocalPtr},
- {S64, GlobalPtr}, {S64, LocalPtr}});
+ {S64, GlobalPtr}, {S64, LocalPtr},
+ {S32, RegionPtr}, {S64, RegionPtr}});
if (ST.hasFlatAddressSpace()) {
Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}});
}
if (ST.hasLDSFPAtomics()) {
getActionDefinitionsBuilder(G_ATOMICRMW_FADD)
- .legalFor({{S32, LocalPtr}});
+ .legalFor({{S32, LocalPtr}, {S32, RegionPtr}});
}
// BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output
@@ -1207,6 +1341,11 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
Shifts.clampScalar(1, S32, S32);
Shifts.clampScalar(0, S16, S64);
Shifts.widenScalarToNextPow2(0, 16);
+
+ getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT})
+ .minScalar(0, S16)
+ .scalarize(0)
+ .lower();
} else {
// Make sure we legalize the shift amount type first, as the general
// expansion for the shifted type will produce much worse code if it hasn't
@@ -1214,6 +1353,11 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
Shifts.clampScalar(1, S32, S32);
Shifts.clampScalar(0, S32, S64);
Shifts.widenScalarToNextPow2(0, 32);
+
+ getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT})
+ .minScalar(0, S32)
+ .scalarize(0)
+ .lower();
}
Shifts.scalarize(0);
@@ -1227,15 +1371,38 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
const LLT EltTy = Query.Types[EltTypeIdx];
const LLT VecTy = Query.Types[VecTypeIdx];
const LLT IdxTy = Query.Types[IdxTypeIdx];
- return (EltTy.getSizeInBits() == 16 ||
- EltTy.getSizeInBits() % 32 == 0) &&
- VecTy.getSizeInBits() % 32 == 0 &&
- VecTy.getSizeInBits() <= MaxRegisterSize &&
- IdxTy.getSizeInBits() == 32;
+ const unsigned EltSize = EltTy.getSizeInBits();
+ return (EltSize == 32 || EltSize == 64) &&
+ VecTy.getSizeInBits() % 32 == 0 &&
+ VecTy.getSizeInBits() <= MaxRegisterSize &&
+ IdxTy.getSizeInBits() == 32;
+ })
+ .bitcastIf(all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltNarrowerThan(VecTypeIdx, 32)),
+ bitcastToVectorElement32(VecTypeIdx))
+ //.bitcastIf(vectorSmallerThan(1, 32), bitcastToScalar(1))
+ .bitcastIf(
+ all(sizeIsMultipleOf32(VecTypeIdx), scalarOrEltWiderThan(VecTypeIdx, 64)),
+ [=](const LegalityQuery &Query) {
+ // For > 64-bit element types, try to turn this into a 64-bit
+ // element vector since we may be able to do better indexing
+ // if this is scalar. If not, fall back to 32.
+ const LLT EltTy = Query.Types[EltTypeIdx];
+ const LLT VecTy = Query.Types[VecTypeIdx];
+ const unsigned DstEltSize = EltTy.getSizeInBits();
+ const unsigned VecSize = VecTy.getSizeInBits();
+
+ const unsigned TargetEltSize = DstEltSize % 64 == 0 ? 64 : 32;
+ return std::make_pair(
+ VecTypeIdx, LLT::vector(VecSize / TargetEltSize, TargetEltSize));
})
.clampScalar(EltTypeIdx, S32, S64)
.clampScalar(VecTypeIdx, S32, S64)
- .clampScalar(IdxTypeIdx, S32, S32);
+ .clampScalar(IdxTypeIdx, S32, S32)
+ .clampMaxNumElements(VecTypeIdx, S32, 32)
+ // TODO: Clamp elements for 64-bit vectors?
+ // It should only be necessary with variable indexes.
+ // As a last resort, lower to the stack
+ .lower();
}
getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT)
@@ -1306,7 +1473,10 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
// FIXME: Clamp maximum size
getActionDefinitionsBuilder(G_CONCAT_VECTORS)
- .legalIf(isRegisterType(0));
+ .legalIf(all(isRegisterType(0), isRegisterType(1)))
+ .clampMaxNumElements(0, S32, 32)
+ .clampMaxNumElements(1, S16, 2) // TODO: Make 4?
+ .clampMaxNumElements(0, S16, 64);
// TODO: Don't fully scalarize v2s16 pieces? Or combine out thosse
// pre-legalize.
@@ -1335,6 +1505,7 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
};
auto &Builder = getActionDefinitionsBuilder(Op)
+ .legalIf(all(isRegisterType(0), isRegisterType(1)))
.lowerFor({{S16, V2S16}})
.lowerIf([=](const LegalityQuery &Query) {
const LLT BigTy = Query.Types[BigTyIdx];
@@ -1390,19 +1561,6 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
}
return std::make_pair(BigTyIdx, LLT::scalar(NewSizeInBits));
})
- .legalIf([=](const LegalityQuery &Query) {
- const LLT &BigTy = Query.Types[BigTyIdx];
- const LLT &LitTy = Query.Types[LitTyIdx];
-
- if (BigTy.isVector() && BigTy.getSizeInBits() < 32)
- return false;
- if (LitTy.isVector() && LitTy.getSizeInBits() < 32)
- return false;
-
- return BigTy.getSizeInBits() % 16 == 0 &&
- LitTy.getSizeInBits() % 16 == 0 &&
- BigTy.getSizeInBits() <= MaxRegisterSize;
- })
// Any vectors left are the wrong size. Scalarize them.
.scalarize(0)
.scalarize(1);
@@ -1427,12 +1585,6 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
SextInReg.lowerFor({{S32}, {S64}});
}
- // FIXME: Placeholder rule. Really depends on whether the clamp modifier is
- // available, and is selectively legal for s16, s32, v2s16.
- getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT, G_UADDSAT, G_USUBSAT})
- .scalarize(0)
- .clampScalar(0, S16, S32);
-
SextInReg
.scalarize(0)
.clampScalar(0, S32, S64)
@@ -1446,11 +1598,16 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
getActionDefinitionsBuilder(G_READCYCLECOUNTER)
.legalFor({S64});
+ getActionDefinitionsBuilder(G_FENCE)
+ .alwaysLegal();
+
getActionDefinitionsBuilder({
// TODO: Verify V_BFI_B32 is generated from expanded bit ops
G_FCOPYSIGN,
G_ATOMIC_CMPXCHG_WITH_SUCCESS,
+ G_ATOMICRMW_NAND,
+ G_ATOMICRMW_FSUB,
G_READ_REGISTER,
G_WRITE_REGISTER,
@@ -1474,7 +1631,6 @@ bool AMDGPULegalizerInfo::legalizeCustom(LegalizerHelper &Helper,
MachineInstr &MI) const {
MachineIRBuilder &B = Helper.MIRBuilder;
MachineRegisterInfo &MRI = *B.getMRI();
- GISelChangeObserver &Observer = Helper.Observer;
switch (MI.getOpcode()) {
case TargetOpcode::G_ADDRSPACE_CAST:
@@ -1483,6 +1639,8 @@ bool AMDGPULegalizerInfo::legalizeCustom(LegalizerHelper &Helper,
return legalizeFrint(MI, MRI, B);
case TargetOpcode::G_FCEIL:
return legalizeFceil(MI, MRI, B);
+ case TargetOpcode::G_FREM:
+ return legalizeFrem(MI, MRI, B);
case TargetOpcode::G_INTRINSIC_TRUNC:
return legalizeIntrinsicTrunc(MI, MRI, B);
case TargetOpcode::G_SITOFP:
@@ -1510,7 +1668,7 @@ bool AMDGPULegalizerInfo::legalizeCustom(LegalizerHelper &Helper,
case TargetOpcode::G_GLOBAL_VALUE:
return legalizeGlobalValue(MI, MRI, B);
case TargetOpcode::G_LOAD:
- return legalizeLoad(MI, MRI, B, Observer);
+ return legalizeLoad(Helper, MI);
case TargetOpcode::G_FMAD:
return legalizeFMad(MI, MRI, B);
case TargetOpcode::G_FDIV:
@@ -1580,8 +1738,7 @@ Register AMDGPULegalizerInfo::getSegmentAperture(
Register QueuePtr = MRI.createGenericVirtualRegister(
LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
- const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
- if (!loadInputValue(QueuePtr, B, &MFI->getArgInfo().QueuePtr))
+ if (!loadInputValue(QueuePtr, B, AMDGPUFunctionArgInfo::QUEUE_PTR))
return Register();
// Offset into amd_queue_t for group_segment_aperture_base_hi /
@@ -1623,8 +1780,7 @@ bool AMDGPULegalizerInfo::legalizeAddrSpaceCast(
const AMDGPUTargetMachine &TM
= static_cast<const AMDGPUTargetMachine &>(MF.getTarget());
- const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
- if (ST.getTargetLowering()->isNoopAddrSpaceCast(SrcAS, DestAS)) {
+ if (TM.isNoopAddrSpaceCast(SrcAS, DestAS)) {
MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST));
return true;
}
@@ -1721,6 +1877,7 @@ bool AMDGPULegalizerInfo::legalizeFrint(
auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2);
B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2);
+ MI.eraseFromParent();
return true;
}
@@ -1752,7 +1909,24 @@ bool AMDGPULegalizerInfo::legalizeFceil(
return true;
}
-static MachineInstrBuilder extractF64Exponent(unsigned Hi,
+bool AMDGPULegalizerInfo::legalizeFrem(
+ MachineInstr &MI, MachineRegisterInfo &MRI,
+ MachineIRBuilder &B) const {
+ Register DstReg = MI.getOperand(0).getReg();
+ Register Src0Reg = MI.getOperand(1).getReg();
+ Register Src1Reg = MI.getOperand(2).getReg();
+ auto Flags = MI.getFlags();
+ LLT Ty = MRI.getType(DstReg);
+
+ auto Div = B.buildFDiv(Ty, Src0Reg, Src1Reg, Flags);
+ auto Trunc = B.buildIntrinsicTrunc(Ty, Div, Flags);
+ auto Neg = B.buildFNeg(Ty, Trunc, Flags);
+ B.buildFMA(DstReg, Neg, Src1Reg, Src0Reg, Flags);
+ MI.eraseFromParent();
+ return true;
+}
+
+static MachineInstrBuilder extractF64Exponent(Register Hi,
MachineIRBuilder &B) {
const unsigned FractBits = 52;
const unsigned ExpBits = 11;
@@ -1762,6 +1936,7 @@ static MachineInstrBuilder extractF64Exponent(unsigned Hi,
auto Const1 = B.buildConstant(S32, ExpBits);
auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32}, false)
+ .addUse(Hi)
.addUse(Const0.getReg(0))
.addUse(Const1.getReg(0));
@@ -1809,6 +1984,7 @@ bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc(
auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0);
B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1);
+ MI.eraseFromParent();
return true;
}
@@ -1907,10 +2083,11 @@ bool AMDGPULegalizerInfo::legalizeExtractVectorElt(
// FIXME: Artifact combiner probably should have replaced the truncated
// constant before this, so we shouldn't need
// getConstantVRegValWithLookThrough.
- Optional<ValueAndVReg> IdxVal = getConstantVRegValWithLookThrough(
- MI.getOperand(2).getReg(), MRI);
- if (!IdxVal) // Dynamic case will be selected to register indexing.
+ Optional<ValueAndVReg> MaybeIdxVal =
+ getConstantVRegValWithLookThrough(MI.getOperand(2).getReg(), MRI);
+ if (!MaybeIdxVal) // Dynamic case will be selected to register indexing.
return true;
+ const int64_t IdxVal = MaybeIdxVal->Value.getSExtValue();
Register Dst = MI.getOperand(0).getReg();
Register Vec = MI.getOperand(1).getReg();
@@ -1919,8 +2096,8 @@ bool AMDGPULegalizerInfo::legalizeExtractVectorElt(
LLT EltTy = VecTy.getElementType();
assert(EltTy == MRI.getType(Dst));
- if (IdxVal->Value < VecTy.getNumElements())
- B.buildExtract(Dst, Vec, IdxVal->Value * EltTy.getSizeInBits());
+ if (IdxVal < VecTy.getNumElements())
+ B.buildExtract(Dst, Vec, IdxVal * EltTy.getSizeInBits());
else
B.buildUndef(Dst);
@@ -1938,11 +2115,12 @@ bool AMDGPULegalizerInfo::legalizeInsertVectorElt(
// FIXME: Artifact combiner probably should have replaced the truncated
// constant before this, so we shouldn't need
// getConstantVRegValWithLookThrough.
- Optional<ValueAndVReg> IdxVal = getConstantVRegValWithLookThrough(
- MI.getOperand(3).getReg(), MRI);
- if (!IdxVal) // Dynamic case will be selected to register indexing.
+ Optional<ValueAndVReg> MaybeIdxVal =
+ getConstantVRegValWithLookThrough(MI.getOperand(3).getReg(), MRI);
+ if (!MaybeIdxVal) // Dynamic case will be selected to register indexing.
return true;
+ int64_t IdxVal = MaybeIdxVal->Value.getSExtValue();
Register Dst = MI.getOperand(0).getReg();
Register Vec = MI.getOperand(1).getReg();
Register Ins = MI.getOperand(2).getReg();
@@ -1951,8 +2129,8 @@ bool AMDGPULegalizerInfo::legalizeInsertVectorElt(
LLT EltTy = VecTy.getElementType();
assert(EltTy == MRI.getType(Ins));
- if (IdxVal->Value < VecTy.getNumElements())
- B.buildInsert(Dst, Vec, Ins, IdxVal->Value * EltTy.getSizeInBits());
+ if (IdxVal < VecTy.getNumElements())
+ B.buildInsert(Dst, Vec, Ins, IdxVal * EltTy.getSizeInBits());
else
B.buildUndef(Dst);
@@ -2043,7 +2221,9 @@ bool AMDGPULegalizerInfo::buildPCRelGlobalAddress(Register DstReg, LLT PtrTy,
// variable, but since the encoding of $symbol starts 4 bytes after the start
// of the s_add_u32 instruction, we end up with an offset that is 4 bytes too
// small. This requires us to add 4 to the global variable offset in order to
- // compute the correct address.
+ // compute the correct address. Similarly for the s_addc_u32 instruction, the
+ // encoding of $symbol starts 12 bytes after the start of the s_add_u32
+ // instruction.
LLT ConstPtrTy = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
@@ -2057,7 +2237,7 @@ bool AMDGPULegalizerInfo::buildPCRelGlobalAddress(Register DstReg, LLT PtrTy,
if (GAFlags == SIInstrInfo::MO_NONE)
MIB.addImm(0);
else
- MIB.addGlobalAddress(GV, Offset + 4, GAFlags + 1);
+ MIB.addGlobalAddress(GV, Offset + 12, GAFlags + 1);
B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass);
@@ -2078,7 +2258,7 @@ bool AMDGPULegalizerInfo::legalizeGlobalValue(
SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
- if (!MFI->isEntryFunction()) {
+ if (!MFI->isModuleEntryFunction()) {
const Function &Fn = MF.getFunction();
DiagnosticInfoUnsupported BadLDSDecl(
Fn, "local memory global used by non-kernel function", MI.getDebugLoc(),
@@ -2104,6 +2284,25 @@ bool AMDGPULegalizerInfo::legalizeGlobalValue(
return true; // Leave in place;
}
+ if (AS == AMDGPUAS::LOCAL_ADDRESS && GV->hasExternalLinkage()) {
+ Type *Ty = GV->getValueType();
+ // HIP uses an unsized array `extern __shared__ T s[]` or similar
+ // zero-sized type in other languages to declare the dynamic shared
+ // memory which size is not known at the compile time. They will be
+ // allocated by the runtime and placed directly after the static
+ // allocated ones. They all share the same offset.
+ if (B.getDataLayout().getTypeAllocSize(Ty).isZero()) {
+ // Adjust alignment for that dynamic shared memory array.
+ MFI->setDynLDSAlign(B.getDataLayout(), *cast<GlobalVariable>(GV));
+ LLT S32 = LLT::scalar(32);
+ auto Sz =
+ B.buildIntrinsic(Intrinsic::amdgcn_groupstaticsize, {S32}, false);
+ B.buildIntToPtr(DstReg, Sz);
+ MI.eraseFromParent();
+ return true;
+ }
+ }
+
B.buildConstant(
DstReg,
MFI->allocateLDSGlobal(B.getDataLayout(), *cast<GlobalVariable>(GV)));
@@ -2154,15 +2353,90 @@ bool AMDGPULegalizerInfo::legalizeGlobalValue(
return true;
}
-bool AMDGPULegalizerInfo::legalizeLoad(
- MachineInstr &MI, MachineRegisterInfo &MRI,
- MachineIRBuilder &B, GISelChangeObserver &Observer) const {
- LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
- auto Cast = B.buildAddrSpaceCast(ConstPtr, MI.getOperand(1).getReg());
- Observer.changingInstr(MI);
- MI.getOperand(1).setReg(Cast.getReg(0));
- Observer.changedInstr(MI);
- return true;
+static LLT widenToNextPowerOf2(LLT Ty) {
+ if (Ty.isVector())
+ return Ty.changeNumElements(PowerOf2Ceil(Ty.getNumElements()));
+ return LLT::scalar(PowerOf2Ceil(Ty.getSizeInBits()));
+}
+
+bool AMDGPULegalizerInfo::legalizeLoad(LegalizerHelper &Helper,
+ MachineInstr &MI) const {
+ MachineIRBuilder &B = Helper.MIRBuilder;
+ MachineRegisterInfo &MRI = *B.getMRI();
+ GISelChangeObserver &Observer = Helper.Observer;
+
+ Register PtrReg = MI.getOperand(1).getReg();
+ LLT PtrTy = MRI.getType(PtrReg);
+ unsigned AddrSpace = PtrTy.getAddressSpace();
+
+ if (AddrSpace == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
+ LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
+ auto Cast = B.buildAddrSpaceCast(ConstPtr, PtrReg);
+ Observer.changingInstr(MI);
+ MI.getOperand(1).setReg(Cast.getReg(0));
+ Observer.changedInstr(MI);
+ return true;
+ }
+
+ Register ValReg = MI.getOperand(0).getReg();
+ LLT ValTy = MRI.getType(ValReg);
+
+ MachineMemOperand *MMO = *MI.memoperands_begin();
+ const unsigned ValSize = ValTy.getSizeInBits();
+ const unsigned MemSize = 8 * MMO->getSize();
+ const Align MemAlign = MMO->getAlign();
+ const unsigned AlignInBits = 8 * MemAlign.value();
+
+ // Widen non-power-of-2 loads to the alignment if needed
+ if (shouldWidenLoad(ST, MemSize, AlignInBits, AddrSpace, MI.getOpcode())) {
+ const unsigned WideMemSize = PowerOf2Ceil(MemSize);
+
+ // This was already the correct extending load result type, so just adjust
+ // the memory type.
+ if (WideMemSize == ValSize) {
+ MachineFunction &MF = B.getMF();
+
+ MachineMemOperand *WideMMO =
+ MF.getMachineMemOperand(MMO, 0, WideMemSize / 8);
+ Observer.changingInstr(MI);
+ MI.setMemRefs(MF, {WideMMO});
+ Observer.changedInstr(MI);
+ return true;
+ }
+
+ // Don't bother handling edge case that should probably never be produced.
+ if (ValSize > WideMemSize)
+ return false;
+
+ LLT WideTy = widenToNextPowerOf2(ValTy);
+
+ Register WideLoad;
+ if (!WideTy.isVector()) {
+ WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
+ B.buildTrunc(ValReg, WideLoad).getReg(0);
+ } else {
+ // Extract the subvector.
+
+ if (isRegisterType(ValTy)) {
+ // If this a case where G_EXTRACT is legal, use it.
+ // (e.g. <3 x s32> -> <4 x s32>)
+ WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
+ B.buildExtract(ValReg, WideLoad, 0);
+ } else {
+ // For cases where the widened type isn't a nice register value, unmerge
+ // from a widened register (e.g. <3 x s16> -> <4 x s16>)
+ B.setInsertPt(B.getMBB(), ++B.getInsertPt());
+ WideLoad = Helper.widenWithUnmerge(WideTy, ValReg);
+ B.setInsertPt(B.getMBB(), MI.getIterator());
+ B.buildLoadFromOffset(WideLoad, PtrReg, *MMO, 0);
+ }
+ }
+
+ MI.eraseFromParent();
+ return true;
+ }
+
+ return false;
}
bool AMDGPULegalizerInfo::legalizeFMad(
@@ -2194,8 +2468,7 @@ bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg(
Register CmpVal = MI.getOperand(2).getReg();
Register NewVal = MI.getOperand(3).getReg();
- assert(SITargetLowering::isFlatGlobalAddrSpace(
- MRI.getType(PtrReg).getAddressSpace()) &&
+ assert(AMDGPU::isFlatGlobalAddrSpace(MRI.getType(PtrReg).getAddressSpace()) &&
"this should not have been custom lowered");
LLT ValTy = MRI.getType(CmpVal);
@@ -2364,23 +2637,42 @@ bool AMDGPULegalizerInfo::legalizeBuildVector(
return true;
}
+// Check that this is a G_XOR x, -1
+static bool isNot(const MachineRegisterInfo &MRI, const MachineInstr &MI) {
+ if (MI.getOpcode() != TargetOpcode::G_XOR)
+ return false;
+ auto ConstVal = getConstantVRegSExtVal(MI.getOperand(2).getReg(), MRI);
+ return ConstVal && *ConstVal == -1;
+}
+
// Return the use branch instruction, otherwise null if the usage is invalid.
-static MachineInstr *verifyCFIntrinsic(MachineInstr &MI,
- MachineRegisterInfo &MRI,
- MachineInstr *&Br,
- MachineBasicBlock *&UncondBrTarget) {
+static MachineInstr *
+verifyCFIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineInstr *&Br,
+ MachineBasicBlock *&UncondBrTarget, bool &Negated) {
Register CondDef = MI.getOperand(0).getReg();
if (!MRI.hasOneNonDBGUse(CondDef))
return nullptr;
MachineBasicBlock *Parent = MI.getParent();
- MachineInstr &UseMI = *MRI.use_instr_nodbg_begin(CondDef);
- if (UseMI.getParent() != Parent ||
- UseMI.getOpcode() != AMDGPU::G_BRCOND)
+ MachineInstr *UseMI = &*MRI.use_instr_nodbg_begin(CondDef);
+
+ if (isNot(MRI, *UseMI)) {
+ Register NegatedCond = UseMI->getOperand(0).getReg();
+ if (!MRI.hasOneNonDBGUse(NegatedCond))
+ return nullptr;
+
+ // We're deleting the def of this value, so we need to remove it.
+ UseMI->eraseFromParent();
+
+ UseMI = &*MRI.use_instr_nodbg_begin(NegatedCond);
+ Negated = true;
+ }
+
+ if (UseMI->getParent() != Parent || UseMI->getOpcode() != AMDGPU::G_BRCOND)
return nullptr;
// Make sure the cond br is followed by a G_BR, or is the last instruction.
- MachineBasicBlock::iterator Next = std::next(UseMI.getIterator());
+ MachineBasicBlock::iterator Next = std::next(UseMI->getIterator());
if (Next == Parent->end()) {
MachineFunction::iterator NextMBB = std::next(Parent->getIterator());
if (NextMBB == Parent->getParent()->end()) // Illegal intrinsic use.
@@ -2393,84 +2685,19 @@ static MachineInstr *verifyCFIntrinsic(MachineInstr &MI,
UncondBrTarget = Br->getOperand(0).getMBB();
}
- return &UseMI;
-}
-
-Register AMDGPULegalizerInfo::insertLiveInCopy(MachineIRBuilder &B,
- MachineRegisterInfo &MRI,
- Register LiveIn,
- Register PhyReg) const {
- assert(PhyReg.isPhysical() && "Physical register expected");
-
- // Insert the live-in copy, if required, by defining destination virtual
- // register.
- // FIXME: It seems EmitLiveInCopies isn't called anywhere?
- if (!MRI.getVRegDef(LiveIn)) {
- // FIXME: Should have scoped insert pt
- MachineBasicBlock &OrigInsBB = B.getMBB();
- auto OrigInsPt = B.getInsertPt();
-
- MachineBasicBlock &EntryMBB = B.getMF().front();
- EntryMBB.addLiveIn(PhyReg);
- B.setInsertPt(EntryMBB, EntryMBB.begin());
- B.buildCopy(LiveIn, PhyReg);
-
- B.setInsertPt(OrigInsBB, OrigInsPt);
- }
-
- return LiveIn;
-}
-
-Register AMDGPULegalizerInfo::getLiveInRegister(MachineIRBuilder &B,
- MachineRegisterInfo &MRI,
- Register PhyReg, LLT Ty,
- bool InsertLiveInCopy) const {
- assert(PhyReg.isPhysical() && "Physical register expected");
-
- // Get or create virtual live-in regester
- Register LiveIn = MRI.getLiveInVirtReg(PhyReg);
- if (!LiveIn) {
- LiveIn = MRI.createGenericVirtualRegister(Ty);
- MRI.addLiveIn(PhyReg, LiveIn);
- }
-
- // When the actual true copy required is from virtual register to physical
- // register (to be inserted later), live-in copy insertion from physical
- // to register virtual register is not required
- if (!InsertLiveInCopy)
- return LiveIn;
-
- return insertLiveInCopy(B, MRI, LiveIn, PhyReg);
-}
-
-const ArgDescriptor *AMDGPULegalizerInfo::getArgDescriptor(
- MachineIRBuilder &B, AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
- const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
- const ArgDescriptor *Arg;
- const TargetRegisterClass *RC;
- LLT ArgTy;
- std::tie(Arg, RC, ArgTy) = MFI->getPreloadedValue(ArgType);
- if (!Arg) {
- LLVM_DEBUG(dbgs() << "Required arg register missing\n");
- return nullptr;
- }
- return Arg;
+ return UseMI;
}
bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B,
- const ArgDescriptor *Arg) const {
- if (!Arg->isRegister() || !Arg->getRegister().isValid())
- return false; // TODO: Handle these
-
- Register SrcReg = Arg->getRegister();
- assert(SrcReg.isPhysical() && "Physical register expected");
+ const ArgDescriptor *Arg,
+ const TargetRegisterClass *ArgRC,
+ LLT ArgTy) const {
+ MCRegister SrcReg = Arg->getRegister();
+ assert(Register::isPhysicalRegister(SrcReg) && "Physical register expected");
assert(DstReg.isVirtual() && "Virtual register expected");
- MachineRegisterInfo &MRI = *B.getMRI();
-
- LLT Ty = MRI.getType(DstReg);
- Register LiveIn = getLiveInRegister(B, MRI, SrcReg, Ty);
-
+ Register LiveIn = getFunctionLiveInPhysReg(B.getMF(), B.getTII(), SrcReg, *ArgRC,
+ ArgTy);
if (Arg->isMasked()) {
// TODO: Should we try to emit this once in the entry block?
const LLT S32 = LLT::scalar(32);
@@ -2492,15 +2719,24 @@ bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B,
return true;
}
-bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin(
- MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B,
+bool AMDGPULegalizerInfo::loadInputValue(
+ Register DstReg, MachineIRBuilder &B,
AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
+ const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
+ const ArgDescriptor *Arg;
+ const TargetRegisterClass *ArgRC;
+ LLT ArgTy;
+ std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType);
- const ArgDescriptor *Arg = getArgDescriptor(B, ArgType);
- if (!Arg)
- return false;
+ if (!Arg->isRegister() || !Arg->getRegister().isValid())
+ return false; // TODO: Handle these
+ return loadInputValue(DstReg, B, Arg, ArgRC, ArgTy);
+}
- if (!loadInputValue(MI.getOperand(0).getReg(), B, Arg))
+bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin(
+ MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B,
+ AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
+ if (!loadInputValue(MI.getOperand(0).getReg(), B, ArgType))
return false;
MI.eraseFromParent();
@@ -2516,9 +2752,6 @@ bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI,
LLT S32 = LLT::scalar(32);
LLT S64 = LLT::scalar(64);
- if (legalizeFastUnsafeFDIV(MI, MRI, B))
- return true;
-
if (DstTy == S16)
return legalizeFDIV16(MI, MRI, B);
if (DstTy == S32)
@@ -2813,22 +3046,14 @@ bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI,
Register Res = MI.getOperand(0).getReg();
Register LHS = MI.getOperand(1).getReg();
Register RHS = MI.getOperand(2).getReg();
-
uint16_t Flags = MI.getFlags();
-
LLT ResTy = MRI.getType(Res);
- LLT S32 = LLT::scalar(32);
- LLT S64 = LLT::scalar(64);
const MachineFunction &MF = B.getMF();
- bool Unsafe =
- MF.getTarget().Options.UnsafeFPMath || MI.getFlag(MachineInstr::FmArcp);
-
- if (!MF.getTarget().Options.UnsafeFPMath && ResTy == S64)
- return false;
+ bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath ||
+ MI.getFlag(MachineInstr::FmAfn);
- if (!Unsafe && ResTy == S32 &&
- MF.getInfo<SIMachineFunctionInfo>()->getMode().allFP32Denormals())
+ if (!AllowInaccurateRcp)
return false;
if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) {
@@ -2855,22 +3080,58 @@ bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI,
}
// x / y -> x * (1.0 / y)
- if (Unsafe) {
- auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false)
- .addUse(RHS)
- .setMIFlags(Flags);
- B.buildFMul(Res, LHS, RCP, Flags);
+ auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false)
+ .addUse(RHS)
+ .setMIFlags(Flags);
+ B.buildFMul(Res, LHS, RCP, Flags);
- MI.eraseFromParent();
- return true;
- }
+ MI.eraseFromParent();
+ return true;
+}
- return false;
+bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV64(MachineInstr &MI,
+ MachineRegisterInfo &MRI,
+ MachineIRBuilder &B) const {
+ Register Res = MI.getOperand(0).getReg();
+ Register X = MI.getOperand(1).getReg();
+ Register Y = MI.getOperand(2).getReg();
+ uint16_t Flags = MI.getFlags();
+ LLT ResTy = MRI.getType(Res);
+
+ const MachineFunction &MF = B.getMF();
+ bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath ||
+ MI.getFlag(MachineInstr::FmAfn);
+
+ if (!AllowInaccurateRcp)
+ return false;
+
+ auto NegY = B.buildFNeg(ResTy, Y);
+ auto One = B.buildFConstant(ResTy, 1.0);
+
+ auto R = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false)
+ .addUse(Y)
+ .setMIFlags(Flags);
+
+ auto Tmp0 = B.buildFMA(ResTy, NegY, R, One);
+ R = B.buildFMA(ResTy, Tmp0, R, R);
+
+ auto Tmp1 = B.buildFMA(ResTy, NegY, R, One);
+ R = B.buildFMA(ResTy, Tmp1, R, R);
+
+ auto Ret = B.buildFMul(ResTy, X, R);
+ auto Tmp2 = B.buildFMA(ResTy, NegY, Ret, X);
+
+ B.buildFMA(Res, Tmp2, R, Ret);
+ MI.eraseFromParent();
+ return true;
}
bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI,
MachineRegisterInfo &MRI,
MachineIRBuilder &B) const {
+ if (legalizeFastUnsafeFDIV(MI, MRI, B))
+ return true;
+
Register Res = MI.getOperand(0).getReg();
Register LHS = MI.getOperand(1).getReg();
Register RHS = MI.getOperand(2).getReg();
@@ -2933,6 +3194,9 @@ static void toggleSPDenormMode(bool Enable,
bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI,
MachineRegisterInfo &MRI,
MachineIRBuilder &B) const {
+ if (legalizeFastUnsafeFDIV(MI, MRI, B))
+ return true;
+
Register Res = MI.getOperand(0).getReg();
Register LHS = MI.getOperand(1).getReg();
Register RHS = MI.getOperand(2).getReg();
@@ -2999,6 +3263,9 @@ bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI,
bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI,
MachineRegisterInfo &MRI,
MachineIRBuilder &B) const {
+ if (legalizeFastUnsafeFDIV64(MI, MRI, B))
+ return true;
+
Register Res = MI.getOperand(0).getReg();
Register LHS = MI.getOperand(1).getReg();
Register RHS = MI.getOperand(2).getReg();
@@ -3109,35 +3376,118 @@ bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI,
return true;
}
-bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI,
- MachineRegisterInfo &MRI,
- MachineIRBuilder &B) const {
+// Expand llvm.amdgcn.rsq.clamp on targets that don't support the instruction.
+// FIXME: Why do we handle this one but not other removed instructions?
+//
+// Reciprocal square root. The clamp prevents infinite results, clamping
+// infinities to max_float. D.f = 1.0 / sqrt(S0.f), result clamped to
+// +-max_float.
+bool AMDGPULegalizerInfo::legalizeRsqClampIntrinsic(MachineInstr &MI,
+ MachineRegisterInfo &MRI,
+ MachineIRBuilder &B) const {
+ if (ST.getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS)
+ return true;
+
+ Register Dst = MI.getOperand(0).getReg();
+ Register Src = MI.getOperand(2).getReg();
+ auto Flags = MI.getFlags();
+
+ LLT Ty = MRI.getType(Dst);
+
+ const fltSemantics *FltSemantics;
+ if (Ty == LLT::scalar(32))
+ FltSemantics = &APFloat::IEEEsingle();
+ else if (Ty == LLT::scalar(64))
+ FltSemantics = &APFloat::IEEEdouble();
+ else
+ return false;
+
+ auto Rsq = B.buildIntrinsic(Intrinsic::amdgcn_rsq, {Ty}, false)
+ .addUse(Src)
+ .setMIFlags(Flags);
+
+ // We don't need to concern ourselves with the snan handling difference, since
+ // the rsq quieted (or not) so use the one which will directly select.
const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
- if (!MFI->isEntryFunction()) {
- return legalizePreloadedArgIntrin(MI, MRI, B,
- AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR);
+ const bool UseIEEE = MFI->getMode().IEEE;
+
+ auto MaxFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics));
+ auto ClampMax = UseIEEE ? B.buildFMinNumIEEE(Ty, Rsq, MaxFlt, Flags) :
+ B.buildFMinNum(Ty, Rsq, MaxFlt, Flags);
+
+ auto MinFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics, true));
+
+ if (UseIEEE)
+ B.buildFMaxNumIEEE(Dst, ClampMax, MinFlt, Flags);
+ else
+ B.buildFMaxNum(Dst, ClampMax, MinFlt, Flags);
+ MI.eraseFromParent();
+ return true;
+}
+
+static unsigned getDSFPAtomicOpcode(Intrinsic::ID IID) {
+ switch (IID) {
+ case Intrinsic::amdgcn_ds_fadd:
+ return AMDGPU::G_ATOMICRMW_FADD;
+ case Intrinsic::amdgcn_ds_fmin:
+ return AMDGPU::G_AMDGPU_ATOMIC_FMIN;
+ case Intrinsic::amdgcn_ds_fmax:
+ return AMDGPU::G_AMDGPU_ATOMIC_FMAX;
+ default:
+ llvm_unreachable("not a DS FP intrinsic");
}
+}
+
+bool AMDGPULegalizerInfo::legalizeDSAtomicFPIntrinsic(LegalizerHelper &Helper,
+ MachineInstr &MI,
+ Intrinsic::ID IID) const {
+ GISelChangeObserver &Observer = Helper.Observer;
+ Observer.changingInstr(MI);
+
+ MI.setDesc(ST.getInstrInfo()->get(getDSFPAtomicOpcode(IID)));
+
+ // The remaining operands were used to set fields in the MemOperand on
+ // construction.
+ for (int I = 6; I > 3; --I)
+ MI.RemoveOperand(I);
+
+ MI.RemoveOperand(1); // Remove the intrinsic ID.
+ Observer.changedInstr(MI);
+ return true;
+}
+bool AMDGPULegalizerInfo::getImplicitArgPtr(Register DstReg,
+ MachineRegisterInfo &MRI,
+ MachineIRBuilder &B) const {
uint64_t Offset =
ST.getTargetLowering()->getImplicitParameterOffset(
B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT);
- Register DstReg = MI.getOperand(0).getReg();
LLT DstTy = MRI.getType(DstReg);
LLT IdxTy = LLT::scalar(DstTy.getSizeInBits());
- const ArgDescriptor *Arg;
- const TargetRegisterClass *RC;
- LLT ArgTy;
- std::tie(Arg, RC, ArgTy) =
- MFI->getPreloadedValue(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
- if (!Arg)
- return false;
-
Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy);
- if (!loadInputValue(KernargPtrReg, B, Arg))
+ if (!loadInputValue(KernargPtrReg, B,
+ AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR))
return false;
+ // FIXME: This should be nuw
B.buildPtrAdd(DstReg, KernargPtrReg, B.buildConstant(IdxTy, Offset).getReg(0));
+ return true;
+}
+
+bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI,
+ MachineRegisterInfo &MRI,
+ MachineIRBuilder &B) const {
+ const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
+ if (!MFI->isEntryFunction()) {
+ return legalizePreloadedArgIntrin(MI, MRI, B,
+ AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR);
+ }
+
+ Register DstReg = MI.getOperand(0).getReg();
+ if (!getImplicitArgPtr(DstReg, MRI, B))
+ return false;
+
MI.eraseFromParent();
return true;
}
@@ -3147,7 +3497,9 @@ bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI,
MachineIRBuilder &B,
unsigned AddrSpace) const {
Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B);
- auto Hi32 = B.buildExtract(LLT::scalar(32), MI.getOperand(2).getReg(), 32);
+ auto Unmerge = B.buildUnmerge(LLT::scalar(32), MI.getOperand(2).getReg());
+ Register Hi32 = Unmerge.getReg(1);
+
B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg);
MI.eraseFromParent();
return true;
@@ -3165,11 +3517,10 @@ AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B,
const unsigned MaxImm = 4095;
Register BaseReg;
unsigned TotalConstOffset;
- MachineInstr *OffsetDef;
const LLT S32 = LLT::scalar(32);
- std::tie(BaseReg, TotalConstOffset, OffsetDef)
- = AMDGPU::getBaseWithConstantOffset(*B.getMRI(), OrigOffset);
+ std::tie(BaseReg, TotalConstOffset) =
+ AMDGPU::getBaseWithConstantOffset(*B.getMRI(), OrigOffset);
unsigned ImmOffset = TotalConstOffset;
@@ -3205,24 +3556,58 @@ AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B,
/// Handle register layout difference for f16 images for some subtargets.
Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B,
MachineRegisterInfo &MRI,
- Register Reg) const {
- if (!ST.hasUnpackedD16VMem())
- return Reg;
-
+ Register Reg,
+ bool ImageStore) const {
const LLT S16 = LLT::scalar(16);
const LLT S32 = LLT::scalar(32);
LLT StoreVT = MRI.getType(Reg);
assert(StoreVT.isVector() && StoreVT.getElementType() == S16);
- auto Unmerge = B.buildUnmerge(S16, Reg);
+ if (ST.hasUnpackedD16VMem()) {
+ auto Unmerge = B.buildUnmerge(S16, Reg);
- SmallVector<Register, 4> WideRegs;
- for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
- WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0));
+ SmallVector<Register, 4> WideRegs;
+ for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
+ WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0));
+
+ int NumElts = StoreVT.getNumElements();
+
+ return B.buildBuildVector(LLT::vector(NumElts, S32), WideRegs).getReg(0);
+ }
+
+ if (ImageStore && ST.hasImageStoreD16Bug()) {
+ if (StoreVT.getNumElements() == 2) {
+ SmallVector<Register, 4> PackedRegs;
+ Reg = B.buildBitcast(S32, Reg).getReg(0);
+ PackedRegs.push_back(Reg);
+ PackedRegs.resize(2, B.buildUndef(S32).getReg(0));
+ return B.buildBuildVector(LLT::vector(2, S32), PackedRegs).getReg(0);
+ }
+
+ if (StoreVT.getNumElements() == 3) {
+ SmallVector<Register, 4> PackedRegs;
+ auto Unmerge = B.buildUnmerge(S16, Reg);
+ for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
+ PackedRegs.push_back(Unmerge.getReg(I));
+ PackedRegs.resize(6, B.buildUndef(S16).getReg(0));
+ Reg = B.buildBuildVector(LLT::vector(6, S16), PackedRegs).getReg(0);
+ return B.buildBitcast(LLT::vector(3, S32), Reg).getReg(0);
+ }
+
+ if (StoreVT.getNumElements() == 4) {
+ SmallVector<Register, 4> PackedRegs;
+ Reg = B.buildBitcast(LLT::vector(2, S32), Reg).getReg(0);
+ auto Unmerge = B.buildUnmerge(S32, Reg);
+ for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
+ PackedRegs.push_back(Unmerge.getReg(I));
+ PackedRegs.resize(4, B.buildUndef(S32).getReg(0));
+ return B.buildBuildVector(LLT::vector(4, S32), PackedRegs).getReg(0);
+ }
- int NumElts = StoreVT.getNumElements();
+ llvm_unreachable("invalid data type");
+ }
- return B.buildBuildVector(LLT::vector(NumElts, S32), WideRegs).getReg(0);
+ return Reg;
}
Register AMDGPULegalizerInfo::fixStoreSourceType(
@@ -3513,6 +3898,9 @@ static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) {
case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP;
+ case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
+ case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
+ return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD;
default:
llvm_unreachable("unhandled atomic opcode");
}
@@ -3523,12 +3911,20 @@ bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI,
Intrinsic::ID IID) const {
const bool IsCmpSwap = IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap ||
IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap;
+ const bool HasReturn = MI.getNumExplicitDefs() != 0;
- Register Dst = MI.getOperand(0).getReg();
- Register VData = MI.getOperand(2).getReg();
+ Register Dst;
- Register CmpVal;
int OpOffset = 0;
+ if (HasReturn) {
+ // A few FP atomics do not support return values.
+ Dst = MI.getOperand(0).getReg();
+ } else {
+ OpOffset = -1;
+ }
+
+ Register VData = MI.getOperand(2 + OpOffset).getReg();
+ Register CmpVal;
if (IsCmpSwap) {
CmpVal = MI.getOperand(3 + OpOffset).getReg();
@@ -3536,7 +3932,7 @@ bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI,
}
Register RSrc = MI.getOperand(3 + OpOffset).getReg();
- const unsigned NumVIndexOps = IsCmpSwap ? 9 : 8;
+ const unsigned NumVIndexOps = (IsCmpSwap ? 8 : 7) + HasReturn;
// The struct intrinsic variants add one additional operand over raw.
const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
@@ -3561,9 +3957,12 @@ bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI,
if (!VIndex)
VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0);
- auto MIB = B.buildInstr(getBufferAtomicPseudo(IID))
- .addDef(Dst)
- .addUse(VData); // vdata
+ auto MIB = B.buildInstr(getBufferAtomicPseudo(IID));
+
+ if (HasReturn)
+ MIB.addDef(Dst);
+
+ MIB.addUse(VData); // vdata
if (IsCmpSwap)
MIB.addReg(CmpVal);
@@ -3583,38 +3982,41 @@ bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI,
/// Turn a set of s16 typed registers in \p A16AddrRegs into a dword sized
/// vector with s16 typed elements.
-static void packImageA16AddressToDwords(MachineIRBuilder &B, MachineInstr &MI,
- SmallVectorImpl<Register> &PackedAddrs,
- int AddrIdx, int DimIdx, int EndIdx,
- int NumGradients) {
+static void packImageA16AddressToDwords(
+ MachineIRBuilder &B, MachineInstr &MI,
+ SmallVectorImpl<Register> &PackedAddrs, unsigned ArgOffset,
+ const AMDGPU::ImageDimIntrinsicInfo *Intr, unsigned EndIdx) {
const LLT S16 = LLT::scalar(16);
const LLT V2S16 = LLT::vector(2, 16);
- for (int I = AddrIdx; I < EndIdx; ++I) {
- MachineOperand &SrcOp = MI.getOperand(I);
+ for (unsigned I = Intr->VAddrStart; I < EndIdx; I++) {
+ MachineOperand &SrcOp = MI.getOperand(ArgOffset + I);
if (!SrcOp.isReg())
continue; // _L to _LZ may have eliminated this.
Register AddrReg = SrcOp.getReg();
- if (I < DimIdx) {
+ if (I < Intr->GradientStart) {
AddrReg = B.buildBitcast(V2S16, AddrReg).getReg(0);
PackedAddrs.push_back(AddrReg);
} else {
// Dz/dh, dz/dv and the last odd coord are packed with undef. Also, in 1D,
// derivatives dx/dh and dx/dv are packed with undef.
if (((I + 1) >= EndIdx) ||
- ((NumGradients / 2) % 2 == 1 &&
- (I == DimIdx + (NumGradients / 2) - 1 ||
- I == DimIdx + NumGradients - 1)) ||
+ ((Intr->NumGradients / 2) % 2 == 1 &&
+ (I == static_cast<unsigned>(Intr->GradientStart +
+ (Intr->NumGradients / 2) - 1) ||
+ I == static_cast<unsigned>(Intr->GradientStart +
+ Intr->NumGradients - 1))) ||
// Check for _L to _LZ optimization
- !MI.getOperand(I + 1).isReg()) {
+ !MI.getOperand(ArgOffset + I + 1).isReg()) {
PackedAddrs.push_back(
B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)})
.getReg(0));
} else {
PackedAddrs.push_back(
- B.buildBuildVector(V2S16, {AddrReg, MI.getOperand(I + 1).getReg()})
+ B.buildBuildVector(
+ V2S16, {AddrReg, MI.getOperand(ArgOffset + I + 1).getReg()})
.getReg(0));
++I;
}
@@ -3673,43 +4075,37 @@ static void convertImageAddrToPacked(MachineIRBuilder &B, MachineInstr &MI,
/// the intrinsic's arguments. In cases like a16 addreses, this requires padding
/// now unnecessary arguments with $noreg.
bool AMDGPULegalizerInfo::legalizeImageIntrinsic(
- MachineInstr &MI, MachineIRBuilder &B,
- GISelChangeObserver &Observer,
- const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr) const {
+ MachineInstr &MI, MachineIRBuilder &B, GISelChangeObserver &Observer,
+ const AMDGPU::ImageDimIntrinsicInfo *Intr) const {
- const int NumDefs = MI.getNumExplicitDefs();
+ const unsigned NumDefs = MI.getNumExplicitDefs();
+ const unsigned ArgOffset = NumDefs + 1;
bool IsTFE = NumDefs == 2;
// We are only processing the operands of d16 image operations on subtargets
// that use the unpacked register layout, or need to repack the TFE result.
// TODO: Do we need to guard against already legalized intrinsics?
const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
- AMDGPU::getMIMGBaseOpcodeInfo(ImageDimIntr->BaseOpcode);
+ AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode);
MachineRegisterInfo *MRI = B.getMRI();
const LLT S32 = LLT::scalar(32);
const LLT S16 = LLT::scalar(16);
const LLT V2S16 = LLT::vector(2, 16);
- // Index of first address argument
- const int AddrIdx = getImageVAddrIdxBegin(BaseOpcode, NumDefs);
-
- int NumVAddrs, NumGradients;
- std::tie(NumVAddrs, NumGradients) = getImageNumVAddr(ImageDimIntr, BaseOpcode);
- const int DMaskIdx = BaseOpcode->Atomic ? -1 :
- getDMaskIdx(BaseOpcode, NumDefs);
unsigned DMask = 0;
// Check for 16 bit addresses and pack if true.
- int DimIdx = AddrIdx + BaseOpcode->NumExtraArgs;
- LLT GradTy = MRI->getType(MI.getOperand(DimIdx).getReg());
- LLT AddrTy = MRI->getType(MI.getOperand(DimIdx + NumGradients).getReg());
+ LLT GradTy =
+ MRI->getType(MI.getOperand(ArgOffset + Intr->GradientStart).getReg());
+ LLT AddrTy =
+ MRI->getType(MI.getOperand(ArgOffset + Intr->CoordStart).getReg());
const bool IsG16 = GradTy == S16;
const bool IsA16 = AddrTy == S16;
int DMaskLanes = 0;
if (!BaseOpcode->Atomic) {
- DMask = MI.getOperand(DMaskIdx).getImm();
+ DMask = MI.getOperand(ArgOffset + Intr->DMaskIndex).getImm();
if (BaseOpcode->Gather4) {
DMaskLanes = 4;
} else if (DMask != 0) {
@@ -3736,7 +4132,7 @@ bool AMDGPULegalizerInfo::legalizeImageIntrinsic(
if (IsTFE && DMask == 0) {
DMask = 0x1;
DMaskLanes = 1;
- MI.getOperand(DMaskIdx).setImm(DMask);
+ MI.getOperand(ArgOffset + Intr->DMaskIndex).setImm(DMask);
}
if (BaseOpcode->Atomic) {
@@ -3757,41 +4153,41 @@ bool AMDGPULegalizerInfo::legalizeImageIntrinsic(
}
}
- int CorrectedNumVAddrs = NumVAddrs;
+ unsigned CorrectedNumVAddrs = Intr->NumVAddrs;
// Optimize _L to _LZ when _L is zero
if (const AMDGPU::MIMGLZMappingInfo *LZMappingInfo =
- AMDGPU::getMIMGLZMappingInfo(ImageDimIntr->BaseOpcode)) {
+ AMDGPU::getMIMGLZMappingInfo(Intr->BaseOpcode)) {
const ConstantFP *ConstantLod;
- const int LodIdx = AddrIdx + NumVAddrs - 1;
- if (mi_match(MI.getOperand(LodIdx).getReg(), *MRI, m_GFCst(ConstantLod))) {
+ if (mi_match(MI.getOperand(ArgOffset + Intr->LodIndex).getReg(), *MRI,
+ m_GFCst(ConstantLod))) {
if (ConstantLod->isZero() || ConstantLod->isNegative()) {
// Set new opcode to _lz variant of _l, and change the intrinsic ID.
- ImageDimIntr = AMDGPU::getImageDimInstrinsicByBaseOpcode(
- LZMappingInfo->LZ, ImageDimIntr->Dim);
+ const AMDGPU::ImageDimIntrinsicInfo *NewImageDimIntr =
+ AMDGPU::getImageDimInstrinsicByBaseOpcode(LZMappingInfo->LZ,
+ Intr->Dim);
// The starting indexes should remain in the same place.
- --NumVAddrs;
--CorrectedNumVAddrs;
- MI.getOperand(MI.getNumExplicitDefs()).setIntrinsicID(
- static_cast<Intrinsic::ID>(ImageDimIntr->Intr));
- MI.RemoveOperand(LodIdx);
+ MI.getOperand(MI.getNumExplicitDefs())
+ .setIntrinsicID(static_cast<Intrinsic::ID>(NewImageDimIntr->Intr));
+ MI.RemoveOperand(ArgOffset + Intr->LodIndex);
+ Intr = NewImageDimIntr;
}
}
}
// Optimize _mip away, when 'lod' is zero
- if (AMDGPU::getMIMGMIPMappingInfo(ImageDimIntr->BaseOpcode)) {
+ if (AMDGPU::getMIMGMIPMappingInfo(Intr->BaseOpcode)) {
int64_t ConstantLod;
- const int LodIdx = AddrIdx + NumVAddrs - 1;
-
- if (mi_match(MI.getOperand(LodIdx).getReg(), *MRI, m_ICst(ConstantLod))) {
+ if (mi_match(MI.getOperand(ArgOffset + Intr->MipIndex).getReg(), *MRI,
+ m_ICst(ConstantLod))) {
if (ConstantLod == 0) {
// TODO: Change intrinsic opcode and remove operand instead or replacing
// it with 0, as the _L to _LZ handling is done above.
- MI.getOperand(LodIdx).ChangeToImmediate(0);
+ MI.getOperand(ArgOffset + Intr->MipIndex).ChangeToImmediate(0);
--CorrectedNumVAddrs;
}
}
@@ -3806,18 +4202,17 @@ bool AMDGPULegalizerInfo::legalizeImageIntrinsic(
} else if (!ST.hasG16())
return false;
- if (NumVAddrs > 1) {
+ if (Intr->NumVAddrs > 1) {
SmallVector<Register, 4> PackedRegs;
// Don't compress addresses for G16
- const int PackEndIdx =
- IsA16 ? (AddrIdx + NumVAddrs) : (DimIdx + NumGradients);
- packImageA16AddressToDwords(B, MI, PackedRegs, AddrIdx, DimIdx,
- PackEndIdx, NumGradients);
+ const int PackEndIdx = IsA16 ? Intr->VAddrEnd : Intr->CoordStart;
+ packImageA16AddressToDwords(B, MI, PackedRegs, ArgOffset, Intr,
+ PackEndIdx);
if (!IsA16) {
// Add uncompressed address
- for (int I = DimIdx + NumGradients; I != AddrIdx + NumVAddrs; ++I) {
- int AddrReg = MI.getOperand(I).getReg();
+ for (unsigned I = Intr->CoordStart; I < Intr->VAddrEnd; I++) {
+ int AddrReg = MI.getOperand(ArgOffset + I).getReg();
assert(B.getMRI()->getType(AddrReg) == LLT::scalar(32));
PackedRegs.push_back(AddrReg);
}
@@ -3833,9 +4228,9 @@ bool AMDGPULegalizerInfo::legalizeImageIntrinsic(
PackedRegs.resize(1);
}
- const int NumPacked = PackedRegs.size();
- for (int I = 0; I != NumVAddrs; ++I) {
- MachineOperand &SrcOp = MI.getOperand(AddrIdx + I);
+ const unsigned NumPacked = PackedRegs.size();
+ for (unsigned I = Intr->VAddrStart; I < Intr->VAddrEnd; I++) {
+ MachineOperand &SrcOp = MI.getOperand(ArgOffset + I);
if (!SrcOp.isReg()) {
assert(SrcOp.isImm() && SrcOp.getImm() == 0);
continue;
@@ -3843,8 +4238,8 @@ bool AMDGPULegalizerInfo::legalizeImageIntrinsic(
assert(SrcOp.getReg() != AMDGPU::NoRegister);
- if (I < NumPacked)
- SrcOp.setReg(PackedRegs[I]);
+ if (I - Intr->VAddrStart < NumPacked)
+ SrcOp.setReg(PackedRegs[I - Intr->VAddrStart]);
else
SrcOp.setReg(AMDGPU::NoRegister);
}
@@ -3863,8 +4258,9 @@ bool AMDGPULegalizerInfo::legalizeImageIntrinsic(
// allocation when possible.
const bool UseNSA = CorrectedNumVAddrs >= 3 && ST.hasNSAEncoding();
- if (!UseNSA && NumVAddrs > 1)
- convertImageAddrToPacked(B, MI, AddrIdx, NumVAddrs);
+ if (!UseNSA && Intr->NumVAddrs > 1)
+ convertImageAddrToPacked(B, MI, ArgOffset + Intr->VAddrStart,
+ Intr->NumVAddrs);
}
int Flags = 0;
@@ -3881,7 +4277,7 @@ bool AMDGPULegalizerInfo::legalizeImageIntrinsic(
if (!Ty.isVector() || Ty.getElementType() != S16)
return true;
- Register RepackedReg = handleD16VData(B, *MRI, VData);
+ Register RepackedReg = handleD16VData(B, *MRI, VData, true);
if (RepackedReg != VData) {
MI.getOperand(1).setReg(RepackedReg);
}
@@ -4053,8 +4449,10 @@ bool AMDGPULegalizerInfo::legalizeImageIntrinsic(
}
bool AMDGPULegalizerInfo::legalizeSBufferLoad(
- MachineInstr &MI, MachineIRBuilder &B,
- GISelChangeObserver &Observer) const {
+ LegalizerHelper &Helper, MachineInstr &MI) const {
+ MachineIRBuilder &B = Helper.MIRBuilder;
+ GISelChangeObserver &Observer = Helper.Observer;
+
Register Dst = MI.getOperand(0).getReg();
LLT Ty = B.getMRI()->getType(Dst);
unsigned Size = Ty.getSizeInBits();
@@ -4062,6 +4460,13 @@ bool AMDGPULegalizerInfo::legalizeSBufferLoad(
Observer.changingInstr(MI);
+ if (shouldBitcastLoadStoreType(ST, Ty, Size)) {
+ Ty = getBitcastRegisterType(Ty);
+ Helper.bitcastDst(MI, Ty, 0);
+ Dst = MI.getOperand(0).getReg();
+ B.setInsertPt(B.getMBB(), MI);
+ }
+
// FIXME: We don't really need this intermediate instruction. The intrinsic
// should be fixed to have a memory operand. Since it's readnone, we're not
// allowed to add one.
@@ -4083,8 +4488,6 @@ bool AMDGPULegalizerInfo::legalizeSBufferLoad(
// always be legal. We may need to restore this to a 96-bit result if it turns
// out this needs to be converted to a vector load during RegBankSelect.
if (!isPowerOf2_32(Size)) {
- LegalizerHelper Helper(MF, *this, Observer, B);
-
if (Ty.isVector())
Helper.moreElementsVectorDst(MI, getPow2VectorType(Ty), 0);
else
@@ -4095,6 +4498,7 @@ bool AMDGPULegalizerInfo::legalizeSBufferLoad(
return true;
}
+// TODO: Move to selection
bool AMDGPULegalizerInfo::legalizeTrapIntrinsic(MachineInstr &MI,
MachineRegisterInfo &MRI,
MachineIRBuilder &B) const {
@@ -4105,17 +4509,14 @@ bool AMDGPULegalizerInfo::legalizeTrapIntrinsic(MachineInstr &MI,
} else {
// Pass queue pointer to trap handler as input, and insert trap instruction
// Reference: https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi
- const ArgDescriptor *Arg =
- getArgDescriptor(B, AMDGPUFunctionArgInfo::QUEUE_PTR);
- if (!Arg)
- return false;
MachineRegisterInfo &MRI = *B.getMRI();
- Register SGPR01(AMDGPU::SGPR0_SGPR1);
- Register LiveIn = getLiveInRegister(
- B, MRI, SGPR01, LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64),
- /*InsertLiveInCopy=*/false);
- if (!loadInputValue(LiveIn, B, Arg))
+
+ Register LiveIn =
+ MRI.createGenericVirtualRegister(LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
+ if (!loadInputValue(LiveIn, B, AMDGPUFunctionArgInfo::QUEUE_PTR))
return false;
+
+ Register SGPR01(AMDGPU::SGPR0_SGPR1);
B.buildCopy(SGPR01, LiveIn);
B.buildInstr(AMDGPU::S_TRAP)
.addImm(GCNSubtarget::TrapIDLLVMTrap)
@@ -4146,6 +4547,78 @@ bool AMDGPULegalizerInfo::legalizeDebugTrapIntrinsic(
return true;
}
+bool AMDGPULegalizerInfo::legalizeBVHIntrinsic(MachineInstr &MI,
+ MachineIRBuilder &B) const {
+ MachineRegisterInfo &MRI = *B.getMRI();
+ const LLT S16 = LLT::scalar(16);
+ const LLT S32 = LLT::scalar(32);
+
+ Register DstReg = MI.getOperand(0).getReg();
+ Register NodePtr = MI.getOperand(2).getReg();
+ Register RayExtent = MI.getOperand(3).getReg();
+ Register RayOrigin = MI.getOperand(4).getReg();
+ Register RayDir = MI.getOperand(5).getReg();
+ Register RayInvDir = MI.getOperand(6).getReg();
+ Register TDescr = MI.getOperand(7).getReg();
+
+ bool IsA16 = MRI.getType(RayDir).getElementType().getSizeInBits() == 16;
+ bool Is64 = MRI.getType(NodePtr).getSizeInBits() == 64;
+ unsigned Opcode = IsA16 ? Is64 ? AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16_nsa
+ : AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16_nsa
+ : Is64 ? AMDGPU::IMAGE_BVH64_INTERSECT_RAY_nsa
+ : AMDGPU::IMAGE_BVH_INTERSECT_RAY_nsa;
+
+ SmallVector<Register, 12> Ops;
+ if (Is64) {
+ auto Unmerge = B.buildUnmerge({S32, S32}, NodePtr);
+ Ops.push_back(Unmerge.getReg(0));
+ Ops.push_back(Unmerge.getReg(1));
+ } else {
+ Ops.push_back(NodePtr);
+ }
+ Ops.push_back(RayExtent);
+
+ auto packLanes = [&Ops, &S32, &B] (Register Src) {
+ auto Unmerge = B.buildUnmerge({S32, S32, S32, S32}, Src);
+ Ops.push_back(Unmerge.getReg(0));
+ Ops.push_back(Unmerge.getReg(1));
+ Ops.push_back(Unmerge.getReg(2));
+ };
+
+ packLanes(RayOrigin);
+ if (IsA16) {
+ auto UnmergeRayDir = B.buildUnmerge({S16, S16, S16, S16}, RayDir);
+ auto UnmergeRayInvDir = B.buildUnmerge({S16, S16, S16, S16}, RayInvDir);
+ Register R1 = MRI.createGenericVirtualRegister(S32);
+ Register R2 = MRI.createGenericVirtualRegister(S32);
+ Register R3 = MRI.createGenericVirtualRegister(S32);
+ B.buildMerge(R1, {UnmergeRayDir.getReg(0), UnmergeRayDir.getReg(1)});
+ B.buildMerge(R2, {UnmergeRayDir.getReg(2), UnmergeRayInvDir.getReg(0)});
+ B.buildMerge(R3, {UnmergeRayInvDir.getReg(1), UnmergeRayInvDir.getReg(2)});
+ Ops.push_back(R1);
+ Ops.push_back(R2);
+ Ops.push_back(R3);
+ } else {
+ packLanes(RayDir);
+ packLanes(RayInvDir);
+ }
+
+ auto MIB = B.buildInstr(AMDGPU::G_AMDGPU_INTRIN_BVH_INTERSECT_RAY)
+ .addDef(DstReg)
+ .addImm(Opcode);
+
+ for (Register R : Ops) {
+ MIB.addUse(R);
+ }
+
+ MIB.addUse(TDescr)
+ .addImm(IsA16 ? 1 : 0)
+ .cloneMemRefs(MI);
+
+ MI.eraseFromParent();
+ return true;
+}
+
bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
MachineInstr &MI) const {
MachineIRBuilder &B = Helper.MIRBuilder;
@@ -4158,7 +4631,9 @@ bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
case Intrinsic::amdgcn_else: {
MachineInstr *Br = nullptr;
MachineBasicBlock *UncondBrTarget = nullptr;
- if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget)) {
+ bool Negated = false;
+ if (MachineInstr *BrCond =
+ verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) {
const SIRegisterInfo *TRI
= static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
@@ -4166,6 +4641,10 @@ bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
Register Use = MI.getOperand(3).getReg();
MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
+
+ if (Negated)
+ std::swap(CondBrTarget, UncondBrTarget);
+
B.setInsertPt(B.getMBB(), BrCond->getIterator());
if (IntrID == Intrinsic::amdgcn_if) {
B.buildInstr(AMDGPU::SI_IF)
@@ -4174,10 +4653,9 @@ bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
.addMBB(UncondBrTarget);
} else {
B.buildInstr(AMDGPU::SI_ELSE)
- .addDef(Def)
- .addUse(Use)
- .addMBB(UncondBrTarget)
- .addImm(0);
+ .addDef(Def)
+ .addUse(Use)
+ .addMBB(UncondBrTarget);
}
if (Br) {
@@ -4201,13 +4679,18 @@ bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
case Intrinsic::amdgcn_loop: {
MachineInstr *Br = nullptr;
MachineBasicBlock *UncondBrTarget = nullptr;
- if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget)) {
+ bool Negated = false;
+ if (MachineInstr *BrCond =
+ verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) {
const SIRegisterInfo *TRI
= static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
Register Reg = MI.getOperand(2).getReg();
+ if (Negated)
+ std::swap(CondBrTarget, UncondBrTarget);
+
B.setInsertPt(B.getMBB(), BrCond->getIterator());
B.buildInstr(AMDGPU::SI_LOOP)
.addUse(Reg)
@@ -4280,7 +4763,7 @@ bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
return true;
}
case Intrinsic::amdgcn_s_buffer_load:
- return legalizeSBufferLoad(MI, B, Helper.Observer);
+ return legalizeSBufferLoad(Helper, MI);
case Intrinsic::amdgcn_raw_buffer_store:
case Intrinsic::amdgcn_struct_buffer_store:
return legalizeBufferStore(MI, MRI, B, false, false);
@@ -4323,6 +4806,8 @@ bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
case Intrinsic::amdgcn_struct_buffer_atomic_inc:
case Intrinsic::amdgcn_raw_buffer_atomic_dec:
case Intrinsic::amdgcn_struct_buffer_atomic_dec:
+ case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
+ case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
return legalizeBufferAtomic(MI, B, IntrID);
@@ -4334,6 +4819,14 @@ bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
return legalizeTrapIntrinsic(MI, MRI, B);
case Intrinsic::debugtrap:
return legalizeDebugTrapIntrinsic(MI, MRI, B);
+ case Intrinsic::amdgcn_rsq_clamp:
+ return legalizeRsqClampIntrinsic(MI, MRI, B);
+ case Intrinsic::amdgcn_ds_fadd:
+ case Intrinsic::amdgcn_ds_fmin:
+ case Intrinsic::amdgcn_ds_fmax:
+ return legalizeDSAtomicFPIntrinsic(Helper, MI, IntrID);
+ case Intrinsic::amdgcn_image_bvh_intersect_ray:
+ return legalizeBVHIntrinsic(MI, B);
default: {
if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
AMDGPU::getImageDimIntrinsicInfo(IntrID))