aboutsummaryrefslogtreecommitdiff
path: root/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'contrib/llvm-project/llvm/lib/Target/AMDGPU/SIISelLowering.cpp')
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/SIISelLowering.cpp2070
1 files changed, 1693 insertions, 377 deletions
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index b7b90e23e895..34826809c1a6 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -15,6 +15,7 @@
#include "AMDGPU.h"
#include "AMDGPUInstrInfo.h"
#include "AMDGPUTargetMachine.h"
+#include "GCNSubtarget.h"
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
#include "SIMachineFunctionInfo.h"
#include "SIRegisterInfo.h"
@@ -28,6 +29,7 @@
#include "llvm/CodeGen/ByteProvider.h"
#include "llvm/CodeGen/FunctionLoweringInfo.h"
#include "llvm/CodeGen/GlobalISel/GISelKnownBits.h"
+#include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h"
#include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
#include "llvm/CodeGen/MachineFrameInfo.h"
#include "llvm/CodeGen/MachineFunction.h"
@@ -146,8 +148,13 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
addRegisterClass(MVT::v16f64, TRI->getVGPRClassForBitWidth(1024));
if (Subtarget->has16BitInsts()) {
- addRegisterClass(MVT::i16, &AMDGPU::SReg_32RegClass);
- addRegisterClass(MVT::f16, &AMDGPU::SReg_32RegClass);
+ if (Subtarget->useRealTrue16Insts()) {
+ addRegisterClass(MVT::i16, &AMDGPU::VGPR_16RegClass);
+ addRegisterClass(MVT::f16, &AMDGPU::VGPR_16RegClass);
+ } else {
+ addRegisterClass(MVT::i16, &AMDGPU::SReg_32RegClass);
+ addRegisterClass(MVT::f16, &AMDGPU::SReg_32RegClass);
+ }
// Unless there are also VOP3P operations, not operations are really legal.
addRegisterClass(MVT::v2i16, &AMDGPU::SReg_32RegClass);
@@ -158,6 +165,8 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
addRegisterClass(MVT::v8f16, &AMDGPU::SGPR_128RegClass);
addRegisterClass(MVT::v16i16, &AMDGPU::SGPR_256RegClass);
addRegisterClass(MVT::v16f16, &AMDGPU::SGPR_256RegClass);
+ addRegisterClass(MVT::v32i16, &AMDGPU::SGPR_512RegClass);
+ addRegisterClass(MVT::v32f16, &AMDGPU::SGPR_512RegClass);
}
addRegisterClass(MVT::v32i32, &AMDGPU::VReg_1024RegClass);
@@ -219,7 +228,7 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
setOperationAction(ISD::SELECT, MVT::f64, Promote);
AddPromotedToType(ISD::SELECT, MVT::f64, MVT::i64);
- setOperationAction(ISD::FSQRT, MVT::f64, Custom);
+ setOperationAction(ISD::FSQRT, {MVT::f32, MVT::f64}, Custom);
setOperationAction(ISD::SELECT_CC,
{MVT::f32, MVT::i32, MVT::i64, MVT::f64, MVT::i1}, Expand);
@@ -262,13 +271,13 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
// We only support LOAD/STORE and vector manipulation ops for vectors
// with > 4 elements.
for (MVT VT :
- {MVT::v8i32, MVT::v8f32, MVT::v9i32, MVT::v9f32, MVT::v10i32,
- MVT::v10f32, MVT::v11i32, MVT::v11f32, MVT::v12i32, MVT::v12f32,
- MVT::v16i32, MVT::v16f32, MVT::v2i64, MVT::v2f64, MVT::v4i16,
- MVT::v4f16, MVT::v3i64, MVT::v3f64, MVT::v6i32, MVT::v6f32,
- MVT::v4i64, MVT::v4f64, MVT::v8i64, MVT::v8f64, MVT::v8i16,
- MVT::v8f16, MVT::v16i16, MVT::v16f16, MVT::v16i64, MVT::v16f64,
- MVT::v32i32, MVT::v32f32}) {
+ {MVT::v8i32, MVT::v8f32, MVT::v9i32, MVT::v9f32, MVT::v10i32,
+ MVT::v10f32, MVT::v11i32, MVT::v11f32, MVT::v12i32, MVT::v12f32,
+ MVT::v16i32, MVT::v16f32, MVT::v2i64, MVT::v2f64, MVT::v4i16,
+ MVT::v4f16, MVT::v3i64, MVT::v3f64, MVT::v6i32, MVT::v6f32,
+ MVT::v4i64, MVT::v4f64, MVT::v8i64, MVT::v8f64, MVT::v8i16,
+ MVT::v8f16, MVT::v16i16, MVT::v16f16, MVT::v16i64, MVT::v16f64,
+ MVT::v32i32, MVT::v32f32, MVT::v32i16, MVT::v32f16}) {
for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) {
switch (Op) {
case ISD::LOAD:
@@ -420,6 +429,8 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
if (Subtarget->has16BitInsts()) {
setOperationAction({ISD::FPOW, ISD::FPOWI}, MVT::f16, Promote);
setOperationAction({ISD::FLOG, ISD::FEXP, ISD::FLOG10}, MVT::f16, Custom);
+ } else {
+ setOperationAction(ISD::FSQRT, MVT::f16, Custom);
}
if (Subtarget->hasMadMacF32Insts())
@@ -470,9 +481,10 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
{MVT::f32, MVT::f64}, Legal);
if (Subtarget->haveRoundOpsF64())
- setOperationAction({ISD::FTRUNC, ISD::FCEIL, ISD::FRINT}, MVT::f64, Legal);
+ setOperationAction({ISD::FTRUNC, ISD::FCEIL, ISD::FROUNDEVEN}, MVT::f64,
+ Legal);
else
- setOperationAction({ISD::FCEIL, ISD::FTRUNC, ISD::FRINT, ISD::FFLOOR},
+ setOperationAction({ISD::FCEIL, ISD::FTRUNC, ISD::FROUNDEVEN, ISD::FFLOOR},
MVT::f64, Custom);
setOperationAction(ISD::FFLOOR, MVT::f64, Legal);
@@ -544,8 +556,9 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
if (STI.hasMadF16())
setOperationAction(ISD::FMAD, MVT::f16, Legal);
- for (MVT VT : {MVT::v2i16, MVT::v2f16, MVT::v4i16, MVT::v4f16, MVT::v8i16,
- MVT::v8f16, MVT::v16i16, MVT::v16f16}) {
+ for (MVT VT :
+ {MVT::v2i16, MVT::v2f16, MVT::v4i16, MVT::v4f16, MVT::v8i16,
+ MVT::v8f16, MVT::v16i16, MVT::v16f16, MVT::v32i16, MVT::v32f16}) {
for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) {
switch (Op) {
case ISD::LOAD:
@@ -631,6 +644,16 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
setOperationAction(ISD::STORE, MVT::v16f16, Promote);
AddPromotedToType(ISD::STORE, MVT::v16f16, MVT::v8i32);
+ setOperationAction(ISD::LOAD, MVT::v32i16, Promote);
+ AddPromotedToType(ISD::LOAD, MVT::v32i16, MVT::v16i32);
+ setOperationAction(ISD::LOAD, MVT::v32f16, Promote);
+ AddPromotedToType(ISD::LOAD, MVT::v32f16, MVT::v16i32);
+
+ setOperationAction(ISD::STORE, MVT::v32i16, Promote);
+ AddPromotedToType(ISD::STORE, MVT::v32i16, MVT::v16i32);
+ setOperationAction(ISD::STORE, MVT::v32f16, Promote);
+ AddPromotedToType(ISD::STORE, MVT::v32f16, MVT::v16i32);
+
setOperationAction({ISD::ANY_EXTEND, ISD::ZERO_EXTEND, ISD::SIGN_EXTEND},
MVT::v2i32, Expand);
setOperationAction(ISD::FP_EXTEND, MVT::v2f32, Expand);
@@ -653,12 +676,15 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
setOperationAction({ISD::FMAXNUM_IEEE, ISD::FMINNUM_IEEE}, MVT::f16, Legal);
setOperationAction({ISD::FMINNUM_IEEE, ISD::FMAXNUM_IEEE},
- {MVT::v4f16, MVT::v8f16, MVT::v16f16}, Custom);
+ {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16},
+ Custom);
setOperationAction({ISD::FMINNUM, ISD::FMAXNUM},
- {MVT::v4f16, MVT::v8f16, MVT::v16f16}, Expand);
+ {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16},
+ Expand);
- for (MVT Vec16 : {MVT::v8i16, MVT::v8f16, MVT::v16i16, MVT::v16f16}) {
+ for (MVT Vec16 : {MVT::v8i16, MVT::v8f16, MVT::v16i16, MVT::v16f16,
+ MVT::v32i16, MVT::v32f16}) {
setOperationAction(
{ISD::BUILD_VECTOR, ISD::EXTRACT_VECTOR_ELT, ISD::SCALAR_TO_VECTOR},
Vec16, Custom);
@@ -681,10 +707,10 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
setOperationAction(ISD::VECTOR_SHUFFLE,
{MVT::v4f16, MVT::v4i16, MVT::v8f16, MVT::v8i16,
- MVT::v16f16, MVT::v16i16},
+ MVT::v16f16, MVT::v16i16, MVT::v32f16, MVT::v32i16},
Custom);
- for (MVT VT : {MVT::v4i16, MVT::v8i16, MVT::v16i16})
+ for (MVT VT : {MVT::v4i16, MVT::v8i16, MVT::v16i16, MVT::v32i16})
// Split vector operations.
setOperationAction({ISD::SHL, ISD::SRA, ISD::SRL, ISD::ADD, ISD::SUB,
ISD::MUL, ISD::SMIN, ISD::SMAX, ISD::UMIN, ISD::UMAX,
@@ -692,7 +718,7 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
ISD::SSUBSAT},
VT, Custom);
- for (MVT VT : {MVT::v4f16, MVT::v8f16, MVT::v16f16})
+ for (MVT VT : {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16})
// Split vector operations.
setOperationAction({ISD::FADD, ISD::FMUL, ISD::FMA, ISD::FCANONICALIZE},
VT, Custom);
@@ -728,7 +754,8 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
setOperationAction(ISD::SELECT,
{MVT::v4i16, MVT::v4f16, MVT::v2i8, MVT::v4i8, MVT::v8i8,
- MVT::v8i16, MVT::v8f16, MVT::v16i16, MVT::v16f16},
+ MVT::v8i16, MVT::v8f16, MVT::v16i16, MVT::v16f16,
+ MVT::v32i16, MVT::v32f16},
Custom);
setOperationAction({ISD::SMULO, ISD::UMULO}, MVT::i64, Custom);
@@ -736,6 +763,13 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
if (Subtarget->hasMad64_32())
setOperationAction({ISD::SMUL_LOHI, ISD::UMUL_LOHI}, MVT::i32, Custom);
+ if (Subtarget->hasPrefetch())
+ setOperationAction(ISD::PREFETCH, MVT::Other, Custom);
+
+ if (Subtarget->hasIEEEMinMax())
+ setOperationAction({ISD::FMAXIMUM, ISD::FMINIMUM},
+ {MVT::f16, MVT::f32, MVT::f64, MVT::v2f16}, Legal);
+
setOperationAction(ISD::INTRINSIC_WO_CHAIN,
{MVT::Other, MVT::f32, MVT::v4f32, MVT::i16, MVT::f16,
MVT::v2i16, MVT::v2f16, MVT::i128},
@@ -753,16 +787,28 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
MVT::i8, MVT::i128},
Custom);
+ setOperationAction(ISD::STACKSAVE, MVT::Other, Custom);
+ setOperationAction(ISD::GET_ROUNDING, MVT::i32, Custom);
+
+ // TODO: Could move this to custom lowering, could benefit from combines on
+ // extract of relevant bits.
+ setOperationAction(ISD::GET_FPMODE, MVT::i32, Legal);
+
+ setOperationAction(ISD::MUL, MVT::i1, Promote);
+
setTargetDAGCombine({ISD::ADD,
ISD::UADDO_CARRY,
ISD::SUB,
ISD::USUBO_CARRY,
ISD::FADD,
ISD::FSUB,
+ ISD::FDIV,
ISD::FMINNUM,
ISD::FMAXNUM,
ISD::FMINNUM_IEEE,
ISD::FMAXNUM_IEEE,
+ ISD::FMINIMUM,
+ ISD::FMAXIMUM,
ISD::FMA,
ISD::SMIN,
ISD::SMAX,
@@ -772,6 +818,7 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
ISD::AND,
ISD::OR,
ISD::XOR,
+ ISD::FSHR,
ISD::SINT_TO_FP,
ISD::UINT_TO_FP,
ISD::FCANONICALIZE,
@@ -1002,12 +1049,20 @@ static EVT memVTFromLoadIntrReturn(Type *Ty, unsigned MaxNumLanes) {
MVT SITargetLowering::getPointerTy(const DataLayout &DL, unsigned AS) const {
if (AMDGPUAS::BUFFER_FAT_POINTER == AS && DL.getPointerSizeInBits(AS) == 160)
return MVT::v5i32;
+ if (AMDGPUAS::BUFFER_STRIDED_POINTER == AS &&
+ DL.getPointerSizeInBits(AS) == 192)
+ return MVT::v6i32;
return AMDGPUTargetLowering::getPointerTy(DL, AS);
}
/// Similarly, the in-memory representation of a p7 is {p8, i32}, aka
/// v8i32 when padding is added.
+/// The in-memory representation of a p9 is {p8, i32, i32}, which is
+/// also v8i32 with padding.
MVT SITargetLowering::getPointerMemTy(const DataLayout &DL, unsigned AS) const {
- if (AMDGPUAS::BUFFER_FAT_POINTER == AS && DL.getPointerSizeInBits(AS) == 160)
+ if ((AMDGPUAS::BUFFER_FAT_POINTER == AS &&
+ DL.getPointerSizeInBits(AS) == 160) ||
+ (AMDGPUAS::BUFFER_STRIDED_POINTER == AS &&
+ DL.getPointerSizeInBits(AS) == 192))
return MVT::v8i32;
return AMDGPUTargetLowering::getPointerMemTy(DL, AS);
}
@@ -1186,9 +1241,13 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
case Intrinsic::amdgcn_global_atomic_fadd:
case Intrinsic::amdgcn_global_atomic_fmin:
case Intrinsic::amdgcn_global_atomic_fmax:
+ case Intrinsic::amdgcn_global_atomic_fmin_num:
+ case Intrinsic::amdgcn_global_atomic_fmax_num:
case Intrinsic::amdgcn_flat_atomic_fadd:
case Intrinsic::amdgcn_flat_atomic_fmin:
case Intrinsic::amdgcn_flat_atomic_fmax:
+ case Intrinsic::amdgcn_flat_atomic_fmin_num:
+ case Intrinsic::amdgcn_flat_atomic_fmax_num:
case Intrinsic::amdgcn_global_atomic_fadd_v2bf16:
case Intrinsic::amdgcn_flat_atomic_fadd_v2bf16: {
Info.opc = ISD::INTRINSIC_W_CHAIN;
@@ -1271,6 +1330,8 @@ bool SITargetLowering::getAddrModeArguments(IntrinsicInst *II,
case Intrinsic::amdgcn_flat_atomic_fadd:
case Intrinsic::amdgcn_flat_atomic_fmin:
case Intrinsic::amdgcn_flat_atomic_fmax:
+ case Intrinsic::amdgcn_flat_atomic_fmin_num:
+ case Intrinsic::amdgcn_flat_atomic_fmax_num:
case Intrinsic::amdgcn_global_atomic_fadd_v2bf16:
case Intrinsic::amdgcn_flat_atomic_fadd_v2bf16:
case Intrinsic::amdgcn_global_atomic_csub: {
@@ -1284,7 +1345,9 @@ bool SITargetLowering::getAddrModeArguments(IntrinsicInst *II,
}
}
-bool SITargetLowering::isLegalFlatAddressingMode(const AddrMode &AM) const {
+bool SITargetLowering::isLegalFlatAddressingMode(const AddrMode &AM,
+ unsigned AddrSpace,
+ uint64_t FlatVariant) const {
if (!Subtarget->hasFlatInstOffsets()) {
// Flat instructions do not have offsets, and only have the register
// address.
@@ -1292,29 +1355,27 @@ bool SITargetLowering::isLegalFlatAddressingMode(const AddrMode &AM) const {
}
return AM.Scale == 0 &&
- (AM.BaseOffs == 0 ||
- Subtarget->getInstrInfo()->isLegalFLATOffset(
- AM.BaseOffs, AMDGPUAS::FLAT_ADDRESS, SIInstrFlags::FLAT));
+ (AM.BaseOffs == 0 || Subtarget->getInstrInfo()->isLegalFLATOffset(
+ AM.BaseOffs, AddrSpace, FlatVariant));
}
bool SITargetLowering::isLegalGlobalAddressingMode(const AddrMode &AM) const {
if (Subtarget->hasFlatGlobalInsts())
- return AM.Scale == 0 &&
- (AM.BaseOffs == 0 || Subtarget->getInstrInfo()->isLegalFLATOffset(
- AM.BaseOffs, AMDGPUAS::GLOBAL_ADDRESS,
- SIInstrFlags::FlatGlobal));
+ return isLegalFlatAddressingMode(AM, AMDGPUAS::GLOBAL_ADDRESS,
+ SIInstrFlags::FlatGlobal);
if (!Subtarget->hasAddr64() || Subtarget->useFlatForGlobal()) {
- // Assume the we will use FLAT for all global memory accesses
- // on VI.
- // FIXME: This assumption is currently wrong. On VI we still use
- // MUBUF instructions for the r + i addressing mode. As currently
- // implemented, the MUBUF instructions only work on buffer < 4GB.
- // It may be possible to support > 4GB buffers with MUBUF instructions,
- // by setting the stride value in the resource descriptor which would
- // increase the size limit to (stride * 4GB). However, this is risky,
- // because it has never been validated.
- return isLegalFlatAddressingMode(AM);
+ // Assume the we will use FLAT for all global memory accesses
+ // on VI.
+ // FIXME: This assumption is currently wrong. On VI we still use
+ // MUBUF instructions for the r + i addressing mode. As currently
+ // implemented, the MUBUF instructions only work on buffer < 4GB.
+ // It may be possible to support > 4GB buffers with MUBUF instructions,
+ // by setting the stride value in the resource descriptor which would
+ // increase the size limit to (stride * 4GB). However, this is risky,
+ // because it has never been validated.
+ return isLegalFlatAddressingMode(AM, AMDGPUAS::FLAT_ADDRESS,
+ SIInstrFlags::FLAT);
}
return isLegalMUBUFAddressingMode(AM);
@@ -1330,7 +1391,8 @@ bool SITargetLowering::isLegalMUBUFAddressingMode(const AddrMode &AM) const {
// assume those use MUBUF instructions. Scratch loads / stores are currently
// implemented as mubuf instructions with offen bit set, so slightly
// different than the normal addr64.
- if (!SIInstrInfo::isLegalMUBUFImmOffset(AM.BaseOffs))
+ const SIInstrInfo *TII = Subtarget->getInstrInfo();
+ if (!TII->isLegalMUBUFImmOffset(AM.BaseOffs))
return false;
// FIXME: Since we can split immediate into soffset and immediate offset,
@@ -1367,7 +1429,8 @@ bool SITargetLowering::isLegalAddressingMode(const DataLayout &DL,
if (AS == AMDGPUAS::CONSTANT_ADDRESS ||
AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT ||
- AS == AMDGPUAS::BUFFER_FAT_POINTER || AS == AMDGPUAS::BUFFER_RESOURCE) {
+ AS == AMDGPUAS::BUFFER_FAT_POINTER || AS == AMDGPUAS::BUFFER_RESOURCE ||
+ AS == AMDGPUAS::BUFFER_STRIDED_POINTER) {
// If the offset isn't a multiple of 4, it probably isn't going to be
// correctly aligned.
// FIXME: Can we get the real alignment here?
@@ -1394,11 +1457,15 @@ bool SITargetLowering::isLegalAddressingMode(const DataLayout &DL,
// On VI, these use the SMEM format and the offset is 20-bit in bytes.
if (!isUInt<20>(AM.BaseOffs))
return false;
- } else {
+ } else if (Subtarget->getGeneration() < AMDGPUSubtarget::GFX12) {
// On GFX9 the offset is signed 21-bit in bytes (but must not be negative
// for S_BUFFER_* instructions).
if (!isInt<21>(AM.BaseOffs))
return false;
+ } else {
+ // On GFX12, all offsets are signed 24-bit in bytes.
+ if (!isInt<24>(AM.BaseOffs))
+ return false;
}
if (AM.Scale == 0) // r + i or just i, depending on HasBaseReg.
@@ -1411,9 +1478,13 @@ bool SITargetLowering::isLegalAddressingMode(const DataLayout &DL,
}
if (AS == AMDGPUAS::PRIVATE_ADDRESS)
- return isLegalMUBUFAddressingMode(AM);
+ return Subtarget->enableFlatScratch()
+ ? isLegalFlatAddressingMode(AM, AMDGPUAS::PRIVATE_ADDRESS,
+ SIInstrFlags::FlatScratch)
+ : isLegalMUBUFAddressingMode(AM);
- if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
+ if (AS == AMDGPUAS::LOCAL_ADDRESS ||
+ (AS == AMDGPUAS::REGION_ADDRESS && Subtarget->hasGDS())) {
// Basic, single offset DS instructions allow a 16-bit unsigned immediate
// field.
// XXX - If doing a 4-byte aligned 8-byte type access, we effectively have
@@ -1436,7 +1507,8 @@ bool SITargetLowering::isLegalAddressingMode(const DataLayout &DL,
// computation. We don't have instructions that compute pointers with any
// addressing modes, so treat them as having no offset like flat
// instructions.
- return isLegalFlatAddressingMode(AM);
+ return isLegalFlatAddressingMode(AM, AMDGPUAS::FLAT_ADDRESS,
+ SIInstrFlags::FLAT);
}
// Assume a user alias of global for unknown address spaces.
@@ -1748,13 +1820,13 @@ SDValue SITargetLowering::lowerKernArgParameterPtr(SelectionDAG &DAG,
// We may not have the kernarg segment argument if we have no kernel
// arguments.
if (!InputPtrReg)
- return DAG.getConstant(0, SL, PtrVT);
+ return DAG.getConstant(Offset, SL, PtrVT);
MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo();
SDValue BasePtr = DAG.getCopyFromReg(Chain, SL,
MRI.getLiveInVirtReg(InputPtrReg->getRegister()), PtrVT);
- return DAG.getObjectPtrOffset(SL, BasePtr, TypeSize::Fixed(Offset));
+ return DAG.getObjectPtrOffset(SL, BasePtr, TypeSize::getFixed(Offset));
}
SDValue SITargetLowering::getImplicitArgPtr(SelectionDAG &DAG,
@@ -2133,13 +2205,14 @@ void SITargetLowering::allocateSpecialInputSGPRs(
const SIRegisterInfo &TRI,
SIMachineFunctionInfo &Info) const {
auto &ArgInfo = Info.getArgInfo();
+ const GCNUserSGPRUsageInfo &UserSGPRInfo = Info.getUserSGPRInfo();
// TODO: Unify handling with private memory pointers.
- if (Info.hasDispatchPtr())
+ if (UserSGPRInfo.hasDispatchPtr())
allocateSGPR64Input(CCInfo, ArgInfo.DispatchPtr);
const Module *M = MF.getFunction().getParent();
- if (Info.hasQueuePtr() &&
+ if (UserSGPRInfo.hasQueuePtr() &&
AMDGPU::getCodeObjectVersion(*M) < AMDGPU::AMDHSA_COV5)
allocateSGPR64Input(CCInfo, ArgInfo.QueuePtr);
@@ -2148,7 +2221,7 @@ void SITargetLowering::allocateSpecialInputSGPRs(
if (Info.hasImplicitArgPtr())
allocateSGPR64Input(CCInfo, ArgInfo.ImplicitArgPtr);
- if (Info.hasDispatchID())
+ if (UserSGPRInfo.hasDispatchID())
allocateSGPR64Input(CCInfo, ArgInfo.DispatchID);
// flat_scratch_init is not applicable for non-kernel functions.
@@ -2171,34 +2244,35 @@ void SITargetLowering::allocateHSAUserSGPRs(CCState &CCInfo,
MachineFunction &MF,
const SIRegisterInfo &TRI,
SIMachineFunctionInfo &Info) const {
- if (Info.hasImplicitBufferPtr()) {
+ const GCNUserSGPRUsageInfo &UserSGPRInfo = Info.getUserSGPRInfo();
+ if (UserSGPRInfo.hasImplicitBufferPtr()) {
Register ImplicitBufferPtrReg = Info.addImplicitBufferPtr(TRI);
MF.addLiveIn(ImplicitBufferPtrReg, &AMDGPU::SGPR_64RegClass);
CCInfo.AllocateReg(ImplicitBufferPtrReg);
}
// FIXME: How should these inputs interact with inreg / custom SGPR inputs?
- if (Info.hasPrivateSegmentBuffer()) {
+ if (UserSGPRInfo.hasPrivateSegmentBuffer()) {
Register PrivateSegmentBufferReg = Info.addPrivateSegmentBuffer(TRI);
MF.addLiveIn(PrivateSegmentBufferReg, &AMDGPU::SGPR_128RegClass);
CCInfo.AllocateReg(PrivateSegmentBufferReg);
}
- if (Info.hasDispatchPtr()) {
+ if (UserSGPRInfo.hasDispatchPtr()) {
Register DispatchPtrReg = Info.addDispatchPtr(TRI);
MF.addLiveIn(DispatchPtrReg, &AMDGPU::SGPR_64RegClass);
CCInfo.AllocateReg(DispatchPtrReg);
}
const Module *M = MF.getFunction().getParent();
- if (Info.hasQueuePtr() &&
+ if (UserSGPRInfo.hasQueuePtr() &&
AMDGPU::getCodeObjectVersion(*M) < AMDGPU::AMDHSA_COV5) {
Register QueuePtrReg = Info.addQueuePtr(TRI);
MF.addLiveIn(QueuePtrReg, &AMDGPU::SGPR_64RegClass);
CCInfo.AllocateReg(QueuePtrReg);
}
- if (Info.hasKernargSegmentPtr()) {
+ if (UserSGPRInfo.hasKernargSegmentPtr()) {
MachineRegisterInfo &MRI = MF.getRegInfo();
Register InputPtrReg = Info.addKernargSegmentPtr(TRI);
CCInfo.AllocateReg(InputPtrReg);
@@ -2207,26 +2281,100 @@ void SITargetLowering::allocateHSAUserSGPRs(CCState &CCInfo,
MRI.setType(VReg, LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
}
- if (Info.hasDispatchID()) {
+ if (UserSGPRInfo.hasDispatchID()) {
Register DispatchIDReg = Info.addDispatchID(TRI);
MF.addLiveIn(DispatchIDReg, &AMDGPU::SGPR_64RegClass);
CCInfo.AllocateReg(DispatchIDReg);
}
- if (Info.hasFlatScratchInit() && !getSubtarget()->isAmdPalOS()) {
+ if (UserSGPRInfo.hasFlatScratchInit() && !getSubtarget()->isAmdPalOS()) {
Register FlatScratchInitReg = Info.addFlatScratchInit(TRI);
MF.addLiveIn(FlatScratchInitReg, &AMDGPU::SGPR_64RegClass);
CCInfo.AllocateReg(FlatScratchInitReg);
}
+ // TODO: Add GridWorkGroupCount user SGPRs when used. For now with HSA we read
+ // these from the dispatch pointer.
+}
+
+// Allocate pre-loaded kernel arguemtns. Arguments to be preloading must be
+// sequential starting from the first argument.
+void SITargetLowering::allocatePreloadKernArgSGPRs(
+ CCState &CCInfo, SmallVectorImpl<CCValAssign> &ArgLocs,
+ const SmallVectorImpl<ISD::InputArg> &Ins, MachineFunction &MF,
+ const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const {
+ Function &F = MF.getFunction();
+ unsigned LastExplicitArgOffset =
+ MF.getSubtarget<GCNSubtarget>().getExplicitKernelArgOffset();
+ GCNUserSGPRUsageInfo &SGPRInfo = Info.getUserSGPRInfo();
+ bool InPreloadSequence = true;
+ unsigned InIdx = 0;
+ for (auto &Arg : F.args()) {
+ if (!InPreloadSequence || !Arg.hasInRegAttr())
+ break;
+
+ int ArgIdx = Arg.getArgNo();
+ // Don't preload non-original args or parts not in the current preload
+ // sequence.
+ if (InIdx < Ins.size() && (!Ins[InIdx].isOrigArg() ||
+ (int)Ins[InIdx].getOrigArgIndex() != ArgIdx))
+ break;
+
+ for (; InIdx < Ins.size() && Ins[InIdx].isOrigArg() &&
+ (int)Ins[InIdx].getOrigArgIndex() == ArgIdx;
+ InIdx++) {
+ assert(ArgLocs[ArgIdx].isMemLoc());
+ auto &ArgLoc = ArgLocs[InIdx];
+ const Align KernelArgBaseAlign = Align(16);
+ unsigned ArgOffset = ArgLoc.getLocMemOffset();
+ Align Alignment = commonAlignment(KernelArgBaseAlign, ArgOffset);
+ unsigned NumAllocSGPRs =
+ alignTo(ArgLoc.getLocVT().getFixedSizeInBits(), 32) / 32;
+
+ // Arg is preloaded into the previous SGPR.
+ if (ArgLoc.getLocVT().getStoreSize() < 4 && Alignment < 4) {
+ Info.getArgInfo().PreloadKernArgs[InIdx].Regs.push_back(
+ Info.getArgInfo().PreloadKernArgs[InIdx - 1].Regs[0]);
+ continue;
+ }
+
+ unsigned Padding = ArgOffset - LastExplicitArgOffset;
+ unsigned PaddingSGPRs = alignTo(Padding, 4) / 4;
+ // Check for free user SGPRs for preloading.
+ if (PaddingSGPRs + NumAllocSGPRs + 1 /*Synthetic SGPRs*/ >
+ SGPRInfo.getNumFreeUserSGPRs()) {
+ InPreloadSequence = false;
+ break;
+ }
+
+ // Preload this argument.
+ const TargetRegisterClass *RC =
+ TRI.getSGPRClassForBitWidth(NumAllocSGPRs * 32);
+ SmallVectorImpl<MCRegister> *PreloadRegs =
+ Info.addPreloadedKernArg(TRI, RC, NumAllocSGPRs, InIdx, PaddingSGPRs);
+
+ if (PreloadRegs->size() > 1)
+ RC = &AMDGPU::SGPR_32RegClass;
+ for (auto &Reg : *PreloadRegs) {
+ assert(Reg);
+ MF.addLiveIn(Reg, RC);
+ CCInfo.AllocateReg(Reg);
+ }
+
+ LastExplicitArgOffset = NumAllocSGPRs * 4 + ArgOffset;
+ }
+ }
+}
+
+void SITargetLowering::allocateLDSKernelId(CCState &CCInfo, MachineFunction &MF,
+ const SIRegisterInfo &TRI,
+ SIMachineFunctionInfo &Info) const {
+ // Always allocate this last since it is a synthetic preload.
if (Info.hasLDSKernelId()) {
Register Reg = Info.addLDSKernelId();
MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
CCInfo.AllocateReg(Reg);
}
-
- // TODO: Add GridWorkGroupCount user SGPRs when used. For now with HSA we read
- // these from the dispatch pointer.
}
// Allocate special input registers that are initialized per-wave.
@@ -2331,7 +2479,7 @@ static void reservePrivateMemoryRegs(const TargetMachine &TM,
// Everything live out of a block is spilled with fast regalloc, so it's
// almost certain that spilling will be required.
- if (TM.getOptLevel() == CodeGenOpt::None)
+ if (TM.getOptLevel() == CodeGenOptLevel::None)
HasStackObjects = true;
// For now assume stack access is needed in any callee functions, so we need
@@ -2477,12 +2625,14 @@ SDValue SITargetLowering::LowerFormalArguments(
bool IsEntryFunc = AMDGPU::isEntryFunctionCC(CallConv);
if (IsGraphics) {
- assert(!Info->hasDispatchPtr() && !Info->hasKernargSegmentPtr() &&
- !Info->hasWorkGroupInfo() && !Info->hasLDSKernelId() &&
- !Info->hasWorkItemIDX() && !Info->hasWorkItemIDY() &&
- !Info->hasWorkItemIDZ());
+ const GCNUserSGPRUsageInfo &UserSGPRInfo = Info->getUserSGPRInfo();
+ assert(!UserSGPRInfo.hasDispatchPtr() &&
+ !UserSGPRInfo.hasKernargSegmentPtr() && !Info->hasWorkGroupInfo() &&
+ !Info->hasLDSKernelId() && !Info->hasWorkItemIDX() &&
+ !Info->hasWorkItemIDY() && !Info->hasWorkItemIDZ());
+ (void)UserSGPRInfo;
if (!Subtarget->enableFlatScratch())
- assert(!Info->hasFlatScratchInit());
+ assert(!UserSGPRInfo.hasFlatScratchInit());
if (CallConv != CallingConv::AMDGPU_CS || !Subtarget->hasArchitectedSGPRs())
assert(!Info->hasWorkGroupIDX() && !Info->hasWorkGroupIDY() &&
!Info->hasWorkGroupIDZ());
@@ -2531,18 +2681,29 @@ SDValue SITargetLowering::LowerFormalArguments(
Splits.append(Ins.begin(), Ins.end());
}
+ if (IsKernel)
+ analyzeFormalArgumentsCompute(CCInfo, Ins);
+
if (IsEntryFunc) {
allocateSpecialEntryInputVGPRs(CCInfo, MF, *TRI, *Info);
allocateHSAUserSGPRs(CCInfo, MF, *TRI, *Info);
+ if (IsKernel && Subtarget->hasKernargPreload() &&
+ !Subtarget->needsKernargPreloadBackwardsCompatibility())
+ allocatePreloadKernArgSGPRs(CCInfo, ArgLocs, Ins, MF, *TRI, *Info);
+
+ allocateLDSKernelId(CCInfo, MF, *TRI, *Info);
} else if (!IsGraphics) {
// For the fixed ABI, pass workitem IDs in the last argument register.
allocateSpecialInputVGPRsFixed(CCInfo, MF, *TRI, *Info);
}
- if (IsKernel) {
- analyzeFormalArgumentsCompute(CCInfo, Ins);
- } else {
+ if (!IsKernel) {
CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, isVarArg);
+ if (!IsGraphics && !Subtarget->enableFlatScratch()) {
+ CCInfo.AllocateRegBlock(ArrayRef<MCPhysReg>{AMDGPU::SGPR0, AMDGPU::SGPR1,
+ AMDGPU::SGPR2, AMDGPU::SGPR3},
+ 4);
+ }
CCInfo.AnalyzeFormalArguments(Splits, AssignFn);
}
@@ -2587,9 +2748,81 @@ SDValue SITargetLowering::LowerFormalArguments(
continue;
}
- SDValue Arg = lowerKernargMemParameter(
- DAG, VT, MemVT, DL, Chain, Offset, Alignment, Ins[i].Flags.isSExt(), &Ins[i]);
- Chains.push_back(Arg.getValue(1));
+ SDValue NewArg;
+ if (Arg.isOrigArg() && Info->getArgInfo().PreloadKernArgs.count(i)) {
+ if (MemVT.getStoreSize() < 4 && Alignment < 4) {
+ // In this case the argument is packed into the previous preload SGPR.
+ int64_t AlignDownOffset = alignDown(Offset, 4);
+ int64_t OffsetDiff = Offset - AlignDownOffset;
+ EVT IntVT = MemVT.changeTypeToInteger();
+
+ const SIMachineFunctionInfo *Info =
+ MF.getInfo<SIMachineFunctionInfo>();
+ MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo();
+ Register Reg =
+ Info->getArgInfo().PreloadKernArgs.find(i)->getSecond().Regs[0];
+
+ assert(Reg);
+ Register VReg = MRI.getLiveInVirtReg(Reg);
+ SDValue Copy = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i32);
+
+ SDValue ShiftAmt = DAG.getConstant(OffsetDiff * 8, DL, MVT::i32);
+ SDValue Extract = DAG.getNode(ISD::SRL, DL, MVT::i32, Copy, ShiftAmt);
+
+ SDValue ArgVal = DAG.getNode(ISD::TRUNCATE, DL, IntVT, Extract);
+ ArgVal = DAG.getNode(ISD::BITCAST, DL, MemVT, ArgVal);
+ NewArg = convertArgType(DAG, VT, MemVT, DL, ArgVal,
+ Ins[i].Flags.isSExt(), &Ins[i]);
+
+ NewArg = DAG.getMergeValues({NewArg, Copy.getValue(1)}, DL);
+ } else {
+ const SIMachineFunctionInfo *Info =
+ MF.getInfo<SIMachineFunctionInfo>();
+ MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo();
+ const SmallVectorImpl<MCRegister> &PreloadRegs =
+ Info->getArgInfo().PreloadKernArgs.find(i)->getSecond().Regs;
+
+ SDValue Copy;
+ if (PreloadRegs.size() == 1) {
+ Register VReg = MRI.getLiveInVirtReg(PreloadRegs[0]);
+ const TargetRegisterClass *RC = MRI.getRegClass(VReg);
+ NewArg = DAG.getCopyFromReg(
+ Chain, DL, VReg,
+ EVT::getIntegerVT(*DAG.getContext(),
+ TRI->getRegSizeInBits(*RC)));
+
+ } else {
+ // If the kernarg alignment does not match the alignment of the SGPR
+ // tuple RC that can accommodate this argument, it will be built up
+ // via copies from from the individual SGPRs that the argument was
+ // preloaded to.
+ SmallVector<SDValue, 4> Elts;
+ for (auto Reg : PreloadRegs) {
+ Register VReg = MRI.getLiveInVirtReg(Reg);
+ Copy = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i32);
+ Elts.push_back(Copy);
+ }
+ NewArg =
+ DAG.getBuildVector(EVT::getVectorVT(*DAG.getContext(), MVT::i32,
+ PreloadRegs.size()),
+ DL, Elts);
+ }
+
+ SDValue CMemVT;
+ if (VT.isScalarInteger() && VT.bitsLT(NewArg.getSimpleValueType()))
+ CMemVT = DAG.getNode(ISD::TRUNCATE, DL, MemVT, NewArg);
+ else
+ CMemVT = DAG.getBitcast(MemVT, NewArg);
+ NewArg = convertArgType(DAG, VT, MemVT, DL, CMemVT,
+ Ins[i].Flags.isSExt(), &Ins[i]);
+ NewArg = DAG.getMergeValues({NewArg, Chain}, DL);
+ }
+ } else {
+ NewArg =
+ lowerKernargMemParameter(DAG, VT, MemVT, DL, Chain, Offset,
+ Alignment, Ins[i].Flags.isSExt(), &Ins[i]);
+ }
+ Chains.push_back(NewArg.getValue(1));
auto *ParamTy =
dyn_cast<PointerType>(FType->getParamType(Ins[i].getOrigArgIndex()));
@@ -2599,11 +2832,11 @@ SDValue SITargetLowering::LowerFormalArguments(
// On SI local pointers are just offsets into LDS, so they are always
// less than 16-bits. On CI and newer they could potentially be
// real pointers, so we can't guarantee their size.
- Arg = DAG.getNode(ISD::AssertZext, DL, Arg.getValueType(), Arg,
- DAG.getValueType(MVT::i16));
+ NewArg = DAG.getNode(ISD::AssertZext, DL, NewArg.getValueType(), NewArg,
+ DAG.getValueType(MVT::i16));
}
- InVals.push_back(Arg);
+ InVals.push_back(NewArg);
continue;
} else if (!IsEntryFunc && VA.isMemLoc()) {
SDValue Val = lowerStackParameter(DAG, VA, DL, Chain, Arg);
@@ -3084,6 +3317,9 @@ bool SITargetLowering::isEligibleForTailCallOptimization(
const SmallVectorImpl<ISD::OutputArg> &Outs,
const SmallVectorImpl<SDValue> &OutVals,
const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const {
+ if (AMDGPU::isChainCC(CalleeCC))
+ return true;
+
if (!mayTailCallThisCC(CalleeCC))
return false;
@@ -3168,7 +3404,36 @@ bool SITargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const {
// The wave scratch offset register is used as the global base pointer.
SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,
SmallVectorImpl<SDValue> &InVals) const {
+ CallingConv::ID CallConv = CLI.CallConv;
+ bool IsChainCallConv = AMDGPU::isChainCC(CallConv);
+
SelectionDAG &DAG = CLI.DAG;
+
+ TargetLowering::ArgListEntry RequestedExec;
+ if (IsChainCallConv) {
+ // The last argument should be the value that we need to put in EXEC.
+ // Pop it out of CLI.Outs and CLI.OutVals before we do any processing so we
+ // don't treat it like the rest of the arguments.
+ RequestedExec = CLI.Args.back();
+ assert(RequestedExec.Node && "No node for EXEC");
+
+ if (!RequestedExec.Ty->isIntegerTy(Subtarget->getWavefrontSize()))
+ return lowerUnhandledCall(CLI, InVals, "Invalid value for EXEC");
+
+ assert(CLI.Outs.back().OrigArgIndex == 2 && "Unexpected last arg");
+ CLI.Outs.pop_back();
+ CLI.OutVals.pop_back();
+
+ if (RequestedExec.Ty->isIntegerTy(64)) {
+ assert(CLI.Outs.back().OrigArgIndex == 2 && "Exec wasn't split up");
+ CLI.Outs.pop_back();
+ CLI.OutVals.pop_back();
+ }
+
+ assert(CLI.Outs.back().OrigArgIndex != 2 &&
+ "Haven't popped all the pieces of the EXEC mask");
+ }
+
const SDLoc &DL = CLI.DL;
SmallVector<ISD::OutputArg, 32> &Outs = CLI.Outs;
SmallVector<SDValue, 32> &OutVals = CLI.OutVals;
@@ -3176,7 +3441,6 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,
SDValue Chain = CLI.Chain;
SDValue Callee = CLI.Callee;
bool &IsTailCall = CLI.IsTailCall;
- CallingConv::ID CallConv = CLI.CallConv;
bool IsVarArg = CLI.IsVarArg;
bool IsSibCall = false;
bool IsThisReturn = false;
@@ -3207,9 +3471,10 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,
if (IsTailCall) {
IsTailCall = isEligibleForTailCallOptimization(
Callee, CallConv, IsVarArg, Outs, OutVals, Ins, DAG);
- if (!IsTailCall && CLI.CB && CLI.CB->isMustTailCall()) {
+ if (!IsTailCall &&
+ ((CLI.CB && CLI.CB->isMustTailCall()) || IsChainCallConv)) {
report_fatal_error("failed to perform tail call elimination on a call "
- "site marked musttail");
+ "site marked musttail or on llvm.amdgcn.cs.chain");
}
bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt;
@@ -3232,7 +3497,7 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,
CCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext());
CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, IsVarArg);
- if (CallConv != CallingConv::AMDGPU_Gfx) {
+ if (CallConv != CallingConv::AMDGPU_Gfx && !AMDGPU::isChainCC(CallConv)) {
// With a fixed ABI, allocate fixed registers before user arguments.
passSpecialInputs(CLI, CCInfo, *Info, RegsToPass, MemOpChains, Chain);
}
@@ -3258,16 +3523,20 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,
// Adjust the stack pointer for the new arguments...
// These operations are automatically eliminated by the prolog/epilog pass
- if (!IsSibCall) {
+ if (!IsSibCall)
Chain = DAG.getCALLSEQ_START(Chain, 0, 0, DL);
+ if (!IsSibCall || IsChainCallConv) {
if (!Subtarget->enableFlatScratch()) {
SmallVector<SDValue, 4> CopyFromChains;
// In the HSA case, this should be an identity copy.
SDValue ScratchRSrcReg
= DAG.getCopyFromReg(Chain, DL, Info->getScratchRSrcReg(), MVT::v4i32);
- RegsToPass.emplace_back(AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3, ScratchRSrcReg);
+ RegsToPass.emplace_back(IsChainCallConv
+ ? AMDGPU::SGPR48_SGPR49_SGPR50_SGPR51
+ : AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3,
+ ScratchRSrcReg);
CopyFromChains.push_back(ScratchRSrcReg.getValue(1));
Chain = DAG.getTokenFactor(DL, CopyFromChains);
}
@@ -3412,6 +3681,9 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,
Ops.push_back(DAG.getTargetConstant(FPDiff, DL, MVT::i32));
}
+ if (IsChainCallConv)
+ Ops.push_back(RequestedExec.Node);
+
// Add argument registers to the end of the list so that they are known live
// into the call.
for (auto &RegToPass : RegsToPass) {
@@ -3420,8 +3692,7 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,
}
// Add a register mask operand representing the call-preserved registers.
-
- auto *TRI = static_cast<const SIRegisterInfo*>(Subtarget->getRegisterInfo());
+ auto *TRI = static_cast<const SIRegisterInfo *>(Subtarget->getRegisterInfo());
const uint32_t *Mask = TRI->getCallPreservedMask(MF, CallConv);
assert(Mask && "Missing call preserved mask for calling convention");
Ops.push_back(DAG.getRegisterMask(Mask));
@@ -3435,8 +3706,17 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,
// actual call instruction.
if (IsTailCall) {
MFI.setHasTailCall();
- unsigned OPC = CallConv == CallingConv::AMDGPU_Gfx ?
- AMDGPUISD::TC_RETURN_GFX : AMDGPUISD::TC_RETURN;
+ unsigned OPC = AMDGPUISD::TC_RETURN;
+ switch (CallConv) {
+ case CallingConv::AMDGPU_Gfx:
+ OPC = AMDGPUISD::TC_RETURN_GFX;
+ break;
+ case CallingConv::AMDGPU_CS_Chain:
+ case CallingConv::AMDGPU_CS_ChainPreserve:
+ OPC = AMDGPUISD::TC_RETURN_CHAIN;
+ break;
+ }
+
return DAG.getNode(OPC, DL, NodeTys, Ops);
}
@@ -3481,22 +3761,21 @@ SDValue SITargetLowering::lowerDYNAMIC_STACKALLOCImpl(
SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, VT);
Chain = SP.getValue(1);
MaybeAlign Alignment = cast<ConstantSDNode>(Tmp3)->getMaybeAlignValue();
- const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
- const TargetFrameLowering *TFL = ST.getFrameLowering();
+ const TargetFrameLowering *TFL = Subtarget->getFrameLowering();
unsigned Opc =
TFL->getStackGrowthDirection() == TargetFrameLowering::StackGrowsUp ?
ISD::ADD : ISD::SUB;
SDValue ScaledSize = DAG.getNode(
ISD::SHL, dl, VT, Size,
- DAG.getConstant(ST.getWavefrontSizeLog2(), dl, MVT::i32));
+ DAG.getConstant(Subtarget->getWavefrontSizeLog2(), dl, MVT::i32));
Align StackAlign = TFL->getStackAlign();
Tmp1 = DAG.getNode(Opc, dl, VT, SP, ScaledSize); // Value
if (Alignment && *Alignment > StackAlign) {
Tmp1 = DAG.getNode(ISD::AND, dl, VT, Tmp1,
DAG.getConstant(-(uint64_t)Alignment->value()
- << ST.getWavefrontSizeLog2(),
+ << Subtarget->getWavefrontSizeLog2(),
dl, VT));
}
@@ -3520,6 +3799,111 @@ SDValue SITargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
return AMDGPUTargetLowering::LowerDYNAMIC_STACKALLOC(Op, DAG);
}
+SDValue SITargetLowering::LowerSTACKSAVE(SDValue Op, SelectionDAG &DAG) const {
+ if (Op.getValueType() != MVT::i32)
+ return Op; // Defer to cannot select error.
+
+ Register SP = getStackPointerRegisterToSaveRestore();
+ SDLoc SL(Op);
+
+ SDValue CopyFromSP = DAG.getCopyFromReg(Op->getOperand(0), SL, SP, MVT::i32);
+
+ // Convert from wave uniform to swizzled vector address. This should protect
+ // from any edge cases where the stacksave result isn't directly used with
+ // stackrestore.
+ SDValue VectorAddress =
+ DAG.getNode(AMDGPUISD::WAVE_ADDRESS, SL, MVT::i32, CopyFromSP);
+ return DAG.getMergeValues({VectorAddress, CopyFromSP.getValue(1)}, SL);
+}
+
+SDValue SITargetLowering::lowerGET_ROUNDING(SDValue Op,
+ SelectionDAG &DAG) const {
+ SDLoc SL(Op);
+ assert(Op.getValueType() == MVT::i32);
+
+ uint32_t BothRoundHwReg =
+ AMDGPU::Hwreg::encodeHwreg(AMDGPU::Hwreg::ID_MODE, 0, 4);
+ SDValue GetRoundBothImm = DAG.getTargetConstant(BothRoundHwReg, SL, MVT::i32);
+
+ SDValue IntrinID =
+ DAG.getTargetConstant(Intrinsic::amdgcn_s_getreg, SL, MVT::i32);
+ SDValue GetReg = DAG.getNode(ISD::INTRINSIC_W_CHAIN, SL, Op->getVTList(),
+ Op.getOperand(0), IntrinID, GetRoundBothImm);
+
+ // There are two rounding modes, one for f32 and one for f64/f16. We only
+ // report in the standard value range if both are the same.
+ //
+ // The raw values also differ from the expected FLT_ROUNDS values. Nearest
+ // ties away from zero is not supported, and the other values are rotated by
+ // 1.
+ //
+ // If the two rounding modes are not the same, report a target defined value.
+
+ // Mode register rounding mode fields:
+ //
+ // [1:0] Single-precision round mode.
+ // [3:2] Double/Half-precision round mode.
+ //
+ // 0=nearest even; 1= +infinity; 2= -infinity, 3= toward zero.
+ //
+ // Hardware Spec
+ // Toward-0 3 0
+ // Nearest Even 0 1
+ // +Inf 1 2
+ // -Inf 2 3
+ // NearestAway0 N/A 4
+ //
+ // We have to handle 16 permutations of a 4-bit value, so we create a 64-bit
+ // table we can index by the raw hardware mode.
+ //
+ // (trunc (FltRoundConversionTable >> MODE.fp_round)) & 0xf
+
+ SDValue BitTable =
+ DAG.getConstant(AMDGPU::FltRoundConversionTable, SL, MVT::i64);
+
+ SDValue Two = DAG.getConstant(2, SL, MVT::i32);
+ SDValue RoundModeTimesNumBits =
+ DAG.getNode(ISD::SHL, SL, MVT::i32, GetReg, Two);
+
+ // TODO: We could possibly avoid a 64-bit shift and use a simpler table if we
+ // knew only one mode was demanded.
+ SDValue TableValue =
+ DAG.getNode(ISD::SRL, SL, MVT::i64, BitTable, RoundModeTimesNumBits);
+ SDValue TruncTable = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, TableValue);
+
+ SDValue EntryMask = DAG.getConstant(0xf, SL, MVT::i32);
+ SDValue TableEntry =
+ DAG.getNode(ISD::AND, SL, MVT::i32, TruncTable, EntryMask);
+
+ // There's a gap in the 4-bit encoded table and actual enum values, so offset
+ // if it's an extended value.
+ SDValue Four = DAG.getConstant(4, SL, MVT::i32);
+ SDValue IsStandardValue =
+ DAG.getSetCC(SL, MVT::i1, TableEntry, Four, ISD::SETULT);
+ SDValue EnumOffset = DAG.getNode(ISD::ADD, SL, MVT::i32, TableEntry, Four);
+ SDValue Result = DAG.getNode(ISD::SELECT, SL, MVT::i32, IsStandardValue,
+ TableEntry, EnumOffset);
+
+ return DAG.getMergeValues({Result, GetReg.getValue(1)}, SL);
+}
+
+SDValue SITargetLowering::lowerPREFETCH(SDValue Op, SelectionDAG &DAG) const {
+ if (Op->isDivergent())
+ return SDValue();
+
+ switch (cast<MemSDNode>(Op)->getAddressSpace()) {
+ case AMDGPUAS::FLAT_ADDRESS:
+ case AMDGPUAS::GLOBAL_ADDRESS:
+ case AMDGPUAS::CONSTANT_ADDRESS:
+ case AMDGPUAS::CONSTANT_ADDRESS_32BIT:
+ break;
+ default:
+ return SDValue();
+ }
+
+ return Op;
+}
+
Register SITargetLowering::getRegisterByName(const char* RegName, LLT VT,
const MachineFunction &MF) const {
Register Reg = StringSwitch<Register>(RegName)
@@ -4217,40 +4601,51 @@ MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter(
}
case AMDGPU::S_ADD_U64_PSEUDO:
case AMDGPU::S_SUB_U64_PSEUDO: {
- MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
+ // For targets older than GFX12, we emit a sequence of 32-bit operations.
+ // For GFX12, we emit s_add_u64 and s_sub_u64.
const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
- const SIRegisterInfo *TRI = ST.getRegisterInfo();
- const TargetRegisterClass *BoolRC = TRI->getBoolRC();
+ MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
const DebugLoc &DL = MI.getDebugLoc();
-
MachineOperand &Dest = MI.getOperand(0);
MachineOperand &Src0 = MI.getOperand(1);
MachineOperand &Src1 = MI.getOperand(2);
-
- Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
- Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
-
- MachineOperand Src0Sub0 = TII->buildExtractSubRegOrImm(
- MI, MRI, Src0, BoolRC, AMDGPU::sub0, &AMDGPU::SReg_32RegClass);
- MachineOperand Src0Sub1 = TII->buildExtractSubRegOrImm(
- MI, MRI, Src0, BoolRC, AMDGPU::sub1, &AMDGPU::SReg_32RegClass);
-
- MachineOperand Src1Sub0 = TII->buildExtractSubRegOrImm(
- MI, MRI, Src1, BoolRC, AMDGPU::sub0, &AMDGPU::SReg_32RegClass);
- MachineOperand Src1Sub1 = TII->buildExtractSubRegOrImm(
- MI, MRI, Src1, BoolRC, AMDGPU::sub1, &AMDGPU::SReg_32RegClass);
-
bool IsAdd = (MI.getOpcode() == AMDGPU::S_ADD_U64_PSEUDO);
-
- unsigned LoOpc = IsAdd ? AMDGPU::S_ADD_U32 : AMDGPU::S_SUB_U32;
- unsigned HiOpc = IsAdd ? AMDGPU::S_ADDC_U32 : AMDGPU::S_SUBB_U32;
- BuildMI(*BB, MI, DL, TII->get(LoOpc), DestSub0).add(Src0Sub0).add(Src1Sub0);
- BuildMI(*BB, MI, DL, TII->get(HiOpc), DestSub1).add(Src0Sub1).add(Src1Sub1);
- BuildMI(*BB, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), Dest.getReg())
- .addReg(DestSub0)
- .addImm(AMDGPU::sub0)
- .addReg(DestSub1)
- .addImm(AMDGPU::sub1);
+ if (Subtarget->hasScalarAddSub64()) {
+ unsigned Opc = IsAdd ? AMDGPU::S_ADD_U64 : AMDGPU::S_SUB_U64;
+ BuildMI(*BB, MI, DL, TII->get(Opc), Dest.getReg())
+ .addReg(Src0.getReg())
+ .addReg(Src1.getReg());
+ } else {
+ const SIRegisterInfo *TRI = ST.getRegisterInfo();
+ const TargetRegisterClass *BoolRC = TRI->getBoolRC();
+
+ Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
+ Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
+
+ MachineOperand Src0Sub0 = TII->buildExtractSubRegOrImm(
+ MI, MRI, Src0, BoolRC, AMDGPU::sub0, &AMDGPU::SReg_32RegClass);
+ MachineOperand Src0Sub1 = TII->buildExtractSubRegOrImm(
+ MI, MRI, Src0, BoolRC, AMDGPU::sub1, &AMDGPU::SReg_32RegClass);
+
+ MachineOperand Src1Sub0 = TII->buildExtractSubRegOrImm(
+ MI, MRI, Src1, BoolRC, AMDGPU::sub0, &AMDGPU::SReg_32RegClass);
+ MachineOperand Src1Sub1 = TII->buildExtractSubRegOrImm(
+ MI, MRI, Src1, BoolRC, AMDGPU::sub1, &AMDGPU::SReg_32RegClass);
+
+ unsigned LoOpc = IsAdd ? AMDGPU::S_ADD_U32 : AMDGPU::S_SUB_U32;
+ unsigned HiOpc = IsAdd ? AMDGPU::S_ADDC_U32 : AMDGPU::S_SUBB_U32;
+ BuildMI(*BB, MI, DL, TII->get(LoOpc), DestSub0)
+ .add(Src0Sub0)
+ .add(Src1Sub0);
+ BuildMI(*BB, MI, DL, TII->get(HiOpc), DestSub1)
+ .add(Src0Sub1)
+ .add(Src1Sub1);
+ BuildMI(*BB, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), Dest.getReg())
+ .addReg(DestSub0)
+ .addImm(AMDGPU::sub0)
+ .addReg(DestSub1)
+ .addImm(AMDGPU::sub1);
+ }
MI.eraseFromParent();
return BB;
}
@@ -4463,8 +4858,8 @@ MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter(
const SIRegisterInfo *TRI = ST.getRegisterInfo();
Register Dst = MI.getOperand(0).getReg();
- Register Src0 = MI.getOperand(1).getReg();
- Register Src1 = MI.getOperand(2).getReg();
+ const MachineOperand &Src0 = MI.getOperand(1);
+ const MachineOperand &Src1 = MI.getOperand(2);
const DebugLoc &DL = MI.getDebugLoc();
Register SrcCond = MI.getOperand(3).getReg();
@@ -4473,20 +4868,42 @@ MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter(
const auto *CondRC = TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
Register SrcCondCopy = MRI.createVirtualRegister(CondRC);
+ const TargetRegisterClass *Src0RC = Src0.isReg()
+ ? MRI.getRegClass(Src0.getReg())
+ : &AMDGPU::VReg_64RegClass;
+ const TargetRegisterClass *Src1RC = Src1.isReg()
+ ? MRI.getRegClass(Src1.getReg())
+ : &AMDGPU::VReg_64RegClass;
+
+ const TargetRegisterClass *Src0SubRC =
+ TRI->getSubRegisterClass(Src0RC, AMDGPU::sub0);
+ const TargetRegisterClass *Src1SubRC =
+ TRI->getSubRegisterClass(Src1RC, AMDGPU::sub1);
+
+ MachineOperand Src0Sub0 = TII->buildExtractSubRegOrImm(
+ MI, MRI, Src0, Src0RC, AMDGPU::sub0, Src0SubRC);
+ MachineOperand Src1Sub0 = TII->buildExtractSubRegOrImm(
+ MI, MRI, Src1, Src1RC, AMDGPU::sub0, Src1SubRC);
+
+ MachineOperand Src0Sub1 = TII->buildExtractSubRegOrImm(
+ MI, MRI, Src0, Src0RC, AMDGPU::sub1, Src0SubRC);
+ MachineOperand Src1Sub1 = TII->buildExtractSubRegOrImm(
+ MI, MRI, Src1, Src1RC, AMDGPU::sub1, Src1SubRC);
+
BuildMI(*BB, MI, DL, TII->get(AMDGPU::COPY), SrcCondCopy)
.addReg(SrcCond);
BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_CNDMASK_B32_e64), DstLo)
- .addImm(0)
- .addReg(Src0, 0, AMDGPU::sub0)
- .addImm(0)
- .addReg(Src1, 0, AMDGPU::sub0)
- .addReg(SrcCondCopy);
+ .addImm(0)
+ .add(Src0Sub0)
+ .addImm(0)
+ .add(Src1Sub0)
+ .addReg(SrcCondCopy);
BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_CNDMASK_B32_e64), DstHi)
- .addImm(0)
- .addReg(Src0, 0, AMDGPU::sub1)
- .addImm(0)
- .addReg(Src1, 0, AMDGPU::sub1)
- .addReg(SrcCondCopy);
+ .addImm(0)
+ .add(Src0Sub1)
+ .addImm(0)
+ .add(Src1Sub1)
+ .addReg(SrcCondCopy);
BuildMI(*BB, MI, DL, TII->get(AMDGPU::REG_SEQUENCE), Dst)
.addReg(DstLo)
@@ -4843,7 +5260,7 @@ SDValue SITargetLowering::splitUnaryVectorOp(SDValue Op,
assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4f32 ||
VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v16i16 ||
VT == MVT::v16f16 || VT == MVT::v8f32 || VT == MVT::v16f32 ||
- VT == MVT::v32f32);
+ VT == MVT::v32f32 || VT == MVT::v32i16 || VT == MVT::v32f16);
SDValue Lo, Hi;
std::tie(Lo, Hi) = DAG.SplitVectorOperand(Op.getNode(), 0);
@@ -4866,7 +5283,7 @@ SDValue SITargetLowering::splitBinaryVectorOp(SDValue Op,
assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4f32 ||
VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v16i16 ||
VT == MVT::v16f16 || VT == MVT::v8f32 || VT == MVT::v16f32 ||
- VT == MVT::v32f32);
+ VT == MVT::v32f32 || VT == MVT::v32i16 || VT == MVT::v32f16);
SDValue Lo0, Hi0;
std::tie(Lo0, Hi0) = DAG.SplitVectorOperand(Op.getNode(), 0);
@@ -4926,10 +5343,14 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
"Load should return a value and a chain");
return Result;
}
- case ISD::FSQRT:
- if (Op.getValueType() == MVT::f64)
+ case ISD::FSQRT: {
+ EVT VT = Op.getValueType();
+ if (VT == MVT::f32)
+ return lowerFSQRTF32(Op, DAG);
+ if (VT == MVT::f64)
return lowerFSQRTF64(Op, DAG);
return SDValue();
+ }
case ISD::FSIN:
case ISD::FCOS:
return LowerTrig(Op, DAG);
@@ -5027,6 +5448,12 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
return lowerXMUL_LOHI(Op, DAG);
case ISD::DYNAMIC_STACKALLOC:
return LowerDYNAMIC_STACKALLOC(Op, DAG);
+ case ISD::STACKSAVE:
+ return LowerSTACKSAVE(Op, DAG);
+ case ISD::GET_ROUNDING:
+ return lowerGET_ROUNDING(Op, DAG);
+ case ISD::PREFETCH:
+ return lowerPREFETCH(Op, DAG);
}
return SDValue();
}
@@ -5382,6 +5809,12 @@ void SITargetLowering::ReplaceNodeResults(SDNode *N,
Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2f16, Op));
return;
}
+ case ISD::FSQRT: {
+ if (N->getValueType(0) != MVT::f16)
+ break;
+ Results.push_back(lowerFSQRTF16(SDValue(N, 0), DAG));
+ break;
+ }
default:
AMDGPUTargetLowering::ReplaceNodeResults(N, Results, DAG);
break;
@@ -5433,6 +5866,9 @@ bool SITargetLowering::shouldEmitFixup(const GlobalValue *GV) const {
}
bool SITargetLowering::shouldEmitGOTReloc(const GlobalValue *GV) const {
+ if (Subtarget->isAmdPalOS() || Subtarget->isMesa3DOS())
+ return false;
+
// FIXME: Either avoid relying on address space here or change the default
// address space for functions to avoid the explicit check.
return (GV->getValueType()->isFunctionTy() ||
@@ -5616,7 +6052,8 @@ SDValue SITargetLowering::lowerFMINNUM_FMAXNUM(SDValue Op,
if (IsIEEEMode)
return expandFMINNUM_FMAXNUM(Op.getNode(), DAG);
- if (VT == MVT::v4f16 || VT == MVT::v8f16 || VT == MVT::v16f16)
+ if (VT == MVT::v4f16 || VT == MVT::v8f16 || VT == MVT::v16f16 ||
+ VT == MVT::v16f16)
return splitBinaryVectorOp(Op, DAG);
return Op;
}
@@ -5711,11 +6148,6 @@ SDValue SITargetLowering::lowerTRAP(SDValue Op, SelectionDAG &DAG) const {
Subtarget->getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA)
return lowerTrapEndpgm(Op, DAG);
- const Module *M = DAG.getMachineFunction().getFunction().getParent();
- unsigned CodeObjectVersion = AMDGPU::getCodeObjectVersion(*M);
- if (CodeObjectVersion <= AMDGPU::AMDHSA_COV3)
- return lowerTrapHsaQueuePtr(Op, DAG);
-
return Subtarget->supportsGetDoorbellID() ? lowerTrapHsa(Op, DAG) :
lowerTrapHsaQueuePtr(Op, DAG);
}
@@ -5873,7 +6305,7 @@ SDValue SITargetLowering::getSegmentAperture(unsigned AS, const SDLoc &DL,
uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
SDValue Ptr =
- DAG.getObjectPtrOffset(DL, QueuePtr, TypeSize::Fixed(StructOffset));
+ DAG.getObjectPtrOffset(DL, QueuePtr, TypeSize::getFixed(StructOffset));
// TODO: Use custom target PseudoSourceValue.
// TODO: We should use the value from the IR intrinsic call, but it might not
@@ -6134,7 +6566,7 @@ SDValue SITargetLowering::lowerEXTRACT_VECTOR_ELT(SDValue Op,
if (SDValue Combined = performExtractVectorEltCombine(Op.getNode(), DCI))
return Combined;
- if (VecSize == 128 || VecSize == 256) {
+ if (VecSize == 128 || VecSize == 256 || VecSize == 512) {
SDValue Lo, Hi;
EVT LoVT, HiVT;
std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VecVT);
@@ -6147,9 +6579,7 @@ SDValue SITargetLowering::lowerEXTRACT_VECTOR_ELT(SDValue Op,
Hi = DAG.getBitcast(HiVT,
DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i64, V2,
DAG.getConstant(1, SL, MVT::i32)));
- } else {
- assert(VecSize == 256);
-
+ } else if (VecSize == 256) {
SDValue V2 = DAG.getBitcast(MVT::v4i64, Vec);
SDValue Parts[4];
for (unsigned P = 0; P < 4; ++P) {
@@ -6161,6 +6591,22 @@ SDValue SITargetLowering::lowerEXTRACT_VECTOR_ELT(SDValue Op,
Parts[0], Parts[1]));
Hi = DAG.getBitcast(HiVT, DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i64,
Parts[2], Parts[3]));
+ } else {
+ assert(VecSize == 512);
+
+ SDValue V2 = DAG.getBitcast(MVT::v8i64, Vec);
+ SDValue Parts[8];
+ for (unsigned P = 0; P < 8; ++P) {
+ Parts[P] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i64, V2,
+ DAG.getConstant(P, SL, MVT::i32));
+ }
+
+ Lo = DAG.getBitcast(LoVT,
+ DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v4i64,
+ Parts[0], Parts[1], Parts[2], Parts[3]));
+ Hi = DAG.getBitcast(HiVT,
+ DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v4i64,
+ Parts[4], Parts[5],Parts[6], Parts[7]));
}
EVT IdxVT = Idx.getValueType();
@@ -6326,6 +6772,27 @@ SDValue SITargetLowering::lowerBUILD_VECTOR(SDValue Op,
return DAG.getNode(ISD::BITCAST, SL, VT, Blend);
}
+ if (VT == MVT::v32i16 || VT == MVT::v32f16) {
+ EVT QuarterVT = MVT::getVectorVT(VT.getVectorElementType().getSimpleVT(),
+ VT.getVectorNumElements() / 8);
+ MVT QuarterIntVT = MVT::getIntegerVT(QuarterVT.getSizeInBits());
+
+ SmallVector<SDValue, 8> Parts[8];
+ for (unsigned I = 0, E = VT.getVectorNumElements() / 8; I != E; ++I) {
+ for (unsigned P = 0; P < 8; ++P)
+ Parts[P].push_back(Op.getOperand(I + P * E));
+ }
+ SDValue Casts[8];
+ for (unsigned P = 0; P < 8; ++P) {
+ SDValue Vec = DAG.getBuildVector(QuarterVT, SL, Parts[P]);
+ Casts[P] = DAG.getNode(ISD::BITCAST, SL, QuarterIntVT, Vec);
+ }
+
+ SDValue Blend =
+ DAG.getBuildVector(MVT::getVectorVT(QuarterIntVT, 8), SL, Casts);
+ return DAG.getNode(ISD::BITCAST, SL, VT, Blend);
+ }
+
assert(VT == MVT::v2f16 || VT == MVT::v2i16);
assert(!Subtarget->hasVOP3PInsts() && "this should be legal");
@@ -6391,24 +6858,12 @@ buildPCRelGlobalAddress(SelectionDAG &DAG, const GlobalValue *GV,
// $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
// which is a 64-bit pc-relative offset from the encoding of the $symbol
// operand to the global variable.
- //
- // What we want here is an offset from the value returned by s_getpc
- // (which is the address of the s_add_u32 instruction) to the global
- // variable, but since the encoding of $symbol starts 4 bytes after the start
- // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too
- // small. This requires us to add 4 to the global variable offset in order to
- // compute the correct address. Similarly for the s_addc_u32 instruction, the
- // encoding of $symbol starts 12 bytes after the start of the s_add_u32
- // instruction.
- SDValue PtrLo =
- DAG.getTargetGlobalAddress(GV, DL, MVT::i32, Offset + 4, GAFlags);
+ SDValue PtrLo = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, Offset, GAFlags);
SDValue PtrHi;
- if (GAFlags == SIInstrInfo::MO_NONE) {
+ if (GAFlags == SIInstrInfo::MO_NONE)
PtrHi = DAG.getTargetConstant(0, DL, MVT::i32);
- } else {
- PtrHi =
- DAG.getTargetGlobalAddress(GV, DL, MVT::i32, Offset + 12, GAFlags + 1);
- }
+ else
+ PtrHi = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, Offset, GAFlags + 1);
return DAG.getNode(AMDGPUISD::PC_ADD_REL_OFFSET, DL, PtrVT, PtrLo, PtrHi);
}
@@ -6450,9 +6905,22 @@ SDValue SITargetLowering::LowerGlobalAddress(AMDGPUMachineFunction *MFI,
return DAG.getNode(AMDGPUISD::LDS, DL, MVT::i32, GA);
}
+ if (Subtarget->isAmdPalOS() || Subtarget->isMesa3DOS()) {
+ SDValue AddrLo = DAG.getTargetGlobalAddress(
+ GV, DL, MVT::i32, GSD->getOffset(), SIInstrInfo::MO_ABS32_LO);
+ AddrLo = {DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, AddrLo), 0};
+
+ SDValue AddrHi = DAG.getTargetGlobalAddress(
+ GV, DL, MVT::i32, GSD->getOffset(), SIInstrInfo::MO_ABS32_HI);
+ AddrHi = {DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, AddrHi), 0};
+
+ return DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, AddrLo, AddrHi);
+ }
+
if (shouldEmitFixup(GV))
return buildPCRelGlobalAddress(DAG, GV, DL, GSD->getOffset(), PtrVT);
- else if (shouldEmitPCReloc(GV))
+
+ if (shouldEmitPCReloc(GV))
return buildPCRelGlobalAddress(DAG, GV, DL, GSD->getOffset(), PtrVT,
SIInstrInfo::MO_REL32);
@@ -6699,6 +7167,7 @@ SDValue SITargetLowering::lowerImage(SDValue Op,
unsigned IntrOpcode = Intr->BaseOpcode;
bool IsGFX10Plus = AMDGPU::isGFX10Plus(*Subtarget);
bool IsGFX11Plus = AMDGPU::isGFX11Plus(*Subtarget);
+ bool IsGFX12Plus = AMDGPU::isGFX12Plus(*Subtarget);
SmallVector<EVT, 3> ResultTypes(Op->values());
SmallVector<EVT, 3> OrigResultTypes(Op->values());
@@ -6718,7 +7187,7 @@ SDValue SITargetLowering::lowerImage(SDValue Op,
if (BaseOpcode->Atomic) {
VData = Op.getOperand(2);
- bool Is64Bit = VData.getValueType() == MVT::i64;
+ bool Is64Bit = VData.getValueSizeInBits() == 64;
if (BaseOpcode->AtomicX2) {
SDValue VData2 = Op.getOperand(3);
VData = DAG.getBuildVector(Is64Bit ? MVT::v2i64 : MVT::v2i32, DL,
@@ -6878,9 +7347,9 @@ SDValue SITargetLowering::lowerImage(SDValue Op,
// SIShrinkInstructions will convert NSA encodings to non-NSA after register
// allocation when possible.
//
- // Partial NSA is allowed on GFX11 where the final register is a contiguous
+ // Partial NSA is allowed on GFX11+ where the final register is a contiguous
// set of the remaining addresses.
- const unsigned NSAMaxSize = ST->getNSAMaxSize();
+ const unsigned NSAMaxSize = ST->getNSAMaxSize(BaseOpcode->Sampler);
const bool HasPartialNSAEncoding = ST->hasPartialNSAEncoding();
const bool UseNSA = ST->hasNSAEncoding() &&
VAddrs.size() >= ST->getNSAThreshold(MF) &&
@@ -6957,7 +7426,7 @@ SDValue SITargetLowering::lowerImage(SDValue Op,
Op.getOperand(ArgOffset + Intr->CachePolicyIndex))->getZExtValue();
if (BaseOpcode->Atomic)
CPol |= AMDGPU::CPol::GLC; // TODO no-return optimization
- if (CPol & ~AMDGPU::CPol::ALL)
+ if (CPol & ~(IsGFX12Plus ? AMDGPU::CPol::ALL : AMDGPU::CPol::ALL_pregfx12))
return Op;
SmallVector<SDValue, 26> Ops;
@@ -6977,7 +7446,8 @@ SDValue SITargetLowering::lowerImage(SDValue Op,
Ops.push_back(DAG.getTargetConstant(DMask, DL, MVT::i32));
if (IsGFX10Plus)
Ops.push_back(DAG.getTargetConstant(DimInfo->Encoding, DL, MVT::i32));
- Ops.push_back(Unorm);
+ if (!IsGFX12Plus || BaseOpcode->Sampler || BaseOpcode->MSAA)
+ Ops.push_back(Unorm);
Ops.push_back(DAG.getTargetConstant(CPol, DL, MVT::i32));
Ops.push_back(IsA16 && // r128, a16 for gfx9
ST->hasFeature(AMDGPU::FeatureR128A16) ? True : False);
@@ -6988,7 +7458,8 @@ SDValue SITargetLowering::lowerImage(SDValue Op,
} else if (cast<ConstantSDNode>(TFE)->getZExtValue()) {
report_fatal_error("TFE is not supported on this GPU");
}
- Ops.push_back(LWE); // lwe
+ if (!IsGFX12Plus || BaseOpcode->Sampler || BaseOpcode->MSAA)
+ Ops.push_back(LWE); // lwe
if (!IsGFX10Plus)
Ops.push_back(DimInfo->DA ? True : False);
if (BaseOpcode->HasD16)
@@ -7000,7 +7471,10 @@ SDValue SITargetLowering::lowerImage(SDValue Op,
UseNSA ? VAddrs.size() : VAddr.getValueType().getSizeInBits() / 32;
int Opcode = -1;
- if (IsGFX11Plus) {
+ if (IsGFX12Plus) {
+ Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx12,
+ NumVDataDwords, NumVAddrDwords);
+ } else if (IsGFX11Plus) {
Opcode = AMDGPU::getMIMGOpcode(IntrOpcode,
UseNSA ? AMDGPU::MIMGEncGfx11NSA
: AMDGPU::MIMGEncGfx11Default,
@@ -7071,7 +7545,8 @@ SDValue SITargetLowering::lowerSBuffer(EVT VT, SDLoc DL, SDValue Rsrc,
};
// Widen vec3 load to vec4.
- if (VT.isVector() && VT.getVectorNumElements() == 3) {
+ if (VT.isVector() && VT.getVectorNumElements() == 3 &&
+ !Subtarget->hasScalarDwordx3Loads()) {
EVT WidenedVT =
EVT::getVectorVT(*DAG.getContext(), VT.getVectorElementType(), 4);
auto WidenedOp = DAG.getMemIntrinsicNode(
@@ -7317,7 +7792,9 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
SDLoc(Op), MVT::i32);
case Intrinsic::amdgcn_s_buffer_load: {
unsigned CPol = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue();
- if (CPol & ~AMDGPU::CPol::ALL)
+ if (CPol & ~((Subtarget->getGeneration() >= AMDGPUSubtarget::GFX12)
+ ? AMDGPU::CPol::ALL
+ : AMDGPU::CPol::ALL_pregfx12))
return Op;
return lowerSBuffer(VT, DL, Op.getOperand(1), Op.getOperand(2), Op.getOperand(3),
DAG);
@@ -7341,9 +7818,6 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
return emitRemovedIntrinsicError(DAG, DL, VT);
}
- case Intrinsic::amdgcn_ldexp:
- return DAG.getNode(ISD::FLDEXP, DL, VT, Op.getOperand(1), Op.getOperand(2));
-
case Intrinsic::amdgcn_fract:
return DAG.getNode(AMDGPUISD::FRACT, DL, VT, Op.getOperand(1));
@@ -7490,6 +7964,19 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
}
}
+// On targets not supporting constant in soffset field, turn zero to
+// SGPR_NULL to avoid generating an extra s_mov with zero.
+static SDValue selectSOffset(SDValue SOffset, SelectionDAG &DAG,
+ const GCNSubtarget *Subtarget) {
+ if (Subtarget->hasRestrictedSOffset())
+ if (auto SOffsetConst = dyn_cast<ConstantSDNode>(SOffset)) {
+ if (SOffsetConst->isZero()) {
+ return DAG.getRegister(AMDGPU::SGPR_NULL, MVT::i32);
+ }
+ }
+ return SOffset;
+}
+
SDValue SITargetLowering::lowerRawBufferAtomicIntrin(SDValue Op,
SelectionDAG &DAG,
unsigned NewOpcode) const {
@@ -7498,13 +7985,14 @@ SDValue SITargetLowering::lowerRawBufferAtomicIntrin(SDValue Op,
SDValue VData = Op.getOperand(2);
SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
auto Offsets = splitBufferOffsets(Op.getOperand(4), DAG);
+ auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget);
SDValue Ops[] = {
Op.getOperand(0), // Chain
VData, // vdata
Rsrc, // rsrc
DAG.getConstant(0, DL, MVT::i32), // vindex
Offsets.first, // voffset
- Op.getOperand(5), // soffset
+ SOffset, // soffset
Offsets.second, // offset
Op.getOperand(6), // cachepolicy
DAG.getTargetConstant(0, DL, MVT::i1), // idxen
@@ -7531,13 +8019,14 @@ SITargetLowering::lowerStructBufferAtomicIntrin(SDValue Op, SelectionDAG &DAG,
SDValue VData = Op.getOperand(2);
SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
auto Offsets = splitBufferOffsets(Op.getOperand(5), DAG);
+ auto SOffset = selectSOffset(Op.getOperand(6), DAG, Subtarget);
SDValue Ops[] = {
Op.getOperand(0), // Chain
VData, // vdata
Rsrc, // rsrc
Op.getOperand(4), // vindex
Offsets.first, // voffset
- Op.getOperand(6), // soffset
+ SOffset, // soffset
Offsets.second, // offset
Op.getOperand(7), // cachepolicy
DAG.getTargetConstant(1, DL, MVT::i1), // idxen
@@ -7693,12 +8182,13 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(2), DAG);
auto Offsets = splitBufferOffsets(Op.getOperand(3), DAG);
+ auto SOffset = selectSOffset(Op.getOperand(4), DAG, Subtarget);
SDValue Ops[] = {
Op.getOperand(0), // Chain
Rsrc, // rsrc
DAG.getConstant(0, DL, MVT::i32), // vindex
Offsets.first, // voffset
- Op.getOperand(4), // soffset
+ SOffset, // soffset
Offsets.second, // offset
Op.getOperand(5), // cachepolicy, swizzled buffer
DAG.getTargetConstant(0, DL, MVT::i1), // idxen
@@ -7717,12 +8207,13 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(2), DAG);
auto Offsets = splitBufferOffsets(Op.getOperand(4), DAG);
+ auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget);
SDValue Ops[] = {
Op.getOperand(0), // Chain
Rsrc, // rsrc
Op.getOperand(3), // vindex
Offsets.first, // voffset
- Op.getOperand(5), // soffset
+ SOffset, // soffset
Offsets.second, // offset
Op.getOperand(6), // cachepolicy, swizzled buffer
DAG.getTargetConstant(1, DL, MVT::i1), // idxen
@@ -7734,21 +8225,22 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
MemSDNode *M = cast<MemSDNode>(Op);
EVT LoadVT = Op.getValueType();
+ auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget);
unsigned Dfmt = cast<ConstantSDNode>(Op.getOperand(7))->getZExtValue();
unsigned Nfmt = cast<ConstantSDNode>(Op.getOperand(8))->getZExtValue();
unsigned Glc = cast<ConstantSDNode>(Op.getOperand(9))->getZExtValue();
unsigned Slc = cast<ConstantSDNode>(Op.getOperand(10))->getZExtValue();
unsigned IdxEn = getIdxEn(Op.getOperand(3));
SDValue Ops[] = {
- Op.getOperand(0), // Chain
- Op.getOperand(2), // rsrc
- Op.getOperand(3), // vindex
- Op.getOperand(4), // voffset
- Op.getOperand(5), // soffset
- Op.getOperand(6), // offset
- DAG.getTargetConstant(Dfmt | (Nfmt << 4), DL, MVT::i32), // format
- DAG.getTargetConstant(Glc | (Slc << 1), DL, MVT::i32), // cachepolicy
- DAG.getTargetConstant(IdxEn, DL, MVT::i1) // idxen
+ Op.getOperand(0), // Chain
+ Op.getOperand(2), // rsrc
+ Op.getOperand(3), // vindex
+ Op.getOperand(4), // voffset
+ SOffset, // soffset
+ Op.getOperand(6), // offset
+ DAG.getTargetConstant(Dfmt | (Nfmt << 4), DL, MVT::i32), // format
+ DAG.getTargetConstant(Glc | (Slc << 1), DL, MVT::i32), // cachepolicy
+ DAG.getTargetConstant(IdxEn, DL, MVT::i1) // idxen
};
if (LoadVT.getScalarType() == MVT::f16)
@@ -7764,13 +8256,14 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
EVT LoadVT = Op.getValueType();
SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(2), DAG);
auto Offsets = splitBufferOffsets(Op.getOperand(3), DAG);
+ auto SOffset = selectSOffset(Op.getOperand(4), DAG, Subtarget);
SDValue Ops[] = {
Op.getOperand(0), // Chain
Rsrc, // rsrc
DAG.getConstant(0, DL, MVT::i32), // vindex
Offsets.first, // voffset
- Op.getOperand(4), // soffset
+ SOffset, // soffset
Offsets.second, // offset
Op.getOperand(5), // format
Op.getOperand(6), // cachepolicy, swizzled buffer
@@ -7790,13 +8283,14 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
EVT LoadVT = Op.getValueType();
SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(2), DAG);
auto Offsets = splitBufferOffsets(Op.getOperand(4), DAG);
+ auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget);
SDValue Ops[] = {
Op.getOperand(0), // Chain
Rsrc, // rsrc
Op.getOperand(3), // vindex
Offsets.first, // voffset
- Op.getOperand(5), // soffset
+ SOffset, // soffset
Offsets.second, // offset
Op.getOperand(6), // format
Op.getOperand(7), // cachepolicy, swizzled buffer
@@ -8009,6 +8503,7 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap: {
SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(4), DAG);
auto Offsets = splitBufferOffsets(Op.getOperand(5), DAG);
+ auto SOffset = selectSOffset(Op.getOperand(6), DAG, Subtarget);
SDValue Ops[] = {
Op.getOperand(0), // Chain
Op.getOperand(2), // src
@@ -8016,7 +8511,7 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
Rsrc, // rsrc
DAG.getConstant(0, DL, MVT::i32), // vindex
Offsets.first, // voffset
- Op.getOperand(6), // soffset
+ SOffset, // soffset
Offsets.second, // offset
Op.getOperand(7), // cachepolicy
DAG.getTargetConstant(0, DL, MVT::i1), // idxen
@@ -8031,6 +8526,7 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap: {
SDValue Rsrc = bufferRsrcPtrToVector(Op->getOperand(4), DAG);
auto Offsets = splitBufferOffsets(Op.getOperand(6), DAG);
+ auto SOffset = selectSOffset(Op.getOperand(7), DAG, Subtarget);
SDValue Ops[] = {
Op.getOperand(0), // Chain
Op.getOperand(2), // src
@@ -8038,7 +8534,7 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
Rsrc, // rsrc
Op.getOperand(5), // vindex
Offsets.first, // voffset
- Op.getOperand(7), // soffset
+ SOffset, // soffset
Offsets.second, // offset
Op.getOperand(8), // cachepolicy
DAG.getTargetConstant(1, DL, MVT::i1), // idxen
@@ -8068,14 +8564,17 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
return SDValue();
}
+ const bool IsGFX11 = AMDGPU::isGFX11(*Subtarget);
const bool IsGFX11Plus = AMDGPU::isGFX11Plus(*Subtarget);
+ const bool IsGFX12Plus = AMDGPU::isGFX12Plus(*Subtarget);
const bool IsA16 = RayDir.getValueType().getVectorElementType() == MVT::f16;
const bool Is64 = NodePtr.getValueType() == MVT::i64;
const unsigned NumVDataDwords = 4;
const unsigned NumVAddrDwords = IsA16 ? (Is64 ? 9 : 8) : (Is64 ? 12 : 11);
const unsigned NumVAddrs = IsGFX11Plus ? (IsA16 ? 4 : 5) : NumVAddrDwords;
- const bool UseNSA =
- Subtarget->hasNSAEncoding() && NumVAddrs <= Subtarget->getNSAMaxSize();
+ const bool UseNSA = (Subtarget->hasNSAEncoding() &&
+ NumVAddrs <= Subtarget->getNSAMaxSize()) ||
+ IsGFX12Plus;
const unsigned BaseOpcodes[2][2] = {
{AMDGPU::IMAGE_BVH_INTERSECT_RAY, AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16},
{AMDGPU::IMAGE_BVH64_INTERSECT_RAY,
@@ -8083,15 +8582,16 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
int Opcode;
if (UseNSA) {
Opcode = AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16],
- IsGFX11Plus ? AMDGPU::MIMGEncGfx11NSA
+ IsGFX12Plus ? AMDGPU::MIMGEncGfx12
+ : IsGFX11 ? AMDGPU::MIMGEncGfx11NSA
: AMDGPU::MIMGEncGfx10NSA,
NumVDataDwords, NumVAddrDwords);
} else {
- Opcode =
- AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16],
- IsGFX11Plus ? AMDGPU::MIMGEncGfx11Default
- : AMDGPU::MIMGEncGfx10Default,
- NumVDataDwords, NumVAddrDwords);
+ assert(!IsGFX12Plus);
+ Opcode = AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16],
+ IsGFX11 ? AMDGPU::MIMGEncGfx11Default
+ : AMDGPU::MIMGEncGfx10Default,
+ NumVDataDwords, NumVAddrDwords);
}
assert(Opcode != -1);
@@ -8179,8 +8679,12 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
}
case Intrinsic::amdgcn_global_atomic_fmin:
case Intrinsic::amdgcn_global_atomic_fmax:
+ case Intrinsic::amdgcn_global_atomic_fmin_num:
+ case Intrinsic::amdgcn_global_atomic_fmax_num:
case Intrinsic::amdgcn_flat_atomic_fmin:
- case Intrinsic::amdgcn_flat_atomic_fmax: {
+ case Intrinsic::amdgcn_flat_atomic_fmax:
+ case Intrinsic::amdgcn_flat_atomic_fmin_num:
+ case Intrinsic::amdgcn_flat_atomic_fmax_num: {
MemSDNode *M = cast<MemSDNode>(Op);
SDValue Ops[] = {
M->getOperand(0), // Chain
@@ -8190,12 +8694,16 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
unsigned Opcode = 0;
switch (IntrID) {
case Intrinsic::amdgcn_global_atomic_fmin:
- case Intrinsic::amdgcn_flat_atomic_fmin: {
+ case Intrinsic::amdgcn_global_atomic_fmin_num:
+ case Intrinsic::amdgcn_flat_atomic_fmin:
+ case Intrinsic::amdgcn_flat_atomic_fmin_num: {
Opcode = AMDGPUISD::ATOMIC_LOAD_FMIN;
break;
}
case Intrinsic::amdgcn_global_atomic_fmax:
- case Intrinsic::amdgcn_flat_atomic_fmax: {
+ case Intrinsic::amdgcn_global_atomic_fmax_num:
+ case Intrinsic::amdgcn_flat_atomic_fmax:
+ case Intrinsic::amdgcn_flat_atomic_fmax_num: {
Opcode = AMDGPUISD::ATOMIC_LOAD_FMAX;
break;
}
@@ -8206,6 +8714,31 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
M->getVTList(), Ops, M->getMemoryVT(),
M->getMemOperand());
}
+ case Intrinsic::amdgcn_s_get_barrier_state: {
+ SDValue Chain = Op->getOperand(0);
+ SmallVector<SDValue, 2> Ops;
+ unsigned Opc;
+ bool IsInlinableBarID = false;
+ int64_t BarID;
+
+ if (isa<ConstantSDNode>(Op->getOperand(2))) {
+ BarID = cast<ConstantSDNode>(Op->getOperand(2))->getSExtValue();
+ IsInlinableBarID = AMDGPU::isInlinableIntLiteral(BarID);
+ }
+
+ if (IsInlinableBarID) {
+ Opc = AMDGPU::S_GET_BARRIER_STATE_IMM;
+ SDValue K = DAG.getTargetConstant(BarID, DL, MVT::i32);
+ Ops.push_back(K);
+ } else {
+ Opc = AMDGPU::S_GET_BARRIER_STATE_M0;
+ SDValue M0Val = copyToM0(DAG, Chain, DL, Op.getOperand(2));
+ Ops.push_back(M0Val.getValue(0));
+ }
+
+ auto NewMI = DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops);
+ return SDValue(NewMI, 0);
+ }
default:
if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
@@ -8383,13 +8916,29 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
return SDValue(DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops), 0);
}
case Intrinsic::amdgcn_s_barrier: {
- if (getTargetMachine().getOptLevel() > CodeGenOpt::None) {
- const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
+ const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
+ if (getTargetMachine().getOptLevel() > CodeGenOptLevel::None) {
unsigned WGSize = ST.getFlatWorkGroupSizes(MF.getFunction()).second;
if (WGSize <= ST.getWavefrontSize())
return SDValue(DAG.getMachineNode(AMDGPU::WAVE_BARRIER, DL, MVT::Other,
Op.getOperand(0)), 0);
}
+
+ // On GFX12 lower s_barrier into s_barrier_signal_imm and s_barrier_wait
+ if (ST.hasSplitBarriers()) {
+ SDValue K =
+ DAG.getTargetConstant(AMDGPU::Barrier::WORKGROUP, DL, MVT::i32);
+ SDValue BarSignal =
+ SDValue(DAG.getMachineNode(AMDGPU::S_BARRIER_SIGNAL_IMM, DL,
+ MVT::Other, K, Op.getOperand(0)),
+ 0);
+ SDValue BarWait =
+ SDValue(DAG.getMachineNode(AMDGPU::S_BARRIER_WAIT, DL, MVT::Other, K,
+ BarSignal.getValue(0)),
+ 0);
+ return BarWait;
+ }
+
return SDValue();
};
case Intrinsic::amdgcn_tbuffer_store: {
@@ -8429,13 +8978,14 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
VData = handleD16VData(VData, DAG);
SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
auto Offsets = splitBufferOffsets(Op.getOperand(5), DAG);
+ auto SOffset = selectSOffset(Op.getOperand(6), DAG, Subtarget);
SDValue Ops[] = {
Chain,
VData, // vdata
Rsrc, // rsrc
Op.getOperand(4), // vindex
Offsets.first, // voffset
- Op.getOperand(6), // soffset
+ SOffset, // soffset
Offsets.second, // offset
Op.getOperand(7), // format
Op.getOperand(8), // cachepolicy, swizzled buffer
@@ -8456,13 +9006,14 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
VData = handleD16VData(VData, DAG);
SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
auto Offsets = splitBufferOffsets(Op.getOperand(4), DAG);
+ auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget);
SDValue Ops[] = {
Chain,
VData, // vdata
Rsrc, // rsrc
DAG.getConstant(0, DL, MVT::i32), // vindex
Offsets.first, // voffset
- Op.getOperand(5), // soffset
+ SOffset, // soffset
Offsets.second, // offset
Op.getOperand(6), // format
Op.getOperand(7), // cachepolicy, swizzled buffer
@@ -8536,13 +9087,14 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
auto Offsets = splitBufferOffsets(Op.getOperand(4), DAG);
+ auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget);
SDValue Ops[] = {
Chain,
VData,
Rsrc,
DAG.getConstant(0, DL, MVT::i32), // vindex
Offsets.first, // voffset
- Op.getOperand(5), // soffset
+ SOffset, // soffset
Offsets.second, // offset
Op.getOperand(6), // cachepolicy, swizzled buffer
DAG.getTargetConstant(0, DL, MVT::i1), // idxen
@@ -8586,13 +9138,14 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
auto Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
auto Offsets = splitBufferOffsets(Op.getOperand(5), DAG);
+ auto SOffset = selectSOffset(Op.getOperand(6), DAG, Subtarget);
SDValue Ops[] = {
Chain,
VData,
Rsrc,
Op.getOperand(4), // vindex
Offsets.first, // voffset
- Op.getOperand(6), // soffset
+ SOffset, // soffset
Offsets.second, // offset
Op.getOperand(7), // cachepolicy, swizzled buffer
DAG.getTargetConstant(1, DL, MVT::i1), // idxen
@@ -8620,8 +9173,7 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
IntrinsicID == Intrinsic::amdgcn_struct_ptr_buffer_load_lds;
unsigned OpOffset = HasVIndex ? 1 : 0;
SDValue VOffset = Op.getOperand(5 + OpOffset);
- auto CVOffset = dyn_cast<ConstantSDNode>(VOffset);
- bool HasVOffset = !CVOffset || !CVOffset->isZero();
+ bool HasVOffset = !isNullConstant(VOffset);
unsigned Size = Op->getConstantOperandVal(4);
switch (Size) {
@@ -8684,12 +9236,13 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
auto F = LoadMMO->getFlags() &
~(MachineMemOperand::MOStore | MachineMemOperand::MOLoad);
- LoadMMO = MF.getMachineMemOperand(LoadPtrI, F | MachineMemOperand::MOLoad,
- Size, LoadMMO->getBaseAlign());
+ LoadMMO =
+ MF.getMachineMemOperand(LoadPtrI, F | MachineMemOperand::MOLoad, Size,
+ LoadMMO->getBaseAlign(), LoadMMO->getAAInfo());
- MachineMemOperand *StoreMMO =
- MF.getMachineMemOperand(StorePtrI, F | MachineMemOperand::MOStore,
- sizeof(int32_t), LoadMMO->getBaseAlign());
+ MachineMemOperand *StoreMMO = MF.getMachineMemOperand(
+ StorePtrI, F | MachineMemOperand::MOStore, sizeof(int32_t),
+ LoadMMO->getBaseAlign(), LoadMMO->getAAInfo());
auto Load = DAG.getMachineNode(Opc, DL, M->getVTList(), Ops);
DAG.setNodeMemRefs(Load, {LoadMMO, StoreMMO});
@@ -8760,11 +9313,12 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
StorePtrI.AddrSpace = AMDGPUAS::LOCAL_ADDRESS;
auto F = LoadMMO->getFlags() &
~(MachineMemOperand::MOStore | MachineMemOperand::MOLoad);
- LoadMMO = MF.getMachineMemOperand(LoadPtrI, F | MachineMemOperand::MOLoad,
- Size, LoadMMO->getBaseAlign());
- MachineMemOperand *StoreMMO =
- MF.getMachineMemOperand(StorePtrI, F | MachineMemOperand::MOStore,
- sizeof(int32_t), Align(4));
+ LoadMMO =
+ MF.getMachineMemOperand(LoadPtrI, F | MachineMemOperand::MOLoad, Size,
+ LoadMMO->getBaseAlign(), LoadMMO->getAAInfo());
+ MachineMemOperand *StoreMMO = MF.getMachineMemOperand(
+ StorePtrI, F | MachineMemOperand::MOStore, sizeof(int32_t), Align(4),
+ LoadMMO->getAAInfo());
auto Load = DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops);
DAG.setNodeMemRefs(Load, {LoadMMO, StoreMMO});
@@ -8774,7 +9328,76 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
case Intrinsic::amdgcn_end_cf:
return SDValue(DAG.getMachineNode(AMDGPU::SI_END_CF, DL, MVT::Other,
Op->getOperand(2), Chain), 0);
+ case Intrinsic::amdgcn_s_barrier_init:
+ case Intrinsic::amdgcn_s_barrier_join:
+ case Intrinsic::amdgcn_s_wakeup_barrier: {
+ SDValue Chain = Op->getOperand(0);
+ SmallVector<SDValue, 2> Ops;
+ SDValue BarOp = Op->getOperand(2);
+ unsigned Opc;
+ bool IsInlinableBarID = false;
+ int64_t BarVal;
+
+ if (isa<ConstantSDNode>(BarOp)) {
+ BarVal = cast<ConstantSDNode>(BarOp)->getSExtValue();
+ IsInlinableBarID = AMDGPU::isInlinableIntLiteral(BarVal);
+ }
+
+ if (IsInlinableBarID) {
+ switch (IntrinsicID) {
+ default:
+ return SDValue();
+ case Intrinsic::amdgcn_s_barrier_init:
+ Opc = AMDGPU::S_BARRIER_INIT_IMM;
+ break;
+ case Intrinsic::amdgcn_s_barrier_join:
+ Opc = AMDGPU::S_BARRIER_JOIN_IMM;
+ break;
+ case Intrinsic::amdgcn_s_wakeup_barrier:
+ Opc = AMDGPU::S_WAKEUP_BARRIER_IMM;
+ break;
+ }
+
+ SDValue K = DAG.getTargetConstant(BarVal, DL, MVT::i32);
+ Ops.push_back(K);
+ } else {
+ switch (IntrinsicID) {
+ default:
+ return SDValue();
+ case Intrinsic::amdgcn_s_barrier_init:
+ Opc = AMDGPU::S_BARRIER_INIT_M0;
+ break;
+ case Intrinsic::amdgcn_s_barrier_join:
+ Opc = AMDGPU::S_BARRIER_JOIN_M0;
+ break;
+ case Intrinsic::amdgcn_s_wakeup_barrier:
+ Opc = AMDGPU::S_WAKEUP_BARRIER_M0;
+ break;
+ }
+ }
+
+ if (IntrinsicID == Intrinsic::amdgcn_s_barrier_init) {
+ SDValue M0Val;
+ // Member count will be read from M0[16:22]
+ M0Val = DAG.getNode(ISD::SHL, DL, MVT::i32, Op.getOperand(3),
+ DAG.getShiftAmountConstant(16, MVT::i32, DL));
+ if (!IsInlinableBarID) {
+ // If reference to barrier id is not an inline constant then it must be
+ // referenced with M0[4:0]. Perform an OR with the member count to
+ // include it in M0.
+ M0Val = SDValue(DAG.getMachineNode(AMDGPU::S_OR_B32, DL, MVT::i32,
+ Op.getOperand(2), M0Val),
+ 0);
+ }
+ Ops.push_back(copyToM0(DAG, Chain, DL, M0Val).getValue(0));
+ } else if (!IsInlinableBarID) {
+ Ops.push_back(copyToM0(DAG, Chain, DL, BarOp).getValue(0));
+ }
+
+ auto NewMI = DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops);
+ return SDValue(NewMI, 0);
+ }
default: {
if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
AMDGPU::getImageDimIntrinsicInfo(IntrinsicID))
@@ -8794,7 +9417,7 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
std::pair<SDValue, SDValue> SITargetLowering::splitBufferOffsets(
SDValue Offset, SelectionDAG &DAG) const {
SDLoc DL(Offset);
- const unsigned MaxImm = SIInstrInfo::getMaxMUBUFImmOffset();
+ const unsigned MaxImm = SIInstrInfo::getMaxMUBUFImmOffset(*Subtarget);
SDValue N0 = Offset;
ConstantSDNode *C1 = nullptr;
@@ -8870,8 +9493,13 @@ void SITargetLowering::setBufferOffsets(SDValue CombinedOffset,
return;
}
}
+
+ SDValue SOffsetZero = Subtarget->hasRestrictedSOffset()
+ ? DAG.getRegister(AMDGPU::SGPR_NULL, MVT::i32)
+ : DAG.getConstant(0, DL, MVT::i32);
+
Offsets[0] = CombinedOffset;
- Offsets[1] = DAG.getConstant(0, DL, MVT::i32);
+ Offsets[1] = SOffsetZero;
Offsets[2] = DAG.getTargetConstant(0, DL, MVT::i32);
}
@@ -9051,7 +9679,7 @@ static bool addressMayBeAccessedAsPrivate(const MachineMemOperand *MMO,
const SIMachineFunctionInfo &Info) {
// TODO: Should check if the address can definitely not access stack.
if (Info.isEntryFunction())
- return Info.hasFlatScratchInit();
+ return Info.getUserSGPRInfo().hasFlatScratchInit();
return true;
}
@@ -9129,7 +9757,8 @@ SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
if (AS == AMDGPUAS::CONSTANT_ADDRESS ||
AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
if (!Op->isDivergent() && Alignment >= Align(4) && NumElements < 32) {
- if (MemVT.isPow2VectorType())
+ if (MemVT.isPow2VectorType() ||
+ (Subtarget->hasScalarDwordx3Loads() && NumElements == 3))
return SDValue();
return WidenOrSplitVectorLoad(Op, DAG);
}
@@ -9145,7 +9774,8 @@ SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
if (Subtarget->getScalarizeGlobalBehavior() && !Op->isDivergent() &&
Load->isSimple() && isMemOpHasNoClobberedMemOperand(Load) &&
Alignment >= Align(4) && NumElements < 32) {
- if (MemVT.isPow2VectorType())
+ if (MemVT.isPow2VectorType() ||
+ (Subtarget->hasScalarDwordx3Loads() && NumElements == 3))
return SDValue();
return WidenOrSplitVectorLoad(Op, DAG);
}
@@ -9217,7 +9847,8 @@ SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
SDValue SITargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
EVT VT = Op.getValueType();
- if (VT.getSizeInBits() == 128 || VT.getSizeInBits() == 256)
+ if (VT.getSizeInBits() == 128 || VT.getSizeInBits() == 256 ||
+ VT.getSizeInBits() == 512)
return splitTernaryVectorOp(Op, DAG);
assert(VT.getSizeInBits() == 64);
@@ -9277,11 +9908,6 @@ SDValue SITargetLowering::lowerFastUnsafeFDIV(SDValue Op,
// XXX - Is UnsafeFPMath sufficient to do this for f64? The maximum ULP
// error seems really high at 2^29 ULP.
-
- // XXX - do we need afn for this or is arcp sufficent?
- if (RHS.getOpcode() == ISD::FSQRT)
- return DAG.getNode(AMDGPUISD::RSQ, SL, VT, RHS.getOperand(0));
-
// 1.0 / x -> rcp(x)
return DAG.getNode(AMDGPUISD::RCP, SL, VT, RHS);
}
@@ -9294,8 +9920,8 @@ SDValue SITargetLowering::lowerFastUnsafeFDIV(SDValue Op,
}
}
- // For f16 require arcp only.
- // For f32 require afn+arcp.
+ // For f16 require afn or arcp.
+ // For f32 require afn.
if (!AllowInaccurateRcp && (VT != MVT::f16 || !Flags.hasAllowReciprocal()))
return SDValue();
@@ -9480,28 +10106,44 @@ SDValue SITargetLowering::LowerFDIV32(SDValue Op, SelectionDAG &DAG) const {
const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
const DenormalMode DenormMode = Info->getMode().FP32Denormals;
- const bool HasFP32Denormals = DenormMode == DenormalMode::getIEEE();
+ const bool PreservesDenormals = DenormMode == DenormalMode::getIEEE();
+ const bool HasDynamicDenormals =
+ (DenormMode.Input == DenormalMode::Dynamic) ||
+ (DenormMode.Output == DenormalMode::Dynamic);
+
+ SDValue SavedDenormMode;
- if (!HasFP32Denormals) {
+ if (!PreservesDenormals) {
// Note we can't use the STRICT_FMA/STRICT_FMUL for the non-strict FDIV
// lowering. The chain dependence is insufficient, and we need glue. We do
// not need the glue variants in a strictfp function.
SDVTList BindParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
+ SDValue Glue = DAG.getEntryNode();
+ if (HasDynamicDenormals) {
+ SDNode *GetReg = DAG.getMachineNode(AMDGPU::S_GETREG_B32, SL,
+ DAG.getVTList(MVT::i32, MVT::Glue),
+ {BitField, Glue});
+ SavedDenormMode = SDValue(GetReg, 0);
+
+ Glue = DAG.getMergeValues(
+ {DAG.getEntryNode(), SDValue(GetReg, 0), SDValue(GetReg, 1)}, SL);
+ }
+
SDNode *EnableDenorm;
if (Subtarget->hasDenormModeInst()) {
const SDValue EnableDenormValue =
getSPDenormModeValue(FP_DENORM_FLUSH_NONE, DAG, Info, Subtarget);
- EnableDenorm = DAG.getNode(AMDGPUISD::DENORM_MODE, SL, BindParamVTs,
- DAG.getEntryNode(), EnableDenormValue).getNode();
+ EnableDenorm = DAG.getNode(AMDGPUISD::DENORM_MODE, SL, BindParamVTs, Glue,
+ EnableDenormValue)
+ .getNode();
} else {
const SDValue EnableDenormValue = DAG.getConstant(FP_DENORM_FLUSH_NONE,
SL, MVT::i32);
- EnableDenorm =
- DAG.getMachineNode(AMDGPU::S_SETREG_B32, SL, BindParamVTs,
- {EnableDenormValue, BitField, DAG.getEntryNode()});
+ EnableDenorm = DAG.getMachineNode(AMDGPU::S_SETREG_B32, SL, BindParamVTs,
+ {EnableDenormValue, BitField, Glue});
}
SDValue Ops[3] = {
@@ -9531,12 +10173,9 @@ SDValue SITargetLowering::LowerFDIV32(SDValue Op, SelectionDAG &DAG) const {
SDValue Fma4 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, NegDivScale0, Fma3,
NumeratorScaled, Fma3, Flags);
- if (!HasFP32Denormals) {
- // FIXME: This mishandles dynamic denormal mode. We need to query the
- // current mode and restore the original.
-
+ if (!PreservesDenormals) {
SDNode *DisableDenorm;
- if (Subtarget->hasDenormModeInst()) {
+ if (!HasDynamicDenormals && Subtarget->hasDenormModeInst()) {
const SDValue DisableDenormValue = getSPDenormModeValue(
FP_DENORM_FLUSH_IN_FLUSH_OUT, DAG, Info, Subtarget);
@@ -9544,8 +10183,11 @@ SDValue SITargetLowering::LowerFDIV32(SDValue Op, SelectionDAG &DAG) const {
Fma4.getValue(1), DisableDenormValue,
Fma4.getValue(2)).getNode();
} else {
+ assert(HasDynamicDenormals == (bool)SavedDenormMode);
const SDValue DisableDenormValue =
- DAG.getConstant(FP_DENORM_FLUSH_IN_FLUSH_OUT, SL, MVT::i32);
+ HasDynamicDenormals
+ ? SavedDenormMode
+ : DAG.getConstant(FP_DENORM_FLUSH_IN_FLUSH_OUT, SL, MVT::i32);
DisableDenorm = DAG.getMachineNode(
AMDGPU::S_SETREG_B32, SL, MVT::Other,
@@ -9754,6 +10396,111 @@ SDValue SITargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
return SDValue();
}
+// Avoid the full correct expansion for f32 sqrt when promoting from f16.
+SDValue SITargetLowering::lowerFSQRTF16(SDValue Op, SelectionDAG &DAG) const {
+ SDLoc SL(Op);
+ assert(!Subtarget->has16BitInsts());
+ SDNodeFlags Flags = Op->getFlags();
+ SDValue Ext =
+ DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Op.getOperand(0), Flags);
+
+ SDValue SqrtID = DAG.getTargetConstant(Intrinsic::amdgcn_sqrt, SL, MVT::i32);
+ SDValue Sqrt =
+ DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, MVT::f32, SqrtID, Ext, Flags);
+
+ return DAG.getNode(ISD::FP_ROUND, SL, MVT::f16, Sqrt,
+ DAG.getTargetConstant(0, SL, MVT::i32), Flags);
+}
+
+SDValue SITargetLowering::lowerFSQRTF32(SDValue Op, SelectionDAG &DAG) const {
+ SDLoc DL(Op);
+ SDNodeFlags Flags = Op->getFlags();
+ MVT VT = Op.getValueType().getSimpleVT();
+ const SDValue X = Op.getOperand(0);
+
+ if (allowApproxFunc(DAG, Flags)) {
+ // Instruction is 1ulp but ignores denormals.
+ return DAG.getNode(
+ ISD::INTRINSIC_WO_CHAIN, DL, VT,
+ DAG.getTargetConstant(Intrinsic::amdgcn_sqrt, DL, MVT::i32), X, Flags);
+ }
+
+ SDValue ScaleThreshold = DAG.getConstantFP(0x1.0p-96f, DL, VT);
+ SDValue NeedScale = DAG.getSetCC(DL, MVT::i1, X, ScaleThreshold, ISD::SETOLT);
+
+ SDValue ScaleUpFactor = DAG.getConstantFP(0x1.0p+32f, DL, VT);
+
+ SDValue ScaledX = DAG.getNode(ISD::FMUL, DL, VT, X, ScaleUpFactor, Flags);
+
+ SDValue SqrtX =
+ DAG.getNode(ISD::SELECT, DL, VT, NeedScale, ScaledX, X, Flags);
+
+ SDValue SqrtS;
+ if (needsDenormHandlingF32(DAG, X, Flags)) {
+ SDValue SqrtID =
+ DAG.getTargetConstant(Intrinsic::amdgcn_sqrt, DL, MVT::i32);
+ SqrtS = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, SqrtID, SqrtX, Flags);
+
+ SDValue SqrtSAsInt = DAG.getNode(ISD::BITCAST, DL, MVT::i32, SqrtS);
+ SDValue SqrtSNextDownInt = DAG.getNode(ISD::ADD, DL, MVT::i32, SqrtSAsInt,
+ DAG.getConstant(-1, DL, MVT::i32));
+ SDValue SqrtSNextDown = DAG.getNode(ISD::BITCAST, DL, VT, SqrtSNextDownInt);
+
+ SDValue NegSqrtSNextDown =
+ DAG.getNode(ISD::FNEG, DL, VT, SqrtSNextDown, Flags);
+
+ SDValue SqrtVP =
+ DAG.getNode(ISD::FMA, DL, VT, NegSqrtSNextDown, SqrtS, SqrtX, Flags);
+
+ SDValue SqrtSNextUpInt = DAG.getNode(ISD::ADD, DL, MVT::i32, SqrtSAsInt,
+ DAG.getConstant(1, DL, MVT::i32));
+ SDValue SqrtSNextUp = DAG.getNode(ISD::BITCAST, DL, VT, SqrtSNextUpInt);
+
+ SDValue NegSqrtSNextUp = DAG.getNode(ISD::FNEG, DL, VT, SqrtSNextUp, Flags);
+ SDValue SqrtVS =
+ DAG.getNode(ISD::FMA, DL, VT, NegSqrtSNextUp, SqrtS, SqrtX, Flags);
+
+ SDValue Zero = DAG.getConstantFP(0.0f, DL, VT);
+ SDValue SqrtVPLE0 = DAG.getSetCC(DL, MVT::i1, SqrtVP, Zero, ISD::SETOLE);
+
+ SqrtS = DAG.getNode(ISD::SELECT, DL, VT, SqrtVPLE0, SqrtSNextDown, SqrtS,
+ Flags);
+
+ SDValue SqrtVPVSGT0 = DAG.getSetCC(DL, MVT::i1, SqrtVS, Zero, ISD::SETOGT);
+ SqrtS = DAG.getNode(ISD::SELECT, DL, VT, SqrtVPVSGT0, SqrtSNextUp, SqrtS,
+ Flags);
+ } else {
+ SDValue SqrtR = DAG.getNode(AMDGPUISD::RSQ, DL, VT, SqrtX, Flags);
+
+ SqrtS = DAG.getNode(ISD::FMUL, DL, VT, SqrtX, SqrtR, Flags);
+
+ SDValue Half = DAG.getConstantFP(0.5f, DL, VT);
+ SDValue SqrtH = DAG.getNode(ISD::FMUL, DL, VT, SqrtR, Half, Flags);
+ SDValue NegSqrtH = DAG.getNode(ISD::FNEG, DL, VT, SqrtH, Flags);
+
+ SDValue SqrtE = DAG.getNode(ISD::FMA, DL, VT, NegSqrtH, SqrtS, Half, Flags);
+ SqrtH = DAG.getNode(ISD::FMA, DL, VT, SqrtH, SqrtE, SqrtH, Flags);
+ SqrtS = DAG.getNode(ISD::FMA, DL, VT, SqrtS, SqrtE, SqrtS, Flags);
+
+ SDValue NegSqrtS = DAG.getNode(ISD::FNEG, DL, VT, SqrtS, Flags);
+ SDValue SqrtD =
+ DAG.getNode(ISD::FMA, DL, VT, NegSqrtS, SqrtS, SqrtX, Flags);
+ SqrtS = DAG.getNode(ISD::FMA, DL, VT, SqrtD, SqrtH, SqrtS, Flags);
+ }
+
+ SDValue ScaleDownFactor = DAG.getConstantFP(0x1.0p-16f, DL, VT);
+
+ SDValue ScaledDown =
+ DAG.getNode(ISD::FMUL, DL, VT, SqrtS, ScaleDownFactor, Flags);
+
+ SqrtS = DAG.getNode(ISD::SELECT, DL, VT, NeedScale, ScaledDown, SqrtS, Flags);
+ SDValue IsZeroOrInf =
+ DAG.getNode(ISD::IS_FPCLASS, DL, MVT::i1, SqrtX,
+ DAG.getTargetConstant(fcZero | fcPosInf, DL, MVT::i32));
+
+ return DAG.getNode(ISD::SELECT, DL, VT, IsZeroOrInf, SqrtX, SqrtS, Flags);
+}
+
SDValue SITargetLowering::lowerFSQRTF64(SDValue Op, SelectionDAG &DAG) const {
// For double type, the SQRT and RSQ instructions don't have required
// precision, we apply Goldschmidt's algorithm to improve the result:
@@ -10111,9 +10858,7 @@ SDValue SITargetLowering::splitBinaryBitConstantOp(
return SDValue();
}
-// Returns true if argument is a boolean value which is not serialized into
-// memory or argument and does not require v_cndmask_b32 to be deserialized.
-static bool isBoolSGPR(SDValue V) {
+bool llvm::isBoolSGPR(SDValue V) {
if (V.getValueType() != MVT::i1)
return false;
switch (V.getOpcode()) {
@@ -10427,13 +11172,34 @@ calculateSrcByte(const SDValue Op, uint64_t DestByte, uint64_t SrcIndex = 0,
if (Depth >= 6)
return std::nullopt;
+ auto ValueSize = Op.getValueSizeInBits();
+ if (ValueSize != 8 && ValueSize != 16 && ValueSize != 32)
+ return std::nullopt;
+
switch (Op->getOpcode()) {
case ISD::TRUNCATE: {
- if (Op->getOperand(0).getScalarValueSizeInBits() != 32)
+ return calculateSrcByte(Op->getOperand(0), DestByte, SrcIndex, Depth + 1);
+ }
+
+ case ISD::SIGN_EXTEND:
+ case ISD::ZERO_EXTEND:
+ case ISD::SIGN_EXTEND_INREG: {
+ SDValue NarrowOp = Op->getOperand(0);
+ auto NarrowVT = NarrowOp.getValueType();
+ if (Op->getOpcode() == ISD::SIGN_EXTEND_INREG) {
+ auto *VTSign = cast<VTSDNode>(Op->getOperand(1));
+ NarrowVT = VTSign->getVT();
+ }
+ if (!NarrowVT.isByteSized())
+ return std::nullopt;
+ uint64_t NarrowByteWidth = NarrowVT.getStoreSize();
+
+ if (SrcIndex >= NarrowByteWidth)
return std::nullopt;
return calculateSrcByte(Op->getOperand(0), DestByte, SrcIndex, Depth + 1);
}
+ case ISD::SRA:
case ISD::SRL: {
auto ShiftOp = dyn_cast<ConstantSDNode>(Op->getOperand(1));
if (!ShiftOp)
@@ -10450,9 +11216,6 @@ calculateSrcByte(const SDValue Op, uint64_t DestByte, uint64_t SrcIndex = 0,
}
default: {
- if (Op.getScalarValueSizeInBits() != 32)
- return std::nullopt;
-
return ByteProvider<SDValue>::getSrc(Op, DestByte, SrcIndex);
}
}
@@ -10476,7 +11239,8 @@ calculateByteProvider(const SDValue &Op, unsigned Index, unsigned Depth,
unsigned BitWidth = Op.getScalarValueSizeInBits();
if (BitWidth % 8 != 0)
return std::nullopt;
- assert(Index < BitWidth / 8 && "invalid index requested");
+ if (Index > BitWidth / 8 - 1)
+ return std::nullopt;
switch (Op.getOpcode()) {
case ISD::OR: {
@@ -10519,6 +11283,31 @@ calculateByteProvider(const SDValue &Op, unsigned Index, unsigned Depth,
return calculateSrcByte(Op->getOperand(0), StartingIndex, Index);
}
+ case ISD::FSHR: {
+ // fshr(X,Y,Z): (X << (BW - (Z % BW))) | (Y >> (Z % BW))
+ auto ShiftOp = dyn_cast<ConstantSDNode>(Op->getOperand(2));
+ if (!ShiftOp || Op.getValueType().isVector())
+ return std::nullopt;
+
+ uint64_t BitsProvided = Op.getValueSizeInBits();
+ if (BitsProvided % 8 != 0)
+ return std::nullopt;
+
+ uint64_t BitShift = ShiftOp->getAPIntValue().urem(BitsProvided);
+ if (BitShift % 8)
+ return std::nullopt;
+
+ uint64_t ConcatSizeInBytes = BitsProvided / 4;
+ uint64_t ByteShift = BitShift / 8;
+
+ uint64_t NewIndex = (Index + ByteShift) % ConcatSizeInBytes;
+ uint64_t BytesProvided = BitsProvided / 8;
+ SDValue NextOp = Op.getOperand(NewIndex >= BytesProvided ? 0 : 1);
+ NewIndex %= BytesProvided;
+ return calculateByteProvider(NextOp, NewIndex, Depth + 1, StartingIndex);
+ }
+
+ case ISD::SRA:
case ISD::SRL: {
auto ShiftOp = dyn_cast<ConstantSDNode>(Op->getOperand(1));
if (!ShiftOp)
@@ -10565,9 +11354,18 @@ calculateByteProvider(const SDValue &Op, unsigned Index, unsigned Depth,
}
case ISD::ANY_EXTEND:
case ISD::SIGN_EXTEND:
- case ISD::ZERO_EXTEND: {
+ case ISD::ZERO_EXTEND:
+ case ISD::SIGN_EXTEND_INREG:
+ case ISD::AssertZext:
+ case ISD::AssertSext: {
SDValue NarrowOp = Op->getOperand(0);
- unsigned NarrowBitWidth = NarrowOp.getScalarValueSizeInBits();
+ unsigned NarrowBitWidth = NarrowOp.getValueSizeInBits();
+ if (Op->getOpcode() == ISD::SIGN_EXTEND_INREG ||
+ Op->getOpcode() == ISD::AssertZext ||
+ Op->getOpcode() == ISD::AssertSext) {
+ auto *VTSign = cast<VTSDNode>(Op->getOperand(1));
+ NarrowBitWidth = VTSign->getVT().getSizeInBits();
+ }
if (NarrowBitWidth % 8 != 0)
return std::nullopt;
uint64_t NarrowByteWidth = NarrowBitWidth / 8;
@@ -10581,10 +11379,7 @@ calculateByteProvider(const SDValue &Op, unsigned Index, unsigned Depth,
}
case ISD::TRUNCATE: {
- unsigned NarrowBitWidth = Op.getScalarValueSizeInBits();
- if (NarrowBitWidth % 8 != 0)
- return std::nullopt;
- uint64_t NarrowByteWidth = NarrowBitWidth / 8;
+ uint64_t NarrowByteWidth = BitWidth / 8;
if (NarrowByteWidth >= Index) {
return calculateByteProvider(Op.getOperand(0), Index, Depth + 1,
@@ -10594,8 +11389,16 @@ calculateByteProvider(const SDValue &Op, unsigned Index, unsigned Depth,
return std::nullopt;
}
+ case ISD::CopyFromReg: {
+ if (BitWidth / 8 > Index)
+ return calculateSrcByte(Op, StartingIndex, Index);
+
+ return std::nullopt;
+ }
+
case ISD::LOAD: {
auto L = cast<LoadSDNode>(Op.getNode());
+
unsigned NarrowBitWidth = L->getMemoryVT().getSizeInBits();
if (NarrowBitWidth % 8 != 0)
return std::nullopt;
@@ -10621,6 +11424,41 @@ calculateByteProvider(const SDValue &Op, unsigned Index, unsigned Depth,
case ISD::BSWAP:
return calculateByteProvider(Op->getOperand(0), BitWidth / 8 - Index - 1,
Depth + 1, StartingIndex);
+
+ case ISD::EXTRACT_VECTOR_ELT: {
+ auto IdxOp = dyn_cast<ConstantSDNode>(Op->getOperand(1));
+ if (!IdxOp)
+ return std::nullopt;
+ auto VecIdx = IdxOp->getZExtValue();
+ auto ScalarSize = Op.getScalarValueSizeInBits();
+ if (ScalarSize != 32) {
+ if ((VecIdx + 1) * ScalarSize > 32)
+ return std::nullopt;
+ Index = ScalarSize == 8 ? VecIdx : VecIdx * 2 + Index;
+ }
+
+ return calculateSrcByte(ScalarSize == 32 ? Op : Op.getOperand(0),
+ StartingIndex, Index);
+ }
+
+ case AMDGPUISD::PERM: {
+ auto PermMask = dyn_cast<ConstantSDNode>(Op->getOperand(2));
+ if (!PermMask)
+ return std::nullopt;
+
+ auto IdxMask =
+ (PermMask->getZExtValue() & (0xFF << (Index * 8))) >> (Index * 8);
+ if (IdxMask > 0x07 && IdxMask != 0x0c)
+ return std::nullopt;
+
+ auto NextOp = Op.getOperand(IdxMask > 0x03 ? 0 : 1);
+ auto NextIndex = IdxMask > 0x03 ? IdxMask % 4 : IdxMask;
+
+ return IdxMask != 0x0c ? calculateSrcByte(NextOp, StartingIndex, NextIndex)
+ : ByteProvider<SDValue>(
+ ByteProvider<SDValue>::getConstantZero());
+ }
+
default: {
return std::nullopt;
}
@@ -10630,7 +11468,8 @@ calculateByteProvider(const SDValue &Op, unsigned Index, unsigned Depth,
}
// Returns true if the Operand is a scalar and is 16 bits
-static bool is16BitScalarOp(SDValue &Operand) {
+static bool isExtendedFrom16Bits(SDValue &Operand) {
+
switch (Operand.getOpcode()) {
case ISD::ANY_EXTEND:
case ISD::SIGN_EXTEND:
@@ -10646,7 +11485,7 @@ static bool is16BitScalarOp(SDValue &Operand) {
auto MemVT = L->getMemoryVT();
return !MemVT.isVector() && MemVT.getSizeInBits() == 16;
}
- return false;
+ return L->getMemoryVT().getSizeInBits() == 16;
}
default:
return false;
@@ -10674,29 +11513,118 @@ static bool addresses16Bits(int Mask) {
// Do not lower into v_perm if the operands are actually 16 bit
// and the selected bits (based on PermMask) correspond with two
// easily addressable 16 bit operands.
-static bool hasEightBitAccesses(uint64_t PermMask, SDValue &Op,
+static bool hasNon16BitAccesses(uint64_t PermMask, SDValue &Op,
SDValue &OtherOp) {
int Low16 = PermMask & 0xffff;
int Hi16 = (PermMask & 0xffff0000) >> 16;
- // ByteProvider only accepts 32 bit operands
- assert(Op.getValueType().getSizeInBits() == 32);
- assert(OtherOp.getValueType().getSizeInBits() == 32);
+ assert(Op.getValueType().isByteSized());
+ assert(OtherOp.getValueType().isByteSized());
- auto OpIs16Bit = is16BitScalarOp(Op);
- auto OtherOpIs16Bit = is16BitScalarOp(Op);
+ auto TempOp = peekThroughBitcasts(Op);
+ auto TempOtherOp = peekThroughBitcasts(OtherOp);
- // If there is a size mismatch, then we must use masking on at least one
- // operand
- if (OpIs16Bit != OtherOpIs16Bit)
+ auto OpIs16Bit =
+ TempOtherOp.getValueSizeInBits() == 16 || isExtendedFrom16Bits(TempOp);
+ if (!OpIs16Bit)
return true;
- // If both operands are 16 bit, return whether or not we cleanly address both
- if (is16BitScalarOp(Op) && is16BitScalarOp(OtherOp))
- return !addresses16Bits(Low16) || !addresses16Bits(Hi16);
+ auto OtherOpIs16Bit = TempOtherOp.getValueSizeInBits() == 16 ||
+ isExtendedFrom16Bits(TempOtherOp);
+ if (!OtherOpIs16Bit)
+ return true;
- // Both are 32 bit operands
- return true;
+ // Do we cleanly address both
+ return !addresses16Bits(Low16) || !addresses16Bits(Hi16);
+}
+
+static SDValue matchPERM(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) {
+ SelectionDAG &DAG = DCI.DAG;
+ EVT VT = N->getValueType(0);
+
+ if (VT != MVT::i32)
+ return SDValue();
+
+ // VT is known to be MVT::i32, so we need to provide 4 bytes.
+ SmallVector<ByteProvider<SDValue>, 8> PermNodes;
+ for (int i = 0; i < 4; i++) {
+ // Find the ByteProvider that provides the ith byte of the result of OR
+ std::optional<ByteProvider<SDValue>> P =
+ calculateByteProvider(SDValue(N, 0), i, 0, /*StartingIndex = */ i);
+ // TODO support constantZero
+ if (!P || P->isConstantZero())
+ return SDValue();
+
+ PermNodes.push_back(*P);
+ }
+ if (PermNodes.size() != 4)
+ return SDValue();
+
+ int FirstSrc = 0;
+ std::optional<int> SecondSrc;
+ uint64_t PermMask = 0x00000000;
+ for (size_t i = 0; i < PermNodes.size(); i++) {
+ auto PermOp = PermNodes[i];
+ // Since the mask is applied to Src1:Src2, Src1 bytes must be offset
+ // by sizeof(Src2) = 4
+ int SrcByteAdjust = 4;
+
+ if (!PermOp.hasSameSrc(PermNodes[FirstSrc])) {
+ if (SecondSrc.has_value())
+ if (!PermOp.hasSameSrc(PermNodes[*SecondSrc]))
+ return SDValue();
+
+ // Set the index of the second distinct Src node
+ SecondSrc = i;
+ assert(!(PermNodes[*SecondSrc].Src->getValueSizeInBits() % 8));
+ SrcByteAdjust = 0;
+ }
+ assert(PermOp.SrcOffset + SrcByteAdjust < 8);
+ assert(!DAG.getDataLayout().isBigEndian());
+ PermMask |= (PermOp.SrcOffset + SrcByteAdjust) << (i * 8);
+ }
+
+ SDValue Op = *PermNodes[FirstSrc].Src;
+ SDValue OtherOp = SecondSrc.has_value() ? *PermNodes[*SecondSrc].Src
+ : *PermNodes[FirstSrc].Src;
+
+ // Check that we haven't just recreated the same FSHR node.
+ if (N->getOpcode() == ISD::FSHR &&
+ (N->getOperand(0) == Op || N->getOperand(0) == OtherOp) &&
+ (N->getOperand(1) == Op || N->getOperand(1) == OtherOp))
+ return SDValue();
+
+ // Check that we are not just extracting the bytes in order from an op
+ if (Op == OtherOp && Op.getValueSizeInBits() == 32) {
+ int Low16 = PermMask & 0xffff;
+ int Hi16 = (PermMask & 0xffff0000) >> 16;
+
+ bool WellFormedLow = (Low16 == 0x0504) || (Low16 == 0x0100);
+ bool WellFormedHi = (Hi16 == 0x0706) || (Hi16 == 0x0302);
+
+ // The perm op would really just produce Op. So combine into Op
+ if (WellFormedLow && WellFormedHi)
+ return DAG.getBitcast(MVT::getIntegerVT(32), Op);
+ }
+
+ if (hasNon16BitAccesses(PermMask, Op, OtherOp)) {
+ SDLoc DL(N);
+ assert(Op.getValueType().isByteSized() &&
+ OtherOp.getValueType().isByteSized());
+
+ // If the ultimate src is less than 32 bits, then we will only be
+ // using bytes 0: Op.getValueSizeInBytes() - 1 in the or.
+ // CalculateByteProvider would not have returned Op as source if we
+ // used a byte that is outside its ValueType. Thus, we are free to
+ // ANY_EXTEND as the extended bits are dont-cares.
+ Op = DAG.getBitcastedAnyExtOrTrunc(Op, DL, MVT::i32);
+ OtherOp = DAG.getBitcastedAnyExtOrTrunc(OtherOp, DL, MVT::i32);
+
+ return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, Op, OtherOp,
+ DAG.getConstant(PermMask, DL, MVT::i32));
+ }
+
+ return SDValue();
}
SDValue SITargetLowering::performOrCombine(SDNode *N,
@@ -10812,69 +11740,8 @@ SDValue SITargetLowering::performOrCombine(SDNode *N,
}
}
if (LHSMask == ~0u || RHSMask == ~0u) {
- SmallVector<ByteProvider<SDValue>, 8> PermNodes;
-
- // VT is known to be MVT::i32, so we need to provide 4 bytes.
- assert(VT == MVT::i32);
- for (int i = 0; i < 4; i++) {
- // Find the ByteProvider that provides the ith byte of the result of OR
- std::optional<ByteProvider<SDValue>> P =
- calculateByteProvider(SDValue(N, 0), i, 0, /*StartingIndex = */ i);
- // TODO support constantZero
- if (!P || P->isConstantZero())
- return SDValue();
-
- PermNodes.push_back(*P);
- }
- if (PermNodes.size() != 4)
- return SDValue();
-
- int FirstSrc = 0;
- std::optional<int> SecondSrc;
- uint64_t permMask = 0x00000000;
- for (size_t i = 0; i < PermNodes.size(); i++) {
- auto PermOp = PermNodes[i];
- // Since the mask is applied to Src1:Src2, Src1 bytes must be offset
- // by sizeof(Src2) = 4
- int SrcByteAdjust = 4;
-
- if (!PermOp.hasSameSrc(PermNodes[FirstSrc])) {
- if (SecondSrc.has_value())
- if (!PermOp.hasSameSrc(PermNodes[*SecondSrc]))
- return SDValue();
- // Set the index of the second distinct Src node
- SecondSrc = i;
- assert(PermNodes[*SecondSrc].Src->getValueType().getSizeInBits() ==
- 32);
- SrcByteAdjust = 0;
- }
- assert(PermOp.SrcOffset + SrcByteAdjust < 8);
- assert(!DAG.getDataLayout().isBigEndian());
- permMask |= (PermOp.SrcOffset + SrcByteAdjust) << (i * 8);
- }
-
- SDValue Op = *PermNodes[FirstSrc].Src;
- SDValue OtherOp = SecondSrc.has_value() ? *PermNodes[*SecondSrc].Src
- : *PermNodes[FirstSrc].Src;
-
- // Check that we are not just extracting the bytes in order from an op
- if (Op == OtherOp) {
- int Low16 = permMask & 0xffff;
- int Hi16 = (permMask & 0xffff0000) >> 16;
-
- bool WellFormedLow = (Low16 == 0x0504) || (Low16 == 0x0100);
- bool WellFormedHi = (Hi16 == 0x0706) || (Hi16 == 0x0302);
-
- // The perm op would really just produce Op. So combine into Op
- if (WellFormedLow && WellFormedHi)
- return Op;
- }
-
- if (hasEightBitAccesses(permMask, Op, OtherOp)) {
- SDLoc DL(N);
- return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, Op, OtherOp,
- DAG.getConstant(permMask, DL, MVT::i32));
- }
+ if (SDValue Perm = matchPERM(N, DCI))
+ return Perm;
}
}
@@ -11021,10 +11888,8 @@ SDValue SITargetLowering::performClassCombine(SDNode *N,
SDValue Mask = N->getOperand(1);
// fp_class x, 0 -> false
- if (const ConstantSDNode *CMask = dyn_cast<ConstantSDNode>(Mask)) {
- if (CMask->isZero())
- return DAG.getConstant(0, SDLoc(N), MVT::i1);
- }
+ if (isNullConstant(Mask))
+ return DAG.getConstant(0, SDLoc(N), MVT::i1);
if (N->getOperand(0).isUndef())
return DAG.getUNDEF(MVT::i1);
@@ -11049,7 +11914,9 @@ SDValue SITargetLowering::performRcpCombine(SDNode *N,
N->getFlags());
}
- if ((VT == MVT::f32 || VT == MVT::f16) && N0.getOpcode() == ISD::FSQRT) {
+ // TODO: Could handle f32 + amdgcn.sqrt but probably never reaches here.
+ if ((VT == MVT::f16 && N0.getOpcode() == ISD::FSQRT) &&
+ N->getFlags().hasAllowContract() && N0->getFlags().hasAllowContract()) {
return DCI.DAG.getNode(AMDGPUISD::RSQ, SDLoc(N), VT,
N0.getOperand(0), N->getFlags());
}
@@ -11131,10 +11998,14 @@ bool SITargetLowering::isCanonicalized(SelectionDAG &DAG, SDValue Op,
case ISD::FMAXNUM:
case ISD::FMINNUM_IEEE:
case ISD::FMAXNUM_IEEE:
+ case ISD::FMINIMUM:
+ case ISD::FMAXIMUM:
case AMDGPUISD::CLAMP:
case AMDGPUISD::FMED3:
case AMDGPUISD::FMAX3:
- case AMDGPUISD::FMIN3: {
+ case AMDGPUISD::FMIN3:
+ case AMDGPUISD::FMAXIMUM3:
+ case AMDGPUISD::FMINIMUM3: {
// FIXME: Shouldn't treat the generic operations different based these.
// However, we aren't really required to flush the result from
// minnum/maxnum..
@@ -11288,7 +12159,9 @@ bool SITargetLowering::isCanonicalized(Register Reg, MachineFunction &MF,
case AMDGPU::G_FMINNUM:
case AMDGPU::G_FMAXNUM:
case AMDGPU::G_FMINNUM_IEEE:
- case AMDGPU::G_FMAXNUM_IEEE: {
+ case AMDGPU::G_FMAXNUM_IEEE:
+ case AMDGPU::G_FMINIMUM:
+ case AMDGPU::G_FMAXIMUM: {
if (Subtarget->supportsMinMaxDenormModes() ||
// FIXME: denormalsEnabledForType is broken for dynamic
denormalsEnabledForType(MRI.getType(Reg), MF))
@@ -11302,7 +12175,8 @@ bool SITargetLowering::isCanonicalized(Register Reg, MachineFunction &MF,
return false;
return true;
case AMDGPU::G_INTRINSIC:
- switch (MI->getIntrinsicID()) {
+ case AMDGPU::G_INTRINSIC_CONVERGENT:
+ switch (cast<GIntrinsic>(MI)->getIntrinsicID()) {
case Intrinsic::amdgcn_fmul_legacy:
case Intrinsic::amdgcn_fmad_ftz:
case Intrinsic::amdgcn_sqrt:
@@ -11321,7 +12195,6 @@ bool SITargetLowering::isCanonicalized(Register Reg, MachineFunction &MF,
case Intrinsic::amdgcn_div_fmas:
case Intrinsic::amdgcn_div_fixup:
case Intrinsic::amdgcn_fract:
- case Intrinsic::amdgcn_ldexp:
case Intrinsic::amdgcn_cvt_pkrtz:
case Intrinsic::amdgcn_cubeid:
case Intrinsic::amdgcn_cubema:
@@ -11476,6 +12349,8 @@ static unsigned minMaxOpcToMin3Max3Opc(unsigned Opc) {
case ISD::FMAXNUM:
case ISD::FMAXNUM_IEEE:
return AMDGPUISD::FMAX3;
+ case ISD::FMAXIMUM:
+ return AMDGPUISD::FMAXIMUM3;
case ISD::SMAX:
return AMDGPUISD::SMAX3;
case ISD::UMAX:
@@ -11483,6 +12358,8 @@ static unsigned minMaxOpcToMin3Max3Opc(unsigned Opc) {
case ISD::FMINNUM:
case ISD::FMINNUM_IEEE:
return AMDGPUISD::FMIN3;
+ case ISD::FMINIMUM:
+ return AMDGPUISD::FMINIMUM3;
case ISD::SMIN:
return AMDGPUISD::SMIN3;
case ISD::UMIN:
@@ -11842,7 +12719,9 @@ SDValue SITargetLowering::performExtractVectorEltCombine(
case ISD::FMAXNUM:
case ISD::FMINNUM:
case ISD::FMAXNUM_IEEE:
- case ISD::FMINNUM_IEEE: {
+ case ISD::FMINNUM_IEEE:
+ case ISD::FMAXIMUM:
+ case ISD::FMINIMUM: {
SDValue Elt0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, ResVT,
Vec.getOperand(0), Idx);
SDValue Elt1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, ResVT,
@@ -12203,6 +13082,256 @@ SDValue SITargetLowering::tryFoldToMad64_32(SDNode *N,
return Accum;
}
+// Collect the ultimate src of each of the mul node's operands, and confirm
+// each operand is 8 bytes.
+static std::optional<ByteProvider<SDValue>>
+handleMulOperand(const SDValue &MulOperand) {
+ auto Byte0 = calculateByteProvider(MulOperand, 0, 0);
+ if (!Byte0 || Byte0->isConstantZero()) {
+ return std::nullopt;
+ }
+ auto Byte1 = calculateByteProvider(MulOperand, 1, 0);
+ if (Byte1 && !Byte1->isConstantZero()) {
+ return std::nullopt;
+ }
+ return Byte0;
+}
+
+static unsigned addPermMasks(unsigned First, unsigned Second) {
+ unsigned FirstCs = First & 0x0c0c0c0c;
+ unsigned SecondCs = Second & 0x0c0c0c0c;
+ unsigned FirstNoCs = First & ~0x0c0c0c0c;
+ unsigned SecondNoCs = Second & ~0x0c0c0c0c;
+
+ assert((FirstCs & 0xFF) | (SecondCs & 0xFF));
+ assert((FirstCs & 0xFF00) | (SecondCs & 0xFF00));
+ assert((FirstCs & 0xFF0000) | (SecondCs & 0xFF0000));
+ assert((FirstCs & 0xFF000000) | (SecondCs & 0xFF000000));
+
+ return (FirstNoCs | SecondNoCs) | (FirstCs & SecondCs);
+}
+
+static void placeSources(ByteProvider<SDValue> &Src0,
+ ByteProvider<SDValue> &Src1,
+ SmallVectorImpl<std::pair<SDValue, unsigned>> &Src0s,
+ SmallVectorImpl<std::pair<SDValue, unsigned>> &Src1s,
+ int Step) {
+
+ assert(Src0.Src.has_value() && Src1.Src.has_value());
+ // Src0s and Src1s are empty, just place arbitrarily.
+ if (Step == 0) {
+ Src0s.push_back({*Src0.Src, (Src0.SrcOffset << 24) + 0x0c0c0c});
+ Src1s.push_back({*Src1.Src, (Src1.SrcOffset << 24) + 0x0c0c0c});
+ return;
+ }
+
+ for (int BPI = 0; BPI < 2; BPI++) {
+ std::pair<ByteProvider<SDValue>, ByteProvider<SDValue>> BPP = {Src0, Src1};
+ if (BPI == 1) {
+ BPP = {Src1, Src0};
+ }
+ unsigned ZeroMask = 0x0c0c0c0c;
+ unsigned FMask = 0xFF << (8 * (3 - Step));
+
+ unsigned FirstMask =
+ BPP.first.SrcOffset << (8 * (3 - Step)) | (ZeroMask & ~FMask);
+ unsigned SecondMask =
+ BPP.second.SrcOffset << (8 * (3 - Step)) | (ZeroMask & ~FMask);
+ // Attempt to find Src vector which contains our SDValue, if so, add our
+ // perm mask to the existing one. If we are unable to find a match for the
+ // first SDValue, attempt to find match for the second.
+ int FirstGroup = -1;
+ for (int I = 0; I < 2; I++) {
+ SmallVectorImpl<std::pair<SDValue, unsigned>> &Srcs =
+ I == 0 ? Src0s : Src1s;
+ auto MatchesFirst = [&BPP](std::pair<SDValue, unsigned> IterElt) {
+ return IterElt.first == *BPP.first.Src;
+ };
+
+ auto Match = llvm::find_if(Srcs, MatchesFirst);
+ if (Match != Srcs.end()) {
+ Match->second = addPermMasks(FirstMask, Match->second);
+ FirstGroup = I;
+ break;
+ }
+ }
+ if (FirstGroup != -1) {
+ SmallVectorImpl<std::pair<SDValue, unsigned>> &Srcs =
+ FirstGroup == 1 ? Src0s : Src1s;
+ auto MatchesSecond = [&BPP](std::pair<SDValue, unsigned> IterElt) {
+ return IterElt.first == *BPP.second.Src;
+ };
+ auto Match = llvm::find_if(Srcs, MatchesSecond);
+ if (Match != Srcs.end()) {
+ Match->second = addPermMasks(SecondMask, Match->second);
+ } else
+ Srcs.push_back({*BPP.second.Src, SecondMask});
+ return;
+ }
+ }
+
+ // If we have made it here, then we could not find a match in Src0s or Src1s
+ // for either Src0 or Src1, so just place them arbitrarily.
+
+ unsigned ZeroMask = 0x0c0c0c0c;
+ unsigned FMask = 0xFF << (8 * (3 - Step));
+
+ Src0s.push_back(
+ {*Src0.Src, (Src0.SrcOffset << (8 * (3 - Step)) | (ZeroMask & ~FMask))});
+ Src1s.push_back(
+ {*Src1.Src, (Src1.SrcOffset << (8 * (3 - Step)) | (ZeroMask & ~FMask))});
+
+ return;
+}
+
+static SDValue
+resolveSources(SelectionDAG &DAG, SDLoc SL,
+ SmallVectorImpl<std::pair<SDValue, unsigned>> &Srcs,
+ bool IsSigned, bool IsAny) {
+
+ // If we just have one source, just permute it accordingly.
+ if (Srcs.size() == 1) {
+ auto Elt = Srcs.begin();
+ auto EltVal = DAG.getBitcastedAnyExtOrTrunc(Elt->first, SL, MVT::i32);
+
+ // v_perm will produce the original value.
+ if (Elt->second == 0x3020100)
+ return EltVal;
+
+ return DAG.getNode(AMDGPUISD::PERM, SL, MVT::i32, EltVal, EltVal,
+ DAG.getConstant(Elt->second, SL, MVT::i32));
+ }
+
+ auto FirstElt = Srcs.begin();
+ auto SecondElt = std::next(FirstElt);
+
+ SmallVector<SDValue, 2> Perms;
+
+ // If we have multiple sources in the chain, combine them via perms (using
+ // calculated perm mask) and Ors.
+ while (true) {
+ auto FirstMask = FirstElt->second;
+ auto SecondMask = SecondElt->second;
+
+ unsigned FirstCs = FirstMask & 0x0c0c0c0c;
+ unsigned FirstPlusFour = FirstMask | 0x04040404;
+ // 0x0c + 0x04 = 0x10, so anding with 0x0F will produced 0x00 for any
+ // original 0x0C.
+ FirstMask = (FirstPlusFour & 0x0F0F0F0F) | FirstCs;
+
+ auto PermMask = addPermMasks(FirstMask, SecondMask);
+ auto FirstVal =
+ DAG.getBitcastedAnyExtOrTrunc(FirstElt->first, SL, MVT::i32);
+ auto SecondVal =
+ DAG.getBitcastedAnyExtOrTrunc(SecondElt->first, SL, MVT::i32);
+
+ Perms.push_back(DAG.getNode(AMDGPUISD::PERM, SL, MVT::i32, FirstVal,
+ SecondVal,
+ DAG.getConstant(PermMask, SL, MVT::i32)));
+
+ FirstElt = std::next(SecondElt);
+ if (FirstElt == Srcs.end())
+ break;
+
+ SecondElt = std::next(FirstElt);
+ // If we only have a FirstElt, then just combine that into the cumulative
+ // source node.
+ if (SecondElt == Srcs.end()) {
+ auto EltVal =
+ DAG.getBitcastedAnyExtOrTrunc(FirstElt->first, SL, MVT::i32);
+
+ Perms.push_back(
+ DAG.getNode(AMDGPUISD::PERM, SL, MVT::i32, EltVal, EltVal,
+ DAG.getConstant(FirstElt->second, SL, MVT::i32)));
+ break;
+ }
+ }
+
+ assert(Perms.size() == 1 || Perms.size() == 2);
+ return Perms.size() == 2
+ ? DAG.getNode(ISD::OR, SL, MVT::i32, Perms[0], Perms[1])
+ : Perms[0];
+}
+
+static void fixMasks(SmallVectorImpl<std::pair<SDValue, unsigned>> &Srcs,
+ unsigned ChainLength) {
+ for (auto &[EntryVal, EntryMask] : Srcs) {
+ EntryMask = EntryMask >> ((4 - ChainLength) * 8);
+ auto ZeroMask = ChainLength == 2 ? 0x0c0c0000 : 0x0c000000;
+ EntryMask += ZeroMask;
+ }
+}
+
+static bool isMul(const SDValue Op) {
+ auto Opcode = Op.getOpcode();
+
+ return (Opcode == ISD::MUL || Opcode == AMDGPUISD::MUL_U24 ||
+ Opcode == AMDGPUISD::MUL_I24);
+}
+
+static std::optional<bool>
+checkDot4MulSignedness(const SDValue &N, ByteProvider<SDValue> &Src0,
+ ByteProvider<SDValue> &Src1, const SDValue &S0Op,
+ const SDValue &S1Op, const SelectionDAG &DAG) {
+ // If we both ops are i8s (pre legalize-dag), then the signedness semantics
+ // of the dot4 is irrelevant.
+ if (S0Op.getValueSizeInBits() == 8 && S1Op.getValueSizeInBits() == 8)
+ return false;
+
+ auto Known0 = DAG.computeKnownBits(S0Op, 0);
+ bool S0IsUnsigned = Known0.countMinLeadingZeros() > 0;
+ bool S0IsSigned = Known0.countMinLeadingOnes() > 0;
+ auto Known1 = DAG.computeKnownBits(S1Op, 0);
+ bool S1IsUnsigned = Known1.countMinLeadingZeros() > 0;
+ bool S1IsSigned = Known1.countMinLeadingOnes() > 0;
+
+ assert(!(S0IsUnsigned && S0IsSigned));
+ assert(!(S1IsUnsigned && S1IsSigned));
+
+ // There are 9 possible permutations of
+ // {S0IsUnsigned, S0IsSigned, S1IsUnsigned, S1IsSigned}
+
+ // In two permutations, the sign bits are known to be the same for both Ops,
+ // so simply return Signed / Unsigned corresponding to the MSB
+
+ if ((S0IsUnsigned && S1IsUnsigned) || (S0IsSigned && S1IsSigned))
+ return S0IsSigned;
+
+ // In another two permutations, the sign bits are known to be opposite. In
+ // this case return std::nullopt to indicate a bad match.
+
+ if ((S0IsUnsigned && S1IsSigned) || (S0IsSigned && S1IsUnsigned))
+ return std::nullopt;
+
+ // In the remaining five permutations, we don't know the value of the sign
+ // bit for at least one Op. Since we have a valid ByteProvider, we know that
+ // the upper bits must be extension bits. Thus, the only ways for the sign
+ // bit to be unknown is if it was sign extended from unknown value, or if it
+ // was any extended. In either case, it is correct to use the signed
+ // version of the signedness semantics of dot4
+
+ // In two of such permutations, we known the sign bit is set for
+ // one op, and the other is unknown. It is okay to used signed version of
+ // dot4.
+ if ((S0IsSigned && !(S1IsSigned || S1IsUnsigned)) ||
+ ((S1IsSigned && !(S0IsSigned || S0IsUnsigned))))
+ return true;
+
+ // In one such permutation, we don't know either of the sign bits. It is okay
+ // to used the signed version of dot4.
+ if ((!(S1IsSigned || S1IsUnsigned) && !(S0IsSigned || S0IsUnsigned)))
+ return true;
+
+ // In two of such permutations, we known the sign bit is unset for
+ // one op, and the other is unknown. Return std::nullopt to indicate a
+ // bad match.
+ if ((S0IsUnsigned && !(S1IsSigned || S1IsUnsigned)) ||
+ ((S1IsUnsigned && !(S0IsSigned || S0IsUnsigned))))
+ return std::nullopt;
+
+ llvm_unreachable("Fully covered condition");
+}
+
SDValue SITargetLowering::performAddCombine(SDNode *N,
DAGCombinerInfo &DCI) const {
SelectionDAG &DAG = DCI.DAG;
@@ -12216,14 +13345,146 @@ SDValue SITargetLowering::performAddCombine(SDNode *N,
if (SDValue Folded = tryFoldToMad64_32(N, DCI))
return Folded;
}
-
- return SDValue();
}
if (SDValue V = reassociateScalarOps(N, DAG)) {
return V;
}
+ if ((isMul(LHS) || isMul(RHS)) && Subtarget->hasDot7Insts() &&
+ (Subtarget->hasDot1Insts() || Subtarget->hasDot8Insts())) {
+ SDValue TempNode(N, 0);
+ std::optional<bool> IsSigned;
+ SmallVector<std::pair<SDValue, unsigned>, 4> Src0s;
+ SmallVector<std::pair<SDValue, unsigned>, 4> Src1s;
+ SmallVector<SDValue, 4> Src2s;
+
+ // Match the v_dot4 tree, while collecting src nodes.
+ int ChainLength = 0;
+ for (int I = 0; I < 4; I++) {
+ auto MulIdx = isMul(LHS) ? 0 : isMul(RHS) ? 1 : -1;
+ if (MulIdx == -1)
+ break;
+ auto Src0 = handleMulOperand(TempNode->getOperand(MulIdx)->getOperand(0));
+ if (!Src0)
+ break;
+ auto Src1 = handleMulOperand(TempNode->getOperand(MulIdx)->getOperand(1));
+ if (!Src1)
+ break;
+
+ auto IterIsSigned = checkDot4MulSignedness(
+ TempNode->getOperand(MulIdx), *Src0, *Src1,
+ TempNode->getOperand(MulIdx)->getOperand(0),
+ TempNode->getOperand(MulIdx)->getOperand(1), DAG);
+ if (!IterIsSigned)
+ break;
+ if (!IsSigned)
+ IsSigned = *IterIsSigned;
+ if (*IterIsSigned != *IsSigned)
+ break;
+ placeSources(*Src0, *Src1, Src0s, Src1s, I);
+ auto AddIdx = 1 - MulIdx;
+ // Allow the special case where add (add (mul24, 0), mul24) became ->
+ // add (mul24, mul24).
+ if (I == 2 && isMul(TempNode->getOperand(AddIdx))) {
+ Src2s.push_back(TempNode->getOperand(AddIdx));
+ auto Src0 =
+ handleMulOperand(TempNode->getOperand(AddIdx)->getOperand(0));
+ if (!Src0)
+ break;
+ auto Src1 =
+ handleMulOperand(TempNode->getOperand(AddIdx)->getOperand(1));
+ if (!Src1)
+ break;
+ auto IterIsSigned = checkDot4MulSignedness(
+ TempNode->getOperand(AddIdx), *Src0, *Src1,
+ TempNode->getOperand(AddIdx)->getOperand(0),
+ TempNode->getOperand(AddIdx)->getOperand(1), DAG);
+ if (!IterIsSigned)
+ break;
+ assert(IsSigned);
+ if (*IterIsSigned != *IsSigned)
+ break;
+ placeSources(*Src0, *Src1, Src0s, Src1s, I + 1);
+ Src2s.push_back(DAG.getConstant(0, SL, MVT::i32));
+ ChainLength = I + 2;
+ break;
+ }
+
+ TempNode = TempNode->getOperand(AddIdx);
+ Src2s.push_back(TempNode);
+ ChainLength = I + 1;
+ if (TempNode->getNumOperands() < 2)
+ break;
+ LHS = TempNode->getOperand(0);
+ RHS = TempNode->getOperand(1);
+ }
+
+ if (ChainLength < 2)
+ return SDValue();
+
+ // Masks were constructed with assumption that we would find a chain of
+ // length 4. If not, then we need to 0 out the MSB bits (via perm mask of
+ // 0x0c) so they do not affect dot calculation.
+ if (ChainLength < 4) {
+ fixMasks(Src0s, ChainLength);
+ fixMasks(Src1s, ChainLength);
+ }
+
+ SDValue Src0, Src1;
+
+ // If we are just using a single source for both, and have permuted the
+ // bytes consistently, we can just use the sources without permuting
+ // (commutation).
+ bool UseOriginalSrc = false;
+ if (ChainLength == 4 && Src0s.size() == 1 && Src1s.size() == 1 &&
+ Src0s.begin()->second == Src1s.begin()->second &&
+ Src0s.begin()->first.getValueSizeInBits() == 32 &&
+ Src1s.begin()->first.getValueSizeInBits() == 32) {
+ SmallVector<unsigned, 4> SrcBytes;
+ auto Src0Mask = Src0s.begin()->second;
+ SrcBytes.push_back(Src0Mask & 0xFF000000);
+ bool UniqueEntries = true;
+ for (auto I = 1; I < 4; I++) {
+ auto NextByte = Src0Mask & (0xFF << ((3 - I) * 8));
+
+ if (is_contained(SrcBytes, NextByte)) {
+ UniqueEntries = false;
+ break;
+ }
+ SrcBytes.push_back(NextByte);
+ }
+
+ if (UniqueEntries) {
+ UseOriginalSrc = true;
+ // Must be 32 bits to enter above conditional.
+ assert(Src0s.begin()->first.getValueSizeInBits() == 32);
+ assert(Src1s.begin()->first.getValueSizeInBits() == 32);
+ Src0 = DAG.getBitcast(MVT::getIntegerVT(32), Src0s.begin()->first);
+ Src1 = DAG.getBitcast(MVT::getIntegerVT(32), Src1s.begin()->first);
+ }
+ }
+
+ if (!UseOriginalSrc) {
+ Src0 = resolveSources(DAG, SL, Src0s, false, true);
+ Src1 = resolveSources(DAG, SL, Src1s, false, true);
+ }
+
+ assert(IsSigned);
+ SDValue Src2 =
+ DAG.getExtOrTrunc(*IsSigned, Src2s[ChainLength - 1], SL, MVT::i32);
+
+ SDValue IID = DAG.getTargetConstant(*IsSigned ? Intrinsic::amdgcn_sdot4
+ : Intrinsic::amdgcn_udot4,
+ SL, MVT::i64);
+
+ assert(!VT.isVector());
+ auto Dot = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, MVT::i32, IID, Src0,
+ Src1, Src2, DAG.getTargetConstant(0, SL, MVT::i1));
+
+ return DAG.getExtOrTrunc(*IsSigned, Dot, SL, VT);
+ }
+
if (VT != MVT::i32 || !DCI.isAfterLegalizeDAG())
return SDValue();
@@ -12295,8 +13556,7 @@ SDValue SITargetLowering::performSubCombine(SDNode *N,
if (LHS.getOpcode() == ISD::USUBO_CARRY) {
// sub (usubo_carry x, 0, cc), y => usubo_carry x, y, cc
- auto C = dyn_cast<ConstantSDNode>(LHS.getOperand(1));
- if (!C || !C->isZero())
+ if (!isNullConstant(LHS.getOperand(1)))
return SDValue();
SDValue Args[] = { LHS.getOperand(0), RHS, LHS.getOperand(2) };
return DAG.getNode(ISD::USUBO_CARRY, SDLoc(N), LHS->getVTList(), Args);
@@ -12417,6 +13677,41 @@ SDValue SITargetLowering::performFSubCombine(SDNode *N,
return SDValue();
}
+SDValue SITargetLowering::performFDivCombine(SDNode *N,
+ DAGCombinerInfo &DCI) const {
+ SelectionDAG &DAG = DCI.DAG;
+ SDLoc SL(N);
+ EVT VT = N->getValueType(0);
+ if (VT != MVT::f16 || !Subtarget->has16BitInsts())
+ return SDValue();
+
+ SDValue LHS = N->getOperand(0);
+ SDValue RHS = N->getOperand(1);
+
+ SDNodeFlags Flags = N->getFlags();
+ SDNodeFlags RHSFlags = RHS->getFlags();
+ if (!Flags.hasAllowContract() || !RHSFlags.hasAllowContract() ||
+ !RHS->hasOneUse())
+ return SDValue();
+
+ if (const ConstantFPSDNode *CLHS = dyn_cast<ConstantFPSDNode>(LHS)) {
+ bool IsNegative = false;
+ if (CLHS->isExactlyValue(1.0) ||
+ (IsNegative = CLHS->isExactlyValue(-1.0))) {
+ // fdiv contract 1.0, (sqrt contract x) -> rsq for f16
+ // fdiv contract -1.0, (sqrt contract x) -> fneg(rsq) for f16
+ if (RHS.getOpcode() == ISD::FSQRT) {
+ // TODO: Or in RHS flags, somehow missing from SDNodeFlags
+ SDValue Rsq =
+ DAG.getNode(AMDGPUISD::RSQ, SL, VT, RHS.getOperand(0), Flags);
+ return IsNegative ? DAG.getNode(ISD::FNEG, SL, VT, Rsq, Flags) : Rsq;
+ }
+ }
+ }
+
+ return SDValue();
+}
+
SDValue SITargetLowering::performFMACombine(SDNode *N,
DAGCombinerInfo &DCI) const {
SelectionDAG &DAG = DCI.DAG;
@@ -12666,7 +13961,7 @@ SDValue SITargetLowering::performClampCombine(SDNode *N,
SDValue SITargetLowering::PerformDAGCombine(SDNode *N,
DAGCombinerInfo &DCI) const {
- if (getTargetMachine().getOptLevel() == CodeGenOpt::None)
+ if (getTargetMachine().getOptLevel() == CodeGenOptLevel::None)
return SDValue();
switch (N->getOpcode()) {
case ISD::ADD:
@@ -12680,12 +13975,16 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N,
return performFAddCombine(N, DCI);
case ISD::FSUB:
return performFSubCombine(N, DCI);
+ case ISD::FDIV:
+ return performFDivCombine(N, DCI);
case ISD::SETCC:
return performSetCCCombine(N, DCI);
case ISD::FMAXNUM:
case ISD::FMINNUM:
case ISD::FMAXNUM_IEEE:
case ISD::FMINNUM_IEEE:
+ case ISD::FMAXIMUM:
+ case ISD::FMINIMUM:
case ISD::SMAX:
case ISD::SMIN:
case ISD::UMAX:
@@ -12699,6 +13998,14 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N,
return performAndCombine(N, DCI);
case ISD::OR:
return performOrCombine(N, DCI);
+ case ISD::FSHR: {
+ const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
+ if (N->getValueType(0) == MVT::i32 && N->isDivergent() &&
+ TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32_e64) != -1) {
+ return matchPERM(N, DCI);
+ }
+ break;
+ }
case ISD::XOR:
return performXorCombine(N, DCI);
case ISD::ZERO_EXTEND:
@@ -12793,7 +14100,7 @@ static unsigned SubIdx2Lane(unsigned Idx) {
}
}
-/// Adjust the writemask of MIMG instructions
+/// Adjust the writemask of MIMG, VIMAGE or VSAMPLE instructions
SDNode *SITargetLowering::adjustWritemask(MachineSDNode *&Node,
SelectionDAG &DAG) const {
unsigned Opcode = Node->getMachineOpcode();
@@ -12811,7 +14118,7 @@ SDNode *SITargetLowering::adjustWritemask(MachineSDNode *&Node,
unsigned TFEIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::tfe) - 1;
unsigned LWEIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::lwe) - 1;
bool UsesTFC = ((int(TFEIdx) >= 0 && Node->getConstantOperandVal(TFEIdx)) ||
- Node->getConstantOperandVal(LWEIdx))
+ (int(LWEIdx) >= 0 && Node->getConstantOperandVal(LWEIdx)))
? true
: false;
unsigned TFCLane = 0;
@@ -12943,7 +14250,11 @@ SDNode *SITargetLowering::adjustWritemask(MachineSDNode *&Node,
continue;
} else {
SDValue Op = DAG.getTargetConstant(Idx, SDLoc(User), MVT::i32);
- DAG.UpdateNodeOperands(User, SDValue(NewNode, 0), Op);
+ SDNode *NewUser = DAG.UpdateNodeOperands(User, SDValue(NewNode, 0), Op);
+ if (NewUser != User) {
+ DAG.ReplaceAllUsesWith(SDValue(User, 0), SDValue(NewUser, 0));
+ DAG.RemoveDeadNode(User);
+ }
}
switch (Idx) {
@@ -13019,7 +14330,7 @@ SDNode *SITargetLowering::PostISelFolding(MachineSDNode *Node,
const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
unsigned Opcode = Node->getMachineOpcode();
- if (TII->isMIMG(Opcode) && !TII->get(Opcode).mayStore() &&
+ if (TII->isImage(Opcode) && !TII->get(Opcode).mayStore() &&
!TII->isGather4(Opcode) &&
AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::dmask)) {
return adjustWritemask(Node, DAG);
@@ -13106,7 +14417,7 @@ void SITargetLowering::AddIMGInit(MachineInstr &MI) const {
return;
unsigned TFEVal = TFE ? TFE->getImm() : 0;
- unsigned LWEVal = LWE->getImm();
+ unsigned LWEVal = LWE ? LWE->getImm() : 0;
unsigned D16Val = D16 ? D16->getImm() : 0;
if (!TFEVal && !LWEVal)
@@ -13183,7 +14494,9 @@ void SITargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI,
SDNode *Node) const {
const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
- MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
+ MachineFunction *MF = MI.getParent()->getParent();
+ MachineRegisterInfo &MRI = MF->getRegInfo();
+ SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>();
if (TII->isVOP3(MI.getOpcode())) {
// Make sure constant bus requirements are respected.
@@ -13194,11 +14507,16 @@ void SITargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI,
// use between vgpr and agpr as agpr tuples tend to be big.
if (!MI.getDesc().operands().empty()) {
unsigned Opc = MI.getOpcode();
+ bool HasAGPRs = Info->mayNeedAGPRs();
const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
- for (auto I : { AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0),
- AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1) }) {
+ int16_t Src2Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2);
+ for (auto I :
+ {AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0),
+ AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1), Src2Idx}) {
if (I == -1)
break;
+ if ((I == Src2Idx) && (HasAGPRs))
+ break;
MachineOperand &Op = MI.getOperand(I);
if (!Op.isReg() || !Op.getReg().isVirtual())
continue;
@@ -13216,6 +14534,9 @@ void SITargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI,
MRI.setRegClass(Op.getReg(), NewRC);
}
+ if (!HasAGPRs)
+ return;
+
// Resolve the rest of AV operands to AGPRs.
if (auto *Src2 = TII->getNamedOperand(MI, AMDGPU::OpName::src2)) {
if (Src2->isReg() && Src2->getReg().isVirtual()) {
@@ -13233,7 +14554,7 @@ void SITargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI,
return;
}
- if (TII->isMIMG(MI)) {
+ if (TII->isImage(MI)) {
if (!MI.mayStore())
AddIMGInit(MI);
TII->enforceOperandRCAlignment(MI, AMDGPU::OpName::vaddr);
@@ -13377,7 +14698,7 @@ SITargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI_,
return std::pair(0U, RC);
}
- if (Constraint.startswith("{") && Constraint.endswith("}")) {
+ if (Constraint.starts_with("{") && Constraint.ends_with("}")) {
StringRef RegName(Constraint.data() + 1, Constraint.size() - 2);
if (RegName.consume_front("v")) {
RC = &AMDGPU::VGPR_32RegClass;
@@ -13467,7 +14788,7 @@ static uint64_t clearUnusedBits(uint64_t Val, unsigned Size) {
}
void SITargetLowering::LowerAsmOperandForConstraint(SDValue Op,
- std::string &Constraint,
+ StringRef Constraint,
std::vector<SDValue> &Ops,
SelectionDAG &DAG) const {
if (isImmConstraint(Constraint)) {
@@ -13516,8 +14837,7 @@ bool SITargetLowering::getAsmOperandConstVal(SDValue Op, uint64_t &Val) const {
return false;
}
-bool SITargetLowering::checkAsmConstraintVal(SDValue Op,
- const std::string &Constraint,
+bool SITargetLowering::checkAsmConstraintVal(SDValue Op, StringRef Constraint,
uint64_t Val) const {
if (Constraint.size() == 1) {
switch (Constraint[0]) {
@@ -13735,8 +15055,9 @@ void SITargetLowering::computeKnownBitsForTargetInstr(
const MachineRegisterInfo &MRI, unsigned Depth) const {
const MachineInstr *MI = MRI.getVRegDef(R);
switch (MI->getOpcode()) {
- case AMDGPU::G_INTRINSIC: {
- switch (MI->getIntrinsicID()) {
+ case AMDGPU::G_INTRINSIC:
+ case AMDGPU::G_INTRINSIC_CONVERGENT: {
+ switch (cast<GIntrinsic>(MI)->getIntrinsicID()) {
case Intrinsic::amdgcn_workitem_id_x:
knownBitsForWorkitemID(*getSubtarget(), KB, Known, 0);
break;
@@ -13801,21 +15122,16 @@ Align SITargetLowering::computeKnownAlignForTargetInstr(
GISelKnownBits &KB, Register R, const MachineRegisterInfo &MRI,
unsigned Depth) const {
const MachineInstr *MI = MRI.getVRegDef(R);
- switch (MI->getOpcode()) {
- case AMDGPU::G_INTRINSIC:
- case AMDGPU::G_INTRINSIC_W_SIDE_EFFECTS: {
+ if (auto *GI = dyn_cast<GIntrinsic>(MI)) {
// FIXME: Can this move to generic code? What about the case where the call
// site specifies a lower alignment?
- Intrinsic::ID IID = MI->getIntrinsicID();
+ Intrinsic::ID IID = GI->getIntrinsicID();
LLVMContext &Ctx = KB.getMachineFunction().getFunction().getContext();
AttributeList Attrs = Intrinsic::getAttributes(Ctx, IID);
if (MaybeAlign RetAlign = Attrs.getRetAlignment())
return *RetAlign;
- return Align(1);
- }
- default:
- return Align(1);
}
+ return Align(1);
}
Align SITargetLowering::getPrefLoopAlignment(MachineLoop *ML) const {