summaryrefslogtreecommitdiff
path: root/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'llvm/lib/Target/AMDGPU/SIISelLowering.cpp')
-rw-r--r--llvm/lib/Target/AMDGPU/SIISelLowering.cpp306
1 files changed, 227 insertions, 79 deletions
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index d98acfc6c532..519c5b936536 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -19,10 +19,12 @@
#include "SIRegisterInfo.h"
#include "llvm/ADT/Statistic.h"
#include "llvm/Analysis/LegacyDivergenceAnalysis.h"
+#include "llvm/Analysis/OptimizationRemarkEmitter.h"
#include "llvm/BinaryFormat/ELF.h"
#include "llvm/CodeGen/Analysis.h"
#include "llvm/CodeGen/FunctionLoweringInfo.h"
#include "llvm/CodeGen/GlobalISel/GISelKnownBits.h"
+#include "llvm/CodeGen/MachineFunction.h"
#include "llvm/CodeGen/MachineLoopInfo.h"
#include "llvm/IR/DiagnosticInfo.h"
#include "llvm/IR/IntrinsicInst.h"
@@ -465,11 +467,15 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
if (!Subtarget->hasBCNT(64))
setOperationAction(ISD::CTPOP, MVT::i64, Expand);
- if (Subtarget->hasFFBH())
+ if (Subtarget->hasFFBH()) {
+ setOperationAction(ISD::CTLZ, MVT::i32, Custom);
setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32, Custom);
+ }
- if (Subtarget->hasFFBL())
+ if (Subtarget->hasFFBL()) {
+ setOperationAction(ISD::CTTZ, MVT::i32, Custom);
setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32, Custom);
+ }
// We only really have 32-bit BFE instructions (and 16-bit on VI).
//
@@ -1061,7 +1067,7 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
AMDGPU::lookupRsrcIntrinsic(IntrID)) {
AttributeList Attr = Intrinsic::getAttributes(CI.getContext(),
(Intrinsic::ID)IntrID);
- if (Attr.hasFnAttribute(Attribute::ReadNone))
+ if (Attr.hasFnAttr(Attribute::ReadNone))
return false;
SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
@@ -1076,7 +1082,7 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
}
Info.flags = MachineMemOperand::MODereferenceable;
- if (Attr.hasFnAttribute(Attribute::ReadOnly)) {
+ if (Attr.hasFnAttr(Attribute::ReadOnly)) {
unsigned DMaskLanes = 4;
if (RsrcIntr->IsImage) {
@@ -1100,7 +1106,7 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
// FIXME: What does alignment mean for an image?
Info.opc = ISD::INTRINSIC_W_CHAIN;
Info.flags |= MachineMemOperand::MOLoad;
- } else if (Attr.hasFnAttribute(Attribute::WriteOnly)) {
+ } else if (Attr.hasFnAttr(Attribute::WriteOnly)) {
Info.opc = ISD::INTRINSIC_VOID;
Type *DataTy = CI.getArgOperand(0)->getType();
@@ -1423,7 +1429,7 @@ bool SITargetLowering::isLegalAddressingMode(const DataLayout &DL,
}
bool SITargetLowering::canMergeStoresTo(unsigned AS, EVT MemVT,
- const SelectionDAG &DAG) const {
+ const MachineFunction &MF) const {
if (AS == AMDGPUAS::GLOBAL_ADDRESS || AS == AMDGPUAS::FLAT_ADDRESS) {
return (MemVT.getSizeInBits() <= 4 * 32);
} else if (AS == AMDGPUAS::PRIVATE_ADDRESS) {
@@ -1657,12 +1663,17 @@ SDValue SITargetLowering::lowerKernArgParameterPtr(SelectionDAG &DAG,
const ArgDescriptor *InputPtrReg;
const TargetRegisterClass *RC;
LLT ArgTy;
+ MVT PtrVT = getPointerTy(DL, AMDGPUAS::CONSTANT_ADDRESS);
std::tie(InputPtrReg, RC, ArgTy) =
Info->getPreloadedValue(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
+ // We may not have the kernarg segment argument if we have no kernel
+ // arguments.
+ if (!InputPtrReg)
+ return DAG.getConstant(0, SL, PtrVT);
+
MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo();
- MVT PtrVT = getPointerTy(DL, AMDGPUAS::CONSTANT_ADDRESS);
SDValue BasePtr = DAG.getCopyFromReg(Chain, SL,
MRI.getLiveInVirtReg(InputPtrReg->getRegister()), PtrVT);
@@ -1808,6 +1819,19 @@ SDValue SITargetLowering::getPreloadedValue(SelectionDAG &DAG,
LLT Ty;
std::tie(Reg, RC, Ty) = MFI.getPreloadedValue(PVID);
+ if (!Reg) {
+ if (PVID == AMDGPUFunctionArgInfo::PreloadedValue::KERNARG_SEGMENT_PTR) {
+ // It's possible for a kernarg intrinsic call to appear in a kernel with
+ // no allocated segment, in which case we do not add the user sgpr
+ // argument, so just return null.
+ return DAG.getConstant(0, SDLoc(), VT);
+ }
+
+ // It's undefined behavior if a function marked with the amdgpu-no-*
+ // attributes uses the corresponding intrinsic.
+ return DAG.getUNDEF(VT);
+ }
+
return CreateLiveInRegister(DAG, RC, Reg->getRegister(), VT);
}
@@ -2023,31 +2047,33 @@ void SITargetLowering::allocateSpecialInputSGPRs(
SIMachineFunctionInfo &Info) const {
auto &ArgInfo = Info.getArgInfo();
- // TODO: Unify handling with private memory pointers.
+ // We need to allocate these in place regardless of their use.
+ const bool IsFixed = AMDGPUTargetMachine::EnableFixedFunctionABI;
- if (Info.hasDispatchPtr())
+ // TODO: Unify handling with private memory pointers.
+ if (IsFixed || Info.hasDispatchPtr())
allocateSGPR64Input(CCInfo, ArgInfo.DispatchPtr);
- if (Info.hasQueuePtr())
+ if (IsFixed || Info.hasQueuePtr())
allocateSGPR64Input(CCInfo, ArgInfo.QueuePtr);
// Implicit arg ptr takes the place of the kernarg segment pointer. This is a
// constant offset from the kernarg segment.
- if (Info.hasImplicitArgPtr())
+ if (IsFixed || Info.hasImplicitArgPtr())
allocateSGPR64Input(CCInfo, ArgInfo.ImplicitArgPtr);
- if (Info.hasDispatchID())
+ if (IsFixed || Info.hasDispatchID())
allocateSGPR64Input(CCInfo, ArgInfo.DispatchID);
// flat_scratch_init is not applicable for non-kernel functions.
- if (Info.hasWorkGroupIDX())
+ if (IsFixed || Info.hasWorkGroupIDX())
allocateSGPR32Input(CCInfo, ArgInfo.WorkGroupIDX);
- if (Info.hasWorkGroupIDY())
+ if (IsFixed || Info.hasWorkGroupIDY())
allocateSGPR32Input(CCInfo, ArgInfo.WorkGroupIDY);
- if (Info.hasWorkGroupIDZ())
+ if (IsFixed || Info.hasWorkGroupIDZ())
allocateSGPR32Input(CCInfo, ArgInfo.WorkGroupIDZ);
}
@@ -2590,9 +2616,12 @@ SITargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
SDValue ReturnAddrReg = CreateLiveInRegister(
DAG, &AMDGPU::SReg_64RegClass, TRI->getReturnAddressReg(MF), MVT::i64);
- SDValue ReturnAddrVirtualReg = DAG.getRegister(
- MF.getRegInfo().createVirtualRegister(&AMDGPU::CCR_SGPR_64RegClass),
- MVT::i64);
+ SDValue ReturnAddrVirtualReg =
+ DAG.getRegister(MF.getRegInfo().createVirtualRegister(
+ CallConv != CallingConv::AMDGPU_Gfx
+ ? &AMDGPU::CCR_SGPR_64RegClass
+ : &AMDGPU::Gfx_CCR_SGPR_64RegClass),
+ MVT::i64);
Chain =
DAG.getCopyToReg(Chain, DL, ReturnAddrVirtualReg, ReturnAddrReg, Flag);
Flag = Chain.getValue(1);
@@ -2655,8 +2684,15 @@ SITargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
RetOps.push_back(Flag);
unsigned Opc = AMDGPUISD::ENDPGM;
- if (!IsWaveEnd)
- Opc = IsShader ? AMDGPUISD::RETURN_TO_EPILOG : AMDGPUISD::RET_FLAG;
+ if (!IsWaveEnd) {
+ if (IsShader)
+ Opc = AMDGPUISD::RETURN_TO_EPILOG;
+ else if (CallConv == CallingConv::AMDGPU_Gfx)
+ Opc = AMDGPUISD::RET_GFX_FLAG;
+ else
+ Opc = AMDGPUISD::RET_FLAG;
+ }
+
return DAG.getNode(Opc, DL, MVT::Other, RetOps);
}
@@ -2747,21 +2783,28 @@ void SITargetLowering::passSpecialInputs(
// TODO: Unify with private memory register handling. This is complicated by
// the fact that at least in kernels, the input argument is not necessarily
// in the same location as the input.
- AMDGPUFunctionArgInfo::PreloadedValue InputRegs[] = {
- AMDGPUFunctionArgInfo::DISPATCH_PTR,
- AMDGPUFunctionArgInfo::QUEUE_PTR,
- AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR,
- AMDGPUFunctionArgInfo::DISPATCH_ID,
- AMDGPUFunctionArgInfo::WORKGROUP_ID_X,
- AMDGPUFunctionArgInfo::WORKGROUP_ID_Y,
- AMDGPUFunctionArgInfo::WORKGROUP_ID_Z
+ static constexpr std::pair<AMDGPUFunctionArgInfo::PreloadedValue,
+ StringLiteral> ImplicitAttrs[] = {
+ {AMDGPUFunctionArgInfo::DISPATCH_PTR, "amdgpu-no-dispatch-ptr"},
+ {AMDGPUFunctionArgInfo::QUEUE_PTR, "amdgpu-no-queue-ptr" },
+ {AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR, "amdgpu-no-implicitarg-ptr"},
+ {AMDGPUFunctionArgInfo::DISPATCH_ID, "amdgpu-no-dispatch-id"},
+ {AMDGPUFunctionArgInfo::WORKGROUP_ID_X, "amdgpu-no-workgroup-id-x"},
+ {AMDGPUFunctionArgInfo::WORKGROUP_ID_Y,"amdgpu-no-workgroup-id-y"},
+ {AMDGPUFunctionArgInfo::WORKGROUP_ID_Z,"amdgpu-no-workgroup-id-z"}
};
- for (auto InputID : InputRegs) {
+ for (auto Attr : ImplicitAttrs) {
const ArgDescriptor *OutgoingArg;
const TargetRegisterClass *ArgRC;
LLT ArgTy;
+ AMDGPUFunctionArgInfo::PreloadedValue InputID = Attr.first;
+
+ // If the callee does not use the attribute value, skip copying the value.
+ if (CLI.CB->hasFnAttr(Attr.second))
+ continue;
+
std::tie(OutgoingArg, ArgRC, ArgTy) =
CalleeArgInfo->getPreloadedValue(InputID);
if (!OutgoingArg)
@@ -2780,11 +2823,14 @@ void SITargetLowering::passSpecialInputs(
if (IncomingArg) {
InputReg = loadInputValue(DAG, ArgRC, ArgVT, DL, *IncomingArg);
- } else {
+ } else if (InputID == AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR) {
// The implicit arg ptr is special because it doesn't have a corresponding
// input for kernels, and is computed from the kernarg segment pointer.
- assert(InputID == AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR);
InputReg = getImplicitArgPtr(DAG, DL);
+ } else {
+ // We may have proven the input wasn't needed, although the ABI is
+ // requiring it. We just need to allocate the register appropriately.
+ InputReg = DAG.getUNDEF(ArgVT);
}
if (OutgoingArg->isRegister()) {
@@ -2827,11 +2873,17 @@ void SITargetLowering::passSpecialInputs(
SDValue InputReg;
SDLoc SL;
+ const bool NeedWorkItemIDX = !CLI.CB->hasFnAttr("amdgpu-no-workitem-id-x");
+ const bool NeedWorkItemIDY = !CLI.CB->hasFnAttr("amdgpu-no-workitem-id-y");
+ const bool NeedWorkItemIDZ = !CLI.CB->hasFnAttr("amdgpu-no-workitem-id-z");
+
// If incoming ids are not packed we need to pack them.
- if (IncomingArgX && !IncomingArgX->isMasked() && CalleeArgInfo->WorkItemIDX)
+ if (IncomingArgX && !IncomingArgX->isMasked() && CalleeArgInfo->WorkItemIDX &&
+ NeedWorkItemIDX)
InputReg = loadInputValue(DAG, ArgRC, MVT::i32, DL, *IncomingArgX);
- if (IncomingArgY && !IncomingArgY->isMasked() && CalleeArgInfo->WorkItemIDY) {
+ if (IncomingArgY && !IncomingArgY->isMasked() && CalleeArgInfo->WorkItemIDY &&
+ NeedWorkItemIDY) {
SDValue Y = loadInputValue(DAG, ArgRC, MVT::i32, DL, *IncomingArgY);
Y = DAG.getNode(ISD::SHL, SL, MVT::i32, Y,
DAG.getShiftAmountConstant(10, MVT::i32, SL));
@@ -2839,7 +2891,8 @@ void SITargetLowering::passSpecialInputs(
DAG.getNode(ISD::OR, SL, MVT::i32, InputReg, Y) : Y;
}
- if (IncomingArgZ && !IncomingArgZ->isMasked() && CalleeArgInfo->WorkItemIDZ) {
+ if (IncomingArgZ && !IncomingArgZ->isMasked() && CalleeArgInfo->WorkItemIDZ &&
+ NeedWorkItemIDZ) {
SDValue Z = loadInputValue(DAG, ArgRC, MVT::i32, DL, *IncomingArgZ);
Z = DAG.getNode(ISD::SHL, SL, MVT::i32, Z,
DAG.getShiftAmountConstant(20, MVT::i32, SL));
@@ -2847,7 +2900,7 @@ void SITargetLowering::passSpecialInputs(
DAG.getNode(ISD::OR, SL, MVT::i32, InputReg, Z) : Z;
}
- if (!InputReg.getNode()) {
+ if (!InputReg && (NeedWorkItemIDX || NeedWorkItemIDY || NeedWorkItemIDZ)) {
// Workitem ids are already packed, any of present incoming arguments
// will carry all required fields.
ArgDescriptor IncomingArg = ArgDescriptor::createArg(
@@ -2858,13 +2911,17 @@ void SITargetLowering::passSpecialInputs(
}
if (OutgoingArg->isRegister()) {
- RegsToPass.emplace_back(OutgoingArg->getRegister(), InputReg);
+ if (InputReg)
+ RegsToPass.emplace_back(OutgoingArg->getRegister(), InputReg);
+
CCInfo.AllocateReg(OutgoingArg->getRegister());
} else {
unsigned SpecialArgOffset = CCInfo.AllocateStack(4, Align(4));
- SDValue ArgStore = storeStackInputValue(DAG, DL, Chain, InputReg,
- SpecialArgOffset);
- MemOpChains.push_back(ArgStore);
+ if (InputReg) {
+ SDValue ArgStore = storeStackInputValue(DAG, DL, Chain, InputReg,
+ SpecialArgOffset);
+ MemOpChains.push_back(ArgStore);
+ }
}
}
@@ -4091,7 +4148,10 @@ MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter(
}
const TargetRegisterClass *Src2RC = MRI.getRegClass(Src2.getReg());
- if (TRI->getRegSizeInBits(*Src2RC) == 64) {
+ unsigned WaveSize = TRI->getRegSizeInBits(*Src2RC);
+ assert(WaveSize == 64 || WaveSize == 32);
+
+ if (WaveSize == 64) {
if (ST.hasScalarCompareEq64()) {
BuildMI(*BB, MII, DL, TII->get(AMDGPU::S_CMP_LG_U64))
.addReg(Src2.getReg())
@@ -4121,8 +4181,13 @@ MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter(
BuildMI(*BB, MII, DL, TII->get(Opc), Dest.getReg()).add(Src0).add(Src1);
- BuildMI(*BB, MII, DL, TII->get(AMDGPU::COPY), CarryDest.getReg())
- .addReg(AMDGPU::SCC);
+ unsigned SelOpc =
+ (WaveSize == 64) ? AMDGPU::S_CSELECT_B64 : AMDGPU::S_CSELECT_B32;
+
+ BuildMI(*BB, MII, DL, TII->get(SelOpc), CarryDest.getReg())
+ .addImm(-1)
+ .addImm(0);
+
MI.eraseFromParent();
return BB;
}
@@ -4261,6 +4326,13 @@ MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter(
MI.eraseFromParent();
return BB;
}
+ case AMDGPU::V_ADDC_U32_e32:
+ case AMDGPU::V_SUBB_U32_e32:
+ case AMDGPU::V_SUBBREV_U32_e32:
+ // These instructions have an implicit use of vcc which counts towards the
+ // constant bus limit.
+ TII->legalizeOperands(MI);
+ return BB;
case AMDGPU::DS_GWS_INIT:
case AMDGPU::DS_GWS_SEMA_BR:
case AMDGPU::DS_GWS_BARRIER:
@@ -4818,7 +4890,7 @@ static SDValue lowerBALLOTIntrinsic(const SITargetLowering &TLI, SDNode *N,
}
if (const ConstantSDNode *Arg = dyn_cast<ConstantSDNode>(Src)) {
// (ballot 0) -> 0
- if (Arg->isNullValue())
+ if (Arg->isZero())
return DAG.getConstant(0, SL, VT);
// (ballot 1) -> EXEC/EXEC_LO
@@ -5266,9 +5338,18 @@ SDValue SITargetLowering::lowerTrapHsaQueuePtr(
MachineFunction &MF = DAG.getMachineFunction();
SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
Register UserSGPR = Info->getQueuePtrUserSGPR();
- assert(UserSGPR != AMDGPU::NoRegister);
- SDValue QueuePtr = CreateLiveInRegister(
- DAG, &AMDGPU::SReg_64RegClass, UserSGPR, MVT::i64);
+
+ SDValue QueuePtr;
+ if (UserSGPR == AMDGPU::NoRegister) {
+ // We probably are in a function incorrectly marked with
+ // amdgpu-no-queue-ptr. This is undefined. We don't want to delete the trap,
+ // so just use a null pointer.
+ QueuePtr = DAG.getConstant(0, SL, MVT::i64);
+ } else {
+ QueuePtr = CreateLiveInRegister(
+ DAG, &AMDGPU::SReg_64RegClass, UserSGPR, MVT::i64);
+ }
+
SDValue SGPR01 = DAG.getRegister(AMDGPU::SGPR0_SGPR1, MVT::i64);
SDValue ToReg = DAG.getCopyToReg(Chain, SL, SGPR01,
QueuePtr, SDValue());
@@ -5345,7 +5426,11 @@ SDValue SITargetLowering::getSegmentAperture(unsigned AS, const SDLoc &DL,
MachineFunction &MF = DAG.getMachineFunction();
SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
Register UserSGPR = Info->getQueuePtrUserSGPR();
- assert(UserSGPR != AMDGPU::NoRegister);
+ if (UserSGPR == AMDGPU::NoRegister) {
+ // We probably are in a function incorrectly marked with
+ // amdgpu-no-queue-ptr. This is undefined.
+ return DAG.getUNDEF(MVT::i32);
+ }
SDValue QueuePtr = CreateLiveInRegister(
DAG, &AMDGPU::SReg_64RegClass, UserSGPR, MVT::i64);
@@ -5936,6 +6021,9 @@ static SDValue constructRetValue(SelectionDAG &DAG,
EVT LegalReqRetVT = ReqRetVT;
if (!ReqRetVT.isVector()) {
+ if (!Data.getValueType().isInteger())
+ Data = DAG.getNode(ISD::BITCAST, DL,
+ Data.getValueType().changeTypeToInteger(), Data);
Data = DAG.getNode(ISD::TRUNCATE, DL, ReqRetVT.changeTypeToInteger(), Data);
} else {
// We need to widen the return vector to a legal type
@@ -6124,7 +6212,7 @@ SDValue SITargetLowering::lowerImage(SDValue Op,
if (MIPMappingInfo) {
if (auto *ConstantLod = dyn_cast<ConstantSDNode>(
Op.getOperand(ArgOffset + Intr->MipIndex))) {
- if (ConstantLod->isNullValue()) {
+ if (ConstantLod->isZero()) {
IntrOpcode = MIPMappingInfo->NONMIP; // set new opcode to variant without _mip
VAddrEnd--; // remove 'mip'
}
@@ -6659,7 +6747,7 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
// intrinsic has the numerator as the first operand to match a normal
// division operation.
- SDValue Src0 = Param->isAllOnesValue() ? Numerator : Denominator;
+ SDValue Src0 = Param->isAllOnes() ? Numerator : Denominator;
return DAG.getNode(AMDGPUISD::DIV_SCALE, DL, Op->getVTList(), Src0,
Denominator, Numerator);
@@ -6793,7 +6881,7 @@ static void updateBufferMMO(MachineMemOperand *MMO, SDValue VOffset,
}
if (VIndex && (!isa<ConstantSDNode>(VIndex) ||
- !cast<ConstantSDNode>(VIndex)->isNullValue())) {
+ !cast<ConstantSDNode>(VIndex)->isZero())) {
// The strided index component of the address is not known to be zero, so we
// cannot represent it in the MMO. Give up.
MMO->setValue((Value *)nullptr);
@@ -7341,7 +7429,6 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
Op->getVTList(), Ops, VT, M->getMemOperand());
}
case Intrinsic::amdgcn_image_bvh_intersect_ray: {
- SDLoc DL(Op);
MemSDNode *M = cast<MemSDNode>(Op);
SDValue NodePtr = M->getOperand(2);
SDValue RayExtent = M->getOperand(3);
@@ -7360,12 +7447,27 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
return SDValue();
}
- bool IsA16 = RayDir.getValueType().getVectorElementType() == MVT::f16;
- bool Is64 = NodePtr.getValueType() == MVT::i64;
- unsigned Opcode = IsA16 ? Is64 ? AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16_nsa
- : AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16_nsa
- : Is64 ? AMDGPU::IMAGE_BVH64_INTERSECT_RAY_nsa
- : AMDGPU::IMAGE_BVH_INTERSECT_RAY_nsa;
+ const bool IsA16 = RayDir.getValueType().getVectorElementType() == MVT::f16;
+ const bool Is64 = NodePtr.getValueType() == MVT::i64;
+ const unsigned NumVDataDwords = 4;
+ const unsigned NumVAddrDwords = IsA16 ? (Is64 ? 9 : 8) : (Is64 ? 12 : 11);
+ const bool UseNSA = Subtarget->hasNSAEncoding() &&
+ NumVAddrDwords <= Subtarget->getNSAMaxSize();
+ const unsigned BaseOpcodes[2][2] = {
+ {AMDGPU::IMAGE_BVH_INTERSECT_RAY, AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16},
+ {AMDGPU::IMAGE_BVH64_INTERSECT_RAY,
+ AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16}};
+ int Opcode;
+ if (UseNSA) {
+ Opcode = AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16],
+ AMDGPU::MIMGEncGfx10NSA, NumVDataDwords,
+ NumVAddrDwords);
+ } else {
+ Opcode = AMDGPU::getMIMGOpcode(
+ BaseOpcodes[Is64][IsA16], AMDGPU::MIMGEncGfx10Default, NumVDataDwords,
+ PowerOf2Ceil(NumVAddrDwords));
+ }
+ assert(Opcode != -1);
SmallVector<SDValue, 16> Ops;
@@ -7405,6 +7507,20 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
packLanes(RayOrigin, true);
packLanes(RayDir, true);
packLanes(RayInvDir, false);
+
+ if (!UseNSA) {
+ // Build a single vector containing all the operands so far prepared.
+ if (NumVAddrDwords > 8) {
+ SDValue Undef = DAG.getUNDEF(MVT::i32);
+ Ops.append(16 - Ops.size(), Undef);
+ }
+ assert(Ops.size() == 8 || Ops.size() == 16);
+ SDValue MergedOps = DAG.getBuildVector(
+ Ops.size() == 16 ? MVT::v16i32 : MVT::v8i32, DL, Ops);
+ Ops.clear();
+ Ops.push_back(MergedOps);
+ }
+
Ops.push_back(TDescr);
if (IsA16)
Ops.push_back(DAG.getTargetConstant(1, DL, MVT::i1));
@@ -7610,7 +7726,7 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
Op.getOperand(0) // Chain
};
- unsigned Opc = Done->isNullValue() ? AMDGPU::EXP : AMDGPU::EXP_DONE;
+ unsigned Opc = Done->isZero() ? AMDGPU::EXP : AMDGPU::EXP_DONE;
return SDValue(DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops), 0);
}
case Intrinsic::amdgcn_s_barrier: {
@@ -8241,6 +8357,16 @@ SDValue SITargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
SDLoc DL(Op);
SDValue Cond = Op.getOperand(0);
+ if (Subtarget->hasScalarCompareEq64() && Op->getOperand(0)->hasOneUse() &&
+ !Op->isDivergent()) {
+ if (VT == MVT::i64)
+ return Op;
+ SDValue LHS = DAG.getNode(ISD::BITCAST, DL, MVT::i64, Op.getOperand(1));
+ SDValue RHS = DAG.getNode(ISD::BITCAST, DL, MVT::i64, Op.getOperand(2));
+ return DAG.getNode(ISD::BITCAST, DL, VT,
+ DAG.getSelect(DL, MVT::i64, Cond, LHS, RHS));
+ }
+
SDValue Zero = DAG.getConstant(0, DL, MVT::i32);
SDValue One = DAG.getConstant(1, DL, MVT::i32);
@@ -9358,7 +9484,8 @@ SDValue SITargetLowering::performOrCombine(SDNode *N,
const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(N->getOperand(1));
if (CRHS) {
if (SDValue Split
- = splitBinaryBitConstantOp(DCI, SDLoc(N), ISD::OR, LHS, CRHS))
+ = splitBinaryBitConstantOp(DCI, SDLoc(N), ISD::OR,
+ N->getOperand(0), CRHS))
return Split;
}
@@ -9445,7 +9572,7 @@ SDValue SITargetLowering::performClassCombine(SDNode *N,
// fp_class x, 0 -> false
if (const ConstantSDNode *CMask = dyn_cast<ConstantSDNode>(Mask)) {
- if (CMask->isNullValue())
+ if (CMask->isZero())
return DAG.getConstant(0, SDLoc(N), MVT::i1);
}
@@ -10348,7 +10475,7 @@ SDValue SITargetLowering::performAddCombine(SDNode *N,
return getMad64_32(DAG, SL, VT, MulLHS, MulRHS, AddRHS, false);
}
- if (numBitsSigned(MulLHS, DAG) < 32 && numBitsSigned(MulRHS, DAG) < 32) {
+ if (numBitsSigned(MulLHS, DAG) <= 32 && numBitsSigned(MulRHS, DAG) <= 32) {
MulLHS = DAG.getSExtOrTrunc(MulLHS, SL, MVT::i32);
MulRHS = DAG.getSExtOrTrunc(MulRHS, SL, MVT::i32);
AddRHS = DAG.getSExtOrTrunc(AddRHS, SL, MVT::i64);
@@ -10434,7 +10561,7 @@ SDValue SITargetLowering::performSubCombine(SDNode *N,
if (LHS.getOpcode() == ISD::SUBCARRY) {
// sub (subcarry x, 0, cc), y => subcarry x, y, cc
auto C = dyn_cast<ConstantSDNode>(LHS.getOperand(1));
- if (!C || !C->isNullValue())
+ if (!C || !C->isZero())
return SDValue();
SDValue Args[] = { LHS.getOperand(0), RHS, LHS.getOperand(2) };
return DAG.getNode(ISD::SUBCARRY, SDLoc(N), LHS->getVTList(), Args);
@@ -10657,20 +10784,20 @@ SDValue SITargetLowering::performSetCCCombine(SDNode *N,
// setcc (sext from i1 cc), -1, eq|sle|uge) => cc
// setcc (sext from i1 cc), 0, eq|sge|ule) => not cc => xor cc, -1
// setcc (sext from i1 cc), 0, ne|ugt|slt) => cc
- if ((CRHS->isAllOnesValue() &&
+ if ((CRHS->isAllOnes() &&
(CC == ISD::SETNE || CC == ISD::SETGT || CC == ISD::SETULT)) ||
- (CRHS->isNullValue() &&
+ (CRHS->isZero() &&
(CC == ISD::SETEQ || CC == ISD::SETGE || CC == ISD::SETULE)))
return DAG.getNode(ISD::XOR, SL, MVT::i1, LHS.getOperand(0),
DAG.getConstant(-1, SL, MVT::i1));
- if ((CRHS->isAllOnesValue() &&
+ if ((CRHS->isAllOnes() &&
(CC == ISD::SETEQ || CC == ISD::SETLE || CC == ISD::SETUGE)) ||
- (CRHS->isNullValue() &&
+ (CRHS->isZero() &&
(CC == ISD::SETNE || CC == ISD::SETUGT || CC == ISD::SETLT)))
return LHS.getOperand(0);
}
- uint64_t CRHSVal = CRHS->getZExtValue();
+ const APInt &CRHSVal = CRHS->getAPIntValue();
if ((CC == ISD::SETEQ || CC == ISD::SETNE) &&
LHS.getOpcode() == ISD::SELECT &&
isa<ConstantSDNode>(LHS.getOperand(1)) &&
@@ -10682,8 +10809,8 @@ SDValue SITargetLowering::performSetCCCombine(SDNode *N,
// setcc (select cc, CT, CF), CF, ne => cc
// setcc (select cc, CT, CF), CT, ne => xor cc, -1
// setcc (select cc, CT, CF), CT, eq => cc
- uint64_t CT = LHS.getConstantOperandVal(1);
- uint64_t CF = LHS.getConstantOperandVal(2);
+ const APInt &CT = LHS.getConstantOperandAPInt(1);
+ const APInt &CF = LHS.getConstantOperandAPInt(2);
if ((CF == CRHSVal && CC == ISD::SETEQ) ||
(CT == CRHSVal && CC == ISD::SETNE))
@@ -10747,7 +10874,7 @@ SDValue SITargetLowering::performCvtF32UByteNCombine(SDNode *N,
// cvt_f32_ubyte1 (srl x, 16) -> cvt_f32_ubyte3 x
// cvt_f32_ubyte0 (srl x, 8) -> cvt_f32_ubyte1 x
if (auto *C = dyn_cast<ConstantSDNode>(Shift.getOperand(1))) {
- Shift = DAG.getZExtOrTrunc(Shift.getOperand(0),
+ SDValue Shifted = DAG.getZExtOrTrunc(Shift.getOperand(0),
SDLoc(Shift.getOperand(0)), MVT::i32);
unsigned ShiftOffset = 8 * Offset;
@@ -10758,7 +10885,7 @@ SDValue SITargetLowering::performCvtF32UByteNCombine(SDNode *N,
if (ShiftOffset < 32 && (ShiftOffset % 8) == 0) {
return DAG.getNode(AMDGPUISD::CVT_F32_UBYTE0 + ShiftOffset / 8, SL,
- MVT::f32, Shift);
+ MVT::f32, Shifted);
}
}
}
@@ -12086,6 +12213,25 @@ static bool fpModeMatchesGlobalFPAtomicMode(const AtomicRMWInst *RMW) {
TargetLowering::AtomicExpansionKind
SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const {
+
+ auto ReportUnsafeHWInst = [&](TargetLowering::AtomicExpansionKind Kind) {
+ OptimizationRemarkEmitter ORE(RMW->getFunction());
+ LLVMContext &Ctx = RMW->getFunction()->getContext();
+ SmallVector<StringRef> SSNs;
+ Ctx.getSyncScopeNames(SSNs);
+ auto MemScope = SSNs[RMW->getSyncScopeID()].empty()
+ ? "system"
+ : SSNs[RMW->getSyncScopeID()];
+ ORE.emit([&]() {
+ return OptimizationRemark(DEBUG_TYPE, "Passed", RMW)
+ << "Hardware instruction generated for atomic "
+ << RMW->getOperationName(RMW->getOperation())
+ << " operation at memory scope " << MemScope
+ << " due to an unsafe request.";
+ });
+ return Kind;
+ };
+
switch (RMW->getOperation()) {
case AtomicRMWInst::FAdd: {
Type *Ty = RMW->getType();
@@ -12120,28 +12266,30 @@ SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const {
SSID == RMW->getContext().getOrInsertSyncScopeID("one-as"))
return AtomicExpansionKind::CmpXChg;
- return AtomicExpansionKind::None;
+ return ReportUnsafeHWInst(AtomicExpansionKind::None);
}
if (AS == AMDGPUAS::FLAT_ADDRESS)
return AtomicExpansionKind::CmpXChg;
- return RMW->use_empty() ? AtomicExpansionKind::None
+ return RMW->use_empty() ? ReportUnsafeHWInst(AtomicExpansionKind::None)
: AtomicExpansionKind::CmpXChg;
}
// DS FP atomics do repect the denormal mode, but the rounding mode is fixed
// to round-to-nearest-even.
// The only exception is DS_ADD_F64 which never flushes regardless of mode.
- if (AS == AMDGPUAS::LOCAL_ADDRESS && Subtarget->hasLDSFPAtomics()) {
+ if (AS == AMDGPUAS::LOCAL_ADDRESS && Subtarget->hasLDSFPAtomicAdd()) {
if (!Ty->isDoubleTy())
return AtomicExpansionKind::None;
- return (fpModeMatchesGlobalFPAtomicMode(RMW) ||
- RMW->getFunction()
- ->getFnAttribute("amdgpu-unsafe-fp-atomics")
- .getValueAsString() == "true")
- ? AtomicExpansionKind::None
+ if (fpModeMatchesGlobalFPAtomicMode(RMW))
+ return AtomicExpansionKind::None;
+
+ return RMW->getFunction()
+ ->getFnAttribute("amdgpu-unsafe-fp-atomics")
+ .getValueAsString() == "true"
+ ? ReportUnsafeHWInst(AtomicExpansionKind::None)
: AtomicExpansionKind::CmpXChg;
}