summaryrefslogtreecommitdiff
path: root/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'llvm/lib/Target/AMDGPU/SIISelLowering.cpp')
-rw-r--r--llvm/lib/Target/AMDGPU/SIISelLowering.cpp265
1 files changed, 158 insertions, 107 deletions
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 56ebf9c06741..e73d87cd66af 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -90,16 +90,21 @@ using namespace llvm;
STATISTIC(NumTailCalls, "Number of tail calls");
-static cl::opt<bool> EnableVGPRIndexMode(
- "amdgpu-vgpr-index-mode",
- cl::desc("Use GPR indexing mode instead of movrel for vector indexing"),
- cl::init(false));
-
static cl::opt<bool> DisableLoopAlignment(
"amdgpu-disable-loop-alignment",
cl::desc("Do not align and prefetch loops"),
cl::init(false));
+static bool hasFP32Denormals(const MachineFunction &MF) {
+ const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
+ return Info->getMode().FP32Denormals;
+}
+
+static bool hasFP64FP16Denormals(const MachineFunction &MF) {
+ const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
+ return Info->getMode().FP64FP16Denormals;
+}
+
static unsigned findFirstFreeSGPR(CCState &CCInfo) {
unsigned NumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs();
for (unsigned Reg = 0; Reg < NumSGPRs; ++Reg) {
@@ -160,6 +165,13 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
computeRegisterProperties(Subtarget->getRegisterInfo());
+ // The boolean content concept here is too inflexible. Compares only ever
+ // really produce a 1-bit result. Any copy/extend from these will turn into a
+ // select, and zext/1 or sext/-1 are equally cheap. Arbitrarily choose 0/1, as
+ // it's what most targets use.
+ setBooleanContents(ZeroOrOneBooleanContent);
+ setBooleanVectorContents(ZeroOrOneBooleanContent);
+
// We need to custom lower vector stores from local memory
setOperationAction(ISD::LOAD, MVT::v2i32, Custom);
setOperationAction(ISD::LOAD, MVT::v3i32, Custom);
@@ -358,14 +370,16 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
setOperationAction(ISD::DEBUGTRAP, MVT::Other, Custom);
if (Subtarget->has16BitInsts()) {
+ setOperationAction(ISD::FPOW, MVT::f16, Promote);
setOperationAction(ISD::FLOG, MVT::f16, Custom);
setOperationAction(ISD::FEXP, MVT::f16, Custom);
setOperationAction(ISD::FLOG10, MVT::f16, Custom);
}
- // v_mad_f32 does not support denormals according to some sources.
- if (!Subtarget->hasFP32Denormals())
- setOperationAction(ISD::FMAD, MVT::f32, Legal);
+ // v_mad_f32 does not support denormals. We report it as unconditionally
+ // legal, and the context where it is formed will disallow it when fp32
+ // denormals are enabled.
+ setOperationAction(ISD::FMAD, MVT::f32, Legal);
if (!Subtarget->hasBFI()) {
// fcopysign can be done in a single instruction with BFI.
@@ -473,8 +487,6 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
setOperationAction(ISD::FP_TO_SINT, MVT::i16, Promote);
setOperationAction(ISD::FP_TO_UINT, MVT::i16, Promote);
- setOperationAction(ISD::SINT_TO_FP, MVT::i16, Promote);
- setOperationAction(ISD::UINT_TO_FP, MVT::i16, Promote);
// F16 - Constant Actions.
setOperationAction(ISD::ConstantFP, MVT::f16, Legal);
@@ -489,6 +501,10 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
setOperationAction(ISD::FP_ROUND, MVT::f16, Custom);
setOperationAction(ISD::FCOS, MVT::f16, Promote);
setOperationAction(ISD::FSIN, MVT::f16, Promote);
+
+ setOperationAction(ISD::SINT_TO_FP, MVT::i16, Custom);
+ setOperationAction(ISD::UINT_TO_FP, MVT::i16, Custom);
+
setOperationAction(ISD::FP_TO_SINT, MVT::f16, Promote);
setOperationAction(ISD::FP_TO_UINT, MVT::f16, Promote);
setOperationAction(ISD::SINT_TO_FP, MVT::f16, Promote);
@@ -503,7 +519,7 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
// F16 - VOP3 Actions.
setOperationAction(ISD::FMA, MVT::f16, Legal);
- if (!Subtarget->hasFP16Denormals() && STI.hasMadF16())
+ if (STI.hasMadF16())
setOperationAction(ISD::FMAD, MVT::f16, Legal);
for (MVT VT : {MVT::v2i16, MVT::v2f16, MVT::v4i16, MVT::v4f16}) {
@@ -761,12 +777,13 @@ const GCNSubtarget *SITargetLowering::getSubtarget() const {
//
// There is only one special case when denormals are enabled we don't currently,
// where this is OK to use.
-bool SITargetLowering::isFPExtFoldable(unsigned Opcode,
- EVT DestVT, EVT SrcVT) const {
+bool SITargetLowering::isFPExtFoldable(const SelectionDAG &DAG, unsigned Opcode,
+ EVT DestVT, EVT SrcVT) const {
return ((Opcode == ISD::FMAD && Subtarget->hasMadMixInsts()) ||
(Opcode == ISD::FMA && Subtarget->hasFmaMixInsts())) &&
- DestVT.getScalarType() == MVT::f32 && !Subtarget->hasFP32Denormals() &&
- SrcVT.getScalarType() == MVT::f16;
+ DestVT.getScalarType() == MVT::f32 &&
+ SrcVT.getScalarType() == MVT::f16 &&
+ !hasFP32Denormals(DAG.getMachineFunction());
}
bool SITargetLowering::isShuffleMaskLegal(ArrayRef<int>, EVT) const {
@@ -1069,23 +1086,18 @@ bool SITargetLowering::isLegalFlatAddressingMode(const AddrMode &AM) const {
return AM.BaseOffs == 0 && AM.Scale == 0;
}
- // GFX9 added a 13-bit signed offset. When using regular flat instructions,
- // the sign bit is ignored and is treated as a 12-bit unsigned offset.
-
- // GFX10 shrinked signed offset to 12 bits. When using regular flat
- // instructions, the sign bit is also ignored and is treated as 11-bit
- // unsigned offset.
-
- if (Subtarget->getGeneration() >= AMDGPUSubtarget::GFX10)
- return isUInt<11>(AM.BaseOffs) && AM.Scale == 0;
-
- // Just r + i
- return isUInt<12>(AM.BaseOffs) && AM.Scale == 0;
+ return AM.Scale == 0 &&
+ (AM.BaseOffs == 0 || Subtarget->getInstrInfo()->isLegalFLATOffset(
+ AM.BaseOffs, AMDGPUAS::FLAT_ADDRESS,
+ /*Signed=*/false));
}
bool SITargetLowering::isLegalGlobalAddressingMode(const AddrMode &AM) const {
if (Subtarget->hasFlatGlobalInsts())
- return isInt<13>(AM.BaseOffs) && AM.Scale == 0;
+ return AM.Scale == 0 &&
+ (AM.BaseOffs == 0 || Subtarget->getInstrInfo()->isLegalFLATOffset(
+ AM.BaseOffs, AMDGPUAS::GLOBAL_ADDRESS,
+ /*Signed=*/true));
if (!Subtarget->hasAddr64() || Subtarget->useFlatForGlobal()) {
// Assume the we will use FLAT for all global memory accesses
@@ -1326,13 +1338,6 @@ EVT SITargetLowering::getOptimalMemOpType(
return MVT::Other;
}
-static bool isFlatGlobalAddrSpace(unsigned AS) {
- return AS == AMDGPUAS::GLOBAL_ADDRESS ||
- AS == AMDGPUAS::FLAT_ADDRESS ||
- AS == AMDGPUAS::CONSTANT_ADDRESS ||
- AS > AMDGPUAS::MAX_AMDGPU_ADDRESS;
-}
-
bool SITargetLowering::isNoopAddrSpaceCast(unsigned SrcAS,
unsigned DestAS) const {
return isFlatGlobalAddrSpace(SrcAS) && isFlatGlobalAddrSpace(DestAS);
@@ -1466,9 +1471,7 @@ SDValue SITargetLowering::lowerKernargMemParameter(
const SDLoc &SL, SDValue Chain,
uint64_t Offset, unsigned Align, bool Signed,
const ISD::InputArg *Arg) const {
- Type *Ty = MemVT.getTypeForEVT(*DAG.getContext());
- PointerType *PtrTy = PointerType::get(Ty, AMDGPUAS::CONSTANT_ADDRESS);
- MachinePointerInfo PtrInfo(UndefValue::get(PtrTy));
+ MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
// Try to avoid using an extload by loading earlier than the argument address,
// and extracting the relevant bits. The load should hopefully be merged with
@@ -2666,9 +2669,7 @@ bool SITargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const {
const Function *ParentFn = CI->getParent()->getParent();
if (AMDGPU::isEntryFunctionCC(ParentFn->getCallingConv()))
return false;
-
- auto Attr = ParentFn->getFnAttribute("disable-tail-calls");
- return (Attr.getValueAsString() != "true");
+ return true;
}
// The wave scratch offset register is used as the global base pointer.
@@ -2787,10 +2788,9 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,
MVT PtrVT = MVT::i32;
// Walk the register/memloc assignments, inserting copies/loads.
- for (unsigned i = 0, realArgIdx = 0, e = ArgLocs.size(); i != e;
- ++i, ++realArgIdx) {
+ for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
CCValAssign &VA = ArgLocs[i];
- SDValue Arg = OutVals[realArgIdx];
+ SDValue Arg = OutVals[i];
// Promote the value if needed.
switch (VA.getLocInfo()) {
@@ -2830,7 +2830,7 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,
MaybeAlign Alignment;
if (IsTailCall) {
- ISD::ArgFlagsTy Flags = Outs[realArgIdx].Flags;
+ ISD::ArgFlagsTy Flags = Outs[i].Flags;
unsigned OpSize = Flags.isByVal() ?
Flags.getByValSize() : VA.getValVT().getStoreSize();
@@ -2868,8 +2868,7 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,
Chain, DL, DstAddr, Arg, SizeNode, Outs[i].Flags.getByValAlign(),
/*isVol = */ false, /*AlwaysInline = */ true,
/*isTailCall = */ false, DstInfo,
- MachinePointerInfo(UndefValue::get(Type::getInt8PtrTy(
- *DAG.getContext(), AMDGPUAS::PRIVATE_ADDRESS))));
+ MachinePointerInfo(AMDGPUAS::PRIVATE_ADDRESS));
MemOpChains.push_back(Cpy);
} else {
@@ -2986,7 +2985,7 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,
IsThisReturn ? OutVals[0] : SDValue());
}
-Register SITargetLowering::getRegisterByName(const char* RegName, EVT VT,
+Register SITargetLowering::getRegisterByName(const char* RegName, LLT VT,
const MachineFunction &MF) const {
Register Reg = StringSwitch<Register>(RegName)
.Case("m0", AMDGPU::M0)
@@ -3411,7 +3410,7 @@ static MachineBasicBlock *emitIndirectSrc(MachineInstr &MI,
std::tie(SubReg, Offset)
= computeIndirectRegAndOffset(TRI, VecRC, SrcReg, Offset);
- bool UseGPRIdxMode = ST.useVGPRIndexMode(EnableVGPRIndexMode);
+ const bool UseGPRIdxMode = ST.useVGPRIndexMode();
if (setM0ToIndexFromSGPR(TII, MRI, MI, Offset, UseGPRIdxMode, true)) {
MachineBasicBlock::iterator I(&MI);
@@ -3506,7 +3505,7 @@ static MachineBasicBlock *emitIndirectDst(MachineInstr &MI,
std::tie(SubReg, Offset) = computeIndirectRegAndOffset(TRI, VecRC,
SrcVec->getReg(),
Offset);
- bool UseGPRIdxMode = ST.useVGPRIndexMode(EnableVGPRIndexMode);
+ const bool UseGPRIdxMode = ST.useVGPRIndexMode();
if (Idx->getReg() == AMDGPU::NoRegister) {
MachineBasicBlock::iterator I(&MI);
@@ -3920,7 +3919,8 @@ MVT SITargetLowering::getScalarShiftAmountTy(const DataLayout &, EVT VT) const {
// however does not support denormals, so we do report fma as faster if we have
// a fast fma device and require denormals.
//
-bool SITargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const {
+bool SITargetLowering::isFMAFasterThanFMulAndFAdd(const MachineFunction &MF,
+ EVT VT) const {
VT = VT.getScalarType();
switch (VT.getSimpleVT().SimpleTy) {
@@ -3929,7 +3929,7 @@ bool SITargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const {
// mad available which returns the same result as the separate operations
// which we should prefer over fma. We can't use this if we want to support
// denormals, so only report this in these cases.
- if (Subtarget->hasFP32Denormals())
+ if (hasFP32Denormals(MF))
return Subtarget->hasFastFMAF32() || Subtarget->hasDLInsts();
// If the subtarget has v_fmac_f32, that's just as good as v_mac_f32.
@@ -3938,7 +3938,7 @@ bool SITargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const {
case MVT::f64:
return true;
case MVT::f16:
- return Subtarget->has16BitInsts() && Subtarget->hasFP16Denormals();
+ return Subtarget->has16BitInsts() && hasFP64FP16Denormals(MF);
default:
break;
}
@@ -3946,6 +3946,21 @@ bool SITargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const {
return false;
}
+bool SITargetLowering::isFMADLegalForFAddFSub(const SelectionDAG &DAG,
+ const SDNode *N) const {
+ // TODO: Check future ftz flag
+ // v_mad_f32/v_mac_f32 do not support denormals.
+ EVT VT = N->getValueType(0);
+ if (VT == MVT::f32)
+ return !hasFP32Denormals(DAG.getMachineFunction());
+ if (VT == MVT::f16) {
+ return Subtarget->hasMadF16() &&
+ !hasFP64FP16Denormals(DAG.getMachineFunction());
+ }
+
+ return false;
+}
+
//===----------------------------------------------------------------------===//
// Custom DAG Lowering Operations
//===----------------------------------------------------------------------===//
@@ -4416,8 +4431,8 @@ unsigned SITargetLowering::isCFIntrinsic(const SDNode *Intr) const {
bool SITargetLowering::shouldEmitFixup(const GlobalValue *GV) const {
const Triple &TT = getTargetMachine().getTargetTriple();
- return (GV->getType()->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS ||
- GV->getType()->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT) &&
+ return (GV->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS ||
+ GV->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT) &&
AMDGPU::shouldEmitConstantsToTextSection(TT);
}
@@ -4425,9 +4440,9 @@ bool SITargetLowering::shouldEmitGOTReloc(const GlobalValue *GV) const {
// FIXME: Either avoid relying on address space here or change the default
// address space for functions to avoid the explicit check.
return (GV->getValueType()->isFunctionTy() ||
- GV->getType()->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS ||
- GV->getType()->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS ||
- GV->getType()->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT) &&
+ GV->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS ||
+ GV->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS ||
+ GV->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT) &&
!shouldEmitFixup(GV) &&
!getTargetMachine().shouldAssumeDSOLocal(*GV->getParent(), GV);
}
@@ -4694,10 +4709,7 @@ SDValue SITargetLowering::getSegmentAperture(unsigned AS, const SDLoc &DL,
// TODO: Use custom target PseudoSourceValue.
// TODO: We should use the value from the IR intrinsic call, but it might not
// be available and how do we get it?
- Value *V = UndefValue::get(PointerType::get(Type::getInt8Ty(*DAG.getContext()),
- AMDGPUAS::CONSTANT_ADDRESS));
-
- MachinePointerInfo PtrInfo(V, StructOffset);
+ MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
return DAG.getLoad(MVT::i32, DL, QueuePtr.getValue(1), Ptr, PtrInfo,
MinAlign(64, StructOffset),
MachineMemOperand::MODereferenceable |
@@ -5646,11 +5658,16 @@ SDValue SITargetLowering::lowerSBuffer(EVT VT, SDLoc DL, SDValue Rsrc,
SDValue Offset, SDValue GLC, SDValue DLC,
SelectionDAG &DAG) const {
MachineFunction &MF = DAG.getMachineFunction();
+
+ const DataLayout &DataLayout = DAG.getDataLayout();
+ unsigned Align =
+ DataLayout.getABITypeAlignment(VT.getTypeForEVT(*DAG.getContext()));
+
MachineMemOperand *MMO = MF.getMachineMemOperand(
MachinePointerInfo(),
MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
MachineMemOperand::MOInvariant,
- VT.getStoreSize(), VT.getStoreSize());
+ VT.getStoreSize(), Align);
if (!Offset->isDivergent()) {
SDValue Ops[] = {
@@ -5659,6 +5676,20 @@ SDValue SITargetLowering::lowerSBuffer(EVT VT, SDLoc DL, SDValue Rsrc,
GLC,
DLC,
};
+
+ // Widen vec3 load to vec4.
+ if (VT.isVector() && VT.getVectorNumElements() == 3) {
+ EVT WidenedVT =
+ EVT::getVectorVT(*DAG.getContext(), VT.getVectorElementType(), 4);
+ auto WidenedOp = DAG.getMemIntrinsicNode(
+ AMDGPUISD::SBUFFER_LOAD, DL, DAG.getVTList(WidenedVT), Ops, WidenedVT,
+ MF.getMachineMemOperand(MMO, 0, WidenedVT.getStoreSize()));
+ auto Subvector = DAG.getNode(
+ ISD::EXTRACT_SUBVECTOR, DL, VT, WidenedOp,
+ DAG.getConstant(0, DL, getVectorIdxTy(DAG.getDataLayout())));
+ return Subvector;
+ }
+
return DAG.getMemIntrinsicNode(AMDGPUISD::SBUFFER_LOAD, DL,
DAG.getVTList(VT), Ops, VT, MMO);
}
@@ -5670,11 +5701,10 @@ SDValue SITargetLowering::lowerSBuffer(EVT VT, SDLoc DL, SDValue Rsrc,
MVT LoadVT = VT.getSimpleVT();
unsigned NumElts = LoadVT.isVector() ? LoadVT.getVectorNumElements() : 1;
assert((LoadVT.getScalarType() == MVT::i32 ||
- LoadVT.getScalarType() == MVT::f32) &&
- isPowerOf2_32(NumElts));
+ LoadVT.getScalarType() == MVT::f32));
if (NumElts == 8 || NumElts == 16) {
- NumLoads = NumElts == 16 ? 4 : 2;
+ NumLoads = NumElts / 4;
LoadVT = MVT::v4i32;
}
@@ -5698,8 +5728,8 @@ SDValue SITargetLowering::lowerSBuffer(EVT VT, SDLoc DL, SDValue Rsrc,
uint64_t InstOffset = cast<ConstantSDNode>(Ops[5])->getZExtValue();
for (unsigned i = 0; i < NumLoads; ++i) {
Ops[5] = DAG.getTargetConstant(InstOffset + 16 * i, DL, MVT::i32);
- Loads.push_back(DAG.getMemIntrinsicNode(AMDGPUISD::BUFFER_LOAD, DL, VTList,
- Ops, LoadVT, MMO));
+ Loads.push_back(getMemIntrinsicNode(AMDGPUISD::BUFFER_LOAD, DL, VTList, Ops,
+ LoadVT, MMO, DAG));
}
if (VT == MVT::v8i32 || VT == MVT::v16i32)
@@ -5918,22 +5948,6 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
return DAG.getNode(AMDGPUISD::INTERP_P1LL_F16, DL, MVT::f32, Ops);
}
}
- case Intrinsic::amdgcn_interp_p2_f16: {
- SDValue ToM0 = DAG.getCopyToReg(DAG.getEntryNode(), DL, AMDGPU::M0,
- Op.getOperand(6), SDValue());
- SDValue Ops[] = {
- Op.getOperand(2), // Src0
- Op.getOperand(3), // Attrchan
- Op.getOperand(4), // Attr
- DAG.getTargetConstant(0, DL, MVT::i32), // $src0_modifiers
- Op.getOperand(1), // Src2
- DAG.getTargetConstant(0, DL, MVT::i32), // $src2_modifiers
- Op.getOperand(5), // high
- DAG.getTargetConstant(0, DL, MVT::i1), // $clamp
- ToM0.getValue(1)
- };
- return DAG.getNode(AMDGPUISD::INTERP_P2_F16, DL, MVT::f16, Ops);
- }
case Intrinsic::amdgcn_sin:
return DAG.getNode(AMDGPUISD::SIN_HW, DL, VT, Op.getOperand(1));
@@ -7088,13 +7102,16 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
EVT VT = Op.getOperand(3).getValueType();
auto *M = cast<MemSDNode>(Op);
- unsigned Opcode = VT.isVector() ? AMDGPUISD::ATOMIC_PK_FADD
- : AMDGPUISD::ATOMIC_FADD;
+ if (VT.isVector()) {
+ return DAG.getMemIntrinsicNode(
+ AMDGPUISD::ATOMIC_PK_FADD, DL, Op->getVTList(), Ops, VT,
+ M->getMemOperand());
+ }
- return DAG.getMemIntrinsicNode(Opcode, DL, Op->getVTList(), Ops, VT,
- M->getMemOperand());
+ return DAG.getAtomic(ISD::ATOMIC_LOAD_FADD, DL, VT,
+ DAG.getVTList(VT, MVT::Other), Ops,
+ M->getMemOperand()).getValue(1);
}
-
case Intrinsic::amdgcn_end_cf:
return SDValue(DAG.getMachineNode(AMDGPU::SI_END_CF, DL, MVT::Other,
Op->getOperand(2), Chain), 0);
@@ -7451,8 +7468,11 @@ SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
// resource descriptor, we can only make private accesses up to a certain
// size.
switch (Subtarget->getMaxPrivateElementSize()) {
- case 4:
- return scalarizeVectorLoad(Load, DAG);
+ case 4: {
+ SDValue Ops[2];
+ std::tie(Ops[0], Ops[1]) = scalarizeVectorLoad(Load, DAG);
+ return DAG.getMergeValues(Ops, DL);
+ }
case 8:
if (NumElements > 2)
return SplitVectorLoad(Op, DAG);
@@ -7529,7 +7549,7 @@ SDValue SITargetLowering::lowerFastUnsafeFDIV(SDValue Op,
const SDNodeFlags Flags = Op->getFlags();
bool Unsafe = DAG.getTarget().Options.UnsafeFPMath || Flags.hasAllowReciprocal();
- if (!Unsafe && VT == MVT::f32 && Subtarget->hasFP32Denormals())
+ if (!Unsafe && VT == MVT::f32 && hasFP32Denormals(DAG.getMachineFunction()))
return SDValue();
if (const ConstantFPSDNode *CLHS = dyn_cast<ConstantFPSDNode>(LHS)) {
@@ -7672,7 +7692,7 @@ SDValue SITargetLowering::lowerFDIV_FAST(SDValue Op, SelectionDAG &DAG) const {
static const SDValue getSPDenormModeValue(int SPDenormMode, SelectionDAG &DAG,
const SDLoc &SL, const GCNSubtarget *ST) {
assert(ST->hasDenormModeInst() && "Requires S_DENORM_MODE");
- int DPDenormModeDefault = ST->hasFP64Denormals()
+ int DPDenormModeDefault = hasFP64FP16Denormals(DAG.getMachineFunction())
? FP_DENORM_FLUSH_NONE
: FP_DENORM_FLUSH_IN_FLUSH_OUT;
@@ -7708,7 +7728,9 @@ SDValue SITargetLowering::LowerFDIV32(SDValue Op, SelectionDAG &DAG) const {
(1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_);
const SDValue BitField = DAG.getTargetConstant(Denorm32Reg, SL, MVT::i16);
- if (!Subtarget->hasFP32Denormals()) {
+ const bool HasFP32Denormals = hasFP32Denormals(DAG.getMachineFunction());
+
+ if (!HasFP32Denormals) {
SDVTList BindParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
SDValue EnableDenorm;
@@ -7752,8 +7774,7 @@ SDValue SITargetLowering::LowerFDIV32(SDValue Op, SelectionDAG &DAG) const {
SDValue Fma4 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, NegDivScale0, Fma3,
NumeratorScaled, Fma3);
- if (!Subtarget->hasFP32Denormals()) {
-
+ if (!HasFP32Denormals) {
SDValue DisableDenorm;
if (Subtarget->hasDenormModeInst()) {
const SDValue DisableDenormValue =
@@ -8727,7 +8748,7 @@ bool SITargetLowering::isCanonicalized(SelectionDAG &DAG, SDValue Op,
auto F = CFP->getValueAPF();
if (F.isNaN() && F.isSignaling())
return false;
- return !F.isDenormal() || denormalsEnabledForType(Op.getValueType());
+ return !F.isDenormal() || denormalsEnabledForType(DAG, Op.getValueType());
}
// If source is a result of another standard FP operation it is already in
@@ -8796,7 +8817,7 @@ bool SITargetLowering::isCanonicalized(SelectionDAG &DAG, SDValue Op,
// snans will be quieted, so we only need to worry about denormals.
if (Subtarget->supportsMinMaxDenormModes() ||
- denormalsEnabledForType(Op.getValueType()))
+ denormalsEnabledForType(DAG, Op.getValueType()))
return true;
// Flushing may be required.
@@ -8868,7 +8889,7 @@ bool SITargetLowering::isCanonicalized(SelectionDAG &DAG, SDValue Op,
LLVM_FALLTHROUGH;
}
default:
- return denormalsEnabledForType(Op.getValueType()) &&
+ return denormalsEnabledForType(DAG, Op.getValueType()) &&
DAG.isKnownNeverSNaN(Op);
}
@@ -8879,7 +8900,7 @@ bool SITargetLowering::isCanonicalized(SelectionDAG &DAG, SDValue Op,
SDValue SITargetLowering::getCanonicalConstantFP(
SelectionDAG &DAG, const SDLoc &SL, EVT VT, const APFloat &C) const {
// Flush denormals to 0 if not enabled.
- if (C.isDenormal() && !denormalsEnabledForType(VT))
+ if (C.isDenormal() && !denormalsEnabledForType(DAG, VT))
return DAG.getConstantFP(0.0, SL, VT);
if (C.isNaN()) {
@@ -9417,8 +9438,8 @@ unsigned SITargetLowering::getFusedOpcode(const SelectionDAG &DAG,
// Only do this if we are not trying to support denormals. v_mad_f32 does not
// support denormals ever.
- if (((VT == MVT::f32 && !Subtarget->hasFP32Denormals()) ||
- (VT == MVT::f16 && !Subtarget->hasFP16Denormals() &&
+ if (((VT == MVT::f32 && !hasFP32Denormals(DAG.getMachineFunction())) ||
+ (VT == MVT::f16 && !hasFP64FP16Denormals(DAG.getMachineFunction()) &&
getSubtarget()->hasMadF16())) &&
isOperationLegal(ISD::FMAD, VT))
return ISD::FMAD;
@@ -9427,7 +9448,7 @@ unsigned SITargetLowering::getFusedOpcode(const SelectionDAG &DAG,
if ((Options.AllowFPOpFusion == FPOpFusion::Fast || Options.UnsafeFPMath ||
(N0->getFlags().hasAllowContract() &&
N1->getFlags().hasAllowContract())) &&
- isFMAFasterThanFMulAndFAdd(VT)) {
+ isFMAFasterThanFMulAndFAdd(DAG.getMachineFunction(), VT)) {
return ISD::FMA;
}
@@ -9543,6 +9564,8 @@ SDValue SITargetLowering::performAddCombine(SDNode *N,
case ISD::SIGN_EXTEND:
case ISD::ANY_EXTEND: {
auto Cond = RHS.getOperand(0);
+ // If this won't be a real VOPC output, we would still need to insert an
+ // extra instruction anyway.
if (!isBoolSGPR(Cond))
break;
SDVTList VTList = DAG.getVTList(MVT::i32, MVT::i1);
@@ -9573,6 +9596,26 @@ SDValue SITargetLowering::performSubCombine(SDNode *N,
SDValue LHS = N->getOperand(0);
SDValue RHS = N->getOperand(1);
+ // sub x, zext (setcc) => subcarry x, 0, setcc
+ // sub x, sext (setcc) => addcarry x, 0, setcc
+ unsigned Opc = RHS.getOpcode();
+ switch (Opc) {
+ default: break;
+ case ISD::ZERO_EXTEND:
+ case ISD::SIGN_EXTEND:
+ case ISD::ANY_EXTEND: {
+ auto Cond = RHS.getOperand(0);
+ // If this won't be a real VOPC output, we would still need to insert an
+ // extra instruction anyway.
+ if (!isBoolSGPR(Cond))
+ break;
+ SDVTList VTList = DAG.getVTList(MVT::i32, MVT::i1);
+ SDValue Args[] = { LHS, DAG.getConstant(0, SL, MVT::i32), Cond };
+ Opc = (Opc == ISD::SIGN_EXTEND) ? ISD::ADDCARRY : ISD::SUBCARRY;
+ return DAG.getNode(Opc, SL, VTList, Args);
+ }
+ }
+
if (LHS.getOpcode() == ISD::SUBCARRY) {
// sub (subcarry x, 0, cc), y => subcarry x, y, cc
auto C = dyn_cast<ConstantSDNode>(LHS.getOperand(1));
@@ -10884,14 +10927,14 @@ bool SITargetLowering::isSDNodeSourceOfDivergence(const SDNode * N,
return false;
}
-bool SITargetLowering::denormalsEnabledForType(EVT VT) const {
+bool SITargetLowering::denormalsEnabledForType(const SelectionDAG &DAG,
+ EVT VT) const {
switch (VT.getScalarType().getSimpleVT().SimpleTy) {
case MVT::f32:
- return Subtarget->hasFP32Denormals();
+ return hasFP32Denormals(DAG.getMachineFunction());
case MVT::f64:
- return Subtarget->hasFP64Denormals();
case MVT::f16:
- return Subtarget->hasFP16Denormals();
+ return hasFP64FP16Denormals(DAG.getMachineFunction());
default:
return false;
}
@@ -10930,6 +10973,12 @@ SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const {
// TODO: Do have these for flat. Older targets also had them for buffers.
unsigned AS = RMW->getPointerAddressSpace();
+
+ if (AS == AMDGPUAS::GLOBAL_ADDRESS && Subtarget->hasAtomicFaddInsts()) {
+ return RMW->use_empty() ? AtomicExpansionKind::None :
+ AtomicExpansionKind::CmpXChg;
+ }
+
return (AS == AMDGPUAS::LOCAL_ADDRESS && Subtarget->hasLDSFPAtomics()) ?
AtomicExpansionKind::None : AtomicExpansionKind::CmpXChg;
}
@@ -10956,6 +11005,8 @@ SITargetLowering::getRegClassFor(MVT VT, bool isDivergent) const {
}
static bool hasCFUser(const Value *V, SmallPtrSet<const Value *, 16> &Visited) {
+ if (!isa<Instruction>(V))
+ return false;
if (!Visited.insert(V).second)
return false;
bool Result = false;