aboutsummaryrefslogtreecommitdiff
path: root/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp')
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp493
1 files changed, 258 insertions, 235 deletions
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index 23cc9404532d..940ec6f31c69 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -16,7 +16,6 @@
#include "AMDGPU.h"
#include "AMDGPUCallLowering.h"
#include "AMDGPUFrameLowering.h"
-#include "AMDGPURegisterInfo.h"
#include "AMDGPUSubtarget.h"
#include "AMDGPUTargetMachine.h"
#include "Utils/AMDGPUBaseInfo.h"
@@ -38,6 +37,11 @@ using namespace llvm;
#include "AMDGPUGenCallingConv.inc"
+static cl::opt<bool> AMDGPUBypassSlowDiv(
+ "amdgpu-bypass-slow-div",
+ cl::desc("Skip 64-bit divide for dynamic 32-bit values"),
+ cl::init(true));
+
// Find a larger type to do a load / store of a vector with.
EVT AMDGPUTargetLowering::getEquivalentMemType(LLVMContext &Ctx, EVT VT) {
unsigned StoreSize = VT.getStoreSizeInBits();
@@ -103,6 +107,24 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
setOperationAction(ISD::LOAD, MVT::v2f64, Promote);
AddPromotedToType(ISD::LOAD, MVT::v2f64, MVT::v4i32);
+ setOperationAction(ISD::LOAD, MVT::v4i64, Promote);
+ AddPromotedToType(ISD::LOAD, MVT::v4i64, MVT::v8i32);
+
+ setOperationAction(ISD::LOAD, MVT::v4f64, Promote);
+ AddPromotedToType(ISD::LOAD, MVT::v4f64, MVT::v8i32);
+
+ setOperationAction(ISD::LOAD, MVT::v8i64, Promote);
+ AddPromotedToType(ISD::LOAD, MVT::v8i64, MVT::v16i32);
+
+ setOperationAction(ISD::LOAD, MVT::v8f64, Promote);
+ AddPromotedToType(ISD::LOAD, MVT::v8f64, MVT::v16i32);
+
+ setOperationAction(ISD::LOAD, MVT::v16i64, Promote);
+ AddPromotedToType(ISD::LOAD, MVT::v16i64, MVT::v32i32);
+
+ setOperationAction(ISD::LOAD, MVT::v16f64, Promote);
+ AddPromotedToType(ISD::LOAD, MVT::v16f64, MVT::v32i32);
+
// There are no 64-bit extloads. These should be done as a 32-bit extload and
// an extension to 64-bit.
for (MVT VT : MVT::integer_valuetypes()) {
@@ -161,11 +183,13 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f32, Expand);
setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f32, Expand);
setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8f32, Expand);
+ setLoadExtAction(ISD::EXTLOAD, MVT::v16f64, MVT::v16f32, Expand);
setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f16, Expand);
setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f16, Expand);
setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f16, Expand);
setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8f16, Expand);
+ setLoadExtAction(ISD::EXTLOAD, MVT::v16f64, MVT::v16f16, Expand);
setOperationAction(ISD::STORE, MVT::f32, Promote);
AddPromotedToType(ISD::STORE, MVT::f32, MVT::i32);
@@ -203,6 +227,24 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
setOperationAction(ISD::STORE, MVT::v2f64, Promote);
AddPromotedToType(ISD::STORE, MVT::v2f64, MVT::v4i32);
+ setOperationAction(ISD::STORE, MVT::v4i64, Promote);
+ AddPromotedToType(ISD::STORE, MVT::v4i64, MVT::v8i32);
+
+ setOperationAction(ISD::STORE, MVT::v4f64, Promote);
+ AddPromotedToType(ISD::STORE, MVT::v4f64, MVT::v8i32);
+
+ setOperationAction(ISD::STORE, MVT::v8i64, Promote);
+ AddPromotedToType(ISD::STORE, MVT::v8i64, MVT::v16i32);
+
+ setOperationAction(ISD::STORE, MVT::v8f64, Promote);
+ AddPromotedToType(ISD::STORE, MVT::v8f64, MVT::v16i32);
+
+ setOperationAction(ISD::STORE, MVT::v16i64, Promote);
+ AddPromotedToType(ISD::STORE, MVT::v16i64, MVT::v32i32);
+
+ setOperationAction(ISD::STORE, MVT::v16f64, Promote);
+ AddPromotedToType(ISD::STORE, MVT::v16f64, MVT::v32i32);
+
setTruncStoreAction(MVT::i64, MVT::i1, Expand);
setTruncStoreAction(MVT::i64, MVT::i8, Expand);
setTruncStoreAction(MVT::i64, MVT::i16, Expand);
@@ -227,12 +269,21 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
setTruncStoreAction(MVT::v2f64, MVT::v2f32, Expand);
setTruncStoreAction(MVT::v2f64, MVT::v2f16, Expand);
+ setTruncStoreAction(MVT::v4i64, MVT::v4i32, Expand);
+ setTruncStoreAction(MVT::v4i64, MVT::v4i16, Expand);
setTruncStoreAction(MVT::v4f64, MVT::v4f32, Expand);
setTruncStoreAction(MVT::v4f64, MVT::v4f16, Expand);
setTruncStoreAction(MVT::v8f64, MVT::v8f32, Expand);
setTruncStoreAction(MVT::v8f64, MVT::v8f16, Expand);
+ setTruncStoreAction(MVT::v16f64, MVT::v16f32, Expand);
+ setTruncStoreAction(MVT::v16f64, MVT::v16f16, Expand);
+ setTruncStoreAction(MVT::v16i64, MVT::v16i16, Expand);
+ setTruncStoreAction(MVT::v16i64, MVT::v16i16, Expand);
+ setTruncStoreAction(MVT::v16i64, MVT::v16i8, Expand);
+ setTruncStoreAction(MVT::v16i64, MVT::v16i8, Expand);
+ setTruncStoreAction(MVT::v16i64, MVT::v16i1, Expand);
setOperationAction(ISD::Constant, MVT::i32, Legal);
setOperationAction(ISD::Constant, MVT::i64, Legal);
@@ -297,6 +348,14 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v16i32, Custom);
setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v32f32, Custom);
setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v32i32, Custom);
+ setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v2f64, Custom);
+ setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v2i64, Custom);
+ setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v4f64, Custom);
+ setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v4i64, Custom);
+ setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v8f64, Custom);
+ setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v8i64, Custom);
+ setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v16f64, Custom);
+ setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v16i64, Custom);
setOperationAction(ISD::FP16_TO_FP, MVT::f64, Expand);
setOperationAction(ISD::FP_TO_FP16, MVT::f64, Custom);
@@ -329,6 +388,9 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
setOperationAction(ISD::SUBE, VT, Legal);
}
+ // The hardware supports 32-bit FSHR, but not FSHL.
+ setOperationAction(ISD::FSHR, MVT::i32, Legal);
+
// The hardware supports 32-bit ROTR, but not ROTL.
setOperationAction(ISD::ROTL, MVT::i32, Expand);
setOperationAction(ISD::ROTL, MVT::i64, Expand);
@@ -381,7 +443,7 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
setOperationAction(ISD::UREM, VT, Expand);
setOperationAction(ISD::SMUL_LOHI, VT, Expand);
setOperationAction(ISD::UMUL_LOHI, VT, Expand);
- setOperationAction(ISD::SDIVREM, VT, Custom);
+ setOperationAction(ISD::SDIVREM, VT, Expand);
setOperationAction(ISD::UDIVREM, VT, Expand);
setOperationAction(ISD::SELECT, VT, Expand);
setOperationAction(ISD::VSELECT, VT, Expand);
@@ -483,6 +545,10 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
MaxStoresPerMemmove = 0xffffffff;
MaxStoresPerMemset = 0xffffffff;
+ // The expansion for 64-bit division is enormous.
+ if (AMDGPUBypassSlowDiv)
+ addBypassSlowDiv(64, 32);
+
setTargetDAGCombine(ISD::BITCAST);
setTargetDAGCombine(ISD::SHL);
setTargetDAGCombine(ISD::SRA);
@@ -609,6 +675,17 @@ bool AMDGPUTargetLowering::allUsesHaveSourceMods(const SDNode *N,
return true;
}
+EVT AMDGPUTargetLowering::getTypeForExtReturn(LLVMContext &Context, EVT VT,
+ ISD::NodeType ExtendKind) const {
+ assert(!VT.isVector() && "only scalar expected");
+
+ // Round to the next multiple of 32-bits.
+ unsigned Size = VT.getSizeInBits();
+ if (Size <= 32)
+ return MVT::i32;
+ return EVT::getIntegerVT(Context, 32 * ((Size + 31) / 32));
+}
+
MVT AMDGPUTargetLowering::getVectorIdxTy(const DataLayout &) const {
return MVT::i32;
}
@@ -641,8 +718,9 @@ bool AMDGPUTargetLowering::shouldReduceLoadWidth(SDNode *N,
unsigned NewSize = NewVT.getStoreSizeInBits();
- // If we are reducing to a 32-bit load, this is always better.
- if (NewSize == 32)
+ // If we are reducing to a 32-bit load or a smaller multi-dword load,
+ // this is always better.
+ if (NewSize >= 32)
return true;
EVT OldVT = N->getValueType(0);
@@ -733,6 +811,26 @@ bool AMDGPUTargetLowering::isSDNodeAlwaysUniform(const SDNode * N) const {
}
}
+SDValue AMDGPUTargetLowering::getNegatedExpression(
+ SDValue Op, SelectionDAG &DAG, bool LegalOperations, bool ForCodeSize,
+ NegatibleCost &Cost, unsigned Depth) const {
+
+ switch (Op.getOpcode()) {
+ case ISD::FMA:
+ case ISD::FMAD: {
+ // Negating a fma is not free if it has users without source mods.
+ if (!allUsesHaveSourceMods(Op.getNode()))
+ return SDValue();
+ break;
+ }
+ default:
+ break;
+ }
+
+ return TargetLowering::getNegatedExpression(Op, DAG, LegalOperations,
+ ForCodeSize, Cost, Depth);
+}
+
//===---------------------------------------------------------------------===//
// Target Properties
//===---------------------------------------------------------------------===//
@@ -912,7 +1010,7 @@ void AMDGPUTargetLowering::analyzeFormalArgumentsCompute(
const unsigned ExplicitOffset = ST.getExplicitKernelArgOffset(Fn);
CallingConv::ID CC = Fn.getCallingConv();
- unsigned MaxAlign = 1;
+ Align MaxAlign = Align(1);
uint64_t ExplicitArgOffset = 0;
const DataLayout &DL = Fn.getParent()->getDataLayout();
@@ -920,12 +1018,12 @@ void AMDGPUTargetLowering::analyzeFormalArgumentsCompute(
for (const Argument &Arg : Fn.args()) {
Type *BaseArgTy = Arg.getType();
- unsigned Align = DL.getABITypeAlignment(BaseArgTy);
- MaxAlign = std::max(Align, MaxAlign);
+ Align Alignment = DL.getABITypeAlign(BaseArgTy);
+ MaxAlign = std::max(Alignment, MaxAlign);
unsigned AllocSize = DL.getTypeAllocSize(BaseArgTy);
- uint64_t ArgOffset = alignTo(ExplicitArgOffset, Align) + ExplicitOffset;
- ExplicitArgOffset = alignTo(ExplicitArgOffset, Align) + AllocSize;
+ uint64_t ArgOffset = alignTo(ExplicitArgOffset, Alignment) + ExplicitOffset;
+ ExplicitArgOffset = alignTo(ExplicitArgOffset, Alignment) + AllocSize;
// We're basically throwing away everything passed into us and starting over
// to get accurate in-memory offsets. The "PartOffset" is completely useless
@@ -999,6 +1097,8 @@ void AMDGPUTargetLowering::analyzeFormalArgumentsCompute(
assert(MemVT.getVectorNumElements() == 3 ||
MemVT.getVectorNumElements() == 5);
MemVT = MemVT.getPow2VectorType(State.getContext());
+ } else if (!MemVT.isSimple() && !MemVT.isVector()) {
+ MemVT = MemVT.getRoundIntegerType(State.getContext());
}
unsigned PartOffset = 0;
@@ -1140,7 +1240,7 @@ SDValue AMDGPUTargetLowering::LowerOperation(SDValue Op,
case ISD::FROUND: return LowerFROUND(Op, DAG);
case ISD::FFLOOR: return LowerFFLOOR(Op, DAG);
case ISD::FLOG:
- return LowerFLOG(Op, DAG, 1.0F / numbers::log2ef);
+ return LowerFLOG(Op, DAG, numbers::ln2f);
case ISD::FLOG10:
return LowerFLOG(Op, DAG, numbers::ln2f / numbers::ln10f);
case ISD::FEXP:
@@ -1196,10 +1296,23 @@ SDValue AMDGPUTargetLowering::LowerGlobalAddress(AMDGPUMachineFunction* MFI,
if (G->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS ||
G->getAddressSpace() == AMDGPUAS::REGION_ADDRESS) {
if (!MFI->isEntryFunction()) {
+ SDLoc DL(Op);
const Function &Fn = DAG.getMachineFunction().getFunction();
DiagnosticInfoUnsupported BadLDSDecl(
- Fn, "local memory global used by non-kernel function", SDLoc(Op).getDebugLoc());
+ Fn, "local memory global used by non-kernel function",
+ DL.getDebugLoc(), DS_Warning);
DAG.getContext()->diagnose(BadLDSDecl);
+
+ // We currently don't have a way to correctly allocate LDS objects that
+ // aren't directly associated with a kernel. We do force inlining of
+ // functions that use local objects. However, if these dead functions are
+ // not eliminated, we don't want a compile time error. Just emit a warning
+ // and a trap, since there should be no callable path here.
+ SDValue Trap = DAG.getNode(ISD::TRAP, DL, MVT::Other, DAG.getEntryNode());
+ SDValue OutputChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other,
+ Trap, DAG.getRoot());
+ DAG.setRoot(OutputChain);
+ return DAG.getUNDEF(Op.getValueType());
}
// XXX: What does the value of G->getOffset() mean?
@@ -1208,7 +1321,7 @@ SDValue AMDGPUTargetLowering::LowerGlobalAddress(AMDGPUMachineFunction* MFI,
// TODO: We could emit code to handle the initialization somewhere.
if (!hasDefinedInitializer(GV)) {
- unsigned Offset = MFI->allocateLDSGlobal(DL, *GV);
+ unsigned Offset = MFI->allocateLDSGlobal(DL, *cast<GlobalVariable>(GV));
return DAG.getConstant(Offset, SDLoc(Op), Op.getValueType());
}
}
@@ -1383,12 +1496,11 @@ AMDGPUTargetLowering::splitVector(const SDValue &N, const SDLoc &DL,
(HiVT.isVector() ? HiVT.getVectorNumElements() : 1) <=
N.getValueType().getVectorNumElements() &&
"More vector elements requested than available!");
- auto IdxTy = getVectorIdxTy(DAG.getDataLayout());
SDValue Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, LoVT, N,
- DAG.getConstant(0, DL, IdxTy));
+ DAG.getVectorIdxConstant(0, DL));
SDValue Hi = DAG.getNode(
HiVT.isVector() ? ISD::EXTRACT_SUBVECTOR : ISD::EXTRACT_VECTOR_ELT, DL,
- HiVT, N, DAG.getConstant(LoVT.getVectorNumElements(), DL, IdxTy));
+ HiVT, N, DAG.getVectorIdxConstant(LoVT.getVectorNumElements(), DL));
return std::make_pair(Lo, Hi);
}
@@ -1433,18 +1545,17 @@ SDValue AMDGPUTargetLowering::SplitVectorLoad(const SDValue Op,
HiPtr, SrcValue.getWithOffset(LoMemVT.getStoreSize()),
HiMemVT, HiAlign, Load->getMemOperand()->getFlags());
- auto IdxTy = getVectorIdxTy(DAG.getDataLayout());
SDValue Join;
if (LoVT == HiVT) {
// This is the case that the vector is power of two so was evenly split.
Join = DAG.getNode(ISD::CONCAT_VECTORS, SL, VT, LoLoad, HiLoad);
} else {
Join = DAG.getNode(ISD::INSERT_SUBVECTOR, SL, VT, DAG.getUNDEF(VT), LoLoad,
- DAG.getConstant(0, SL, IdxTy));
- Join = DAG.getNode(HiVT.isVector() ? ISD::INSERT_SUBVECTOR
- : ISD::INSERT_VECTOR_ELT,
- SL, VT, Join, HiLoad,
- DAG.getConstant(LoVT.getVectorNumElements(), SL, IdxTy));
+ DAG.getVectorIdxConstant(0, SL));
+ Join = DAG.getNode(
+ HiVT.isVector() ? ISD::INSERT_SUBVECTOR : ISD::INSERT_VECTOR_ELT, SL,
+ VT, Join, HiLoad,
+ DAG.getVectorIdxConstant(LoVT.getVectorNumElements(), SL));
}
SDValue Ops[] = {Join, DAG.getNode(ISD::TokenFactor, SL, MVT::Other,
@@ -1474,7 +1585,7 @@ SDValue AMDGPUTargetLowering::WidenVectorLoad(SDValue Op,
WideMemVT, BaseAlign, Load->getMemOperand()->getFlags());
return DAG.getMergeValues(
{DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, VT, WideLoad,
- DAG.getConstant(0, SL, getVectorIdxTy(DAG.getDataLayout()))),
+ DAG.getVectorIdxConstant(0, SL)),
WideLoad.getValue(1)},
SL);
}
@@ -1588,9 +1699,11 @@ SDValue AMDGPUTargetLowering::LowerDIVREM24(SDValue Op, SelectionDAG &DAG,
const AMDGPUMachineFunction *MFI = MF.getInfo<AMDGPUMachineFunction>();
// float fr = mad(fqneg, fb, fa);
- unsigned OpCode = MFI->getMode().FP32Denormals ?
- (unsigned)AMDGPUISD::FMAD_FTZ :
- (unsigned)ISD::FMAD;
+ unsigned OpCode = !Subtarget->hasMadMacF32Insts() ?
+ (unsigned)ISD::FMA :
+ !MFI->getMode().allFP32Denormals() ?
+ (unsigned)ISD::FMAD :
+ (unsigned)AMDGPUISD::FMAD_FTZ;
SDValue fr = DAG.getNode(OpCode, DL, FltVT, fqneg, fb, fa);
// int iq = (int)fq;
@@ -1673,9 +1786,11 @@ void AMDGPUTargetLowering::LowerUDIVREM64(SDValue Op,
const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
// Compute denominator reciprocal.
- unsigned FMAD = MFI->getMode().FP32Denormals ?
- (unsigned)AMDGPUISD::FMAD_FTZ :
- (unsigned)ISD::FMAD;
+ unsigned FMAD = !Subtarget->hasMadMacF32Insts() ?
+ (unsigned)ISD::FMA :
+ !MFI->getMode().allFP32Denormals() ?
+ (unsigned)ISD::FMAD :
+ (unsigned)AMDGPUISD::FMAD_FTZ;
SDValue Cvt_Lo = DAG.getNode(ISD::UINT_TO_FP, DL, MVT::f32, RHS_Lo);
SDValue Cvt_Hi = DAG.getNode(ISD::UINT_TO_FP, DL, MVT::f32, RHS_Hi);
@@ -1861,103 +1976,43 @@ SDValue AMDGPUTargetLowering::LowerUDIVREM(SDValue Op,
return Res;
}
- SDValue Num = Op.getOperand(0);
- SDValue Den = Op.getOperand(1);
-
- // RCP = URECIP(Den) = 2^32 / Den + e
- // e is rounding error.
- SDValue RCP = DAG.getNode(AMDGPUISD::URECIP, DL, VT, Den);
-
- // RCP_LO = mul(RCP, Den) */
- SDValue RCP_LO = DAG.getNode(ISD::MUL, DL, VT, RCP, Den);
-
- // RCP_HI = mulhu (RCP, Den) */
- SDValue RCP_HI = DAG.getNode(ISD::MULHU, DL, VT, RCP, Den);
-
- // NEG_RCP_LO = -RCP_LO
- SDValue NEG_RCP_LO = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT),
- RCP_LO);
-
- // ABS_RCP_LO = (RCP_HI == 0 ? NEG_RCP_LO : RCP_LO)
- SDValue ABS_RCP_LO = DAG.getSelectCC(DL, RCP_HI, DAG.getConstant(0, DL, VT),
- NEG_RCP_LO, RCP_LO,
- ISD::SETEQ);
- // Calculate the rounding error from the URECIP instruction
- // E = mulhu(ABS_RCP_LO, RCP)
- SDValue E = DAG.getNode(ISD::MULHU, DL, VT, ABS_RCP_LO, RCP);
-
- // RCP_A_E = RCP + E
- SDValue RCP_A_E = DAG.getNode(ISD::ADD, DL, VT, RCP, E);
-
- // RCP_S_E = RCP - E
- SDValue RCP_S_E = DAG.getNode(ISD::SUB, DL, VT, RCP, E);
-
- // Tmp0 = (RCP_HI == 0 ? RCP_A_E : RCP_SUB_E)
- SDValue Tmp0 = DAG.getSelectCC(DL, RCP_HI, DAG.getConstant(0, DL, VT),
- RCP_A_E, RCP_S_E,
- ISD::SETEQ);
- // Quotient = mulhu(Tmp0, Num)
- SDValue Quotient = DAG.getNode(ISD::MULHU, DL, VT, Tmp0, Num);
-
- // Num_S_Remainder = Quotient * Den
- SDValue Num_S_Remainder = DAG.getNode(ISD::MUL, DL, VT, Quotient, Den);
-
- // Remainder = Num - Num_S_Remainder
- SDValue Remainder = DAG.getNode(ISD::SUB, DL, VT, Num, Num_S_Remainder);
-
- // Remainder_GE_Den = (Remainder >= Den ? -1 : 0)
- SDValue Remainder_GE_Den = DAG.getSelectCC(DL, Remainder, Den,
- DAG.getConstant(-1, DL, VT),
- DAG.getConstant(0, DL, VT),
- ISD::SETUGE);
- // Remainder_GE_Zero = (Num >= Num_S_Remainder ? -1 : 0)
- SDValue Remainder_GE_Zero = DAG.getSelectCC(DL, Num,
- Num_S_Remainder,
- DAG.getConstant(-1, DL, VT),
- DAG.getConstant(0, DL, VT),
- ISD::SETUGE);
- // Tmp1 = Remainder_GE_Den & Remainder_GE_Zero
- SDValue Tmp1 = DAG.getNode(ISD::AND, DL, VT, Remainder_GE_Den,
- Remainder_GE_Zero);
-
- // Calculate Division result:
-
- // Quotient_A_One = Quotient + 1
- SDValue Quotient_A_One = DAG.getNode(ISD::ADD, DL, VT, Quotient,
- DAG.getConstant(1, DL, VT));
-
- // Quotient_S_One = Quotient - 1
- SDValue Quotient_S_One = DAG.getNode(ISD::SUB, DL, VT, Quotient,
- DAG.getConstant(1, DL, VT));
-
- // Div = (Tmp1 == 0 ? Quotient : Quotient_A_One)
- SDValue Div = DAG.getSelectCC(DL, Tmp1, DAG.getConstant(0, DL, VT),
- Quotient, Quotient_A_One, ISD::SETEQ);
-
- // Div = (Remainder_GE_Zero == 0 ? Quotient_S_One : Div)
- Div = DAG.getSelectCC(DL, Remainder_GE_Zero, DAG.getConstant(0, DL, VT),
- Quotient_S_One, Div, ISD::SETEQ);
-
- // Calculate Rem result:
-
- // Remainder_S_Den = Remainder - Den
- SDValue Remainder_S_Den = DAG.getNode(ISD::SUB, DL, VT, Remainder, Den);
-
- // Remainder_A_Den = Remainder + Den
- SDValue Remainder_A_Den = DAG.getNode(ISD::ADD, DL, VT, Remainder, Den);
-
- // Rem = (Tmp1 == 0 ? Remainder : Remainder_S_Den)
- SDValue Rem = DAG.getSelectCC(DL, Tmp1, DAG.getConstant(0, DL, VT),
- Remainder, Remainder_S_Den, ISD::SETEQ);
-
- // Rem = (Remainder_GE_Zero == 0 ? Remainder_A_Den : Rem)
- Rem = DAG.getSelectCC(DL, Remainder_GE_Zero, DAG.getConstant(0, DL, VT),
- Remainder_A_Den, Rem, ISD::SETEQ);
- SDValue Ops[2] = {
- Div,
- Rem
- };
- return DAG.getMergeValues(Ops, DL);
+ SDValue X = Op.getOperand(0);
+ SDValue Y = Op.getOperand(1);
+
+ // See AMDGPUCodeGenPrepare::expandDivRem32 for a description of the
+ // algorithm used here.
+
+ // Initial estimate of inv(y).
+ SDValue Z = DAG.getNode(AMDGPUISD::URECIP, DL, VT, Y);
+
+ // One round of UNR.
+ SDValue NegY = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), Y);
+ SDValue NegYZ = DAG.getNode(ISD::MUL, DL, VT, NegY, Z);
+ Z = DAG.getNode(ISD::ADD, DL, VT, Z,
+ DAG.getNode(ISD::MULHU, DL, VT, Z, NegYZ));
+
+ // Quotient/remainder estimate.
+ SDValue Q = DAG.getNode(ISD::MULHU, DL, VT, X, Z);
+ SDValue R =
+ DAG.getNode(ISD::SUB, DL, VT, X, DAG.getNode(ISD::MUL, DL, VT, Q, Y));
+
+ // First quotient/remainder refinement.
+ EVT CCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
+ SDValue One = DAG.getConstant(1, DL, VT);
+ SDValue Cond = DAG.getSetCC(DL, CCVT, R, Y, ISD::SETUGE);
+ Q = DAG.getNode(ISD::SELECT, DL, VT, Cond,
+ DAG.getNode(ISD::ADD, DL, VT, Q, One), Q);
+ R = DAG.getNode(ISD::SELECT, DL, VT, Cond,
+ DAG.getNode(ISD::SUB, DL, VT, R, Y), R);
+
+ // Second quotient/remainder refinement.
+ Cond = DAG.getSetCC(DL, CCVT, R, Y, ISD::SETUGE);
+ Q = DAG.getNode(ISD::SELECT, DL, VT, Cond,
+ DAG.getNode(ISD::ADD, DL, VT, Q, One), Q);
+ R = DAG.getNode(ISD::SELECT, DL, VT, Cond,
+ DAG.getNode(ISD::SUB, DL, VT, R, Y), R);
+
+ return DAG.getMergeValues({Q, R}, DL);
}
SDValue AMDGPUTargetLowering::LowerSDIVREM(SDValue Op,
@@ -2164,8 +2219,7 @@ SDValue AMDGPUTargetLowering::LowerFNEARBYINT(SDValue Op, SelectionDAG &DAG) con
// Don't handle v2f16. The extra instructions to scalarize and repack around the
// compare and vselect end up producing worse code than scalarizing the whole
// operation.
-SDValue AMDGPUTargetLowering::LowerFROUND_LegalFTRUNC(SDValue Op,
- SelectionDAG &DAG) const {
+SDValue AMDGPUTargetLowering::LowerFROUND(SDValue Op, SelectionDAG &DAG) const {
SDLoc SL(Op);
SDValue X = Op.getOperand(0);
EVT VT = Op.getValueType();
@@ -2194,75 +2248,6 @@ SDValue AMDGPUTargetLowering::LowerFROUND_LegalFTRUNC(SDValue Op,
return DAG.getNode(ISD::FADD, SL, VT, T, Sel);
}
-SDValue AMDGPUTargetLowering::LowerFROUND64(SDValue Op, SelectionDAG &DAG) const {
- SDLoc SL(Op);
- SDValue X = Op.getOperand(0);
-
- SDValue L = DAG.getNode(ISD::BITCAST, SL, MVT::i64, X);
-
- const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
- const SDValue One = DAG.getConstant(1, SL, MVT::i32);
- const SDValue NegOne = DAG.getConstant(-1, SL, MVT::i32);
- const SDValue FiftyOne = DAG.getConstant(51, SL, MVT::i32);
- EVT SetCCVT =
- getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i32);
-
- SDValue BC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, X);
-
- SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, BC, One);
-
- SDValue Exp = extractF64Exponent(Hi, SL, DAG);
-
- const SDValue Mask = DAG.getConstant(INT64_C(0x000fffffffffffff), SL,
- MVT::i64);
-
- SDValue M = DAG.getNode(ISD::SRA, SL, MVT::i64, Mask, Exp);
- SDValue D = DAG.getNode(ISD::SRA, SL, MVT::i64,
- DAG.getConstant(INT64_C(0x0008000000000000), SL,
- MVT::i64),
- Exp);
-
- SDValue Tmp0 = DAG.getNode(ISD::AND, SL, MVT::i64, L, M);
- SDValue Tmp1 = DAG.getSetCC(SL, SetCCVT,
- DAG.getConstant(0, SL, MVT::i64), Tmp0,
- ISD::SETNE);
-
- SDValue Tmp2 = DAG.getNode(ISD::SELECT, SL, MVT::i64, Tmp1,
- D, DAG.getConstant(0, SL, MVT::i64));
- SDValue K = DAG.getNode(ISD::ADD, SL, MVT::i64, L, Tmp2);
-
- K = DAG.getNode(ISD::AND, SL, MVT::i64, K, DAG.getNOT(SL, M, MVT::i64));
- K = DAG.getNode(ISD::BITCAST, SL, MVT::f64, K);
-
- SDValue ExpLt0 = DAG.getSetCC(SL, SetCCVT, Exp, Zero, ISD::SETLT);
- SDValue ExpGt51 = DAG.getSetCC(SL, SetCCVT, Exp, FiftyOne, ISD::SETGT);
- SDValue ExpEqNegOne = DAG.getSetCC(SL, SetCCVT, NegOne, Exp, ISD::SETEQ);
-
- SDValue Mag = DAG.getNode(ISD::SELECT, SL, MVT::f64,
- ExpEqNegOne,
- DAG.getConstantFP(1.0, SL, MVT::f64),
- DAG.getConstantFP(0.0, SL, MVT::f64));
-
- SDValue S = DAG.getNode(ISD::FCOPYSIGN, SL, MVT::f64, Mag, X);
-
- K = DAG.getNode(ISD::SELECT, SL, MVT::f64, ExpLt0, S, K);
- K = DAG.getNode(ISD::SELECT, SL, MVT::f64, ExpGt51, X, K);
-
- return K;
-}
-
-SDValue AMDGPUTargetLowering::LowerFROUND(SDValue Op, SelectionDAG &DAG) const {
- EVT VT = Op.getValueType();
-
- if (isOperationLegal(ISD::FTRUNC, VT))
- return LowerFROUND_LegalFTRUNC(Op, DAG);
-
- if (VT == MVT::f64)
- return LowerFROUND64(Op, DAG);
-
- llvm_unreachable("unhandled type");
-}
-
SDValue AMDGPUTargetLowering::LowerFFLOOR(SDValue Op, SelectionDAG &DAG) const {
SDLoc SL(Op);
SDValue Src = Op.getOperand(0);
@@ -2793,6 +2778,7 @@ static bool isI24(SDValue Op, SelectionDAG &DAG) {
static SDValue simplifyI24(SDNode *Node24,
TargetLowering::DAGCombinerInfo &DCI) {
SelectionDAG &DAG = DCI.DAG;
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
bool IsIntrin = Node24->getOpcode() == ISD::INTRINSIC_WO_CHAIN;
SDValue LHS = IsIntrin ? Node24->getOperand(1) : Node24->getOperand(0);
@@ -2806,11 +2792,11 @@ static SDValue simplifyI24(SDNode *Node24,
APInt Demanded = APInt::getLowBitsSet(LHS.getValueSizeInBits(), 24);
- // First try to simplify using GetDemandedBits which allows the operands to
- // have other uses, but will only perform simplifications that involve
- // bypassing some nodes for this user.
- SDValue DemandedLHS = DAG.GetDemandedBits(LHS, Demanded);
- SDValue DemandedRHS = DAG.GetDemandedBits(RHS, Demanded);
+ // First try to simplify using SimplifyMultipleUseDemandedBits which allows
+ // the operands to have other uses, but will only perform simplifications that
+ // involve bypassing some nodes for this user.
+ SDValue DemandedLHS = TLI.SimplifyMultipleUseDemandedBits(LHS, Demanded, DAG);
+ SDValue DemandedRHS = TLI.SimplifyMultipleUseDemandedBits(RHS, Demanded, DAG);
if (DemandedLHS || DemandedRHS)
return DAG.getNode(NewOpcode, SDLoc(Node24), Node24->getVTList(),
DemandedLHS ? DemandedLHS : LHS,
@@ -2818,7 +2804,6 @@ static SDValue simplifyI24(SDNode *Node24,
// Now try SimplifyDemandedBits which can simplify the nodes used by our
// operands if this node is the only user.
- const TargetLowering &TLI = DAG.getTargetLoweringInfo();
if (TLI.SimplifyDemandedBits(LHS, Demanded, DCI))
return SDValue(Node24, 0);
if (TLI.SimplifyDemandedBits(RHS, Demanded, DCI))
@@ -2877,7 +2862,7 @@ SDValue AMDGPUTargetLowering::performLoadCombine(SDNode *N,
return SDValue();
LoadSDNode *LN = cast<LoadSDNode>(N);
- if (LN->isVolatile() || !ISD::isNormalLoad(LN) || hasVolatileUser(LN))
+ if (!LN->isSimple() || !ISD::isNormalLoad(LN) || hasVolatileUser(LN))
return SDValue();
SDLoc SL(N);
@@ -2885,16 +2870,17 @@ SDValue AMDGPUTargetLowering::performLoadCombine(SDNode *N,
EVT VT = LN->getMemoryVT();
unsigned Size = VT.getStoreSize();
- unsigned Align = LN->getAlignment();
- if (Align < Size && isTypeLegal(VT)) {
+ Align Alignment = LN->getAlign();
+ if (Alignment < Size && isTypeLegal(VT)) {
bool IsFast;
unsigned AS = LN->getAddressSpace();
// Expand unaligned loads earlier than legalization. Due to visitation order
// problems during legalization, the emitted instructions to pack and unpack
// the bytes again are not eliminated in the case of an unaligned copy.
- if (!allowsMisalignedMemoryAccesses(
- VT, AS, Align, LN->getMemOperand()->getFlags(), &IsFast)) {
+ if (!allowsMisalignedMemoryAccesses(VT, AS, Alignment.value(),
+ LN->getMemOperand()->getFlags(),
+ &IsFast)) {
SDValue Ops[2];
if (VT.isVector())
@@ -2931,7 +2917,7 @@ SDValue AMDGPUTargetLowering::performStoreCombine(SDNode *N,
return SDValue();
StoreSDNode *SN = cast<StoreSDNode>(N);
- if (SN->isVolatile() || !ISD::isNormalStore(SN))
+ if (!SN->isSimple() || !ISD::isNormalStore(SN))
return SDValue();
EVT VT = SN->getMemoryVT();
@@ -2939,8 +2925,8 @@ SDValue AMDGPUTargetLowering::performStoreCombine(SDNode *N,
SDLoc SL(N);
SelectionDAG &DAG = DCI.DAG;
- unsigned Align = SN->getAlignment();
- if (Align < Size && isTypeLegal(VT)) {
+ Align Alignment = SN->getAlign();
+ if (Alignment < Size && isTypeLegal(VT)) {
bool IsFast;
unsigned AS = SN->getAddressSpace();
@@ -2948,8 +2934,9 @@ SDValue AMDGPUTargetLowering::performStoreCombine(SDNode *N,
// order problems during legalization, the emitted instructions to pack and
// unpack the bytes again are not eliminated in the case of an unaligned
// copy.
- if (!allowsMisalignedMemoryAccesses(
- VT, AS, Align, SN->getMemOperand()->getFlags(), &IsFast)) {
+ if (!allowsMisalignedMemoryAccesses(VT, AS, Alignment.value(),
+ SN->getMemOperand()->getFlags(),
+ &IsFast)) {
if (VT.isVector())
return scalarizeVectorStore(SN, DAG);
@@ -3012,6 +2999,16 @@ SDValue AMDGPUTargetLowering::performIntrinsicWOChainCombine(
case Intrinsic::amdgcn_mul_i24:
case Intrinsic::amdgcn_mul_u24:
return simplifyI24(N, DCI);
+ case Intrinsic::amdgcn_fract:
+ case Intrinsic::amdgcn_rsq:
+ case Intrinsic::amdgcn_rcp_legacy:
+ case Intrinsic::amdgcn_rsq_legacy:
+ case Intrinsic::amdgcn_rsq_clamp:
+ case Intrinsic::amdgcn_ldexp: {
+ // FIXME: This is probably wrong. If src is an sNaN, it won't be quieted
+ SDValue Src = N->getOperand(1);
+ return Src.isUndef() ? Src : SDValue();
+ }
default:
return SDValue();
}
@@ -3465,24 +3462,24 @@ SDValue AMDGPUTargetLowering::performCtlz_CttzCombine(const SDLoc &SL, SDValue C
ISD::CondCode CCOpcode = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
SDValue CmpLHS = Cond.getOperand(0);
- unsigned Opc = isCttzOpc(RHS.getOpcode()) ? AMDGPUISD::FFBL_B32 :
- AMDGPUISD::FFBH_U32;
-
// select (setcc x, 0, eq), -1, (ctlz_zero_undef x) -> ffbh_u32 x
// select (setcc x, 0, eq), -1, (cttz_zero_undef x) -> ffbl_u32 x
if (CCOpcode == ISD::SETEQ &&
(isCtlzOpc(RHS.getOpcode()) || isCttzOpc(RHS.getOpcode())) &&
- RHS.getOperand(0) == CmpLHS &&
- isNegativeOne(LHS)) {
+ RHS.getOperand(0) == CmpLHS && isNegativeOne(LHS)) {
+ unsigned Opc =
+ isCttzOpc(RHS.getOpcode()) ? AMDGPUISD::FFBL_B32 : AMDGPUISD::FFBH_U32;
return getFFBX_U32(DAG, CmpLHS, SL, Opc);
}
// select (setcc x, 0, ne), (ctlz_zero_undef x), -1 -> ffbh_u32 x
// select (setcc x, 0, ne), (cttz_zero_undef x), -1 -> ffbl_u32 x
if (CCOpcode == ISD::SETNE &&
- (isCtlzOpc(LHS.getOpcode()) || isCttzOpc(RHS.getOpcode())) &&
- LHS.getOperand(0) == CmpLHS &&
- isNegativeOne(RHS)) {
+ (isCtlzOpc(LHS.getOpcode()) || isCttzOpc(LHS.getOpcode())) &&
+ LHS.getOperand(0) == CmpLHS && isNegativeOne(RHS)) {
+ unsigned Opc =
+ isCttzOpc(LHS.getOpcode()) ? AMDGPUISD::FFBL_B32 : AMDGPUISD::FFBH_U32;
+
return getFFBX_U32(DAG, CmpLHS, SL, Opc);
}
@@ -4117,12 +4114,12 @@ SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N,
SDValue AMDGPUTargetLowering::CreateLiveInRegister(SelectionDAG &DAG,
const TargetRegisterClass *RC,
- unsigned Reg, EVT VT,
+ Register Reg, EVT VT,
const SDLoc &SL,
bool RawReg) const {
MachineFunction &MF = DAG.getMachineFunction();
MachineRegisterInfo &MRI = MF.getRegInfo();
- unsigned VReg;
+ Register VReg;
if (!MRI.isLiveIn(Reg)) {
VReg = MRI.createVirtualRegister(RC);
@@ -4266,11 +4263,9 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
NODE_NAME_CASE(DIV_FMAS)
NODE_NAME_CASE(DIV_FIXUP)
NODE_NAME_CASE(FMAD_FTZ)
- NODE_NAME_CASE(TRIG_PREOP)
NODE_NAME_CASE(RCP)
NODE_NAME_CASE(RSQ)
NODE_NAME_CASE(RCP_LEGACY)
- NODE_NAME_CASE(RSQ_LEGACY)
NODE_NAME_CASE(RCP_IFLAG)
NODE_NAME_CASE(FMUL_LEGACY)
NODE_NAME_CASE(RSQ_CLAMP)
@@ -4298,8 +4293,6 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
NODE_NAME_CASE(MAD_U64_U32)
NODE_NAME_CASE(PERM)
NODE_NAME_CASE(TEXTURE_FETCH)
- NODE_NAME_CASE(EXPORT)
- NODE_NAME_CASE(EXPORT_DONE)
NODE_NAME_CASE(R600_EXPORT)
NODE_NAME_CASE(CONST_ADDRESS)
NODE_NAME_CASE(REGISTER_LOAD)
@@ -4323,12 +4316,8 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
NODE_NAME_CASE(CONST_DATA_PTR)
NODE_NAME_CASE(PC_ADD_REL_OFFSET)
NODE_NAME_CASE(LDS)
- NODE_NAME_CASE(KILL)
NODE_NAME_CASE(DUMMY_CHAIN)
case AMDGPUISD::FIRST_MEM_OPCODE_NUMBER: break;
- NODE_NAME_CASE(INTERP_P1LL_F16)
- NODE_NAME_CASE(INTERP_P1LV_F16)
- NODE_NAME_CASE(INTERP_P2_F16)
NODE_NAME_CASE(LOAD_D16_HI)
NODE_NAME_CASE(LOAD_D16_LO)
NODE_NAME_CASE(LOAD_D16_HI_I8)
@@ -4347,6 +4336,7 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
NODE_NAME_CASE(ATOMIC_DEC)
NODE_NAME_CASE(ATOMIC_LOAD_FMIN)
NODE_NAME_CASE(ATOMIC_LOAD_FMAX)
+ NODE_NAME_CASE(ATOMIC_LOAD_CSUB)
NODE_NAME_CASE(BUFFER_LOAD)
NODE_NAME_CASE(BUFFER_LOAD_UBYTE)
NODE_NAME_CASE(BUFFER_LOAD_USHORT)
@@ -4373,6 +4363,7 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
NODE_NAME_CASE(BUFFER_ATOMIC_INC)
NODE_NAME_CASE(BUFFER_ATOMIC_DEC)
NODE_NAME_CASE(BUFFER_ATOMIC_CMPSWAP)
+ NODE_NAME_CASE(BUFFER_ATOMIC_CSUB)
NODE_NAME_CASE(BUFFER_ATOMIC_FADD)
NODE_NAME_CASE(BUFFER_ATOMIC_PK_FADD)
NODE_NAME_CASE(ATOMIC_PK_FADD)
@@ -4539,11 +4530,10 @@ void AMDGPUTargetLowering::computeKnownBitsForTargetNode(
}
case AMDGPUISD::LDS: {
auto GA = cast<GlobalAddressSDNode>(Op.getOperand(0).getNode());
- unsigned Align = GA->getGlobal()->getAlignment();
+ Align Alignment = GA->getGlobal()->getPointerAlignment(DAG.getDataLayout());
Known.Zero.setHighBits(16);
- if (Align)
- Known.Zero.setLowBits(Log2_32(Align));
+ Known.Zero.setLowBits(Log2(Alignment));
break;
}
case ISD::INTRINSIC_WO_CHAIN: {
@@ -4607,6 +4597,29 @@ unsigned AMDGPUTargetLowering::ComputeNumSignBitsForTargetNode(
}
}
+unsigned AMDGPUTargetLowering::computeNumSignBitsForTargetInstr(
+ GISelKnownBits &Analysis, Register R,
+ const APInt &DemandedElts, const MachineRegisterInfo &MRI,
+ unsigned Depth) const {
+ const MachineInstr *MI = MRI.getVRegDef(R);
+ if (!MI)
+ return 1;
+
+ // TODO: Check range metadata on MMO.
+ switch (MI->getOpcode()) {
+ case AMDGPU::G_AMDGPU_BUFFER_LOAD_SBYTE:
+ return 25;
+ case AMDGPU::G_AMDGPU_BUFFER_LOAD_SSHORT:
+ return 17;
+ case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE:
+ return 24;
+ case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT:
+ return 16;
+ default:
+ return 1;
+ }
+}
+
bool AMDGPUTargetLowering::isKnownNeverNaNForTargetNode(SDValue Op,
const SelectionDAG &DAG,
bool SNaN,
@@ -4648,7 +4661,6 @@ bool AMDGPUTargetLowering::isKnownNeverNaNForTargetNode(SDValue Op,
case AMDGPUISD::RCP:
case AMDGPUISD::RSQ:
case AMDGPUISD::RCP_LEGACY:
- case AMDGPUISD::RSQ_LEGACY:
case AMDGPUISD::RSQ_CLAMP: {
if (SNaN)
return true;
@@ -4665,7 +4677,6 @@ bool AMDGPUTargetLowering::isKnownNeverNaNForTargetNode(SDValue Op,
case AMDGPUISD::DIV_SCALE:
case AMDGPUISD::DIV_FMAS:
case AMDGPUISD::DIV_FIXUP:
- case AMDGPUISD::TRIG_PREOP:
// TODO: Refine on operands.
return SNaN;
case AMDGPUISD::SIN_HW:
@@ -4692,6 +4703,18 @@ bool AMDGPUTargetLowering::isKnownNeverNaNForTargetNode(SDValue Op,
return DAG.isKnownNeverNaN(Op.getOperand(1), SNaN, Depth + 1) &&
DAG.isKnownNeverNaN(Op.getOperand(2), SNaN, Depth + 1);
}
+ case Intrinsic::amdgcn_rcp:
+ case Intrinsic::amdgcn_rsq:
+ case Intrinsic::amdgcn_rcp_legacy:
+ case Intrinsic::amdgcn_rsq_legacy:
+ case Intrinsic::amdgcn_rsq_clamp: {
+ if (SNaN)
+ return true;
+
+ // TODO: Need is known positive check.
+ return false;
+ }
+ case Intrinsic::amdgcn_trig_preop:
case Intrinsic::amdgcn_fdot2:
// TODO: Refine on operand
return SNaN;