aboutsummaryrefslogtreecommitdiff
path: root/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp')
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp282
1 files changed, 219 insertions, 63 deletions
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index 254d02d4ce5b..fcbdf51b03c1 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -323,24 +323,26 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
setOperationAction({ISD::BR_JT, ISD::BRIND}, MVT::Other, Expand);
- // This is totally unsupported, just custom lower to produce an error.
+ // For R600, this is totally unsupported, just custom lower to produce an
+ // error.
setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Custom);
// Library functions. These default to Expand, but we have instructions
// for them.
- setOperationAction({ISD::FCEIL, ISD::FPOW, ISD::FABS, ISD::FFLOOR, ISD::FRINT,
- ISD::FTRUNC, ISD::FMINNUM, ISD::FMAXNUM},
+ setOperationAction({ISD::FCEIL, ISD::FPOW, ISD::FABS, ISD::FFLOOR,
+ ISD::FROUNDEVEN, ISD::FTRUNC, ISD::FMINNUM, ISD::FMAXNUM},
MVT::f32, Legal);
setOperationAction(ISD::FLOG2, MVT::f32, Custom);
setOperationAction(ISD::FROUND, {MVT::f32, MVT::f64}, Custom);
- setOperationAction({ISD::FLOG, ISD::FLOG10, ISD::FEXP, ISD::FEXP2}, MVT::f32,
- Custom);
+ setOperationAction(
+ {ISD::FLOG, ISD::FLOG10, ISD::FEXP, ISD::FEXP2, ISD::FEXP10}, MVT::f32,
+ Custom);
setOperationAction(ISD::FNEARBYINT, {MVT::f16, MVT::f32, MVT::f64}, Custom);
- setOperationAction(ISD::FROUNDEVEN, {MVT::f16, MVT::f32, MVT::f64}, Custom);
+ setOperationAction(ISD::FRINT, {MVT::f16, MVT::f32, MVT::f64}, Custom);
setOperationAction(ISD::FREM, {MVT::f16, MVT::f32, MVT::f64}, Custom);
@@ -351,7 +353,8 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
setOperationAction({ISD::FLOG2, ISD::FEXP2}, MVT::f16, Custom);
}
- setOperationAction({ISD::FLOG10, ISD::FLOG, ISD::FEXP}, MVT::f16, Custom);
+ setOperationAction({ISD::FLOG10, ISD::FLOG, ISD::FEXP, ISD::FEXP10}, MVT::f16,
+ Custom);
// FIXME: These IS_FPCLASS vector fp types are marked custom so it reaches
// scalarization code. Can be removed when IS_FPCLASS expand isn't called by
@@ -383,7 +386,7 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
MVT::v12f32, MVT::v16f16, MVT::v16i16, MVT::v16f32, MVT::v16i32,
MVT::v32f32, MVT::v32i32, MVT::v2f64, MVT::v2i64, MVT::v3f64,
MVT::v3i64, MVT::v4f64, MVT::v4i64, MVT::v8f64, MVT::v8i64,
- MVT::v16f64, MVT::v16i64},
+ MVT::v16f64, MVT::v16i64, MVT::v32i16, MVT::v32f16},
Custom);
setOperationAction(ISD::FP16_TO_FP, MVT::f64, Expand);
@@ -456,14 +459,17 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
for (MVT VT : FloatVectorTypes) {
setOperationAction(
- {ISD::FABS, ISD::FMINNUM, ISD::FMAXNUM, ISD::FADD,
- ISD::FCEIL, ISD::FCOS, ISD::FDIV, ISD::FEXP2,
- ISD::FEXP, ISD::FLOG2, ISD::FREM, ISD::FLOG,
- ISD::FLOG10, ISD::FPOW, ISD::FFLOOR, ISD::FTRUNC,
- ISD::FMUL, ISD::FMA, ISD::FRINT, ISD::FNEARBYINT,
- ISD::FSQRT, ISD::FSIN, ISD::FSUB, ISD::FNEG,
- ISD::VSELECT, ISD::SELECT_CC, ISD::FCOPYSIGN, ISD::VECTOR_SHUFFLE,
- ISD::SETCC, ISD::FCANONICALIZE},
+ {ISD::FABS, ISD::FMINNUM, ISD::FMAXNUM,
+ ISD::FADD, ISD::FCEIL, ISD::FCOS,
+ ISD::FDIV, ISD::FEXP2, ISD::FEXP,
+ ISD::FEXP10, ISD::FLOG2, ISD::FREM,
+ ISD::FLOG, ISD::FLOG10, ISD::FPOW,
+ ISD::FFLOOR, ISD::FTRUNC, ISD::FMUL,
+ ISD::FMA, ISD::FRINT, ISD::FNEARBYINT,
+ ISD::FSQRT, ISD::FSIN, ISD::FSUB,
+ ISD::FNEG, ISD::VSELECT, ISD::SELECT_CC,
+ ISD::FCOPYSIGN, ISD::VECTOR_SHUFFLE, ISD::SETCC,
+ ISD::FCANONICALIZE, ISD::FROUNDEVEN},
VT, Expand);
}
@@ -584,6 +590,7 @@ static bool fnegFoldsIntoOpcode(unsigned Opc) {
case ISD::FTRUNC:
case ISD::FRINT:
case ISD::FNEARBYINT:
+ case ISD::FROUNDEVEN:
case ISD::FCANONICALIZE:
case AMDGPUISD::RCP:
case AMDGPUISD::RCP_LEGACY:
@@ -1001,6 +1008,9 @@ CCAssignFn *AMDGPUCallLowering::CCAssignFnForCall(CallingConv::ID CC,
case CallingConv::AMDGPU_ES:
case CallingConv::AMDGPU_LS:
return CC_AMDGPU;
+ case CallingConv::AMDGPU_CS_Chain:
+ case CallingConv::AMDGPU_CS_ChainPreserve:
+ return CC_AMDGPU_CS_CHAIN;
case CallingConv::C:
case CallingConv::Fast:
case CallingConv::Cold:
@@ -1024,6 +1034,8 @@ CCAssignFn *AMDGPUCallLowering::CCAssignFnForReturn(CallingConv::ID CC,
case CallingConv::AMDGPU_GS:
case CallingConv::AMDGPU_PS:
case CallingConv::AMDGPU_CS:
+ case CallingConv::AMDGPU_CS_Chain:
+ case CallingConv::AMDGPU_CS_ChainPreserve:
case CallingConv::AMDGPU_HS:
case CallingConv::AMDGPU_ES:
case CallingConv::AMDGPU_LS:
@@ -1315,6 +1327,7 @@ SDValue AMDGPUTargetLowering::LowerOperation(SDValue Op,
case ISD::FLOG10:
return LowerFLOGCommon(Op, DAG);
case ISD::FEXP:
+ case ISD::FEXP10:
return lowerFEXP(Op, DAG);
case ISD::FEXP2:
return lowerFEXP2(Op, DAG);
@@ -1360,6 +1373,7 @@ void AMDGPUTargetLowering::ReplaceNodeResults(SDNode *N,
Results.push_back(Lowered);
return;
case ISD::FEXP:
+ case ISD::FEXP10:
if (SDValue Lowered = lowerFEXP(SDValue(N, 0), DAG))
Results.push_back(Lowered);
return;
@@ -1714,7 +1728,7 @@ SDValue AMDGPUTargetLowering::SplitVectorLoad(const SDValue Op,
SDValue LoLoad = DAG.getExtLoad(Load->getExtensionType(), SL, LoVT,
Load->getChain(), BasePtr, SrcValue, LoMemVT,
BaseAlign, Load->getMemOperand()->getFlags());
- SDValue HiPtr = DAG.getObjectPtrOffset(SL, BasePtr, TypeSize::Fixed(Size));
+ SDValue HiPtr = DAG.getObjectPtrOffset(SL, BasePtr, TypeSize::getFixed(Size));
SDValue HiLoad =
DAG.getExtLoad(Load->getExtensionType(), SL, HiVT, Load->getChain(),
HiPtr, SrcValue.getWithOffset(LoMemVT.getStoreSize()),
@@ -2362,7 +2376,8 @@ SDValue AMDGPUTargetLowering::LowerFTRUNC(SDValue Op, SelectionDAG &DAG) const {
return DAG.getNode(ISD::BITCAST, SL, MVT::f64, Tmp2);
}
-SDValue AMDGPUTargetLowering::LowerFRINT(SDValue Op, SelectionDAG &DAG) const {
+SDValue AMDGPUTargetLowering::LowerFROUNDEVEN(SDValue Op,
+ SelectionDAG &DAG) const {
SDLoc SL(Op);
SDValue Src = Op.getOperand(0);
@@ -2389,18 +2404,19 @@ SDValue AMDGPUTargetLowering::LowerFRINT(SDValue Op, SelectionDAG &DAG) const {
return DAG.getSelect(SL, MVT::f64, Cond, Src, Tmp2);
}
-SDValue AMDGPUTargetLowering::LowerFNEARBYINT(SDValue Op, SelectionDAG &DAG) const {
+SDValue AMDGPUTargetLowering::LowerFNEARBYINT(SDValue Op,
+ SelectionDAG &DAG) const {
// FNEARBYINT and FRINT are the same, except in their handling of FP
// exceptions. Those aren't really meaningful for us, and OpenCL only has
// rint, so just treat them as equivalent.
- return DAG.getNode(ISD::FRINT, SDLoc(Op), Op.getValueType(), Op.getOperand(0));
+ return DAG.getNode(ISD::FROUNDEVEN, SDLoc(Op), Op.getValueType(),
+ Op.getOperand(0));
}
-SDValue AMDGPUTargetLowering::LowerFROUNDEVEN(SDValue Op,
- SelectionDAG &DAG) const {
+SDValue AMDGPUTargetLowering::LowerFRINT(SDValue Op, SelectionDAG &DAG) const {
auto VT = Op.getValueType();
auto Arg = Op.getOperand(0u);
- return DAG.getNode(ISD::FRINT, SDLoc(Op), VT, Arg);
+ return DAG.getNode(ISD::FROUNDEVEN, SDLoc(Op), VT, Arg);
}
// XXX - May require not supporting f32 denormals?
@@ -2423,18 +2439,16 @@ SDValue AMDGPUTargetLowering::LowerFROUND(SDValue Op, SelectionDAG &DAG) const {
const SDValue Zero = DAG.getConstantFP(0.0, SL, VT);
const SDValue One = DAG.getConstantFP(1.0, SL, VT);
- const SDValue Half = DAG.getConstantFP(0.5, SL, VT);
-
- SDValue SignOne = DAG.getNode(ISD::FCOPYSIGN, SL, VT, One, X);
EVT SetCCVT =
getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
+ const SDValue Half = DAG.getConstantFP(0.5, SL, VT);
SDValue Cmp = DAG.getSetCC(SL, SetCCVT, AbsDiff, Half, ISD::SETOGE);
+ SDValue OneOrZeroFP = DAG.getNode(ISD::SELECT, SL, VT, Cmp, One, Zero);
- SDValue Sel = DAG.getNode(ISD::SELECT, SL, VT, Cmp, SignOne, Zero);
-
- return DAG.getNode(ISD::FADD, SL, VT, T, Sel);
+ SDValue SignedOffset = DAG.getNode(ISD::FCOPYSIGN, SL, VT, OneOrZeroFP, X);
+ return DAG.getNode(ISD::FADD, SL, VT, T, SignedOffset);
}
SDValue AMDGPUTargetLowering::LowerFFLOOR(SDValue Op, SelectionDAG &DAG) const {
@@ -2468,7 +2482,18 @@ static bool valueIsKnownNeverF32Denorm(SDValue Src) {
case ISD::FP_EXTEND:
return Src.getOperand(0).getValueType() == MVT::f16;
case ISD::FP16_TO_FP:
+ case ISD::FFREXP:
return true;
+ case ISD::INTRINSIC_WO_CHAIN: {
+ unsigned IntrinsicID =
+ cast<ConstantSDNode>(Src.getOperand(0))->getZExtValue();
+ switch (IntrinsicID) {
+ case Intrinsic::amdgcn_frexp_mant:
+ return true;
+ default:
+ return false;
+ }
+ }
default:
return false;
}
@@ -2476,15 +2501,17 @@ static bool valueIsKnownNeverF32Denorm(SDValue Src) {
llvm_unreachable("covered opcode switch");
}
-static bool allowApproxFunc(const SelectionDAG &DAG, SDNodeFlags Flags) {
+bool AMDGPUTargetLowering::allowApproxFunc(const SelectionDAG &DAG,
+ SDNodeFlags Flags) {
if (Flags.hasApproximateFuncs())
return true;
auto &Options = DAG.getTarget().Options;
return Options.UnsafeFPMath || Options.ApproxFuncFPMath;
}
-static bool needsDenormHandlingF32(const SelectionDAG &DAG, SDValue Src,
- SDNodeFlags Flags) {
+bool AMDGPUTargetLowering::needsDenormHandlingF32(const SelectionDAG &DAG,
+ SDValue Src,
+ SDNodeFlags Flags) {
return !valueIsKnownNeverF32Denorm(Src) &&
DAG.getMachineFunction()
.getDenormalMode(APFloat::IEEEsingle())
@@ -2528,7 +2555,7 @@ SDValue AMDGPUTargetLowering::getIsFinite(SelectionDAG &DAG, SDValue Src,
std::pair<SDValue, SDValue>
AMDGPUTargetLowering::getScaledLogInput(SelectionDAG &DAG, const SDLoc SL,
SDValue Src, SDNodeFlags Flags) const {
- if (allowApproxFunc(DAG, Flags) || !needsDenormHandlingF32(DAG, Src, Flags))
+ if (!needsDenormHandlingF32(DAG, Src, Flags))
return {};
MVT VT = MVT::f32;
@@ -2609,9 +2636,7 @@ SDValue AMDGPUTargetLowering::LowerFLOGCommon(SDValue Op,
X = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, X, Flags);
}
- SDValue Lowered = LowerFLOGUnsafe(
- X, DL, DAG, IsLog10 ? numbers::ln2 / numbers::ln10 : numbers::ln2,
- Flags);
+ SDValue Lowered = LowerFLOGUnsafe(X, DL, DAG, IsLog10, Flags);
if (VT == MVT::f16 && !Subtarget->has16BitInsts()) {
return DAG.getNode(ISD::FP_ROUND, DL, VT, Lowered,
DAG.getTargetConstant(0, DL, MVT::i32), Flags);
@@ -2696,11 +2721,37 @@ SDValue AMDGPUTargetLowering::LowerFLOG10(SDValue Op, SelectionDAG &DAG) const {
// Do f32 fast math expansion for flog2 or flog10. This is accurate enough for a
// promote f16 operation.
SDValue AMDGPUTargetLowering::LowerFLOGUnsafe(SDValue Src, const SDLoc &SL,
- SelectionDAG &DAG,
- double Log2BaseInverted,
+ SelectionDAG &DAG, bool IsLog10,
SDNodeFlags Flags) const {
EVT VT = Src.getValueType();
- unsigned LogOp = VT == MVT::f32 ? AMDGPUISD::LOG : ISD::FLOG2;
+ unsigned LogOp =
+ VT == MVT::f32 ? (unsigned)AMDGPUISD::LOG : (unsigned)ISD::FLOG2;
+
+ double Log2BaseInverted =
+ IsLog10 ? numbers::ln2 / numbers::ln10 : numbers::ln2;
+
+ if (VT == MVT::f32) {
+ auto [ScaledInput, IsScaled] = getScaledLogInput(DAG, SL, Src, Flags);
+ if (ScaledInput) {
+ SDValue LogSrc = DAG.getNode(AMDGPUISD::LOG, SL, VT, ScaledInput, Flags);
+ SDValue ScaledResultOffset =
+ DAG.getConstantFP(-32.0 * Log2BaseInverted, SL, VT);
+
+ SDValue Zero = DAG.getConstantFP(0.0f, SL, VT);
+
+ SDValue ResultOffset = DAG.getNode(ISD::SELECT, SL, VT, IsScaled,
+ ScaledResultOffset, Zero, Flags);
+
+ SDValue Log2Inv = DAG.getConstantFP(Log2BaseInverted, SL, VT);
+
+ if (Subtarget->hasFastFMAF32())
+ return DAG.getNode(ISD::FMA, SL, VT, LogSrc, Log2Inv, ResultOffset,
+ Flags);
+ SDValue Mul = DAG.getNode(ISD::FMUL, SL, VT, LogSrc, Log2Inv, Flags);
+ return DAG.getNode(ISD::FADD, SL, VT, Mul, ResultOffset);
+ }
+ }
+
SDValue Log2Operand = DAG.getNode(LogOp, SL, VT, Src, Flags);
SDValue Log2BaseInvertedOperand = DAG.getConstantFP(Log2BaseInverted, SL, VT);
@@ -2728,7 +2779,7 @@ SDValue AMDGPUTargetLowering::lowerFEXP2(SDValue Op, SelectionDAG &DAG) const {
assert(VT == MVT::f32);
- if (allowApproxFunc(DAG, Flags) || !needsDenormHandlingF32(DAG, Src, Flags))
+ if (!needsDenormHandlingF32(DAG, Src, Flags))
return DAG.getNode(AMDGPUISD::EXP, SL, MVT::f32, Src, Flags);
// bool needs_scaling = x < -0x1.f80000p+6f;
@@ -2759,14 +2810,95 @@ SDValue AMDGPUTargetLowering::lowerFEXP2(SDValue Op, SelectionDAG &DAG) const {
return DAG.getNode(ISD::FMUL, SL, VT, Exp2, ResultScale, Flags);
}
-SDValue AMDGPUTargetLowering::lowerFEXPUnsafe(SDValue Op, const SDLoc &SL,
+SDValue AMDGPUTargetLowering::lowerFEXPUnsafe(SDValue X, const SDLoc &SL,
SelectionDAG &DAG,
SDNodeFlags Flags) const {
- // exp2(M_LOG2E_F * f);
- EVT VT = Op.getValueType();
- const SDValue K = DAG.getConstantFP(numbers::log2e, SL, VT);
- SDValue Mul = DAG.getNode(ISD::FMUL, SL, VT, Op, K, Flags);
- return DAG.getNode(VT == MVT::f32 ? AMDGPUISD::EXP : ISD::FEXP2, SL, VT, Mul,
+ EVT VT = X.getValueType();
+ const SDValue Log2E = DAG.getConstantFP(numbers::log2e, SL, VT);
+
+ if (VT != MVT::f32 || !needsDenormHandlingF32(DAG, X, Flags)) {
+ // exp2(M_LOG2E_F * f);
+ SDValue Mul = DAG.getNode(ISD::FMUL, SL, VT, X, Log2E, Flags);
+ return DAG.getNode(VT == MVT::f32 ? (unsigned)AMDGPUISD::EXP
+ : (unsigned)ISD::FEXP2,
+ SL, VT, Mul, Flags);
+ }
+
+ EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
+
+ SDValue Threshold = DAG.getConstantFP(-0x1.5d58a0p+6f, SL, VT);
+ SDValue NeedsScaling = DAG.getSetCC(SL, SetCCVT, X, Threshold, ISD::SETOLT);
+
+ SDValue ScaleOffset = DAG.getConstantFP(0x1.0p+6f, SL, VT);
+
+ SDValue ScaledX = DAG.getNode(ISD::FADD, SL, VT, X, ScaleOffset, Flags);
+
+ SDValue AdjustedX =
+ DAG.getNode(ISD::SELECT, SL, VT, NeedsScaling, ScaledX, X);
+
+ SDValue ExpInput = DAG.getNode(ISD::FMUL, SL, VT, AdjustedX, Log2E, Flags);
+
+ SDValue Exp2 = DAG.getNode(AMDGPUISD::EXP, SL, VT, ExpInput, Flags);
+
+ SDValue ResultScaleFactor = DAG.getConstantFP(0x1.969d48p-93f, SL, VT);
+ SDValue AdjustedResult =
+ DAG.getNode(ISD::FMUL, SL, VT, Exp2, ResultScaleFactor, Flags);
+
+ return DAG.getNode(ISD::SELECT, SL, VT, NeedsScaling, AdjustedResult, Exp2,
+ Flags);
+}
+
+/// Emit approx-funcs appropriate lowering for exp10. inf/nan should still be
+/// handled correctly.
+SDValue AMDGPUTargetLowering::lowerFEXP10Unsafe(SDValue X, const SDLoc &SL,
+ SelectionDAG &DAG,
+ SDNodeFlags Flags) const {
+ const EVT VT = X.getValueType();
+ const unsigned Exp2Op = VT == MVT::f32 ? AMDGPUISD::EXP : ISD::FEXP2;
+
+ if (VT != MVT::f32 || !needsDenormHandlingF32(DAG, X, Flags)) {
+ // exp2(x * 0x1.a92000p+1f) * exp2(x * 0x1.4f0978p-11f);
+ SDValue K0 = DAG.getConstantFP(0x1.a92000p+1f, SL, VT);
+ SDValue K1 = DAG.getConstantFP(0x1.4f0978p-11f, SL, VT);
+
+ SDValue Mul0 = DAG.getNode(ISD::FMUL, SL, VT, X, K0, Flags);
+ SDValue Exp2_0 = DAG.getNode(Exp2Op, SL, VT, Mul0, Flags);
+ SDValue Mul1 = DAG.getNode(ISD::FMUL, SL, VT, X, K1, Flags);
+ SDValue Exp2_1 = DAG.getNode(Exp2Op, SL, VT, Mul1, Flags);
+ return DAG.getNode(ISD::FMUL, SL, VT, Exp2_0, Exp2_1);
+ }
+
+ // bool s = x < -0x1.2f7030p+5f;
+ // x += s ? 0x1.0p+5f : 0.0f;
+ // exp10 = exp2(x * 0x1.a92000p+1f) *
+ // exp2(x * 0x1.4f0978p-11f) *
+ // (s ? 0x1.9f623ep-107f : 1.0f);
+
+ EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
+
+ SDValue Threshold = DAG.getConstantFP(-0x1.2f7030p+5f, SL, VT);
+ SDValue NeedsScaling = DAG.getSetCC(SL, SetCCVT, X, Threshold, ISD::SETOLT);
+
+ SDValue ScaleOffset = DAG.getConstantFP(0x1.0p+5f, SL, VT);
+ SDValue ScaledX = DAG.getNode(ISD::FADD, SL, VT, X, ScaleOffset, Flags);
+ SDValue AdjustedX =
+ DAG.getNode(ISD::SELECT, SL, VT, NeedsScaling, ScaledX, X);
+
+ SDValue K0 = DAG.getConstantFP(0x1.a92000p+1f, SL, VT);
+ SDValue K1 = DAG.getConstantFP(0x1.4f0978p-11f, SL, VT);
+
+ SDValue Mul0 = DAG.getNode(ISD::FMUL, SL, VT, AdjustedX, K0, Flags);
+ SDValue Exp2_0 = DAG.getNode(Exp2Op, SL, VT, Mul0, Flags);
+ SDValue Mul1 = DAG.getNode(ISD::FMUL, SL, VT, AdjustedX, K1, Flags);
+ SDValue Exp2_1 = DAG.getNode(Exp2Op, SL, VT, Mul1, Flags);
+
+ SDValue MulExps = DAG.getNode(ISD::FMUL, SL, VT, Exp2_0, Exp2_1, Flags);
+
+ SDValue ResultScaleFactor = DAG.getConstantFP(0x1.9f623ep-107f, SL, VT);
+ SDValue AdjustedResult =
+ DAG.getNode(ISD::FMUL, SL, VT, MulExps, ResultScaleFactor, Flags);
+
+ return DAG.getNode(ISD::SELECT, SL, VT, NeedsScaling, AdjustedResult, MulExps,
Flags);
}
@@ -2775,7 +2907,7 @@ SDValue AMDGPUTargetLowering::lowerFEXP(SDValue Op, SelectionDAG &DAG) const {
SDLoc SL(Op);
SDValue X = Op.getOperand(0);
SDNodeFlags Flags = Op->getFlags();
- const bool IsExp10 = false; // TODO: For some reason exp10 is missing
+ const bool IsExp10 = Op.getOpcode() == ISD::FEXP10;
if (VT.getScalarType() == MVT::f16) {
// v_exp_f16 (fmul x, log2e)
@@ -2799,9 +2931,9 @@ SDValue AMDGPUTargetLowering::lowerFEXP(SDValue Op, SelectionDAG &DAG) const {
// TODO: Interpret allowApproxFunc as ignoring DAZ. This is currently copying
// library behavior. Also, is known-not-daz source sufficient?
- if (allowApproxFunc(DAG, Flags) && !needsDenormHandlingF32(DAG, X, Flags)) {
- assert(!IsExp10 && "todo exp10 support");
- return lowerFEXPUnsafe(X, SL, DAG, Flags);
+ if (allowApproxFunc(DAG, Flags)) {
+ return IsExp10 ? lowerFEXP10Unsafe(X, SL, DAG, Flags)
+ : lowerFEXPUnsafe(X, SL, DAG, Flags);
}
// Algorithm:
@@ -2868,7 +3000,7 @@ SDValue AMDGPUTargetLowering::lowerFEXP(SDValue Op, SelectionDAG &DAG) const {
PL = getMad(DAG, SL, VT, XH, CL, Mad0, Flags);
}
- SDValue E = DAG.getNode(ISD::FRINT, SL, VT, PH, Flags);
+ SDValue E = DAG.getNode(ISD::FROUNDEVEN, SL, VT, PH, Flags);
// It is unsafe to contract this fsub into the PH multiply.
SDValue PHSubE = DAG.getNode(ISD::FSUB, SL, VT, PH, E, FlagsNoContract);
@@ -3675,8 +3807,7 @@ SDValue AMDGPUTargetLowering::performIntrinsicWOChainCombine(
case Intrinsic::amdgcn_rsq:
case Intrinsic::amdgcn_rcp_legacy:
case Intrinsic::amdgcn_rsq_legacy:
- case Intrinsic::amdgcn_rsq_clamp:
- case Intrinsic::amdgcn_ldexp: {
+ case Intrinsic::amdgcn_rsq_clamp: {
// FIXME: This is probably wrong. If src is an sNaN, it won't be quieted
SDValue Src = N->getOperand(1);
return Src.isUndef() ? Src : SDValue();
@@ -3989,8 +4120,7 @@ static SDValue getAddOneOp(const SDNode *V) {
if (V->getOpcode() != ISD::ADD)
return SDValue();
- auto *C = dyn_cast<ConstantSDNode>(V->getOperand(1));
- return C && C->isOne() ? V->getOperand(0) : SDValue();
+ return isOneConstant(V->getOperand(1)) ? V->getOperand(0) : SDValue();
}
SDValue AMDGPUTargetLowering::performMulCombine(SDNode *N,
@@ -4220,8 +4350,7 @@ SDValue AMDGPUTargetLowering::getFFBX_U32(SelectionDAG &DAG,
SDValue AMDGPUTargetLowering::performCtlz_CttzCombine(const SDLoc &SL, SDValue Cond,
SDValue LHS, SDValue RHS,
DAGCombinerInfo &DCI) const {
- ConstantSDNode *CmpRhs = dyn_cast<ConstantSDNode>(Cond.getOperand(1));
- if (!CmpRhs || !CmpRhs->isZero())
+ if (!isNullConstant(Cond.getOperand(1)))
return SDValue();
SelectionDAG &DAG = DCI.DAG;
@@ -4615,6 +4744,7 @@ SDValue AMDGPUTargetLowering::performFNegCombine(SDNode *N,
case ISD::FTRUNC:
case ISD::FRINT:
case ISD::FNEARBYINT: // XXX - Should fround be handled?
+ case ISD::FROUNDEVEN:
case ISD::FSIN:
case ISD::FCANONICALIZE:
case AMDGPUISD::RCP:
@@ -4976,6 +5106,36 @@ SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N,
return performAssertSZExtCombine(N, DCI);
case ISD::INTRINSIC_WO_CHAIN:
return performIntrinsicWOChainCombine(N, DCI);
+ case AMDGPUISD::FMAD_FTZ: {
+ SDValue N0 = N->getOperand(0);
+ SDValue N1 = N->getOperand(1);
+ SDValue N2 = N->getOperand(2);
+ EVT VT = N->getValueType(0);
+
+ // FMAD_FTZ is a FMAD + flush denormals to zero.
+ // We flush the inputs, the intermediate step, and the output.
+ ConstantFPSDNode *N0CFP = dyn_cast<ConstantFPSDNode>(N0);
+ ConstantFPSDNode *N1CFP = dyn_cast<ConstantFPSDNode>(N1);
+ ConstantFPSDNode *N2CFP = dyn_cast<ConstantFPSDNode>(N2);
+ if (N0CFP && N1CFP && N2CFP) {
+ const auto FTZ = [](const APFloat &V) {
+ if (V.isDenormal()) {
+ APFloat Zero(V.getSemantics(), 0);
+ return V.isNegative() ? -Zero : Zero;
+ }
+ return V;
+ };
+
+ APFloat V0 = FTZ(N0CFP->getValueAPF());
+ APFloat V1 = FTZ(N1CFP->getValueAPF());
+ APFloat V2 = FTZ(N2CFP->getValueAPF());
+ V0.multiply(V1, APFloat::rmNearestTiesToEven);
+ V0 = FTZ(V0);
+ V0.add(V2, APFloat::rmNearestTiesToEven);
+ return DAG.getConstantFP(FTZ(V0), DL, VT);
+ }
+ break;
+ }
}
return SDValue();
}
@@ -5117,8 +5277,10 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
NODE_NAME_CASE(CALL)
NODE_NAME_CASE(TC_RETURN)
NODE_NAME_CASE(TC_RETURN_GFX)
+ NODE_NAME_CASE(TC_RETURN_CHAIN)
NODE_NAME_CASE(TRAP)
NODE_NAME_CASE(RET_GLUE)
+ NODE_NAME_CASE(WAVE_ADDRESS)
NODE_NAME_CASE(RETURN_TO_EPILOG)
NODE_NAME_CASE(ENDPGM)
NODE_NAME_CASE(ENDPGM_TRAP)
@@ -5711,12 +5873,6 @@ AMDGPUTargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const {
}
}
-bool AMDGPUTargetLowering::isConstantUnsignedBitfieldExtractLegal(
- unsigned Opc, LLT Ty1, LLT Ty2) const {
- return (Ty1 == LLT::scalar(32) || Ty1 == LLT::scalar(64)) &&
- Ty2 == LLT::scalar(32);
-}
-
/// Whether it is profitable to sink the operands of an
/// Instruction I to the basic block of I.
/// This helps using several modifiers (like abs and neg) more often.