diff options
Diffstat (limited to 'llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp')
-rw-r--r-- | llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp | 493 |
1 files changed, 258 insertions, 235 deletions
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp index 23cc9404532d..940ec6f31c69 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -16,7 +16,6 @@ #include "AMDGPU.h" #include "AMDGPUCallLowering.h" #include "AMDGPUFrameLowering.h" -#include "AMDGPURegisterInfo.h" #include "AMDGPUSubtarget.h" #include "AMDGPUTargetMachine.h" #include "Utils/AMDGPUBaseInfo.h" @@ -38,6 +37,11 @@ using namespace llvm; #include "AMDGPUGenCallingConv.inc" +static cl::opt<bool> AMDGPUBypassSlowDiv( + "amdgpu-bypass-slow-div", + cl::desc("Skip 64-bit divide for dynamic 32-bit values"), + cl::init(true)); + // Find a larger type to do a load / store of a vector with. EVT AMDGPUTargetLowering::getEquivalentMemType(LLVMContext &Ctx, EVT VT) { unsigned StoreSize = VT.getStoreSizeInBits(); @@ -103,6 +107,24 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM, setOperationAction(ISD::LOAD, MVT::v2f64, Promote); AddPromotedToType(ISD::LOAD, MVT::v2f64, MVT::v4i32); + setOperationAction(ISD::LOAD, MVT::v4i64, Promote); + AddPromotedToType(ISD::LOAD, MVT::v4i64, MVT::v8i32); + + setOperationAction(ISD::LOAD, MVT::v4f64, Promote); + AddPromotedToType(ISD::LOAD, MVT::v4f64, MVT::v8i32); + + setOperationAction(ISD::LOAD, MVT::v8i64, Promote); + AddPromotedToType(ISD::LOAD, MVT::v8i64, MVT::v16i32); + + setOperationAction(ISD::LOAD, MVT::v8f64, Promote); + AddPromotedToType(ISD::LOAD, MVT::v8f64, MVT::v16i32); + + setOperationAction(ISD::LOAD, MVT::v16i64, Promote); + AddPromotedToType(ISD::LOAD, MVT::v16i64, MVT::v32i32); + + setOperationAction(ISD::LOAD, MVT::v16f64, Promote); + AddPromotedToType(ISD::LOAD, MVT::v16f64, MVT::v32i32); + // There are no 64-bit extloads. These should be done as a 32-bit extload and // an extension to 64-bit. for (MVT VT : MVT::integer_valuetypes()) { @@ -161,11 +183,13 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM, setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f32, Expand); setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f32, Expand); setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8f32, Expand); + setLoadExtAction(ISD::EXTLOAD, MVT::v16f64, MVT::v16f32, Expand); setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f16, Expand); setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f16, Expand); setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f16, Expand); setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8f16, Expand); + setLoadExtAction(ISD::EXTLOAD, MVT::v16f64, MVT::v16f16, Expand); setOperationAction(ISD::STORE, MVT::f32, Promote); AddPromotedToType(ISD::STORE, MVT::f32, MVT::i32); @@ -203,6 +227,24 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM, setOperationAction(ISD::STORE, MVT::v2f64, Promote); AddPromotedToType(ISD::STORE, MVT::v2f64, MVT::v4i32); + setOperationAction(ISD::STORE, MVT::v4i64, Promote); + AddPromotedToType(ISD::STORE, MVT::v4i64, MVT::v8i32); + + setOperationAction(ISD::STORE, MVT::v4f64, Promote); + AddPromotedToType(ISD::STORE, MVT::v4f64, MVT::v8i32); + + setOperationAction(ISD::STORE, MVT::v8i64, Promote); + AddPromotedToType(ISD::STORE, MVT::v8i64, MVT::v16i32); + + setOperationAction(ISD::STORE, MVT::v8f64, Promote); + AddPromotedToType(ISD::STORE, MVT::v8f64, MVT::v16i32); + + setOperationAction(ISD::STORE, MVT::v16i64, Promote); + AddPromotedToType(ISD::STORE, MVT::v16i64, MVT::v32i32); + + setOperationAction(ISD::STORE, MVT::v16f64, Promote); + AddPromotedToType(ISD::STORE, MVT::v16f64, MVT::v32i32); + setTruncStoreAction(MVT::i64, MVT::i1, Expand); setTruncStoreAction(MVT::i64, MVT::i8, Expand); setTruncStoreAction(MVT::i64, MVT::i16, Expand); @@ -227,12 +269,21 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM, setTruncStoreAction(MVT::v2f64, MVT::v2f32, Expand); setTruncStoreAction(MVT::v2f64, MVT::v2f16, Expand); + setTruncStoreAction(MVT::v4i64, MVT::v4i32, Expand); + setTruncStoreAction(MVT::v4i64, MVT::v4i16, Expand); setTruncStoreAction(MVT::v4f64, MVT::v4f32, Expand); setTruncStoreAction(MVT::v4f64, MVT::v4f16, Expand); setTruncStoreAction(MVT::v8f64, MVT::v8f32, Expand); setTruncStoreAction(MVT::v8f64, MVT::v8f16, Expand); + setTruncStoreAction(MVT::v16f64, MVT::v16f32, Expand); + setTruncStoreAction(MVT::v16f64, MVT::v16f16, Expand); + setTruncStoreAction(MVT::v16i64, MVT::v16i16, Expand); + setTruncStoreAction(MVT::v16i64, MVT::v16i16, Expand); + setTruncStoreAction(MVT::v16i64, MVT::v16i8, Expand); + setTruncStoreAction(MVT::v16i64, MVT::v16i8, Expand); + setTruncStoreAction(MVT::v16i64, MVT::v16i1, Expand); setOperationAction(ISD::Constant, MVT::i32, Legal); setOperationAction(ISD::Constant, MVT::i64, Legal); @@ -297,6 +348,14 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM, setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v16i32, Custom); setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v32f32, Custom); setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v32i32, Custom); + setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v2f64, Custom); + setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v2i64, Custom); + setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v4f64, Custom); + setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v4i64, Custom); + setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v8f64, Custom); + setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v8i64, Custom); + setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v16f64, Custom); + setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v16i64, Custom); setOperationAction(ISD::FP16_TO_FP, MVT::f64, Expand); setOperationAction(ISD::FP_TO_FP16, MVT::f64, Custom); @@ -329,6 +388,9 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM, setOperationAction(ISD::SUBE, VT, Legal); } + // The hardware supports 32-bit FSHR, but not FSHL. + setOperationAction(ISD::FSHR, MVT::i32, Legal); + // The hardware supports 32-bit ROTR, but not ROTL. setOperationAction(ISD::ROTL, MVT::i32, Expand); setOperationAction(ISD::ROTL, MVT::i64, Expand); @@ -381,7 +443,7 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM, setOperationAction(ISD::UREM, VT, Expand); setOperationAction(ISD::SMUL_LOHI, VT, Expand); setOperationAction(ISD::UMUL_LOHI, VT, Expand); - setOperationAction(ISD::SDIVREM, VT, Custom); + setOperationAction(ISD::SDIVREM, VT, Expand); setOperationAction(ISD::UDIVREM, VT, Expand); setOperationAction(ISD::SELECT, VT, Expand); setOperationAction(ISD::VSELECT, VT, Expand); @@ -483,6 +545,10 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM, MaxStoresPerMemmove = 0xffffffff; MaxStoresPerMemset = 0xffffffff; + // The expansion for 64-bit division is enormous. + if (AMDGPUBypassSlowDiv) + addBypassSlowDiv(64, 32); + setTargetDAGCombine(ISD::BITCAST); setTargetDAGCombine(ISD::SHL); setTargetDAGCombine(ISD::SRA); @@ -609,6 +675,17 @@ bool AMDGPUTargetLowering::allUsesHaveSourceMods(const SDNode *N, return true; } +EVT AMDGPUTargetLowering::getTypeForExtReturn(LLVMContext &Context, EVT VT, + ISD::NodeType ExtendKind) const { + assert(!VT.isVector() && "only scalar expected"); + + // Round to the next multiple of 32-bits. + unsigned Size = VT.getSizeInBits(); + if (Size <= 32) + return MVT::i32; + return EVT::getIntegerVT(Context, 32 * ((Size + 31) / 32)); +} + MVT AMDGPUTargetLowering::getVectorIdxTy(const DataLayout &) const { return MVT::i32; } @@ -641,8 +718,9 @@ bool AMDGPUTargetLowering::shouldReduceLoadWidth(SDNode *N, unsigned NewSize = NewVT.getStoreSizeInBits(); - // If we are reducing to a 32-bit load, this is always better. - if (NewSize == 32) + // If we are reducing to a 32-bit load or a smaller multi-dword load, + // this is always better. + if (NewSize >= 32) return true; EVT OldVT = N->getValueType(0); @@ -733,6 +811,26 @@ bool AMDGPUTargetLowering::isSDNodeAlwaysUniform(const SDNode * N) const { } } +SDValue AMDGPUTargetLowering::getNegatedExpression( + SDValue Op, SelectionDAG &DAG, bool LegalOperations, bool ForCodeSize, + NegatibleCost &Cost, unsigned Depth) const { + + switch (Op.getOpcode()) { + case ISD::FMA: + case ISD::FMAD: { + // Negating a fma is not free if it has users without source mods. + if (!allUsesHaveSourceMods(Op.getNode())) + return SDValue(); + break; + } + default: + break; + } + + return TargetLowering::getNegatedExpression(Op, DAG, LegalOperations, + ForCodeSize, Cost, Depth); +} + //===---------------------------------------------------------------------===// // Target Properties //===---------------------------------------------------------------------===// @@ -912,7 +1010,7 @@ void AMDGPUTargetLowering::analyzeFormalArgumentsCompute( const unsigned ExplicitOffset = ST.getExplicitKernelArgOffset(Fn); CallingConv::ID CC = Fn.getCallingConv(); - unsigned MaxAlign = 1; + Align MaxAlign = Align(1); uint64_t ExplicitArgOffset = 0; const DataLayout &DL = Fn.getParent()->getDataLayout(); @@ -920,12 +1018,12 @@ void AMDGPUTargetLowering::analyzeFormalArgumentsCompute( for (const Argument &Arg : Fn.args()) { Type *BaseArgTy = Arg.getType(); - unsigned Align = DL.getABITypeAlignment(BaseArgTy); - MaxAlign = std::max(Align, MaxAlign); + Align Alignment = DL.getABITypeAlign(BaseArgTy); + MaxAlign = std::max(Alignment, MaxAlign); unsigned AllocSize = DL.getTypeAllocSize(BaseArgTy); - uint64_t ArgOffset = alignTo(ExplicitArgOffset, Align) + ExplicitOffset; - ExplicitArgOffset = alignTo(ExplicitArgOffset, Align) + AllocSize; + uint64_t ArgOffset = alignTo(ExplicitArgOffset, Alignment) + ExplicitOffset; + ExplicitArgOffset = alignTo(ExplicitArgOffset, Alignment) + AllocSize; // We're basically throwing away everything passed into us and starting over // to get accurate in-memory offsets. The "PartOffset" is completely useless @@ -999,6 +1097,8 @@ void AMDGPUTargetLowering::analyzeFormalArgumentsCompute( assert(MemVT.getVectorNumElements() == 3 || MemVT.getVectorNumElements() == 5); MemVT = MemVT.getPow2VectorType(State.getContext()); + } else if (!MemVT.isSimple() && !MemVT.isVector()) { + MemVT = MemVT.getRoundIntegerType(State.getContext()); } unsigned PartOffset = 0; @@ -1140,7 +1240,7 @@ SDValue AMDGPUTargetLowering::LowerOperation(SDValue Op, case ISD::FROUND: return LowerFROUND(Op, DAG); case ISD::FFLOOR: return LowerFFLOOR(Op, DAG); case ISD::FLOG: - return LowerFLOG(Op, DAG, 1.0F / numbers::log2ef); + return LowerFLOG(Op, DAG, numbers::ln2f); case ISD::FLOG10: return LowerFLOG(Op, DAG, numbers::ln2f / numbers::ln10f); case ISD::FEXP: @@ -1196,10 +1296,23 @@ SDValue AMDGPUTargetLowering::LowerGlobalAddress(AMDGPUMachineFunction* MFI, if (G->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS || G->getAddressSpace() == AMDGPUAS::REGION_ADDRESS) { if (!MFI->isEntryFunction()) { + SDLoc DL(Op); const Function &Fn = DAG.getMachineFunction().getFunction(); DiagnosticInfoUnsupported BadLDSDecl( - Fn, "local memory global used by non-kernel function", SDLoc(Op).getDebugLoc()); + Fn, "local memory global used by non-kernel function", + DL.getDebugLoc(), DS_Warning); DAG.getContext()->diagnose(BadLDSDecl); + + // We currently don't have a way to correctly allocate LDS objects that + // aren't directly associated with a kernel. We do force inlining of + // functions that use local objects. However, if these dead functions are + // not eliminated, we don't want a compile time error. Just emit a warning + // and a trap, since there should be no callable path here. + SDValue Trap = DAG.getNode(ISD::TRAP, DL, MVT::Other, DAG.getEntryNode()); + SDValue OutputChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, + Trap, DAG.getRoot()); + DAG.setRoot(OutputChain); + return DAG.getUNDEF(Op.getValueType()); } // XXX: What does the value of G->getOffset() mean? @@ -1208,7 +1321,7 @@ SDValue AMDGPUTargetLowering::LowerGlobalAddress(AMDGPUMachineFunction* MFI, // TODO: We could emit code to handle the initialization somewhere. if (!hasDefinedInitializer(GV)) { - unsigned Offset = MFI->allocateLDSGlobal(DL, *GV); + unsigned Offset = MFI->allocateLDSGlobal(DL, *cast<GlobalVariable>(GV)); return DAG.getConstant(Offset, SDLoc(Op), Op.getValueType()); } } @@ -1383,12 +1496,11 @@ AMDGPUTargetLowering::splitVector(const SDValue &N, const SDLoc &DL, (HiVT.isVector() ? HiVT.getVectorNumElements() : 1) <= N.getValueType().getVectorNumElements() && "More vector elements requested than available!"); - auto IdxTy = getVectorIdxTy(DAG.getDataLayout()); SDValue Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, LoVT, N, - DAG.getConstant(0, DL, IdxTy)); + DAG.getVectorIdxConstant(0, DL)); SDValue Hi = DAG.getNode( HiVT.isVector() ? ISD::EXTRACT_SUBVECTOR : ISD::EXTRACT_VECTOR_ELT, DL, - HiVT, N, DAG.getConstant(LoVT.getVectorNumElements(), DL, IdxTy)); + HiVT, N, DAG.getVectorIdxConstant(LoVT.getVectorNumElements(), DL)); return std::make_pair(Lo, Hi); } @@ -1433,18 +1545,17 @@ SDValue AMDGPUTargetLowering::SplitVectorLoad(const SDValue Op, HiPtr, SrcValue.getWithOffset(LoMemVT.getStoreSize()), HiMemVT, HiAlign, Load->getMemOperand()->getFlags()); - auto IdxTy = getVectorIdxTy(DAG.getDataLayout()); SDValue Join; if (LoVT == HiVT) { // This is the case that the vector is power of two so was evenly split. Join = DAG.getNode(ISD::CONCAT_VECTORS, SL, VT, LoLoad, HiLoad); } else { Join = DAG.getNode(ISD::INSERT_SUBVECTOR, SL, VT, DAG.getUNDEF(VT), LoLoad, - DAG.getConstant(0, SL, IdxTy)); - Join = DAG.getNode(HiVT.isVector() ? ISD::INSERT_SUBVECTOR - : ISD::INSERT_VECTOR_ELT, - SL, VT, Join, HiLoad, - DAG.getConstant(LoVT.getVectorNumElements(), SL, IdxTy)); + DAG.getVectorIdxConstant(0, SL)); + Join = DAG.getNode( + HiVT.isVector() ? ISD::INSERT_SUBVECTOR : ISD::INSERT_VECTOR_ELT, SL, + VT, Join, HiLoad, + DAG.getVectorIdxConstant(LoVT.getVectorNumElements(), SL)); } SDValue Ops[] = {Join, DAG.getNode(ISD::TokenFactor, SL, MVT::Other, @@ -1474,7 +1585,7 @@ SDValue AMDGPUTargetLowering::WidenVectorLoad(SDValue Op, WideMemVT, BaseAlign, Load->getMemOperand()->getFlags()); return DAG.getMergeValues( {DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, VT, WideLoad, - DAG.getConstant(0, SL, getVectorIdxTy(DAG.getDataLayout()))), + DAG.getVectorIdxConstant(0, SL)), WideLoad.getValue(1)}, SL); } @@ -1588,9 +1699,11 @@ SDValue AMDGPUTargetLowering::LowerDIVREM24(SDValue Op, SelectionDAG &DAG, const AMDGPUMachineFunction *MFI = MF.getInfo<AMDGPUMachineFunction>(); // float fr = mad(fqneg, fb, fa); - unsigned OpCode = MFI->getMode().FP32Denormals ? - (unsigned)AMDGPUISD::FMAD_FTZ : - (unsigned)ISD::FMAD; + unsigned OpCode = !Subtarget->hasMadMacF32Insts() ? + (unsigned)ISD::FMA : + !MFI->getMode().allFP32Denormals() ? + (unsigned)ISD::FMAD : + (unsigned)AMDGPUISD::FMAD_FTZ; SDValue fr = DAG.getNode(OpCode, DL, FltVT, fqneg, fb, fa); // int iq = (int)fq; @@ -1673,9 +1786,11 @@ void AMDGPUTargetLowering::LowerUDIVREM64(SDValue Op, const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); // Compute denominator reciprocal. - unsigned FMAD = MFI->getMode().FP32Denormals ? - (unsigned)AMDGPUISD::FMAD_FTZ : - (unsigned)ISD::FMAD; + unsigned FMAD = !Subtarget->hasMadMacF32Insts() ? + (unsigned)ISD::FMA : + !MFI->getMode().allFP32Denormals() ? + (unsigned)ISD::FMAD : + (unsigned)AMDGPUISD::FMAD_FTZ; SDValue Cvt_Lo = DAG.getNode(ISD::UINT_TO_FP, DL, MVT::f32, RHS_Lo); SDValue Cvt_Hi = DAG.getNode(ISD::UINT_TO_FP, DL, MVT::f32, RHS_Hi); @@ -1861,103 +1976,43 @@ SDValue AMDGPUTargetLowering::LowerUDIVREM(SDValue Op, return Res; } - SDValue Num = Op.getOperand(0); - SDValue Den = Op.getOperand(1); - - // RCP = URECIP(Den) = 2^32 / Den + e - // e is rounding error. - SDValue RCP = DAG.getNode(AMDGPUISD::URECIP, DL, VT, Den); - - // RCP_LO = mul(RCP, Den) */ - SDValue RCP_LO = DAG.getNode(ISD::MUL, DL, VT, RCP, Den); - - // RCP_HI = mulhu (RCP, Den) */ - SDValue RCP_HI = DAG.getNode(ISD::MULHU, DL, VT, RCP, Den); - - // NEG_RCP_LO = -RCP_LO - SDValue NEG_RCP_LO = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), - RCP_LO); - - // ABS_RCP_LO = (RCP_HI == 0 ? NEG_RCP_LO : RCP_LO) - SDValue ABS_RCP_LO = DAG.getSelectCC(DL, RCP_HI, DAG.getConstant(0, DL, VT), - NEG_RCP_LO, RCP_LO, - ISD::SETEQ); - // Calculate the rounding error from the URECIP instruction - // E = mulhu(ABS_RCP_LO, RCP) - SDValue E = DAG.getNode(ISD::MULHU, DL, VT, ABS_RCP_LO, RCP); - - // RCP_A_E = RCP + E - SDValue RCP_A_E = DAG.getNode(ISD::ADD, DL, VT, RCP, E); - - // RCP_S_E = RCP - E - SDValue RCP_S_E = DAG.getNode(ISD::SUB, DL, VT, RCP, E); - - // Tmp0 = (RCP_HI == 0 ? RCP_A_E : RCP_SUB_E) - SDValue Tmp0 = DAG.getSelectCC(DL, RCP_HI, DAG.getConstant(0, DL, VT), - RCP_A_E, RCP_S_E, - ISD::SETEQ); - // Quotient = mulhu(Tmp0, Num) - SDValue Quotient = DAG.getNode(ISD::MULHU, DL, VT, Tmp0, Num); - - // Num_S_Remainder = Quotient * Den - SDValue Num_S_Remainder = DAG.getNode(ISD::MUL, DL, VT, Quotient, Den); - - // Remainder = Num - Num_S_Remainder - SDValue Remainder = DAG.getNode(ISD::SUB, DL, VT, Num, Num_S_Remainder); - - // Remainder_GE_Den = (Remainder >= Den ? -1 : 0) - SDValue Remainder_GE_Den = DAG.getSelectCC(DL, Remainder, Den, - DAG.getConstant(-1, DL, VT), - DAG.getConstant(0, DL, VT), - ISD::SETUGE); - // Remainder_GE_Zero = (Num >= Num_S_Remainder ? -1 : 0) - SDValue Remainder_GE_Zero = DAG.getSelectCC(DL, Num, - Num_S_Remainder, - DAG.getConstant(-1, DL, VT), - DAG.getConstant(0, DL, VT), - ISD::SETUGE); - // Tmp1 = Remainder_GE_Den & Remainder_GE_Zero - SDValue Tmp1 = DAG.getNode(ISD::AND, DL, VT, Remainder_GE_Den, - Remainder_GE_Zero); - - // Calculate Division result: - - // Quotient_A_One = Quotient + 1 - SDValue Quotient_A_One = DAG.getNode(ISD::ADD, DL, VT, Quotient, - DAG.getConstant(1, DL, VT)); - - // Quotient_S_One = Quotient - 1 - SDValue Quotient_S_One = DAG.getNode(ISD::SUB, DL, VT, Quotient, - DAG.getConstant(1, DL, VT)); - - // Div = (Tmp1 == 0 ? Quotient : Quotient_A_One) - SDValue Div = DAG.getSelectCC(DL, Tmp1, DAG.getConstant(0, DL, VT), - Quotient, Quotient_A_One, ISD::SETEQ); - - // Div = (Remainder_GE_Zero == 0 ? Quotient_S_One : Div) - Div = DAG.getSelectCC(DL, Remainder_GE_Zero, DAG.getConstant(0, DL, VT), - Quotient_S_One, Div, ISD::SETEQ); - - // Calculate Rem result: - - // Remainder_S_Den = Remainder - Den - SDValue Remainder_S_Den = DAG.getNode(ISD::SUB, DL, VT, Remainder, Den); - - // Remainder_A_Den = Remainder + Den - SDValue Remainder_A_Den = DAG.getNode(ISD::ADD, DL, VT, Remainder, Den); - - // Rem = (Tmp1 == 0 ? Remainder : Remainder_S_Den) - SDValue Rem = DAG.getSelectCC(DL, Tmp1, DAG.getConstant(0, DL, VT), - Remainder, Remainder_S_Den, ISD::SETEQ); - - // Rem = (Remainder_GE_Zero == 0 ? Remainder_A_Den : Rem) - Rem = DAG.getSelectCC(DL, Remainder_GE_Zero, DAG.getConstant(0, DL, VT), - Remainder_A_Den, Rem, ISD::SETEQ); - SDValue Ops[2] = { - Div, - Rem - }; - return DAG.getMergeValues(Ops, DL); + SDValue X = Op.getOperand(0); + SDValue Y = Op.getOperand(1); + + // See AMDGPUCodeGenPrepare::expandDivRem32 for a description of the + // algorithm used here. + + // Initial estimate of inv(y). + SDValue Z = DAG.getNode(AMDGPUISD::URECIP, DL, VT, Y); + + // One round of UNR. + SDValue NegY = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), Y); + SDValue NegYZ = DAG.getNode(ISD::MUL, DL, VT, NegY, Z); + Z = DAG.getNode(ISD::ADD, DL, VT, Z, + DAG.getNode(ISD::MULHU, DL, VT, Z, NegYZ)); + + // Quotient/remainder estimate. + SDValue Q = DAG.getNode(ISD::MULHU, DL, VT, X, Z); + SDValue R = + DAG.getNode(ISD::SUB, DL, VT, X, DAG.getNode(ISD::MUL, DL, VT, Q, Y)); + + // First quotient/remainder refinement. + EVT CCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT); + SDValue One = DAG.getConstant(1, DL, VT); + SDValue Cond = DAG.getSetCC(DL, CCVT, R, Y, ISD::SETUGE); + Q = DAG.getNode(ISD::SELECT, DL, VT, Cond, + DAG.getNode(ISD::ADD, DL, VT, Q, One), Q); + R = DAG.getNode(ISD::SELECT, DL, VT, Cond, + DAG.getNode(ISD::SUB, DL, VT, R, Y), R); + + // Second quotient/remainder refinement. + Cond = DAG.getSetCC(DL, CCVT, R, Y, ISD::SETUGE); + Q = DAG.getNode(ISD::SELECT, DL, VT, Cond, + DAG.getNode(ISD::ADD, DL, VT, Q, One), Q); + R = DAG.getNode(ISD::SELECT, DL, VT, Cond, + DAG.getNode(ISD::SUB, DL, VT, R, Y), R); + + return DAG.getMergeValues({Q, R}, DL); } SDValue AMDGPUTargetLowering::LowerSDIVREM(SDValue Op, @@ -2164,8 +2219,7 @@ SDValue AMDGPUTargetLowering::LowerFNEARBYINT(SDValue Op, SelectionDAG &DAG) con // Don't handle v2f16. The extra instructions to scalarize and repack around the // compare and vselect end up producing worse code than scalarizing the whole // operation. -SDValue AMDGPUTargetLowering::LowerFROUND_LegalFTRUNC(SDValue Op, - SelectionDAG &DAG) const { +SDValue AMDGPUTargetLowering::LowerFROUND(SDValue Op, SelectionDAG &DAG) const { SDLoc SL(Op); SDValue X = Op.getOperand(0); EVT VT = Op.getValueType(); @@ -2194,75 +2248,6 @@ SDValue AMDGPUTargetLowering::LowerFROUND_LegalFTRUNC(SDValue Op, return DAG.getNode(ISD::FADD, SL, VT, T, Sel); } -SDValue AMDGPUTargetLowering::LowerFROUND64(SDValue Op, SelectionDAG &DAG) const { - SDLoc SL(Op); - SDValue X = Op.getOperand(0); - - SDValue L = DAG.getNode(ISD::BITCAST, SL, MVT::i64, X); - - const SDValue Zero = DAG.getConstant(0, SL, MVT::i32); - const SDValue One = DAG.getConstant(1, SL, MVT::i32); - const SDValue NegOne = DAG.getConstant(-1, SL, MVT::i32); - const SDValue FiftyOne = DAG.getConstant(51, SL, MVT::i32); - EVT SetCCVT = - getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i32); - - SDValue BC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, X); - - SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, BC, One); - - SDValue Exp = extractF64Exponent(Hi, SL, DAG); - - const SDValue Mask = DAG.getConstant(INT64_C(0x000fffffffffffff), SL, - MVT::i64); - - SDValue M = DAG.getNode(ISD::SRA, SL, MVT::i64, Mask, Exp); - SDValue D = DAG.getNode(ISD::SRA, SL, MVT::i64, - DAG.getConstant(INT64_C(0x0008000000000000), SL, - MVT::i64), - Exp); - - SDValue Tmp0 = DAG.getNode(ISD::AND, SL, MVT::i64, L, M); - SDValue Tmp1 = DAG.getSetCC(SL, SetCCVT, - DAG.getConstant(0, SL, MVT::i64), Tmp0, - ISD::SETNE); - - SDValue Tmp2 = DAG.getNode(ISD::SELECT, SL, MVT::i64, Tmp1, - D, DAG.getConstant(0, SL, MVT::i64)); - SDValue K = DAG.getNode(ISD::ADD, SL, MVT::i64, L, Tmp2); - - K = DAG.getNode(ISD::AND, SL, MVT::i64, K, DAG.getNOT(SL, M, MVT::i64)); - K = DAG.getNode(ISD::BITCAST, SL, MVT::f64, K); - - SDValue ExpLt0 = DAG.getSetCC(SL, SetCCVT, Exp, Zero, ISD::SETLT); - SDValue ExpGt51 = DAG.getSetCC(SL, SetCCVT, Exp, FiftyOne, ISD::SETGT); - SDValue ExpEqNegOne = DAG.getSetCC(SL, SetCCVT, NegOne, Exp, ISD::SETEQ); - - SDValue Mag = DAG.getNode(ISD::SELECT, SL, MVT::f64, - ExpEqNegOne, - DAG.getConstantFP(1.0, SL, MVT::f64), - DAG.getConstantFP(0.0, SL, MVT::f64)); - - SDValue S = DAG.getNode(ISD::FCOPYSIGN, SL, MVT::f64, Mag, X); - - K = DAG.getNode(ISD::SELECT, SL, MVT::f64, ExpLt0, S, K); - K = DAG.getNode(ISD::SELECT, SL, MVT::f64, ExpGt51, X, K); - - return K; -} - -SDValue AMDGPUTargetLowering::LowerFROUND(SDValue Op, SelectionDAG &DAG) const { - EVT VT = Op.getValueType(); - - if (isOperationLegal(ISD::FTRUNC, VT)) - return LowerFROUND_LegalFTRUNC(Op, DAG); - - if (VT == MVT::f64) - return LowerFROUND64(Op, DAG); - - llvm_unreachable("unhandled type"); -} - SDValue AMDGPUTargetLowering::LowerFFLOOR(SDValue Op, SelectionDAG &DAG) const { SDLoc SL(Op); SDValue Src = Op.getOperand(0); @@ -2793,6 +2778,7 @@ static bool isI24(SDValue Op, SelectionDAG &DAG) { static SDValue simplifyI24(SDNode *Node24, TargetLowering::DAGCombinerInfo &DCI) { SelectionDAG &DAG = DCI.DAG; + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); bool IsIntrin = Node24->getOpcode() == ISD::INTRINSIC_WO_CHAIN; SDValue LHS = IsIntrin ? Node24->getOperand(1) : Node24->getOperand(0); @@ -2806,11 +2792,11 @@ static SDValue simplifyI24(SDNode *Node24, APInt Demanded = APInt::getLowBitsSet(LHS.getValueSizeInBits(), 24); - // First try to simplify using GetDemandedBits which allows the operands to - // have other uses, but will only perform simplifications that involve - // bypassing some nodes for this user. - SDValue DemandedLHS = DAG.GetDemandedBits(LHS, Demanded); - SDValue DemandedRHS = DAG.GetDemandedBits(RHS, Demanded); + // First try to simplify using SimplifyMultipleUseDemandedBits which allows + // the operands to have other uses, but will only perform simplifications that + // involve bypassing some nodes for this user. + SDValue DemandedLHS = TLI.SimplifyMultipleUseDemandedBits(LHS, Demanded, DAG); + SDValue DemandedRHS = TLI.SimplifyMultipleUseDemandedBits(RHS, Demanded, DAG); if (DemandedLHS || DemandedRHS) return DAG.getNode(NewOpcode, SDLoc(Node24), Node24->getVTList(), DemandedLHS ? DemandedLHS : LHS, @@ -2818,7 +2804,6 @@ static SDValue simplifyI24(SDNode *Node24, // Now try SimplifyDemandedBits which can simplify the nodes used by our // operands if this node is the only user. - const TargetLowering &TLI = DAG.getTargetLoweringInfo(); if (TLI.SimplifyDemandedBits(LHS, Demanded, DCI)) return SDValue(Node24, 0); if (TLI.SimplifyDemandedBits(RHS, Demanded, DCI)) @@ -2877,7 +2862,7 @@ SDValue AMDGPUTargetLowering::performLoadCombine(SDNode *N, return SDValue(); LoadSDNode *LN = cast<LoadSDNode>(N); - if (LN->isVolatile() || !ISD::isNormalLoad(LN) || hasVolatileUser(LN)) + if (!LN->isSimple() || !ISD::isNormalLoad(LN) || hasVolatileUser(LN)) return SDValue(); SDLoc SL(N); @@ -2885,16 +2870,17 @@ SDValue AMDGPUTargetLowering::performLoadCombine(SDNode *N, EVT VT = LN->getMemoryVT(); unsigned Size = VT.getStoreSize(); - unsigned Align = LN->getAlignment(); - if (Align < Size && isTypeLegal(VT)) { + Align Alignment = LN->getAlign(); + if (Alignment < Size && isTypeLegal(VT)) { bool IsFast; unsigned AS = LN->getAddressSpace(); // Expand unaligned loads earlier than legalization. Due to visitation order // problems during legalization, the emitted instructions to pack and unpack // the bytes again are not eliminated in the case of an unaligned copy. - if (!allowsMisalignedMemoryAccesses( - VT, AS, Align, LN->getMemOperand()->getFlags(), &IsFast)) { + if (!allowsMisalignedMemoryAccesses(VT, AS, Alignment.value(), + LN->getMemOperand()->getFlags(), + &IsFast)) { SDValue Ops[2]; if (VT.isVector()) @@ -2931,7 +2917,7 @@ SDValue AMDGPUTargetLowering::performStoreCombine(SDNode *N, return SDValue(); StoreSDNode *SN = cast<StoreSDNode>(N); - if (SN->isVolatile() || !ISD::isNormalStore(SN)) + if (!SN->isSimple() || !ISD::isNormalStore(SN)) return SDValue(); EVT VT = SN->getMemoryVT(); @@ -2939,8 +2925,8 @@ SDValue AMDGPUTargetLowering::performStoreCombine(SDNode *N, SDLoc SL(N); SelectionDAG &DAG = DCI.DAG; - unsigned Align = SN->getAlignment(); - if (Align < Size && isTypeLegal(VT)) { + Align Alignment = SN->getAlign(); + if (Alignment < Size && isTypeLegal(VT)) { bool IsFast; unsigned AS = SN->getAddressSpace(); @@ -2948,8 +2934,9 @@ SDValue AMDGPUTargetLowering::performStoreCombine(SDNode *N, // order problems during legalization, the emitted instructions to pack and // unpack the bytes again are not eliminated in the case of an unaligned // copy. - if (!allowsMisalignedMemoryAccesses( - VT, AS, Align, SN->getMemOperand()->getFlags(), &IsFast)) { + if (!allowsMisalignedMemoryAccesses(VT, AS, Alignment.value(), + SN->getMemOperand()->getFlags(), + &IsFast)) { if (VT.isVector()) return scalarizeVectorStore(SN, DAG); @@ -3012,6 +2999,16 @@ SDValue AMDGPUTargetLowering::performIntrinsicWOChainCombine( case Intrinsic::amdgcn_mul_i24: case Intrinsic::amdgcn_mul_u24: return simplifyI24(N, DCI); + case Intrinsic::amdgcn_fract: + case Intrinsic::amdgcn_rsq: + case Intrinsic::amdgcn_rcp_legacy: + case Intrinsic::amdgcn_rsq_legacy: + case Intrinsic::amdgcn_rsq_clamp: + case Intrinsic::amdgcn_ldexp: { + // FIXME: This is probably wrong. If src is an sNaN, it won't be quieted + SDValue Src = N->getOperand(1); + return Src.isUndef() ? Src : SDValue(); + } default: return SDValue(); } @@ -3465,24 +3462,24 @@ SDValue AMDGPUTargetLowering::performCtlz_CttzCombine(const SDLoc &SL, SDValue C ISD::CondCode CCOpcode = cast<CondCodeSDNode>(Cond.getOperand(2))->get(); SDValue CmpLHS = Cond.getOperand(0); - unsigned Opc = isCttzOpc(RHS.getOpcode()) ? AMDGPUISD::FFBL_B32 : - AMDGPUISD::FFBH_U32; - // select (setcc x, 0, eq), -1, (ctlz_zero_undef x) -> ffbh_u32 x // select (setcc x, 0, eq), -1, (cttz_zero_undef x) -> ffbl_u32 x if (CCOpcode == ISD::SETEQ && (isCtlzOpc(RHS.getOpcode()) || isCttzOpc(RHS.getOpcode())) && - RHS.getOperand(0) == CmpLHS && - isNegativeOne(LHS)) { + RHS.getOperand(0) == CmpLHS && isNegativeOne(LHS)) { + unsigned Opc = + isCttzOpc(RHS.getOpcode()) ? AMDGPUISD::FFBL_B32 : AMDGPUISD::FFBH_U32; return getFFBX_U32(DAG, CmpLHS, SL, Opc); } // select (setcc x, 0, ne), (ctlz_zero_undef x), -1 -> ffbh_u32 x // select (setcc x, 0, ne), (cttz_zero_undef x), -1 -> ffbl_u32 x if (CCOpcode == ISD::SETNE && - (isCtlzOpc(LHS.getOpcode()) || isCttzOpc(RHS.getOpcode())) && - LHS.getOperand(0) == CmpLHS && - isNegativeOne(RHS)) { + (isCtlzOpc(LHS.getOpcode()) || isCttzOpc(LHS.getOpcode())) && + LHS.getOperand(0) == CmpLHS && isNegativeOne(RHS)) { + unsigned Opc = + isCttzOpc(LHS.getOpcode()) ? AMDGPUISD::FFBL_B32 : AMDGPUISD::FFBH_U32; + return getFFBX_U32(DAG, CmpLHS, SL, Opc); } @@ -4117,12 +4114,12 @@ SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N, SDValue AMDGPUTargetLowering::CreateLiveInRegister(SelectionDAG &DAG, const TargetRegisterClass *RC, - unsigned Reg, EVT VT, + Register Reg, EVT VT, const SDLoc &SL, bool RawReg) const { MachineFunction &MF = DAG.getMachineFunction(); MachineRegisterInfo &MRI = MF.getRegInfo(); - unsigned VReg; + Register VReg; if (!MRI.isLiveIn(Reg)) { VReg = MRI.createVirtualRegister(RC); @@ -4266,11 +4263,9 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const { NODE_NAME_CASE(DIV_FMAS) NODE_NAME_CASE(DIV_FIXUP) NODE_NAME_CASE(FMAD_FTZ) - NODE_NAME_CASE(TRIG_PREOP) NODE_NAME_CASE(RCP) NODE_NAME_CASE(RSQ) NODE_NAME_CASE(RCP_LEGACY) - NODE_NAME_CASE(RSQ_LEGACY) NODE_NAME_CASE(RCP_IFLAG) NODE_NAME_CASE(FMUL_LEGACY) NODE_NAME_CASE(RSQ_CLAMP) @@ -4298,8 +4293,6 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const { NODE_NAME_CASE(MAD_U64_U32) NODE_NAME_CASE(PERM) NODE_NAME_CASE(TEXTURE_FETCH) - NODE_NAME_CASE(EXPORT) - NODE_NAME_CASE(EXPORT_DONE) NODE_NAME_CASE(R600_EXPORT) NODE_NAME_CASE(CONST_ADDRESS) NODE_NAME_CASE(REGISTER_LOAD) @@ -4323,12 +4316,8 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const { NODE_NAME_CASE(CONST_DATA_PTR) NODE_NAME_CASE(PC_ADD_REL_OFFSET) NODE_NAME_CASE(LDS) - NODE_NAME_CASE(KILL) NODE_NAME_CASE(DUMMY_CHAIN) case AMDGPUISD::FIRST_MEM_OPCODE_NUMBER: break; - NODE_NAME_CASE(INTERP_P1LL_F16) - NODE_NAME_CASE(INTERP_P1LV_F16) - NODE_NAME_CASE(INTERP_P2_F16) NODE_NAME_CASE(LOAD_D16_HI) NODE_NAME_CASE(LOAD_D16_LO) NODE_NAME_CASE(LOAD_D16_HI_I8) @@ -4347,6 +4336,7 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const { NODE_NAME_CASE(ATOMIC_DEC) NODE_NAME_CASE(ATOMIC_LOAD_FMIN) NODE_NAME_CASE(ATOMIC_LOAD_FMAX) + NODE_NAME_CASE(ATOMIC_LOAD_CSUB) NODE_NAME_CASE(BUFFER_LOAD) NODE_NAME_CASE(BUFFER_LOAD_UBYTE) NODE_NAME_CASE(BUFFER_LOAD_USHORT) @@ -4373,6 +4363,7 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const { NODE_NAME_CASE(BUFFER_ATOMIC_INC) NODE_NAME_CASE(BUFFER_ATOMIC_DEC) NODE_NAME_CASE(BUFFER_ATOMIC_CMPSWAP) + NODE_NAME_CASE(BUFFER_ATOMIC_CSUB) NODE_NAME_CASE(BUFFER_ATOMIC_FADD) NODE_NAME_CASE(BUFFER_ATOMIC_PK_FADD) NODE_NAME_CASE(ATOMIC_PK_FADD) @@ -4539,11 +4530,10 @@ void AMDGPUTargetLowering::computeKnownBitsForTargetNode( } case AMDGPUISD::LDS: { auto GA = cast<GlobalAddressSDNode>(Op.getOperand(0).getNode()); - unsigned Align = GA->getGlobal()->getAlignment(); + Align Alignment = GA->getGlobal()->getPointerAlignment(DAG.getDataLayout()); Known.Zero.setHighBits(16); - if (Align) - Known.Zero.setLowBits(Log2_32(Align)); + Known.Zero.setLowBits(Log2(Alignment)); break; } case ISD::INTRINSIC_WO_CHAIN: { @@ -4607,6 +4597,29 @@ unsigned AMDGPUTargetLowering::ComputeNumSignBitsForTargetNode( } } +unsigned AMDGPUTargetLowering::computeNumSignBitsForTargetInstr( + GISelKnownBits &Analysis, Register R, + const APInt &DemandedElts, const MachineRegisterInfo &MRI, + unsigned Depth) const { + const MachineInstr *MI = MRI.getVRegDef(R); + if (!MI) + return 1; + + // TODO: Check range metadata on MMO. + switch (MI->getOpcode()) { + case AMDGPU::G_AMDGPU_BUFFER_LOAD_SBYTE: + return 25; + case AMDGPU::G_AMDGPU_BUFFER_LOAD_SSHORT: + return 17; + case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE: + return 24; + case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT: + return 16; + default: + return 1; + } +} + bool AMDGPUTargetLowering::isKnownNeverNaNForTargetNode(SDValue Op, const SelectionDAG &DAG, bool SNaN, @@ -4648,7 +4661,6 @@ bool AMDGPUTargetLowering::isKnownNeverNaNForTargetNode(SDValue Op, case AMDGPUISD::RCP: case AMDGPUISD::RSQ: case AMDGPUISD::RCP_LEGACY: - case AMDGPUISD::RSQ_LEGACY: case AMDGPUISD::RSQ_CLAMP: { if (SNaN) return true; @@ -4665,7 +4677,6 @@ bool AMDGPUTargetLowering::isKnownNeverNaNForTargetNode(SDValue Op, case AMDGPUISD::DIV_SCALE: case AMDGPUISD::DIV_FMAS: case AMDGPUISD::DIV_FIXUP: - case AMDGPUISD::TRIG_PREOP: // TODO: Refine on operands. return SNaN; case AMDGPUISD::SIN_HW: @@ -4692,6 +4703,18 @@ bool AMDGPUTargetLowering::isKnownNeverNaNForTargetNode(SDValue Op, return DAG.isKnownNeverNaN(Op.getOperand(1), SNaN, Depth + 1) && DAG.isKnownNeverNaN(Op.getOperand(2), SNaN, Depth + 1); } + case Intrinsic::amdgcn_rcp: + case Intrinsic::amdgcn_rsq: + case Intrinsic::amdgcn_rcp_legacy: + case Intrinsic::amdgcn_rsq_legacy: + case Intrinsic::amdgcn_rsq_clamp: { + if (SNaN) + return true; + + // TODO: Need is known positive check. + return false; + } + case Intrinsic::amdgcn_trig_preop: case Intrinsic::amdgcn_fdot2: // TODO: Refine on operand return SNaN; |