diff options
Diffstat (limited to 'llvm/lib/Target/AMDGPU/SIISelLowering.cpp')
-rw-r--r-- | llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 2276 |
1 files changed, 1442 insertions, 834 deletions
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index e73d87cd66af..d035aa8f72bd 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -11,11 +11,6 @@ // //===----------------------------------------------------------------------===// -#if defined(_MSC_VER) || defined(__MINGW32__) -// Provide M_PI. -#define _USE_MATH_DEFINES -#endif - #include "SIISelLowering.h" #include "AMDGPU.h" #include "AMDGPUSubtarget.h" @@ -40,6 +35,7 @@ #include "llvm/CodeGen/CallingConvLower.h" #include "llvm/CodeGen/DAGCombine.h" #include "llvm/CodeGen/ISDOpcodes.h" +#include "llvm/CodeGen/GlobalISel/GISelKnownBits.h" #include "llvm/CodeGen/MachineBasicBlock.h" #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineFunction.h" @@ -95,14 +91,24 @@ static cl::opt<bool> DisableLoopAlignment( cl::desc("Do not align and prefetch loops"), cl::init(false)); +static cl::opt<bool> VGPRReserveforSGPRSpill( + "amdgpu-reserve-vgpr-for-sgpr-spill", + cl::desc("Allocates one VGPR for future SGPR Spill"), cl::init(true)); + +static cl::opt<bool> UseDivergentRegisterIndexing( + "amdgpu-use-divergent-register-indexing", + cl::Hidden, + cl::desc("Use indirect register addressing for divergent indexes"), + cl::init(false)); + static bool hasFP32Denormals(const MachineFunction &MF) { const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); - return Info->getMode().FP32Denormals; + return Info->getMode().allFP32Denormals(); } static bool hasFP64FP16Denormals(const MachineFunction &MF) { const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); - return Info->getMode().FP64FP16Denormals; + return Info->getMode().allFP64FP16Denormals(); } static unsigned findFirstFreeSGPR(CCState &CCInfo) { @@ -141,12 +147,21 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, addRegisterClass(MVT::v5i32, &AMDGPU::SGPR_160RegClass); addRegisterClass(MVT::v5f32, &AMDGPU::VReg_160RegClass); - addRegisterClass(MVT::v8i32, &AMDGPU::SReg_256RegClass); + addRegisterClass(MVT::v8i32, &AMDGPU::SGPR_256RegClass); addRegisterClass(MVT::v8f32, &AMDGPU::VReg_256RegClass); - addRegisterClass(MVT::v16i32, &AMDGPU::SReg_512RegClass); + addRegisterClass(MVT::v4i64, &AMDGPU::SGPR_256RegClass); + addRegisterClass(MVT::v4f64, &AMDGPU::VReg_256RegClass); + + addRegisterClass(MVT::v16i32, &AMDGPU::SGPR_512RegClass); addRegisterClass(MVT::v16f32, &AMDGPU::VReg_512RegClass); + addRegisterClass(MVT::v8i64, &AMDGPU::SGPR_512RegClass); + addRegisterClass(MVT::v8f64, &AMDGPU::VReg_512RegClass); + + addRegisterClass(MVT::v16i64, &AMDGPU::SGPR_1024RegClass); + addRegisterClass(MVT::v16f64, &AMDGPU::VReg_1024RegClass); + if (Subtarget->has16BitInsts()) { addRegisterClass(MVT::i16, &AMDGPU::SReg_32RegClass); addRegisterClass(MVT::f16, &AMDGPU::SReg_32RegClass); @@ -158,10 +173,8 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, addRegisterClass(MVT::v4f16, &AMDGPU::SReg_64RegClass); } - if (Subtarget->hasMAIInsts()) { - addRegisterClass(MVT::v32i32, &AMDGPU::VReg_1024RegClass); - addRegisterClass(MVT::v32f32, &AMDGPU::VReg_1024RegClass); - } + addRegisterClass(MVT::v32i32, &AMDGPU::VReg_1024RegClass); + addRegisterClass(MVT::v32f32, &AMDGPU::VReg_1024RegClass); computeRegisterProperties(Subtarget->getRegisterInfo()); @@ -202,6 +215,17 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, setTruncStoreAction(MVT::v8i32, MVT::v8i8, Expand); setTruncStoreAction(MVT::v16i32, MVT::v16i8, Expand); setTruncStoreAction(MVT::v32i32, MVT::v32i8, Expand); + setTruncStoreAction(MVT::v2i16, MVT::v2i8, Expand); + setTruncStoreAction(MVT::v4i16, MVT::v4i8, Expand); + setTruncStoreAction(MVT::v8i16, MVT::v8i8, Expand); + setTruncStoreAction(MVT::v16i16, MVT::v16i8, Expand); + setTruncStoreAction(MVT::v32i16, MVT::v32i8, Expand); + + setTruncStoreAction(MVT::v4i64, MVT::v4i8, Expand); + setTruncStoreAction(MVT::v8i64, MVT::v8i8, Expand); + setTruncStoreAction(MVT::v8i64, MVT::v8i16, Expand); + setTruncStoreAction(MVT::v8i64, MVT::v8i32, Expand); + setTruncStoreAction(MVT::v16i64, MVT::v16i32, Expand); setOperationAction(ISD::GlobalAddress, MVT::i32, Custom); setOperationAction(ISD::GlobalAddress, MVT::i64, Custom); @@ -224,6 +248,12 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, setOperationAction(ISD::TRUNCATE, MVT::v2i32, Expand); setOperationAction(ISD::FP_ROUND, MVT::v2f32, Expand); + setOperationAction(ISD::TRUNCATE, MVT::v4i32, Expand); + setOperationAction(ISD::FP_ROUND, MVT::v4f32, Expand); + setOperationAction(ISD::TRUNCATE, MVT::v8i32, Expand); + setOperationAction(ISD::FP_ROUND, MVT::v8f32, Expand); + setOperationAction(ISD::TRUNCATE, MVT::v16i32, Expand); + setOperationAction(ISD::FP_ROUND, MVT::v16f32, Expand); setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i1, Custom); setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i1, Custom); @@ -260,7 +290,8 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, // with > 4 elements. for (MVT VT : { MVT::v8i32, MVT::v8f32, MVT::v16i32, MVT::v16f32, MVT::v2i64, MVT::v2f64, MVT::v4i16, MVT::v4f16, - MVT::v32i32, MVT::v32f32 }) { + MVT::v4i64, MVT::v4f64, MVT::v8i64, MVT::v8f64, + MVT::v16i64, MVT::v16f64, MVT::v32i32, MVT::v32f32 }) { for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) { switch (Op) { case ISD::LOAD: @@ -304,6 +335,48 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v4i32); } + for (MVT Vec64 : { MVT::v4i64, MVT::v4f64 }) { + setOperationAction(ISD::BUILD_VECTOR, Vec64, Promote); + AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v8i32); + + setOperationAction(ISD::EXTRACT_VECTOR_ELT, Vec64, Promote); + AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v8i32); + + setOperationAction(ISD::INSERT_VECTOR_ELT, Vec64, Promote); + AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v8i32); + + setOperationAction(ISD::SCALAR_TO_VECTOR, Vec64, Promote); + AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v8i32); + } + + for (MVT Vec64 : { MVT::v8i64, MVT::v8f64 }) { + setOperationAction(ISD::BUILD_VECTOR, Vec64, Promote); + AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v16i32); + + setOperationAction(ISD::EXTRACT_VECTOR_ELT, Vec64, Promote); + AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v16i32); + + setOperationAction(ISD::INSERT_VECTOR_ELT, Vec64, Promote); + AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v16i32); + + setOperationAction(ISD::SCALAR_TO_VECTOR, Vec64, Promote); + AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v16i32); + } + + for (MVT Vec64 : { MVT::v16i64, MVT::v16f64 }) { + setOperationAction(ISD::BUILD_VECTOR, Vec64, Promote); + AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v32i32); + + setOperationAction(ISD::EXTRACT_VECTOR_ELT, Vec64, Promote); + AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v32i32); + + setOperationAction(ISD::INSERT_VECTOR_ELT, Vec64, Promote); + AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v32i32); + + setOperationAction(ISD::SCALAR_TO_VECTOR, Vec64, Promote); + AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v32i32); + } + setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8i32, Expand); setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8f32, Expand); setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v16i32, Expand); @@ -361,9 +434,14 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, setOperationAction(ISD::ADDRSPACECAST, MVT::i64, Custom); } - setOperationAction(ISD::BSWAP, MVT::i32, Legal); setOperationAction(ISD::BITREVERSE, MVT::i32, Legal); + // FIXME: This should be narrowed to i32, but that only happens if i64 is + // illegal. + // FIXME: Should lower sub-i32 bswaps to bit-ops without v_perm_b32. + setOperationAction(ISD::BSWAP, MVT::i64, Legal); + setOperationAction(ISD::BSWAP, MVT::i32, Legal); + // On SI this is s_memtime and s_memrealtime on VI. setOperationAction(ISD::READCYCLECOUNTER, MVT::i64, Legal); setOperationAction(ISD::TRAP, MVT::Other, Custom); @@ -376,10 +454,8 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, setOperationAction(ISD::FLOG10, MVT::f16, Custom); } - // v_mad_f32 does not support denormals. We report it as unconditionally - // legal, and the context where it is formed will disallow it when fp32 - // denormals are enabled. - setOperationAction(ISD::FMAD, MVT::f32, Legal); + if (Subtarget->hasMadMacF32Insts()) + setOperationAction(ISD::FMAD, MVT::f32, Legal); if (!Subtarget->hasBFI()) { // fcopysign can be done in a single instruction with BFI. @@ -463,7 +539,6 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, setOperationAction(ISD::SREM, MVT::i16, Promote); setOperationAction(ISD::UREM, MVT::i16, Promote); - setOperationAction(ISD::BSWAP, MVT::i16, Promote); setOperationAction(ISD::BITREVERSE, MVT::i16, Promote); setOperationAction(ISD::CTTZ, MVT::i16, Promote); @@ -499,8 +574,8 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, // F16 - VOP1 Actions. setOperationAction(ISD::FP_ROUND, MVT::f16, Custom); - setOperationAction(ISD::FCOS, MVT::f16, Promote); - setOperationAction(ISD::FSIN, MVT::f16, Promote); + setOperationAction(ISD::FCOS, MVT::f16, Custom); + setOperationAction(ISD::FSIN, MVT::f16, Custom); setOperationAction(ISD::SINT_TO_FP, MVT::i16, Custom); setOperationAction(ISD::UINT_TO_FP, MVT::i16, Custom); @@ -545,6 +620,11 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, } } + // v_perm_b32 can handle either of these. + setOperationAction(ISD::BSWAP, MVT::i16, Legal); + setOperationAction(ISD::BSWAP, MVT::v2i16, Legal); + setOperationAction(ISD::BSWAP, MVT::v4i16, Custom); + // XXX - Do these do anything? Vector constants turn into build_vector. setOperationAction(ISD::Constant, MVT::v2i16, Legal); setOperationAction(ISD::ConstantFP, MVT::v2f16, Legal); @@ -686,6 +766,9 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, setOperationAction(ISD::SELECT, VT, Custom); } + setOperationAction(ISD::SMULO, MVT::i64, Custom); + setOperationAction(ISD::UMULO, MVT::i64, Custom); + setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom); setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::f32, Custom); setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v4f32, Custom); @@ -762,6 +845,9 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, setTargetDAGCombine(ISD::ATOMIC_LOAD_UMAX); setTargetDAGCombine(ISD::ATOMIC_LOAD_FADD); + // FIXME: In other contexts we pretend this is a per-function property. + setStackPointerRegisterToSaveRestore(AMDGPU::SGPR32); + setSchedulingPreference(Sched::RegPressure); } @@ -783,6 +869,7 @@ bool SITargetLowering::isFPExtFoldable(const SelectionDAG &DAG, unsigned Opcode, (Opcode == ISD::FMA && Subtarget->hasFmaMixInsts())) && DestVT.getScalarType() == MVT::f32 && SrcVT.getScalarType() == MVT::f16 && + // TODO: This probably only requires no input flushing? !hasFP32Denormals(DAG.getMachineFunction()); } @@ -877,45 +964,33 @@ unsigned SITargetLowering::getVectorTypeBreakdownForCallingConv( Context, CC, VT, IntermediateVT, NumIntermediates, RegisterVT); } -static MVT memVTFromAggregate(Type *Ty) { - // Only limited forms of aggregate type currently expected. - assert(Ty->isStructTy() && "Expected struct type"); - +static EVT memVTFromImageData(Type *Ty, unsigned DMaskLanes) { + assert(DMaskLanes != 0); - Type *ElementType = nullptr; - unsigned NumElts; - if (Ty->getContainedType(0)->isVectorTy()) { - VectorType *VecComponent = cast<VectorType>(Ty->getContainedType(0)); - ElementType = VecComponent->getElementType(); - NumElts = VecComponent->getNumElements(); - } else { - ElementType = Ty->getContainedType(0); - NumElts = 1; + if (auto *VT = dyn_cast<FixedVectorType>(Ty)) { + unsigned NumElts = std::min(DMaskLanes, VT->getNumElements()); + return EVT::getVectorVT(Ty->getContext(), + EVT::getEVT(VT->getElementType()), + NumElts); } - assert((Ty->getContainedType(1) && Ty->getContainedType(1)->isIntegerTy(32)) && "Expected int32 type"); + return EVT::getEVT(Ty); +} - // Calculate the size of the memVT type from the aggregate - unsigned Pow2Elts = 0; - unsigned ElementSize; - switch (ElementType->getTypeID()) { - default: - llvm_unreachable("Unknown type!"); - case Type::IntegerTyID: - ElementSize = cast<IntegerType>(ElementType)->getBitWidth(); - break; - case Type::HalfTyID: - ElementSize = 16; - break; - case Type::FloatTyID: - ElementSize = 32; - break; - } - unsigned AdditionalElts = ElementSize == 16 ? 2 : 1; - Pow2Elts = 1 << Log2_32_Ceil(NumElts + AdditionalElts); +// Peek through TFE struct returns to only use the data size. +static EVT memVTFromImageReturn(Type *Ty, unsigned DMaskLanes) { + auto *ST = dyn_cast<StructType>(Ty); + if (!ST) + return memVTFromImageData(Ty, DMaskLanes); - return MVT::getVectorVT(MVT::getVT(ElementType, false), - Pow2Elts); + // Some intrinsics return an aggregate type - special case to work out the + // correct memVT. + // + // Only limited forms of aggregate type currently expected. + if (ST->getNumContainedTypes() != 2 || + !ST->getContainedType(1)->isIntegerTy(32)) + return EVT(); + return memVTFromImageData(ST->getContainedType(0), DMaskLanes); } bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, @@ -944,17 +1019,40 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, Info.flags = MachineMemOperand::MODereferenceable; if (Attr.hasFnAttribute(Attribute::ReadOnly)) { + unsigned DMaskLanes = 4; + + if (RsrcIntr->IsImage) { + const AMDGPU::ImageDimIntrinsicInfo *Intr + = AMDGPU::getImageDimIntrinsicInfo(IntrID); + const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode = + AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode); + + if (!BaseOpcode->Gather4) { + // If this isn't a gather, we may have excess loaded elements in the + // IR type. Check the dmask for the real number of elements loaded. + unsigned DMask + = cast<ConstantInt>(CI.getArgOperand(0))->getZExtValue(); + DMaskLanes = DMask == 0 ? 1 : countPopulation(DMask); + } + + Info.memVT = memVTFromImageReturn(CI.getType(), DMaskLanes); + } else + Info.memVT = EVT::getEVT(CI.getType()); + + // FIXME: What does alignment mean for an image? Info.opc = ISD::INTRINSIC_W_CHAIN; - Info.memVT = MVT::getVT(CI.getType(), true); - if (Info.memVT == MVT::Other) { - // Some intrinsics return an aggregate type - special case to work out - // the correct memVT - Info.memVT = memVTFromAggregate(CI.getType()); - } Info.flags |= MachineMemOperand::MOLoad; } else if (Attr.hasFnAttribute(Attribute::WriteOnly)) { Info.opc = ISD::INTRINSIC_VOID; - Info.memVT = MVT::getVT(CI.getArgOperand(0)->getType()); + + Type *DataTy = CI.getArgOperand(0)->getType(); + if (RsrcIntr->IsImage) { + unsigned DMask = cast<ConstantInt>(CI.getArgOperand(1))->getZExtValue(); + unsigned DMaskLanes = DMask == 0 ? 1 : countPopulation(DMask); + Info.memVT = memVTFromImageData(DataTy, DMaskLanes); + } else + Info.memVT = EVT::getEVT(DataTy); + Info.flags |= MachineMemOperand::MOStore; } else { // Atomic @@ -1031,6 +1129,17 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, return true; } + case Intrinsic::amdgcn_global_atomic_csub: { + Info.opc = ISD::INTRINSIC_W_CHAIN; + Info.memVT = MVT::getVT(CI.getType()); + Info.ptrVal = CI.getOperand(0); + Info.align.reset(); + Info.flags = MachineMemOperand::MOLoad | + MachineMemOperand::MOStore | + MachineMemOperand::MODereferenceable | + MachineMemOperand::MOVolatile; + return true; + } case Intrinsic::amdgcn_ds_gws_init: case Intrinsic::amdgcn_ds_gws_barrier: case Intrinsic::amdgcn_ds_gws_sema_v: @@ -1226,9 +1335,10 @@ bool SITargetLowering::isLegalAddressingMode(const DataLayout &DL, // addressing modes, so treat them as having no offset like flat // instructions. return isLegalFlatAddressingMode(AM); - } else { - llvm_unreachable("unhandled address space"); } + + // Assume a user alias of global for unknown address spaces. + return isLegalGlobalAddressingMode(AM); } bool SITargetLowering::canMergeStoresTo(unsigned AS, EVT MemVT, @@ -1279,9 +1389,11 @@ bool SITargetLowering::allowsMisalignedMemoryAccessesImpl( // If we have an uniform constant load, it still requires using a slow // buffer instruction if unaligned. if (IsFast) { + // Accesses can really be issued as 1-byte aligned or 4-byte aligned, so + // 2-byte alignment is worse than 1 unless doing a 2-byte accesss. *IsFast = (AddrSpace == AMDGPUAS::CONSTANT_ADDRESS || AddrSpace == AMDGPUAS::CONSTANT_ADDRESS_32BIT) ? - (Align % 4 == 0) : true; + Align >= 4 : Align != 2; } return true; @@ -1320,18 +1432,17 @@ bool SITargetLowering::allowsMisalignedMemoryAccesses( } EVT SITargetLowering::getOptimalMemOpType( - uint64_t Size, unsigned DstAlign, unsigned SrcAlign, bool IsMemset, - bool ZeroMemset, bool MemcpyStrSrc, - const AttributeList &FuncAttributes) const { + const MemOp &Op, const AttributeList &FuncAttributes) const { // FIXME: Should account for address space here. // The default fallback uses the private pointer size as a guess for a type to // use. Make sure we switch these to 64-bit accesses. - if (Size >= 16 && DstAlign >= 4) // XXX: Should only do for global + if (Op.size() >= 16 && + Op.isDstAligned(Align(4))) // XXX: Should only do for global return MVT::v4i32; - if (Size >= 8 && DstAlign >= 4) + if (Op.size() >= 8 && Op.isDstAligned(Align(4))) return MVT::v2i32; // Use the default. @@ -1416,9 +1527,10 @@ SDValue SITargetLowering::lowerKernArgParameterPtr(SelectionDAG &DAG, const ArgDescriptor *InputPtrReg; const TargetRegisterClass *RC; + LLT ArgTy; - std::tie(InputPtrReg, RC) - = Info->getPreloadedValue(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR); + std::tie(InputPtrReg, RC, ArgTy) = + Info->getPreloadedValue(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR); MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo(); MVT PtrVT = getPointerTy(DL, AMDGPUAS::CONSTANT_ADDRESS); @@ -1457,7 +1569,7 @@ SDValue SITargetLowering::convertArgType(SelectionDAG &DAG, EVT VT, EVT MemVT, } if (MemVT.isFloatingPoint()) - Val = getFPExtOrFPTrunc(DAG, Val, SL, VT); + Val = getFPExtOrFPRound(DAG, Val, SL, VT); else if (Signed) Val = DAG.getSExtOrTrunc(Val, SL, VT); else @@ -1467,16 +1579,15 @@ SDValue SITargetLowering::convertArgType(SelectionDAG &DAG, EVT VT, EVT MemVT, } SDValue SITargetLowering::lowerKernargMemParameter( - SelectionDAG &DAG, EVT VT, EVT MemVT, - const SDLoc &SL, SDValue Chain, - uint64_t Offset, unsigned Align, bool Signed, - const ISD::InputArg *Arg) const { + SelectionDAG &DAG, EVT VT, EVT MemVT, const SDLoc &SL, SDValue Chain, + uint64_t Offset, Align Alignment, bool Signed, + const ISD::InputArg *Arg) const { MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS); // Try to avoid using an extload by loading earlier than the argument address, // and extracting the relevant bits. The load should hopefully be merged with // the previous argument. - if (MemVT.getStoreSize() < 4 && Align < 4) { + if (MemVT.getStoreSize() < 4 && Alignment < 4) { // TODO: Handle align < 4 and size >= 4 (can happen with packed structs). int64_t AlignDownOffset = alignDown(Offset, 4); int64_t OffsetDiff = Offset - AlignDownOffset; @@ -1502,9 +1613,9 @@ SDValue SITargetLowering::lowerKernargMemParameter( } SDValue Ptr = lowerKernArgParameterPtr(DAG, SL, Chain, Offset); - SDValue Load = DAG.getLoad(MemVT, SL, Chain, Ptr, PtrInfo, Align, + SDValue Load = DAG.getLoad(MemVT, SL, Chain, Ptr, PtrInfo, Alignment, MachineMemOperand::MODereferenceable | - MachineMemOperand::MOInvariant); + MachineMemOperand::MOInvariant); SDValue Val = convertArgType(DAG, VT, MemVT, SL, Load, Signed, Arg); return DAG.getMergeValues({ Val, Load.getValue(1) }, SL); @@ -1565,8 +1676,9 @@ SDValue SITargetLowering::getPreloadedValue(SelectionDAG &DAG, AMDGPUFunctionArgInfo::PreloadedValue PVID) const { const ArgDescriptor *Reg; const TargetRegisterClass *RC; + LLT Ty; - std::tie(Reg, RC) = MFI.getPreloadedValue(PVID); + std::tie(Reg, RC, Ty) = MFI.getPreloadedValue(PVID); return CreateLiveInRegister(DAG, RC, Reg->getRegister(), VT); } @@ -1666,7 +1778,7 @@ static ArgDescriptor allocateVGPR32Input(CCState &CCInfo, unsigned Mask = ~0u, unsigned RegIdx = CCInfo.getFirstUnallocated(ArgVGPRs); if (RegIdx == ArgVGPRs.size()) { // Spill to stack required. - int64_t Offset = CCInfo.AllocateStack(4, 4); + int64_t Offset = CCInfo.AllocateStack(4, Align(4)); return ArgDescriptor::createStack(Offset, Mask); } @@ -1706,10 +1818,11 @@ static ArgDescriptor allocateSGPR64Input(CCState &CCInfo) { return allocateSGPR32InputImpl(CCInfo, &AMDGPU::SGPR_64RegClass, 16); } -void SITargetLowering::allocateSpecialInputVGPRs(CCState &CCInfo, - MachineFunction &MF, - const SIRegisterInfo &TRI, - SIMachineFunctionInfo &Info) const { +/// Allocate implicit function VGPR arguments at the end of allocated user +/// arguments. +void SITargetLowering::allocateSpecialInputVGPRs( + CCState &CCInfo, MachineFunction &MF, + const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const { const unsigned Mask = 0x3ff; ArgDescriptor Arg; @@ -1727,6 +1840,20 @@ void SITargetLowering::allocateSpecialInputVGPRs(CCState &CCInfo, Info.setWorkItemIDZ(allocateVGPR32Input(CCInfo, Mask << 20, Arg)); } +/// Allocate implicit function VGPR arguments in fixed registers. +void SITargetLowering::allocateSpecialInputVGPRsFixed( + CCState &CCInfo, MachineFunction &MF, + const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const { + Register Reg = CCInfo.AllocateReg(AMDGPU::VGPR31); + if (!Reg) + report_fatal_error("failed to allocated VGPR for implicit arguments"); + + const unsigned Mask = 0x3ff; + Info.setWorkItemIDX(ArgDescriptor::createRegister(Reg, Mask)); + Info.setWorkItemIDY(ArgDescriptor::createRegister(Reg, Mask << 10)); + Info.setWorkItemIDZ(ArgDescriptor::createRegister(Reg, Mask << 20)); +} + void SITargetLowering::allocateSpecialInputSGPRs( CCState &CCInfo, MachineFunction &MF, @@ -1742,8 +1869,10 @@ void SITargetLowering::allocateSpecialInputSGPRs( if (Info.hasQueuePtr()) ArgInfo.QueuePtr = allocateSGPR64Input(CCInfo); - if (Info.hasKernargSegmentPtr()) - ArgInfo.KernargSegmentPtr = allocateSGPR64Input(CCInfo); + // Implicit arg ptr takes the place of the kernarg segment pointer. This is a + // constant offset from the kernarg segment. + if (Info.hasImplicitArgPtr()) + ArgInfo.ImplicitArgPtr = allocateSGPR64Input(CCInfo); if (Info.hasDispatchID()) ArgInfo.DispatchID = allocateSGPR64Input(CCInfo); @@ -1758,9 +1887,6 @@ void SITargetLowering::allocateSpecialInputSGPRs( if (Info.hasWorkGroupIDZ()) ArgInfo.WorkGroupIDZ = allocateSGPR32Input(CCInfo); - - if (Info.hasImplicitArgPtr()) - ArgInfo.ImplicitArgPtr = allocateSGPR64Input(CCInfo); } // Allocate special inputs passed in user SGPRs. @@ -1916,67 +2042,45 @@ static void reservePrivateMemoryRegs(const TargetMachine &TM, Info.setScratchRSrcReg(ReservedBufferReg); } - // hasFP should be accurate for kernels even before the frame is finalized. - if (ST.getFrameLowering()->hasFP(MF)) { - MachineRegisterInfo &MRI = MF.getRegInfo(); + MachineRegisterInfo &MRI = MF.getRegInfo(); - // Try to use s32 as the SP, but move it if it would interfere with input - // arguments. This won't work with calls though. - // - // FIXME: Move SP to avoid any possible inputs, or find a way to spill input - // registers. - if (!MRI.isLiveIn(AMDGPU::SGPR32)) { - Info.setStackPtrOffsetReg(AMDGPU::SGPR32); - } else { - assert(AMDGPU::isShader(MF.getFunction().getCallingConv())); + // For entry functions we have to set up the stack pointer if we use it, + // whereas non-entry functions get this "for free". This means there is no + // intrinsic advantage to using S32 over S34 in cases where we do not have + // calls but do need a frame pointer (i.e. if we are requested to have one + // because frame pointer elimination is disabled). To keep things simple we + // only ever use S32 as the call ABI stack pointer, and so using it does not + // imply we need a separate frame pointer. + // + // Try to use s32 as the SP, but move it if it would interfere with input + // arguments. This won't work with calls though. + // + // FIXME: Move SP to avoid any possible inputs, or find a way to spill input + // registers. + if (!MRI.isLiveIn(AMDGPU::SGPR32)) { + Info.setStackPtrOffsetReg(AMDGPU::SGPR32); + } else { + assert(AMDGPU::isShader(MF.getFunction().getCallingConv())); - if (MFI.hasCalls()) - report_fatal_error("call in graphics shader with too many input SGPRs"); + if (MFI.hasCalls()) + report_fatal_error("call in graphics shader with too many input SGPRs"); - for (unsigned Reg : AMDGPU::SGPR_32RegClass) { - if (!MRI.isLiveIn(Reg)) { - Info.setStackPtrOffsetReg(Reg); - break; - } + for (unsigned Reg : AMDGPU::SGPR_32RegClass) { + if (!MRI.isLiveIn(Reg)) { + Info.setStackPtrOffsetReg(Reg); + break; } - - if (Info.getStackPtrOffsetReg() == AMDGPU::SP_REG) - report_fatal_error("failed to find register for SP"); } - if (MFI.hasCalls()) { - Info.setScratchWaveOffsetReg(AMDGPU::SGPR33); - Info.setFrameOffsetReg(AMDGPU::SGPR33); - } else { - unsigned ReservedOffsetReg = - TRI.reservedPrivateSegmentWaveByteOffsetReg(MF); - Info.setScratchWaveOffsetReg(ReservedOffsetReg); - Info.setFrameOffsetReg(ReservedOffsetReg); - } - } else if (RequiresStackAccess) { - assert(!MFI.hasCalls()); - // We know there are accesses and they will be done relative to SP, so just - // pin it to the input. - // - // FIXME: Should not do this if inline asm is reading/writing these - // registers. - Register PreloadedSP = Info.getPreloadedReg( - AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET); - - Info.setStackPtrOffsetReg(PreloadedSP); - Info.setScratchWaveOffsetReg(PreloadedSP); - Info.setFrameOffsetReg(PreloadedSP); - } else { - assert(!MFI.hasCalls()); + if (Info.getStackPtrOffsetReg() == AMDGPU::SP_REG) + report_fatal_error("failed to find register for SP"); + } - // There may not be stack access at all. There may still be spills, or - // access of a constant pointer (in which cases an extra copy will be - // emitted in the prolog). - unsigned ReservedOffsetReg - = TRI.reservedPrivateSegmentWaveByteOffsetReg(MF); - Info.setStackPtrOffsetReg(ReservedOffsetReg); - Info.setScratchWaveOffsetReg(ReservedOffsetReg); - Info.setFrameOffsetReg(ReservedOffsetReg); + // hasFP should be accurate for entry functions even before the frame is + // finalized, because it does not rely on the known stack size, only + // properties like whether variable sized objects are present. + if (ST.getFrameLowering()->hasFP(MF)) { + Info.setFrameOffsetReg(AMDGPU::SGPR33); } } @@ -2110,6 +2214,10 @@ SDValue SITargetLowering::LowerFormalArguments( if (IsEntryFunc) { allocateSpecialEntryInputVGPRs(CCInfo, MF, *TRI, *Info); allocateHSAUserSGPRs(CCInfo, MF, *TRI, *Info); + } else { + // For the fixed ABI, pass workitem IDs in the last argument register. + if (AMDGPUTargetMachine::EnableFixedFunctionABI) + allocateSpecialInputVGPRsFixed(CCInfo, MF, *TRI, *Info); } if (IsKernel) { @@ -2126,9 +2234,9 @@ SDValue SITargetLowering::LowerFormalArguments( // // FIXME: Alignment of explicit arguments totally broken with non-0 explicit // kern arg offset. - const unsigned KernelArgBaseAlign = 16; + const Align KernelArgBaseAlign = Align(16); - for (unsigned i = 0, e = Ins.size(), ArgIdx = 0; i != e; ++i) { + for (unsigned i = 0, e = Ins.size(), ArgIdx = 0; i != e; ++i) { const ISD::InputArg &Arg = Ins[i]; if (Arg.isOrigArg() && Skipped[Arg.getOrigArgIndex()]) { InVals.push_back(DAG.getUNDEF(Arg.VT)); @@ -2143,10 +2251,11 @@ SDValue SITargetLowering::LowerFormalArguments( EVT MemVT = VA.getLocVT(); const uint64_t Offset = VA.getLocMemOffset(); - unsigned Align = MinAlign(KernelArgBaseAlign, Offset); + Align Alignment = commonAlignment(KernelArgBaseAlign, Offset); - SDValue Arg = lowerKernargMemParameter( - DAG, VT, MemVT, DL, Chain, Offset, Align, Ins[i].Flags.isSExt(), &Ins[i]); + SDValue Arg = + lowerKernargMemParameter(DAG, VT, MemVT, DL, Chain, Offset, Alignment, + Ins[i].Flags.isSExt(), &Ins[i]); Chains.push_back(Arg.getValue(1)); auto *ParamTy = @@ -2221,7 +2330,7 @@ SDValue SITargetLowering::LowerFormalArguments( InVals.push_back(Val); } - if (!IsEntryFunc) { + if (!IsEntryFunc && !AMDGPUTargetMachine::EnableFixedFunctionABI) { // Special inputs come after user arguments. allocateSpecialInputVGPRs(CCInfo, MF, *TRI, *Info); } @@ -2231,8 +2340,6 @@ SDValue SITargetLowering::LowerFormalArguments( allocateSystemSGPRs(CCInfo, MF, *Info, CallConv, IsShader); } else { CCInfo.AllocateReg(Info->getScratchRSrcReg()); - CCInfo.AllocateReg(Info->getScratchWaveOffsetReg()); - CCInfo.AllocateReg(Info->getFrameOffsetReg()); allocateSpecialInputSGPRs(CCInfo, MF, *TRI, *Info); } @@ -2442,50 +2549,51 @@ void SITargetLowering::passSpecialInputs( SDValue Chain) const { // If we don't have a call site, this was a call inserted by // legalization. These can never use special inputs. - if (!CLI.CS) + if (!CLI.CB) return; - const Function *CalleeFunc = CLI.CS.getCalledFunction(); - assert(CalleeFunc); - SelectionDAG &DAG = CLI.DAG; const SDLoc &DL = CLI.DL; const SIRegisterInfo *TRI = Subtarget->getRegisterInfo(); - - auto &ArgUsageInfo = - DAG.getPass()->getAnalysis<AMDGPUArgumentUsageInfo>(); - const AMDGPUFunctionArgInfo &CalleeArgInfo - = ArgUsageInfo.lookupFuncArgInfo(*CalleeFunc); - const AMDGPUFunctionArgInfo &CallerArgInfo = Info.getArgInfo(); + const AMDGPUFunctionArgInfo *CalleeArgInfo + = &AMDGPUArgumentUsageInfo::FixedABIFunctionInfo; + if (const Function *CalleeFunc = CLI.CB->getCalledFunction()) { + auto &ArgUsageInfo = + DAG.getPass()->getAnalysis<AMDGPUArgumentUsageInfo>(); + CalleeArgInfo = &ArgUsageInfo.lookupFuncArgInfo(*CalleeFunc); + } + // TODO: Unify with private memory register handling. This is complicated by // the fact that at least in kernels, the input argument is not necessarily // in the same location as the input. AMDGPUFunctionArgInfo::PreloadedValue InputRegs[] = { AMDGPUFunctionArgInfo::DISPATCH_PTR, AMDGPUFunctionArgInfo::QUEUE_PTR, - AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR, + AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR, AMDGPUFunctionArgInfo::DISPATCH_ID, AMDGPUFunctionArgInfo::WORKGROUP_ID_X, AMDGPUFunctionArgInfo::WORKGROUP_ID_Y, - AMDGPUFunctionArgInfo::WORKGROUP_ID_Z, - AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR + AMDGPUFunctionArgInfo::WORKGROUP_ID_Z }; for (auto InputID : InputRegs) { const ArgDescriptor *OutgoingArg; const TargetRegisterClass *ArgRC; + LLT ArgTy; - std::tie(OutgoingArg, ArgRC) = CalleeArgInfo.getPreloadedValue(InputID); + std::tie(OutgoingArg, ArgRC, ArgTy) = + CalleeArgInfo->getPreloadedValue(InputID); if (!OutgoingArg) continue; const ArgDescriptor *IncomingArg; const TargetRegisterClass *IncomingArgRC; - std::tie(IncomingArg, IncomingArgRC) - = CallerArgInfo.getPreloadedValue(InputID); + LLT Ty; + std::tie(IncomingArg, IncomingArgRC, Ty) = + CallerArgInfo.getPreloadedValue(InputID); assert(IncomingArgRC == ArgRC); // All special arguments are ints for now. @@ -2503,8 +2611,11 @@ void SITargetLowering::passSpecialInputs( if (OutgoingArg->isRegister()) { RegsToPass.emplace_back(OutgoingArg->getRegister(), InputReg); + if (!CCInfo.AllocateReg(OutgoingArg->getRegister())) + report_fatal_error("failed to allocate implicit input argument"); } else { - unsigned SpecialArgOffset = CCInfo.AllocateStack(ArgVT.getStoreSize(), 4); + unsigned SpecialArgOffset = + CCInfo.AllocateStack(ArgVT.getStoreSize(), Align(4)); SDValue ArgStore = storeStackInputValue(DAG, DL, Chain, InputReg, SpecialArgOffset); MemOpChains.push_back(ArgStore); @@ -2515,33 +2626,34 @@ void SITargetLowering::passSpecialInputs( // packed. const ArgDescriptor *OutgoingArg; const TargetRegisterClass *ArgRC; + LLT Ty; - std::tie(OutgoingArg, ArgRC) = - CalleeArgInfo.getPreloadedValue(AMDGPUFunctionArgInfo::WORKITEM_ID_X); + std::tie(OutgoingArg, ArgRC, Ty) = + CalleeArgInfo->getPreloadedValue(AMDGPUFunctionArgInfo::WORKITEM_ID_X); if (!OutgoingArg) - std::tie(OutgoingArg, ArgRC) = - CalleeArgInfo.getPreloadedValue(AMDGPUFunctionArgInfo::WORKITEM_ID_Y); + std::tie(OutgoingArg, ArgRC, Ty) = + CalleeArgInfo->getPreloadedValue(AMDGPUFunctionArgInfo::WORKITEM_ID_Y); if (!OutgoingArg) - std::tie(OutgoingArg, ArgRC) = - CalleeArgInfo.getPreloadedValue(AMDGPUFunctionArgInfo::WORKITEM_ID_Z); + std::tie(OutgoingArg, ArgRC, Ty) = + CalleeArgInfo->getPreloadedValue(AMDGPUFunctionArgInfo::WORKITEM_ID_Z); if (!OutgoingArg) return; - const ArgDescriptor *IncomingArgX - = CallerArgInfo.getPreloadedValue(AMDGPUFunctionArgInfo::WORKITEM_ID_X).first; - const ArgDescriptor *IncomingArgY - = CallerArgInfo.getPreloadedValue(AMDGPUFunctionArgInfo::WORKITEM_ID_Y).first; - const ArgDescriptor *IncomingArgZ - = CallerArgInfo.getPreloadedValue(AMDGPUFunctionArgInfo::WORKITEM_ID_Z).first; + const ArgDescriptor *IncomingArgX = std::get<0>( + CallerArgInfo.getPreloadedValue(AMDGPUFunctionArgInfo::WORKITEM_ID_X)); + const ArgDescriptor *IncomingArgY = std::get<0>( + CallerArgInfo.getPreloadedValue(AMDGPUFunctionArgInfo::WORKITEM_ID_Y)); + const ArgDescriptor *IncomingArgZ = std::get<0>( + CallerArgInfo.getPreloadedValue(AMDGPUFunctionArgInfo::WORKITEM_ID_Z)); SDValue InputReg; SDLoc SL; // If incoming ids are not packed we need to pack them. - if (IncomingArgX && !IncomingArgX->isMasked() && CalleeArgInfo.WorkItemIDX) + if (IncomingArgX && !IncomingArgX->isMasked() && CalleeArgInfo->WorkItemIDX) InputReg = loadInputValue(DAG, ArgRC, MVT::i32, DL, *IncomingArgX); - if (IncomingArgY && !IncomingArgY->isMasked() && CalleeArgInfo.WorkItemIDY) { + if (IncomingArgY && !IncomingArgY->isMasked() && CalleeArgInfo->WorkItemIDY) { SDValue Y = loadInputValue(DAG, ArgRC, MVT::i32, DL, *IncomingArgY); Y = DAG.getNode(ISD::SHL, SL, MVT::i32, Y, DAG.getShiftAmountConstant(10, MVT::i32, SL)); @@ -2549,7 +2661,7 @@ void SITargetLowering::passSpecialInputs( DAG.getNode(ISD::OR, SL, MVT::i32, InputReg, Y) : Y; } - if (IncomingArgZ && !IncomingArgZ->isMasked() && CalleeArgInfo.WorkItemIDZ) { + if (IncomingArgZ && !IncomingArgZ->isMasked() && CalleeArgInfo->WorkItemIDZ) { SDValue Z = loadInputValue(DAG, ArgRC, MVT::i32, DL, *IncomingArgZ); Z = DAG.getNode(ISD::SHL, SL, MVT::i32, Z, DAG.getShiftAmountConstant(20, MVT::i32, SL)); @@ -2569,8 +2681,9 @@ void SITargetLowering::passSpecialInputs( if (OutgoingArg->isRegister()) { RegsToPass.emplace_back(OutgoingArg->getRegister(), InputReg); + CCInfo.AllocateReg(OutgoingArg->getRegister()); } else { - unsigned SpecialArgOffset = CCInfo.AllocateStack(4, 4); + unsigned SpecialArgOffset = CCInfo.AllocateStack(4, Align(4)); SDValue ArgStore = storeStackInputValue(DAG, DL, Chain, InputReg, SpecialArgOffset); MemOpChains.push_back(ArgStore); @@ -2703,10 +2816,11 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI, "unsupported call to variadic function "); } - if (!CLI.CS.getInstruction()) + if (!CLI.CB) report_fatal_error("unsupported libcall legalization"); - if (!CLI.CS.getCalledFunction()) { + if (!AMDGPUTargetMachine::EnableFixedFunctionABI && + !CLI.CB->getCalledFunction()) { return lowerUnhandledCall(CLI, InVals, "unsupported indirect call to function "); } @@ -2726,7 +2840,7 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI, if (IsTailCall) { IsTailCall = isEligibleForTailCallOptimization( Callee, CallConv, IsVarArg, Outs, OutVals, Ins, DAG); - if (!IsTailCall && CLI.CS && CLI.CS.isMustTailCall()) { + if (!IsTailCall && CLI.CB && CLI.CB->isMustTailCall()) { report_fatal_error("failed to perform tail call elimination on a call " "site marked musttail"); } @@ -2743,12 +2857,19 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI, } const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); + SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass; + SmallVector<SDValue, 8> MemOpChains; // Analyze operands of the call, assigning locations to each operand. SmallVector<CCValAssign, 16> ArgLocs; CCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext()); CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, IsVarArg); + if (AMDGPUTargetMachine::EnableFixedFunctionABI) { + // With a fixed ABI, allocate fixed registers before user arguments. + passSpecialInputs(CLI, CCInfo, *Info, RegsToPass, MemOpChains, Chain); + } + CCInfo.AnalyzeCallOperands(Outs, AssignFn); // Get a count of how many bytes are to be pushed on the stack. @@ -2767,7 +2888,6 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI, // arguments to begin at SP+0. Completely unused for non-tail calls. int32_t FPDiff = 0; MachineFrameInfo &MFI = MF.getFrameInfo(); - SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass; // Adjust the stack pointer for the new arguments... // These operations are automatically eliminated by the prolog/epilog pass @@ -2784,7 +2904,6 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI, Chain = DAG.getTokenFactor(DL, CopyFromChains); } - SmallVector<SDValue, 8> MemOpChains; MVT PtrVT = MVT::i32; // Walk the register/memloc assignments, inserting copies/loads. @@ -2837,7 +2956,7 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI, // FIXME: We can have better than the minimum byval required alignment. Alignment = Flags.isByVal() - ? MaybeAlign(Flags.getByValAlign()) + ? Flags.getNonZeroByValAlign() : commonAlignment(Subtarget->getStackAlignment(), Offset); Offset = Offset + FPDiff; @@ -2864,11 +2983,12 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI, if (Outs[i].Flags.isByVal()) { SDValue SizeNode = DAG.getConstant(Outs[i].Flags.getByValSize(), DL, MVT::i32); - SDValue Cpy = DAG.getMemcpy( - Chain, DL, DstAddr, Arg, SizeNode, Outs[i].Flags.getByValAlign(), - /*isVol = */ false, /*AlwaysInline = */ true, - /*isTailCall = */ false, DstInfo, - MachinePointerInfo(AMDGPUAS::PRIVATE_ADDRESS)); + SDValue Cpy = + DAG.getMemcpy(Chain, DL, DstAddr, Arg, SizeNode, + Outs[i].Flags.getNonZeroByValAlign(), + /*isVol = */ false, /*AlwaysInline = */ true, + /*isTailCall = */ false, DstInfo, + MachinePointerInfo(AMDGPUAS::PRIVATE_ADDRESS)); MemOpChains.push_back(Cpy); } else { @@ -2879,8 +2999,10 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI, } } - // Copy special input registers after user input arguments. - passSpecialInputs(CLI, CCInfo, *Info, RegsToPass, MemOpChains, Chain); + if (!AMDGPUTargetMachine::EnableFixedFunctionABI) { + // Copy special input registers after user input arguments. + passSpecialInputs(CLI, CCInfo, *Info, RegsToPass, MemOpChains, Chain); + } if (!MemOpChains.empty()) Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOpChains); @@ -2927,9 +3049,12 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI, Ops.push_back(Callee); // Add a redundant copy of the callee global which will not be legalized, as // we need direct access to the callee later. - GlobalAddressSDNode *GSD = cast<GlobalAddressSDNode>(Callee); - const GlobalValue *GV = GSD->getGlobal(); - Ops.push_back(DAG.getTargetGlobalAddress(GV, DL, MVT::i64)); + if (GlobalAddressSDNode *GSD = dyn_cast<GlobalAddressSDNode>(Callee)) { + const GlobalValue *GV = GSD->getGlobal(); + Ops.push_back(DAG.getTargetGlobalAddress(GV, DL, MVT::i64)); + } else { + Ops.push_back(DAG.getTargetConstant(0, DL, MVT::i64)); + } if (IsTailCall) { // Each tail call may have to adjust the stack by a different amount, so @@ -2985,6 +3110,71 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI, IsThisReturn ? OutVals[0] : SDValue()); } +// This is identical to the default implementation in ExpandDYNAMIC_STACKALLOC, +// except for applying the wave size scale to the increment amount. +SDValue SITargetLowering::lowerDYNAMIC_STACKALLOCImpl( + SDValue Op, SelectionDAG &DAG) const { + const MachineFunction &MF = DAG.getMachineFunction(); + const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); + + SDLoc dl(Op); + EVT VT = Op.getValueType(); + SDValue Tmp1 = Op; + SDValue Tmp2 = Op.getValue(1); + SDValue Tmp3 = Op.getOperand(2); + SDValue Chain = Tmp1.getOperand(0); + + Register SPReg = Info->getStackPtrOffsetReg(); + + // Chain the dynamic stack allocation so that it doesn't modify the stack + // pointer when other instructions are using the stack. + Chain = DAG.getCALLSEQ_START(Chain, 0, 0, dl); + + SDValue Size = Tmp2.getOperand(1); + SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, VT); + Chain = SP.getValue(1); + MaybeAlign Alignment = cast<ConstantSDNode>(Tmp3)->getMaybeAlignValue(); + const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); + const TargetFrameLowering *TFL = ST.getFrameLowering(); + unsigned Opc = + TFL->getStackGrowthDirection() == TargetFrameLowering::StackGrowsUp ? + ISD::ADD : ISD::SUB; + + SDValue ScaledSize = DAG.getNode( + ISD::SHL, dl, VT, Size, + DAG.getConstant(ST.getWavefrontSizeLog2(), dl, MVT::i32)); + + Align StackAlign = TFL->getStackAlign(); + Tmp1 = DAG.getNode(Opc, dl, VT, SP, ScaledSize); // Value + if (Alignment && *Alignment > StackAlign) { + Tmp1 = DAG.getNode(ISD::AND, dl, VT, Tmp1, + DAG.getConstant(-(uint64_t)Alignment->value() + << ST.getWavefrontSizeLog2(), + dl, VT)); + } + + Chain = DAG.getCopyToReg(Chain, dl, SPReg, Tmp1); // Output chain + Tmp2 = DAG.getCALLSEQ_END( + Chain, DAG.getIntPtrConstant(0, dl, true), + DAG.getIntPtrConstant(0, dl, true), SDValue(), dl); + + return DAG.getMergeValues({Tmp1, Tmp2}, dl); +} + +SDValue SITargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, + SelectionDAG &DAG) const { + // We only handle constant sizes here to allow non-entry block, static sized + // allocas. A truly dynamic value is more difficult to support because we + // don't know if the size value is uniform or not. If the size isn't uniform, + // we would need to do a wave reduction to get the maximum size to know how + // much to increment the uniform stack pointer. + SDValue Size = Op.getOperand(1); + if (isa<ConstantSDNode>(Size)) + return lowerDYNAMIC_STACKALLOCImpl(Op, DAG); // Use "generic" expansion. + + return AMDGPUTargetLowering::LowerDYNAMIC_STACKALLOC(Op, DAG); +} + Register SITargetLowering::getRegisterByName(const char* RegName, LLT VT, const MachineFunction &MF) const { Register Reg = StringSwitch<Register>(RegName) @@ -3310,9 +3500,15 @@ static MachineBasicBlock::iterator loadM0FromVGPR(const SIInstrInfo *TII, auto InsPt = emitLoadM0FromVGPRLoop(TII, MRI, MBB, *LoopBB, DL, *Idx, InitResultReg, DstReg, PhiReg, TmpExec, Offset, UseGPRIdxMode, IsIndirectSrc); - - MachineBasicBlock::iterator First = RemainderBB->begin(); - BuildMI(*RemainderBB, First, DL, TII->get(MovExecOpc), Exec) + MachineBasicBlock* LandingPad = MF->CreateMachineBasicBlock(); + MachineFunction::iterator MBBI(LoopBB); + ++MBBI; + MF->insert(MBBI, LandingPad); + LoopBB->removeSuccessor(RemainderBB); + LandingPad->addSuccessor(RemainderBB); + LoopBB->addSuccessor(LandingPad); + MachineBasicBlock::iterator First = LandingPad->begin(); + BuildMI(*LandingPad, First, DL, TII->get(MovExecOpc), Exec) .addReg(SaveExec); return InsPt; @@ -3331,7 +3527,7 @@ computeIndirectRegAndOffset(const SIRegisterInfo &TRI, if (Offset >= NumElts || Offset < 0) return std::make_pair(AMDGPU::sub0, Offset); - return std::make_pair(AMDGPU::sub0 + Offset, 0); + return std::make_pair(SIRegisterInfo::getSubRegFromChannel(Offset), 0); } // Return true if the index is an SGPR and was set. @@ -3465,24 +3661,6 @@ static MachineBasicBlock *emitIndirectSrc(MachineInstr &MI, return LoopBB; } -static unsigned getMOVRELDPseudo(const SIRegisterInfo &TRI, - const TargetRegisterClass *VecRC) { - switch (TRI.getRegSizeInBits(*VecRC)) { - case 32: // 4 bytes - return AMDGPU::V_MOVRELD_B32_V1; - case 64: // 8 bytes - return AMDGPU::V_MOVRELD_B32_V2; - case 128: // 16 bytes - return AMDGPU::V_MOVRELD_B32_V4; - case 256: // 32 bytes - return AMDGPU::V_MOVRELD_B32_V8; - case 512: // 64 bytes - return AMDGPU::V_MOVRELD_B32_V16; - default: - llvm_unreachable("unsupported size for MOVRELD pseudos"); - } -} - static MachineBasicBlock *emitIndirectDst(MachineInstr &MI, MachineBasicBlock &MBB, const GCNSubtarget &ST) { @@ -3522,28 +3700,18 @@ static MachineBasicBlock *emitIndirectDst(MachineInstr &MI, return &MBB; } + const MCInstrDesc &MovRelDesc + = TII->getIndirectRegWritePseudo(TRI.getRegSizeInBits(*VecRC), 32, false); + if (setM0ToIndexFromSGPR(TII, MRI, MI, Offset, UseGPRIdxMode, false)) { MachineBasicBlock::iterator I(&MI); const DebugLoc &DL = MI.getDebugLoc(); - - if (UseGPRIdxMode) { - BuildMI(MBB, I, DL, TII->get(AMDGPU::V_MOV_B32_indirect)) - .addReg(SrcVec->getReg(), RegState::Undef, SubReg) // vdst - .add(*Val) - .addReg(Dst, RegState::ImplicitDefine) - .addReg(SrcVec->getReg(), RegState::Implicit) - .addReg(AMDGPU::M0, RegState::Implicit); - + BuildMI(MBB, I, DL, MovRelDesc, Dst) + .addReg(SrcVec->getReg()) + .add(*Val) + .addImm(SubReg); + if (UseGPRIdxMode) BuildMI(MBB, I, DL, TII->get(AMDGPU::S_SET_GPR_IDX_OFF)); - } else { - const MCInstrDesc &MovRelDesc = TII->get(getMOVRELDPseudo(TRI, VecRC)); - - BuildMI(MBB, I, DL, MovRelDesc) - .addReg(Dst, RegState::Define) - .addReg(SrcVec->getReg()) - .add(*Val) - .addImm(SubReg - AMDGPU::sub0); - } MI.eraseFromParent(); return &MBB; @@ -3560,26 +3728,14 @@ static MachineBasicBlock *emitIndirectDst(MachineInstr &MI, Offset, UseGPRIdxMode, false); MachineBasicBlock *LoopBB = InsPt->getParent(); - if (UseGPRIdxMode) { - BuildMI(*LoopBB, InsPt, DL, TII->get(AMDGPU::V_MOV_B32_indirect)) - .addReg(PhiReg, RegState::Undef, SubReg) // vdst - .add(*Val) // src0 - .addReg(Dst, RegState::ImplicitDefine) - .addReg(PhiReg, RegState::Implicit) - .addReg(AMDGPU::M0, RegState::Implicit); + BuildMI(*LoopBB, InsPt, DL, MovRelDesc, Dst) + .addReg(PhiReg) + .add(*Val) + .addImm(AMDGPU::sub0); + if (UseGPRIdxMode) BuildMI(*LoopBB, InsPt, DL, TII->get(AMDGPU::S_SET_GPR_IDX_OFF)); - } else { - const MCInstrDesc &MovRelDesc = TII->get(getMOVRELDPseudo(TRI, VecRC)); - - BuildMI(*LoopBB, InsPt, DL, MovRelDesc) - .addReg(Dst, RegState::Define) - .addReg(PhiReg) - .add(*Val) - .addImm(SubReg - AMDGPU::sub0); - } MI.eraseFromParent(); - return LoopBB; } @@ -3590,17 +3746,27 @@ MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter( MachineFunction *MF = BB->getParent(); SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>(); - if (TII->isMIMG(MI)) { - if (MI.memoperands_empty() && MI.mayLoadOrStore()) { - report_fatal_error("missing mem operand from MIMG instruction"); - } - // Add a memoperand for mimg instructions so that they aren't assumed to - // be ordered memory instuctions. + switch (MI.getOpcode()) { + case AMDGPU::S_UADDO_PSEUDO: + case AMDGPU::S_USUBO_PSEUDO: { + const DebugLoc &DL = MI.getDebugLoc(); + MachineOperand &Dest0 = MI.getOperand(0); + MachineOperand &Dest1 = MI.getOperand(1); + MachineOperand &Src0 = MI.getOperand(2); + MachineOperand &Src1 = MI.getOperand(3); + + unsigned Opc = (MI.getOpcode() == AMDGPU::S_UADDO_PSEUDO) + ? AMDGPU::S_ADD_I32 + : AMDGPU::S_SUB_I32; + BuildMI(*BB, MI, DL, TII->get(Opc), Dest0.getReg()).add(Src0).add(Src1); + + BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_CSELECT_B64), Dest1.getReg()) + .addImm(1) + .addImm(0); + MI.eraseFromParent(); return BB; } - - switch (MI.getOpcode()) { case AMDGPU::S_ADD_U64_PSEUDO: case AMDGPU::S_SUB_U64_PSEUDO: { MachineRegisterInfo &MRI = BB->getParent()->getRegInfo(); @@ -3616,35 +3782,150 @@ MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter( Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); - MachineOperand Src0Sub0 = TII->buildExtractSubRegOrImm(MI, MRI, - Src0, BoolRC, AMDGPU::sub0, - &AMDGPU::SReg_32RegClass); - MachineOperand Src0Sub1 = TII->buildExtractSubRegOrImm(MI, MRI, - Src0, BoolRC, AMDGPU::sub1, - &AMDGPU::SReg_32RegClass); + MachineOperand Src0Sub0 = TII->buildExtractSubRegOrImm( + MI, MRI, Src0, BoolRC, AMDGPU::sub0, &AMDGPU::SReg_32RegClass); + MachineOperand Src0Sub1 = TII->buildExtractSubRegOrImm( + MI, MRI, Src0, BoolRC, AMDGPU::sub1, &AMDGPU::SReg_32RegClass); - MachineOperand Src1Sub0 = TII->buildExtractSubRegOrImm(MI, MRI, - Src1, BoolRC, AMDGPU::sub0, - &AMDGPU::SReg_32RegClass); - MachineOperand Src1Sub1 = TII->buildExtractSubRegOrImm(MI, MRI, - Src1, BoolRC, AMDGPU::sub1, - &AMDGPU::SReg_32RegClass); + MachineOperand Src1Sub0 = TII->buildExtractSubRegOrImm( + MI, MRI, Src1, BoolRC, AMDGPU::sub0, &AMDGPU::SReg_32RegClass); + MachineOperand Src1Sub1 = TII->buildExtractSubRegOrImm( + MI, MRI, Src1, BoolRC, AMDGPU::sub1, &AMDGPU::SReg_32RegClass); bool IsAdd = (MI.getOpcode() == AMDGPU::S_ADD_U64_PSEUDO); unsigned LoOpc = IsAdd ? AMDGPU::S_ADD_U32 : AMDGPU::S_SUB_U32; unsigned HiOpc = IsAdd ? AMDGPU::S_ADDC_U32 : AMDGPU::S_SUBB_U32; - BuildMI(*BB, MI, DL, TII->get(LoOpc), DestSub0) - .add(Src0Sub0) - .add(Src1Sub0); - BuildMI(*BB, MI, DL, TII->get(HiOpc), DestSub1) - .add(Src0Sub1) - .add(Src1Sub1); + BuildMI(*BB, MI, DL, TII->get(LoOpc), DestSub0).add(Src0Sub0).add(Src1Sub0); + BuildMI(*BB, MI, DL, TII->get(HiOpc), DestSub1).add(Src0Sub1).add(Src1Sub1); BuildMI(*BB, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), Dest.getReg()) - .addReg(DestSub0) - .addImm(AMDGPU::sub0) - .addReg(DestSub1) - .addImm(AMDGPU::sub1); + .addReg(DestSub0) + .addImm(AMDGPU::sub0) + .addReg(DestSub1) + .addImm(AMDGPU::sub1); + MI.eraseFromParent(); + return BB; + } + case AMDGPU::V_ADD_U64_PSEUDO: + case AMDGPU::V_SUB_U64_PSEUDO: { + MachineRegisterInfo &MRI = BB->getParent()->getRegInfo(); + const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>(); + const SIRegisterInfo *TRI = ST.getRegisterInfo(); + const DebugLoc &DL = MI.getDebugLoc(); + + bool IsAdd = (MI.getOpcode() == AMDGPU::V_ADD_U64_PSEUDO); + + const auto *CarryRC = TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID); + + Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); + Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); + + Register CarryReg = MRI.createVirtualRegister(CarryRC); + Register DeadCarryReg = MRI.createVirtualRegister(CarryRC); + + MachineOperand &Dest = MI.getOperand(0); + MachineOperand &Src0 = MI.getOperand(1); + MachineOperand &Src1 = MI.getOperand(2); + + const TargetRegisterClass *Src0RC = Src0.isReg() + ? MRI.getRegClass(Src0.getReg()) + : &AMDGPU::VReg_64RegClass; + const TargetRegisterClass *Src1RC = Src1.isReg() + ? MRI.getRegClass(Src1.getReg()) + : &AMDGPU::VReg_64RegClass; + + const TargetRegisterClass *Src0SubRC = + TRI->getSubRegClass(Src0RC, AMDGPU::sub0); + const TargetRegisterClass *Src1SubRC = + TRI->getSubRegClass(Src1RC, AMDGPU::sub1); + + MachineOperand SrcReg0Sub0 = TII->buildExtractSubRegOrImm( + MI, MRI, Src0, Src0RC, AMDGPU::sub0, Src0SubRC); + MachineOperand SrcReg1Sub0 = TII->buildExtractSubRegOrImm( + MI, MRI, Src1, Src1RC, AMDGPU::sub0, Src1SubRC); + + MachineOperand SrcReg0Sub1 = TII->buildExtractSubRegOrImm( + MI, MRI, Src0, Src0RC, AMDGPU::sub1, Src0SubRC); + MachineOperand SrcReg1Sub1 = TII->buildExtractSubRegOrImm( + MI, MRI, Src1, Src1RC, AMDGPU::sub1, Src1SubRC); + + unsigned LoOpc = IsAdd ? AMDGPU::V_ADD_I32_e64 : AMDGPU::V_SUB_I32_e64; + MachineInstr *LoHalf = BuildMI(*BB, MI, DL, TII->get(LoOpc), DestSub0) + .addReg(CarryReg, RegState::Define) + .add(SrcReg0Sub0) + .add(SrcReg1Sub0) + .addImm(0); // clamp bit + + unsigned HiOpc = IsAdd ? AMDGPU::V_ADDC_U32_e64 : AMDGPU::V_SUBB_U32_e64; + MachineInstr *HiHalf = + BuildMI(*BB, MI, DL, TII->get(HiOpc), DestSub1) + .addReg(DeadCarryReg, RegState::Define | RegState::Dead) + .add(SrcReg0Sub1) + .add(SrcReg1Sub1) + .addReg(CarryReg, RegState::Kill) + .addImm(0); // clamp bit + + BuildMI(*BB, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), Dest.getReg()) + .addReg(DestSub0) + .addImm(AMDGPU::sub0) + .addReg(DestSub1) + .addImm(AMDGPU::sub1); + TII->legalizeOperands(*LoHalf); + TII->legalizeOperands(*HiHalf); + MI.eraseFromParent(); + return BB; + } + case AMDGPU::S_ADD_CO_PSEUDO: + case AMDGPU::S_SUB_CO_PSEUDO: { + // This pseudo has a chance to be selected + // only from uniform add/subcarry node. All the VGPR operands + // therefore assumed to be splat vectors. + MachineRegisterInfo &MRI = BB->getParent()->getRegInfo(); + const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>(); + const SIRegisterInfo *TRI = ST.getRegisterInfo(); + MachineBasicBlock::iterator MII = MI; + const DebugLoc &DL = MI.getDebugLoc(); + MachineOperand &Dest = MI.getOperand(0); + MachineOperand &CarryDest = MI.getOperand(1); + MachineOperand &Src0 = MI.getOperand(2); + MachineOperand &Src1 = MI.getOperand(3); + MachineOperand &Src2 = MI.getOperand(4); + unsigned Opc = (MI.getOpcode() == AMDGPU::S_ADD_CO_PSEUDO) + ? AMDGPU::S_ADDC_U32 + : AMDGPU::S_SUBB_U32; + if (Src0.isReg() && TRI->isVectorRegister(MRI, Src0.getReg())) { + Register RegOp0 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); + BuildMI(*BB, MII, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp0) + .addReg(Src0.getReg()); + Src0.setReg(RegOp0); + } + if (Src1.isReg() && TRI->isVectorRegister(MRI, Src1.getReg())) { + Register RegOp1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); + BuildMI(*BB, MII, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp1) + .addReg(Src1.getReg()); + Src1.setReg(RegOp1); + } + Register RegOp2 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); + if (TRI->isVectorRegister(MRI, Src2.getReg())) { + BuildMI(*BB, MII, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp2) + .addReg(Src2.getReg()); + Src2.setReg(RegOp2); + } + + if (TRI->getRegSizeInBits(*MRI.getRegClass(Src2.getReg())) == 64) { + BuildMI(*BB, MII, DL, TII->get(AMDGPU::S_CMP_LG_U64)) + .addReg(Src2.getReg()) + .addImm(0); + } else { + BuildMI(*BB, MII, DL, TII->get(AMDGPU::S_CMPK_LG_U32)) + .addReg(Src2.getReg()) + .addImm(0); + } + + BuildMI(*BB, MII, DL, TII->get(Opc), Dest.getReg()).add(Src0).add(Src1); + + BuildMI(*BB, MII, DL, TII->get(AMDGPU::COPY), CarryDest.getReg()) + .addReg(AMDGPU::SCC); MI.eraseFromParent(); return BB; } @@ -3741,12 +4022,14 @@ MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter( case AMDGPU::SI_INDIRECT_SRC_V4: case AMDGPU::SI_INDIRECT_SRC_V8: case AMDGPU::SI_INDIRECT_SRC_V16: + case AMDGPU::SI_INDIRECT_SRC_V32: return emitIndirectSrc(MI, *BB, *getSubtarget()); case AMDGPU::SI_INDIRECT_DST_V1: case AMDGPU::SI_INDIRECT_DST_V2: case AMDGPU::SI_INDIRECT_DST_V4: case AMDGPU::SI_INDIRECT_DST_V8: case AMDGPU::SI_INDIRECT_DST_V16: + case AMDGPU::SI_INDIRECT_DST_V32: return emitIndirectDst(MI, *BB, *getSubtarget()); case AMDGPU::SI_KILL_F32_COND_IMM_PSEUDO: case AMDGPU::SI_KILL_I1_PSEUDO: @@ -3870,6 +4153,75 @@ MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter( } return emitGWSMemViolTestLoop(MI, BB); + case AMDGPU::S_SETREG_B32: { + if (!getSubtarget()->hasDenormModeInst()) + return BB; + + // Try to optimize cases that only set the denormal mode or rounding mode. + // + // If the s_setreg_b32 fully sets all of the bits in the rounding mode or + // denormal mode to a constant, we can use s_round_mode or s_denorm_mode + // instead. + // + // FIXME: This could be predicates on the immediate, but tablegen doesn't + // allow you to have a no side effect instruction in the output of a + // sideeffecting pattern. + + // TODO: Should also emit a no side effects pseudo if only FP bits are + // touched, even if not all of them or to a variable. + unsigned ID, Offset, Width; + AMDGPU::Hwreg::decodeHwreg(MI.getOperand(1).getImm(), ID, Offset, Width); + if (ID != AMDGPU::Hwreg::ID_MODE) + return BB; + + const unsigned WidthMask = maskTrailingOnes<unsigned>(Width); + const unsigned SetMask = WidthMask << Offset; + unsigned SetDenormOp = 0; + unsigned SetRoundOp = 0; + + // The dedicated instructions can only set the whole denorm or round mode at + // once, not a subset of bits in either. + if (Width == 8 && (SetMask & (AMDGPU::Hwreg::FP_ROUND_MASK | + AMDGPU::Hwreg::FP_DENORM_MASK)) == SetMask) { + // If this fully sets both the round and denorm mode, emit the two + // dedicated instructions for these. + assert(Offset == 0); + SetRoundOp = AMDGPU::S_ROUND_MODE; + SetDenormOp = AMDGPU::S_DENORM_MODE; + } else if (Width == 4) { + if ((SetMask & AMDGPU::Hwreg::FP_ROUND_MASK) == SetMask) { + SetRoundOp = AMDGPU::S_ROUND_MODE; + assert(Offset == 0); + } else if ((SetMask & AMDGPU::Hwreg::FP_DENORM_MASK) == SetMask) { + SetDenormOp = AMDGPU::S_DENORM_MODE; + assert(Offset == 4); + } + } + + if (SetRoundOp || SetDenormOp) { + MachineRegisterInfo &MRI = BB->getParent()->getRegInfo(); + MachineInstr *Def = MRI.getVRegDef(MI.getOperand(0).getReg()); + if (Def && Def->isMoveImmediate() && Def->getOperand(1).isImm()) { + unsigned ImmVal = Def->getOperand(1).getImm(); + if (SetRoundOp) { + BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(SetRoundOp)) + .addImm(ImmVal & 0xf); + + // If we also have the denorm mode, get just the denorm mode bits. + ImmVal >>= 4; + } + + if (SetDenormOp) { + BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(SetDenormOp)) + .addImm(ImmVal & 0xf); + } + + MI.eraseFromParent(); + } + } + + return BB; + } default: return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI, BB); } @@ -3925,10 +4277,13 @@ bool SITargetLowering::isFMAFasterThanFMulAndFAdd(const MachineFunction &MF, switch (VT.getSimpleVT().SimpleTy) { case MVT::f32: { - // This is as fast on some subtargets. However, we always have full rate f32 - // mad available which returns the same result as the separate operations - // which we should prefer over fma. We can't use this if we want to support - // denormals, so only report this in these cases. + // If mad is not available this depends only on if f32 fma is full rate. + if (!Subtarget->hasMadMacF32Insts()) + return Subtarget->hasFastFMAF32(); + + // Otherwise f32 mad is always full rate and returns the same result as + // the separate operations so should be preferred over fma. + // However does not support denomals. if (hasFP32Denormals(MF)) return Subtarget->hasFastFMAF32() || Subtarget->hasDLInsts(); @@ -3946,13 +4301,14 @@ bool SITargetLowering::isFMAFasterThanFMulAndFAdd(const MachineFunction &MF, return false; } -bool SITargetLowering::isFMADLegalForFAddFSub(const SelectionDAG &DAG, - const SDNode *N) const { +bool SITargetLowering::isFMADLegal(const SelectionDAG &DAG, + const SDNode *N) const { // TODO: Check future ftz flag // v_mad_f32/v_mac_f32 do not support denormals. EVT VT = N->getValueType(0); if (VT == MVT::f32) - return !hasFP32Denormals(DAG.getMachineFunction()); + return Subtarget->hasMadMacF32Insts() && + !hasFP32Denormals(DAG.getMachineFunction()); if (VT == MVT::f16) { return Subtarget->hasMadF16() && !hasFP64FP16Denormals(DAG.getMachineFunction()); @@ -3971,7 +4327,7 @@ SDValue SITargetLowering::splitUnaryVectorOp(SDValue Op, SelectionDAG &DAG) const { unsigned Opc = Op.getOpcode(); EVT VT = Op.getValueType(); - assert(VT == MVT::v4f16); + assert(VT == MVT::v4f16 || VT == MVT::v4i16); SDValue Lo, Hi; std::tie(Lo, Hi) = DAG.SplitVectorOperand(Op.getNode(), 0); @@ -4080,6 +4436,7 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { case ISD::FABS: case ISD::FNEG: case ISD::FCANONICALIZE: + case ISD::BSWAP: return splitUnaryVectorOp(Op, DAG); case ISD::FMINNUM: case ISD::FMAXNUM: @@ -4101,6 +4458,11 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { case ISD::FMINNUM_IEEE: case ISD::FMAXNUM_IEEE: return splitBinaryVectorOp(Op, DAG); + case ISD::SMULO: + case ISD::UMULO: + return lowerXMULO(Op, DAG); + case ISD::DYNAMIC_STACKALLOC: + return LowerDYNAMIC_STACKALLOC(Op, DAG); } return SDValue(); } @@ -4204,9 +4566,8 @@ static SDValue lowerICMPIntrinsic(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG) { EVT VT = N->getValueType(0); const auto *CD = cast<ConstantSDNode>(N->getOperand(3)); - int CondCode = CD->getSExtValue(); - if (CondCode < ICmpInst::Predicate::FIRST_ICMP_PREDICATE || - CondCode > ICmpInst::Predicate::LAST_ICMP_PREDICATE) + unsigned CondCode = CD->getZExtValue(); + if (!ICmpInst::isIntPredicate(static_cast<ICmpInst::Predicate>(CondCode))) return DAG.getUNDEF(VT); ICmpInst::Predicate IcInput = static_cast<ICmpInst::Predicate>(CondCode); @@ -4241,11 +4602,9 @@ static SDValue lowerFCMPIntrinsic(const SITargetLowering &TLI, EVT VT = N->getValueType(0); const auto *CD = cast<ConstantSDNode>(N->getOperand(3)); - int CondCode = CD->getSExtValue(); - if (CondCode < FCmpInst::Predicate::FIRST_FCMP_PREDICATE || - CondCode > FCmpInst::Predicate::LAST_FCMP_PREDICATE) { + unsigned CondCode = CD->getZExtValue(); + if (!FCmpInst::isFPPredicate(static_cast<FCmpInst::Predicate>(CondCode))) return DAG.getUNDEF(VT); - } SDValue Src0 = N->getOperand(1); SDValue Src1 = N->getOperand(2); @@ -4268,6 +4627,43 @@ static SDValue lowerFCMPIntrinsic(const SITargetLowering &TLI, return DAG.getZExtOrTrunc(SetCC, SL, VT); } +static SDValue lowerBALLOTIntrinsic(const SITargetLowering &TLI, SDNode *N, + SelectionDAG &DAG) { + EVT VT = N->getValueType(0); + SDValue Src = N->getOperand(1); + SDLoc SL(N); + + if (Src.getOpcode() == ISD::SETCC) { + // (ballot (ISD::SETCC ...)) -> (AMDGPUISD::SETCC ...) + return DAG.getNode(AMDGPUISD::SETCC, SL, VT, Src.getOperand(0), + Src.getOperand(1), Src.getOperand(2)); + } + if (const ConstantSDNode *Arg = dyn_cast<ConstantSDNode>(Src)) { + // (ballot 0) -> 0 + if (Arg->isNullValue()) + return DAG.getConstant(0, SL, VT); + + // (ballot 1) -> EXEC/EXEC_LO + if (Arg->isOne()) { + Register Exec; + if (VT.getScalarSizeInBits() == 32) + Exec = AMDGPU::EXEC_LO; + else if (VT.getScalarSizeInBits() == 64) + Exec = AMDGPU::EXEC; + else + return SDValue(); + + return DAG.getCopyFromReg(DAG.getEntryNode(), SL, Exec, VT); + } + } + + // (ballot (i1 $src)) -> (AMDGPUISD::SETCC (i32 (zext $src)) (i32 0) + // ISD::SETNE) + return DAG.getNode( + AMDGPUISD::SETCC, SL, VT, DAG.getZExtOrTrunc(Src, SL, MVT::i32), + DAG.getConstant(0, SL, MVT::i32), DAG.getCondCode(ISD::SETNE)); +} + void SITargetLowering::ReplaceNodeResults(SDNode *N, SmallVectorImpl<SDValue> &Results, SelectionDAG &DAG) const { @@ -4440,9 +4836,7 @@ bool SITargetLowering::shouldEmitGOTReloc(const GlobalValue *GV) const { // FIXME: Either avoid relying on address space here or change the default // address space for functions to avoid the explicit check. return (GV->getValueType()->isFunctionTy() || - GV->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS || - GV->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS || - GV->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT) && + !isNonGlobalAddrSpace(GV->getAddressSpace())) && !shouldEmitFixup(GV) && !getTargetMachine().shouldAssumeDSOLocal(*GV->getParent(), GV); } @@ -4451,6 +4845,14 @@ bool SITargetLowering::shouldEmitPCReloc(const GlobalValue *GV) const { return !shouldEmitFixup(GV) && !shouldEmitGOTReloc(GV); } +bool SITargetLowering::shouldUseLDSConstAddress(const GlobalValue *GV) const { + if (!GV->hasExternalLinkage()) + return true; + + const auto OS = getTargetMachine().getTargetTriple().getOS(); + return OS == Triple::AMDHSA || OS == Triple::AMDPAL; +} + /// This transforms the control flow intrinsics to get the branch destination as /// last parameter, also switches branch target with BR if the need arise SDValue SITargetLowering::LowerBRCOND(SDValue BRCOND, @@ -4470,16 +4872,10 @@ SDValue SITargetLowering::LowerBRCOND(SDValue BRCOND, } else { // Get the target from BR if we don't negate the condition BR = findUser(BRCOND, ISD::BR); + assert(BR && "brcond missing unconditional branch user"); Target = BR->getOperand(1); } - // FIXME: This changes the types of the intrinsics instead of introducing new - // nodes with the correct types. - // e.g. llvm.amdgcn.loop - - // eg: i1,ch = llvm.amdgcn.loop t0, TargetConstant:i32<6271>, t3 - // => t9: ch = llvm.amdgcn.loop t0, TargetConstant:i32<6271>, t3, BasicBlock:ch<bb1 0x7fee5286d088> - unsigned CFNode = isCFIntrinsic(Intr); if (CFNode == 0) { // This is a uniform branch so we don't need to legalize. @@ -4524,7 +4920,6 @@ SDValue SITargetLowering::LowerBRCOND(SDValue BRCOND, }; SDValue NewBR = DAG.getNode(ISD::BR, DL, BR->getVTList(), Ops); DAG.ReplaceAllUsesWith(BR, NewBR.getNode()); - BR = NewBR.getNode(); } SDValue Chain = SDValue(Result, Result->getNumValues() - 1); @@ -4577,13 +4972,14 @@ SDValue SITargetLowering::LowerRETURNADDR(SDValue Op, return DAG.getCopyFromReg(DAG.getEntryNode(), DL, Reg, VT); } -SDValue SITargetLowering::getFPExtOrFPTrunc(SelectionDAG &DAG, +SDValue SITargetLowering::getFPExtOrFPRound(SelectionDAG &DAG, SDValue Op, const SDLoc &DL, EVT VT) const { return Op.getValueType().bitsLE(VT) ? DAG.getNode(ISD::FP_EXTEND, DL, VT, Op) : - DAG.getNode(ISD::FTRUNC, DL, VT, Op); + DAG.getNode(ISD::FP_ROUND, DL, VT, Op, + DAG.getTargetConstant(0, DL, MVT::i32)); } SDValue SITargetLowering::lowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const { @@ -4609,7 +5005,7 @@ SDValue SITargetLowering::lowerFMINNUM_FMAXNUM(SDValue Op, const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); bool IsIEEEMode = Info->getMode().IEEE; - // FIXME: Assert during eslection that this is only selected for + // FIXME: Assert during selection that this is only selected for // ieee_mode. Currently a combine can produce the ieee version for non-ieee // mode functions, but this happens to be OK since it's only done in cases // where there is known no sNaN. @@ -4621,6 +5017,42 @@ SDValue SITargetLowering::lowerFMINNUM_FMAXNUM(SDValue Op, return Op; } +SDValue SITargetLowering::lowerXMULO(SDValue Op, SelectionDAG &DAG) const { + EVT VT = Op.getValueType(); + SDLoc SL(Op); + SDValue LHS = Op.getOperand(0); + SDValue RHS = Op.getOperand(1); + bool isSigned = Op.getOpcode() == ISD::SMULO; + + if (ConstantSDNode *RHSC = isConstOrConstSplat(RHS)) { + const APInt &C = RHSC->getAPIntValue(); + // mulo(X, 1 << S) -> { X << S, (X << S) >> S != X } + if (C.isPowerOf2()) { + // smulo(x, signed_min) is same as umulo(x, signed_min). + bool UseArithShift = isSigned && !C.isMinSignedValue(); + SDValue ShiftAmt = DAG.getConstant(C.logBase2(), SL, MVT::i32); + SDValue Result = DAG.getNode(ISD::SHL, SL, VT, LHS, ShiftAmt); + SDValue Overflow = DAG.getSetCC(SL, MVT::i1, + DAG.getNode(UseArithShift ? ISD::SRA : ISD::SRL, + SL, VT, Result, ShiftAmt), + LHS, ISD::SETNE); + return DAG.getMergeValues({ Result, Overflow }, SL); + } + } + + SDValue Result = DAG.getNode(ISD::MUL, SL, VT, LHS, RHS); + SDValue Top = DAG.getNode(isSigned ? ISD::MULHS : ISD::MULHU, + SL, VT, LHS, RHS); + + SDValue Sign = isSigned + ? DAG.getNode(ISD::SRA, SL, VT, Result, + DAG.getConstant(VT.getScalarSizeInBits() - 1, SL, MVT::i32)) + : DAG.getConstant(0, SL, VT); + SDValue Overflow = DAG.getSetCC(SL, MVT::i1, Top, Sign, ISD::SETNE); + + return DAG.getMergeValues({ Result, Overflow }, SL); +} + SDValue SITargetLowering::lowerTRAP(SDValue Op, SelectionDAG &DAG) const { SDLoc SL(Op); SDValue Chain = Op.getOperand(0); @@ -4694,7 +5126,7 @@ SDValue SITargetLowering::getSegmentAperture(unsigned AS, const SDLoc &DL, MachineFunction &MF = DAG.getMachineFunction(); SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); - unsigned UserSGPR = Info->getQueuePtrUserSGPR(); + Register UserSGPR = Info->getQueuePtrUserSGPR(); assert(UserSGPR != AMDGPU::NoRegister); SDValue QueuePtr = CreateLiveInRegister( @@ -4765,6 +5197,10 @@ SDValue SITargetLowering::lowerADDRSPACECAST(SDValue Op, } } + if (ASC->getDestAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT && + Src.getValueType() == MVT::i64) + return DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, Src); + // global <-> flat are no-ops and never emitted. const MachineFunction &MF = DAG.getMachineFunction(); @@ -5036,8 +5472,9 @@ SITargetLowering::isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const { static SDValue buildPCRelGlobalAddress(SelectionDAG &DAG, const GlobalValue *GV, - const SDLoc &DL, unsigned Offset, EVT PtrVT, + const SDLoc &DL, int64_t Offset, EVT PtrVT, unsigned GAFlags = SIInstrInfo::MO_NONE) { + assert(isInt<32>(Offset + 4) && "32-bit offset is expected!"); // In order to support pc-relative addressing, the PC_ADD_REL_OFFSET SDNode is // lowered to the following code sequence: // @@ -5086,9 +5523,7 @@ SDValue SITargetLowering::LowerGlobalAddress(AMDGPUMachineFunction *MFI, GlobalAddressSDNode *GSD = cast<GlobalAddressSDNode>(Op); const GlobalValue *GV = GSD->getGlobal(); if ((GSD->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS && - (!GV->hasExternalLinkage() || - getTargetMachine().getTargetTriple().getOS() == Triple::AMDHSA || - getTargetMachine().getTargetTriple().getOS() == Triple::AMDPAL)) || + shouldUseLDSConstAddress(GV)) || GSD->getAddressSpace() == AMDGPUAS::REGION_ADDRESS || GSD->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS) return AMDGPUTargetLowering::LowerGlobalAddress(MFI, Op, DAG); @@ -5114,11 +5549,11 @@ SDValue SITargetLowering::LowerGlobalAddress(AMDGPUMachineFunction *MFI, Type *Ty = PtrVT.getTypeForEVT(*DAG.getContext()); PointerType *PtrTy = PointerType::get(Ty, AMDGPUAS::CONSTANT_ADDRESS); const DataLayout &DataLayout = DAG.getDataLayout(); - unsigned Align = DataLayout.getABITypeAlignment(PtrTy); + Align Alignment = DataLayout.getABITypeAlign(PtrTy); MachinePointerInfo PtrInfo = MachinePointerInfo::getGOT(DAG.getMachineFunction()); - return DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), GOTAddr, PtrInfo, Align, + return DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), GOTAddr, PtrInfo, Alignment, MachineMemOperand::MODereferenceable | MachineMemOperand::MOInvariant); } @@ -5144,8 +5579,8 @@ SDValue SITargetLowering::lowerImplicitZextParam(SelectionDAG &DAG, MVT VT, unsigned Offset) const { SDLoc SL(Op); - SDValue Param = lowerKernargMemParameter(DAG, MVT::i32, MVT::i32, SL, - DAG.getEntryNode(), Offset, 4, false); + SDValue Param = lowerKernargMemParameter( + DAG, MVT::i32, MVT::i32, SL, DAG.getEntryNode(), Offset, Align(4), false); // The local size values will have the hi 16-bits as zero. return DAG.getNode(ISD::AssertZext, SL, MVT::i32, Param, DAG.getValueType(VT)); @@ -5181,6 +5616,9 @@ static SDValue getBuildDwordsVector(SelectionDAG &DAG, SDLoc DL, } else if (Elts.size() == 2) { Type = MVT::v2f32; NumElts = 2; + } else if (Elts.size() == 3) { + Type = MVT::v3f32; + NumElts = 3; } else if (Elts.size() <= 4) { Type = MVT::v4f32; NumElts = 4; @@ -5230,6 +5668,24 @@ static bool parseCachePolicy(SDValue CachePolicy, SelectionDAG &DAG, return Value == 0; } +static SDValue padEltsToUndef(SelectionDAG &DAG, const SDLoc &DL, EVT CastVT, + SDValue Src, int ExtraElts) { + EVT SrcVT = Src.getValueType(); + + SmallVector<SDValue, 8> Elts; + + if (SrcVT.isVector()) + DAG.ExtractVectorElements(Src, Elts); + else + Elts.push_back(Src); + + SDValue Undef = DAG.getUNDEF(SrcVT.getScalarType()); + while (ExtraElts--) + Elts.push_back(Undef); + + return DAG.getBuildVector(CastVT, DL, Elts); +} + // Re-construct the required return value for a image load intrinsic. // This is more complicated due to the optional use TexFailCtrl which means the required // return type is an aggregate @@ -5241,76 +5697,56 @@ static SDValue constructRetValue(SelectionDAG &DAG, const SDLoc &DL, LLVMContext &Context) { // Determine the required return type. This is the same regardless of IsTexFail flag EVT ReqRetVT = ResultTypes[0]; - EVT ReqRetEltVT = ReqRetVT.isVector() ? ReqRetVT.getVectorElementType() : ReqRetVT; int ReqRetNumElts = ReqRetVT.isVector() ? ReqRetVT.getVectorNumElements() : 1; - EVT AdjEltVT = Unpacked && IsD16 ? MVT::i32 : ReqRetEltVT; - EVT AdjVT = Unpacked ? ReqRetNumElts > 1 ? EVT::getVectorVT(Context, AdjEltVT, ReqRetNumElts) - : AdjEltVT - : ReqRetVT; - - // Extract data part of the result - // Bitcast the result to the same type as the required return type - int NumElts; - if (IsD16 && !Unpacked) - NumElts = NumVDataDwords << 1; - else - NumElts = NumVDataDwords; + int NumDataDwords = (!IsD16 || (IsD16 && Unpacked)) ? + ReqRetNumElts : (ReqRetNumElts + 1) / 2; - EVT CastVT = NumElts > 1 ? EVT::getVectorVT(Context, AdjEltVT, NumElts) - : AdjEltVT; + int MaskPopDwords = (!IsD16 || (IsD16 && Unpacked)) ? + DMaskPop : (DMaskPop + 1) / 2; - // Special case for v6f16. Rather than add support for this, use v3i32 to - // extract the data elements - bool V6F16Special = false; - if (NumElts == 6) { - CastVT = EVT::getVectorVT(Context, MVT::i32, NumElts / 2); - DMaskPop >>= 1; - ReqRetNumElts >>= 1; - V6F16Special = true; - AdjVT = MVT::v2i32; - } + MVT DataDwordVT = NumDataDwords == 1 ? + MVT::i32 : MVT::getVectorVT(MVT::i32, NumDataDwords); - SDValue N = SDValue(Result, 0); - SDValue CastRes = DAG.getNode(ISD::BITCAST, DL, CastVT, N); + MVT MaskPopVT = MaskPopDwords == 1 ? + MVT::i32 : MVT::getVectorVT(MVT::i32, MaskPopDwords); - // Iterate over the result - SmallVector<SDValue, 4> BVElts; + SDValue Data(Result, 0); + SDValue TexFail; - if (CastVT.isVector()) { - DAG.ExtractVectorElements(CastRes, BVElts, 0, DMaskPop); - } else { - BVElts.push_back(CastRes); - } - int ExtraElts = ReqRetNumElts - DMaskPop; - while(ExtraElts--) - BVElts.push_back(DAG.getUNDEF(AdjEltVT)); + if (IsTexFail) { + SDValue ZeroIdx = DAG.getConstant(0, DL, MVT::i32); + if (MaskPopVT.isVector()) { + Data = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MaskPopVT, + SDValue(Result, 0), ZeroIdx); + } else { + Data = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MaskPopVT, + SDValue(Result, 0), ZeroIdx); + } - SDValue PreTFCRes; - if (ReqRetNumElts > 1) { - SDValue NewVec = DAG.getBuildVector(AdjVT, DL, BVElts); - if (IsD16 && Unpacked) - PreTFCRes = adjustLoadValueTypeImpl(NewVec, ReqRetVT, DL, DAG, Unpacked); - else - PreTFCRes = NewVec; - } else { - PreTFCRes = BVElts[0]; + TexFail = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, + SDValue(Result, 0), + DAG.getConstant(MaskPopDwords, DL, MVT::i32)); } - if (V6F16Special) - PreTFCRes = DAG.getNode(ISD::BITCAST, DL, MVT::v4f16, PreTFCRes); + if (DataDwordVT.isVector()) + Data = padEltsToUndef(DAG, DL, DataDwordVT, Data, + NumDataDwords - MaskPopDwords); - if (!IsTexFail) { - if (Result->getNumValues() > 1) - return DAG.getMergeValues({PreTFCRes, SDValue(Result, 1)}, DL); - else - return PreTFCRes; - } + if (IsD16) + Data = adjustLoadValueTypeImpl(Data, ReqRetVT, DL, DAG, Unpacked); + + if (!ReqRetVT.isVector()) + Data = DAG.getNode(ISD::TRUNCATE, DL, ReqRetVT.changeTypeToInteger(), Data); + + Data = DAG.getNode(ISD::BITCAST, DL, ReqRetVT, Data); - // Extract the TexFail result and insert into aggregate return - SmallVector<SDValue, 1> TFCElt; - DAG.ExtractVectorElements(N, TFCElt, DMaskPop, 1); - SDValue TFCRes = DAG.getNode(ISD::BITCAST, DL, ResultTypes[1], TFCElt[0]); - return DAG.getMergeValues({PreTFCRes, TFCRes, SDValue(Result, 1)}, DL); + if (TexFail) + return DAG.getMergeValues({Data, TexFail, SDValue(Result, 1)}, DL); + + if (Result->getNumValues() == 1) + return Data; + + return DAG.getMergeValues({Data, SDValue(Result, 1)}, DL); } static bool parseTexFail(SDValue TexFailCtrl, SelectionDAG &DAG, SDValue *TFE, @@ -5331,6 +5767,35 @@ static bool parseTexFail(SDValue TexFailCtrl, SelectionDAG &DAG, SDValue *TFE, return Value == 0; } +static void packImageA16AddressToDwords(SelectionDAG &DAG, SDValue Op, + MVT PackVectorVT, + SmallVectorImpl<SDValue> &PackedAddrs, + unsigned DimIdx, unsigned EndIdx, + unsigned NumGradients) { + SDLoc DL(Op); + for (unsigned I = DimIdx; I < EndIdx; I++) { + SDValue Addr = Op.getOperand(I); + + // Gradients are packed with undef for each coordinate. + // In <hi 16 bit>,<lo 16 bit> notation, the registers look like this: + // 1D: undef,dx/dh; undef,dx/dv + // 2D: dy/dh,dx/dh; dy/dv,dx/dv + // 3D: dy/dh,dx/dh; undef,dz/dh; dy/dv,dx/dv; undef,dz/dv + if (((I + 1) >= EndIdx) || + ((NumGradients / 2) % 2 == 1 && (I == DimIdx + (NumGradients / 2) - 1 || + I == DimIdx + NumGradients - 1))) { + if (Addr.getValueType() != MVT::i16) + Addr = DAG.getBitcast(MVT::i16, Addr); + Addr = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Addr); + } else { + Addr = DAG.getBuildVector(PackVectorVT, DL, {Addr, Op.getOperand(I + 1)}); + I++; + } + Addr = DAG.getBitcast(MVT::f32, Addr); + PackedAddrs.push_back(Addr); + } +} + SDValue SITargetLowering::lowerImage(SDValue Op, const AMDGPU::ImageDimIntrinsicInfo *Intr, SelectionDAG &DAG) const { @@ -5350,6 +5815,7 @@ SDValue SITargetLowering::lowerImage(SDValue Op, SmallVector<EVT, 3> ResultTypes(Op->value_begin(), Op->value_end()); SmallVector<EVT, 3> OrigResultTypes(Op->value_begin(), Op->value_end()); bool IsD16 = false; + bool IsG16 = false; bool IsA16 = false; SDValue VData; int NumVDataDwords; @@ -5456,41 +5922,67 @@ SDValue SITargetLowering::lowerImage(SDValue Op, } } - // Check for 16 bit addresses and pack if true. + // Push back extra arguments. + for (unsigned I = 0; I < BaseOpcode->NumExtraArgs; I++) + VAddrs.push_back(Op.getOperand(AddrIdx + I)); + + // Check for 16 bit addresses or derivatives and pack if true. unsigned DimIdx = AddrIdx + BaseOpcode->NumExtraArgs; + unsigned CoordIdx = DimIdx + NumGradients; + unsigned CoordsEnd = AddrIdx + NumMIVAddrs; + MVT VAddrVT = Op.getOperand(DimIdx).getSimpleValueType(); - const MVT VAddrScalarVT = VAddrVT.getScalarType(); - if (((VAddrScalarVT == MVT::f16) || (VAddrScalarVT == MVT::i16)) && - ST->hasFeature(AMDGPU::FeatureR128A16)) { - IsA16 = true; - const MVT VectorVT = VAddrScalarVT == MVT::f16 ? MVT::v2f16 : MVT::v2i16; - for (unsigned i = AddrIdx; i < (AddrIdx + NumMIVAddrs); ++i) { - SDValue AddrLo, AddrHi; - // Push back extra arguments. - if (i < DimIdx) { - AddrLo = Op.getOperand(i); - } else { - AddrLo = Op.getOperand(i); - // Dz/dh, dz/dv and the last odd coord are packed with undef. Also, - // in 1D, derivatives dx/dh and dx/dv are packed with undef. - if (((i + 1) >= (AddrIdx + NumMIVAddrs)) || - ((NumGradients / 2) % 2 == 1 && - (i == DimIdx + (NumGradients / 2) - 1 || - i == DimIdx + NumGradients - 1))) { - AddrHi = DAG.getUNDEF(MVT::f16); - } else { - AddrHi = Op.getOperand(i + 1); - i++; - } - AddrLo = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VectorVT, - {AddrLo, AddrHi}); - AddrLo = DAG.getBitcast(MVT::i32, AddrLo); + MVT VAddrScalarVT = VAddrVT.getScalarType(); + MVT PackVectorVT = VAddrScalarVT == MVT::f16 ? MVT::v2f16 : MVT::v2i16; + IsG16 = VAddrScalarVT == MVT::f16 || VAddrScalarVT == MVT::i16; + + VAddrVT = Op.getOperand(CoordIdx).getSimpleValueType(); + VAddrScalarVT = VAddrVT.getScalarType(); + IsA16 = VAddrScalarVT == MVT::f16 || VAddrScalarVT == MVT::i16; + if (IsA16 || IsG16) { + if (IsA16) { + if (!ST->hasA16()) { + LLVM_DEBUG(dbgs() << "Failed to lower image intrinsic: Target does not " + "support 16 bit addresses\n"); + return Op; + } + if (!IsG16) { + LLVM_DEBUG( + dbgs() << "Failed to lower image intrinsic: 16 bit addresses " + "need 16 bit derivatives but got 32 bit derivatives\n"); + return Op; } - VAddrs.push_back(AddrLo); + } else if (!ST->hasG16()) { + LLVM_DEBUG(dbgs() << "Failed to lower image intrinsic: Target does not " + "support 16 bit derivatives\n"); + return Op; + } + + if (BaseOpcode->Gradients && !IsA16) { + if (!ST->hasG16()) { + LLVM_DEBUG(dbgs() << "Failed to lower image intrinsic: Target does not " + "support 16 bit derivatives\n"); + return Op; + } + // Activate g16 + const AMDGPU::MIMGG16MappingInfo *G16MappingInfo = + AMDGPU::getMIMGG16MappingInfo(Intr->BaseOpcode); + IntrOpcode = G16MappingInfo->G16; // set new opcode to variant with _g16 + } + + // Don't compress addresses for G16 + const int PackEndIdx = IsA16 ? CoordsEnd : CoordIdx; + packImageA16AddressToDwords(DAG, Op, PackVectorVT, VAddrs, DimIdx, + PackEndIdx, NumGradients); + + if (!IsA16) { + // Add uncompressed address + for (unsigned I = CoordIdx; I < CoordsEnd; I++) + VAddrs.push_back(Op.getOperand(I)); } } else { - for (unsigned i = 0; i < NumMIVAddrs; ++i) - VAddrs.push_back(Op.getOperand(AddrIdx + i)); + for (unsigned I = DimIdx; I < CoordsEnd; I++) + VAddrs.push_back(Op.getOperand(I)); } // If the register allocator cannot place the address registers contiguously @@ -5557,8 +6049,8 @@ SDValue SITargetLowering::lowerImage(SDValue Op, } EVT NewVT = NumVDataDwords > 1 ? - EVT::getVectorVT(*DAG.getContext(), MVT::f32, NumVDataDwords) - : MVT::f32; + EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumVDataDwords) + : MVT::i32; ResultTypes[0] = NewVT; if (ResultTypes.size() == 3) { @@ -5603,10 +6095,12 @@ SDValue SITargetLowering::lowerImage(SDValue Op, Ops.push_back(DLC); Ops.push_back(GLC); Ops.push_back(SLC); - Ops.push_back(IsA16 && // a16 or r128 + Ops.push_back(IsA16 && // r128, a16 for gfx9 ST->hasFeature(AMDGPU::FeatureR128A16) ? True : False); - Ops.push_back(TFE); // tfe - Ops.push_back(LWE); // lwe + if (IsGFX10) + Ops.push_back(IsA16 ? True : False); + Ops.push_back(TFE); + Ops.push_back(LWE); if (!IsGFX10) Ops.push_back(DimInfo->DA ? True : False); if (BaseOpcode->HasD16) @@ -5655,26 +6149,25 @@ SDValue SITargetLowering::lowerImage(SDValue Op, } SDValue SITargetLowering::lowerSBuffer(EVT VT, SDLoc DL, SDValue Rsrc, - SDValue Offset, SDValue GLC, SDValue DLC, + SDValue Offset, SDValue CachePolicy, SelectionDAG &DAG) const { MachineFunction &MF = DAG.getMachineFunction(); const DataLayout &DataLayout = DAG.getDataLayout(); - unsigned Align = - DataLayout.getABITypeAlignment(VT.getTypeForEVT(*DAG.getContext())); + Align Alignment = + DataLayout.getABITypeAlign(VT.getTypeForEVT(*DAG.getContext())); MachineMemOperand *MMO = MF.getMachineMemOperand( MachinePointerInfo(), MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable | MachineMemOperand::MOInvariant, - VT.getStoreSize(), Align); + VT.getStoreSize(), Alignment); if (!Offset->isDivergent()) { SDValue Ops[] = { Rsrc, Offset, // Offset - GLC, - DLC, + CachePolicy }; // Widen vec3 load to vec4. @@ -5684,9 +6177,8 @@ SDValue SITargetLowering::lowerSBuffer(EVT VT, SDLoc DL, SDValue Rsrc, auto WidenedOp = DAG.getMemIntrinsicNode( AMDGPUISD::SBUFFER_LOAD, DL, DAG.getVTList(WidenedVT), Ops, WidenedVT, MF.getMachineMemOperand(MMO, 0, WidenedVT.getStoreSize())); - auto Subvector = DAG.getNode( - ISD::EXTRACT_SUBVECTOR, DL, VT, WidenedOp, - DAG.getConstant(0, DL, getVectorIdxTy(DAG.getDataLayout()))); + auto Subvector = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, WidenedOp, + DAG.getVectorIdxConstant(0, DL)); return Subvector; } @@ -5705,11 +6197,10 @@ SDValue SITargetLowering::lowerSBuffer(EVT VT, SDLoc DL, SDValue Rsrc, if (NumElts == 8 || NumElts == 16) { NumLoads = NumElts / 4; - LoadVT = MVT::v4i32; + LoadVT = MVT::getVectorVT(LoadVT.getScalarType(), 4); } SDVTList VTList = DAG.getVTList({LoadVT, MVT::Glue}); - unsigned CachePolicy = cast<ConstantSDNode>(GLC)->getZExtValue(); SDValue Ops[] = { DAG.getEntryNode(), // Chain Rsrc, // rsrc @@ -5717,13 +6208,14 @@ SDValue SITargetLowering::lowerSBuffer(EVT VT, SDLoc DL, SDValue Rsrc, {}, // voffset {}, // soffset {}, // offset - DAG.getTargetConstant(CachePolicy, DL, MVT::i32), // cachepolicy + CachePolicy, // cachepolicy DAG.getTargetConstant(0, DL, MVT::i1), // idxen }; // Use the alignment to ensure that the required offsets will fit into the // immediate offsets. - setBufferOffsets(Offset, DAG, &Ops[3], NumLoads > 1 ? 16 * NumLoads : 4); + setBufferOffsets(Offset, DAG, &Ops[3], + NumLoads > 1 ? Align(16 * NumLoads) : Align(4)); uint64_t InstOffset = cast<ConstantSDNode>(Ops[5])->getZExtValue(); for (unsigned i = 0; i < NumLoads; ++i) { @@ -5732,7 +6224,7 @@ SDValue SITargetLowering::lowerSBuffer(EVT VT, SDLoc DL, SDValue Rsrc, LoadVT, MMO, DAG)); } - if (VT == MVT::v8i32 || VT == MVT::v16i32) + if (NumElts == 8 || NumElts == 16) return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Loads); return Loads[0]; @@ -5777,6 +6269,11 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR); } case Intrinsic::amdgcn_kernarg_segment_ptr: { + if (!AMDGPU::isKernel(MF.getFunction().getCallingConv())) { + // This only makes sense to call in a kernel, so just lower to null. + return DAG.getConstant(0, DL, VT); + } + return getPreloadedValue(DAG, *MFI, VT, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR); } @@ -5790,8 +6287,7 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, case Intrinsic::amdgcn_rsq_legacy: if (Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) return emitRemovedIntrinsicError(DAG, DL, VT); - - return DAG.getNode(AMDGPUISD::RSQ_LEGACY, DL, VT, Op.getOperand(1)); + return SDValue(); case Intrinsic::amdgcn_rcp_legacy: if (Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) return emitRemovedIntrinsicError(DAG, DL, VT); @@ -5815,37 +6311,43 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, return emitNonHSAIntrinsicError(DAG, DL, VT); return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(), - SI::KernelInputOffsets::NGROUPS_X, 4, false); + SI::KernelInputOffsets::NGROUPS_X, Align(4), + false); case Intrinsic::r600_read_ngroups_y: if (Subtarget->isAmdHsaOS()) return emitNonHSAIntrinsicError(DAG, DL, VT); return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(), - SI::KernelInputOffsets::NGROUPS_Y, 4, false); + SI::KernelInputOffsets::NGROUPS_Y, Align(4), + false); case Intrinsic::r600_read_ngroups_z: if (Subtarget->isAmdHsaOS()) return emitNonHSAIntrinsicError(DAG, DL, VT); return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(), - SI::KernelInputOffsets::NGROUPS_Z, 4, false); + SI::KernelInputOffsets::NGROUPS_Z, Align(4), + false); case Intrinsic::r600_read_global_size_x: if (Subtarget->isAmdHsaOS()) return emitNonHSAIntrinsicError(DAG, DL, VT); return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(), - SI::KernelInputOffsets::GLOBAL_SIZE_X, 4, false); + SI::KernelInputOffsets::GLOBAL_SIZE_X, + Align(4), false); case Intrinsic::r600_read_global_size_y: if (Subtarget->isAmdHsaOS()) return emitNonHSAIntrinsicError(DAG, DL, VT); return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(), - SI::KernelInputOffsets::GLOBAL_SIZE_Y, 4, false); + SI::KernelInputOffsets::GLOBAL_SIZE_Y, + Align(4), false); case Intrinsic::r600_read_global_size_z: if (Subtarget->isAmdHsaOS()) return emitNonHSAIntrinsicError(DAG, DL, VT); return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(), - SI::KernelInputOffsets::GLOBAL_SIZE_Z, 4, false); + SI::KernelInputOffsets::GLOBAL_SIZE_Z, + Align(4), false); case Intrinsic::r600_read_local_size_x: if (Subtarget->isAmdHsaOS()) return emitNonHSAIntrinsicError(DAG, DL, VT); @@ -5865,29 +6367,23 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, return lowerImplicitZextParam(DAG, Op, MVT::i16, SI::KernelInputOffsets::LOCAL_SIZE_Z); case Intrinsic::amdgcn_workgroup_id_x: - case Intrinsic::r600_read_tgid_x: return getPreloadedValue(DAG, *MFI, VT, AMDGPUFunctionArgInfo::WORKGROUP_ID_X); case Intrinsic::amdgcn_workgroup_id_y: - case Intrinsic::r600_read_tgid_y: return getPreloadedValue(DAG, *MFI, VT, AMDGPUFunctionArgInfo::WORKGROUP_ID_Y); case Intrinsic::amdgcn_workgroup_id_z: - case Intrinsic::r600_read_tgid_z: return getPreloadedValue(DAG, *MFI, VT, AMDGPUFunctionArgInfo::WORKGROUP_ID_Z); case Intrinsic::amdgcn_workitem_id_x: - case Intrinsic::r600_read_tidig_x: return loadInputValue(DAG, &AMDGPU::VGPR_32RegClass, MVT::i32, SDLoc(DAG.getEntryNode()), MFI->getArgInfo().WorkItemIDX); case Intrinsic::amdgcn_workitem_id_y: - case Intrinsic::r600_read_tidig_y: return loadInputValue(DAG, &AMDGPU::VGPR_32RegClass, MVT::i32, SDLoc(DAG.getEntryNode()), MFI->getArgInfo().WorkItemIDY); case Intrinsic::amdgcn_workitem_id_z: - case Intrinsic::r600_read_tidig_z: return loadInputValue(DAG, &AMDGPU::VGPR_32RegClass, MVT::i32, SDLoc(DAG.getEntryNode()), MFI->getArgInfo().WorkItemIDZ); @@ -5901,53 +6397,11 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, if (!parseCachePolicy(Op.getOperand(3), DAG, &GLC, nullptr, IsGFX10 ? &DLC : nullptr)) return Op; - return lowerSBuffer(VT, DL, Op.getOperand(1), Op.getOperand(2), GLC, DLC, + return lowerSBuffer(VT, DL, Op.getOperand(1), Op.getOperand(2), Op.getOperand(3), DAG); } case Intrinsic::amdgcn_fdiv_fast: return lowerFDIV_FAST(Op, DAG); - case Intrinsic::amdgcn_interp_p1_f16: { - SDValue ToM0 = DAG.getCopyToReg(DAG.getEntryNode(), DL, AMDGPU::M0, - Op.getOperand(5), SDValue()); - if (getSubtarget()->getLDSBankCount() == 16) { - // 16 bank LDS - - // FIXME: This implicitly will insert a second CopyToReg to M0. - SDValue S = DAG.getNode( - ISD::INTRINSIC_WO_CHAIN, DL, MVT::f32, - DAG.getTargetConstant(Intrinsic::amdgcn_interp_mov, DL, MVT::i32), - DAG.getConstant(2, DL, MVT::i32), // P0 - Op.getOperand(2), // Attrchan - Op.getOperand(3), // Attr - Op.getOperand(5)); // m0 - - SDValue Ops[] = { - Op.getOperand(1), // Src0 - Op.getOperand(2), // Attrchan - Op.getOperand(3), // Attr - DAG.getTargetConstant(0, DL, MVT::i32), // $src0_modifiers - S, // Src2 - holds two f16 values selected by high - DAG.getTargetConstant(0, DL, MVT::i32), // $src2_modifiers - Op.getOperand(4), // high - DAG.getTargetConstant(0, DL, MVT::i1), // $clamp - DAG.getTargetConstant(0, DL, MVT::i32) // $omod - }; - return DAG.getNode(AMDGPUISD::INTERP_P1LV_F16, DL, MVT::f32, Ops); - } else { - // 32 bank LDS - SDValue Ops[] = { - Op.getOperand(1), // Src0 - Op.getOperand(2), // Attrchan - Op.getOperand(3), // Attr - DAG.getTargetConstant(0, DL, MVT::i32), // $src0_modifiers - Op.getOperand(4), // high - DAG.getTargetConstant(0, DL, MVT::i1), // $clamp - DAG.getTargetConstant(0, DL, MVT::i32), // $omod - ToM0.getValue(1) - }; - return DAG.getNode(AMDGPUISD::INTERP_P1LL_F16, DL, MVT::f32, Ops); - } - } case Intrinsic::amdgcn_sin: return DAG.getNode(AMDGPUISD::SIN_HW, DL, VT, Op.getOperand(1)); @@ -5988,9 +6442,6 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, return DAG.getNode(AMDGPUISD::DIV_FIXUP, DL, VT, Op.getOperand(1), Op.getOperand(2), Op.getOperand(3)); - case Intrinsic::amdgcn_trig_preop: - return DAG.getNode(AMDGPUISD::TRIG_PREOP, DL, VT, - Op.getOperand(1), Op.getOperand(2)); case Intrinsic::amdgcn_div_scale: { const ConstantSDNode *Param = cast<ConstantSDNode>(Op.getOperand(3)); @@ -6020,6 +6471,8 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, case Intrinsic::amdgcn_fcmp: { return lowerFCMPIntrinsic(*this, Op.getNode(), DAG); } + case Intrinsic::amdgcn_ballot: + return lowerBALLOTIntrinsic(*this, Op.getNode(), DAG); case Intrinsic::amdgcn_fmed3: return DAG.getNode(AMDGPUISD::FMED3, DL, VT, Op.getOperand(1), Op.getOperand(2), Op.getOperand(3)); @@ -6098,6 +6551,19 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, DAG.getConstant(1, SL, MVT::i32)); return DAG.getSetCC(SL, MVT::i1, SrcHi, Aperture, ISD::SETEQ); } + case Intrinsic::amdgcn_alignbit: + return DAG.getNode(ISD::FSHR, DL, VT, + Op.getOperand(1), Op.getOperand(2), Op.getOperand(3)); + case Intrinsic::amdgcn_reloc_constant: { + Module *M = const_cast<Module *>(MF.getFunction().getParent()); + const MDNode *Metadata = cast<MDNodeSDNode>(Op.getOperand(1))->getMD(); + auto SymbolName = cast<MDString>(Metadata->getOperand(0))->getString(); + auto RelocSymbol = cast<GlobalVariable>( + M->getOrInsertGlobal(SymbolName, Type::getInt32Ty(M->getContext()))); + SDValue GA = DAG.getTargetGlobalAddress(RelocSymbol, DL, MVT::i32, 0, + SIInstrInfo::MO_ABS32_LO); + return {DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, GA), 0}; + } default: if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr = AMDGPU::getImageDimIntrinsicInfo(IntrinsicID)) @@ -6131,6 +6597,28 @@ static unsigned getBufferOffsetForMMO(SDValue VOffset, cast<ConstantSDNode>(Offset)->getSExtValue(); } +static unsigned getDSShaderTypeValue(const MachineFunction &MF) { + switch (MF.getFunction().getCallingConv()) { + case CallingConv::AMDGPU_PS: + return 1; + case CallingConv::AMDGPU_VS: + return 2; + case CallingConv::AMDGPU_GS: + return 3; + case CallingConv::AMDGPU_HS: + case CallingConv::AMDGPU_LS: + case CallingConv::AMDGPU_ES: + report_fatal_error("ds_ordered_count unsupported for this calling conv"); + case CallingConv::AMDGPU_CS: + case CallingConv::AMDGPU_KERNEL: + case CallingConv::C: + case CallingConv::Fast: + default: + // Assume other calling conventions are various compute callable functions + return 0; + } +} + SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, SelectionDAG &DAG) const { unsigned IntrID = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); @@ -6146,8 +6634,6 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, unsigned IndexOperand = M->getConstantOperandVal(7); unsigned WaveRelease = M->getConstantOperandVal(8); unsigned WaveDone = M->getConstantOperandVal(9); - unsigned ShaderType; - unsigned Instruction; unsigned OrderedCountIndex = IndexOperand & 0x3f; IndexOperand &= ~0x3f; @@ -6166,36 +6652,11 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, if (IndexOperand) report_fatal_error("ds_ordered_count: bad index operand"); - switch (IntrID) { - case Intrinsic::amdgcn_ds_ordered_add: - Instruction = 0; - break; - case Intrinsic::amdgcn_ds_ordered_swap: - Instruction = 1; - break; - } - if (WaveDone && !WaveRelease) report_fatal_error("ds_ordered_count: wave_done requires wave_release"); - switch (DAG.getMachineFunction().getFunction().getCallingConv()) { - case CallingConv::AMDGPU_CS: - case CallingConv::AMDGPU_KERNEL: - ShaderType = 0; - break; - case CallingConv::AMDGPU_PS: - ShaderType = 1; - break; - case CallingConv::AMDGPU_VS: - ShaderType = 2; - break; - case CallingConv::AMDGPU_GS: - ShaderType = 3; - break; - default: - report_fatal_error("ds_ordered_count unsupported for this calling conv"); - } - + unsigned Instruction = IntrID == Intrinsic::amdgcn_ds_ordered_add ? 0 : 1; + unsigned ShaderType = getDSShaderTypeValue(DAG.getMachineFunction()); unsigned Offset0 = OrderedCountIndex << 2; unsigned Offset1 = WaveRelease | (WaveDone << 1) | (ShaderType << 2) | (Instruction << 4); @@ -6425,6 +6886,7 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, case Intrinsic::amdgcn_buffer_atomic_swap: case Intrinsic::amdgcn_buffer_atomic_add: case Intrinsic::amdgcn_buffer_atomic_sub: + case Intrinsic::amdgcn_buffer_atomic_csub: case Intrinsic::amdgcn_buffer_atomic_smin: case Intrinsic::amdgcn_buffer_atomic_umin: case Intrinsic::amdgcn_buffer_atomic_smax: @@ -6467,6 +6929,9 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, case Intrinsic::amdgcn_buffer_atomic_sub: Opcode = AMDGPUISD::BUFFER_ATOMIC_SUB; break; + case Intrinsic::amdgcn_buffer_atomic_csub: + Opcode = AMDGPUISD::BUFFER_ATOMIC_CSUB; + break; case Intrinsic::amdgcn_buffer_atomic_smin: Opcode = AMDGPUISD::BUFFER_ATOMIC_SMIN; break; @@ -6715,6 +7180,18 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, return DAG.getMemIntrinsicNode(AMDGPUISD::BUFFER_ATOMIC_CMPSWAP, DL, Op->getVTList(), Ops, VT, M->getMemOperand()); } + case Intrinsic::amdgcn_global_atomic_csub: { + MemSDNode *M = cast<MemSDNode>(Op); + SDValue Ops[] = { + M->getOperand(0), // Chain + M->getOperand(2), // Ptr + M->getOperand(3) // Value + }; + + return DAG.getMemIntrinsicNode(AMDGPUISD::ATOMIC_LOAD_CSUB, SDLoc(Op), + M->getVTList(), Ops, M->getMemoryVT(), + M->getMemOperand()); + } default: if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr = @@ -6750,9 +7227,8 @@ SDValue SITargetLowering::getMemIntrinsicNode(unsigned Opcode, const SDLoc &DL, auto NewOp = DAG.getMemIntrinsicNode(Opcode, DL, WidenedVTList, Ops, WidenedMemVT, MMO); if (WidenedVT != VT) { - auto Extract = DAG.getNode( - ISD::EXTRACT_SUBVECTOR, DL, VT, NewOp, - DAG.getConstant(0, DL, getVectorIdxTy(DAG.getDataLayout()))); + auto Extract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, NewOp, + DAG.getVectorIdxConstant(0, DL)); NewOp = DAG.getMergeValues({ Extract, SDValue(NewOp.getNode(), 1) }, DL); } return NewOp; @@ -6792,52 +7268,29 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op, MachineFunction &MF = DAG.getMachineFunction(); switch (IntrinsicID) { - case Intrinsic::amdgcn_exp: { - const ConstantSDNode *Tgt = cast<ConstantSDNode>(Op.getOperand(2)); - const ConstantSDNode *En = cast<ConstantSDNode>(Op.getOperand(3)); - const ConstantSDNode *Done = cast<ConstantSDNode>(Op.getOperand(8)); - const ConstantSDNode *VM = cast<ConstantSDNode>(Op.getOperand(9)); - - const SDValue Ops[] = { - Chain, - DAG.getTargetConstant(Tgt->getZExtValue(), DL, MVT::i8), // tgt - DAG.getTargetConstant(En->getZExtValue(), DL, MVT::i8), // en - Op.getOperand(4), // src0 - Op.getOperand(5), // src1 - Op.getOperand(6), // src2 - Op.getOperand(7), // src3 - DAG.getTargetConstant(0, DL, MVT::i1), // compr - DAG.getTargetConstant(VM->getZExtValue(), DL, MVT::i1) - }; - - unsigned Opc = Done->isNullValue() ? - AMDGPUISD::EXPORT : AMDGPUISD::EXPORT_DONE; - return DAG.getNode(Opc, DL, Op->getVTList(), Ops); - } case Intrinsic::amdgcn_exp_compr: { - const ConstantSDNode *Tgt = cast<ConstantSDNode>(Op.getOperand(2)); - const ConstantSDNode *En = cast<ConstantSDNode>(Op.getOperand(3)); SDValue Src0 = Op.getOperand(4); SDValue Src1 = Op.getOperand(5); - const ConstantSDNode *Done = cast<ConstantSDNode>(Op.getOperand(6)); - const ConstantSDNode *VM = cast<ConstantSDNode>(Op.getOperand(7)); + // Hack around illegal type on SI by directly selecting it. + if (isTypeLegal(Src0.getValueType())) + return SDValue(); + const ConstantSDNode *Done = cast<ConstantSDNode>(Op.getOperand(6)); SDValue Undef = DAG.getUNDEF(MVT::f32); const SDValue Ops[] = { - Chain, - DAG.getTargetConstant(Tgt->getZExtValue(), DL, MVT::i8), // tgt - DAG.getTargetConstant(En->getZExtValue(), DL, MVT::i8), // en - DAG.getNode(ISD::BITCAST, DL, MVT::f32, Src0), - DAG.getNode(ISD::BITCAST, DL, MVT::f32, Src1), + Op.getOperand(2), // tgt + DAG.getNode(ISD::BITCAST, DL, MVT::f32, Src0), // src0 + DAG.getNode(ISD::BITCAST, DL, MVT::f32, Src1), // src1 Undef, // src2 Undef, // src3 + Op.getOperand(7), // vm DAG.getTargetConstant(1, DL, MVT::i1), // compr - DAG.getTargetConstant(VM->getZExtValue(), DL, MVT::i1) + Op.getOperand(3), // en + Op.getOperand(0) // Chain }; - unsigned Opc = Done->isNullValue() ? - AMDGPUISD::EXPORT : AMDGPUISD::EXPORT_DONE; - return DAG.getNode(Opc, DL, Op->getVTList(), Ops); + unsigned Opc = Done->isNullValue() ? AMDGPU::EXP : AMDGPU::EXP_DONE; + return SDValue(DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops), 0); } case Intrinsic::amdgcn_s_barrier: { if (getTargetMachine().getOptLevel() > CodeGenOpt::None) { @@ -7183,13 +7636,14 @@ std::pair<SDValue, SDValue> SITargetLowering::splitBufferOffsets( // three offsets (voffset, soffset and instoffset) into the SDValue[3] array // pointed to by Offsets. unsigned SITargetLowering::setBufferOffsets(SDValue CombinedOffset, - SelectionDAG &DAG, SDValue *Offsets, - unsigned Align) const { + SelectionDAG &DAG, SDValue *Offsets, + Align Alignment) const { SDLoc DL(CombinedOffset); if (auto C = dyn_cast<ConstantSDNode>(CombinedOffset)) { uint32_t Imm = C->getZExtValue(); uint32_t SOffset, ImmOffset; - if (AMDGPU::splitMUBUFOffset(Imm, SOffset, ImmOffset, Subtarget, Align)) { + if (AMDGPU::splitMUBUFOffset(Imm, SOffset, ImmOffset, Subtarget, + Alignment)) { Offsets[0] = DAG.getConstant(0, DL, MVT::i32); Offsets[1] = DAG.getConstant(SOffset, DL, MVT::i32); Offsets[2] = DAG.getTargetConstant(ImmOffset, DL, MVT::i32); @@ -7202,7 +7656,7 @@ unsigned SITargetLowering::setBufferOffsets(SDValue CombinedOffset, uint32_t SOffset, ImmOffset; int Offset = cast<ConstantSDNode>(N1)->getSExtValue(); if (Offset >= 0 && AMDGPU::splitMUBUFOffset(Offset, SOffset, ImmOffset, - Subtarget, Align)) { + Subtarget, Alignment)) { Offsets[0] = N0; Offsets[1] = DAG.getConstant(SOffset, DL, MVT::i32); Offsets[2] = DAG.getTargetConstant(ImmOffset, DL, MVT::i32); @@ -7413,7 +7867,8 @@ SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const { SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); // If there is a possibilty that flat instruction access scratch memory // then we need to use the same legalization rules we use for private. - if (AS == AMDGPUAS::FLAT_ADDRESS) + if (AS == AMDGPUAS::FLAT_ADDRESS && + !Subtarget->hasMultiDwordFlatScratchAddressing()) AS = MFI->hasFlatScratchInit() ? AMDGPUAS::PRIVATE_ADDRESS : AMDGPUAS::GLOBAL_ADDRESS; @@ -7438,7 +7893,7 @@ SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const { AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT || AS == AMDGPUAS::GLOBAL_ADDRESS) { if (Subtarget->getScalarizeGlobalBehavior() && !Op->isDivergent() && - !Load->isVolatile() && isMemOpHasNoClobberedMemOperand(Load) && + Load->isSimple() && isMemOpHasNoClobberedMemOperand(Load) && Alignment >= 4 && NumElements < 32) { if (MemVT.isPow2VectorType()) return SDValue(); @@ -7547,55 +8002,54 @@ SDValue SITargetLowering::lowerFastUnsafeFDIV(SDValue Op, SDValue RHS = Op.getOperand(1); EVT VT = Op.getValueType(); const SDNodeFlags Flags = Op->getFlags(); - bool Unsafe = DAG.getTarget().Options.UnsafeFPMath || Flags.hasAllowReciprocal(); - if (!Unsafe && VT == MVT::f32 && hasFP32Denormals(DAG.getMachineFunction())) + bool AllowInaccurateRcp = DAG.getTarget().Options.UnsafeFPMath || + Flags.hasApproximateFuncs(); + + // Without !fpmath accuracy information, we can't do more because we don't + // know exactly whether rcp is accurate enough to meet !fpmath requirement. + if (!AllowInaccurateRcp) return SDValue(); if (const ConstantFPSDNode *CLHS = dyn_cast<ConstantFPSDNode>(LHS)) { - if (Unsafe || VT == MVT::f32 || VT == MVT::f16) { - if (CLHS->isExactlyValue(1.0)) { - // v_rcp_f32 and v_rsq_f32 do not support denormals, and according to - // the CI documentation has a worst case error of 1 ulp. - // OpenCL requires <= 2.5 ulp for 1.0 / x, so it should always be OK to - // use it as long as we aren't trying to use denormals. - // - // v_rcp_f16 and v_rsq_f16 DO support denormals. - - // 1.0 / sqrt(x) -> rsq(x) - - // XXX - Is UnsafeFPMath sufficient to do this for f64? The maximum ULP - // error seems really high at 2^29 ULP. - if (RHS.getOpcode() == ISD::FSQRT) - return DAG.getNode(AMDGPUISD::RSQ, SL, VT, RHS.getOperand(0)); - - // 1.0 / x -> rcp(x) - return DAG.getNode(AMDGPUISD::RCP, SL, VT, RHS); - } + if (CLHS->isExactlyValue(1.0)) { + // v_rcp_f32 and v_rsq_f32 do not support denormals, and according to + // the CI documentation has a worst case error of 1 ulp. + // OpenCL requires <= 2.5 ulp for 1.0 / x, so it should always be OK to + // use it as long as we aren't trying to use denormals. + // + // v_rcp_f16 and v_rsq_f16 DO support denormals. - // Same as for 1.0, but expand the sign out of the constant. - if (CLHS->isExactlyValue(-1.0)) { - // -1.0 / x -> rcp (fneg x) - SDValue FNegRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS); - return DAG.getNode(AMDGPUISD::RCP, SL, VT, FNegRHS); - } + // 1.0 / sqrt(x) -> rsq(x) + + // XXX - Is UnsafeFPMath sufficient to do this for f64? The maximum ULP + // error seems really high at 2^29 ULP. + if (RHS.getOpcode() == ISD::FSQRT) + return DAG.getNode(AMDGPUISD::RSQ, SL, VT, RHS.getOperand(0)); + + // 1.0 / x -> rcp(x) + return DAG.getNode(AMDGPUISD::RCP, SL, VT, RHS); } - } - if (Unsafe) { - // Turn into multiply by the reciprocal. - // x / y -> x * (1.0 / y) - SDValue Recip = DAG.getNode(AMDGPUISD::RCP, SL, VT, RHS); - return DAG.getNode(ISD::FMUL, SL, VT, LHS, Recip, Flags); + // Same as for 1.0, but expand the sign out of the constant. + if (CLHS->isExactlyValue(-1.0)) { + // -1.0 / x -> rcp (fneg x) + SDValue FNegRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS); + return DAG.getNode(AMDGPUISD::RCP, SL, VT, FNegRHS); + } } - return SDValue(); + // Turn into multiply by the reciprocal. + // x / y -> x * (1.0 / y) + SDValue Recip = DAG.getNode(AMDGPUISD::RCP, SL, VT, RHS); + return DAG.getNode(ISD::FMUL, SL, VT, LHS, Recip, Flags); } static SDValue getFPBinOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL, - EVT VT, SDValue A, SDValue B, SDValue GlueChain) { + EVT VT, SDValue A, SDValue B, SDValue GlueChain, + SDNodeFlags Flags) { if (GlueChain->getNumValues() <= 1) { - return DAG.getNode(Opcode, SL, VT, A, B); + return DAG.getNode(Opcode, SL, VT, A, B, Flags); } assert(GlueChain->getNumValues() == 3); @@ -7608,15 +8062,16 @@ static SDValue getFPBinOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL, break; } - return DAG.getNode(Opcode, SL, VTList, GlueChain.getValue(1), A, B, - GlueChain.getValue(2)); + return DAG.getNode(Opcode, SL, VTList, + {GlueChain.getValue(1), A, B, GlueChain.getValue(2)}, + Flags); } static SDValue getFPTernOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL, EVT VT, SDValue A, SDValue B, SDValue C, - SDValue GlueChain) { + SDValue GlueChain, SDNodeFlags Flags) { if (GlueChain->getNumValues() <= 1) { - return DAG.getNode(Opcode, SL, VT, A, B, C); + return DAG.getNode(Opcode, SL, VT, {A, B, C}, Flags); } assert(GlueChain->getNumValues() == 3); @@ -7629,8 +8084,9 @@ static SDValue getFPTernOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL, break; } - return DAG.getNode(Opcode, SL, VTList, GlueChain.getValue(1), A, B, C, - GlueChain.getValue(2)); + return DAG.getNode(Opcode, SL, VTList, + {GlueChain.getValue(1), A, B, C, GlueChain.getValue(2)}, + Flags); } SDValue SITargetLowering::LowerFDIV16(SDValue Op, SelectionDAG &DAG) const { @@ -7704,6 +8160,13 @@ SDValue SITargetLowering::LowerFDIV32(SDValue Op, SelectionDAG &DAG) const { if (SDValue FastLowered = lowerFastUnsafeFDIV(Op, DAG)) return FastLowered; + // The selection matcher assumes anything with a chain selecting to a + // mayRaiseFPException machine instruction. Since we're introducing a chain + // here, we need to explicitly report nofpexcept for the regular fdiv + // lowering. + SDNodeFlags Flags = Op->getFlags(); + Flags.setNoFPExcept(true); + SDLoc SL(Op); SDValue LHS = Op.getOperand(0); SDValue RHS = Op.getOperand(1); @@ -7713,95 +8176,100 @@ SDValue SITargetLowering::LowerFDIV32(SDValue Op, SelectionDAG &DAG) const { SDVTList ScaleVT = DAG.getVTList(MVT::f32, MVT::i1); SDValue DenominatorScaled = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, - RHS, RHS, LHS); + {RHS, RHS, LHS}, Flags); SDValue NumeratorScaled = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, - LHS, RHS, LHS); + {LHS, RHS, LHS}, Flags); // Denominator is scaled to not be denormal, so using rcp is ok. SDValue ApproxRcp = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, - DenominatorScaled); + DenominatorScaled, Flags); SDValue NegDivScale0 = DAG.getNode(ISD::FNEG, SL, MVT::f32, - DenominatorScaled); + DenominatorScaled, Flags); const unsigned Denorm32Reg = AMDGPU::Hwreg::ID_MODE | (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) | (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_); - const SDValue BitField = DAG.getTargetConstant(Denorm32Reg, SL, MVT::i16); + const SDValue BitField = DAG.getTargetConstant(Denorm32Reg, SL, MVT::i32); const bool HasFP32Denormals = hasFP32Denormals(DAG.getMachineFunction()); if (!HasFP32Denormals) { + // Note we can't use the STRICT_FMA/STRICT_FMUL for the non-strict FDIV + // lowering. The chain dependence is insufficient, and we need glue. We do + // not need the glue variants in a strictfp function. + SDVTList BindParamVTs = DAG.getVTList(MVT::Other, MVT::Glue); - SDValue EnableDenorm; + SDNode *EnableDenorm; if (Subtarget->hasDenormModeInst()) { const SDValue EnableDenormValue = getSPDenormModeValue(FP_DENORM_FLUSH_NONE, DAG, SL, Subtarget); EnableDenorm = DAG.getNode(AMDGPUISD::DENORM_MODE, SL, BindParamVTs, - DAG.getEntryNode(), EnableDenormValue); + DAG.getEntryNode(), EnableDenormValue).getNode(); } else { const SDValue EnableDenormValue = DAG.getConstant(FP_DENORM_FLUSH_NONE, SL, MVT::i32); - EnableDenorm = DAG.getNode(AMDGPUISD::SETREG, SL, BindParamVTs, - DAG.getEntryNode(), EnableDenormValue, - BitField); + EnableDenorm = + DAG.getMachineNode(AMDGPU::S_SETREG_B32, SL, BindParamVTs, + {EnableDenormValue, BitField, DAG.getEntryNode()}); } SDValue Ops[3] = { NegDivScale0, - EnableDenorm.getValue(0), - EnableDenorm.getValue(1) + SDValue(EnableDenorm, 0), + SDValue(EnableDenorm, 1) }; NegDivScale0 = DAG.getMergeValues(Ops, SL); } SDValue Fma0 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, NegDivScale0, - ApproxRcp, One, NegDivScale0); + ApproxRcp, One, NegDivScale0, Flags); SDValue Fma1 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, Fma0, ApproxRcp, - ApproxRcp, Fma0); + ApproxRcp, Fma0, Flags); SDValue Mul = getFPBinOp(DAG, ISD::FMUL, SL, MVT::f32, NumeratorScaled, - Fma1, Fma1); + Fma1, Fma1, Flags); SDValue Fma2 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, NegDivScale0, Mul, - NumeratorScaled, Mul); + NumeratorScaled, Mul, Flags); - SDValue Fma3 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, Fma2, Fma1, Mul, Fma2); + SDValue Fma3 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, + Fma2, Fma1, Mul, Fma2, Flags); SDValue Fma4 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, NegDivScale0, Fma3, - NumeratorScaled, Fma3); + NumeratorScaled, Fma3, Flags); if (!HasFP32Denormals) { - SDValue DisableDenorm; + SDNode *DisableDenorm; if (Subtarget->hasDenormModeInst()) { const SDValue DisableDenormValue = getSPDenormModeValue(FP_DENORM_FLUSH_IN_FLUSH_OUT, DAG, SL, Subtarget); DisableDenorm = DAG.getNode(AMDGPUISD::DENORM_MODE, SL, MVT::Other, Fma4.getValue(1), DisableDenormValue, - Fma4.getValue(2)); + Fma4.getValue(2)).getNode(); } else { const SDValue DisableDenormValue = DAG.getConstant(FP_DENORM_FLUSH_IN_FLUSH_OUT, SL, MVT::i32); - DisableDenorm = DAG.getNode(AMDGPUISD::SETREG, SL, MVT::Other, - Fma4.getValue(1), DisableDenormValue, - BitField, Fma4.getValue(2)); + DisableDenorm = DAG.getMachineNode( + AMDGPU::S_SETREG_B32, SL, MVT::Other, + {DisableDenormValue, BitField, Fma4.getValue(1), Fma4.getValue(2)}); } SDValue OutputChain = DAG.getNode(ISD::TokenFactor, SL, MVT::Other, - DisableDenorm, DAG.getRoot()); + SDValue(DisableDenorm, 0), DAG.getRoot()); DAG.setRoot(OutputChain); } SDValue Scale = NumeratorScaled.getValue(1); SDValue Fmas = DAG.getNode(AMDGPUISD::DIV_FMAS, SL, MVT::f32, - Fma4, Fma1, Fma3, Scale); + {Fma4, Fma1, Fma3, Scale}, Flags); - return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f32, Fmas, RHS, LHS); + return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f32, Fmas, RHS, LHS, Flags); } SDValue SITargetLowering::LowerFDIV64(SDValue Op, SelectionDAG &DAG) const { @@ -7916,7 +8384,8 @@ SDValue SITargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const { SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); // If there is a possibilty that flat instruction access scratch memory // then we need to use the same legalization rules we use for private. - if (AS == AMDGPUAS::FLAT_ADDRESS) + if (AS == AMDGPUAS::FLAT_ADDRESS && + !Subtarget->hasMultiDwordFlatScratchAddressing()) AS = MFI->hasFlatScratchInit() ? AMDGPUAS::PRIVATE_ADDRESS : AMDGPUAS::GLOBAL_ADDRESS; @@ -7976,22 +8445,24 @@ SDValue SITargetLowering::LowerTrig(SDValue Op, SelectionDAG &DAG) const { SDValue Arg = Op.getOperand(0); SDValue TrigVal; - // TODO: Should this propagate fast-math-flags? + // Propagate fast-math flags so that the multiply we introduce can be folded + // if Arg is already the result of a multiply by constant. + auto Flags = Op->getFlags(); - SDValue OneOver2Pi = DAG.getConstantFP(0.5 / M_PI, DL, VT); + SDValue OneOver2Pi = DAG.getConstantFP(0.5 * numbers::inv_pi, DL, VT); if (Subtarget->hasTrigReducedRange()) { - SDValue MulVal = DAG.getNode(ISD::FMUL, DL, VT, Arg, OneOver2Pi); - TrigVal = DAG.getNode(AMDGPUISD::FRACT, DL, VT, MulVal); + SDValue MulVal = DAG.getNode(ISD::FMUL, DL, VT, Arg, OneOver2Pi, Flags); + TrigVal = DAG.getNode(AMDGPUISD::FRACT, DL, VT, MulVal, Flags); } else { - TrigVal = DAG.getNode(ISD::FMUL, DL, VT, Arg, OneOver2Pi); + TrigVal = DAG.getNode(ISD::FMUL, DL, VT, Arg, OneOver2Pi, Flags); } switch (Op.getOpcode()) { case ISD::FCOS: - return DAG.getNode(AMDGPUISD::COS_HW, SDLoc(Op), VT, TrigVal); + return DAG.getNode(AMDGPUISD::COS_HW, SDLoc(Op), VT, TrigVal, Flags); case ISD::FSIN: - return DAG.getNode(AMDGPUISD::SIN_HW, SDLoc(Op), VT, TrigVal); + return DAG.getNode(AMDGPUISD::SIN_HW, SDLoc(Op), VT, TrigVal, Flags); default: llvm_unreachable("Wrong trig opcode"); } @@ -8032,7 +8503,7 @@ SDValue SITargetLowering::performUCharToFloatCombine(SDNode *N, DAGCombinerInfo &DCI) const { EVT VT = N->getValueType(0); EVT ScalarVT = VT.getScalarType(); - if (ScalarVT != MVT::f32) + if (ScalarVT != MVT::f32 && ScalarVT != MVT::f16) return SDValue(); SelectionDAG &DAG = DCI.DAG; @@ -8047,8 +8518,14 @@ SDValue SITargetLowering::performUCharToFloatCombine(SDNode *N, // about in practice. if (DCI.isAfterLegalizeDAG() && SrcVT == MVT::i32) { if (DAG.MaskedValueIsZero(Src, APInt::getHighBitsSet(32, 24))) { - SDValue Cvt = DAG.getNode(AMDGPUISD::CVT_F32_UBYTE0, DL, VT, Src); + SDValue Cvt = DAG.getNode(AMDGPUISD::CVT_F32_UBYTE0, DL, MVT::f32, Src); DCI.AddToWorklist(Cvt.getNode()); + + // For the f16 case, fold to a cast to f32 and then cast back to f16. + if (ScalarVT != MVT::f32) { + Cvt = DAG.getNode(ISD::FP_ROUND, DL, VT, Cvt, + DAG.getTargetConstant(0, DL, MVT::i32)); + } return Cvt; } } @@ -8525,7 +9002,7 @@ SDValue SITargetLowering::performOrCombine(SDNode *N, } } - if (VT != MVT::i64) + if (VT != MVT::i64 || DCI.isBeforeLegalizeOps()) return SDValue(); // TODO: This could be a generic combine with a predicate for extracting the @@ -8735,6 +9212,11 @@ SDValue SITargetLowering::performRcpCombine(SDNode *N, N->getFlags()); } + if ((VT == MVT::f32 || VT == MVT::f16) && N0.getOpcode() == ISD::FSQRT) { + return DCI.DAG.getNode(AMDGPUISD::RSQ, SDLoc(N), VT, + N0.getOperand(0), N->getFlags()); + } + return AMDGPUTargetLowering::performRcpCombine(N, DCI); } @@ -8776,9 +9258,7 @@ bool SITargetLowering::isCanonicalized(SelectionDAG &DAG, SDValue Op, case AMDGPUISD::RSQ: case AMDGPUISD::RSQ_CLAMP: case AMDGPUISD::RCP_LEGACY: - case AMDGPUISD::RSQ_LEGACY: case AMDGPUISD::RCP_IFLAG: - case AMDGPUISD::TRIG_PREOP: case AMDGPUISD::DIV_SCALE: case AMDGPUISD::DIV_FMAS: case AMDGPUISD::DIV_FIXUP: @@ -8881,6 +9361,12 @@ bool SITargetLowering::isCanonicalized(SelectionDAG &DAG, SDValue Op, case Intrinsic::amdgcn_cubeid: case Intrinsic::amdgcn_frexp_mant: case Intrinsic::amdgcn_fdot2: + case Intrinsic::amdgcn_rcp: + case Intrinsic::amdgcn_rsq: + case Intrinsic::amdgcn_rsq_clamp: + case Intrinsic::amdgcn_rcp_legacy: + case Intrinsic::amdgcn_rsq_legacy: + case Intrinsic::amdgcn_trig_preop: return true; default: break; @@ -9099,8 +9585,7 @@ SDValue SITargetLowering::performFPMed3ImmCombine(SelectionDAG &DAG, return SDValue(); // Ordered >= (although NaN inputs should have folded away by now). - APFloat::cmpResult Cmp = K0->getValueAPF().compare(K1->getValueAPF()); - if (Cmp == APFloat::cmpGreaterThan) + if (K0->getValueAPF() > K1->getValueAPF()) return SDValue(); const MachineFunction &MF = DAG.getMachineFunction(); @@ -9275,6 +9760,50 @@ SDValue SITargetLowering::performCvtPkRTZCombine(SDNode *N, return SDValue(); } +// Check if EXTRACT_VECTOR_ELT/INSERT_VECTOR_ELT (<n x e>, var-idx) should be +// expanded into a set of cmp/select instructions. +bool SITargetLowering::shouldExpandVectorDynExt(unsigned EltSize, + unsigned NumElem, + bool IsDivergentIdx) { + if (UseDivergentRegisterIndexing) + return false; + + unsigned VecSize = EltSize * NumElem; + + // Sub-dword vectors of size 2 dword or less have better implementation. + if (VecSize <= 64 && EltSize < 32) + return false; + + // Always expand the rest of sub-dword instructions, otherwise it will be + // lowered via memory. + if (EltSize < 32) + return true; + + // Always do this if var-idx is divergent, otherwise it will become a loop. + if (IsDivergentIdx) + return true; + + // Large vectors would yield too many compares and v_cndmask_b32 instructions. + unsigned NumInsts = NumElem /* Number of compares */ + + ((EltSize + 31) / 32) * NumElem /* Number of cndmasks */; + return NumInsts <= 16; +} + +static bool shouldExpandVectorDynExt(SDNode *N) { + SDValue Idx = N->getOperand(N->getNumOperands() - 1); + if (isa<ConstantSDNode>(Idx)) + return false; + + SDValue Vec = N->getOperand(0); + EVT VecVT = Vec.getValueType(); + EVT EltVT = VecVT.getVectorElementType(); + unsigned EltSize = EltVT.getSizeInBits(); + unsigned NumElem = VecVT.getVectorNumElements(); + + return SITargetLowering::shouldExpandVectorDynExt(EltSize, NumElem, + Idx->isDivergent()); +} + SDValue SITargetLowering::performExtractVectorEltCombine( SDNode *N, DAGCombinerInfo &DCI) const { SDValue Vec = N->getOperand(0); @@ -9336,18 +9865,12 @@ SDValue SITargetLowering::performExtractVectorEltCombine( unsigned EltSize = EltVT.getSizeInBits(); // EXTRACT_VECTOR_ELT (<n x e>, var-idx) => n x select (e, const-idx) - // This elminates non-constant index and subsequent movrel or scratch access. - // Sub-dword vectors of size 2 dword or less have better implementation. - // Vectors of size bigger than 8 dwords would yield too many v_cndmask_b32 - // instructions. - if (VecSize <= 256 && (VecSize > 64 || EltSize >= 32) && - !isa<ConstantSDNode>(N->getOperand(1))) { + if (::shouldExpandVectorDynExt(N)) { SDLoc SL(N); SDValue Idx = N->getOperand(1); - EVT IdxVT = Idx.getValueType(); SDValue V; for (unsigned I = 0, E = VecVT.getVectorNumElements(); I < E; ++I) { - SDValue IC = DAG.getConstant(I, SL, IdxVT); + SDValue IC = DAG.getVectorIdxConstant(I, SL); SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, Vec, IC); if (I == 0) V = Elt; @@ -9402,17 +9925,10 @@ SITargetLowering::performInsertVectorEltCombine(SDNode *N, SDValue Idx = N->getOperand(2); EVT VecVT = Vec.getValueType(); EVT EltVT = VecVT.getVectorElementType(); - unsigned VecSize = VecVT.getSizeInBits(); - unsigned EltSize = EltVT.getSizeInBits(); // INSERT_VECTOR_ELT (<n x e>, var-idx) // => BUILD_VECTOR n x select (e, const-idx) - // This elminates non-constant index and subsequent movrel or scratch access. - // Sub-dword vectors of size 2 dword or less have better implementation. - // Vectors of size bigger than 8 dwords would yield too many v_cndmask_b32 - // instructions. - if (isa<ConstantSDNode>(Idx) || - VecSize > 256 || (VecSize <= 64 && EltSize < 32)) + if (!::shouldExpandVectorDynExt(N)) return SDValue(); SelectionDAG &DAG = DCI.DAG; @@ -9919,39 +10435,50 @@ SDValue SITargetLowering::performCvtF32UByteNCombine(SDNode *N, unsigned Offset = N->getOpcode() - AMDGPUISD::CVT_F32_UBYTE0; SDValue Src = N->getOperand(0); - SDValue Srl = N->getOperand(0); - if (Srl.getOpcode() == ISD::ZERO_EXTEND) - Srl = Srl.getOperand(0); + SDValue Shift = N->getOperand(0); - // TODO: Handle (or x, (srl y, 8)) pattern when known bits are zero. - if (Srl.getOpcode() == ISD::SRL) { + // TODO: Extend type shouldn't matter (assuming legal types). + if (Shift.getOpcode() == ISD::ZERO_EXTEND) + Shift = Shift.getOperand(0); + + if (Shift.getOpcode() == ISD::SRL || Shift.getOpcode() == ISD::SHL) { + // cvt_f32_ubyte1 (shl x, 8) -> cvt_f32_ubyte0 x + // cvt_f32_ubyte3 (shl x, 16) -> cvt_f32_ubyte1 x // cvt_f32_ubyte0 (srl x, 16) -> cvt_f32_ubyte2 x // cvt_f32_ubyte1 (srl x, 16) -> cvt_f32_ubyte3 x - // cvt_f32_ubyte0 (srl x, 8) -> cvt_f32_ubyte1 x - - if (const ConstantSDNode *C = - dyn_cast<ConstantSDNode>(Srl.getOperand(1))) { - Srl = DAG.getZExtOrTrunc(Srl.getOperand(0), SDLoc(Srl.getOperand(0)), - EVT(MVT::i32)); + // cvt_f32_ubyte0 (srl x, 8) -> cvt_f32_ubyte1 x + if (auto *C = dyn_cast<ConstantSDNode>(Shift.getOperand(1))) { + Shift = DAG.getZExtOrTrunc(Shift.getOperand(0), + SDLoc(Shift.getOperand(0)), MVT::i32); + + unsigned ShiftOffset = 8 * Offset; + if (Shift.getOpcode() == ISD::SHL) + ShiftOffset -= C->getZExtValue(); + else + ShiftOffset += C->getZExtValue(); - unsigned SrcOffset = C->getZExtValue() + 8 * Offset; - if (SrcOffset < 32 && SrcOffset % 8 == 0) { - return DAG.getNode(AMDGPUISD::CVT_F32_UBYTE0 + SrcOffset / 8, SL, - MVT::f32, Srl); + if (ShiftOffset < 32 && (ShiftOffset % 8) == 0) { + return DAG.getNode(AMDGPUISD::CVT_F32_UBYTE0 + ShiftOffset / 8, SL, + MVT::f32, Shift); } } } - APInt Demanded = APInt::getBitsSet(32, 8 * Offset, 8 * Offset + 8); - - KnownBits Known; - TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(), - !DCI.isBeforeLegalizeOps()); const TargetLowering &TLI = DAG.getTargetLoweringInfo(); - if (TLI.SimplifyDemandedBits(Src, Demanded, Known, TLO)) { - DCI.CommitTargetLoweringOpt(TLO); + APInt DemandedBits = APInt::getBitsSet(32, 8 * Offset, 8 * Offset + 8); + if (TLI.SimplifyDemandedBits(Src, DemandedBits, DCI)) { + // We simplified Src. If this node is not dead, visit it again so it is + // folded properly. + if (N->getOpcode() != ISD::DELETED_NODE) + DCI.AddToWorklist(N); + return SDValue(N, 0); } + // Handle (or x, (srl y, 8)) pattern when known bits are zero. + if (SDValue DemandedSrc = + TLI.SimplifyMultipleUseDemandedBits(Src, DemandedBits, DAG)) + return DAG.getNode(N->getOpcode(), SL, MVT::f32, DemandedSrc); + return SDValue(); } @@ -9964,16 +10491,13 @@ SDValue SITargetLowering::performClampCombine(SDNode *N, const MachineFunction &MF = DCI.DAG.getMachineFunction(); const APFloat &F = CSrc->getValueAPF(); APFloat Zero = APFloat::getZero(F.getSemantics()); - APFloat::cmpResult Cmp0 = F.compare(Zero); - if (Cmp0 == APFloat::cmpLessThan || - (Cmp0 == APFloat::cmpUnordered && - MF.getInfo<SIMachineFunctionInfo>()->getMode().DX10Clamp)) { + if (F < Zero || + (F.isNaN() && MF.getInfo<SIMachineFunctionInfo>()->getMode().DX10Clamp)) { return DCI.DAG.getConstantFP(Zero, SDLoc(N), N->getValueType(0)); } APFloat One(F.getSemantics(), "1.0"); - APFloat::cmpResult Cmp1 = F.compare(One); - if (Cmp1 == APFloat::cmpGreaterThan) + if (F > One) return DCI.DAG.getConstantFP(One, SDLoc(N), N->getValueType(0)); return SDValue(CSrc, 0); @@ -10061,10 +10585,10 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N, case AMDGPUISD::FRACT: case AMDGPUISD::RSQ: case AMDGPUISD::RCP_LEGACY: - case AMDGPUISD::RSQ_LEGACY: case AMDGPUISD::RCP_IFLAG: case AMDGPUISD::RSQ_CLAMP: case AMDGPUISD::LDEXP: { + // FIXME: This is probably wrong. If src is an sNaN, it won't be quieted SDValue Src = N->getOperand(0); if (Src.isUndef()) return Src; @@ -10406,24 +10930,6 @@ SDNode *SITargetLowering::PostISelFolding(MachineSDNode *Node, Ops.push_back(ImpDef.getValue(1)); return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops); } - case AMDGPU::V_PERMLANE16_B32: - case AMDGPU::V_PERMLANEX16_B32: { - ConstantSDNode *FI = cast<ConstantSDNode>(Node->getOperand(0)); - ConstantSDNode *BC = cast<ConstantSDNode>(Node->getOperand(2)); - if (!FI->getZExtValue() && !BC->getZExtValue()) - break; - SDValue VDstIn = Node->getOperand(6); - if (VDstIn.isMachineOpcode() - && VDstIn.getMachineOpcode() == AMDGPU::IMPLICIT_DEF) - break; - MachineSDNode *ImpDef = DAG.getMachineNode(TargetOpcode::IMPLICIT_DEF, - SDLoc(Node), MVT::i32); - SmallVector<SDValue, 8> Ops = { SDValue(FI, 0), Node->getOperand(1), - SDValue(BC, 0), Node->getOperand(3), - Node->getOperand(4), Node->getOperand(5), - SDValue(ImpDef, 0), Node->getOperand(7) }; - return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops); - } default: break; } @@ -10592,89 +11098,50 @@ SITargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, MVT VT) const { const TargetRegisterClass *RC = nullptr; if (Constraint.size() == 1) { + const unsigned BitWidth = VT.getSizeInBits(); switch (Constraint[0]) { default: return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT); case 's': case 'r': - switch (VT.getSizeInBits()) { - default: - return std::make_pair(0U, nullptr); - case 32: + switch (BitWidth) { case 16: RC = &AMDGPU::SReg_32RegClass; break; case 64: RC = &AMDGPU::SGPR_64RegClass; break; - case 96: - RC = &AMDGPU::SReg_96RegClass; - break; - case 128: - RC = &AMDGPU::SGPR_128RegClass; - break; - case 160: - RC = &AMDGPU::SReg_160RegClass; - break; - case 256: - RC = &AMDGPU::SReg_256RegClass; - break; - case 512: - RC = &AMDGPU::SReg_512RegClass; + default: + RC = SIRegisterInfo::getSGPRClassForBitWidth(BitWidth); + if (!RC) + return std::make_pair(0U, nullptr); break; } break; case 'v': - switch (VT.getSizeInBits()) { - default: - return std::make_pair(0U, nullptr); - case 32: + switch (BitWidth) { case 16: RC = &AMDGPU::VGPR_32RegClass; break; - case 64: - RC = &AMDGPU::VReg_64RegClass; - break; - case 96: - RC = &AMDGPU::VReg_96RegClass; - break; - case 128: - RC = &AMDGPU::VReg_128RegClass; - break; - case 160: - RC = &AMDGPU::VReg_160RegClass; - break; - case 256: - RC = &AMDGPU::VReg_256RegClass; - break; - case 512: - RC = &AMDGPU::VReg_512RegClass; + default: + RC = SIRegisterInfo::getVGPRClassForBitWidth(BitWidth); + if (!RC) + return std::make_pair(0U, nullptr); break; } break; case 'a': if (!Subtarget->hasMAIInsts()) break; - switch (VT.getSizeInBits()) { - default: - return std::make_pair(0U, nullptr); - case 32: + switch (BitWidth) { case 16: RC = &AMDGPU::AGPR_32RegClass; break; - case 64: - RC = &AMDGPU::AReg_64RegClass; - break; - case 128: - RC = &AMDGPU::AReg_128RegClass; - break; - case 512: - RC = &AMDGPU::AReg_512RegClass; + default: + RC = SIRegisterInfo::getAGPRClassForBitWidth(BitWidth); + if (!RC) + return std::make_pair(0U, nullptr); break; - case 1024: - RC = &AMDGPU::AReg_1024RegClass; - // v32 types are not legal but we support them here. - return std::make_pair(0U, RC); } break; } @@ -10701,9 +11168,29 @@ SITargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, return std::make_pair(RC->getRegister(Idx), RC); } } + + // FIXME: Returns VS_32 for physical SGPR constraints return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT); } +static bool isImmConstraint(StringRef Constraint) { + if (Constraint.size() == 1) { + switch (Constraint[0]) { + default: break; + case 'I': + case 'J': + case 'A': + case 'B': + case 'C': + return true; + } + } else if (Constraint == "DA" || + Constraint == "DB") { + return true; + } + return false; +} + SITargetLowering::ConstraintType SITargetLowering::getConstraintType(StringRef Constraint) const { if (Constraint.size() == 1) { @@ -10715,9 +11202,115 @@ SITargetLowering::getConstraintType(StringRef Constraint) const { return C_RegisterClass; } } + if (isImmConstraint(Constraint)) { + return C_Other; + } return TargetLowering::getConstraintType(Constraint); } +static uint64_t clearUnusedBits(uint64_t Val, unsigned Size) { + if (!AMDGPU::isInlinableIntLiteral(Val)) { + Val = Val & maskTrailingOnes<uint64_t>(Size); + } + return Val; +} + +void SITargetLowering::LowerAsmOperandForConstraint(SDValue Op, + std::string &Constraint, + std::vector<SDValue> &Ops, + SelectionDAG &DAG) const { + if (isImmConstraint(Constraint)) { + uint64_t Val; + if (getAsmOperandConstVal(Op, Val) && + checkAsmConstraintVal(Op, Constraint, Val)) { + Val = clearUnusedBits(Val, Op.getScalarValueSizeInBits()); + Ops.push_back(DAG.getTargetConstant(Val, SDLoc(Op), MVT::i64)); + } + } else { + TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG); + } +} + +bool SITargetLowering::getAsmOperandConstVal(SDValue Op, uint64_t &Val) const { + unsigned Size = Op.getScalarValueSizeInBits(); + if (Size > 64) + return false; + + if (Size == 16 && !Subtarget->has16BitInsts()) + return false; + + if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) { + Val = C->getSExtValue(); + return true; + } + if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Op)) { + Val = C->getValueAPF().bitcastToAPInt().getSExtValue(); + return true; + } + if (BuildVectorSDNode *V = dyn_cast<BuildVectorSDNode>(Op)) { + if (Size != 16 || Op.getNumOperands() != 2) + return false; + if (Op.getOperand(0).isUndef() || Op.getOperand(1).isUndef()) + return false; + if (ConstantSDNode *C = V->getConstantSplatNode()) { + Val = C->getSExtValue(); + return true; + } + if (ConstantFPSDNode *C = V->getConstantFPSplatNode()) { + Val = C->getValueAPF().bitcastToAPInt().getSExtValue(); + return true; + } + } + + return false; +} + +bool SITargetLowering::checkAsmConstraintVal(SDValue Op, + const std::string &Constraint, + uint64_t Val) const { + if (Constraint.size() == 1) { + switch (Constraint[0]) { + case 'I': + return AMDGPU::isInlinableIntLiteral(Val); + case 'J': + return isInt<16>(Val); + case 'A': + return checkAsmConstraintValA(Op, Val); + case 'B': + return isInt<32>(Val); + case 'C': + return isUInt<32>(clearUnusedBits(Val, Op.getScalarValueSizeInBits())) || + AMDGPU::isInlinableIntLiteral(Val); + default: + break; + } + } else if (Constraint.size() == 2) { + if (Constraint == "DA") { + int64_t HiBits = static_cast<int32_t>(Val >> 32); + int64_t LoBits = static_cast<int32_t>(Val); + return checkAsmConstraintValA(Op, HiBits, 32) && + checkAsmConstraintValA(Op, LoBits, 32); + } + if (Constraint == "DB") { + return true; + } + } + llvm_unreachable("Invalid asm constraint"); +} + +bool SITargetLowering::checkAsmConstraintValA(SDValue Op, + uint64_t Val, + unsigned MaxSize) const { + unsigned Size = std::min<unsigned>(Op.getScalarValueSizeInBits(), MaxSize); + bool HasInv2Pi = Subtarget->hasInv2PiInlineImm(); + if ((Size == 16 && AMDGPU::isInlinableLiteral16(Val, HasInv2Pi)) || + (Size == 32 && AMDGPU::isInlinableLiteral32(Val, HasInv2Pi)) || + (Size == 64 && AMDGPU::isInlinableLiteral64(Val, HasInv2Pi))) { + return true; + } + return false; +} + // Figure out which registers should be reserved for stack access. Only after // the function is legalized do we know all of the non-spill stack objects or if // calls are present. @@ -10745,11 +11338,6 @@ void SITargetLowering::finalizeLowering(MachineFunction &MF) const { if (Info->getFrameOffsetReg() != AMDGPU::FP_REG) MRI.replaceRegWith(AMDGPU::FP_REG, Info->getFrameOffsetReg()); - if (Info->getScratchWaveOffsetReg() != AMDGPU::SCRATCH_WAVE_OFFSET_REG) { - MRI.replaceRegWith(AMDGPU::SCRATCH_WAVE_OFFSET_REG, - Info->getScratchWaveOffsetReg()); - } - Info->limitOccupancy(MF); if (ST.isWave32() && !MF.empty()) { @@ -10772,15 +11360,18 @@ void SITargetLowering::finalizeLowering(MachineFunction &MF) const { } TargetLoweringBase::finalizeLowering(MF); + + // Allocate a VGPR for future SGPR Spill if + // "amdgpu-reserve-vgpr-for-sgpr-spill" option is used + // FIXME: We won't need this hack if we split SGPR allocation from VGPR + if (VGPRReserveforSGPRSpill && !Info->VGPRReservedForSGPRSpill && + !Info->isEntryFunction() && MF.getFrameInfo().hasStackObjects()) + Info->reserveVGPRforSGPRSpills(MF); } -void SITargetLowering::computeKnownBitsForFrameIndex(const SDValue Op, - KnownBits &Known, - const APInt &DemandedElts, - const SelectionDAG &DAG, - unsigned Depth) const { - TargetLowering::computeKnownBitsForFrameIndex(Op, Known, DemandedElts, - DAG, Depth); +void SITargetLowering::computeKnownBitsForFrameIndex( + const int FI, KnownBits &Known, const MachineFunction &MF) const { + TargetLowering::computeKnownBitsForFrameIndex(FI, Known, MF); // Set the high bits to zero based on the maximum allowed scratch size per // wave. We can't use vaddr in MUBUF instructions if we don't know the address @@ -10788,6 +11379,27 @@ void SITargetLowering::computeKnownBitsForFrameIndex(const SDValue Op, Known.Zero.setHighBits(getSubtarget()->getKnownHighZeroBitsForFrameIndex()); } +Align SITargetLowering::computeKnownAlignForTargetInstr( + GISelKnownBits &KB, Register R, const MachineRegisterInfo &MRI, + unsigned Depth) const { + const MachineInstr *MI = MRI.getVRegDef(R); + switch (MI->getOpcode()) { + case AMDGPU::G_INTRINSIC: + case AMDGPU::G_INTRINSIC_W_SIDE_EFFECTS: { + // FIXME: Can this move to generic code? What about the case where the call + // site specifies a lower alignment? + Intrinsic::ID IID = MI->getIntrinsicID(); + LLVMContext &Ctx = KB.getMachineFunction().getFunction().getContext(); + AttributeList Attrs = Intrinsic::getAttributes(Ctx, IID); + if (MaybeAlign RetAlign = Attrs.getRetAlignment()) + return *RetAlign; + return Align(1); + } + default: + return Align(1); + } +} + Align SITargetLowering::getPrefLoopAlignment(MachineLoop *ML) const { const Align PrefAlign = TargetLowering::getPrefLoopAlignment(ML); const Align CacheLineAlign = Align(64); @@ -10879,30 +11491,19 @@ bool SITargetLowering::isSDNodeSourceOfDivergence(const SDNode * N, case ISD::CopyFromReg: { const RegisterSDNode *R = cast<RegisterSDNode>(N->getOperand(1)); - const MachineFunction * MF = FLI->MF; - const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>(); - const MachineRegisterInfo &MRI = MF->getRegInfo(); - const SIRegisterInfo &TRI = ST.getInstrInfo()->getRegisterInfo(); - unsigned Reg = R->getReg(); - if (Register::isPhysicalRegister(Reg)) - return !TRI.isSGPRReg(MRI, Reg); - - if (MRI.isLiveIn(Reg)) { - // workitem.id.x workitem.id.y workitem.id.z - // Any VGPR formal argument is also considered divergent - if (!TRI.isSGPRReg(MRI, Reg)) - return true; - // Formal arguments of non-entry functions - // are conservatively considered divergent - else if (!AMDGPU::isEntryFunctionCC(FLI->Fn->getCallingConv())) - return true; - return false; - } - const Value *V = FLI->getValueFromVirtualReg(Reg); - if (V) + const MachineRegisterInfo &MRI = FLI->MF->getRegInfo(); + const SIRegisterInfo *TRI = Subtarget->getRegisterInfo(); + Register Reg = R->getReg(); + + // FIXME: Why does this need to consider isLiveIn? + if (Reg.isPhysical() || MRI.isLiveIn(Reg)) + return !TRI->isSGPRReg(MRI, Reg); + + if (const Value *V = FLI->getValueFromVirtualReg(R->getReg())) return KDA->isDivergent(V); + assert(Reg == FLI->DemoteRegister || isCopyFromRegOfInlineAsm(N)); - return !TRI.isSGPRReg(MRI, Reg); + return !TRI->isSGPRReg(MRI, Reg); } break; case ISD::LOAD: { @@ -11004,7 +11605,19 @@ SITargetLowering::getRegClassFor(MVT VT, bool isDivergent) const { return RC; } -static bool hasCFUser(const Value *V, SmallPtrSet<const Value *, 16> &Visited) { +// FIXME: This is a workaround for DivergenceAnalysis not understanding always +// uniform values (as produced by the mask results of control flow intrinsics) +// used outside of divergent blocks. The phi users need to also be treated as +// always uniform. +static bool hasCFUser(const Value *V, SmallPtrSet<const Value *, 16> &Visited, + unsigned WaveSize) { + // FIXME: We asssume we never cast the mask results of a control flow + // intrinsic. + // Early exit if the type won't be consistent as a compile time hack. + IntegerType *IT = dyn_cast<IntegerType>(V->getType()); + if (!IT || IT->getBitWidth() != WaveSize) + return false; + if (!isa<Instruction>(V)) return false; if (!Visited.insert(V).second) @@ -11036,7 +11649,7 @@ static bool hasCFUser(const Value *V, SmallPtrSet<const Value *, 16> &Visited) { } } } else { - Result = hasCFUser(U, Visited); + Result = hasCFUser(U, Visited, WaveSize); } if (Result) break; @@ -11046,36 +11659,16 @@ static bool hasCFUser(const Value *V, SmallPtrSet<const Value *, 16> &Visited) { bool SITargetLowering::requiresUniformRegister(MachineFunction &MF, const Value *V) const { - if (const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(V)) { - switch (Intrinsic->getIntrinsicID()) { - default: - return false; - case Intrinsic::amdgcn_if_break: - return true; - } - } - if (const ExtractValueInst *ExtValue = dyn_cast<ExtractValueInst>(V)) { - if (const IntrinsicInst *Intrinsic = - dyn_cast<IntrinsicInst>(ExtValue->getOperand(0))) { - switch (Intrinsic->getIntrinsicID()) { - default: - return false; - case Intrinsic::amdgcn_if: - case Intrinsic::amdgcn_else: { - ArrayRef<unsigned> Indices = ExtValue->getIndices(); - if (Indices.size() == 1 && Indices[0] == 1) { - return true; - } - } - } - } - } if (const CallInst *CI = dyn_cast<CallInst>(V)) { - if (isa<InlineAsm>(CI->getCalledValue())) { + if (CI->isInlineAsm()) { + // FIXME: This cannot give a correct answer. This should only trigger in + // the case where inline asm returns mixed SGPR and VGPR results, used + // outside the defining block. We don't have a specific result to + // consider, so this assumes if any value is SGPR, the overall register + // also needs to be SGPR. const SIRegisterInfo *SIRI = Subtarget->getRegisterInfo(); - ImmutableCallSite CS(CI); TargetLowering::AsmOperandInfoVector TargetConstraints = ParseConstraints( - MF.getDataLayout(), Subtarget->getRegisterInfo(), CS); + MF.getDataLayout(), Subtarget->getRegisterInfo(), *CI); for (auto &TC : TargetConstraints) { if (TC.Type == InlineAsm::isOutput) { ComputeConstraintToUse(TC, SDValue()); @@ -11095,5 +11688,20 @@ bool SITargetLowering::requiresUniformRegister(MachineFunction &MF, } } SmallPtrSet<const Value *, 16> Visited; - return hasCFUser(V, Visited); + return hasCFUser(V, Visited, Subtarget->getWavefrontSize()); +} + +std::pair<int, MVT> +SITargetLowering::getTypeLegalizationCost(const DataLayout &DL, + Type *Ty) const { + auto Cost = TargetLoweringBase::getTypeLegalizationCost(DL, Ty); + auto Size = DL.getTypeSizeInBits(Ty); + // Maximum load or store can handle 8 dwords for scalar and 4 for + // vector ALU. Let's assume anything above 8 dwords is expensive + // even if legal. + if (Size <= 256) + return Cost; + + Cost.first = (Size + 255) / 256; + return Cost; } |