diff options
Diffstat (limited to 'lib/Target/AMDGPU/SIISelLowering.cpp')
-rw-r--r-- | lib/Target/AMDGPU/SIISelLowering.cpp | 1918 |
1 files changed, 1521 insertions, 397 deletions
diff --git a/lib/Target/AMDGPU/SIISelLowering.cpp b/lib/Target/AMDGPU/SIISelLowering.cpp index 0ba921647097..db0782e2bf3e 100644 --- a/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/lib/Target/AMDGPU/SIISelLowering.cpp @@ -1,9 +1,8 @@ //===-- SIISelLowering.cpp - SI DAG Lowering Implementation ---------------===// // -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // @@ -19,7 +18,6 @@ #include "SIISelLowering.h" #include "AMDGPU.h" -#include "AMDGPUIntrinsicInfo.h" #include "AMDGPUSubtarget.h" #include "AMDGPUTargetMachine.h" #include "SIDefines.h" @@ -95,11 +93,10 @@ static cl::opt<bool> EnableVGPRIndexMode( cl::desc("Use GPR indexing mode instead of movrel for vector indexing"), cl::init(false)); -static cl::opt<unsigned> AssumeFrameIndexHighZeroBits( - "amdgpu-frame-index-zero-bits", - cl::desc("High bits of frame index assumed to be zero"), - cl::init(5), - cl::ReallyHidden); +static cl::opt<bool> DisableLoopAlignment( + "amdgpu-disable-loop-alignment", + cl::desc("Do not align and prefetch loops"), + cl::init(false)); static unsigned findFirstFreeSGPR(CCState &CCInfo) { unsigned NumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs(); @@ -125,12 +122,18 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, addRegisterClass(MVT::v2i32, &AMDGPU::SReg_64RegClass); addRegisterClass(MVT::v2f32, &AMDGPU::VReg_64RegClass); + addRegisterClass(MVT::v3i32, &AMDGPU::SGPR_96RegClass); + addRegisterClass(MVT::v3f32, &AMDGPU::VReg_96RegClass); + addRegisterClass(MVT::v2i64, &AMDGPU::SReg_128RegClass); addRegisterClass(MVT::v2f64, &AMDGPU::SReg_128RegClass); addRegisterClass(MVT::v4i32, &AMDGPU::SReg_128RegClass); addRegisterClass(MVT::v4f32, &AMDGPU::VReg_128RegClass); + addRegisterClass(MVT::v5i32, &AMDGPU::SGPR_160RegClass); + addRegisterClass(MVT::v5f32, &AMDGPU::VReg_160RegClass); + addRegisterClass(MVT::v8i32, &AMDGPU::SReg_256RegClass); addRegisterClass(MVT::v8f32, &AMDGPU::VReg_256RegClass); @@ -148,18 +151,27 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, addRegisterClass(MVT::v4f16, &AMDGPU::SReg_64RegClass); } + if (Subtarget->hasMAIInsts()) { + addRegisterClass(MVT::v32i32, &AMDGPU::VReg_1024RegClass); + addRegisterClass(MVT::v32f32, &AMDGPU::VReg_1024RegClass); + } + computeRegisterProperties(Subtarget->getRegisterInfo()); // We need to custom lower vector stores from local memory setOperationAction(ISD::LOAD, MVT::v2i32, Custom); + setOperationAction(ISD::LOAD, MVT::v3i32, Custom); setOperationAction(ISD::LOAD, MVT::v4i32, Custom); + setOperationAction(ISD::LOAD, MVT::v5i32, Custom); setOperationAction(ISD::LOAD, MVT::v8i32, Custom); setOperationAction(ISD::LOAD, MVT::v16i32, Custom); setOperationAction(ISD::LOAD, MVT::i1, Custom); setOperationAction(ISD::LOAD, MVT::v32i32, Custom); setOperationAction(ISD::STORE, MVT::v2i32, Custom); + setOperationAction(ISD::STORE, MVT::v3i32, Custom); setOperationAction(ISD::STORE, MVT::v4i32, Custom); + setOperationAction(ISD::STORE, MVT::v5i32, Custom); setOperationAction(ISD::STORE, MVT::v8i32, Custom); setOperationAction(ISD::STORE, MVT::v16i32, Custom); setOperationAction(ISD::STORE, MVT::i1, Custom); @@ -218,11 +230,15 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::v4f16, Custom); setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::v8f16, Custom); setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom); + setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i16, Custom); + setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i8, Custom); setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom); setOperationAction(ISD::INTRINSIC_VOID, MVT::v2i16, Custom); setOperationAction(ISD::INTRINSIC_VOID, MVT::v2f16, Custom); setOperationAction(ISD::INTRINSIC_VOID, MVT::v4f16, Custom); + setOperationAction(ISD::INTRINSIC_VOID, MVT::i16, Custom); + setOperationAction(ISD::INTRINSIC_VOID, MVT::i8, Custom); setOperationAction(ISD::BRCOND, MVT::Other, Custom); setOperationAction(ISD::BR_CC, MVT::i1, Expand); @@ -248,8 +264,9 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, // We only support LOAD/STORE and vector manipulation ops for vectors // with > 4 elements. - for (MVT VT : {MVT::v8i32, MVT::v8f32, MVT::v16i32, MVT::v16f32, - MVT::v2i64, MVT::v2f64, MVT::v4i16, MVT::v4f16, MVT::v32i32 }) { + for (MVT VT : { MVT::v8i32, MVT::v8f32, MVT::v16i32, MVT::v16f32, + MVT::v2i64, MVT::v2f64, MVT::v4i16, MVT::v4f16, + MVT::v32i32, MVT::v32f32 }) { for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) { switch (Op) { case ISD::LOAD: @@ -323,6 +340,18 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i16, Custom); setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f16, Custom); + // Deal with vec3 vector operations when widened to vec4. + setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v3i32, Custom); + setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v3f32, Custom); + setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v4i32, Custom); + setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v4f32, Custom); + + // Deal with vec5 vector operations when widened to vec8. + setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v5i32, Custom); + setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v5f32, Custom); + setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v8i32, Custom); + setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v8f32, Custom); + // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, // and output demarshalling setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i32, Custom); @@ -400,7 +429,7 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, setOperationAction(ISD::FMAXNUM_IEEE, MVT::f64, Legal); - if (Subtarget->getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) { + if (Subtarget->haveRoundOpsF64()) { setOperationAction(ISD::FTRUNC, MVT::f64, Legal); setOperationAction(ISD::FCEIL, MVT::f64, Legal); setOperationAction(ISD::FRINT, MVT::f64, Legal); @@ -492,7 +521,7 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, // F16 - VOP3 Actions. setOperationAction(ISD::FMA, MVT::f16, Legal); - if (!Subtarget->hasFP16Denormals()) + if (!Subtarget->hasFP16Denormals() && STI.hasMadF16()) setOperationAction(ISD::FMAD, MVT::f16, Legal); for (MVT VT : {MVT::v2i16, MVT::v2f16, MVT::v4i16, MVT::v4f16}) { @@ -607,6 +636,9 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i16, Custom); setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f16, Custom); + setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4f16, Custom); + setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4i16, Custom); + setOperationAction(ISD::SHL, MVT::v4i16, Custom); setOperationAction(ISD::SRA, MVT::v4i16, Custom); setOperationAction(ISD::SRL, MVT::v4i16, Custom); @@ -679,6 +711,7 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, setTargetDAGCombine(ISD::FCANONICALIZE); setTargetDAGCombine(ISD::SCALAR_TO_VECTOR); setTargetDAGCombine(ISD::ZERO_EXTEND); + setTargetDAGCombine(ISD::SIGN_EXTEND_INREG); setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT); setTargetDAGCombine(ISD::INSERT_VECTOR_ELT); @@ -701,13 +734,9 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, setTargetDAGCombine(ISD::ATOMIC_LOAD_MAX); setTargetDAGCombine(ISD::ATOMIC_LOAD_UMIN); setTargetDAGCombine(ISD::ATOMIC_LOAD_UMAX); + setTargetDAGCombine(ISD::ATOMIC_LOAD_FADD); setSchedulingPreference(Sched::RegPressure); - - // SI at least has hardware support for floating point exceptions, but no way - // of using or handling them is implemented. They are also optional in OpenCL - // (Section 7.3) - setHasFloatingPointExceptions(Subtarget->hasFPExceptions()); } const GCNSubtarget *SITargetLowering::getSubtarget() const { @@ -910,6 +939,8 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, switch (IntrID) { case Intrinsic::amdgcn_atomic_inc: case Intrinsic::amdgcn_atomic_dec: + case Intrinsic::amdgcn_ds_ordered_add: + case Intrinsic::amdgcn_ds_ordered_swap: case Intrinsic::amdgcn_ds_fadd: case Intrinsic::amdgcn_ds_fmin: case Intrinsic::amdgcn_ds_fmax: { @@ -919,13 +950,75 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, Info.align = 0; Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOStore; + const ConstantInt *Vol = cast<ConstantInt>(CI.getOperand(4)); + if (!Vol->isZero()) + Info.flags |= MachineMemOperand::MOVolatile; + + return true; + } + case Intrinsic::amdgcn_buffer_atomic_fadd: { + SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); + + Info.opc = ISD::INTRINSIC_VOID; + Info.memVT = MVT::getVT(CI.getOperand(0)->getType()); + Info.ptrVal = MFI->getBufferPSV( + *MF.getSubtarget<GCNSubtarget>().getInstrInfo(), + CI.getArgOperand(1)); + Info.align = 0; + Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOStore; + const ConstantInt *Vol = dyn_cast<ConstantInt>(CI.getOperand(4)); if (!Vol || !Vol->isZero()) Info.flags |= MachineMemOperand::MOVolatile; return true; } + case Intrinsic::amdgcn_global_atomic_fadd: { + Info.opc = ISD::INTRINSIC_VOID; + Info.memVT = MVT::getVT(CI.getOperand(0)->getType() + ->getPointerElementType()); + Info.ptrVal = CI.getOperand(0); + Info.align = 0; + Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOStore; + + return true; + } + case Intrinsic::amdgcn_ds_append: + case Intrinsic::amdgcn_ds_consume: { + Info.opc = ISD::INTRINSIC_W_CHAIN; + Info.memVT = MVT::getVT(CI.getType()); + Info.ptrVal = CI.getOperand(0); + Info.align = 0; + Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOStore; + + const ConstantInt *Vol = cast<ConstantInt>(CI.getOperand(1)); + if (!Vol->isZero()) + Info.flags |= MachineMemOperand::MOVolatile; + + return true; + } + case Intrinsic::amdgcn_ds_gws_init: + case Intrinsic::amdgcn_ds_gws_barrier: + case Intrinsic::amdgcn_ds_gws_sema_v: + case Intrinsic::amdgcn_ds_gws_sema_br: + case Intrinsic::amdgcn_ds_gws_sema_p: + case Intrinsic::amdgcn_ds_gws_sema_release_all: { + Info.opc = ISD::INTRINSIC_VOID; + + SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); + Info.ptrVal = + MFI->getGWSPSV(*MF.getSubtarget<GCNSubtarget>().getInstrInfo()); + // This is an abstract access, but we need to specify a type and size. + Info.memVT = MVT::i32; + Info.size = 4; + Info.align = 4; + + Info.flags = MachineMemOperand::MOStore; + if (IntrID == Intrinsic::amdgcn_ds_gws_barrier) + Info.flags = MachineMemOperand::MOLoad; + return true; + } default: return false; } @@ -937,6 +1030,8 @@ bool SITargetLowering::getAddrModeArguments(IntrinsicInst *II, switch (II->getIntrinsicID()) { case Intrinsic::amdgcn_atomic_inc: case Intrinsic::amdgcn_atomic_dec: + case Intrinsic::amdgcn_ds_ordered_add: + case Intrinsic::amdgcn_ds_ordered_swap: case Intrinsic::amdgcn_ds_fadd: case Intrinsic::amdgcn_ds_fmin: case Intrinsic::amdgcn_ds_fmax: { @@ -960,6 +1055,13 @@ bool SITargetLowering::isLegalFlatAddressingMode(const AddrMode &AM) const { // GFX9 added a 13-bit signed offset. When using regular flat instructions, // the sign bit is ignored and is treated as a 12-bit unsigned offset. + // GFX10 shrinked signed offset to 12 bits. When using regular flat + // instructions, the sign bit is also ignored and is treated as 11-bit + // unsigned offset. + + if (Subtarget->getGeneration() >= AMDGPUSubtarget::GFX10) + return isUInt<11>(AM.BaseOffs) && AM.Scale == 0; + // Just r + i return isUInt<12>(AM.BaseOffs) && AM.Scale == 0; } @@ -1030,7 +1132,8 @@ bool SITargetLowering::isLegalAddressingMode(const DataLayout &DL, return isLegalGlobalAddressingMode(AM); if (AS == AMDGPUAS::CONSTANT_ADDRESS || - AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { + AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT || + AS == AMDGPUAS::BUFFER_FAT_POINTER) { // If the offset isn't a multiple of 4, it probably isn't going to be // correctly aligned. // FIXME: Can we get the real alignment here? @@ -1106,16 +1209,15 @@ bool SITargetLowering::canMergeStoresTo(unsigned AS, EVT MemVT, } else if (AS == AMDGPUAS::PRIVATE_ADDRESS) { unsigned MaxPrivateBits = 8 * getSubtarget()->getMaxPrivateElementSize(); return (MemVT.getSizeInBits() <= MaxPrivateBits); - } else if (AS == AMDGPUAS::LOCAL_ADDRESS) { + } else if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) { return (MemVT.getSizeInBits() <= 2 * 32); } return true; } -bool SITargetLowering::allowsMisalignedMemoryAccesses(EVT VT, - unsigned AddrSpace, - unsigned Align, - bool *IsFast) const { +bool SITargetLowering::allowsMisalignedMemoryAccesses( + EVT VT, unsigned AddrSpace, unsigned Align, MachineMemOperand::Flags Flags, + bool *IsFast) const { if (IsFast) *IsFast = false; @@ -1178,11 +1280,10 @@ bool SITargetLowering::allowsMisalignedMemoryAccesses(EVT VT, return VT.bitsGT(MVT::i32) && Align % 4 == 0; } -EVT SITargetLowering::getOptimalMemOpType(uint64_t Size, unsigned DstAlign, - unsigned SrcAlign, bool IsMemset, - bool ZeroMemset, - bool MemcpyStrSrc, - MachineFunction &MF) const { +EVT SITargetLowering::getOptimalMemOpType( + uint64_t Size, unsigned DstAlign, unsigned SrcAlign, bool IsMemset, + bool ZeroMemset, bool MemcpyStrSrc, + const AttributeList &FuncAttributes) const { // FIXME: Should account for address space here. // The default fallback uses the private pointer size as a guess for a type to @@ -1201,7 +1302,8 @@ EVT SITargetLowering::getOptimalMemOpType(uint64_t Size, unsigned DstAlign, static bool isFlatGlobalAddrSpace(unsigned AS) { return AS == AMDGPUAS::GLOBAL_ADDRESS || AS == AMDGPUAS::FLAT_ADDRESS || - AS == AMDGPUAS::CONSTANT_ADDRESS; + AS == AMDGPUAS::CONSTANT_ADDRESS || + AS > AMDGPUAS::MAX_AMDGPU_ADDRESS; } bool SITargetLowering::isNoopAddrSpaceCast(unsigned SrcAS, @@ -1216,8 +1318,8 @@ bool SITargetLowering::isMemOpHasNoClobberedMemOperand(const SDNode *N) const { return I && I->getMetadata("amdgpu.noclobber"); } -bool SITargetLowering::isCheapAddrSpaceCast(unsigned SrcAS, - unsigned DestAS) const { +bool SITargetLowering::isFreeAddrSpaceCast(unsigned SrcAS, + unsigned DestAS) const { // Flat -> private/local is a simple truncate. // Flat -> global is no-op if (SrcAS == AMDGPUAS::FLAT_ADDRESS) @@ -1305,6 +1407,17 @@ SDValue SITargetLowering::convertArgType(SelectionDAG &DAG, EVT VT, EVT MemVT, const SDLoc &SL, SDValue Val, bool Signed, const ISD::InputArg *Arg) const { + // First, if it is a widened vector, narrow it. + if (VT.isVector() && + VT.getVectorNumElements() != MemVT.getVectorNumElements()) { + EVT NarrowedVT = + EVT::getVectorVT(*DAG.getContext(), MemVT.getVectorElementType(), + VT.getVectorNumElements()); + Val = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, NarrowedVT, Val, + DAG.getConstant(0, SL, MVT::i32)); + } + + // Then convert the vector elements or scalar value. if (Arg && (Arg->Flags.isSExt() || Arg->Flags.isZExt()) && VT.bitsLT(MemVT)) { unsigned Opc = Arg->Flags.isZExt() ? ISD::AssertZext : ISD::AssertSext; @@ -1441,8 +1554,7 @@ static void processShaderInputArgs(SmallVectorImpl<ISD::InputArg> &Splits, // First check if it's a PS input addr. if (CallConv == CallingConv::AMDGPU_PS && - !Arg->Flags.isInReg() && !Arg->Flags.isByVal() && PSInputNum <= 15) { - + !Arg->Flags.isInReg() && PSInputNum <= 15) { bool SkipArg = !Arg->Used && !Info->isPSInputAllocated(PSInputNum); // Inconveniently only the first part of the split is marked as isSplit, @@ -1508,7 +1620,13 @@ static void allocateSpecialEntryInputVGPRs(CCState &CCInfo, // Try to allocate a VGPR at the end of the argument list, or if no argument // VGPRs are left allocating a stack slot. -static ArgDescriptor allocateVGPR32Input(CCState &CCInfo) { +// If \p Mask is is given it indicates bitfield position in the register. +// If \p Arg is given use it with new ]p Mask instead of allocating new. +static ArgDescriptor allocateVGPR32Input(CCState &CCInfo, unsigned Mask = ~0u, + ArgDescriptor Arg = ArgDescriptor()) { + if (Arg.isSet()) + return ArgDescriptor::createArg(Arg, Mask); + ArrayRef<MCPhysReg> ArgVGPRs = makeArrayRef(AMDGPU::VGPR_32RegClass.begin(), 32); unsigned RegIdx = CCInfo.getFirstUnallocated(ArgVGPRs); @@ -1516,7 +1634,7 @@ static ArgDescriptor allocateVGPR32Input(CCState &CCInfo) { // Spill to stack required. int64_t Offset = CCInfo.AllocateStack(4, 4); - return ArgDescriptor::createStack(Offset); + return ArgDescriptor::createStack(Offset, Mask); } unsigned Reg = ArgVGPRs[RegIdx]; @@ -1525,7 +1643,7 @@ static ArgDescriptor allocateVGPR32Input(CCState &CCInfo) { MachineFunction &MF = CCInfo.getMachineFunction(); MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass); - return ArgDescriptor::createRegister(Reg); + return ArgDescriptor::createRegister(Reg, Mask); } static ArgDescriptor allocateSGPR32InputImpl(CCState &CCInfo, @@ -1557,14 +1675,21 @@ static void allocateSpecialInputVGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) { - if (Info.hasWorkItemIDX()) - Info.setWorkItemIDX(allocateVGPR32Input(CCInfo)); + const unsigned Mask = 0x3ff; + ArgDescriptor Arg; + + if (Info.hasWorkItemIDX()) { + Arg = allocateVGPR32Input(CCInfo, Mask); + Info.setWorkItemIDX(Arg); + } - if (Info.hasWorkItemIDY()) - Info.setWorkItemIDY(allocateVGPR32Input(CCInfo)); + if (Info.hasWorkItemIDY()) { + Arg = allocateVGPR32Input(CCInfo, Mask << 10, Arg); + Info.setWorkItemIDY(Arg); + } if (Info.hasWorkItemIDZ()) - Info.setWorkItemIDZ(allocateVGPR32Input(CCInfo)); + Info.setWorkItemIDZ(allocateVGPR32Input(CCInfo, Mask << 20, Arg)); } static void allocateSpecialInputSGPRs(CCState &CCInfo, @@ -1714,6 +1839,7 @@ static void reservePrivateMemoryRegs(const TargetMachine &TM, // should reserve the arguments and use them directly. MachineFrameInfo &MFI = MF.getFrameInfo(); bool HasStackObjects = MFI.hasStackObjects(); + const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); // Record that we know we have non-spill stack objects so we don't need to // check all stack objects later. @@ -1729,65 +1855,89 @@ static void reservePrivateMemoryRegs(const TargetMachine &TM, // the scratch registers to pass in. bool RequiresStackAccess = HasStackObjects || MFI.hasCalls(); - const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); - if (ST.isAmdHsaOrMesa(MF.getFunction())) { - if (RequiresStackAccess) { - // If we have stack objects, we unquestionably need the private buffer - // resource. For the Code Object V2 ABI, this will be the first 4 user - // SGPR inputs. We can reserve those and use them directly. - - unsigned PrivateSegmentBufferReg = Info.getPreloadedReg( - AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_BUFFER); - Info.setScratchRSrcReg(PrivateSegmentBufferReg); - - if (MFI.hasCalls()) { - // If we have calls, we need to keep the frame register in a register - // that won't be clobbered by a call, so ensure it is copied somewhere. - - // This is not a problem for the scratch wave offset, because the same - // registers are reserved in all functions. - - // FIXME: Nothing is really ensuring this is a call preserved register, - // it's just selected from the end so it happens to be. - unsigned ReservedOffsetReg - = TRI.reservedPrivateSegmentWaveByteOffsetReg(MF); - Info.setScratchWaveOffsetReg(ReservedOffsetReg); - } else { - unsigned PrivateSegmentWaveByteOffsetReg = Info.getPreloadedReg( - AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET); - Info.setScratchWaveOffsetReg(PrivateSegmentWaveByteOffsetReg); - } - } else { - unsigned ReservedBufferReg - = TRI.reservedPrivateSegmentBufferReg(MF); - unsigned ReservedOffsetReg - = TRI.reservedPrivateSegmentWaveByteOffsetReg(MF); - - // We tentatively reserve the last registers (skipping the last two - // which may contain VCC). After register allocation, we'll replace - // these with the ones immediately after those which were really - // allocated. In the prologue copies will be inserted from the argument - // to these reserved registers. - Info.setScratchRSrcReg(ReservedBufferReg); - Info.setScratchWaveOffsetReg(ReservedOffsetReg); - } + if (RequiresStackAccess && ST.isAmdHsaOrMesa(MF.getFunction())) { + // If we have stack objects, we unquestionably need the private buffer + // resource. For the Code Object V2 ABI, this will be the first 4 user + // SGPR inputs. We can reserve those and use them directly. + + unsigned PrivateSegmentBufferReg = + Info.getPreloadedReg(AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_BUFFER); + Info.setScratchRSrcReg(PrivateSegmentBufferReg); } else { unsigned ReservedBufferReg = TRI.reservedPrivateSegmentBufferReg(MF); + // We tentatively reserve the last registers (skipping the last registers + // which may contain VCC, FLAT_SCR, and XNACK). After register allocation, + // we'll replace these with the ones immediately after those which were + // really allocated. In the prologue copies will be inserted from the + // argument to these reserved registers. // Without HSA, relocations are used for the scratch pointer and the // buffer resource setup is always inserted in the prologue. Scratch wave // offset is still in an input SGPR. Info.setScratchRSrcReg(ReservedBufferReg); + } - if (HasStackObjects && !MFI.hasCalls()) { - unsigned ScratchWaveOffsetReg = Info.getPreloadedReg( - AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET); - Info.setScratchWaveOffsetReg(ScratchWaveOffsetReg); + // hasFP should be accurate for kernels even before the frame is finalized. + if (ST.getFrameLowering()->hasFP(MF)) { + MachineRegisterInfo &MRI = MF.getRegInfo(); + + // Try to use s32 as the SP, but move it if it would interfere with input + // arguments. This won't work with calls though. + // + // FIXME: Move SP to avoid any possible inputs, or find a way to spill input + // registers. + if (!MRI.isLiveIn(AMDGPU::SGPR32)) { + Info.setStackPtrOffsetReg(AMDGPU::SGPR32); } else { - unsigned ReservedOffsetReg - = TRI.reservedPrivateSegmentWaveByteOffsetReg(MF); + assert(AMDGPU::isShader(MF.getFunction().getCallingConv())); + + if (MFI.hasCalls()) + report_fatal_error("call in graphics shader with too many input SGPRs"); + + for (unsigned Reg : AMDGPU::SGPR_32RegClass) { + if (!MRI.isLiveIn(Reg)) { + Info.setStackPtrOffsetReg(Reg); + break; + } + } + + if (Info.getStackPtrOffsetReg() == AMDGPU::SP_REG) + report_fatal_error("failed to find register for SP"); + } + + if (MFI.hasCalls()) { + Info.setScratchWaveOffsetReg(AMDGPU::SGPR33); + Info.setFrameOffsetReg(AMDGPU::SGPR33); + } else { + unsigned ReservedOffsetReg = + TRI.reservedPrivateSegmentWaveByteOffsetReg(MF); Info.setScratchWaveOffsetReg(ReservedOffsetReg); + Info.setFrameOffsetReg(ReservedOffsetReg); } + } else if (RequiresStackAccess) { + assert(!MFI.hasCalls()); + // We know there are accesses and they will be done relative to SP, so just + // pin it to the input. + // + // FIXME: Should not do this if inline asm is reading/writing these + // registers. + unsigned PreloadedSP = Info.getPreloadedReg( + AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET); + + Info.setStackPtrOffsetReg(PreloadedSP); + Info.setScratchWaveOffsetReg(PreloadedSP); + Info.setFrameOffsetReg(PreloadedSP); + } else { + assert(!MFI.hasCalls()); + + // There may not be stack access at all. There may still be spills, or + // access of a constant pointer (in which cases an extra copy will be + // emitted in the prolog). + unsigned ReservedOffsetReg + = TRI.reservedPrivateSegmentWaveByteOffsetReg(MF); + Info.setStackPtrOffsetReg(ReservedOffsetReg); + Info.setScratchWaveOffsetReg(ReservedOffsetReg); + Info.setFrameOffsetReg(ReservedOffsetReg); } } @@ -1845,7 +1995,6 @@ SDValue SITargetLowering::LowerFormalArguments( const Function &Fn = MF.getFunction(); FunctionType *FType = MF.getFunction().getFunctionType(); SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); - const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); if (Subtarget->isAmdHsaOS() && AMDGPU::isShader(CallConv)) { DiagnosticInfoUnsupported NoGraphicsHSA( @@ -1854,11 +2003,6 @@ SDValue SITargetLowering::LowerFormalArguments( return DAG.getEntryNode(); } - // Create stack objects that are used for emitting debugger prologue if - // "amdgpu-debugger-emit-prologue" attribute was specified. - if (ST.debuggerEmitPrologue()) - createDebuggerPrologueStackObjects(MF); - SmallVector<ISD::InputArg, 16> Splits; SmallVector<CCValAssign, 16> ArgLocs; BitVector Skipped(Ins.size()); @@ -1869,12 +2013,6 @@ SDValue SITargetLowering::LowerFormalArguments( bool IsKernel = AMDGPU::isKernel(CallConv); bool IsEntryFunc = AMDGPU::isEntryFunctionCC(CallConv); - if (!IsEntryFunc) { - // 4 bytes are reserved at offset 0 for the emergency stack slot. Skip over - // this when allocating argument fixed offsets. - CCInfo.AllocateStack(4, 4); - } - if (IsShader) { processShaderInputArgs(Splits, CallConv, Ins, Skipped, FType, Info); @@ -1975,7 +2113,8 @@ SDValue SITargetLowering::LowerFormalArguments( auto *ParamTy = dyn_cast<PointerType>(FType->getParamType(Ins[i].getOrigArgIndex())); if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS && - ParamTy && ParamTy->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) { + ParamTy && (ParamTy->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS || + ParamTy->getAddressSpace() == AMDGPUAS::REGION_ADDRESS)) { // On SI local pointers are just offsets into LDS, so they are always // less than 16-bits. On CI and newer they could potentially be // real pointers, so we can't guarantee their size. @@ -2002,13 +2141,14 @@ SDValue SITargetLowering::LowerFormalArguments( Reg = MF.addLiveIn(Reg, RC); SDValue Val = DAG.getCopyFromReg(Chain, DL, Reg, VT); - if (Arg.Flags.isSRet() && !getSubtarget()->enableHugePrivateBuffer()) { + if (Arg.Flags.isSRet()) { // The return object should be reasonably addressable. // FIXME: This helps when the return is a real sret. If it is a // automatically inserted sret (i.e. CanLowerReturn returns false), an // extra copy is inserted in SelectionDAGBuilder which obscures this. - unsigned NumBits = 32 - AssumeFrameIndexHighZeroBits; + unsigned NumBits + = 32 - getSubtarget()->getKnownHighZeroBitsForFrameIndex(); Val = DAG.getNode(ISD::AssertZext, DL, VT, Val, DAG.getValueType(EVT::getIntegerVT(*DAG.getContext(), NumBits))); } @@ -2126,16 +2266,13 @@ SITargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv, SDValue ReturnAddrReg = CreateLiveInRegister( DAG, &AMDGPU::SReg_64RegClass, TRI->getReturnAddressReg(MF), MVT::i64); - // FIXME: Should be able to use a vreg here, but need a way to prevent it - // from being allcoated to a CSR. - - SDValue PhysReturnAddrReg = DAG.getRegister(TRI->getReturnAddressReg(MF), - MVT::i64); - - Chain = DAG.getCopyToReg(Chain, DL, PhysReturnAddrReg, ReturnAddrReg, Flag); + SDValue ReturnAddrVirtualReg = DAG.getRegister( + MF.getRegInfo().createVirtualRegister(&AMDGPU::CCR_SGPR_64RegClass), + MVT::i64); + Chain = + DAG.getCopyToReg(Chain, DL, ReturnAddrVirtualReg, ReturnAddrReg, Flag); Flag = Chain.getValue(1); - - RetOps.push_back(PhysReturnAddrReg); + RetOps.push_back(ReturnAddrVirtualReg); } // Copy the result values into the output registers. @@ -2295,9 +2432,6 @@ void SITargetLowering::passSpecialInputs( AMDGPUFunctionArgInfo::WORKGROUP_ID_X, AMDGPUFunctionArgInfo::WORKGROUP_ID_Y, AMDGPUFunctionArgInfo::WORKGROUP_ID_Z, - AMDGPUFunctionArgInfo::WORKITEM_ID_X, - AMDGPUFunctionArgInfo::WORKITEM_ID_Y, - AMDGPUFunctionArgInfo::WORKITEM_ID_Z, AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR }; @@ -2337,6 +2471,71 @@ void SITargetLowering::passSpecialInputs( MemOpChains.push_back(ArgStore); } } + + // Pack workitem IDs into a single register or pass it as is if already + // packed. + const ArgDescriptor *OutgoingArg; + const TargetRegisterClass *ArgRC; + + std::tie(OutgoingArg, ArgRC) = + CalleeArgInfo.getPreloadedValue(AMDGPUFunctionArgInfo::WORKITEM_ID_X); + if (!OutgoingArg) + std::tie(OutgoingArg, ArgRC) = + CalleeArgInfo.getPreloadedValue(AMDGPUFunctionArgInfo::WORKITEM_ID_Y); + if (!OutgoingArg) + std::tie(OutgoingArg, ArgRC) = + CalleeArgInfo.getPreloadedValue(AMDGPUFunctionArgInfo::WORKITEM_ID_Z); + if (!OutgoingArg) + return; + + const ArgDescriptor *IncomingArgX + = CallerArgInfo.getPreloadedValue(AMDGPUFunctionArgInfo::WORKITEM_ID_X).first; + const ArgDescriptor *IncomingArgY + = CallerArgInfo.getPreloadedValue(AMDGPUFunctionArgInfo::WORKITEM_ID_Y).first; + const ArgDescriptor *IncomingArgZ + = CallerArgInfo.getPreloadedValue(AMDGPUFunctionArgInfo::WORKITEM_ID_Z).first; + + SDValue InputReg; + SDLoc SL; + + // If incoming ids are not packed we need to pack them. + if (IncomingArgX && !IncomingArgX->isMasked() && CalleeArgInfo.WorkItemIDX) + InputReg = loadInputValue(DAG, ArgRC, MVT::i32, DL, *IncomingArgX); + + if (IncomingArgY && !IncomingArgY->isMasked() && CalleeArgInfo.WorkItemIDY) { + SDValue Y = loadInputValue(DAG, ArgRC, MVT::i32, DL, *IncomingArgY); + Y = DAG.getNode(ISD::SHL, SL, MVT::i32, Y, + DAG.getShiftAmountConstant(10, MVT::i32, SL)); + InputReg = InputReg.getNode() ? + DAG.getNode(ISD::OR, SL, MVT::i32, InputReg, Y) : Y; + } + + if (IncomingArgZ && !IncomingArgZ->isMasked() && CalleeArgInfo.WorkItemIDZ) { + SDValue Z = loadInputValue(DAG, ArgRC, MVT::i32, DL, *IncomingArgZ); + Z = DAG.getNode(ISD::SHL, SL, MVT::i32, Z, + DAG.getShiftAmountConstant(20, MVT::i32, SL)); + InputReg = InputReg.getNode() ? + DAG.getNode(ISD::OR, SL, MVT::i32, InputReg, Z) : Z; + } + + if (!InputReg.getNode()) { + // Workitem ids are already packed, any of present incoming arguments + // will carry all required fields. + ArgDescriptor IncomingArg = ArgDescriptor::createArg( + IncomingArgX ? *IncomingArgX : + IncomingArgY ? *IncomingArgY : + *IncomingArgZ, ~0u); + InputReg = loadInputValue(DAG, ArgRC, MVT::i32, DL, IncomingArg); + } + + if (OutgoingArg->isRegister()) { + RegsToPass.emplace_back(OutgoingArg->getRegister(), InputReg); + } else { + unsigned SpecialArgOffset = CCInfo.AllocateStack(4, 4); + SDValue ArgStore = storeStackInputValue(DAG, DL, Chain, InputReg, + SpecialArgOffset); + MemOpChains.push_back(ArgStore); + } } static bool canGuaranteeTCO(CallingConv::ID CC) { @@ -2478,7 +2677,6 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI, "unsupported call from graphics shader of function "); } - // The first 4 bytes are reserved for the callee's emergency stack slot. if (IsTailCall) { IsTailCall = isEligibleForTailCallOptimization( Callee, CallConv, IsVarArg, Outs, OutVals, Ins, DAG); @@ -2505,9 +2703,6 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI, CCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext()); CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, IsVarArg); - // The first 4 bytes are reserved for the callee's emergency stack slot. - CCInfo.AllocateStack(4, 4); - CCInfo.AnalyzeCallOperands(Outs, AssignFn); // Get a count of how many bytes are to be pushed on the stack. @@ -2528,31 +2723,19 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI, MachineFrameInfo &MFI = MF.getFrameInfo(); SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass; - SDValue CallerSavedFP; - // Adjust the stack pointer for the new arguments... // These operations are automatically eliminated by the prolog/epilog pass if (!IsSibCall) { Chain = DAG.getCALLSEQ_START(Chain, 0, 0, DL); - unsigned OffsetReg = Info->getScratchWaveOffsetReg(); + SmallVector<SDValue, 4> CopyFromChains; // In the HSA case, this should be an identity copy. SDValue ScratchRSrcReg = DAG.getCopyFromReg(Chain, DL, Info->getScratchRSrcReg(), MVT::v4i32); RegsToPass.emplace_back(AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3, ScratchRSrcReg); - - // TODO: Don't hardcode these registers and get from the callee function. - SDValue ScratchWaveOffsetReg - = DAG.getCopyFromReg(Chain, DL, OffsetReg, MVT::i32); - RegsToPass.emplace_back(AMDGPU::SGPR4, ScratchWaveOffsetReg); - - if (!Info->isEntryFunction()) { - // Avoid clobbering this function's FP value. In the current convention - // callee will overwrite this, so do save/restore around the call site. - CallerSavedFP = DAG.getCopyFromReg(Chain, DL, - Info->getFrameOffsetReg(), MVT::i32); - } + CopyFromChains.push_back(ScratchRSrcReg.getValue(1)); + Chain = DAG.getTokenFactor(DL, CopyFromChains); } SmallVector<SDValue, 8> MemOpChains; @@ -2694,6 +2877,11 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI, std::vector<SDValue> Ops; Ops.push_back(Chain); Ops.push_back(Callee); + // Add a redundant copy of the callee global which will not be legalized, as + // we need direct access to the callee later. + GlobalAddressSDNode *GSD = cast<GlobalAddressSDNode>(Callee); + const GlobalValue *GV = GSD->getGlobal(); + Ops.push_back(DAG.getTargetGlobalAddress(GV, DL, MVT::i64)); if (IsTailCall) { // Each tail call may have to adjust the stack by a different amount, so @@ -2735,12 +2923,6 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI, Chain = Call.getValue(0); InFlag = Call.getValue(1); - if (CallerSavedFP) { - SDValue FPReg = DAG.getRegister(Info->getFrameOffsetReg(), MVT::i32); - Chain = DAG.getCopyToReg(Chain, DL, FPReg, CallerSavedFP, InFlag); - InFlag = Chain.getValue(1); - } - uint64_t CalleePopBytes = NumBytes; Chain = DAG.getCALLSEQ_END(Chain, DAG.getTargetConstant(0, DL, MVT::i32), DAG.getTargetConstant(CalleePopBytes, DL, MVT::i32), @@ -2773,8 +2955,8 @@ unsigned SITargetLowering::getRegisterByName(const char* RegName, EVT VT, } - if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS && - Subtarget->getRegisterInfo()->regsOverlap(Reg, AMDGPU::FLAT_SCR)) { + if (!Subtarget->hasFlatScrRegister() && + Subtarget->getRegisterInfo()->regsOverlap(Reg, AMDGPU::FLAT_SCR)) { report_fatal_error(Twine("invalid register \"" + StringRef(RegName) + "\" for subtarget.")); } @@ -2830,6 +3012,107 @@ MachineBasicBlock *SITargetLowering::splitKillBlock(MachineInstr &MI, return SplitBB; } +// Split block \p MBB at \p MI, as to insert a loop. If \p InstInLoop is true, +// \p MI will be the only instruction in the loop body block. Otherwise, it will +// be the first instruction in the remainder block. +// +/// \returns { LoopBody, Remainder } +static std::pair<MachineBasicBlock *, MachineBasicBlock *> +splitBlockForLoop(MachineInstr &MI, MachineBasicBlock &MBB, bool InstInLoop) { + MachineFunction *MF = MBB.getParent(); + MachineBasicBlock::iterator I(&MI); + + // To insert the loop we need to split the block. Move everything after this + // point to a new block, and insert a new empty block between the two. + MachineBasicBlock *LoopBB = MF->CreateMachineBasicBlock(); + MachineBasicBlock *RemainderBB = MF->CreateMachineBasicBlock(); + MachineFunction::iterator MBBI(MBB); + ++MBBI; + + MF->insert(MBBI, LoopBB); + MF->insert(MBBI, RemainderBB); + + LoopBB->addSuccessor(LoopBB); + LoopBB->addSuccessor(RemainderBB); + + // Move the rest of the block into a new block. + RemainderBB->transferSuccessorsAndUpdatePHIs(&MBB); + + if (InstInLoop) { + auto Next = std::next(I); + + // Move instruction to loop body. + LoopBB->splice(LoopBB->begin(), &MBB, I, Next); + + // Move the rest of the block. + RemainderBB->splice(RemainderBB->begin(), &MBB, Next, MBB.end()); + } else { + RemainderBB->splice(RemainderBB->begin(), &MBB, I, MBB.end()); + } + + MBB.addSuccessor(LoopBB); + + return std::make_pair(LoopBB, RemainderBB); +} + +MachineBasicBlock * +SITargetLowering::emitGWSMemViolTestLoop(MachineInstr &MI, + MachineBasicBlock *BB) const { + const DebugLoc &DL = MI.getDebugLoc(); + + MachineRegisterInfo &MRI = BB->getParent()->getRegInfo(); + + MachineBasicBlock *LoopBB; + MachineBasicBlock *RemainderBB; + const SIInstrInfo *TII = getSubtarget()->getInstrInfo(); + + MachineBasicBlock::iterator Prev = std::prev(MI.getIterator()); + + std::tie(LoopBB, RemainderBB) = splitBlockForLoop(MI, *BB, true); + + MachineBasicBlock::iterator I = LoopBB->end(); + MachineOperand *Src = TII->getNamedOperand(MI, AMDGPU::OpName::data0); + + const unsigned EncodedReg = AMDGPU::Hwreg::encodeHwreg( + AMDGPU::Hwreg::ID_TRAPSTS, AMDGPU::Hwreg::OFFSET_MEM_VIOL, 1); + + // Clear TRAP_STS.MEM_VIOL + BuildMI(*LoopBB, LoopBB->begin(), DL, TII->get(AMDGPU::S_SETREG_IMM32_B32)) + .addImm(0) + .addImm(EncodedReg); + + // This is a pain, but we're not allowed to have physical register live-ins + // yet. Insert a pair of copies if the VGPR0 hack is necessary. + if (Src && TargetRegisterInfo::isPhysicalRegister(Src->getReg())) { + unsigned Data0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); + BuildMI(*BB, std::next(Prev), DL, TII->get(AMDGPU::COPY), Data0) + .add(*Src); + + BuildMI(*LoopBB, LoopBB->begin(), DL, TII->get(AMDGPU::COPY), Src->getReg()) + .addReg(Data0); + + MRI.setSimpleHint(Data0, Src->getReg()); + } + + BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::S_WAITCNT)) + .addImm(0); + + unsigned Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); + + // Load and check TRAP_STS.MEM_VIOL + BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::S_GETREG_B32), Reg) + .addImm(EncodedReg); + + // FIXME: Do we need to use an isel pseudo that may clobber scc? + BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::S_CMP_LG_U32)) + .addReg(Reg, RegState::Kill) + .addImm(0); + BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::S_CBRANCH_SCC1)) + .addMBB(LoopBB); + + return RemainderBB; +} + // Do a v_movrels_b32 or v_movreld_b32 for each unique value of \p IdxReg in the // wavefront. If the value is uniform and just happens to be in a VGPR, this // will only do one iteration. In the worst case, this will loop 64 times. @@ -2849,12 +3132,16 @@ static MachineBasicBlock::iterator emitLoadM0FromVGPRLoop( int Offset, bool UseGPRIdxMode, bool IsIndirectSrc) { + MachineFunction *MF = OrigBB.getParent(); + const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>(); + const SIRegisterInfo *TRI = ST.getRegisterInfo(); MachineBasicBlock::iterator I = LoopBB.begin(); - unsigned PhiExec = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); - unsigned NewExec = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); + const TargetRegisterClass *BoolRC = TRI->getBoolRC(); + unsigned PhiExec = MRI.createVirtualRegister(BoolRC); + unsigned NewExec = MRI.createVirtualRegister(BoolRC); unsigned CurrentIdxReg = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); - unsigned CondReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); + unsigned CondReg = MRI.createVirtualRegister(BoolRC); BuildMI(LoopBB, I, DL, TII->get(TargetOpcode::PHI), PhiReg) .addReg(InitReg) @@ -2878,7 +3165,9 @@ static MachineBasicBlock::iterator emitLoadM0FromVGPRLoop( .addReg(IdxReg.getReg(), 0, IdxReg.getSubReg()); // Update EXEC, save the original EXEC value to VCC. - BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_AND_SAVEEXEC_B64), NewExec) + BuildMI(LoopBB, I, DL, TII->get(ST.isWave32() ? AMDGPU::S_AND_SAVEEXEC_B32 + : AMDGPU::S_AND_SAVEEXEC_B64), + NewExec) .addReg(CondReg, RegState::Kill); MRI.setSimpleHint(NewExec, CondReg); @@ -2894,7 +3183,7 @@ static MachineBasicBlock::iterator emitLoadM0FromVGPRLoop( .addImm(Offset); } unsigned IdxMode = IsIndirectSrc ? - VGPRIndexMode::SRC0_ENABLE : VGPRIndexMode::DST_ENABLE; + AMDGPU::VGPRIndexMode::SRC0_ENABLE : AMDGPU::VGPRIndexMode::DST_ENABLE; MachineInstr *SetOn = BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_SET_GPR_IDX_ON)) .addReg(IdxReg, RegState::Kill) @@ -2913,10 +3202,12 @@ static MachineBasicBlock::iterator emitLoadM0FromVGPRLoop( } // Update EXEC, switch all done bits to 0 and all todo bits to 1. + unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC; MachineInstr *InsertPt = - BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_XOR_B64), AMDGPU::EXEC) - .addReg(AMDGPU::EXEC) - .addReg(NewExec); + BuildMI(LoopBB, I, DL, TII->get(ST.isWave32() ? AMDGPU::S_XOR_B32_term + : AMDGPU::S_XOR_B64_term), Exec) + .addReg(Exec) + .addReg(NewExec); // XXX - s_xor_b64 sets scc to 1 if the result is nonzero, so can we use // s_cbranch_scc0? @@ -2942,38 +3233,28 @@ static MachineBasicBlock::iterator loadM0FromVGPR(const SIInstrInfo *TII, bool UseGPRIdxMode, bool IsIndirectSrc) { MachineFunction *MF = MBB.getParent(); + const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>(); + const SIRegisterInfo *TRI = ST.getRegisterInfo(); MachineRegisterInfo &MRI = MF->getRegInfo(); const DebugLoc &DL = MI.getDebugLoc(); MachineBasicBlock::iterator I(&MI); + const auto *BoolXExecRC = TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID); unsigned DstReg = MI.getOperand(0).getReg(); - unsigned SaveExec = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass); - unsigned TmpExec = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass); + unsigned SaveExec = MRI.createVirtualRegister(BoolXExecRC); + unsigned TmpExec = MRI.createVirtualRegister(BoolXExecRC); + unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC; + unsigned MovExecOpc = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64; BuildMI(MBB, I, DL, TII->get(TargetOpcode::IMPLICIT_DEF), TmpExec); // Save the EXEC mask - BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B64), SaveExec) - .addReg(AMDGPU::EXEC); + BuildMI(MBB, I, DL, TII->get(MovExecOpc), SaveExec) + .addReg(Exec); - // To insert the loop we need to split the block. Move everything after this - // point to a new block, and insert a new empty block between the two. - MachineBasicBlock *LoopBB = MF->CreateMachineBasicBlock(); - MachineBasicBlock *RemainderBB = MF->CreateMachineBasicBlock(); - MachineFunction::iterator MBBI(MBB); - ++MBBI; - - MF->insert(MBBI, LoopBB); - MF->insert(MBBI, RemainderBB); - - LoopBB->addSuccessor(LoopBB); - LoopBB->addSuccessor(RemainderBB); - - // Move the rest of the block into a new block. - RemainderBB->transferSuccessorsAndUpdatePHIs(&MBB); - RemainderBB->splice(RemainderBB->begin(), &MBB, I, MBB.end()); - - MBB.addSuccessor(LoopBB); + MachineBasicBlock *LoopBB; + MachineBasicBlock *RemainderBB; + std::tie(LoopBB, RemainderBB) = splitBlockForLoop(MI, MBB, false); const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx); @@ -2982,7 +3263,7 @@ static MachineBasicBlock::iterator loadM0FromVGPR(const SIInstrInfo *TII, Offset, UseGPRIdxMode, IsIndirectSrc); MachineBasicBlock::iterator First = RemainderBB->begin(); - BuildMI(*RemainderBB, First, DL, TII->get(AMDGPU::S_MOV_B64), AMDGPU::EXEC) + BuildMI(*RemainderBB, First, DL, TII->get(MovExecOpc), Exec) .addReg(SaveExec); return InsPt; @@ -3025,7 +3306,7 @@ static bool setM0ToIndexFromSGPR(const SIInstrInfo *TII, if (UseGPRIdxMode) { unsigned IdxMode = IsIndirectSrc ? - VGPRIndexMode::SRC0_ENABLE : VGPRIndexMode::DST_ENABLE; + AMDGPU::VGPRIndexMode::SRC0_ENABLE : AMDGPU::VGPRIndexMode::DST_ENABLE; if (Offset == 0) { MachineInstr *SetOn = BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_SET_GPR_IDX_ON)) @@ -3274,6 +3555,9 @@ MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter( case AMDGPU::S_ADD_U64_PSEUDO: case AMDGPU::S_SUB_U64_PSEUDO: { MachineRegisterInfo &MRI = BB->getParent()->getRegInfo(); + const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>(); + const SIRegisterInfo *TRI = ST.getRegisterInfo(); + const TargetRegisterClass *BoolRC = TRI->getBoolRC(); const DebugLoc &DL = MI.getDebugLoc(); MachineOperand &Dest = MI.getOperand(0); @@ -3284,17 +3568,17 @@ MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter( unsigned DestSub1 = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); MachineOperand Src0Sub0 = TII->buildExtractSubRegOrImm(MI, MRI, - Src0, &AMDGPU::SReg_64RegClass, AMDGPU::sub0, + Src0, BoolRC, AMDGPU::sub0, &AMDGPU::SReg_32_XM0RegClass); MachineOperand Src0Sub1 = TII->buildExtractSubRegOrImm(MI, MRI, - Src0, &AMDGPU::SReg_64RegClass, AMDGPU::sub1, + Src0, BoolRC, AMDGPU::sub1, &AMDGPU::SReg_32_XM0RegClass); MachineOperand Src1Sub0 = TII->buildExtractSubRegOrImm(MI, MRI, - Src1, &AMDGPU::SReg_64RegClass, AMDGPU::sub0, + Src1, BoolRC, AMDGPU::sub0, &AMDGPU::SReg_32_XM0RegClass); MachineOperand Src1Sub1 = TII->buildExtractSubRegOrImm(MI, MRI, - Src1, &AMDGPU::SReg_64RegClass, AMDGPU::sub1, + Src1, BoolRC, AMDGPU::sub1, &AMDGPU::SReg_32_XM0RegClass); bool IsAdd = (MI.getOpcode() == AMDGPU::S_ADD_U64_PSEUDO); @@ -3330,6 +3614,14 @@ MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter( MI.eraseFromParent(); return BB; + case AMDGPU::SI_INIT_EXEC_LO: + // This should be before all vector instructions. + BuildMI(*BB, &*BB->begin(), MI.getDebugLoc(), TII->get(AMDGPU::S_MOV_B32), + AMDGPU::EXEC_LO) + .addImm(MI.getOperand(0).getImm()); + MI.eraseFromParent(); + return BB; + case AMDGPU::SI_INIT_EXEC_FROM_INPUT: { // Extract the thread count from an SGPR input and set EXEC accordingly. // Since BFM can't shift by 64, handle that case with CMP + CMOV. @@ -3363,24 +3655,31 @@ MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter( (void)Found; // This should be before all vector instructions. + unsigned Mask = (getSubtarget()->getWavefrontSize() << 1) - 1; + bool isWave32 = getSubtarget()->isWave32(); + unsigned Exec = isWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC; BuildMI(*BB, FirstMI, DebugLoc(), TII->get(AMDGPU::S_BFE_U32), CountReg) .addReg(InputReg) - .addImm((MI.getOperand(1).getImm() & 0x7f) | 0x70000); - BuildMI(*BB, FirstMI, DebugLoc(), TII->get(AMDGPU::S_BFM_B64), - AMDGPU::EXEC) + .addImm((MI.getOperand(1).getImm() & Mask) | 0x70000); + BuildMI(*BB, FirstMI, DebugLoc(), + TII->get(isWave32 ? AMDGPU::S_BFM_B32 : AMDGPU::S_BFM_B64), + Exec) .addReg(CountReg) .addImm(0); BuildMI(*BB, FirstMI, DebugLoc(), TII->get(AMDGPU::S_CMP_EQ_U32)) .addReg(CountReg, RegState::Kill) - .addImm(64); - BuildMI(*BB, FirstMI, DebugLoc(), TII->get(AMDGPU::S_CMOV_B64), - AMDGPU::EXEC) + .addImm(getSubtarget()->getWavefrontSize()); + BuildMI(*BB, FirstMI, DebugLoc(), + TII->get(isWave32 ? AMDGPU::S_CMOV_B32 : AMDGPU::S_CMOV_B64), + Exec) .addImm(-1); MI.eraseFromParent(); return BB; } case AMDGPU::GET_GROUPSTATICSIZE: { + assert(getTargetMachine().getTargetTriple().getOS() == Triple::AMDHSA || + getTargetMachine().getTargetTriple().getOS() == Triple::AMDPAL); DebugLoc DL = MI.getDebugLoc(); BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_MOV_B32)) .add(MI.getOperand(0)) @@ -3405,6 +3704,8 @@ MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter( return splitKillBlock(MI, BB); case AMDGPU::V_CNDMASK_B64_PSEUDO: { MachineRegisterInfo &MRI = BB->getParent()->getRegInfo(); + const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>(); + const SIRegisterInfo *TRI = ST.getRegisterInfo(); unsigned Dst = MI.getOperand(0).getReg(); unsigned Src0 = MI.getOperand(1).getReg(); @@ -3414,16 +3715,21 @@ MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter( unsigned DstLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); unsigned DstHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); - unsigned SrcCondCopy = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass); + const auto *CondRC = TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID); + unsigned SrcCondCopy = MRI.createVirtualRegister(CondRC); BuildMI(*BB, MI, DL, TII->get(AMDGPU::COPY), SrcCondCopy) .addReg(SrcCond); BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_CNDMASK_B32_e64), DstLo) + .addImm(0) .addReg(Src0, 0, AMDGPU::sub0) + .addImm(0) .addReg(Src1, 0, AMDGPU::sub0) .addReg(SrcCondCopy); BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_CNDMASK_B32_e64), DstHi) + .addImm(0) .addReg(Src0, 0, AMDGPU::sub1) + .addImm(0) .addReg(Src1, 0, AMDGPU::sub1) .addReg(SrcCondCopy); @@ -3457,40 +3763,60 @@ MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter( .addReg(Info->getFrameOffsetReg(), RegState::Implicit); return BB; } - case AMDGPU::SI_CALL_ISEL: - case AMDGPU::SI_TCRETURN_ISEL: { + case AMDGPU::SI_CALL_ISEL: { const SIInstrInfo *TII = getSubtarget()->getInstrInfo(); const DebugLoc &DL = MI.getDebugLoc(); + unsigned ReturnAddrReg = TII->getRegisterInfo().getReturnAddressReg(*MF); - MachineRegisterInfo &MRI = MF->getRegInfo(); - unsigned GlobalAddrReg = MI.getOperand(0).getReg(); - MachineInstr *PCRel = MRI.getVRegDef(GlobalAddrReg); - assert(PCRel->getOpcode() == AMDGPU::SI_PC_ADD_REL_OFFSET); + MachineInstrBuilder MIB; + MIB = BuildMI(*BB, MI, DL, TII->get(AMDGPU::SI_CALL), ReturnAddrReg); - const GlobalValue *G = PCRel->getOperand(1).getGlobal(); + for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) + MIB.add(MI.getOperand(I)); - MachineInstrBuilder MIB; - if (MI.getOpcode() == AMDGPU::SI_CALL_ISEL) { - MIB = BuildMI(*BB, MI, DL, TII->get(AMDGPU::SI_CALL), ReturnAddrReg) - .add(MI.getOperand(0)) - .addGlobalAddress(G); - } else { - MIB = BuildMI(*BB, MI, DL, TII->get(AMDGPU::SI_TCRETURN)) - .add(MI.getOperand(0)) - .addGlobalAddress(G); + MIB.cloneMemRefs(MI); + MI.eraseFromParent(); + return BB; + } + case AMDGPU::V_ADD_I32_e32: + case AMDGPU::V_SUB_I32_e32: + case AMDGPU::V_SUBREV_I32_e32: { + // TODO: Define distinct V_*_I32_Pseudo instructions instead. + const DebugLoc &DL = MI.getDebugLoc(); + unsigned Opc = MI.getOpcode(); - // There is an additional imm operand for tcreturn, but it should be in the - // right place already. + bool NeedClampOperand = false; + if (TII->pseudoToMCOpcode(Opc) == -1) { + Opc = AMDGPU::getVOPe64(Opc); + NeedClampOperand = true; } - for (unsigned I = 1, E = MI.getNumOperands(); I != E; ++I) - MIB.add(MI.getOperand(I)); + auto I = BuildMI(*BB, MI, DL, TII->get(Opc), MI.getOperand(0).getReg()); + if (TII->isVOP3(*I)) { + const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>(); + const SIRegisterInfo *TRI = ST.getRegisterInfo(); + I.addReg(TRI->getVCC(), RegState::Define); + } + I.add(MI.getOperand(1)) + .add(MI.getOperand(2)); + if (NeedClampOperand) + I.addImm(0); // clamp bit for e64 encoding + + TII->legalizeOperands(*I); - MIB.cloneMemRefs(MI); MI.eraseFromParent(); return BB; } + case AMDGPU::DS_GWS_INIT: + case AMDGPU::DS_GWS_SEMA_V: + case AMDGPU::DS_GWS_SEMA_BR: + case AMDGPU::DS_GWS_SEMA_P: + case AMDGPU::DS_GWS_SEMA_RELEASE_ALL: + case AMDGPU::DS_GWS_BARRIER: + if (getSubtarget()->hasGWSAutoReplay()) + return BB; + return emitGWSMemViolTestLoop(MI, BB); default: return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI, BB); } @@ -3617,6 +3943,7 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { switch (Op.getOpcode()) { default: return AMDGPUTargetLowering::LowerOperation(Op, DAG); case ISD::BRCOND: return LowerBRCOND(Op, DAG); + case ISD::RETURNADDR: return LowerRETURNADDR(Op, DAG); case ISD::LOAD: { SDValue Result = LowerLOAD(Op, DAG); assert((!Result.getNode() || @@ -3641,10 +3968,14 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { case ISD::INTRINSIC_W_CHAIN: return LowerINTRINSIC_W_CHAIN(Op, DAG); case ISD::INTRINSIC_VOID: return LowerINTRINSIC_VOID(Op, DAG); case ISD::ADDRSPACECAST: return lowerADDRSPACECAST(Op, DAG); + case ISD::INSERT_SUBVECTOR: + return lowerINSERT_SUBVECTOR(Op, DAG); case ISD::INSERT_VECTOR_ELT: return lowerINSERT_VECTOR_ELT(Op, DAG); case ISD::EXTRACT_VECTOR_ELT: return lowerEXTRACT_VECTOR_ELT(Op, DAG); + case ISD::VECTOR_SHUFFLE: + return lowerVECTOR_SHUFFLE(Op, DAG); case ISD::BUILD_VECTOR: return lowerBUILD_VECTOR(Op, DAG); case ISD::FP_ROUND: @@ -3742,10 +4073,7 @@ SDValue SITargetLowering::adjustLoadValueType(unsigned Opcode, static SDValue lowerICMPIntrinsic(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG) { EVT VT = N->getValueType(0); - const auto *CD = dyn_cast<ConstantSDNode>(N->getOperand(3)); - if (!CD) - return DAG.getUNDEF(VT); - + const auto *CD = cast<ConstantSDNode>(N->getOperand(3)); int CondCode = CD->getSExtValue(); if (CondCode < ICmpInst::Predicate::FIRST_ICMP_PREDICATE || CondCode > ICmpInst::Predicate::LAST_ICMP_PREDICATE) @@ -3753,7 +4081,6 @@ static SDValue lowerICMPIntrinsic(const SITargetLowering &TLI, ICmpInst::Predicate IcInput = static_cast<ICmpInst::Predicate>(CondCode); - SDValue LHS = N->getOperand(1); SDValue RHS = N->getOperand(2); @@ -3769,16 +4096,20 @@ static SDValue lowerICMPIntrinsic(const SITargetLowering &TLI, ISD::CondCode CCOpcode = getICmpCondCode(IcInput); - return DAG.getNode(AMDGPUISD::SETCC, DL, VT, LHS, RHS, - DAG.getCondCode(CCOpcode)); + unsigned WavefrontSize = TLI.getSubtarget()->getWavefrontSize(); + EVT CCVT = EVT::getIntegerVT(*DAG.getContext(), WavefrontSize); + + SDValue SetCC = DAG.getNode(AMDGPUISD::SETCC, DL, CCVT, LHS, RHS, + DAG.getCondCode(CCOpcode)); + if (VT.bitsEq(CCVT)) + return SetCC; + return DAG.getZExtOrTrunc(SetCC, DL, VT); } static SDValue lowerFCMPIntrinsic(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG) { EVT VT = N->getValueType(0); - const auto *CD = dyn_cast<ConstantSDNode>(N->getOperand(3)); - if (!CD) - return DAG.getUNDEF(VT); + const auto *CD = cast<ConstantSDNode>(N->getOperand(3)); int CondCode = CD->getSExtValue(); if (CondCode < FCmpInst::Predicate::FIRST_FCMP_PREDICATE || @@ -3798,8 +4129,13 @@ static SDValue lowerFCMPIntrinsic(const SITargetLowering &TLI, FCmpInst::Predicate IcInput = static_cast<FCmpInst::Predicate>(CondCode); ISD::CondCode CCOpcode = getFCmpCondCode(IcInput); - return DAG.getNode(AMDGPUISD::SETCC, SL, VT, Src0, - Src1, DAG.getCondCode(CCOpcode)); + unsigned WavefrontSize = TLI.getSubtarget()->getWavefrontSize(); + EVT CCVT = EVT::getIntegerVT(*DAG.getContext(), WavefrontSize); + SDValue SetCC = DAG.getNode(AMDGPUISD::SETCC, SL, CCVT, Src0, + Src1, DAG.getCondCode(CCOpcode)); + if (VT.bitsEq(CCVT)) + return SetCC; + return DAG.getZExtOrTrunc(SetCC, SL, VT); } void SITargetLowering::ReplaceNodeResults(SDNode *N, @@ -3957,32 +4293,6 @@ unsigned SITargetLowering::isCFIntrinsic(const SDNode *Intr) const { return 0; } -void SITargetLowering::createDebuggerPrologueStackObjects( - MachineFunction &MF) const { - // Create stack objects that are used for emitting debugger prologue. - // - // Debugger prologue writes work group IDs and work item IDs to scratch memory - // at fixed location in the following format: - // offset 0: work group ID x - // offset 4: work group ID y - // offset 8: work group ID z - // offset 16: work item ID x - // offset 20: work item ID y - // offset 24: work item ID z - SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); - int ObjectIdx = 0; - - // For each dimension: - for (unsigned i = 0; i < 3; ++i) { - // Create fixed stack object for work group ID. - ObjectIdx = MF.getFrameInfo().CreateFixedObject(4, i * 4, true); - Info->setDebuggerWorkGroupIDStackObjectIndex(i, ObjectIdx); - // Create fixed stack object for work item ID. - ObjectIdx = MF.getFrameInfo().CreateFixedObject(4, i * 4 + 16, true); - Info->setDebuggerWorkItemIDStackObjectIndex(i, ObjectIdx); - } -} - bool SITargetLowering::shouldEmitFixup(const GlobalValue *GV) const { const Triple &TT = getTargetMachine().getTargetTriple(); return (GV->getType()->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS || @@ -3991,7 +4301,10 @@ bool SITargetLowering::shouldEmitFixup(const GlobalValue *GV) const { } bool SITargetLowering::shouldEmitGOTReloc(const GlobalValue *GV) const { - return (GV->getType()->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS || + // FIXME: Either avoid relying on address space here or change the default + // address space for functions to avoid the explicit check. + return (GV->getValueType()->isFunctionTy() || + GV->getType()->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS || GV->getType()->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS || GV->getType()->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT) && !shouldEmitFixup(GV) && @@ -4103,6 +4416,31 @@ SDValue SITargetLowering::LowerBRCOND(SDValue BRCOND, return Chain; } +SDValue SITargetLowering::LowerRETURNADDR(SDValue Op, + SelectionDAG &DAG) const { + MVT VT = Op.getSimpleValueType(); + SDLoc DL(Op); + // Checking the depth + if (cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue() != 0) + return DAG.getConstant(0, DL, VT); + + MachineFunction &MF = DAG.getMachineFunction(); + const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); + // Check for kernel and shader functions + if (Info->isEntryFunction()) + return DAG.getConstant(0, DL, VT); + + MachineFrameInfo &MFI = MF.getFrameInfo(); + // There is a call to @llvm.returnaddress in this function + MFI.setReturnAddressIsTaken(true); + + const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo(); + // Get the return address reg and mark it as an implicit live-in + unsigned Reg = MF.addLiveIn(TRI->getReturnAddressReg(MF), getRegClassFor(VT, Op.getNode()->isDivergent())); + + return DAG.getCopyFromReg(DAG.getEntryNode(), DL, Reg, VT); +} + SDValue SITargetLowering::getFPExtOrFPTrunc(SelectionDAG &DAG, SDValue Op, const SDLoc &DL, @@ -4131,7 +4469,9 @@ SDValue SITargetLowering::lowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const { SDValue SITargetLowering::lowerFMINNUM_FMAXNUM(SDValue Op, SelectionDAG &DAG) const { EVT VT = Op.getValueType(); - bool IsIEEEMode = Subtarget->enableIEEEBit(DAG.getMachineFunction()); + const MachineFunction &MF = DAG.getMachineFunction(); + const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); + bool IsIEEEMode = Info->getMode().IEEE; // FIXME: Assert during eslection that this is only selected for // ieee_mode. Currently a combine can produce the ieee version for non-ieee @@ -4302,6 +4642,32 @@ SDValue SITargetLowering::lowerADDRSPACECAST(SDValue Op, return DAG.getUNDEF(ASC->getValueType(0)); } +// This lowers an INSERT_SUBVECTOR by extracting the individual elements from +// the small vector and inserting them into the big vector. That is better than +// the default expansion of doing it via a stack slot. Even though the use of +// the stack slot would be optimized away afterwards, the stack slot itself +// remains. +SDValue SITargetLowering::lowerINSERT_SUBVECTOR(SDValue Op, + SelectionDAG &DAG) const { + SDValue Vec = Op.getOperand(0); + SDValue Ins = Op.getOperand(1); + SDValue Idx = Op.getOperand(2); + EVT VecVT = Vec.getValueType(); + EVT InsVT = Ins.getValueType(); + EVT EltVT = VecVT.getVectorElementType(); + unsigned InsNumElts = InsVT.getVectorNumElements(); + unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue(); + SDLoc SL(Op); + + for (unsigned I = 0; I != InsNumElts; ++I) { + SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, Ins, + DAG.getConstant(I, SL, MVT::i32)); + Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, SL, VecVT, Vec, Elt, + DAG.getConstant(IdxVal + I, SL, MVT::i32)); + } + return Vec; +} + SDValue SITargetLowering::lowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const { SDValue Vec = Op.getOperand(0); @@ -4352,12 +4718,12 @@ SDValue SITargetLowering::lowerINSERT_VECTOR_ELT(SDValue Op, MVT IntVT = MVT::getIntegerVT(VecSize); // Avoid stack access for dynamic indexing. - SDValue Val = InsVal; - if (InsVal.getValueType() == MVT::f16) - Val = DAG.getNode(ISD::BITCAST, SL, MVT::i16, InsVal); - // v_bfi_b32 (v_bfm_b32 16, (shl idx, 16)), val, vec - SDValue ExtVal = DAG.getNode(ISD::ZERO_EXTEND, SL, IntVT, Val); + + // Create a congruent vector with the target value in each element so that + // the required element can be masked and ORed into the target vector. + SDValue ExtVal = DAG.getNode(ISD::BITCAST, SL, IntVT, + DAG.getSplatBuildVector(VecVT, SL, InsVal)); assert(isPowerOf2_32(EltSize)); SDValue ScaleFactor = DAG.getConstant(Log2_32(EltSize), SL, MVT::i32); @@ -4419,6 +4785,63 @@ SDValue SITargetLowering::lowerEXTRACT_VECTOR_ELT(SDValue Op, return DAG.getAnyExtOrTrunc(Elt, SL, ResultVT); } +static bool elementPairIsContiguous(ArrayRef<int> Mask, int Elt) { + assert(Elt % 2 == 0); + return Mask[Elt + 1] == Mask[Elt] + 1 && (Mask[Elt] % 2 == 0); +} + +SDValue SITargetLowering::lowerVECTOR_SHUFFLE(SDValue Op, + SelectionDAG &DAG) const { + SDLoc SL(Op); + EVT ResultVT = Op.getValueType(); + ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Op); + + EVT PackVT = ResultVT.isInteger() ? MVT::v2i16 : MVT::v2f16; + EVT EltVT = PackVT.getVectorElementType(); + int SrcNumElts = Op.getOperand(0).getValueType().getVectorNumElements(); + + // vector_shuffle <0,1,6,7> lhs, rhs + // -> concat_vectors (extract_subvector lhs, 0), (extract_subvector rhs, 2) + // + // vector_shuffle <6,7,2,3> lhs, rhs + // -> concat_vectors (extract_subvector rhs, 2), (extract_subvector lhs, 2) + // + // vector_shuffle <6,7,0,1> lhs, rhs + // -> concat_vectors (extract_subvector rhs, 2), (extract_subvector lhs, 0) + + // Avoid scalarizing when both halves are reading from consecutive elements. + SmallVector<SDValue, 4> Pieces; + for (int I = 0, N = ResultVT.getVectorNumElements(); I != N; I += 2) { + if (elementPairIsContiguous(SVN->getMask(), I)) { + const int Idx = SVN->getMaskElt(I); + int VecIdx = Idx < SrcNumElts ? 0 : 1; + int EltIdx = Idx < SrcNumElts ? Idx : Idx - SrcNumElts; + SDValue SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, + PackVT, SVN->getOperand(VecIdx), + DAG.getConstant(EltIdx, SL, MVT::i32)); + Pieces.push_back(SubVec); + } else { + const int Idx0 = SVN->getMaskElt(I); + const int Idx1 = SVN->getMaskElt(I + 1); + int VecIdx0 = Idx0 < SrcNumElts ? 0 : 1; + int VecIdx1 = Idx1 < SrcNumElts ? 0 : 1; + int EltIdx0 = Idx0 < SrcNumElts ? Idx0 : Idx0 - SrcNumElts; + int EltIdx1 = Idx1 < SrcNumElts ? Idx1 : Idx1 - SrcNumElts; + + SDValue Vec0 = SVN->getOperand(VecIdx0); + SDValue Elt0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, + Vec0, DAG.getConstant(EltIdx0, SL, MVT::i32)); + + SDValue Vec1 = SVN->getOperand(VecIdx1); + SDValue Elt1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, + Vec1, DAG.getConstant(EltIdx1, SL, MVT::i32)); + Pieces.push_back(DAG.getBuildVector(PackVT, SL, { Elt0, Elt1 })); + } + } + + return DAG.getNode(ISD::CONCAT_VECTORS, SL, ResultVT, Pieces); +} + SDValue SITargetLowering::lowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { SDLoc SL(Op); @@ -4512,11 +4935,18 @@ buildPCRelGlobalAddress(SelectionDAG &DAG, const GlobalValue *GV, // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too // small. This requires us to add 4 to the global variable offset in order to // compute the correct address. - SDValue PtrLo = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, Offset + 4, - GAFlags); - SDValue PtrHi = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, Offset + 4, - GAFlags == SIInstrInfo::MO_NONE ? - GAFlags : GAFlags + 1); + unsigned LoFlags = GAFlags; + if (LoFlags == SIInstrInfo::MO_NONE) + LoFlags = SIInstrInfo::MO_REL32; + SDValue PtrLo = + DAG.getTargetGlobalAddress(GV, DL, MVT::i32, Offset + 4, LoFlags); + SDValue PtrHi; + if (GAFlags == SIInstrInfo::MO_NONE) { + PtrHi = DAG.getTargetConstant(0, DL, MVT::i32); + } else { + PtrHi = + DAG.getTargetGlobalAddress(GV, DL, MVT::i32, Offset + 4, GAFlags + 1); + } return DAG.getNode(AMDGPUISD::PC_ADD_REL_OFFSET, DL, PtrVT, PtrLo, PtrHi); } @@ -4525,7 +4955,10 @@ SDValue SITargetLowering::LowerGlobalAddress(AMDGPUMachineFunction *MFI, SelectionDAG &DAG) const { GlobalAddressSDNode *GSD = cast<GlobalAddressSDNode>(Op); const GlobalValue *GV = GSD->getGlobal(); - if (GSD->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS || + if ((GSD->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS && + (!GV->hasExternalLinkage() || + getTargetMachine().getTargetTriple().getOS() == Triple::AMDHSA || + getTargetMachine().getTargetTriple().getOS() == Triple::AMDPAL)) || GSD->getAddressSpace() == AMDGPUAS::REGION_ADDRESS || GSD->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS) return AMDGPUTargetLowering::LowerGlobalAddress(MFI, Op, DAG); @@ -4533,7 +4966,12 @@ SDValue SITargetLowering::LowerGlobalAddress(AMDGPUMachineFunction *MFI, SDLoc DL(GSD); EVT PtrVT = Op.getValueType(); - // FIXME: Should not make address space based decisions here. + if (GSD->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) { + SDValue GA = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, GSD->getOffset(), + SIInstrInfo::MO_ABS32_LO); + return DAG.getNode(AMDGPUISD::LDS, DL, MVT::i32, GA); + } + if (shouldEmitFixup(GV)) return buildPCRelGlobalAddress(DAG, GV, DL, GSD->getOffset(), PtrVT); else if (shouldEmitPCReloc(GV)) @@ -4641,10 +5079,8 @@ static SDValue getBuildDwordsVector(SelectionDAG &DAG, SDLoc DL, } static bool parseCachePolicy(SDValue CachePolicy, SelectionDAG &DAG, - SDValue *GLC, SDValue *SLC) { - auto CachePolicyConst = dyn_cast<ConstantSDNode>(CachePolicy.getNode()); - if (!CachePolicyConst) - return false; + SDValue *GLC, SDValue *SLC, SDValue *DLC) { + auto CachePolicyConst = cast<ConstantSDNode>(CachePolicy.getNode()); uint64_t Value = CachePolicyConst->getZExtValue(); SDLoc DL(CachePolicy); @@ -4656,6 +5092,10 @@ static bool parseCachePolicy(SDValue CachePolicy, SelectionDAG &DAG, *SLC = DAG.getTargetConstant((Value & 0x2) ? 1 : 0, DL, MVT::i32); Value &= ~(uint64_t)0x2; } + if (DLC) { + *DLC = DAG.getTargetConstant((Value & 0x4) ? 1 : 0, DL, MVT::i32); + Value &= ~(uint64_t)0x4; + } return Value == 0; } @@ -4689,14 +5129,14 @@ static SDValue constructRetValue(SelectionDAG &DAG, EVT CastVT = NumElts > 1 ? EVT::getVectorVT(Context, AdjEltVT, NumElts) : AdjEltVT; - // Special case for v8f16. Rather than add support for this, use v4i32 to + // Special case for v6f16. Rather than add support for this, use v3i32 to // extract the data elements - bool V8F16Special = false; - if (CastVT == MVT::v8f16) { - CastVT = MVT::v4i32; + bool V6F16Special = false; + if (NumElts == 6) { + CastVT = EVT::getVectorVT(Context, MVT::i32, NumElts / 2); DMaskPop >>= 1; ReqRetNumElts >>= 1; - V8F16Special = true; + V6F16Special = true; AdjVT = MVT::v2i32; } @@ -4726,7 +5166,7 @@ static SDValue constructRetValue(SelectionDAG &DAG, PreTFCRes = BVElts[0]; } - if (V8F16Special) + if (V6F16Special) PreTFCRes = DAG.getNode(ISD::BITCAST, DL, MVT::v4f16, PreTFCRes); if (!IsTexFail) { @@ -4745,9 +5185,7 @@ static SDValue constructRetValue(SelectionDAG &DAG, static bool parseTexFail(SDValue TexFailCtrl, SelectionDAG &DAG, SDValue *TFE, SDValue *LWE, bool &IsTexFail) { - auto TexFailCtrlConst = dyn_cast<ConstantSDNode>(TexFailCtrl.getNode()); - if (!TexFailCtrlConst) - return false; + auto TexFailCtrlConst = cast<ConstantSDNode>(TexFailCtrl.getNode()); uint64_t Value = TexFailCtrlConst->getZExtValue(); if (Value) { @@ -4774,7 +5212,10 @@ SDValue SITargetLowering::lowerImage(SDValue Op, const AMDGPU::MIMGDimInfo *DimInfo = AMDGPU::getMIMGDimInfo(Intr->Dim); const AMDGPU::MIMGLZMappingInfo *LZMappingInfo = AMDGPU::getMIMGLZMappingInfo(Intr->BaseOpcode); + const AMDGPU::MIMGMIPMappingInfo *MIPMappingInfo = + AMDGPU::getMIMGMIPMappingInfo(Intr->BaseOpcode); unsigned IntrOpcode = Intr->BaseOpcode; + bool IsGFX10 = Subtarget->getGeneration() >= AMDGPUSubtarget::GFX10; SmallVector<EVT, 3> ResultTypes(Op->value_begin(), Op->value_end()); SmallVector<EVT, 3> OrigResultTypes(Op->value_begin(), Op->value_end()); @@ -4810,9 +5251,7 @@ SDValue SITargetLowering::lowerImage(SDValue Op, } } else { unsigned DMaskIdx = BaseOpcode->Store ? 3 : isa<MemSDNode>(Op) ? 2 : 1; - auto DMaskConst = dyn_cast<ConstantSDNode>(Op.getOperand(DMaskIdx)); - if (!DMaskConst) - return Op; + auto DMaskConst = cast<ConstantSDNode>(Op.getOperand(DMaskIdx)); DMask = DMaskConst->getZExtValue(); DMaskLanes = BaseOpcode->Gather4 ? 4 : countPopulation(DMask); @@ -4821,8 +5260,7 @@ SDValue SITargetLowering::lowerImage(SDValue Op, MVT StoreVT = VData.getSimpleValueType(); if (StoreVT.getScalarType() == MVT::f16) { - if (Subtarget->getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS || - !BaseOpcode->HasD16) + if (!Subtarget->hasD16Images() || !BaseOpcode->HasD16) return Op; // D16 is unsupported for this instruction IsD16 = true; @@ -4835,8 +5273,7 @@ SDValue SITargetLowering::lowerImage(SDValue Op, // and whether packing is supported. MVT LoadVT = ResultTypes[0].getSimpleVT(); if (LoadVT.getScalarType() == MVT::f16) { - if (Subtarget->getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS || - !BaseOpcode->HasD16) + if (!Subtarget->hasD16Images() || !BaseOpcode->HasD16) return Op; // D16 is unsupported for this instruction IsD16 = true; @@ -4878,6 +5315,17 @@ SDValue SITargetLowering::lowerImage(SDValue Op, } } + // Optimize _mip away, when 'lod' is zero + if (MIPMappingInfo) { + if (auto ConstantLod = + dyn_cast<ConstantSDNode>(Op.getOperand(AddrIdx+NumVAddrs-1))) { + if (ConstantLod->isNullValue()) { + IntrOpcode = MIPMappingInfo->NONMIP; // set new opcode to variant without _mip + NumMIVAddrs--; // remove 'lod' + } + } + } + // Check for 16 bit addresses and pack if true. unsigned DimIdx = AddrIdx + BaseOpcode->NumExtraArgs; MVT VAddrVT = Op.getOperand(DimIdx).getSimpleValueType(); @@ -4915,7 +5363,22 @@ SDValue SITargetLowering::lowerImage(SDValue Op, VAddrs.push_back(Op.getOperand(AddrIdx + i)); } - SDValue VAddr = getBuildDwordsVector(DAG, DL, VAddrs); + // If the register allocator cannot place the address registers contiguously + // without introducing moves, then using the non-sequential address encoding + // is always preferable, since it saves VALU instructions and is usually a + // wash in terms of code size or even better. + // + // However, we currently have no way of hinting to the register allocator that + // MIMG addresses should be placed contiguously when it is possible to do so, + // so force non-NSA for the common 2-address case as a heuristic. + // + // SIShrinkInstructions will convert NSA encodings to non-NSA after register + // allocation when possible. + bool UseNSA = + ST->hasFeature(AMDGPU::FeatureNSAEncoding) && VAddrs.size() >= 3; + SDValue VAddr; + if (!UseNSA) + VAddr = getBuildDwordsVector(DAG, DL, VAddrs); SDValue True = DAG.getTargetConstant(1, DL, MVT::i1); SDValue False = DAG.getTargetConstant(0, DL, MVT::i1); @@ -4926,9 +5389,7 @@ SDValue SITargetLowering::lowerImage(SDValue Op, CtrlIdx = AddrIdx + NumVAddrs + 1; } else { auto UnormConst = - dyn_cast<ConstantSDNode>(Op.getOperand(AddrIdx + NumVAddrs + 2)); - if (!UnormConst) - return Op; + cast<ConstantSDNode>(Op.getOperand(AddrIdx + NumVAddrs + 2)); Unorm = UnormConst->getZExtValue() ? True : False; CtrlIdx = AddrIdx + NumVAddrs + 3; @@ -4965,9 +5426,6 @@ SDValue SITargetLowering::lowerImage(SDValue Op, return Undef; } - // Have to use a power of 2 number of dwords - NumVDataDwords = 1 << Log2_32_Ceil(NumVDataDwords); - EVT NewVT = NumVDataDwords > 1 ? EVT::getVectorVT(*DAG.getContext(), MVT::f32, NumVDataDwords) : MVT::f32; @@ -4983,45 +5441,66 @@ SDValue SITargetLowering::lowerImage(SDValue Op, SDValue GLC; SDValue SLC; + SDValue DLC; if (BaseOpcode->Atomic) { GLC = True; // TODO no-return optimization - if (!parseCachePolicy(Op.getOperand(CtrlIdx + 1), DAG, nullptr, &SLC)) + if (!parseCachePolicy(Op.getOperand(CtrlIdx + 1), DAG, nullptr, &SLC, + IsGFX10 ? &DLC : nullptr)) return Op; } else { - if (!parseCachePolicy(Op.getOperand(CtrlIdx + 1), DAG, &GLC, &SLC)) + if (!parseCachePolicy(Op.getOperand(CtrlIdx + 1), DAG, &GLC, &SLC, + IsGFX10 ? &DLC : nullptr)) return Op; } - SmallVector<SDValue, 14> Ops; + SmallVector<SDValue, 26> Ops; if (BaseOpcode->Store || BaseOpcode->Atomic) Ops.push_back(VData); // vdata - Ops.push_back(VAddr); + if (UseNSA) { + for (const SDValue &Addr : VAddrs) + Ops.push_back(Addr); + } else { + Ops.push_back(VAddr); + } Ops.push_back(Op.getOperand(AddrIdx + NumVAddrs)); // rsrc if (BaseOpcode->Sampler) Ops.push_back(Op.getOperand(AddrIdx + NumVAddrs + 1)); // sampler Ops.push_back(DAG.getTargetConstant(DMask, DL, MVT::i32)); + if (IsGFX10) + Ops.push_back(DAG.getTargetConstant(DimInfo->Encoding, DL, MVT::i32)); Ops.push_back(Unorm); + if (IsGFX10) + Ops.push_back(DLC); Ops.push_back(GLC); Ops.push_back(SLC); Ops.push_back(IsA16 && // a16 or r128 ST->hasFeature(AMDGPU::FeatureR128A16) ? True : False); Ops.push_back(TFE); // tfe Ops.push_back(LWE); // lwe - Ops.push_back(DimInfo->DA ? True : False); + if (!IsGFX10) + Ops.push_back(DimInfo->DA ? True : False); if (BaseOpcode->HasD16) Ops.push_back(IsD16 ? True : False); if (isa<MemSDNode>(Op)) Ops.push_back(Op.getOperand(0)); // chain - int NumVAddrDwords = VAddr.getValueType().getSizeInBits() / 32; + int NumVAddrDwords = + UseNSA ? VAddrs.size() : VAddr.getValueType().getSizeInBits() / 32; int Opcode = -1; - if (Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) - Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx8, - NumVDataDwords, NumVAddrDwords); - if (Opcode == -1) - Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx6, + if (IsGFX10) { + Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, + UseNSA ? AMDGPU::MIMGEncGfx10NSA + : AMDGPU::MIMGEncGfx10Default, NumVDataDwords, NumVAddrDwords); + } else { + if (Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) + Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx8, + NumVDataDwords, NumVAddrDwords); + if (Opcode == -1) + Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx6, + NumVDataDwords, NumVAddrDwords); + } assert(Opcode != -1); MachineSDNode *NewNode = DAG.getMachineNode(Opcode, DL, ResultTypes, Ops); @@ -5046,7 +5525,7 @@ SDValue SITargetLowering::lowerImage(SDValue Op, } SDValue SITargetLowering::lowerSBuffer(EVT VT, SDLoc DL, SDValue Rsrc, - SDValue Offset, SDValue GLC, + SDValue Offset, SDValue GLC, SDValue DLC, SelectionDAG &DAG) const { MachineFunction &MF = DAG.getMachineFunction(); MachineMemOperand *MMO = MF.getMachineMemOperand( @@ -5059,7 +5538,8 @@ SDValue SITargetLowering::lowerSBuffer(EVT VT, SDLoc DL, SDValue Rsrc, SDValue Ops[] = { Rsrc, Offset, // Offset - GLC // glc + GLC, + DLC, }; return DAG.getMemIntrinsicNode(AMDGPUISD::SBUFFER_LOAD, DL, DAG.getVTList(VT), Ops, VT, MMO); @@ -5263,16 +5743,18 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, return loadInputValue(DAG, &AMDGPU::VGPR_32RegClass, MVT::i32, SDLoc(DAG.getEntryNode()), MFI->getArgInfo().WorkItemIDZ); - case SIIntrinsic::SI_load_const: { - SDValue Load = - lowerSBuffer(MVT::i32, DL, Op.getOperand(1), Op.getOperand(2), - DAG.getTargetConstant(0, DL, MVT::i1), DAG); - return DAG.getNode(ISD::BITCAST, DL, MVT::f32, Load); - } + case Intrinsic::amdgcn_wavefrontsize: + return DAG.getConstant(MF.getSubtarget<GCNSubtarget>().getWavefrontSize(), + SDLoc(Op), MVT::i32); case Intrinsic::amdgcn_s_buffer_load: { - unsigned Cache = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue(); - return lowerSBuffer(VT, DL, Op.getOperand(1), Op.getOperand(2), - DAG.getTargetConstant(Cache & 1, DL, MVT::i1), DAG); + bool IsGFX10 = Subtarget->getGeneration() >= AMDGPUSubtarget::GFX10; + SDValue GLC; + SDValue DLC = DAG.getTargetConstant(0, DL, MVT::i1); + if (!parseCachePolicy(Op.getOperand(3), DAG, &GLC, nullptr, + IsGFX10 ? &DLC : nullptr)) + return Op; + return lowerSBuffer(VT, DL, Op.getOperand(1), Op.getOperand(2), GLC, DLC, + DAG); } case Intrinsic::amdgcn_fdiv_fast: return lowerFDIV_FAST(Op, DAG); @@ -5295,12 +5777,70 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, Op.getOperand(2), Op.getOperand(3), Op.getOperand(4), Glue); } + case Intrinsic::amdgcn_interp_p1_f16: { + SDValue M0 = copyToM0(DAG, DAG.getEntryNode(), DL, Op.getOperand(5)); + SDValue Glue = M0.getValue(1); + if (getSubtarget()->getLDSBankCount() == 16) { + // 16 bank LDS + SDValue S = DAG.getNode(AMDGPUISD::INTERP_MOV, DL, MVT::f32, + DAG.getConstant(2, DL, MVT::i32), // P0 + Op.getOperand(2), // Attrchan + Op.getOperand(3), // Attr + Glue); + SDValue Ops[] = { + Op.getOperand(1), // Src0 + Op.getOperand(2), // Attrchan + Op.getOperand(3), // Attr + DAG.getConstant(0, DL, MVT::i32), // $src0_modifiers + S, // Src2 - holds two f16 values selected by high + DAG.getConstant(0, DL, MVT::i32), // $src2_modifiers + Op.getOperand(4), // high + DAG.getConstant(0, DL, MVT::i1), // $clamp + DAG.getConstant(0, DL, MVT::i32) // $omod + }; + return DAG.getNode(AMDGPUISD::INTERP_P1LV_F16, DL, MVT::f32, Ops); + } else { + // 32 bank LDS + SDValue Ops[] = { + Op.getOperand(1), // Src0 + Op.getOperand(2), // Attrchan + Op.getOperand(3), // Attr + DAG.getConstant(0, DL, MVT::i32), // $src0_modifiers + Op.getOperand(4), // high + DAG.getConstant(0, DL, MVT::i1), // $clamp + DAG.getConstant(0, DL, MVT::i32), // $omod + Glue + }; + return DAG.getNode(AMDGPUISD::INTERP_P1LL_F16, DL, MVT::f32, Ops); + } + } + case Intrinsic::amdgcn_interp_p2_f16: { + SDValue M0 = copyToM0(DAG, DAG.getEntryNode(), DL, Op.getOperand(6)); + SDValue Glue = SDValue(M0.getNode(), 1); + SDValue Ops[] = { + Op.getOperand(2), // Src0 + Op.getOperand(3), // Attrchan + Op.getOperand(4), // Attr + DAG.getConstant(0, DL, MVT::i32), // $src0_modifiers + Op.getOperand(1), // Src2 + DAG.getConstant(0, DL, MVT::i32), // $src2_modifiers + Op.getOperand(5), // high + DAG.getConstant(0, DL, MVT::i1), // $clamp + Glue + }; + return DAG.getNode(AMDGPUISD::INTERP_P2_F16, DL, MVT::f16, Ops); + } case Intrinsic::amdgcn_sin: return DAG.getNode(AMDGPUISD::SIN_HW, DL, VT, Op.getOperand(1)); case Intrinsic::amdgcn_cos: return DAG.getNode(AMDGPUISD::COS_HW, DL, VT, Op.getOperand(1)); + case Intrinsic::amdgcn_mul_u24: + return DAG.getNode(AMDGPUISD::MUL_U24, DL, VT, Op.getOperand(1), Op.getOperand(2)); + case Intrinsic::amdgcn_mul_i24: + return DAG.getNode(AMDGPUISD::MUL_I24, DL, VT, Op.getOperand(1), Op.getOperand(2)); + case Intrinsic::amdgcn_log_clamp: { if (Subtarget->getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS) return SDValue(); @@ -5334,10 +5874,7 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, return DAG.getNode(AMDGPUISD::TRIG_PREOP, DL, VT, Op.getOperand(1), Op.getOperand(2)); case Intrinsic::amdgcn_div_scale: { - // 3rd parameter required to be a constant. - const ConstantSDNode *Param = dyn_cast<ConstantSDNode>(Op.getOperand(3)); - if (!Param) - return DAG.getMergeValues({ DAG.getUNDEF(VT), DAG.getUNDEF(MVT::i1) }, DL); + const ConstantSDNode *Param = cast<ConstantSDNode>(Op.getOperand(3)); // Translate to the operands expected by the machine instruction. The // first parameter must be the same as the first instruction. @@ -5423,6 +5960,23 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, case Intrinsic::amdgcn_fmad_ftz: return DAG.getNode(AMDGPUISD::FMAD_FTZ, DL, VT, Op.getOperand(1), Op.getOperand(2), Op.getOperand(3)); + + case Intrinsic::amdgcn_if_break: + return SDValue(DAG.getMachineNode(AMDGPU::SI_IF_BREAK, DL, VT, + Op->getOperand(1), Op->getOperand(2)), 0); + + case Intrinsic::amdgcn_groupstaticsize: { + Triple::OSType OS = getTargetMachine().getTargetTriple().getOS(); + if (OS == Triple::AMDHSA || OS == Triple::AMDPAL) + return Op; + + const Module *M = MF.getFunction().getParent(); + const GlobalValue *GV = + M->getNamedValue(Intrinsic::getName(Intrinsic::amdgcn_groupstaticsize)); + SDValue GA = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, 0, + SIInstrInfo::MO_ABS32_LO); + return {DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, GA), 0}; + } default: if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr = AMDGPU::getImageDimIntrinsicInfo(IntrinsicID)) @@ -5438,9 +5992,99 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, SDLoc DL(Op); switch (IntrID) { + case Intrinsic::amdgcn_ds_ordered_add: + case Intrinsic::amdgcn_ds_ordered_swap: { + MemSDNode *M = cast<MemSDNode>(Op); + SDValue Chain = M->getOperand(0); + SDValue M0 = M->getOperand(2); + SDValue Value = M->getOperand(3); + unsigned IndexOperand = M->getConstantOperandVal(7); + unsigned WaveRelease = M->getConstantOperandVal(8); + unsigned WaveDone = M->getConstantOperandVal(9); + unsigned ShaderType; + unsigned Instruction; + + unsigned OrderedCountIndex = IndexOperand & 0x3f; + IndexOperand &= ~0x3f; + unsigned CountDw = 0; + + if (Subtarget->getGeneration() >= AMDGPUSubtarget::GFX10) { + CountDw = (IndexOperand >> 24) & 0xf; + IndexOperand &= ~(0xf << 24); + + if (CountDw < 1 || CountDw > 4) { + report_fatal_error( + "ds_ordered_count: dword count must be between 1 and 4"); + } + } + + if (IndexOperand) + report_fatal_error("ds_ordered_count: bad index operand"); + + switch (IntrID) { + case Intrinsic::amdgcn_ds_ordered_add: + Instruction = 0; + break; + case Intrinsic::amdgcn_ds_ordered_swap: + Instruction = 1; + break; + } + + if (WaveDone && !WaveRelease) + report_fatal_error("ds_ordered_count: wave_done requires wave_release"); + + switch (DAG.getMachineFunction().getFunction().getCallingConv()) { + case CallingConv::AMDGPU_CS: + case CallingConv::AMDGPU_KERNEL: + ShaderType = 0; + break; + case CallingConv::AMDGPU_PS: + ShaderType = 1; + break; + case CallingConv::AMDGPU_VS: + ShaderType = 2; + break; + case CallingConv::AMDGPU_GS: + ShaderType = 3; + break; + default: + report_fatal_error("ds_ordered_count unsupported for this calling conv"); + } + + unsigned Offset0 = OrderedCountIndex << 2; + unsigned Offset1 = WaveRelease | (WaveDone << 1) | (ShaderType << 2) | + (Instruction << 4); + + if (Subtarget->getGeneration() >= AMDGPUSubtarget::GFX10) + Offset1 |= (CountDw - 1) << 6; + + unsigned Offset = Offset0 | (Offset1 << 8); + + SDValue Ops[] = { + Chain, + Value, + DAG.getTargetConstant(Offset, DL, MVT::i16), + copyToM0(DAG, Chain, DL, M0).getValue(1), // Glue + }; + return DAG.getMemIntrinsicNode(AMDGPUISD::DS_ORDERED_COUNT, DL, + M->getVTList(), Ops, M->getMemoryVT(), + M->getMemOperand()); + } + case Intrinsic::amdgcn_ds_fadd: { + MemSDNode *M = cast<MemSDNode>(Op); + unsigned Opc; + switch (IntrID) { + case Intrinsic::amdgcn_ds_fadd: + Opc = ISD::ATOMIC_LOAD_FADD; + break; + } + + return DAG.getAtomic(Opc, SDLoc(Op), M->getMemoryVT(), + M->getOperand(0), M->getOperand(2), M->getOperand(3), + M->getMemOperand()); + } case Intrinsic::amdgcn_atomic_inc: case Intrinsic::amdgcn_atomic_dec: - case Intrinsic::amdgcn_ds_fadd: case Intrinsic::amdgcn_ds_fmin: case Intrinsic::amdgcn_ds_fmax: { MemSDNode *M = cast<MemSDNode>(Op); @@ -5452,9 +6096,6 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, case Intrinsic::amdgcn_atomic_dec: Opc = AMDGPUISD::ATOMIC_DEC; break; - case Intrinsic::amdgcn_ds_fadd: - Opc = AMDGPUISD::ATOMIC_LOAD_FADD; - break; case Intrinsic::amdgcn_ds_fmin: Opc = AMDGPUISD::ATOMIC_LOAD_FMIN; break; @@ -5503,8 +6144,14 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, if (LoadVT.getScalarType() == MVT::f16) return adjustLoadValueType(AMDGPUISD::BUFFER_LOAD_FORMAT_D16, M, DAG, Ops); - return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops, IntVT, - M->getMemOperand()); + + // Handle BUFFER_LOAD_BYTE/UBYTE/SHORT/USHORT overloaded intrinsics + if (LoadVT.getScalarType() == MVT::i8 || + LoadVT.getScalarType() == MVT::i16) + return handleByteShortBufferLoads(DAG, LoadVT, DL, Ops, M); + + return getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops, IntVT, + M->getMemOperand(), DAG); } case Intrinsic::amdgcn_raw_buffer_load: case Intrinsic::amdgcn_raw_buffer_load_format: { @@ -5531,8 +6178,14 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, if (LoadVT.getScalarType() == MVT::f16) return adjustLoadValueType(AMDGPUISD::BUFFER_LOAD_FORMAT_D16, M, DAG, Ops); - return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops, IntVT, - M->getMemOperand()); + + // Handle BUFFER_LOAD_BYTE/UBYTE/SHORT/USHORT overloaded intrinsics + if (LoadVT.getScalarType() == MVT::i8 || + LoadVT.getScalarType() == MVT::i16) + return handleByteShortBufferLoads(DAG, LoadVT, DL, Ops, M); + + return getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops, IntVT, + M->getMemOperand(), DAG); } case Intrinsic::amdgcn_struct_buffer_load: case Intrinsic::amdgcn_struct_buffer_load_format: { @@ -5559,8 +6212,14 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, if (LoadVT.getScalarType() == MVT::f16) return adjustLoadValueType(AMDGPUISD::BUFFER_LOAD_FORMAT_D16, M, DAG, Ops); - return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops, IntVT, - M->getMemOperand()); + + // Handle BUFFER_LOAD_BYTE/UBYTE/SHORT/USHORT overloaded intrinsics + if (LoadVT.getScalarType() == MVT::i8 || + LoadVT.getScalarType() == MVT::i16) + return handleByteShortBufferLoads(DAG, LoadVT, DL, Ops, M); + + return getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops, IntVT, + M->getMemOperand(), DAG); } case Intrinsic::amdgcn_tbuffer_load: { MemSDNode *M = cast<MemSDNode>(Op); @@ -5588,9 +6247,9 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, if (LoadVT.getScalarType() == MVT::f16) return adjustLoadValueType(AMDGPUISD::TBUFFER_LOAD_FORMAT_D16, M, DAG, Ops); - return DAG.getMemIntrinsicNode(AMDGPUISD::TBUFFER_LOAD_FORMAT, DL, - Op->getVTList(), Ops, LoadVT, - M->getMemOperand()); + return getMemIntrinsicNode(AMDGPUISD::TBUFFER_LOAD_FORMAT, DL, + Op->getVTList(), Ops, LoadVT, M->getMemOperand(), + DAG); } case Intrinsic::amdgcn_raw_tbuffer_load: { MemSDNode *M = cast<MemSDNode>(Op); @@ -5612,9 +6271,9 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, if (LoadVT.getScalarType() == MVT::f16) return adjustLoadValueType(AMDGPUISD::TBUFFER_LOAD_FORMAT_D16, M, DAG, Ops); - return DAG.getMemIntrinsicNode(AMDGPUISD::TBUFFER_LOAD_FORMAT, DL, - Op->getVTList(), Ops, LoadVT, - M->getMemOperand()); + return getMemIntrinsicNode(AMDGPUISD::TBUFFER_LOAD_FORMAT, DL, + Op->getVTList(), Ops, LoadVT, M->getMemOperand(), + DAG); } case Intrinsic::amdgcn_struct_tbuffer_load: { MemSDNode *M = cast<MemSDNode>(Op); @@ -5636,9 +6295,9 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, if (LoadVT.getScalarType() == MVT::f16) return adjustLoadValueType(AMDGPUISD::TBUFFER_LOAD_FORMAT_D16, M, DAG, Ops); - return DAG.getMemIntrinsicNode(AMDGPUISD::TBUFFER_LOAD_FORMAT, DL, - Op->getVTList(), Ops, LoadVT, - M->getMemOperand()); + return getMemIntrinsicNode(AMDGPUISD::TBUFFER_LOAD_FORMAT, DL, + Op->getVTList(), Ops, LoadVT, M->getMemOperand(), + DAG); } case Intrinsic::amdgcn_buffer_atomic_swap: case Intrinsic::amdgcn_buffer_atomic_add: @@ -5913,6 +6572,39 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, } } +// Call DAG.getMemIntrinsicNode for a load, but first widen a dwordx3 type to +// dwordx4 if on SI. +SDValue SITargetLowering::getMemIntrinsicNode(unsigned Opcode, const SDLoc &DL, + SDVTList VTList, + ArrayRef<SDValue> Ops, EVT MemVT, + MachineMemOperand *MMO, + SelectionDAG &DAG) const { + EVT VT = VTList.VTs[0]; + EVT WidenedVT = VT; + EVT WidenedMemVT = MemVT; + if (!Subtarget->hasDwordx3LoadStores() && + (WidenedVT == MVT::v3i32 || WidenedVT == MVT::v3f32)) { + WidenedVT = EVT::getVectorVT(*DAG.getContext(), + WidenedVT.getVectorElementType(), 4); + WidenedMemVT = EVT::getVectorVT(*DAG.getContext(), + WidenedMemVT.getVectorElementType(), 4); + MMO = DAG.getMachineFunction().getMachineMemOperand(MMO, 0, 16); + } + + assert(VTList.NumVTs == 2); + SDVTList WidenedVTList = DAG.getVTList(WidenedVT, VTList.VTs[1]); + + auto NewOp = DAG.getMemIntrinsicNode(Opcode, DL, WidenedVTList, Ops, + WidenedMemVT, MMO); + if (WidenedVT != VT) { + auto Extract = DAG.getNode( + ISD::EXTRACT_SUBVECTOR, DL, VT, NewOp, + DAG.getConstant(0, DL, getVectorIdxTy(DAG.getDataLayout()))); + NewOp = DAG.getMergeValues({ Extract, SDValue(NewOp.getNode(), 1) }, DL); + } + return NewOp; +} + SDValue SITargetLowering::handleD16VData(SDValue VData, SelectionDAG &DAG) const { EVT StoreVT = VData.getValueType(); @@ -6129,6 +6821,12 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op, AMDGPUISD::BUFFER_STORE : AMDGPUISD::BUFFER_STORE_FORMAT; Opc = IsD16 ? AMDGPUISD::BUFFER_STORE_FORMAT_D16 : Opc; MemSDNode *M = cast<MemSDNode>(Op); + + // Handle BUFFER_STORE_BYTE/SHORT overloaded intrinsics + EVT VDataType = VData.getValueType().getScalarType(); + if (VDataType == MVT::i8 || VDataType == MVT::i16) + return handleByteShortBufferStores(DAG, VDataType, DL, Ops, M); + return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops, M->getMemoryVT(), M->getMemOperand()); } @@ -6155,6 +6853,12 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op, AMDGPUISD::BUFFER_STORE : AMDGPUISD::BUFFER_STORE_FORMAT; Opc = IsD16 ? AMDGPUISD::BUFFER_STORE_FORMAT_D16 : Opc; MemSDNode *M = cast<MemSDNode>(Op); + + // Handle BUFFER_STORE_BYTE/SHORT overloaded intrinsics + EVT VDataType = VData.getValueType().getScalarType(); + if (VDataType == MVT::i8 || VDataType == MVT::i16) + return handleByteShortBufferStores(DAG, VDataType, DL, Ops, M); + return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops, M->getMemoryVT(), M->getMemOperand()); } @@ -6181,10 +6885,63 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op, AMDGPUISD::BUFFER_STORE : AMDGPUISD::BUFFER_STORE_FORMAT; Opc = IsD16 ? AMDGPUISD::BUFFER_STORE_FORMAT_D16 : Opc; MemSDNode *M = cast<MemSDNode>(Op); + + // Handle BUFFER_STORE_BYTE/SHORT overloaded intrinsics + EVT VDataType = VData.getValueType().getScalarType(); + if (VDataType == MVT::i8 || VDataType == MVT::i16) + return handleByteShortBufferStores(DAG, VDataType, DL, Ops, M); + return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops, M->getMemoryVT(), M->getMemOperand()); } + case Intrinsic::amdgcn_buffer_atomic_fadd: { + unsigned Slc = cast<ConstantSDNode>(Op.getOperand(6))->getZExtValue(); + unsigned IdxEn = 1; + if (auto Idx = dyn_cast<ConstantSDNode>(Op.getOperand(4))) + IdxEn = Idx->getZExtValue() != 0; + SDValue Ops[] = { + Chain, + Op.getOperand(2), // vdata + Op.getOperand(3), // rsrc + Op.getOperand(4), // vindex + SDValue(), // voffset -- will be set by setBufferOffsets + SDValue(), // soffset -- will be set by setBufferOffsets + SDValue(), // offset -- will be set by setBufferOffsets + DAG.getConstant(Slc << 1, DL, MVT::i32), // cachepolicy + DAG.getConstant(IdxEn, DL, MVT::i1), // idxen + }; + setBufferOffsets(Op.getOperand(5), DAG, &Ops[4]); + EVT VT = Op.getOperand(2).getValueType(); + + auto *M = cast<MemSDNode>(Op); + unsigned Opcode = VT.isVector() ? AMDGPUISD::BUFFER_ATOMIC_PK_FADD + : AMDGPUISD::BUFFER_ATOMIC_FADD; + + return DAG.getMemIntrinsicNode(Opcode, DL, Op->getVTList(), Ops, VT, + M->getMemOperand()); + } + + case Intrinsic::amdgcn_global_atomic_fadd: { + SDValue Ops[] = { + Chain, + Op.getOperand(2), // ptr + Op.getOperand(3) // vdata + }; + EVT VT = Op.getOperand(3).getValueType(); + + auto *M = cast<MemSDNode>(Op); + unsigned Opcode = VT.isVector() ? AMDGPUISD::ATOMIC_PK_FADD + : AMDGPUISD::ATOMIC_FADD; + + return DAG.getMemIntrinsicNode(Opcode, DL, Op->getVTList(), Ops, VT, + M->getMemOperand()); + } + + case Intrinsic::amdgcn_end_cf: + return SDValue(DAG.getMachineNode(AMDGPU::SI_END_CF, DL, MVT::Other, + Op->getOperand(2), Chain), 0); + default: { if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr = AMDGPU::getImageDimIntrinsicInfo(IntrinsicID)) @@ -6283,6 +7040,38 @@ void SITargetLowering::setBufferOffsets(SDValue CombinedOffset, Offsets[2] = DAG.getConstant(0, DL, MVT::i32); } +// Handle 8 bit and 16 bit buffer loads +SDValue SITargetLowering::handleByteShortBufferLoads(SelectionDAG &DAG, + EVT LoadVT, SDLoc DL, + ArrayRef<SDValue> Ops, + MemSDNode *M) const { + EVT IntVT = LoadVT.changeTypeToInteger(); + unsigned Opc = (LoadVT.getScalarType() == MVT::i8) ? + AMDGPUISD::BUFFER_LOAD_UBYTE : AMDGPUISD::BUFFER_LOAD_USHORT; + + SDVTList ResList = DAG.getVTList(MVT::i32, MVT::Other); + SDValue BufferLoad = DAG.getMemIntrinsicNode(Opc, DL, ResList, + Ops, IntVT, + M->getMemOperand()); + SDValue BufferLoadTrunc = DAG.getNode(ISD::TRUNCATE, DL, + LoadVT.getScalarType(), BufferLoad); + return DAG.getMergeValues({BufferLoadTrunc, BufferLoad.getValue(1)}, DL); +} + +// Handle 8 bit and 16 bit buffer stores +SDValue SITargetLowering::handleByteShortBufferStores(SelectionDAG &DAG, + EVT VDataType, SDLoc DL, + SDValue Ops[], + MemSDNode *M) const { + SDValue BufferStoreExt = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Ops[1]); + Ops[1] = BufferStoreExt; + unsigned Opc = (VDataType == MVT::i8) ? AMDGPUISD::BUFFER_STORE_BYTE : + AMDGPUISD::BUFFER_STORE_SHORT; + ArrayRef<SDValue> OpsRef = makeArrayRef(&Ops[0], 9); + return DAG.getMemIntrinsicNode(Opc, DL, M->getVTList(), OpsRef, VDataType, + M->getMemOperand()); +} + static SDValue getLoadExtOrTrunc(SelectionDAG &DAG, ISD::LoadExtType ExtType, SDValue Op, const SDLoc &SL, EVT VT) { @@ -6395,8 +7184,25 @@ SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const { SDValue NewLD = DAG.getExtLoad(ISD::EXTLOAD, DL, MVT::i32, Chain, BasePtr, RealMemVT, MMO); + if (!MemVT.isVector()) { + SDValue Ops[] = { + DAG.getNode(ISD::TRUNCATE, DL, MemVT, NewLD), + NewLD.getValue(1) + }; + + return DAG.getMergeValues(Ops, DL); + } + + SmallVector<SDValue, 3> Elts; + for (unsigned I = 0, N = MemVT.getVectorNumElements(); I != N; ++I) { + SDValue Elt = DAG.getNode(ISD::SRL, DL, MVT::i32, NewLD, + DAG.getConstant(I, DL, MVT::i32)); + + Elts.push_back(DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, Elt)); + } + SDValue Ops[] = { - DAG.getNode(ISD::TRUNCATE, DL, MemVT, NewLD), + DAG.getBuildVector(MemVT, DL, Elts), NewLD.getValue(1) }; @@ -6409,15 +7215,21 @@ SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const { assert(Op.getValueType().getVectorElementType() == MVT::i32 && "Custom lowering for non-i32 vectors hasn't been implemented."); - unsigned Alignment = Load->getAlignment(); - unsigned AS = Load->getAddressSpace(); if (!allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), MemVT, - AS, Alignment)) { + *Load->getMemOperand())) { SDValue Ops[2]; std::tie(Ops[0], Ops[1]) = expandUnalignedLoad(Load, DAG); return DAG.getMergeValues(Ops, DL); } + unsigned Alignment = Load->getAlignment(); + unsigned AS = Load->getAddressSpace(); + if (Subtarget->hasLDSMisalignedBug() && + AS == AMDGPUAS::FLAT_ADDRESS && + Alignment < MemVT.getStoreSize() && MemVT.getSizeInBits() > 32) { + return SplitVectorLoad(Op, DAG); + } + MachineFunction &MF = DAG.getMachineFunction(); SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); // If there is a possibilty that flat instruction access scratch memory @@ -6430,8 +7242,13 @@ SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const { if (AS == AMDGPUAS::CONSTANT_ADDRESS || AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) { - if (!Op->isDivergent() && Alignment >= 4 && NumElements < 32) - return SDValue(); + if (!Op->isDivergent() && Alignment >= 4 && NumElements < 32) { + if (MemVT.isPow2VectorType()) + return SDValue(); + if (NumElements == 3) + return WidenVectorLoad(Op, DAG); + return SplitVectorLoad(Op, DAG); + } // Non-uniform loads will be selected to MUBUF instructions, so they // have the same legalization requirements as global and private // loads. @@ -6443,8 +7260,13 @@ SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const { AS == AMDGPUAS::GLOBAL_ADDRESS) { if (Subtarget->getScalarizeGlobalBehavior() && !Op->isDivergent() && !Load->isVolatile() && isMemOpHasNoClobberedMemOperand(Load) && - Alignment >= 4 && NumElements < 32) - return SDValue(); + Alignment >= 4 && NumElements < 32) { + if (MemVT.isPow2VectorType()) + return SDValue(); + if (NumElements == 3) + return WidenVectorLoad(Op, DAG); + return SplitVectorLoad(Op, DAG); + } // Non-uniform loads will be selected to MUBUF instructions, so they // have the same legalization requirements as global and private // loads. @@ -6456,7 +7278,10 @@ SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const { AS == AMDGPUAS::FLAT_ADDRESS) { if (NumElements > 4) return SplitVectorLoad(Op, DAG); - // v4 loads are supported for private and global memory. + // v3 loads not supported on SI. + if (NumElements == 3 && !Subtarget->hasDwordx3LoadStores()) + return WidenVectorLoad(Op, DAG); + // v3 and v4 loads are supported for private and global memory. return SDValue(); } if (AS == AMDGPUAS::PRIVATE_ADDRESS) { @@ -6474,11 +7299,14 @@ SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const { // Same as global/flat if (NumElements > 4) return SplitVectorLoad(Op, DAG); + // v3 loads not supported on SI. + if (NumElements == 3 && !Subtarget->hasDwordx3LoadStores()) + return WidenVectorLoad(Op, DAG); return SDValue(); default: llvm_unreachable("unsupported private_element_size"); } - } else if (AS == AMDGPUAS::LOCAL_ADDRESS) { + } else if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) { // Use ds_read_b128 if possible. if (Subtarget->useDS128() && Load->getAlignment() >= 16 && MemVT.getStoreSize() == 16) @@ -6794,7 +7622,7 @@ SDValue SITargetLowering::LowerFDIV64(SDValue Op, SelectionDAG &DAG) const { SDValue Scale; - if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS) { + if (!Subtarget->hasUsableDivScaleConditionOutput()) { // Workaround a hardware bug on SI where the condition output from div_scale // is not usable. @@ -6856,12 +7684,18 @@ SDValue SITargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const { assert(VT.isVector() && Store->getValue().getValueType().getScalarType() == MVT::i32); - unsigned AS = Store->getAddressSpace(); if (!allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), VT, - AS, Store->getAlignment())) { + *Store->getMemOperand())) { return expandUnalignedStore(Store, DAG); } + unsigned AS = Store->getAddressSpace(); + if (Subtarget->hasLDSMisalignedBug() && + AS == AMDGPUAS::FLAT_ADDRESS && + Store->getAlignment() < VT.getStoreSize() && VT.getSizeInBits() > 32) { + return SplitVectorStore(Op, DAG); + } + MachineFunction &MF = DAG.getMachineFunction(); SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); // If there is a possibilty that flat instruction access scratch memory @@ -6875,6 +7709,9 @@ SDValue SITargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const { AS == AMDGPUAS::FLAT_ADDRESS) { if (NumElements > 4) return SplitVectorStore(Op, DAG); + // v3 stores not supported on SI. + if (NumElements == 3 && !Subtarget->hasDwordx3LoadStores()) + return SplitVectorStore(Op, DAG); return SDValue(); } else if (AS == AMDGPUAS::PRIVATE_ADDRESS) { switch (Subtarget->getMaxPrivateElementSize()) { @@ -6885,16 +7722,16 @@ SDValue SITargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const { return SplitVectorStore(Op, DAG); return SDValue(); case 16: - if (NumElements > 4) + if (NumElements > 4 || NumElements == 3) return SplitVectorStore(Op, DAG); return SDValue(); default: llvm_unreachable("unsupported private_element_size"); } - } else if (AS == AMDGPUAS::LOCAL_ADDRESS) { + } else if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) { // Use ds_write_b128 if possible. if (Subtarget->useDS128() && Store->getAlignment() >= 16 && - VT.getStoreSize() == 16) + VT.getStoreSize() == 16 && NumElements != 3) return SDValue(); if (NumElements > 2) @@ -6905,7 +7742,7 @@ SDValue SITargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const { // out-of-bounds even if base + offsets is in bounds. Split vectorized // stores here to avoid emitting ds_write2_b32. We may re-combine the // store later in the SILoadStoreOptimizer. - if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS && + if (!Subtarget->hasUsableDSOffset() && NumElements == 2 && VT.getStoreSize() == 8 && Store->getAlignment() < 8) { return SplitVectorStore(Op, DAG); @@ -7614,6 +8451,43 @@ SDValue SITargetLowering::performZeroExtendCombine(SDNode *N, return SDValue(); } +SDValue SITargetLowering::performSignExtendInRegCombine(SDNode *N, + DAGCombinerInfo &DCI) + const { + SDValue Src = N->getOperand(0); + auto *VTSign = cast<VTSDNode>(N->getOperand(1)); + + if (((Src.getOpcode() == AMDGPUISD::BUFFER_LOAD_UBYTE && + VTSign->getVT() == MVT::i8) || + (Src.getOpcode() == AMDGPUISD::BUFFER_LOAD_USHORT && + VTSign->getVT() == MVT::i16)) && + Src.hasOneUse()) { + auto *M = cast<MemSDNode>(Src); + SDValue Ops[] = { + Src.getOperand(0), // Chain + Src.getOperand(1), // rsrc + Src.getOperand(2), // vindex + Src.getOperand(3), // voffset + Src.getOperand(4), // soffset + Src.getOperand(5), // offset + Src.getOperand(6), + Src.getOperand(7) + }; + // replace with BUFFER_LOAD_BYTE/SHORT + SDVTList ResList = DCI.DAG.getVTList(MVT::i32, + Src.getOperand(0).getValueType()); + unsigned Opc = (Src.getOpcode() == AMDGPUISD::BUFFER_LOAD_UBYTE) ? + AMDGPUISD::BUFFER_LOAD_BYTE : AMDGPUISD::BUFFER_LOAD_SHORT; + SDValue BufferLoadSignExt = DCI.DAG.getMemIntrinsicNode(Opc, SDLoc(N), + ResList, + Ops, M->getMemoryVT(), + M->getMemOperand()); + return DCI.DAG.getMergeValues({BufferLoadSignExt, + BufferLoadSignExt.getValue(1)}, SDLoc(N)); + } + return SDValue(); +} + SDValue SITargetLowering::performClassCombine(SDNode *N, DAGCombinerInfo &DCI) const { SelectionDAG &DAG = DCI.DAG; @@ -8013,9 +8887,12 @@ SDValue SITargetLowering::performFPMed3ImmCombine(SelectionDAG &DAG, if (Cmp == APFloat::cmpGreaterThan) return SDValue(); + const MachineFunction &MF = DAG.getMachineFunction(); + const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); + // TODO: Check IEEE bit enabled? EVT VT = Op0.getValueType(); - if (Subtarget->enableDX10Clamp()) { + if (Info->getMode().DX10Clamp) { // If dx10_clamp is enabled, NaNs clamp to 0.0. This is the same as the // hardware fmed3 behavior converting to a min. // FIXME: Should this be allowing -0.0? @@ -8059,10 +8936,10 @@ SDValue SITargetLowering::performMinMaxCombine(SDNode *N, // Only do this if the inner op has one use since this will just increases // register pressure for no benefit. - if (Opc != AMDGPUISD::FMIN_LEGACY && Opc != AMDGPUISD::FMAX_LEGACY && - !VT.isVector() && VT != MVT::f64 && - ((VT != MVT::f16 && VT != MVT::i16) || Subtarget->hasMin3Max3_16())) { + !VT.isVector() && + (VT == MVT::i32 || VT == MVT::f32 || + ((VT == MVT::f16 || VT == MVT::i16) && Subtarget->hasMin3Max3_16()))) { // max(max(a, b), c) -> max3(a, b, c) // min(min(a, b), c) -> min3(a, b, c) if (Op0.getOpcode() == Opc && Op0.hasOneUse()) { @@ -8149,9 +9026,12 @@ SDValue SITargetLowering::performFMed3Combine(SDNode *N, return DAG.getNode(AMDGPUISD::CLAMP, SL, VT, Src2); } + const MachineFunction &MF = DAG.getMachineFunction(); + const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); + // FIXME: dx10_clamp behavior assumed in instcombine. Should we really bother // handling no dx10-clamp? - if (Subtarget->enableDX10Clamp()) { + if (Info->getMode().DX10Clamp) { // If NaNs is clamped to 0, we are free to reorder the inputs. if (isa<ConstantFPSDNode>(Src0) && !isa<ConstantFPSDNode>(Src1)) @@ -8342,8 +9222,10 @@ unsigned SITargetLowering::getFusedOpcode(const SelectionDAG &DAG, // Only do this if we are not trying to support denormals. v_mad_f32 does not // support denormals ever. - if ((VT == MVT::f32 && !Subtarget->hasFP32Denormals()) || - (VT == MVT::f16 && !Subtarget->hasFP16Denormals())) + if (((VT == MVT::f32 && !Subtarget->hasFP32Denormals()) || + (VT == MVT::f16 && !Subtarget->hasFP16Denormals() && + getSubtarget()->hasMadF16())) && + isOperationLegal(ISD::FMAD, VT)) return ISD::FMAD; const TargetOptions &Options = DAG.getTarget().Options; @@ -8357,6 +9239,46 @@ unsigned SITargetLowering::getFusedOpcode(const SelectionDAG &DAG, return 0; } +// For a reassociatable opcode perform: +// op x, (op y, z) -> op (op x, z), y, if x and z are uniform +SDValue SITargetLowering::reassociateScalarOps(SDNode *N, + SelectionDAG &DAG) const { + EVT VT = N->getValueType(0); + if (VT != MVT::i32 && VT != MVT::i64) + return SDValue(); + + unsigned Opc = N->getOpcode(); + SDValue Op0 = N->getOperand(0); + SDValue Op1 = N->getOperand(1); + + if (!(Op0->isDivergent() ^ Op1->isDivergent())) + return SDValue(); + + if (Op0->isDivergent()) + std::swap(Op0, Op1); + + if (Op1.getOpcode() != Opc || !Op1.hasOneUse()) + return SDValue(); + + SDValue Op2 = Op1.getOperand(1); + Op1 = Op1.getOperand(0); + if (!(Op1->isDivergent() ^ Op2->isDivergent())) + return SDValue(); + + if (Op1->isDivergent()) + std::swap(Op1, Op2); + + // If either operand is constant this will conflict with + // DAGCombiner::ReassociateOps(). + if (DAG.isConstantIntBuildVectorOrConstantInt(Op0) || + DAG.isConstantIntBuildVectorOrConstantInt(Op1)) + return SDValue(); + + SDLoc SL(N); + SDValue Add1 = DAG.getNode(Opc, SL, VT, Op0, Op1); + return DAG.getNode(Opc, SL, VT, Add1, Op2); +} + static SDValue getMad64_32(SelectionDAG &DAG, const SDLoc &SL, EVT VT, SDValue N0, SDValue N1, SDValue N2, @@ -8405,6 +9327,10 @@ SDValue SITargetLowering::performAddCombine(SDNode *N, return SDValue(); } + if (SDValue V = reassociateScalarOps(N, DAG)) { + return V; + } + if (VT != MVT::i32 || !DCI.isAfterLegalizeDAG()) return SDValue(); @@ -8452,14 +9378,10 @@ SDValue SITargetLowering::performSubCombine(SDNode *N, SDValue LHS = N->getOperand(0); SDValue RHS = N->getOperand(1); - unsigned Opc = LHS.getOpcode(); - if (Opc != ISD::SUBCARRY) - std::swap(RHS, LHS); - if (LHS.getOpcode() == ISD::SUBCARRY) { // sub (subcarry x, 0, cc), y => subcarry x, y, cc auto C = dyn_cast<ConstantSDNode>(LHS.getOperand(1)); - if (!C || C->getZExtValue() != 0) + if (!C || !C->isNullValue()) return SDValue(); SDValue Args[] = { LHS.getOperand(0), RHS, LHS.getOperand(2) }; return DAG.getNode(ISD::SUBCARRY, SDLoc(N), LHS->getVTList(), Args); @@ -8587,7 +9509,7 @@ SDValue SITargetLowering::performFMACombine(SDNode *N, EVT VT = N->getValueType(0); SDLoc SL(N); - if (!Subtarget->hasDotInsts() || VT != MVT::f32) + if (!Subtarget->hasDot2Insts() || VT != MVT::f32) return SDValue(); // FMA((F32)S0.x, (F32)S1. x, FMA((F32)S0.y, (F32)S1.y, (F32)z)) -> @@ -8801,11 +9723,13 @@ SDValue SITargetLowering::performClampCombine(SDNode *N, if (!CSrc) return SDValue(); + const MachineFunction &MF = DCI.DAG.getMachineFunction(); const APFloat &F = CSrc->getValueAPF(); APFloat Zero = APFloat::getZero(F.getSemantics()); APFloat::cmpResult Cmp0 = F.compare(Zero); if (Cmp0 == APFloat::cmpLessThan || - (Cmp0 == APFloat::cmpUnordered && Subtarget->enableDX10Clamp())) { + (Cmp0 == APFloat::cmpUnordered && + MF.getInfo<SIMachineFunctionInfo>()->getMode().DX10Clamp)) { return DCI.DAG.getConstantFP(Zero, SDLoc(N), N->getValueType(0)); } @@ -8822,7 +9746,6 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const { if (getTargetMachine().getOptLevel() == CodeGenOpt::None) return SDValue(); - switch (N->getOpcode()) { default: return AMDGPUTargetLowering::PerformDAGCombine(N, DCI); @@ -8873,11 +9796,11 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N, case ISD::ATOMIC_LOAD_MAX: case ISD::ATOMIC_LOAD_UMIN: case ISD::ATOMIC_LOAD_UMAX: + case ISD::ATOMIC_LOAD_FADD: case AMDGPUISD::ATOMIC_INC: case AMDGPUISD::ATOMIC_DEC: - case AMDGPUISD::ATOMIC_LOAD_FADD: case AMDGPUISD::ATOMIC_LOAD_FMIN: - case AMDGPUISD::ATOMIC_LOAD_FMAX: // TODO: Target mem intrinsics. + case AMDGPUISD::ATOMIC_LOAD_FMAX: // TODO: Target mem intrinsics. if (DCI.isBeforeLegalize()) break; return performMemSDNodeCombine(cast<MemSDNode>(N), DCI); @@ -8889,6 +9812,8 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N, return performXorCombine(N, DCI); case ISD::ZERO_EXTEND: return performZeroExtendCombine(N, DCI); + case ISD::SIGN_EXTEND_INREG: + return performSignExtendInRegCombine(N , DCI); case AMDGPUISD::FP_CLASS: return performClassCombine(N, DCI); case ISD::FCANONICALIZE: @@ -9034,6 +9959,10 @@ SDNode *SITargetLowering::adjustWritemask(MachineSDNode *&Node, // Don't allow 0 dmask, as hardware assumes one channel enabled. bool NoChannels = !NewDmask; if (NoChannels) { + if (!UsesTFC) { + // No uses of the result and not using TFC. Then do nothing. + return Node; + } // If the original dmask has one channel - then nothing to do if (OldBitsSet == 1) return Node; @@ -9205,7 +10134,8 @@ SDNode *SITargetLowering::PostISelFolding(MachineSDNode *Node, break; MVT VT = Src0.getValueType().getSimpleVT(); - const TargetRegisterClass *RC = getRegClassFor(VT); + const TargetRegisterClass *RC = + getRegClassFor(VT, Src0.getNode()->isDivergent()); MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo(); SDValue UndefReg = DAG.getRegister(MRI.createVirtualRegister(RC), VT); @@ -9238,6 +10168,24 @@ SDNode *SITargetLowering::PostISelFolding(MachineSDNode *Node, Ops.push_back(ImpDef.getValue(1)); return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops); } + case AMDGPU::V_PERMLANE16_B32: + case AMDGPU::V_PERMLANEX16_B32: { + ConstantSDNode *FI = cast<ConstantSDNode>(Node->getOperand(0)); + ConstantSDNode *BC = cast<ConstantSDNode>(Node->getOperand(2)); + if (!FI->getZExtValue() && !BC->getZExtValue()) + break; + SDValue VDstIn = Node->getOperand(6); + if (VDstIn.isMachineOpcode() + && VDstIn.getMachineOpcode() == AMDGPU::IMPLICIT_DEF) + break; + MachineSDNode *ImpDef = DAG.getMachineNode(TargetOpcode::IMPLICIT_DEF, + SDLoc(Node), MVT::i32); + SmallVector<SDValue, 8> Ops = { SDValue(FI, 0), Node->getOperand(1), + SDValue(BC, 0), Node->getOperand(3), + Node->getOperand(4), Node->getOperand(5), + SDValue(ImpDef, 0), Node->getOperand(7) }; + return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops); + } default: break; } @@ -9256,6 +10204,36 @@ void SITargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI, if (TII->isVOP3(MI.getOpcode())) { // Make sure constant bus requirements are respected. TII->legalizeOperandsVOP3(MRI, MI); + + // Prefer VGPRs over AGPRs in mAI instructions where possible. + // This saves a chain-copy of registers and better ballance register + // use between vgpr and agpr as agpr tuples tend to be big. + if (const MCOperandInfo *OpInfo = MI.getDesc().OpInfo) { + unsigned Opc = MI.getOpcode(); + const SIRegisterInfo *TRI = Subtarget->getRegisterInfo(); + for (auto I : { AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0), + AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1) }) { + if (I == -1) + break; + MachineOperand &Op = MI.getOperand(I); + if ((OpInfo[I].RegClass != llvm::AMDGPU::AV_64RegClassID && + OpInfo[I].RegClass != llvm::AMDGPU::AV_32RegClassID) || + !TargetRegisterInfo::isVirtualRegister(Op.getReg()) || + !TRI->isAGPR(MRI, Op.getReg())) + continue; + auto *Src = MRI.getUniqueVRegDef(Op.getReg()); + if (!Src || !Src->isCopy() || + !TRI->isSGPRReg(MRI, Src->getOperand(1).getReg())) + continue; + auto *RC = TRI->getRegClassForReg(MRI, Op.getReg()); + auto *NewRC = TRI->getEquivalentVGPRClass(RC); + // All uses of agpr64 and agpr32 can also accept vgpr except for + // v_accvgpr_read, but we do not produce agpr reads during selection, + // so no use checks are needed. + MRI.setRegClass(Op.getReg(), NewRC); + } + } + return; } @@ -9391,9 +10369,15 @@ SITargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, case 64: RC = &AMDGPU::SGPR_64RegClass; break; + case 96: + RC = &AMDGPU::SReg_96RegClass; + break; case 128: RC = &AMDGPU::SReg_128RegClass; break; + case 160: + RC = &AMDGPU::SReg_160RegClass; + break; case 256: RC = &AMDGPU::SReg_256RegClass; break; @@ -9419,6 +10403,9 @@ SITargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, case 128: RC = &AMDGPU::VReg_128RegClass; break; + case 160: + RC = &AMDGPU::VReg_160RegClass; + break; case 256: RC = &AMDGPU::VReg_256RegClass; break; @@ -9427,6 +10414,29 @@ SITargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, break; } break; + case 'a': + switch (VT.getSizeInBits()) { + default: + return std::make_pair(0U, nullptr); + case 32: + case 16: + RC = &AMDGPU::AGPR_32RegClass; + break; + case 64: + RC = &AMDGPU::AReg_64RegClass; + break; + case 128: + RC = &AMDGPU::AReg_128RegClass; + break; + case 512: + RC = &AMDGPU::AReg_512RegClass; + break; + case 1024: + RC = &AMDGPU::AReg_1024RegClass; + // v32 types are not legal but we support them here. + return std::make_pair(0U, RC); + } + break; } // We actually support i128, i16 and f16 as inline parameters // even if they are not reported as legal @@ -9440,6 +10450,8 @@ SITargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, RC = &AMDGPU::VGPR_32RegClass; } else if (Constraint[1] == 's') { RC = &AMDGPU::SGPR_32RegClass; + } else if (Constraint[1] == 'a') { + RC = &AMDGPU::AGPR_32RegClass; } if (RC) { @@ -9459,6 +10471,7 @@ SITargetLowering::getConstraintType(StringRef Constraint) const { default: break; case 's': case 'v': + case 'a': return C_RegisterClass; } } @@ -9471,7 +10484,7 @@ SITargetLowering::getConstraintType(StringRef Constraint) const { void SITargetLowering::finalizeLowering(MachineFunction &MF) const { MachineRegisterInfo &MRI = MF.getRegInfo(); SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); - const MachineFrameInfo &MFI = MF.getFrameInfo(); + const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); const SIRegisterInfo *TRI = Subtarget->getRegisterInfo(); if (Info->isEntryFunction()) { @@ -9479,31 +10492,45 @@ void SITargetLowering::finalizeLowering(MachineFunction &MF) const { reservePrivateMemoryRegs(getTargetMachine(), MF, *TRI, *Info); } - // We have to assume the SP is needed in case there are calls in the function - // during lowering. Calls are only detected after the function is - // lowered. We're about to reserve registers, so don't bother using it if we - // aren't really going to use it. - bool NeedSP = !Info->isEntryFunction() || - MFI.hasVarSizedObjects() || - MFI.hasCalls(); + assert(!TRI->isSubRegister(Info->getScratchRSrcReg(), + Info->getStackPtrOffsetReg())); + if (Info->getStackPtrOffsetReg() != AMDGPU::SP_REG) + MRI.replaceRegWith(AMDGPU::SP_REG, Info->getStackPtrOffsetReg()); - if (NeedSP) { - unsigned ReservedStackPtrOffsetReg = TRI->reservedStackPtrOffsetReg(MF); - Info->setStackPtrOffsetReg(ReservedStackPtrOffsetReg); + // We need to worry about replacing the default register with itself in case + // of MIR testcases missing the MFI. + if (Info->getScratchRSrcReg() != AMDGPU::PRIVATE_RSRC_REG) + MRI.replaceRegWith(AMDGPU::PRIVATE_RSRC_REG, Info->getScratchRSrcReg()); - assert(Info->getStackPtrOffsetReg() != Info->getFrameOffsetReg()); - assert(!TRI->isSubRegister(Info->getScratchRSrcReg(), - Info->getStackPtrOffsetReg())); - MRI.replaceRegWith(AMDGPU::SP_REG, Info->getStackPtrOffsetReg()); - } + if (Info->getFrameOffsetReg() != AMDGPU::FP_REG) + MRI.replaceRegWith(AMDGPU::FP_REG, Info->getFrameOffsetReg()); - MRI.replaceRegWith(AMDGPU::PRIVATE_RSRC_REG, Info->getScratchRSrcReg()); - MRI.replaceRegWith(AMDGPU::FP_REG, Info->getFrameOffsetReg()); - MRI.replaceRegWith(AMDGPU::SCRATCH_WAVE_OFFSET_REG, - Info->getScratchWaveOffsetReg()); + if (Info->getScratchWaveOffsetReg() != AMDGPU::SCRATCH_WAVE_OFFSET_REG) { + MRI.replaceRegWith(AMDGPU::SCRATCH_WAVE_OFFSET_REG, + Info->getScratchWaveOffsetReg()); + } Info->limitOccupancy(MF); + if (ST.isWave32() && !MF.empty()) { + // Add VCC_HI def because many instructions marked as imp-use VCC where + // we may only define VCC_LO. If nothing defines VCC_HI we may end up + // having a use of undef. + + const SIInstrInfo *TII = ST.getInstrInfo(); + DebugLoc DL; + + MachineBasicBlock &MBB = MF.front(); + MachineBasicBlock::iterator I = MBB.getFirstNonDebugInstr(); + BuildMI(MBB, I, DL, TII->get(TargetOpcode::IMPLICIT_DEF), AMDGPU::VCC_HI); + + for (auto &MBB : MF) { + for (auto &MI : MBB) { + TII->fixImplicitOperands(MI); + } + } + } + TargetLoweringBase::finalizeLowering(MF); } @@ -9515,14 +10542,81 @@ void SITargetLowering::computeKnownBitsForFrameIndex(const SDValue Op, TargetLowering::computeKnownBitsForFrameIndex(Op, Known, DemandedElts, DAG, Depth); - if (getSubtarget()->enableHugePrivateBuffer()) - return; - - // Technically it may be possible to have a dispatch with a single workitem - // that uses the full private memory size, but that's not really useful. We - // can't use vaddr in MUBUF instructions if we don't know the address + // Set the high bits to zero based on the maximum allowed scratch size per + // wave. We can't use vaddr in MUBUF instructions if we don't know the address // calculation won't overflow, so assume the sign bit is never set. - Known.Zero.setHighBits(AssumeFrameIndexHighZeroBits); + Known.Zero.setHighBits(getSubtarget()->getKnownHighZeroBitsForFrameIndex()); +} + +unsigned SITargetLowering::getPrefLoopAlignment(MachineLoop *ML) const { + const unsigned PrefAlign = TargetLowering::getPrefLoopAlignment(ML); + const unsigned CacheLineAlign = 6; // log2(64) + + // Pre-GFX10 target did not benefit from loop alignment + if (!ML || DisableLoopAlignment || + (getSubtarget()->getGeneration() < AMDGPUSubtarget::GFX10) || + getSubtarget()->hasInstFwdPrefetchBug()) + return PrefAlign; + + // On GFX10 I$ is 4 x 64 bytes cache lines. + // By default prefetcher keeps one cache line behind and reads two ahead. + // We can modify it with S_INST_PREFETCH for larger loops to have two lines + // behind and one ahead. + // Therefor we can benefit from aligning loop headers if loop fits 192 bytes. + // If loop fits 64 bytes it always spans no more than two cache lines and + // does not need an alignment. + // Else if loop is less or equal 128 bytes we do not need to modify prefetch, + // Else if loop is less or equal 192 bytes we need two lines behind. + + const SIInstrInfo *TII = getSubtarget()->getInstrInfo(); + const MachineBasicBlock *Header = ML->getHeader(); + if (Header->getAlignment() != PrefAlign) + return Header->getAlignment(); // Already processed. + + unsigned LoopSize = 0; + for (const MachineBasicBlock *MBB : ML->blocks()) { + // If inner loop block is aligned assume in average half of the alignment + // size to be added as nops. + if (MBB != Header) + LoopSize += (1 << MBB->getAlignment()) / 2; + + for (const MachineInstr &MI : *MBB) { + LoopSize += TII->getInstSizeInBytes(MI); + if (LoopSize > 192) + return PrefAlign; + } + } + + if (LoopSize <= 64) + return PrefAlign; + + if (LoopSize <= 128) + return CacheLineAlign; + + // If any of parent loops is surrounded by prefetch instructions do not + // insert new for inner loop, which would reset parent's settings. + for (MachineLoop *P = ML->getParentLoop(); P; P = P->getParentLoop()) { + if (MachineBasicBlock *Exit = P->getExitBlock()) { + auto I = Exit->getFirstNonDebugInstr(); + if (I != Exit->end() && I->getOpcode() == AMDGPU::S_INST_PREFETCH) + return CacheLineAlign; + } + } + + MachineBasicBlock *Pre = ML->getLoopPreheader(); + MachineBasicBlock *Exit = ML->getExitBlock(); + + if (Pre && Exit) { + BuildMI(*Pre, Pre->getFirstTerminator(), DebugLoc(), + TII->get(AMDGPU::S_INST_PREFETCH)) + .addImm(1); // prefetch 2 lines behind PC + + BuildMI(*Exit, Exit->getFirstNonDebugInstr(), DebugLoc(), + TII->get(AMDGPU::S_INST_PREFETCH)) + .addImm(2); // prefetch 1 line behind PC + } + + return CacheLineAlign; } LLVM_ATTRIBUTE_UNUSED @@ -9531,7 +10625,8 @@ static bool isCopyFromRegOfInlineAsm(const SDNode *N) { do { // Follow the chain until we find an INLINEASM node. N = N->getOperand(0).getNode(); - if (N->getOpcode() == ISD::INLINEASM) + if (N->getOpcode() == ISD::INLINEASM || + N->getOpcode() == ISD::INLINEASM_BR) return true; } while (N->getOpcode() == ISD::CopyFromReg); return false; @@ -9616,7 +10711,10 @@ bool SITargetLowering::isKnownNeverNaNForTargetNode(SDValue Op, bool SNaN, unsigned Depth) const { if (Op.getOpcode() == AMDGPUISD::CLAMP) { - if (Subtarget->enableDX10Clamp()) + const MachineFunction &MF = DAG.getMachineFunction(); + const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); + + if (Info->getMode().DX10Clamp) return true; // Clamped to 0. return DAG.isKnownNeverNaN(Op.getOperand(0), SNaN, Depth + 1); } @@ -9624,3 +10722,29 @@ bool SITargetLowering::isKnownNeverNaNForTargetNode(SDValue Op, return AMDGPUTargetLowering::isKnownNeverNaNForTargetNode(Op, DAG, SNaN, Depth); } + +TargetLowering::AtomicExpansionKind +SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const { + switch (RMW->getOperation()) { + case AtomicRMWInst::FAdd: { + Type *Ty = RMW->getType(); + + // We don't have a way to support 16-bit atomics now, so just leave them + // as-is. + if (Ty->isHalfTy()) + return AtomicExpansionKind::None; + + if (!Ty->isFloatTy()) + return AtomicExpansionKind::CmpXChg; + + // TODO: Do have these for flat. Older targets also had them for buffers. + unsigned AS = RMW->getPointerAddressSpace(); + return (AS == AMDGPUAS::LOCAL_ADDRESS && Subtarget->hasLDSFPAtomics()) ? + AtomicExpansionKind::None : AtomicExpansionKind::CmpXChg; + } + default: + break; + } + + return AMDGPUTargetLowering::shouldExpandAtomicRMWInIR(RMW); +} |