diff options
Diffstat (limited to 'llvm/lib/Target/AMDGPU/SIISelLowering.cpp')
| -rw-r--r-- | llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 306 |
1 files changed, 227 insertions, 79 deletions
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index d98acfc6c532..519c5b936536 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -19,10 +19,12 @@ #include "SIRegisterInfo.h" #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/LegacyDivergenceAnalysis.h" +#include "llvm/Analysis/OptimizationRemarkEmitter.h" #include "llvm/BinaryFormat/ELF.h" #include "llvm/CodeGen/Analysis.h" #include "llvm/CodeGen/FunctionLoweringInfo.h" #include "llvm/CodeGen/GlobalISel/GISelKnownBits.h" +#include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineLoopInfo.h" #include "llvm/IR/DiagnosticInfo.h" #include "llvm/IR/IntrinsicInst.h" @@ -465,11 +467,15 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, if (!Subtarget->hasBCNT(64)) setOperationAction(ISD::CTPOP, MVT::i64, Expand); - if (Subtarget->hasFFBH()) + if (Subtarget->hasFFBH()) { + setOperationAction(ISD::CTLZ, MVT::i32, Custom); setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32, Custom); + } - if (Subtarget->hasFFBL()) + if (Subtarget->hasFFBL()) { + setOperationAction(ISD::CTTZ, MVT::i32, Custom); setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32, Custom); + } // We only really have 32-bit BFE instructions (and 16-bit on VI). // @@ -1061,7 +1067,7 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, AMDGPU::lookupRsrcIntrinsic(IntrID)) { AttributeList Attr = Intrinsic::getAttributes(CI.getContext(), (Intrinsic::ID)IntrID); - if (Attr.hasFnAttribute(Attribute::ReadNone)) + if (Attr.hasFnAttr(Attribute::ReadNone)) return false; SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>(); @@ -1076,7 +1082,7 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, } Info.flags = MachineMemOperand::MODereferenceable; - if (Attr.hasFnAttribute(Attribute::ReadOnly)) { + if (Attr.hasFnAttr(Attribute::ReadOnly)) { unsigned DMaskLanes = 4; if (RsrcIntr->IsImage) { @@ -1100,7 +1106,7 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, // FIXME: What does alignment mean for an image? Info.opc = ISD::INTRINSIC_W_CHAIN; Info.flags |= MachineMemOperand::MOLoad; - } else if (Attr.hasFnAttribute(Attribute::WriteOnly)) { + } else if (Attr.hasFnAttr(Attribute::WriteOnly)) { Info.opc = ISD::INTRINSIC_VOID; Type *DataTy = CI.getArgOperand(0)->getType(); @@ -1423,7 +1429,7 @@ bool SITargetLowering::isLegalAddressingMode(const DataLayout &DL, } bool SITargetLowering::canMergeStoresTo(unsigned AS, EVT MemVT, - const SelectionDAG &DAG) const { + const MachineFunction &MF) const { if (AS == AMDGPUAS::GLOBAL_ADDRESS || AS == AMDGPUAS::FLAT_ADDRESS) { return (MemVT.getSizeInBits() <= 4 * 32); } else if (AS == AMDGPUAS::PRIVATE_ADDRESS) { @@ -1657,12 +1663,17 @@ SDValue SITargetLowering::lowerKernArgParameterPtr(SelectionDAG &DAG, const ArgDescriptor *InputPtrReg; const TargetRegisterClass *RC; LLT ArgTy; + MVT PtrVT = getPointerTy(DL, AMDGPUAS::CONSTANT_ADDRESS); std::tie(InputPtrReg, RC, ArgTy) = Info->getPreloadedValue(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR); + // We may not have the kernarg segment argument if we have no kernel + // arguments. + if (!InputPtrReg) + return DAG.getConstant(0, SL, PtrVT); + MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo(); - MVT PtrVT = getPointerTy(DL, AMDGPUAS::CONSTANT_ADDRESS); SDValue BasePtr = DAG.getCopyFromReg(Chain, SL, MRI.getLiveInVirtReg(InputPtrReg->getRegister()), PtrVT); @@ -1808,6 +1819,19 @@ SDValue SITargetLowering::getPreloadedValue(SelectionDAG &DAG, LLT Ty; std::tie(Reg, RC, Ty) = MFI.getPreloadedValue(PVID); + if (!Reg) { + if (PVID == AMDGPUFunctionArgInfo::PreloadedValue::KERNARG_SEGMENT_PTR) { + // It's possible for a kernarg intrinsic call to appear in a kernel with + // no allocated segment, in which case we do not add the user sgpr + // argument, so just return null. + return DAG.getConstant(0, SDLoc(), VT); + } + + // It's undefined behavior if a function marked with the amdgpu-no-* + // attributes uses the corresponding intrinsic. + return DAG.getUNDEF(VT); + } + return CreateLiveInRegister(DAG, RC, Reg->getRegister(), VT); } @@ -2023,31 +2047,33 @@ void SITargetLowering::allocateSpecialInputSGPRs( SIMachineFunctionInfo &Info) const { auto &ArgInfo = Info.getArgInfo(); - // TODO: Unify handling with private memory pointers. + // We need to allocate these in place regardless of their use. + const bool IsFixed = AMDGPUTargetMachine::EnableFixedFunctionABI; - if (Info.hasDispatchPtr()) + // TODO: Unify handling with private memory pointers. + if (IsFixed || Info.hasDispatchPtr()) allocateSGPR64Input(CCInfo, ArgInfo.DispatchPtr); - if (Info.hasQueuePtr()) + if (IsFixed || Info.hasQueuePtr()) allocateSGPR64Input(CCInfo, ArgInfo.QueuePtr); // Implicit arg ptr takes the place of the kernarg segment pointer. This is a // constant offset from the kernarg segment. - if (Info.hasImplicitArgPtr()) + if (IsFixed || Info.hasImplicitArgPtr()) allocateSGPR64Input(CCInfo, ArgInfo.ImplicitArgPtr); - if (Info.hasDispatchID()) + if (IsFixed || Info.hasDispatchID()) allocateSGPR64Input(CCInfo, ArgInfo.DispatchID); // flat_scratch_init is not applicable for non-kernel functions. - if (Info.hasWorkGroupIDX()) + if (IsFixed || Info.hasWorkGroupIDX()) allocateSGPR32Input(CCInfo, ArgInfo.WorkGroupIDX); - if (Info.hasWorkGroupIDY()) + if (IsFixed || Info.hasWorkGroupIDY()) allocateSGPR32Input(CCInfo, ArgInfo.WorkGroupIDY); - if (Info.hasWorkGroupIDZ()) + if (IsFixed || Info.hasWorkGroupIDZ()) allocateSGPR32Input(CCInfo, ArgInfo.WorkGroupIDZ); } @@ -2590,9 +2616,12 @@ SITargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv, SDValue ReturnAddrReg = CreateLiveInRegister( DAG, &AMDGPU::SReg_64RegClass, TRI->getReturnAddressReg(MF), MVT::i64); - SDValue ReturnAddrVirtualReg = DAG.getRegister( - MF.getRegInfo().createVirtualRegister(&AMDGPU::CCR_SGPR_64RegClass), - MVT::i64); + SDValue ReturnAddrVirtualReg = + DAG.getRegister(MF.getRegInfo().createVirtualRegister( + CallConv != CallingConv::AMDGPU_Gfx + ? &AMDGPU::CCR_SGPR_64RegClass + : &AMDGPU::Gfx_CCR_SGPR_64RegClass), + MVT::i64); Chain = DAG.getCopyToReg(Chain, DL, ReturnAddrVirtualReg, ReturnAddrReg, Flag); Flag = Chain.getValue(1); @@ -2655,8 +2684,15 @@ SITargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv, RetOps.push_back(Flag); unsigned Opc = AMDGPUISD::ENDPGM; - if (!IsWaveEnd) - Opc = IsShader ? AMDGPUISD::RETURN_TO_EPILOG : AMDGPUISD::RET_FLAG; + if (!IsWaveEnd) { + if (IsShader) + Opc = AMDGPUISD::RETURN_TO_EPILOG; + else if (CallConv == CallingConv::AMDGPU_Gfx) + Opc = AMDGPUISD::RET_GFX_FLAG; + else + Opc = AMDGPUISD::RET_FLAG; + } + return DAG.getNode(Opc, DL, MVT::Other, RetOps); } @@ -2747,21 +2783,28 @@ void SITargetLowering::passSpecialInputs( // TODO: Unify with private memory register handling. This is complicated by // the fact that at least in kernels, the input argument is not necessarily // in the same location as the input. - AMDGPUFunctionArgInfo::PreloadedValue InputRegs[] = { - AMDGPUFunctionArgInfo::DISPATCH_PTR, - AMDGPUFunctionArgInfo::QUEUE_PTR, - AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR, - AMDGPUFunctionArgInfo::DISPATCH_ID, - AMDGPUFunctionArgInfo::WORKGROUP_ID_X, - AMDGPUFunctionArgInfo::WORKGROUP_ID_Y, - AMDGPUFunctionArgInfo::WORKGROUP_ID_Z + static constexpr std::pair<AMDGPUFunctionArgInfo::PreloadedValue, + StringLiteral> ImplicitAttrs[] = { + {AMDGPUFunctionArgInfo::DISPATCH_PTR, "amdgpu-no-dispatch-ptr"}, + {AMDGPUFunctionArgInfo::QUEUE_PTR, "amdgpu-no-queue-ptr" }, + {AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR, "amdgpu-no-implicitarg-ptr"}, + {AMDGPUFunctionArgInfo::DISPATCH_ID, "amdgpu-no-dispatch-id"}, + {AMDGPUFunctionArgInfo::WORKGROUP_ID_X, "amdgpu-no-workgroup-id-x"}, + {AMDGPUFunctionArgInfo::WORKGROUP_ID_Y,"amdgpu-no-workgroup-id-y"}, + {AMDGPUFunctionArgInfo::WORKGROUP_ID_Z,"amdgpu-no-workgroup-id-z"} }; - for (auto InputID : InputRegs) { + for (auto Attr : ImplicitAttrs) { const ArgDescriptor *OutgoingArg; const TargetRegisterClass *ArgRC; LLT ArgTy; + AMDGPUFunctionArgInfo::PreloadedValue InputID = Attr.first; + + // If the callee does not use the attribute value, skip copying the value. + if (CLI.CB->hasFnAttr(Attr.second)) + continue; + std::tie(OutgoingArg, ArgRC, ArgTy) = CalleeArgInfo->getPreloadedValue(InputID); if (!OutgoingArg) @@ -2780,11 +2823,14 @@ void SITargetLowering::passSpecialInputs( if (IncomingArg) { InputReg = loadInputValue(DAG, ArgRC, ArgVT, DL, *IncomingArg); - } else { + } else if (InputID == AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR) { // The implicit arg ptr is special because it doesn't have a corresponding // input for kernels, and is computed from the kernarg segment pointer. - assert(InputID == AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR); InputReg = getImplicitArgPtr(DAG, DL); + } else { + // We may have proven the input wasn't needed, although the ABI is + // requiring it. We just need to allocate the register appropriately. + InputReg = DAG.getUNDEF(ArgVT); } if (OutgoingArg->isRegister()) { @@ -2827,11 +2873,17 @@ void SITargetLowering::passSpecialInputs( SDValue InputReg; SDLoc SL; + const bool NeedWorkItemIDX = !CLI.CB->hasFnAttr("amdgpu-no-workitem-id-x"); + const bool NeedWorkItemIDY = !CLI.CB->hasFnAttr("amdgpu-no-workitem-id-y"); + const bool NeedWorkItemIDZ = !CLI.CB->hasFnAttr("amdgpu-no-workitem-id-z"); + // If incoming ids are not packed we need to pack them. - if (IncomingArgX && !IncomingArgX->isMasked() && CalleeArgInfo->WorkItemIDX) + if (IncomingArgX && !IncomingArgX->isMasked() && CalleeArgInfo->WorkItemIDX && + NeedWorkItemIDX) InputReg = loadInputValue(DAG, ArgRC, MVT::i32, DL, *IncomingArgX); - if (IncomingArgY && !IncomingArgY->isMasked() && CalleeArgInfo->WorkItemIDY) { + if (IncomingArgY && !IncomingArgY->isMasked() && CalleeArgInfo->WorkItemIDY && + NeedWorkItemIDY) { SDValue Y = loadInputValue(DAG, ArgRC, MVT::i32, DL, *IncomingArgY); Y = DAG.getNode(ISD::SHL, SL, MVT::i32, Y, DAG.getShiftAmountConstant(10, MVT::i32, SL)); @@ -2839,7 +2891,8 @@ void SITargetLowering::passSpecialInputs( DAG.getNode(ISD::OR, SL, MVT::i32, InputReg, Y) : Y; } - if (IncomingArgZ && !IncomingArgZ->isMasked() && CalleeArgInfo->WorkItemIDZ) { + if (IncomingArgZ && !IncomingArgZ->isMasked() && CalleeArgInfo->WorkItemIDZ && + NeedWorkItemIDZ) { SDValue Z = loadInputValue(DAG, ArgRC, MVT::i32, DL, *IncomingArgZ); Z = DAG.getNode(ISD::SHL, SL, MVT::i32, Z, DAG.getShiftAmountConstant(20, MVT::i32, SL)); @@ -2847,7 +2900,7 @@ void SITargetLowering::passSpecialInputs( DAG.getNode(ISD::OR, SL, MVT::i32, InputReg, Z) : Z; } - if (!InputReg.getNode()) { + if (!InputReg && (NeedWorkItemIDX || NeedWorkItemIDY || NeedWorkItemIDZ)) { // Workitem ids are already packed, any of present incoming arguments // will carry all required fields. ArgDescriptor IncomingArg = ArgDescriptor::createArg( @@ -2858,13 +2911,17 @@ void SITargetLowering::passSpecialInputs( } if (OutgoingArg->isRegister()) { - RegsToPass.emplace_back(OutgoingArg->getRegister(), InputReg); + if (InputReg) + RegsToPass.emplace_back(OutgoingArg->getRegister(), InputReg); + CCInfo.AllocateReg(OutgoingArg->getRegister()); } else { unsigned SpecialArgOffset = CCInfo.AllocateStack(4, Align(4)); - SDValue ArgStore = storeStackInputValue(DAG, DL, Chain, InputReg, - SpecialArgOffset); - MemOpChains.push_back(ArgStore); + if (InputReg) { + SDValue ArgStore = storeStackInputValue(DAG, DL, Chain, InputReg, + SpecialArgOffset); + MemOpChains.push_back(ArgStore); + } } } @@ -4091,7 +4148,10 @@ MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter( } const TargetRegisterClass *Src2RC = MRI.getRegClass(Src2.getReg()); - if (TRI->getRegSizeInBits(*Src2RC) == 64) { + unsigned WaveSize = TRI->getRegSizeInBits(*Src2RC); + assert(WaveSize == 64 || WaveSize == 32); + + if (WaveSize == 64) { if (ST.hasScalarCompareEq64()) { BuildMI(*BB, MII, DL, TII->get(AMDGPU::S_CMP_LG_U64)) .addReg(Src2.getReg()) @@ -4121,8 +4181,13 @@ MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter( BuildMI(*BB, MII, DL, TII->get(Opc), Dest.getReg()).add(Src0).add(Src1); - BuildMI(*BB, MII, DL, TII->get(AMDGPU::COPY), CarryDest.getReg()) - .addReg(AMDGPU::SCC); + unsigned SelOpc = + (WaveSize == 64) ? AMDGPU::S_CSELECT_B64 : AMDGPU::S_CSELECT_B32; + + BuildMI(*BB, MII, DL, TII->get(SelOpc), CarryDest.getReg()) + .addImm(-1) + .addImm(0); + MI.eraseFromParent(); return BB; } @@ -4261,6 +4326,13 @@ MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter( MI.eraseFromParent(); return BB; } + case AMDGPU::V_ADDC_U32_e32: + case AMDGPU::V_SUBB_U32_e32: + case AMDGPU::V_SUBBREV_U32_e32: + // These instructions have an implicit use of vcc which counts towards the + // constant bus limit. + TII->legalizeOperands(MI); + return BB; case AMDGPU::DS_GWS_INIT: case AMDGPU::DS_GWS_SEMA_BR: case AMDGPU::DS_GWS_BARRIER: @@ -4818,7 +4890,7 @@ static SDValue lowerBALLOTIntrinsic(const SITargetLowering &TLI, SDNode *N, } if (const ConstantSDNode *Arg = dyn_cast<ConstantSDNode>(Src)) { // (ballot 0) -> 0 - if (Arg->isNullValue()) + if (Arg->isZero()) return DAG.getConstant(0, SL, VT); // (ballot 1) -> EXEC/EXEC_LO @@ -5266,9 +5338,18 @@ SDValue SITargetLowering::lowerTrapHsaQueuePtr( MachineFunction &MF = DAG.getMachineFunction(); SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); Register UserSGPR = Info->getQueuePtrUserSGPR(); - assert(UserSGPR != AMDGPU::NoRegister); - SDValue QueuePtr = CreateLiveInRegister( - DAG, &AMDGPU::SReg_64RegClass, UserSGPR, MVT::i64); + + SDValue QueuePtr; + if (UserSGPR == AMDGPU::NoRegister) { + // We probably are in a function incorrectly marked with + // amdgpu-no-queue-ptr. This is undefined. We don't want to delete the trap, + // so just use a null pointer. + QueuePtr = DAG.getConstant(0, SL, MVT::i64); + } else { + QueuePtr = CreateLiveInRegister( + DAG, &AMDGPU::SReg_64RegClass, UserSGPR, MVT::i64); + } + SDValue SGPR01 = DAG.getRegister(AMDGPU::SGPR0_SGPR1, MVT::i64); SDValue ToReg = DAG.getCopyToReg(Chain, SL, SGPR01, QueuePtr, SDValue()); @@ -5345,7 +5426,11 @@ SDValue SITargetLowering::getSegmentAperture(unsigned AS, const SDLoc &DL, MachineFunction &MF = DAG.getMachineFunction(); SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); Register UserSGPR = Info->getQueuePtrUserSGPR(); - assert(UserSGPR != AMDGPU::NoRegister); + if (UserSGPR == AMDGPU::NoRegister) { + // We probably are in a function incorrectly marked with + // amdgpu-no-queue-ptr. This is undefined. + return DAG.getUNDEF(MVT::i32); + } SDValue QueuePtr = CreateLiveInRegister( DAG, &AMDGPU::SReg_64RegClass, UserSGPR, MVT::i64); @@ -5936,6 +6021,9 @@ static SDValue constructRetValue(SelectionDAG &DAG, EVT LegalReqRetVT = ReqRetVT; if (!ReqRetVT.isVector()) { + if (!Data.getValueType().isInteger()) + Data = DAG.getNode(ISD::BITCAST, DL, + Data.getValueType().changeTypeToInteger(), Data); Data = DAG.getNode(ISD::TRUNCATE, DL, ReqRetVT.changeTypeToInteger(), Data); } else { // We need to widen the return vector to a legal type @@ -6124,7 +6212,7 @@ SDValue SITargetLowering::lowerImage(SDValue Op, if (MIPMappingInfo) { if (auto *ConstantLod = dyn_cast<ConstantSDNode>( Op.getOperand(ArgOffset + Intr->MipIndex))) { - if (ConstantLod->isNullValue()) { + if (ConstantLod->isZero()) { IntrOpcode = MIPMappingInfo->NONMIP; // set new opcode to variant without _mip VAddrEnd--; // remove 'mip' } @@ -6659,7 +6747,7 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, // intrinsic has the numerator as the first operand to match a normal // division operation. - SDValue Src0 = Param->isAllOnesValue() ? Numerator : Denominator; + SDValue Src0 = Param->isAllOnes() ? Numerator : Denominator; return DAG.getNode(AMDGPUISD::DIV_SCALE, DL, Op->getVTList(), Src0, Denominator, Numerator); @@ -6793,7 +6881,7 @@ static void updateBufferMMO(MachineMemOperand *MMO, SDValue VOffset, } if (VIndex && (!isa<ConstantSDNode>(VIndex) || - !cast<ConstantSDNode>(VIndex)->isNullValue())) { + !cast<ConstantSDNode>(VIndex)->isZero())) { // The strided index component of the address is not known to be zero, so we // cannot represent it in the MMO. Give up. MMO->setValue((Value *)nullptr); @@ -7341,7 +7429,6 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, Op->getVTList(), Ops, VT, M->getMemOperand()); } case Intrinsic::amdgcn_image_bvh_intersect_ray: { - SDLoc DL(Op); MemSDNode *M = cast<MemSDNode>(Op); SDValue NodePtr = M->getOperand(2); SDValue RayExtent = M->getOperand(3); @@ -7360,12 +7447,27 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, return SDValue(); } - bool IsA16 = RayDir.getValueType().getVectorElementType() == MVT::f16; - bool Is64 = NodePtr.getValueType() == MVT::i64; - unsigned Opcode = IsA16 ? Is64 ? AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16_nsa - : AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16_nsa - : Is64 ? AMDGPU::IMAGE_BVH64_INTERSECT_RAY_nsa - : AMDGPU::IMAGE_BVH_INTERSECT_RAY_nsa; + const bool IsA16 = RayDir.getValueType().getVectorElementType() == MVT::f16; + const bool Is64 = NodePtr.getValueType() == MVT::i64; + const unsigned NumVDataDwords = 4; + const unsigned NumVAddrDwords = IsA16 ? (Is64 ? 9 : 8) : (Is64 ? 12 : 11); + const bool UseNSA = Subtarget->hasNSAEncoding() && + NumVAddrDwords <= Subtarget->getNSAMaxSize(); + const unsigned BaseOpcodes[2][2] = { + {AMDGPU::IMAGE_BVH_INTERSECT_RAY, AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16}, + {AMDGPU::IMAGE_BVH64_INTERSECT_RAY, + AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16}}; + int Opcode; + if (UseNSA) { + Opcode = AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16], + AMDGPU::MIMGEncGfx10NSA, NumVDataDwords, + NumVAddrDwords); + } else { + Opcode = AMDGPU::getMIMGOpcode( + BaseOpcodes[Is64][IsA16], AMDGPU::MIMGEncGfx10Default, NumVDataDwords, + PowerOf2Ceil(NumVAddrDwords)); + } + assert(Opcode != -1); SmallVector<SDValue, 16> Ops; @@ -7405,6 +7507,20 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, packLanes(RayOrigin, true); packLanes(RayDir, true); packLanes(RayInvDir, false); + + if (!UseNSA) { + // Build a single vector containing all the operands so far prepared. + if (NumVAddrDwords > 8) { + SDValue Undef = DAG.getUNDEF(MVT::i32); + Ops.append(16 - Ops.size(), Undef); + } + assert(Ops.size() == 8 || Ops.size() == 16); + SDValue MergedOps = DAG.getBuildVector( + Ops.size() == 16 ? MVT::v16i32 : MVT::v8i32, DL, Ops); + Ops.clear(); + Ops.push_back(MergedOps); + } + Ops.push_back(TDescr); if (IsA16) Ops.push_back(DAG.getTargetConstant(1, DL, MVT::i1)); @@ -7610,7 +7726,7 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op, Op.getOperand(0) // Chain }; - unsigned Opc = Done->isNullValue() ? AMDGPU::EXP : AMDGPU::EXP_DONE; + unsigned Opc = Done->isZero() ? AMDGPU::EXP : AMDGPU::EXP_DONE; return SDValue(DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops), 0); } case Intrinsic::amdgcn_s_barrier: { @@ -8241,6 +8357,16 @@ SDValue SITargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { SDLoc DL(Op); SDValue Cond = Op.getOperand(0); + if (Subtarget->hasScalarCompareEq64() && Op->getOperand(0)->hasOneUse() && + !Op->isDivergent()) { + if (VT == MVT::i64) + return Op; + SDValue LHS = DAG.getNode(ISD::BITCAST, DL, MVT::i64, Op.getOperand(1)); + SDValue RHS = DAG.getNode(ISD::BITCAST, DL, MVT::i64, Op.getOperand(2)); + return DAG.getNode(ISD::BITCAST, DL, VT, + DAG.getSelect(DL, MVT::i64, Cond, LHS, RHS)); + } + SDValue Zero = DAG.getConstant(0, DL, MVT::i32); SDValue One = DAG.getConstant(1, DL, MVT::i32); @@ -9358,7 +9484,8 @@ SDValue SITargetLowering::performOrCombine(SDNode *N, const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(N->getOperand(1)); if (CRHS) { if (SDValue Split - = splitBinaryBitConstantOp(DCI, SDLoc(N), ISD::OR, LHS, CRHS)) + = splitBinaryBitConstantOp(DCI, SDLoc(N), ISD::OR, + N->getOperand(0), CRHS)) return Split; } @@ -9445,7 +9572,7 @@ SDValue SITargetLowering::performClassCombine(SDNode *N, // fp_class x, 0 -> false if (const ConstantSDNode *CMask = dyn_cast<ConstantSDNode>(Mask)) { - if (CMask->isNullValue()) + if (CMask->isZero()) return DAG.getConstant(0, SDLoc(N), MVT::i1); } @@ -10348,7 +10475,7 @@ SDValue SITargetLowering::performAddCombine(SDNode *N, return getMad64_32(DAG, SL, VT, MulLHS, MulRHS, AddRHS, false); } - if (numBitsSigned(MulLHS, DAG) < 32 && numBitsSigned(MulRHS, DAG) < 32) { + if (numBitsSigned(MulLHS, DAG) <= 32 && numBitsSigned(MulRHS, DAG) <= 32) { MulLHS = DAG.getSExtOrTrunc(MulLHS, SL, MVT::i32); MulRHS = DAG.getSExtOrTrunc(MulRHS, SL, MVT::i32); AddRHS = DAG.getSExtOrTrunc(AddRHS, SL, MVT::i64); @@ -10434,7 +10561,7 @@ SDValue SITargetLowering::performSubCombine(SDNode *N, if (LHS.getOpcode() == ISD::SUBCARRY) { // sub (subcarry x, 0, cc), y => subcarry x, y, cc auto C = dyn_cast<ConstantSDNode>(LHS.getOperand(1)); - if (!C || !C->isNullValue()) + if (!C || !C->isZero()) return SDValue(); SDValue Args[] = { LHS.getOperand(0), RHS, LHS.getOperand(2) }; return DAG.getNode(ISD::SUBCARRY, SDLoc(N), LHS->getVTList(), Args); @@ -10657,20 +10784,20 @@ SDValue SITargetLowering::performSetCCCombine(SDNode *N, // setcc (sext from i1 cc), -1, eq|sle|uge) => cc // setcc (sext from i1 cc), 0, eq|sge|ule) => not cc => xor cc, -1 // setcc (sext from i1 cc), 0, ne|ugt|slt) => cc - if ((CRHS->isAllOnesValue() && + if ((CRHS->isAllOnes() && (CC == ISD::SETNE || CC == ISD::SETGT || CC == ISD::SETULT)) || - (CRHS->isNullValue() && + (CRHS->isZero() && (CC == ISD::SETEQ || CC == ISD::SETGE || CC == ISD::SETULE))) return DAG.getNode(ISD::XOR, SL, MVT::i1, LHS.getOperand(0), DAG.getConstant(-1, SL, MVT::i1)); - if ((CRHS->isAllOnesValue() && + if ((CRHS->isAllOnes() && (CC == ISD::SETEQ || CC == ISD::SETLE || CC == ISD::SETUGE)) || - (CRHS->isNullValue() && + (CRHS->isZero() && (CC == ISD::SETNE || CC == ISD::SETUGT || CC == ISD::SETLT))) return LHS.getOperand(0); } - uint64_t CRHSVal = CRHS->getZExtValue(); + const APInt &CRHSVal = CRHS->getAPIntValue(); if ((CC == ISD::SETEQ || CC == ISD::SETNE) && LHS.getOpcode() == ISD::SELECT && isa<ConstantSDNode>(LHS.getOperand(1)) && @@ -10682,8 +10809,8 @@ SDValue SITargetLowering::performSetCCCombine(SDNode *N, // setcc (select cc, CT, CF), CF, ne => cc // setcc (select cc, CT, CF), CT, ne => xor cc, -1 // setcc (select cc, CT, CF), CT, eq => cc - uint64_t CT = LHS.getConstantOperandVal(1); - uint64_t CF = LHS.getConstantOperandVal(2); + const APInt &CT = LHS.getConstantOperandAPInt(1); + const APInt &CF = LHS.getConstantOperandAPInt(2); if ((CF == CRHSVal && CC == ISD::SETEQ) || (CT == CRHSVal && CC == ISD::SETNE)) @@ -10747,7 +10874,7 @@ SDValue SITargetLowering::performCvtF32UByteNCombine(SDNode *N, // cvt_f32_ubyte1 (srl x, 16) -> cvt_f32_ubyte3 x // cvt_f32_ubyte0 (srl x, 8) -> cvt_f32_ubyte1 x if (auto *C = dyn_cast<ConstantSDNode>(Shift.getOperand(1))) { - Shift = DAG.getZExtOrTrunc(Shift.getOperand(0), + SDValue Shifted = DAG.getZExtOrTrunc(Shift.getOperand(0), SDLoc(Shift.getOperand(0)), MVT::i32); unsigned ShiftOffset = 8 * Offset; @@ -10758,7 +10885,7 @@ SDValue SITargetLowering::performCvtF32UByteNCombine(SDNode *N, if (ShiftOffset < 32 && (ShiftOffset % 8) == 0) { return DAG.getNode(AMDGPUISD::CVT_F32_UBYTE0 + ShiftOffset / 8, SL, - MVT::f32, Shift); + MVT::f32, Shifted); } } } @@ -12086,6 +12213,25 @@ static bool fpModeMatchesGlobalFPAtomicMode(const AtomicRMWInst *RMW) { TargetLowering::AtomicExpansionKind SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const { + + auto ReportUnsafeHWInst = [&](TargetLowering::AtomicExpansionKind Kind) { + OptimizationRemarkEmitter ORE(RMW->getFunction()); + LLVMContext &Ctx = RMW->getFunction()->getContext(); + SmallVector<StringRef> SSNs; + Ctx.getSyncScopeNames(SSNs); + auto MemScope = SSNs[RMW->getSyncScopeID()].empty() + ? "system" + : SSNs[RMW->getSyncScopeID()]; + ORE.emit([&]() { + return OptimizationRemark(DEBUG_TYPE, "Passed", RMW) + << "Hardware instruction generated for atomic " + << RMW->getOperationName(RMW->getOperation()) + << " operation at memory scope " << MemScope + << " due to an unsafe request."; + }); + return Kind; + }; + switch (RMW->getOperation()) { case AtomicRMWInst::FAdd: { Type *Ty = RMW->getType(); @@ -12120,28 +12266,30 @@ SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const { SSID == RMW->getContext().getOrInsertSyncScopeID("one-as")) return AtomicExpansionKind::CmpXChg; - return AtomicExpansionKind::None; + return ReportUnsafeHWInst(AtomicExpansionKind::None); } if (AS == AMDGPUAS::FLAT_ADDRESS) return AtomicExpansionKind::CmpXChg; - return RMW->use_empty() ? AtomicExpansionKind::None + return RMW->use_empty() ? ReportUnsafeHWInst(AtomicExpansionKind::None) : AtomicExpansionKind::CmpXChg; } // DS FP atomics do repect the denormal mode, but the rounding mode is fixed // to round-to-nearest-even. // The only exception is DS_ADD_F64 which never flushes regardless of mode. - if (AS == AMDGPUAS::LOCAL_ADDRESS && Subtarget->hasLDSFPAtomics()) { + if (AS == AMDGPUAS::LOCAL_ADDRESS && Subtarget->hasLDSFPAtomicAdd()) { if (!Ty->isDoubleTy()) return AtomicExpansionKind::None; - return (fpModeMatchesGlobalFPAtomicMode(RMW) || - RMW->getFunction() - ->getFnAttribute("amdgpu-unsafe-fp-atomics") - .getValueAsString() == "true") - ? AtomicExpansionKind::None + if (fpModeMatchesGlobalFPAtomicMode(RMW)) + return AtomicExpansionKind::None; + + return RMW->getFunction() + ->getFnAttribute("amdgpu-unsafe-fp-atomics") + .getValueAsString() == "true" + ? ReportUnsafeHWInst(AtomicExpansionKind::None) : AtomicExpansionKind::CmpXChg; } |
