diff options
Diffstat (limited to 'llvm/lib/Target/AMDGPU/SIISelLowering.cpp')
| -rw-r--r-- | llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 1659 |
1 files changed, 1369 insertions, 290 deletions
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index 3148f49ff0d5..a7f4d63229b7 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -15,6 +15,7 @@ #include "AMDGPU.h" #include "AMDGPUInstrInfo.h" #include "AMDGPUTargetMachine.h" +#include "GCNSubtarget.h" #include "MCTargetDesc/AMDGPUMCTargetDesc.h" #include "SIMachineFunctionInfo.h" #include "SIRegisterInfo.h" @@ -28,6 +29,7 @@ #include "llvm/CodeGen/ByteProvider.h" #include "llvm/CodeGen/FunctionLoweringInfo.h" #include "llvm/CodeGen/GlobalISel/GISelKnownBits.h" +#include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h" #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineFunction.h" @@ -146,8 +148,13 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, addRegisterClass(MVT::v16f64, TRI->getVGPRClassForBitWidth(1024)); if (Subtarget->has16BitInsts()) { - addRegisterClass(MVT::i16, &AMDGPU::SReg_32RegClass); - addRegisterClass(MVT::f16, &AMDGPU::SReg_32RegClass); + if (Subtarget->useRealTrue16Insts()) { + addRegisterClass(MVT::i16, &AMDGPU::VGPR_16RegClass); + addRegisterClass(MVT::f16, &AMDGPU::VGPR_16RegClass); + } else { + addRegisterClass(MVT::i16, &AMDGPU::SReg_32RegClass); + addRegisterClass(MVT::f16, &AMDGPU::SReg_32RegClass); + } // Unless there are also VOP3P operations, not operations are really legal. addRegisterClass(MVT::v2i16, &AMDGPU::SReg_32RegClass); @@ -158,6 +165,8 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, addRegisterClass(MVT::v8f16, &AMDGPU::SGPR_128RegClass); addRegisterClass(MVT::v16i16, &AMDGPU::SGPR_256RegClass); addRegisterClass(MVT::v16f16, &AMDGPU::SGPR_256RegClass); + addRegisterClass(MVT::v32i16, &AMDGPU::SGPR_512RegClass); + addRegisterClass(MVT::v32f16, &AMDGPU::SGPR_512RegClass); } addRegisterClass(MVT::v32i32, &AMDGPU::VReg_1024RegClass); @@ -219,7 +228,7 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, setOperationAction(ISD::SELECT, MVT::f64, Promote); AddPromotedToType(ISD::SELECT, MVT::f64, MVT::i64); - setOperationAction(ISD::FSQRT, MVT::f64, Custom); + setOperationAction(ISD::FSQRT, {MVT::f32, MVT::f64}, Custom); setOperationAction(ISD::SELECT_CC, {MVT::f32, MVT::i32, MVT::i64, MVT::f64, MVT::i1}, Expand); @@ -262,13 +271,13 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, // We only support LOAD/STORE and vector manipulation ops for vectors // with > 4 elements. for (MVT VT : - {MVT::v8i32, MVT::v8f32, MVT::v9i32, MVT::v9f32, MVT::v10i32, - MVT::v10f32, MVT::v11i32, MVT::v11f32, MVT::v12i32, MVT::v12f32, - MVT::v16i32, MVT::v16f32, MVT::v2i64, MVT::v2f64, MVT::v4i16, - MVT::v4f16, MVT::v3i64, MVT::v3f64, MVT::v6i32, MVT::v6f32, - MVT::v4i64, MVT::v4f64, MVT::v8i64, MVT::v8f64, MVT::v8i16, - MVT::v8f16, MVT::v16i16, MVT::v16f16, MVT::v16i64, MVT::v16f64, - MVT::v32i32, MVT::v32f32}) { + {MVT::v8i32, MVT::v8f32, MVT::v9i32, MVT::v9f32, MVT::v10i32, + MVT::v10f32, MVT::v11i32, MVT::v11f32, MVT::v12i32, MVT::v12f32, + MVT::v16i32, MVT::v16f32, MVT::v2i64, MVT::v2f64, MVT::v4i16, + MVT::v4f16, MVT::v3i64, MVT::v3f64, MVT::v6i32, MVT::v6f32, + MVT::v4i64, MVT::v4f64, MVT::v8i64, MVT::v8f64, MVT::v8i16, + MVT::v8f16, MVT::v16i16, MVT::v16f16, MVT::v16i64, MVT::v16f64, + MVT::v32i32, MVT::v32f32, MVT::v32i16, MVT::v32f16}) { for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) { switch (Op) { case ISD::LOAD: @@ -278,10 +287,10 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, case ISD::UNDEF: case ISD::EXTRACT_VECTOR_ELT: case ISD::INSERT_VECTOR_ELT: - case ISD::EXTRACT_SUBVECTOR: case ISD::SCALAR_TO_VECTOR: case ISD::IS_FPCLASS: break; + case ISD::EXTRACT_SUBVECTOR: case ISD::INSERT_SUBVECTOR: case ISD::CONCAT_VECTORS: setOperationAction(Op, VT, Custom); @@ -420,6 +429,8 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, if (Subtarget->has16BitInsts()) { setOperationAction({ISD::FPOW, ISD::FPOWI}, MVT::f16, Promote); setOperationAction({ISD::FLOG, ISD::FEXP, ISD::FLOG10}, MVT::f16, Custom); + } else { + setOperationAction(ISD::FSQRT, MVT::f16, Custom); } if (Subtarget->hasMadMacF32Insts()) @@ -470,9 +481,10 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, {MVT::f32, MVT::f64}, Legal); if (Subtarget->haveRoundOpsF64()) - setOperationAction({ISD::FTRUNC, ISD::FCEIL, ISD::FRINT}, MVT::f64, Legal); + setOperationAction({ISD::FTRUNC, ISD::FCEIL, ISD::FROUNDEVEN}, MVT::f64, + Legal); else - setOperationAction({ISD::FCEIL, ISD::FTRUNC, ISD::FRINT, ISD::FFLOOR}, + setOperationAction({ISD::FCEIL, ISD::FTRUNC, ISD::FROUNDEVEN, ISD::FFLOOR}, MVT::f64, Custom); setOperationAction(ISD::FFLOOR, MVT::f64, Legal); @@ -544,8 +556,9 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, if (STI.hasMadF16()) setOperationAction(ISD::FMAD, MVT::f16, Legal); - for (MVT VT : {MVT::v2i16, MVT::v2f16, MVT::v4i16, MVT::v4f16, MVT::v8i16, - MVT::v8f16, MVT::v16i16, MVT::v16f16}) { + for (MVT VT : + {MVT::v2i16, MVT::v2f16, MVT::v4i16, MVT::v4f16, MVT::v8i16, + MVT::v8f16, MVT::v16i16, MVT::v16f16, MVT::v32i16, MVT::v32f16}) { for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) { switch (Op) { case ISD::LOAD: @@ -631,6 +644,16 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, setOperationAction(ISD::STORE, MVT::v16f16, Promote); AddPromotedToType(ISD::STORE, MVT::v16f16, MVT::v8i32); + setOperationAction(ISD::LOAD, MVT::v32i16, Promote); + AddPromotedToType(ISD::LOAD, MVT::v32i16, MVT::v16i32); + setOperationAction(ISD::LOAD, MVT::v32f16, Promote); + AddPromotedToType(ISD::LOAD, MVT::v32f16, MVT::v16i32); + + setOperationAction(ISD::STORE, MVT::v32i16, Promote); + AddPromotedToType(ISD::STORE, MVT::v32i16, MVT::v16i32); + setOperationAction(ISD::STORE, MVT::v32f16, Promote); + AddPromotedToType(ISD::STORE, MVT::v32f16, MVT::v16i32); + setOperationAction({ISD::ANY_EXTEND, ISD::ZERO_EXTEND, ISD::SIGN_EXTEND}, MVT::v2i32, Expand); setOperationAction(ISD::FP_EXTEND, MVT::v2f32, Expand); @@ -653,12 +676,15 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, setOperationAction({ISD::FMAXNUM_IEEE, ISD::FMINNUM_IEEE}, MVT::f16, Legal); setOperationAction({ISD::FMINNUM_IEEE, ISD::FMAXNUM_IEEE}, - {MVT::v4f16, MVT::v8f16, MVT::v16f16}, Custom); + {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16}, + Custom); setOperationAction({ISD::FMINNUM, ISD::FMAXNUM}, - {MVT::v4f16, MVT::v8f16, MVT::v16f16}, Expand); + {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16}, + Expand); - for (MVT Vec16 : {MVT::v8i16, MVT::v8f16, MVT::v16i16, MVT::v16f16}) { + for (MVT Vec16 : {MVT::v8i16, MVT::v8f16, MVT::v16i16, MVT::v16f16, + MVT::v32i16, MVT::v32f16}) { setOperationAction( {ISD::BUILD_VECTOR, ISD::EXTRACT_VECTOR_ELT, ISD::SCALAR_TO_VECTOR}, Vec16, Custom); @@ -681,10 +707,10 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, setOperationAction(ISD::VECTOR_SHUFFLE, {MVT::v4f16, MVT::v4i16, MVT::v8f16, MVT::v8i16, - MVT::v16f16, MVT::v16i16}, + MVT::v16f16, MVT::v16i16, MVT::v32f16, MVT::v32i16}, Custom); - for (MVT VT : {MVT::v4i16, MVT::v8i16, MVT::v16i16}) + for (MVT VT : {MVT::v4i16, MVT::v8i16, MVT::v16i16, MVT::v32i16}) // Split vector operations. setOperationAction({ISD::SHL, ISD::SRA, ISD::SRL, ISD::ADD, ISD::SUB, ISD::MUL, ISD::SMIN, ISD::SMAX, ISD::UMIN, ISD::UMAX, @@ -692,7 +718,7 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, ISD::SSUBSAT}, VT, Custom); - for (MVT VT : {MVT::v4f16, MVT::v8f16, MVT::v16f16}) + for (MVT VT : {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16}) // Split vector operations. setOperationAction({ISD::FADD, ISD::FMUL, ISD::FMA, ISD::FCANONICALIZE}, VT, Custom); @@ -728,7 +754,8 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, setOperationAction(ISD::SELECT, {MVT::v4i16, MVT::v4f16, MVT::v2i8, MVT::v4i8, MVT::v8i8, - MVT::v8i16, MVT::v8f16, MVT::v16i16, MVT::v16f16}, + MVT::v8i16, MVT::v8f16, MVT::v16i16, MVT::v16f16, + MVT::v32i16, MVT::v32f16}, Custom); setOperationAction({ISD::SMULO, ISD::UMULO}, MVT::i64, Custom); @@ -753,12 +780,22 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, MVT::i8, MVT::i128}, Custom); + setOperationAction(ISD::STACKSAVE, MVT::Other, Custom); + setOperationAction(ISD::GET_ROUNDING, MVT::i32, Custom); + + // TODO: Could move this to custom lowering, could benefit from combines on + // extract of relevant bits. + setOperationAction(ISD::GET_FPMODE, MVT::i32, Legal); + + setOperationAction(ISD::MUL, MVT::i1, Promote); + setTargetDAGCombine({ISD::ADD, ISD::UADDO_CARRY, ISD::SUB, ISD::USUBO_CARRY, ISD::FADD, ISD::FSUB, + ISD::FDIV, ISD::FMINNUM, ISD::FMAXNUM, ISD::FMINNUM_IEEE, @@ -772,6 +809,7 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, ISD::AND, ISD::OR, ISD::XOR, + ISD::FSHR, ISD::SINT_TO_FP, ISD::UINT_TO_FP, ISD::FCANONICALIZE, @@ -1284,7 +1322,9 @@ bool SITargetLowering::getAddrModeArguments(IntrinsicInst *II, } } -bool SITargetLowering::isLegalFlatAddressingMode(const AddrMode &AM) const { +bool SITargetLowering::isLegalFlatAddressingMode(const AddrMode &AM, + unsigned AddrSpace, + uint64_t FlatVariant) const { if (!Subtarget->hasFlatInstOffsets()) { // Flat instructions do not have offsets, and only have the register // address. @@ -1292,29 +1332,27 @@ bool SITargetLowering::isLegalFlatAddressingMode(const AddrMode &AM) const { } return AM.Scale == 0 && - (AM.BaseOffs == 0 || - Subtarget->getInstrInfo()->isLegalFLATOffset( - AM.BaseOffs, AMDGPUAS::FLAT_ADDRESS, SIInstrFlags::FLAT)); + (AM.BaseOffs == 0 || Subtarget->getInstrInfo()->isLegalFLATOffset( + AM.BaseOffs, AddrSpace, FlatVariant)); } bool SITargetLowering::isLegalGlobalAddressingMode(const AddrMode &AM) const { if (Subtarget->hasFlatGlobalInsts()) - return AM.Scale == 0 && - (AM.BaseOffs == 0 || Subtarget->getInstrInfo()->isLegalFLATOffset( - AM.BaseOffs, AMDGPUAS::GLOBAL_ADDRESS, - SIInstrFlags::FlatGlobal)); + return isLegalFlatAddressingMode(AM, AMDGPUAS::GLOBAL_ADDRESS, + SIInstrFlags::FlatGlobal); if (!Subtarget->hasAddr64() || Subtarget->useFlatForGlobal()) { - // Assume the we will use FLAT for all global memory accesses - // on VI. - // FIXME: This assumption is currently wrong. On VI we still use - // MUBUF instructions for the r + i addressing mode. As currently - // implemented, the MUBUF instructions only work on buffer < 4GB. - // It may be possible to support > 4GB buffers with MUBUF instructions, - // by setting the stride value in the resource descriptor which would - // increase the size limit to (stride * 4GB). However, this is risky, - // because it has never been validated. - return isLegalFlatAddressingMode(AM); + // Assume the we will use FLAT for all global memory accesses + // on VI. + // FIXME: This assumption is currently wrong. On VI we still use + // MUBUF instructions for the r + i addressing mode. As currently + // implemented, the MUBUF instructions only work on buffer < 4GB. + // It may be possible to support > 4GB buffers with MUBUF instructions, + // by setting the stride value in the resource descriptor which would + // increase the size limit to (stride * 4GB). However, this is risky, + // because it has never been validated. + return isLegalFlatAddressingMode(AM, AMDGPUAS::FLAT_ADDRESS, + SIInstrFlags::FLAT); } return isLegalMUBUFAddressingMode(AM); @@ -1411,9 +1449,13 @@ bool SITargetLowering::isLegalAddressingMode(const DataLayout &DL, } if (AS == AMDGPUAS::PRIVATE_ADDRESS) - return isLegalMUBUFAddressingMode(AM); + return Subtarget->enableFlatScratch() + ? isLegalFlatAddressingMode(AM, AMDGPUAS::PRIVATE_ADDRESS, + SIInstrFlags::FlatScratch) + : isLegalMUBUFAddressingMode(AM); - if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) { + if (AS == AMDGPUAS::LOCAL_ADDRESS || + (AS == AMDGPUAS::REGION_ADDRESS && Subtarget->hasGDS())) { // Basic, single offset DS instructions allow a 16-bit unsigned immediate // field. // XXX - If doing a 4-byte aligned 8-byte type access, we effectively have @@ -1436,7 +1478,8 @@ bool SITargetLowering::isLegalAddressingMode(const DataLayout &DL, // computation. We don't have instructions that compute pointers with any // addressing modes, so treat them as having no offset like flat // instructions. - return isLegalFlatAddressingMode(AM); + return isLegalFlatAddressingMode(AM, AMDGPUAS::FLAT_ADDRESS, + SIInstrFlags::FLAT); } // Assume a user alias of global for unknown address spaces. @@ -1754,7 +1797,7 @@ SDValue SITargetLowering::lowerKernArgParameterPtr(SelectionDAG &DAG, SDValue BasePtr = DAG.getCopyFromReg(Chain, SL, MRI.getLiveInVirtReg(InputPtrReg->getRegister()), PtrVT); - return DAG.getObjectPtrOffset(SL, BasePtr, TypeSize::Fixed(Offset)); + return DAG.getObjectPtrOffset(SL, BasePtr, TypeSize::getFixed(Offset)); } SDValue SITargetLowering::getImplicitArgPtr(SelectionDAG &DAG, @@ -2133,13 +2176,14 @@ void SITargetLowering::allocateSpecialInputSGPRs( const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const { auto &ArgInfo = Info.getArgInfo(); + const GCNUserSGPRUsageInfo &UserSGPRInfo = Info.getUserSGPRInfo(); // TODO: Unify handling with private memory pointers. - if (Info.hasDispatchPtr()) + if (UserSGPRInfo.hasDispatchPtr()) allocateSGPR64Input(CCInfo, ArgInfo.DispatchPtr); const Module *M = MF.getFunction().getParent(); - if (Info.hasQueuePtr() && + if (UserSGPRInfo.hasQueuePtr() && AMDGPU::getCodeObjectVersion(*M) < AMDGPU::AMDHSA_COV5) allocateSGPR64Input(CCInfo, ArgInfo.QueuePtr); @@ -2148,7 +2192,7 @@ void SITargetLowering::allocateSpecialInputSGPRs( if (Info.hasImplicitArgPtr()) allocateSGPR64Input(CCInfo, ArgInfo.ImplicitArgPtr); - if (Info.hasDispatchID()) + if (UserSGPRInfo.hasDispatchID()) allocateSGPR64Input(CCInfo, ArgInfo.DispatchID); // flat_scratch_init is not applicable for non-kernel functions. @@ -2171,34 +2215,35 @@ void SITargetLowering::allocateHSAUserSGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const { - if (Info.hasImplicitBufferPtr()) { + const GCNUserSGPRUsageInfo &UserSGPRInfo = Info.getUserSGPRInfo(); + if (UserSGPRInfo.hasImplicitBufferPtr()) { Register ImplicitBufferPtrReg = Info.addImplicitBufferPtr(TRI); MF.addLiveIn(ImplicitBufferPtrReg, &AMDGPU::SGPR_64RegClass); CCInfo.AllocateReg(ImplicitBufferPtrReg); } // FIXME: How should these inputs interact with inreg / custom SGPR inputs? - if (Info.hasPrivateSegmentBuffer()) { + if (UserSGPRInfo.hasPrivateSegmentBuffer()) { Register PrivateSegmentBufferReg = Info.addPrivateSegmentBuffer(TRI); MF.addLiveIn(PrivateSegmentBufferReg, &AMDGPU::SGPR_128RegClass); CCInfo.AllocateReg(PrivateSegmentBufferReg); } - if (Info.hasDispatchPtr()) { + if (UserSGPRInfo.hasDispatchPtr()) { Register DispatchPtrReg = Info.addDispatchPtr(TRI); MF.addLiveIn(DispatchPtrReg, &AMDGPU::SGPR_64RegClass); CCInfo.AllocateReg(DispatchPtrReg); } const Module *M = MF.getFunction().getParent(); - if (Info.hasQueuePtr() && + if (UserSGPRInfo.hasQueuePtr() && AMDGPU::getCodeObjectVersion(*M) < AMDGPU::AMDHSA_COV5) { Register QueuePtrReg = Info.addQueuePtr(TRI); MF.addLiveIn(QueuePtrReg, &AMDGPU::SGPR_64RegClass); CCInfo.AllocateReg(QueuePtrReg); } - if (Info.hasKernargSegmentPtr()) { + if (UserSGPRInfo.hasKernargSegmentPtr()) { MachineRegisterInfo &MRI = MF.getRegInfo(); Register InputPtrReg = Info.addKernargSegmentPtr(TRI); CCInfo.AllocateReg(InputPtrReg); @@ -2207,26 +2252,100 @@ void SITargetLowering::allocateHSAUserSGPRs(CCState &CCInfo, MRI.setType(VReg, LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); } - if (Info.hasDispatchID()) { + if (UserSGPRInfo.hasDispatchID()) { Register DispatchIDReg = Info.addDispatchID(TRI); MF.addLiveIn(DispatchIDReg, &AMDGPU::SGPR_64RegClass); CCInfo.AllocateReg(DispatchIDReg); } - if (Info.hasFlatScratchInit() && !getSubtarget()->isAmdPalOS()) { + if (UserSGPRInfo.hasFlatScratchInit() && !getSubtarget()->isAmdPalOS()) { Register FlatScratchInitReg = Info.addFlatScratchInit(TRI); MF.addLiveIn(FlatScratchInitReg, &AMDGPU::SGPR_64RegClass); CCInfo.AllocateReg(FlatScratchInitReg); } + // TODO: Add GridWorkGroupCount user SGPRs when used. For now with HSA we read + // these from the dispatch pointer. +} + +// Allocate pre-loaded kernel arguemtns. Arguments to be preloading must be +// sequential starting from the first argument. +void SITargetLowering::allocatePreloadKernArgSGPRs( + CCState &CCInfo, SmallVectorImpl<CCValAssign> &ArgLocs, + const SmallVectorImpl<ISD::InputArg> &Ins, MachineFunction &MF, + const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const { + Function &F = MF.getFunction(); + unsigned LastExplicitArgOffset = + MF.getSubtarget<GCNSubtarget>().getExplicitKernelArgOffset(); + GCNUserSGPRUsageInfo &SGPRInfo = Info.getUserSGPRInfo(); + bool InPreloadSequence = true; + unsigned InIdx = 0; + for (auto &Arg : F.args()) { + if (!InPreloadSequence || !Arg.hasInRegAttr()) + break; + + int ArgIdx = Arg.getArgNo(); + // Don't preload non-original args or parts not in the current preload + // sequence. + if (InIdx < Ins.size() && (!Ins[InIdx].isOrigArg() || + (int)Ins[InIdx].getOrigArgIndex() != ArgIdx)) + break; + + for (; InIdx < Ins.size() && Ins[InIdx].isOrigArg() && + (int)Ins[InIdx].getOrigArgIndex() == ArgIdx; + InIdx++) { + assert(ArgLocs[ArgIdx].isMemLoc()); + auto &ArgLoc = ArgLocs[InIdx]; + const Align KernelArgBaseAlign = Align(16); + unsigned ArgOffset = ArgLoc.getLocMemOffset(); + Align Alignment = commonAlignment(KernelArgBaseAlign, ArgOffset); + unsigned NumAllocSGPRs = + alignTo(ArgLoc.getLocVT().getFixedSizeInBits(), 32) / 32; + + // Arg is preloaded into the previous SGPR. + if (ArgLoc.getLocVT().getStoreSize() < 4 && Alignment < 4) { + Info.getArgInfo().PreloadKernArgs[InIdx].Regs.push_back( + Info.getArgInfo().PreloadKernArgs[InIdx - 1].Regs[0]); + continue; + } + + unsigned Padding = ArgOffset - LastExplicitArgOffset; + unsigned PaddingSGPRs = alignTo(Padding, 4) / 4; + // Check for free user SGPRs for preloading. + if (PaddingSGPRs + NumAllocSGPRs + 1 /*Synthetic SGPRs*/ > + SGPRInfo.getNumFreeUserSGPRs()) { + InPreloadSequence = false; + break; + } + + // Preload this argument. + const TargetRegisterClass *RC = + TRI.getSGPRClassForBitWidth(NumAllocSGPRs * 32); + SmallVectorImpl<MCRegister> *PreloadRegs = + Info.addPreloadedKernArg(TRI, RC, NumAllocSGPRs, InIdx, PaddingSGPRs); + + if (PreloadRegs->size() > 1) + RC = &AMDGPU::SGPR_32RegClass; + for (auto &Reg : *PreloadRegs) { + assert(Reg); + MF.addLiveIn(Reg, RC); + CCInfo.AllocateReg(Reg); + } + + LastExplicitArgOffset = NumAllocSGPRs * 4 + ArgOffset; + } + } +} + +void SITargetLowering::allocateLDSKernelId(CCState &CCInfo, MachineFunction &MF, + const SIRegisterInfo &TRI, + SIMachineFunctionInfo &Info) const { + // Always allocate this last since it is a synthetic preload. if (Info.hasLDSKernelId()) { Register Reg = Info.addLDSKernelId(); MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass); CCInfo.AllocateReg(Reg); } - - // TODO: Add GridWorkGroupCount user SGPRs when used. For now with HSA we read - // these from the dispatch pointer. } // Allocate special input registers that are initialized per-wave. @@ -2331,7 +2450,7 @@ static void reservePrivateMemoryRegs(const TargetMachine &TM, // Everything live out of a block is spilled with fast regalloc, so it's // almost certain that spilling will be required. - if (TM.getOptLevel() == CodeGenOpt::None) + if (TM.getOptLevel() == CodeGenOptLevel::None) HasStackObjects = true; // For now assume stack access is needed in any callee functions, so we need @@ -2477,12 +2596,14 @@ SDValue SITargetLowering::LowerFormalArguments( bool IsEntryFunc = AMDGPU::isEntryFunctionCC(CallConv); if (IsGraphics) { - assert(!Info->hasDispatchPtr() && !Info->hasKernargSegmentPtr() && - !Info->hasWorkGroupInfo() && !Info->hasLDSKernelId() && - !Info->hasWorkItemIDX() && !Info->hasWorkItemIDY() && - !Info->hasWorkItemIDZ()); + const GCNUserSGPRUsageInfo &UserSGPRInfo = Info->getUserSGPRInfo(); + assert(!UserSGPRInfo.hasDispatchPtr() && + !UserSGPRInfo.hasKernargSegmentPtr() && !Info->hasWorkGroupInfo() && + !Info->hasLDSKernelId() && !Info->hasWorkItemIDX() && + !Info->hasWorkItemIDY() && !Info->hasWorkItemIDZ()); + (void)UserSGPRInfo; if (!Subtarget->enableFlatScratch()) - assert(!Info->hasFlatScratchInit()); + assert(!UserSGPRInfo.hasFlatScratchInit()); if (CallConv != CallingConv::AMDGPU_CS || !Subtarget->hasArchitectedSGPRs()) assert(!Info->hasWorkGroupIDX() && !Info->hasWorkGroupIDY() && !Info->hasWorkGroupIDZ()); @@ -2531,18 +2652,29 @@ SDValue SITargetLowering::LowerFormalArguments( Splits.append(Ins.begin(), Ins.end()); } + if (IsKernel) + analyzeFormalArgumentsCompute(CCInfo, Ins); + if (IsEntryFunc) { allocateSpecialEntryInputVGPRs(CCInfo, MF, *TRI, *Info); allocateHSAUserSGPRs(CCInfo, MF, *TRI, *Info); + if (IsKernel && Subtarget->hasKernargPreload() && + !Subtarget->needsKernargPreloadBackwardsCompatibility()) + allocatePreloadKernArgSGPRs(CCInfo, ArgLocs, Ins, MF, *TRI, *Info); + + allocateLDSKernelId(CCInfo, MF, *TRI, *Info); } else if (!IsGraphics) { // For the fixed ABI, pass workitem IDs in the last argument register. allocateSpecialInputVGPRsFixed(CCInfo, MF, *TRI, *Info); } - if (IsKernel) { - analyzeFormalArgumentsCompute(CCInfo, Ins); - } else { + if (!IsKernel) { CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, isVarArg); + if (!IsGraphics && !Subtarget->enableFlatScratch()) { + CCInfo.AllocateRegBlock(ArrayRef<MCPhysReg>{AMDGPU::SGPR0, AMDGPU::SGPR1, + AMDGPU::SGPR2, AMDGPU::SGPR3}, + 4); + } CCInfo.AnalyzeFormalArguments(Splits, AssignFn); } @@ -2587,9 +2719,81 @@ SDValue SITargetLowering::LowerFormalArguments( continue; } - SDValue Arg = lowerKernargMemParameter( - DAG, VT, MemVT, DL, Chain, Offset, Alignment, Ins[i].Flags.isSExt(), &Ins[i]); - Chains.push_back(Arg.getValue(1)); + SDValue NewArg; + if (Arg.isOrigArg() && Info->getArgInfo().PreloadKernArgs.count(i)) { + if (MemVT.getStoreSize() < 4 && Alignment < 4) { + // In this case the argument is packed into the previous preload SGPR. + int64_t AlignDownOffset = alignDown(Offset, 4); + int64_t OffsetDiff = Offset - AlignDownOffset; + EVT IntVT = MemVT.changeTypeToInteger(); + + const SIMachineFunctionInfo *Info = + MF.getInfo<SIMachineFunctionInfo>(); + MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo(); + Register Reg = + Info->getArgInfo().PreloadKernArgs.find(i)->getSecond().Regs[0]; + + assert(Reg); + Register VReg = MRI.getLiveInVirtReg(Reg); + SDValue Copy = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i32); + + SDValue ShiftAmt = DAG.getConstant(OffsetDiff * 8, DL, MVT::i32); + SDValue Extract = DAG.getNode(ISD::SRL, DL, MVT::i32, Copy, ShiftAmt); + + SDValue ArgVal = DAG.getNode(ISD::TRUNCATE, DL, IntVT, Extract); + ArgVal = DAG.getNode(ISD::BITCAST, DL, MemVT, ArgVal); + NewArg = convertArgType(DAG, VT, MemVT, DL, ArgVal, + Ins[i].Flags.isSExt(), &Ins[i]); + + NewArg = DAG.getMergeValues({NewArg, Copy.getValue(1)}, DL); + } else { + const SIMachineFunctionInfo *Info = + MF.getInfo<SIMachineFunctionInfo>(); + MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo(); + const SmallVectorImpl<MCRegister> &PreloadRegs = + Info->getArgInfo().PreloadKernArgs.find(i)->getSecond().Regs; + + SDValue Copy; + if (PreloadRegs.size() == 1) { + Register VReg = MRI.getLiveInVirtReg(PreloadRegs[0]); + const TargetRegisterClass *RC = MRI.getRegClass(VReg); + NewArg = DAG.getCopyFromReg( + Chain, DL, VReg, + EVT::getIntegerVT(*DAG.getContext(), + TRI->getRegSizeInBits(*RC))); + + } else { + // If the kernarg alignment does not match the alignment of the SGPR + // tuple RC that can accommodate this argument, it will be built up + // via copies from from the individual SGPRs that the argument was + // preloaded to. + SmallVector<SDValue, 4> Elts; + for (auto Reg : PreloadRegs) { + Register VReg = MRI.getLiveInVirtReg(Reg); + Copy = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i32); + Elts.push_back(Copy); + } + NewArg = + DAG.getBuildVector(EVT::getVectorVT(*DAG.getContext(), MVT::i32, + PreloadRegs.size()), + DL, Elts); + } + + SDValue CMemVT; + if (VT.isScalarInteger() && VT.bitsLT(NewArg.getSimpleValueType())) + CMemVT = DAG.getNode(ISD::TRUNCATE, DL, MemVT, NewArg); + else + CMemVT = DAG.getBitcast(MemVT, NewArg); + NewArg = convertArgType(DAG, VT, MemVT, DL, CMemVT, + Ins[i].Flags.isSExt(), &Ins[i]); + NewArg = DAG.getMergeValues({NewArg, Chain}, DL); + } + } else { + NewArg = + lowerKernargMemParameter(DAG, VT, MemVT, DL, Chain, Offset, + Alignment, Ins[i].Flags.isSExt(), &Ins[i]); + } + Chains.push_back(NewArg.getValue(1)); auto *ParamTy = dyn_cast<PointerType>(FType->getParamType(Ins[i].getOrigArgIndex())); @@ -2599,11 +2803,11 @@ SDValue SITargetLowering::LowerFormalArguments( // On SI local pointers are just offsets into LDS, so they are always // less than 16-bits. On CI and newer they could potentially be // real pointers, so we can't guarantee their size. - Arg = DAG.getNode(ISD::AssertZext, DL, Arg.getValueType(), Arg, - DAG.getValueType(MVT::i16)); + NewArg = DAG.getNode(ISD::AssertZext, DL, NewArg.getValueType(), NewArg, + DAG.getValueType(MVT::i16)); } - InVals.push_back(Arg); + InVals.push_back(NewArg); continue; } else if (!IsEntryFunc && VA.isMemLoc()) { SDValue Val = lowerStackParameter(DAG, VA, DL, Chain, Arg); @@ -3084,6 +3288,9 @@ bool SITargetLowering::isEligibleForTailCallOptimization( const SmallVectorImpl<ISD::OutputArg> &Outs, const SmallVectorImpl<SDValue> &OutVals, const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const { + if (AMDGPU::isChainCC(CalleeCC)) + return true; + if (!mayTailCallThisCC(CalleeCC)) return false; @@ -3168,7 +3375,36 @@ bool SITargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const { // The wave scratch offset register is used as the global base pointer. SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI, SmallVectorImpl<SDValue> &InVals) const { + CallingConv::ID CallConv = CLI.CallConv; + bool IsChainCallConv = AMDGPU::isChainCC(CallConv); + SelectionDAG &DAG = CLI.DAG; + + TargetLowering::ArgListEntry RequestedExec; + if (IsChainCallConv) { + // The last argument should be the value that we need to put in EXEC. + // Pop it out of CLI.Outs and CLI.OutVals before we do any processing so we + // don't treat it like the rest of the arguments. + RequestedExec = CLI.Args.back(); + assert(RequestedExec.Node && "No node for EXEC"); + + if (!RequestedExec.Ty->isIntegerTy(Subtarget->getWavefrontSize())) + return lowerUnhandledCall(CLI, InVals, "Invalid value for EXEC"); + + assert(CLI.Outs.back().OrigArgIndex == 2 && "Unexpected last arg"); + CLI.Outs.pop_back(); + CLI.OutVals.pop_back(); + + if (RequestedExec.Ty->isIntegerTy(64)) { + assert(CLI.Outs.back().OrigArgIndex == 2 && "Exec wasn't split up"); + CLI.Outs.pop_back(); + CLI.OutVals.pop_back(); + } + + assert(CLI.Outs.back().OrigArgIndex != 2 && + "Haven't popped all the pieces of the EXEC mask"); + } + const SDLoc &DL = CLI.DL; SmallVector<ISD::OutputArg, 32> &Outs = CLI.Outs; SmallVector<SDValue, 32> &OutVals = CLI.OutVals; @@ -3176,7 +3412,6 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI, SDValue Chain = CLI.Chain; SDValue Callee = CLI.Callee; bool &IsTailCall = CLI.IsTailCall; - CallingConv::ID CallConv = CLI.CallConv; bool IsVarArg = CLI.IsVarArg; bool IsSibCall = false; bool IsThisReturn = false; @@ -3207,9 +3442,10 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI, if (IsTailCall) { IsTailCall = isEligibleForTailCallOptimization( Callee, CallConv, IsVarArg, Outs, OutVals, Ins, DAG); - if (!IsTailCall && CLI.CB && CLI.CB->isMustTailCall()) { + if (!IsTailCall && + ((CLI.CB && CLI.CB->isMustTailCall()) || IsChainCallConv)) { report_fatal_error("failed to perform tail call elimination on a call " - "site marked musttail"); + "site marked musttail or on llvm.amdgcn.cs.chain"); } bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt; @@ -3232,7 +3468,7 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI, CCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext()); CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, IsVarArg); - if (CallConv != CallingConv::AMDGPU_Gfx) { + if (CallConv != CallingConv::AMDGPU_Gfx && !AMDGPU::isChainCC(CallConv)) { // With a fixed ABI, allocate fixed registers before user arguments. passSpecialInputs(CLI, CCInfo, *Info, RegsToPass, MemOpChains, Chain); } @@ -3258,16 +3494,20 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI, // Adjust the stack pointer for the new arguments... // These operations are automatically eliminated by the prolog/epilog pass - if (!IsSibCall) { + if (!IsSibCall) Chain = DAG.getCALLSEQ_START(Chain, 0, 0, DL); + if (!IsSibCall || IsChainCallConv) { if (!Subtarget->enableFlatScratch()) { SmallVector<SDValue, 4> CopyFromChains; // In the HSA case, this should be an identity copy. SDValue ScratchRSrcReg = DAG.getCopyFromReg(Chain, DL, Info->getScratchRSrcReg(), MVT::v4i32); - RegsToPass.emplace_back(AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3, ScratchRSrcReg); + RegsToPass.emplace_back(IsChainCallConv + ? AMDGPU::SGPR48_SGPR49_SGPR50_SGPR51 + : AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3, + ScratchRSrcReg); CopyFromChains.push_back(ScratchRSrcReg.getValue(1)); Chain = DAG.getTokenFactor(DL, CopyFromChains); } @@ -3412,6 +3652,9 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI, Ops.push_back(DAG.getTargetConstant(FPDiff, DL, MVT::i32)); } + if (IsChainCallConv) + Ops.push_back(RequestedExec.Node); + // Add argument registers to the end of the list so that they are known live // into the call. for (auto &RegToPass : RegsToPass) { @@ -3420,8 +3663,7 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI, } // Add a register mask operand representing the call-preserved registers. - - auto *TRI = static_cast<const SIRegisterInfo*>(Subtarget->getRegisterInfo()); + auto *TRI = static_cast<const SIRegisterInfo *>(Subtarget->getRegisterInfo()); const uint32_t *Mask = TRI->getCallPreservedMask(MF, CallConv); assert(Mask && "Missing call preserved mask for calling convention"); Ops.push_back(DAG.getRegisterMask(Mask)); @@ -3435,8 +3677,17 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI, // actual call instruction. if (IsTailCall) { MFI.setHasTailCall(); - unsigned OPC = CallConv == CallingConv::AMDGPU_Gfx ? - AMDGPUISD::TC_RETURN_GFX : AMDGPUISD::TC_RETURN; + unsigned OPC = AMDGPUISD::TC_RETURN; + switch (CallConv) { + case CallingConv::AMDGPU_Gfx: + OPC = AMDGPUISD::TC_RETURN_GFX; + break; + case CallingConv::AMDGPU_CS_Chain: + case CallingConv::AMDGPU_CS_ChainPreserve: + OPC = AMDGPUISD::TC_RETURN_CHAIN; + break; + } + return DAG.getNode(OPC, DL, NodeTys, Ops); } @@ -3481,22 +3732,21 @@ SDValue SITargetLowering::lowerDYNAMIC_STACKALLOCImpl( SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, VT); Chain = SP.getValue(1); MaybeAlign Alignment = cast<ConstantSDNode>(Tmp3)->getMaybeAlignValue(); - const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); - const TargetFrameLowering *TFL = ST.getFrameLowering(); + const TargetFrameLowering *TFL = Subtarget->getFrameLowering(); unsigned Opc = TFL->getStackGrowthDirection() == TargetFrameLowering::StackGrowsUp ? ISD::ADD : ISD::SUB; SDValue ScaledSize = DAG.getNode( ISD::SHL, dl, VT, Size, - DAG.getConstant(ST.getWavefrontSizeLog2(), dl, MVT::i32)); + DAG.getConstant(Subtarget->getWavefrontSizeLog2(), dl, MVT::i32)); Align StackAlign = TFL->getStackAlign(); Tmp1 = DAG.getNode(Opc, dl, VT, SP, ScaledSize); // Value if (Alignment && *Alignment > StackAlign) { Tmp1 = DAG.getNode(ISD::AND, dl, VT, Tmp1, DAG.getConstant(-(uint64_t)Alignment->value() - << ST.getWavefrontSizeLog2(), + << Subtarget->getWavefrontSizeLog2(), dl, VT)); } @@ -3520,6 +3770,94 @@ SDValue SITargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, return AMDGPUTargetLowering::LowerDYNAMIC_STACKALLOC(Op, DAG); } +SDValue SITargetLowering::LowerSTACKSAVE(SDValue Op, SelectionDAG &DAG) const { + if (Op.getValueType() != MVT::i32) + return Op; // Defer to cannot select error. + + Register SP = getStackPointerRegisterToSaveRestore(); + SDLoc SL(Op); + + SDValue CopyFromSP = DAG.getCopyFromReg(Op->getOperand(0), SL, SP, MVT::i32); + + // Convert from wave uniform to swizzled vector address. This should protect + // from any edge cases where the stacksave result isn't directly used with + // stackrestore. + SDValue VectorAddress = + DAG.getNode(AMDGPUISD::WAVE_ADDRESS, SL, MVT::i32, CopyFromSP); + return DAG.getMergeValues({VectorAddress, CopyFromSP.getValue(1)}, SL); +} + +SDValue SITargetLowering::lowerGET_ROUNDING(SDValue Op, + SelectionDAG &DAG) const { + SDLoc SL(Op); + assert(Op.getValueType() == MVT::i32); + + uint32_t BothRoundHwReg = + AMDGPU::Hwreg::encodeHwreg(AMDGPU::Hwreg::ID_MODE, 0, 4); + SDValue GetRoundBothImm = DAG.getTargetConstant(BothRoundHwReg, SL, MVT::i32); + + SDValue IntrinID = + DAG.getTargetConstant(Intrinsic::amdgcn_s_getreg, SL, MVT::i32); + SDValue GetReg = DAG.getNode(ISD::INTRINSIC_W_CHAIN, SL, Op->getVTList(), + Op.getOperand(0), IntrinID, GetRoundBothImm); + + // There are two rounding modes, one for f32 and one for f64/f16. We only + // report in the standard value range if both are the same. + // + // The raw values also differ from the expected FLT_ROUNDS values. Nearest + // ties away from zero is not supported, and the other values are rotated by + // 1. + // + // If the two rounding modes are not the same, report a target defined value. + + // Mode register rounding mode fields: + // + // [1:0] Single-precision round mode. + // [3:2] Double/Half-precision round mode. + // + // 0=nearest even; 1= +infinity; 2= -infinity, 3= toward zero. + // + // Hardware Spec + // Toward-0 3 0 + // Nearest Even 0 1 + // +Inf 1 2 + // -Inf 2 3 + // NearestAway0 N/A 4 + // + // We have to handle 16 permutations of a 4-bit value, so we create a 64-bit + // table we can index by the raw hardware mode. + // + // (trunc (FltRoundConversionTable >> MODE.fp_round)) & 0xf + + SDValue BitTable = + DAG.getConstant(AMDGPU::FltRoundConversionTable, SL, MVT::i64); + + SDValue Two = DAG.getConstant(2, SL, MVT::i32); + SDValue RoundModeTimesNumBits = + DAG.getNode(ISD::SHL, SL, MVT::i32, GetReg, Two); + + // TODO: We could possibly avoid a 64-bit shift and use a simpler table if we + // knew only one mode was demanded. + SDValue TableValue = + DAG.getNode(ISD::SRL, SL, MVT::i64, BitTable, RoundModeTimesNumBits); + SDValue TruncTable = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, TableValue); + + SDValue EntryMask = DAG.getConstant(0xf, SL, MVT::i32); + SDValue TableEntry = + DAG.getNode(ISD::AND, SL, MVT::i32, TruncTable, EntryMask); + + // There's a gap in the 4-bit encoded table and actual enum values, so offset + // if it's an extended value. + SDValue Four = DAG.getConstant(4, SL, MVT::i32); + SDValue IsStandardValue = + DAG.getSetCC(SL, MVT::i1, TableEntry, Four, ISD::SETULT); + SDValue EnumOffset = DAG.getNode(ISD::ADD, SL, MVT::i32, TableEntry, Four); + SDValue Result = DAG.getNode(ISD::SELECT, SL, MVT::i32, IsStandardValue, + TableEntry, EnumOffset); + + return DAG.getMergeValues({Result, GetReg.getValue(1)}, SL); +} + Register SITargetLowering::getRegisterByName(const char* RegName, LLT VT, const MachineFunction &MF) const { Register Reg = StringSwitch<Register>(RegName) @@ -4463,8 +4801,8 @@ MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter( const SIRegisterInfo *TRI = ST.getRegisterInfo(); Register Dst = MI.getOperand(0).getReg(); - Register Src0 = MI.getOperand(1).getReg(); - Register Src1 = MI.getOperand(2).getReg(); + const MachineOperand &Src0 = MI.getOperand(1); + const MachineOperand &Src1 = MI.getOperand(2); const DebugLoc &DL = MI.getDebugLoc(); Register SrcCond = MI.getOperand(3).getReg(); @@ -4473,20 +4811,42 @@ MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter( const auto *CondRC = TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID); Register SrcCondCopy = MRI.createVirtualRegister(CondRC); + const TargetRegisterClass *Src0RC = Src0.isReg() + ? MRI.getRegClass(Src0.getReg()) + : &AMDGPU::VReg_64RegClass; + const TargetRegisterClass *Src1RC = Src1.isReg() + ? MRI.getRegClass(Src1.getReg()) + : &AMDGPU::VReg_64RegClass; + + const TargetRegisterClass *Src0SubRC = + TRI->getSubRegisterClass(Src0RC, AMDGPU::sub0); + const TargetRegisterClass *Src1SubRC = + TRI->getSubRegisterClass(Src1RC, AMDGPU::sub1); + + MachineOperand Src0Sub0 = TII->buildExtractSubRegOrImm( + MI, MRI, Src0, Src0RC, AMDGPU::sub0, Src0SubRC); + MachineOperand Src1Sub0 = TII->buildExtractSubRegOrImm( + MI, MRI, Src1, Src1RC, AMDGPU::sub0, Src1SubRC); + + MachineOperand Src0Sub1 = TII->buildExtractSubRegOrImm( + MI, MRI, Src0, Src0RC, AMDGPU::sub1, Src0SubRC); + MachineOperand Src1Sub1 = TII->buildExtractSubRegOrImm( + MI, MRI, Src1, Src1RC, AMDGPU::sub1, Src1SubRC); + BuildMI(*BB, MI, DL, TII->get(AMDGPU::COPY), SrcCondCopy) .addReg(SrcCond); BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_CNDMASK_B32_e64), DstLo) - .addImm(0) - .addReg(Src0, 0, AMDGPU::sub0) - .addImm(0) - .addReg(Src1, 0, AMDGPU::sub0) - .addReg(SrcCondCopy); + .addImm(0) + .add(Src0Sub0) + .addImm(0) + .add(Src1Sub0) + .addReg(SrcCondCopy); BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_CNDMASK_B32_e64), DstHi) - .addImm(0) - .addReg(Src0, 0, AMDGPU::sub1) - .addImm(0) - .addReg(Src1, 0, AMDGPU::sub1) - .addReg(SrcCondCopy); + .addImm(0) + .add(Src0Sub1) + .addImm(0) + .add(Src1Sub1) + .addReg(SrcCondCopy); BuildMI(*BB, MI, DL, TII->get(AMDGPU::REG_SEQUENCE), Dst) .addReg(DstLo) @@ -4843,7 +5203,7 @@ SDValue SITargetLowering::splitUnaryVectorOp(SDValue Op, assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4f32 || VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v16i16 || VT == MVT::v16f16 || VT == MVT::v8f32 || VT == MVT::v16f32 || - VT == MVT::v32f32); + VT == MVT::v32f32 || VT == MVT::v32i16 || VT == MVT::v32f16); SDValue Lo, Hi; std::tie(Lo, Hi) = DAG.SplitVectorOperand(Op.getNode(), 0); @@ -4866,7 +5226,7 @@ SDValue SITargetLowering::splitBinaryVectorOp(SDValue Op, assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4f32 || VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v16i16 || VT == MVT::v16f16 || VT == MVT::v8f32 || VT == MVT::v16f32 || - VT == MVT::v32f32); + VT == MVT::v32f32 || VT == MVT::v32i16 || VT == MVT::v32f16); SDValue Lo0, Hi0; std::tie(Lo0, Hi0) = DAG.SplitVectorOperand(Op.getNode(), 0); @@ -4926,10 +5286,14 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { "Load should return a value and a chain"); return Result; } - case ISD::FSQRT: - if (Op.getValueType() == MVT::f64) + case ISD::FSQRT: { + EVT VT = Op.getValueType(); + if (VT == MVT::f32) + return lowerFSQRTF32(Op, DAG); + if (VT == MVT::f64) return lowerFSQRTF64(Op, DAG); return SDValue(); + } case ISD::FSIN: case ISD::FCOS: return LowerTrig(Op, DAG); @@ -5027,6 +5391,10 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { return lowerXMUL_LOHI(Op, DAG); case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG); + case ISD::STACKSAVE: + return LowerSTACKSAVE(Op, DAG); + case ISD::GET_ROUNDING: + return lowerGET_ROUNDING(Op, DAG); } return SDValue(); } @@ -5382,6 +5750,12 @@ void SITargetLowering::ReplaceNodeResults(SDNode *N, Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2f16, Op)); return; } + case ISD::FSQRT: { + if (N->getValueType(0) != MVT::f16) + break; + Results.push_back(lowerFSQRTF16(SDValue(N, 0), DAG)); + break; + } default: AMDGPUTargetLowering::ReplaceNodeResults(N, Results, DAG); break; @@ -5433,6 +5807,9 @@ bool SITargetLowering::shouldEmitFixup(const GlobalValue *GV) const { } bool SITargetLowering::shouldEmitGOTReloc(const GlobalValue *GV) const { + if (Subtarget->isAmdPalOS() || Subtarget->isMesa3DOS()) + return false; + // FIXME: Either avoid relying on address space here or change the default // address space for functions to avoid the explicit check. return (GV->getValueType()->isFunctionTy() || @@ -5616,7 +5993,8 @@ SDValue SITargetLowering::lowerFMINNUM_FMAXNUM(SDValue Op, if (IsIEEEMode) return expandFMINNUM_FMAXNUM(Op.getNode(), DAG); - if (VT == MVT::v4f16 || VT == MVT::v8f16 || VT == MVT::v16f16) + if (VT == MVT::v4f16 || VT == MVT::v8f16 || VT == MVT::v16f16 || + VT == MVT::v16f16) return splitBinaryVectorOp(Op, DAG); return Op; } @@ -5711,11 +6089,6 @@ SDValue SITargetLowering::lowerTRAP(SDValue Op, SelectionDAG &DAG) const { Subtarget->getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA) return lowerTrapEndpgm(Op, DAG); - const Module *M = DAG.getMachineFunction().getFunction().getParent(); - unsigned CodeObjectVersion = AMDGPU::getCodeObjectVersion(*M); - if (CodeObjectVersion <= AMDGPU::AMDHSA_COV3) - return lowerTrapHsaQueuePtr(Op, DAG); - return Subtarget->supportsGetDoorbellID() ? lowerTrapHsa(Op, DAG) : lowerTrapHsaQueuePtr(Op, DAG); } @@ -5873,7 +6246,7 @@ SDValue SITargetLowering::getSegmentAperture(unsigned AS, const SDLoc &DL, uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44; SDValue Ptr = - DAG.getObjectPtrOffset(DL, QueuePtr, TypeSize::Fixed(StructOffset)); + DAG.getObjectPtrOffset(DL, QueuePtr, TypeSize::getFixed(StructOffset)); // TODO: Use custom target PseudoSourceValue. // TODO: We should use the value from the IR intrinsic call, but it might not @@ -6134,7 +6507,7 @@ SDValue SITargetLowering::lowerEXTRACT_VECTOR_ELT(SDValue Op, if (SDValue Combined = performExtractVectorEltCombine(Op.getNode(), DCI)) return Combined; - if (VecSize == 128 || VecSize == 256) { + if (VecSize == 128 || VecSize == 256 || VecSize == 512) { SDValue Lo, Hi; EVT LoVT, HiVT; std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VecVT); @@ -6147,9 +6520,7 @@ SDValue SITargetLowering::lowerEXTRACT_VECTOR_ELT(SDValue Op, Hi = DAG.getBitcast(HiVT, DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i64, V2, DAG.getConstant(1, SL, MVT::i32))); - } else { - assert(VecSize == 256); - + } else if (VecSize == 256) { SDValue V2 = DAG.getBitcast(MVT::v4i64, Vec); SDValue Parts[4]; for (unsigned P = 0; P < 4; ++P) { @@ -6161,6 +6532,22 @@ SDValue SITargetLowering::lowerEXTRACT_VECTOR_ELT(SDValue Op, Parts[0], Parts[1])); Hi = DAG.getBitcast(HiVT, DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i64, Parts[2], Parts[3])); + } else { + assert(VecSize == 512); + + SDValue V2 = DAG.getBitcast(MVT::v8i64, Vec); + SDValue Parts[8]; + for (unsigned P = 0; P < 8; ++P) { + Parts[P] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i64, V2, + DAG.getConstant(P, SL, MVT::i32)); + } + + Lo = DAG.getBitcast(LoVT, + DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v4i64, + Parts[0], Parts[1], Parts[2], Parts[3])); + Hi = DAG.getBitcast(HiVT, + DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v4i64, + Parts[4], Parts[5],Parts[6], Parts[7])); } EVT IdxVT = Idx.getValueType(); @@ -6326,6 +6713,27 @@ SDValue SITargetLowering::lowerBUILD_VECTOR(SDValue Op, return DAG.getNode(ISD::BITCAST, SL, VT, Blend); } + if (VT == MVT::v32i16 || VT == MVT::v32f16) { + EVT QuarterVT = MVT::getVectorVT(VT.getVectorElementType().getSimpleVT(), + VT.getVectorNumElements() / 8); + MVT QuarterIntVT = MVT::getIntegerVT(QuarterVT.getSizeInBits()); + + SmallVector<SDValue, 8> Parts[8]; + for (unsigned I = 0, E = VT.getVectorNumElements() / 8; I != E; ++I) { + for (unsigned P = 0; P < 8; ++P) + Parts[P].push_back(Op.getOperand(I + P * E)); + } + SDValue Casts[8]; + for (unsigned P = 0; P < 8; ++P) { + SDValue Vec = DAG.getBuildVector(QuarterVT, SL, Parts[P]); + Casts[P] = DAG.getNode(ISD::BITCAST, SL, QuarterIntVT, Vec); + } + + SDValue Blend = + DAG.getBuildVector(MVT::getVectorVT(QuarterIntVT, 8), SL, Casts); + return DAG.getNode(ISD::BITCAST, SL, VT, Blend); + } + assert(VT == MVT::v2f16 || VT == MVT::v2i16); assert(!Subtarget->hasVOP3PInsts() && "this should be legal"); @@ -6391,24 +6799,12 @@ buildPCRelGlobalAddress(SelectionDAG &DAG, const GlobalValue *GV, // $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant, // which is a 64-bit pc-relative offset from the encoding of the $symbol // operand to the global variable. - // - // What we want here is an offset from the value returned by s_getpc - // (which is the address of the s_add_u32 instruction) to the global - // variable, but since the encoding of $symbol starts 4 bytes after the start - // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too - // small. This requires us to add 4 to the global variable offset in order to - // compute the correct address. Similarly for the s_addc_u32 instruction, the - // encoding of $symbol starts 12 bytes after the start of the s_add_u32 - // instruction. - SDValue PtrLo = - DAG.getTargetGlobalAddress(GV, DL, MVT::i32, Offset + 4, GAFlags); + SDValue PtrLo = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, Offset, GAFlags); SDValue PtrHi; - if (GAFlags == SIInstrInfo::MO_NONE) { + if (GAFlags == SIInstrInfo::MO_NONE) PtrHi = DAG.getTargetConstant(0, DL, MVT::i32); - } else { - PtrHi = - DAG.getTargetGlobalAddress(GV, DL, MVT::i32, Offset + 12, GAFlags + 1); - } + else + PtrHi = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, Offset, GAFlags + 1); return DAG.getNode(AMDGPUISD::PC_ADD_REL_OFFSET, DL, PtrVT, PtrLo, PtrHi); } @@ -6450,9 +6846,22 @@ SDValue SITargetLowering::LowerGlobalAddress(AMDGPUMachineFunction *MFI, return DAG.getNode(AMDGPUISD::LDS, DL, MVT::i32, GA); } + if (Subtarget->isAmdPalOS() || Subtarget->isMesa3DOS()) { + SDValue AddrLo = DAG.getTargetGlobalAddress( + GV, DL, MVT::i32, GSD->getOffset(), SIInstrInfo::MO_ABS32_LO); + AddrLo = {DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, AddrLo), 0}; + + SDValue AddrHi = DAG.getTargetGlobalAddress( + GV, DL, MVT::i32, GSD->getOffset(), SIInstrInfo::MO_ABS32_HI); + AddrHi = {DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, AddrHi), 0}; + + return DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, AddrLo, AddrHi); + } + if (shouldEmitFixup(GV)) return buildPCRelGlobalAddress(DAG, GV, DL, GSD->getOffset(), PtrVT); - else if (shouldEmitPCReloc(GV)) + + if (shouldEmitPCReloc(GV)) return buildPCRelGlobalAddress(DAG, GV, DL, GSD->getOffset(), PtrVT, SIInstrInfo::MO_REL32); @@ -7341,9 +7750,6 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, return emitRemovedIntrinsicError(DAG, DL, VT); } - case Intrinsic::amdgcn_ldexp: - return DAG.getNode(ISD::FLDEXP, DL, VT, Op.getOperand(1), Op.getOperand(2)); - case Intrinsic::amdgcn_fract: return DAG.getNode(AMDGPUISD::FRACT, DL, VT, Op.getOperand(1)); @@ -8383,7 +8789,7 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op, return SDValue(DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops), 0); } case Intrinsic::amdgcn_s_barrier: { - if (getTargetMachine().getOptLevel() > CodeGenOpt::None) { + if (getTargetMachine().getOptLevel() > CodeGenOptLevel::None) { const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); unsigned WGSize = ST.getFlatWorkGroupSizes(MF.getFunction()).second; if (WGSize <= ST.getWavefrontSize()) @@ -8620,8 +9026,7 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op, IntrinsicID == Intrinsic::amdgcn_struct_ptr_buffer_load_lds; unsigned OpOffset = HasVIndex ? 1 : 0; SDValue VOffset = Op.getOperand(5 + OpOffset); - auto CVOffset = dyn_cast<ConstantSDNode>(VOffset); - bool HasVOffset = !CVOffset || !CVOffset->isZero(); + bool HasVOffset = !isNullConstant(VOffset); unsigned Size = Op->getConstantOperandVal(4); switch (Size) { @@ -8684,12 +9089,13 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op, auto F = LoadMMO->getFlags() & ~(MachineMemOperand::MOStore | MachineMemOperand::MOLoad); - LoadMMO = MF.getMachineMemOperand(LoadPtrI, F | MachineMemOperand::MOLoad, - Size, LoadMMO->getBaseAlign()); + LoadMMO = + MF.getMachineMemOperand(LoadPtrI, F | MachineMemOperand::MOLoad, Size, + LoadMMO->getBaseAlign(), LoadMMO->getAAInfo()); - MachineMemOperand *StoreMMO = - MF.getMachineMemOperand(StorePtrI, F | MachineMemOperand::MOStore, - sizeof(int32_t), LoadMMO->getBaseAlign()); + MachineMemOperand *StoreMMO = MF.getMachineMemOperand( + StorePtrI, F | MachineMemOperand::MOStore, sizeof(int32_t), + LoadMMO->getBaseAlign(), LoadMMO->getAAInfo()); auto Load = DAG.getMachineNode(Opc, DL, M->getVTList(), Ops); DAG.setNodeMemRefs(Load, {LoadMMO, StoreMMO}); @@ -8760,11 +9166,12 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op, StorePtrI.AddrSpace = AMDGPUAS::LOCAL_ADDRESS; auto F = LoadMMO->getFlags() & ~(MachineMemOperand::MOStore | MachineMemOperand::MOLoad); - LoadMMO = MF.getMachineMemOperand(LoadPtrI, F | MachineMemOperand::MOLoad, - Size, LoadMMO->getBaseAlign()); - MachineMemOperand *StoreMMO = - MF.getMachineMemOperand(StorePtrI, F | MachineMemOperand::MOStore, - sizeof(int32_t), Align(4)); + LoadMMO = + MF.getMachineMemOperand(LoadPtrI, F | MachineMemOperand::MOLoad, Size, + LoadMMO->getBaseAlign(), LoadMMO->getAAInfo()); + MachineMemOperand *StoreMMO = MF.getMachineMemOperand( + StorePtrI, F | MachineMemOperand::MOStore, sizeof(int32_t), Align(4), + LoadMMO->getAAInfo()); auto Load = DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops); DAG.setNodeMemRefs(Load, {LoadMMO, StoreMMO}); @@ -9051,7 +9458,7 @@ static bool addressMayBeAccessedAsPrivate(const MachineMemOperand *MMO, const SIMachineFunctionInfo &Info) { // TODO: Should check if the address can definitely not access stack. if (Info.isEntryFunction()) - return Info.hasFlatScratchInit(); + return Info.getUserSGPRInfo().hasFlatScratchInit(); return true; } @@ -9217,7 +9624,8 @@ SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const { SDValue SITargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { EVT VT = Op.getValueType(); - if (VT.getSizeInBits() == 128 || VT.getSizeInBits() == 256) + if (VT.getSizeInBits() == 128 || VT.getSizeInBits() == 256 || + VT.getSizeInBits() == 512) return splitTernaryVectorOp(Op, DAG); assert(VT.getSizeInBits() == 64); @@ -9277,11 +9685,6 @@ SDValue SITargetLowering::lowerFastUnsafeFDIV(SDValue Op, // XXX - Is UnsafeFPMath sufficient to do this for f64? The maximum ULP // error seems really high at 2^29 ULP. - - // XXX - do we need afn for this or is arcp sufficent? - if (RHS.getOpcode() == ISD::FSQRT) - return DAG.getNode(AMDGPUISD::RSQ, SL, VT, RHS.getOperand(0)); - // 1.0 / x -> rcp(x) return DAG.getNode(AMDGPUISD::RCP, SL, VT, RHS); } @@ -9294,8 +9697,8 @@ SDValue SITargetLowering::lowerFastUnsafeFDIV(SDValue Op, } } - // For f16 require arcp only. - // For f32 require afn+arcp. + // For f16 require afn or arcp. + // For f32 require afn. if (!AllowInaccurateRcp && (VT != MVT::f16 || !Flags.hasAllowReciprocal())) return SDValue(); @@ -9480,28 +9883,44 @@ SDValue SITargetLowering::LowerFDIV32(SDValue Op, SelectionDAG &DAG) const { const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); const DenormalMode DenormMode = Info->getMode().FP32Denormals; - const bool HasFP32Denormals = DenormMode == DenormalMode::getIEEE(); + const bool PreservesDenormals = DenormMode == DenormalMode::getIEEE(); + const bool HasDynamicDenormals = + (DenormMode.Input == DenormalMode::Dynamic) || + (DenormMode.Output == DenormalMode::Dynamic); + + SDValue SavedDenormMode; - if (!HasFP32Denormals) { + if (!PreservesDenormals) { // Note we can't use the STRICT_FMA/STRICT_FMUL for the non-strict FDIV // lowering. The chain dependence is insufficient, and we need glue. We do // not need the glue variants in a strictfp function. SDVTList BindParamVTs = DAG.getVTList(MVT::Other, MVT::Glue); + SDValue Glue = DAG.getEntryNode(); + if (HasDynamicDenormals) { + SDNode *GetReg = DAG.getMachineNode(AMDGPU::S_GETREG_B32, SL, + DAG.getVTList(MVT::i32, MVT::Glue), + {BitField, Glue}); + SavedDenormMode = SDValue(GetReg, 0); + + Glue = DAG.getMergeValues( + {DAG.getEntryNode(), SDValue(GetReg, 0), SDValue(GetReg, 1)}, SL); + } + SDNode *EnableDenorm; if (Subtarget->hasDenormModeInst()) { const SDValue EnableDenormValue = getSPDenormModeValue(FP_DENORM_FLUSH_NONE, DAG, Info, Subtarget); - EnableDenorm = DAG.getNode(AMDGPUISD::DENORM_MODE, SL, BindParamVTs, - DAG.getEntryNode(), EnableDenormValue).getNode(); + EnableDenorm = DAG.getNode(AMDGPUISD::DENORM_MODE, SL, BindParamVTs, Glue, + EnableDenormValue) + .getNode(); } else { const SDValue EnableDenormValue = DAG.getConstant(FP_DENORM_FLUSH_NONE, SL, MVT::i32); - EnableDenorm = - DAG.getMachineNode(AMDGPU::S_SETREG_B32, SL, BindParamVTs, - {EnableDenormValue, BitField, DAG.getEntryNode()}); + EnableDenorm = DAG.getMachineNode(AMDGPU::S_SETREG_B32, SL, BindParamVTs, + {EnableDenormValue, BitField, Glue}); } SDValue Ops[3] = { @@ -9531,12 +9950,9 @@ SDValue SITargetLowering::LowerFDIV32(SDValue Op, SelectionDAG &DAG) const { SDValue Fma4 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, NegDivScale0, Fma3, NumeratorScaled, Fma3, Flags); - if (!HasFP32Denormals) { - // FIXME: This mishandles dynamic denormal mode. We need to query the - // current mode and restore the original. - + if (!PreservesDenormals) { SDNode *DisableDenorm; - if (Subtarget->hasDenormModeInst()) { + if (!HasDynamicDenormals && Subtarget->hasDenormModeInst()) { const SDValue DisableDenormValue = getSPDenormModeValue( FP_DENORM_FLUSH_IN_FLUSH_OUT, DAG, Info, Subtarget); @@ -9544,8 +9960,11 @@ SDValue SITargetLowering::LowerFDIV32(SDValue Op, SelectionDAG &DAG) const { Fma4.getValue(1), DisableDenormValue, Fma4.getValue(2)).getNode(); } else { + assert(HasDynamicDenormals == (bool)SavedDenormMode); const SDValue DisableDenormValue = - DAG.getConstant(FP_DENORM_FLUSH_IN_FLUSH_OUT, SL, MVT::i32); + HasDynamicDenormals + ? SavedDenormMode + : DAG.getConstant(FP_DENORM_FLUSH_IN_FLUSH_OUT, SL, MVT::i32); DisableDenorm = DAG.getMachineNode( AMDGPU::S_SETREG_B32, SL, MVT::Other, @@ -9754,6 +10173,111 @@ SDValue SITargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const { return SDValue(); } +// Avoid the full correct expansion for f32 sqrt when promoting from f16. +SDValue SITargetLowering::lowerFSQRTF16(SDValue Op, SelectionDAG &DAG) const { + SDLoc SL(Op); + assert(!Subtarget->has16BitInsts()); + SDNodeFlags Flags = Op->getFlags(); + SDValue Ext = + DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Op.getOperand(0), Flags); + + SDValue SqrtID = DAG.getTargetConstant(Intrinsic::amdgcn_sqrt, SL, MVT::i32); + SDValue Sqrt = + DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, MVT::f32, SqrtID, Ext, Flags); + + return DAG.getNode(ISD::FP_ROUND, SL, MVT::f16, Sqrt, + DAG.getTargetConstant(0, SL, MVT::i32), Flags); +} + +SDValue SITargetLowering::lowerFSQRTF32(SDValue Op, SelectionDAG &DAG) const { + SDLoc DL(Op); + SDNodeFlags Flags = Op->getFlags(); + MVT VT = Op.getValueType().getSimpleVT(); + const SDValue X = Op.getOperand(0); + + if (allowApproxFunc(DAG, Flags)) { + // Instruction is 1ulp but ignores denormals. + return DAG.getNode( + ISD::INTRINSIC_WO_CHAIN, DL, VT, + DAG.getTargetConstant(Intrinsic::amdgcn_sqrt, DL, MVT::i32), X, Flags); + } + + SDValue ScaleThreshold = DAG.getConstantFP(0x1.0p-96f, DL, VT); + SDValue NeedScale = DAG.getSetCC(DL, MVT::i1, X, ScaleThreshold, ISD::SETOLT); + + SDValue ScaleUpFactor = DAG.getConstantFP(0x1.0p+32f, DL, VT); + + SDValue ScaledX = DAG.getNode(ISD::FMUL, DL, VT, X, ScaleUpFactor, Flags); + + SDValue SqrtX = + DAG.getNode(ISD::SELECT, DL, VT, NeedScale, ScaledX, X, Flags); + + SDValue SqrtS; + if (needsDenormHandlingF32(DAG, X, Flags)) { + SDValue SqrtID = + DAG.getTargetConstant(Intrinsic::amdgcn_sqrt, DL, MVT::i32); + SqrtS = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, SqrtID, SqrtX, Flags); + + SDValue SqrtSAsInt = DAG.getNode(ISD::BITCAST, DL, MVT::i32, SqrtS); + SDValue SqrtSNextDownInt = DAG.getNode(ISD::ADD, DL, MVT::i32, SqrtSAsInt, + DAG.getConstant(-1, DL, MVT::i32)); + SDValue SqrtSNextDown = DAG.getNode(ISD::BITCAST, DL, VT, SqrtSNextDownInt); + + SDValue NegSqrtSNextDown = + DAG.getNode(ISD::FNEG, DL, VT, SqrtSNextDown, Flags); + + SDValue SqrtVP = + DAG.getNode(ISD::FMA, DL, VT, NegSqrtSNextDown, SqrtS, SqrtX, Flags); + + SDValue SqrtSNextUpInt = DAG.getNode(ISD::ADD, DL, MVT::i32, SqrtSAsInt, + DAG.getConstant(1, DL, MVT::i32)); + SDValue SqrtSNextUp = DAG.getNode(ISD::BITCAST, DL, VT, SqrtSNextUpInt); + + SDValue NegSqrtSNextUp = DAG.getNode(ISD::FNEG, DL, VT, SqrtSNextUp, Flags); + SDValue SqrtVS = + DAG.getNode(ISD::FMA, DL, VT, NegSqrtSNextUp, SqrtS, SqrtX, Flags); + + SDValue Zero = DAG.getConstantFP(0.0f, DL, VT); + SDValue SqrtVPLE0 = DAG.getSetCC(DL, MVT::i1, SqrtVP, Zero, ISD::SETOLE); + + SqrtS = DAG.getNode(ISD::SELECT, DL, VT, SqrtVPLE0, SqrtSNextDown, SqrtS, + Flags); + + SDValue SqrtVPVSGT0 = DAG.getSetCC(DL, MVT::i1, SqrtVS, Zero, ISD::SETOGT); + SqrtS = DAG.getNode(ISD::SELECT, DL, VT, SqrtVPVSGT0, SqrtSNextUp, SqrtS, + Flags); + } else { + SDValue SqrtR = DAG.getNode(AMDGPUISD::RSQ, DL, VT, SqrtX, Flags); + + SqrtS = DAG.getNode(ISD::FMUL, DL, VT, SqrtX, SqrtR, Flags); + + SDValue Half = DAG.getConstantFP(0.5f, DL, VT); + SDValue SqrtH = DAG.getNode(ISD::FMUL, DL, VT, SqrtR, Half, Flags); + SDValue NegSqrtH = DAG.getNode(ISD::FNEG, DL, VT, SqrtH, Flags); + + SDValue SqrtE = DAG.getNode(ISD::FMA, DL, VT, NegSqrtH, SqrtS, Half, Flags); + SqrtH = DAG.getNode(ISD::FMA, DL, VT, SqrtH, SqrtE, SqrtH, Flags); + SqrtS = DAG.getNode(ISD::FMA, DL, VT, SqrtS, SqrtE, SqrtS, Flags); + + SDValue NegSqrtS = DAG.getNode(ISD::FNEG, DL, VT, SqrtS, Flags); + SDValue SqrtD = + DAG.getNode(ISD::FMA, DL, VT, NegSqrtS, SqrtS, SqrtX, Flags); + SqrtS = DAG.getNode(ISD::FMA, DL, VT, SqrtD, SqrtH, SqrtS, Flags); + } + + SDValue ScaleDownFactor = DAG.getConstantFP(0x1.0p-16f, DL, VT); + + SDValue ScaledDown = + DAG.getNode(ISD::FMUL, DL, VT, SqrtS, ScaleDownFactor, Flags); + + SqrtS = DAG.getNode(ISD::SELECT, DL, VT, NeedScale, ScaledDown, SqrtS, Flags); + SDValue IsZeroOrInf = + DAG.getNode(ISD::IS_FPCLASS, DL, MVT::i1, SqrtX, + DAG.getTargetConstant(fcZero | fcPosInf, DL, MVT::i32)); + + return DAG.getNode(ISD::SELECT, DL, VT, IsZeroOrInf, SqrtX, SqrtS, Flags); +} + SDValue SITargetLowering::lowerFSQRTF64(SDValue Op, SelectionDAG &DAG) const { // For double type, the SQRT and RSQ instructions don't have required // precision, we apply Goldschmidt's algorithm to improve the result: @@ -10111,9 +10635,7 @@ SDValue SITargetLowering::splitBinaryBitConstantOp( return SDValue(); } -// Returns true if argument is a boolean value which is not serialized into -// memory or argument and does not require v_cndmask_b32 to be deserialized. -static bool isBoolSGPR(SDValue V) { +bool llvm::isBoolSGPR(SDValue V) { if (V.getValueType() != MVT::i1) return false; switch (V.getOpcode()) { @@ -10427,13 +10949,34 @@ calculateSrcByte(const SDValue Op, uint64_t DestByte, uint64_t SrcIndex = 0, if (Depth >= 6) return std::nullopt; + auto ValueSize = Op.getValueSizeInBits(); + if (ValueSize != 8 && ValueSize != 16 && ValueSize != 32) + return std::nullopt; + switch (Op->getOpcode()) { case ISD::TRUNCATE: { - if (Op->getOperand(0).getScalarValueSizeInBits() != 32) + return calculateSrcByte(Op->getOperand(0), DestByte, SrcIndex, Depth + 1); + } + + case ISD::SIGN_EXTEND: + case ISD::ZERO_EXTEND: + case ISD::SIGN_EXTEND_INREG: { + SDValue NarrowOp = Op->getOperand(0); + auto NarrowVT = NarrowOp.getValueType(); + if (Op->getOpcode() == ISD::SIGN_EXTEND_INREG) { + auto *VTSign = cast<VTSDNode>(Op->getOperand(1)); + NarrowVT = VTSign->getVT(); + } + if (!NarrowVT.isByteSized()) + return std::nullopt; + uint64_t NarrowByteWidth = NarrowVT.getStoreSize(); + + if (SrcIndex >= NarrowByteWidth) return std::nullopt; return calculateSrcByte(Op->getOperand(0), DestByte, SrcIndex, Depth + 1); } + case ISD::SRA: case ISD::SRL: { auto ShiftOp = dyn_cast<ConstantSDNode>(Op->getOperand(1)); if (!ShiftOp) @@ -10450,9 +10993,6 @@ calculateSrcByte(const SDValue Op, uint64_t DestByte, uint64_t SrcIndex = 0, } default: { - if (Op.getScalarValueSizeInBits() != 32) - return std::nullopt; - return ByteProvider<SDValue>::getSrc(Op, DestByte, SrcIndex); } } @@ -10476,7 +11016,8 @@ calculateByteProvider(const SDValue &Op, unsigned Index, unsigned Depth, unsigned BitWidth = Op.getScalarValueSizeInBits(); if (BitWidth % 8 != 0) return std::nullopt; - assert(Index < BitWidth / 8 && "invalid index requested"); + if (Index > BitWidth / 8 - 1) + return std::nullopt; switch (Op.getOpcode()) { case ISD::OR: { @@ -10519,6 +11060,31 @@ calculateByteProvider(const SDValue &Op, unsigned Index, unsigned Depth, return calculateSrcByte(Op->getOperand(0), StartingIndex, Index); } + case ISD::FSHR: { + // fshr(X,Y,Z): (X << (BW - (Z % BW))) | (Y >> (Z % BW)) + auto ShiftOp = dyn_cast<ConstantSDNode>(Op->getOperand(2)); + if (!ShiftOp || Op.getValueType().isVector()) + return std::nullopt; + + uint64_t BitsProvided = Op.getValueSizeInBits(); + if (BitsProvided % 8 != 0) + return std::nullopt; + + uint64_t BitShift = ShiftOp->getAPIntValue().urem(BitsProvided); + if (BitShift % 8) + return std::nullopt; + + uint64_t ConcatSizeInBytes = BitsProvided / 4; + uint64_t ByteShift = BitShift / 8; + + uint64_t NewIndex = (Index + ByteShift) % ConcatSizeInBytes; + uint64_t BytesProvided = BitsProvided / 8; + SDValue NextOp = Op.getOperand(NewIndex >= BytesProvided ? 0 : 1); + NewIndex %= BytesProvided; + return calculateByteProvider(NextOp, NewIndex, Depth + 1, StartingIndex); + } + + case ISD::SRA: case ISD::SRL: { auto ShiftOp = dyn_cast<ConstantSDNode>(Op->getOperand(1)); if (!ShiftOp) @@ -10565,9 +11131,18 @@ calculateByteProvider(const SDValue &Op, unsigned Index, unsigned Depth, } case ISD::ANY_EXTEND: case ISD::SIGN_EXTEND: - case ISD::ZERO_EXTEND: { + case ISD::ZERO_EXTEND: + case ISD::SIGN_EXTEND_INREG: + case ISD::AssertZext: + case ISD::AssertSext: { SDValue NarrowOp = Op->getOperand(0); - unsigned NarrowBitWidth = NarrowOp.getScalarValueSizeInBits(); + unsigned NarrowBitWidth = NarrowOp.getValueSizeInBits(); + if (Op->getOpcode() == ISD::SIGN_EXTEND_INREG || + Op->getOpcode() == ISD::AssertZext || + Op->getOpcode() == ISD::AssertSext) { + auto *VTSign = cast<VTSDNode>(Op->getOperand(1)); + NarrowBitWidth = VTSign->getVT().getSizeInBits(); + } if (NarrowBitWidth % 8 != 0) return std::nullopt; uint64_t NarrowByteWidth = NarrowBitWidth / 8; @@ -10581,10 +11156,7 @@ calculateByteProvider(const SDValue &Op, unsigned Index, unsigned Depth, } case ISD::TRUNCATE: { - unsigned NarrowBitWidth = Op.getScalarValueSizeInBits(); - if (NarrowBitWidth % 8 != 0) - return std::nullopt; - uint64_t NarrowByteWidth = NarrowBitWidth / 8; + uint64_t NarrowByteWidth = BitWidth / 8; if (NarrowByteWidth >= Index) { return calculateByteProvider(Op.getOperand(0), Index, Depth + 1, @@ -10594,8 +11166,16 @@ calculateByteProvider(const SDValue &Op, unsigned Index, unsigned Depth, return std::nullopt; } + case ISD::CopyFromReg: { + if (BitWidth / 8 > Index) + return calculateSrcByte(Op, StartingIndex, Index); + + return std::nullopt; + } + case ISD::LOAD: { auto L = cast<LoadSDNode>(Op.getNode()); + unsigned NarrowBitWidth = L->getMemoryVT().getSizeInBits(); if (NarrowBitWidth % 8 != 0) return std::nullopt; @@ -10621,6 +11201,41 @@ calculateByteProvider(const SDValue &Op, unsigned Index, unsigned Depth, case ISD::BSWAP: return calculateByteProvider(Op->getOperand(0), BitWidth / 8 - Index - 1, Depth + 1, StartingIndex); + + case ISD::EXTRACT_VECTOR_ELT: { + auto IdxOp = dyn_cast<ConstantSDNode>(Op->getOperand(1)); + if (!IdxOp) + return std::nullopt; + auto VecIdx = IdxOp->getZExtValue(); + auto ScalarSize = Op.getScalarValueSizeInBits(); + if (ScalarSize != 32) { + if ((VecIdx + 1) * ScalarSize > 32) + return std::nullopt; + Index = ScalarSize == 8 ? VecIdx : VecIdx * 2 + Index; + } + + return calculateSrcByte(ScalarSize == 32 ? Op : Op.getOperand(0), + StartingIndex, Index); + } + + case AMDGPUISD::PERM: { + auto PermMask = dyn_cast<ConstantSDNode>(Op->getOperand(2)); + if (!PermMask) + return std::nullopt; + + auto IdxMask = + (PermMask->getZExtValue() & (0xFF << (Index * 8))) >> (Index * 8); + if (IdxMask > 0x07 && IdxMask != 0x0c) + return std::nullopt; + + auto NextOp = Op.getOperand(IdxMask > 0x03 ? 0 : 1); + auto NextIndex = IdxMask > 0x03 ? IdxMask % 4 : IdxMask; + + return IdxMask != 0x0c ? calculateSrcByte(NextOp, StartingIndex, NextIndex) + : ByteProvider<SDValue>( + ByteProvider<SDValue>::getConstantZero()); + } + default: { return std::nullopt; } @@ -10630,7 +11245,8 @@ calculateByteProvider(const SDValue &Op, unsigned Index, unsigned Depth, } // Returns true if the Operand is a scalar and is 16 bits -static bool is16BitScalarOp(SDValue &Operand) { +static bool isExtendedFrom16Bits(SDValue &Operand) { + switch (Operand.getOpcode()) { case ISD::ANY_EXTEND: case ISD::SIGN_EXTEND: @@ -10646,7 +11262,7 @@ static bool is16BitScalarOp(SDValue &Operand) { auto MemVT = L->getMemoryVT(); return !MemVT.isVector() && MemVT.getSizeInBits() == 16; } - return false; + return L->getMemoryVT().getSizeInBits() == 16; } default: return false; @@ -10674,29 +11290,118 @@ static bool addresses16Bits(int Mask) { // Do not lower into v_perm if the operands are actually 16 bit // and the selected bits (based on PermMask) correspond with two // easily addressable 16 bit operands. -static bool hasEightBitAccesses(uint64_t PermMask, SDValue &Op, +static bool hasNon16BitAccesses(uint64_t PermMask, SDValue &Op, SDValue &OtherOp) { int Low16 = PermMask & 0xffff; int Hi16 = (PermMask & 0xffff0000) >> 16; - // ByteProvider only accepts 32 bit operands - assert(Op.getValueType().getSizeInBits() == 32); - assert(OtherOp.getValueType().getSizeInBits() == 32); + assert(Op.getValueType().isByteSized()); + assert(OtherOp.getValueType().isByteSized()); - auto OpIs16Bit = is16BitScalarOp(Op); - auto OtherOpIs16Bit = is16BitScalarOp(Op); + auto TempOp = peekThroughBitcasts(Op); + auto TempOtherOp = peekThroughBitcasts(OtherOp); - // If there is a size mismatch, then we must use masking on at least one - // operand - if (OpIs16Bit != OtherOpIs16Bit) + auto OpIs16Bit = + TempOtherOp.getValueSizeInBits() == 16 || isExtendedFrom16Bits(TempOp); + if (!OpIs16Bit) return true; - // If both operands are 16 bit, return whether or not we cleanly address both - if (is16BitScalarOp(Op) && is16BitScalarOp(OtherOp)) - return !addresses16Bits(Low16) || !addresses16Bits(Hi16); + auto OtherOpIs16Bit = TempOtherOp.getValueSizeInBits() == 16 || + isExtendedFrom16Bits(TempOtherOp); + if (!OtherOpIs16Bit) + return true; - // Both are 32 bit operands - return true; + // Do we cleanly address both + return !addresses16Bits(Low16) || !addresses16Bits(Hi16); +} + +static SDValue matchPERM(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) { + SelectionDAG &DAG = DCI.DAG; + EVT VT = N->getValueType(0); + + if (VT != MVT::i32) + return SDValue(); + + // VT is known to be MVT::i32, so we need to provide 4 bytes. + SmallVector<ByteProvider<SDValue>, 8> PermNodes; + for (int i = 0; i < 4; i++) { + // Find the ByteProvider that provides the ith byte of the result of OR + std::optional<ByteProvider<SDValue>> P = + calculateByteProvider(SDValue(N, 0), i, 0, /*StartingIndex = */ i); + // TODO support constantZero + if (!P || P->isConstantZero()) + return SDValue(); + + PermNodes.push_back(*P); + } + if (PermNodes.size() != 4) + return SDValue(); + + int FirstSrc = 0; + std::optional<int> SecondSrc; + uint64_t PermMask = 0x00000000; + for (size_t i = 0; i < PermNodes.size(); i++) { + auto PermOp = PermNodes[i]; + // Since the mask is applied to Src1:Src2, Src1 bytes must be offset + // by sizeof(Src2) = 4 + int SrcByteAdjust = 4; + + if (!PermOp.hasSameSrc(PermNodes[FirstSrc])) { + if (SecondSrc.has_value()) + if (!PermOp.hasSameSrc(PermNodes[*SecondSrc])) + return SDValue(); + + // Set the index of the second distinct Src node + SecondSrc = i; + assert(!(PermNodes[*SecondSrc].Src->getValueSizeInBits() % 8)); + SrcByteAdjust = 0; + } + assert(PermOp.SrcOffset + SrcByteAdjust < 8); + assert(!DAG.getDataLayout().isBigEndian()); + PermMask |= (PermOp.SrcOffset + SrcByteAdjust) << (i * 8); + } + + SDValue Op = *PermNodes[FirstSrc].Src; + SDValue OtherOp = SecondSrc.has_value() ? *PermNodes[*SecondSrc].Src + : *PermNodes[FirstSrc].Src; + + // Check that we haven't just recreated the same FSHR node. + if (N->getOpcode() == ISD::FSHR && + (N->getOperand(0) == Op || N->getOperand(0) == OtherOp) && + (N->getOperand(1) == Op || N->getOperand(1) == OtherOp)) + return SDValue(); + + // Check that we are not just extracting the bytes in order from an op + if (Op == OtherOp && Op.getValueSizeInBits() == 32) { + int Low16 = PermMask & 0xffff; + int Hi16 = (PermMask & 0xffff0000) >> 16; + + bool WellFormedLow = (Low16 == 0x0504) || (Low16 == 0x0100); + bool WellFormedHi = (Hi16 == 0x0706) || (Hi16 == 0x0302); + + // The perm op would really just produce Op. So combine into Op + if (WellFormedLow && WellFormedHi) + return DAG.getBitcast(MVT::getIntegerVT(32), Op); + } + + if (hasNon16BitAccesses(PermMask, Op, OtherOp)) { + SDLoc DL(N); + assert(Op.getValueType().isByteSized() && + OtherOp.getValueType().isByteSized()); + + // If the ultimate src is less than 32 bits, then we will only be + // using bytes 0: Op.getValueSizeInBytes() - 1 in the or. + // CalculateByteProvider would not have returned Op as source if we + // used a byte that is outside its ValueType. Thus, we are free to + // ANY_EXTEND as the extended bits are dont-cares. + Op = DAG.getBitcastedAnyExtOrTrunc(Op, DL, MVT::i32); + OtherOp = DAG.getBitcastedAnyExtOrTrunc(OtherOp, DL, MVT::i32); + + return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, Op, OtherOp, + DAG.getConstant(PermMask, DL, MVT::i32)); + } + + return SDValue(); } SDValue SITargetLowering::performOrCombine(SDNode *N, @@ -10812,69 +11517,8 @@ SDValue SITargetLowering::performOrCombine(SDNode *N, } } if (LHSMask == ~0u || RHSMask == ~0u) { - SmallVector<ByteProvider<SDValue>, 8> PermNodes; - - // VT is known to be MVT::i32, so we need to provide 4 bytes. - assert(VT == MVT::i32); - for (int i = 0; i < 4; i++) { - // Find the ByteProvider that provides the ith byte of the result of OR - std::optional<ByteProvider<SDValue>> P = - calculateByteProvider(SDValue(N, 0), i, 0, /*StartingIndex = */ i); - // TODO support constantZero - if (!P || P->isConstantZero()) - return SDValue(); - - PermNodes.push_back(*P); - } - if (PermNodes.size() != 4) - return SDValue(); - - int FirstSrc = 0; - std::optional<int> SecondSrc; - uint64_t permMask = 0x00000000; - for (size_t i = 0; i < PermNodes.size(); i++) { - auto PermOp = PermNodes[i]; - // Since the mask is applied to Src1:Src2, Src1 bytes must be offset - // by sizeof(Src2) = 4 - int SrcByteAdjust = 4; - - if (!PermOp.hasSameSrc(PermNodes[FirstSrc])) { - if (SecondSrc.has_value()) - if (!PermOp.hasSameSrc(PermNodes[*SecondSrc])) - return SDValue(); - // Set the index of the second distinct Src node - SecondSrc = i; - assert(PermNodes[*SecondSrc].Src->getValueType().getSizeInBits() == - 32); - SrcByteAdjust = 0; - } - assert(PermOp.SrcOffset + SrcByteAdjust < 8); - assert(!DAG.getDataLayout().isBigEndian()); - permMask |= (PermOp.SrcOffset + SrcByteAdjust) << (i * 8); - } - - SDValue Op = *PermNodes[FirstSrc].Src; - SDValue OtherOp = SecondSrc.has_value() ? *PermNodes[*SecondSrc].Src - : *PermNodes[FirstSrc].Src; - - // Check that we are not just extracting the bytes in order from an op - if (Op == OtherOp) { - int Low16 = permMask & 0xffff; - int Hi16 = (permMask & 0xffff0000) >> 16; - - bool WellFormedLow = (Low16 == 0x0504) || (Low16 == 0x0100); - bool WellFormedHi = (Hi16 == 0x0706) || (Hi16 == 0x0302); - - // The perm op would really just produce Op. So combine into Op - if (WellFormedLow && WellFormedHi) - return Op; - } - - if (hasEightBitAccesses(permMask, Op, OtherOp)) { - SDLoc DL(N); - return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, Op, OtherOp, - DAG.getConstant(permMask, DL, MVT::i32)); - } + if (SDValue Perm = matchPERM(N, DCI)) + return Perm; } } @@ -11021,10 +11665,8 @@ SDValue SITargetLowering::performClassCombine(SDNode *N, SDValue Mask = N->getOperand(1); // fp_class x, 0 -> false - if (const ConstantSDNode *CMask = dyn_cast<ConstantSDNode>(Mask)) { - if (CMask->isZero()) - return DAG.getConstant(0, SDLoc(N), MVT::i1); - } + if (isNullConstant(Mask)) + return DAG.getConstant(0, SDLoc(N), MVT::i1); if (N->getOperand(0).isUndef()) return DAG.getUNDEF(MVT::i1); @@ -11049,7 +11691,9 @@ SDValue SITargetLowering::performRcpCombine(SDNode *N, N->getFlags()); } - if ((VT == MVT::f32 || VT == MVT::f16) && N0.getOpcode() == ISD::FSQRT) { + // TODO: Could handle f32 + amdgcn.sqrt but probably never reaches here. + if ((VT == MVT::f16 && N0.getOpcode() == ISD::FSQRT) && + N->getFlags().hasAllowContract() && N0->getFlags().hasAllowContract()) { return DCI.DAG.getNode(AMDGPUISD::RSQ, SDLoc(N), VT, N0.getOperand(0), N->getFlags()); } @@ -11302,7 +11946,8 @@ bool SITargetLowering::isCanonicalized(Register Reg, MachineFunction &MF, return false; return true; case AMDGPU::G_INTRINSIC: - switch (MI->getIntrinsicID()) { + case AMDGPU::G_INTRINSIC_CONVERGENT: + switch (cast<GIntrinsic>(MI)->getIntrinsicID()) { case Intrinsic::amdgcn_fmul_legacy: case Intrinsic::amdgcn_fmad_ftz: case Intrinsic::amdgcn_sqrt: @@ -11321,7 +11966,6 @@ bool SITargetLowering::isCanonicalized(Register Reg, MachineFunction &MF, case Intrinsic::amdgcn_div_fmas: case Intrinsic::amdgcn_div_fixup: case Intrinsic::amdgcn_fract: - case Intrinsic::amdgcn_ldexp: case Intrinsic::amdgcn_cvt_pkrtz: case Intrinsic::amdgcn_cubeid: case Intrinsic::amdgcn_cubema: @@ -12203,6 +12847,256 @@ SDValue SITargetLowering::tryFoldToMad64_32(SDNode *N, return Accum; } +// Collect the ultimate src of each of the mul node's operands, and confirm +// each operand is 8 bytes. +static std::optional<ByteProvider<SDValue>> +handleMulOperand(const SDValue &MulOperand) { + auto Byte0 = calculateByteProvider(MulOperand, 0, 0); + if (!Byte0 || Byte0->isConstantZero()) { + return std::nullopt; + } + auto Byte1 = calculateByteProvider(MulOperand, 1, 0); + if (Byte1 && !Byte1->isConstantZero()) { + return std::nullopt; + } + return Byte0; +} + +static unsigned addPermMasks(unsigned First, unsigned Second) { + unsigned FirstCs = First & 0x0c0c0c0c; + unsigned SecondCs = Second & 0x0c0c0c0c; + unsigned FirstNoCs = First & ~0x0c0c0c0c; + unsigned SecondNoCs = Second & ~0x0c0c0c0c; + + assert((FirstCs & 0xFF) | (SecondCs & 0xFF)); + assert((FirstCs & 0xFF00) | (SecondCs & 0xFF00)); + assert((FirstCs & 0xFF0000) | (SecondCs & 0xFF0000)); + assert((FirstCs & 0xFF000000) | (SecondCs & 0xFF000000)); + + return (FirstNoCs | SecondNoCs) | (FirstCs & SecondCs); +} + +static void placeSources(ByteProvider<SDValue> &Src0, + ByteProvider<SDValue> &Src1, + SmallVectorImpl<std::pair<SDValue, unsigned>> &Src0s, + SmallVectorImpl<std::pair<SDValue, unsigned>> &Src1s, + int Step) { + + assert(Src0.Src.has_value() && Src1.Src.has_value()); + // Src0s and Src1s are empty, just place arbitrarily. + if (Step == 0) { + Src0s.push_back({*Src0.Src, (Src0.SrcOffset << 24) + 0x0c0c0c}); + Src1s.push_back({*Src1.Src, (Src1.SrcOffset << 24) + 0x0c0c0c}); + return; + } + + for (int BPI = 0; BPI < 2; BPI++) { + std::pair<ByteProvider<SDValue>, ByteProvider<SDValue>> BPP = {Src0, Src1}; + if (BPI == 1) { + BPP = {Src1, Src0}; + } + unsigned ZeroMask = 0x0c0c0c0c; + unsigned FMask = 0xFF << (8 * (3 - Step)); + + unsigned FirstMask = + BPP.first.SrcOffset << (8 * (3 - Step)) | (ZeroMask & ~FMask); + unsigned SecondMask = + BPP.second.SrcOffset << (8 * (3 - Step)) | (ZeroMask & ~FMask); + // Attempt to find Src vector which contains our SDValue, if so, add our + // perm mask to the existing one. If we are unable to find a match for the + // first SDValue, attempt to find match for the second. + int FirstGroup = -1; + for (int I = 0; I < 2; I++) { + SmallVectorImpl<std::pair<SDValue, unsigned>> &Srcs = + I == 0 ? Src0s : Src1s; + auto MatchesFirst = [&BPP](std::pair<SDValue, unsigned> IterElt) { + return IterElt.first == *BPP.first.Src; + }; + + auto Match = llvm::find_if(Srcs, MatchesFirst); + if (Match != Srcs.end()) { + Match->second = addPermMasks(FirstMask, Match->second); + FirstGroup = I; + break; + } + } + if (FirstGroup != -1) { + SmallVectorImpl<std::pair<SDValue, unsigned>> &Srcs = + FirstGroup == 1 ? Src0s : Src1s; + auto MatchesSecond = [&BPP](std::pair<SDValue, unsigned> IterElt) { + return IterElt.first == *BPP.second.Src; + }; + auto Match = llvm::find_if(Srcs, MatchesSecond); + if (Match != Srcs.end()) { + Match->second = addPermMasks(SecondMask, Match->second); + } else + Srcs.push_back({*BPP.second.Src, SecondMask}); + return; + } + } + + // If we have made it here, then we could not find a match in Src0s or Src1s + // for either Src0 or Src1, so just place them arbitrarily. + + unsigned ZeroMask = 0x0c0c0c0c; + unsigned FMask = 0xFF << (8 * (3 - Step)); + + Src0s.push_back( + {*Src0.Src, (Src0.SrcOffset << (8 * (3 - Step)) | (ZeroMask & ~FMask))}); + Src1s.push_back( + {*Src1.Src, (Src1.SrcOffset << (8 * (3 - Step)) | (ZeroMask & ~FMask))}); + + return; +} + +static SDValue +resolveSources(SelectionDAG &DAG, SDLoc SL, + SmallVectorImpl<std::pair<SDValue, unsigned>> &Srcs, + bool IsSigned, bool IsAny) { + + // If we just have one source, just permute it accordingly. + if (Srcs.size() == 1) { + auto Elt = Srcs.begin(); + auto EltVal = DAG.getBitcastedAnyExtOrTrunc(Elt->first, SL, MVT::i32); + + // v_perm will produce the original value. + if (Elt->second == 0x3020100) + return EltVal; + + return DAG.getNode(AMDGPUISD::PERM, SL, MVT::i32, EltVal, EltVal, + DAG.getConstant(Elt->second, SL, MVT::i32)); + } + + auto FirstElt = Srcs.begin(); + auto SecondElt = std::next(FirstElt); + + SmallVector<SDValue, 2> Perms; + + // If we have multiple sources in the chain, combine them via perms (using + // calculated perm mask) and Ors. + while (true) { + auto FirstMask = FirstElt->second; + auto SecondMask = SecondElt->second; + + unsigned FirstCs = FirstMask & 0x0c0c0c0c; + unsigned FirstPlusFour = FirstMask | 0x04040404; + // 0x0c + 0x04 = 0x10, so anding with 0x0F will produced 0x00 for any + // original 0x0C. + FirstMask = (FirstPlusFour & 0x0F0F0F0F) | FirstCs; + + auto PermMask = addPermMasks(FirstMask, SecondMask); + auto FirstVal = + DAG.getBitcastedAnyExtOrTrunc(FirstElt->first, SL, MVT::i32); + auto SecondVal = + DAG.getBitcastedAnyExtOrTrunc(SecondElt->first, SL, MVT::i32); + + Perms.push_back(DAG.getNode(AMDGPUISD::PERM, SL, MVT::i32, FirstVal, + SecondVal, + DAG.getConstant(PermMask, SL, MVT::i32))); + + FirstElt = std::next(SecondElt); + if (FirstElt == Srcs.end()) + break; + + SecondElt = std::next(FirstElt); + // If we only have a FirstElt, then just combine that into the cumulative + // source node. + if (SecondElt == Srcs.end()) { + auto EltVal = + DAG.getBitcastedAnyExtOrTrunc(FirstElt->first, SL, MVT::i32); + + Perms.push_back( + DAG.getNode(AMDGPUISD::PERM, SL, MVT::i32, EltVal, EltVal, + DAG.getConstant(FirstElt->second, SL, MVT::i32))); + break; + } + } + + assert(Perms.size() == 1 || Perms.size() == 2); + return Perms.size() == 2 + ? DAG.getNode(ISD::OR, SL, MVT::i32, Perms[0], Perms[1]) + : Perms[0]; +} + +static void fixMasks(SmallVectorImpl<std::pair<SDValue, unsigned>> &Srcs, + unsigned ChainLength) { + for (auto &[EntryVal, EntryMask] : Srcs) { + EntryMask = EntryMask >> ((4 - ChainLength) * 8); + auto ZeroMask = ChainLength == 2 ? 0x0c0c0000 : 0x0c000000; + EntryMask += ZeroMask; + } +} + +static bool isMul(const SDValue Op) { + auto Opcode = Op.getOpcode(); + + return (Opcode == ISD::MUL || Opcode == AMDGPUISD::MUL_U24 || + Opcode == AMDGPUISD::MUL_I24); +} + +static std::optional<bool> +checkDot4MulSignedness(const SDValue &N, ByteProvider<SDValue> &Src0, + ByteProvider<SDValue> &Src1, const SDValue &S0Op, + const SDValue &S1Op, const SelectionDAG &DAG) { + // If we both ops are i8s (pre legalize-dag), then the signedness semantics + // of the dot4 is irrelevant. + if (S0Op.getValueSizeInBits() == 8 && S1Op.getValueSizeInBits() == 8) + return false; + + auto Known0 = DAG.computeKnownBits(S0Op, 0); + bool S0IsUnsigned = Known0.countMinLeadingZeros() > 0; + bool S0IsSigned = Known0.countMinLeadingOnes() > 0; + auto Known1 = DAG.computeKnownBits(S1Op, 0); + bool S1IsUnsigned = Known1.countMinLeadingZeros() > 0; + bool S1IsSigned = Known1.countMinLeadingOnes() > 0; + + assert(!(S0IsUnsigned && S0IsSigned)); + assert(!(S1IsUnsigned && S1IsSigned)); + + // There are 9 possible permutations of + // {S0IsUnsigned, S0IsSigned, S1IsUnsigned, S1IsSigned} + + // In two permutations, the sign bits are known to be the same for both Ops, + // so simply return Signed / Unsigned corresponding to the MSB + + if ((S0IsUnsigned && S1IsUnsigned) || (S0IsSigned && S1IsSigned)) + return S0IsSigned; + + // In another two permutations, the sign bits are known to be opposite. In + // this case return std::nullopt to indicate a bad match. + + if ((S0IsUnsigned && S1IsSigned) || (S0IsSigned && S1IsUnsigned)) + return std::nullopt; + + // In the remaining five permutations, we don't know the value of the sign + // bit for at least one Op. Since we have a valid ByteProvider, we know that + // the upper bits must be extension bits. Thus, the only ways for the sign + // bit to be unknown is if it was sign extended from unknown value, or if it + // was any extended. In either case, it is correct to use the signed + // version of the signedness semantics of dot4 + + // In two of such permutations, we known the sign bit is set for + // one op, and the other is unknown. It is okay to used signed version of + // dot4. + if ((S0IsSigned && !(S1IsSigned || S1IsUnsigned)) || + ((S1IsSigned && !(S0IsSigned || S0IsUnsigned)))) + return true; + + // In one such permutation, we don't know either of the sign bits. It is okay + // to used the signed version of dot4. + if ((!(S1IsSigned || S1IsUnsigned) && !(S0IsSigned || S0IsUnsigned))) + return true; + + // In two of such permutations, we known the sign bit is unset for + // one op, and the other is unknown. Return std::nullopt to indicate a + // bad match. + if ((S0IsUnsigned && !(S1IsSigned || S1IsUnsigned)) || + ((S1IsUnsigned && !(S0IsSigned || S0IsUnsigned)))) + return std::nullopt; + + llvm_unreachable("Fully covered condition"); +} + SDValue SITargetLowering::performAddCombine(SDNode *N, DAGCombinerInfo &DCI) const { SelectionDAG &DAG = DCI.DAG; @@ -12216,14 +13110,146 @@ SDValue SITargetLowering::performAddCombine(SDNode *N, if (SDValue Folded = tryFoldToMad64_32(N, DCI)) return Folded; } - - return SDValue(); } if (SDValue V = reassociateScalarOps(N, DAG)) { return V; } + if ((isMul(LHS) || isMul(RHS)) && Subtarget->hasDot7Insts() && + (Subtarget->hasDot1Insts() || Subtarget->hasDot8Insts())) { + SDValue TempNode(N, 0); + std::optional<bool> IsSigned; + SmallVector<std::pair<SDValue, unsigned>, 4> Src0s; + SmallVector<std::pair<SDValue, unsigned>, 4> Src1s; + SmallVector<SDValue, 4> Src2s; + + // Match the v_dot4 tree, while collecting src nodes. + int ChainLength = 0; + for (int I = 0; I < 4; I++) { + auto MulIdx = isMul(LHS) ? 0 : isMul(RHS) ? 1 : -1; + if (MulIdx == -1) + break; + auto Src0 = handleMulOperand(TempNode->getOperand(MulIdx)->getOperand(0)); + if (!Src0) + break; + auto Src1 = handleMulOperand(TempNode->getOperand(MulIdx)->getOperand(1)); + if (!Src1) + break; + + auto IterIsSigned = checkDot4MulSignedness( + TempNode->getOperand(MulIdx), *Src0, *Src1, + TempNode->getOperand(MulIdx)->getOperand(0), + TempNode->getOperand(MulIdx)->getOperand(1), DAG); + if (!IterIsSigned) + break; + if (!IsSigned) + IsSigned = *IterIsSigned; + if (*IterIsSigned != *IsSigned) + break; + placeSources(*Src0, *Src1, Src0s, Src1s, I); + auto AddIdx = 1 - MulIdx; + // Allow the special case where add (add (mul24, 0), mul24) became -> + // add (mul24, mul24). + if (I == 2 && isMul(TempNode->getOperand(AddIdx))) { + Src2s.push_back(TempNode->getOperand(AddIdx)); + auto Src0 = + handleMulOperand(TempNode->getOperand(AddIdx)->getOperand(0)); + if (!Src0) + break; + auto Src1 = + handleMulOperand(TempNode->getOperand(AddIdx)->getOperand(1)); + if (!Src1) + break; + auto IterIsSigned = checkDot4MulSignedness( + TempNode->getOperand(AddIdx), *Src0, *Src1, + TempNode->getOperand(AddIdx)->getOperand(0), + TempNode->getOperand(AddIdx)->getOperand(1), DAG); + if (!IterIsSigned) + break; + assert(IsSigned); + if (*IterIsSigned != *IsSigned) + break; + placeSources(*Src0, *Src1, Src0s, Src1s, I + 1); + Src2s.push_back(DAG.getConstant(0, SL, MVT::i32)); + ChainLength = I + 2; + break; + } + + TempNode = TempNode->getOperand(AddIdx); + Src2s.push_back(TempNode); + ChainLength = I + 1; + if (TempNode->getNumOperands() < 2) + break; + LHS = TempNode->getOperand(0); + RHS = TempNode->getOperand(1); + } + + if (ChainLength < 2) + return SDValue(); + + // Masks were constructed with assumption that we would find a chain of + // length 4. If not, then we need to 0 out the MSB bits (via perm mask of + // 0x0c) so they do not affect dot calculation. + if (ChainLength < 4) { + fixMasks(Src0s, ChainLength); + fixMasks(Src1s, ChainLength); + } + + SDValue Src0, Src1; + + // If we are just using a single source for both, and have permuted the + // bytes consistently, we can just use the sources without permuting + // (commutation). + bool UseOriginalSrc = false; + if (ChainLength == 4 && Src0s.size() == 1 && Src1s.size() == 1 && + Src0s.begin()->second == Src1s.begin()->second && + Src0s.begin()->first.getValueSizeInBits() == 32 && + Src1s.begin()->first.getValueSizeInBits() == 32) { + SmallVector<unsigned, 4> SrcBytes; + auto Src0Mask = Src0s.begin()->second; + SrcBytes.push_back(Src0Mask & 0xFF000000); + bool UniqueEntries = true; + for (auto I = 1; I < 4; I++) { + auto NextByte = Src0Mask & (0xFF << ((3 - I) * 8)); + + if (is_contained(SrcBytes, NextByte)) { + UniqueEntries = false; + break; + } + SrcBytes.push_back(NextByte); + } + + if (UniqueEntries) { + UseOriginalSrc = true; + // Must be 32 bits to enter above conditional. + assert(Src0s.begin()->first.getValueSizeInBits() == 32); + assert(Src1s.begin()->first.getValueSizeInBits() == 32); + Src0 = DAG.getBitcast(MVT::getIntegerVT(32), Src0s.begin()->first); + Src1 = DAG.getBitcast(MVT::getIntegerVT(32), Src1s.begin()->first); + } + } + + if (!UseOriginalSrc) { + Src0 = resolveSources(DAG, SL, Src0s, false, true); + Src1 = resolveSources(DAG, SL, Src1s, false, true); + } + + assert(IsSigned); + SDValue Src2 = + DAG.getExtOrTrunc(*IsSigned, Src2s[ChainLength - 1], SL, MVT::i32); + + SDValue IID = DAG.getTargetConstant(*IsSigned ? Intrinsic::amdgcn_sdot4 + : Intrinsic::amdgcn_udot4, + SL, MVT::i64); + + assert(!VT.isVector()); + auto Dot = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, MVT::i32, IID, Src0, + Src1, Src2, DAG.getTargetConstant(0, SL, MVT::i1)); + + return DAG.getExtOrTrunc(*IsSigned, Dot, SL, VT); + } + if (VT != MVT::i32 || !DCI.isAfterLegalizeDAG()) return SDValue(); @@ -12295,8 +13321,7 @@ SDValue SITargetLowering::performSubCombine(SDNode *N, if (LHS.getOpcode() == ISD::USUBO_CARRY) { // sub (usubo_carry x, 0, cc), y => usubo_carry x, y, cc - auto C = dyn_cast<ConstantSDNode>(LHS.getOperand(1)); - if (!C || !C->isZero()) + if (!isNullConstant(LHS.getOperand(1))) return SDValue(); SDValue Args[] = { LHS.getOperand(0), RHS, LHS.getOperand(2) }; return DAG.getNode(ISD::USUBO_CARRY, SDLoc(N), LHS->getVTList(), Args); @@ -12417,6 +13442,41 @@ SDValue SITargetLowering::performFSubCombine(SDNode *N, return SDValue(); } +SDValue SITargetLowering::performFDivCombine(SDNode *N, + DAGCombinerInfo &DCI) const { + SelectionDAG &DAG = DCI.DAG; + SDLoc SL(N); + EVT VT = N->getValueType(0); + if (VT != MVT::f16 || !Subtarget->has16BitInsts()) + return SDValue(); + + SDValue LHS = N->getOperand(0); + SDValue RHS = N->getOperand(1); + + SDNodeFlags Flags = N->getFlags(); + SDNodeFlags RHSFlags = RHS->getFlags(); + if (!Flags.hasAllowContract() || !RHSFlags.hasAllowContract() || + !RHS->hasOneUse()) + return SDValue(); + + if (const ConstantFPSDNode *CLHS = dyn_cast<ConstantFPSDNode>(LHS)) { + bool IsNegative = false; + if (CLHS->isExactlyValue(1.0) || + (IsNegative = CLHS->isExactlyValue(-1.0))) { + // fdiv contract 1.0, (sqrt contract x) -> rsq for f16 + // fdiv contract -1.0, (sqrt contract x) -> fneg(rsq) for f16 + if (RHS.getOpcode() == ISD::FSQRT) { + // TODO: Or in RHS flags, somehow missing from SDNodeFlags + SDValue Rsq = + DAG.getNode(AMDGPUISD::RSQ, SL, VT, RHS.getOperand(0), Flags); + return IsNegative ? DAG.getNode(ISD::FNEG, SL, VT, Rsq, Flags) : Rsq; + } + } + } + + return SDValue(); +} + SDValue SITargetLowering::performFMACombine(SDNode *N, DAGCombinerInfo &DCI) const { SelectionDAG &DAG = DCI.DAG; @@ -12666,7 +13726,7 @@ SDValue SITargetLowering::performClampCombine(SDNode *N, SDValue SITargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const { - if (getTargetMachine().getOptLevel() == CodeGenOpt::None) + if (getTargetMachine().getOptLevel() == CodeGenOptLevel::None) return SDValue(); switch (N->getOpcode()) { case ISD::ADD: @@ -12680,6 +13740,8 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N, return performFAddCombine(N, DCI); case ISD::FSUB: return performFSubCombine(N, DCI); + case ISD::FDIV: + return performFDivCombine(N, DCI); case ISD::SETCC: return performSetCCCombine(N, DCI); case ISD::FMAXNUM: @@ -12699,6 +13761,14 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N, return performAndCombine(N, DCI); case ISD::OR: return performOrCombine(N, DCI); + case ISD::FSHR: { + const SIInstrInfo *TII = getSubtarget()->getInstrInfo(); + if (N->getValueType(0) == MVT::i32 && N->isDivergent() && + TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32_e64) != -1) { + return matchPERM(N, DCI); + } + break; + } case ISD::XOR: return performXorCombine(N, DCI); case ISD::ZERO_EXTEND: @@ -12943,7 +14013,11 @@ SDNode *SITargetLowering::adjustWritemask(MachineSDNode *&Node, continue; } else { SDValue Op = DAG.getTargetConstant(Idx, SDLoc(User), MVT::i32); - DAG.UpdateNodeOperands(User, SDValue(NewNode, 0), Op); + SDNode *NewUser = DAG.UpdateNodeOperands(User, SDValue(NewNode, 0), Op); + if (NewUser != User) { + DAG.ReplaceAllUsesWith(SDValue(User, 0), SDValue(NewUser, 0)); + DAG.RemoveDeadNode(User); + } } switch (Idx) { @@ -13183,7 +14257,9 @@ void SITargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI, SDNode *Node) const { const SIInstrInfo *TII = getSubtarget()->getInstrInfo(); - MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo(); + MachineFunction *MF = MI.getParent()->getParent(); + MachineRegisterInfo &MRI = MF->getRegInfo(); + SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>(); if (TII->isVOP3(MI.getOpcode())) { // Make sure constant bus requirements are respected. @@ -13194,11 +14270,16 @@ void SITargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI, // use between vgpr and agpr as agpr tuples tend to be big. if (!MI.getDesc().operands().empty()) { unsigned Opc = MI.getOpcode(); + bool HasAGPRs = Info->mayNeedAGPRs(); const SIRegisterInfo *TRI = Subtarget->getRegisterInfo(); - for (auto I : { AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0), - AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1) }) { + int16_t Src2Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2); + for (auto I : + {AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0), + AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1), Src2Idx}) { if (I == -1) break; + if ((I == Src2Idx) && (HasAGPRs)) + break; MachineOperand &Op = MI.getOperand(I); if (!Op.isReg() || !Op.getReg().isVirtual()) continue; @@ -13216,6 +14297,9 @@ void SITargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI, MRI.setRegClass(Op.getReg(), NewRC); } + if (!HasAGPRs) + return; + // Resolve the rest of AV operands to AGPRs. if (auto *Src2 = TII->getNamedOperand(MI, AMDGPU::OpName::src2)) { if (Src2->isReg() && Src2->getReg().isVirtual()) { @@ -13467,7 +14551,7 @@ static uint64_t clearUnusedBits(uint64_t Val, unsigned Size) { } void SITargetLowering::LowerAsmOperandForConstraint(SDValue Op, - std::string &Constraint, + StringRef Constraint, std::vector<SDValue> &Ops, SelectionDAG &DAG) const { if (isImmConstraint(Constraint)) { @@ -13516,8 +14600,7 @@ bool SITargetLowering::getAsmOperandConstVal(SDValue Op, uint64_t &Val) const { return false; } -bool SITargetLowering::checkAsmConstraintVal(SDValue Op, - const std::string &Constraint, +bool SITargetLowering::checkAsmConstraintVal(SDValue Op, StringRef Constraint, uint64_t Val) const { if (Constraint.size() == 1) { switch (Constraint[0]) { @@ -13735,8 +14818,9 @@ void SITargetLowering::computeKnownBitsForTargetInstr( const MachineRegisterInfo &MRI, unsigned Depth) const { const MachineInstr *MI = MRI.getVRegDef(R); switch (MI->getOpcode()) { - case AMDGPU::G_INTRINSIC: { - switch (MI->getIntrinsicID()) { + case AMDGPU::G_INTRINSIC: + case AMDGPU::G_INTRINSIC_CONVERGENT: { + switch (cast<GIntrinsic>(MI)->getIntrinsicID()) { case Intrinsic::amdgcn_workitem_id_x: knownBitsForWorkitemID(*getSubtarget(), KB, Known, 0); break; @@ -13801,21 +14885,16 @@ Align SITargetLowering::computeKnownAlignForTargetInstr( GISelKnownBits &KB, Register R, const MachineRegisterInfo &MRI, unsigned Depth) const { const MachineInstr *MI = MRI.getVRegDef(R); - switch (MI->getOpcode()) { - case AMDGPU::G_INTRINSIC: - case AMDGPU::G_INTRINSIC_W_SIDE_EFFECTS: { + if (auto *GI = dyn_cast<GIntrinsic>(MI)) { // FIXME: Can this move to generic code? What about the case where the call // site specifies a lower alignment? - Intrinsic::ID IID = MI->getIntrinsicID(); + Intrinsic::ID IID = GI->getIntrinsicID(); LLVMContext &Ctx = KB.getMachineFunction().getFunction().getContext(); AttributeList Attrs = Intrinsic::getAttributes(Ctx, IID); if (MaybeAlign RetAlign = Attrs.getRetAlignment()) return *RetAlign; - return Align(1); - } - default: - return Align(1); } + return Align(1); } Align SITargetLowering::getPrefLoopAlignment(MachineLoop *ML) const { |
