diff options
Diffstat (limited to 'lib/Target/AMDGPU/SIISelLowering.cpp')
-rw-r--r-- | lib/Target/AMDGPU/SIISelLowering.cpp | 1052 |
1 files changed, 675 insertions, 377 deletions
diff --git a/lib/Target/AMDGPU/SIISelLowering.cpp b/lib/Target/AMDGPU/SIISelLowering.cpp index db0782e2bf3e..56ebf9c06741 100644 --- a/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/lib/Target/AMDGPU/SIISelLowering.cpp @@ -20,11 +20,11 @@ #include "AMDGPU.h" #include "AMDGPUSubtarget.h" #include "AMDGPUTargetMachine.h" +#include "MCTargetDesc/AMDGPUMCTargetDesc.h" #include "SIDefines.h" #include "SIInstrInfo.h" #include "SIMachineFunctionInfo.h" #include "SIRegisterInfo.h" -#include "MCTargetDesc/AMDGPUMCTargetDesc.h" #include "Utils/AMDGPUBaseInfo.h" #include "llvm/ADT/APFloat.h" #include "llvm/ADT/APInt.h" @@ -35,6 +35,7 @@ #include "llvm/ADT/StringRef.h" #include "llvm/ADT/StringSwitch.h" #include "llvm/ADT/Twine.h" +#include "llvm/Analysis/LegacyDivergenceAnalysis.h" #include "llvm/CodeGen/Analysis.h" #include "llvm/CodeGen/CallingConvLower.h" #include "llvm/CodeGen/DAGCombine.h" @@ -44,6 +45,7 @@ #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineInstr.h" #include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineLoopInfo.h" #include "llvm/CodeGen/MachineMemOperand.h" #include "llvm/CodeGen/MachineModuleInfo.h" #include "llvm/CodeGen/MachineOperand.h" @@ -115,7 +117,7 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, addRegisterClass(MVT::i1, &AMDGPU::VReg_1RegClass); addRegisterClass(MVT::i64, &AMDGPU::SReg_64RegClass); - addRegisterClass(MVT::i32, &AMDGPU::SReg_32_XM0RegClass); + addRegisterClass(MVT::i32, &AMDGPU::SReg_32RegClass); addRegisterClass(MVT::f32, &AMDGPU::VGPR_32RegClass); addRegisterClass(MVT::f64, &AMDGPU::VReg_64RegClass); @@ -125,10 +127,10 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, addRegisterClass(MVT::v3i32, &AMDGPU::SGPR_96RegClass); addRegisterClass(MVT::v3f32, &AMDGPU::VReg_96RegClass); - addRegisterClass(MVT::v2i64, &AMDGPU::SReg_128RegClass); - addRegisterClass(MVT::v2f64, &AMDGPU::SReg_128RegClass); + addRegisterClass(MVT::v2i64, &AMDGPU::SGPR_128RegClass); + addRegisterClass(MVT::v2f64, &AMDGPU::SGPR_128RegClass); - addRegisterClass(MVT::v4i32, &AMDGPU::SReg_128RegClass); + addRegisterClass(MVT::v4i32, &AMDGPU::SGPR_128RegClass); addRegisterClass(MVT::v4f32, &AMDGPU::VReg_128RegClass); addRegisterClass(MVT::v5i32, &AMDGPU::SGPR_160RegClass); @@ -141,12 +143,12 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, addRegisterClass(MVT::v16f32, &AMDGPU::VReg_512RegClass); if (Subtarget->has16BitInsts()) { - addRegisterClass(MVT::i16, &AMDGPU::SReg_32_XM0RegClass); - addRegisterClass(MVT::f16, &AMDGPU::SReg_32_XM0RegClass); + addRegisterClass(MVT::i16, &AMDGPU::SReg_32RegClass); + addRegisterClass(MVT::f16, &AMDGPU::SReg_32RegClass); // Unless there are also VOP3P operations, not operations are really legal. - addRegisterClass(MVT::v2i16, &AMDGPU::SReg_32_XM0RegClass); - addRegisterClass(MVT::v2f16, &AMDGPU::SReg_32_XM0RegClass); + addRegisterClass(MVT::v2i16, &AMDGPU::SReg_32RegClass); + addRegisterClass(MVT::v2f16, &AMDGPU::SReg_32RegClass); addRegisterClass(MVT::v4i16, &AMDGPU::SReg_64RegClass); addRegisterClass(MVT::v4f16, &AMDGPU::SReg_64RegClass); } @@ -178,6 +180,7 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, setOperationAction(ISD::STORE, MVT::v32i32, Custom); setTruncStoreAction(MVT::v2i32, MVT::v2i16, Expand); + setTruncStoreAction(MVT::v3i32, MVT::v3i16, Expand); setTruncStoreAction(MVT::v4i32, MVT::v4i16, Expand); setTruncStoreAction(MVT::v8i32, MVT::v8i16, Expand); setTruncStoreAction(MVT::v16i32, MVT::v16i16, Expand); @@ -215,31 +218,10 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i8, Custom); setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i8, Custom); setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i16, Custom); + setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v3i16, Custom); setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i16, Custom); setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::Other, Custom); - setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom); - setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::f32, Custom); - setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v4f32, Custom); - setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i16, Custom); - setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::f16, Custom); - setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v2i16, Custom); - setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v2f16, Custom); - - setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::v2f16, Custom); - setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::v4f16, Custom); - setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::v8f16, Custom); - setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom); - setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i16, Custom); - setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i8, Custom); - - setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom); - setOperationAction(ISD::INTRINSIC_VOID, MVT::v2i16, Custom); - setOperationAction(ISD::INTRINSIC_VOID, MVT::v2f16, Custom); - setOperationAction(ISD::INTRINSIC_VOID, MVT::v4f16, Custom); - setOperationAction(ISD::INTRINSIC_VOID, MVT::i16, Custom); - setOperationAction(ISD::INTRINSIC_VOID, MVT::i8, Custom); - setOperationAction(ISD::BRCOND, MVT::Other, Custom); setOperationAction(ISD::BR_CC, MVT::i1, Expand); setOperationAction(ISD::BR_CC, MVT::i32, Expand); @@ -653,6 +635,7 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, setOperationAction(ISD::FADD, MVT::v4f16, Custom); setOperationAction(ISD::FMUL, MVT::v4f16, Custom); + setOperationAction(ISD::FMA, MVT::v4f16, Custom); setOperationAction(ISD::FMAXNUM, MVT::v2f16, Custom); setOperationAction(ISD::FMINNUM, MVT::v2f16, Custom); @@ -687,6 +670,33 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, setOperationAction(ISD::SELECT, VT, Custom); } + setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom); + setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::f32, Custom); + setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v4f32, Custom); + setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i16, Custom); + setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::f16, Custom); + setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v2i16, Custom); + setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v2f16, Custom); + + setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::v2f16, Custom); + setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::v2i16, Custom); + setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::v4f16, Custom); + setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::v4i16, Custom); + setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::v8f16, Custom); + setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom); + setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::f16, Custom); + setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i16, Custom); + setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i8, Custom); + + setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom); + setOperationAction(ISD::INTRINSIC_VOID, MVT::v2i16, Custom); + setOperationAction(ISD::INTRINSIC_VOID, MVT::v2f16, Custom); + setOperationAction(ISD::INTRINSIC_VOID, MVT::v4f16, Custom); + setOperationAction(ISD::INTRINSIC_VOID, MVT::v4i16, Custom); + setOperationAction(ISD::INTRINSIC_VOID, MVT::f16, Custom); + setOperationAction(ISD::INTRINSIC_VOID, MVT::i16, Custom); + setOperationAction(ISD::INTRINSIC_VOID, MVT::i8, Custom); + setTargetDAGCombine(ISD::ADD); setTargetDAGCombine(ISD::ADDCARRY); setTargetDAGCombine(ISD::SUB); @@ -768,19 +778,22 @@ bool SITargetLowering::isShuffleMaskLegal(ArrayRef<int>, EVT) const { MVT SITargetLowering::getRegisterTypeForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const { - // TODO: Consider splitting all arguments into 32-bit pieces. - if (CC != CallingConv::AMDGPU_KERNEL && VT.isVector()) { + if (CC == CallingConv::AMDGPU_KERNEL) + return TargetLowering::getRegisterTypeForCallingConv(Context, CC, VT); + + if (VT.isVector()) { EVT ScalarVT = VT.getScalarType(); unsigned Size = ScalarVT.getSizeInBits(); if (Size == 32) return ScalarVT.getSimpleVT(); - if (Size == 64) + if (Size > 32) return MVT::i32; if (Size == 16 && Subtarget->has16BitInsts()) return VT.isInteger() ? MVT::v2i16 : MVT::v2f16; - } + } else if (VT.getSizeInBits() > 32) + return MVT::i32; return TargetLowering::getRegisterTypeForCallingConv(Context, CC, VT); } @@ -788,7 +801,10 @@ MVT SITargetLowering::getRegisterTypeForCallingConv(LLVMContext &Context, unsigned SITargetLowering::getNumRegistersForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const { - if (CC != CallingConv::AMDGPU_KERNEL && VT.isVector()) { + if (CC == CallingConv::AMDGPU_KERNEL) + return TargetLowering::getNumRegistersForCallingConv(Context, CC, VT); + + if (VT.isVector()) { unsigned NumElts = VT.getVectorNumElements(); EVT ScalarVT = VT.getScalarType(); unsigned Size = ScalarVT.getSizeInBits(); @@ -796,12 +812,13 @@ unsigned SITargetLowering::getNumRegistersForCallingConv(LLVMContext &Context, if (Size == 32) return NumElts; - if (Size == 64) - return 2 * NumElts; + if (Size > 32) + return NumElts * ((Size + 31) / 32); if (Size == 16 && Subtarget->has16BitInsts()) - return (VT.getVectorNumElements() + 1) / 2; - } + return (NumElts + 1) / 2; + } else if (VT.getSizeInBits() > 32) + return (VT.getSizeInBits() + 31) / 32; return TargetLowering::getNumRegistersForCallingConv(Context, CC, VT); } @@ -821,10 +838,10 @@ unsigned SITargetLowering::getVectorTypeBreakdownForCallingConv( return NumIntermediates; } - if (Size == 64) { + if (Size > 32) { RegisterVT = MVT::i32; IntermediateVT = RegisterVT; - NumIntermediates = 2 * NumElts; + NumIntermediates = NumElts * ((Size + 31) / 32); return NumIntermediates; } @@ -901,7 +918,7 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, Info.ptrVal = MFI->getImagePSV( *MF.getSubtarget<GCNSubtarget>().getInstrInfo(), CI.getArgOperand(RsrcIntr->RsrcArg)); - Info.align = 0; + Info.align.reset(); } else { Info.ptrVal = MFI->getBufferPSV( *MF.getSubtarget<GCNSubtarget>().getInstrInfo(), @@ -947,7 +964,7 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, Info.opc = ISD::INTRINSIC_W_CHAIN; Info.memVT = MVT::getVT(CI.getType()); Info.ptrVal = CI.getOperand(0); - Info.align = 0; + Info.align.reset(); Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOStore; const ConstantInt *Vol = cast<ConstantInt>(CI.getOperand(4)); @@ -964,7 +981,7 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, Info.ptrVal = MFI->getBufferPSV( *MF.getSubtarget<GCNSubtarget>().getInstrInfo(), CI.getArgOperand(1)); - Info.align = 0; + Info.align.reset(); Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOStore; const ConstantInt *Vol = dyn_cast<ConstantInt>(CI.getOperand(4)); @@ -978,7 +995,7 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, Info.memVT = MVT::getVT(CI.getOperand(0)->getType() ->getPointerElementType()); Info.ptrVal = CI.getOperand(0); - Info.align = 0; + Info.align.reset(); Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOStore; return true; @@ -988,7 +1005,7 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, Info.opc = ISD::INTRINSIC_W_CHAIN; Info.memVT = MVT::getVT(CI.getType()); Info.ptrVal = CI.getOperand(0); - Info.align = 0; + Info.align.reset(); Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOStore; const ConstantInt *Vol = cast<ConstantInt>(CI.getOperand(1)); @@ -1012,7 +1029,7 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, // This is an abstract access, but we need to specify a type and size. Info.memVT = MVT::i32; Info.size = 4; - Info.align = 4; + Info.align = Align(4); Info.flags = MachineMemOperand::MOStore; if (IntrID == Intrinsic::amdgcn_ds_gws_barrier) @@ -1215,21 +1232,12 @@ bool SITargetLowering::canMergeStoresTo(unsigned AS, EVT MemVT, return true; } -bool SITargetLowering::allowsMisalignedMemoryAccesses( - EVT VT, unsigned AddrSpace, unsigned Align, MachineMemOperand::Flags Flags, - bool *IsFast) const { +bool SITargetLowering::allowsMisalignedMemoryAccessesImpl( + unsigned Size, unsigned AddrSpace, unsigned Align, + MachineMemOperand::Flags Flags, bool *IsFast) const { if (IsFast) *IsFast = false; - // TODO: I think v3i32 should allow unaligned accesses on CI with DS_READ_B96, - // which isn't a simple VT. - // Until MVT is extended to handle this, simply check for the size and - // rely on the condition below: allow accesses if the size is a multiple of 4. - if (VT == MVT::Other || (VT != MVT::Other && VT.getSizeInBits() > 1024 && - VT.getStoreSize() > 16)) { - return false; - } - if (AddrSpace == AMDGPUAS::LOCAL_ADDRESS || AddrSpace == AMDGPUAS::REGION_ADDRESS) { // ds_read/write_b64 require 8-byte alignment, but we can do a 4 byte @@ -1268,7 +1276,7 @@ bool SITargetLowering::allowsMisalignedMemoryAccesses( } // Smaller than dword value must be aligned. - if (VT.bitsLT(MVT::i32)) + if (Size < 32) return false; // 8.1.6 - For Dword or larger reads or writes, the two LSBs of the @@ -1277,7 +1285,26 @@ bool SITargetLowering::allowsMisalignedMemoryAccesses( if (IsFast) *IsFast = true; - return VT.bitsGT(MVT::i32) && Align % 4 == 0; + return Size >= 32 && Align >= 4; +} + +bool SITargetLowering::allowsMisalignedMemoryAccesses( + EVT VT, unsigned AddrSpace, unsigned Align, MachineMemOperand::Flags Flags, + bool *IsFast) const { + if (IsFast) + *IsFast = false; + + // TODO: I think v3i32 should allow unaligned accesses on CI with DS_READ_B96, + // which isn't a simple VT. + // Until MVT is extended to handle this, simply check for the size and + // rely on the condition below: allow accesses if the size is a multiple of 4. + if (VT == MVT::Other || (VT != MVT::Other && VT.getSizeInBits() > 1024 && + VT.getStoreSize() > 16)) { + return false; + } + + return allowsMisalignedMemoryAccessesImpl(VT.getSizeInBits(), AddrSpace, + Align, Flags, IsFast); } EVT SITargetLowering::getOptimalMemOpType( @@ -1336,9 +1363,9 @@ bool SITargetLowering::isMemOpUniform(const SDNode *N) const { TargetLoweringBase::LegalizeTypeAction SITargetLowering::getPreferredVectorAction(MVT VT) const { - if (VT.getVectorNumElements() != 1 && VT.getScalarType().bitsLE(MVT::i16)) - return TypeSplitVector; - + int NumElts = VT.getVectorNumElements(); + if (NumElts != 1 && VT.getScalarType().bitsLE(MVT::i16)) + return VT.isPow2VectorType() ? TypeSplitVector : TypeWidenVector; return TargetLoweringBase::getPreferredVectorAction(VT); } @@ -1562,7 +1589,8 @@ static void processShaderInputArgs(SmallVectorImpl<ISD::InputArg> &Splits, // entire split argument. if (Arg->Flags.isSplit()) { while (!Arg->Flags.isSplitEnd()) { - assert(!Arg->VT.isVector() && + assert((!Arg->VT.isVector() || + Arg->VT.getScalarSizeInBits() == 16) && "unexpected vector split in ps argument type"); if (!SkipArg) Splits.push_back(*Arg); @@ -1589,29 +1617,32 @@ static void processShaderInputArgs(SmallVectorImpl<ISD::InputArg> &Splits, } // Allocate special inputs passed in VGPRs. -static void allocateSpecialEntryInputVGPRs(CCState &CCInfo, - MachineFunction &MF, - const SIRegisterInfo &TRI, - SIMachineFunctionInfo &Info) { +void SITargetLowering::allocateSpecialEntryInputVGPRs(CCState &CCInfo, + MachineFunction &MF, + const SIRegisterInfo &TRI, + SIMachineFunctionInfo &Info) const { + const LLT S32 = LLT::scalar(32); + MachineRegisterInfo &MRI = MF.getRegInfo(); + if (Info.hasWorkItemIDX()) { - unsigned Reg = AMDGPU::VGPR0; - MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass); + Register Reg = AMDGPU::VGPR0; + MRI.setType(MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32); CCInfo.AllocateReg(Reg); Info.setWorkItemIDX(ArgDescriptor::createRegister(Reg)); } if (Info.hasWorkItemIDY()) { - unsigned Reg = AMDGPU::VGPR1; - MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass); + Register Reg = AMDGPU::VGPR1; + MRI.setType(MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32); CCInfo.AllocateReg(Reg); Info.setWorkItemIDY(ArgDescriptor::createRegister(Reg)); } if (Info.hasWorkItemIDZ()) { - unsigned Reg = AMDGPU::VGPR2; - MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass); + Register Reg = AMDGPU::VGPR2; + MRI.setType(MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32); CCInfo.AllocateReg(Reg); Info.setWorkItemIDZ(ArgDescriptor::createRegister(Reg)); @@ -1642,7 +1673,8 @@ static ArgDescriptor allocateVGPR32Input(CCState &CCInfo, unsigned Mask = ~0u, assert(Reg != AMDGPU::NoRegister); MachineFunction &MF = CCInfo.getMachineFunction(); - MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass); + Register LiveInVReg = MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass); + MF.getRegInfo().setType(LiveInVReg, LLT::scalar(32)); return ArgDescriptor::createRegister(Reg, Mask); } @@ -1671,10 +1703,10 @@ static ArgDescriptor allocateSGPR64Input(CCState &CCInfo) { return allocateSGPR32InputImpl(CCInfo, &AMDGPU::SGPR_64RegClass, 16); } -static void allocateSpecialInputVGPRs(CCState &CCInfo, - MachineFunction &MF, - const SIRegisterInfo &TRI, - SIMachineFunctionInfo &Info) { +void SITargetLowering::allocateSpecialInputVGPRs(CCState &CCInfo, + MachineFunction &MF, + const SIRegisterInfo &TRI, + SIMachineFunctionInfo &Info) const { const unsigned Mask = 0x3ff; ArgDescriptor Arg; @@ -1692,10 +1724,11 @@ static void allocateSpecialInputVGPRs(CCState &CCInfo, Info.setWorkItemIDZ(allocateVGPR32Input(CCInfo, Mask << 20, Arg)); } -static void allocateSpecialInputSGPRs(CCState &CCInfo, - MachineFunction &MF, - const SIRegisterInfo &TRI, - SIMachineFunctionInfo &Info) { +void SITargetLowering::allocateSpecialInputSGPRs( + CCState &CCInfo, + MachineFunction &MF, + const SIRegisterInfo &TRI, + SIMachineFunctionInfo &Info) const { auto &ArgInfo = Info.getArgInfo(); // TODO: Unify handling with private memory pointers. @@ -1728,10 +1761,10 @@ static void allocateSpecialInputSGPRs(CCState &CCInfo, } // Allocate special inputs passed in user SGPRs. -static void allocateHSAUserSGPRs(CCState &CCInfo, - MachineFunction &MF, - const SIRegisterInfo &TRI, - SIMachineFunctionInfo &Info) { +void SITargetLowering::allocateHSAUserSGPRs(CCState &CCInfo, + MachineFunction &MF, + const SIRegisterInfo &TRI, + SIMachineFunctionInfo &Info) const { if (Info.hasImplicitBufferPtr()) { unsigned ImplicitBufferPtrReg = Info.addImplicitBufferPtr(TRI); MF.addLiveIn(ImplicitBufferPtrReg, &AMDGPU::SGPR_64RegClass); @@ -1758,9 +1791,12 @@ static void allocateHSAUserSGPRs(CCState &CCInfo, } if (Info.hasKernargSegmentPtr()) { - unsigned InputPtrReg = Info.addKernargSegmentPtr(TRI); - MF.addLiveIn(InputPtrReg, &AMDGPU::SGPR_64RegClass); + MachineRegisterInfo &MRI = MF.getRegInfo(); + Register InputPtrReg = Info.addKernargSegmentPtr(TRI); CCInfo.AllocateReg(InputPtrReg); + + Register VReg = MF.addLiveIn(InputPtrReg, &AMDGPU::SGPR_64RegClass); + MRI.setType(VReg, LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64)); } if (Info.hasDispatchID()) { @@ -1780,32 +1816,32 @@ static void allocateHSAUserSGPRs(CCState &CCInfo, } // Allocate special input registers that are initialized per-wave. -static void allocateSystemSGPRs(CCState &CCInfo, - MachineFunction &MF, - SIMachineFunctionInfo &Info, - CallingConv::ID CallConv, - bool IsShader) { +void SITargetLowering::allocateSystemSGPRs(CCState &CCInfo, + MachineFunction &MF, + SIMachineFunctionInfo &Info, + CallingConv::ID CallConv, + bool IsShader) const { if (Info.hasWorkGroupIDX()) { unsigned Reg = Info.addWorkGroupIDX(); - MF.addLiveIn(Reg, &AMDGPU::SReg_32_XM0RegClass); + MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass); CCInfo.AllocateReg(Reg); } if (Info.hasWorkGroupIDY()) { unsigned Reg = Info.addWorkGroupIDY(); - MF.addLiveIn(Reg, &AMDGPU::SReg_32_XM0RegClass); + MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass); CCInfo.AllocateReg(Reg); } if (Info.hasWorkGroupIDZ()) { unsigned Reg = Info.addWorkGroupIDZ(); - MF.addLiveIn(Reg, &AMDGPU::SReg_32_XM0RegClass); + MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass); CCInfo.AllocateReg(Reg); } if (Info.hasWorkGroupInfo()) { unsigned Reg = Info.addWorkGroupInfo(); - MF.addLiveIn(Reg, &AMDGPU::SReg_32_XM0RegClass); + MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass); CCInfo.AllocateReg(Reg); } @@ -1860,7 +1896,7 @@ static void reservePrivateMemoryRegs(const TargetMachine &TM, // resource. For the Code Object V2 ABI, this will be the first 4 user // SGPR inputs. We can reserve those and use them directly. - unsigned PrivateSegmentBufferReg = + Register PrivateSegmentBufferReg = Info.getPreloadedReg(AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_BUFFER); Info.setScratchRSrcReg(PrivateSegmentBufferReg); } else { @@ -1921,7 +1957,7 @@ static void reservePrivateMemoryRegs(const TargetMachine &TM, // // FIXME: Should not do this if inline asm is reading/writing these // registers. - unsigned PreloadedSP = Info.getPreloadedReg( + Register PreloadedSP = Info.getPreloadedReg( AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET); Info.setStackPtrOffsetReg(PreloadedSP); @@ -1971,7 +2007,7 @@ void SITargetLowering::insertCopiesSplitCSR( else llvm_unreachable("Unexpected register class in CSRsViaCopy!"); - unsigned NewVR = MRI->createVirtualRegister(RC); + Register NewVR = MRI->createVirtualRegister(RC); // Create copy from CSR to a virtual register. Entry->addLiveIn(*I); BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR) @@ -2134,7 +2170,7 @@ SDValue SITargetLowering::LowerFormalArguments( assert(VA.isRegLoc() && "Parameter must be in a register!"); - unsigned Reg = VA.getLocReg(); + Register Reg = VA.getLocReg(); const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg, VT); EVT ValVT = VA.getValVT(); @@ -2652,6 +2688,15 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI, bool IsThisReturn = false; MachineFunction &MF = DAG.getMachineFunction(); + if (Callee.isUndef() || isNullConstant(Callee)) { + if (!CLI.IsTailCall) { + for (unsigned I = 0, E = CLI.Ins.size(); I != E; ++I) + InVals.push_back(DAG.getUNDEF(CLI.Ins[I].VT)); + } + + return Chain; + } + if (IsVarArg) { return lowerUnhandledCall(CLI, InVals, "unsupported call to variadic function "); @@ -2782,7 +2827,7 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI, int32_t Offset = LocMemOffset; SDValue PtrOff = DAG.getConstant(Offset, DL, PtrVT); - unsigned Align = 0; + MaybeAlign Alignment; if (IsTailCall) { ISD::ArgFlagsTy Flags = Outs[realArgIdx].Flags; @@ -2790,8 +2835,10 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI, Flags.getByValSize() : VA.getValVT().getStoreSize(); // FIXME: We can have better than the minimum byval required alignment. - Align = Flags.isByVal() ? Flags.getByValAlign() : - MinAlign(Subtarget->getStackAlignment(), Offset); + Alignment = + Flags.isByVal() + ? MaybeAlign(Flags.getByValAlign()) + : commonAlignment(Subtarget->getStackAlignment(), Offset); Offset = Offset + FPDiff; int FI = MFI.CreateFixedObject(OpSize, Offset, true); @@ -2810,7 +2857,8 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI, } else { DstAddr = PtrOff; DstInfo = MachinePointerInfo::getStack(MF, LocMemOffset); - Align = MinAlign(Subtarget->getStackAlignment(), LocMemOffset); + Alignment = + commonAlignment(Subtarget->getStackAlignment(), LocMemOffset); } if (Outs[i].Flags.isByVal()) { @@ -2825,7 +2873,8 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI, MemOpChains.push_back(Cpy); } else { - SDValue Store = DAG.getStore(Chain, DL, Arg, DstAddr, DstInfo, Align); + SDValue Store = DAG.getStore(Chain, DL, Arg, DstAddr, DstInfo, + Alignment ? Alignment->value() : 0); MemOpChains.push_back(Store); } } @@ -2937,9 +2986,9 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI, IsThisReturn ? OutVals[0] : SDValue()); } -unsigned SITargetLowering::getRegisterByName(const char* RegName, EVT VT, - SelectionDAG &DAG) const { - unsigned Reg = StringSwitch<unsigned>(RegName) +Register SITargetLowering::getRegisterByName(const char* RegName, EVT VT, + const MachineFunction &MF) const { + Register Reg = StringSwitch<Register>(RegName) .Case("m0", AMDGPU::M0) .Case("exec", AMDGPU::EXEC) .Case("exec_lo", AMDGPU::EXEC_LO) @@ -2947,7 +2996,7 @@ unsigned SITargetLowering::getRegisterByName(const char* RegName, EVT VT, .Case("flat_scratch", AMDGPU::FLAT_SCR) .Case("flat_scratch_lo", AMDGPU::FLAT_SCR_LO) .Case("flat_scratch_hi", AMDGPU::FLAT_SCR_HI) - .Default(AMDGPU::NoRegister); + .Default(Register()); if (Reg == AMDGPU::NoRegister) { report_fatal_error(Twine("invalid register name \"" @@ -3055,6 +3104,20 @@ splitBlockForLoop(MachineInstr &MI, MachineBasicBlock &MBB, bool InstInLoop) { return std::make_pair(LoopBB, RemainderBB); } +/// Insert \p MI into a BUNDLE with an S_WAITCNT 0 immediately following it. +void SITargetLowering::bundleInstWithWaitcnt(MachineInstr &MI) const { + MachineBasicBlock *MBB = MI.getParent(); + const SIInstrInfo *TII = getSubtarget()->getInstrInfo(); + auto I = MI.getIterator(); + auto E = std::next(I); + + BuildMI(*MBB, E, MI.getDebugLoc(), TII->get(AMDGPU::S_WAITCNT)) + .addImm(0); + + MIBundleBuilder Bundler(*MBB, I, E); + finalizeBundle(*MBB, Bundler.begin()); +} + MachineBasicBlock * SITargetLowering::emitGWSMemViolTestLoop(MachineInstr &MI, MachineBasicBlock *BB) const { @@ -3066,12 +3129,13 @@ SITargetLowering::emitGWSMemViolTestLoop(MachineInstr &MI, MachineBasicBlock *RemainderBB; const SIInstrInfo *TII = getSubtarget()->getInstrInfo(); - MachineBasicBlock::iterator Prev = std::prev(MI.getIterator()); + // Apparently kill flags are only valid if the def is in the same block? + if (MachineOperand *Src = TII->getNamedOperand(MI, AMDGPU::OpName::data0)) + Src->setIsKill(false); std::tie(LoopBB, RemainderBB) = splitBlockForLoop(MI, *BB, true); MachineBasicBlock::iterator I = LoopBB->end(); - MachineOperand *Src = TII->getNamedOperand(MI, AMDGPU::OpName::data0); const unsigned EncodedReg = AMDGPU::Hwreg::encodeHwreg( AMDGPU::Hwreg::ID_TRAPSTS, AMDGPU::Hwreg::OFFSET_MEM_VIOL, 1); @@ -3081,23 +3145,9 @@ SITargetLowering::emitGWSMemViolTestLoop(MachineInstr &MI, .addImm(0) .addImm(EncodedReg); - // This is a pain, but we're not allowed to have physical register live-ins - // yet. Insert a pair of copies if the VGPR0 hack is necessary. - if (Src && TargetRegisterInfo::isPhysicalRegister(Src->getReg())) { - unsigned Data0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); - BuildMI(*BB, std::next(Prev), DL, TII->get(AMDGPU::COPY), Data0) - .add(*Src); + bundleInstWithWaitcnt(MI); - BuildMI(*LoopBB, LoopBB->begin(), DL, TII->get(AMDGPU::COPY), Src->getReg()) - .addReg(Data0); - - MRI.setSimpleHint(Data0, Src->getReg()); - } - - BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::S_WAITCNT)) - .addImm(0); - - unsigned Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); + Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); // Load and check TRAP_STS.MEM_VIOL BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::S_GETREG_B32), Reg) @@ -3138,10 +3188,10 @@ static MachineBasicBlock::iterator emitLoadM0FromVGPRLoop( MachineBasicBlock::iterator I = LoopBB.begin(); const TargetRegisterClass *BoolRC = TRI->getBoolRC(); - unsigned PhiExec = MRI.createVirtualRegister(BoolRC); - unsigned NewExec = MRI.createVirtualRegister(BoolRC); - unsigned CurrentIdxReg = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); - unsigned CondReg = MRI.createVirtualRegister(BoolRC); + Register PhiExec = MRI.createVirtualRegister(BoolRC); + Register NewExec = MRI.createVirtualRegister(BoolRC); + Register CurrentIdxReg = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); + Register CondReg = MRI.createVirtualRegister(BoolRC); BuildMI(LoopBB, I, DL, TII->get(TargetOpcode::PHI), PhiReg) .addReg(InitReg) @@ -3240,9 +3290,9 @@ static MachineBasicBlock::iterator loadM0FromVGPR(const SIInstrInfo *TII, MachineBasicBlock::iterator I(&MI); const auto *BoolXExecRC = TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID); - unsigned DstReg = MI.getOperand(0).getReg(); - unsigned SaveExec = MRI.createVirtualRegister(BoolXExecRC); - unsigned TmpExec = MRI.createVirtualRegister(BoolXExecRC); + Register DstReg = MI.getOperand(0).getReg(); + Register SaveExec = MRI.createVirtualRegister(BoolXExecRC); + Register TmpExec = MRI.createVirtualRegister(BoolXExecRC); unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC; unsigned MovExecOpc = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64; @@ -3315,7 +3365,7 @@ static bool setM0ToIndexFromSGPR(const SIInstrInfo *TII, SetOn->getOperand(3).setIsUndef(); } else { - unsigned Tmp = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); + Register Tmp = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_ADD_I32), Tmp) .add(*Idx) .addImm(Offset); @@ -3351,8 +3401,8 @@ static MachineBasicBlock *emitIndirectSrc(MachineInstr &MI, MachineFunction *MF = MBB.getParent(); MachineRegisterInfo &MRI = MF->getRegInfo(); - unsigned Dst = MI.getOperand(0).getReg(); - unsigned SrcReg = TII->getNamedOperand(MI, AMDGPU::OpName::src)->getReg(); + Register Dst = MI.getOperand(0).getReg(); + Register SrcReg = TII->getNamedOperand(MI, AMDGPU::OpName::src)->getReg(); int Offset = TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm(); const TargetRegisterClass *VecRC = MRI.getRegClass(SrcReg); @@ -3390,8 +3440,8 @@ static MachineBasicBlock *emitIndirectSrc(MachineInstr &MI, const DebugLoc &DL = MI.getDebugLoc(); MachineBasicBlock::iterator I(&MI); - unsigned PhiReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); - unsigned InitReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); + Register PhiReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); + Register InitReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); BuildMI(MBB, I, DL, TII->get(TargetOpcode::IMPLICIT_DEF), InitReg); @@ -3442,7 +3492,7 @@ static MachineBasicBlock *emitIndirectDst(MachineInstr &MI, MachineFunction *MF = MBB.getParent(); MachineRegisterInfo &MRI = MF->getRegInfo(); - unsigned Dst = MI.getOperand(0).getReg(); + Register Dst = MI.getOperand(0).getReg(); const MachineOperand *SrcVec = TII->getNamedOperand(MI, AMDGPU::OpName::src); const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx); const MachineOperand *Val = TII->getNamedOperand(MI, AMDGPU::OpName::val); @@ -3505,7 +3555,7 @@ static MachineBasicBlock *emitIndirectDst(MachineInstr &MI, const DebugLoc &DL = MI.getDebugLoc(); - unsigned PhiReg = MRI.createVirtualRegister(VecRC); + Register PhiReg = MRI.createVirtualRegister(VecRC); auto InsPt = loadM0FromVGPR(TII, MBB, MI, SrcVec->getReg(), PhiReg, Offset, UseGPRIdxMode, false); @@ -3564,22 +3614,22 @@ MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter( MachineOperand &Src0 = MI.getOperand(1); MachineOperand &Src1 = MI.getOperand(2); - unsigned DestSub0 = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); - unsigned DestSub1 = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); + Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); + Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass); MachineOperand Src0Sub0 = TII->buildExtractSubRegOrImm(MI, MRI, Src0, BoolRC, AMDGPU::sub0, - &AMDGPU::SReg_32_XM0RegClass); + &AMDGPU::SReg_32RegClass); MachineOperand Src0Sub1 = TII->buildExtractSubRegOrImm(MI, MRI, Src0, BoolRC, AMDGPU::sub1, - &AMDGPU::SReg_32_XM0RegClass); + &AMDGPU::SReg_32RegClass); MachineOperand Src1Sub0 = TII->buildExtractSubRegOrImm(MI, MRI, Src1, BoolRC, AMDGPU::sub0, - &AMDGPU::SReg_32_XM0RegClass); + &AMDGPU::SReg_32RegClass); MachineOperand Src1Sub1 = TII->buildExtractSubRegOrImm(MI, MRI, Src1, BoolRC, AMDGPU::sub1, - &AMDGPU::SReg_32_XM0RegClass); + &AMDGPU::SReg_32RegClass); bool IsAdd = (MI.getOpcode() == AMDGPU::S_ADD_U64_PSEUDO); @@ -3632,8 +3682,8 @@ MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter( // S_CMOV_B64 exec, -1 MachineInstr *FirstMI = &*BB->begin(); MachineRegisterInfo &MRI = MF->getRegInfo(); - unsigned InputReg = MI.getOperand(0).getReg(); - unsigned CountReg = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); + Register InputReg = MI.getOperand(0).getReg(); + Register CountReg = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass); bool Found = false; // Move the COPY of the input reg to the beginning, so that we can use it. @@ -3707,16 +3757,16 @@ MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter( const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>(); const SIRegisterInfo *TRI = ST.getRegisterInfo(); - unsigned Dst = MI.getOperand(0).getReg(); - unsigned Src0 = MI.getOperand(1).getReg(); - unsigned Src1 = MI.getOperand(2).getReg(); + Register Dst = MI.getOperand(0).getReg(); + Register Src0 = MI.getOperand(1).getReg(); + Register Src1 = MI.getOperand(2).getReg(); const DebugLoc &DL = MI.getDebugLoc(); - unsigned SrcCond = MI.getOperand(3).getReg(); + Register SrcCond = MI.getOperand(3).getReg(); - unsigned DstLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); - unsigned DstHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); + Register DstLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); + Register DstHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); const auto *CondRC = TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID); - unsigned SrcCondCopy = MRI.createVirtualRegister(CondRC); + Register SrcCondCopy = MRI.createVirtualRegister(CondRC); BuildMI(*BB, MI, DL, TII->get(AMDGPU::COPY), SrcCondCopy) .addReg(SrcCond); @@ -3814,8 +3864,12 @@ MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter( case AMDGPU::DS_GWS_SEMA_P: case AMDGPU::DS_GWS_SEMA_RELEASE_ALL: case AMDGPU::DS_GWS_BARRIER: - if (getSubtarget()->hasGWSAutoReplay()) + // A s_waitcnt 0 is required to be the instruction immediately following. + if (getSubtarget()->hasGWSAutoReplay()) { + bundleInstWithWaitcnt(MI); return BB; + } + return emitGWSMemViolTestLoop(MI, BB); default: return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI, BB); @@ -3939,6 +3993,30 @@ SDValue SITargetLowering::splitBinaryVectorOp(SDValue Op, return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op), VT, OpLo, OpHi); } +SDValue SITargetLowering::splitTernaryVectorOp(SDValue Op, + SelectionDAG &DAG) const { + unsigned Opc = Op.getOpcode(); + EVT VT = Op.getValueType(); + assert(VT == MVT::v4i16 || VT == MVT::v4f16); + + SDValue Lo0, Hi0; + std::tie(Lo0, Hi0) = DAG.SplitVectorOperand(Op.getNode(), 0); + SDValue Lo1, Hi1; + std::tie(Lo1, Hi1) = DAG.SplitVectorOperand(Op.getNode(), 1); + SDValue Lo2, Hi2; + std::tie(Lo2, Hi2) = DAG.SplitVectorOperand(Op.getNode(), 2); + + SDLoc SL(Op); + + SDValue OpLo = DAG.getNode(Opc, SL, Lo0.getValueType(), Lo0, Lo1, Lo2, + Op->getFlags()); + SDValue OpHi = DAG.getNode(Opc, SL, Hi0.getValueType(), Hi0, Hi1, Hi2, + Op->getFlags()); + + return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op), VT, OpLo, OpHi); +} + + SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { switch (Op.getOpcode()) { default: return AMDGPUTargetLowering::LowerOperation(Op, DAG); @@ -3991,6 +4069,8 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { case ISD::FMINNUM: case ISD::FMAXNUM: return lowerFMINNUM_FMAXNUM(Op, DAG); + case ISD::FMA: + return splitTernaryVectorOp(Op, DAG); case ISD::SHL: case ISD::SRA: case ISD::SRL: @@ -4070,6 +4150,41 @@ SDValue SITargetLowering::adjustLoadValueType(unsigned Opcode, return DAG.getMergeValues({ Adjusted, Load.getValue(1) }, DL); } +SDValue SITargetLowering::lowerIntrinsicLoad(MemSDNode *M, bool IsFormat, + SelectionDAG &DAG, + ArrayRef<SDValue> Ops) const { + SDLoc DL(M); + EVT LoadVT = M->getValueType(0); + EVT EltType = LoadVT.getScalarType(); + EVT IntVT = LoadVT.changeTypeToInteger(); + + bool IsD16 = IsFormat && (EltType.getSizeInBits() == 16); + + unsigned Opc = + IsFormat ? AMDGPUISD::BUFFER_LOAD_FORMAT : AMDGPUISD::BUFFER_LOAD; + + if (IsD16) { + return adjustLoadValueType(AMDGPUISD::BUFFER_LOAD_FORMAT_D16, M, DAG, Ops); + } + + // Handle BUFFER_LOAD_BYTE/UBYTE/SHORT/USHORT overloaded intrinsics + if (!IsD16 && !LoadVT.isVector() && EltType.getSizeInBits() < 32) + return handleByteShortBufferLoads(DAG, LoadVT, DL, Ops, M); + + if (isTypeLegal(LoadVT)) { + return getMemIntrinsicNode(Opc, DL, M->getVTList(), Ops, IntVT, + M->getMemOperand(), DAG); + } + + EVT CastVT = getEquivalentMemType(*DAG.getContext(), LoadVT); + SDVTList VTList = DAG.getVTList(CastVT, MVT::Other); + SDValue MemNode = getMemIntrinsicNode(Opc, DL, VTList, Ops, CastVT, + M->getMemOperand(), DAG); + return DAG.getMergeValues( + {DAG.getNode(ISD::BITCAST, DL, LoadVT, MemNode), MemNode.getValue(1)}, + DL); +} + static SDValue lowerICMPIntrinsic(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG) { EVT VT = N->getValueType(0); @@ -4196,8 +4311,14 @@ void SITargetLowering::ReplaceNodeResults(SDNode *N, } case ISD::INTRINSIC_W_CHAIN: { if (SDValue Res = LowerINTRINSIC_W_CHAIN(SDValue(N, 0), DAG)) { - Results.push_back(Res); - Results.push_back(Res.getValue(1)); + if (Res.getOpcode() == ISD::MERGE_VALUES) { + // FIXME: Hacky + Results.push_back(Res.getOperand(0)); + Results.push_back(Res.getOperand(1)); + } else { + Results.push_back(Res); + Results.push_back(Res.getValue(1)); + } return; } @@ -4935,11 +5056,8 @@ buildPCRelGlobalAddress(SelectionDAG &DAG, const GlobalValue *GV, // of the s_add_u32 instruction, we end up with an offset that is 4 bytes too // small. This requires us to add 4 to the global variable offset in order to // compute the correct address. - unsigned LoFlags = GAFlags; - if (LoFlags == SIInstrInfo::MO_NONE) - LoFlags = SIInstrInfo::MO_REL32; SDValue PtrLo = - DAG.getTargetGlobalAddress(GV, DL, MVT::i32, Offset + 4, LoFlags); + DAG.getTargetGlobalAddress(GV, DL, MVT::i32, Offset + 4, GAFlags); SDValue PtrHi; if (GAFlags == SIInstrInfo::MO_NONE) { PtrHi = DAG.getTargetConstant(0, DL, MVT::i32); @@ -5563,14 +5681,14 @@ SDValue SITargetLowering::lowerSBuffer(EVT VT, SDLoc DL, SDValue Rsrc, SDVTList VTList = DAG.getVTList({LoadVT, MVT::Glue}); unsigned CachePolicy = cast<ConstantSDNode>(GLC)->getZExtValue(); SDValue Ops[] = { - DAG.getEntryNode(), // Chain - Rsrc, // rsrc - DAG.getConstant(0, DL, MVT::i32), // vindex - {}, // voffset - {}, // soffset - {}, // offset - DAG.getConstant(CachePolicy, DL, MVT::i32), // cachepolicy - DAG.getConstant(0, DL, MVT::i1), // idxen + DAG.getEntryNode(), // Chain + Rsrc, // rsrc + DAG.getConstant(0, DL, MVT::i32), // vindex + {}, // voffset + {}, // soffset + {}, // offset + DAG.getTargetConstant(CachePolicy, DL, MVT::i32), // cachepolicy + DAG.getTargetConstant(0, DL, MVT::i1), // idxen }; // Use the alignment to ensure that the required offsets will fit into the @@ -5579,7 +5697,7 @@ SDValue SITargetLowering::lowerSBuffer(EVT VT, SDLoc DL, SDValue Rsrc, uint64_t InstOffset = cast<ConstantSDNode>(Ops[5])->getZExtValue(); for (unsigned i = 0; i < NumLoads; ++i) { - Ops[5] = DAG.getConstant(InstOffset + 16 * i, DL, MVT::i32); + Ops[5] = DAG.getTargetConstant(InstOffset + 16 * i, DL, MVT::i32); Loads.push_back(DAG.getMemIntrinsicNode(AMDGPUISD::BUFFER_LOAD, DL, VTList, Ops, LoadVT, MMO)); } @@ -5758,45 +5876,31 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, } case Intrinsic::amdgcn_fdiv_fast: return lowerFDIV_FAST(Op, DAG); - case Intrinsic::amdgcn_interp_mov: { - SDValue M0 = copyToM0(DAG, DAG.getEntryNode(), DL, Op.getOperand(4)); - SDValue Glue = M0.getValue(1); - return DAG.getNode(AMDGPUISD::INTERP_MOV, DL, MVT::f32, Op.getOperand(1), - Op.getOperand(2), Op.getOperand(3), Glue); - } - case Intrinsic::amdgcn_interp_p1: { - SDValue M0 = copyToM0(DAG, DAG.getEntryNode(), DL, Op.getOperand(4)); - SDValue Glue = M0.getValue(1); - return DAG.getNode(AMDGPUISD::INTERP_P1, DL, MVT::f32, Op.getOperand(1), - Op.getOperand(2), Op.getOperand(3), Glue); - } - case Intrinsic::amdgcn_interp_p2: { - SDValue M0 = copyToM0(DAG, DAG.getEntryNode(), DL, Op.getOperand(5)); - SDValue Glue = SDValue(M0.getNode(), 1); - return DAG.getNode(AMDGPUISD::INTERP_P2, DL, MVT::f32, Op.getOperand(1), - Op.getOperand(2), Op.getOperand(3), Op.getOperand(4), - Glue); - } case Intrinsic::amdgcn_interp_p1_f16: { - SDValue M0 = copyToM0(DAG, DAG.getEntryNode(), DL, Op.getOperand(5)); - SDValue Glue = M0.getValue(1); + SDValue ToM0 = DAG.getCopyToReg(DAG.getEntryNode(), DL, AMDGPU::M0, + Op.getOperand(5), SDValue()); if (getSubtarget()->getLDSBankCount() == 16) { // 16 bank LDS - SDValue S = DAG.getNode(AMDGPUISD::INTERP_MOV, DL, MVT::f32, - DAG.getConstant(2, DL, MVT::i32), // P0 - Op.getOperand(2), // Attrchan - Op.getOperand(3), // Attr - Glue); + + // FIXME: This implicitly will insert a second CopyToReg to M0. + SDValue S = DAG.getNode( + ISD::INTRINSIC_WO_CHAIN, DL, MVT::f32, + DAG.getTargetConstant(Intrinsic::amdgcn_interp_mov, DL, MVT::i32), + DAG.getConstant(2, DL, MVT::i32), // P0 + Op.getOperand(2), // Attrchan + Op.getOperand(3), // Attr + Op.getOperand(5)); // m0 + SDValue Ops[] = { Op.getOperand(1), // Src0 Op.getOperand(2), // Attrchan Op.getOperand(3), // Attr - DAG.getConstant(0, DL, MVT::i32), // $src0_modifiers + DAG.getTargetConstant(0, DL, MVT::i32), // $src0_modifiers S, // Src2 - holds two f16 values selected by high - DAG.getConstant(0, DL, MVT::i32), // $src2_modifiers + DAG.getTargetConstant(0, DL, MVT::i32), // $src2_modifiers Op.getOperand(4), // high - DAG.getConstant(0, DL, MVT::i1), // $clamp - DAG.getConstant(0, DL, MVT::i32) // $omod + DAG.getTargetConstant(0, DL, MVT::i1), // $clamp + DAG.getTargetConstant(0, DL, MVT::i32) // $omod }; return DAG.getNode(AMDGPUISD::INTERP_P1LV_F16, DL, MVT::f32, Ops); } else { @@ -5805,28 +5909,28 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, Op.getOperand(1), // Src0 Op.getOperand(2), // Attrchan Op.getOperand(3), // Attr - DAG.getConstant(0, DL, MVT::i32), // $src0_modifiers + DAG.getTargetConstant(0, DL, MVT::i32), // $src0_modifiers Op.getOperand(4), // high - DAG.getConstant(0, DL, MVT::i1), // $clamp - DAG.getConstant(0, DL, MVT::i32), // $omod - Glue + DAG.getTargetConstant(0, DL, MVT::i1), // $clamp + DAG.getTargetConstant(0, DL, MVT::i32), // $omod + ToM0.getValue(1) }; return DAG.getNode(AMDGPUISD::INTERP_P1LL_F16, DL, MVT::f32, Ops); } } case Intrinsic::amdgcn_interp_p2_f16: { - SDValue M0 = copyToM0(DAG, DAG.getEntryNode(), DL, Op.getOperand(6)); - SDValue Glue = SDValue(M0.getNode(), 1); + SDValue ToM0 = DAG.getCopyToReg(DAG.getEntryNode(), DL, AMDGPU::M0, + Op.getOperand(6), SDValue()); SDValue Ops[] = { Op.getOperand(2), // Src0 Op.getOperand(3), // Attrchan Op.getOperand(4), // Attr - DAG.getConstant(0, DL, MVT::i32), // $src0_modifiers + DAG.getTargetConstant(0, DL, MVT::i32), // $src0_modifiers Op.getOperand(1), // Src2 - DAG.getConstant(0, DL, MVT::i32), // $src2_modifiers + DAG.getTargetConstant(0, DL, MVT::i32), // $src2_modifiers Op.getOperand(5), // high - DAG.getConstant(0, DL, MVT::i1), // $clamp - Glue + DAG.getTargetConstant(0, DL, MVT::i1), // $clamp + ToM0.getValue(1) }; return DAG.getNode(AMDGPUISD::INTERP_P2_F16, DL, MVT::f16, Ops); } @@ -5947,16 +6051,6 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, Op.getOperand(1), Op.getOperand(2)); return DAG.getNode(ISD::BITCAST, DL, VT, Node); } - case Intrinsic::amdgcn_wqm: { - SDValue Src = Op.getOperand(1); - return SDValue(DAG.getMachineNode(AMDGPU::WQM, DL, Src.getValueType(), Src), - 0); - } - case Intrinsic::amdgcn_wwm: { - SDValue Src = Op.getOperand(1); - return SDValue(DAG.getMachineNode(AMDGPU::WWM, DL, Src.getValueType(), Src), - 0); - } case Intrinsic::amdgcn_fmad_ftz: return DAG.getNode(AMDGPUISD::FMAD_FTZ, DL, VT, Op.getOperand(1), Op.getOperand(2), Op.getOperand(3)); @@ -5977,6 +6071,19 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SIInstrInfo::MO_ABS32_LO); return {DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, GA), 0}; } + case Intrinsic::amdgcn_is_shared: + case Intrinsic::amdgcn_is_private: { + SDLoc SL(Op); + unsigned AS = (IntrinsicID == Intrinsic::amdgcn_is_shared) ? + AMDGPUAS::LOCAL_ADDRESS : AMDGPUAS::PRIVATE_ADDRESS; + SDValue Aperture = getSegmentAperture(AS, SL, DAG); + SDValue SrcVec = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, + Op.getOperand(1)); + + SDValue SrcHi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, SrcVec, + DAG.getConstant(1, SL, MVT::i32)); + return DAG.getSetCC(SL, MVT::i1, SrcHi, Aperture, ISD::SETEQ); + } default: if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr = AMDGPU::getImageDimIntrinsicInfo(IntrinsicID)) @@ -5986,6 +6093,30 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, } } +// This function computes an appropriate offset to pass to +// MachineMemOperand::setOffset() based on the offset inputs to +// an intrinsic. If any of the offsets are non-contstant or +// if VIndex is non-zero then this function returns 0. Otherwise, +// it returns the sum of VOffset, SOffset, and Offset. +static unsigned getBufferOffsetForMMO(SDValue VOffset, + SDValue SOffset, + SDValue Offset, + SDValue VIndex = SDValue()) { + + if (!isa<ConstantSDNode>(VOffset) || !isa<ConstantSDNode>(SOffset) || + !isa<ConstantSDNode>(Offset)) + return 0; + + if (VIndex) { + if (!isa<ConstantSDNode>(VIndex) || !cast<ConstantSDNode>(VIndex)->isNullValue()) + return 0; + } + + return cast<ConstantSDNode>(VOffset)->getSExtValue() + + cast<ConstantSDNode>(SOffset)->getSExtValue() + + cast<ConstantSDNode>(Offset)->getSExtValue(); +} + SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, SelectionDAG &DAG) const { unsigned IntrID = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); @@ -6128,17 +6259,22 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, SDValue(), // voffset -- will be set by setBufferOffsets SDValue(), // soffset -- will be set by setBufferOffsets SDValue(), // offset -- will be set by setBufferOffsets - DAG.getConstant(Glc | (Slc << 1), DL, MVT::i32), // cachepolicy - DAG.getConstant(IdxEn, DL, MVT::i1), // idxen + DAG.getTargetConstant(Glc | (Slc << 1), DL, MVT::i32), // cachepolicy + DAG.getTargetConstant(IdxEn, DL, MVT::i1), // idxen }; - setBufferOffsets(Op.getOperand(4), DAG, &Ops[3]); + unsigned Offset = setBufferOffsets(Op.getOperand(4), DAG, &Ops[3]); + // We don't know the offset if vindex is non-zero, so clear it. + if (IdxEn) + Offset = 0; + unsigned Opc = (IntrID == Intrinsic::amdgcn_buffer_load) ? AMDGPUISD::BUFFER_LOAD : AMDGPUISD::BUFFER_LOAD_FORMAT; EVT VT = Op.getValueType(); EVT IntVT = VT.changeTypeToInteger(); auto *M = cast<MemSDNode>(Op); + M->getMemOperand()->setOffset(Offset); EVT LoadVT = Op.getValueType(); if (LoadVT.getScalarType() == MVT::f16) @@ -6155,6 +6291,8 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, } case Intrinsic::amdgcn_raw_buffer_load: case Intrinsic::amdgcn_raw_buffer_load_format: { + const bool IsFormat = IntrID == Intrinsic::amdgcn_raw_buffer_load_format; + auto Offsets = splitBufferOffsets(Op.getOperand(3), DAG); SDValue Ops[] = { Op.getOperand(0), // Chain @@ -6163,32 +6301,18 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, Offsets.first, // voffset Op.getOperand(4), // soffset Offsets.second, // offset - Op.getOperand(5), // cachepolicy - DAG.getConstant(0, DL, MVT::i1), // idxen + Op.getOperand(5), // cachepolicy, swizzled buffer + DAG.getTargetConstant(0, DL, MVT::i1), // idxen }; - unsigned Opc = (IntrID == Intrinsic::amdgcn_raw_buffer_load) ? - AMDGPUISD::BUFFER_LOAD : AMDGPUISD::BUFFER_LOAD_FORMAT; - - EVT VT = Op.getValueType(); - EVT IntVT = VT.changeTypeToInteger(); auto *M = cast<MemSDNode>(Op); - EVT LoadVT = Op.getValueType(); - - if (LoadVT.getScalarType() == MVT::f16) - return adjustLoadValueType(AMDGPUISD::BUFFER_LOAD_FORMAT_D16, - M, DAG, Ops); - - // Handle BUFFER_LOAD_BYTE/UBYTE/SHORT/USHORT overloaded intrinsics - if (LoadVT.getScalarType() == MVT::i8 || - LoadVT.getScalarType() == MVT::i16) - return handleByteShortBufferLoads(DAG, LoadVT, DL, Ops, M); - - return getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops, IntVT, - M->getMemOperand(), DAG); + M->getMemOperand()->setOffset(getBufferOffsetForMMO(Ops[3], Ops[4], Ops[5])); + return lowerIntrinsicLoad(M, IsFormat, DAG, Ops); } case Intrinsic::amdgcn_struct_buffer_load: case Intrinsic::amdgcn_struct_buffer_load_format: { + const bool IsFormat = IntrID == Intrinsic::amdgcn_struct_buffer_load_format; + auto Offsets = splitBufferOffsets(Op.getOperand(4), DAG); SDValue Ops[] = { Op.getOperand(0), // Chain @@ -6197,29 +6321,14 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, Offsets.first, // voffset Op.getOperand(5), // soffset Offsets.second, // offset - Op.getOperand(6), // cachepolicy - DAG.getConstant(1, DL, MVT::i1), // idxen + Op.getOperand(6), // cachepolicy, swizzled buffer + DAG.getTargetConstant(1, DL, MVT::i1), // idxen }; - unsigned Opc = (IntrID == Intrinsic::amdgcn_struct_buffer_load) ? - AMDGPUISD::BUFFER_LOAD : AMDGPUISD::BUFFER_LOAD_FORMAT; - - EVT VT = Op.getValueType(); - EVT IntVT = VT.changeTypeToInteger(); auto *M = cast<MemSDNode>(Op); - EVT LoadVT = Op.getValueType(); - - if (LoadVT.getScalarType() == MVT::f16) - return adjustLoadValueType(AMDGPUISD::BUFFER_LOAD_FORMAT_D16, - M, DAG, Ops); - - // Handle BUFFER_LOAD_BYTE/UBYTE/SHORT/USHORT overloaded intrinsics - if (LoadVT.getScalarType() == MVT::i8 || - LoadVT.getScalarType() == MVT::i16) - return handleByteShortBufferLoads(DAG, LoadVT, DL, Ops, M); - - return getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops, IntVT, - M->getMemOperand(), DAG); + M->getMemOperand()->setOffset(getBufferOffsetForMMO(Ops[3], Ops[4], Ops[5], + Ops[2])); + return lowerIntrinsicLoad(cast<MemSDNode>(Op), IsFormat, DAG, Ops); } case Intrinsic::amdgcn_tbuffer_load: { MemSDNode *M = cast<MemSDNode>(Op); @@ -6239,9 +6348,9 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, Op.getOperand(4), // voffset Op.getOperand(5), // soffset Op.getOperand(6), // offset - DAG.getConstant(Dfmt | (Nfmt << 4), DL, MVT::i32), // format - DAG.getConstant(Glc | (Slc << 1), DL, MVT::i32), // cachepolicy - DAG.getConstant(IdxEn, DL, MVT::i1), // idxen + DAG.getTargetConstant(Dfmt | (Nfmt << 4), DL, MVT::i32), // format + DAG.getTargetConstant(Glc | (Slc << 1), DL, MVT::i32), // cachepolicy + DAG.getTargetConstant(IdxEn, DL, MVT::i1) // idxen }; if (LoadVT.getScalarType() == MVT::f16) @@ -6264,8 +6373,8 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, Op.getOperand(4), // soffset Offsets.second, // offset Op.getOperand(5), // format - Op.getOperand(6), // cachepolicy - DAG.getConstant(0, DL, MVT::i1), // idxen + Op.getOperand(6), // cachepolicy, swizzled buffer + DAG.getTargetConstant(0, DL, MVT::i1), // idxen }; if (LoadVT.getScalarType() == MVT::f16) @@ -6288,8 +6397,8 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, Op.getOperand(5), // soffset Offsets.second, // offset Op.getOperand(6), // format - Op.getOperand(7), // cachepolicy - DAG.getConstant(1, DL, MVT::i1), // idxen + Op.getOperand(7), // cachepolicy, swizzled buffer + DAG.getTargetConstant(1, DL, MVT::i1), // idxen }; if (LoadVT.getScalarType() == MVT::f16) @@ -6321,13 +6430,17 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, SDValue(), // voffset -- will be set by setBufferOffsets SDValue(), // soffset -- will be set by setBufferOffsets SDValue(), // offset -- will be set by setBufferOffsets - DAG.getConstant(Slc << 1, DL, MVT::i32), // cachepolicy - DAG.getConstant(IdxEn, DL, MVT::i1), // idxen + DAG.getTargetConstant(Slc << 1, DL, MVT::i32), // cachepolicy + DAG.getTargetConstant(IdxEn, DL, MVT::i1), // idxen }; - setBufferOffsets(Op.getOperand(5), DAG, &Ops[4]); + unsigned Offset = setBufferOffsets(Op.getOperand(5), DAG, &Ops[4]); + // We don't know the offset if vindex is non-zero, so clear it. + if (IdxEn) + Offset = 0; EVT VT = Op.getValueType(); auto *M = cast<MemSDNode>(Op); + M->getMemOperand()->setOffset(Offset); unsigned Opcode = 0; switch (IntrID) { @@ -6377,7 +6490,9 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, case Intrinsic::amdgcn_raw_buffer_atomic_umax: case Intrinsic::amdgcn_raw_buffer_atomic_and: case Intrinsic::amdgcn_raw_buffer_atomic_or: - case Intrinsic::amdgcn_raw_buffer_atomic_xor: { + case Intrinsic::amdgcn_raw_buffer_atomic_xor: + case Intrinsic::amdgcn_raw_buffer_atomic_inc: + case Intrinsic::amdgcn_raw_buffer_atomic_dec: { auto Offsets = splitBufferOffsets(Op.getOperand(4), DAG); SDValue Ops[] = { Op.getOperand(0), // Chain @@ -6388,11 +6503,12 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, Op.getOperand(5), // soffset Offsets.second, // offset Op.getOperand(6), // cachepolicy - DAG.getConstant(0, DL, MVT::i1), // idxen + DAG.getTargetConstant(0, DL, MVT::i1), // idxen }; EVT VT = Op.getValueType(); auto *M = cast<MemSDNode>(Op); + M->getMemOperand()->setOffset(getBufferOffsetForMMO(Ops[4], Ops[5], Ops[6])); unsigned Opcode = 0; switch (IntrID) { @@ -6426,6 +6542,12 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, case Intrinsic::amdgcn_raw_buffer_atomic_xor: Opcode = AMDGPUISD::BUFFER_ATOMIC_XOR; break; + case Intrinsic::amdgcn_raw_buffer_atomic_inc: + Opcode = AMDGPUISD::BUFFER_ATOMIC_INC; + break; + case Intrinsic::amdgcn_raw_buffer_atomic_dec: + Opcode = AMDGPUISD::BUFFER_ATOMIC_DEC; + break; default: llvm_unreachable("unhandled atomic opcode"); } @@ -6442,7 +6564,9 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, case Intrinsic::amdgcn_struct_buffer_atomic_umax: case Intrinsic::amdgcn_struct_buffer_atomic_and: case Intrinsic::amdgcn_struct_buffer_atomic_or: - case Intrinsic::amdgcn_struct_buffer_atomic_xor: { + case Intrinsic::amdgcn_struct_buffer_atomic_xor: + case Intrinsic::amdgcn_struct_buffer_atomic_inc: + case Intrinsic::amdgcn_struct_buffer_atomic_dec: { auto Offsets = splitBufferOffsets(Op.getOperand(5), DAG); SDValue Ops[] = { Op.getOperand(0), // Chain @@ -6453,11 +6577,13 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, Op.getOperand(6), // soffset Offsets.second, // offset Op.getOperand(7), // cachepolicy - DAG.getConstant(1, DL, MVT::i1), // idxen + DAG.getTargetConstant(1, DL, MVT::i1), // idxen }; EVT VT = Op.getValueType(); auto *M = cast<MemSDNode>(Op); + M->getMemOperand()->setOffset(getBufferOffsetForMMO(Ops[4], Ops[5], Ops[6], + Ops[3])); unsigned Opcode = 0; switch (IntrID) { @@ -6491,6 +6617,12 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, case Intrinsic::amdgcn_struct_buffer_atomic_xor: Opcode = AMDGPUISD::BUFFER_ATOMIC_XOR; break; + case Intrinsic::amdgcn_struct_buffer_atomic_inc: + Opcode = AMDGPUISD::BUFFER_ATOMIC_INC; + break; + case Intrinsic::amdgcn_struct_buffer_atomic_dec: + Opcode = AMDGPUISD::BUFFER_ATOMIC_DEC; + break; default: llvm_unreachable("unhandled atomic opcode"); } @@ -6512,12 +6644,16 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, SDValue(), // voffset -- will be set by setBufferOffsets SDValue(), // soffset -- will be set by setBufferOffsets SDValue(), // offset -- will be set by setBufferOffsets - DAG.getConstant(Slc << 1, DL, MVT::i32), // cachepolicy - DAG.getConstant(IdxEn, DL, MVT::i1), // idxen + DAG.getTargetConstant(Slc << 1, DL, MVT::i32), // cachepolicy + DAG.getTargetConstant(IdxEn, DL, MVT::i1), // idxen }; - setBufferOffsets(Op.getOperand(6), DAG, &Ops[5]); + unsigned Offset = setBufferOffsets(Op.getOperand(6), DAG, &Ops[5]); + // We don't know the offset if vindex is non-zero, so clear it. + if (IdxEn) + Offset = 0; EVT VT = Op.getValueType(); auto *M = cast<MemSDNode>(Op); + M->getMemOperand()->setOffset(Offset); return DAG.getMemIntrinsicNode(AMDGPUISD::BUFFER_ATOMIC_CMPSWAP, DL, Op->getVTList(), Ops, VT, M->getMemOperand()); @@ -6534,10 +6670,11 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, Op.getOperand(6), // soffset Offsets.second, // offset Op.getOperand(7), // cachepolicy - DAG.getConstant(0, DL, MVT::i1), // idxen + DAG.getTargetConstant(0, DL, MVT::i1), // idxen }; EVT VT = Op.getValueType(); auto *M = cast<MemSDNode>(Op); + M->getMemOperand()->setOffset(getBufferOffsetForMMO(Ops[5], Ops[6], Ops[7])); return DAG.getMemIntrinsicNode(AMDGPUISD::BUFFER_ATOMIC_CMPSWAP, DL, Op->getVTList(), Ops, VT, M->getMemOperand()); @@ -6554,10 +6691,12 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, Op.getOperand(7), // soffset Offsets.second, // offset Op.getOperand(8), // cachepolicy - DAG.getConstant(1, DL, MVT::i1), // idxen + DAG.getTargetConstant(1, DL, MVT::i1), // idxen }; EVT VT = Op.getValueType(); auto *M = cast<MemSDNode>(Op); + M->getMemOperand()->setOffset(getBufferOffsetForMMO(Ops[5], Ops[6], Ops[7], + Ops[4])); return DAG.getMemIntrinsicNode(AMDGPUISD::BUFFER_ATOMIC_CMPSWAP, DL, Op->getVTList(), Ops, VT, M->getMemOperand()); @@ -6686,23 +6825,6 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op, AMDGPUISD::EXPORT : AMDGPUISD::EXPORT_DONE; return DAG.getNode(Opc, DL, Op->getVTList(), Ops); } - case Intrinsic::amdgcn_s_sendmsg: - case Intrinsic::amdgcn_s_sendmsghalt: { - unsigned NodeOp = (IntrinsicID == Intrinsic::amdgcn_s_sendmsg) ? - AMDGPUISD::SENDMSG : AMDGPUISD::SENDMSGHALT; - Chain = copyToM0(DAG, Chain, DL, Op.getOperand(3)); - SDValue Glue = Chain.getValue(1); - return DAG.getNode(NodeOp, DL, MVT::Other, Chain, - Op.getOperand(2), Glue); - } - case Intrinsic::amdgcn_init_exec: { - return DAG.getNode(AMDGPUISD::INIT_EXEC, DL, MVT::Other, Chain, - Op.getOperand(2)); - } - case Intrinsic::amdgcn_init_exec_from_input: { - return DAG.getNode(AMDGPUISD::INIT_EXEC_FROM_INPUT, DL, MVT::Other, Chain, - Op.getOperand(2), Op.getOperand(3)); - } case Intrinsic::amdgcn_s_barrier: { if (getTargetMachine().getOptLevel() > CodeGenOpt::None) { const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); @@ -6733,9 +6855,9 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op, Op.getOperand(5), // voffset Op.getOperand(6), // soffset Op.getOperand(7), // offset - DAG.getConstant(Dfmt | (Nfmt << 4), DL, MVT::i32), // format - DAG.getConstant(Glc | (Slc << 1), DL, MVT::i32), // cachepolicy - DAG.getConstant(IdxEn, DL, MVT::i1), // idexen + DAG.getTargetConstant(Dfmt | (Nfmt << 4), DL, MVT::i32), // format + DAG.getTargetConstant(Glc | (Slc << 1), DL, MVT::i32), // cachepolicy + DAG.getTargetConstant(IdxEn, DL, MVT::i1), // idexen }; unsigned Opc = IsD16 ? AMDGPUISD::TBUFFER_STORE_FORMAT_D16 : AMDGPUISD::TBUFFER_STORE_FORMAT; @@ -6759,8 +6881,8 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op, Op.getOperand(6), // soffset Offsets.second, // offset Op.getOperand(7), // format - Op.getOperand(8), // cachepolicy - DAG.getConstant(1, DL, MVT::i1), // idexen + Op.getOperand(8), // cachepolicy, swizzled buffer + DAG.getTargetConstant(1, DL, MVT::i1), // idexen }; unsigned Opc = IsD16 ? AMDGPUISD::TBUFFER_STORE_FORMAT_D16 : AMDGPUISD::TBUFFER_STORE_FORMAT; @@ -6784,8 +6906,8 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op, Op.getOperand(5), // soffset Offsets.second, // offset Op.getOperand(6), // format - Op.getOperand(7), // cachepolicy - DAG.getConstant(0, DL, MVT::i1), // idexen + Op.getOperand(7), // cachepolicy, swizzled buffer + DAG.getTargetConstant(0, DL, MVT::i1), // idexen }; unsigned Opc = IsD16 ? AMDGPUISD::TBUFFER_STORE_FORMAT_D16 : AMDGPUISD::TBUFFER_STORE_FORMAT; @@ -6813,14 +6935,18 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op, SDValue(), // voffset -- will be set by setBufferOffsets SDValue(), // soffset -- will be set by setBufferOffsets SDValue(), // offset -- will be set by setBufferOffsets - DAG.getConstant(Glc | (Slc << 1), DL, MVT::i32), // cachepolicy - DAG.getConstant(IdxEn, DL, MVT::i1), // idxen + DAG.getTargetConstant(Glc | (Slc << 1), DL, MVT::i32), // cachepolicy + DAG.getTargetConstant(IdxEn, DL, MVT::i1), // idxen }; - setBufferOffsets(Op.getOperand(5), DAG, &Ops[4]); + unsigned Offset = setBufferOffsets(Op.getOperand(5), DAG, &Ops[4]); + // We don't know the offset if vindex is non-zero, so clear it. + if (IdxEn) + Offset = 0; unsigned Opc = IntrinsicID == Intrinsic::amdgcn_buffer_store ? AMDGPUISD::BUFFER_STORE : AMDGPUISD::BUFFER_STORE_FORMAT; Opc = IsD16 ? AMDGPUISD::BUFFER_STORE_FORMAT_D16 : Opc; MemSDNode *M = cast<MemSDNode>(Op); + M->getMemOperand()->setOffset(Offset); // Handle BUFFER_STORE_BYTE/SHORT overloaded intrinsics EVT VDataType = VData.getValueType().getScalarType(); @@ -6833,10 +6959,22 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op, case Intrinsic::amdgcn_raw_buffer_store: case Intrinsic::amdgcn_raw_buffer_store_format: { + const bool IsFormat = + IntrinsicID == Intrinsic::amdgcn_raw_buffer_store_format; + SDValue VData = Op.getOperand(2); - bool IsD16 = (VData.getValueType().getScalarType() == MVT::f16); + EVT VDataVT = VData.getValueType(); + EVT EltType = VDataVT.getScalarType(); + bool IsD16 = IsFormat && (EltType.getSizeInBits() == 16); if (IsD16) VData = handleD16VData(VData, DAG); + + if (!isTypeLegal(VDataVT)) { + VData = + DAG.getNode(ISD::BITCAST, DL, + getEquivalentMemType(*DAG.getContext(), VDataVT), VData); + } + auto Offsets = splitBufferOffsets(Op.getOperand(4), DAG); SDValue Ops[] = { Chain, @@ -6846,18 +6984,18 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op, Offsets.first, // voffset Op.getOperand(5), // soffset Offsets.second, // offset - Op.getOperand(6), // cachepolicy - DAG.getConstant(0, DL, MVT::i1), // idxen + Op.getOperand(6), // cachepolicy, swizzled buffer + DAG.getTargetConstant(0, DL, MVT::i1), // idxen }; - unsigned Opc = IntrinsicID == Intrinsic::amdgcn_raw_buffer_store ? - AMDGPUISD::BUFFER_STORE : AMDGPUISD::BUFFER_STORE_FORMAT; + unsigned Opc = + IsFormat ? AMDGPUISD::BUFFER_STORE_FORMAT : AMDGPUISD::BUFFER_STORE; Opc = IsD16 ? AMDGPUISD::BUFFER_STORE_FORMAT_D16 : Opc; MemSDNode *M = cast<MemSDNode>(Op); + M->getMemOperand()->setOffset(getBufferOffsetForMMO(Ops[4], Ops[5], Ops[6])); // Handle BUFFER_STORE_BYTE/SHORT overloaded intrinsics - EVT VDataType = VData.getValueType().getScalarType(); - if (VDataType == MVT::i8 || VDataType == MVT::i16) - return handleByteShortBufferStores(DAG, VDataType, DL, Ops, M); + if (!IsD16 && !VDataVT.isVector() && EltType.getSizeInBits() < 32) + return handleByteShortBufferStores(DAG, VDataVT, DL, Ops, M); return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops, M->getMemoryVT(), M->getMemOperand()); @@ -6865,10 +7003,23 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op, case Intrinsic::amdgcn_struct_buffer_store: case Intrinsic::amdgcn_struct_buffer_store_format: { + const bool IsFormat = + IntrinsicID == Intrinsic::amdgcn_struct_buffer_store_format; + SDValue VData = Op.getOperand(2); - bool IsD16 = (VData.getValueType().getScalarType() == MVT::f16); + EVT VDataVT = VData.getValueType(); + EVT EltType = VDataVT.getScalarType(); + bool IsD16 = IsFormat && (EltType.getSizeInBits() == 16); + if (IsD16) VData = handleD16VData(VData, DAG); + + if (!isTypeLegal(VDataVT)) { + VData = + DAG.getNode(ISD::BITCAST, DL, + getEquivalentMemType(*DAG.getContext(), VDataVT), VData); + } + auto Offsets = splitBufferOffsets(Op.getOperand(5), DAG); SDValue Ops[] = { Chain, @@ -6878,17 +7029,19 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op, Offsets.first, // voffset Op.getOperand(6), // soffset Offsets.second, // offset - Op.getOperand(7), // cachepolicy - DAG.getConstant(1, DL, MVT::i1), // idxen + Op.getOperand(7), // cachepolicy, swizzled buffer + DAG.getTargetConstant(1, DL, MVT::i1), // idxen }; unsigned Opc = IntrinsicID == Intrinsic::amdgcn_struct_buffer_store ? AMDGPUISD::BUFFER_STORE : AMDGPUISD::BUFFER_STORE_FORMAT; Opc = IsD16 ? AMDGPUISD::BUFFER_STORE_FORMAT_D16 : Opc; MemSDNode *M = cast<MemSDNode>(Op); + M->getMemOperand()->setOffset(getBufferOffsetForMMO(Ops[4], Ops[5], Ops[6], + Ops[3])); // Handle BUFFER_STORE_BYTE/SHORT overloaded intrinsics EVT VDataType = VData.getValueType().getScalarType(); - if (VDataType == MVT::i8 || VDataType == MVT::i16) + if (!IsD16 && !VDataVT.isVector() && EltType.getSizeInBits() < 32) return handleByteShortBufferStores(DAG, VDataType, DL, Ops, M); return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops, @@ -6908,13 +7061,17 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op, SDValue(), // voffset -- will be set by setBufferOffsets SDValue(), // soffset -- will be set by setBufferOffsets SDValue(), // offset -- will be set by setBufferOffsets - DAG.getConstant(Slc << 1, DL, MVT::i32), // cachepolicy - DAG.getConstant(IdxEn, DL, MVT::i1), // idxen + DAG.getTargetConstant(Slc << 1, DL, MVT::i32), // cachepolicy + DAG.getTargetConstant(IdxEn, DL, MVT::i1), // idxen }; - setBufferOffsets(Op.getOperand(5), DAG, &Ops[4]); + unsigned Offset = setBufferOffsets(Op.getOperand(5), DAG, &Ops[4]); + // We don't know the offset if vindex is non-zero, so clear it. + if (IdxEn) + Offset = 0; EVT VT = Op.getOperand(2).getValueType(); auto *M = cast<MemSDNode>(Op); + M->getMemOperand()->setOffset(Offset); unsigned Opcode = VT.isVector() ? AMDGPUISD::BUFFER_ATOMIC_PK_FADD : AMDGPUISD::BUFFER_ATOMIC_FADD; @@ -6987,7 +7144,7 @@ std::pair<SDValue, SDValue> SITargetLowering::splitBufferOffsets( Overflow += ImmOffset; ImmOffset = 0; } - C1 = cast<ConstantSDNode>(DAG.getConstant(ImmOffset, DL, MVT::i32)); + C1 = cast<ConstantSDNode>(DAG.getTargetConstant(ImmOffset, DL, MVT::i32)); if (Overflow) { auto OverflowVal = DAG.getConstant(Overflow, DL, MVT::i32); if (!N0) @@ -7001,14 +7158,14 @@ std::pair<SDValue, SDValue> SITargetLowering::splitBufferOffsets( if (!N0) N0 = DAG.getConstant(0, DL, MVT::i32); if (!C1) - C1 = cast<ConstantSDNode>(DAG.getConstant(0, DL, MVT::i32)); + C1 = cast<ConstantSDNode>(DAG.getTargetConstant(0, DL, MVT::i32)); return {N0, SDValue(C1, 0)}; } // Analyze a combined offset from an amdgcn_buffer_ intrinsic and store the // three offsets (voffset, soffset and instoffset) into the SDValue[3] array // pointed to by Offsets. -void SITargetLowering::setBufferOffsets(SDValue CombinedOffset, +unsigned SITargetLowering::setBufferOffsets(SDValue CombinedOffset, SelectionDAG &DAG, SDValue *Offsets, unsigned Align) const { SDLoc DL(CombinedOffset); @@ -7018,8 +7175,8 @@ void SITargetLowering::setBufferOffsets(SDValue CombinedOffset, if (AMDGPU::splitMUBUFOffset(Imm, SOffset, ImmOffset, Subtarget, Align)) { Offsets[0] = DAG.getConstant(0, DL, MVT::i32); Offsets[1] = DAG.getConstant(SOffset, DL, MVT::i32); - Offsets[2] = DAG.getConstant(ImmOffset, DL, MVT::i32); - return; + Offsets[2] = DAG.getTargetConstant(ImmOffset, DL, MVT::i32); + return SOffset + ImmOffset; } } if (DAG.isBaseWithConstantOffset(CombinedOffset)) { @@ -7031,13 +7188,14 @@ void SITargetLowering::setBufferOffsets(SDValue CombinedOffset, Subtarget, Align)) { Offsets[0] = N0; Offsets[1] = DAG.getConstant(SOffset, DL, MVT::i32); - Offsets[2] = DAG.getConstant(ImmOffset, DL, MVT::i32); - return; + Offsets[2] = DAG.getTargetConstant(ImmOffset, DL, MVT::i32); + return 0; } } Offsets[0] = CombinedOffset; Offsets[1] = DAG.getConstant(0, DL, MVT::i32); - Offsets[2] = DAG.getConstant(0, DL, MVT::i32); + Offsets[2] = DAG.getTargetConstant(0, DL, MVT::i32); + return 0; } // Handle 8 bit and 16 bit buffer loads @@ -7053,9 +7211,10 @@ SDValue SITargetLowering::handleByteShortBufferLoads(SelectionDAG &DAG, SDValue BufferLoad = DAG.getMemIntrinsicNode(Opc, DL, ResList, Ops, IntVT, M->getMemOperand()); - SDValue BufferLoadTrunc = DAG.getNode(ISD::TRUNCATE, DL, - LoadVT.getScalarType(), BufferLoad); - return DAG.getMergeValues({BufferLoadTrunc, BufferLoad.getValue(1)}, DL); + SDValue LoadVal = DAG.getNode(ISD::TRUNCATE, DL, IntVT, BufferLoad); + LoadVal = DAG.getNode(ISD::BITCAST, DL, LoadVT, LoadVal); + + return DAG.getMergeValues({LoadVal, BufferLoad.getValue(1)}, DL); } // Handle 8 bit and 16 bit buffer stores @@ -7063,6 +7222,9 @@ SDValue SITargetLowering::handleByteShortBufferStores(SelectionDAG &DAG, EVT VDataType, SDLoc DL, SDValue Ops[], MemSDNode *M) const { + if (VDataType == MVT::f16) + Ops[1] = DAG.getNode(ISD::BITCAST, DL, MVT::i16, Ops[1]); + SDValue BufferStoreExt = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Ops[1]); Ops[1] = BufferStoreExt; unsigned Opc = (VDataType == MVT::i8) ? AMDGPUISD::BUFFER_STORE_BYTE : @@ -7215,8 +7377,8 @@ SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const { assert(Op.getValueType().getVectorElementType() == MVT::i32 && "Custom lowering for non-i32 vectors hasn't been implemented."); - if (!allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), MemVT, - *Load->getMemOperand())) { + if (!allowsMemoryAccessForAlignment(*DAG.getContext(), DAG.getDataLayout(), + MemVT, *Load->getMemOperand())) { SDValue Ops[2]; std::tie(Ops[0], Ops[1]) = expandUnalignedLoad(Load, DAG); return DAG.getMergeValues(Ops, DL); @@ -7505,6 +7667,19 @@ SDValue SITargetLowering::lowerFDIV_FAST(SDValue Op, SelectionDAG &DAG) const { return DAG.getNode(ISD::FMUL, SL, MVT::f32, r3, Mul); } +// Returns immediate value for setting the F32 denorm mode when using the +// S_DENORM_MODE instruction. +static const SDValue getSPDenormModeValue(int SPDenormMode, SelectionDAG &DAG, + const SDLoc &SL, const GCNSubtarget *ST) { + assert(ST->hasDenormModeInst() && "Requires S_DENORM_MODE"); + int DPDenormModeDefault = ST->hasFP64Denormals() + ? FP_DENORM_FLUSH_NONE + : FP_DENORM_FLUSH_IN_FLUSH_OUT; + + int Mode = SPDenormMode | (DPDenormModeDefault << 2); + return DAG.getTargetConstant(Mode, SL, MVT::i32); +} + SDValue SITargetLowering::LowerFDIV32(SDValue Op, SelectionDAG &DAG) const { if (SDValue FastLowered = lowerFastUnsafeFDIV(Op, DAG)) return FastLowered; @@ -7531,16 +7706,26 @@ SDValue SITargetLowering::LowerFDIV32(SDValue Op, SelectionDAG &DAG) const { const unsigned Denorm32Reg = AMDGPU::Hwreg::ID_MODE | (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) | (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_); - const SDValue BitField = DAG.getTargetConstant(Denorm32Reg, SL, MVT::i16); if (!Subtarget->hasFP32Denormals()) { SDVTList BindParamVTs = DAG.getVTList(MVT::Other, MVT::Glue); - const SDValue EnableDenormValue = DAG.getConstant(FP_DENORM_FLUSH_NONE, - SL, MVT::i32); - SDValue EnableDenorm = DAG.getNode(AMDGPUISD::SETREG, SL, BindParamVTs, - DAG.getEntryNode(), - EnableDenormValue, BitField); + + SDValue EnableDenorm; + if (Subtarget->hasDenormModeInst()) { + const SDValue EnableDenormValue = + getSPDenormModeValue(FP_DENORM_FLUSH_NONE, DAG, SL, Subtarget); + + EnableDenorm = DAG.getNode(AMDGPUISD::DENORM_MODE, SL, BindParamVTs, + DAG.getEntryNode(), EnableDenormValue); + } else { + const SDValue EnableDenormValue = DAG.getConstant(FP_DENORM_FLUSH_NONE, + SL, MVT::i32); + EnableDenorm = DAG.getNode(AMDGPUISD::SETREG, SL, BindParamVTs, + DAG.getEntryNode(), EnableDenormValue, + BitField); + } + SDValue Ops[3] = { NegDivScale0, EnableDenorm.getValue(0), @@ -7562,19 +7747,29 @@ SDValue SITargetLowering::LowerFDIV32(SDValue Op, SelectionDAG &DAG) const { SDValue Fma2 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, NegDivScale0, Mul, NumeratorScaled, Mul); - SDValue Fma3 = getFPTernOp(DAG, ISD::FMA,SL, MVT::f32, Fma2, Fma1, Mul, Fma2); + SDValue Fma3 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, Fma2, Fma1, Mul, Fma2); SDValue Fma4 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, NegDivScale0, Fma3, NumeratorScaled, Fma3); if (!Subtarget->hasFP32Denormals()) { - const SDValue DisableDenormValue = - DAG.getConstant(FP_DENORM_FLUSH_IN_FLUSH_OUT, SL, MVT::i32); - SDValue DisableDenorm = DAG.getNode(AMDGPUISD::SETREG, SL, MVT::Other, - Fma4.getValue(1), - DisableDenormValue, - BitField, - Fma4.getValue(2)); + + SDValue DisableDenorm; + if (Subtarget->hasDenormModeInst()) { + const SDValue DisableDenormValue = + getSPDenormModeValue(FP_DENORM_FLUSH_IN_FLUSH_OUT, DAG, SL, Subtarget); + + DisableDenorm = DAG.getNode(AMDGPUISD::DENORM_MODE, SL, MVT::Other, + Fma4.getValue(1), DisableDenormValue, + Fma4.getValue(2)); + } else { + const SDValue DisableDenormValue = + DAG.getConstant(FP_DENORM_FLUSH_IN_FLUSH_OUT, SL, MVT::i32); + + DisableDenorm = DAG.getNode(AMDGPUISD::SETREG, SL, MVT::Other, + Fma4.getValue(1), DisableDenormValue, + BitField, Fma4.getValue(2)); + } SDValue OutputChain = DAG.getNode(ISD::TokenFactor, SL, MVT::Other, DisableDenorm, DAG.getRoot()); @@ -7684,8 +7879,8 @@ SDValue SITargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const { assert(VT.isVector() && Store->getValue().getValueType().getScalarType() == MVT::i32); - if (!allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), VT, - *Store->getMemOperand())) { + if (!allowsMemoryAccessForAlignment(*DAG.getContext(), DAG.getDataLayout(), + VT, *Store->getMemOperand())) { return expandUnalignedStore(Store, DAG); } @@ -10065,7 +10260,7 @@ SDNode *SITargetLowering::legalizeTargetIndependentNode(SDNode *Node, // Insert a copy to a VReg_1 virtual register so LowerI1Copies doesn't have // to try understanding copies to physical registers. if (SrcVal.getValueType() == MVT::i1 && - TargetRegisterInfo::isPhysicalRegister(DestReg->getReg())) { + Register::isPhysicalRegister(DestReg->getReg())) { SDLoc SL(Node); MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo(); SDValue VReg = DAG.getRegister( @@ -10218,7 +10413,7 @@ void SITargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI, MachineOperand &Op = MI.getOperand(I); if ((OpInfo[I].RegClass != llvm::AMDGPU::AV_64RegClassID && OpInfo[I].RegClass != llvm::AMDGPU::AV_32RegClassID) || - !TargetRegisterInfo::isVirtualRegister(Op.getReg()) || + !Register::isVirtualRegister(Op.getReg()) || !TRI->isAGPR(MRI, Op.getReg())) continue; auto *Src = MRI.getUniqueVRegDef(Op.getReg()); @@ -10256,7 +10451,7 @@ void SITargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI, Node->use_begin()->isMachineOpcode() && Node->use_begin()->getMachineOpcode() == AMDGPU::EXTRACT_SUBREG && !Node->use_begin()->hasAnyUseOfValue(0))) { - unsigned Def = MI.getOperand(0).getReg(); + Register Def = MI.getOperand(0).getReg(); // Change this into a noret atomic. MI.setDesc(TII->get(NoRetAtomicOp)); @@ -10300,7 +10495,7 @@ MachineSDNode *SITargetLowering::wrapAddr64Rsrc(SelectionDAG &DAG, // Combine the constants and the pointer. const SDValue Ops1[] = { - DAG.getTargetConstant(AMDGPU::SReg_128RegClassID, DL, MVT::i32), + DAG.getTargetConstant(AMDGPU::SGPR_128RegClassID, DL, MVT::i32), Ptr, DAG.getTargetConstant(AMDGPU::sub0_sub1, DL, MVT::i32), SubRegHi, @@ -10330,7 +10525,7 @@ MachineSDNode *SITargetLowering::buildRSRC(SelectionDAG &DAG, const SDLoc &DL, SDValue DataHi = buildSMovImm32(DAG, DL, RsrcDword2And3 >> 32); const SDValue Ops[] = { - DAG.getTargetConstant(AMDGPU::SReg_128RegClassID, DL, MVT::i32), + DAG.getTargetConstant(AMDGPU::SGPR_128RegClassID, DL, MVT::i32), PtrLo, DAG.getTargetConstant(AMDGPU::sub0, DL, MVT::i32), PtrHi, @@ -10364,7 +10559,7 @@ SITargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, return std::make_pair(0U, nullptr); case 32: case 16: - RC = &AMDGPU::SReg_32_XM0RegClass; + RC = &AMDGPU::SReg_32RegClass; break; case 64: RC = &AMDGPU::SGPR_64RegClass; @@ -10373,7 +10568,7 @@ SITargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, RC = &AMDGPU::SReg_96RegClass; break; case 128: - RC = &AMDGPU::SReg_128RegClass; + RC = &AMDGPU::SGPR_128RegClass; break; case 160: RC = &AMDGPU::SReg_160RegClass; @@ -10415,6 +10610,8 @@ SITargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, } break; case 'a': + if (!Subtarget->hasMAIInsts()) + break; switch (VT.getSizeInBits()) { default: return std::make_pair(0U, nullptr); @@ -10548,9 +10745,9 @@ void SITargetLowering::computeKnownBitsForFrameIndex(const SDValue Op, Known.Zero.setHighBits(getSubtarget()->getKnownHighZeroBitsForFrameIndex()); } -unsigned SITargetLowering::getPrefLoopAlignment(MachineLoop *ML) const { - const unsigned PrefAlign = TargetLowering::getPrefLoopAlignment(ML); - const unsigned CacheLineAlign = 6; // log2(64) +Align SITargetLowering::getPrefLoopAlignment(MachineLoop *ML) const { + const Align PrefAlign = TargetLowering::getPrefLoopAlignment(ML); + const Align CacheLineAlign = Align(64); // Pre-GFX10 target did not benefit from loop alignment if (!ML || DisableLoopAlignment || @@ -10578,7 +10775,7 @@ unsigned SITargetLowering::getPrefLoopAlignment(MachineLoop *ML) const { // If inner loop block is aligned assume in average half of the alignment // size to be added as nops. if (MBB != Header) - LoopSize += (1 << MBB->getAlignment()) / 2; + LoopSize += MBB->getAlignment().value() / 2; for (const MachineInstr &MI : *MBB) { LoopSize += TII->getInstSizeInBytes(MI); @@ -10644,7 +10841,7 @@ bool SITargetLowering::isSDNodeSourceOfDivergence(const SDNode * N, const MachineRegisterInfo &MRI = MF->getRegInfo(); const SIRegisterInfo &TRI = ST.getInstrInfo()->getRegisterInfo(); unsigned Reg = R->getReg(); - if (TRI.isPhysicalRegister(Reg)) + if (Register::isPhysicalRegister(Reg)) return !TRI.isSGPRReg(MRI, Reg); if (MRI.isLiveIn(Reg)) { @@ -10683,12 +10880,6 @@ bool SITargetLowering::isSDNodeSourceOfDivergence(const SDNode * N, case ISD::INTRINSIC_W_CHAIN: return AMDGPU::isIntrinsicSourceOfDivergence( cast<ConstantSDNode>(N->getOperand(1))->getZExtValue()); - // In some cases intrinsics that are a source of divergence have been - // lowered to AMDGPUISD so we also need to check those too. - case AMDGPUISD::INTERP_MOV: - case AMDGPUISD::INTERP_P1: - case AMDGPUISD::INTERP_P2: - return true; } return false; } @@ -10748,3 +10939,110 @@ SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const { return AMDGPUTargetLowering::shouldExpandAtomicRMWInIR(RMW); } + +const TargetRegisterClass * +SITargetLowering::getRegClassFor(MVT VT, bool isDivergent) const { + const TargetRegisterClass *RC = TargetLoweringBase::getRegClassFor(VT, false); + const SIRegisterInfo *TRI = Subtarget->getRegisterInfo(); + if (RC == &AMDGPU::VReg_1RegClass && !isDivergent) + return Subtarget->getWavefrontSize() == 64 ? &AMDGPU::SReg_64RegClass + : &AMDGPU::SReg_32RegClass; + if (!TRI->isSGPRClass(RC) && !isDivergent) + return TRI->getEquivalentSGPRClass(RC); + else if (TRI->isSGPRClass(RC) && isDivergent) + return TRI->getEquivalentVGPRClass(RC); + + return RC; +} + +static bool hasCFUser(const Value *V, SmallPtrSet<const Value *, 16> &Visited) { + if (!Visited.insert(V).second) + return false; + bool Result = false; + for (auto U : V->users()) { + if (const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(U)) { + if (V == U->getOperand(1)) { + switch (Intrinsic->getIntrinsicID()) { + default: + Result = false; + break; + case Intrinsic::amdgcn_if_break: + case Intrinsic::amdgcn_if: + case Intrinsic::amdgcn_else: + Result = true; + break; + } + } + if (V == U->getOperand(0)) { + switch (Intrinsic->getIntrinsicID()) { + default: + Result = false; + break; + case Intrinsic::amdgcn_end_cf: + case Intrinsic::amdgcn_loop: + Result = true; + break; + } + } + } else { + Result = hasCFUser(U, Visited); + } + if (Result) + break; + } + return Result; +} + +bool SITargetLowering::requiresUniformRegister(MachineFunction &MF, + const Value *V) const { + if (const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(V)) { + switch (Intrinsic->getIntrinsicID()) { + default: + return false; + case Intrinsic::amdgcn_if_break: + return true; + } + } + if (const ExtractValueInst *ExtValue = dyn_cast<ExtractValueInst>(V)) { + if (const IntrinsicInst *Intrinsic = + dyn_cast<IntrinsicInst>(ExtValue->getOperand(0))) { + switch (Intrinsic->getIntrinsicID()) { + default: + return false; + case Intrinsic::amdgcn_if: + case Intrinsic::amdgcn_else: { + ArrayRef<unsigned> Indices = ExtValue->getIndices(); + if (Indices.size() == 1 && Indices[0] == 1) { + return true; + } + } + } + } + } + if (const CallInst *CI = dyn_cast<CallInst>(V)) { + if (isa<InlineAsm>(CI->getCalledValue())) { + const SIRegisterInfo *SIRI = Subtarget->getRegisterInfo(); + ImmutableCallSite CS(CI); + TargetLowering::AsmOperandInfoVector TargetConstraints = ParseConstraints( + MF.getDataLayout(), Subtarget->getRegisterInfo(), CS); + for (auto &TC : TargetConstraints) { + if (TC.Type == InlineAsm::isOutput) { + ComputeConstraintToUse(TC, SDValue()); + unsigned AssignedReg; + const TargetRegisterClass *RC; + std::tie(AssignedReg, RC) = getRegForInlineAsmConstraint( + SIRI, TC.ConstraintCode, TC.ConstraintVT); + if (RC) { + MachineRegisterInfo &MRI = MF.getRegInfo(); + if (AssignedReg != 0 && SIRI->isSGPRReg(MRI, AssignedReg)) + return true; + else if (SIRI->isSGPRClass(RC)) + return true; + } + } + } + } + } + SmallPtrSet<const Value *, 16> Visited; + return hasCFUser(V, Visited); +} |