diff options
Diffstat (limited to 'contrib/llvm-project/llvm/lib/Target/AMDGPU')
15 files changed, 218 insertions, 46 deletions
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPU.td b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPU.td index 060fb66d38f7..d2a325d5ad89 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPU.td +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPU.td @@ -1100,8 +1100,8 @@ def FeatureGFX12 : GCNSubtargetFeatureGeneration<"GFX12", FeatureVOP3Literal, FeatureDPP8, FeatureNoDataDepHazard, FeaturePkFmacF16Inst, FeatureA16, FeatureFastDenormalF32, FeatureG16, - FeatureUnalignedBufferAccess, FeatureUnalignedDSAccess, FeatureGDS, - FeatureGWS, FeatureTrue16BitInsts + FeatureUnalignedBufferAccess, FeatureUnalignedDSAccess, + FeatureTrue16BitInsts ] >; diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp index 0a17b1536040..4462cd8a31f1 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp @@ -75,8 +75,9 @@ enum class SchedGroupMask { DS = 1u << 7, DS_READ = 1u << 8, DS_WRITE = 1u << 9, + TRANS = 1u << 10, ALL = ALU | VALU | SALU | MFMA | VMEM | VMEM_READ | VMEM_WRITE | DS | - DS_READ | DS_WRITE, + DS_READ | DS_WRITE | TRANS, LLVM_MARK_AS_BITMASK_ENUM(/* LargestFlag = */ ALL) }; @@ -1435,11 +1436,12 @@ bool SchedGroup::canAddMI(const MachineInstr &MI) const { Result = false; else if (((SGMask & SchedGroupMask::ALU) != SchedGroupMask::NONE) && - (TII->isVALU(MI) || TII->isMFMAorWMMA(MI) || TII->isSALU(MI))) + (TII->isVALU(MI) || TII->isMFMAorWMMA(MI) || TII->isSALU(MI) || + TII->isTRANS(MI))) Result = true; else if (((SGMask & SchedGroupMask::VALU) != SchedGroupMask::NONE) && - TII->isVALU(MI) && !TII->isMFMAorWMMA(MI)) + TII->isVALU(MI) && !TII->isMFMAorWMMA(MI) && !TII->isTRANS(MI)) Result = true; else if (((SGMask & SchedGroupMask::SALU) != SchedGroupMask::NONE) && @@ -1476,6 +1478,10 @@ bool SchedGroup::canAddMI(const MachineInstr &MI) const { MI.mayStore() && TII->isDS(MI)) Result = true; + else if (((SGMask & SchedGroupMask::TRANS) != SchedGroupMask::NONE) && + TII->isTRANS(MI)) + Result = true; + LLVM_DEBUG( dbgs() << "For SchedGroup with mask " << format_hex((int)SGMask, 10, true) << (Result ? " could classify " : " unable to classify ") << MI); @@ -1635,10 +1641,13 @@ void IGroupLPDAGMutation::addSchedBarrierEdges(SUnit &SchedBarrier) { // Remove all existing edges from the SCHED_BARRIER that were added due to the // instruction having side effects. resetEdges(SchedBarrier, DAG); + LLVM_DEBUG(dbgs() << "Building SchedGroup for SchedBarrier with Mask: " + << MI.getOperand(0).getImm() << "\n"); auto InvertedMask = invertSchedBarrierMask((SchedGroupMask)MI.getOperand(0).getImm()); SchedGroup SG(InvertedMask, std::nullopt, DAG, TII); SG.initSchedGroup(); + // Preserve original instruction ordering relative to the SCHED_BARRIER. SG.link( SchedBarrier, @@ -1652,14 +1661,15 @@ IGroupLPDAGMutation::invertSchedBarrierMask(SchedGroupMask Mask) const { // allowed past the SCHED_BARRIER. SchedGroupMask InvertedMask = ~Mask; - // ALU implies VALU, SALU, MFMA. + // ALU implies VALU, SALU, MFMA, TRANS. if ((InvertedMask & SchedGroupMask::ALU) == SchedGroupMask::NONE) - InvertedMask &= - ~SchedGroupMask::VALU & ~SchedGroupMask::SALU & ~SchedGroupMask::MFMA; - // VALU, SALU, MFMA implies ALU. + InvertedMask &= ~SchedGroupMask::VALU & ~SchedGroupMask::SALU & + ~SchedGroupMask::MFMA & ~SchedGroupMask::TRANS; + // VALU, SALU, MFMA, TRANS implies ALU. else if ((InvertedMask & SchedGroupMask::VALU) == SchedGroupMask::NONE || (InvertedMask & SchedGroupMask::SALU) == SchedGroupMask::NONE || - (InvertedMask & SchedGroupMask::MFMA) == SchedGroupMask::NONE) + (InvertedMask & SchedGroupMask::MFMA) == SchedGroupMask::NONE || + (InvertedMask & SchedGroupMask::TRANS) == SchedGroupMask::NONE) InvertedMask &= ~SchedGroupMask::ALU; // VMEM implies VMEM_READ, VMEM_WRITE. @@ -1678,6 +1688,9 @@ IGroupLPDAGMutation::invertSchedBarrierMask(SchedGroupMask Mask) const { (InvertedMask & SchedGroupMask::DS_WRITE) == SchedGroupMask::NONE) InvertedMask &= ~SchedGroupMask::DS; + LLVM_DEBUG(dbgs() << "After Inverting, SchedGroup Mask: " << (int)InvertedMask + << "\n"); + return InvertedMask; } diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp index 9d7443012e3d..541a5b62450d 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -169,11 +169,17 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM, setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::f16, Expand); setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::bf16, Expand); setLoadExtAction(ISD::EXTLOAD, MVT::v2f32, MVT::v2f16, Expand); + setLoadExtAction(ISD::EXTLOAD, MVT::v2f32, MVT::v2bf16, Expand); setLoadExtAction(ISD::EXTLOAD, MVT::v3f32, MVT::v3f16, Expand); + setLoadExtAction(ISD::EXTLOAD, MVT::v3f32, MVT::v3bf16, Expand); setLoadExtAction(ISD::EXTLOAD, MVT::v4f32, MVT::v4f16, Expand); + setLoadExtAction(ISD::EXTLOAD, MVT::v4f32, MVT::v4bf16, Expand); setLoadExtAction(ISD::EXTLOAD, MVT::v8f32, MVT::v8f16, Expand); + setLoadExtAction(ISD::EXTLOAD, MVT::v8f32, MVT::v8bf16, Expand); setLoadExtAction(ISD::EXTLOAD, MVT::v16f32, MVT::v16f16, Expand); + setLoadExtAction(ISD::EXTLOAD, MVT::v16f32, MVT::v16bf16, Expand); setLoadExtAction(ISD::EXTLOAD, MVT::v32f32, MVT::v32f16, Expand); + setLoadExtAction(ISD::EXTLOAD, MVT::v32f32, MVT::v32bf16, Expand); setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f32, Expand); setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f32, Expand); @@ -185,10 +191,15 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM, setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f16, Expand); setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::bf16, Expand); setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f16, Expand); + setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2bf16, Expand); setLoadExtAction(ISD::EXTLOAD, MVT::v3f64, MVT::v3f16, Expand); + setLoadExtAction(ISD::EXTLOAD, MVT::v3f64, MVT::v3bf16, Expand); setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f16, Expand); + setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4bf16, Expand); setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8f16, Expand); + setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8bf16, Expand); setLoadExtAction(ISD::EXTLOAD, MVT::v16f64, MVT::v16f16, Expand); + setLoadExtAction(ISD::EXTLOAD, MVT::v16f64, MVT::v16bf16, Expand); setOperationAction(ISD::STORE, MVT::f32, Promote); AddPromotedToType(ISD::STORE, MVT::f32, MVT::i32); @@ -506,9 +517,11 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM, setOperationAction(ISD::SELECT, MVT::v12f32, Promote); AddPromotedToType(ISD::SELECT, MVT::v12f32, MVT::v12i32); - // There are no libcalls of any kind. - for (int I = 0; I < RTLIB::UNKNOWN_LIBCALL; ++I) - setLibcallName(static_cast<RTLIB::Libcall>(I), nullptr); + // Disable most libcalls. + for (int I = 0; I < RTLIB::UNKNOWN_LIBCALL; ++I) { + if (I < RTLIB::ATOMIC_LOAD || I > RTLIB::ATOMIC_FETCH_NAND_16) + setLibcallName(static_cast<RTLIB::Libcall>(I), nullptr); + } setSchedulingPreference(Sched::RegPressure); setJumpIsExpensive(true); @@ -556,6 +569,8 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM, ISD::FSUB, ISD::FNEG, ISD::FABS, ISD::AssertZext, ISD::AssertSext, ISD::INTRINSIC_WO_CHAIN}); + + setMaxAtomicSizeInBitsSupported(64); } bool AMDGPUTargetLowering::mayIgnoreSignedZero(SDValue Op) const { @@ -3055,18 +3070,26 @@ SDValue AMDGPUTargetLowering::LowerCTLZ_CTTZ(SDValue Op, SelectionDAG &DAG) cons bool ZeroUndef = Op.getOpcode() == ISD::CTLZ_ZERO_UNDEF || Op.getOpcode() == ISD::CTTZ_ZERO_UNDEF; + bool Is64BitScalar = !Src->isDivergent() && Src.getValueType() == MVT::i64; - if (Src.getValueType() == MVT::i32) { + if (Src.getValueType() == MVT::i32 || Is64BitScalar) { // (ctlz hi:lo) -> (umin (ffbh src), 32) // (cttz hi:lo) -> (umin (ffbl src), 32) // (ctlz_zero_undef src) -> (ffbh src) // (cttz_zero_undef src) -> (ffbl src) + + // 64-bit scalar version produce 32-bit result + // (ctlz hi:lo) -> (umin (S_FLBIT_I32_B64 src), 64) + // (cttz hi:lo) -> (umin (S_FF1_I32_B64 src), 64) + // (ctlz_zero_undef src) -> (S_FLBIT_I32_B64 src) + // (cttz_zero_undef src) -> (S_FF1_I32_B64 src) SDValue NewOpr = DAG.getNode(NewOpc, SL, MVT::i32, Src); if (!ZeroUndef) { - const SDValue Const32 = DAG.getConstant(32, SL, MVT::i32); - NewOpr = DAG.getNode(ISD::UMIN, SL, MVT::i32, NewOpr, Const32); + const SDValue ConstVal = DAG.getConstant( + Op.getValueType().getScalarSizeInBits(), SL, MVT::i32); + NewOpr = DAG.getNode(ISD::UMIN, SL, MVT::i32, NewOpr, ConstVal); } - return NewOpr; + return DAG.getNode(ISD::ZERO_EXTEND, SL, Src.getValueType(), NewOpr); } SDValue Lo, Hi; diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp index ee93d9eb4c0a..2bb7b6bd0674 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp @@ -1241,6 +1241,10 @@ static Value *simplifyAMDGCNMemoryIntrinsicDemanded(InstCombiner &IC, ConstantInt *DMask = cast<ConstantInt>(Args[DMaskIdx]); unsigned DMaskVal = DMask->getZExtValue() & 0xf; + // dmask 0 has special semantics, do not simplify. + if (DMaskVal == 0) + return nullptr; + // Mask off values that are undefined because the dmask doesn't cover them DemandedElts &= (1 << llvm::popcount(DMaskVal)) - 1; @@ -1261,7 +1265,7 @@ static Value *simplifyAMDGCNMemoryIntrinsicDemanded(InstCombiner &IC, unsigned NewNumElts = DemandedElts.popcount(); if (!NewNumElts) - return UndefValue::get(IIVTy); + return PoisonValue::get(IIVTy); if (NewNumElts >= VWidth && DemandedElts.isMask()) { if (DMaskIdx >= 0) @@ -1299,7 +1303,7 @@ static Value *simplifyAMDGCNMemoryIntrinsicDemanded(InstCombiner &IC, if (IsLoad) { if (NewNumElts == 1) { - return IC.Builder.CreateInsertElement(UndefValue::get(IIVTy), NewCall, + return IC.Builder.CreateInsertElement(PoisonValue::get(IIVTy), NewCall, DemandedElts.countr_zero()); } diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp index 0c21382e5c22..f03e6b8915b1 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp @@ -1050,8 +1050,7 @@ bool AMDGPULibCalls::fold_pow(FPMathOperator *FPOp, IRBuilder<> &B, CF->isNegative(); } else { needlog = true; - needcopysign = needabs = FInfo.getId() != AMDGPULibFunc::EI_POWR && - (!CF || CF->isNegative()); + needcopysign = needabs = FInfo.getId() != AMDGPULibFunc::EI_POWR; } } else { ConstantDataVector *CDV = dyn_cast<ConstantDataVector>(opr0); diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp index 1bed516fb5c7..5e73411cae9b 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp @@ -530,6 +530,15 @@ static Value *promoteAllocaUserToVector( return Builder.CreateVectorSplat(VectorTy->getElementCount(), Elt); } + if (auto *Intr = dyn_cast<IntrinsicInst>(Inst)) { + if (Intr->getIntrinsicID() == Intrinsic::objectsize) { + Intr->replaceAllUsesWith( + Builder.getIntN(Intr->getType()->getIntegerBitWidth(), + DL.getTypeAllocSize(VectorTy))); + return nullptr; + } + } + llvm_unreachable("Unsupported call when promoting alloca to vector"); } @@ -773,8 +782,17 @@ bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToVector(AllocaInst &Alloca) { continue; } + if (auto *Intr = dyn_cast<IntrinsicInst>(Inst)) { + if (Intr->getIntrinsicID() == Intrinsic::objectsize) { + WorkList.push_back(Inst); + continue; + } + } + // Ignore assume-like intrinsics and comparisons used in assumes. if (isAssumeLikeIntrinsic(Inst)) { + if (!Inst->use_empty()) + return RejectUser(Inst, "assume-like intrinsic cannot have any users"); UsersToRemove.push_back(Inst); continue; } diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp index e8c04ecf39ba..fdc2077868cf 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -345,6 +345,11 @@ static cl::opt<bool> EnableImageIntrinsicOptimizer( cl::desc("Enable image intrinsic optimizer pass"), cl::init(true), cl::Hidden); +static cl::opt<bool> + EnableLoopPrefetch("amdgpu-loop-prefetch", + cl::desc("Enable loop data prefetch on AMDGPU"), + cl::Hidden, cl::init(false)); + static cl::opt<bool> EnableMaxIlpSchedStrategy( "amdgpu-enable-max-ilp-scheduling-strategy", cl::desc("Enable scheduling strategy to maximize ILP for a single wave."), @@ -982,6 +987,8 @@ void AMDGPUPassConfig::addEarlyCSEOrGVNPass() { } void AMDGPUPassConfig::addStraightLineScalarOptimizationPasses() { + if (isPassEnabled(EnableLoopPrefetch, CodeGenOptLevel::Aggressive)) + addPass(createLoopDataPrefetchPass()); addPass(createSeparateConstOffsetFromGEPPass()); // ReassociateGEPs exposes more opportunities for SLSR. See // the example in reassociate-geps-and-slsr.ll. diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp index f1da1a61bf4d..ebe0b8551b23 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp @@ -1345,3 +1345,11 @@ GCNTTIImpl::getTypeLegalizationCost(Type *Ty) const { Cost.first += (Size + 255) / 256; return Cost; } + +unsigned GCNTTIImpl::getPrefetchDistance() const { + return ST->hasPrefetch() ? 128 : 0; +} + +bool GCNTTIImpl::shouldPrefetchAddressSpace(unsigned AS) const { + return AMDGPU::isFlatGlobalAddrSpace(AS); +} diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h index 1e6c5bbfc0d7..cd8e9fd10bbf 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h @@ -254,6 +254,16 @@ public: InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF, TTI::TargetCostKind CostKind); + + /// Data cache line size for LoopDataPrefetch pass. Has no use before GFX12. + unsigned getCacheLineSize() const override { return 128; } + + /// How much before a load we should place the prefetch instruction. + /// This is currently measured in number of IR instructions. + unsigned getPrefetchDistance() const override; + + /// \return if target want to issue a prefetch in address space \p AS. + bool shouldPrefetchAddressSpace(unsigned AS) const override; }; } // end namespace llvm diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/DSInstructions.td b/contrib/llvm-project/llvm/lib/Target/AMDGPU/DSInstructions.td index 3a895923fa4b..bc9049b4ef33 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/DSInstructions.td +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/DSInstructions.td @@ -1147,7 +1147,8 @@ def : GCNPat < >; } // End SubtargetPredicate = HasAtomicDsPkAdd16Insts -def : Pat < +let OtherPredicates = [HasGDS] in +def : GCNPat < (SIds_ordered_count i32:$value, i16:$offset), (DS_ORDERED_COUNT $value, (as_i16imm $offset)) >; @@ -1189,7 +1190,8 @@ def : GCNPat < //===----------------------------------------------------------------------===// class Base_DS_Real_gfx6_gfx7_gfx10_gfx11_gfx12<bits<8> op, DS_Pseudo ps, int ef, - string opName = ps.Mnemonic> + string opName = ps.Mnemonic, + bit hasGFX12Enc = 0> : DS_Real<ps, opName>, SIMCInstr <ps.Mnemonic, ef> { let Inst{7-0} = !if(ps.has_offset0, offset0, 0); @@ -1201,6 +1203,8 @@ class Base_DS_Real_gfx6_gfx7_gfx10_gfx11_gfx12<bits<8> op, DS_Pseudo ps, int ef, let Inst{47-40} = !if(ps.has_data0, data0{7-0}, 0); let Inst{55-48} = !if(ps.has_data1, data1{7-0}, 0); let Inst{63-56} = !if(ps.has_vdst, vdst{7-0}, 0); + + let gds = !if(hasGFX12Enc, 0, ?); } //===----------------------------------------------------------------------===// @@ -1212,7 +1216,7 @@ let AssemblerPredicate = isGFX12Plus, DecoderNamespace = "GFX12" in { defvar ps = !cast<DS_Pseudo>(NAME); def _gfx12 : Base_DS_Real_gfx6_gfx7_gfx10_gfx11_gfx12<op, ps, SIEncodingFamily.GFX12, - ps.Mnemonic>; + ps.Mnemonic, 1>; } multiclass DS_Real_Renamed_gfx12<bits<8> op, DS_Pseudo backing_pseudo, @@ -1220,7 +1224,7 @@ let AssemblerPredicate = isGFX12Plus, DecoderNamespace = "GFX12" in { def _gfx12 : Base_DS_Real_gfx6_gfx7_gfx10_gfx11_gfx12<op, backing_pseudo, SIEncodingFamily.GFX12, - real_name>, + real_name, 1>, MnemonicAlias<backing_pseudo.Mnemonic, real_name>, Requires<[isGFX12Plus]>; } diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp index ed2e7e4f189e..7939d0036568 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp @@ -702,6 +702,11 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size, AMDGPU::OpName::src2_modifiers); } + if (Res && (MCII->get(MI.getOpcode()).TSFlags & SIInstrFlags::DS) && + !AMDGPU::hasGDS(STI)) { + insertNamedMCOperand(MI, MCOperand::createImm(0), AMDGPU::OpName::gds); + } + if (Res && (MCII->get(MI.getOpcode()).TSFlags & (SIInstrFlags::MUBUF | SIInstrFlags::FLAT | SIInstrFlags::SMRD))) { int CPolPos = AMDGPU::getNamedOperandIdx(MI.getOpcode(), diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index 34826809c1a6..fc119aa61d01 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -540,10 +540,7 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, MVT::f16, Custom); setOperationAction({ISD::SINT_TO_FP, ISD::UINT_TO_FP}, MVT::i16, Custom); - - setOperationAction( - {ISD::FP_TO_SINT, ISD::FP_TO_UINT, ISD::SINT_TO_FP, ISD::UINT_TO_FP}, - MVT::f16, Promote); + setOperationAction({ISD::FP_TO_SINT, ISD::FP_TO_UINT}, MVT::f16, Promote); // F16 - VOP2 Actions. setOperationAction({ISD::BR_CC, ISD::SELECT_CC}, MVT::f16, Expand); @@ -1145,11 +1142,10 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, MachineMemOperand::MOStore | MachineMemOperand::MODereferenceable; - // XXX - Should this be volatile without known ordering? - Info.flags |= MachineMemOperand::MOVolatile; - switch (IntrID) { default: + // XXX - Should this be volatile without known ordering? + Info.flags |= MachineMemOperand::MOVolatile; break; case Intrinsic::amdgcn_raw_buffer_load_lds: case Intrinsic::amdgcn_raw_ptr_buffer_load_lds: @@ -1157,6 +1153,7 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, case Intrinsic::amdgcn_struct_ptr_buffer_load_lds: { unsigned Width = cast<ConstantInt>(CI.getArgOperand(2))->getZExtValue(); Info.memVT = EVT::getIntegerVT(CI.getContext(), Width * 8); + Info.ptrVal = CI.getArgOperand(1); return true; } } @@ -1289,8 +1286,8 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, Info.opc = ISD::INTRINSIC_VOID; unsigned Width = cast<ConstantInt>(CI.getArgOperand(2))->getZExtValue(); Info.memVT = EVT::getIntegerVT(CI.getContext(), Width * 8); - Info.flags |= MachineMemOperand::MOLoad | MachineMemOperand::MOStore | - MachineMemOperand::MOVolatile; + Info.ptrVal = CI.getArgOperand(1); + Info.flags |= MachineMemOperand::MOLoad | MachineMemOperand::MOStore; return true; } case Intrinsic::amdgcn_ds_bvh_stack_rtn: { @@ -9231,7 +9228,9 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op, MachinePointerInfo LoadPtrI = LoadMMO->getPointerInfo(); MachinePointerInfo StorePtrI = LoadPtrI; - StorePtrI.V = nullptr; + LoadPtrI.V = PoisonValue::get( + PointerType::get(*DAG.getContext(), AMDGPUAS::GLOBAL_ADDRESS)); + LoadPtrI.AddrSpace = AMDGPUAS::GLOBAL_ADDRESS; StorePtrI.AddrSpace = AMDGPUAS::LOCAL_ADDRESS; auto F = LoadMMO->getFlags() & @@ -9309,6 +9308,8 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op, MachinePointerInfo LoadPtrI = LoadMMO->getPointerInfo(); LoadPtrI.Offset = Op->getConstantOperandVal(5); MachinePointerInfo StorePtrI = LoadPtrI; + LoadPtrI.V = PoisonValue::get( + PointerType::get(*DAG.getContext(), AMDGPUAS::GLOBAL_ADDRESS)); LoadPtrI.AddrSpace = AMDGPUAS::GLOBAL_ADDRESS; StorePtrI.AddrSpace = AMDGPUAS::LOCAL_ADDRESS; auto F = LoadMMO->getFlags() & diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp index 8415a3d77d3b..55ddb540c51e 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp @@ -238,7 +238,7 @@ public: bool merge(const WaitcntBrackets &Other); - RegInterval getRegInterval(const MachineInstr *MI, const SIInstrInfo *TII, + RegInterval getRegInterval(const MachineInstr *MI, const MachineRegisterInfo *MRI, const SIRegisterInfo *TRI, unsigned OpNo) const; @@ -500,7 +500,6 @@ public: } // end anonymous namespace RegInterval WaitcntBrackets::getRegInterval(const MachineInstr *MI, - const SIInstrInfo *TII, const MachineRegisterInfo *MRI, const SIRegisterInfo *TRI, unsigned OpNo) const { @@ -534,7 +533,7 @@ RegInterval WaitcntBrackets::getRegInterval(const MachineInstr *MI, else return {-1, -1}; - const TargetRegisterClass *RC = TII->getOpRegClass(*MI, OpNo); + const TargetRegisterClass *RC = TRI->getPhysRegBaseClass(Op.getReg()); unsigned Size = TRI->getRegSizeInBits(*RC); Result.second = Result.first + ((Size + 16) / 32); @@ -546,7 +545,7 @@ void WaitcntBrackets::setExpScore(const MachineInstr *MI, const SIRegisterInfo *TRI, const MachineRegisterInfo *MRI, unsigned OpNo, unsigned Val) { - RegInterval Interval = getRegInterval(MI, TII, MRI, TRI, OpNo); + RegInterval Interval = getRegInterval(MI, MRI, TRI, OpNo); assert(TRI->isVectorRegister(*MRI, MI->getOperand(OpNo).getReg())); for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) { setRegScore(RegNo, EXP_CNT, Val); @@ -674,7 +673,7 @@ void WaitcntBrackets::updateByEvent(const SIInstrInfo *TII, Inst.getOpcode() == AMDGPU::BUFFER_STORE_DWORDX4) { MachineOperand *MO = TII->getNamedOperand(Inst, AMDGPU::OpName::data); unsigned OpNo;//TODO: find the OpNo for this operand; - RegInterval Interval = getRegInterval(&Inst, TII, MRI, TRI, OpNo); + RegInterval Interval = getRegInterval(&Inst, MRI, TRI, OpNo); for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) { setRegScore(RegNo + NUM_ALL_VGPRS, t, CurrScore); @@ -686,7 +685,7 @@ void WaitcntBrackets::updateByEvent(const SIInstrInfo *TII, auto &Op = Inst.getOperand(I); if (!Op.isReg() || !Op.isDef()) continue; - RegInterval Interval = getRegInterval(&Inst, TII, MRI, TRI, I); + RegInterval Interval = getRegInterval(&Inst, MRI, TRI, I); if (T == VM_CNT) { if (Interval.first >= NUM_ALL_VGPRS) continue; @@ -1140,7 +1139,7 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI, if (MI.getOperand(CallAddrOpIdx).isReg()) { RegInterval CallAddrOpInterval = - ScoreBrackets.getRegInterval(&MI, TII, MRI, TRI, CallAddrOpIdx); + ScoreBrackets.getRegInterval(&MI, MRI, TRI, CallAddrOpIdx); for (int RegNo = CallAddrOpInterval.first; RegNo < CallAddrOpInterval.second; ++RegNo) @@ -1150,7 +1149,7 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI, AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::dst); if (RtnAddrOpIdx != -1) { RegInterval RtnAddrOpInterval = - ScoreBrackets.getRegInterval(&MI, TII, MRI, TRI, RtnAddrOpIdx); + ScoreBrackets.getRegInterval(&MI, MRI, TRI, RtnAddrOpIdx); for (int RegNo = RtnAddrOpInterval.first; RegNo < RtnAddrOpInterval.second; ++RegNo) @@ -1202,8 +1201,7 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI, if (Op.isTied() && Op.isUse() && TII->doesNotReadTiedSource(MI)) continue; - RegInterval Interval = - ScoreBrackets.getRegInterval(&MI, TII, MRI, TRI, I); + RegInterval Interval = ScoreBrackets.getRegInterval(&MI, MRI, TRI, I); const bool IsVGPR = TRI->isVectorRegister(*MRI, Op.getReg()); for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) { @@ -1782,7 +1780,7 @@ bool SIInsertWaitcnts::shouldFlushVmCnt(MachineLoop *ML, MachineOperand &Op = MI.getOperand(I); if (!Op.isReg() || !TRI->isVectorRegister(*MRI, Op.getReg())) continue; - RegInterval Interval = Brackets.getRegInterval(&MI, TII, MRI, TRI, I); + RegInterval Interval = Brackets.getRegInterval(&MI, MRI, TRI, I); // Vgpr use if (Op.isUse()) { for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) { diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp index 70ef1fff274a..ebe23a5eac57 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -245,6 +245,10 @@ bool SIInstrInfo::areLoadsFromSameBasePtr(SDNode *Load0, SDNode *Load1, if (!get(Opc0).mayLoad() || !get(Opc1).mayLoad()) return false; + // A mayLoad instruction without a def is not a load. Likely a prefetch. + if (!get(Opc0).getNumDefs() || !get(Opc1).getNumDefs()) + return false; + if (isDS(Opc0) && isDS(Opc1)) { // FIXME: Handle this case: @@ -3654,6 +3658,9 @@ bool SIInstrInfo::areMemAccessesTriviallyDisjoint(const MachineInstr &MIa, if (MIa.hasOrderedMemoryRef() || MIb.hasOrderedMemoryRef()) return false; + if (isLDSDMA(MIa) || isLDSDMA(MIb)) + return false; + // TODO: Should we check the address space from the MachineMemOperand? That // would allow us to distinguish objects we know don't alias based on the // underlying address space, even if it was lowered to a different one, @@ -4976,6 +4983,14 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI, } } + if (isDS(MI) && !ST.hasGDS()) { + const MachineOperand *GDSOp = getNamedOperand(MI, AMDGPU::OpName::gds); + if (GDSOp && GDSOp->getImm() != 0) { + ErrInfo = "GDS is not supported on this subtarget"; + return false; + } + } + if (isImage(MI)) { const MachineOperand *DimOp = getNamedOperand(MI, AMDGPU::OpName::dim); if (DimOp) { @@ -6897,6 +6912,15 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist, Inst.eraseFromParent(); return; + case AMDGPU::S_FLBIT_I32_B64: + splitScalar64BitCountOp(Worklist, Inst, AMDGPU::V_FFBH_U32_e32); + Inst.eraseFromParent(); + return; + case AMDGPU::S_FF1_I32_B64: + splitScalar64BitCountOp(Worklist, Inst, AMDGPU::V_FFBL_B32_e32); + Inst.eraseFromParent(); + return; + case AMDGPU::S_LSHL_B32: if (ST.hasOnlyRevVALUShifts()) { NewOpcode = AMDGPU::V_LSHLREV_B32_e64; @@ -7830,6 +7854,61 @@ void SIInstrInfo::splitScalar64BitBFE(SIInstrWorklist &Worklist, addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist); } +void SIInstrInfo::splitScalar64BitCountOp(SIInstrWorklist &Worklist, + MachineInstr &Inst, unsigned Opcode, + MachineDominatorTree *MDT) const { + // (S_FLBIT_I32_B64 hi:lo) -> + // -> (umin (V_FFBH_U32_e32 hi), (uaddsat (V_FFBH_U32_e32 lo), 32)) + // (S_FF1_I32_B64 hi:lo) -> + // ->(umin (uaddsat (V_FFBL_B32_e32 hi), 32) (V_FFBL_B32_e32 lo)) + + MachineBasicBlock &MBB = *Inst.getParent(); + MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); + MachineBasicBlock::iterator MII = Inst; + const DebugLoc &DL = Inst.getDebugLoc(); + + MachineOperand &Dest = Inst.getOperand(0); + MachineOperand &Src = Inst.getOperand(1); + + const MCInstrDesc &InstDesc = get(Opcode); + + bool IsCtlz = Opcode == AMDGPU::V_FFBH_U32_e32; + unsigned OpcodeAdd = + ST.hasAddNoCarry() ? AMDGPU::V_ADD_U32_e64 : AMDGPU::V_ADD_CO_U32_e32; + + const TargetRegisterClass *SrcRC = + Src.isReg() ? MRI.getRegClass(Src.getReg()) : &AMDGPU::SGPR_32RegClass; + const TargetRegisterClass *SrcSubRC = + RI.getSubRegisterClass(SrcRC, AMDGPU::sub0); + + MachineOperand SrcRegSub0 = + buildExtractSubRegOrImm(MII, MRI, Src, SrcRC, AMDGPU::sub0, SrcSubRC); + MachineOperand SrcRegSub1 = + buildExtractSubRegOrImm(MII, MRI, Src, SrcRC, AMDGPU::sub1, SrcSubRC); + + Register MidReg1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); + Register MidReg2 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); + Register MidReg3 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); + Register MidReg4 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); + + BuildMI(MBB, MII, DL, InstDesc, MidReg1).add(SrcRegSub0); + + BuildMI(MBB, MII, DL, InstDesc, MidReg2).add(SrcRegSub1); + + BuildMI(MBB, MII, DL, get(OpcodeAdd), MidReg3) + .addReg(IsCtlz ? MidReg1 : MidReg2) + .addImm(32) + .addImm(1); // enable clamp + + BuildMI(MBB, MII, DL, get(AMDGPU::V_MIN_U32_e64), MidReg4) + .addReg(MidReg3) + .addReg(IsCtlz ? MidReg2 : MidReg1); + + MRI.replaceRegWith(Dest.getReg(), MidReg4); + + addUsersToMoveToVALUWorklist(MidReg4, MRI, Worklist); +} + void SIInstrInfo::addUsersToMoveToVALUWorklist( Register DstReg, MachineRegisterInfo &MRI, SIInstrWorklist &Worklist) const { diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInstrInfo.h index affe52046752..46eee6fae0a5 100644 --- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInstrInfo.h +++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInstrInfo.h @@ -144,6 +144,9 @@ private: void splitScalar64BitBCNT(SIInstrWorklist &Worklist, MachineInstr &Inst) const; void splitScalar64BitBFE(SIInstrWorklist &Worklist, MachineInstr &Inst) const; + void splitScalar64BitCountOp(SIInstrWorklist &Worklist, MachineInstr &Inst, + unsigned Opcode, + MachineDominatorTree *MDT = nullptr) const; void movePackToVALU(SIInstrWorklist &Worklist, MachineRegisterInfo &MRI, MachineInstr &Inst) const; |