aboutsummaryrefslogtreecommitdiff
path: root/contrib/llvm-project/llvm/lib/Target/AMDGPU
diff options
context:
space:
mode:
Diffstat (limited to 'contrib/llvm-project/llvm/lib/Target/AMDGPU')
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPU.td4
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp29
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp37
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp8
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp3
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp18
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp7
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp8
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h10
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/DSInstructions.td12
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp5
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/SIISelLowering.cpp21
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp20
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp79
-rw-r--r--contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInstrInfo.h3
15 files changed, 218 insertions, 46 deletions
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPU.td b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPU.td
index 060fb66d38f7..d2a325d5ad89 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPU.td
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPU.td
@@ -1100,8 +1100,8 @@ def FeatureGFX12 : GCNSubtargetFeatureGeneration<"GFX12",
FeatureVOP3Literal, FeatureDPP8,
FeatureNoDataDepHazard, FeaturePkFmacF16Inst,
FeatureA16, FeatureFastDenormalF32, FeatureG16,
- FeatureUnalignedBufferAccess, FeatureUnalignedDSAccess, FeatureGDS,
- FeatureGWS, FeatureTrue16BitInsts
+ FeatureUnalignedBufferAccess, FeatureUnalignedDSAccess,
+ FeatureTrue16BitInsts
]
>;
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp
index 0a17b1536040..4462cd8a31f1 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUIGroupLP.cpp
@@ -75,8 +75,9 @@ enum class SchedGroupMask {
DS = 1u << 7,
DS_READ = 1u << 8,
DS_WRITE = 1u << 9,
+ TRANS = 1u << 10,
ALL = ALU | VALU | SALU | MFMA | VMEM | VMEM_READ | VMEM_WRITE | DS |
- DS_READ | DS_WRITE,
+ DS_READ | DS_WRITE | TRANS,
LLVM_MARK_AS_BITMASK_ENUM(/* LargestFlag = */ ALL)
};
@@ -1435,11 +1436,12 @@ bool SchedGroup::canAddMI(const MachineInstr &MI) const {
Result = false;
else if (((SGMask & SchedGroupMask::ALU) != SchedGroupMask::NONE) &&
- (TII->isVALU(MI) || TII->isMFMAorWMMA(MI) || TII->isSALU(MI)))
+ (TII->isVALU(MI) || TII->isMFMAorWMMA(MI) || TII->isSALU(MI) ||
+ TII->isTRANS(MI)))
Result = true;
else if (((SGMask & SchedGroupMask::VALU) != SchedGroupMask::NONE) &&
- TII->isVALU(MI) && !TII->isMFMAorWMMA(MI))
+ TII->isVALU(MI) && !TII->isMFMAorWMMA(MI) && !TII->isTRANS(MI))
Result = true;
else if (((SGMask & SchedGroupMask::SALU) != SchedGroupMask::NONE) &&
@@ -1476,6 +1478,10 @@ bool SchedGroup::canAddMI(const MachineInstr &MI) const {
MI.mayStore() && TII->isDS(MI))
Result = true;
+ else if (((SGMask & SchedGroupMask::TRANS) != SchedGroupMask::NONE) &&
+ TII->isTRANS(MI))
+ Result = true;
+
LLVM_DEBUG(
dbgs() << "For SchedGroup with mask " << format_hex((int)SGMask, 10, true)
<< (Result ? " could classify " : " unable to classify ") << MI);
@@ -1635,10 +1641,13 @@ void IGroupLPDAGMutation::addSchedBarrierEdges(SUnit &SchedBarrier) {
// Remove all existing edges from the SCHED_BARRIER that were added due to the
// instruction having side effects.
resetEdges(SchedBarrier, DAG);
+ LLVM_DEBUG(dbgs() << "Building SchedGroup for SchedBarrier with Mask: "
+ << MI.getOperand(0).getImm() << "\n");
auto InvertedMask =
invertSchedBarrierMask((SchedGroupMask)MI.getOperand(0).getImm());
SchedGroup SG(InvertedMask, std::nullopt, DAG, TII);
SG.initSchedGroup();
+
// Preserve original instruction ordering relative to the SCHED_BARRIER.
SG.link(
SchedBarrier,
@@ -1652,14 +1661,15 @@ IGroupLPDAGMutation::invertSchedBarrierMask(SchedGroupMask Mask) const {
// allowed past the SCHED_BARRIER.
SchedGroupMask InvertedMask = ~Mask;
- // ALU implies VALU, SALU, MFMA.
+ // ALU implies VALU, SALU, MFMA, TRANS.
if ((InvertedMask & SchedGroupMask::ALU) == SchedGroupMask::NONE)
- InvertedMask &=
- ~SchedGroupMask::VALU & ~SchedGroupMask::SALU & ~SchedGroupMask::MFMA;
- // VALU, SALU, MFMA implies ALU.
+ InvertedMask &= ~SchedGroupMask::VALU & ~SchedGroupMask::SALU &
+ ~SchedGroupMask::MFMA & ~SchedGroupMask::TRANS;
+ // VALU, SALU, MFMA, TRANS implies ALU.
else if ((InvertedMask & SchedGroupMask::VALU) == SchedGroupMask::NONE ||
(InvertedMask & SchedGroupMask::SALU) == SchedGroupMask::NONE ||
- (InvertedMask & SchedGroupMask::MFMA) == SchedGroupMask::NONE)
+ (InvertedMask & SchedGroupMask::MFMA) == SchedGroupMask::NONE ||
+ (InvertedMask & SchedGroupMask::TRANS) == SchedGroupMask::NONE)
InvertedMask &= ~SchedGroupMask::ALU;
// VMEM implies VMEM_READ, VMEM_WRITE.
@@ -1678,6 +1688,9 @@ IGroupLPDAGMutation::invertSchedBarrierMask(SchedGroupMask Mask) const {
(InvertedMask & SchedGroupMask::DS_WRITE) == SchedGroupMask::NONE)
InvertedMask &= ~SchedGroupMask::DS;
+ LLVM_DEBUG(dbgs() << "After Inverting, SchedGroup Mask: " << (int)InvertedMask
+ << "\n");
+
return InvertedMask;
}
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index 9d7443012e3d..541a5b62450d 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -169,11 +169,17 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::f16, Expand);
setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::bf16, Expand);
setLoadExtAction(ISD::EXTLOAD, MVT::v2f32, MVT::v2f16, Expand);
+ setLoadExtAction(ISD::EXTLOAD, MVT::v2f32, MVT::v2bf16, Expand);
setLoadExtAction(ISD::EXTLOAD, MVT::v3f32, MVT::v3f16, Expand);
+ setLoadExtAction(ISD::EXTLOAD, MVT::v3f32, MVT::v3bf16, Expand);
setLoadExtAction(ISD::EXTLOAD, MVT::v4f32, MVT::v4f16, Expand);
+ setLoadExtAction(ISD::EXTLOAD, MVT::v4f32, MVT::v4bf16, Expand);
setLoadExtAction(ISD::EXTLOAD, MVT::v8f32, MVT::v8f16, Expand);
+ setLoadExtAction(ISD::EXTLOAD, MVT::v8f32, MVT::v8bf16, Expand);
setLoadExtAction(ISD::EXTLOAD, MVT::v16f32, MVT::v16f16, Expand);
+ setLoadExtAction(ISD::EXTLOAD, MVT::v16f32, MVT::v16bf16, Expand);
setLoadExtAction(ISD::EXTLOAD, MVT::v32f32, MVT::v32f16, Expand);
+ setLoadExtAction(ISD::EXTLOAD, MVT::v32f32, MVT::v32bf16, Expand);
setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f32, Expand);
setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f32, Expand);
@@ -185,10 +191,15 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f16, Expand);
setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::bf16, Expand);
setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f16, Expand);
+ setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2bf16, Expand);
setLoadExtAction(ISD::EXTLOAD, MVT::v3f64, MVT::v3f16, Expand);
+ setLoadExtAction(ISD::EXTLOAD, MVT::v3f64, MVT::v3bf16, Expand);
setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f16, Expand);
+ setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4bf16, Expand);
setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8f16, Expand);
+ setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8bf16, Expand);
setLoadExtAction(ISD::EXTLOAD, MVT::v16f64, MVT::v16f16, Expand);
+ setLoadExtAction(ISD::EXTLOAD, MVT::v16f64, MVT::v16bf16, Expand);
setOperationAction(ISD::STORE, MVT::f32, Promote);
AddPromotedToType(ISD::STORE, MVT::f32, MVT::i32);
@@ -506,9 +517,11 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
setOperationAction(ISD::SELECT, MVT::v12f32, Promote);
AddPromotedToType(ISD::SELECT, MVT::v12f32, MVT::v12i32);
- // There are no libcalls of any kind.
- for (int I = 0; I < RTLIB::UNKNOWN_LIBCALL; ++I)
- setLibcallName(static_cast<RTLIB::Libcall>(I), nullptr);
+ // Disable most libcalls.
+ for (int I = 0; I < RTLIB::UNKNOWN_LIBCALL; ++I) {
+ if (I < RTLIB::ATOMIC_LOAD || I > RTLIB::ATOMIC_FETCH_NAND_16)
+ setLibcallName(static_cast<RTLIB::Libcall>(I), nullptr);
+ }
setSchedulingPreference(Sched::RegPressure);
setJumpIsExpensive(true);
@@ -556,6 +569,8 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
ISD::FSUB, ISD::FNEG,
ISD::FABS, ISD::AssertZext,
ISD::AssertSext, ISD::INTRINSIC_WO_CHAIN});
+
+ setMaxAtomicSizeInBitsSupported(64);
}
bool AMDGPUTargetLowering::mayIgnoreSignedZero(SDValue Op) const {
@@ -3055,18 +3070,26 @@ SDValue AMDGPUTargetLowering::LowerCTLZ_CTTZ(SDValue Op, SelectionDAG &DAG) cons
bool ZeroUndef = Op.getOpcode() == ISD::CTLZ_ZERO_UNDEF ||
Op.getOpcode() == ISD::CTTZ_ZERO_UNDEF;
+ bool Is64BitScalar = !Src->isDivergent() && Src.getValueType() == MVT::i64;
- if (Src.getValueType() == MVT::i32) {
+ if (Src.getValueType() == MVT::i32 || Is64BitScalar) {
// (ctlz hi:lo) -> (umin (ffbh src), 32)
// (cttz hi:lo) -> (umin (ffbl src), 32)
// (ctlz_zero_undef src) -> (ffbh src)
// (cttz_zero_undef src) -> (ffbl src)
+
+ // 64-bit scalar version produce 32-bit result
+ // (ctlz hi:lo) -> (umin (S_FLBIT_I32_B64 src), 64)
+ // (cttz hi:lo) -> (umin (S_FF1_I32_B64 src), 64)
+ // (ctlz_zero_undef src) -> (S_FLBIT_I32_B64 src)
+ // (cttz_zero_undef src) -> (S_FF1_I32_B64 src)
SDValue NewOpr = DAG.getNode(NewOpc, SL, MVT::i32, Src);
if (!ZeroUndef) {
- const SDValue Const32 = DAG.getConstant(32, SL, MVT::i32);
- NewOpr = DAG.getNode(ISD::UMIN, SL, MVT::i32, NewOpr, Const32);
+ const SDValue ConstVal = DAG.getConstant(
+ Op.getValueType().getScalarSizeInBits(), SL, MVT::i32);
+ NewOpr = DAG.getNode(ISD::UMIN, SL, MVT::i32, NewOpr, ConstVal);
}
- return NewOpr;
+ return DAG.getNode(ISD::ZERO_EXTEND, SL, Src.getValueType(), NewOpr);
}
SDValue Lo, Hi;
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
index ee93d9eb4c0a..2bb7b6bd0674 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
@@ -1241,6 +1241,10 @@ static Value *simplifyAMDGCNMemoryIntrinsicDemanded(InstCombiner &IC,
ConstantInt *DMask = cast<ConstantInt>(Args[DMaskIdx]);
unsigned DMaskVal = DMask->getZExtValue() & 0xf;
+ // dmask 0 has special semantics, do not simplify.
+ if (DMaskVal == 0)
+ return nullptr;
+
// Mask off values that are undefined because the dmask doesn't cover them
DemandedElts &= (1 << llvm::popcount(DMaskVal)) - 1;
@@ -1261,7 +1265,7 @@ static Value *simplifyAMDGCNMemoryIntrinsicDemanded(InstCombiner &IC,
unsigned NewNumElts = DemandedElts.popcount();
if (!NewNumElts)
- return UndefValue::get(IIVTy);
+ return PoisonValue::get(IIVTy);
if (NewNumElts >= VWidth && DemandedElts.isMask()) {
if (DMaskIdx >= 0)
@@ -1299,7 +1303,7 @@ static Value *simplifyAMDGCNMemoryIntrinsicDemanded(InstCombiner &IC,
if (IsLoad) {
if (NewNumElts == 1) {
- return IC.Builder.CreateInsertElement(UndefValue::get(IIVTy), NewCall,
+ return IC.Builder.CreateInsertElement(PoisonValue::get(IIVTy), NewCall,
DemandedElts.countr_zero());
}
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp
index 0c21382e5c22..f03e6b8915b1 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp
@@ -1050,8 +1050,7 @@ bool AMDGPULibCalls::fold_pow(FPMathOperator *FPOp, IRBuilder<> &B,
CF->isNegative();
} else {
needlog = true;
- needcopysign = needabs = FInfo.getId() != AMDGPULibFunc::EI_POWR &&
- (!CF || CF->isNegative());
+ needcopysign = needabs = FInfo.getId() != AMDGPULibFunc::EI_POWR;
}
} else {
ConstantDataVector *CDV = dyn_cast<ConstantDataVector>(opr0);
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
index 1bed516fb5c7..5e73411cae9b 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
@@ -530,6 +530,15 @@ static Value *promoteAllocaUserToVector(
return Builder.CreateVectorSplat(VectorTy->getElementCount(), Elt);
}
+ if (auto *Intr = dyn_cast<IntrinsicInst>(Inst)) {
+ if (Intr->getIntrinsicID() == Intrinsic::objectsize) {
+ Intr->replaceAllUsesWith(
+ Builder.getIntN(Intr->getType()->getIntegerBitWidth(),
+ DL.getTypeAllocSize(VectorTy)));
+ return nullptr;
+ }
+ }
+
llvm_unreachable("Unsupported call when promoting alloca to vector");
}
@@ -773,8 +782,17 @@ bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToVector(AllocaInst &Alloca) {
continue;
}
+ if (auto *Intr = dyn_cast<IntrinsicInst>(Inst)) {
+ if (Intr->getIntrinsicID() == Intrinsic::objectsize) {
+ WorkList.push_back(Inst);
+ continue;
+ }
+ }
+
// Ignore assume-like intrinsics and comparisons used in assumes.
if (isAssumeLikeIntrinsic(Inst)) {
+ if (!Inst->use_empty())
+ return RejectUser(Inst, "assume-like intrinsic cannot have any users");
UsersToRemove.push_back(Inst);
continue;
}
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index e8c04ecf39ba..fdc2077868cf 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -345,6 +345,11 @@ static cl::opt<bool> EnableImageIntrinsicOptimizer(
cl::desc("Enable image intrinsic optimizer pass"), cl::init(true),
cl::Hidden);
+static cl::opt<bool>
+ EnableLoopPrefetch("amdgpu-loop-prefetch",
+ cl::desc("Enable loop data prefetch on AMDGPU"),
+ cl::Hidden, cl::init(false));
+
static cl::opt<bool> EnableMaxIlpSchedStrategy(
"amdgpu-enable-max-ilp-scheduling-strategy",
cl::desc("Enable scheduling strategy to maximize ILP for a single wave."),
@@ -982,6 +987,8 @@ void AMDGPUPassConfig::addEarlyCSEOrGVNPass() {
}
void AMDGPUPassConfig::addStraightLineScalarOptimizationPasses() {
+ if (isPassEnabled(EnableLoopPrefetch, CodeGenOptLevel::Aggressive))
+ addPass(createLoopDataPrefetchPass());
addPass(createSeparateConstOffsetFromGEPPass());
// ReassociateGEPs exposes more opportunities for SLSR. See
// the example in reassociate-geps-and-slsr.ll.
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
index f1da1a61bf4d..ebe0b8551b23 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
@@ -1345,3 +1345,11 @@ GCNTTIImpl::getTypeLegalizationCost(Type *Ty) const {
Cost.first += (Size + 255) / 256;
return Cost;
}
+
+unsigned GCNTTIImpl::getPrefetchDistance() const {
+ return ST->hasPrefetch() ? 128 : 0;
+}
+
+bool GCNTTIImpl::shouldPrefetchAddressSpace(unsigned AS) const {
+ return AMDGPU::isFlatGlobalAddrSpace(AS);
+}
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
index 1e6c5bbfc0d7..cd8e9fd10bbf 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
@@ -254,6 +254,16 @@ public:
InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty,
FastMathFlags FMF,
TTI::TargetCostKind CostKind);
+
+ /// Data cache line size for LoopDataPrefetch pass. Has no use before GFX12.
+ unsigned getCacheLineSize() const override { return 128; }
+
+ /// How much before a load we should place the prefetch instruction.
+ /// This is currently measured in number of IR instructions.
+ unsigned getPrefetchDistance() const override;
+
+ /// \return if target want to issue a prefetch in address space \p AS.
+ bool shouldPrefetchAddressSpace(unsigned AS) const override;
};
} // end namespace llvm
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/DSInstructions.td b/contrib/llvm-project/llvm/lib/Target/AMDGPU/DSInstructions.td
index 3a895923fa4b..bc9049b4ef33 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/DSInstructions.td
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/DSInstructions.td
@@ -1147,7 +1147,8 @@ def : GCNPat <
>;
} // End SubtargetPredicate = HasAtomicDsPkAdd16Insts
-def : Pat <
+let OtherPredicates = [HasGDS] in
+def : GCNPat <
(SIds_ordered_count i32:$value, i16:$offset),
(DS_ORDERED_COUNT $value, (as_i16imm $offset))
>;
@@ -1189,7 +1190,8 @@ def : GCNPat <
//===----------------------------------------------------------------------===//
class Base_DS_Real_gfx6_gfx7_gfx10_gfx11_gfx12<bits<8> op, DS_Pseudo ps, int ef,
- string opName = ps.Mnemonic>
+ string opName = ps.Mnemonic,
+ bit hasGFX12Enc = 0>
: DS_Real<ps, opName>, SIMCInstr <ps.Mnemonic, ef> {
let Inst{7-0} = !if(ps.has_offset0, offset0, 0);
@@ -1201,6 +1203,8 @@ class Base_DS_Real_gfx6_gfx7_gfx10_gfx11_gfx12<bits<8> op, DS_Pseudo ps, int ef,
let Inst{47-40} = !if(ps.has_data0, data0{7-0}, 0);
let Inst{55-48} = !if(ps.has_data1, data1{7-0}, 0);
let Inst{63-56} = !if(ps.has_vdst, vdst{7-0}, 0);
+
+ let gds = !if(hasGFX12Enc, 0, ?);
}
//===----------------------------------------------------------------------===//
@@ -1212,7 +1216,7 @@ let AssemblerPredicate = isGFX12Plus, DecoderNamespace = "GFX12" in {
defvar ps = !cast<DS_Pseudo>(NAME);
def _gfx12 :
Base_DS_Real_gfx6_gfx7_gfx10_gfx11_gfx12<op, ps, SIEncodingFamily.GFX12,
- ps.Mnemonic>;
+ ps.Mnemonic, 1>;
}
multiclass DS_Real_Renamed_gfx12<bits<8> op, DS_Pseudo backing_pseudo,
@@ -1220,7 +1224,7 @@ let AssemblerPredicate = isGFX12Plus, DecoderNamespace = "GFX12" in {
def _gfx12 :
Base_DS_Real_gfx6_gfx7_gfx10_gfx11_gfx12<op, backing_pseudo,
SIEncodingFamily.GFX12,
- real_name>,
+ real_name, 1>,
MnemonicAlias<backing_pseudo.Mnemonic, real_name>,
Requires<[isGFX12Plus]>;
}
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
index ed2e7e4f189e..7939d0036568 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
@@ -702,6 +702,11 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
AMDGPU::OpName::src2_modifiers);
}
+ if (Res && (MCII->get(MI.getOpcode()).TSFlags & SIInstrFlags::DS) &&
+ !AMDGPU::hasGDS(STI)) {
+ insertNamedMCOperand(MI, MCOperand::createImm(0), AMDGPU::OpName::gds);
+ }
+
if (Res && (MCII->get(MI.getOpcode()).TSFlags &
(SIInstrFlags::MUBUF | SIInstrFlags::FLAT | SIInstrFlags::SMRD))) {
int CPolPos = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 34826809c1a6..fc119aa61d01 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -540,10 +540,7 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
MVT::f16, Custom);
setOperationAction({ISD::SINT_TO_FP, ISD::UINT_TO_FP}, MVT::i16, Custom);
-
- setOperationAction(
- {ISD::FP_TO_SINT, ISD::FP_TO_UINT, ISD::SINT_TO_FP, ISD::UINT_TO_FP},
- MVT::f16, Promote);
+ setOperationAction({ISD::FP_TO_SINT, ISD::FP_TO_UINT}, MVT::f16, Promote);
// F16 - VOP2 Actions.
setOperationAction({ISD::BR_CC, ISD::SELECT_CC}, MVT::f16, Expand);
@@ -1145,11 +1142,10 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
MachineMemOperand::MOStore |
MachineMemOperand::MODereferenceable;
- // XXX - Should this be volatile without known ordering?
- Info.flags |= MachineMemOperand::MOVolatile;
-
switch (IntrID) {
default:
+ // XXX - Should this be volatile without known ordering?
+ Info.flags |= MachineMemOperand::MOVolatile;
break;
case Intrinsic::amdgcn_raw_buffer_load_lds:
case Intrinsic::amdgcn_raw_ptr_buffer_load_lds:
@@ -1157,6 +1153,7 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
case Intrinsic::amdgcn_struct_ptr_buffer_load_lds: {
unsigned Width = cast<ConstantInt>(CI.getArgOperand(2))->getZExtValue();
Info.memVT = EVT::getIntegerVT(CI.getContext(), Width * 8);
+ Info.ptrVal = CI.getArgOperand(1);
return true;
}
}
@@ -1289,8 +1286,8 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
Info.opc = ISD::INTRINSIC_VOID;
unsigned Width = cast<ConstantInt>(CI.getArgOperand(2))->getZExtValue();
Info.memVT = EVT::getIntegerVT(CI.getContext(), Width * 8);
- Info.flags |= MachineMemOperand::MOLoad | MachineMemOperand::MOStore |
- MachineMemOperand::MOVolatile;
+ Info.ptrVal = CI.getArgOperand(1);
+ Info.flags |= MachineMemOperand::MOLoad | MachineMemOperand::MOStore;
return true;
}
case Intrinsic::amdgcn_ds_bvh_stack_rtn: {
@@ -9231,7 +9228,9 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
MachinePointerInfo LoadPtrI = LoadMMO->getPointerInfo();
MachinePointerInfo StorePtrI = LoadPtrI;
- StorePtrI.V = nullptr;
+ LoadPtrI.V = PoisonValue::get(
+ PointerType::get(*DAG.getContext(), AMDGPUAS::GLOBAL_ADDRESS));
+ LoadPtrI.AddrSpace = AMDGPUAS::GLOBAL_ADDRESS;
StorePtrI.AddrSpace = AMDGPUAS::LOCAL_ADDRESS;
auto F = LoadMMO->getFlags() &
@@ -9309,6 +9308,8 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
MachinePointerInfo LoadPtrI = LoadMMO->getPointerInfo();
LoadPtrI.Offset = Op->getConstantOperandVal(5);
MachinePointerInfo StorePtrI = LoadPtrI;
+ LoadPtrI.V = PoisonValue::get(
+ PointerType::get(*DAG.getContext(), AMDGPUAS::GLOBAL_ADDRESS));
LoadPtrI.AddrSpace = AMDGPUAS::GLOBAL_ADDRESS;
StorePtrI.AddrSpace = AMDGPUAS::LOCAL_ADDRESS;
auto F = LoadMMO->getFlags() &
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
index 8415a3d77d3b..55ddb540c51e 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -238,7 +238,7 @@ public:
bool merge(const WaitcntBrackets &Other);
- RegInterval getRegInterval(const MachineInstr *MI, const SIInstrInfo *TII,
+ RegInterval getRegInterval(const MachineInstr *MI,
const MachineRegisterInfo *MRI,
const SIRegisterInfo *TRI, unsigned OpNo) const;
@@ -500,7 +500,6 @@ public:
} // end anonymous namespace
RegInterval WaitcntBrackets::getRegInterval(const MachineInstr *MI,
- const SIInstrInfo *TII,
const MachineRegisterInfo *MRI,
const SIRegisterInfo *TRI,
unsigned OpNo) const {
@@ -534,7 +533,7 @@ RegInterval WaitcntBrackets::getRegInterval(const MachineInstr *MI,
else
return {-1, -1};
- const TargetRegisterClass *RC = TII->getOpRegClass(*MI, OpNo);
+ const TargetRegisterClass *RC = TRI->getPhysRegBaseClass(Op.getReg());
unsigned Size = TRI->getRegSizeInBits(*RC);
Result.second = Result.first + ((Size + 16) / 32);
@@ -546,7 +545,7 @@ void WaitcntBrackets::setExpScore(const MachineInstr *MI,
const SIRegisterInfo *TRI,
const MachineRegisterInfo *MRI, unsigned OpNo,
unsigned Val) {
- RegInterval Interval = getRegInterval(MI, TII, MRI, TRI, OpNo);
+ RegInterval Interval = getRegInterval(MI, MRI, TRI, OpNo);
assert(TRI->isVectorRegister(*MRI, MI->getOperand(OpNo).getReg()));
for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
setRegScore(RegNo, EXP_CNT, Val);
@@ -674,7 +673,7 @@ void WaitcntBrackets::updateByEvent(const SIInstrInfo *TII,
Inst.getOpcode() == AMDGPU::BUFFER_STORE_DWORDX4) {
MachineOperand *MO = TII->getNamedOperand(Inst, AMDGPU::OpName::data);
unsigned OpNo;//TODO: find the OpNo for this operand;
- RegInterval Interval = getRegInterval(&Inst, TII, MRI, TRI, OpNo);
+ RegInterval Interval = getRegInterval(&Inst, MRI, TRI, OpNo);
for (int RegNo = Interval.first; RegNo < Interval.second;
++RegNo) {
setRegScore(RegNo + NUM_ALL_VGPRS, t, CurrScore);
@@ -686,7 +685,7 @@ void WaitcntBrackets::updateByEvent(const SIInstrInfo *TII,
auto &Op = Inst.getOperand(I);
if (!Op.isReg() || !Op.isDef())
continue;
- RegInterval Interval = getRegInterval(&Inst, TII, MRI, TRI, I);
+ RegInterval Interval = getRegInterval(&Inst, MRI, TRI, I);
if (T == VM_CNT) {
if (Interval.first >= NUM_ALL_VGPRS)
continue;
@@ -1140,7 +1139,7 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI,
if (MI.getOperand(CallAddrOpIdx).isReg()) {
RegInterval CallAddrOpInterval =
- ScoreBrackets.getRegInterval(&MI, TII, MRI, TRI, CallAddrOpIdx);
+ ScoreBrackets.getRegInterval(&MI, MRI, TRI, CallAddrOpIdx);
for (int RegNo = CallAddrOpInterval.first;
RegNo < CallAddrOpInterval.second; ++RegNo)
@@ -1150,7 +1149,7 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI,
AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::dst);
if (RtnAddrOpIdx != -1) {
RegInterval RtnAddrOpInterval =
- ScoreBrackets.getRegInterval(&MI, TII, MRI, TRI, RtnAddrOpIdx);
+ ScoreBrackets.getRegInterval(&MI, MRI, TRI, RtnAddrOpIdx);
for (int RegNo = RtnAddrOpInterval.first;
RegNo < RtnAddrOpInterval.second; ++RegNo)
@@ -1202,8 +1201,7 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI,
if (Op.isTied() && Op.isUse() && TII->doesNotReadTiedSource(MI))
continue;
- RegInterval Interval =
- ScoreBrackets.getRegInterval(&MI, TII, MRI, TRI, I);
+ RegInterval Interval = ScoreBrackets.getRegInterval(&MI, MRI, TRI, I);
const bool IsVGPR = TRI->isVectorRegister(*MRI, Op.getReg());
for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
@@ -1782,7 +1780,7 @@ bool SIInsertWaitcnts::shouldFlushVmCnt(MachineLoop *ML,
MachineOperand &Op = MI.getOperand(I);
if (!Op.isReg() || !TRI->isVectorRegister(*MRI, Op.getReg()))
continue;
- RegInterval Interval = Brackets.getRegInterval(&MI, TII, MRI, TRI, I);
+ RegInterval Interval = Brackets.getRegInterval(&MI, MRI, TRI, I);
// Vgpr use
if (Op.isUse()) {
for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 70ef1fff274a..ebe23a5eac57 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -245,6 +245,10 @@ bool SIInstrInfo::areLoadsFromSameBasePtr(SDNode *Load0, SDNode *Load1,
if (!get(Opc0).mayLoad() || !get(Opc1).mayLoad())
return false;
+ // A mayLoad instruction without a def is not a load. Likely a prefetch.
+ if (!get(Opc0).getNumDefs() || !get(Opc1).getNumDefs())
+ return false;
+
if (isDS(Opc0) && isDS(Opc1)) {
// FIXME: Handle this case:
@@ -3654,6 +3658,9 @@ bool SIInstrInfo::areMemAccessesTriviallyDisjoint(const MachineInstr &MIa,
if (MIa.hasOrderedMemoryRef() || MIb.hasOrderedMemoryRef())
return false;
+ if (isLDSDMA(MIa) || isLDSDMA(MIb))
+ return false;
+
// TODO: Should we check the address space from the MachineMemOperand? That
// would allow us to distinguish objects we know don't alias based on the
// underlying address space, even if it was lowered to a different one,
@@ -4976,6 +4983,14 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI,
}
}
+ if (isDS(MI) && !ST.hasGDS()) {
+ const MachineOperand *GDSOp = getNamedOperand(MI, AMDGPU::OpName::gds);
+ if (GDSOp && GDSOp->getImm() != 0) {
+ ErrInfo = "GDS is not supported on this subtarget";
+ return false;
+ }
+ }
+
if (isImage(MI)) {
const MachineOperand *DimOp = getNamedOperand(MI, AMDGPU::OpName::dim);
if (DimOp) {
@@ -6897,6 +6912,15 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist,
Inst.eraseFromParent();
return;
+ case AMDGPU::S_FLBIT_I32_B64:
+ splitScalar64BitCountOp(Worklist, Inst, AMDGPU::V_FFBH_U32_e32);
+ Inst.eraseFromParent();
+ return;
+ case AMDGPU::S_FF1_I32_B64:
+ splitScalar64BitCountOp(Worklist, Inst, AMDGPU::V_FFBL_B32_e32);
+ Inst.eraseFromParent();
+ return;
+
case AMDGPU::S_LSHL_B32:
if (ST.hasOnlyRevVALUShifts()) {
NewOpcode = AMDGPU::V_LSHLREV_B32_e64;
@@ -7830,6 +7854,61 @@ void SIInstrInfo::splitScalar64BitBFE(SIInstrWorklist &Worklist,
addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
}
+void SIInstrInfo::splitScalar64BitCountOp(SIInstrWorklist &Worklist,
+ MachineInstr &Inst, unsigned Opcode,
+ MachineDominatorTree *MDT) const {
+ // (S_FLBIT_I32_B64 hi:lo) ->
+ // -> (umin (V_FFBH_U32_e32 hi), (uaddsat (V_FFBH_U32_e32 lo), 32))
+ // (S_FF1_I32_B64 hi:lo) ->
+ // ->(umin (uaddsat (V_FFBL_B32_e32 hi), 32) (V_FFBL_B32_e32 lo))
+
+ MachineBasicBlock &MBB = *Inst.getParent();
+ MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
+ MachineBasicBlock::iterator MII = Inst;
+ const DebugLoc &DL = Inst.getDebugLoc();
+
+ MachineOperand &Dest = Inst.getOperand(0);
+ MachineOperand &Src = Inst.getOperand(1);
+
+ const MCInstrDesc &InstDesc = get(Opcode);
+
+ bool IsCtlz = Opcode == AMDGPU::V_FFBH_U32_e32;
+ unsigned OpcodeAdd =
+ ST.hasAddNoCarry() ? AMDGPU::V_ADD_U32_e64 : AMDGPU::V_ADD_CO_U32_e32;
+
+ const TargetRegisterClass *SrcRC =
+ Src.isReg() ? MRI.getRegClass(Src.getReg()) : &AMDGPU::SGPR_32RegClass;
+ const TargetRegisterClass *SrcSubRC =
+ RI.getSubRegisterClass(SrcRC, AMDGPU::sub0);
+
+ MachineOperand SrcRegSub0 =
+ buildExtractSubRegOrImm(MII, MRI, Src, SrcRC, AMDGPU::sub0, SrcSubRC);
+ MachineOperand SrcRegSub1 =
+ buildExtractSubRegOrImm(MII, MRI, Src, SrcRC, AMDGPU::sub1, SrcSubRC);
+
+ Register MidReg1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+ Register MidReg2 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+ Register MidReg3 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+ Register MidReg4 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+
+ BuildMI(MBB, MII, DL, InstDesc, MidReg1).add(SrcRegSub0);
+
+ BuildMI(MBB, MII, DL, InstDesc, MidReg2).add(SrcRegSub1);
+
+ BuildMI(MBB, MII, DL, get(OpcodeAdd), MidReg3)
+ .addReg(IsCtlz ? MidReg1 : MidReg2)
+ .addImm(32)
+ .addImm(1); // enable clamp
+
+ BuildMI(MBB, MII, DL, get(AMDGPU::V_MIN_U32_e64), MidReg4)
+ .addReg(MidReg3)
+ .addReg(IsCtlz ? MidReg2 : MidReg1);
+
+ MRI.replaceRegWith(Dest.getReg(), MidReg4);
+
+ addUsersToMoveToVALUWorklist(MidReg4, MRI, Worklist);
+}
+
void SIInstrInfo::addUsersToMoveToVALUWorklist(
Register DstReg, MachineRegisterInfo &MRI,
SIInstrWorklist &Worklist) const {
diff --git a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInstrInfo.h
index affe52046752..46eee6fae0a5 100644
--- a/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/contrib/llvm-project/llvm/lib/Target/AMDGPU/SIInstrInfo.h
@@ -144,6 +144,9 @@ private:
void splitScalar64BitBCNT(SIInstrWorklist &Worklist,
MachineInstr &Inst) const;
void splitScalar64BitBFE(SIInstrWorklist &Worklist, MachineInstr &Inst) const;
+ void splitScalar64BitCountOp(SIInstrWorklist &Worklist, MachineInstr &Inst,
+ unsigned Opcode,
+ MachineDominatorTree *MDT = nullptr) const;
void movePackToVALU(SIInstrWorklist &Worklist, MachineRegisterInfo &MRI,
MachineInstr &Inst) const;