12 files changed, 194 insertions, 60 deletions
diff --git a/contrib/llvm-project/llvm/include/llvm/Analysis/AliasAnalysis.h b/contrib/llvm-project/llvm/include/llvm/Analysis/AliasAnalysis.h
index d6f732d35fd4..e8e4f491be5a 100644
--- a/contrib/llvm-project/llvm/include/llvm/Analysis/AliasAnalysis.h
+++ b/contrib/llvm-project/llvm/include/llvm/Analysis/AliasAnalysis.h
@@ -287,6 +287,10 @@ public:
   ///   store %l, ...
   bool MayBeCrossIteration = false;
 
+  /// Whether alias analysis is allowed to use the dominator tree, for use by
+  /// passes that lazily update the DT while performing AA queries.
+  bool UseDominatorTree = true;
+
   AAQueryInfo(AAResults &AAR, CaptureInfo *CI) : AAR(AAR), CI(CI) {}
 };
 
@@ -668,6 +672,9 @@ public:
   void enableCrossIterationMode() {
     AAQI.MayBeCrossIteration = true;
   }
+
+  /// Disable the use of the dominator tree during alias analysis queries.
+  void disableDominatorTree() { AAQI.UseDominatorTree = false; }
 };
 
 /// Temporary typedef for legacy code that uses a generic \c AliasAnalysis
diff --git a/contrib/llvm-project/llvm/include/llvm/Analysis/BasicAliasAnalysis.h b/contrib/llvm-project/llvm/include/llvm/Analysis/BasicAliasAnalysis.h
index afc1811239f2..7eca82729430 100644
--- a/contrib/llvm-project/llvm/include/llvm/Analysis/BasicAliasAnalysis.h
+++ b/contrib/llvm-project/llvm/include/llvm/Analysis/BasicAliasAnalysis.h
@@ -43,20 +43,26 @@ class BasicAAResult : public AAResultBase {
   const Function &F;
   const TargetLibraryInfo &TLI;
   AssumptionCache &AC;
-  DominatorTree *DT;
+  /// Use getDT() instead of accessing this member directly, in order to
+  /// respect the AAQI.UseDominatorTree option.
+  DominatorTree *DT_;
+
+  DominatorTree *getDT(const AAQueryInfo &AAQI) const {
+    return AAQI.UseDominatorTree ? DT_ : nullptr;
+  }
 
 public:
   BasicAAResult(const DataLayout &DL, const Function &F,
                 const TargetLibraryInfo &TLI, AssumptionCache &AC,
                 DominatorTree *DT = nullptr)
-      : DL(DL), F(F), TLI(TLI), AC(AC), DT(DT) {}
+      : DL(DL), F(F), TLI(TLI), AC(AC), DT_(DT) {}
 
   BasicAAResult(const BasicAAResult &Arg)
       : AAResultBase(Arg), DL(Arg.DL), F(Arg.F), TLI(Arg.TLI), AC(Arg.AC),
-        DT(Arg.DT) {}
+        DT_(Arg.DT_) {}
   BasicAAResult(BasicAAResult &&Arg)
       : AAResultBase(std::move(Arg)), DL(Arg.DL), F(Arg.F), TLI(Arg.TLI),
-        AC(Arg.AC), DT(Arg.DT) {}
+        AC(Arg.AC), DT_(Arg.DT_) {}
 
   /// Handle invalidation events in the new pass manager.
   bool invalidate(Function &Fn, const PreservedAnalyses &PA,
diff --git a/contrib/llvm-project/llvm/include/llvm/Analysis/BranchProbabilityInfo.h b/contrib/llvm-project/llvm/include/llvm/Analysis/BranchProbabilityInfo.h
index 6b9d17818201..91e1872e9bd6 100644
--- a/contrib/llvm-project/llvm/include/llvm/Analysis/BranchProbabilityInfo.h
+++ b/contrib/llvm-project/llvm/include/llvm/Analysis/BranchProbabilityInfo.h
@@ -122,16 +122,23 @@ public:
   }
 
   BranchProbabilityInfo(BranchProbabilityInfo &&Arg)
-      : Probs(std::move(Arg.Probs)), LastF(Arg.LastF),
-        EstimatedBlockWeight(std::move(Arg.EstimatedBlockWeight)) {}
+      : Handles(std::move(Arg.Handles)), Probs(std::move(Arg.Probs)),
+        LastF(Arg.LastF),
+        EstimatedBlockWeight(std::move(Arg.EstimatedBlockWeight)) {
+    for (auto &Handle : Handles)
+      Handle.setBPI(this);
+  }
 
   BranchProbabilityInfo(const BranchProbabilityInfo &) = delete;
   BranchProbabilityInfo &operator=(const BranchProbabilityInfo &) = delete;
 
   BranchProbabilityInfo &operator=(BranchProbabilityInfo &&RHS) {
     releaseMemory();
+    Handles = std::move(RHS.Handles);
     Probs = std::move(RHS.Probs);
     EstimatedBlockWeight = std::move(RHS.EstimatedBlockWeight);
+    for (auto &Handle : Handles)
+      Handle.setBPI(this);
     return *this;
   }
 
@@ -279,6 +286,8 @@ private:
     }
 
   public:
+    void setBPI(BranchProbabilityInfo *BPI) { this->BPI = BPI; }
+
     BasicBlockCallbackVH(const Value *V, BranchProbabilityInfo *BPI = nullptr)
         : CallbackVH(const_cast<Value *>(V)), BPI(BPI) {}
   };
diff --git a/contrib/llvm-project/llvm/include/llvm/Analysis/Loads.h b/contrib/llvm-project/llvm/include/llvm/Analysis/Loads.h
index 2880ed33a34c..0926093bba99 100644
--- a/contrib/llvm-project/llvm/include/llvm/Analysis/Loads.h
+++ b/contrib/llvm-project/llvm/include/llvm/Analysis/Loads.h
@@ -18,7 +18,7 @@
 
 namespace llvm {
 
-class AAResults;
+class BatchAAResults;
 class AssumptionCache;
 class DataLayout;
 class DominatorTree;
@@ -129,11 +129,10 @@ extern cl::opt<unsigned> DefMaxInstsToScan;
 /// location in memory, as opposed to the value operand of a store.
 ///
 /// \returns The found value, or nullptr if no value is found.
-Value *FindAvailableLoadedValue(LoadInst *Load,
-                                BasicBlock *ScanBB,
+Value *FindAvailableLoadedValue(LoadInst *Load, BasicBlock *ScanBB,
                                 BasicBlock::iterator &ScanFrom,
                                 unsigned MaxInstsToScan = DefMaxInstsToScan,
-                                AAResults *AA = nullptr,
+                                BatchAAResults *AA = nullptr,
                                 bool *IsLoadCSE = nullptr,
                                 unsigned *NumScanedInst = nullptr);
 
@@ -141,7 +140,8 @@ Value *FindAvailableLoadedValue(LoadInst *Load,
 /// FindAvailableLoadedValue() for the case where we are not interested in
 /// finding the closest clobbering instruction if no available load is found.
 /// This overload cannot be used to scan across multiple blocks.
-Value *FindAvailableLoadedValue(LoadInst *Load, AAResults &AA, bool *IsLoadCSE,
+Value *FindAvailableLoadedValue(LoadInst *Load, BatchAAResults &AA,
+                                bool *IsLoadCSE,
                                 unsigned MaxInstsToScan = DefMaxInstsToScan);
 
 /// Scan backwards to see if we have the value of the given pointer available
@@ -170,7 +170,7 @@ Value *FindAvailableLoadedValue(LoadInst *Load, AAResults &AA, bool *IsLoadCSE,
 Value *findAvailablePtrLoadStore(const MemoryLocation &Loc, Type *AccessTy,
                                  bool AtLeastAtomic, BasicBlock *ScanBB,
                                  BasicBlock::iterator &ScanFrom,
-                                 unsigned MaxInstsToScan, AAResults *AA,
+                                 unsigned MaxInstsToScan, BatchAAResults *AA,
                                  bool *IsLoadCSE, unsigned *NumScanedInst);
 
 /// Returns true if a pointer value \p A can be replace with another pointer
diff --git a/contrib/llvm-project/llvm/include/llvm/Analysis/VecFuncs.def b/contrib/llvm-project/llvm/include/llvm/Analysis/VecFuncs.def
index f09e12f3038c..07edf68c667a 100644
--- a/contrib/llvm-project/llvm/include/llvm/Analysis/VecFuncs.def
+++ b/contrib/llvm-project/llvm/include/llvm/Analysis/VecFuncs.def
@@ -771,8 +771,8 @@ TLI_DEFINE_VECFUNC("log2f", "_ZGVsMxv_log2f", SCALABLE(4), MASKED, "_ZGVsMxv")
 TLI_DEFINE_VECFUNC("llvm.log2.f64", "_ZGVsMxv_log2", SCALABLE(2), MASKED, "_ZGVsMxv")
 TLI_DEFINE_VECFUNC("llvm.log2.f32", "_ZGVsMxv_log2f", SCALABLE(4), MASKED, "_ZGVsMxv")
 
-TLI_DEFINE_VECFUNC("modf", "_ZGVsMxvl8_modf", SCALABLE(2), MASKED, "_ZGVsMxvl8")
-TLI_DEFINE_VECFUNC("modff", "_ZGVsMxvl4_modff", SCALABLE(4), MASKED, "_ZGVsMxvl4")
+TLI_DEFINE_VECFUNC("modf", "_ZGVsNxvl8_modf", SCALABLE(2), NOMASK, "_ZGVsNxvl8")
+TLI_DEFINE_VECFUNC("modff", "_ZGVsNxvl4_modff", SCALABLE(4), NOMASK, "_ZGVsNxvl4")
 
 TLI_DEFINE_VECFUNC("nextafter", "_ZGVsMxvv_nextafter", SCALABLE(2), MASKED, "_ZGVsMxvv")
 TLI_DEFINE_VECFUNC("nextafterf", "_ZGVsMxvv_nextafterf", SCALABLE(4), MASKED, "_ZGVsMxvv")
@@ -787,11 +787,11 @@ TLI_DEFINE_VECFUNC("sinf", "_ZGVsMxv_sinf", SCALABLE(4), MASKED, "_ZGVsMxv")
 TLI_DEFINE_VECFUNC("llvm.sin.f64", "_ZGVsMxv_sin", SCALABLE(2), MASKED, "_ZGVsMxv")
 TLI_DEFINE_VECFUNC("llvm.sin.f32", "_ZGVsMxv_sinf", SCALABLE(4), MASKED, "_ZGVsMxv")
 
-TLI_DEFINE_VECFUNC("sincos", "_ZGVsMxvl8l8_sincos", SCALABLE(2), MASKED, "_ZGVsMxvl8l8")
-TLI_DEFINE_VECFUNC("sincosf", "_ZGVsMxvl4l4_sincosf", SCALABLE(4), MASKED, "_ZGVsMxvl4l4")
+TLI_DEFINE_VECFUNC("sincos", "_ZGVsNxvl8l8_sincos", SCALABLE(2), NOMASK, "_ZGVsNxvl8l8")
+TLI_DEFINE_VECFUNC("sincosf", "_ZGVsNxvl4l4_sincosf", SCALABLE(4), NOMASK, "_ZGVsNxvl4l4")
 
-TLI_DEFINE_VECFUNC("sincospi", "_ZGVsMxvl8l8_sincospi", SCALABLE(2), MASKED, "_ZGVsMxvl8l8")
-TLI_DEFINE_VECFUNC("sincospif", "_ZGVsMxvl4l4_sincospif", SCALABLE(4), MASKED, "_ZGVsMxvl4l4")
+TLI_DEFINE_VECFUNC("sincospi", "_ZGVsNxvl8l8_sincospi", SCALABLE(2), NOMASK, "_ZGVsNxvl8l8")
+TLI_DEFINE_VECFUNC("sincospif", "_ZGVsNxvl4l4_sincospif", SCALABLE(4), NOMASK, "_ZGVsNxvl4l4")
 
 TLI_DEFINE_VECFUNC("sinh", "_ZGVsMxv_sinh",  SCALABLE(2), MASKED, "_ZGVsMxv")
 TLI_DEFINE_VECFUNC("sinhf", "_ZGVsMxv_sinhf", SCALABLE(4), MASKED, "_ZGVsMxv")
@@ -1005,8 +1005,6 @@ TLI_DEFINE_VECFUNC("llvm.log2.f32", "armpl_svlog2_f32_x", SCALABLE(4), MASKED, "
 
 TLI_DEFINE_VECFUNC("modf", "armpl_vmodfq_f64", FIXED(2), NOMASK, "_ZGV_LLVM_N2vl8")
 TLI_DEFINE_VECFUNC("modff", "armpl_vmodfq_f32", FIXED(4), NOMASK, "_ZGV_LLVM_N4vl4")
-TLI_DEFINE_VECFUNC("modf", "armpl_svmodf_f64_x",  SCALABLE(2), MASKED, "_ZGVsMxvl8")
-TLI_DEFINE_VECFUNC("modff", "armpl_svmodf_f32_x", SCALABLE(4), MASKED, "_ZGVsMxvl4")
 
 TLI_DEFINE_VECFUNC("nextafter", "armpl_vnextafterq_f64", FIXED(2), NOMASK, "_ZGV_LLVM_N2vv")
 TLI_DEFINE_VECFUNC("nextafterf", "armpl_vnextafterq_f32", FIXED(4), NOMASK, "_ZGV_LLVM_N4vv")
@@ -1035,13 +1033,9 @@ TLI_DEFINE_VECFUNC("llvm.sin.f32", "armpl_svsin_f32_x", SCALABLE(4), MASKED, "_Z
 
 TLI_DEFINE_VECFUNC("sincos", "armpl_vsincosq_f64", FIXED(2), NOMASK, "_ZGV_LLVM_N2vl8l8")
 TLI_DEFINE_VECFUNC("sincosf", "armpl_vsincosq_f32", FIXED(4), NOMASK, "_ZGV_LLVM_N4vl4l4")
-TLI_DEFINE_VECFUNC("sincos", "armpl_svsincos_f64_x",  SCALABLE(2), MASKED, "_ZGVsMxvl8l8")
-TLI_DEFINE_VECFUNC("sincosf", "armpl_svsincos_f32_x", SCALABLE(4), MASKED, "_ZGVsMxvl4l4")
 
 TLI_DEFINE_VECFUNC("sincospi", "armpl_vsincospiq_f64", FIXED(2), NOMASK, "_ZGV_LLVM_N2vl8l8")
 TLI_DEFINE_VECFUNC("sincospif", "armpl_vsincospiq_f32", FIXED(4), NOMASK, "_ZGV_LLVM_N4vl4l4")
-TLI_DEFINE_VECFUNC("sincospi", "armpl_svsincospi_f64_x", SCALABLE(2), MASKED, "_ZGVsMxvl8l8")
-TLI_DEFINE_VECFUNC("sincospif", "armpl_svsincospi_f32_x", SCALABLE(4), MASKED, "_ZGVsMxvl4l4")
 
 TLI_DEFINE_VECFUNC("sinh", "armpl_vsinhq_f64", FIXED(2), NOMASK, "_ZGV_LLVM_N2v")
 TLI_DEFINE_VECFUNC("sinhf", "armpl_vsinhq_f32", FIXED(4), NOMASK, "_ZGV_LLVM_N4v")
diff --git a/contrib/llvm-project/llvm/include/llvm/CodeGen/LivePhysRegs.h b/contrib/llvm-project/llvm/include/llvm/CodeGen/LivePhysRegs.h
index 76bb34d270a2..1d40b1cbb0ea 100644
--- a/contrib/llvm-project/llvm/include/llvm/CodeGen/LivePhysRegs.h
+++ b/contrib/llvm-project/llvm/include/llvm/CodeGen/LivePhysRegs.h
@@ -193,11 +193,18 @@ void addLiveIns(MachineBasicBlock &MBB, const LivePhysRegs &LiveRegs);
 void computeAndAddLiveIns(LivePhysRegs &LiveRegs,
                           MachineBasicBlock &MBB);
 
-/// Convenience function for recomputing live-in's for \p MBB.
-static inline void recomputeLiveIns(MachineBasicBlock &MBB) {
+/// Convenience function for recomputing live-in's for a MBB. Returns true if
+/// any changes were made.
+static inline bool recomputeLiveIns(MachineBasicBlock &MBB) {
   LivePhysRegs LPR;
+  auto oldLiveIns = MBB.getLiveIns();
+
   MBB.clearLiveIns();
   computeAndAddLiveIns(LPR, MBB);
+  MBB.sortUniqueLiveIns();
+
+  auto newLiveIns = MBB.getLiveIns();
+  return oldLiveIns != newLiveIns;
 }
 
 } // end namespace llvm
diff --git a/contrib/llvm-project/llvm/include/llvm/CodeGen/MachineBasicBlock.h b/contrib/llvm-project/llvm/include/llvm/CodeGen/MachineBasicBlock.h
index c84fd281c6a5..dc2035fa598c 100644
--- a/contrib/llvm-project/llvm/include/llvm/CodeGen/MachineBasicBlock.h
+++ b/contrib/llvm-project/llvm/include/llvm/CodeGen/MachineBasicBlock.h
@@ -111,6 +111,10 @@ public:
 
     RegisterMaskPair(MCPhysReg PhysReg, LaneBitmask LaneMask)
         : PhysReg(PhysReg), LaneMask(LaneMask) {}
+
+    bool operator==(const RegisterMaskPair &other) const {
+      return PhysReg == other.PhysReg && LaneMask == other.LaneMask;
+    }
   };
 
 private:
@@ -473,6 +477,8 @@ public:
   /// Remove entry from the livein set and return iterator to the next.
   livein_iterator removeLiveIn(livein_iterator I);
 
+  std::vector<RegisterMaskPair> getLiveIns() const { return LiveIns; }
+
   class liveout_iterator {
   public:
     using iterator_category = std::input_iterator_tag;
diff --git a/contrib/llvm-project/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/contrib/llvm-project/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index e6db9da5526a..c5f43d17d1c1 100644
--- a/contrib/llvm-project/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/contrib/llvm-project/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -2601,6 +2601,11 @@ def int_amdgcn_ds_bvh_stack_rtn :
     [ImmArg<ArgIndex<3>>, IntrWillReturn, IntrNoCallback, IntrNoFree]
   >;
 
+def int_amdgcn_s_wait_event_export_ready :
+  ClangBuiltin<"__builtin_amdgcn_s_wait_event_export_ready">,
+  Intrinsic<[], [], [IntrNoMem, IntrHasSideEffects, IntrWillReturn]
+>;
+
 // WMMA (Wave Matrix Multiply-Accumulate) intrinsics
 //
 // These operations perform a matrix multiplication and accumulation of
@@ -2608,10 +2613,10 @@ def int_amdgcn_ds_bvh_stack_rtn :
 
 class AMDGPUWmmaIntrinsic<LLVMType AB, LLVMType CD> :
   Intrinsic<
-    [CD],               // %D
+    [CD], // %D
     [
       AB,               // %A
-      AB,               // %B
+      LLVMMatchType<1>, // %B
       LLVMMatchType<0>, // %C
     ],
     [IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]
@@ -2619,49 +2624,50 @@ class AMDGPUWmmaIntrinsic<LLVMType AB, LLVMType CD> :
 
 class AMDGPUWmmaIntrinsicOPSEL<LLVMType AB, LLVMType CD> :
   Intrinsic<
-    [CD],               // %D
+    [CD], // %D
     [
       AB,               // %A
-      AB,               // %B
+      LLVMMatchType<1>, // %B
       LLVMMatchType<0>, // %C
-      llvm_i1_ty,       // %high
+      llvm_i1_ty,       // %high (op_sel) for GFX11, 0 for GFX12
     ],
     [IntrNoMem, IntrConvergent, ImmArg<ArgIndex<3>>, IntrWillReturn, IntrNoCallback, IntrNoFree]
 >;
 
 class AMDGPUWmmaIntrinsicIU<LLVMType AB, LLVMType CD> :
   Intrinsic<
-    [CD],               // %D
+    [CD], // %D
     [
       llvm_i1_ty,       // %A_sign
       AB,               // %A
       llvm_i1_ty,       // %B_sign
-      AB,               // %B
+      LLVMMatchType<1>, // %B
       LLVMMatchType<0>, // %C
       llvm_i1_ty,       // %clamp
     ],
     [IntrNoMem, IntrConvergent, ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<5>>, IntrWillReturn, IntrNoCallback, IntrNoFree]
 >;
 
-def int_amdgcn_wmma_f32_16x16x16_f16   : AMDGPUWmmaIntrinsic<llvm_v16f16_ty, llvm_anyfloat_ty>;
-def int_amdgcn_wmma_f32_16x16x16_bf16  : AMDGPUWmmaIntrinsic<llvm_v16i16_ty, llvm_anyfloat_ty>;
-// The regular, untied f16/bf16 wmma intrinsics only write to one half
-// of the registers (set via the op_sel bit).
-// The content of the other 16-bit of the registers is undefined.
-def int_amdgcn_wmma_f16_16x16x16_f16   : AMDGPUWmmaIntrinsicOPSEL<llvm_v16f16_ty, llvm_anyfloat_ty>;
-def int_amdgcn_wmma_bf16_16x16x16_bf16 : AMDGPUWmmaIntrinsicOPSEL<llvm_v16i16_ty, llvm_anyint_ty>;
-// The tied versions of the f16/bf16 wmma intrinsics tie the destination matrix
-// registers to the input accumulator registers.
-// Essentially, the content of the other 16-bit is preserved from the input.
-def int_amdgcn_wmma_f16_16x16x16_f16_tied   : AMDGPUWmmaIntrinsicOPSEL<llvm_v16f16_ty, llvm_anyfloat_ty>;
-def int_amdgcn_wmma_bf16_16x16x16_bf16_tied : AMDGPUWmmaIntrinsicOPSEL<llvm_v16i16_ty, llvm_anyint_ty>;
-def int_amdgcn_wmma_i32_16x16x16_iu8   : AMDGPUWmmaIntrinsicIU<llvm_v4i32_ty, llvm_anyint_ty>;
-def int_amdgcn_wmma_i32_16x16x16_iu4   : AMDGPUWmmaIntrinsicIU<llvm_v2i32_ty, llvm_anyint_ty>;
+// WMMA GFX11Only
 
-def int_amdgcn_s_wait_event_export_ready :
-  ClangBuiltin<"__builtin_amdgcn_s_wait_event_export_ready">,
-  Intrinsic<[], [], [IntrNoMem, IntrHasSideEffects, IntrWillReturn]
->;
+// The OPSEL intrinsics read from and write to one half of the registers, selected by the op_sel bit.
+// The tied versions of the f16/bf16 wmma intrinsics tie the destination matrix registers to the input accumulator registers.
+// The content of the other 16-bit half is preserved from the input.
+def int_amdgcn_wmma_f16_16x16x16_f16_tied   : AMDGPUWmmaIntrinsicOPSEL<llvm_anyfloat_ty, llvm_anyfloat_ty>;
+def int_amdgcn_wmma_bf16_16x16x16_bf16_tied : AMDGPUWmmaIntrinsicOPSEL<llvm_anyint_ty, llvm_anyint_ty>;
+
+// WMMA GFX11Plus
+
+def int_amdgcn_wmma_f32_16x16x16_f16   : AMDGPUWmmaIntrinsic<llvm_anyfloat_ty, llvm_anyfloat_ty>;
+def int_amdgcn_wmma_f32_16x16x16_bf16  : AMDGPUWmmaIntrinsic<llvm_anyint_ty, llvm_anyfloat_ty>;
+def int_amdgcn_wmma_i32_16x16x16_iu8   : AMDGPUWmmaIntrinsicIU<llvm_anyint_ty, llvm_anyint_ty>;
+def int_amdgcn_wmma_i32_16x16x16_iu4   : AMDGPUWmmaIntrinsicIU<llvm_anyint_ty, llvm_anyint_ty>;
+
+// GFX11: The OPSEL intrinsics read from and write to one half of the registers, selected by the op_sel bit.
+//        The content of the other 16-bit half is undefined.
+// GFX12: The op_sel bit must be 0.
+def int_amdgcn_wmma_f16_16x16x16_f16   : AMDGPUWmmaIntrinsicOPSEL<llvm_anyfloat_ty, llvm_anyfloat_ty>;
+def int_amdgcn_wmma_bf16_16x16x16_bf16 : AMDGPUWmmaIntrinsicOPSEL<llvm_anyint_ty, llvm_anyint_ty>;
 
 //===----------------------------------------------------------------------===//
 // GFX12 Intrinsics
@@ -2681,6 +2687,65 @@ def int_amdgcn_permlanex16_var : ClangBuiltin<"__builtin_amdgcn_permlanex16_var"
             [IntrNoMem, IntrConvergent, IntrWillReturn,
              ImmArg<ArgIndex<3>>, ImmArg<ArgIndex<4>>, IntrNoCallback, IntrNoFree]>;
 
+
+// WMMA (Wave Matrix Multiply-Accumulate) intrinsics
+//
+// These operations perform a matrix multiplication and accumulation of
+// the form: D = A * B + C .
+
+// A and B are <8 x fp8> or <8 x bf8>, but since fp8 and bf8 are not supported by llvm we use <2 x i32>.
+def int_amdgcn_wmma_f32_16x16x16_fp8_fp8 : AMDGPUWmmaIntrinsic<llvm_anyint_ty, llvm_anyfloat_ty>;
+def int_amdgcn_wmma_f32_16x16x16_fp8_bf8 : AMDGPUWmmaIntrinsic<llvm_anyint_ty, llvm_anyfloat_ty>;
+def int_amdgcn_wmma_f32_16x16x16_bf8_fp8 : AMDGPUWmmaIntrinsic<llvm_anyint_ty, llvm_anyfloat_ty>;
+def int_amdgcn_wmma_f32_16x16x16_bf8_bf8 : AMDGPUWmmaIntrinsic<llvm_anyint_ty, llvm_anyfloat_ty>;
+// A and B are <16 x iu4>.
+def int_amdgcn_wmma_i32_16x16x32_iu4     : AMDGPUWmmaIntrinsicIU<llvm_anyint_ty, llvm_anyint_ty>;
+
+// SWMMAC (Wave Matrix(sparse) Multiply-Accumulate) intrinsics
+//
+// These operations perform a sparse matrix multiplication and accumulation of
+// the form: D = A * B + C.
+// A is sparse matrix, half the size of B, and is expanded using sparsity index.
+
+class AMDGPUSWmmacIntrinsicIdx<LLVMType A, LLVMType B, LLVMType CD, LLVMType Index> :
+  Intrinsic<
+    [CD],               // %D
+    [
+      A,                // %A
+      B,                // %B
+      LLVMMatchType<0>, // %C
+      Index             // %Sparsity index for A
+    ],
+    [IntrNoMem, IntrConvergent, IntrWillReturn]
+>;
+
+class AMDGPUSWmmacIntrinsicIUIdx<LLVMType A, LLVMType B, LLVMType CD, LLVMType Index> :
+  Intrinsic<
+    [CD],               // %D
+    [
+      llvm_i1_ty,       // %A_sign
+      A,                // %A
+      llvm_i1_ty,       // %B_sign
+      B,                // %B
+      LLVMMatchType<0>, // %C
+      Index,            // %Sparsity index for A
+      llvm_i1_ty,       // %clamp
+    ],
+    [IntrNoMem, IntrConvergent, IntrWillReturn, ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<6>>]
+>;
+
+def int_amdgcn_swmmac_f32_16x16x32_f16     : AMDGPUSWmmacIntrinsicIdx<llvm_anyfloat_ty, llvm_anyfloat_ty, llvm_anyfloat_ty, llvm_anyint_ty>;
+def int_amdgcn_swmmac_f32_16x16x32_bf16    : AMDGPUSWmmacIntrinsicIdx<llvm_anyint_ty, llvm_anyint_ty, llvm_anyfloat_ty, llvm_anyint_ty>;
+def int_amdgcn_swmmac_f16_16x16x32_f16     : AMDGPUSWmmacIntrinsicIdx<llvm_anyfloat_ty, llvm_anyfloat_ty, llvm_anyfloat_ty, llvm_anyint_ty>;
+def int_amdgcn_swmmac_bf16_16x16x32_bf16   : AMDGPUSWmmacIntrinsicIdx<llvm_anyint_ty, llvm_anyint_ty, llvm_anyint_ty, llvm_anyint_ty>;
+def int_amdgcn_swmmac_i32_16x16x32_iu8     : AMDGPUSWmmacIntrinsicIUIdx<llvm_anyint_ty, llvm_anyint_ty, llvm_anyint_ty, llvm_anyint_ty>;
+def int_amdgcn_swmmac_i32_16x16x32_iu4     : AMDGPUSWmmacIntrinsicIUIdx<llvm_anyint_ty, llvm_anyint_ty, llvm_anyint_ty, llvm_anyint_ty>;
+def int_amdgcn_swmmac_i32_16x16x64_iu4     : AMDGPUSWmmacIntrinsicIUIdx<llvm_anyint_ty, llvm_anyint_ty, llvm_anyint_ty, llvm_anyint_ty>;
+def int_amdgcn_swmmac_f32_16x16x32_fp8_fp8 : AMDGPUSWmmacIntrinsicIdx<llvm_anyint_ty, llvm_anyint_ty, llvm_anyfloat_ty, llvm_anyint_ty>;
+def int_amdgcn_swmmac_f32_16x16x32_fp8_bf8 : AMDGPUSWmmacIntrinsicIdx<llvm_anyint_ty, llvm_anyint_ty, llvm_anyfloat_ty, llvm_anyint_ty>;
+def int_amdgcn_swmmac_f32_16x16x32_bf8_fp8 : AMDGPUSWmmacIntrinsicIdx<llvm_anyint_ty, llvm_anyint_ty, llvm_anyfloat_ty, llvm_anyint_ty>;
+def int_amdgcn_swmmac_f32_16x16x32_bf8_bf8 : AMDGPUSWmmacIntrinsicIdx<llvm_anyint_ty, llvm_anyint_ty, llvm_anyfloat_ty, llvm_anyint_ty>;
+
 def int_amdgcn_global_atomic_ordered_add_b64 : AMDGPUAtomicRtn<llvm_i64_ty, global_ptr_ty>;
 
 def int_amdgcn_flat_atomic_fmin_num   : AMDGPUAtomicRtn<llvm_anyfloat_ty>;
@@ -2712,6 +2777,10 @@ class AMDGPULoadTr<LLVMType ptr_ty>:
 
 def int_amdgcn_global_load_tr : AMDGPULoadTr<global_ptr_ty>;
 
+// i32 @llvm.amdgcn.wave.id()
+def int_amdgcn_wave_id :
+  DefaultAttrsIntrinsic<[llvm_i32_ty], [], [IntrNoMem, IntrSpeculatable]>;
+
 //===----------------------------------------------------------------------===//
 // Deep learning intrinsics.
 //===----------------------------------------------------------------------===//
diff --git a/contrib/llvm-project/llvm/include/llvm/Support/X86FoldTablesUtils.h b/contrib/llvm-project/llvm/include/llvm/Support/X86FoldTablesUtils.h
index ed244febc38d..77d32cc7fb37 100644
--- a/contrib/llvm-project/llvm/include/llvm/Support/X86FoldTablesUtils.h
+++ b/contrib/llvm-project/llvm/include/llvm/Support/X86FoldTablesUtils.h
@@ -46,11 +46,12 @@ enum {
   // Broadcast type.
   // (stored in bits 12 - 14)
   TB_BCAST_TYPE_SHIFT = TB_ALIGN_SHIFT + 3,
-  TB_BCAST_D = 0 << TB_BCAST_TYPE_SHIFT,
-  TB_BCAST_Q = 1 << TB_BCAST_TYPE_SHIFT,
-  TB_BCAST_SS = 2 << TB_BCAST_TYPE_SHIFT,
-  TB_BCAST_SD = 3 << TB_BCAST_TYPE_SHIFT,
-  TB_BCAST_SH = 4 << TB_BCAST_TYPE_SHIFT,
+  TB_BCAST_W = 0 << TB_BCAST_TYPE_SHIFT,
+  TB_BCAST_D = 1 << TB_BCAST_TYPE_SHIFT,
+  TB_BCAST_Q = 2 << TB_BCAST_TYPE_SHIFT,
+  TB_BCAST_SS = 3 << TB_BCAST_TYPE_SHIFT,
+  TB_BCAST_SD = 4 << TB_BCAST_TYPE_SHIFT,
+  TB_BCAST_SH = 5 << TB_BCAST_TYPE_SHIFT,
   TB_BCAST_MASK = 0x7 << TB_BCAST_TYPE_SHIFT,
 
   // Unused bits 15-16
diff --git a/contrib/llvm-project/llvm/include/llvm/Target/TargetInstrPredicate.td b/contrib/llvm-project/llvm/include/llvm/Target/TargetInstrPredicate.td
index 82c4c7b23a49..b5419cb9f386 100644
--- a/contrib/llvm-project/llvm/include/llvm/Target/TargetInstrPredicate.td
+++ b/contrib/llvm-project/llvm/include/llvm/Target/TargetInstrPredicate.td
@@ -152,6 +152,34 @@ class CheckImmOperand_s<int Index, string Value> : CheckOperandBase<Index> {
   string ImmVal = Value;
 }
 
+// Check that the operand at position `Index` is less than `Imm`.
+// If field `FunctionMapper` is a non-empty string, then function
+// `FunctionMapper` is applied to the operand value, and the return value is then
+// compared against `Imm`.
+class CheckImmOperandLT<int Index, int Imm> : CheckOperandBase<Index> {
+  int ImmVal = Imm;
+}
+
+// Check that the operand at position `Index` is greater than `Imm`.
+// If field `FunctionMapper` is a non-empty string, then function
+// `FunctionMapper` is applied to the operand value, and the return value is then
+// compared against `Imm`.
+class CheckImmOperandGT<int Index, int Imm> : CheckOperandBase<Index> {
+  int ImmVal = Imm;
+}
+
+// Check that the operand at position `Index` is less than or equal to `Imm`.
+// If field `FunctionMapper` is a non-empty string, then function
+// `FunctionMapper` is applied to the operand value, and the return value is then
+// compared against `Imm`.
+class CheckImmOperandLE<int Index, int Imm> : CheckNot<CheckImmOperandGT<Index, Imm>>;
+
+// Check that the operand at position `Index` is greater than or equal to `Imm`.
+// If field `FunctionMapper` is a non-empty string, then function
+// `FunctionMapper` is applied to the operand value, and the return value is then
+// compared against `Imm`.
+class CheckImmOperandGE<int Index, int Imm> : CheckNot<CheckImmOperandLT<Index, Imm>>;
+
 // Expands to a call to `FunctionMapper` if field `FunctionMapper` is set.
 // Otherwise, it expands to a CheckNot<CheckInvalidRegOperand<Index>>.
 class CheckRegOperandSimple<int Index> : CheckOperandBase<Index>;
@@ -203,6 +231,12 @@ class CheckAll<list<MCInstPredicate> Sequence>
 class CheckAny<list<MCInstPredicate> Sequence>
     : CheckPredicateSequence<Sequence>;
 
+// Check that the operand at position `Index` is in range [Start, End].
+// If field `FunctionMapper` is a non-empty string, then function
+// `FunctionMapper` is applied to the operand value, and the return value is then
+// compared against range [Start, End].
+class CheckImmOperandRange<int Index, int Start, int End>
+  : CheckAll<[CheckImmOperandGE<Index, Start>, CheckImmOperandLE<Index, End>]>;
 
 // Used to expand the body of a function predicate. See the definition of
 // TIIPredicate below.
diff --git a/contrib/llvm-project/llvm/include/llvm/TargetParser/AArch64TargetParser.h b/contrib/llvm-project/llvm/include/llvm/TargetParser/AArch64TargetParser.h
index 623fdc21ba65..6d82748d8004 100644
--- a/contrib/llvm-project/llvm/include/llvm/TargetParser/AArch64TargetParser.h
+++ b/contrib/llvm-project/llvm/include/llvm/TargetParser/AArch64TargetParser.h
@@ -813,7 +813,8 @@ struct CpuAlias {
   StringRef Name;
 };
 
-inline constexpr CpuAlias CpuAliases[] = {{"grace", "neoverse-v2"}};
+inline constexpr CpuAlias CpuAliases[] = {{"cobalt-100", "neoverse-n2"},
+                                          {"grace", "neoverse-v2"}};
 
 bool getExtensionFeatures(
     const AArch64::ExtensionBitset &Extensions,
diff --git a/contrib/llvm-project/llvm/include/llvm/TargetParser/Triple.h b/contrib/llvm-project/llvm/include/llvm/TargetParser/Triple.h
index 870dc75b1c1f..49ec8de9c528 100644
--- a/contrib/llvm-project/llvm/include/llvm/TargetParser/Triple.h
+++ b/contrib/llvm-project/llvm/include/llvm/TargetParser/Triple.h
@@ -1033,11 +1033,11 @@ public:
            isWindowsCygwinEnvironment() || isOHOSFamily();
   }
 
-  /// Tests whether the target uses TLS Descriptor by default.
+  /// True if the target supports both general-dynamic and TLSDESC, and TLSDESC
+  /// is enabled by default.
   bool hasDefaultTLSDESC() const {
     // TODO: Improve check for other platforms, like Android, and RISC-V
-    // Note: This is currently only used on RISC-V.
-    return isOSBinFormatELF() && isAArch64();
+    return false;
   }
 
   /// Tests whether the target uses -data-sections as default.