55 files changed, 3227 insertions, 448 deletions
diff --git a/clang/lib/Headers/__clang_cuda_texture_intrinsics.h b/clang/lib/Headers/__clang_cuda_texture_intrinsics.h
index 3c0f0026f1f0..a71952211237 100644
--- a/clang/lib/Headers/__clang_cuda_texture_intrinsics.h
+++ b/clang/lib/Headers/__clang_cuda_texture_intrinsics.h
@@ -666,6 +666,7 @@ __device__ static void __tex_fetch(__T *__ptr, cudaTextureObject_t __handle,
       __tex_fetch_v4<__op>::template __run<__FetchT>(__handle, __args...));
 }
 
+#if CUDA_VERSION < 12000
 // texture<> objects get magically converted into a texture reference.  However,
 // there's no way to convert them to cudaTextureObject_t on C++ level. So, we
 // cheat a bit and use inline assembly to do it. It costs us an extra register
@@ -713,6 +714,7 @@ __tex_fetch(__DataT *, __RetT *__ptr,
       __tex_fetch_v4<__op>::template __run<__FetchT>(
           __tex_handle_to_obj(__handle), __args...));
 }
+#endif // CUDA_VERSION
 } // namespace __cuda_tex
 } // namespace
 #pragma pop_macro("__ASM_OUT")
diff --git a/clang/lib/Headers/__clang_hip_libdevice_declares.h b/clang/lib/Headers/__clang_hip_libdevice_declares.h
index 8be848ba2aa3..be25f4b4a050 100644
--- a/clang/lib/Headers/__clang_hip_libdevice_declares.h
+++ b/clang/lib/Headers/__clang_hip_libdevice_declares.h
@@ -288,12 +288,17 @@ __llvm_amdgcn_rsq_f64(double __x) {
 
 __device__ __attribute__((const)) _Float16 __ocml_ceil_f16(_Float16);
 __device__ _Float16 __ocml_cos_f16(_Float16);
+__device__ __attribute__((const)) _Float16 __ocml_cvtrtn_f16_f32(float);
+__device__ __attribute__((const)) _Float16 __ocml_cvtrtp_f16_f32(float);
+__device__ __attribute__((const)) _Float16 __ocml_cvtrtz_f16_f32(float);
 __device__ __attribute__((pure)) _Float16 __ocml_exp_f16(_Float16);
 __device__ __attribute__((pure)) _Float16 __ocml_exp10_f16(_Float16);
 __device__ __attribute__((pure)) _Float16 __ocml_exp2_f16(_Float16);
 __device__ __attribute__((const)) _Float16 __ocml_floor_f16(_Float16);
 __device__ __attribute__((const)) _Float16 __ocml_fma_f16(_Float16, _Float16,
                                                           _Float16);
+__device__ __attribute__((const)) _Float16 __ocml_fmax_f16(_Float16, _Float16);
+__device__ __attribute__((const)) _Float16 __ocml_fmin_f16(_Float16, _Float16);
 __device__ __attribute__((const)) _Float16 __ocml_fabs_f16(_Float16);
 __device__ __attribute__((const)) int __ocml_isinf_f16(_Float16);
 __device__ __attribute__((const)) int __ocml_isnan_f16(_Float16);
diff --git a/clang/lib/Headers/__clang_hip_math.h b/clang/lib/Headers/__clang_hip_math.h
index ef7e087b832c..537dd0fca870 100644
--- a/clang/lib/Headers/__clang_hip_math.h
+++ b/clang/lib/Headers/__clang_hip_math.h
@@ -70,9 +70,9 @@ __DEVICE__ void __static_assert_equal_size() {
 #endif
 
 __DEVICE__
-uint64_t __make_mantissa_base8(const char *__tagp) {
+uint64_t __make_mantissa_base8(const char *__tagp __attribute__((nonnull))) {
   uint64_t __r = 0;
-  while (__tagp) {
+  while (*__tagp != '\0') {
     char __tmp = *__tagp;
 
     if (__tmp >= '0' && __tmp <= '7')
@@ -87,9 +87,9 @@ uint64_t __make_mantissa_base8(const char *__tagp) {
 }
 
 __DEVICE__
-uint64_t __make_mantissa_base10(const char *__tagp) {
+uint64_t __make_mantissa_base10(const char *__tagp __attribute__((nonnull))) {
   uint64_t __r = 0;
-  while (__tagp) {
+  while (*__tagp != '\0') {
     char __tmp = *__tagp;
 
     if (__tmp >= '0' && __tmp <= '9')
@@ -104,9 +104,9 @@ uint64_t __make_mantissa_base10(const char *__tagp) {
 }
 
 __DEVICE__
-uint64_t __make_mantissa_base16(const char *__tagp) {
+uint64_t __make_mantissa_base16(const char *__tagp __attribute__((nonnull))) {
   uint64_t __r = 0;
-  while (__tagp) {
+  while (*__tagp != '\0') {
     char __tmp = *__tagp;
 
     if (__tmp >= '0' && __tmp <= '9')
@@ -125,10 +125,7 @@ uint64_t __make_mantissa_base16(const char *__tagp) {
 }
 
 __DEVICE__
-uint64_t __make_mantissa(const char *__tagp) {
-  if (!__tagp)
-    return 0u;
-
+uint64_t __make_mantissa(const char *__tagp __attribute__((nonnull))) {
   if (*__tagp == '0') {
     ++__tagp;
 
@@ -233,7 +230,7 @@ __DEVICE__
 float expm1f(float __x) { return __ocml_expm1_f32(__x); }
 
 __DEVICE__
-float fabsf(float __x) { return __ocml_fabs_f32(__x); }
+float fabsf(float __x) { return __builtin_fabsf(__x); }
 
 __DEVICE__
 float fdimf(float __x, float __y) { return __ocml_fdim_f32(__x, __y); }
@@ -359,7 +356,7 @@ float modff(float __x, float *__iptr) {
 }
 
 __DEVICE__
-float nanf(const char *__tagp) {
+float nanf(const char *__tagp __attribute__((nonnull))) {
   union {
     float val;
     struct ieee_float {
@@ -792,7 +789,7 @@ __DEVICE__
 double expm1(double __x) { return __ocml_expm1_f64(__x); }
 
 __DEVICE__
-double fabs(double __x) { return __ocml_fabs_f64(__x); }
+double fabs(double __x) { return __builtin_fabs(__x); }
 
 __DEVICE__
 double fdim(double __x, double __y) { return __ocml_fdim_f64(__x, __y); }
diff --git a/clang/lib/Headers/__clang_hip_runtime_wrapper.h b/clang/lib/Headers/__clang_hip_runtime_wrapper.h
index 10cec58ed12f..0508731de106 100644
--- a/clang/lib/Headers/__clang_hip_runtime_wrapper.h
+++ b/clang/lib/Headers/__clang_hip_runtime_wrapper.h
@@ -113,6 +113,7 @@ __attribute__((weak)) inline __device__ void free(void *__ptr) {
 
 #include <__clang_hip_libdevice_declares.h>
 #include <__clang_hip_math.h>
+#include <__clang_hip_stdlib.h>
 
 #if defined(__HIPCC_RTC__)
 #include <__clang_hip_cmath.h>
diff --git a/clang/lib/Headers/__clang_hip_stdlib.h b/clang/lib/Headers/__clang_hip_stdlib.h
new file mode 100644
index 000000000000..bd770e2415f9
--- /dev/null
+++ b/clang/lib/Headers/__clang_hip_stdlib.h
@@ -0,0 +1,43 @@
+/*===---- __clang_hip_stdlib.h - Device-side HIP math support --------------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+#ifndef __CLANG_HIP_STDLIB_H__
+
+#if !defined(__HIP__) && !defined(__OPENMP_AMDGCN__)
+#error "This file is for HIP and OpenMP AMDGCN device compilation only."
+#endif
+
+#if !defined(__cplusplus)
+
+#include <limits.h>
+
+#ifdef __OPENMP_AMDGCN__
+#define __DEVICE__ static inline __attribute__((always_inline, nothrow))
+#else
+#define __DEVICE__ static __device__ inline __attribute__((always_inline))
+#endif
+
+__DEVICE__
+int abs(int __x) {
+  int __sgn = __x >> (sizeof(int) * CHAR_BIT - 1);
+  return (__x ^ __sgn) - __sgn;
+}
+__DEVICE__
+long labs(long __x) {
+  long __sgn = __x >> (sizeof(long) * CHAR_BIT - 1);
+  return (__x ^ __sgn) - __sgn;
+}
+__DEVICE__
+long long llabs(long long __x) {
+  long long __sgn = __x >> (sizeof(long long) * CHAR_BIT - 1);
+  return (__x ^ __sgn) - __sgn;
+}
+
+#endif // !defined(__cplusplus)
+
+#endif // #define __CLANG_HIP_STDLIB_H__
diff --git a/clang/lib/Headers/altivec.h b/clang/lib/Headers/altivec.h
index 0b1e76e81cc7..f50466ec9637 100644
--- a/clang/lib/Headers/altivec.h
+++ b/clang/lib/Headers/altivec.h
@@ -17323,32 +17323,32 @@ provided.
 #define vec_ncipherlast_be __builtin_altivec_crypto_vncipherlast
 
 #ifdef __VSX__
-static __inline__ vector unsigned long long __attribute__((__always_inline__))
-__builtin_crypto_vsbox(vector unsigned long long __a) {
+static __inline__ vector unsigned char __attribute__((__always_inline__))
+__builtin_crypto_vsbox(vector unsigned char __a) {
   return __builtin_altivec_crypto_vsbox(__a);
 }
 
-static __inline__ vector unsigned long long __attribute__((__always_inline__))
-__builtin_crypto_vcipher(vector unsigned long long __a,
-                         vector unsigned long long __b) {
+static __inline__ vector unsigned char __attribute__((__always_inline__))
+__builtin_crypto_vcipher(vector unsigned char __a,
+                         vector unsigned char __b) {
   return __builtin_altivec_crypto_vcipher(__a, __b);
 }
 
-static __inline__ vector unsigned long long __attribute__((__always_inline__))
-__builtin_crypto_vcipherlast(vector unsigned long long __a,
-                             vector unsigned long long __b) {
+static __inline__ vector unsigned char __attribute__((__always_inline__))
+__builtin_crypto_vcipherlast(vector unsigned char __a,
+                             vector unsigned char __b) {
   return __builtin_altivec_crypto_vcipherlast(__a, __b);
 }
 
-static __inline__ vector unsigned long long __attribute__((__always_inline__))
-__builtin_crypto_vncipher(vector unsigned long long __a,
-                          vector unsigned long long __b) {
+static __inline__ vector unsigned char __attribute__((__always_inline__))
+__builtin_crypto_vncipher(vector unsigned char __a,
+                          vector unsigned char __b) {
   return __builtin_altivec_crypto_vncipher(__a, __b);
 }
 
-static __inline__ vector unsigned long long __attribute__((__always_inline__))
-__builtin_crypto_vncipherlast(vector unsigned long long __a,
-                              vector unsigned long long __b) {
+static __inline__ vector unsigned char  __attribute__((__always_inline__))
+__builtin_crypto_vncipherlast(vector unsigned char __a,
+                              vector unsigned char __b) {
   return __builtin_altivec_crypto_vncipherlast(__a, __b);
 }
 #endif /* __VSX__ */
diff --git a/clang/lib/Headers/amxfp16intrin.h b/clang/lib/Headers/amxfp16intrin.h
new file mode 100644
index 000000000000..ed798245d41e
--- /dev/null
+++ b/clang/lib/Headers/amxfp16intrin.h
@@ -0,0 +1,58 @@
+/*===------------- amxfp16intrin.h - AMX_FP16 intrinsics -*- C++ -*---------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===------------------------------------------------------------------------===
+ */
+
+#ifndef __IMMINTRIN_H
+#error "Never use <amxfp16intrin.h> directly; use <immintrin.h> instead."
+#endif /* __IMMINTRIN_H */
+
+#ifndef __AMX_FP16INTRIN_H
+#define __AMX_FP16INTRIN_H
+#ifdef __x86_64__
+
+/// Compute dot-product of FP16 (16-bit) floating-point pairs in tiles \a a
+///    and \a b, accumulating the intermediate single-precision (32-bit)
+///    floating-point elements with elements in \a dst, and store the 32-bit
+///    result back to tile \a dst.
+///
+/// \headerfile <immintrin.h>
+///
+/// \code
+/// void _tile_dpfp16ps (__tile dst, __tile a, __tile b)
+/// \endcode
+///
+/// \code{.operation}
+/// FOR m := 0 TO dst.rows - 1
+///	tmp := dst.row[m]
+///	FOR k := 0 TO (a.colsb / 4) - 1
+///		FOR n := 0 TO (dst.colsb / 4) - 1
+///			tmp.fp32[n] += FP32(a.row[m].fp16[2*k+0]) *
+///					FP32(b.row[k].fp16[2*n+0])
+///			tmp.fp32[n] += FP32(a.row[m].fp16[2*k+1]) *
+///					FP32(b.row[k].fp16[2*n+1])
+///		ENDFOR
+///	ENDFOR
+///	write_row_and_zero(dst, m, tmp, dst.colsb)
+/// ENDFOR
+/// zero_upper_rows(dst, dst.rows)
+/// zero_tileconfig_start()
+/// \endcode
+///
+/// This intrinsic corresponds to the \c TDPFP16PS instruction.
+///
+/// \param dst
+///    The destination tile. Max size is 1024 Bytes.
+/// \param a
+///    The 1st source tile. Max size is 1024 Bytes.
+/// \param b
+///    The 2nd source tile. Max size is 1024 Bytes.
+#define _tile_dpfp16ps(dst, a, b)                                \
+  __builtin_ia32_tdpfp16ps(dst, a, b)
+
+#endif /* __x86_64__ */
+#endif /* __AMX_FP16INTRIN_H */
diff --git a/clang/lib/Headers/amxintrin.h b/clang/lib/Headers/amxintrin.h
index ec67a87e39ca..baa56f5b28e8 100644
--- a/clang/lib/Headers/amxintrin.h
+++ b/clang/lib/Headers/amxintrin.h
@@ -22,6 +22,8 @@
   __attribute__((__always_inline__, __nodebug__, __target__("amx-int8")))
 #define __DEFAULT_FN_ATTRS_BF16                                                \
   __attribute__((__always_inline__, __nodebug__, __target__("amx-bf16")))
+#define __DEFAULT_FN_ATTRS_FP16                                                \
+  __attribute__((__always_inline__, __nodebug__, __target__("amx-fp16")))
 
 /// Load tile configuration from a 64-byte memory location specified by
 /// "mem_addr". The tile configuration includes the tile type palette, the
@@ -290,6 +292,13 @@ _tile_dpbf16ps_internal(unsigned short m, unsigned short n, unsigned short k,
   return __builtin_ia32_tdpbf16ps_internal(m, n, k, dst, src1, src2);
 }
 
+/// This is internal intrinsic. C/C++ user should avoid calling it directly.
+static __inline__ _tile1024i __DEFAULT_FN_ATTRS_FP16
+_tile_dpfp16ps_internal(unsigned short m, unsigned short n, unsigned short k,
+                        _tile1024i dst, _tile1024i src1, _tile1024i src2) {
+  return __builtin_ia32_tdpfp16ps_internal(m, n, k, dst, src1, src2);
+}
+
 /// This struct pack the shape and tile data together for user. We suggest
 /// initializing the struct as early as possible, because compiler depends
 /// on the shape information to do configure. The constant value is preferred
@@ -484,9 +493,32 @@ static __inline__ void __tile_dpbf16ps(__tile1024i *dst, __tile1024i src0,
                                       src0.tile, src1.tile);
 }
 
+/// Compute dot-product of FP16 (16-bit) floating-point pairs in tiles src0 and
+/// src1, accumulating the intermediate single-precision (32-bit) floating-point
+/// elements with elements in "dst", and store the 32-bit result back to tile
+/// "dst".
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the <c> TDPFP16PS </c> instruction.
+///
+/// \param dst
+///    The destination tile. Max size is 1024 Bytes.
+/// \param src0
+///    The 1st source tile. Max size is 1024 Bytes.
+/// \param src1
+///    The 2nd source tile. Max size is 1024 Bytes.
+__DEFAULT_FN_ATTRS_FP16
+static __inline__ void __tile_dpfp16ps(__tile1024i *dst, __tile1024i src0,
+                                       __tile1024i src1) {
+  dst->tile = _tile_dpfp16ps_internal(src0.row, src1.col, src0.col, dst->tile,
+                                      src0.tile, src1.tile);
+}
+
 #undef __DEFAULT_FN_ATTRS_TILE
 #undef __DEFAULT_FN_ATTRS_INT8
 #undef __DEFAULT_FN_ATTRS_BF16
+#undef __DEFAULT_FN_ATTRS_FP16
 
 #endif /* __x86_64__ */
 #endif /* __AMXINTRIN_H */
diff --git a/clang/lib/Headers/arm_acle.h b/clang/lib/Headers/arm_acle.h
index 1cfc1403276d..e086f1f02dad 100644
--- a/clang/lib/Headers/arm_acle.h
+++ b/clang/lib/Headers/arm_acle.h
@@ -64,7 +64,7 @@ static __inline__ void __attribute__((__always_inline__, __nodebug__)) __yield(v
 }
 #endif
 
-#if __ARM_32BIT_STATE
+#if defined(__ARM_32BIT_STATE) && __ARM_32BIT_STATE
 #define __dbg(t) __builtin_arm_dbg(t)
 #endif
 
@@ -82,7 +82,7 @@ __swp(uint32_t __x, volatile uint32_t *__p) {
 /* 8.6.1 Data prefetch */
 #define __pld(addr) __pldx(0, 0, 0, addr)
 
-#if __ARM_32BIT_STATE
+#if defined(__ARM_32BIT_STATE) && __ARM_32BIT_STATE
 #define __pldx(access_kind, cache_level, retention_policy, addr) \
   __builtin_arm_prefetch(addr, access_kind, 1)
 #else
@@ -93,7 +93,7 @@ __swp(uint32_t __x, volatile uint32_t *__p) {
 /* 8.6.2 Instruction prefetch */
 #define __pli(addr) __plix(0, 0, addr)
 
-#if __ARM_32BIT_STATE
+#if defined(__ARM_32BIT_STATE) && __ARM_32BIT_STATE
 #define __plix(cache_level, retention_policy, addr) \
   __builtin_arm_prefetch(addr, 0, 0)
 #else
@@ -140,17 +140,17 @@ __rorl(unsigned long __x, uint32_t __y) {
 /* CLZ */
 static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__))
 __clz(uint32_t __t) {
-  return __builtin_clz(__t);
+  return (uint32_t)__builtin_clz(__t);
 }
 
 static __inline__ unsigned long __attribute__((__always_inline__, __nodebug__))
 __clzl(unsigned long __t) {
-  return __builtin_clzl(__t);
+  return (unsigned long)__builtin_clzl(__t);
 }
 
 static __inline__ uint64_t __attribute__((__always_inline__, __nodebug__))
 __clzll(uint64_t __t) {
-  return __builtin_clzll(__t);
+  return (uint64_t)__builtin_clzll(__t);
 }
 
 /* CLS */
@@ -201,7 +201,7 @@ __rev16(uint32_t __t) {
 
 static __inline__ uint64_t __attribute__((__always_inline__, __nodebug__))
 __rev16ll(uint64_t __t) {
-  return (((uint64_t)__rev16(__t >> 32)) << 32) | __rev16(__t);
+  return (((uint64_t)__rev16(__t >> 32)) << 32) | (uint64_t)__rev16((uint32_t)__t);
 }
 
 static __inline__ unsigned long __attribute__((__always_inline__, __nodebug__))
@@ -216,7 +216,7 @@ __rev16l(unsigned long __t) {
 /* REVSH */
 static __inline__ int16_t __attribute__((__always_inline__, __nodebug__))
 __revsh(int16_t __t) {
-  return __builtin_bswap16(__t);
+  return (int16_t)__builtin_bswap16((uint16_t)__t);
 }
 
 /* RBIT */
@@ -227,7 +227,7 @@ __rbit(uint32_t __t) {
 
 static __inline__ uint64_t __attribute__((__always_inline__, __nodebug__))
 __rbitll(uint64_t __t) {
-#if __ARM_32BIT_STATE
+#if defined(__ARM_32BIT_STATE) && __ARM_32BIT_STATE
   return (((uint64_t)__builtin_arm_rbit(__t)) << 32) |
          __builtin_arm_rbit(__t >> 32);
 #else
@@ -247,7 +247,7 @@ __rbitl(unsigned long __t) {
 /*
  * 9.3 16-bit multiplications
  */
-#if __ARM_FEATURE_DSP
+#if defined(__ARM_FEATURE_DSP) && __ARM_FEATURE_DSP
 static __inline__ int32_t __attribute__((__always_inline__,__nodebug__))
 __smulbb(int32_t __a, int32_t __b) {
   return __builtin_arm_smulbb(__a, __b);
@@ -277,17 +277,17 @@ __smulwt(int32_t __a, int32_t __b) {
 /*
  * 9.4 Saturating intrinsics
  *
- * FIXME: Change guard to their corrosponding __ARM_FEATURE flag when Q flag
+ * FIXME: Change guard to their corresponding __ARM_FEATURE flag when Q flag
  * intrinsics are implemented and the flag is enabled.
  */
 /* 9.4.1 Width-specified saturation intrinsics */
-#if __ARM_FEATURE_SAT
+#if defined(__ARM_FEATURE_SAT) && __ARM_FEATURE_SAT
 #define __ssat(x, y) __builtin_arm_ssat(x, y)
 #define __usat(x, y) __builtin_arm_usat(x, y)
 #endif
 
 /* 9.4.2 Saturating addition and subtraction intrinsics */
-#if __ARM_FEATURE_DSP
+#if defined(__ARM_FEATURE_DSP) && __ARM_FEATURE_DSP
 static __inline__ int32_t __attribute__((__always_inline__, __nodebug__))
 __qadd(int32_t __t, int32_t __v) {
   return __builtin_arm_qadd(__t, __v);
@@ -305,7 +305,7 @@ __qdbl(int32_t __t) {
 #endif
 
 /* 9.4.3 Accumultating multiplications */
-#if __ARM_FEATURE_DSP
+#if defined(__ARM_FEATURE_DSP) && __ARM_FEATURE_DSP
 static __inline__ int32_t __attribute__((__always_inline__, __nodebug__))
 __smlabb(int32_t __a, int32_t __b, int32_t __c) {
   return __builtin_arm_smlabb(__a, __b, __c);
@@ -334,13 +334,13 @@ __smlawt(int32_t __a, int32_t __b, int32_t __c) {
 
 
 /* 9.5.4 Parallel 16-bit saturation */
-#if __ARM_FEATURE_SIMD32
+#if defined(__ARM_FEATURE_SIMD32) && __ARM_FEATURE_SIMD32
 #define __ssat16(x, y) __builtin_arm_ssat16(x, y)
 #define __usat16(x, y) __builtin_arm_usat16(x, y)
 #endif
 
 /* 9.5.5 Packing and unpacking */
-#if __ARM_FEATURE_SIMD32
+#if defined(__ARM_FEATURE_SIMD32) && __ARM_FEATURE_SIMD32
 typedef int32_t int8x4_t;
 typedef int32_t int16x2_t;
 typedef uint32_t uint8x4_t;
@@ -365,7 +365,7 @@ __uxtb16(int8x4_t __a) {
 #endif
 
 /* 9.5.6 Parallel selection */
-#if __ARM_FEATURE_SIMD32
+#if defined(__ARM_FEATURE_SIMD32) && __ARM_FEATURE_SIMD32
 static __inline__ uint8x4_t __attribute__((__always_inline__, __nodebug__))
 __sel(uint8x4_t __a, uint8x4_t __b) {
   return __builtin_arm_sel(__a, __b);
@@ -373,7 +373,7 @@ __sel(uint8x4_t __a, uint8x4_t __b) {
 #endif
 
 /* 9.5.7 Parallel 8-bit addition and subtraction */
-#if __ARM_FEATURE_SIMD32
+#if defined(__ARM_FEATURE_SIMD32) && __ARM_FEATURE_SIMD32
 static __inline__ int8x4_t __attribute__((__always_inline__, __nodebug__))
 __qadd8(int8x4_t __a, int8x4_t __b) {
   return __builtin_arm_qadd8(__a, __b);
@@ -425,7 +425,7 @@ __usub8(uint8x4_t __a, uint8x4_t __b) {
 #endif
 
 /* 9.5.8 Sum of 8-bit absolute differences */
-#if __ARM_FEATURE_SIMD32
+#if defined(__ARM_FEATURE_SIMD32) && __ARM_FEATURE_SIMD32
 static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__))
 __usad8(uint8x4_t __a, uint8x4_t __b) {
   return __builtin_arm_usad8(__a, __b);
@@ -437,7 +437,7 @@ __usada8(uint8x4_t __a, uint8x4_t __b, uint32_t __c) {
 #endif
 
 /* 9.5.9 Parallel 16-bit addition and subtraction */
-#if __ARM_FEATURE_SIMD32
+#if defined(__ARM_FEATURE_SIMD32) && __ARM_FEATURE_SIMD32
 static __inline__ int16x2_t __attribute__((__always_inline__, __nodebug__))
 __qadd16(int16x2_t __a, int16x2_t __b) {
   return __builtin_arm_qadd16(__a, __b);
@@ -537,7 +537,7 @@ __usub16(uint16x2_t __a, uint16x2_t __b) {
 #endif
 
 /* 9.5.10 Parallel 16-bit multiplications */
-#if __ARM_FEATURE_SIMD32
+#if defined(__ARM_FEATURE_SIMD32) && __ARM_FEATURE_SIMD32
 static __inline__ int32_t __attribute__((__always_inline__, __nodebug__))
 __smlad(int16x2_t __a, int16x2_t __b, int32_t __c) {
   return __builtin_arm_smlad(__a, __b, __c);
@@ -589,155 +589,156 @@ __smusdx(int16x2_t __a, int16x2_t __b) {
 #endif
 
 /* 9.7 CRC32 intrinsics */
-#if __ARM_FEATURE_CRC32
-static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__))
+#if (defined(__ARM_FEATURE_CRC32) && __ARM_FEATURE_CRC32) ||                   \
+    (defined(__ARM_64BIT_STATE) && __ARM_64BIT_STATE)
+static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__, target("crc")))
 __crc32b(uint32_t __a, uint8_t __b) {
   return __builtin_arm_crc32b(__a, __b);
 }
 
-static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__))
+static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__, target("crc")))
 __crc32h(uint32_t __a, uint16_t __b) {
   return __builtin_arm_crc32h(__a, __b);
 }
 
-static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__))
+static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__, target("crc")))
 __crc32w(uint32_t __a, uint32_t __b) {
   return __builtin_arm_crc32w(__a, __b);
 }
 
-static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__))
+static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__, target("crc")))
 __crc32d(uint32_t __a, uint64_t __b) {
   return __builtin_arm_crc32d(__a, __b);
 }
 
-static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__))
+static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__, target("crc")))
 __crc32cb(uint32_t __a, uint8_t __b) {
   return __builtin_arm_crc32cb(__a, __b);
 }
 
-static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__))
+static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__, target("crc")))
 __crc32ch(uint32_t __a, uint16_t __b) {
   return __builtin_arm_crc32ch(__a, __b);
 }
 
-static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__))
+static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__, target("crc")))
 __crc32cw(uint32_t __a, uint32_t __b) {
   return __builtin_arm_crc32cw(__a, __b);
 }
 
-static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__))
+static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__, target("crc")))
 __crc32cd(uint32_t __a, uint64_t __b) {
   return __builtin_arm_crc32cd(__a, __b);
 }
 #endif
 
 /* Armv8.3-A Javascript conversion intrinsic */
-#if __ARM_64BIT_STATE && defined(__ARM_FEATURE_JCVT)
-static __inline__ int32_t __attribute__((__always_inline__, __nodebug__))
+#if defined(__ARM_64BIT_STATE) && __ARM_64BIT_STATE
+static __inline__ int32_t __attribute__((__always_inline__, __nodebug__, target("v8.3a")))
 __jcvt(double __a) {
   return __builtin_arm_jcvt(__a);
 }
 #endif
 
 /* Armv8.5-A FP rounding intrinsics */
-#if __ARM_64BIT_STATE && defined(__ARM_FEATURE_FRINT)
-static __inline__ float __attribute__((__always_inline__, __nodebug__))
-__frint32zf(float __a) {
-  return __builtin_arm_frint32zf(__a);
+#if defined(__ARM_64BIT_STATE) && __ARM_64BIT_STATE
+static __inline__ float __attribute__((__always_inline__, __nodebug__, target("v8.5a")))
+__rint32zf(float __a) {
+  return __builtin_arm_rint32zf(__a);
 }
 
-static __inline__ double __attribute__((__always_inline__, __nodebug__))
-__frint32z(double __a) {
-  return __builtin_arm_frint32z(__a);
+static __inline__ double __attribute__((__always_inline__, __nodebug__, target("v8.5a")))
+__rint32z(double __a) {
+  return __builtin_arm_rint32z(__a);
 }
 
-static __inline__ float __attribute__((__always_inline__, __nodebug__))
-__frint64zf(float __a) {
-  return __builtin_arm_frint64zf(__a);
+static __inline__ float __attribute__((__always_inline__, __nodebug__, target("v8.5a")))
+__rint64zf(float __a) {
+  return __builtin_arm_rint64zf(__a);
 }
 
-static __inline__ double __attribute__((__always_inline__, __nodebug__))
-__frint64z(double __a) {
-  return __builtin_arm_frint64z(__a);
+static __inline__ double __attribute__((__always_inline__, __nodebug__, target("v8.5a")))
+__rint64z(double __a) {
+  return __builtin_arm_rint64z(__a);
 }
 
-static __inline__ float __attribute__((__always_inline__, __nodebug__))
-__frint32xf(float __a) {
-  return __builtin_arm_frint32xf(__a);
+static __inline__ float __attribute__((__always_inline__, __nodebug__, target("v8.5a")))
+__rint32xf(float __a) {
+  return __builtin_arm_rint32xf(__a);
 }
 
-static __inline__ double __attribute__((__always_inline__, __nodebug__))
-__frint32x(double __a) {
-  return __builtin_arm_frint32x(__a);
+static __inline__ double __attribute__((__always_inline__, __nodebug__, target("v8.5a")))
+__rint32x(double __a) {
+  return __builtin_arm_rint32x(__a);
 }
 
-static __inline__ float __attribute__((__always_inline__, __nodebug__))
-__frint64xf(float __a) {
-  return __builtin_arm_frint64xf(__a);
+static __inline__ float __attribute__((__always_inline__, __nodebug__, target("v8.5a")))
+__rint64xf(float __a) {
+  return __builtin_arm_rint64xf(__a);
 }
 
-static __inline__ double __attribute__((__always_inline__, __nodebug__))
-__frint64x(double __a) {
-  return __builtin_arm_frint64x(__a);
+static __inline__ double __attribute__((__always_inline__, __nodebug__, target("v8.5a")))
+__rint64x(double __a) {
+  return __builtin_arm_rint64x(__a);
 }
 #endif
 
 /* Armv8.7-A load/store 64-byte intrinsics */
-#if __ARM_64BIT_STATE && defined(__ARM_FEATURE_LS64)
+#if defined(__ARM_64BIT_STATE) && __ARM_64BIT_STATE
 typedef struct {
     uint64_t val[8];
 } data512_t;
 
-static __inline__ data512_t __attribute__((__always_inline__, __nodebug__))
+static __inline__ data512_t __attribute__((__always_inline__, __nodebug__, target("ls64")))
 __arm_ld64b(const void *__addr) {
-    data512_t __value;
-    __builtin_arm_ld64b(__addr, __value.val);
-    return __value;
+  data512_t __value;
+  __builtin_arm_ld64b(__addr, __value.val);
+  return __value;
 }
-static __inline__ void __attribute__((__always_inline__, __nodebug__))
+static __inline__ void __attribute__((__always_inline__, __nodebug__, target("ls64")))
 __arm_st64b(void *__addr, data512_t __value) {
-    __builtin_arm_st64b(__addr, __value.val);
+  __builtin_arm_st64b(__addr, __value.val);
 }
-static __inline__ uint64_t __attribute__((__always_inline__, __nodebug__))
+static __inline__ uint64_t __attribute__((__always_inline__, __nodebug__, target("ls64")))
 __arm_st64bv(void *__addr, data512_t __value) {
-    return __builtin_arm_st64bv(__addr, __value.val);
+  return __builtin_arm_st64bv(__addr, __value.val);
 }
-static __inline__ uint64_t __attribute__((__always_inline__, __nodebug__))
+static __inline__ uint64_t __attribute__((__always_inline__, __nodebug__, target("ls64")))
 __arm_st64bv0(void *__addr, data512_t __value) {
-    return __builtin_arm_st64bv0(__addr, __value.val);
+  return __builtin_arm_st64bv0(__addr, __value.val);
 }
 #endif
 
 /* 10.1 Special register intrinsics */
 #define __arm_rsr(sysreg) __builtin_arm_rsr(sysreg)
 #define __arm_rsr64(sysreg) __builtin_arm_rsr64(sysreg)
+#define __arm_rsr128(sysreg) __builtin_arm_rsr128(sysreg)
 #define __arm_rsrp(sysreg) __builtin_arm_rsrp(sysreg)
 #define __arm_rsrf(sysreg) __builtin_bit_cast(float, __arm_rsr(sysreg))
 #define __arm_rsrf64(sysreg) __builtin_bit_cast(double, __arm_rsr64(sysreg))
 #define __arm_wsr(sysreg, v) __builtin_arm_wsr(sysreg, v)
 #define __arm_wsr64(sysreg, v) __builtin_arm_wsr64(sysreg, v)
+#define __arm_wsr128(sysreg, v) __builtin_arm_wsr128(sysreg, v)
 #define __arm_wsrp(sysreg, v) __builtin_arm_wsrp(sysreg, v)
 #define __arm_wsrf(sysreg, v) __arm_wsr(sysreg, __builtin_bit_cast(uint32_t, v))
 #define __arm_wsrf64(sysreg, v) __arm_wsr64(sysreg, __builtin_bit_cast(uint64_t, v))
 
 /* Memory Tagging Extensions (MTE) Intrinsics */
-#if __ARM_FEATURE_MEMORY_TAGGING
+#if defined(__ARM_64BIT_STATE) && __ARM_64BIT_STATE
 #define __arm_mte_create_random_tag(__ptr, __mask)  __builtin_arm_irg(__ptr, __mask)
 #define __arm_mte_increment_tag(__ptr, __tag_offset)  __builtin_arm_addg(__ptr, __tag_offset)
 #define __arm_mte_exclude_tag(__ptr, __excluded)  __builtin_arm_gmi(__ptr, __excluded)
 #define __arm_mte_get_tag(__ptr) __builtin_arm_ldg(__ptr)
 #define __arm_mte_set_tag(__ptr) __builtin_arm_stg(__ptr)
 #define __arm_mte_ptrdiff(__ptra, __ptrb) __builtin_arm_subp(__ptra, __ptrb)
-#endif
 
 /* Memory Operations Intrinsics */
-#if __ARM_FEATURE_MOPS && __ARM_FEATURE_MEMORY_TAGGING
 #define __arm_mops_memset_tag(__tagged_address, __value, __size)    \
   __builtin_arm_mops_memset_tag(__tagged_address, __value, __size)
 #endif
 
 /* Transactional Memory Extension (TME) Intrinsics */
-#if __ARM_FEATURE_TME
+#if defined(__ARM_FEATURE_TME) && __ARM_FEATURE_TME
 
 #define _TMFAILURE_REASON  0x00007fffu
 #define _TMFAILURE_RTRY    0x00008000u
@@ -759,12 +760,12 @@ __arm_st64bv0(void *__addr, data512_t __value) {
 #endif /* __ARM_FEATURE_TME */
 
 /* Armv8.5-A Random number generation intrinsics */
-#if __ARM_64BIT_STATE && defined(__ARM_FEATURE_RNG)
-static __inline__ int __attribute__((__always_inline__, __nodebug__))
+#if defined(__ARM_64BIT_STATE) && __ARM_64BIT_STATE
+static __inline__ int __attribute__((__always_inline__, __nodebug__, target("rand")))
 __rndr(uint64_t *__p) {
   return __builtin_arm_rndr(__p);
 }
-static __inline__ int __attribute__((__always_inline__, __nodebug__))
+static __inline__ int __attribute__((__always_inline__, __nodebug__, target("rand")))
 __rndrrs(uint64_t *__p) {
   return __builtin_arm_rndrrs(__p);
 }
diff --git a/clang/lib/Headers/arm_neon_sve_bridge.h b/clang/lib/Headers/arm_neon_sve_bridge.h
index 17699d8d11dd..a9fbdbaf4bb9 100644
--- a/clang/lib/Headers/arm_neon_sve_bridge.h
+++ b/clang/lib/Headers/arm_neon_sve_bridge.h
@@ -159,7 +159,6 @@ svfloat32_t svdup_neonq_f32(float32x4_t);
 __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_neonq_f64)))
 svfloat64_t svdup_neonq_f64(float64x2_t);
 
-#if defined(__ARM_FEATURE_SVE_BF16)
 __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svset_neonq_bf16)))
 svbfloat16_t svset_neonq(svbfloat16_t, bfloat16x8_t);
 __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svset_neonq_bf16)))
@@ -172,7 +171,6 @@ __aio __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_neonq_bf16)))
 svbfloat16_t svdup_neonq(bfloat16x8_t);
 __ai __attribute__((__clang_arm_builtin_alias(__builtin_sve_svdup_neonq_bf16)))
 svbfloat16_t svdup_neonq_bf16(bfloat16x8_t);
-#endif // defined(__ARM_FEATURE_SVE_BF16)
 
 #undef __ai
 #undef __aio
diff --git a/clang/lib/Headers/avx512bf16intrin.h b/clang/lib/Headers/avx512bf16intrin.h
index 09653738d40a..a864c1e3350b 100644
--- a/clang/lib/Headers/avx512bf16intrin.h
+++ b/clang/lib/Headers/avx512bf16intrin.h
@@ -10,12 +10,14 @@
 #error "Never use <avx512bf16intrin.h> directly; include <immintrin.h> instead."
 #endif
 
+#ifdef __SSE2__
+
 #ifndef __AVX512BF16INTRIN_H
 #define __AVX512BF16INTRIN_H
 
-typedef short __m512bh __attribute__((__vector_size__(64), __aligned__(64)));
-typedef short __m256bh __attribute__((__vector_size__(32), __aligned__(32)));
-typedef unsigned short __bfloat16;
+typedef __bf16 __v32bf __attribute__((__vector_size__(64), __aligned__(64)));
+typedef __bf16 __m512bh __attribute__((__vector_size__(64), __aligned__(64)));
+typedef __bf16 __bfloat16 __attribute__((deprecated("use __bf16 instead")));
 
 #define __DEFAULT_FN_ATTRS512 \
   __attribute__((__always_inline__, __nodebug__, __target__("avx512bf16"), \
@@ -33,7 +35,7 @@ typedef unsigned short __bfloat16;
 ///    A bfloat data.
 /// \returns A float data whose sign field and exponent field keep unchanged,
 ///    and fraction field is extended to 23 bits.
-static __inline__ float __DEFAULT_FN_ATTRS _mm_cvtsbh_ss(__bfloat16 __A) {
+static __inline__ float __DEFAULT_FN_ATTRS _mm_cvtsbh_ss(__bf16 __A) {
   return __builtin_ia32_cvtsbf162ss_32(__A);
 }
 
@@ -74,9 +76,9 @@ _mm512_cvtne2ps_pbh(__m512 __A, __m512 __B) {
 ///    conversion of __B, and higher 256 bits come from conversion of __A.
 static __inline__ __m512bh __DEFAULT_FN_ATTRS512
 _mm512_mask_cvtne2ps_pbh(__m512bh __W, __mmask32 __U, __m512 __A, __m512 __B) {
-  return (__m512bh)__builtin_ia32_selectw_512((__mmask32)__U,
-                                        (__v32hi)_mm512_cvtne2ps_pbh(__A, __B),
-                                        (__v32hi)__W);
+  return (__m512bh)__builtin_ia32_selectpbf_512((__mmask32)__U,
+                                        (__v32bf)_mm512_cvtne2ps_pbh(__A, __B),
+                                        (__v32bf)__W);
 }
 
 /// Convert Two Packed Single Data to One Packed BF16 Data.
@@ -96,9 +98,9 @@ _mm512_mask_cvtne2ps_pbh(__m512bh __W, __mmask32 __U, __m512 __A, __m512 __B) {
 ///    conversion of __B, and higher 256 bits come from conversion of __A.
 static __inline__ __m512bh __DEFAULT_FN_ATTRS512
 _mm512_maskz_cvtne2ps_pbh(__mmask32 __U, __m512 __A, __m512 __B) {
-  return (__m512bh)__builtin_ia32_selectw_512((__mmask32)__U,
-                                        (__v32hi)_mm512_cvtne2ps_pbh(__A, __B),
-                                        (__v32hi)_mm512_setzero_si512());
+  return (__m512bh)__builtin_ia32_selectpbf_512((__mmask32)__U,
+                                        (__v32bf)_mm512_cvtne2ps_pbh(__A, __B),
+                                        (__v32bf)_mm512_setzero_si512());
 }
 
 /// Convert Packed Single Data to Packed BF16 Data.
@@ -113,7 +115,7 @@ _mm512_maskz_cvtne2ps_pbh(__mmask32 __U, __m512 __A, __m512 __B) {
 static __inline__ __m256bh __DEFAULT_FN_ATTRS512
 _mm512_cvtneps_pbh(__m512 __A) {
   return (__m256bh)__builtin_ia32_cvtneps2bf16_512_mask((__v16sf)__A,
-                                              (__v16hi)_mm256_undefined_si256(),
+                                              (__v16bf)_mm256_undefined_si256(),
                                               (__mmask16)-1);
 }
 
@@ -134,7 +136,7 @@ _mm512_cvtneps_pbh(__m512 __A) {
 static __inline__ __m256bh __DEFAULT_FN_ATTRS512
 _mm512_mask_cvtneps_pbh(__m256bh __W, __mmask16 __U, __m512 __A) {
   return (__m256bh)__builtin_ia32_cvtneps2bf16_512_mask((__v16sf)__A,
-                                                        (__v16hi)__W,
+                                                        (__v16bf)__W,
                                                         (__mmask16)__U);
 }
 
@@ -153,7 +155,7 @@ _mm512_mask_cvtneps_pbh(__m256bh __W, __mmask16 __U, __m512 __A) {
 static __inline__ __m256bh __DEFAULT_FN_ATTRS512
 _mm512_maskz_cvtneps_pbh(__mmask16 __U, __m512 __A) {
   return (__m256bh)__builtin_ia32_cvtneps2bf16_512_mask((__v16sf)__A,
-                                                (__v16hi)_mm256_setzero_si256(),
+                                                (__v16bf)_mm256_setzero_si256(),
                                                 (__mmask16)__U);
 }
 
@@ -174,8 +176,8 @@ _mm512_maskz_cvtneps_pbh(__mmask16 __U, __m512 __A) {
 static __inline__ __m512 __DEFAULT_FN_ATTRS512
 _mm512_dpbf16_ps(__m512 __D, __m512bh __A, __m512bh __B) {
   return (__m512)__builtin_ia32_dpbf16ps_512((__v16sf) __D,
-                                             (__v16si) __A,
-                                             (__v16si) __B);
+                                             (__v32bf) __A,
+                                             (__v32bf) __B);
 }
 
 /// Dot Product of BF16 Pairs Accumulated into Packed Single Precision.
@@ -277,3 +279,4 @@ _mm512_mask_cvtpbh_ps(__m512 __S, __mmask16 __U, __m256bh __A) {
 #undef __DEFAULT_FN_ATTRS512
 
 #endif
+#endif
diff --git a/clang/lib/Headers/avx512fintrin.h b/clang/lib/Headers/avx512fintrin.h
index 61bc89c2b895..b19d2fb90ff5 100644
--- a/clang/lib/Headers/avx512fintrin.h
+++ b/clang/lib/Headers/avx512fintrin.h
@@ -256,8 +256,8 @@ _mm512_maskz_broadcastq_epi64 (__mmask8 __M, __m128i __A)
 static __inline __m512 __DEFAULT_FN_ATTRS512
 _mm512_setzero_ps(void)
 {
-  return __extension__ (__m512){ 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
-                                 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0 };
+  return __extension__ (__m512){ 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+                                 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f };
 }
 
 #define _mm512_setzero _mm512_setzero_ps
diff --git a/clang/lib/Headers/avx512fp16intrin.h b/clang/lib/Headers/avx512fp16intrin.h
index 99409a31b32b..5cdc37fde629 100644
--- a/clang/lib/Headers/avx512fp16intrin.h
+++ b/clang/lib/Headers/avx512fp16intrin.h
@@ -10,6 +10,8 @@
 #error "Never use <avx512fp16intrin.h> directly; include <immintrin.h> instead."
 #endif
 
+#ifdef __SSE2__
+
 #ifndef __AVX512FP16INTRIN_H
 #define __AVX512FP16INTRIN_H
 
@@ -17,12 +19,6 @@
 typedef _Float16 __v32hf __attribute__((__vector_size__(64), __aligned__(64)));
 typedef _Float16 __m512h __attribute__((__vector_size__(64), __aligned__(64)));
 typedef _Float16 __m512h_u __attribute__((__vector_size__(64), __aligned__(1)));
-typedef _Float16 __v8hf __attribute__((__vector_size__(16), __aligned__(16)));
-typedef _Float16 __m128h __attribute__((__vector_size__(16), __aligned__(16)));
-typedef _Float16 __m128h_u __attribute__((__vector_size__(16), __aligned__(1)));
-typedef _Float16 __v16hf __attribute__((__vector_size__(32), __aligned__(32)));
-typedef _Float16 __m256h __attribute__((__vector_size__(32), __aligned__(32)));
-typedef _Float16 __m256h_u __attribute__((__vector_size__(32), __aligned__(1)));
 
 /* Define the default attributes for the functions in this file. */
 #define __DEFAULT_FN_ATTRS512                                                  \
@@ -829,7 +825,7 @@ static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_load_sh(void const *__dp) {
   struct __mm_load_sh_struct {
     _Float16 __u;
   } __attribute__((__packed__, __may_alias__));
-  _Float16 __u = ((struct __mm_load_sh_struct *)__dp)->__u;
+  _Float16 __u = ((const struct __mm_load_sh_struct *)__dp)->__u;
   return (__m128h){__u, 0, 0, 0, 0, 0, 0, 0};
 }
 
@@ -838,13 +834,13 @@ _mm_mask_load_sh(__m128h __W, __mmask8 __U, const void *__A) {
   __m128h src = (__v8hf)__builtin_shufflevector(
       (__v8hf)__W, (__v8hf)_mm_setzero_ph(), 0, 8, 8, 8, 8, 8, 8, 8);
 
-  return (__m128h)__builtin_ia32_loadsh128_mask((__v8hf *)__A, src, __U & 1);
+  return (__m128h)__builtin_ia32_loadsh128_mask((const __v8hf *)__A, src, __U & 1);
 }
 
 static __inline__ __m128h __DEFAULT_FN_ATTRS128
 _mm_maskz_load_sh(__mmask8 __U, const void *__A) {
   return (__m128h)__builtin_ia32_loadsh128_mask(
-      (__v8hf *)__A, (__v8hf)_mm_setzero_ph(), __U & 1);
+      (const __v8hf *)__A, (__v8hf)_mm_setzero_ph(), __U & 1);
 }
 
 static __inline__ __m512h __DEFAULT_FN_ATTRS512
@@ -3347,3 +3343,4 @@ _mm512_permutexvar_ph(__m512i __A, __m512h __B) {
 #undef __DEFAULT_FN_ATTRS512
 
 #endif
+#endif
diff --git a/clang/lib/Headers/avx512ifmavlintrin.h b/clang/lib/Headers/avx512ifmavlintrin.h
index 5889401d1055..3284ee182004 100644
--- a/clang/lib/Headers/avx512ifmavlintrin.h
+++ b/clang/lib/Headers/avx512ifmavlintrin.h
@@ -18,14 +18,21 @@
 #define __DEFAULT_FN_ATTRS128 __attribute__((__always_inline__, __nodebug__, __target__("avx512ifma,avx512vl"), __min_vector_width__(128)))
 #define __DEFAULT_FN_ATTRS256 __attribute__((__always_inline__, __nodebug__, __target__("avx512ifma,avx512vl"), __min_vector_width__(256)))
 
+#define _mm_madd52hi_epu64(X, Y, Z)                                            \
+  ((__m128i)__builtin_ia32_vpmadd52huq128((__v2di)(X), (__v2di)(Y),            \
+                                          (__v2di)(Z)))
 
+#define _mm256_madd52hi_epu64(X, Y, Z)                                         \
+  ((__m256i)__builtin_ia32_vpmadd52huq256((__v4di)(X), (__v4di)(Y),            \
+                                          (__v4di)(Z)))
 
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_madd52hi_epu64 (__m128i __X, __m128i __Y, __m128i __Z)
-{
-  return (__m128i)__builtin_ia32_vpmadd52huq128((__v2di) __X, (__v2di) __Y,
-                                                (__v2di) __Z);
-}
+#define _mm_madd52lo_epu64(X, Y, Z)                                            \
+  ((__m128i)__builtin_ia32_vpmadd52luq128((__v2di)(X), (__v2di)(Y),            \
+                                          (__v2di)(Z)))
+
+#define _mm256_madd52lo_epu64(X, Y, Z)                                         \
+  ((__m256i)__builtin_ia32_vpmadd52luq256((__v4di)(X), (__v4di)(Y),            \
+                                          (__v4di)(Z)))
 
 static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_mask_madd52hi_epu64 (__m128i __W, __mmask8 __M, __m128i __X, __m128i __Y)
@@ -44,13 +51,6 @@ _mm_maskz_madd52hi_epu64 (__mmask8 __M, __m128i __X, __m128i __Y, __m128i __Z)
 }
 
 static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_madd52hi_epu64 (__m256i __X, __m256i __Y, __m256i __Z)
-{
-  return (__m256i)__builtin_ia32_vpmadd52huq256((__v4di)__X, (__v4di)__Y,
-                                                (__v4di)__Z);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_mask_madd52hi_epu64 (__m256i __W, __mmask8 __M, __m256i __X, __m256i __Y)
 {
   return (__m256i)__builtin_ia32_selectq_256(__M,
@@ -67,13 +67,6 @@ _mm256_maskz_madd52hi_epu64 (__mmask8 __M, __m256i __X, __m256i __Y, __m256i __Z
 }
 
 static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_madd52lo_epu64 (__m128i __X, __m128i __Y, __m128i __Z)
-{
-  return (__m128i)__builtin_ia32_vpmadd52luq128((__v2di)__X, (__v2di)__Y,
-                                                (__v2di)__Z);
-}
-
-static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_mask_madd52lo_epu64 (__m128i __W, __mmask8 __M, __m128i __X, __m128i __Y)
 {
   return (__m128i)__builtin_ia32_selectq_128(__M,
@@ -90,13 +83,6 @@ _mm_maskz_madd52lo_epu64 (__mmask8 __M, __m128i __X, __m128i __Y, __m128i __Z)
 }
 
 static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_madd52lo_epu64 (__m256i __X, __m256i __Y, __m256i __Z)
-{
-  return (__m256i)__builtin_ia32_vpmadd52luq256((__v4di)__X, (__v4di)__Y,
-                                                (__v4di)__Z);
-}
-
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_mask_madd52lo_epu64 (__m256i __W, __mmask8 __M, __m256i __X, __m256i __Y)
 {
   return (__m256i)__builtin_ia32_selectq_256(__M,
diff --git a/clang/lib/Headers/avx512vlbf16intrin.h b/clang/lib/Headers/avx512vlbf16intrin.h
index 1cdbb28484ac..f5b8911fac2a 100644
--- a/clang/lib/Headers/avx512vlbf16intrin.h
+++ b/clang/lib/Headers/avx512vlbf16intrin.h
@@ -10,11 +10,11 @@
 #error "Never use <avx512vlbf16intrin.h> directly; include <immintrin.h> instead."
 #endif
 
+#ifdef __SSE2__
+
 #ifndef __AVX512VLBF16INTRIN_H
 #define __AVX512VLBF16INTRIN_H
 
-typedef short __m128bh __attribute__((__vector_size__(16), __aligned__(16)));
-
 #define __DEFAULT_FN_ATTRS128 \
   __attribute__((__always_inline__, __nodebug__, \
                  __target__("avx512vl, avx512bf16"), __min_vector_width__(128)))
@@ -59,9 +59,9 @@ _mm_cvtne2ps_pbh(__m128 __A, __m128 __B) {
 ///    conversion of __B, and higher 64 bits come from conversion of __A.
 static __inline__ __m128bh __DEFAULT_FN_ATTRS128
 _mm_mask_cvtne2ps_pbh(__m128bh __W, __mmask8 __U, __m128 __A, __m128 __B) {
-  return (__m128bh)__builtin_ia32_selectw_128((__mmask8)__U,
-                                             (__v8hi)_mm_cvtne2ps_pbh(__A, __B),
-                                             (__v8hi)__W);
+  return (__m128bh)__builtin_ia32_selectpbf_128((__mmask8)__U,
+                                             (__v8bf)_mm_cvtne2ps_pbh(__A, __B),
+                                             (__v8bf)__W);
 }
 
 /// Convert Two Packed Single Data to One Packed BF16 Data.
@@ -81,9 +81,9 @@ _mm_mask_cvtne2ps_pbh(__m128bh __W, __mmask8 __U, __m128 __A, __m128 __B) {
 ///    conversion of __B, and higher 64 bits come from conversion of __A.
 static __inline__ __m128bh __DEFAULT_FN_ATTRS128
 _mm_maskz_cvtne2ps_pbh(__mmask8 __U, __m128 __A, __m128 __B) {
-  return (__m128bh)__builtin_ia32_selectw_128((__mmask8)__U,
-                                             (__v8hi)_mm_cvtne2ps_pbh(__A, __B),
-                                             (__v8hi)_mm_setzero_si128());
+  return (__m128bh)__builtin_ia32_selectpbf_128((__mmask8)__U,
+                                             (__v8bf)_mm_cvtne2ps_pbh(__A, __B),
+                                             (__v8bf)_mm_setzero_si128());
 }
 
 /// Convert Two Packed Single Data to One Packed BF16 Data.
@@ -123,9 +123,9 @@ _mm256_cvtne2ps_pbh(__m256 __A, __m256 __B) {
 ///    conversion of __B, and higher 128 bits come from conversion of __A.
 static __inline__ __m256bh __DEFAULT_FN_ATTRS256
 _mm256_mask_cvtne2ps_pbh(__m256bh __W, __mmask16 __U, __m256 __A, __m256 __B) {
-  return (__m256bh)__builtin_ia32_selectw_256((__mmask16)__U,
-                                         (__v16hi)_mm256_cvtne2ps_pbh(__A, __B),
-                                         (__v16hi)__W);
+  return (__m256bh)__builtin_ia32_selectpbf_256((__mmask16)__U,
+                                         (__v16bf)_mm256_cvtne2ps_pbh(__A, __B),
+                                         (__v16bf)__W);
 }
 
 /// Convert Two Packed Single Data to One Packed BF16 Data.
@@ -145,9 +145,9 @@ _mm256_mask_cvtne2ps_pbh(__m256bh __W, __mmask16 __U, __m256 __A, __m256 __B) {
 ///    conversion of __B, and higher 128 bits come from conversion of __A.
 static __inline__ __m256bh __DEFAULT_FN_ATTRS256
 _mm256_maskz_cvtne2ps_pbh(__mmask16 __U, __m256 __A, __m256 __B) {
-  return (__m256bh)__builtin_ia32_selectw_256((__mmask16)__U,
-                                         (__v16hi)_mm256_cvtne2ps_pbh(__A, __B),
-                                         (__v16hi)_mm256_setzero_si256());
+  return (__m256bh)__builtin_ia32_selectpbf_256((__mmask16)__U,
+                                         (__v16bf)_mm256_cvtne2ps_pbh(__A, __B),
+                                         (__v16bf)_mm256_setzero_si256());
 }
 
 /// Convert Packed Single Data to Packed BF16 Data.
@@ -160,12 +160,8 @@ _mm256_maskz_cvtne2ps_pbh(__mmask16 __U, __m256 __A, __m256 __B) {
 ///    A 128-bit vector of [4 x float].
 /// \returns A 128-bit vector of [8 x bfloat] whose lower 64 bits come from
 ///    conversion of __A, and higher 64 bits are 0.
-static __inline__ __m128bh __DEFAULT_FN_ATTRS128
-_mm_cvtneps_pbh(__m128 __A) {
-  return (__m128bh)__builtin_ia32_cvtneps2bf16_128_mask((__v4sf) __A,
-                                                  (__v8hi)_mm_undefined_si128(),
-                                                  (__mmask8)-1);
-}
+#define _mm_cvtneps_pbh(A)                                                     \
+  ((__m128bh)__builtin_ia32_vcvtneps2bf16128((__v4sf)(A)))
 
 /// Convert Packed Single Data to Packed BF16 Data.
 ///
@@ -185,7 +181,7 @@ _mm_cvtneps_pbh(__m128 __A) {
 static __inline__ __m128bh __DEFAULT_FN_ATTRS128
 _mm_mask_cvtneps_pbh(__m128bh __W, __mmask8 __U, __m128 __A) {
   return (__m128bh)__builtin_ia32_cvtneps2bf16_128_mask((__v4sf) __A,
-                                                        (__v8hi)__W,
+                                                        (__v8bf)__W,
                                                         (__mmask8)__U);
 }
 
@@ -205,7 +201,7 @@ _mm_mask_cvtneps_pbh(__m128bh __W, __mmask8 __U, __m128 __A) {
 static __inline__ __m128bh __DEFAULT_FN_ATTRS128
 _mm_maskz_cvtneps_pbh(__mmask8 __U, __m128 __A) {
   return (__m128bh)__builtin_ia32_cvtneps2bf16_128_mask((__v4sf) __A,
-                                                    (__v8hi)_mm_setzero_si128(),
+                                                    (__v8bf)_mm_setzero_si128(),
                                                     (__mmask8)__U);
 }
 
@@ -218,12 +214,8 @@ _mm_maskz_cvtneps_pbh(__mmask8 __U, __m128 __A) {
 /// \param __A
 ///    A 256-bit vector of [8 x float].
 /// \returns A 128-bit vector of [8 x bfloat] comes from conversion of __A.
-static __inline__ __m128bh __DEFAULT_FN_ATTRS256
-_mm256_cvtneps_pbh(__m256 __A) {
-  return (__m128bh)__builtin_ia32_cvtneps2bf16_256_mask((__v8sf)__A,
-                                                  (__v8hi)_mm_undefined_si128(),
-                                                  (__mmask8)-1);
-}
+#define _mm256_cvtneps_pbh(A)                                                  \
+  ((__m128bh)__builtin_ia32_vcvtneps2bf16256((__v8sf)(A)))
 
 /// Convert Packed Single Data to Packed BF16 Data.
 ///
@@ -242,7 +234,7 @@ _mm256_cvtneps_pbh(__m256 __A) {
 static __inline__ __m128bh __DEFAULT_FN_ATTRS256
 _mm256_mask_cvtneps_pbh(__m128bh __W, __mmask8 __U, __m256 __A) {
   return (__m128bh)__builtin_ia32_cvtneps2bf16_256_mask((__v8sf)__A,
-                                                        (__v8hi)__W,
+                                                        (__v8bf)__W,
                                                         (__mmask8)__U);
 }
 
@@ -261,7 +253,7 @@ _mm256_mask_cvtneps_pbh(__m128bh __W, __mmask8 __U, __m256 __A) {
 static __inline__ __m128bh __DEFAULT_FN_ATTRS256
 _mm256_maskz_cvtneps_pbh(__mmask8 __U, __m256 __A) {
   return (__m128bh)__builtin_ia32_cvtneps2bf16_256_mask((__v8sf)__A,
-                                                    (__v8hi)_mm_setzero_si128(),
+                                                    (__v8bf)_mm_setzero_si128(),
                                                     (__mmask8)__U);
 }
 
@@ -282,8 +274,8 @@ _mm256_maskz_cvtneps_pbh(__mmask8 __U, __m256 __A) {
 static __inline__ __m128 __DEFAULT_FN_ATTRS128
 _mm_dpbf16_ps(__m128 __D, __m128bh __A, __m128bh __B) {
   return (__m128)__builtin_ia32_dpbf16ps_128((__v4sf)__D,
-                                             (__v4si)__A,
-                                             (__v4si)__B);
+                                             (__v8bf)__A,
+                                             (__v8bf)__B);
 }
 
 /// Dot Product of BF16 Pairs Accumulated into Packed Single Precision.
@@ -351,8 +343,8 @@ _mm_maskz_dpbf16_ps(__mmask8 __U, __m128 __D, __m128bh __A, __m128bh __B) {
 static __inline__ __m256 __DEFAULT_FN_ATTRS256
 _mm256_dpbf16_ps(__m256 __D, __m256bh __A, __m256bh __B) {
   return (__m256)__builtin_ia32_dpbf16ps_256((__v8sf)__D,
-                                             (__v8si)__A,
-                                             (__v8si)__B);
+                                             (__v16bf)__A,
+                                             (__v16bf)__B);
 }
 
 /// Dot Product of BF16 Pairs Accumulated into Packed Single Precision.
@@ -413,11 +405,11 @@ _mm256_maskz_dpbf16_ps(__mmask8 __U, __m256 __D, __m256bh __A, __m256bh __B) {
 ///    A float data.
 /// \returns A bf16 data whose sign field and exponent field keep unchanged,
 ///    and fraction field is truncated to 7 bits.
-static __inline__ __bfloat16 __DEFAULT_FN_ATTRS128 _mm_cvtness_sbh(float __A) {
+static __inline__ __bf16 __DEFAULT_FN_ATTRS128 _mm_cvtness_sbh(float __A) {
   __v4sf __V = {__A, 0, 0, 0};
-  __v8hi __R = __builtin_ia32_cvtneps2bf16_128_mask(
-      (__v4sf)__V, (__v8hi)_mm_undefined_si128(), (__mmask8)-1);
-  return (__bfloat16)__R[0];
+  __v8bf __R = __builtin_ia32_cvtneps2bf16_128_mask(
+      (__v4sf)__V, (__v8bf)_mm_undefined_si128(), (__mmask8)-1);
+  return (__bf16)__R[0];
 }
 
 /// Convert Packed BF16 Data to Packed float Data.
@@ -520,3 +512,4 @@ _mm256_mask_cvtpbh_ps(__m256 __S, __mmask8 __U, __m128bh __A) {
 #undef __DEFAULT_FN_ATTRS256
 
 #endif
+#endif
diff --git a/clang/lib/Headers/avx512vlbwintrin.h b/clang/lib/Headers/avx512vlbwintrin.h
index 521ccab27e04..148af5ab9a34 100644
--- a/clang/lib/Headers/avx512vlbwintrin.h
+++ b/clang/lib/Headers/avx512vlbwintrin.h
@@ -2803,6 +2803,358 @@ _mm256_mask_permutexvar_epi16 (__m256i __W, __mmask16 __M, __m256i __A,
                                   (__v16hi)_mm256_dbsad_epu8((A), (B), (imm)), \
                                   (__v16hi)_mm256_setzero_si256()))
 
+static __inline__ short __DEFAULT_FN_ATTRS128
+_mm_reduce_add_epi16(__m128i __W) {
+  return __builtin_reduce_add((__v8hi)__W);
+}
+
+static __inline__ short __DEFAULT_FN_ATTRS128
+_mm_reduce_mul_epi16(__m128i __W) {
+  return __builtin_reduce_mul((__v8hi)__W);
+}
+
+static __inline__ short __DEFAULT_FN_ATTRS128
+_mm_reduce_and_epi16(__m128i __W) {
+  return __builtin_reduce_and((__v8hi)__W);
+}
+
+static __inline__ short __DEFAULT_FN_ATTRS128
+_mm_reduce_or_epi16(__m128i __W) {
+  return __builtin_reduce_or((__v8hi)__W);
+}
+
+static __inline__ short __DEFAULT_FN_ATTRS128
+_mm_mask_reduce_add_epi16( __mmask8 __M, __m128i __W) {
+  __W = _mm_maskz_mov_epi16(__M, __W);
+  return __builtin_reduce_add((__v8hi)__W);
+}
+
+static __inline__ short __DEFAULT_FN_ATTRS128
+_mm_mask_reduce_mul_epi16( __mmask8 __M, __m128i __W) {
+  __W = _mm_mask_mov_epi16(_mm_set1_epi16(1), __M, __W);
+  return __builtin_reduce_mul((__v8hi)__W);
+}
+
+static __inline__ short __DEFAULT_FN_ATTRS128
+_mm_mask_reduce_and_epi16( __mmask8 __M, __m128i __W) {
+  __W = _mm_mask_mov_epi16(_mm_set1_epi16(-1), __M, __W);
+  return __builtin_reduce_and((__v8hi)__W);
+}
+
+static __inline__ short __DEFAULT_FN_ATTRS128
+_mm_mask_reduce_or_epi16(__mmask8 __M, __m128i __W) {
+  __W = _mm_maskz_mov_epi16(__M, __W);
+  return __builtin_reduce_or((__v8hi)__W);
+}
+
+static __inline__ short __DEFAULT_FN_ATTRS128
+_mm_reduce_max_epi16(__m128i __V) {
+  return __builtin_reduce_max((__v8hi)__V);
+}
+
+static __inline__ unsigned short __DEFAULT_FN_ATTRS128
+_mm_reduce_max_epu16(__m128i __V) {
+  return __builtin_reduce_max((__v8hu)__V);
+}
+
+static __inline__ short __DEFAULT_FN_ATTRS128
+_mm_reduce_min_epi16(__m128i __V) {
+  return __builtin_reduce_min((__v8hi)__V);
+}
+
+static __inline__ unsigned short __DEFAULT_FN_ATTRS128
+_mm_reduce_min_epu16(__m128i __V) {
+  return __builtin_reduce_min((__v8hu)__V);
+}
+
+static __inline__ short __DEFAULT_FN_ATTRS128
+_mm_mask_reduce_max_epi16(__mmask16 __M, __m128i __V) {
+  __V = _mm_mask_mov_epi16(_mm_set1_epi16(-32767-1), __M, __V);
+  return __builtin_reduce_max((__v8hi)__V);
+}
+
+static __inline__ unsigned short __DEFAULT_FN_ATTRS128
+_mm_mask_reduce_max_epu16(__mmask16 __M, __m128i __V) {
+  __V = _mm_maskz_mov_epi16(__M, __V);
+  return __builtin_reduce_max((__v8hu)__V);
+}
+
+static __inline__ short __DEFAULT_FN_ATTRS128
+_mm_mask_reduce_min_epi16(__mmask16 __M, __m128i __V) {
+  __V = _mm_mask_mov_epi16(_mm_set1_epi16(32767), __M, __V);
+  return __builtin_reduce_min((__v8hi)__V);
+}
+
+static __inline__ unsigned short __DEFAULT_FN_ATTRS128
+_mm_mask_reduce_min_epu16(__mmask16 __M, __m128i __V) {
+  __V = _mm_mask_mov_epi16(_mm_set1_epi16(-1), __M, __V);
+  return __builtin_reduce_min((__v8hu)__V);
+}
+
+static __inline__ short __DEFAULT_FN_ATTRS256
+_mm256_reduce_add_epi16(__m256i __W) {
+  return __builtin_reduce_add((__v16hi)__W);
+}
+
+static __inline__ short __DEFAULT_FN_ATTRS256
+_mm256_reduce_mul_epi16(__m256i __W) {
+  return __builtin_reduce_mul((__v16hi)__W);
+}
+
+static __inline__ short __DEFAULT_FN_ATTRS256
+_mm256_reduce_and_epi16(__m256i __W) {
+  return __builtin_reduce_and((__v16hi)__W);
+}
+
+static __inline__ short __DEFAULT_FN_ATTRS256
+_mm256_reduce_or_epi16(__m256i __W) {
+  return __builtin_reduce_or((__v16hi)__W);
+}
+
+static __inline__ short __DEFAULT_FN_ATTRS256
+_mm256_mask_reduce_add_epi16( __mmask16 __M, __m256i __W) {
+  __W = _mm256_maskz_mov_epi16(__M, __W);
+  return __builtin_reduce_add((__v16hi)__W);
+}
+
+static __inline__ short __DEFAULT_FN_ATTRS256
+_mm256_mask_reduce_mul_epi16( __mmask16 __M, __m256i __W) {
+  __W = _mm256_mask_mov_epi16(_mm256_set1_epi16(1), __M, __W);
+  return __builtin_reduce_mul((__v16hi)__W);
+}
+
+static __inline__ short __DEFAULT_FN_ATTRS256
+_mm256_mask_reduce_and_epi16( __mmask16 __M, __m256i __W) {
+  __W = _mm256_mask_mov_epi16(_mm256_set1_epi16(-1), __M, __W);
+  return __builtin_reduce_and((__v16hi)__W);
+}
+
+static __inline__ short __DEFAULT_FN_ATTRS256
+_mm256_mask_reduce_or_epi16(__mmask16 __M, __m256i __W) {
+  __W = _mm256_maskz_mov_epi16(__M, __W);
+  return __builtin_reduce_or((__v16hi)__W);
+}
+
+static __inline__ short __DEFAULT_FN_ATTRS256
+_mm256_reduce_max_epi16(__m256i __V) {
+  return __builtin_reduce_max((__v16hi)__V);
+}
+
+static __inline__ unsigned short __DEFAULT_FN_ATTRS256
+_mm256_reduce_max_epu16(__m256i __V) {
+  return __builtin_reduce_max((__v16hu)__V);
+}
+
+static __inline__ short __DEFAULT_FN_ATTRS256
+_mm256_reduce_min_epi16(__m256i __V) {
+  return __builtin_reduce_min((__v16hi)__V);
+}
+
+static __inline__ unsigned short __DEFAULT_FN_ATTRS256
+_mm256_reduce_min_epu16(__m256i __V) {
+  return __builtin_reduce_min((__v16hu)__V);
+}
+
+static __inline__ short __DEFAULT_FN_ATTRS256
+_mm256_mask_reduce_max_epi16(__mmask16 __M, __m256i __V) {
+  __V = _mm256_mask_mov_epi16(_mm256_set1_epi16(-32767-1), __M, __V);
+  return __builtin_reduce_max((__v16hi)__V);
+}
+
+static __inline__ unsigned short __DEFAULT_FN_ATTRS256
+_mm256_mask_reduce_max_epu16(__mmask16 __M, __m256i __V) {
+  __V = _mm256_maskz_mov_epi16(__M, __V);
+  return __builtin_reduce_max((__v16hu)__V);
+}
+
+static __inline__ short __DEFAULT_FN_ATTRS256
+_mm256_mask_reduce_min_epi16(__mmask16 __M, __m256i __V) {
+  __V = _mm256_mask_mov_epi16(_mm256_set1_epi16(32767), __M, __V);
+  return __builtin_reduce_min((__v16hi)__V);
+}
+
+static __inline__ unsigned short __DEFAULT_FN_ATTRS256
+_mm256_mask_reduce_min_epu16(__mmask16 __M, __m256i __V) {
+  __V = _mm256_mask_mov_epi16(_mm256_set1_epi16(-1), __M, __V);
+  return __builtin_reduce_min((__v16hu)__V);
+}
+
+static __inline__ signed char __DEFAULT_FN_ATTRS128
+_mm_reduce_add_epi8(__m128i __W) {
+  return __builtin_reduce_add((__v16qs)__W);
+}
+
+static __inline__ signed char __DEFAULT_FN_ATTRS128
+_mm_reduce_mul_epi8(__m128i __W) {
+  return __builtin_reduce_mul((__v16qs)__W);
+}
+
+static __inline__ signed char __DEFAULT_FN_ATTRS128
+_mm_reduce_and_epi8(__m128i __W) {
+  return __builtin_reduce_and((__v16qs)__W);
+}
+
+static __inline__ signed char __DEFAULT_FN_ATTRS128
+_mm_reduce_or_epi8(__m128i __W) {
+  return __builtin_reduce_or((__v16qs)__W);
+}
+
+static __inline__ signed char __DEFAULT_FN_ATTRS128
+_mm_mask_reduce_add_epi8(__mmask16 __M, __m128i __W) {
+  __W = _mm_maskz_mov_epi8(__M, __W);
+  return __builtin_reduce_add((__v16qs)__W);
+}
+
+static __inline__ signed char __DEFAULT_FN_ATTRS128
+_mm_mask_reduce_mul_epi8(__mmask16 __M, __m128i __W) {
+  __W = _mm_mask_mov_epi8(_mm_set1_epi8(1), __M, __W);
+  return __builtin_reduce_mul((__v16qs)__W);
+}
+
+static __inline__ signed char __DEFAULT_FN_ATTRS128
+_mm_mask_reduce_and_epi8(__mmask16 __M, __m128i __W) {
+  __W = _mm_mask_mov_epi8(_mm_set1_epi8(-1), __M, __W);
+  return __builtin_reduce_and((__v16qs)__W);
+}
+
+static __inline__ signed char __DEFAULT_FN_ATTRS128
+_mm_mask_reduce_or_epi8(__mmask16 __M, __m128i __W) {
+  __W = _mm_maskz_mov_epi8(__M, __W);
+  return __builtin_reduce_or((__v16qs)__W);
+}
+
+static __inline__ signed char __DEFAULT_FN_ATTRS128
+_mm_reduce_max_epi8(__m128i __V) {
+  return __builtin_reduce_max((__v16qs)__V);
+}
+
+static __inline__ unsigned char __DEFAULT_FN_ATTRS128
+_mm_reduce_max_epu8(__m128i __V) {
+  return __builtin_reduce_max((__v16qu)__V);
+}
+
+static __inline__ signed char __DEFAULT_FN_ATTRS128
+_mm_reduce_min_epi8(__m128i __V) {
+  return __builtin_reduce_min((__v16qs)__V);
+}
+
+static __inline__ unsigned char __DEFAULT_FN_ATTRS128
+_mm_reduce_min_epu8(__m128i __V) {
+  return __builtin_reduce_min((__v16qu)__V);
+}
+
+static __inline__ signed char __DEFAULT_FN_ATTRS128
+_mm_mask_reduce_max_epi8(__mmask16 __M, __m128i __V) {
+  __V = _mm_mask_mov_epi8(_mm_set1_epi8(-127-1), __M, __V);
+  return __builtin_reduce_max((__v16qs)__V);
+}
+
+static __inline__ unsigned char __DEFAULT_FN_ATTRS128
+_mm_mask_reduce_max_epu8(__mmask16 __M, __m128i __V) {
+  __V = _mm_maskz_mov_epi8(__M, __V);
+  return __builtin_reduce_max((__v16qu)__V);
+}
+
+static __inline__ signed char __DEFAULT_FN_ATTRS128
+_mm_mask_reduce_min_epi8(__mmask16 __M, __m128i __V) {
+  __V = _mm_mask_mov_epi8(_mm_set1_epi8(127), __M, __V);
+  return __builtin_reduce_min((__v16qs)__V);
+}
+
+static __inline__ unsigned char __DEFAULT_FN_ATTRS128
+_mm_mask_reduce_min_epu8(__mmask16 __M, __m128i __V) {
+  __V = _mm_mask_mov_epi8(_mm_set1_epi8(-1), __M, __V);
+  return __builtin_reduce_min((__v16qu)__V);
+}
+
+static __inline__ signed char __DEFAULT_FN_ATTRS256
+_mm256_reduce_add_epi8(__m256i __W) {
+  return __builtin_reduce_add((__v32qs)__W);
+}
+
+static __inline__ signed char __DEFAULT_FN_ATTRS256
+_mm256_reduce_mul_epi8(__m256i __W) {
+  return __builtin_reduce_mul((__v32qs)__W);
+}
+
+static __inline__ signed char __DEFAULT_FN_ATTRS256
+_mm256_reduce_and_epi8(__m256i __W) {
+  return __builtin_reduce_and((__v32qs)__W);
+}
+
+static __inline__ signed char __DEFAULT_FN_ATTRS256
+_mm256_reduce_or_epi8(__m256i __W) {
+  return __builtin_reduce_or((__v32qs)__W);
+}
+
+static __inline__ signed char __DEFAULT_FN_ATTRS256
+_mm256_mask_reduce_add_epi8(__mmask32 __M, __m256i __W) {
+  __W = _mm256_maskz_mov_epi8(__M, __W);
+  return __builtin_reduce_add((__v32qs)__W);
+}
+
+static __inline__ signed char __DEFAULT_FN_ATTRS256
+_mm256_mask_reduce_mul_epi8(__mmask32 __M, __m256i __W) {
+  __W = _mm256_mask_mov_epi8(_mm256_set1_epi8(1), __M, __W);
+  return __builtin_reduce_mul((__v32qs)__W);
+}
+
+static __inline__ signed char __DEFAULT_FN_ATTRS256
+_mm256_mask_reduce_and_epi8(__mmask32 __M, __m256i __W) {
+  __W = _mm256_mask_mov_epi8(_mm256_set1_epi8(-1), __M, __W);
+  return __builtin_reduce_and((__v32qs)__W);
+}
+
+static __inline__ signed char __DEFAULT_FN_ATTRS256
+_mm256_mask_reduce_or_epi8(__mmask32 __M, __m256i __W) {
+  __W = _mm256_maskz_mov_epi8(__M, __W);
+  return __builtin_reduce_or((__v32qs)__W);
+}
+
+static __inline__ signed char __DEFAULT_FN_ATTRS256
+_mm256_reduce_max_epi8(__m256i __V) {
+  return __builtin_reduce_max((__v32qs)__V);
+}
+
+static __inline__ unsigned char __DEFAULT_FN_ATTRS256
+_mm256_reduce_max_epu8(__m256i __V) {
+  return __builtin_reduce_max((__v32qu)__V);
+}
+
+static __inline__ signed char __DEFAULT_FN_ATTRS256
+_mm256_reduce_min_epi8(__m256i __V) {
+  return __builtin_reduce_min((__v32qs)__V);
+}
+
+static __inline__ unsigned char __DEFAULT_FN_ATTRS256
+_mm256_reduce_min_epu8(__m256i __V) {
+  return __builtin_reduce_min((__v32qu)__V);
+}
+
+static __inline__ signed char __DEFAULT_FN_ATTRS256
+_mm256_mask_reduce_max_epi8(__mmask32 __M, __m256i __V) {
+  __V = _mm256_mask_mov_epi8(_mm256_set1_epi8(-127-1), __M, __V);
+  return __builtin_reduce_max((__v32qs)__V);
+}
+
+static __inline__ unsigned char __DEFAULT_FN_ATTRS256
+_mm256_mask_reduce_max_epu8(__mmask32 __M, __m256i __V) {
+  __V = _mm256_maskz_mov_epi8(__M, __V);
+  return __builtin_reduce_max((__v32qu)__V);
+}
+
+static __inline__ signed char __DEFAULT_FN_ATTRS256
+_mm256_mask_reduce_min_epi8(__mmask32 __M, __m256i __V) {
+  __V = _mm256_mask_mov_epi8(_mm256_set1_epi8(127), __M, __V);
+  return __builtin_reduce_min((__v32qs)__V);
+}
+
+static __inline__ unsigned char __DEFAULT_FN_ATTRS256
+_mm256_mask_reduce_min_epu8(__mmask32 __M, __m256i __V) {
+  __V = _mm256_mask_mov_epi8(_mm256_set1_epi8(-1), __M, __V);
+  return __builtin_reduce_min((__v32qu)__V);
+}
+
 #undef __DEFAULT_FN_ATTRS128
 #undef __DEFAULT_FN_ATTRS256
 
diff --git a/clang/lib/Headers/avx512vlfp16intrin.h b/clang/lib/Headers/avx512vlfp16intrin.h
index 3d27853ad964..d4a7d1b1c53e 100644
--- a/clang/lib/Headers/avx512vlfp16intrin.h
+++ b/clang/lib/Headers/avx512vlfp16intrin.h
@@ -11,6 +11,8 @@
     "Never use <avx512vlfp16intrin.h> directly; include <immintrin.h> instead."
 #endif
 
+#ifdef __SSE2__
+
 #ifndef __AVX512VLFP16INTRIN_H
 #define __AVX512VLFP16INTRIN_H
 
@@ -2066,3 +2068,4 @@ _mm_reduce_min_ph(__m128h __V) {
 #undef __DEFAULT_FN_ATTRS256
 
 #endif
+#endif
diff --git a/clang/lib/Headers/avxifmaintrin.h b/clang/lib/Headers/avxifmaintrin.h
new file mode 100644
index 000000000000..5c782d2a5b86
--- /dev/null
+++ b/clang/lib/Headers/avxifmaintrin.h
@@ -0,0 +1,177 @@
+/*===----------------- avxifmaintrin.h - IFMA intrinsics -------------------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __IMMINTRIN_H
+#error "Never use <avxifmaintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef __AVXIFMAINTRIN_H
+#define __AVXIFMAINTRIN_H
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS128                                                  \
+  __attribute__((__always_inline__, __nodebug__, __target__("avxifma"),        \
+                 __min_vector_width__(128)))
+#define __DEFAULT_FN_ATTRS256                                                  \
+  __attribute__((__always_inline__, __nodebug__, __target__("avxifma"),        \
+                 __min_vector_width__(256)))
+
+// must vex-encoding
+
+/// Multiply packed unsigned 52-bit integers in each 64-bit element of \a __Y
+/// and \a __Z to form a 104-bit intermediate result. Add the high 52-bit
+/// unsigned integer from the intermediate result with the corresponding
+/// unsigned 64-bit integer in \a __X, and store the results in \a dst.
+///
+/// \headerfile <immintrin.h>
+///
+/// \code
+/// __m128i
+/// _mm_madd52hi_avx_epu64 (__m128i __X, __m128i __Y, __m128i __Z)
+/// \endcode
+///
+/// This intrinsic corresponds to the \c VPMADD52HUQ instruction.
+///
+/// \return
+/// 	return __m128i dst.
+/// \param __X
+/// 	A 128-bit vector of [2 x i64]
+/// \param __Y
+/// 	A 128-bit vector of [2 x i64]
+/// \param __Z
+/// 	A 128-bit vector of [2 x i64]
+///
+/// \code{.operation}
+/// FOR j := 0 to 1
+/// 	i := j*64
+/// 	tmp[127:0] := ZeroExtend64(__Y[i+51:i]) * ZeroExtend64(__Z[i+51:i])
+/// 	dst[i+63:i] := __X[i+63:i] + ZeroExtend64(tmp[103:52])
+/// ENDFOR
+/// dst[MAX:128] := 0
+/// \endcode
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_madd52hi_avx_epu64(__m128i __X, __m128i __Y, __m128i __Z) {
+  return (__m128i)__builtin_ia32_vpmadd52huq128((__v2di)__X, (__v2di)__Y,
+                                                (__v2di)__Z);
+}
+
+/// Multiply packed unsigned 52-bit integers in each 64-bit element of \a __Y
+/// and \a __Z to form a 104-bit intermediate result. Add the high 52-bit
+/// unsigned integer from the intermediate result with the corresponding
+/// unsigned 64-bit integer in \a __X, and store the results in \a dst.
+///
+/// \headerfile <immintrin.h>
+///
+/// \code
+/// __m256i
+/// _mm256_madd52hi_avx_epu64 (__m256i __X, __m256i __Y, __m256i __Z)
+/// \endcode
+///
+/// This intrinsic corresponds to the \c VPMADD52HUQ instruction.
+///
+/// \return
+/// 	return __m256i dst.
+/// \param __X
+/// 	A 256-bit vector of [4 x i64]
+/// \param __Y
+/// 	A 256-bit vector of [4 x i64]
+/// \param __Z
+/// 	A 256-bit vector of [4 x i64]
+///
+/// \code{.operation}
+/// FOR j := 0 to 3
+/// 	i := j*64
+/// 	tmp[127:0] := ZeroExtend64(__Y[i+51:i]) * ZeroExtend64(__Z[i+51:i])
+/// 	dst[i+63:i] := __X[i+63:i] + ZeroExtend64(tmp[103:52])
+/// ENDFOR
+/// dst[MAX:256] := 0
+/// \endcode
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_madd52hi_avx_epu64(__m256i __X, __m256i __Y, __m256i __Z) {
+  return (__m256i)__builtin_ia32_vpmadd52huq256((__v4di)__X, (__v4di)__Y,
+                                                (__v4di)__Z);
+}
+
+/// Multiply packed unsigned 52-bit integers in each 64-bit element of \a __Y
+/// and \a __Z to form a 104-bit intermediate result. Add the low 52-bit
+/// unsigned integer from the intermediate result with the corresponding
+/// unsigned 64-bit integer in \a __X, and store the results in \a dst.
+///
+/// \headerfile <immintrin.h>
+///
+/// \code
+/// __m128i
+/// _mm_madd52lo_avx_epu64 (__m128i __X, __m128i __Y, __m128i __Z)
+/// \endcode
+///
+/// This intrinsic corresponds to the \c VPMADD52LUQ instruction.
+///
+/// \return
+/// 	return __m128i dst.
+/// \param __X
+/// 	A 128-bit vector of [2 x i64]
+/// \param __Y
+/// 	A 128-bit vector of [2 x i64]
+/// \param __Z
+/// 	A 128-bit vector of [2 x i64]
+///
+/// \code{.operation}
+/// FOR j := 0 to 1
+/// 	i := j*64
+/// 	tmp[127:0] := ZeroExtend64(__Y[i+51:i]) * ZeroExtend64(__Z[i+51:i])
+/// 	dst[i+63:i] := __X[i+63:i] + ZeroExtend64(tmp[51:0])
+/// ENDFOR
+/// dst[MAX:128] := 0
+/// \endcode
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_madd52lo_avx_epu64(__m128i __X, __m128i __Y, __m128i __Z) {
+  return (__m128i)__builtin_ia32_vpmadd52luq128((__v2di)__X, (__v2di)__Y,
+                                                (__v2di)__Z);
+}
+
+/// Multiply packed unsigned 52-bit integers in each 64-bit element of \a __Y
+/// and \a __Z to form a 104-bit intermediate result. Add the low 52-bit
+/// unsigned integer from the intermediate result with the corresponding
+/// unsigned 64-bit integer in \a __X, and store the results in \a dst.
+///
+/// \headerfile <immintrin.h>
+///
+/// \code
+/// __m256i
+/// _mm256_madd52lo_avx_epu64 (__m256i __X, __m256i __Y, __m256i __Z)
+/// \endcode
+///
+/// This intrinsic corresponds to the \c VPMADD52LUQ instruction.
+///
+/// \return
+/// 	return __m256i dst.
+/// \param __X
+/// 	A 256-bit vector of [4 x i64]
+/// \param __Y
+/// 	A 256-bit vector of [4 x i64]
+/// \param __Z
+/// 	A 256-bit vector of [4 x i64]
+///
+/// \code{.operation}
+/// FOR j := 0 to 3
+/// 	i := j*64
+/// 	tmp[127:0] := ZeroExtend64(__Y[i+51:i]) * ZeroExtend64(__Z[i+51:i])
+/// 	dst[i+63:i] := __X[i+63:i] + ZeroExtend64(tmp[51:0])
+/// ENDFOR
+/// dst[MAX:256] := 0
+/// \endcode
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_madd52lo_avx_epu64(__m256i __X, __m256i __Y, __m256i __Z) {
+  return (__m256i)__builtin_ia32_vpmadd52luq256((__v4di)__X, (__v4di)__Y,
+                                                (__v4di)__Z);
+}
+#undef __DEFAULT_FN_ATTRS128
+#undef __DEFAULT_FN_ATTRS256
+
+#endif // __AVXIFMAINTRIN_H
diff --git a/clang/lib/Headers/avxintrin.h b/clang/lib/Headers/avxintrin.h
index a8f953c260c2..ee31569c1623 100644
--- a/clang/lib/Headers/avxintrin.h
+++ b/clang/lib/Headers/avxintrin.h
@@ -39,6 +39,16 @@ typedef float __m256_u __attribute__ ((__vector_size__ (32), __aligned__(1)));
 typedef double __m256d_u __attribute__((__vector_size__(32), __aligned__(1)));
 typedef long long __m256i_u __attribute__((__vector_size__(32), __aligned__(1)));
 
+#ifdef __SSE2__
+/* Both _Float16 and __bf16 require SSE2 being enabled. */
+typedef _Float16 __v16hf __attribute__((__vector_size__(32), __aligned__(32)));
+typedef _Float16 __m256h __attribute__((__vector_size__(32), __aligned__(32)));
+typedef _Float16 __m256h_u __attribute__((__vector_size__(32), __aligned__(1)));
+
+typedef __bf16 __v16bf __attribute__((__vector_size__(32), __aligned__(32)));
+typedef __bf16 __m256bh __attribute__((__vector_size__(32), __aligned__(32)));
+#endif
+
 /* Define the default attributes for the functions in this file. */
 #define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx"), __min_vector_width__(256)))
 #define __DEFAULT_FN_ATTRS128 __attribute__((__always_inline__, __nodebug__, __target__("avx"), __min_vector_width__(128)))
@@ -4288,7 +4298,7 @@ _mm256_set1_epi64x(long long __q)
 static __inline __m256d __DEFAULT_FN_ATTRS
 _mm256_setzero_pd(void)
 {
-  return __extension__ (__m256d){ 0, 0, 0, 0 };
+  return __extension__ (__m256d){ 0.0, 0.0, 0.0, 0.0 };
 }
 
 /// Constructs a 256-bit floating-point vector of [8 x float] with all
@@ -4302,7 +4312,7 @@ _mm256_setzero_pd(void)
 static __inline __m256 __DEFAULT_FN_ATTRS
 _mm256_setzero_ps(void)
 {
-  return __extension__ (__m256){ 0, 0, 0, 0, 0, 0, 0, 0 };
+  return __extension__ (__m256){ 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f };
 }
 
 /// Constructs a 256-bit integer vector initialized to zero.
diff --git a/clang/lib/Headers/avxneconvertintrin.h b/clang/lib/Headers/avxneconvertintrin.h
new file mode 100644
index 000000000000..1bef1c893787
--- /dev/null
+++ b/clang/lib/Headers/avxneconvertintrin.h
@@ -0,0 +1,484 @@
+/*===-------------- avxneconvertintrin.h - AVXNECONVERT --------------------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __IMMINTRIN_H
+#error                                                                         \
+    "Never use <avxneconvertintrin.h> directly; include <immintrin.h> instead."
+#endif // __IMMINTRIN_H
+
+#ifdef __SSE2__
+
+#ifndef __AVXNECONVERTINTRIN_H
+#define __AVXNECONVERTINTRIN_H
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS128                                                  \
+  __attribute__((__always_inline__, __nodebug__, __target__("avxneconvert"),   \
+                 __min_vector_width__(128)))
+#define __DEFAULT_FN_ATTRS256                                                  \
+  __attribute__((__always_inline__, __nodebug__, __target__("avxneconvert"),   \
+                 __min_vector_width__(256)))
+
+/// Convert scalar BF16 (16-bit) floating-point element
+/// stored at memory locations starting at location \a __A to a
+/// single-precision (32-bit) floating-point, broadcast it to packed
+/// single-precision (32-bit) floating-point elements, and store the results in
+/// \a dst.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// _mm_bcstnebf16_ps(const void *__A);
+/// \endcode
+///
+/// This intrinsic corresponds to the \c VBCSTNEBF162PS instruction.
+///
+/// \param __A
+///    A pointer to a 16-bit memory location. The address of the memory
+///    location does not have to be aligned.
+/// \returns
+///    A 128-bit vector of [4 x float].
+///
+/// \code{.operation}
+/// b := Convert_BF16_To_FP32(MEM[__A+15:__A])
+/// FOR j := 0 to 3
+///   m := j*32
+///   dst[m+31:m] := b
+/// ENDFOR
+/// dst[MAX:128] := 0
+/// \endcode
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
+_mm_bcstnebf16_ps(const void *__A) {
+  return (__m128)__builtin_ia32_vbcstnebf162ps128((const __bf16 *)__A);
+}
+
+/// Convert scalar BF16 (16-bit) floating-point element
+/// stored at memory locations starting at location \a __A to a
+/// single-precision (32-bit) floating-point, broadcast it to packed
+/// single-precision (32-bit) floating-point elements, and store the results in
+/// \a dst.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// _mm256_bcstnebf16_ps(const void *__A);
+/// \endcode
+///
+/// This intrinsic corresponds to the \c VBCSTNEBF162PS instruction.
+///
+/// \param __A
+///    A pointer to a 16-bit memory location. The address of the memory
+///    location does not have to be aligned.
+/// \returns
+///    A 256-bit vector of [8 x float].
+///
+/// \code{.operation}
+/// b := Convert_BF16_To_FP32(MEM[__A+15:__A])
+/// FOR j := 0 to 7
+///   m := j*32
+///   dst[m+31:m] := b
+/// ENDFOR
+/// dst[MAX:256] := 0
+/// \endcode
+static __inline__ __m256 __DEFAULT_FN_ATTRS256
+_mm256_bcstnebf16_ps(const void *__A) {
+  return (__m256)__builtin_ia32_vbcstnebf162ps256((const __bf16 *)__A);
+}
+
+/// Convert scalar half-precision (16-bit) floating-point element
+/// stored at memory locations starting at location \a __A to a
+/// single-precision (32-bit) floating-point, broadcast it to packed
+/// single-precision (32-bit) floating-point elements, and store the results in
+/// \a dst.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// _mm_bcstnesh_ps(const void *__A);
+/// \endcode
+///
+/// This intrinsic corresponds to the \c VBCSTNESH2PS instruction.
+///
+/// \param __A
+///    A pointer to a 16-bit memory location. The address of the memory
+///    location does not have to be aligned.
+/// \returns
+///    A 128-bit vector of [4 x float].
+///
+/// \code{.operation}
+/// b := Convert_FP16_To_FP32(MEM[__A+15:__A])
+/// FOR j := 0 to 3
+///   m := j*32
+///   dst[m+31:m] := b
+/// ENDFOR
+/// dst[MAX:128] := 0
+/// \endcode
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
+_mm_bcstnesh_ps(const void *__A) {
+  return (__m128)__builtin_ia32_vbcstnesh2ps128((const _Float16 *)__A);
+}
+
+/// Convert scalar half-precision (16-bit) floating-point element
+/// stored at memory locations starting at location \a __A to a
+/// single-precision (32-bit) floating-point, broadcast it to packed
+/// single-precision (32-bit) floating-point elements, and store the results in
+/// \a dst.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// _mm256_bcstnesh_ps(const void *__A);
+/// \endcode
+///
+/// This intrinsic corresponds to the \c VBCSTNESH2PS instruction.
+///
+/// \param __A
+///    A pointer to a 16-bit memory location. The address of the memory
+///    location does not have to be aligned.
+/// \returns
+///    A 256-bit vector of [8 x float].
+///
+/// \code{.operation}
+/// b := Convert_FP16_To_FP32(MEM[__A+15:__A])
+/// FOR j := 0 to 7
+///   m := j*32
+///   dst[m+31:m] := b
+/// ENDFOR
+/// dst[MAX:256] := 0
+/// \endcode
+static __inline__ __m256 __DEFAULT_FN_ATTRS256
+_mm256_bcstnesh_ps(const void *__A) {
+  return (__m256)__builtin_ia32_vbcstnesh2ps256((const _Float16 *)__A);
+}
+
+/// Convert packed BF16 (16-bit) floating-point even-indexed elements
+/// stored at memory locations starting at location \a __A to packed
+/// single-precision (32-bit) floating-point elements, and store the results in
+/// \a dst.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// _mm_cvtneebf16_ps(const __m128bh *__A);
+/// \endcode
+///
+/// This intrinsic corresponds to the \c VCVTNEEBF162PS instruction.
+///
+/// \param __A
+///    A pointer to a 128-bit memory location containing 8 consecutive
+///    BF16 (16-bit) floating-point values.
+/// \returns
+///    A 128-bit vector of [4 x float].
+///
+/// \code{.operation}
+/// FOR j := 0 to 3
+/// 	k := j*2
+/// 	i := k*16
+/// 	m := j*32
+/// 	dst[m+31:m] := Convert_BF16_To_FP32(MEM[__A+i+15:__A+i])
+/// ENDFOR
+/// dst[MAX:128] := 0
+/// \endcode
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
+_mm_cvtneebf16_ps(const __m128bh *__A) {
+  return (__m128)__builtin_ia32_vcvtneebf162ps128((const __v8bf *)__A);
+}
+
+/// Convert packed BF16 (16-bit) floating-point even-indexed elements
+/// stored at memory locations starting at location \a __A to packed
+/// single-precision (32-bit) floating-point elements, and store the results in
+/// \a dst.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// _mm256_cvtneebf16_ps(const __m256bh *__A);
+/// \endcode
+///
+/// This intrinsic corresponds to the \c VCVTNEEBF162PS instruction.
+///
+/// \param __A
+///    A pointer to a 256-bit memory location containing 16 consecutive
+///    BF16 (16-bit) floating-point values.
+/// \returns
+///    A 256-bit vector of [8 x float].
+///
+/// \code{.operation}
+/// FOR j := 0 to 7
+/// 	k := j*2
+/// 	i := k*16
+/// 	m := j*32
+/// 	dst[m+31:m] := Convert_BF16_To_FP32(MEM[__A+i+15:__A+i])
+/// ENDFOR
+/// dst[MAX:256] := 0
+/// \endcode
+static __inline__ __m256 __DEFAULT_FN_ATTRS256
+_mm256_cvtneebf16_ps(const __m256bh *__A) {
+  return (__m256)__builtin_ia32_vcvtneebf162ps256((const __v16bf *)__A);
+}
+
+/// Convert packed half-precision (16-bit) floating-point even-indexed elements
+/// stored at memory locations starting at location \a __A to packed
+/// single-precision (32-bit) floating-point elements, and store the results in
+/// \a dst.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// _mm_cvtneeph_ps(const __m128h *__A);
+/// \endcode
+///
+/// This intrinsic corresponds to the \c VCVTNEEPH2PS instruction.
+///
+/// \param __A
+///    A pointer to a 128-bit memory location containing 8 consecutive
+///    half-precision (16-bit) floating-point values.
+/// \returns
+///    A 128-bit vector of [4 x float].
+///
+/// \code{.operation}
+/// FOR j := 0 to 3
+/// 	k := j*2
+/// 	i := k*16
+/// 	m := j*32
+/// 	dst[m+31:m] := Convert_FP16_To_FP32(MEM[__A+i+15:__A+i])
+/// ENDFOR
+/// dst[MAX:128] := 0
+/// \endcode
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
+_mm_cvtneeph_ps(const __m128h *__A) {
+  return (__m128)__builtin_ia32_vcvtneeph2ps128((const __v8hf *)__A);
+}
+
+/// Convert packed half-precision (16-bit) floating-point even-indexed elements
+/// stored at memory locations starting at location \a __A to packed
+/// single-precision (32-bit) floating-point elements, and store the results in
+/// \a dst.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// _mm256_cvtneeph_ps(const __m256h *__A);
+/// \endcode
+///
+/// This intrinsic corresponds to the \c VCVTNEEPH2PS instruction.
+///
+/// \param __A
+///    A pointer to a 256-bit memory location containing 16 consecutive
+///    half-precision (16-bit) floating-point values.
+/// \returns
+///    A 256-bit vector of [8 x float].
+///
+/// \code{.operation}
+/// FOR j := 0 to 7
+/// 	k := j*2
+/// 	i := k*16
+/// 	m := j*32
+/// 	dst[m+31:m] := Convert_FP16_To_FP32(MEM[__A+i+15:__A+i])
+/// ENDFOR
+/// dst[MAX:256] := 0
+/// \endcode
+static __inline__ __m256 __DEFAULT_FN_ATTRS256
+_mm256_cvtneeph_ps(const __m256h *__A) {
+  return (__m256)__builtin_ia32_vcvtneeph2ps256((const __v16hf *)__A);
+}
+
+/// Convert packed BF16 (16-bit) floating-point odd-indexed elements
+/// stored at memory locations starting at location \a __A to packed
+/// single-precision (32-bit) floating-point elements, and store the results in
+/// \a dst.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// _mm_cvtneobf16_ps(const __m128bh *__A);
+/// \endcode
+///
+/// This intrinsic corresponds to the \c VCVTNEOBF162PS instruction.
+///
+/// \param __A
+///    A pointer to a 128-bit memory location containing 8 consecutive
+///    BF16 (16-bit) floating-point values.
+/// \returns
+///    A 128-bit vector of [4 x float].
+///
+/// \code{.operation}
+/// FOR j := 0 to 3
+/// 	k := j*2+1
+/// 	i := k*16
+/// 	m := j*32
+/// 	dst[m+31:m] := Convert_BF16_To_FP32(MEM[__A+i+15:__A+i])
+/// ENDFOR
+/// dst[MAX:128] := 0
+/// \endcode
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
+_mm_cvtneobf16_ps(const __m128bh *__A) {
+  return (__m128)__builtin_ia32_vcvtneobf162ps128((const __v8bf *)__A);
+}
+
+/// Convert packed BF16 (16-bit) floating-point odd-indexed elements
+/// stored at memory locations starting at location \a __A to packed
+/// single-precision (32-bit) floating-point elements, and store the results in
+/// \a dst.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// _mm256_cvtneobf16_ps(const __m256bh *__A);
+/// \endcode
+///
+/// This intrinsic corresponds to the \c VCVTNEOBF162PS instruction.
+///
+/// \param __A
+///    A pointer to a 256-bit memory location containing 16 consecutive
+///    BF16 (16-bit) floating-point values.
+/// \returns
+///    A 256-bit vector of [8 x float].
+///
+/// \code{.operation}
+/// FOR j := 0 to 7
+/// 	k := j*2+1
+/// 	i := k*16
+/// 	m := j*32
+/// 	dst[m+31:m] := Convert_BF16_To_FP32(MEM[__A+i+15:__A+i])
+/// ENDFOR
+/// dst[MAX:256] := 0
+/// \endcode
+static __inline__ __m256 __DEFAULT_FN_ATTRS256
+_mm256_cvtneobf16_ps(const __m256bh *__A) {
+  return (__m256)__builtin_ia32_vcvtneobf162ps256((const __v16bf *)__A);
+}
+
+/// Convert packed half-precision (16-bit) floating-point odd-indexed elements
+/// stored at memory locations starting at location \a __A to packed
+/// single-precision (32-bit) floating-point elements, and store the results in
+/// \a dst.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// _mm_cvtneoph_ps(const __m128h *__A);
+/// \endcode
+///
+/// This intrinsic corresponds to the \c VCVTNEOPH2PS instruction.
+///
+/// \param __A
+///    A pointer to a 128-bit memory location containing 8 consecutive
+///    half-precision (16-bit) floating-point values.
+/// \returns
+///    A 128-bit vector of [4 x float].
+///
+/// \code{.operation}
+/// FOR j := 0 to 3
+/// 	k := j*2+1
+/// 	i := k*16
+/// 	m := j*32
+/// 	dst[m+31:m] := Convert_FP16_To_FP32(MEM[__A+i+15:__A+i])
+/// ENDFOR
+/// dst[MAX:128] := 0
+/// \endcode
+static __inline__ __m128 __DEFAULT_FN_ATTRS128
+_mm_cvtneoph_ps(const __m128h *__A) {
+  return (__m128)__builtin_ia32_vcvtneoph2ps128((const __v8hf *)__A);
+}
+
+/// Convert packed half-precision (16-bit) floating-point odd-indexed elements
+/// stored at memory locations starting at location \a __A to packed
+/// single-precision (32-bit) floating-point elements, and store the results in
+/// \a dst.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// _mm256_cvtneoph_ps(const __m256h *__A);
+/// \endcode
+///
+/// This intrinsic corresponds to the \c VCVTNEOPH2PS instruction.
+///
+/// \param __A
+///    A pointer to a 256-bit memory location containing 16 consecutive
+///    half-precision (16-bit) floating-point values.
+/// \returns
+///    A 256-bit vector of [8 x float].
+///
+/// \code{.operation}
+/// FOR j := 0 to 7
+/// 	k := j*2+1
+/// 	i := k*16
+/// 	m := j*32
+/// 	dst[m+31:m] := Convert_FP16_To_FP32(MEM[__A+i+15:__A+i])
+/// ENDFOR
+/// dst[MAX:256] := 0
+/// \endcode
+static __inline__ __m256 __DEFAULT_FN_ATTRS256
+_mm256_cvtneoph_ps(const __m256h *__A) {
+  return (__m256)__builtin_ia32_vcvtneoph2ps256((const __v16hf *)__A);
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in \a __A
+/// to packed BF16 (16-bit) floating-point elements, and store the results in \a
+/// dst.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// _mm_cvtneps_avx_pbh(__m128 __A);
+/// \endcode
+///
+/// This intrinsic corresponds to the \c VCVTNEPS2BF16 instruction.
+///
+/// \param __A
+///    A 128-bit vector of [4 x float].
+/// \returns
+///    A 128-bit vector of [8 x bfloat].
+///
+/// \code{.operation}
+/// FOR j := 0 to 3
+/// 	dst.word[j] := Convert_FP32_To_BF16(__A.fp32[j])
+/// ENDFOR
+/// dst[MAX:128] := 0
+/// \endcode
+static __inline__ __m128bh __DEFAULT_FN_ATTRS128
+_mm_cvtneps_avx_pbh(__m128 __A) {
+  return (__m128bh)__builtin_ia32_vcvtneps2bf16128((__v4sf)__A);
+}
+
+/// Convert packed single-precision (32-bit) floating-point elements in \a __A
+/// to packed BF16 (16-bit) floating-point elements, and store the results in \a
+/// dst.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// _mm256_cvtneps_avx_pbh(__m256 __A);
+/// \endcode
+///
+/// This intrinsic corresponds to the \c VCVTNEPS2BF16 instruction.
+///
+/// \param __A
+///    A 256-bit vector of [8 x float].
+/// \returns
+///    A 128-bit vector of [8 x bfloat].
+///
+/// \code{.operation}
+/// FOR j := 0 to 7
+/// 	dst.word[j] := Convert_FP32_To_BF16(a.fp32[j])
+/// ENDFOR
+/// dst[MAX:128] := 0
+/// \endcode
+static __inline__ __m128bh __DEFAULT_FN_ATTRS256
+_mm256_cvtneps_avx_pbh(__m256 __A) {
+  return (__m128bh)__builtin_ia32_vcvtneps2bf16256((__v8sf)__A);
+}
+
+#undef __DEFAULT_FN_ATTRS128
+#undef __DEFAULT_FN_ATTRS256
+
+#endif // __AVXNECONVERTINTRIN_H
+#endif // __SSE2__
diff --git a/clang/lib/Headers/avxvnniint8intrin.h b/clang/lib/Headers/avxvnniint8intrin.h
new file mode 100644
index 000000000000..b0b6cb853f71
--- /dev/null
+++ b/clang/lib/Headers/avxvnniint8intrin.h
@@ -0,0 +1,471 @@
+/*===-------- avxvnniint8intrin.h - AVXVNNIINT8 intrinsics -----------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+#ifndef __IMMINTRIN_H
+#error                                                                         \
+    "Never use <avxvnniint8intrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef __AVXVNNIINT8INTRIN_H
+#define __AVXVNNIINT8INTRIN_H
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS256                                                  \
+  __attribute__((__always_inline__, __nodebug__, __target__("avxvnniint8"),    \
+                 __min_vector_width__(256)))
+#define __DEFAULT_FN_ATTRS128                                                  \
+  __attribute__((__always_inline__, __nodebug__, __target__("avxvnniint8"),    \
+                 __min_vector_width__(128)))
+
+/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with
+///    corresponding signed 8-bit integers in \a __B, producing 4 intermediate
+///    signed 16-bit results. Sum these 4 results with the corresponding
+///    32-bit integer in \a __W, and store the packed 32-bit results in \a dst.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// _mm_dpbssd_epi32(__m128i __W, __m128i __A, __m128i __B);
+/// \endcode
+///
+/// This intrinsic corresponds to the \c VPDPBSSD instruction.
+///
+/// \param __A
+///    A 128-bit vector of [16 x char].
+/// \param __B
+///    A 128-bit vector of [16 x char].
+/// \returns
+///    A 128-bit vector of [4 x int].
+///
+/// \code{.operation}
+/// FOR j := 0 to 3
+/// 	tmp1.word := SignExtend16(__A.byte[4*j]) * SignExtend16(__B.byte[4*j])
+/// 	tmp2.word := SignExtend16(__A.byte[4*j+1]) * SignExtend16(__B.byte[4*j+1])
+/// 	tmp3.word := SignExtend16(__A.byte[4*j+2]) * SignExtend16(__B.byte[4*j+2])
+/// 	tmp4.word := SignExtend16(__A.byte[4*j+3]) * SignExtend16(__B.byte[4*j+3])
+/// 	dst.dword[j] := __W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4
+/// ENDFOR
+/// dst[MAX:128] := 0
+/// \endcode
+static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpbssd_epi32(__m128i __W,
+                                                                 __m128i __A,
+                                                                 __m128i __B) {
+  return (__m128i)__builtin_ia32_vpdpbssd128((__v4si)__W, (__v4si)__A,
+                                             (__v4si)__B);
+}
+
+/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with
+///    corresponding signed 8-bit integers in \a __B, producing 4 intermediate
+///    signed 16-bit results. Sum these 4 results with the corresponding
+///    32-bit integer in \a __W, and store the packed 32-bit results in \a dst.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// _mm256_dpbssd_epi32(__m256i __W, __m256i __A, __m256i __B);
+/// \endcode
+///
+/// This intrinsic corresponds to the \c VPDPBSSD instruction.
+///
+/// \param __A
+///    A 256-bit vector of [32 x char].
+/// \param __B
+///    A 256-bit vector of [32 x char].
+/// \returns
+///    A 256-bit vector of [8 x int].
+///
+/// \code{.operation}
+/// FOR j := 0 to 7
+/// 	tmp1.word := SignExtend16(__A.byte[4*j]) * SignExtend16(__B.byte[4*j])
+/// 	tmp2.word := SignExtend16(__A.byte[4*j+1]) * SignExtend16(__B.byte[4*j+1])
+/// 	tmp3.word := SignExtend16(__A.byte[4*j+2]) * SignExtend16(__B.byte[4*j+2])
+/// 	tmp4.word := SignExtend16(__A.byte[4*j+3]) * SignExtend16(__B.byte[4*j+3])
+/// 	dst.dword[j] := __W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4
+/// ENDFOR
+/// dst[MAX:256] := 0
+/// \endcode
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_dpbssd_epi32(__m256i __W, __m256i __A, __m256i __B) {
+  return (__m256i)__builtin_ia32_vpdpbssd256((__v8si)__W, (__v8si)__A,
+                                             (__v8si)__B);
+}
+
+/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with
+///    corresponding signed 8-bit integers in \a __B, producing 4 intermediate
+///    signed 16-bit results. Sum these 4 results with the corresponding
+///    32-bit integer in \a __W with signed saturation, and store the packed
+///    32-bit results in \a dst.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// _mm_dpbssds_epi32( __m128i __W, __m128i __A, __m128i __B);
+/// \endcode
+///
+/// This intrinsic corresponds to the \c VPDPBSSD instruction.
+///
+/// \param __A
+///    A 128-bit vector of [16 x char].
+/// \param __B
+///    A 128-bit vector of [16 x char].
+/// \returns
+///    A 128-bit vector of [4 x int].
+///
+/// \code{.operation}
+/// FOR j := 0 to 3
+/// 	tmp1.word := SignExtend16(__A.byte[4*j]) * SignExtend16(__B.byte[4*j])
+/// 	tmp2.word := SignExtend16(__A.byte[4*j+1]) * SignExtend16(__B.byte[4*j+1])
+/// 	tmp3.word := SignExtend16(__A.byte[4*j+2]) * SignExtend16(__B.byte[4*j+2])
+/// 	tmp4.word := SignExtend16(__A.byte[4*j+3]) * SignExtend16(__B.byte[4*j+3])
+/// 	dst.dword[j] := SIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4)
+/// ENDFOR
+/// dst[MAX:128] := 0
+/// \endcode
+static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpbssds_epi32(__m128i __W,
+                                                                  __m128i __A,
+                                                                  __m128i __B) {
+  return (__m128i)__builtin_ia32_vpdpbssds128((__v4si)__W, (__v4si)__A,
+                                              (__v4si)__B);
+}
+
+/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with
+///    corresponding signed 8-bit integers in \a __B, producing 4 intermediate
+///    signed 16-bit results. Sum these 4 results with the corresponding
+///    32-bit integer in \a __W with signed saturation, and store the packed
+///    32-bit results in \a dst.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// _mm256_dpbssds_epi32(__m256i __W, __m256i __A, __m256i __B);
+/// \endcode
+///
+/// This intrinsic corresponds to the \c VPDPBSSD instruction.
+///
+/// \param __A
+///    A 256-bit vector of [32 x char].
+/// \param __B
+///    A 256-bit vector of [32 x char].
+/// \returns
+///    A 256-bit vector of [8 x int].
+///
+/// \code{.operation}
+/// FOR j := 0 to 7
+/// 	tmp1.word := SignExtend16(__A.byte[4*j]) * SignExtend16(__B.byte[4*j])
+/// 	tmp2.word := SignExtend16(__A.byte[4*j+1]) * SignExtend16(__B.byte[4*j+1])
+/// 	tmp3.word := SignExtend16(__A.byte[4*j+2]) * SignExtend16(__B.byte[4*j+2])
+/// 	tmp4.word := SignExtend16(__A.byte[4*j+3]) * SignExtend16(__B.byte[4*j+3])
+/// 	dst.dword[j] := SIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4)
+/// ENDFOR
+/// dst[MAX:256] := 0
+/// \endcode
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_dpbssds_epi32(__m256i __W, __m256i __A, __m256i __B) {
+  return (__m256i)__builtin_ia32_vpdpbssds256((__v8si)__W, (__v8si)__A,
+                                              (__v8si)__B);
+}
+
+/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with
+///    corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate
+///    signed 16-bit results. Sum these 4 results with the corresponding
+///    32-bit integer in \a __W, and store the packed 32-bit results in \a dst.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// _mm_dpbsud_epi32(__m128i __W, __m128i __A, __m128i __B);
+/// \endcode
+///
+/// This intrinsic corresponds to the \c VPDPBSSD instruction.
+///
+/// \param __A
+///    A 128-bit vector of [16 x char].
+/// \param __B
+///    A 128-bit vector of [16 x unsigned char].
+/// \returns
+///    A 128-bit vector of [4 x int].
+///
+/// \code{.operation}
+/// FOR j := 0 to 3
+/// 	tmp1.word := Signed(SignExtend16(__A.byte[4*j]) * ZeroExtend16(__B.byte[4*j]))
+/// 	tmp2.word := Signed(SignExtend16(__A.byte[4*j+1]) * ZeroExtend16(__B.byte[4*j+1]))
+/// 	tmp3.word := Signed(SignExtend16(__A.byte[4*j+2]) * ZeroExtend16(__B.byte[4*j+2]))
+/// 	tmp4.word := Signed(SignExtend16(__A.byte[4*j+3]) * ZeroExtend16(__B.byte[4*j+3]))
+/// 	dst.dword[j] := __W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4
+/// ENDFOR
+/// dst[MAX:128] := 0
+/// \endcode
+static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpbsud_epi32(__m128i __W,
+                                                                 __m128i __A,
+                                                                 __m128i __B) {
+  return (__m128i)__builtin_ia32_vpdpbsud128((__v4si)__W, (__v4si)__A,
+                                             (__v4si)__B);
+}
+
+/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with
+///    corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate
+///    signed 16-bit results. Sum these 4 results with the corresponding
+///    32-bit integer in \a __W, and store the packed 32-bit results in \a dst.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// _mm256_dpbsud_epi32(__m256i __W, __m256i __A, __m256i __B);
+/// \endcode
+///
+/// This intrinsic corresponds to the \c VPDPBSSD instruction.
+///
+/// \param __A
+///    A 256-bit vector of [32 x char].
+/// \param __B
+///    A 256-bit vector of [32 x unsigned char].
+/// \returns
+///    A 256-bit vector of [8 x int].
+///
+/// \code{.operation}
+/// FOR j := 0 to 7
+/// 	tmp1.word := Signed(SignExtend16(__A.byte[4*j]) * ZeroExtend16(__B.byte[4*j]))
+/// 	tmp2.word := Signed(SignExtend16(__A.byte[4*j+1]) * ZeroExtend16(__B.byte[4*j+1]))
+/// 	tmp3.word := Signed(SignExtend16(__A.byte[4*j+2]) * ZeroExtend16(__B.byte[4*j+2]))
+/// 	tmp4.word := Signed(SignExtend16(__A.byte[4*j+3]) * ZeroExtend16(__B.byte[4*j+3]))
+/// 	dst.dword[j] := __W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4
+/// ENDFOR
+/// dst[MAX:256] := 0
+/// \endcode
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_dpbsud_epi32(__m256i __W, __m256i __A, __m256i __B) {
+  return (__m256i)__builtin_ia32_vpdpbsud256((__v8si)__W, (__v8si)__A,
+                                             (__v8si)__B);
+}
+
+/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with
+///    corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate
+///    signed 16-bit results. Sum these 4 results with the corresponding
+///    32-bit integer in \a __W with signed saturation, and store the packed
+///    32-bit results in \a dst.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// _mm_dpbsuds_epi32( __m128i __W, __m128i __A, __m128i __B);
+/// \endcode
+///
+/// This intrinsic corresponds to the \c VPDPBSSD instruction.
+///
+/// \param __A
+///    A 128-bit vector of [16 x char].
+/// \param __B
+///    A 128-bit vector of [16 x unsigned char].
+/// \returns
+///    A 128-bit vector of [4 x int].
+///
+/// \code{.operation}
+/// FOR j := 0 to 3
+/// 	tmp1.word := Signed(SignExtend16(__A.byte[4*j]) * ZeroExtend16(__B.byte[4*j]))
+/// 	tmp2.word := Signed(SignExtend16(__A.byte[4*j+1]) * ZeroExtend16(__B.byte[4*j+1]))
+/// 	tmp3.word := Signed(SignExtend16(__A.byte[4*j+2]) * ZeroExtend16(__B.byte[4*j+2]))
+/// 	tmp4.word := Signed(SignExtend16(__A.byte[4*j+3]) * ZeroExtend16(__B.byte[4*j+3]))
+/// 	dst.dword[j] := SIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4)
+/// ENDFOR
+/// dst[MAX:128] := 0
+/// \endcode
+static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpbsuds_epi32(__m128i __W,
+                                                                  __m128i __A,
+                                                                  __m128i __B) {
+  return (__m128i)__builtin_ia32_vpdpbsuds128((__v4si)__W, (__v4si)__A,
+                                              (__v4si)__B);
+}
+
+/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with
+///    corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate
+///    signed 16-bit results. Sum these 4 results with the corresponding
+///    32-bit integer in \a __W with signed saturation, and store the packed
+///    32-bit results in \a dst.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// _mm256_dpbsuds_epi32(__m256i __W, __m256i __A, __m256i __B);
+/// \endcode
+///
+/// This intrinsic corresponds to the \c VPDPBSSD instruction.
+///
+/// \param __A
+///    A 256-bit vector of [32 x char].
+/// \param __B
+///    A 256-bit vector of [32 x unsigned char].
+/// \returns
+///    A 256-bit vector of [8 x int].
+///
+/// \code{.operation}
+/// FOR j := 0 to 7
+/// 	tmp1.word := Signed(SignExtend16(__A.byte[4*j]) * ZeroExtend16(__B.byte[4*j]))
+/// 	tmp2.word := Signed(SignExtend16(__A.byte[4*j+1]) * ZeroExtend16(__B.byte[4*j+1]))
+/// 	tmp3.word := Signed(SignExtend16(__A.byte[4*j+2]) * ZeroExtend16(__B.byte[4*j+2]))
+/// 	tmp4.word := Signed(SignExtend16(__A.byte[4*j+3]) * ZeroExtend16(__B.byte[4*j+3]))
+/// 	dst.dword[j] := SIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4)
+/// ENDFOR
+/// dst[MAX:256] := 0
+/// \endcode
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_dpbsuds_epi32(__m256i __W, __m256i __A, __m256i __B) {
+  return (__m256i)__builtin_ia32_vpdpbsuds256((__v8si)__W, (__v8si)__A,
+                                              (__v8si)__B);
+}
+
+/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a __A with
+///    corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate
+///    signed 16-bit results. Sum these 4 results with the corresponding
+///    32-bit integer in \a __W, and store the packed 32-bit results in \a dst.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// _mm_dpbuud_epi32(__m128i __W, __m128i __A, __m128i __B);
+/// \endcode
+///
+/// This intrinsic corresponds to the \c VPDPBSSD instruction.
+///
+/// \param __A
+///    A 128-bit vector of [16 x unsigned char].
+/// \param __B
+///    A 128-bit vector of [16 x unsigned char].
+/// \returns
+///    A 128-bit vector of [4 x int].
+///
+/// \code{.operation}
+/// FOR j := 0 to 3
+/// 	tmp1.word := ZeroExtend16(__A.byte[4*j]) * ZeroExtend16(__B.byte[4*j])
+/// 	tmp2.word := ZeroExtend16(__A.byte[4*j+1]) * ZeroExtend16(__B.byte[4*j+1])
+/// 	tmp3.word := ZeroExtend16(__A.byte[4*j+2]) * ZeroExtend16(__B.byte[4*j+2])
+/// 	tmp4.word := ZeroExtend16(__A.byte[4*j+3]) * ZeroExtend16(__B.byte[4*j+3])
+/// 	dst.dword[j] := __W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4
+/// ENDFOR
+/// dst[MAX:128] := 0
+/// \endcode
+static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpbuud_epi32(__m128i __W,
+                                                                 __m128i __A,
+                                                                 __m128i __B) {
+  return (__m128i)__builtin_ia32_vpdpbuud128((__v4si)__W, (__v4si)__A,
+                                             (__v4si)__B);
+}
+
+/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a __A with
+///    corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate
+///    signed 16-bit results. Sum these 4 results with the corresponding
+///    32-bit integer in \a __W, and store the packed 32-bit results in \a dst.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// _mm256_dpbuud_epi32(__m256i __W, __m256i __A, __m256i __B);
+/// \endcode
+///
+/// This intrinsic corresponds to the \c VPDPBSSD instruction.
+///
+/// \param __A
+///    A 256-bit vector of [32 x unsigned char].
+/// \param __B
+///    A 256-bit vector of [32 x unsigned char].
+/// \returns
+///    A 256-bit vector of [8 x int].
+///
+/// \code{.operation}
+/// FOR j := 0 to 7
+/// 	tmp1.word := ZeroExtend16(__A.byte[4*j]) * ZeroExtend16(__B.byte[4*j])
+/// 	tmp2.word := ZeroExtend16(__A.byte[4*j+1]) * ZeroExtend16(__B.byte[4*j+1])
+/// 	tmp3.word := ZeroExtend16(__A.byte[4*j+2]) * ZeroExtend16(__B.byte[4*j+2])
+/// 	tmp4.word := ZeroExtend16(__A.byte[4*j+3]) * ZeroExtend16(__B.byte[4*j+3])
+/// 	dst.dword[j] := __W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4
+/// ENDFOR
+/// dst[MAX:256] := 0
+/// \endcode
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_dpbuud_epi32(__m256i __W, __m256i __A, __m256i __B) {
+  return (__m256i)__builtin_ia32_vpdpbuud256((__v8si)__W, (__v8si)__A,
+                                             (__v8si)__B);
+}
+
+/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a __A with
+///    corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate
+///    signed 16-bit results. Sum these 4 results with the corresponding
+///    32-bit integer in \a __W with signed saturation, and store the packed
+///    32-bit results in \a dst.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// _mm_dpbuuds_epi32( __m128i __W, __m128i __A, __m128i __B);
+/// \endcode
+///
+/// This intrinsic corresponds to the \c VPDPBUUDS instruction.
+///
+/// \param __A
+///    A 128-bit vector of [16 x unsigned char].
+/// \param __B
+///    A 128-bit vector of [16 x unsigned char].
+/// \returns
+///    A 128-bit vector of [4 x int].
+///
+/// \code{.operation}
+/// FOR j := 0 to 3
+/// 	tmp1.word := ZeroExtend16(__A.byte[4*j]) * ZeroExtend16(__B.byte[4*j])
+/// 	tmp2.word := ZeroExtend16(__A.byte[4*j+1]) * ZeroExtend16(__B.byte[4*j+1])
+/// 	tmp3.word := ZeroExtend16(__A.byte[4*j+2]) * ZeroExtend16(__B.byte[4*j+2])
+/// 	tmp4.word := ZeroExtend16(__A.byte[4*j+3]) * ZeroExtend16(__B.byte[4*j+3])
+/// 	dst.dword[j] := UNSIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4)
+/// ENDFOR
+/// dst[MAX:128] := 0
+/// \endcode
+static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_dpbuuds_epi32(__m128i __W,
+                                                                  __m128i __A,
+                                                                  __m128i __B) {
+  return (__m128i)__builtin_ia32_vpdpbuuds128((__v4si)__W, (__v4si)__A,
+                                              (__v4si)__B);
+}
+
+/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in \a __A with
+///    corresponding unsigned 8-bit integers in \a __B, producing 4 intermediate
+///    signed 16-bit results. Sum these 4 results with the corresponding
+///    32-bit integer in \a __W with signed saturation, and store the packed
+///    32-bit results in \a dst.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// _mm256_dpbuuds_epi32(__m256i __W, __m256i __A, __m256i __B);
+/// \endcode
+///
+/// This intrinsic corresponds to the \c VPDPBUUDS instruction.
+///
+/// \param __A
+///    A 256-bit vector of [32 x unsigned char].
+/// \param __B
+///    A 256-bit vector of [32 x unsigned char].
+/// \returns
+///    A 256-bit vector of [8 x int].
+///
+/// \code{.operation}
+/// FOR j := 0 to 7
+/// 	tmp1.word := ZeroExtend16(__A.byte[4*j]) * ZeroExtend16(__B.byte[4*j])
+/// 	tmp2.word := ZeroExtend16(__A.byte[4*j+1]) * ZeroExtend16(__B.byte[4*j+1])
+/// 	tmp3.word := ZeroExtend16(__A.byte[4*j+2]) * ZeroExtend16(__B.byte[4*j+2])
+/// 	tmp4.word := ZeroExtend16(__A.byte[4*j+3]) * ZeroExtend16(__B.byte[4*j+3])
+/// 	dst.dword[j] := UNSIGNED_DWORD_SATURATE(__W.dword[j] + tmp1 + tmp2 + tmp3 + tmp4)
+/// ENDFOR
+/// dst[MAX:256] := 0
+/// \endcode
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_dpbuuds_epi32(__m256i __W, __m256i __A, __m256i __B) {
+  return (__m256i)__builtin_ia32_vpdpbuuds256((__v8si)__W, (__v8si)__A,
+                                              (__v8si)__B);
+}
+#undef __DEFAULT_FN_ATTRS128
+#undef __DEFAULT_FN_ATTRS256
+
+#endif // __AVXVNNIINT8INTRIN_H
diff --git a/clang/lib/Headers/cmpccxaddintrin.h b/clang/lib/Headers/cmpccxaddintrin.h
new file mode 100644
index 000000000000..6957498996c8
--- /dev/null
+++ b/clang/lib/Headers/cmpccxaddintrin.h
@@ -0,0 +1,70 @@
+/*===--------------- cmpccxaddintrin.h - CMPCCXADD intrinsics--------------===
+ *
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+#ifndef __X86GPRINTRIN_H
+#error                                                                         \
+    "Never use <cmpccxaddintrin.h> directly; include <x86gprintrin.h> instead."
+#endif // __X86GPRINTRIN_H
+
+#ifndef __CMPCCXADDINTRIN_H
+#define __CMPCCXADDINTRIN_H
+#ifdef __x86_64__
+
+typedef enum {
+  _CMPCCX_O,   /* Overflow.  */
+  _CMPCCX_NO,  /* No overflow.  */
+  _CMPCCX_B,   /* Below.  */
+  _CMPCCX_NB,  /* Not below.  */
+  _CMPCCX_Z,   /* Zero.  */
+  _CMPCCX_NZ,  /* Not zero.  */
+  _CMPCCX_BE,  /* Below or equal.  */
+  _CMPCCX_NBE, /* Neither below nor equal.  */
+  _CMPCCX_S,   /* Sign.  */
+  _CMPCCX_NS,  /* No sign.  */
+  _CMPCCX_P,   /* Parity.  */
+  _CMPCCX_NP,  /* No parity.  */
+  _CMPCCX_L,   /* Less.  */
+  _CMPCCX_NL,  /* Not less.  */
+  _CMPCCX_LE,  /* Less or equal.  */
+  _CMPCCX_NLE, /* Neither less nor equal.  */
+} _CMPCCX_ENUM;
+
+/// Compares the value from the memory __A with the value of __B. If the
+/// specified condition __D is met, then add the third operand __C to the
+/// __A and write it into __A, else the value of __A is unchanged. The return
+/// value is the original value of __A.
+///
+/// \headerfile <immintrin.h>
+///
+/// This intrinsic corresponds to the \c CMPCCXADD instructions.
+///
+/// \param __A
+///    __A pointer specifying the memory address.
+///
+/// \param __B
+///   A integer operand.
+///
+/// \param __C
+///   A integer operand.
+///
+/// \param __D
+///   The specified condition.
+///
+/// \returns a integer which is the original value of first operand.
+
+#define _cmpccxadd_epi32(__A, __B, __C, __D)                                   \
+  ((int)(__builtin_ia32_cmpccxadd32((void *)(__A), (int)(__B), (int)(__C),     \
+                                    (int)(__D))))
+
+#define _cmpccxadd_epi64(__A, __B, __C, __D)                                   \
+  ((long long)(__builtin_ia32_cmpccxadd64((void *)(__A), (long long)(__B),     \
+                                          (long long)(__C), (int)(__D))))
+
+#endif // __x86_64__
+#endif // __CMPCCXADDINTRIN_H
diff --git a/clang/lib/Headers/cpuid.h b/clang/lib/Headers/cpuid.h
index 5d262a60735f..1ad6853a97c9 100644
--- a/clang/lib/Headers/cpuid.h
+++ b/clang/lib/Headers/cpuid.h
@@ -200,9 +200,18 @@
 #define bit_AMXINT8       0x02000000
 
 /* Features in %eax for leaf 7 sub-leaf 1 */
+#define bit_RAOINT        0x00000008
 #define bit_AVXVNNI       0x00000010
 #define bit_AVX512BF16    0x00000020
+#define bit_CMPCCXADD     0x00000080
+#define bit_AMXFP16       0x00200000
 #define bit_HRESET        0x00400000
+#define bit_AVXIFMA       0x00800000
+
+/* Features in %edx for leaf 7 sub-leaf 1 */
+#define bit_AVXVNNIINT8   0x00000010
+#define bit_AVXNECONVERT  0x00000020
+#define bit_PREFETCHI     0x00004000
 
 /* Features in %eax for leaf 13 sub-leaf 1 */
 #define bit_XSAVEOPT    0x00000001
@@ -232,6 +241,7 @@
 
 /* Features in %ebx for leaf 0x80000008 */
 #define bit_CLZERO      0x00000001
+#define bit_RDPRU       0x00000010
 #define bit_WBNOINVD    0x00000200
 
 
@@ -260,7 +270,8 @@
         : "0"(__leaf), "2"(__count))
 #endif
 
-static __inline int __get_cpuid_max (unsigned int __leaf, unsigned int *__sig)
+static __inline unsigned int __get_cpuid_max (unsigned int __leaf,
+                                              unsigned int *__sig)
 {
     unsigned int __eax, __ebx, __ecx, __edx;
 #if __i386__
diff --git a/clang/lib/Headers/cuda_wrappers/cmath b/clang/lib/Headers/cuda_wrappers/cmath
new file mode 100644
index 000000000000..45f89beec9b4
--- /dev/null
+++ b/clang/lib/Headers/cuda_wrappers/cmath
@@ -0,0 +1,90 @@
+/*===---- cmath - CUDA wrapper for <cmath> ---------------------------------===
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __CLANG_CUDA_WRAPPERS_CMATH
+#define __CLANG_CUDA_WRAPPERS_CMATH
+
+#include_next <cmath>
+
+#if defined(_LIBCPP_STD_VER)
+
+// libc++ will need long double variants of these functions, but CUDA does not
+// provide them. We'll provide their declarations, which should allow the
+// headers to parse, but would not allow accidental use of them on a GPU.
+
+__attribute__((device)) long double logb(long double);
+__attribute__((device)) long double scalbn(long double, int);
+
+namespace std {
+
+// For __constexpr_fmin/fmax we only need device-side overloads before c++14
+// where they are not constexpr.
+#if _LIBCPP_STD_VER < 14
+
+__attribute__((device))
+inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 float __constexpr_fmax(float __x, float __y) _NOEXCEPT {
+  return __builtin_fmaxf(__x, __y);
+}
+
+__attribute__((device))
+inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 double __constexpr_fmax(double __x, double __y) _NOEXCEPT {
+  return __builtin_fmax(__x, __y);
+}
+
+__attribute__((device))
+inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 long double
+__constexpr_fmax(long double __x, long double __y) _NOEXCEPT {
+  return __builtin_fmaxl(__x, __y);
+}
+
+template <class _Tp, class _Up, __enable_if_t<is_arithmetic<_Tp>::value && is_arithmetic<_Up>::value, int> = 0>
+__attribute__((device))
+_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 typename __promote<_Tp, _Up>::type
+__constexpr_fmax(_Tp __x, _Up __y) _NOEXCEPT {
+  using __result_type = typename __promote<_Tp, _Up>::type;
+  return std::__constexpr_fmax(static_cast<__result_type>(__x), static_cast<__result_type>(__y));
+}
+#endif // _LIBCPP_STD_VER < 14
+
+// For logb/scalbn templates we must always provide device overloads because
+// libc++ implementation uses __builtin_XXX which gets translated into a libcall
+// which we can't handle on GPU. We need to forward those to CUDA-provided
+// implementations.
+
+template <class _Tp>
+__attribute__((device))
+_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 _Tp __constexpr_logb(_Tp __x) {
+  return ::logb(__x);
+}
+
+template <class _Tp>
+__attribute__((device))
+_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 _Tp __constexpr_scalbn(_Tp __x, int __exp) {
+  return ::scalbn(__x, __exp);
+}
+
+} // namespace std//
+
+#endif // _LIBCPP_STD_VER
+
+#endif // include guard
diff --git a/clang/lib/Headers/emmintrin.h b/clang/lib/Headers/emmintrin.h
index a3f56e832b32..064d97493659 100644
--- a/clang/lib/Headers/emmintrin.h
+++ b/clang/lib/Headers/emmintrin.h
@@ -38,6 +38,16 @@ typedef unsigned char __v16qu __attribute__((__vector_size__(16)));
  * appear in the interface though. */
 typedef signed char __v16qs __attribute__((__vector_size__(16)));
 
+#ifdef __SSE2__
+/* Both _Float16 and __bf16 require SSE2 being enabled. */
+typedef _Float16 __v8hf __attribute__((__vector_size__(16), __aligned__(16)));
+typedef _Float16 __m128h __attribute__((__vector_size__(16), __aligned__(16)));
+typedef _Float16 __m128h_u __attribute__((__vector_size__(16), __aligned__(1)));
+
+typedef __bf16 __v8bf __attribute__((__vector_size__(16), __aligned__(16)));
+typedef __bf16 __m128bh __attribute__((__vector_size__(16), __aligned__(16)));
+#endif
+
 /* Define the default attributes for the functions in this file. */
 #define __DEFAULT_FN_ATTRS                                                     \
   __attribute__((__always_inline__, __nodebug__, __target__("sse2"),           \
@@ -1809,7 +1819,7 @@ static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_setr_pd(double __w,
 /// \returns An initialized 128-bit floating-point vector of [2 x double] with
 ///    all elements set to zero.
 static __inline__ __m128d __DEFAULT_FN_ATTRS _mm_setzero_pd(void) {
-  return __extension__(__m128d){0, 0};
+  return __extension__(__m128d){0.0, 0.0};
 }
 
 /// Constructs a 128-bit floating-point vector of [2 x double]. The lower
diff --git a/clang/lib/Headers/float.h b/clang/lib/Headers/float.h
index c6a6cc08462d..0e73bca0a2d6 100644
--- a/clang/lib/Headers/float.h
+++ b/clang/lib/Headers/float.h
@@ -38,9 +38,10 @@
 #  undef FLT_MANT_DIG
 #  undef DBL_MANT_DIG
 #  undef LDBL_MANT_DIG
-#  if __STDC_VERSION__ >= 199901L || !defined(__STRICT_ANSI__) ||              \
-      __cplusplus >= 201103L ||                                                \
-      (__STDC_HOSTED__ && defined(_AIX) && defined(_ALL_SOURCE))
+#if (defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L) ||              \
+    !defined(__STRICT_ANSI__) ||                                               \
+    (defined(__cplusplus) && __cplusplus >= 201103L) ||                        \
+    (__STDC_HOSTED__ && defined(_AIX) && defined(_ALL_SOURCE))
 #    undef DECIMAL_DIG
 #  endif
 #  undef FLT_DIG
@@ -67,9 +68,10 @@
 #  undef FLT_MIN
 #  undef DBL_MIN
 #  undef LDBL_MIN
-#  if __STDC_VERSION__ >= 201112L || !defined(__STRICT_ANSI__) ||              \
-      __cplusplus >= 201703L ||                                                \
-      (__STDC_HOSTED__ && defined(_AIX) && defined(_ALL_SOURCE))
+#if (defined(__STDC_VERSION__) && __STDC_VERSION__ >= 201112L) ||              \
+    !defined(__STRICT_ANSI__) ||                                               \
+    (defined(__cplusplus) && __cplusplus >= 201703L) ||                        \
+    (__STDC_HOSTED__ && defined(_AIX) && defined(_ALL_SOURCE))
 #    undef FLT_TRUE_MIN
 #    undef DBL_TRUE_MIN
 #    undef LDBL_TRUE_MIN
@@ -84,7 +86,10 @@
 
 /* Characteristics of floating point types, C99 5.2.4.2.2 */
 
+#if (defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L) ||              \
+    (defined(__cplusplus) && __cplusplus >= 201103L)
 #define FLT_EVAL_METHOD __FLT_EVAL_METHOD__
+#endif
 #define FLT_ROUNDS (__builtin_flt_rounds())
 #define FLT_RADIX __FLT_RADIX__
 
@@ -92,8 +97,9 @@
 #define DBL_MANT_DIG __DBL_MANT_DIG__
 #define LDBL_MANT_DIG __LDBL_MANT_DIG__
 
-#if __STDC_VERSION__ >= 199901L || !defined(__STRICT_ANSI__) ||                \
-    __cplusplus >= 201103L ||                                                  \
+#if (defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L) ||              \
+    !defined(__STRICT_ANSI__) ||                                               \
+    (defined(__cplusplus) && __cplusplus >= 201103L) ||                        \
     (__STDC_HOSTED__ && defined(_AIX) && defined(_ALL_SOURCE))
 #  define DECIMAL_DIG __DECIMAL_DIG__
 #endif
@@ -130,8 +136,9 @@
 #define DBL_MIN __DBL_MIN__
 #define LDBL_MIN __LDBL_MIN__
 
-#if __STDC_VERSION__ >= 201112L || !defined(__STRICT_ANSI__) ||                \
-    __cplusplus >= 201703L ||                                                  \
+#if (defined(__STDC_VERSION__) && __STDC_VERSION__ >= 201112L) ||              \
+    !defined(__STRICT_ANSI__) ||                                               \
+    (defined(__cplusplus) && __cplusplus >= 201703L) ||                        \
     (__STDC_HOSTED__ && defined(_AIX) && defined(_ALL_SOURCE))
 #  define FLT_TRUE_MIN __FLT_DENORM_MIN__
 #  define DBL_TRUE_MIN __DBL_DENORM_MIN__
diff --git a/clang/lib/Headers/gfniintrin.h b/clang/lib/Headers/gfniintrin.h
index a59238b0b131..5ec53c54fc4e 100644
--- a/clang/lib/Headers/gfniintrin.h
+++ b/clang/lib/Headers/gfniintrin.h
@@ -20,10 +20,12 @@
 /* Default attributes for YMM unmasked form. */
 #define __DEFAULT_FN_ATTRS_Y __attribute__((__always_inline__, __nodebug__, __target__("avx,gfni"), __min_vector_width__(256)))
 
-/* Default attributes for ZMM forms. */
-#define __DEFAULT_FN_ATTRS_Z __attribute__((__always_inline__, __nodebug__, __target__("avx512bw,gfni"), __min_vector_width__(512)))
+/* Default attributes for ZMM unmasked forms. */
+#define __DEFAULT_FN_ATTRS_Z __attribute__((__always_inline__, __nodebug__, __target__("avx512f,gfni"), __min_vector_width__(512)))
+/* Default attributes for ZMM masked forms. */
+#define __DEFAULT_FN_ATTRS_Z_MASK __attribute__((__always_inline__, __nodebug__, __target__("avx512bw,gfni"), __min_vector_width__(512)))
 
-/* Default attributes for VLX forms. */
+/* Default attributes for VLX masked forms. */
 #define __DEFAULT_FN_ATTRS_VL128 __attribute__((__always_inline__, __nodebug__, __target__("avx512bw,avx512vl,gfni"), __min_vector_width__(128)))
 #define __DEFAULT_FN_ATTRS_VL256 __attribute__((__always_inline__, __nodebug__, __target__("avx512bw,avx512vl,gfni"), __min_vector_width__(256)))
 
@@ -99,7 +101,7 @@ _mm512_gf2p8mul_epi8(__m512i __A, __m512i __B)
               (__v64qi) __B);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS_Z
+static __inline__ __m512i __DEFAULT_FN_ATTRS_Z_MASK
 _mm512_mask_gf2p8mul_epi8(__m512i __S, __mmask64 __U, __m512i __A, __m512i __B)
 {
   return (__m512i) __builtin_ia32_selectb_512(__U,
@@ -107,7 +109,7 @@ _mm512_mask_gf2p8mul_epi8(__m512i __S, __mmask64 __U, __m512i __A, __m512i __B)
               (__v64qi) __S);
 }
 
-static __inline__ __m512i __DEFAULT_FN_ATTRS_Z
+static __inline__ __m512i __DEFAULT_FN_ATTRS_Z_MASK
 _mm512_maskz_gf2p8mul_epi8(__mmask64 __U, __m512i __A, __m512i __B)
 {
   return _mm512_mask_gf2p8mul_epi8((__m512i)_mm512_setzero_si512(),
diff --git a/clang/lib/Headers/hlsl/hlsl_basic_types.h b/clang/lib/Headers/hlsl/hlsl_basic_types.h
index e68715f1a6a4..9ea605cfa840 100644
--- a/clang/lib/Headers/hlsl/hlsl_basic_types.h
+++ b/clang/lib/Headers/hlsl/hlsl_basic_types.h
@@ -9,6 +9,7 @@
 #ifndef _HLSL_HLSL_BASIC_TYPES_H_
 #define _HLSL_HLSL_BASIC_TYPES_H_
 
+namespace hlsl {
 // built-in scalar data types:
 
 #ifdef __HLSL_ENABLE_16_BIT
@@ -61,4 +62,6 @@ typedef vector<double, 2> double2;
 typedef vector<double, 3> double3;
 typedef vector<double, 4> double4;
 
+} // namespace hlsl
+
 #endif //_HLSL_HLSL_BASIC_TYPES_H_
diff --git a/clang/lib/Headers/hlsl/hlsl_intrinsics.h b/clang/lib/Headers/hlsl/hlsl_intrinsics.h
index b5cdb8b44970..d811a28a4335 100644
--- a/clang/lib/Headers/hlsl/hlsl_intrinsics.h
+++ b/clang/lib/Headers/hlsl/hlsl_intrinsics.h
@@ -9,7 +9,215 @@
 #ifndef _HLSL_HLSL_INTRINSICS_H_
 #define _HLSL_HLSL_INTRINSICS_H_
 
+namespace hlsl {
+
+__attribute__((availability(shadermodel, introduced = 6.0)))
 __attribute__((clang_builtin_alias(__builtin_hlsl_wave_active_count_bits))) uint
 WaveActiveCountBits(bool bBit);
 
+// abs builtins
+#ifdef __HLSL_ENABLE_16_BIT
+__attribute__((clang_builtin_alias(__builtin_elementwise_abs)))
+int16_t abs(int16_t);
+__attribute__((clang_builtin_alias(__builtin_elementwise_abs)))
+int16_t2 abs(int16_t2);
+__attribute__((clang_builtin_alias(__builtin_elementwise_abs)))
+int16_t3 abs(int16_t3);
+__attribute__((clang_builtin_alias(__builtin_elementwise_abs)))
+int16_t4 abs(int16_t4);
+__attribute__((clang_builtin_alias(__builtin_elementwise_abs))) half abs(half);
+__attribute__((clang_builtin_alias(__builtin_elementwise_abs)))
+half2 abs(half2);
+__attribute__((clang_builtin_alias(__builtin_elementwise_abs)))
+half3 abs(half3);
+__attribute__((clang_builtin_alias(__builtin_elementwise_abs)))
+half4 abs(half4);
+#endif
+
+__attribute__((clang_builtin_alias(__builtin_elementwise_abs))) int abs(int);
+__attribute__((clang_builtin_alias(__builtin_elementwise_abs))) int2 abs(int2);
+__attribute__((clang_builtin_alias(__builtin_elementwise_abs))) int3 abs(int3);
+__attribute__((clang_builtin_alias(__builtin_elementwise_abs))) int4 abs(int4);
+__attribute__((clang_builtin_alias(__builtin_elementwise_abs))) float
+abs(float);
+__attribute__((clang_builtin_alias(__builtin_elementwise_abs)))
+float2 abs(float2);
+__attribute__((clang_builtin_alias(__builtin_elementwise_abs)))
+float3 abs(float3);
+__attribute__((clang_builtin_alias(__builtin_elementwise_abs)))
+float4 abs(float4);
+__attribute__((clang_builtin_alias(__builtin_elementwise_abs)))
+int64_t abs(int64_t);
+__attribute__((clang_builtin_alias(__builtin_elementwise_abs)))
+int64_t2 abs(int64_t2);
+__attribute__((clang_builtin_alias(__builtin_elementwise_abs)))
+int64_t3 abs(int64_t3);
+__attribute__((clang_builtin_alias(__builtin_elementwise_abs)))
+int64_t4 abs(int64_t4);
+__attribute__((clang_builtin_alias(__builtin_elementwise_abs))) double
+abs(double);
+__attribute__((clang_builtin_alias(__builtin_elementwise_abs)))
+double2 abs(double2);
+__attribute__((clang_builtin_alias(__builtin_elementwise_abs)))
+double3 abs(double3);
+__attribute__((clang_builtin_alias(__builtin_elementwise_abs)))
+double4 abs(double4);
+
+// sqrt builtins
+__attribute__((clang_builtin_alias(__builtin_sqrt))) double sqrt(double In);
+__attribute__((clang_builtin_alias(__builtin_sqrtf))) float sqrt(float In);
+
+#ifdef __HLSL_ENABLE_16_BIT
+__attribute__((clang_builtin_alias(__builtin_sqrtf16))) half sqrt(half In);
+#endif
+
+// ceil builtins
+#ifdef __HLSL_ENABLE_16_BIT
+__attribute__((clang_builtin_alias(__builtin_elementwise_ceil)))
+half ceil(half);
+__attribute__((clang_builtin_alias(__builtin_elementwise_ceil)))
+half2 ceil(half2);
+__attribute__((clang_builtin_alias(__builtin_elementwise_ceil)))
+half3 ceil(half3);
+__attribute__((clang_builtin_alias(__builtin_elementwise_ceil)))
+half4 ceil(half4);
+#endif
+
+__attribute__((clang_builtin_alias(__builtin_elementwise_ceil))) float
+ceil(float);
+__attribute__((clang_builtin_alias(__builtin_elementwise_ceil)))
+float2 ceil(float2);
+__attribute__((clang_builtin_alias(__builtin_elementwise_ceil)))
+float3 ceil(float3);
+__attribute__((clang_builtin_alias(__builtin_elementwise_ceil)))
+float4 ceil(float4);
+
+__attribute__((clang_builtin_alias(__builtin_elementwise_ceil))) double
+ceil(double);
+__attribute__((clang_builtin_alias(__builtin_elementwise_ceil)))
+double2 ceil(double2);
+__attribute__((clang_builtin_alias(__builtin_elementwise_ceil)))
+double3 ceil(double3);
+__attribute__((clang_builtin_alias(__builtin_elementwise_ceil)))
+double4 ceil(double4);
+
+// floor builtins
+#ifdef __HLSL_ENABLE_16_BIT
+__attribute__((clang_builtin_alias(__builtin_elementwise_floor)))
+half floor(half);
+__attribute__((clang_builtin_alias(__builtin_elementwise_floor)))
+half2 floor(half2);
+__attribute__((clang_builtin_alias(__builtin_elementwise_floor)))
+half3 floor(half3);
+__attribute__((clang_builtin_alias(__builtin_elementwise_floor)))
+half4 floor(half4);
+#endif
+
+__attribute__((clang_builtin_alias(__builtin_elementwise_floor))) float
+floor(float);
+__attribute__((clang_builtin_alias(__builtin_elementwise_floor)))
+float2 floor(float2);
+__attribute__((clang_builtin_alias(__builtin_elementwise_floor)))
+float3 floor(float3);
+__attribute__((clang_builtin_alias(__builtin_elementwise_floor)))
+float4 floor(float4);
+
+__attribute__((clang_builtin_alias(__builtin_elementwise_floor))) double
+floor(double);
+__attribute__((clang_builtin_alias(__builtin_elementwise_floor)))
+double2 floor(double2);
+__attribute__((clang_builtin_alias(__builtin_elementwise_floor)))
+double3 floor(double3);
+__attribute__((clang_builtin_alias(__builtin_elementwise_floor)))
+double4 floor(double4);
+
+// cos builtins
+#ifdef __HLSL_ENABLE_16_BIT
+__attribute__((clang_builtin_alias(__builtin_elementwise_cos))) half cos(half);
+__attribute__((clang_builtin_alias(__builtin_elementwise_cos)))
+half2 cos(half2);
+__attribute__((clang_builtin_alias(__builtin_elementwise_cos)))
+half3 cos(half3);
+__attribute__((clang_builtin_alias(__builtin_elementwise_cos)))
+half4 cos(half4);
+#endif
+
+__attribute__((clang_builtin_alias(__builtin_elementwise_cos))) float
+cos(float);
+__attribute__((clang_builtin_alias(__builtin_elementwise_cos)))
+float2 cos(float2);
+__attribute__((clang_builtin_alias(__builtin_elementwise_cos)))
+float3 cos(float3);
+__attribute__((clang_builtin_alias(__builtin_elementwise_cos)))
+float4 cos(float4);
+
+__attribute__((clang_builtin_alias(__builtin_elementwise_cos))) double
+cos(double);
+__attribute__((clang_builtin_alias(__builtin_elementwise_cos)))
+double2 cos(double2);
+__attribute__((clang_builtin_alias(__builtin_elementwise_cos)))
+double3 cos(double3);
+__attribute__((clang_builtin_alias(__builtin_elementwise_cos)))
+double4 cos(double4);
+
+// sin builtins
+#ifdef __HLSL_ENABLE_16_BIT
+__attribute__((clang_builtin_alias(__builtin_elementwise_sin))) half sin(half);
+__attribute__((clang_builtin_alias(__builtin_elementwise_sin)))
+half2 sin(half2);
+__attribute__((clang_builtin_alias(__builtin_elementwise_sin)))
+half3 sin(half3);
+__attribute__((clang_builtin_alias(__builtin_elementwise_sin)))
+half4 sin(half4);
+#endif
+
+__attribute__((clang_builtin_alias(__builtin_elementwise_sin))) float
+sin(float);
+__attribute__((clang_builtin_alias(__builtin_elementwise_sin)))
+float2 sin(float2);
+__attribute__((clang_builtin_alias(__builtin_elementwise_sin)))
+float3 sin(float3);
+__attribute__((clang_builtin_alias(__builtin_elementwise_sin)))
+float4 sin(float4);
+
+__attribute__((clang_builtin_alias(__builtin_elementwise_sin))) double
+sin(double);
+__attribute__((clang_builtin_alias(__builtin_elementwise_sin)))
+double2 sin(double2);
+__attribute__((clang_builtin_alias(__builtin_elementwise_sin)))
+double3 sin(double3);
+__attribute__((clang_builtin_alias(__builtin_elementwise_sin)))
+double4 sin(double4);
+
+// trunc builtins
+#ifdef __HLSL_ENABLE_16_BIT
+__attribute__((clang_builtin_alias(__builtin_elementwise_trunc)))
+half trunc(half);
+__attribute__((clang_builtin_alias(__builtin_elementwise_trunc)))
+half2 trunc(half2);
+__attribute__((clang_builtin_alias(__builtin_elementwise_trunc)))
+half3 trunc(half3);
+__attribute__((clang_builtin_alias(__builtin_elementwise_trunc)))
+half4 trunc(half4);
+#endif
+
+__attribute__((clang_builtin_alias(__builtin_elementwise_trunc))) float
+trunc(float);
+__attribute__((clang_builtin_alias(__builtin_elementwise_trunc)))
+float2 trunc(float2);
+__attribute__((clang_builtin_alias(__builtin_elementwise_trunc)))
+float3 trunc(float3);
+__attribute__((clang_builtin_alias(__builtin_elementwise_trunc)))
+float4 trunc(float4);
+
+__attribute__((clang_builtin_alias(__builtin_elementwise_trunc))) double
+trunc(double);
+__attribute__((clang_builtin_alias(__builtin_elementwise_trunc)))
+double2 trunc(double2);
+__attribute__((clang_builtin_alias(__builtin_elementwise_trunc)))
+double3 trunc(double3);
+__attribute__((clang_builtin_alias(__builtin_elementwise_trunc)))
+double4 trunc(double4);
+
+} // namespace hlsl
 #endif //_HLSL_HLSL_INTRINSICS_H_
diff --git a/clang/lib/Headers/immintrin.h b/clang/lib/Headers/immintrin.h
index e4d7a799b1ca..6967b46fdb24 100644
--- a/clang/lib/Headers/immintrin.h
+++ b/clang/lib/Headers/immintrin.h
@@ -190,6 +190,11 @@
 #endif
 
 #if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
+    defined(__AVXIFMA__)
+#include <avxifmaintrin.h>
+#endif
+
+#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
     defined(__AVX512VBMI__)
 #include <avx512vbmiintrin.h>
 #endif
@@ -214,17 +219,13 @@
 #include <avx512pfintrin.h>
 #endif
 
-/*
- * FIXME: _Float16 type is legal only when HW support float16 operation.
- * We use __AVX512FP16__ to identify if float16 is supported or not, so
- * when float16 is not supported, the related header is not included.
- *
- */
-#if defined(__AVX512FP16__)
+#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
+    defined(__AVX512FP16__)
 #include <avx512fp16intrin.h>
 #endif
 
-#if defined(__AVX512FP16__) && defined(__AVX512VL__)
+#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
+    (defined(__AVX512VL__) && defined(__AVX512FP16__))
 #include <avx512vlfp16intrin.h>
 #endif
 
@@ -259,6 +260,16 @@
 #endif
 
 #if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
+    defined(__AVXVNNIINT8__)
+#include <avxvnniint8intrin.h>
+#endif
+
+#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
+    defined(__AVXNECONVERT__)
+#include <avxneconvertintrin.h>
+#endif
+
+#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
     defined(__RDPID__)
 /// Returns the value of the IA32_TSC_AUX MSR (0xc0000103).
 ///
@@ -291,6 +302,23 @@ _rdrand64_step(unsigned long long *__p)
 {
   return (int)__builtin_ia32_rdrand64_step(__p);
 }
+#else
+// We need to emulate the functionality of 64-bit rdrand with 2 32-bit
+// rdrand instructions.
+static __inline__ int __attribute__((__always_inline__, __nodebug__, __target__("rdrnd")))
+_rdrand64_step(unsigned long long *__p)
+{
+  unsigned int __lo, __hi;
+  unsigned int __res_lo = __builtin_ia32_rdrand32_step(&__lo);
+  unsigned int __res_hi = __builtin_ia32_rdrand32_step(&__hi);
+  if (__res_lo && __res_hi) {
+    *__p = ((unsigned long long)__hi << 32) | (unsigned long long)__lo;
+    return 1;
+  } else {
+    *__p = 0;
+    return 0;
+  }
+}
 #endif
 #endif /* __RDRND__ */
 
@@ -495,6 +523,10 @@ _storebe_i64(void * __P, long long __D) {
     defined(__INVPCID__)
 #include <invpcidintrin.h>
 #endif
+#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
+    defined(__AMXFP16__)
+#include <amxfp16intrin.h>
+#endif
 
 #if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
     defined(__KL__) || defined(__WIDEKL__)
diff --git a/clang/lib/Headers/larchintrin.h b/clang/lib/Headers/larchintrin.h
new file mode 100644
index 000000000000..c5c533ee0b8c
--- /dev/null
+++ b/clang/lib/Headers/larchintrin.h
@@ -0,0 +1,234 @@
+/*===------------ larchintrin.h - LoongArch intrinsics ---------------------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef _LOONGARCH_BASE_INTRIN_H
+#define _LOONGARCH_BASE_INTRIN_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct rdtime {
+  unsigned int value;
+  unsigned int timeid;
+} __rdtime_t;
+
+#if __loongarch_grlen == 64
+typedef struct drdtime {
+  unsigned long dvalue;
+  unsigned long dtimeid;
+} __drdtime_t;
+
+extern __inline __drdtime_t
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    __rdtime_d(void) {
+  __drdtime_t __drdtime;
+  __asm__ volatile(
+      "rdtime.d %[val], %[tid]\n\t"
+      : [val] "=&r"(__drdtime.dvalue), [tid] "=&r"(__drdtime.dtimeid));
+  return __drdtime;
+}
+#endif
+
+extern __inline __rdtime_t
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    __rdtimeh_w(void) {
+  __rdtime_t __rdtime;
+  __asm__ volatile("rdtimeh.w %[val], %[tid]\n\t"
+                   : [val] "=&r"(__rdtime.value), [tid] "=&r"(__rdtime.timeid));
+  return __rdtime;
+}
+
+extern __inline __rdtime_t
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    __rdtimel_w(void) {
+  __rdtime_t __rdtime;
+  __asm__ volatile("rdtimel.w %[val], %[tid]\n\t"
+                   : [val] "=&r"(__rdtime.value), [tid] "=&r"(__rdtime.timeid));
+  return __rdtime;
+}
+
+#if __loongarch_grlen == 64
+extern __inline int
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    __crc_w_b_w(char _1, int _2) {
+  return (int)__builtin_loongarch_crc_w_b_w((char)_1, (int)_2);
+}
+
+extern __inline int
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    __crc_w_h_w(short _1, int _2) {
+  return (int)__builtin_loongarch_crc_w_h_w((short)_1, (int)_2);
+}
+
+extern __inline int
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    __crc_w_w_w(int _1, int _2) {
+  return (int)__builtin_loongarch_crc_w_w_w((int)_1, (int)_2);
+}
+
+extern __inline int
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    __crc_w_d_w(long int _1, int _2) {
+  return (int)__builtin_loongarch_crc_w_d_w((long int)_1, (int)_2);
+}
+
+extern __inline int
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    __crcc_w_b_w(char _1, int _2) {
+  return (int)__builtin_loongarch_crcc_w_b_w((char)_1, (int)_2);
+}
+
+extern __inline int
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    __crcc_w_h_w(short _1, int _2) {
+  return (int)__builtin_loongarch_crcc_w_h_w((short)_1, (int)_2);
+}
+
+extern __inline int
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    __crcc_w_w_w(int _1, int _2) {
+  return (int)__builtin_loongarch_crcc_w_w_w((int)_1, (int)_2);
+}
+
+extern __inline int
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    __crcc_w_d_w(long int _1, int _2) {
+  return (int)__builtin_loongarch_crcc_w_d_w((long int)_1, (int)_2);
+}
+#endif
+
+#define __break(/*ui15*/ _1) __builtin_loongarch_break((_1))
+
+#if __loongarch_grlen == 32
+#define __cacop_w(/*uimm5*/ _1, /*unsigned int*/ _2, /*simm12*/ _3)            \
+  ((void)__builtin_loongarch_cacop_w((_1), (unsigned int)(_2), (_3)))
+#endif
+
+#if __loongarch_grlen == 64
+#define __cacop_d(/*uimm5*/ _1, /*unsigned long int*/ _2, /*simm12*/ _3)       \
+  ((void)__builtin_loongarch_cacop_d((_1), (unsigned long int)(_2), (_3)))
+#endif
+
+#define __dbar(/*ui15*/ _1) __builtin_loongarch_dbar((_1))
+
+#define __ibar(/*ui15*/ _1) __builtin_loongarch_ibar((_1))
+
+#define __movfcsr2gr(/*ui5*/ _1) __builtin_loongarch_movfcsr2gr((_1));
+
+#define __movgr2fcsr(/*ui5*/ _1, _2)                                           \
+  __builtin_loongarch_movgr2fcsr((_1), (unsigned int)_2);
+
+#define __syscall(/*ui15*/ _1) __builtin_loongarch_syscall((_1))
+
+#define __csrrd_w(/*ui14*/ _1) ((unsigned int)__builtin_loongarch_csrrd_w((_1)))
+
+#define __csrwr_w(/*unsigned int*/ _1, /*ui14*/ _2)                            \
+  ((unsigned int)__builtin_loongarch_csrwr_w((unsigned int)(_1), (_2)))
+
+#define __csrxchg_w(/*unsigned int*/ _1, /*unsigned int*/ _2, /*ui14*/ _3)     \
+  ((unsigned int)__builtin_loongarch_csrxchg_w((unsigned int)(_1),             \
+                                               (unsigned int)(_2), (_3)))
+
+#if __loongarch_grlen == 64
+#define __csrrd_d(/*ui14*/ _1)                                                 \
+  ((unsigned long int)__builtin_loongarch_csrrd_d((_1)))
+
+#define __csrwr_d(/*unsigned long int*/ _1, /*ui14*/ _2)                       \
+  ((unsigned long int)__builtin_loongarch_csrwr_d((unsigned long int)(_1),     \
+                                                  (_2)))
+
+#define __csrxchg_d(/*unsigned long int*/ _1, /*unsigned long int*/ _2,        \
+                    /*ui14*/ _3)                                               \
+  ((unsigned long int)__builtin_loongarch_csrxchg_d(                           \
+      (unsigned long int)(_1), (unsigned long int)(_2), (_3)))
+#endif
+
+extern __inline unsigned char
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    __iocsrrd_b(unsigned int _1) {
+  return (unsigned char)__builtin_loongarch_iocsrrd_b((unsigned int)_1);
+}
+
+extern __inline unsigned char
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    __iocsrrd_h(unsigned int _1) {
+  return (unsigned short)__builtin_loongarch_iocsrrd_h((unsigned int)_1);
+}
+
+extern __inline unsigned int
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    __iocsrrd_w(unsigned int _1) {
+  return (unsigned int)__builtin_loongarch_iocsrrd_w((unsigned int)_1);
+}
+
+#if __loongarch_grlen == 64
+extern __inline unsigned long int
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    __iocsrrd_d(unsigned int _1) {
+  return (unsigned long int)__builtin_loongarch_iocsrrd_d((unsigned int)_1);
+}
+#endif
+
+extern __inline void
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    __iocsrwr_b(unsigned char _1, unsigned int _2) {
+  __builtin_loongarch_iocsrwr_b((unsigned char)_1, (unsigned int)_2);
+}
+
+extern __inline void
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    __iocsrwr_h(unsigned short _1, unsigned int _2) {
+  __builtin_loongarch_iocsrwr_h((unsigned short)_1, (unsigned int)_2);
+}
+
+extern __inline void
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    __iocsrwr_w(unsigned int _1, unsigned int _2) {
+  __builtin_loongarch_iocsrwr_w((unsigned int)_1, (unsigned int)_2);
+}
+
+extern __inline unsigned int
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    __cpucfg(unsigned int _1) {
+  return (unsigned int)__builtin_loongarch_cpucfg((unsigned int)_1);
+}
+
+#if __loongarch_grlen == 64
+extern __inline void
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    __iocsrwr_d(unsigned long int _1, unsigned int _2) {
+  __builtin_loongarch_iocsrwr_d((unsigned long int)_1, (unsigned int)_2);
+}
+
+extern __inline void
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    __asrtgt_d(long int _1, long int _2) {
+  __builtin_loongarch_asrtgt_d((long int)_1, (long int)_2);
+}
+
+extern __inline void
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    __asrtle_d(long int _1, long int _2) {
+  __builtin_loongarch_asrtle_d((long int)_1, (long int)_2);
+}
+#endif
+
+#if __loongarch_grlen == 64
+#define __lddir_d(/*long int*/ _1, /*ui5*/ _2)                                 \
+  ((long int)__builtin_loongarch_lddir_d((long int)(_1), (_2)))
+
+#define __ldpte_d(/*long int*/ _1, /*ui5*/ _2)                                 \
+  ((void)__builtin_loongarch_ldpte_d((long int)(_1), (_2)))
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+#endif /* _LOONGARCH_BASE_INTRIN_H */
diff --git a/clang/lib/Headers/limits.h b/clang/lib/Headers/limits.h
index cfd23a219ee5..32cc901b26be 100644
--- a/clang/lib/Headers/limits.h
+++ b/clang/lib/Headers/limits.h
@@ -65,7 +65,7 @@
 /* C2x 5.2.4.2.1 */
 /* FIXME: This is using the placeholder dates Clang produces for these macros
    in C2x mode; switch to the correct values once they've been published. */
-#if __STDC_VERSION__ >= 202000L
+#if defined(__STDC_VERSION__) && __STDC_VERSION__ >= 202000L
 #define BOOL_WIDTH   __BOOL_WIDTH__
 #define CHAR_WIDTH   CHAR_BIT
 #define SCHAR_WIDTH  CHAR_BIT
@@ -93,7 +93,8 @@
 /* C99 5.2.4.2.1: Added long long.
    C++11 18.3.3.2: same contents as the Standard C Library header <limits.h>.
  */
-#if __STDC_VERSION__ >= 199901L || __cplusplus >= 201103L
+#if (defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L) ||              \
+    (defined(__cplusplus) && __cplusplus >= 201103L)
 
 #undef  LLONG_MIN
 #undef  LLONG_MAX
diff --git a/clang/lib/Headers/opencl-c-base.h b/clang/lib/Headers/opencl-c-base.h
index c433b4f7eb1a..fad2f9c0272b 100644
--- a/clang/lib/Headers/opencl-c-base.h
+++ b/clang/lib/Headers/opencl-c-base.h
@@ -74,6 +74,25 @@
 #define __opencl_c_atomic_scope_all_devices 1
 #define __opencl_c_read_write_images 1
 #endif // defined(__SPIR__)
+
+// Undefine any feature macros that have been explicitly disabled using
+// an __undef_<feature> macro.
+#ifdef __undef___opencl_c_work_group_collective_functions
+#undef __opencl_c_work_group_collective_functions
+#endif
+#ifdef __undef___opencl_c_atomic_order_seq_cst
+#undef __opencl_c_atomic_order_seq_cst
+#endif
+#ifdef __undef___opencl_c_atomic_scope_device
+#undef __opencl_c_atomic_scope_device
+#endif
+#ifdef __undef___opencl_c_atomic_scope_all_devices
+#undef __opencl_c_atomic_scope_all_devices
+#endif
+#ifdef __undef___opencl_c_read_write_images
+#undef __opencl_c_read_write_images
+#endif
+
 #endif // (__OPENCL_CPP_VERSION__ == 202100 || __OPENCL_C_VERSION__ == 300)
 
 #if !defined(__opencl_c_generic_address_space)
diff --git a/clang/lib/Headers/opencl-c.h b/clang/lib/Headers/opencl-c.h
index 72a6bfeafd6a..288bb18bc654 100644
--- a/clang/lib/Headers/opencl-c.h
+++ b/clang/lib/Headers/opencl-c.h
@@ -12396,11 +12396,11 @@ void __ovld vstorea_half16_rtn(double16, size_t, __private half *);
  * image objects and then want to read the updated data.
  */
 
-void __ovld __conv barrier(cl_mem_fence_flags flags);
+void __ovld __conv barrier(cl_mem_fence_flags);
 
 #if defined(__OPENCL_CPP_VERSION__) || (__OPENCL_C_VERSION__ >= CL_VERSION_2_0)
-void __ovld __conv work_group_barrier(cl_mem_fence_flags flags, memory_scope);
-void __ovld __conv work_group_barrier(cl_mem_fence_flags flags);
+void __ovld __conv work_group_barrier(cl_mem_fence_flags, memory_scope);
+void __ovld __conv work_group_barrier(cl_mem_fence_flags);
 #endif //defined(__OPENCL_CPP_VERSION__) || (__OPENCL_C_VERSION__ >= CL_VERSION_2_0)
 
 // OpenCL v1.1 s6.11.9, v1.2 s6.12.9 - Explicit Memory Fence Functions
@@ -12418,7 +12418,7 @@ void __ovld __conv work_group_barrier(cl_mem_fence_flags flags);
  * CLK_LOCAL_MEM_FENCE
  * CLK_GLOBAL_MEM_FENCE.
  */
-void __ovld mem_fence(cl_mem_fence_flags flags);
+void __ovld mem_fence(cl_mem_fence_flags);
 
 /**
  * Read memory barrier that orders only
@@ -12430,7 +12430,7 @@ void __ovld mem_fence(cl_mem_fence_flags flags);
  * CLK_LOCAL_MEM_FENCE
  * CLK_GLOBAL_MEM_FENCE.
  */
-void __ovld read_mem_fence(cl_mem_fence_flags flags);
+void __ovld read_mem_fence(cl_mem_fence_flags);
 
 /**
  * Write memory barrier that orders only
@@ -12442,7 +12442,7 @@ void __ovld read_mem_fence(cl_mem_fence_flags flags);
  * CLK_LOCAL_MEM_FENCE
  * CLK_GLOBAL_MEM_FENCE.
  */
-void __ovld write_mem_fence(cl_mem_fence_flags flags);
+void __ovld write_mem_fence(cl_mem_fence_flags);
 
 // OpenCL v2.0 s6.13.9 - Address Space Qualifier Functions
 
@@ -12891,29 +12891,29 @@ void __ovld prefetch(const __global half16 *, size_t);
  * (old + val) and store result at location
  * pointed by p. The function returns old.
  */
-int __ovld atomic_add(volatile __global int *p, int val);
-uint __ovld atomic_add(volatile __global uint *p, uint val);
-int __ovld atomic_add(volatile __local int *p, int val);
-uint __ovld atomic_add(volatile __local uint *p, uint val);
+int __ovld atomic_add(volatile __global int *, int);
+uint __ovld atomic_add(volatile __global uint *, uint);
+int __ovld atomic_add(volatile __local int *, int);
+uint __ovld atomic_add(volatile __local uint *, uint);
 #ifdef __OPENCL_CPP_VERSION__
-int __ovld atomic_add(volatile int *p, int val);
-uint __ovld atomic_add(volatile uint *p, uint val);
+int __ovld atomic_add(volatile int *, int);
+uint __ovld atomic_add(volatile uint *, uint);
 #endif
 
 #if defined(cl_khr_global_int32_base_atomics)
-int __ovld atom_add(volatile __global int *p, int val);
-uint __ovld atom_add(volatile __global uint *p, uint val);
+int __ovld atom_add(volatile __global int *, int);
+uint __ovld atom_add(volatile __global uint *, uint);
 #endif
 #if defined(cl_khr_local_int32_base_atomics)
-int __ovld atom_add(volatile __local int *p, int val);
-uint __ovld atom_add(volatile __local uint *p, uint val);
+int __ovld atom_add(volatile __local int *, int);
+uint __ovld atom_add(volatile __local uint *, uint);
 #endif
 
 #if defined(cl_khr_int64_base_atomics)
-long __ovld atom_add(volatile __global long *p, long val);
-ulong __ovld atom_add(volatile __global ulong *p, ulong val);
-long __ovld atom_add(volatile __local long *p, long val);
-ulong __ovld atom_add(volatile __local ulong *p, ulong val);
+long __ovld atom_add(volatile __global long *, long);
+ulong __ovld atom_add(volatile __global ulong *, ulong);
+long __ovld atom_add(volatile __local long *, long);
+ulong __ovld atom_add(volatile __local ulong *, ulong);
 #endif
 
 /**
@@ -12921,29 +12921,29 @@ ulong __ovld atom_add(volatile __local ulong *p, ulong val);
  * Compute (old - val) and store result at location pointed by p. The function
  * returns old.
  */
-int __ovld atomic_sub(volatile __global int *p, int val);
-uint __ovld atomic_sub(volatile __global uint *p, uint val);
-int __ovld atomic_sub(volatile __local int *p, int val);
-uint __ovld atomic_sub(volatile __local uint *p, uint val);
+int __ovld atomic_sub(volatile __global int *, int);
+uint __ovld atomic_sub(volatile __global uint *, uint);
+int __ovld atomic_sub(volatile __local int *, int);
+uint __ovld atomic_sub(volatile __local uint *, uint);
 #ifdef __OPENCL_CPP_VERSION__
-int __ovld atomic_sub(volatile int *p, int val);
-uint __ovld atomic_sub(volatile uint *p, uint val);
+int __ovld atomic_sub(volatile int *, int);
+uint __ovld atomic_sub(volatile uint *, uint);
 #endif
 
 #if defined(cl_khr_global_int32_base_atomics)
-int __ovld atom_sub(volatile __global int *p, int val);
-uint __ovld atom_sub(volatile __global uint *p, uint val);
+int __ovld atom_sub(volatile __global int *, int);
+uint __ovld atom_sub(volatile __global uint *, uint);
 #endif
 #if defined(cl_khr_local_int32_base_atomics)
-int __ovld atom_sub(volatile __local int *p, int val);
-uint __ovld atom_sub(volatile __local uint *p, uint val);
+int __ovld atom_sub(volatile __local int *, int);
+uint __ovld atom_sub(volatile __local uint *, uint);
 #endif
 
 #if defined(cl_khr_int64_base_atomics)
-long __ovld atom_sub(volatile __global long *p, long val);
-ulong __ovld atom_sub(volatile __global ulong *p, ulong val);
-long __ovld atom_sub(volatile __local long *p, long val);
-ulong __ovld atom_sub(volatile __local ulong *p, ulong val);
+long __ovld atom_sub(volatile __global long *, long);
+ulong __ovld atom_sub(volatile __global ulong *, ulong);
+long __ovld atom_sub(volatile __local long *, long);
+ulong __ovld atom_sub(volatile __local ulong *, ulong);
 #endif
 
 /**
@@ -12951,32 +12951,32 @@ ulong __ovld atom_sub(volatile __local ulong *p, ulong val);
  * with new value given by val. Returns old
  * value.
  */
-int __ovld atomic_xchg(volatile __global int *p, int val);
-uint __ovld atomic_xchg(volatile __global uint *p, uint val);
-int __ovld atomic_xchg(volatile __local int *p, int val);
-uint __ovld atomic_xchg(volatile __local uint *p, uint val);
-float __ovld atomic_xchg(volatile __global float *p, float val);
-float __ovld atomic_xchg(volatile __local float *p, float val);
+int __ovld atomic_xchg(volatile __global int *, int);
+uint __ovld atomic_xchg(volatile __global uint *, uint);
+int __ovld atomic_xchg(volatile __local int *, int);
+uint __ovld atomic_xchg(volatile __local uint *, uint);
+float __ovld atomic_xchg(volatile __global float *, float);
+float __ovld atomic_xchg(volatile __local float *, float);
 #ifdef __OPENCL_CPP_VERSION__
-int __ovld atomic_xchg(volatile int *p, int val);
-uint __ovld atomic_xchg(volatile uint *p, uint val);
-float __ovld atomic_xchg(volatile float *p, float val);
+int __ovld atomic_xchg(volatile int *, int);
+uint __ovld atomic_xchg(volatile uint *, uint);
+float __ovld atomic_xchg(volatile float *, float);
 #endif
 
 #if defined(cl_khr_global_int32_base_atomics)
-int __ovld atom_xchg(volatile __global int *p, int val);
-uint __ovld atom_xchg(volatile __global uint *p, uint val);
+int __ovld atom_xchg(volatile __global int *, int);
+uint __ovld atom_xchg(volatile __global uint *, uint);
 #endif
 #if defined(cl_khr_local_int32_base_atomics)
-int __ovld atom_xchg(volatile __local int *p, int val);
-uint __ovld atom_xchg(volatile __local uint *p, uint val);
+int __ovld atom_xchg(volatile __local int *, int);
+uint __ovld atom_xchg(volatile __local uint *, uint);
 #endif
 
 #if defined(cl_khr_int64_base_atomics)
-long __ovld atom_xchg(volatile __global long *p, long val);
-long __ovld atom_xchg(volatile __local long *p, long val);
-ulong __ovld atom_xchg(volatile __global ulong *p, ulong val);
-ulong __ovld atom_xchg(volatile __local ulong *p, ulong val);
+long __ovld atom_xchg(volatile __global long *, long);
+long __ovld atom_xchg(volatile __local long *, long);
+ulong __ovld atom_xchg(volatile __global ulong *, ulong);
+ulong __ovld atom_xchg(volatile __local ulong *, ulong);
 #endif
 
 /**
@@ -13048,29 +13048,29 @@ ulong __ovld atom_dec(volatile __local ulong *);
  * location pointed by p. The function
  * returns old.
  */
-int __ovld atomic_cmpxchg(volatile __global int *p, int cmp, int val);
-uint __ovld atomic_cmpxchg(volatile __global uint *p, uint cmp, uint val);
-int __ovld atomic_cmpxchg(volatile __local int *p, int cmp, int val);
-uint __ovld atomic_cmpxchg(volatile __local uint *p, uint cmp, uint val);
+int __ovld atomic_cmpxchg(volatile __global int *, int, int);
+uint __ovld atomic_cmpxchg(volatile __global uint *, uint, uint);
+int __ovld atomic_cmpxchg(volatile __local int *, int, int);
+uint __ovld atomic_cmpxchg(volatile __local uint *, uint, uint);
 #ifdef __OPENCL_CPP_VERSION__
-int __ovld atomic_cmpxchg(volatile int *p, int cmp, int val);
-uint __ovld atomic_cmpxchg(volatile uint *p, uint cmp, uint val);
+int __ovld atomic_cmpxchg(volatile int *, int, int);
+uint __ovld atomic_cmpxchg(volatile uint *, uint, uint);
 #endif
 
 #if defined(cl_khr_global_int32_base_atomics)
-int __ovld atom_cmpxchg(volatile __global int *p, int cmp, int val);
-uint __ovld atom_cmpxchg(volatile __global uint *p, uint cmp, uint val);
+int __ovld atom_cmpxchg(volatile __global int *, int, int);
+uint __ovld atom_cmpxchg(volatile __global uint *, uint, uint);
 #endif
 #if defined(cl_khr_local_int32_base_atomics)
-int __ovld atom_cmpxchg(volatile __local int *p, int cmp, int val);
-uint __ovld atom_cmpxchg(volatile __local uint *p, uint cmp, uint val);
+int __ovld atom_cmpxchg(volatile __local int *, int, int);
+uint __ovld atom_cmpxchg(volatile __local uint *, uint, uint);
 #endif
 
 #if defined(cl_khr_int64_base_atomics)
-long __ovld atom_cmpxchg(volatile __global long *p, long cmp, long val);
-ulong __ovld atom_cmpxchg(volatile __global ulong *p, ulong cmp, ulong val);
-long __ovld atom_cmpxchg(volatile __local long *p, long cmp, long val);
-ulong __ovld atom_cmpxchg(volatile __local ulong *p, ulong cmp, ulong val);
+long __ovld atom_cmpxchg(volatile __global long *, long, long);
+ulong __ovld atom_cmpxchg(volatile __global ulong *, ulong, ulong);
+long __ovld atom_cmpxchg(volatile __local long *, long, long);
+ulong __ovld atom_cmpxchg(volatile __local ulong *, ulong, ulong);
 #endif
 
 /**
@@ -13080,29 +13080,29 @@ ulong __ovld atom_cmpxchg(volatile __local ulong *p, ulong cmp, ulong val);
  * location pointed by p. The function
  * returns old.
  */
-int __ovld atomic_min(volatile __global int *p, int val);
-uint __ovld atomic_min(volatile __global uint *p, uint val);
-int __ovld atomic_min(volatile __local int *p, int val);
-uint __ovld atomic_min(volatile __local uint *p, uint val);
+int __ovld atomic_min(volatile __global int *, int);
+uint __ovld atomic_min(volatile __global uint *, uint);
+int __ovld atomic_min(volatile __local int *, int);
+uint __ovld atomic_min(volatile __local uint *, uint);
 #ifdef __OPENCL_CPP_VERSION__
-int __ovld atomic_min(volatile int *p, int val);
-uint __ovld atomic_min(volatile uint *p, uint val);
+int __ovld atomic_min(volatile int *, int);
+uint __ovld atomic_min(volatile uint *, uint);
 #endif
 
 #if defined(cl_khr_global_int32_extended_atomics)
-int __ovld atom_min(volatile __global int *p, int val);
-uint __ovld atom_min(volatile __global uint *p, uint val);
+int __ovld atom_min(volatile __global int *, int);
+uint __ovld atom_min(volatile __global uint *, uint);
 #endif
 #if defined(cl_khr_local_int32_extended_atomics)
-int __ovld atom_min(volatile __local int *p, int val);
-uint __ovld atom_min(volatile __local uint *p, uint val);
+int __ovld atom_min(volatile __local int *, int);
+uint __ovld atom_min(volatile __local uint *, uint);
 #endif
 
 #if defined(cl_khr_int64_extended_atomics)
-long __ovld atom_min(volatile __global long *p, long val);
-ulong __ovld atom_min(volatile __global ulong *p, ulong val);
-long __ovld atom_min(volatile __local long *p, long val);
-ulong __ovld atom_min(volatile __local ulong *p, ulong val);
+long __ovld atom_min(volatile __global long *, long);
+ulong __ovld atom_min(volatile __global ulong *, ulong);
+long __ovld atom_min(volatile __local long *, long);
+ulong __ovld atom_min(volatile __local ulong *, ulong);
 #endif
 
 /**
@@ -13112,29 +13112,29 @@ ulong __ovld atom_min(volatile __local ulong *p, ulong val);
  * location pointed by p. The function
  * returns old.
  */
-int __ovld atomic_max(volatile __global int *p, int val);
-uint __ovld atomic_max(volatile __global uint *p, uint val);
-int __ovld atomic_max(volatile __local int *p, int val);
-uint __ovld atomic_max(volatile __local uint *p, uint val);
+int __ovld atomic_max(volatile __global int *, int);
+uint __ovld atomic_max(volatile __global uint *, uint);
+int __ovld atomic_max(volatile __local int *, int);
+uint __ovld atomic_max(volatile __local uint *, uint);
 #ifdef __OPENCL_CPP_VERSION__
-int __ovld atomic_max(volatile int *p, int val);
-uint __ovld atomic_max(volatile uint *p, uint val);
+int __ovld atomic_max(volatile int *, int);
+uint __ovld atomic_max(volatile uint *, uint);
 #endif
 
 #if defined(cl_khr_global_int32_extended_atomics)
-int __ovld atom_max(volatile __global int *p, int val);
-uint __ovld atom_max(volatile __global uint *p, uint val);
+int __ovld atom_max(volatile __global int *, int);
+uint __ovld atom_max(volatile __global uint *, uint);
 #endif
 #if defined(cl_khr_local_int32_extended_atomics)
-int __ovld atom_max(volatile __local int *p, int val);
-uint __ovld atom_max(volatile __local uint *p, uint val);
+int __ovld atom_max(volatile __local int *, int);
+uint __ovld atom_max(volatile __local uint *, uint);
 #endif
 
 #if defined(cl_khr_int64_extended_atomics)
-long __ovld atom_max(volatile __global long *p, long val);
-ulong __ovld atom_max(volatile __global ulong *p, ulong val);
-long __ovld atom_max(volatile __local long *p, long val);
-ulong __ovld atom_max(volatile __local ulong *p, ulong val);
+long __ovld atom_max(volatile __global long *, long);
+ulong __ovld atom_max(volatile __global ulong *, ulong);
+long __ovld atom_max(volatile __local long *, long);
+ulong __ovld atom_max(volatile __local ulong *, ulong);
 #endif
 
 /**
@@ -13143,29 +13143,29 @@ ulong __ovld atom_max(volatile __local ulong *p, ulong val);
  * (old & val) and store result at location
  * pointed by p. The function returns old.
  */
-int __ovld atomic_and(volatile __global int *p, int val);
-uint __ovld atomic_and(volatile __global uint *p, uint val);
-int __ovld atomic_and(volatile __local int *p, int val);
-uint __ovld atomic_and(volatile __local uint *p, uint val);
+int __ovld atomic_and(volatile __global int *, int);
+uint __ovld atomic_and(volatile __global uint *, uint);
+int __ovld atomic_and(volatile __local int *, int);
+uint __ovld atomic_and(volatile __local uint *, uint);
 #ifdef __OPENCL_CPP_VERSION__
-int __ovld atomic_and(volatile int *p, int val);
-uint __ovld atomic_and(volatile uint *p, uint val);
+int __ovld atomic_and(volatile int *, int);
+uint __ovld atomic_and(volatile uint *, uint);
 #endif
 
 #if defined(cl_khr_global_int32_extended_atomics)
-int __ovld atom_and(volatile __global int *p, int val);
-uint __ovld atom_and(volatile __global uint *p, uint val);
+int __ovld atom_and(volatile __global int *, int);
+uint __ovld atom_and(volatile __global uint *, uint);
 #endif
 #if defined(cl_khr_local_int32_extended_atomics)
-int __ovld atom_and(volatile __local int *p, int val);
-uint __ovld atom_and(volatile __local uint *p, uint val);
+int __ovld atom_and(volatile __local int *, int);
+uint __ovld atom_and(volatile __local uint *, uint);
 #endif
 
 #if defined(cl_khr_int64_extended_atomics)
-long __ovld atom_and(volatile __global long *p, long val);
-ulong __ovld atom_and(volatile __global ulong *p, ulong val);
-long __ovld atom_and(volatile __local long *p, long val);
-ulong __ovld atom_and(volatile __local ulong *p, ulong val);
+long __ovld atom_and(volatile __global long *, long);
+ulong __ovld atom_and(volatile __global ulong *, ulong);
+long __ovld atom_and(volatile __local long *, long);
+ulong __ovld atom_and(volatile __local ulong *, ulong);
 #endif
 
 /**
@@ -13174,29 +13174,29 @@ ulong __ovld atom_and(volatile __local ulong *p, ulong val);
  * (old | val) and store result at location
  * pointed by p. The function returns old.
  */
-int __ovld atomic_or(volatile __global int *p, int val);
-uint __ovld atomic_or(volatile __global uint *p, uint val);
-int __ovld atomic_or(volatile __local int *p, int val);
-uint __ovld atomic_or(volatile __local uint *p, uint val);
+int __ovld atomic_or(volatile __global int *, int);
+uint __ovld atomic_or(volatile __global uint *, uint);
+int __ovld atomic_or(volatile __local int *, int);
+uint __ovld atomic_or(volatile __local uint *, uint);
 #ifdef __OPENCL_CPP_VERSION__
-int __ovld atomic_or(volatile int *p, int val);
-uint __ovld atomic_or(volatile uint *p, uint val);
+int __ovld atomic_or(volatile int *, int);
+uint __ovld atomic_or(volatile uint *, uint);
 #endif
 
 #if defined(cl_khr_global_int32_extended_atomics)
-int __ovld atom_or(volatile __global int *p, int val);
-uint __ovld atom_or(volatile __global uint *p, uint val);
+int __ovld atom_or(volatile __global int *, int);
+uint __ovld atom_or(volatile __global uint *, uint);
 #endif
 #if defined(cl_khr_local_int32_extended_atomics)
-int __ovld atom_or(volatile __local int *p, int val);
-uint __ovld atom_or(volatile __local uint *p, uint val);
+int __ovld atom_or(volatile __local int *, int);
+uint __ovld atom_or(volatile __local uint *, uint);
 #endif
 
 #if defined(cl_khr_int64_extended_atomics)
-long __ovld atom_or(volatile __global long *p, long val);
-ulong __ovld atom_or(volatile __global ulong *p, ulong val);
-long __ovld atom_or(volatile __local long *p, long val);
-ulong __ovld atom_or(volatile __local ulong *p, ulong val);
+long __ovld atom_or(volatile __global long *, long);
+ulong __ovld atom_or(volatile __global ulong *, ulong);
+long __ovld atom_or(volatile __local long *, long);
+ulong __ovld atom_or(volatile __local ulong *, ulong);
 #endif
 
 /**
@@ -13205,29 +13205,29 @@ ulong __ovld atom_or(volatile __local ulong *p, ulong val);
  * (old ^ val) and store result at location
  * pointed by p. The function returns old.
  */
-int __ovld atomic_xor(volatile __global int *p, int val);
-uint __ovld atomic_xor(volatile __global uint *p, uint val);
-int __ovld atomic_xor(volatile __local int *p, int val);
-uint __ovld atomic_xor(volatile __local uint *p, uint val);
+int __ovld atomic_xor(volatile __global int *, int);
+uint __ovld atomic_xor(volatile __global uint *, uint);
+int __ovld atomic_xor(volatile __local int *, int);
+uint __ovld atomic_xor(volatile __local uint *, uint);
 #ifdef __OPENCL_CPP_VERSION__
-int __ovld atomic_xor(volatile int *p, int val);
-uint __ovld atomic_xor(volatile uint *p, uint val);
+int __ovld atomic_xor(volatile int *, int);
+uint __ovld atomic_xor(volatile uint *, uint);
 #endif
 
 #if defined(cl_khr_global_int32_extended_atomics)
-int __ovld atom_xor(volatile __global int *p, int val);
-uint __ovld atom_xor(volatile __global uint *p, uint val);
+int __ovld atom_xor(volatile __global int *, int);
+uint __ovld atom_xor(volatile __global uint *, uint);
 #endif
 #if defined(cl_khr_local_int32_extended_atomics)
-int __ovld atom_xor(volatile __local int *p, int val);
-uint __ovld atom_xor(volatile __local uint *p, uint val);
+int __ovld atom_xor(volatile __local int *, int);
+uint __ovld atom_xor(volatile __local uint *, uint);
 #endif
 
 #if defined(cl_khr_int64_extended_atomics)
-long __ovld atom_xor(volatile __global long *p, long val);
-ulong __ovld atom_xor(volatile __global ulong *p, ulong val);
-long __ovld atom_xor(volatile __local long *p, long val);
-ulong __ovld atom_xor(volatile __local ulong *p, ulong val);
+long __ovld atom_xor(volatile __global long *, long);
+ulong __ovld atom_xor(volatile __global ulong *, ulong);
+long __ovld atom_xor(volatile __local long *, long);
+ulong __ovld atom_xor(volatile __local ulong *, ulong);
 #endif
 
 #if defined(cl_khr_int64_base_atomics) && defined(cl_khr_int64_extended_atomics)
@@ -15257,13 +15257,17 @@ float4 __ovld __purefn read_imagef(read_only image2d_t, sampler_t, float2, float
 int4 __ovld __purefn read_imagei(read_only image2d_t, sampler_t, float2, float);
 uint4 __ovld __purefn read_imageui(read_only image2d_t, sampler_t, float2, float);
 
+#ifdef cl_khr_depth_images
 float __ovld __purefn read_imagef(read_only image2d_depth_t, sampler_t, float2, float);
+#endif // cl_khr_depth_images
 
 float4 __ovld __purefn read_imagef(read_only image2d_array_t, sampler_t, float4, float);
 int4 __ovld __purefn read_imagei(read_only image2d_array_t, sampler_t, float4, float);
 uint4 __ovld __purefn read_imageui(read_only image2d_array_t, sampler_t, float4, float);
 
+#ifdef cl_khr_depth_images
 float __ovld __purefn read_imagef(read_only image2d_array_depth_t, sampler_t, float4, float);
+#endif // cl_khr_depth_images
 
 float4 __ovld __purefn read_imagef(read_only image3d_t, sampler_t, float4, float);
 int4 __ovld __purefn read_imagei(read_only image3d_t, sampler_t, float4, float);
@@ -15281,13 +15285,17 @@ float4 __ovld __purefn read_imagef(read_only image2d_t, sampler_t, float2, float
 int4 __ovld __purefn read_imagei(read_only image2d_t, sampler_t, float2, float2, float2);
 uint4 __ovld __purefn read_imageui(read_only image2d_t, sampler_t, float2, float2, float2);
 
+#ifdef cl_khr_depth_images
 float __ovld __purefn read_imagef(read_only image2d_depth_t, sampler_t, float2, float2, float2);
+#endif // cl_khr_depth_images
 
 float4 __ovld __purefn read_imagef(read_only image2d_array_t, sampler_t, float4, float2, float2);
 int4 __ovld __purefn read_imagei(read_only image2d_array_t, sampler_t, float4, float2, float2);
 uint4 __ovld __purefn read_imageui(read_only image2d_array_t, sampler_t, float4, float2, float2);
 
+#ifdef cl_khr_depth_images
 float __ovld __purefn read_imagef(read_only image2d_array_depth_t, sampler_t, float4, float2, float2);
+#endif // cl_khr_depth_images
 
 float4 __ovld __purefn read_imagef(read_only image3d_t, sampler_t, float4, float4, float4);
 int4 __ovld __purefn read_imagei(read_only image3d_t, sampler_t, float4, float4, float4);
@@ -15380,9 +15388,11 @@ float4 __ovld __purefn read_imagef(read_write image2d_array_t, int4);
 int4 __ovld __purefn read_imagei(read_write image2d_array_t, int4);
 uint4 __ovld __purefn read_imageui(read_write image2d_array_t, int4);
 
+#ifdef cl_khr_3d_image_writes
 float4 __ovld __purefn read_imagef(read_write image3d_t, int4);
 int4 __ovld __purefn read_imagei(read_write image3d_t, int4);
 uint4 __ovld __purefn read_imageui(read_write image3d_t, int4);
+#endif // cl_khr_3d_image_writes
 
 #ifdef cl_khr_depth_images
 float __ovld __purefn read_imagef(read_write image2d_depth_t, int2);
@@ -15423,9 +15433,11 @@ uint4 __ovld __purefn read_imageui(read_write image2d_array_t, sampler_t, float4
 
 float __ovld __purefn read_imagef(read_write image2d_array_depth_t, sampler_t, float4, float);
 
+#ifdef cl_khr_3d_image_writes
 float4 __ovld __purefn read_imagef(read_write image3d_t, sampler_t, float4, float);
 int4 __ovld __purefn read_imagei(read_write image3d_t, sampler_t, float4, float);
 uint4 __ovld __purefn read_imageui(read_write image3d_t, sampler_t, float4, float);
+#endif // cl_khr_3d_image_writes
 
 float4 __ovld __purefn read_imagef(read_write image1d_t, sampler_t, float, float, float);
 int4 __ovld __purefn read_imagei(read_write image1d_t, sampler_t, float, float, float);
@@ -15447,9 +15459,11 @@ uint4 __ovld __purefn read_imageui(read_write image2d_array_t, sampler_t, float4
 
 float __ovld __purefn read_imagef(read_write image2d_array_depth_t, sampler_t, float4, float2, float2);
 
+#ifdef cl_khr_3d_image_writes
 float4 __ovld __purefn read_imagef(read_write image3d_t, sampler_t, float4, float4, float4);
 int4 __ovld __purefn read_imagei(read_write image3d_t, sampler_t, float4, float4, float4);
 uint4 __ovld __purefn read_imageui(read_write image3d_t, sampler_t, float4, float4, float4);
+#endif // cl_khr_3d_image_writes
 
 #endif //cl_khr_mipmap_image
 
@@ -15457,7 +15471,9 @@ uint4 __ovld __purefn read_imageui(read_write image3d_t, sampler_t, float4, floa
 #ifdef cl_khr_fp16
 half4 __ovld __purefn read_imageh(read_write image1d_t, int);
 half4 __ovld __purefn read_imageh(read_write image2d_t, int2);
+#ifdef cl_khr_3d_image_writes
 half4 __ovld __purefn read_imageh(read_write image3d_t, int4);
+#endif // cl_khr_3d_image_writes
 half4 __ovld __purefn read_imageh(read_write image1d_array_t, int2);
 half4 __ovld __purefn read_imageh(read_write image2d_array_t, int4);
 half4 __ovld __purefn read_imageh(read_write image1d_buffer_t, int);
@@ -15727,7 +15743,9 @@ int __ovld __cnfn get_image_width(write_only image2d_array_msaa_depth_t);
 int __ovld __cnfn get_image_width(read_write image1d_t);
 int __ovld __cnfn get_image_width(read_write image1d_buffer_t);
 int __ovld __cnfn get_image_width(read_write image2d_t);
+#ifdef cl_khr_3d_image_writes
 int __ovld __cnfn get_image_width(read_write image3d_t);
+#endif // cl_khr_3d_image_writes
 int __ovld __cnfn get_image_width(read_write image1d_array_t);
 int __ovld __cnfn get_image_width(read_write image2d_array_t);
 #ifdef cl_khr_depth_images
@@ -15777,7 +15795,9 @@ int __ovld __cnfn get_image_height(write_only image2d_array_msaa_depth_t);
 
 #if defined(__opencl_c_read_write_images)
 int __ovld __cnfn get_image_height(read_write image2d_t);
+#ifdef cl_khr_3d_image_writes
 int __ovld __cnfn get_image_height(read_write image3d_t);
+#endif // cl_khr_3d_image_writes
 int __ovld __cnfn get_image_height(read_write image2d_array_t);
 #ifdef cl_khr_depth_images
 int __ovld __cnfn get_image_height(read_write image2d_depth_t);
@@ -15798,11 +15818,11 @@ int __ovld __cnfn get_image_depth(read_only image3d_t);
 
 #ifdef cl_khr_3d_image_writes
 int __ovld __cnfn get_image_depth(write_only image3d_t);
-#endif
 
 #if defined(__opencl_c_read_write_images)
 int __ovld __cnfn get_image_depth(read_write image3d_t);
 #endif //defined(__opencl_c_read_write_images)
+#endif // cl_khr_3d_image_writes
 
 // OpenCL Extension v2.0 s9.18 - Mipmaps
 #if defined(__OPENCL_CPP_VERSION__) || (__OPENCL_C_VERSION__ >= CL_VERSION_2_0)
@@ -15824,24 +15844,32 @@ int __ovld get_image_num_mip_levels(write_only image3d_t);
 #if defined(__opencl_c_read_write_images)
 int __ovld get_image_num_mip_levels(read_write image1d_t);
 int __ovld get_image_num_mip_levels(read_write image2d_t);
+#ifdef cl_khr_3d_image_writes
 int __ovld get_image_num_mip_levels(read_write image3d_t);
+#endif // cl_khr_3d_image_writes
 #endif //defined(__opencl_c_read_write_images)
 
 int __ovld get_image_num_mip_levels(read_only image1d_array_t);
 int __ovld get_image_num_mip_levels(read_only image2d_array_t);
+#ifdef cl_khr_depth_images
 int __ovld get_image_num_mip_levels(read_only image2d_array_depth_t);
 int __ovld get_image_num_mip_levels(read_only image2d_depth_t);
+#endif // cl_khr_depth_images
 
 int __ovld get_image_num_mip_levels(write_only image1d_array_t);
 int __ovld get_image_num_mip_levels(write_only image2d_array_t);
+#ifdef cl_khr_depth_images
 int __ovld get_image_num_mip_levels(write_only image2d_array_depth_t);
 int __ovld get_image_num_mip_levels(write_only image2d_depth_t);
+#endif // cl_khr_depth_images
 
 #if defined(__opencl_c_read_write_images)
 int __ovld get_image_num_mip_levels(read_write image1d_array_t);
 int __ovld get_image_num_mip_levels(read_write image2d_array_t);
+#ifdef cl_khr_depth_images
 int __ovld get_image_num_mip_levels(read_write image2d_array_depth_t);
 int __ovld get_image_num_mip_levels(read_write image2d_depth_t);
+#endif // cl_khr_depth_images
 #endif //defined(__opencl_c_read_write_images)
 
 #endif //cl_khr_mipmap_image
@@ -15906,7 +15934,9 @@ int __ovld __cnfn get_image_channel_data_type(write_only image2d_array_msaa_dept
 int __ovld __cnfn get_image_channel_data_type(read_write image1d_t);
 int __ovld __cnfn get_image_channel_data_type(read_write image1d_buffer_t);
 int __ovld __cnfn get_image_channel_data_type(read_write image2d_t);
+#ifdef cl_khr_3d_image_writes
 int __ovld __cnfn get_image_channel_data_type(read_write image3d_t);
+#endif // cl_khr_3d_image_writes
 int __ovld __cnfn get_image_channel_data_type(read_write image1d_array_t);
 int __ovld __cnfn get_image_channel_data_type(read_write image2d_array_t);
 #ifdef cl_khr_depth_images
@@ -15978,7 +16008,9 @@ int __ovld __cnfn get_image_channel_order(write_only image2d_array_msaa_depth_t)
 int __ovld __cnfn get_image_channel_order(read_write image1d_t);
 int __ovld __cnfn get_image_channel_order(read_write image1d_buffer_t);
 int __ovld __cnfn get_image_channel_order(read_write image2d_t);
+#ifdef cl_khr_3d_image_writes
 int __ovld __cnfn get_image_channel_order(read_write image3d_t);
+#endif // cl_khr_3d_image_writes
 int __ovld __cnfn get_image_channel_order(read_write image1d_array_t);
 int __ovld __cnfn get_image_channel_order(read_write image2d_array_t);
 #ifdef cl_khr_depth_images
@@ -16048,10 +16080,10 @@ int2 __ovld __cnfn get_image_dim(read_write image2d_array_msaa_depth_t);
 int4 __ovld __cnfn get_image_dim(read_only image3d_t);
 #ifdef cl_khr_3d_image_writes
 int4 __ovld __cnfn get_image_dim(write_only image3d_t);
-#endif
 #if defined(__opencl_c_read_write_images)
 int4 __ovld __cnfn get_image_dim(read_write image3d_t);
 #endif //defined(__opencl_c_read_write_images)
+#endif // cl_khr_3d_image_writes
 
 /**
  * Return the image array size.
@@ -16266,9 +16298,9 @@ uint    __ovld get_enqueued_num_sub_groups(void);
 uint    __ovld get_sub_group_id(void);
 uint    __ovld get_sub_group_local_id(void);
 
-void    __ovld __conv sub_group_barrier(cl_mem_fence_flags flags);
+void    __ovld __conv sub_group_barrier(cl_mem_fence_flags);
 #if defined(__OPENCL_CPP_VERSION__) || (__OPENCL_C_VERSION__ >= CL_VERSION_2_0)
-void    __ovld __conv sub_group_barrier(cl_mem_fence_flags flags, memory_scope);
+void    __ovld __conv sub_group_barrier(cl_mem_fence_flags, memory_scope);
 #endif //defined(__OPENCL_CPP_VERSION__) || (__OPENCL_C_VERSION__ >= CL_VERSION_2_0)
 
 int     __ovld __conv sub_group_all(int predicate);
@@ -17847,15 +17879,13 @@ intel_sub_group_avc_sic_configure_skc(
     uint skip_block_partition_type, uint skip_motion_vector_mask,
     ulong motion_vectors, uchar bidirectional_weight, uchar skip_sad_adjustment,
     intel_sub_group_avc_sic_payload_t payload);
-intel_sub_group_avc_sic_payload_t __ovld
-intel_sub_group_avc_sic_configure_ipe(
-    uchar luma_intra_partition_mask, uchar intra_neighbour_availabilty,
+intel_sub_group_avc_sic_payload_t __ovld intel_sub_group_avc_sic_configure_ipe(
+    uchar luma_intra_partition_mask, uchar intra_neighbour_availability,
     uchar left_edge_luma_pixels, uchar upper_left_corner_luma_pixel,
     uchar upper_edge_luma_pixels, uchar upper_right_edge_luma_pixels,
     uchar intra_sad_adjustment, intel_sub_group_avc_sic_payload_t payload);
-intel_sub_group_avc_sic_payload_t __ovld
-intel_sub_group_avc_sic_configure_ipe(
-    uchar luma_intra_partition_mask, uchar intra_neighbour_availabilty,
+intel_sub_group_avc_sic_payload_t __ovld intel_sub_group_avc_sic_configure_ipe(
+    uchar luma_intra_partition_mask, uchar intra_neighbour_availability,
     uchar left_edge_luma_pixels, uchar upper_left_corner_luma_pixel,
     uchar upper_edge_luma_pixels, uchar upper_right_edge_luma_pixels,
     ushort left_edge_chroma_pixels, ushort upper_left_corner_chroma_pixel,
diff --git a/clang/lib/Headers/openmp_wrappers/stdlib.h b/clang/lib/Headers/openmp_wrappers/stdlib.h
new file mode 100644
index 000000000000..d607469e04f7
--- /dev/null
+++ b/clang/lib/Headers/openmp_wrappers/stdlib.h
@@ -0,0 +1,29 @@
+/*===---- openmp_wrapper/stdlib.h ------ OpenMP math.h intercept ----- c++ -===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __CLANG_OPENMP_STDLIB_H__
+#define __CLANG_OPENMP_STDLIB_H__
+
+#ifndef _OPENMP
+#error "This file is for OpenMP compilation only."
+#endif
+
+#include_next <stdlib.h>
+
+#ifdef __AMDGCN__
+#pragma omp begin declare variant match(device = {arch(amdgcn)})
+
+#define __OPENMP_AMDGCN__
+#include <__clang_hip_stdlib.h>
+#undef __OPENMP_AMDGCN__
+
+#pragma omp end declare variant
+#endif
+
+#endif // __CLANG_OPENMP_STDLIB_H__
diff --git a/clang/lib/Headers/ppc_wrappers/emmintrin.h b/clang/lib/Headers/ppc_wrappers/emmintrin.h
index a4c458a41bcf..0814ea5593ba 100644
--- a/clang/lib/Headers/ppc_wrappers/emmintrin.h
+++ b/clang/lib/Headers/ppc_wrappers/emmintrin.h
@@ -36,7 +36,7 @@
 #ifndef EMMINTRIN_H_
 #define EMMINTRIN_H_
 
-#if defined(__ppc64__) &&                                                      \
+#if defined(__powerpc64__) &&                                                  \
     (defined(__linux__) || defined(__FreeBSD__) || defined(_AIX))
 
 #include <altivec.h>
@@ -2262,7 +2262,7 @@ extern __inline __m128d
 
 #else
 #include_next <emmintrin.h>
-#endif /* defined(__ppc64__) &&
+#endif /* defined(__powerpc64__) &&                                            \
         *   (defined(__linux__) || defined(__FreeBSD__) || defined(_AIX)) */
 
 #endif /* EMMINTRIN_H_ */
diff --git a/clang/lib/Headers/ppc_wrappers/mm_malloc.h b/clang/lib/Headers/ppc_wrappers/mm_malloc.h
index 65920917f3bd..7c1e625e44d5 100644
--- a/clang/lib/Headers/ppc_wrappers/mm_malloc.h
+++ b/clang/lib/Headers/ppc_wrappers/mm_malloc.h
@@ -10,7 +10,7 @@
 #ifndef _MM_MALLOC_H_INCLUDED
 #define _MM_MALLOC_H_INCLUDED
 
-#if defined(__ppc64__) &&                                                      \
+#if defined(__powerpc64__) &&                                                  \
     (defined(__linux__) || defined(__FreeBSD__) || defined(_AIX))
 
 #include <stdlib.h>
diff --git a/clang/lib/Headers/ppc_wrappers/mmintrin.h b/clang/lib/Headers/ppc_wrappers/mmintrin.h
index 70e8b81e11ee..0be3af2b0bd7 100644
--- a/clang/lib/Headers/ppc_wrappers/mmintrin.h
+++ b/clang/lib/Headers/ppc_wrappers/mmintrin.h
@@ -35,7 +35,7 @@
 #ifndef _MMINTRIN_H_INCLUDED
 #define _MMINTRIN_H_INCLUDED
 
-#if defined(__ppc64__) &&                                                      \
+#if defined(__powerpc64__) &&                                                  \
     (defined(__linux__) || defined(__FreeBSD__) || defined(_AIX))
 
 #include <altivec.h>
@@ -1447,7 +1447,7 @@ extern __inline __m64
 
 #else
 #include_next <mmintrin.h>
-#endif /* defined(__ppc64__) &&
+#endif /* defined(__powerpc64__) &&                                            \
         *   (defined(__linux__) || defined(__FreeBSD__) || defined(_AIX)) */
 
 #endif /* _MMINTRIN_H_INCLUDED */
diff --git a/clang/lib/Headers/ppc_wrappers/pmmintrin.h b/clang/lib/Headers/ppc_wrappers/pmmintrin.h
index fda39edbaa22..db128192abfb 100644
--- a/clang/lib/Headers/ppc_wrappers/pmmintrin.h
+++ b/clang/lib/Headers/ppc_wrappers/pmmintrin.h
@@ -39,7 +39,7 @@
 #ifndef PMMINTRIN_H_
 #define PMMINTRIN_H_
 
-#if defined(__ppc64__) &&                                                      \
+#if defined(__powerpc64__) &&                                                  \
     (defined(__linux__) || defined(__FreeBSD__) || defined(_AIX))
 
 /* We need definitions from the SSE2 and SSE header files*/
@@ -139,7 +139,7 @@ extern __inline __m128i
 
 #else
 #include_next <pmmintrin.h>
-#endif /* defined(__ppc64__) &&
+#endif /* defined(__powerpc64__) &&                                            \
         *   (defined(__linux__) || defined(__FreeBSD__) || defined(_AIX)) */
 
 #endif /* PMMINTRIN_H_ */
diff --git a/clang/lib/Headers/ppc_wrappers/smmintrin.h b/clang/lib/Headers/ppc_wrappers/smmintrin.h
index 6fe6c8a93d9b..6fe6d2a157a5 100644
--- a/clang/lib/Headers/ppc_wrappers/smmintrin.h
+++ b/clang/lib/Headers/ppc_wrappers/smmintrin.h
@@ -29,7 +29,7 @@
 #ifndef SMMINTRIN_H_
 #define SMMINTRIN_H_
 
-#if defined(__ppc64__) &&                                                      \
+#if defined(__powerpc64__) &&                                                  \
     (defined(__linux__) || defined(__FreeBSD__) || defined(_AIX))
 
 #include <altivec.h>
@@ -657,7 +657,7 @@ extern __inline __m128i
 
 #else
 #include_next <smmintrin.h>
-#endif /* defined(__ppc64__) &&
+#endif /* defined(__powerpc64__) &&                                            \
         *   (defined(__linux__) || defined(__FreeBSD__) || defined(_AIX)) */
 
 #endif /* SMMINTRIN_H_ */
diff --git a/clang/lib/Headers/ppc_wrappers/tmmintrin.h b/clang/lib/Headers/ppc_wrappers/tmmintrin.h
index 6185ca1e7e71..92f08676d2df 100644
--- a/clang/lib/Headers/ppc_wrappers/tmmintrin.h
+++ b/clang/lib/Headers/ppc_wrappers/tmmintrin.h
@@ -25,7 +25,7 @@
 #ifndef TMMINTRIN_H_
 #define TMMINTRIN_H_
 
-#if defined(__ppc64__) &&                                                      \
+#if defined(__powerpc64__) &&                                                  \
     (defined(__linux__) || defined(__FreeBSD__) || defined(_AIX))
 
 #include <altivec.h>
@@ -447,7 +447,7 @@ extern __inline __m64
 
 #else
 #include_next <tmmintrin.h>
-#endif /* defined(__ppc64__) &&
+#endif /* defined(__powerpc64__) &&                                            \
         *   (defined(__linux__) || defined(__FreeBSD__) || defined(_AIX)) */
 
 #endif /* TMMINTRIN_H_ */
diff --git a/clang/lib/Headers/ppc_wrappers/xmmintrin.h b/clang/lib/Headers/ppc_wrappers/xmmintrin.h
index ee0032ca159c..9dd21b65c2f7 100644
--- a/clang/lib/Headers/ppc_wrappers/xmmintrin.h
+++ b/clang/lib/Headers/ppc_wrappers/xmmintrin.h
@@ -35,7 +35,7 @@
 #ifndef XMMINTRIN_H_
 #define XMMINTRIN_H_
 
-#if defined(__ppc64__) &&                                                      \
+#if defined(__powerpc64__) &&                                                  \
     (defined(__linux__) || defined(__FreeBSD__) || defined(_AIX))
 
 /* Define four value permute mask */
@@ -1821,7 +1821,7 @@ extern __inline void
 
 #else
 #include_next <xmmintrin.h>
-#endif /* defined(__ppc64__) &&
+#endif /* defined(__powerpc64__) &&                                            \
         *   (defined(__linux__) || defined(__FreeBSD__) || defined(_AIX)) */
 
 #endif /* XMMINTRIN_H_ */
diff --git a/clang/lib/Headers/prfchiintrin.h b/clang/lib/Headers/prfchiintrin.h
new file mode 100644
index 000000000000..36600b25aa1d
--- /dev/null
+++ b/clang/lib/Headers/prfchiintrin.h
@@ -0,0 +1,61 @@
+/*===---- prfchiintrin.h - PREFETCHI intrinsic -----------------------------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __PRFCHIINTRIN_H
+#define __PRFCHIINTRIN_H
+
+#ifdef __x86_64__
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS                                                     \
+  __attribute__((__always_inline__, __nodebug__, __target__("prefetchi")))
+
+/// Loads an instruction sequence containing the specified memory address into
+///    all level cache.
+///
+///    Note that the effect of this intrinsic is dependent on the processor
+///    implementation.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c PREFETCHIT0 instruction.
+///
+/// \param __P
+///    A pointer specifying the memory address to be prefetched.
+static __inline__ void __DEFAULT_FN_ATTRS
+_m_prefetchit0(volatile const void *__P) {
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wcast-qual"
+  __builtin_ia32_prefetchi((const void *)__P, 3 /* _MM_HINT_T0 */);
+#pragma clang diagnostic pop
+}
+
+/// Loads an instruction sequence containing the specified memory address into
+///    all but the first-level cache.
+///
+///    Note that the effect of this intrinsic is dependent on the processor
+///    implementation.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c PREFETCHIT1 instruction.
+///
+/// \param __P
+///    A pointer specifying the memory address to be prefetched.
+static __inline__ void __DEFAULT_FN_ATTRS
+_m_prefetchit1(volatile const void *__P) {
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wcast-qual"
+  __builtin_ia32_prefetchi((const void *)__P, 2 /* _MM_HINT_T1 */);
+#pragma clang diagnostic pop
+}
+#endif /* __x86_64__ */
+#undef __DEFAULT_FN_ATTRS
+
+#endif /* __PRFCHWINTRIN_H */
diff --git a/clang/lib/Headers/raointintrin.h b/clang/lib/Headers/raointintrin.h
new file mode 100644
index 000000000000..d3290eb62abf
--- /dev/null
+++ b/clang/lib/Headers/raointintrin.h
@@ -0,0 +1,203 @@
+/*===----------------------- raointintrin.h - RAOINT ------------------------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __X86GPRINTRIN_H
+#error "Never use <raointintrin.h> directly; include <x86gprintrin.h> instead."
+#endif // __X86GPRINTRIN_H
+
+#ifndef __RAOINTINTRIN_H
+#define __RAOINTINTRIN_H
+
+#define __DEFAULT_FN_ATTRS                                                     \
+  __attribute__((__always_inline__, __nodebug__, __target__("raoint")))
+
+/// Atomically add a 32-bit value at memory operand \a __A and a 32-bit \a __B,
+///    and store the result to the same memory location.
+///
+///    This intrinsic should be used for contention or weak ordering. It may
+///    result in bad performance for hot data used by single thread only.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c AADD instruction.
+///
+/// \param __A
+///    A pointer to a 32-bit memory location.
+/// \param __B
+///    A 32-bit integer value.
+///
+/// \code{.operation}
+/// MEM[__A+31:__A] := MEM[__A+31:__A] + __B[31:0]
+/// \endcode
+static __inline__ void __DEFAULT_FN_ATTRS _aadd_i32(int *__A, int __B) {
+  __builtin_ia32_aadd32((int *)__A, __B);
+}
+
+/// Atomically and a 32-bit value at memory operand \a __A and a 32-bit \a __B,
+///    and store the result to the same memory location.
+///
+///    This intrinsic should be used for contention or weak ordering. It may
+///    result in bad performance for hot data used by single thread only.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c AAND instruction.
+///
+/// \param __A
+///    A pointer to a 32-bit memory location.
+/// \param __B
+///    A 32-bit integer value.
+///
+/// \code{.operation}
+/// MEM[__A+31:__A] := MEM[__A+31:__A] AND __B[31:0]
+/// \endcode
+static __inline__ void __DEFAULT_FN_ATTRS _aand_i32(int *__A, int __B) {
+  __builtin_ia32_aand32((int *)__A, __B);
+}
+
+/// Atomically or a 32-bit value at memory operand \a __A and a 32-bit \a __B,
+///    and store the result to the same memory location.
+///
+///    This intrinsic should be used for contention or weak ordering. It may
+///    result in bad performance for hot data used by single thread only.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c AOR instruction.
+///
+/// \param __A
+///    A pointer to a 32-bit memory location.
+/// \param __B
+///    A 32-bit integer value.
+///
+/// \code{.operation}
+/// MEM[__A+31:__A] := MEM[__A+31:__A] OR __B[31:0]
+/// \endcode
+static __inline__ void __DEFAULT_FN_ATTRS _aor_i32(int *__A, int __B) {
+  __builtin_ia32_aor32((int *)__A, __B);
+}
+
+/// Atomically xor a 32-bit value at memory operand \a __A and a 32-bit \a __B,
+///    and store the result to the same memory location.
+///
+///    This intrinsic should be used for contention or weak ordering. It may
+///    result in bad performance for hot data used by single thread only.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c AXOR instruction.
+///
+/// \param __A
+///    A pointer to a 32-bit memory location.
+/// \param __B
+///    A 32-bit integer value.
+///
+/// \code{.operation}
+/// MEM[__A+31:__A] := MEM[__A+31:__A] XOR __B[31:0]
+/// \endcode
+static __inline__ void __DEFAULT_FN_ATTRS _axor_i32(int *__A, int __B) {
+  __builtin_ia32_axor32((int *)__A, __B);
+}
+
+#ifdef __x86_64__
+/// Atomically add a 64-bit value at memory operand \a __A and a 64-bit \a __B,
+///    and store the result to the same memory location.
+///
+///    This intrinsic should be used for contention or weak ordering. It may
+///    result in bad performance for hot data used by single thread only.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c AADD instruction.
+///
+/// \param __A
+///    A pointer to a 64-bit memory location.
+/// \param __B
+///    A 64-bit integer value.
+///
+/// \code{.operation}
+/// MEM[__A+63:__A] := MEM[__A+63:__A] + __B[63:0]
+/// \endcode
+static __inline__ void __DEFAULT_FN_ATTRS _aadd_i64(long long *__A,
+                                                    long long __B) {
+  __builtin_ia32_aadd64((long long *)__A, __B);
+}
+
+/// Atomically and a 64-bit value at memory operand \a __A and a 64-bit \a __B,
+///    and store the result to the same memory location.
+///
+///    This intrinsic should be used for contention or weak ordering. It may
+///    result in bad performance for hot data used by single thread only.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c AAND instruction.
+///
+/// \param __A
+///    A pointer to a 64-bit memory location.
+/// \param __B
+///    A 64-bit integer value.
+///
+/// \code{.operation}
+/// MEM[__A+63:__A] := MEM[__A+63:__A] AND __B[63:0]
+/// \endcode
+static __inline__ void __DEFAULT_FN_ATTRS _aand_i64(long long *__A,
+                                                    long long __B) {
+  __builtin_ia32_aand64((long long *)__A, __B);
+}
+
+/// Atomically or a 64-bit value at memory operand \a __A and a 64-bit \a __B,
+///    and store the result to the same memory location.
+///
+///    This intrinsic should be used for contention or weak ordering. It may
+///    result in bad performance for hot data used by single thread only.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c AOR instruction.
+///
+/// \param __A
+///    A pointer to a 64-bit memory location.
+/// \param __B
+///    A 64-bit integer value.
+///
+/// \code{.operation}
+/// MEM[__A+63:__A] := MEM[__A+63:__A] OR __B[63:0]
+/// \endcode
+static __inline__ void __DEFAULT_FN_ATTRS _aor_i64(long long *__A,
+                                                   long long __B) {
+  __builtin_ia32_aor64((long long *)__A, __B);
+}
+
+/// Atomically xor a 64-bit value at memory operand \a __A and a 64-bit \a __B,
+///    and store the result to the same memory location.
+///
+///    This intrinsic should be used for contention or weak ordering. It may
+///    result in bad performance for hot data used by single thread only.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c AXOR instruction.
+///
+/// \param __A
+///    A pointer to a 64-bit memory location.
+/// \param __B
+///    A 64-bit integer value.
+///
+/// \code{.operation}
+/// MEM[__A+63:__A] := MEM[__A+63:__A] XOR __B[63:0]
+/// \endcode
+static __inline__ void __DEFAULT_FN_ATTRS _axor_i64(long long *__A,
+                                                    long long __B) {
+  __builtin_ia32_axor64((long long *)__A, __B);
+}
+#endif // __x86_64__
+
+#undef __DEFAULT_FN_ATTRS
+#endif // __RAOINTINTRIN_H
diff --git a/clang/lib/Headers/smmintrin.h b/clang/lib/Headers/smmintrin.h
index 46fb7bcd4e09..2111c24f31a6 100644
--- a/clang/lib/Headers/smmintrin.h
+++ b/clang/lib/Headers/smmintrin.h
@@ -818,7 +818,7 @@ static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_max_epu32(__m128i __V1,
 ///    parameter, is copied to the result.
 /// \param N
 ///    Specifies which bits from operand \a Y will be copied, which bits in the
-///    result they will be be copied to, and which bits in the result will be
+///    result they will be copied to, and which bits in the result will be
 ///    cleared. The following assignments are made: \n
 ///    Bits [7:6] specify the bits to copy from operand \a Y: \n
 ///      00: Selects bits [31:0] from operand \a Y. \n
diff --git a/clang/lib/Headers/stdarg.h b/clang/lib/Headers/stdarg.h
index 0bc39408c1e5..ba978721f1f3 100644
--- a/clang/lib/Headers/stdarg.h
+++ b/clang/lib/Headers/stdarg.h
@@ -8,13 +8,30 @@
  */
 
 #ifndef __STDARG_H
-#define __STDARG_H
 
+#ifndef __GNUC_VA_LIST
+#define __GNUC_VA_LIST
+typedef __builtin_va_list __gnuc_va_list;
+#endif
+
+#ifdef __need___va_list
+#undef __need___va_list
+#else
+#define __STDARG_H
 #ifndef _VA_LIST
 typedef __builtin_va_list va_list;
 #define _VA_LIST
 #endif
+
+/* FIXME: This is using the placeholder dates Clang produces for these macros
+   in C2x mode; switch to the correct values once they've been published. */
+#if defined(__STDC_VERSION__) && __STDC_VERSION__ >= 202000L
+/* C2x does not require the second parameter for va_start. */
+#define va_start(ap, ...) __builtin_va_start(ap, 0)
+#else
+/* Versions before C2x do require the second parameter. */
 #define va_start(ap, param) __builtin_va_start(ap, param)
+#endif
 #define va_end(ap)          __builtin_va_end(ap)
 #define va_arg(ap, type)    __builtin_va_arg(ap, type)
 
@@ -23,13 +40,12 @@ typedef __builtin_va_list va_list;
  */
 #define __va_copy(d,s) __builtin_va_copy(d,s)
 
-#if __STDC_VERSION__ >= 199901L || __cplusplus >= 201103L || !defined(__STRICT_ANSI__)
+#if (defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L) ||              \
+    (defined(__cplusplus) && __cplusplus >= 201103L) ||                        \
+    !defined(__STRICT_ANSI__)
 #define va_copy(dest, src)  __builtin_va_copy(dest, src)
 #endif
 
-#ifndef __GNUC_VA_LIST
-#define __GNUC_VA_LIST 1
-typedef __builtin_va_list __gnuc_va_list;
-#endif
-
 #endif /* __STDARG_H */
+
+#endif /* not __STDARG_H */
diff --git a/clang/lib/Headers/stdatomic.h b/clang/lib/Headers/stdatomic.h
index 318c7ca56e41..0f893beea6ca 100644
--- a/clang/lib/Headers/stdatomic.h
+++ b/clang/lib/Headers/stdatomic.h
@@ -15,10 +15,12 @@
  *
  * Exclude the MSVC path as well as the MSVC header as of the 14.31.30818
  * explicitly disallows `stdatomic.h` in the C mode via an `#error`.  Fallback
- * to the clang resource header until that is fully supported.
+ * to the clang resource header until that is fully supported.  The
+ * `stdatomic.h` header requires C++ 23 or newer.
  */
 #if __STDC_HOSTED__ &&                                                         \
-    __has_include_next(<stdatomic.h>) && !(defined(_MSC_VER) && !defined(__cplusplus))
+    __has_include_next(<stdatomic.h>) &&                                       \
+    (!defined(_MSC_VER) || (defined(__cplusplus) && __cplusplus >= 202002L))
 # include_next <stdatomic.h>
 #else
 
@@ -45,7 +47,8 @@ extern "C" {
 /* 7.17.2 Initialization */
 
 #define ATOMIC_VAR_INIT(value) (value)
-#if (__STDC_VERSION__ >= 201710L || __cplusplus >= 202002L) &&                 \
+#if ((defined(__STDC_VERSION__) && __STDC_VERSION__ >= 201710L) ||             \
+     (defined(__cplusplus) && __cplusplus >= 202002L)) &&                      \
     !defined(_CLANG_DISABLE_CRT_DEPRECATION_WARNINGS)
 /* ATOMIC_VAR_INIT was deprecated in C17 and C++20. */
 #pragma clang deprecated(ATOMIC_VAR_INIT)
diff --git a/clang/lib/Headers/stdbool.h b/clang/lib/Headers/stdbool.h
index f0e588532e16..9406aab0ca72 100644
--- a/clang/lib/Headers/stdbool.h
+++ b/clang/lib/Headers/stdbool.h
@@ -12,7 +12,7 @@
 
 #define __bool_true_false_are_defined 1
 
-#if __STDC_VERSION__ > 201710L
+#if defined(__STDC_VERSION__) && __STDC_VERSION__ > 201710L
 /* FIXME: We should be issuing a deprecation warning here, but cannot yet due
  * to system headers which include this header file unconditionally.
  */
@@ -23,7 +23,7 @@
 #elif defined(__GNUC__) && !defined(__STRICT_ANSI__)
 /* Define _Bool as a GNU extension. */
 #define _Bool bool
-#if __cplusplus < 201103L
+#if defined(__cplusplus) && __cplusplus < 201103L
 /* For C++98, define bool, false, true as a GNU extension. */
 #define bool bool
 #define false false
diff --git a/clang/lib/Headers/stddef.h b/clang/lib/Headers/stddef.h
index a15d21b55317..42815176dcd0 100644
--- a/clang/lib/Headers/stddef.h
+++ b/clang/lib/Headers/stddef.h
@@ -97,8 +97,15 @@ using ::std::nullptr_t;
 #undef __need_NULL
 #endif /* defined(__need_NULL) */
 
+/* FIXME: This is using the placeholder dates Clang produces for these macros
+   in C2x mode; switch to the correct values once they've been published. */
+#if defined(__STDC_VERSION__) && __STDC_VERSION__ >= 202000L
+typedef typeof(nullptr) nullptr_t;
+#endif /* defined(__STDC_VERSION__) && __STDC_VERSION__ >= 202000L */
+
 #if defined(__need_STDDEF_H_misc)
-#if __STDC_VERSION__ >= 201112L || __cplusplus >= 201103L
+#if (defined(__STDC_VERSION__) && __STDC_VERSION__ >= 201112L) ||              \
+    (defined(__cplusplus) && __cplusplus >= 201103L)
 #include "__stddef_max_align_t.h"
 #endif
 #define offsetof(t, d) __builtin_offsetof(t, d)
diff --git a/clang/lib/Headers/stdint.h b/clang/lib/Headers/stdint.h
index 4790c25a2774..a47e91be1889 100644
--- a/clang/lib/Headers/stdint.h
+++ b/clang/lib/Headers/stdint.h
@@ -96,13 +96,21 @@
 typedef __INT64_TYPE__ int64_t;
 # endif /* __int8_t_defined */
 typedef __UINT64_TYPE__ uint64_t;
+# undef __int_least64_t
 # define __int_least64_t int64_t
+# undef __uint_least64_t
 # define __uint_least64_t uint64_t
+# undef __int_least32_t
 # define __int_least32_t int64_t
+# undef __uint_least32_t
 # define __uint_least32_t uint64_t
+# undef __int_least16_t
 # define __int_least16_t int64_t
+# undef __uint_least16_t
 # define __uint_least16_t uint64_t
+# undef __int_least8_t
 # define __int_least8_t int64_t
+# undef __uint_least8_t
 # define __uint_least8_t uint64_t
 #endif /* __INT64_TYPE__ */
 
@@ -120,11 +128,17 @@ typedef int56_t int_least56_t;
 typedef uint56_t uint_least56_t;
 typedef int56_t int_fast56_t;
 typedef uint56_t uint_fast56_t;
+# undef __int_least32_t
 # define __int_least32_t int56_t
+# undef __uint_least32_t
 # define __uint_least32_t uint56_t
+# undef __int_least16_t
 # define __int_least16_t int56_t
+# undef __uint_least16_t
 # define __uint_least16_t uint56_t
+# undef __int_least8_t
 # define __int_least8_t int56_t
+# undef __uint_least8_t
 # define __uint_least8_t uint56_t
 #endif /* __INT56_TYPE__ */
 
@@ -136,11 +150,17 @@ typedef int48_t int_least48_t;
 typedef uint48_t uint_least48_t;
 typedef int48_t int_fast48_t;
 typedef uint48_t uint_fast48_t;
+# undef __int_least32_t
 # define __int_least32_t int48_t
+# undef __uint_least32_t
 # define __uint_least32_t uint48_t
+# undef __int_least16_t
 # define __int_least16_t int48_t
+# undef __uint_least16_t
 # define __uint_least16_t uint48_t
+# undef __int_least8_t
 # define __int_least8_t int48_t
+# undef __uint_least8_t
 # define __uint_least8_t uint48_t
 #endif /* __INT48_TYPE__ */
 
@@ -152,11 +172,17 @@ typedef int40_t int_least40_t;
 typedef uint40_t uint_least40_t;
 typedef int40_t int_fast40_t;
 typedef uint40_t uint_fast40_t;
+# undef __int_least32_t
 # define __int_least32_t int40_t
+# undef __uint_least32_t
 # define __uint_least32_t uint40_t
+# undef __int_least16_t
 # define __int_least16_t int40_t
+# undef __uint_least16_t
 # define __uint_least16_t uint40_t
+# undef __int_least8_t
 # define __int_least8_t int40_t
+# undef __uint_least8_t
 # define __uint_least8_t uint40_t
 #endif /* __INT40_TYPE__ */
 
@@ -172,11 +198,17 @@ typedef __INT32_TYPE__ int32_t;
 typedef __UINT32_TYPE__ uint32_t;
 # endif /* __uint32_t_defined */
 
+# undef __int_least32_t
 # define __int_least32_t int32_t
+# undef __uint_least32_t
 # define __uint_least32_t uint32_t
+# undef __int_least16_t
 # define __int_least16_t int32_t
+# undef __uint_least16_t
 # define __uint_least16_t uint32_t
+# undef __int_least8_t
 # define __int_least8_t int32_t
+# undef __uint_least8_t
 # define __uint_least8_t uint32_t
 #endif /* __INT32_TYPE__ */
 
@@ -194,9 +226,13 @@ typedef int24_t int_least24_t;
 typedef uint24_t uint_least24_t;
 typedef int24_t int_fast24_t;
 typedef uint24_t uint_fast24_t;
+# undef __int_least16_t
 # define __int_least16_t int24_t
+# undef __uint_least16_t
 # define __uint_least16_t uint24_t
+# undef __int_least8_t
 # define __int_least8_t int24_t
+# undef __uint_least8_t
 # define __uint_least8_t uint24_t
 #endif /* __INT24_TYPE__ */
 
@@ -205,9 +241,13 @@ typedef uint24_t uint_fast24_t;
 typedef __INT16_TYPE__ int16_t;
 #endif /* __int8_t_defined */
 typedef __UINT16_TYPE__ uint16_t;
+# undef __int_least16_t
 # define __int_least16_t int16_t
+# undef __uint_least16_t
 # define __uint_least16_t uint16_t
+# undef __int_least8_t
 # define __int_least8_t int16_t
+# undef __uint_least8_t
 # define __uint_least8_t uint16_t
 #endif /* __INT16_TYPE__ */
 
@@ -224,7 +264,9 @@ typedef __uint_least16_t uint_fast16_t;
 typedef __INT8_TYPE__ int8_t;
 #endif /* __int8_t_defined */
 typedef __UINT8_TYPE__ uint8_t;
+# undef __int_least8_t
 # define __int_least8_t int8_t
+# undef __uint_least8_t
 # define __uint_least8_t uint8_t
 #endif /* __INT8_TYPE__ */
 
@@ -285,16 +327,15 @@ typedef __UINTMAX_TYPE__ uintmax_t;
 
 
 #ifdef __INT64_TYPE__
+# undef __int64_c_suffix
+# undef __int32_c_suffix
+# undef __int16_c_suffix
+# undef  __int8_c_suffix
 # ifdef __INT64_C_SUFFIX__
 #  define __int64_c_suffix __INT64_C_SUFFIX__
 #  define __int32_c_suffix __INT64_C_SUFFIX__
 #  define __int16_c_suffix __INT64_C_SUFFIX__
 #  define  __int8_c_suffix __INT64_C_SUFFIX__
-# else
-#  undef __int64_c_suffix
-#  undef __int32_c_suffix
-#  undef __int16_c_suffix
-#  undef  __int8_c_suffix
 # endif /* __INT64_C_SUFFIX__ */
 #endif /* __INT64_TYPE__ */
 
@@ -310,6 +351,9 @@ typedef __UINTMAX_TYPE__ uintmax_t;
 
 
 #ifdef __INT56_TYPE__
+# undef __int32_c_suffix
+# undef __int16_c_suffix
+# undef  __int8_c_suffix
 # ifdef __INT56_C_SUFFIX__
 #  define INT56_C(v) __int_c(v, __INT56_C_SUFFIX__)
 #  define UINT56_C(v) __uint_c(v, __INT56_C_SUFFIX__)
@@ -319,14 +363,14 @@ typedef __UINTMAX_TYPE__ uintmax_t;
 # else
 #  define INT56_C(v) v
 #  define UINT56_C(v) v ## U
-#  undef __int32_c_suffix
-#  undef __int16_c_suffix
-#  undef  __int8_c_suffix
 # endif /* __INT56_C_SUFFIX__ */
 #endif /* __INT56_TYPE__ */
 
 
 #ifdef __INT48_TYPE__
+# undef __int32_c_suffix
+# undef __int16_c_suffix
+# undef  __int8_c_suffix
 # ifdef __INT48_C_SUFFIX__
 #  define INT48_C(v) __int_c(v, __INT48_C_SUFFIX__)
 #  define UINT48_C(v) __uint_c(v, __INT48_C_SUFFIX__)
@@ -336,14 +380,14 @@ typedef __UINTMAX_TYPE__ uintmax_t;
 # else
 #  define INT48_C(v) v
 #  define UINT48_C(v) v ## U
-#  undef __int32_c_suffix
-#  undef __int16_c_suffix
-#  undef  __int8_c_suffix
 # endif /* __INT48_C_SUFFIX__ */
 #endif /* __INT48_TYPE__ */
 
 
 #ifdef __INT40_TYPE__
+# undef __int32_c_suffix
+# undef __int16_c_suffix
+# undef  __int8_c_suffix
 # ifdef __INT40_C_SUFFIX__
 #  define INT40_C(v) __int_c(v, __INT40_C_SUFFIX__)
 #  define UINT40_C(v) __uint_c(v, __INT40_C_SUFFIX__)
@@ -353,22 +397,18 @@ typedef __UINTMAX_TYPE__ uintmax_t;
 # else
 #  define INT40_C(v) v
 #  define UINT40_C(v) v ## U
-#  undef __int32_c_suffix
-#  undef __int16_c_suffix
-#  undef  __int8_c_suffix
 # endif /* __INT40_C_SUFFIX__ */
 #endif /* __INT40_TYPE__ */
 
 
 #ifdef __INT32_TYPE__
+# undef __int32_c_suffix
+# undef __int16_c_suffix
+# undef  __int8_c_suffix
 # ifdef __INT32_C_SUFFIX__
 #  define __int32_c_suffix __INT32_C_SUFFIX__
 #  define __int16_c_suffix __INT32_C_SUFFIX__
 #  define __int8_c_suffix  __INT32_C_SUFFIX__
-#else
-#  undef __int32_c_suffix
-#  undef __int16_c_suffix
-#  undef  __int8_c_suffix
 # endif /* __INT32_C_SUFFIX__ */
 #endif /* __INT32_TYPE__ */
 
@@ -384,6 +424,8 @@ typedef __UINTMAX_TYPE__ uintmax_t;
 
 
 #ifdef __INT24_TYPE__
+# undef __int16_c_suffix
+# undef  __int8_c_suffix
 # ifdef __INT24_C_SUFFIX__
 #  define INT24_C(v) __int_c(v, __INT24_C_SUFFIX__)
 #  define UINT24_C(v) __uint_c(v, __INT24_C_SUFFIX__)
@@ -392,19 +434,16 @@ typedef __UINTMAX_TYPE__ uintmax_t;
 # else
 #  define INT24_C(v) v
 #  define UINT24_C(v) v ## U
-#  undef __int16_c_suffix
-#  undef  __int8_c_suffix
 # endif /* __INT24_C_SUFFIX__ */
 #endif /* __INT24_TYPE__ */
 
 
 #ifdef __INT16_TYPE__
+# undef __int16_c_suffix
+# undef  __int8_c_suffix
 # ifdef __INT16_C_SUFFIX__
 #  define __int16_c_suffix __INT16_C_SUFFIX__
 #  define __int8_c_suffix  __INT16_C_SUFFIX__
-#else
-#  undef __int16_c_suffix
-#  undef  __int8_c_suffix
 # endif /* __INT16_C_SUFFIX__ */
 #endif /* __INT16_TYPE__ */
 
@@ -420,10 +459,9 @@ typedef __UINTMAX_TYPE__ uintmax_t;
 
 
 #ifdef __INT8_TYPE__
+# undef  __int8_c_suffix
 # ifdef __INT8_C_SUFFIX__
 #  define __int8_c_suffix __INT8_C_SUFFIX__
-#else
-#  undef  __int8_c_suffix
 # endif /* __INT8_C_SUFFIX__ */
 #endif /* __INT8_TYPE__ */
 
@@ -463,27 +501,39 @@ typedef __UINTMAX_TYPE__ uintmax_t;
 # define UINT64_MAX         UINT64_C(18446744073709551615)
 /* FIXME: This is using the placeholder dates Clang produces for these macros
    in C2x mode; switch to the correct values once they've been published. */
-#if __STDC_VERSION__ >= 202000L
+#if defined(__STDC_VERSION__) && __STDC_VERSION__ >= 202000L
 # define UINT64_WIDTH         64
 # define INT64_WIDTH          UINT64_WIDTH
 
 # define __UINT_LEAST64_WIDTH UINT64_WIDTH
+# undef __UINT_LEAST32_WIDTH
 # define __UINT_LEAST32_WIDTH UINT64_WIDTH
+# undef __UINT_LEAST16_WIDTH
 # define __UINT_LEAST16_WIDTH UINT64_WIDTH
+# undef __UINT_LEAST8_MAX
 # define __UINT_LEAST8_MAX UINT64_MAX
 #endif /* __STDC_VERSION__ */
 
 # define __INT_LEAST64_MIN   INT64_MIN
 # define __INT_LEAST64_MAX   INT64_MAX
 # define __UINT_LEAST64_MAX UINT64_MAX
+# undef __INT_LEAST32_MIN
 # define __INT_LEAST32_MIN   INT64_MIN
+# undef __INT_LEAST32_MAX
 # define __INT_LEAST32_MAX   INT64_MAX
+# undef __UINT_LEAST32_MAX
 # define __UINT_LEAST32_MAX UINT64_MAX
+# undef __INT_LEAST16_MIN
 # define __INT_LEAST16_MIN   INT64_MIN
+# undef __INT_LEAST16_MAX
 # define __INT_LEAST16_MAX   INT64_MAX
+# undef __UINT_LEAST16_MAX
 # define __UINT_LEAST16_MAX UINT64_MAX
+# undef __INT_LEAST8_MIN
 # define __INT_LEAST8_MIN    INT64_MIN
+# undef __INT_LEAST8_MAX
 # define __INT_LEAST8_MAX    INT64_MAX
+# undef __UINT_LEAST8_MAX
 # define __UINT_LEAST8_MAX  UINT64_MAX
 #endif /* __INT64_TYPE__ */
 
@@ -497,7 +547,7 @@ typedef __UINTMAX_TYPE__ uintmax_t;
 
 /* FIXME: This is using the placeholder dates Clang produces for these macros
    in C2x mode; switch to the correct values once they've been published. */
-#if __STDC_VERSION__ >= 202000L
+#if defined(__STDC_VERSION__) &&  __STDC_VERSION__ >= 202000L
 # define UINT_LEAST64_WIDTH __UINT_LEAST64_WIDTH
 # define INT_LEAST64_WIDTH  UINT_LEAST64_WIDTH
 # define UINT_FAST64_WIDTH  __UINT_LEAST64_WIDTH
@@ -517,27 +567,39 @@ typedef __UINTMAX_TYPE__ uintmax_t;
 # define INT_FAST56_MAX      INT56_MAX
 # define UINT_FAST56_MAX    UINT56_MAX
 
+# undef __INT_LEAST32_MIN
 # define __INT_LEAST32_MIN   INT56_MIN
+# undef __INT_LEAST32_MAX
 # define __INT_LEAST32_MAX   INT56_MAX
+# undef __UINT_LEAST32_MAX
 # define __UINT_LEAST32_MAX UINT56_MAX
+# undef __INT_LEAST16_MIN
 # define __INT_LEAST16_MIN   INT56_MIN
+# undef __INT_LEAST16_MAX
 # define __INT_LEAST16_MAX   INT56_MAX
+# undef __UINT_LEAST16_MAX
 # define __UINT_LEAST16_MAX UINT56_MAX
+# undef __INT_LEAST8_MIN
 # define __INT_LEAST8_MIN    INT56_MIN
+# undef __INT_LEAST8_MAX
 # define __INT_LEAST8_MAX    INT56_MAX
+# undef __UINT_LEAST8_MAX
 # define __UINT_LEAST8_MAX  UINT56_MAX
 
 /* FIXME: This is using the placeholder dates Clang produces for these macros
    in C2x mode; switch to the correct values once they've been published. */
-#if __STDC_VERSION__ >= 202000L
+#if defined(__STDC_VERSION__) && __STDC_VERSION__ >= 202000L
 # define UINT56_WIDTH         56
 # define INT56_WIDTH          UINT56_WIDTH
 # define UINT_LEAST56_WIDTH   UINT56_WIDTH
 # define INT_LEAST56_WIDTH    UINT_LEAST56_WIDTH
 # define UINT_FAST56_WIDTH    UINT56_WIDTH
 # define INT_FAST56_WIDTH     UINT_FAST56_WIDTH
+# undef __UINT_LEAST32_WIDTH
 # define __UINT_LEAST32_WIDTH UINT56_WIDTH
+# undef __UINT_LEAST16_WIDTH
 # define __UINT_LEAST16_WIDTH UINT56_WIDTH
+# undef __UINT_LEAST8_WIDTH
 # define __UINT_LEAST8_WIDTH  UINT56_WIDTH
 #endif /* __STDC_VERSION__ */
 #endif /* __INT56_TYPE__ */
@@ -554,27 +616,39 @@ typedef __UINTMAX_TYPE__ uintmax_t;
 # define INT_FAST48_MAX      INT48_MAX
 # define UINT_FAST48_MAX    UINT48_MAX
 
+# undef __INT_LEAST32_MIN
 # define __INT_LEAST32_MIN   INT48_MIN
+# undef __INT_LEAST32_MAX
 # define __INT_LEAST32_MAX   INT48_MAX
+# undef __UINT_LEAST32_MAX
 # define __UINT_LEAST32_MAX UINT48_MAX
+# undef __INT_LEAST16_MIN
 # define __INT_LEAST16_MIN   INT48_MIN
+# undef __INT_LEAST16_MAX
 # define __INT_LEAST16_MAX   INT48_MAX
+# undef __UINT_LEAST16_MAX
 # define __UINT_LEAST16_MAX UINT48_MAX
+# undef __INT_LEAST8_MIN
 # define __INT_LEAST8_MIN    INT48_MIN
+# undef __INT_LEAST8_MAX
 # define __INT_LEAST8_MAX    INT48_MAX
+# undef __UINT_LEAST8_MAX
 # define __UINT_LEAST8_MAX  UINT48_MAX
 
 /* FIXME: This is using the placeholder dates Clang produces for these macros
    in C2x mode; switch to the correct values once they've been published. */
-#if __STDC_VERSION__ >= 202000L
+#if defined(__STDC_VERSION__) && __STDC_VERSION__ >= 202000L
 #define UINT48_WIDTH         48
 #define INT48_WIDTH          UINT48_WIDTH
 #define UINT_LEAST48_WIDTH   UINT48_WIDTH
 #define INT_LEAST48_WIDTH    UINT_LEAST48_WIDTH
 #define UINT_FAST48_WIDTH    UINT48_WIDTH
 #define INT_FAST48_WIDTH     UINT_FAST48_WIDTH
+#undef __UINT_LEAST32_WIDTH
 #define __UINT_LEAST32_WIDTH UINT48_WIDTH
+# undef __UINT_LEAST16_WIDTH
 #define __UINT_LEAST16_WIDTH UINT48_WIDTH
+# undef __UINT_LEAST8_WIDTH
 #define __UINT_LEAST8_WIDTH  UINT48_WIDTH
 #endif /* __STDC_VERSION__ */
 #endif /* __INT48_TYPE__ */
@@ -591,27 +665,39 @@ typedef __UINTMAX_TYPE__ uintmax_t;
 # define INT_FAST40_MAX      INT40_MAX
 # define UINT_FAST40_MAX    UINT40_MAX
 
+# undef __INT_LEAST32_MIN
 # define __INT_LEAST32_MIN   INT40_MIN
+# undef __INT_LEAST32_MAX
 # define __INT_LEAST32_MAX   INT40_MAX
+# undef __UINT_LEAST32_MAX
 # define __UINT_LEAST32_MAX UINT40_MAX
+# undef __INT_LEAST16_MIN
 # define __INT_LEAST16_MIN   INT40_MIN
+# undef __INT_LEAST16_MAX
 # define __INT_LEAST16_MAX   INT40_MAX
+# undef __UINT_LEAST16_MAX
 # define __UINT_LEAST16_MAX UINT40_MAX
+# undef __INT_LEAST8_MIN
 # define __INT_LEAST8_MIN    INT40_MIN
+# undef __INT_LEAST8_MAX
 # define __INT_LEAST8_MAX    INT40_MAX
+# undef __UINT_LEAST8_MAX
 # define __UINT_LEAST8_MAX  UINT40_MAX
 
 /* FIXME: This is using the placeholder dates Clang produces for these macros
    in C2x mode; switch to the correct values once they've been published. */
-#if __STDC_VERSION__ >= 202000L
+#if defined(__STDC_VERSION__) && __STDC_VERSION__ >= 202000L
 # define UINT40_WIDTH         40
 # define INT40_WIDTH          UINT40_WIDTH
 # define UINT_LEAST40_WIDTH   UINT40_WIDTH
 # define INT_LEAST40_WIDTH    UINT_LEAST40_WIDTH
 # define UINT_FAST40_WIDTH    UINT40_WIDTH
 # define INT_FAST40_WIDTH     UINT_FAST40_WIDTH
+# undef __UINT_LEAST32_WIDTH
 # define __UINT_LEAST32_WIDTH UINT40_WIDTH
+# undef __UINT_LEAST16_WIDTH
 # define __UINT_LEAST16_WIDTH UINT40_WIDTH
+# undef __UINT_LEAST8_WIDTH
 # define __UINT_LEAST8_WIDTH  UINT40_WIDTH
 #endif /* __STDC_VERSION__ */
 #endif /* __INT40_TYPE__ */
@@ -622,23 +708,35 @@ typedef __UINTMAX_TYPE__ uintmax_t;
 # define INT32_MIN         (-INT32_C(2147483647)-1)
 # define UINT32_MAX         UINT32_C(4294967295)
 
+# undef __INT_LEAST32_MIN
 # define __INT_LEAST32_MIN   INT32_MIN
+# undef __INT_LEAST32_MAX
 # define __INT_LEAST32_MAX   INT32_MAX
+# undef __UINT_LEAST32_MAX
 # define __UINT_LEAST32_MAX UINT32_MAX
+# undef __INT_LEAST16_MIN
 # define __INT_LEAST16_MIN   INT32_MIN
+# undef __INT_LEAST16_MAX
 # define __INT_LEAST16_MAX   INT32_MAX
+# undef __UINT_LEAST16_MAX
 # define __UINT_LEAST16_MAX UINT32_MAX
+# undef __INT_LEAST8_MIN
 # define __INT_LEAST8_MIN    INT32_MIN
+# undef __INT_LEAST8_MAX
 # define __INT_LEAST8_MAX    INT32_MAX
+# undef __UINT_LEAST8_MAX
 # define __UINT_LEAST8_MAX  UINT32_MAX
 
 /* FIXME: This is using the placeholder dates Clang produces for these macros
    in C2x mode; switch to the correct values once they've been published. */
-#if __STDC_VERSION__ >= 202000L
+#if defined(__STDC_VERSION__) && __STDC_VERSION__ >= 202000L
 # define UINT32_WIDTH         32
 # define INT32_WIDTH          UINT32_WIDTH
+# undef __UINT_LEAST32_WIDTH
 # define __UINT_LEAST32_WIDTH UINT32_WIDTH
+# undef __UINT_LEAST16_WIDTH
 # define __UINT_LEAST16_WIDTH UINT32_WIDTH
+# undef __UINT_LEAST8_WIDTH
 # define __UINT_LEAST8_WIDTH  UINT32_WIDTH
 #endif /* __STDC_VERSION__ */
 #endif /* __INT32_TYPE__ */
@@ -653,7 +751,7 @@ typedef __UINTMAX_TYPE__ uintmax_t;
 
 /* FIXME: This is using the placeholder dates Clang produces for these macros
    in C2x mode; switch to the correct values once they've been published. */
-#if __STDC_VERSION__ >= 202000L
+#if defined(__STDC_VERSION__) && __STDC_VERSION__ >= 202000L
 # define UINT_LEAST32_WIDTH __UINT_LEAST32_WIDTH
 # define INT_LEAST32_WIDTH  UINT_LEAST32_WIDTH
 # define UINT_FAST32_WIDTH  __UINT_LEAST32_WIDTH
@@ -673,23 +771,31 @@ typedef __UINTMAX_TYPE__ uintmax_t;
 # define INT_FAST24_MAX      INT24_MAX
 # define UINT_FAST24_MAX    UINT24_MAX
 
+# undef __INT_LEAST16_MIN
 # define __INT_LEAST16_MIN   INT24_MIN
+# undef __INT_LEAST16_MAX
 # define __INT_LEAST16_MAX   INT24_MAX
+# undef __UINT_LEAST16_MAX
 # define __UINT_LEAST16_MAX UINT24_MAX
+# undef __INT_LEAST8_MIN
 # define __INT_LEAST8_MIN    INT24_MIN
+# undef __INT_LEAST8_MAX
 # define __INT_LEAST8_MAX    INT24_MAX
+# undef __UINT_LEAST8_MAX
 # define __UINT_LEAST8_MAX  UINT24_MAX
 
 /* FIXME: This is using the placeholder dates Clang produces for these macros
    in C2x mode; switch to the correct values once they've been published. */
-#if __STDC_VERSION__ >= 202000L
+#if defined(__STDC_VERSION__) && __STDC_VERSION__ >= 202000L
 # define UINT24_WIDTH         24
 # define INT24_WIDTH          UINT24_WIDTH
 # define UINT_LEAST24_WIDTH   UINT24_WIDTH
 # define INT_LEAST24_WIDTH    UINT_LEAST24_WIDTH
 # define UINT_FAST24_WIDTH    UINT24_WIDTH
 # define INT_FAST24_WIDTH     UINT_FAST24_WIDTH
+# undef __UINT_LEAST16_WIDTH
 # define __UINT_LEAST16_WIDTH UINT24_WIDTH
+# undef __UINT_LEAST8_WIDTH
 # define __UINT_LEAST8_WIDTH  UINT24_WIDTH
 #endif /* __STDC_VERSION__ */
 #endif /* __INT24_TYPE__ */
@@ -700,19 +806,27 @@ typedef __UINTMAX_TYPE__ uintmax_t;
 #define INT16_MIN          (-INT16_C(32767)-1)
 #define UINT16_MAX          UINT16_C(65535)
 
+# undef __INT_LEAST16_MIN
 # define __INT_LEAST16_MIN   INT16_MIN
+# undef __INT_LEAST16_MAX
 # define __INT_LEAST16_MAX   INT16_MAX
+# undef __UINT_LEAST16_MAX
 # define __UINT_LEAST16_MAX UINT16_MAX
+# undef __INT_LEAST8_MIN
 # define __INT_LEAST8_MIN    INT16_MIN
+# undef __INT_LEAST8_MAX
 # define __INT_LEAST8_MAX    INT16_MAX
+# undef __UINT_LEAST8_MAX
 # define __UINT_LEAST8_MAX  UINT16_MAX
 
 /* FIXME: This is using the placeholder dates Clang produces for these macros
    in C2x mode; switch to the correct values once they've been published. */
-#if __STDC_VERSION__ >= 202000L
+#if defined(__STDC_VERSION__) && __STDC_VERSION__ >= 202000L
 # define UINT16_WIDTH         16
 # define INT16_WIDTH          UINT16_WIDTH
+# undef __UINT_LEAST16_WIDTH
 # define __UINT_LEAST16_WIDTH UINT16_WIDTH
+# undef __UINT_LEAST8_WIDTH
 # define __UINT_LEAST8_WIDTH  UINT16_WIDTH
 #endif /* __STDC_VERSION__ */
 #endif /* __INT16_TYPE__ */
@@ -727,7 +841,7 @@ typedef __UINTMAX_TYPE__ uintmax_t;
 
 /* FIXME: This is using the placeholder dates Clang produces for these macros
    in C2x mode; switch to the correct values once they've been published. */
-#if __STDC_VERSION__ >= 202000L
+#if defined(__STDC_VERSION__) && __STDC_VERSION__ >= 202000L
 # define UINT_LEAST16_WIDTH __UINT_LEAST16_WIDTH
 # define INT_LEAST16_WIDTH  UINT_LEAST16_WIDTH
 # define UINT_FAST16_WIDTH  __UINT_LEAST16_WIDTH
@@ -741,15 +855,19 @@ typedef __UINTMAX_TYPE__ uintmax_t;
 # define INT8_MIN          (-INT8_C(127)-1)
 # define UINT8_MAX          UINT8_C(255)
 
+# undef __INT_LEAST8_MIN
 # define __INT_LEAST8_MIN    INT8_MIN
+# undef __INT_LEAST8_MAX
 # define __INT_LEAST8_MAX    INT8_MAX
+# undef __UINT_LEAST8_MAX
 # define __UINT_LEAST8_MAX  UINT8_MAX
 
 /* FIXME: This is using the placeholder dates Clang produces for these macros
    in C2x mode; switch to the correct values once they've been published. */
-#if __STDC_VERSION__ >= 202000L
+#if defined(__STDC_VERSION__) && __STDC_VERSION__ >= 202000L
 # define UINT8_WIDTH         8
 # define INT8_WIDTH          UINT8_WIDTH
+# undef __UINT_LEAST8_WIDTH
 # define __UINT_LEAST8_WIDTH UINT8_WIDTH
 #endif /* __STDC_VERSION__ */
 #endif /* __INT8_TYPE__ */
@@ -764,7 +882,7 @@ typedef __UINTMAX_TYPE__ uintmax_t;
 
 /* FIXME: This is using the placeholder dates Clang produces for these macros
    in C2x mode; switch to the correct values once they've been published. */
-#if __STDC_VERSION__ >= 202000L
+#if defined(__STDC_VERSION__) && __STDC_VERSION__ >= 202000L
 # define UINT_LEAST8_WIDTH __UINT_LEAST8_WIDTH
 # define INT_LEAST8_WIDTH  UINT_LEAST8_WIDTH
 # define UINT_FAST8_WIDTH  __UINT_LEAST8_WIDTH
@@ -792,7 +910,7 @@ typedef __UINTMAX_TYPE__ uintmax_t;
 /* C2x 7.20.2.4 Width of integer types capable of holding object pointers. */
 /* FIXME: This is using the placeholder dates Clang produces for these macros
    in C2x mode; switch to the correct values once they've been published. */
-#if __STDC_VERSION__ >= 202000L
+#if defined(__STDC_VERSION__) && __STDC_VERSION__ >= 202000L
 /* NB: The C standard requires that these be the same value, but the compiler
    exposes separate internal width macros. */
 #define INTPTR_WIDTH  __INTPTR_WIDTH__
@@ -813,7 +931,7 @@ typedef __UINTMAX_TYPE__ uintmax_t;
 /* C2x 7.20.2.5 Width of greatest-width integer types. */
 /* FIXME: This is using the placeholder dates Clang produces for these macros
    in C2x mode; switch to the correct values once they've been published. */
-#if __STDC_VERSION__ >= 202000L
+#if defined(__STDC_VERSION__) && __STDC_VERSION__ >= 202000L
 /* NB: The C standard requires that these be the same value, but the compiler
    exposes separate internal width macros. */
 #define INTMAX_WIDTH __INTMAX_WIDTH__
@@ -849,7 +967,7 @@ typedef __UINTMAX_TYPE__ uintmax_t;
 /* C2x 7.20.3.x Width of other integer types. */
 /* FIXME: This is using the placeholder dates Clang produces for these macros
    in C2x mode; switch to the correct values once they've been published. */
-#if __STDC_VERSION__ >= 202000L
+#if defined(__STDC_VERSION__) && __STDC_VERSION__ >= 202000L
 #define PTRDIFF_WIDTH    __PTRDIFF_WIDTH__
 #define SIG_ATOMIC_WIDTH __SIG_ATOMIC_WIDTH__
 #define SIZE_WIDTH       __SIZE_WIDTH__
diff --git a/clang/lib/Headers/stdnoreturn.h b/clang/lib/Headers/stdnoreturn.h
index 7d19fa7b2f2b..967be947627a 100644
--- a/clang/lib/Headers/stdnoreturn.h
+++ b/clang/lib/Headers/stdnoreturn.h
@@ -13,7 +13,7 @@
 #define noreturn _Noreturn
 #define __noreturn_is_defined 1
 
-#if __STDC_VERSION__ > 201710L &&                                              \
+#if (defined(__STDC_VERSION__) && __STDC_VERSION__ > 201710L) &&               \
     !defined(_CLANG_DISABLE_CRT_DEPRECATION_WARNINGS)
 /* The noreturn macro is deprecated in C2x. We do not mark it as such because
    including the header file in C2x is also deprecated and we do not want to
diff --git a/clang/lib/Headers/unwind.h b/clang/lib/Headers/unwind.h
index 971a62da0d21..33e1792cd1fb 100644
--- a/clang/lib/Headers/unwind.h
+++ b/clang/lib/Headers/unwind.h
@@ -65,7 +65,8 @@ struct _Unwind_Context;
 #if defined(__arm__) && !(defined(__USING_SJLJ_EXCEPTIONS__) || \
                           defined(__ARM_DWARF_EH__) || defined(__SEH__))
 struct _Unwind_Control_Block;
-typedef struct _Unwind_Control_Block _Unwind_Exception; /* Alias */
+typedef struct _Unwind_Control_Block _Unwind_Control_Block;
+#define _Unwind_Exception _Unwind_Control_Block /* Alias */
 #else
 struct _Unwind_Exception;
 typedef struct _Unwind_Exception _Unwind_Exception;
diff --git a/clang/lib/Headers/velintrin.h b/clang/lib/Headers/velintrin.h
index 69b1fba296d4..3f2bc00442e7 100644
--- a/clang/lib/Headers/velintrin.h
+++ b/clang/lib/Headers/velintrin.h
@@ -13,7 +13,7 @@
 typedef double __vr __attribute__((__vector_size__(2048)));
 
 // Vector mask registers
-#if __STDC_VERSION__ >= 199901L
+#if defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L
 // For C99
 typedef _Bool __vm    __attribute__((ext_vector_type(256)));
 typedef _Bool __vm256 __attribute__((ext_vector_type(256)));
diff --git a/clang/lib/Headers/x86gprintrin.h b/clang/lib/Headers/x86gprintrin.h
index 2c2fbb97c9ac..f9a765be4322 100644
--- a/clang/lib/Headers/x86gprintrin.h
+++ b/clang/lib/Headers/x86gprintrin.h
@@ -25,23 +25,35 @@
 #include <crc32intrin.h>
 #endif
 
+#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
+    defined(__PRFCHI__)
+#include <prfchiintrin.h>
+#endif
+
+#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
+    defined(__RAOINT__)
+#include <raointintrin.h>
+#endif
+
+#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
+    defined(__CMPCCXADD__)
+#include <cmpccxaddintrin.h>
+#endif
+
 #if defined(__i386__)
-#define __FULLBX "ebx"
+#define __SAVE_GPRBX "mov {%%ebx, %%eax |eax, ebx};"
+#define __RESTORE_GPRBX "mov {%%eax, %%ebx |ebx, eax};"
 #define __TMPGPR "eax"
 #else
 // When in 64-bit target, the 32-bit operands generate a 32-bit result,
 // zero-extended to a 64-bit result in the destination general-purpose,
 // It means "mov x %ebx" will clobber the higher 32 bits of rbx, so we
 // should preserve the 64-bit register rbx.
-#define __FULLBX "rbx"
+#define __SAVE_GPRBX "mov {%%rbx, %%rax |rax, rbx};"
+#define __RESTORE_GPRBX "mov {%%rax, %%rbx |rbx, rax};"
 #define __TMPGPR "rax"
 #endif
 
-#define __MOVEGPR(__r1, __r2) "mov {%%"__r1 ", %%"__r2 "|"__r2 ", "__r1"};"
-
-#define __SAVE_GPRBX __MOVEGPR(__FULLBX, __TMPGPR)
-#define __RESTORE_GPRBX __MOVEGPR(__TMPGPR, __FULLBX)
-
 #define __SSC_MARK(__Tag)                                                      \
   __asm__ __volatile__( __SAVE_GPRBX                                           \
                        "mov {%0, %%ebx|ebx, %0}; "                             \
diff --git a/clang/lib/Headers/xmmintrin.h b/clang/lib/Headers/xmmintrin.h
index 4aa70d6e55a6..80aa2a817f6a 100644
--- a/clang/lib/Headers/xmmintrin.h
+++ b/clang/lib/Headers/xmmintrin.h
@@ -1906,7 +1906,7 @@ _mm_setr_ps(float __z, float __y, float __x, float __w)
 static __inline__ __m128 __DEFAULT_FN_ATTRS
 _mm_setzero_ps(void)
 {
-  return __extension__ (__m128){ 0, 0, 0, 0 };
+  return __extension__ (__m128){ 0.0f, 0.0f, 0.0f, 0.0f };
 }
 
 /// Stores the upper 64 bits of a 128-bit vector of [4 x float] to a
@@ -3005,7 +3005,6 @@ do { \
 #define _m_pavgw _mm_avg_pu16
 #define _m_psadbw _mm_sad_pu8
 #define _m_ _mm_
-#define _m_ _mm_
 
 #undef __DEFAULT_FN_ATTRS
 #undef __DEFAULT_FN_ATTRS_MMX