vendor/clang/clang-trunk-r375505 vendor/clang

author: Dimitry Andric <dim@FreeBSD.org> 2019-10-23 17:52:09 +0000
committer: Dimitry Andric <dim@FreeBSD.org> 2019-10-23 17:52:09 +0000
commit: 519fc96c475680de2cc49e7811dbbfadb912cbcc (patch)
tree: 310ca684459b7e9ae13c9a3b9abf308b3a634afe /lib/Headers
parent: 2298981669bf3bd63335a4be179bc0f96823a8f4 (diff)
18 files changed, 1133 insertions, 255 deletions
diff --git a/lib/Headers/__clang_cuda_intrinsics.h b/lib/Headers/__clang_cuda_intrinsics.h
index 2970d17f89ee..b67461a146fc 100644
--- a/lib/Headers/__clang_cuda_intrinsics.h
+++ b/lib/Headers/__clang_cuda_intrinsics.h
@@ -211,7 +211,15 @@ inline __device__ unsigned int __ballot_sync(unsigned int mask, int pred) {
   return __nvvm_vote_ballot_sync(mask, pred);
 }
 
-inline __device__ unsigned int __activemask() { return __nvvm_vote_ballot(1); }
+inline __device__ unsigned int __activemask() {
+#if CUDA_VERSION < 9020
+  return __nvvm_vote_ballot(1);
+#else
+  unsigned int mask;
+  asm volatile("activemask.b32 %0;" : "=r"(mask));
+  return mask;
+#endif
+}
 
 inline __device__ unsigned int __fns(unsigned mask, unsigned base, int offset) {
   return __nvvm_fns(mask, base, offset);
diff --git a/lib/Headers/altivec.h b/lib/Headers/altivec.h
index 4008440b2bc5..8352f8f740c2 100644
--- a/lib/Headers/altivec.h
+++ b/lib/Headers/altivec.h
@@ -2761,8 +2761,8 @@ static __inline__ vector double __ATTRS_o_ai vec_xl_len(double *__a,
   return (vector double)__builtin_vsx_lxvl(__a, (__b << 56));
 }
 
-static __inline__ vector double __ATTRS_o_ai vec_xl_len_r(unsigned char *__a,
-                                                          size_t __b) {
+static __inline__ vector unsigned char __ATTRS_o_ai
+vec_xl_len_r(unsigned char *__a, size_t __b) {
   vector unsigned char __res =
       (vector unsigned char)__builtin_vsx_lxvll(__a, (__b << 56));
 #ifdef __LITTLE_ENDIAN__
@@ -2876,9 +2876,10 @@ static __inline__ vector double __ATTRS_o_ai vec_cpsgn(vector double __a,
 #ifdef __VSX__
 #define vec_ctf(__a, __b)                                                      \
   _Generic((__a), vector int                                                   \
-           : (vector float)__builtin_altivec_vcfsx((__a), (__b)),              \
+           : (vector float)__builtin_altivec_vcfsx((vector int)(__a), (__b)),  \
              vector unsigned int                                               \
-           : (vector float)__builtin_altivec_vcfux((vector int)(__a), (__b)),  \
+           : (vector float)__builtin_altivec_vcfux((vector unsigned int)(__a), \
+                                                   (__b)),                     \
              vector unsigned long long                                         \
            : (__builtin_convertvector((vector unsigned long long)(__a),        \
                                       vector double) *                         \
@@ -2892,9 +2893,10 @@ static __inline__ vector double __ATTRS_o_ai vec_cpsgn(vector double __a,
 #else
 #define vec_ctf(__a, __b)                                                      \
   _Generic((__a), vector int                                                   \
-           : (vector float)__builtin_altivec_vcfsx((__a), (__b)),              \
+           : (vector float)__builtin_altivec_vcfsx((vector int)(__a), (__b)),  \
              vector unsigned int                                               \
-           : (vector float)__builtin_altivec_vcfux((vector int)(__a), (__b)))
+           : (vector float)__builtin_altivec_vcfux((vector unsigned int)(__a), \
+                                                   (__b)))
 #endif
 
 /* vec_vcfsx */
@@ -2910,10 +2912,11 @@ static __inline__ vector double __ATTRS_o_ai vec_cpsgn(vector double __a,
 #ifdef __VSX__
 #define vec_cts(__a, __b)                                                      \
   _Generic((__a), vector float                                                 \
-           : __builtin_altivec_vctsxs((__a), (__b)), vector double             \
+           : __builtin_altivec_vctsxs((vector float)(__a), (__b)),             \
+             vector double                                                     \
            : __extension__({                                                   \
              vector double __ret =                                             \
-                 (__a) *                                                       \
+                 (vector double)(__a) *                                        \
                  (vector double)(vector unsigned long long)((0x3ffULL + (__b)) \
                                                             << 52);            \
              __builtin_convertvector(__ret, vector signed long long);          \
@@ -2931,10 +2934,11 @@ static __inline__ vector double __ATTRS_o_ai vec_cpsgn(vector double __a,
 #ifdef __VSX__
 #define vec_ctu(__a, __b)                                                      \
   _Generic((__a), vector float                                                 \
-           : __builtin_altivec_vctuxs((__a), (__b)), vector double             \
+           : __builtin_altivec_vctuxs((vector float)(__a), (__b)),             \
+             vector double                                                     \
            : __extension__({                                                   \
              vector double __ret =                                             \
-                 (__a) *                                                       \
+                 (vector double)(__a) *                                        \
                  (vector double)(vector unsigned long long)((0x3ffULL + __b)   \
                                                             << 52);            \
              __builtin_convertvector(__ret, vector unsigned long long);        \
@@ -3286,9 +3290,7 @@ static __inline__ vector double __ATTRS_o_ai vec_div(vector double __a,
 
 /* vec_dss */
 
-static __inline__ void __attribute__((__always_inline__)) vec_dss(int __a) {
-  __builtin_altivec_dss(__a);
-}
+#define vec_dss __builtin_altivec_dss
 
 /* vec_dssall */
 
@@ -6301,19 +6303,20 @@ static __inline__ vector float __ATTRS_o_ai vec_or(vector float __a,
 #ifdef __VSX__
 static __inline__ vector double __ATTRS_o_ai vec_or(vector bool long long __a,
                                                     vector double __b) {
-  return (vector unsigned long long)__a | (vector unsigned long long)__b;
+  return (vector double)((vector unsigned long long)__a |
+                         (vector unsigned long long)__b);
 }
 
 static __inline__ vector double __ATTRS_o_ai vec_or(vector double __a,
                                                     vector bool long long __b) {
-  return (vector unsigned long long)__a | (vector unsigned long long)__b;
+  return (vector double)((vector unsigned long long)__a |
+                         (vector unsigned long long)__b);
 }
 
 static __inline__ vector double __ATTRS_o_ai vec_or(vector double __a,
                                                     vector double __b) {
-  vector unsigned long long __res =
-      (vector unsigned long long)__a | (vector unsigned long long)__b;
-  return (vector double)__res;
+  return (vector double)((vector unsigned long long)__a |
+                         (vector unsigned long long)__b);
 }
 
 static __inline__ vector signed long long __ATTRS_o_ai
@@ -14781,7 +14784,7 @@ static __inline__ int __ATTRS_o_ai vec_all_ne(vector bool long long __a,
 static __inline__ int __ATTRS_o_ai vec_all_ne(vector float __a,
                                               vector float __b) {
 #ifdef __VSX__
-  return __builtin_vsx_xvcmpeqdp_p(__CR6_EQ, __a, __b);
+  return __builtin_vsx_xvcmpeqdp_p(__CR6_EQ, (vector double)__a, (vector double)__b);
 #else
   return __builtin_altivec_vcmpeqfp_p(__CR6_EQ, __a, __b);
 #endif
@@ -16425,27 +16428,27 @@ vec_xl(signed long long __offset, unsigned __int128 *__ptr) {
 #ifdef __LITTLE_ENDIAN__
 static __inline__ vector signed char __ATTRS_o_ai
 vec_xl_be(signed long long __offset, signed char *__ptr) {
-  vector signed char __vec = __builtin_vsx_lxvd2x_be(__offset, __ptr);
+  vector signed char __vec = (vector signed char)__builtin_vsx_lxvd2x_be(__offset, __ptr);
   return __builtin_shufflevector(__vec, __vec, 7, 6, 5, 4, 3, 2, 1, 0, 15, 14,
                                  13, 12, 11, 10, 9, 8);
 }
 
 static __inline__ vector unsigned char __ATTRS_o_ai
 vec_xl_be(signed long long __offset, unsigned char *__ptr) {
-  vector unsigned char __vec = __builtin_vsx_lxvd2x_be(__offset, __ptr);
+  vector unsigned char __vec = (vector unsigned char)__builtin_vsx_lxvd2x_be(__offset, __ptr);
   return __builtin_shufflevector(__vec, __vec, 7, 6, 5, 4, 3, 2, 1, 0, 15, 14,
                                  13, 12, 11, 10, 9, 8);
 }
 
 static __inline__ vector signed short  __ATTRS_o_ai
 vec_xl_be(signed long long __offset, signed short *__ptr) {
-  vector signed short __vec = __builtin_vsx_lxvd2x_be(__offset, __ptr);
+  vector signed short __vec = (vector signed short)__builtin_vsx_lxvd2x_be(__offset, __ptr);
   return __builtin_shufflevector(__vec, __vec, 3, 2, 1, 0, 7, 6, 5, 4);
 }
 
 static __inline__ vector unsigned short __ATTRS_o_ai
 vec_xl_be(signed long long __offset, unsigned short *__ptr) {
-  vector unsigned short __vec = __builtin_vsx_lxvd2x_be(__offset, __ptr);
+  vector unsigned short __vec = (vector unsigned short)__builtin_vsx_lxvd2x_be(__offset, __ptr);
   return __builtin_shufflevector(__vec, __vec, 3, 2, 1, 0, 7, 6, 5, 4);
 }
 
@@ -16583,7 +16586,8 @@ static __inline__ void __ATTRS_o_ai vec_xst_be(vector signed char __vec,
   vector signed char __tmp =
      __builtin_shufflevector(__vec, __vec, 7, 6, 5, 4, 3, 2, 1, 0, 15, 14,
                              13, 12, 11, 10, 9, 8);
-  __builtin_vsx_stxvd2x_be(__tmp, __offset, __ptr);
+  typedef __attribute__((vector_size(sizeof(__tmp)))) double __vector_double;
+  __builtin_vsx_stxvd2x_be((__vector_double)__tmp, __offset, __ptr);
 }
 
 static __inline__ void __ATTRS_o_ai vec_xst_be(vector unsigned char __vec,
@@ -16592,7 +16596,8 @@ static __inline__ void __ATTRS_o_ai vec_xst_be(vector unsigned char __vec,
   vector unsigned char __tmp =
      __builtin_shufflevector(__vec, __vec, 7, 6, 5, 4, 3, 2, 1, 0, 15, 14,
                              13, 12, 11, 10, 9, 8);
-  __builtin_vsx_stxvd2x_be(__tmp, __offset, __ptr);
+  typedef __attribute__((vector_size(sizeof(__tmp)))) double __vector_double;
+  __builtin_vsx_stxvd2x_be((__vector_double)__tmp, __offset, __ptr);
 }
 
 static __inline__ void __ATTRS_o_ai vec_xst_be(vector signed short __vec,
@@ -16600,7 +16605,8 @@ static __inline__ void __ATTRS_o_ai vec_xst_be(vector signed short __vec,
                                                signed short *__ptr) {
   vector signed short __tmp =
      __builtin_shufflevector(__vec, __vec, 3, 2, 1, 0, 7, 6, 5, 4);
-  __builtin_vsx_stxvd2x_be(__tmp, __offset, __ptr);
+  typedef __attribute__((vector_size(sizeof(__tmp)))) double __vector_double;
+  __builtin_vsx_stxvd2x_be((__vector_double)__tmp, __offset, __ptr);
 }
 
 static __inline__ void __ATTRS_o_ai vec_xst_be(vector unsigned short __vec,
@@ -16608,7 +16614,8 @@ static __inline__ void __ATTRS_o_ai vec_xst_be(vector unsigned short __vec,
                                                unsigned short *__ptr) {
   vector unsigned short __tmp =
      __builtin_shufflevector(__vec, __vec, 3, 2, 1, 0, 7, 6, 5, 4);
-  __builtin_vsx_stxvd2x_be(__tmp, __offset, __ptr);
+  typedef __attribute__((vector_size(sizeof(__tmp)))) double __vector_double;
+  __builtin_vsx_stxvd2x_be((__vector_double)__tmp, __offset, __ptr);
 }
 
 static __inline__ void __ATTRS_o_ai vec_xst_be(vector signed int __vec,
@@ -16620,32 +16627,32 @@ static __inline__ void __ATTRS_o_ai vec_xst_be(vector signed int __vec,
 static __inline__ void __ATTRS_o_ai vec_xst_be(vector unsigned int __vec,
                                                signed long long  __offset,
                                                unsigned int *__ptr) {
-  __builtin_vsx_stxvw4x_be(__vec, __offset, __ptr);
+  __builtin_vsx_stxvw4x_be((vector int)__vec, __offset, __ptr);
 }
 
 static __inline__ void __ATTRS_o_ai vec_xst_be(vector float __vec,
                                                signed long long  __offset,
                                                float *__ptr) {
-  __builtin_vsx_stxvw4x_be(__vec, __offset, __ptr);
+  __builtin_vsx_stxvw4x_be((vector int)__vec, __offset, __ptr);
 }
 
 #ifdef __VSX__
 static __inline__ void __ATTRS_o_ai vec_xst_be(vector signed long long __vec,
                                                signed long long  __offset,
                                                signed long long *__ptr) {
-  __builtin_vsx_stxvd2x_be(__vec, __offset, __ptr);
+  __builtin_vsx_stxvd2x_be((vector double)__vec, __offset, __ptr);
 }
 
 static __inline__ void __ATTRS_o_ai vec_xst_be(vector unsigned long long __vec,
                                                signed long long  __offset,
                                                unsigned long long *__ptr) {
-  __builtin_vsx_stxvd2x_be(__vec, __offset, __ptr);
+  __builtin_vsx_stxvd2x_be((vector double)__vec, __offset, __ptr);
 }
 
 static __inline__ void __ATTRS_o_ai vec_xst_be(vector double __vec,
                                                signed long long  __offset,
                                                double *__ptr) {
-  __builtin_vsx_stxvd2x_be(__vec, __offset, __ptr);
+  __builtin_vsx_stxvd2x_be((vector double)__vec, __offset, __ptr);
 }
 #endif
 
@@ -16667,13 +16674,13 @@ static __inline__ void __ATTRS_o_ai vec_xst_be(vector unsigned __int128 __vec,
 #endif
 
 #ifdef __POWER9_VECTOR__
-#define vec_test_data_class(__a, __b)                                      \
-        _Generic((__a),                                                    \
-           vector float:                                                   \
-             (vector bool int)__builtin_vsx_xvtstdcsp((__a), (__b)),       \
-           vector double:                                                  \
-             (vector bool long long)__builtin_vsx_xvtstdcdp((__a), (__b))  \
-        )
+#define vec_test_data_class(__a, __b)                                          \
+  _Generic(                                                                    \
+      (__a), vector float                                                      \
+      : (vector bool int)__builtin_vsx_xvtstdcsp((vector float)(__a), (__b)),  \
+        vector double                                                          \
+      : (vector bool long long)__builtin_vsx_xvtstdcdp((vector double)(__a),   \
+                                                       (__b)))
 
 #endif /* #ifdef __POWER9_VECTOR__ */
 
diff --git a/lib/Headers/arm_acle.h b/lib/Headers/arm_acle.h
index 096cc261af2c..0510e6fd809f 100644
--- a/lib/Headers/arm_acle.h
+++ b/lib/Headers/arm_acle.h
@@ -613,7 +613,7 @@ __jcvt(double __a) {
 #define __arm_wsr64(sysreg, v) __builtin_arm_wsr64(sysreg, v)
 #define __arm_wsrp(sysreg, v) __builtin_arm_wsrp(sysreg, v)
 
-// Memory Tagging Extensions (MTE) Intrinsics
+/* Memory Tagging Extensions (MTE) Intrinsics */
 #if __ARM_FEATURE_MEMORY_TAGGING
 #define __arm_mte_create_random_tag(__ptr, __mask)  __builtin_arm_irg(__ptr, __mask)
 #define __arm_mte_increment_tag(__ptr, __tag_offset)  __builtin_arm_addg(__ptr, __tag_offset)
@@ -623,6 +623,28 @@ __jcvt(double __a) {
 #define __arm_mte_ptrdiff(__ptra, __ptrb) __builtin_arm_subp(__ptra, __ptrb)
 #endif
 
+/* Transactional Memory Extension (TME) Intrinsics */
+#if __ARM_FEATURE_TME
+
+#define _TMFAILURE_REASON  0x00007fffu
+#define _TMFAILURE_RTRY    0x00008000u
+#define _TMFAILURE_CNCL    0x00010000u
+#define _TMFAILURE_MEM     0x00020000u
+#define _TMFAILURE_IMP     0x00040000u
+#define _TMFAILURE_ERR     0x00080000u
+#define _TMFAILURE_SIZE    0x00100000u
+#define _TMFAILURE_NEST    0x00200000u
+#define _TMFAILURE_DBG     0x00400000u
+#define _TMFAILURE_INT     0x00800000u
+#define _TMFAILURE_TRIVIAL 0x01000000u
+
+#define __tstart()        __builtin_arm_tstart()
+#define __tcommit()       __builtin_arm_tcommit()
+#define __tcancel(__arg)  __builtin_arm_tcancel(__arg)
+#define __ttest()         __builtin_arm_ttest()
+
+#endif /* __ARM_FEATURE_TME */
+
 #if defined(__cplusplus)
 }
 #endif
diff --git a/lib/Headers/avx512fintrin.h b/lib/Headers/avx512fintrin.h
index 132761f9ef5c..698e477fe5f3 100644
--- a/lib/Headers/avx512fintrin.h
+++ b/lib/Headers/avx512fintrin.h
@@ -7658,13 +7658,13 @@ _mm512_maskz_getexp_ps (__mmask16 __U, __m512 __A)
 #define _mm512_i32gather_ps(index, addr, scale) \
   (__m512)__builtin_ia32_gathersiv16sf((__v16sf)_mm512_undefined_ps(), \
                                        (void const *)(addr), \
-                                       (__v16sf)(__m512)(index), \
+                                       (__v16si)(__m512)(index), \
                                        (__mmask16)-1, (int)(scale))
 
 #define _mm512_mask_i32gather_ps(v1_old, mask, index, addr, scale) \
   (__m512)__builtin_ia32_gathersiv16sf((__v16sf)(__m512)(v1_old), \
                                        (void const *)(addr), \
-                                       (__v16sf)(__m512)(index), \
+                                       (__v16si)(__m512)(index), \
                                        (__mmask16)(mask), (int)(scale))
 
 #define _mm512_i32gather_epi32(index, addr, scale) \
@@ -8436,7 +8436,7 @@ _store_mask16(__mmask16 *__A, __mmask16 __B) {
 }
 
 static __inline__ void __DEFAULT_FN_ATTRS512
-_mm512_stream_si512 (__m512i * __P, __m512i __A)
+_mm512_stream_si512 (void * __P, __m512i __A)
 {
   typedef __v8di __v8di_aligned __attribute__((aligned(64)));
   __builtin_nontemporal_store((__v8di_aligned)__A, (__v8di_aligned*)__P);
@@ -8450,14 +8450,14 @@ _mm512_stream_load_si512 (void const *__P)
 }
 
 static __inline__ void __DEFAULT_FN_ATTRS512
-_mm512_stream_pd (double *__P, __m512d __A)
+_mm512_stream_pd (void *__P, __m512d __A)
 {
   typedef __v8df __v8df_aligned __attribute__((aligned(64)));
   __builtin_nontemporal_store((__v8df_aligned)__A, (__v8df_aligned*)__P);
 }
 
 static __inline__ void __DEFAULT_FN_ATTRS512
-_mm512_stream_ps (float *__P, __m512 __A)
+_mm512_stream_ps (void *__P, __m512 __A)
 {
   typedef __v16sf __v16sf_aligned __attribute__((aligned(64)));
   __builtin_nontemporal_store((__v16sf_aligned)__A, (__v16sf_aligned*)__P);
@@ -9659,6 +9659,23 @@ _mm512_mask_reduce_min_ps(__mmask16 __M, __m512 __V) {
 }
 #undef _mm512_mask_reduce_operator
 
+/// Moves the least significant 32 bits of a vector of [16 x i32] to a
+///    32-bit signed integer value.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> VMOVD / MOVD </c> instruction.
+///
+/// \param __A
+///    A vector of [16 x i32]. The least significant 32 bits are moved to the
+///    destination.
+/// \returns A 32-bit signed integer containing the moved value.
+static __inline__ int __DEFAULT_FN_ATTRS512
+_mm512_cvtsi512_si32(__m512i __A) {
+  __v16si __b = (__v16si)__A;
+  return __b[0];
+}
+
 #undef __DEFAULT_FN_ATTRS512
 #undef __DEFAULT_FN_ATTRS128
 #undef __DEFAULT_FN_ATTRS
diff --git a/lib/Headers/bmiintrin.h b/lib/Headers/bmiintrin.h
index b7af62f609ae..841bd84070e8 100644
--- a/lib/Headers/bmiintrin.h
+++ b/lib/Headers/bmiintrin.h
@@ -14,27 +14,13 @@
 #ifndef __BMIINTRIN_H
 #define __BMIINTRIN_H
 
-#define _tzcnt_u16(a)     (__tzcnt_u16((a)))
-
-#define _andn_u32(a, b)   (__andn_u32((a), (b)))
-
-/* _bextr_u32 != __bextr_u32 */
-#define _blsi_u32(a)      (__blsi_u32((a)))
-
-#define _blsmsk_u32(a)    (__blsmsk_u32((a)))
-
-#define _blsr_u32(a)      (__blsr_u32((a)))
-
-#define _tzcnt_u32(a)     (__tzcnt_u32((a)))
-
-/* Define the default attributes for the functions in this file. */
-#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("bmi")))
-
 /* Allow using the tzcnt intrinsics even for non-BMI targets. Since the TZCNT
    instruction behaves as BSF on non-BMI targets, there is code that expects
    to use it as a potentially faster version of BSF. */
 #define __RELAXED_FN_ATTRS __attribute__((__always_inline__, __nodebug__))
 
+#define _tzcnt_u16(a)     (__tzcnt_u16((a)))
+
 /// Counts the number of trailing zero bits in the operand.
 ///
 /// \headerfile <x86intrin.h>
@@ -51,6 +37,94 @@ __tzcnt_u16(unsigned short __X)
   return __builtin_ia32_tzcnt_u16(__X);
 }
 
+/// Counts the number of trailing zero bits in the operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> TZCNT </c> instruction.
+///
+/// \param __X
+///    An unsigned 32-bit integer whose trailing zeros are to be counted.
+/// \returns An unsigned 32-bit integer containing the number of trailing zero
+///    bits in the operand.
+static __inline__ unsigned int __RELAXED_FN_ATTRS
+__tzcnt_u32(unsigned int __X)
+{
+  return __builtin_ia32_tzcnt_u32(__X);
+}
+
+/// Counts the number of trailing zero bits in the operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> TZCNT </c> instruction.
+///
+/// \param __X
+///    An unsigned 32-bit integer whose trailing zeros are to be counted.
+/// \returns An 32-bit integer containing the number of trailing zero bits in
+///    the operand.
+static __inline__ int __RELAXED_FN_ATTRS
+_mm_tzcnt_32(unsigned int __X)
+{
+  return __builtin_ia32_tzcnt_u32(__X);
+}
+
+#define _tzcnt_u32(a)     (__tzcnt_u32((a)))
+
+#ifdef __x86_64__
+
+/// Counts the number of trailing zero bits in the operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> TZCNT </c> instruction.
+///
+/// \param __X
+///    An unsigned 64-bit integer whose trailing zeros are to be counted.
+/// \returns An unsigned 64-bit integer containing the number of trailing zero
+///    bits in the operand.
+static __inline__ unsigned long long __RELAXED_FN_ATTRS
+__tzcnt_u64(unsigned long long __X)
+{
+  return __builtin_ia32_tzcnt_u64(__X);
+}
+
+/// Counts the number of trailing zero bits in the operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the <c> TZCNT </c> instruction.
+///
+/// \param __X
+///    An unsigned 64-bit integer whose trailing zeros are to be counted.
+/// \returns An 64-bit integer containing the number of trailing zero bits in
+///    the operand.
+static __inline__ long long __RELAXED_FN_ATTRS
+_mm_tzcnt_64(unsigned long long __X)
+{
+  return __builtin_ia32_tzcnt_u64(__X);
+}
+
+#define _tzcnt_u64(a)     (__tzcnt_u64((a)))
+
+#endif /* __x86_64__ */
+
+#undef __RELAXED_FN_ATTRS
+
+#if !defined(_MSC_VER) || __has_feature(modules) || defined(__BMI__)
+
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("bmi")))
+
+#define _andn_u32(a, b)   (__andn_u32((a), (b)))
+
+/* _bextr_u32 != __bextr_u32 */
+#define _blsi_u32(a)      (__blsi_u32((a)))
+
+#define _blsmsk_u32(a)    (__blsmsk_u32((a)))
+
+#define _blsr_u32(a)      (__blsr_u32((a)))
+
 /// Performs a bitwise AND of the second operand with the one's
 ///    complement of the first operand.
 ///
@@ -169,38 +243,6 @@ __blsr_u32(unsigned int __X)
   return __X & (__X - 1);
 }
 
-/// Counts the number of trailing zero bits in the operand.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> TZCNT </c> instruction.
-///
-/// \param __X
-///    An unsigned 32-bit integer whose trailing zeros are to be counted.
-/// \returns An unsigned 32-bit integer containing the number of trailing zero
-///    bits in the operand.
-static __inline__ unsigned int __RELAXED_FN_ATTRS
-__tzcnt_u32(unsigned int __X)
-{
-  return __builtin_ia32_tzcnt_u32(__X);
-}
-
-/// Counts the number of trailing zero bits in the operand.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> TZCNT </c> instruction.
-///
-/// \param __X
-///    An unsigned 32-bit integer whose trailing zeros are to be counted.
-/// \returns An 32-bit integer containing the number of trailing zero bits in
-///    the operand.
-static __inline__ int __RELAXED_FN_ATTRS
-_mm_tzcnt_32(unsigned int __X)
-{
-  return __builtin_ia32_tzcnt_u32(__X);
-}
-
 #ifdef __x86_64__
 
 #define _andn_u64(a, b)   (__andn_u64((a), (b)))
@@ -212,8 +254,6 @@ _mm_tzcnt_32(unsigned int __X)
 
 #define _blsr_u64(a)      (__blsr_u64((a)))
 
-#define _tzcnt_u64(a)     (__tzcnt_u64((a)))
-
 /// Performs a bitwise AND of the second operand with the one's
 ///    complement of the first operand.
 ///
@@ -332,41 +372,10 @@ __blsr_u64(unsigned long long __X)
   return __X & (__X - 1);
 }
 
-/// Counts the number of trailing zero bits in the operand.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> TZCNT </c> instruction.
-///
-/// \param __X
-///    An unsigned 64-bit integer whose trailing zeros are to be counted.
-/// \returns An unsigned 64-bit integer containing the number of trailing zero
-///    bits in the operand.
-static __inline__ unsigned long long __RELAXED_FN_ATTRS
-__tzcnt_u64(unsigned long long __X)
-{
-  return __builtin_ia32_tzcnt_u64(__X);
-}
-
-/// Counts the number of trailing zero bits in the operand.
-///
-/// \headerfile <x86intrin.h>
-///
-/// This intrinsic corresponds to the <c> TZCNT </c> instruction.
-///
-/// \param __X
-///    An unsigned 64-bit integer whose trailing zeros are to be counted.
-/// \returns An 64-bit integer containing the number of trailing zero bits in
-///    the operand.
-static __inline__ long long __RELAXED_FN_ATTRS
-_mm_tzcnt_64(unsigned long long __X)
-{
-  return __builtin_ia32_tzcnt_u64(__X);
-}
-
 #endif /* __x86_64__ */
 
 #undef __DEFAULT_FN_ATTRS
-#undef __RELAXED_FN_ATTRS
+
+#endif /* !defined(_MSC_VER) || __has_feature(modules) || defined(__BMI__) */
 
 #endif /* __BMIINTRIN_H */
diff --git a/lib/Headers/cpuid.h b/lib/Headers/cpuid.h
index 02ffac26c0b3..4ddd64847c32 100644
--- a/lib/Headers/cpuid.h
+++ b/lib/Headers/cpuid.h
@@ -38,8 +38,8 @@
 #define signature_TM2_ecx 0x3638784d
 /* NSC:     "Geode by NSC" */
 #define signature_NSC_ebx 0x646f6547
-#define signature_NSC_edx 0x43534e20
-#define signature_NSC_ecx 0x79622065
+#define signature_NSC_edx 0x79622065
+#define signature_NSC_ecx 0x43534e20
 /* NEXGEN:  "NexGenDriven" */
 #define signature_NEXGEN_ebx 0x4778654e
 #define signature_NEXGEN_edx 0x72446e65
diff --git a/lib/Headers/emmintrin.h b/lib/Headers/emmintrin.h
index 3d55f5f2710f..c8fefdfc792a 100644
--- a/lib/Headers/emmintrin.h
+++ b/lib/Headers/emmintrin.h
@@ -4029,7 +4029,7 @@ _mm_storeu_si128(__m128i_u *__p, __m128i __b)
 /// \param __b
 ///    A 128-bit integer vector containing the value to be stored.
 static __inline__ void __DEFAULT_FN_ATTRS
-_mm_storeu_si64(void const *__p, __m128i __b)
+_mm_storeu_si64(void *__p, __m128i __b)
 {
   struct __storeu_si64 {
     long long __v;
@@ -4050,7 +4050,7 @@ _mm_storeu_si64(void const *__p, __m128i __b)
 /// \param __b
 ///    A 128-bit integer vector containing the value to be stored.
 static __inline__ void __DEFAULT_FN_ATTRS
-_mm_storeu_si32(void const *__p, __m128i __b)
+_mm_storeu_si32(void *__p, __m128i __b)
 {
   struct __storeu_si32 {
     int __v;
@@ -4071,7 +4071,7 @@ _mm_storeu_si32(void const *__p, __m128i __b)
 /// \param __b
 ///    A 128-bit integer vector containing the value to be stored.
 static __inline__ void __DEFAULT_FN_ATTRS
-_mm_storeu_si16(void const *__p, __m128i __b)
+_mm_storeu_si16(void *__p, __m128i __b)
 {
   struct __storeu_si16 {
     short __v;
diff --git a/lib/Headers/ia32intrin.h b/lib/Headers/ia32intrin.h
index 8e38df73187d..79b7f0655cf0 100644
--- a/lib/Headers/ia32intrin.h
+++ b/lib/Headers/ia32intrin.h
@@ -195,6 +195,74 @@ __writeeflags(unsigned int __f)
 }
 #endif /* !__x86_64__ */
 
+/** Cast a 32-bit float value to a 32-bit unsigned integer value
+ *
+ *  \headerfile <x86intrin.h>
+ *  This intrinsic corresponds to the <c> VMOVD / MOVD </c> instruction in x86_64,
+ *  and corresponds to the <c> VMOVL / MOVL </c> instruction in ia32.
+ *
+ *  \param __A
+ *     A 32-bit float value.
+ *  \returns a 32-bit unsigned integer containing the converted value.
+ */
+static __inline__ unsigned int __attribute__((__always_inline__))
+_castf32_u32(float __A) {
+  unsigned int D;
+  __builtin_memcpy(&D, &__A, sizeof(__A));
+  return D;
+}
+
+/** Cast a 64-bit float value to a 64-bit unsigned integer value
+ *
+ *  \headerfile <x86intrin.h>
+ *  This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction in x86_64,
+ *  and corresponds to the <c> VMOVL / MOVL </c> instruction in ia32.
+ *
+ *  \param __A
+ *     A 64-bit float value.
+ *  \returns a 64-bit unsigned integer containing the converted value.
+ */
+static __inline__ unsigned long long __attribute__((__always_inline__))
+_castf64_u64(double __A) {
+  unsigned long long D;
+  __builtin_memcpy(&D, &__A, sizeof(__A));
+  return D;
+}
+
+/** Cast a 32-bit unsigned integer value to a 32-bit float value
+ *
+ *  \headerfile <x86intrin.h>
+ *  This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction in x86_64,
+ *  and corresponds to the <c> FLDS </c> instruction in ia32.
+ *
+ *  \param __A
+ *     A 32-bit unsigned integer value.
+ *  \returns a 32-bit float value containing the converted value.
+ */
+static __inline__ float __attribute__((__always_inline__))
+_castu32_f32(unsigned int __A) {
+  float D;
+  __builtin_memcpy(&D, &__A, sizeof(__A));
+  return D;
+}
+
+/** Cast a 64-bit unsigned integer value to a 64-bit float value
+ *
+ *  \headerfile <x86intrin.h>
+ *  This intrinsic corresponds to the <c> VMOVQ / MOVQ </c> instruction in x86_64,
+ *  and corresponds to the <c> FLDL </c> instruction in ia32.
+ *
+ *  \param __A
+ *     A 64-bit unsigned integer value.
+ *  \returns a 64-bit float value containing the converted value.
+ */
+static __inline__ double __attribute__((__always_inline__))
+_castu64_f64(unsigned long long __A) {
+  double D;
+  __builtin_memcpy(&D, &__A, sizeof(__A));
+  return D;
+}
+
 /** Adds the unsigned integer operand to the CRC-32C checksum of the
  *     unsigned char operand.
  *
diff --git a/lib/Headers/immintrin.h b/lib/Headers/immintrin.h
index 7555ad82fac7..ae900ee85b76 100644
--- a/lib/Headers/immintrin.h
+++ b/lib/Headers/immintrin.h
@@ -64,9 +64,8 @@
 #include <vpclmulqdqintrin.h>
 #endif
 
-#if !defined(_MSC_VER) || __has_feature(modules) || defined(__BMI__)
+/* No feature check desired due to internal checks */
 #include <bmiintrin.h>
-#endif
 
 #if !defined(_MSC_VER) || __has_feature(modules) || defined(__BMI2__)
 #include <bmi2intrin.h>
diff --git a/lib/Headers/opencl-c-base.h b/lib/Headers/opencl-c-base.h
index a82954ddd326..430e07d36f62 100644
--- a/lib/Headers/opencl-c-base.h
+++ b/lib/Headers/opencl-c-base.h
@@ -126,7 +126,7 @@ typedef double double8 __attribute__((ext_vector_type(8)));
 typedef double double16 __attribute__((ext_vector_type(16)));
 #endif
 
-#if __OPENCL_C_VERSION__ >= CL_VERSION_2_0
+#if defined(__OPENCL_CPP_VERSION__) || (__OPENCL_C_VERSION__ >= CL_VERSION_2_0)
 #define NULL ((void*)0)
 #endif
 
@@ -276,7 +276,7 @@ typedef uint cl_mem_fence_flags;
  */
 #define CLK_GLOBAL_MEM_FENCE   0x02
 
-#if __OPENCL_C_VERSION__ >= CL_VERSION_2_0
+#if defined(__OPENCL_CPP_VERSION__) || (__OPENCL_C_VERSION__ >= CL_VERSION_2_0)
 
 typedef enum memory_scope {
   memory_scope_work_item = __OPENCL_MEMORY_SCOPE_WORK_ITEM,
@@ -288,9 +288,6 @@ typedef enum memory_scope {
 #endif
 } memory_scope;
 
-#endif //__OPENCL_C_VERSION__ >= CL_VERSION_2_0
-
-#if __OPENCL_C_VERSION__ >= CL_VERSION_2_0
 /**
  * Queue a memory fence to ensure correct ordering of memory
  * operations between work-items of a work-group to
@@ -313,7 +310,7 @@ typedef enum memory_order
   memory_order_seq_cst = __ATOMIC_SEQ_CST
 } memory_order;
 
-#endif //__OPENCL_C_VERSION__ >= CL_VERSION_2_0
+#endif // defined(__OPENCL_CPP_VERSION__) || (__OPENCL_C_VERSION__ >= CL_VERSION_2_0)
 
 // OpenCL v1.1 s6.11.3, v1.2 s6.12.14, v2.0 s6.13.14 - Image Read and Write Functions
 
@@ -389,14 +386,10 @@ typedef enum memory_order
 #endif //__OPENCL_C_VERSION__ >= CL_VERSION_2_0
 
 // OpenCL v2.0 s6.13.16 - Pipe Functions
-#if __OPENCL_C_VERSION__ >= CL_VERSION_2_0
+#if defined(__OPENCL_CPP_VERSION__) || (__OPENCL_C_VERSION__ >= CL_VERSION_2_0)
 #define CLK_NULL_RESERVE_ID (__builtin_astype(((void*)(__SIZE_MAX__)), reserve_id_t))
-#endif //__OPENCL_C_VERSION__ >= CL_VERSION_2_0
-
 
 // OpenCL v2.0 s6.13.17 - Enqueue Kernels
-#if __OPENCL_C_VERSION__ >= CL_VERSION_2_0
-
 #define CL_COMPLETE                                 0x0
 #define CL_RUNNING                                  0x1
 #define CL_SUBMITTED                                0x2
@@ -413,7 +406,7 @@ typedef enum memory_order
 #define CLK_OUT_OF_RESOURCES                        -5
 
 #define CLK_NULL_QUEUE                              0
-#define CLK_NULL_EVENT (__builtin_astype(((void*)(__SIZE_MAX__)), clk_event_t))
+#define CLK_NULL_EVENT (__builtin_astype(((__SIZE_MAX__)), clk_event_t))
 
 // execution model related definitions
 #define CLK_ENQUEUE_FLAGS_NO_WAIT                   0x0
@@ -435,7 +428,7 @@ typedef struct {
   size_t localWorkSize[MAX_WORK_DIM];
 } ndrange_t;
 
-#endif //__OPENCL_C_VERSION__ >= CL_VERSION_2_0
+#endif // defined(__OPENCL_CPP_VERSION__) || (__OPENCL_C_VERSION__ >= CL_VERSION_2_0)
 
 #ifdef cl_intel_device_side_avc_motion_estimation
 #pragma OPENCL EXTENSION cl_intel_device_side_avc_motion_estimation : begin
diff --git a/lib/Headers/opencl-c.h b/lib/Headers/opencl-c.h
index 4207c53ccedb..06c5ab6a72f0 100644
--- a/lib/Headers/opencl-c.h
+++ b/lib/Headers/opencl-c.h
@@ -11,11 +11,11 @@
 
 #include "opencl-c-base.h"
 
-#if __OPENCL_C_VERSION__ >= CL_VERSION_2_0
+#if defined(__OPENCL_CPP_VERSION__) || (__OPENCL_C_VERSION__ >= CL_VERSION_2_0)
 #ifndef cl_khr_depth_images
 #define cl_khr_depth_images
 #endif //cl_khr_depth_images
-#endif //__OPENCL_C_VERSION__ >= CL_VERSION_2_0
+#endif //defined(__OPENCL_CPP_VERSION__) || (__OPENCL_C_VERSION__ >= CL_VERSION_2_0)
 
 #if __OPENCL_C_VERSION__ < CL_VERSION_2_0
 #ifdef cl_khr_3d_image_writes
@@ -23,10 +23,10 @@
 #endif //cl_khr_3d_image_writes
 #endif //__OPENCL_C_VERSION__ < CL_VERSION_2_0
 
-#if __OPENCL_C_VERSION__ >= CL_VERSION_1_2
+#if defined(__OPENCL_CPP_VERSION__) || (__OPENCL_C_VERSION__ >= CL_VERSION_1_2)
 #pragma OPENCL EXTENSION cl_intel_planar_yuv : begin
 #pragma OPENCL EXTENSION cl_intel_planar_yuv : end
-#endif // __OPENCL_C_VERSION__ >= CL_VERSION_1_2
+#endif // defined(__OPENCL_CPP_VERSION__) || (__OPENCL_C_VERSION__ >= CL_VERSION_1_2)
 
 #define __ovld __attribute__((overloadable))
 #define __conv __attribute__((convergent))
@@ -6517,11 +6517,11 @@ size_t __ovld __cnfn get_group_id(uint dimindx);
  */
 size_t __ovld __cnfn get_global_offset(uint dimindx);
 
-#if __OPENCL_C_VERSION__ >= CL_VERSION_2_0
+#if defined(__OPENCL_CPP_VERSION__) || (__OPENCL_C_VERSION__ >= CL_VERSION_2_0)
 size_t __ovld get_enqueued_local_size(uint dimindx);
 size_t __ovld get_global_linear_id(void);
 size_t __ovld get_local_linear_id(void);
-#endif //__OPENCL_C_VERSION__ >= CL_VERSION_2_0
+#endif //defined(__OPENCL_CPP_VERSION__) || (__OPENCL_C_VERSION__ >= CL_VERSION_2_0)
 
 // OpenCL v1.1 s6.11.2, v1.2 s6.12.2, v2.0 s6.13.2 - Math functions
 
@@ -7352,7 +7352,7 @@ half16 __ovld __cnfn fmod(half16 x, half16 y);
  * Returns fmin(x - floor (x), 0x1.fffffep-1f ).
  * floor(x) is returned in iptr.
  */
-#if __OPENCL_C_VERSION__ >= CL_VERSION_2_0
+#if defined(__OPENCL_CPP_VERSION__) || (__OPENCL_C_VERSION__ >= CL_VERSION_2_0)
 float __ovld fract(float x, float *iptr);
 float2 __ovld fract(float2 x, float2 *iptr);
 float3 __ovld fract(float3 x, float3 *iptr);
@@ -7434,7 +7434,7 @@ half4 __ovld fract(half4 x, __private half4 *iptr);
 half8 __ovld fract(half8 x, __private half8 *iptr);
 half16 __ovld fract(half16 x, __private half16 *iptr);
 #endif //cl_khr_fp16
-#endif //__OPENCL_C_VERSION__ >= CL_VERSION_2_0
+#endif //defined(__OPENCL_CPP_VERSION__) || (__OPENCL_C_VERSION__ >= CL_VERSION_2_0)
 
 /**
  * Extract mantissa and exponent from x. For each
@@ -7442,7 +7442,7 @@ half16 __ovld fract(half16 x, __private half16 *iptr);
  * magnitude in the interval [1/2, 1) or 0. Each
  * component of x equals mantissa returned * 2^exp.
  */
-#if __OPENCL_C_VERSION__ >= CL_VERSION_2_0
+#if defined(__OPENCL_CPP_VERSION__) || (__OPENCL_C_VERSION__ >= CL_VERSION_2_0)
 float __ovld frexp(float x, int *exp);
 float2 __ovld frexp(float2 x, int2 *exp);
 float3 __ovld frexp(float3 x, int3 *exp);
@@ -7524,7 +7524,7 @@ half4 __ovld frexp(half4 x, __private int4 *exp);
 half8 __ovld frexp(half8 x, __private int8 *exp);
 half16 __ovld frexp(half16 x, __private int16 *exp);
 #endif //cl_khr_fp16
-#endif //__OPENCL_C_VERSION__ >= CL_VERSION_2_0
+#endif //defined(__OPENCL_CPP_VERSION__) || (__OPENCL_C_VERSION__ >= CL_VERSION_2_0)
 
 /**
  * Compute the value of the square root of x^2 + y^2
@@ -7649,7 +7649,7 @@ half8 __ovld __cnfn lgamma(half8 x);
 half16 __ovld __cnfn lgamma(half16 x);
 #endif //cl_khr_fp16
 
-#if __OPENCL_C_VERSION__ >= CL_VERSION_2_0
+#if defined(__OPENCL_CPP_VERSION__) || (__OPENCL_C_VERSION__ >= CL_VERSION_2_0)
 float __ovld lgamma_r(float x, int *signp);
 float2 __ovld lgamma_r(float2 x, int2 *signp);
 float3 __ovld lgamma_r(float3 x, int3 *signp);
@@ -7731,7 +7731,7 @@ half4 __ovld lgamma_r(half4 x, __private int4 *signp);
 half8 __ovld lgamma_r(half8 x, __private int8 *signp);
 half16 __ovld lgamma_r(half16 x, __private int16 *signp);
 #endif //cl_khr_fp16
-#endif //__OPENCL_C_VERSION__ >= CL_VERSION_2_0
+#endif //defined(__OPENCL_CPP_VERSION__) || (__OPENCL_C_VERSION__ >= CL_VERSION_2_0)
 
 /**
  * Compute natural logarithm.
@@ -7955,7 +7955,7 @@ half16 __ovld __cnfn minmag(half16 x, half16 y);
  * the argument. It stores the integral part in the object
  * pointed to by iptr.
  */
-#if __OPENCL_C_VERSION__ >= CL_VERSION_2_0
+#if defined(__OPENCL_CPP_VERSION__) || (__OPENCL_C_VERSION__ >= CL_VERSION_2_0)
 float __ovld modf(float x, float *iptr);
 float2 __ovld modf(float2 x, float2 *iptr);
 float3 __ovld modf(float3 x, float3 *iptr);
@@ -8037,7 +8037,7 @@ half4 __ovld modf(half4 x, __private half4 *iptr);
 half8 __ovld modf(half8 x, __private half8 *iptr);
 half16 __ovld modf(half16 x, __private half16 *iptr);
 #endif //cl_khr_fp16
-#endif //__OPENCL_C_VERSION__ >= CL_VERSION_2_0
+#endif //defined(__OPENCL_CPP_VERSION__) || (__OPENCL_C_VERSION__ >= CL_VERSION_2_0)
 
 /**
  * Returns a quiet NaN. The nancode may be placed
@@ -8215,7 +8215,7 @@ half16 __ovld __cnfn remainder(half16 x, half16 y);
  * sign as x/y. It stores this signed value in the object
  * pointed to by quo.
  */
-#if __OPENCL_C_VERSION__ >= CL_VERSION_2_0
+#if defined(__OPENCL_CPP_VERSION__) || (__OPENCL_C_VERSION__ >= CL_VERSION_2_0)
 float __ovld remquo(float x, float y, int *quo);
 float2 __ovld remquo(float2 x, float2 y, int2 *quo);
 float3 __ovld remquo(float3 x, float3 y, int3 *quo);
@@ -8298,7 +8298,7 @@ half4 __ovld remquo(half4 x, half4 y, __private int4 *quo);
 half8 __ovld remquo(half8 x, half8 y, __private int8 *quo);
 half16 __ovld remquo(half16 x, half16 y, __private int16 *quo);
 #endif //cl_khr_fp16
-#endif //__OPENCL_C_VERSION__ >= CL_VERSION_2_0
+#endif //defined(__OPENCL_CPP_VERSION__) || (__OPENCL_C_VERSION__ >= CL_VERSION_2_0)
 /**
  * Round to integral value (using round to nearest
  * even rounding mode) in floating-point format.
@@ -8439,7 +8439,7 @@ half16 __ovld __cnfn sin(half16);
  * is the return value and computed cosine is returned
  * in cosval.
  */
-#if __OPENCL_C_VERSION__ >= CL_VERSION_2_0
+#if defined(__OPENCL_CPP_VERSION__) || (__OPENCL_C_VERSION__ >= CL_VERSION_2_0)
 float __ovld sincos(float x, float *cosval);
 float2 __ovld sincos(float2 x, float2 *cosval);
 float3 __ovld sincos(float3 x, float3 *cosval);
@@ -8521,7 +8521,7 @@ half4 __ovld sincos(half4 x, __private half4 *cosval);
 half8 __ovld sincos(half8 x, __private half8 *cosval);
 half16 __ovld sincos(half16 x, __private half16 *cosval);
 #endif //cl_khr_fp16
-#endif //__OPENCL_C_VERSION__ >= CL_VERSION_2_0
+#endif //defined(__OPENCL_CPP_VERSION__) || (__OPENCL_C_VERSION__ >= CL_VERSION_2_0)
 
 /**
  * Compute hyperbolic sine.
@@ -9446,7 +9446,7 @@ ulong16 __ovld __cnfn clz(ulong16 x);
  * returns the size in bits of the type of x or
  * component type of x, if x is a vector.
  */
-#if __OPENCL_C_VERSION__ >= CL_VERSION_2_0
+#if defined(__OPENCL_CPP_VERSION__) || (__OPENCL_C_VERSION__ >= CL_VERSION_2_0)
 char __ovld ctz(char x);
 uchar __ovld ctz(uchar x);
 char2 __ovld ctz(char2 x);
@@ -9495,7 +9495,7 @@ long8 __ovld ctz(long8 x);
 ulong8 __ovld ctz(ulong8 x);
 long16 __ovld ctz(long16 x);
 ulong16 __ovld ctz(ulong16 x);
-#endif //__OPENCL_C_VERSION__ >= CL_VERSION_2_0
+#endif //defined(__OPENCL_CPP_VERSION__) || (__OPENCL_C_VERSION__ >= CL_VERSION_2_0)
 
 /**
  * Returns mul_hi(a, b) + c.
@@ -11340,7 +11340,7 @@ half8 __ovld vload8(size_t offset, const __constant half *p);
 half16 __ovld vload16(size_t offset, const __constant half *p);
 #endif //cl_khr_fp16
 
-#if __OPENCL_C_VERSION__ >= CL_VERSION_2_0
+#if defined(__OPENCL_CPP_VERSION__) || (__OPENCL_C_VERSION__ >= CL_VERSION_2_0)
 char2 __ovld vload2(size_t offset, const char *p);
 uchar2 __ovld vload2(size_t offset, const uchar *p);
 short2 __ovld vload2(size_t offset, const short *p);
@@ -11578,9 +11578,9 @@ half4 __ovld vload4(size_t offset, const __private half *p);
 half8 __ovld vload8(size_t offset, const __private half *p);
 half16 __ovld vload16(size_t offset, const __private half *p);
 #endif //cl_khr_fp16
-#endif //__OPENCL_C_VERSION__ >= CL_VERSION_2_0
+#endif //defined(__OPENCL_CPP_VERSION__) || (__OPENCL_C_VERSION__ >= CL_VERSION_2_0)
 
-#if __OPENCL_C_VERSION__ >= CL_VERSION_2_0
+#if defined(__OPENCL_CPP_VERSION__) || (__OPENCL_C_VERSION__ >= CL_VERSION_2_0)
 void __ovld vstore2(char2 data, size_t offset, char *p);
 void __ovld vstore2(uchar2 data, size_t offset, uchar *p);
 void __ovld vstore2(short2 data, size_t offset, short *p);
@@ -11814,7 +11814,7 @@ void __ovld vstore4(half4 data, size_t offset, __private half *p);
 void __ovld vstore8(half8 data, size_t offset, __private half *p);
 void __ovld vstore16(half16 data, size_t offset, __private half *p);
 #endif //cl_khr_fp16
-#endif //__OPENCL_C_VERSION__ >= CL_VERSION_2_0
+#endif //defined(__OPENCL_CPP_VERSION__) || (__OPENCL_C_VERSION__ >= CL_VERSION_2_0)
 
 /**
  * Read sizeof (half) bytes of data from address
@@ -11825,13 +11825,13 @@ void __ovld vstore16(half16 data, size_t offset, __private half *p);
  * must be 16-bit aligned.
  */
 float __ovld vload_half(size_t offset, const __constant half *p);
-#if __OPENCL_C_VERSION__ >= CL_VERSION_2_0
+#if defined(__OPENCL_CPP_VERSION__) || (__OPENCL_C_VERSION__ >= CL_VERSION_2_0)
 float __ovld vload_half(size_t offset, const half *p);
 #else
 float __ovld vload_half(size_t offset, const __global half *p);
 float __ovld vload_half(size_t offset, const __local half *p);
 float __ovld vload_half(size_t offset, const __private half *p);
-#endif //__OPENCL_C_VERSION__ >= CL_VERSION_2_0
+#endif //defined(__OPENCL_CPP_VERSION__) || (__OPENCL_C_VERSION__ >= CL_VERSION_2_0)
 
 /**
  * Read sizeof (halfn) bytes of data from address
@@ -11846,7 +11846,7 @@ float3 __ovld vload_half3(size_t offset, const __constant half *p);
 float4 __ovld vload_half4(size_t offset, const __constant half *p);
 float8 __ovld vload_half8(size_t offset, const __constant half *p);
 float16 __ovld vload_half16(size_t offset, const __constant half *p);
-#if __OPENCL_C_VERSION__ >= CL_VERSION_2_0
+#if defined(__OPENCL_CPP_VERSION__) || (__OPENCL_C_VERSION__ >= CL_VERSION_2_0)
 float2 __ovld vload_half2(size_t offset, const half *p);
 float3 __ovld vload_half3(size_t offset, const half *p);
 float4 __ovld vload_half4(size_t offset, const half *p);
@@ -11868,7 +11868,7 @@ float3 __ovld vload_half3(size_t offset, const __private half *p);
 float4 __ovld vload_half4(size_t offset, const __private half *p);
 float8 __ovld vload_half8(size_t offset, const __private half *p);
 float16 __ovld vload_half16(size_t offset, const __private half *p);
-#endif //__OPENCL_C_VERSION__ >= CL_VERSION_2_0
+#endif //defined(__OPENCL_CPP_VERSION__) || (__OPENCL_C_VERSION__ >= CL_VERSION_2_0)
 
 /**
  * The float value given by data is first
@@ -11881,7 +11881,7 @@ float16 __ovld vload_half16(size_t offset, const __private half *p);
  * The default current rounding mode is round to
  * nearest even.
  */
-#if __OPENCL_C_VERSION__ >= CL_VERSION_2_0
+#if defined(__OPENCL_CPP_VERSION__) || (__OPENCL_C_VERSION__ >= CL_VERSION_2_0)
 void __ovld vstore_half(float data, size_t offset, half *p);
 void __ovld vstore_half_rte(float data, size_t offset, half *p);
 void __ovld vstore_half_rtz(float data, size_t offset, half *p);
@@ -11927,7 +11927,7 @@ void __ovld vstore_half_rtz(double data, size_t offset, __private half *p);
 void __ovld vstore_half_rtp(double data, size_t offset, __private half *p);
 void __ovld vstore_half_rtn(double data, size_t offset, __private half *p);
 #endif //cl_khr_fp64
-#endif //__OPENCL_C_VERSION__ >= CL_VERSION_2_0
+#endif //defined(__OPENCL_CPP_VERSION__) || (__OPENCL_C_VERSION__ >= CL_VERSION_2_0)
 
 /**
  * The floatn value given by data is converted to
@@ -11940,7 +11940,7 @@ void __ovld vstore_half_rtn(double data, size_t offset, __private half *p);
  * The default current rounding mode is round to
  * nearest even.
  */
-#if __OPENCL_C_VERSION__ >= CL_VERSION_2_0
+#if defined(__OPENCL_CPP_VERSION__) || (__OPENCL_C_VERSION__ >= CL_VERSION_2_0)
 void __ovld vstore_half2(float2 data, size_t offset, half *p);
 void __ovld vstore_half3(float3 data, size_t offset, half *p);
 void __ovld vstore_half4(float4 data, size_t offset, half *p);
@@ -12146,7 +12146,7 @@ void __ovld vstore_half4_rtn(double4 data, size_t offset, __private half *p);
 void __ovld vstore_half8_rtn(double8 data, size_t offset, __private half *p);
 void __ovld vstore_half16_rtn(double16 data, size_t offset, __private half *p);
 #endif //cl_khr_fp64
-#endif //__OPENCL_C_VERSION__ >= CL_VERSION_2_0
+#endif //defined(__OPENCL_CPP_VERSION__) || (__OPENCL_C_VERSION__ >= CL_VERSION_2_0)
 
 /**
  * For n = 1, 2, 4, 8 and 16 read sizeof (halfn)
@@ -12167,7 +12167,7 @@ float3 __ovld vloada_half3(size_t offset, const __constant half *p);
 float4 __ovld vloada_half4(size_t offset, const __constant half *p);
 float8 __ovld vloada_half8(size_t offset, const __constant half *p);
 float16 __ovld vloada_half16(size_t offset, const __constant half *p);
-#if __OPENCL_C_VERSION__ >= CL_VERSION_2_0
+#if defined(__OPENCL_CPP_VERSION__) || (__OPENCL_C_VERSION__ >= CL_VERSION_2_0)
 float __ovld vloada_half(size_t offset, const half *p);
 float2 __ovld vloada_half2(size_t offset, const half *p);
 float3 __ovld vloada_half3(size_t offset, const half *p);
@@ -12193,7 +12193,7 @@ float3 __ovld vloada_half3(size_t offset, const __private half *p);
 float4 __ovld vloada_half4(size_t offset, const __private half *p);
 float8 __ovld vloada_half8(size_t offset, const __private half *p);
 float16 __ovld vloada_half16(size_t offset, const __private half *p);
-#endif //__OPENCL_C_VERSION__ >= CL_VERSION_2_0
+#endif //defined(__OPENCL_CPP_VERSION__) || (__OPENCL_C_VERSION__ >= CL_VERSION_2_0)
 
 /**
  * The floatn value given by data is converted to
@@ -12211,7 +12211,7 @@ float16 __ovld vloada_half16(size_t offset, const __private half *p);
  * mode. The default current rounding mode is
  * round to nearest even.
  */
-#if __OPENCL_C_VERSION__ >= CL_VERSION_2_0
+#if defined(__OPENCL_CPP_VERSION__) || (__OPENCL_C_VERSION__ >= CL_VERSION_2_0)
 void __ovld vstorea_half(float data, size_t offset, half *p);
 void __ovld vstorea_half2(float2 data, size_t offset, half *p);
 void __ovld vstorea_half3(float3 data, size_t offset, half *p);
@@ -12496,7 +12496,7 @@ void __ovld vstorea_half4_rtn(double4 data,size_t offset, __private half *p);
 void __ovld vstorea_half8_rtn(double8 data,size_t offset, __private half *p);
 void __ovld vstorea_half16_rtn(double16 data,size_t offset, __private half *p);
 #endif //cl_khr_fp64
-#endif //__OPENCL_C_VERSION__ >= CL_VERSION_2_0
+#endif //defined(__OPENCL_CPP_VERSION__) || (__OPENCL_C_VERSION__ >= CL_VERSION_2_0)
 
 // OpenCL v1.1 s6.11.8, v1.2 s6.12.8, v2.0 s6.13.8 - Synchronization Functions
 
@@ -12532,10 +12532,10 @@ void __ovld vstorea_half16_rtn(double16 data,size_t offset, __private half *p);
 
 void __ovld __conv barrier(cl_mem_fence_flags flags);
 
-#if __OPENCL_C_VERSION__ >= CL_VERSION_2_0
+#if defined(__OPENCL_CPP_VERSION__) || (__OPENCL_C_VERSION__ >= CL_VERSION_2_0)
 void __ovld __conv work_group_barrier(cl_mem_fence_flags flags, memory_scope scope);
 void __ovld __conv work_group_barrier(cl_mem_fence_flags flags);
-#endif //__OPENCL_C_VERSION__ >= CL_VERSION_2_0
+#endif //defined(__OPENCL_CPP_VERSION__) || (__OPENCL_C_VERSION__ >= CL_VERSION_2_0)
 
 // OpenCL v1.1 s6.11.9, v1.2 s6.12.9 - Explicit Memory Fence Functions
 
@@ -12580,7 +12580,7 @@ void __ovld write_mem_fence(cl_mem_fence_flags flags);
 
 // OpenCL v2.0 s6.13.9 - Address Space Qualifier Functions
 
-#if __OPENCL_C_VERSION__ >= CL_VERSION_2_0
+#if defined(__OPENCL_CPP_VERSION__) || (__OPENCL_C_VERSION__ >= CL_VERSION_2_0)
 cl_mem_fence_flags __ovld get_fence(const void *ptr);
 cl_mem_fence_flags __ovld get_fence(void *ptr);
 
@@ -12591,7 +12591,7 @@ cl_mem_fence_flags __ovld get_fence(void *ptr);
  * where gentype is builtin type or user defined type.
  */
 
-#endif //__OPENCL_C_VERSION__ >= CL_VERSION_2_0
+#endif //defined(__OPENCL_CPP_VERSION__) || (__OPENCL_C_VERSION__ >= CL_VERSION_2_0)
 
 // OpenCL v1.1 s6.11.10, v1.2 s6.12.10, v2.0 s6.13.10 - Async Copies from Global to Local Memory, Local to Global Memory, and Prefetch
 
@@ -13371,7 +13371,7 @@ unsigned long __ovld atom_xor(volatile __local unsigned long *p, unsigned long v
 
 // OpenCL v2.0 s6.13.11 - Atomics Functions
 
-#if __OPENCL_C_VERSION__ >= CL_VERSION_2_0
+#if defined(__OPENCL_CPP_VERSION__) || (__OPENCL_C_VERSION__ >= CL_VERSION_2_0)
 
 // double atomics support requires extensions cl_khr_int64_base_atomics and cl_khr_int64_extended_atomics
 #if defined(cl_khr_int64_base_atomics) && defined(cl_khr_int64_extended_atomics)
@@ -13692,7 +13692,7 @@ void __ovld atomic_flag_clear(volatile atomic_flag *object);
 void __ovld atomic_flag_clear_explicit(volatile atomic_flag *object, memory_order order);
 void __ovld atomic_flag_clear_explicit(volatile atomic_flag *object, memory_order order, memory_scope scope);
 
-#endif //__OPENCL_C_VERSION__ >= CL_VERSION_2_0
+#endif //defined(__OPENCL_CPP_VERSION__) || (__OPENCL_C_VERSION__ >= CL_VERSION_2_0)
 
 // OpenCL v1.1 s6.11.12, v1.2 s6.12.12, v2.0 s6.13.12 - Miscellaneous Vector Functions
 
@@ -14186,7 +14186,7 @@ half16 __ovld __cnfn shuffle2(half8 x, half8 y, ushort16 mask);
 half16 __ovld __cnfn shuffle2(half16 x, half16 y, ushort16 mask);
 #endif //cl_khr_fp16
 
-#if __OPENCL_C_VERSION__ >= CL_VERSION_1_2
+#if defined(__OPENCL_CPP_VERSION__) || (__OPENCL_C_VERSION__ >= CL_VERSION_1_2)
 // OpenCL v1.2 s6.12.13, v2.0 s6.13.13 - printf
 
 int printf(__constant const char* st, ...) __attribute__((format(printf, 1, 2)));
@@ -14307,7 +14307,7 @@ int4 __purefn __ovld read_imagei(read_only image3d_t image, sampler_t sampler, f
 uint4 __purefn __ovld read_imageui(read_only image3d_t image, sampler_t sampler, int4 coord);
 uint4 __purefn __ovld read_imageui(read_only image3d_t image, sampler_t sampler, float4 coord);
 
-#if __OPENCL_C_VERSION__ >= CL_VERSION_1_2
+#if defined(__OPENCL_CPP_VERSION__) || (__OPENCL_C_VERSION__ >= CL_VERSION_1_2)
 float4 __purefn __ovld read_imagef(read_only image2d_array_t image_array, sampler_t sampler, int4 coord);
 float4 __purefn __ovld read_imagef(read_only image2d_array_t image_array, sampler_t sampler, float4 coord);
 
@@ -14315,7 +14315,7 @@ int4 __purefn __ovld read_imagei(read_only image2d_array_t image_array, sampler_
 int4 __purefn __ovld read_imagei(read_only image2d_array_t image_array, sampler_t sampler, float4 coord);
 uint4 __purefn __ovld read_imageui(read_only image2d_array_t image_array, sampler_t sampler, int4 coord);
 uint4 __purefn __ovld read_imageui(read_only image2d_array_t image_array, sampler_t sampler, float4 coord);
-#endif // __OPENCL_C_VERSION__ >= CL_VERSION_1_2
+#endif // defined(__OPENCL_CPP_VERSION__) || (__OPENCL_C_VERSION__ >= CL_VERSION_1_2)
 
 float4 __purefn __ovld read_imagef(read_only image1d_t image, sampler_t sampler, int coord);
 float4 __purefn __ovld read_imagef(read_only image1d_t image, sampler_t sampler, float coord);
@@ -14325,7 +14325,7 @@ int4 __purefn __ovld read_imagei(read_only image1d_t image, sampler_t sampler, f
 uint4 __purefn __ovld read_imageui(read_only image1d_t image, sampler_t sampler, int coord);
 uint4 __purefn __ovld read_imageui(read_only image1d_t image, sampler_t sampler, float coord);
 
-#if __OPENCL_C_VERSION__ >= CL_VERSION_1_2
+#if defined(__OPENCL_CPP_VERSION__) || (__OPENCL_C_VERSION__ >= CL_VERSION_1_2)
 float4 __purefn __ovld read_imagef(read_only image1d_array_t image_array, sampler_t sampler, int2 coord);
 float4 __purefn __ovld read_imagef(read_only image1d_array_t image_array, sampler_t sampler, float2 coord);
 
@@ -14333,7 +14333,7 @@ int4 __purefn __ovld read_imagei(read_only image1d_array_t image_array, sampler_
 int4 __purefn __ovld read_imagei(read_only image1d_array_t image_array, sampler_t sampler, float2 coord);
 uint4 __purefn __ovld read_imageui(read_only image1d_array_t image_array, sampler_t sampler, int2 coord);
 uint4 __purefn __ovld read_imageui(read_only image1d_array_t image_array, sampler_t sampler, float2 coord);
-#endif // __OPENCL_C_VERSION__ >= CL_VERSION_1_2
+#endif // defined(__OPENCL_CPP_VERSION__) || (__OPENCL_C_VERSION__ >= CL_VERSION_1_2)
 
 #ifdef cl_khr_depth_images
 float __purefn __ovld read_imagef(read_only image2d_depth_t image, sampler_t sampler, float2 coord);
@@ -14358,7 +14358,7 @@ float __purefn __ovld read_imagef(read_only image2d_array_msaa_depth_t image, in
 #endif //cl_khr_gl_msaa_sharing
 
 // OpenCL Extension v2.0 s9.18 - Mipmaps
-#if __OPENCL_C_VERSION__ >= CL_VERSION_2_0
+#if defined(__OPENCL_CPP_VERSION__) || (__OPENCL_C_VERSION__ >= CL_VERSION_2_0)
 #ifdef cl_khr_mipmap_image
 
 float4 __purefn __ovld read_imagef(read_only image1d_t image, sampler_t sampler, float coord, float lod);
@@ -14410,9 +14410,9 @@ int4 __purefn __ovld read_imagei(read_only image3d_t image, sampler_t sampler, f
 uint4 __purefn __ovld read_imageui(read_only image3d_t image, sampler_t sampler, float4 coord, float4 gradientX, float4 gradientY);
 
 #endif //cl_khr_mipmap_image
-#endif //__OPENCL_C_VERSION__ >= CL_VERSION_2_0
+#endif //defined(__OPENCL_CPP_VERSION__) || (__OPENCL_C_VERSION__ >= CL_VERSION_2_0)
 
-#if __OPENCL_C_VERSION__ >= CL_VERSION_1_2
+#if defined(__OPENCL_CPP_VERSION__) || (__OPENCL_C_VERSION__ >= CL_VERSION_1_2)
 
 /**
 * Sampler-less Image Access
@@ -14447,7 +14447,7 @@ float4 __purefn __ovld read_imagef(read_only image3d_t image, int4 coord);
 int4 __purefn __ovld read_imagei(read_only image3d_t image, int4 coord);
 uint4 __purefn __ovld read_imageui(read_only image3d_t image, int4 coord);
 
-#endif // __OPENCL_C_VERSION__ >= CL_VERSION_1_2
+#endif // defined(__OPENCL_CPP_VERSION__) || (__OPENCL_C_VERSION__ >= CL_VERSION_1_2)
 
 // Image read functions returning half4 type
 #ifdef cl_khr_fp16
@@ -14457,7 +14457,7 @@ half4 __purefn __ovld read_imageh(read_only image2d_t image, sampler_t sampler,
 half4 __purefn __ovld read_imageh(read_only image2d_t image, sampler_t sampler, float2 coord);
 half4 __purefn __ovld read_imageh(read_only image3d_t image, sampler_t sampler, int4 coord);
 half4 __purefn __ovld read_imageh(read_only image3d_t image, sampler_t sampler, float4 coord);
-#if __OPENCL_C_VERSION__ >= CL_VERSION_1_2
+#if defined(__OPENCL_CPP_VERSION__) || (__OPENCL_C_VERSION__ >= CL_VERSION_1_2)
 half4 __purefn __ovld read_imageh(read_only image1d_array_t image, sampler_t sampler, int2 coord);
 half4 __purefn __ovld read_imageh(read_only image1d_array_t image, sampler_t sampler, float2 coord);
 half4 __purefn __ovld read_imageh(read_only image2d_array_t image, sampler_t sampler, int4 coord);
@@ -14471,11 +14471,11 @@ half4 __purefn __ovld read_imageh(read_only image3d_t image, int4 coord);
 half4 __purefn __ovld read_imageh(read_only image1d_array_t image, int2 coord);
 half4 __purefn __ovld read_imageh(read_only image2d_array_t image, int4 coord);
 half4 __purefn __ovld read_imageh(read_only image1d_buffer_t image, int coord);
-#endif // __OPENCL_C_VERSION__ >= CL_VERSION_1_2
+#endif // defined(__OPENCL_CPP_VERSION__) || (__OPENCL_C_VERSION__ >= CL_VERSION_1_2)
 #endif //cl_khr_fp16
 
 // Image read functions for read_write images
-#if __OPENCL_C_VERSION__ >= CL_VERSION_2_0
+#if defined(__OPENCL_CPP_VERSION__) || (__OPENCL_C_VERSION__ >= CL_VERSION_2_0)
 float4 __purefn __ovld read_imagef(read_write image1d_t image, int coord);
 int4 __purefn __ovld read_imagei(read_write image1d_t image, int coord);
 uint4 __purefn __ovld read_imageui(read_write image1d_t image, int coord);
@@ -14518,7 +14518,7 @@ float __purefn __ovld read_imagef(read_write image2d_msaa_depth_t image, int2 co
 float __purefn __ovld read_imagef(read_write image2d_array_msaa_depth_t image, int4 coord, int sample);
 #endif //cl_khr_gl_msaa_sharing
 
-#if __OPENCL_C_VERSION__ >= CL_VERSION_2_0
+#if defined(__OPENCL_CPP_VERSION__) || (__OPENCL_C_VERSION__ >= CL_VERSION_2_0)
 #ifdef cl_khr_mipmap_image
 float4 __purefn __ovld read_imagef(read_write image1d_t image, sampler_t sampler, float coord, float lod);
 int4 __purefn __ovld read_imagei(read_write image1d_t image, sampler_t sampler, float coord, float lod);
@@ -14569,7 +14569,7 @@ int4 __purefn __ovld read_imagei(read_write image3d_t image, sampler_t sampler,
 uint4 __purefn __ovld read_imageui(read_write image3d_t image, sampler_t sampler, float4 coord, float4 gradientX, float4 gradientY);
 
 #endif //cl_khr_mipmap_image
-#endif //__OPENCL_C_VERSION__ >= CL_VERSION_2_0
+#endif //defined(__OPENCL_CPP_VERSION__) || (__OPENCL_C_VERSION__ >= CL_VERSION_2_0)
 
 // Image read functions returning half4 type
 #ifdef cl_khr_fp16
@@ -14580,7 +14580,7 @@ half4 __purefn __ovld read_imageh(read_write image1d_array_t image, int2 coord);
 half4 __purefn __ovld read_imageh(read_write image2d_array_t image, int4 coord);
 half4 __purefn __ovld read_imageh(read_write image1d_buffer_t image, int coord);
 #endif //cl_khr_fp16
-#endif //__OPENCL_C_VERSION__ >= CL_VERSION_2_0
+#endif //defined(__OPENCL_CPP_VERSION__) || (__OPENCL_C_VERSION__ >= CL_VERSION_2_0)
 
 /**
  * Write color value to location specified by coordinate
@@ -14681,7 +14681,7 @@ void __ovld write_imagef(write_only image2d_array_depth_t image, int4 coord, flo
 #endif //cl_khr_depth_images
 
 // OpenCL Extension v2.0 s9.18 - Mipmaps
-#if __OPENCL_C_VERSION__ >= CL_VERSION_2_0
+#if defined(__OPENCL_CPP_VERSION__) || (__OPENCL_C_VERSION__ >= CL_VERSION_2_0)
 #ifdef cl_khr_mipmap_image
 void __ovld write_imagef(write_only image1d_t image, int coord, int lod, float4 color);
 void __ovld write_imagei(write_only image1d_t image, int coord, int lod, int4 color);
@@ -14708,7 +14708,7 @@ void __ovld write_imagei(write_only image3d_t image, int4 coord, int lod, int4 c
 void __ovld write_imageui(write_only image3d_t image, int4 coord, int lod, uint4 color);
 #endif
 #endif //cl_khr_mipmap_image
-#endif //__OPENCL_C_VERSION__ >= CL_VERSION_2_0
+#endif //defined(__OPENCL_CPP_VERSION__) || (__OPENCL_C_VERSION__ >= CL_VERSION_2_0)
 
 // Image write functions for half4 type
 #ifdef cl_khr_fp16
@@ -14723,7 +14723,7 @@ void __ovld write_imageh(write_only image1d_buffer_t image, int coord, half4 col
 #endif //cl_khr_fp16
 
 // Image write functions for read_write images
-#if __OPENCL_C_VERSION__ >= CL_VERSION_2_0
+#if defined(__OPENCL_CPP_VERSION__) || (__OPENCL_C_VERSION__ >= CL_VERSION_2_0)
 void __ovld write_imagef(read_write image2d_t image, int2 coord, float4 color);
 void __ovld write_imagei(read_write image2d_t image, int2 coord, int4 color);
 void __ovld write_imageui(read_write image2d_t image, int2 coord, uint4 color);
@@ -14755,7 +14755,7 @@ void __ovld write_imagef(read_write image2d_depth_t image, int2 coord, float col
 void __ovld write_imagef(read_write image2d_array_depth_t image, int4 coord, float color);
 #endif //cl_khr_depth_images
 
-#if __OPENCL_C_VERSION__ >= CL_VERSION_2_0
+#if defined(__OPENCL_CPP_VERSION__) || (__OPENCL_C_VERSION__ >= CL_VERSION_2_0)
 #ifdef cl_khr_mipmap_image
 void __ovld write_imagef(read_write image1d_t image, int coord, int lod, float4 color);
 void __ovld write_imagei(read_write image1d_t image, int coord, int lod, int4 color);
@@ -14782,7 +14782,7 @@ void __ovld write_imagei(read_write image3d_t image, int4 coord, int lod, int4 c
 void __ovld write_imageui(read_write image3d_t image, int4 coord, int lod, uint4 color);
 #endif
 #endif //cl_khr_mipmap_image
-#endif //__OPENCL_C_VERSION__ >= CL_VERSION_2_0
+#endif //defined(__OPENCL_CPP_VERSION__) || (__OPENCL_C_VERSION__ >= CL_VERSION_2_0)
 
 // Image write functions for half4 type
 #ifdef cl_khr_fp16
@@ -14795,7 +14795,7 @@ void __ovld write_imageh(read_write image1d_array_t image, int2 coord, half4 col
 void __ovld write_imageh(read_write image2d_array_t image, int4 coord, half4 color);
 void __ovld write_imageh(read_write image1d_buffer_t image, int coord, half4 color);
 #endif //cl_khr_fp16
-#endif //__OPENCL_C_VERSION__ >= CL_VERSION_2_0
+#endif //defined(__OPENCL_CPP_VERSION__) || (__OPENCL_C_VERSION__ >= CL_VERSION_2_0)
 
 // Note: In OpenCL v1.0/1.1/1.2, image argument of image query builtin functions does not have
 // access qualifier, which by default assume read_only access qualifier. Image query builtin
@@ -14843,7 +14843,7 @@ int __ovld __cnfn get_image_width(write_only image2d_array_msaa_t image);
 int __ovld __cnfn get_image_width(write_only image2d_array_msaa_depth_t image);
 #endif //cl_khr_gl_msaa_sharing
 
-#if __OPENCL_C_VERSION__ >= CL_VERSION_2_0
+#if defined(__OPENCL_CPP_VERSION__) || (__OPENCL_C_VERSION__ >= CL_VERSION_2_0)
 int __ovld __cnfn get_image_width(read_write image1d_t image);
 int __ovld __cnfn get_image_width(read_write image1d_buffer_t image);
 int __ovld __cnfn get_image_width(read_write image2d_t image);
@@ -14860,7 +14860,7 @@ int __ovld __cnfn get_image_width(read_write image2d_msaa_depth_t image);
 int __ovld __cnfn get_image_width(read_write image2d_array_msaa_t image);
 int __ovld __cnfn get_image_width(read_write image2d_array_msaa_depth_t image);
 #endif //cl_khr_gl_msaa_sharing
-#endif //__OPENCL_C_VERSION__ >= CL_VERSION_2_0
+#endif //defined(__OPENCL_CPP_VERSION__) || (__OPENCL_C_VERSION__ >= CL_VERSION_2_0)
 
 /**
  * Return the image height in pixels.
@@ -14895,7 +14895,7 @@ int __ovld __cnfn get_image_height(write_only image2d_array_msaa_t image);
 int __ovld __cnfn get_image_height(write_only image2d_array_msaa_depth_t image);
 #endif //cl_khr_gl_msaa_sharing
 
-#if __OPENCL_C_VERSION__ >= CL_VERSION_2_0
+#if defined(__OPENCL_CPP_VERSION__) || (__OPENCL_C_VERSION__ >= CL_VERSION_2_0)
 int __ovld __cnfn get_image_height(read_write image2d_t image);
 int __ovld __cnfn get_image_height(read_write image3d_t image);
 int __ovld __cnfn get_image_height(read_write image2d_array_t image);
@@ -14909,7 +14909,7 @@ int __ovld __cnfn get_image_height(read_write image2d_msaa_depth_t image);
 int __ovld __cnfn get_image_height(read_write image2d_array_msaa_t image);
 int __ovld __cnfn get_image_height(read_write image2d_array_msaa_depth_t image);
 #endif //cl_khr_gl_msaa_sharing
-#endif //__OPENCL_C_VERSION__ >= CL_VERSION_2_0
+#endif //defined(__OPENCL_CPP_VERSION__) || (__OPENCL_C_VERSION__ >= CL_VERSION_2_0)
 
 /**
  * Return the image depth in pixels.
@@ -14920,12 +14920,12 @@ int __ovld __cnfn get_image_depth(read_only image3d_t image);
 int __ovld __cnfn get_image_depth(write_only image3d_t image);
 #endif
 
-#if __OPENCL_C_VERSION__ >= CL_VERSION_2_0
+#if defined(__OPENCL_CPP_VERSION__) || (__OPENCL_C_VERSION__ >= CL_VERSION_2_0)
 int __ovld __cnfn get_image_depth(read_write image3d_t image);
-#endif //__OPENCL_C_VERSION__ >= CL_VERSION_2_0
+#endif //defined(__OPENCL_CPP_VERSION__) || (__OPENCL_C_VERSION__ >= CL_VERSION_2_0)
 
 // OpenCL Extension v2.0 s9.18 - Mipmaps
-#if __OPENCL_C_VERSION__ >= CL_VERSION_2_0
+#if defined(__OPENCL_CPP_VERSION__) || (__OPENCL_C_VERSION__ >= CL_VERSION_2_0)
 #ifdef cl_khr_mipmap_image
 /**
  * Return the image miplevels.
@@ -14961,7 +14961,7 @@ int __ovld get_image_num_mip_levels(read_write image2d_array_depth_t image);
 int __ovld get_image_num_mip_levels(read_write image2d_depth_t image);
 
 #endif //cl_khr_mipmap_image
-#endif //__OPENCL_C_VERSION__ >= CL_VERSION_2_0
+#endif //defined(__OPENCL_CPP_VERSION__) || (__OPENCL_C_VERSION__ >= CL_VERSION_2_0)
 
 /**
  * Return the channel data type. Valid values are:
@@ -15018,7 +15018,7 @@ int __ovld __cnfn get_image_channel_data_type(write_only image2d_array_msaa_t im
 int __ovld __cnfn get_image_channel_data_type(write_only image2d_array_msaa_depth_t image);
 #endif //cl_khr_gl_msaa_sharing
 
-#if __OPENCL_C_VERSION__ >= CL_VERSION_2_0
+#if defined(__OPENCL_CPP_VERSION__) || (__OPENCL_C_VERSION__ >= CL_VERSION_2_0)
 int __ovld __cnfn get_image_channel_data_type(read_write image1d_t image);
 int __ovld __cnfn get_image_channel_data_type(read_write image1d_buffer_t image);
 int __ovld __cnfn get_image_channel_data_type(read_write image2d_t image);
@@ -15035,7 +15035,7 @@ int __ovld __cnfn get_image_channel_data_type(read_write image2d_msaa_depth_t im
 int __ovld __cnfn get_image_channel_data_type(read_write image2d_array_msaa_t image);
 int __ovld __cnfn get_image_channel_data_type(read_write image2d_array_msaa_depth_t image);
 #endif //cl_khr_gl_msaa_sharing
-#endif //__OPENCL_C_VERSION__ >= CL_VERSION_2_0
+#endif //defined(__OPENCL_CPP_VERSION__) || (__OPENCL_C_VERSION__ >= CL_VERSION_2_0)
 
 /**
  * Return the image channel order. Valid values are:
@@ -15090,7 +15090,7 @@ int __ovld __cnfn get_image_channel_order(write_only image2d_array_msaa_t image)
 int __ovld __cnfn get_image_channel_order(write_only image2d_array_msaa_depth_t image);
 #endif //cl_khr_gl_msaa_sharing
 
-#if __OPENCL_C_VERSION__ >= CL_VERSION_2_0
+#if defined(__OPENCL_CPP_VERSION__) || (__OPENCL_C_VERSION__ >= CL_VERSION_2_0)
 int __ovld __cnfn get_image_channel_order(read_write image1d_t image);
 int __ovld __cnfn get_image_channel_order(read_write image1d_buffer_t image);
 int __ovld __cnfn get_image_channel_order(read_write image2d_t image);
@@ -15107,7 +15107,7 @@ int __ovld __cnfn get_image_channel_order(read_write image2d_msaa_depth_t image)
 int __ovld __cnfn get_image_channel_order(read_write image2d_array_msaa_t image);
 int __ovld __cnfn get_image_channel_order(read_write image2d_array_msaa_depth_t image);
 #endif //cl_khr_gl_msaa_sharing
-#endif //__OPENCL_C_VERSION__ >= CL_VERSION_2_0
+#endif //defined(__OPENCL_CPP_VERSION__) || (__OPENCL_C_VERSION__ >= CL_VERSION_2_0)
 
 /**
  * Return the 2D image width and height as an int2
@@ -15140,7 +15140,7 @@ int2 __ovld __cnfn get_image_dim(write_only image2d_array_msaa_t image);
 int2 __ovld __cnfn get_image_dim(write_only image2d_array_msaa_depth_t image);
 #endif //cl_khr_gl_msaa_sharing
 
-#if __OPENCL_C_VERSION__ >= CL_VERSION_2_0
+#if defined(__OPENCL_CPP_VERSION__) || (__OPENCL_C_VERSION__ >= CL_VERSION_2_0)
 int2 __ovld __cnfn get_image_dim(read_write image2d_t image);
 int2 __ovld __cnfn get_image_dim(read_write image2d_array_t image);
 #ifdef cl_khr_depth_images
@@ -15153,7 +15153,7 @@ int2 __ovld __cnfn get_image_dim(read_write image2d_msaa_depth_t image);
 int2 __ovld __cnfn get_image_dim(read_write image2d_array_msaa_t image);
 int2 __ovld __cnfn get_image_dim(read_write image2d_array_msaa_depth_t image);
 #endif //cl_khr_gl_msaa_sharing
-#endif //__OPENCL_C_VERSION__ >= CL_VERSION_2_0
+#endif //defined(__OPENCL_CPP_VERSION__) || (__OPENCL_C_VERSION__ >= CL_VERSION_2_0)
 
 /**
  * Return the 3D image width, height, and depth as an
@@ -15165,9 +15165,9 @@ int4 __ovld __cnfn get_image_dim(read_only image3d_t image);
 #ifdef cl_khr_3d_image_writes
 int4 __ovld __cnfn get_image_dim(write_only image3d_t image);
 #endif
-#if __OPENCL_C_VERSION__ >= CL_VERSION_2_0
+#if defined(__OPENCL_CPP_VERSION__) || (__OPENCL_C_VERSION__ >= CL_VERSION_2_0)
 int4 __ovld __cnfn get_image_dim(read_write image3d_t image);
-#endif //__OPENCL_C_VERSION__ >= CL_VERSION_2_0
+#endif //defined(__OPENCL_CPP_VERSION__) || (__OPENCL_C_VERSION__ >= CL_VERSION_2_0)
 
 /**
  * Return the image array size.
@@ -15193,7 +15193,7 @@ size_t __ovld __cnfn get_image_array_size(write_only image2d_array_msaa_t image_
 size_t __ovld __cnfn get_image_array_size(write_only image2d_array_msaa_depth_t image_array);
 #endif //cl_khr_gl_msaa_sharing
 
-#if __OPENCL_C_VERSION__ >= CL_VERSION_2_0
+#if defined(__OPENCL_CPP_VERSION__) || (__OPENCL_C_VERSION__ >= CL_VERSION_2_0)
 size_t __ovld __cnfn get_image_array_size(read_write image1d_array_t image_array);
 size_t __ovld __cnfn get_image_array_size(read_write image2d_array_t image_array);
 #ifdef cl_khr_depth_images
@@ -15203,7 +15203,7 @@ size_t __ovld __cnfn get_image_array_size(read_write image2d_array_depth_t image
 size_t __ovld __cnfn get_image_array_size(read_write image2d_array_msaa_t image_array);
 size_t __ovld __cnfn get_image_array_size(read_write image2d_array_msaa_depth_t image_array);
 #endif //cl_khr_gl_msaa_sharing
-#endif //__OPENCL_C_VERSION__ >= CL_VERSION_2_0
+#endif //defined(__OPENCL_CPP_VERSION__) || (__OPENCL_C_VERSION__ >= CL_VERSION_2_0)
 
 /**
 * Return the number of samples associated with image
@@ -15219,17 +15219,17 @@ int __ovld get_image_num_samples(write_only image2d_msaa_depth_t image);
 int __ovld get_image_num_samples(write_only image2d_array_msaa_t image);
 int __ovld get_image_num_samples(write_only image2d_array_msaa_depth_t image);
 
-#if __OPENCL_C_VERSION__ >= CL_VERSION_2_0
+#if defined(__OPENCL_CPP_VERSION__) || (__OPENCL_C_VERSION__ >= CL_VERSION_2_0)
 int __ovld get_image_num_samples(read_write image2d_msaa_t image);
 int __ovld get_image_num_samples(read_write image2d_msaa_depth_t image);
 int __ovld get_image_num_samples(read_write image2d_array_msaa_t image);
 int __ovld get_image_num_samples(read_write image2d_array_msaa_depth_t image);
-#endif //__OPENCL_C_VERSION__ >= CL_VERSION_2_0
+#endif //defined(__OPENCL_CPP_VERSION__) || (__OPENCL_C_VERSION__ >= CL_VERSION_2_0)
 #endif
 
 // OpenCL v2.0 s6.13.15 - Work-group Functions
 
-#if __OPENCL_C_VERSION__ >= CL_VERSION_2_0
+#if defined(__OPENCL_CPP_VERSION__) || (__OPENCL_C_VERSION__ >= CL_VERSION_2_0)
 int __ovld __conv work_group_all(int predicate);
 int __ovld __conv work_group_any(int predicate);
 
@@ -15327,16 +15327,16 @@ double __ovld __conv work_group_scan_inclusive_min(double x);
 double __ovld __conv work_group_scan_inclusive_max(double x);
 #endif //cl_khr_fp64
 
-#endif //__OPENCL_C_VERSION__ >= CL_VERSION_2_0
+#endif //defined(__OPENCL_CPP_VERSION__) || (__OPENCL_C_VERSION__ >= CL_VERSION_2_0)
 
 // OpenCL v2.0 s6.13.16 - Pipe Functions
-#if __OPENCL_C_VERSION__ >= CL_VERSION_2_0
+#if defined(__OPENCL_CPP_VERSION__) || (__OPENCL_C_VERSION__ >= CL_VERSION_2_0)
 bool __ovld is_valid_reserve_id(reserve_id_t reserve_id);
-#endif //__OPENCL_C_VERSION__ >= CL_VERSION_2_0
+#endif //defined(__OPENCL_CPP_VERSION__) || (__OPENCL_C_VERSION__ >= CL_VERSION_2_0)
 
 
 // OpenCL v2.0 s6.13.17 - Enqueue Kernels
-#if __OPENCL_C_VERSION__ >= CL_VERSION_2_0
+#if defined(__OPENCL_CPP_VERSION__) || (__OPENCL_C_VERSION__ >= CL_VERSION_2_0)
 
 ndrange_t __ovld ndrange_1D(size_t);
 ndrange_t __ovld ndrange_1D(size_t, size_t);
@@ -15350,7 +15350,7 @@ ndrange_t __ovld ndrange_3D(const size_t[3]);
 ndrange_t __ovld ndrange_3D(const size_t[3], const size_t[3]);
 ndrange_t __ovld ndrange_3D(const size_t[3], const size_t[3], const size_t[3]);
 
-int __ovld enqueue_marker(queue_t, uint, const __private clk_event_t*, __private clk_event_t*);
+int __ovld enqueue_marker(queue_t, uint, const clk_event_t*, clk_event_t*);
 
 void __ovld retain_event(clk_event_t);
 
@@ -15365,7 +15365,7 @@ bool __ovld is_valid_event (clk_event_t event);
 void __ovld capture_event_profiling_info(clk_event_t, clk_profiling_info, __global void* value);
 
 queue_t __ovld get_default_queue(void);
-#endif //__OPENCL_C_VERSION__ >= CL_VERSION_2_0
+#endif //defined(__OPENCL_CPP_VERSION__) || (__OPENCL_C_VERSION__ >= CL_VERSION_2_0)
 
 // OpenCL Extension v2.0 s9.17 - Sub-groups
 
@@ -15374,16 +15374,16 @@ queue_t __ovld get_default_queue(void);
 uint    __ovld get_sub_group_size(void);
 uint    __ovld get_max_sub_group_size(void);
 uint    __ovld get_num_sub_groups(void);
-#if __OPENCL_C_VERSION__ >= CL_VERSION_2_0
+#if defined(__OPENCL_CPP_VERSION__) || (__OPENCL_C_VERSION__ >= CL_VERSION_2_0)
 uint    __ovld get_enqueued_num_sub_groups(void);
-#endif //__OPENCL_C_VERSION__ >= CL_VERSION_2_0
+#endif //defined(__OPENCL_CPP_VERSION__) || (__OPENCL_C_VERSION__ >= CL_VERSION_2_0)
 uint    __ovld get_sub_group_id(void);
 uint    __ovld get_sub_group_local_id(void);
 
 void    __ovld __conv sub_group_barrier(cl_mem_fence_flags flags);
-#if __OPENCL_C_VERSION__ >= CL_VERSION_2_0
+#if defined(__OPENCL_CPP_VERSION__) || (__OPENCL_C_VERSION__ >= CL_VERSION_2_0)
 void    __ovld __conv sub_group_barrier(cl_mem_fence_flags flags, memory_scope scope);
-#endif //__OPENCL_C_VERSION__ >= CL_VERSION_2_0
+#endif //defined(__OPENCL_CPP_VERSION__) || (__OPENCL_C_VERSION__ >= CL_VERSION_2_0)
 
 int     __ovld __conv sub_group_all(int predicate);
 int     __ovld __conv sub_group_any(int predicate);
@@ -15573,12 +15573,12 @@ uint2   __ovld __conv intel_sub_group_block_read2( read_only image2d_t image, in
 uint4   __ovld __conv intel_sub_group_block_read4( read_only image2d_t image, int2 coord );
 uint8   __ovld __conv intel_sub_group_block_read8( read_only image2d_t image, int2 coord );
 
-#if (__OPENCL_C_VERSION__ >= CL_VERSION_2_0)
+#if defined(__OPENCL_CPP_VERSION__) || (__OPENCL_C_VERSION__ >= CL_VERSION_2_0)
 uint    __ovld __conv intel_sub_group_block_read(read_write image2d_t image, int2 coord);
 uint2   __ovld __conv intel_sub_group_block_read2(read_write image2d_t image, int2 coord);
 uint4   __ovld __conv intel_sub_group_block_read4(read_write image2d_t image, int2 coord);
 uint8   __ovld __conv intel_sub_group_block_read8(read_write image2d_t image, int2 coord);
-#endif // (__OPENCL_C_VERSION__ >= CL_VERSION_2_0)
+#endif // defined(__OPENCL_CPP_VERSION__) || (__OPENCL_C_VERSION__ >= CL_VERSION_2_0)
 
 uint    __ovld __conv intel_sub_group_block_read( const __global uint* p );
 uint2   __ovld __conv intel_sub_group_block_read2( const __global uint* p );
@@ -15590,12 +15590,12 @@ void    __ovld __conv intel_sub_group_block_write2(write_only image2d_t image, i
 void    __ovld __conv intel_sub_group_block_write4(write_only image2d_t image, int2 coord, uint4 data);
 void    __ovld __conv intel_sub_group_block_write8(write_only image2d_t image, int2 coord, uint8 data);
 
-#if (__OPENCL_C_VERSION__ >= CL_VERSION_2_0)
+#if defined(__OPENCL_CPP_VERSION__) || (__OPENCL_C_VERSION__ >= CL_VERSION_2_0)
 void    __ovld __conv intel_sub_group_block_write(read_write image2d_t image, int2 coord, uint data);
 void    __ovld __conv intel_sub_group_block_write2(read_write image2d_t image, int2 coord, uint2 data);
 void    __ovld __conv intel_sub_group_block_write4(read_write image2d_t image, int2 coord, uint4 data);
 void    __ovld __conv intel_sub_group_block_write8(read_write image2d_t image, int2 coord, uint8 data);
-#endif // (__OPENCL_C_VERSION__ >= CL_VERSION_2_0)
+#endif // defined(__OPENCL_CPP_VERSION__) || (__OPENCL_C_VERSION__ >= CL_VERSION_2_0)
 
 void    __ovld __conv intel_sub_group_block_write( __global uint* p, uint data );
 void    __ovld __conv intel_sub_group_block_write2( __global uint* p, uint2 data );
@@ -15713,12 +15713,12 @@ uint2      __ovld __conv intel_sub_group_block_read_ui2( read_only image2d_t ima
 uint4      __ovld __conv intel_sub_group_block_read_ui4( read_only image2d_t image, int2 byte_coord );
 uint8      __ovld __conv intel_sub_group_block_read_ui8( read_only image2d_t image, int2 byte_coord );
 
-#if (__OPENCL_C_VERSION__ >= CL_VERSION_2_0)
+#if defined(__OPENCL_CPP_VERSION__) || (__OPENCL_C_VERSION__ >= CL_VERSION_2_0)
 uint       __ovld __conv intel_sub_group_block_read_ui( read_write image2d_t image, int2 byte_coord );
 uint2      __ovld __conv intel_sub_group_block_read_ui2( read_write image2d_t image, int2 byte_coord );
 uint4      __ovld __conv intel_sub_group_block_read_ui4( read_write image2d_t image, int2 byte_coord );
 uint8      __ovld __conv intel_sub_group_block_read_ui8( read_write image2d_t image, int2 byte_coord );
-#endif // (__OPENCL_C_VERSION__ >= CL_VERSION_2_0)
+#endif // defined(__OPENCL_CPP_VERSION__) || (__OPENCL_C_VERSION__ >= CL_VERSION_2_0)
 
 uint       __ovld __conv intel_sub_group_block_read_ui( const __global uint* p );
 uint2      __ovld __conv intel_sub_group_block_read_ui2( const __global uint* p );
@@ -15730,12 +15730,12 @@ void       __ovld __conv intel_sub_group_block_write_ui2( read_only image2d_t im
 void       __ovld __conv intel_sub_group_block_write_ui4( read_only image2d_t image, int2 byte_coord, uint4 data );
 void       __ovld __conv intel_sub_group_block_write_ui8( read_only image2d_t image, int2 byte_coord, uint8 data );
 
-#if (__OPENCL_C_VERSION__ >= CL_VERSION_2_0)
+#if defined(__OPENCL_CPP_VERSION__) || (__OPENCL_C_VERSION__ >= CL_VERSION_2_0)
 void       __ovld __conv intel_sub_group_block_write_ui( read_write image2d_t image, int2 byte_coord, uint data );
 void       __ovld __conv intel_sub_group_block_write_ui2( read_write image2d_t image, int2 byte_coord, uint2 data );
 void       __ovld __conv intel_sub_group_block_write_ui4( read_write image2d_t image, int2 byte_coord, uint4 data );
 void       __ovld __conv intel_sub_group_block_write_ui8( read_write image2d_t image, int2 byte_coord, uint8 data );
-#endif // (__OPENCL_C_VERSION__ >= CL_VERSION_2_0)
+#endif // defined(__OPENCL_CPP_VERSION__) || (__OPENCL_C_VERSION__ >= CL_VERSION_2_0)
 
 void       __ovld __conv intel_sub_group_block_write_ui( __global uint* p, uint data );
 void       __ovld __conv intel_sub_group_block_write_ui2( __global uint* p, uint2 data );
@@ -15747,12 +15747,12 @@ ushort2     __ovld __conv intel_sub_group_block_read_us2( read_only image2d_t im
 ushort4     __ovld __conv intel_sub_group_block_read_us4( read_only image2d_t image, int2 coord );
 ushort8     __ovld __conv intel_sub_group_block_read_us8( read_only image2d_t image, int2 coord );
 
-#if (__OPENCL_C_VERSION__ >= CL_VERSION_2_0)
+#if defined(__OPENCL_CPP_VERSION__) || (__OPENCL_C_VERSION__ >= CL_VERSION_2_0)
 ushort      __ovld __conv intel_sub_group_block_read_us(read_write image2d_t image, int2 coord);
 ushort2     __ovld __conv intel_sub_group_block_read_us2(read_write image2d_t image, int2 coord);
 ushort4     __ovld __conv intel_sub_group_block_read_us4(read_write image2d_t image, int2 coord);
 ushort8     __ovld __conv intel_sub_group_block_read_us8(read_write image2d_t image, int2 coord);
-#endif // (__OPENCL_C_VERSION__ >= CL_VERSION_2_0)
+#endif // defined(__OPENCL_CPP_VERSION__) || (__OPENCL_C_VERSION__ >= CL_VERSION_2_0)
 
 ushort      __ovld __conv intel_sub_group_block_read_us(  const __global ushort* p );
 ushort2     __ovld __conv intel_sub_group_block_read_us2( const __global ushort* p );
@@ -15764,12 +15764,12 @@ void        __ovld __conv intel_sub_group_block_write_us2(write_only image2d_t i
 void        __ovld __conv intel_sub_group_block_write_us4(write_only image2d_t image, int2 coord, ushort4 data);
 void        __ovld __conv intel_sub_group_block_write_us8(write_only image2d_t image, int2 coord, ushort8 data);
 
-#if (__OPENCL_C_VERSION__ >= CL_VERSION_2_0)
+#if defined(__OPENCL_CPP_VERSION__) || (__OPENCL_C_VERSION__ >= CL_VERSION_2_0)
 void        __ovld __conv intel_sub_group_block_write_us(read_write image2d_t image, int2 coord, ushort  data);
 void        __ovld __conv intel_sub_group_block_write_us2(read_write image2d_t image, int2 coord, ushort2 data);
 void        __ovld __conv intel_sub_group_block_write_us4(read_write image2d_t image, int2 coord, ushort4 data);
 void        __ovld __conv intel_sub_group_block_write_us8(read_write image2d_t image, int2 coord, ushort8 data);
-#endif // (__OPENCL_C_VERSION__ >= CL_VERSION_2_0)
+#endif // defined(__OPENCL_CPP_VERSION__) || (__OPENCL_C_VERSION__ >= CL_VERSION_2_0)
 
 void        __ovld __conv intel_sub_group_block_write_us(  __global ushort* p, ushort  data );
 void        __ovld __conv intel_sub_group_block_write_us2( __global ushort* p, ushort2 data );
diff --git a/lib/Headers/ppc_wrappers/emmintrin.h b/lib/Headers/ppc_wrappers/emmintrin.h
index 617ce24acd3f..293276cc9be0 100644
--- a/lib/Headers/ppc_wrappers/emmintrin.h
+++ b/lib/Headers/ppc_wrappers/emmintrin.h
@@ -35,6 +35,8 @@
 #ifndef EMMINTRIN_H_
 #define EMMINTRIN_H_
 
+#if defined(__linux__) && defined(__ppc64__)
+
 #include <altivec.h>
 
 /* We need definitions from the SSE header files.  */
@@ -2315,4 +2317,8 @@ _mm_castsi128_pd(__m128i __A)
   return (__m128d) __A;
 }
 
+#else
+#include_next <emmintrin.h>
+#endif /* defined(__linux__) && defined(__ppc64__) */
+
 #endif /* EMMINTRIN_H_ */
diff --git a/lib/Headers/ppc_wrappers/mm_malloc.h b/lib/Headers/ppc_wrappers/mm_malloc.h
index d91d7865c893..24b14c8e07c0 100644
--- a/lib/Headers/ppc_wrappers/mm_malloc.h
+++ b/lib/Headers/ppc_wrappers/mm_malloc.h
@@ -10,6 +10,8 @@
 #ifndef _MM_MALLOC_H_INCLUDED
 #define _MM_MALLOC_H_INCLUDED
 
+#if defined(__linux__) && defined(__ppc64__)
+
 #include <stdlib.h>
 
 /* We can't depend on <stdlib.h> since the prototype of posix_memalign
@@ -41,4 +43,8 @@ _mm_free (void * ptr)
   free (ptr);
 }
 
+#else
+#include_next <mm_malloc.h>
+#endif
+
 #endif /* _MM_MALLOC_H_INCLUDED */
diff --git a/lib/Headers/ppc_wrappers/mmintrin.h b/lib/Headers/ppc_wrappers/mmintrin.h
index b949653adf5a..c55c44726f00 100644
--- a/lib/Headers/ppc_wrappers/mmintrin.h
+++ b/lib/Headers/ppc_wrappers/mmintrin.h
@@ -35,6 +35,8 @@
 #ifndef _MMINTRIN_H_INCLUDED
 #define _MMINTRIN_H_INCLUDED
 
+#if defined(__linux__) && defined(__ppc64__)
+
 #include <altivec.h>
 /* The Intel API is flexible enough that we must allow aliasing with other
    vector types, and their scalar components.  */
@@ -1440,4 +1442,9 @@ extern __inline __m64
   return (res.as_m64);
 #endif
 }
+
+#else
+#include_next <mmintrin.h>
+#endif /* defined(__linux__) && defined(__ppc64__) */
+
 #endif /* _MMINTRIN_H_INCLUDED */
diff --git a/lib/Headers/ppc_wrappers/pmmintrin.h b/lib/Headers/ppc_wrappers/pmmintrin.h
new file mode 100644
index 000000000000..6d93383d5412
--- /dev/null
+++ b/lib/Headers/ppc_wrappers/pmmintrin.h
@@ -0,0 +1,150 @@
+/*===---- pmmintrin.h - Implementation of SSE3 intrinsics on PowerPC -------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+/* Implemented from the specification included in the Intel C++ Compiler
+   User Guide and Reference, version 9.0.  */
+
+#ifndef NO_WARN_X86_INTRINSICS
+/* This header is distributed to simplify porting x86_64 code that
+   makes explicit use of Intel intrinsics to powerpc64le.
+   It is the user's responsibility to determine if the results are
+   acceptable and make additional changes as necessary.
+   Note that much code that uses Intel intrinsics can be rewritten in
+   standard C or GNU C extensions, which are more portable and better
+   optimized across multiple targets.
+
+   In the specific case of X86 SSE3 intrinsics, the PowerPC VMX/VSX ISA
+   is a good match for most SIMD operations.  However the Horizontal
+   add/sub requires the data pairs be permuted into a separate
+   registers with vertical even/odd alignment for the operation.
+   And the addsub operation requires the sign of only the even numbered
+   elements be flipped (xored with -0.0).
+   For larger blocks of code using these intrinsic implementations,
+   the compiler be should be able to schedule instructions to avoid
+   additional latency.
+
+   In the specific case of the monitor and mwait instructions there are
+   no direct equivalent in the PowerISA at this time.  So those
+   intrinsics are not implemented.  */
+#error "Please read comment above.  Use -DNO_WARN_X86_INTRINSICS to disable this warning."
+#endif
+
+#ifndef PMMINTRIN_H_
+#define PMMINTRIN_H_
+
+#if defined(__linux__) && defined(__ppc64__)
+
+/* We need definitions from the SSE2 and SSE header files*/
+#include <emmintrin.h>
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_addsub_ps (__m128 __X, __m128 __Y)
+{
+  const __v4sf even_n0 = {-0.0, 0.0, -0.0, 0.0};
+  __v4sf even_neg_Y = vec_xor(__Y, even_n0);
+  return (__m128) vec_add (__X, even_neg_Y);
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_addsub_pd (__m128d __X, __m128d __Y)
+{
+  const __v2df even_n0 = {-0.0, 0.0};
+  __v2df even_neg_Y = vec_xor(__Y, even_n0);
+  return (__m128d) vec_add (__X, even_neg_Y);
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_hadd_ps (__m128 __X, __m128 __Y)
+{
+  __vector unsigned char xform2 = {
+      0x00, 0x01, 0x02, 0x03,
+      0x08, 0x09, 0x0A, 0x0B,
+      0x10, 0x11, 0x12, 0x13,
+      0x18, 0x19, 0x1A, 0x1B
+    };
+  __vector unsigned char xform1 = {
+      0x04, 0x05, 0x06, 0x07,
+      0x0C, 0x0D, 0x0E, 0x0F,
+      0x14, 0x15, 0x16, 0x17,
+      0x1C, 0x1D, 0x1E, 0x1F
+    };
+  return (__m128) vec_add (vec_perm ((__v4sf) __X, (__v4sf) __Y, xform2),
+			   vec_perm ((__v4sf) __X, (__v4sf) __Y, xform1));
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_hsub_ps (__m128 __X, __m128 __Y)
+{
+  __vector unsigned char xform2 = {
+      0x00, 0x01, 0x02, 0x03,
+      0x08, 0x09, 0x0A, 0x0B,
+      0x10, 0x11, 0x12, 0x13,
+      0x18, 0x19, 0x1A, 0x1B
+    };
+  __vector unsigned char xform1 = {
+      0x04, 0x05, 0x06, 0x07,
+      0x0C, 0x0D, 0x0E, 0x0F,
+      0x14, 0x15, 0x16, 0x17,
+      0x1C, 0x1D, 0x1E, 0x1F
+    };
+  return (__m128) vec_sub (vec_perm ((__v4sf) __X, (__v4sf) __Y, xform2),
+			   vec_perm ((__v4sf) __X, (__v4sf) __Y, xform1));
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_hadd_pd (__m128d __X, __m128d __Y)
+{
+  return (__m128d) vec_add (vec_mergeh ((__v2df) __X, (__v2df)__Y),
+				  vec_mergel ((__v2df) __X, (__v2df)__Y));
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_hsub_pd (__m128d __X, __m128d __Y)
+{
+  return (__m128d) vec_sub (vec_mergeh ((__v2df) __X, (__v2df)__Y),
+			    vec_mergel ((__v2df) __X, (__v2df)__Y));
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_movehdup_ps (__m128 __X)
+{
+  return (__m128)vec_mergeo ((__v4su)__X, (__v4su)__X);
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_moveldup_ps (__m128 __X)
+{
+  return (__m128)vec_mergee ((__v4su)__X, (__v4su)__X);
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_loaddup_pd (double const *__P)
+{
+  return (__m128d) vec_splats (*__P);
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_movedup_pd (__m128d __X)
+{
+  return _mm_shuffle_pd (__X, __X, _MM_SHUFFLE2 (0,0));
+}
+
+extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_lddqu_si128 (__m128i const *__P)
+{
+  return (__m128i) (vec_vsx_ld(0, (signed int const *)__P));
+}
+
+/* POWER8 / POWER9 have no equivalent for _mm_monitor nor _mm_wait.  */
+
+#else
+#include_next <pmmintrin.h>
+#endif /* defined(__linux__) && defined(__ppc64__) */
+
+#endif /* PMMINTRIN_H_ */
diff --git a/lib/Headers/ppc_wrappers/smmintrin.h b/lib/Headers/ppc_wrappers/smmintrin.h
new file mode 100644
index 000000000000..56ef6ba76b06
--- /dev/null
+++ b/lib/Headers/ppc_wrappers/smmintrin.h
@@ -0,0 +1,85 @@
+/*===---- smmintrin.h - Implementation of SSE4 intrinsics on PowerPC -------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+/* Implemented from the specification included in the Intel C++ Compiler
+   User Guide and Reference, version 9.0.
+
+   NOTE: This is NOT a complete implementation of the SSE4 intrinsics!  */
+
+#ifndef NO_WARN_X86_INTRINSICS
+/* This header is distributed to simplify porting x86_64 code that
+   makes explicit use of Intel intrinsics to powerp64/powerpc64le.
+
+   It is the user's responsibility to determine if the results are
+   acceptable and make additional changes as necessary.
+
+   Note that much code that uses Intel intrinsics can be rewritten in
+   standard C or GNU C extensions, which are more portable and better
+   optimized across multiple targets.  */
+#error                                                                         \
+    "Please read comment above.  Use -DNO_WARN_X86_INTRINSICS to disable this error."
+#endif
+
+#ifndef SMMINTRIN_H_
+#define SMMINTRIN_H_
+
+#if defined(__linux__) && defined(__ppc64__)
+
+#include <altivec.h>
+#include <emmintrin.h>
+
+extern __inline int
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _mm_extract_epi8(__m128i __X, const int __N) {
+  return (unsigned char)((__v16qi)__X)[__N & 15];
+}
+
+extern __inline int
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _mm_extract_epi32(__m128i __X, const int __N) {
+  return ((__v4si)__X)[__N & 3];
+}
+
+extern __inline int
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _mm_extract_epi64(__m128i __X, const int __N) {
+  return ((__v2di)__X)[__N & 1];
+}
+
+extern __inline int
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _mm_extract_ps(__m128 __X, const int __N) {
+  return ((__v4si)__X)[__N & 3];
+}
+
+extern __inline __m128i
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _mm_blend_epi16(__m128i __A, __m128i __B, const int __imm8) {
+  __v16qi __charmask = vec_splats((signed char)__imm8);
+  __charmask = vec_gb(__charmask);
+  __v8hu __shortmask = (__v8hu)vec_unpackh(__charmask);
+#ifdef __BIG_ENDIAN__
+  __shortmask = vec_reve(__shortmask);
+#endif
+  return (__m128i)vec_sel((__v8hu)__A, (__v8hu)__B, __shortmask);
+}
+
+extern __inline __m128i
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+    _mm_blendv_epi8(__m128i __A, __m128i __B, __m128i __mask) {
+  const __v16qu __seven = vec_splats((unsigned char)0x07);
+  __v16qu __lmask = vec_sra((__v16qu)__mask, __seven);
+  return (__m128i)vec_sel((__v16qu)__A, (__v16qu)__B, __lmask);
+}
+
+#else
+#include_next <smmintrin.h>
+#endif /* defined(__linux__) && defined(__ppc64__) */
+
+#endif /* _SMMINTRIN_H_ */
diff --git a/lib/Headers/ppc_wrappers/tmmintrin.h b/lib/Headers/ppc_wrappers/tmmintrin.h
new file mode 100644
index 000000000000..b5a935d5e47e
--- /dev/null
+++ b/lib/Headers/ppc_wrappers/tmmintrin.h
@@ -0,0 +1,495 @@
+/*===---- tmmintrin.h - Implementation of SSSE3 intrinsics on PowerPC ------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+/* Implemented from the specification included in the Intel C++ Compiler
+   User Guide and Reference, version 9.0.  */
+
+#ifndef NO_WARN_X86_INTRINSICS
+/* This header is distributed to simplify porting x86_64 code that
+   makes explicit use of Intel intrinsics to powerpc64le.
+
+   It is the user's responsibility to determine if the results are
+   acceptable and make additional changes as necessary.
+
+   Note that much code that uses Intel intrinsics can be rewritten in
+   standard C or GNU C extensions, which are more portable and better
+   optimized across multiple targets.  */
+#endif
+
+#ifndef TMMINTRIN_H_
+#define TMMINTRIN_H_
+
+#if defined(__linux__) && defined(__ppc64__)
+
+#include <altivec.h>
+
+/* We need definitions from the SSE header files.  */
+#include <pmmintrin.h>
+
+extern __inline __m128i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_abs_epi16 (__m128i __A)
+{
+  return (__m128i) vec_abs ((__v8hi) __A);
+}
+
+extern __inline __m128i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_abs_epi32 (__m128i __A)
+{
+  return (__m128i) vec_abs ((__v4si) __A);
+}
+
+extern __inline __m128i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_abs_epi8 (__m128i __A)
+{
+  return (__m128i) vec_abs ((__v16qi) __A);
+}
+
+extern __inline __m64
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_abs_pi16 (__m64 __A)
+{
+  __v8hi __B = (__v8hi) (__v2du) { __A, __A };
+  return (__m64) ((__v2du) vec_abs (__B))[0];
+}
+
+extern __inline __m64
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_abs_pi32 (__m64 __A)
+{
+  __v4si __B = (__v4si) (__v2du) { __A, __A };
+  return (__m64) ((__v2du) vec_abs (__B))[0];
+}
+
+extern __inline __m64
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_abs_pi8 (__m64 __A)
+{
+  __v16qi __B = (__v16qi) (__v2du) { __A, __A };
+  return (__m64) ((__v2du) vec_abs (__B))[0];
+}
+
+extern __inline __m128i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_alignr_epi8 (__m128i __A, __m128i __B, const unsigned int __count)
+{
+  if (__builtin_constant_p (__count) && __count < 16)
+    {
+#ifdef __LITTLE_ENDIAN__
+      __A = (__m128i) vec_reve ((__v16qu) __A);
+      __B = (__m128i) vec_reve ((__v16qu) __B);
+#endif
+      __A = (__m128i) vec_sld ((__v16qu) __B, (__v16qu) __A, __count);
+#ifdef __LITTLE_ENDIAN__
+      __A = (__m128i) vec_reve ((__v16qu) __A);
+#endif
+      return __A;
+    }
+
+  if (__count == 0)
+    return __B;
+
+  if (__count >= 16)
+    {
+      if (__count >= 32)
+	{
+	  const __v16qu zero = { 0 };
+	  return (__m128i) zero;
+	}
+      else
+	{
+	  const __v16qu __shift =
+	    vec_splats ((unsigned char) ((__count - 16) * 8));
+#ifdef __LITTLE_ENDIAN__
+	  return (__m128i) vec_sro ((__v16qu) __A, __shift);
+#else
+	  return (__m128i) vec_slo ((__v16qu) __A, __shift);
+#endif
+	}
+    }
+  else
+    {
+      const __v16qu __shiftA =
+	vec_splats ((unsigned char) ((16 - __count) * 8));
+      const __v16qu __shiftB = vec_splats ((unsigned char) (__count * 8));
+#ifdef __LITTLE_ENDIAN__
+      __A = (__m128i) vec_slo ((__v16qu) __A, __shiftA);
+      __B = (__m128i) vec_sro ((__v16qu) __B, __shiftB);
+#else
+      __A = (__m128i) vec_sro ((__v16qu) __A, __shiftA);
+      __B = (__m128i) vec_slo ((__v16qu) __B, __shiftB);
+#endif
+      return (__m128i) vec_or ((__v16qu) __A, (__v16qu) __B);
+    }
+}
+
+extern __inline __m64
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_alignr_pi8 (__m64 __A, __m64 __B, unsigned int __count)
+{
+  if (__count < 16)
+    {
+      __v2du __C = { __B, __A };
+#ifdef __LITTLE_ENDIAN__
+      const __v4su __shift = { __count << 3, 0, 0, 0 };
+      __C = (__v2du) vec_sro ((__v16qu) __C, (__v16qu) __shift);
+#else
+      const __v4su __shift = { 0, 0, 0, __count << 3 };
+      __C = (__v2du) vec_slo ((__v16qu) __C, (__v16qu) __shift);
+#endif
+      return (__m64) __C[0];
+    }
+  else
+    {
+      const __m64 __zero = { 0 };
+      return __zero;
+    }
+}
+
+extern __inline __m128i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_hadd_epi16 (__m128i __A, __m128i __B)
+{
+  const __v16qu __P =
+    {  0,  1,  4,  5,  8,  9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29 };
+  const __v16qu __Q =
+    {  2,  3,  6,  7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31 };
+  __v8hi __C = vec_perm ((__v8hi) __A, (__v8hi) __B, __P);
+  __v8hi __D = vec_perm ((__v8hi) __A, (__v8hi) __B, __Q);
+  return (__m128i) vec_add (__C, __D);
+}
+
+extern __inline __m128i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_hadd_epi32 (__m128i __A, __m128i __B)
+{
+  const __v16qu __P =
+    {  0,  1,  2,  3,  8,  9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27 };
+  const __v16qu __Q =
+    {  4,  5,  6,  7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31 };
+  __v4si __C = vec_perm ((__v4si) __A, (__v4si) __B, __P);
+  __v4si __D = vec_perm ((__v4si) __A, (__v4si) __B, __Q);
+  return (__m128i) vec_add (__C, __D);
+}
+
+extern __inline __m64
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_hadd_pi16 (__m64 __A, __m64 __B)
+{
+  __v8hi __C = (__v8hi) (__v2du) { __A, __B };
+  const __v16qu __P =
+    {  0,  1,  4,  5,  8,  9, 12, 13,  0,  1,  4,  5,  8,  9, 12, 13 };
+  const __v16qu __Q =
+    {  2,  3,  6,  7, 10, 11, 14, 15,  2,  3,  6,  7, 10, 11, 14, 15 };
+  __v8hi __D = vec_perm (__C, __C, __Q);
+  __C = vec_perm (__C, __C, __P);
+  __C = vec_add (__C, __D);
+  return (__m64) ((__v2du) __C)[1];
+}
+
+extern __inline __m64
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_hadd_pi32 (__m64 __A, __m64 __B)
+{
+  __v4si __C = (__v4si) (__v2du) { __A, __B };
+  const __v16qu __P =
+    {  0,  1,  2,  3,  8,  9, 10, 11,  0,  1,  2,  3,  8,  9, 10, 11 };
+  const __v16qu __Q =
+    {  4,  5,  6,  7, 12, 13, 14, 15,  4,  5,  6,  7, 12, 13, 14, 15 };
+  __v4si __D = vec_perm (__C, __C, __Q);
+  __C = vec_perm (__C, __C, __P);
+  __C = vec_add (__C, __D);
+  return (__m64) ((__v2du) __C)[1];
+}
+
+extern __inline __m128i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_hadds_epi16 (__m128i __A, __m128i __B)
+{
+  __v4si __C = { 0 }, __D = { 0 };
+  __C = vec_sum4s ((__v8hi) __A, __C);
+  __D = vec_sum4s ((__v8hi) __B, __D);
+  __C = (__v4si) vec_packs (__C, __D);
+  return (__m128i) __C;
+}
+
+extern __inline __m64
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_hadds_pi16 (__m64 __A, __m64 __B)
+{
+  const __v4si __zero = { 0 };
+  __v8hi __C = (__v8hi) (__v2du) { __A, __B };
+  __v4si __D = vec_sum4s (__C, __zero);
+  __C = vec_packs (__D, __D);
+  return (__m64) ((__v2du) __C)[1];
+}
+
+extern __inline __m128i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_hsub_epi16 (__m128i __A, __m128i __B)
+{
+  const __v16qu __P =
+    {  0,  1,  4,  5,  8,  9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29 };
+  const __v16qu __Q =
+    {  2,  3,  6,  7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31 };
+  __v8hi __C = vec_perm ((__v8hi) __A, (__v8hi) __B, __P);
+  __v8hi __D = vec_perm ((__v8hi) __A, (__v8hi) __B, __Q);
+  return (__m128i) vec_sub (__C, __D);
+}
+
+extern __inline __m128i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_hsub_epi32 (__m128i __A, __m128i __B)
+{
+  const __v16qu __P =
+    {  0,  1,  2,  3,  8,  9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27 };
+  const __v16qu __Q =
+    {  4,  5,  6,  7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31 };
+  __v4si __C = vec_perm ((__v4si) __A, (__v4si) __B, __P);
+  __v4si __D = vec_perm ((__v4si) __A, (__v4si) __B, __Q);
+  return (__m128i) vec_sub (__C, __D);
+}
+
+extern __inline __m64
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_hsub_pi16 (__m64 __A, __m64 __B)
+{
+  const __v16qu __P =
+    {  0,  1,  4,  5,  8,  9, 12, 13,  0,  1,  4,  5,  8,  9, 12, 13 };
+  const __v16qu __Q =
+    {  2,  3,  6,  7, 10, 11, 14, 15,  2,  3,  6,  7, 10, 11, 14, 15 };
+  __v8hi __C = (__v8hi) (__v2du) { __A, __B };
+  __v8hi __D = vec_perm (__C, __C, __Q);
+  __C = vec_perm (__C, __C, __P);
+  __C = vec_sub (__C, __D);
+  return (__m64) ((__v2du) __C)[1];
+}
+
+extern __inline __m64
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_hsub_pi32 (__m64 __A, __m64 __B)
+{
+  const __v16qu __P =
+    {  0,  1,  2,  3,  8,  9, 10, 11,  0,  1,  2,  3,  8,  9, 10, 11 };
+  const __v16qu __Q =
+    {  4,  5,  6,  7, 12, 13, 14, 15,  4,  5,  6,  7, 12, 13, 14, 15 };
+  __v4si __C = (__v4si) (__v2du) { __A, __B };
+  __v4si __D = vec_perm (__C, __C, __Q);
+  __C = vec_perm (__C, __C, __P);
+  __C = vec_sub (__C, __D);
+  return (__m64) ((__v2du) __C)[1];
+}
+
+extern __inline __m128i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_hsubs_epi16 (__m128i __A, __m128i __B)
+{
+  const __v16qu __P =
+    {  0,  1,  4,  5,  8,  9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29 };
+  const __v16qu __Q =
+    {  2,  3,  6,  7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31 };
+  __v8hi __C = vec_perm ((__v8hi) __A, (__v8hi) __B, __P);
+  __v8hi __D = vec_perm ((__v8hi) __A, (__v8hi) __B, __Q);
+  return (__m128i) vec_subs (__C, __D);
+}
+
+extern __inline __m64
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_hsubs_pi16 (__m64 __A, __m64 __B)
+{
+  const __v16qu __P =
+    {  0,  1,  4,  5,  8,  9, 12, 13,  0,  1,  4,  5,  8,  9, 12, 13 };
+  const __v16qu __Q =
+    {  2,  3,  6,  7, 10, 11, 14, 15,  2,  3,  6,  7, 10, 11, 14, 15 };
+  __v8hi __C = (__v8hi) (__v2du) { __A, __B };
+  __v8hi __D = vec_perm (__C, __C, __P);
+  __v8hi __E = vec_perm (__C, __C, __Q);
+  __C = vec_subs (__D, __E);
+  return (__m64) ((__v2du) __C)[1];
+}
+
+extern __inline __m128i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_shuffle_epi8 (__m128i __A, __m128i __B)
+{
+  const __v16qi __zero = { 0 };
+  __vector __bool char __select = vec_cmplt ((__v16qi) __B, __zero);
+  __v16qi __C = vec_perm ((__v16qi) __A, (__v16qi) __A, (__v16qu) __B);
+  return (__m128i) vec_sel (__C, __zero, __select);
+}
+
+extern __inline __m64
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_shuffle_pi8 (__m64 __A, __m64 __B)
+{
+  const __v16qi __zero = { 0 };
+  __v16qi __C = (__v16qi) (__v2du) { __A, __A };
+  __v16qi __D = (__v16qi) (__v2du) { __B, __B };
+  __vector __bool char __select = vec_cmplt ((__v16qi) __D, __zero);
+  __C = vec_perm ((__v16qi) __C, (__v16qi) __C, (__v16qu) __D);
+  __C = vec_sel (__C, __zero, __select);
+  return (__m64) ((__v2du) (__C))[0];
+}
+
+extern __inline __m128i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_sign_epi8 (__m128i __A, __m128i __B)
+{
+  const __v16qi __zero = { 0 };
+  __v16qi __selectneg = (__v16qi) vec_cmplt ((__v16qi) __B, __zero);
+  __v16qi __selectpos =
+    (__v16qi) vec_neg ((__v16qi) vec_cmpgt ((__v16qi) __B, __zero));
+  __v16qi __conv = vec_add (__selectneg, __selectpos);
+  return (__m128i) vec_mul ((__v16qi) __A, (__v16qi) __conv);
+}
+
+extern __inline __m128i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_sign_epi16 (__m128i __A, __m128i __B)
+{
+  const __v8hi __zero = { 0 };
+  __v8hi __selectneg = (__v8hi) vec_cmplt ((__v8hi) __B, __zero);
+  __v8hi __selectpos =
+    (__v8hi) vec_neg ((__v8hi) vec_cmpgt ((__v8hi) __B, __zero));
+  __v8hi __conv = vec_add (__selectneg, __selectpos);
+  return (__m128i) vec_mul ((__v8hi) __A, (__v8hi) __conv);
+}
+
+extern __inline __m128i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_sign_epi32 (__m128i __A, __m128i __B)
+{
+  const __v4si __zero = { 0 };
+  __v4si __selectneg = (__v4si) vec_cmplt ((__v4si) __B, __zero);
+  __v4si __selectpos =
+    (__v4si) vec_neg ((__v4si) vec_cmpgt ((__v4si) __B, __zero));
+  __v4si __conv = vec_add (__selectneg, __selectpos);
+  return (__m128i) vec_mul ((__v4si) __A, (__v4si) __conv);
+}
+
+extern __inline __m64
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_sign_pi8 (__m64 __A, __m64 __B)
+{
+  const __v16qi __zero = { 0 };
+  __v16qi __C = (__v16qi) (__v2du) { __A, __A };
+  __v16qi __D = (__v16qi) (__v2du) { __B, __B };
+  __C = (__v16qi) _mm_sign_epi8 ((__m128i) __C, (__m128i) __D);
+  return (__m64) ((__v2du) (__C))[0];
+}
+
+extern __inline __m64
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_sign_pi16 (__m64 __A, __m64 __B)
+{
+  const __v8hi __zero = { 0 };
+  __v8hi __C = (__v8hi) (__v2du) { __A, __A };
+  __v8hi __D = (__v8hi) (__v2du) { __B, __B };
+  __C = (__v8hi) _mm_sign_epi16 ((__m128i) __C, (__m128i) __D);
+  return (__m64) ((__v2du) (__C))[0];
+}
+
+extern __inline __m64
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_sign_pi32 (__m64 __A, __m64 __B)
+{
+  const __v4si __zero = { 0 };
+  __v4si __C = (__v4si) (__v2du) { __A, __A };
+  __v4si __D = (__v4si) (__v2du) { __B, __B };
+  __C = (__v4si) _mm_sign_epi32 ((__m128i) __C, (__m128i) __D);
+  return (__m64) ((__v2du) (__C))[0];
+}
+
+extern __inline __m128i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maddubs_epi16 (__m128i __A, __m128i __B)
+{
+  __v8hi __unsigned = vec_splats ((signed short) 0x00ff);
+  __v8hi __C = vec_and (vec_unpackh ((__v16qi) __A), __unsigned);
+  __v8hi __D = vec_and (vec_unpackl ((__v16qi) __A), __unsigned);
+  __v8hi __E = vec_unpackh ((__v16qi) __B);
+  __v8hi __F = vec_unpackl ((__v16qi) __B);
+  __C = vec_mul (__C, __E);
+  __D = vec_mul (__D, __F);
+  const __v16qu __odds  =
+    {  0,  1,  4,  5,  8,  9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29 };
+  const __v16qu __evens =
+    {  2,  3,  6,  7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31 };
+  __E = vec_perm (__C, __D, __odds);
+  __F = vec_perm (__C, __D, __evens);
+  return (__m128i) vec_adds (__E, __F);
+}
+
+extern __inline __m64
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_maddubs_pi16 (__m64 __A, __m64 __B)
+{
+  __v8hi __C = (__v8hi) (__v2du) { __A, __A };
+  __C = vec_unpackl ((__v16qi) __C);
+  const __v8hi __unsigned = vec_splats ((signed short) 0x00ff);
+  __C = vec_and (__C, __unsigned);
+  __v8hi __D = (__v8hi) (__v2du) { __B, __B };
+  __D = vec_unpackl ((__v16qi) __D);
+  __D = vec_mul (__C, __D);
+  const __v16qu __odds  =
+    {  0,  1,  4,  5,  8,  9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29 };
+  const __v16qu __evens =
+    {  2,  3,  6,  7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31 };
+  __C = vec_perm (__D, __D, __odds);
+  __D = vec_perm (__D, __D, __evens);
+  __C = vec_adds (__C, __D);
+  return (__m64) ((__v2du) (__C))[0];
+}
+
+extern __inline __m128i
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mulhrs_epi16 (__m128i __A, __m128i __B)
+{
+  __v4si __C = vec_unpackh ((__v8hi) __A);
+  __v4si __D = vec_unpackh ((__v8hi) __B);
+  __C = vec_mul (__C, __D);
+  __D = vec_unpackl ((__v8hi) __A);
+  __v4si __E = vec_unpackl ((__v8hi) __B);
+  __D = vec_mul (__D, __E);
+  const __v4su __shift = vec_splats ((unsigned int) 14);
+  __C = vec_sr (__C, __shift);
+  __D = vec_sr (__D, __shift);
+  const __v4si __ones = vec_splats ((signed int) 1);
+  __C = vec_add (__C, __ones);
+  __C = vec_sr (__C, (__v4su) __ones);
+  __D = vec_add (__D, __ones);
+  __D = vec_sr (__D, (__v4su) __ones);
+  return (__m128i) vec_pack (__C, __D);
+}
+
+extern __inline __m64
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_mulhrs_pi16 (__m64 __A, __m64 __B)
+{
+  __v4si __C = (__v4si) (__v2du) { __A, __A };
+  __C = vec_unpackh ((__v8hi) __C);
+  __v4si __D = (__v4si) (__v2du) { __B, __B };
+  __D = vec_unpackh ((__v8hi) __D);
+  __C = vec_mul (__C, __D);
+  const __v4su __shift = vec_splats ((unsigned int) 14);
+  __C = vec_sr (__C, __shift);
+  const __v4si __ones = vec_splats ((signed int) 1);
+  __C = vec_add (__C, __ones);
+  __C = vec_sr (__C, (__v4su) __ones);
+  __v8hi __E = vec_pack (__C, __D);
+  return (__m64) ((__v2du) (__E))[0];
+}
+
+#else
+#include_next <tmmintrin.h>
+#endif /* defined(__linux__) && defined(__ppc64__) */
+
+#endif /* TMMINTRIN_H_ */
diff --git a/lib/Headers/ppc_wrappers/xmmintrin.h b/lib/Headers/ppc_wrappers/xmmintrin.h
index 1b322b66519a..0f429fa04081 100644
--- a/lib/Headers/ppc_wrappers/xmmintrin.h
+++ b/lib/Headers/ppc_wrappers/xmmintrin.h
@@ -34,6 +34,8 @@
 #ifndef _XMMINTRIN_H_INCLUDED
 #define _XMMINTRIN_H_INCLUDED
 
+#if defined(__linux__) && defined(__ppc64__)
+
 /* Define four value permute mask */
 #define _MM_SHUFFLE(w,x,y,z) (((w) << 6) | ((x) << 4) | ((y) << 2) | (z))
 
@@ -1835,4 +1837,8 @@ do {									\
 /* For backward source compatibility.  */
 //# include <emmintrin.h>
 
+#else
+#include_next <xmmintrin.h>
+#endif /* defined(__linux__) && defined(__ppc64__) */
+
 #endif /* _XMMINTRIN_H_INCLUDED */
author	Dimitry Andric <dim@FreeBSD.org>	2019-10-23 17:52:09 +0000
committer	Dimitry Andric <dim@FreeBSD.org>	2019-10-23 17:52:09 +0000
commit	519fc96c475680de2cc49e7811dbbfadb912cbcc (patch)
tree	310ca684459b7e9ae13c9a3b9abf308b3a634afe /lib/Headers
parent	2298981669bf3bd63335a4be179bc0f96823a8f4 (diff)