1 files changed, 162 insertions, 183 deletions
diff --git a/lib/Headers/smmintrin.h b/lib/Headers/smmintrin.h
index c2fa5a452bcec..4806b3e4e150d 100644
--- a/lib/Headers/smmintrin.h
+++ b/lib/Headers/smmintrin.h
@@ -21,13 +21,13 @@
  *===-----------------------------------------------------------------------===
  */
 
-#ifndef _SMMINTRIN_H
-#define _SMMINTRIN_H
+#ifndef __SMMINTRIN_H
+#define __SMMINTRIN_H
 
 #include <tmmintrin.h>
 
 /* Define the default attributes for the functions in this file. */
-#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("sse4.1")))
+#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("sse4.1"), __min_vector_width__(128)))
 
 /* SSE4 Rounding macros. */
 #define _MM_FROUND_TO_NEAREST_INT    0x00
@@ -46,7 +46,7 @@
 #define _MM_FROUND_RINT      (_MM_FROUND_RAISE_EXC | _MM_FROUND_CUR_DIRECTION)
 #define _MM_FROUND_NEARBYINT (_MM_FROUND_NO_EXC | _MM_FROUND_CUR_DIRECTION)
 
-/// \brief Rounds up each element of the 128-bit vector of [4 x float] to an
+/// Rounds up each element of the 128-bit vector of [4 x float] to an
 ///    integer and returns the rounded values in a 128-bit vector of
 ///    [4 x float].
 ///
@@ -63,7 +63,7 @@
 /// \returns A 128-bit vector of [4 x float] containing the rounded values.
 #define _mm_ceil_ps(X)       _mm_round_ps((X), _MM_FROUND_CEIL)
 
-/// \brief Rounds up each element of the 128-bit vector of [2 x double] to an
+/// Rounds up each element of the 128-bit vector of [2 x double] to an
 ///    integer and returns the rounded values in a 128-bit vector of
 ///    [2 x double].
 ///
@@ -80,7 +80,7 @@
 /// \returns A 128-bit vector of [2 x double] containing the rounded values.
 #define _mm_ceil_pd(X)       _mm_round_pd((X), _MM_FROUND_CEIL)
 
-/// \brief Copies three upper elements of the first 128-bit vector operand to
+/// Copies three upper elements of the first 128-bit vector operand to
 ///    the corresponding three upper elements of the 128-bit result vector of
 ///    [4 x float]. Rounds up the lowest element of the second 128-bit vector
 ///    operand to an integer and copies it to the lowest element of the 128-bit
@@ -105,7 +105,7 @@
 ///    values.
 #define _mm_ceil_ss(X, Y)    _mm_round_ss((X), (Y), _MM_FROUND_CEIL)
 
-/// \brief Copies the upper element of the first 128-bit vector operand to the
+/// Copies the upper element of the first 128-bit vector operand to the
 ///    corresponding upper element of the 128-bit result vector of [2 x double].
 ///    Rounds up the lower element of the second 128-bit vector operand to an
 ///    integer and copies it to the lower element of the 128-bit result vector
@@ -130,7 +130,7 @@
 ///    values.
 #define _mm_ceil_sd(X, Y)    _mm_round_sd((X), (Y), _MM_FROUND_CEIL)
 
-/// \brief Rounds down each element of the 128-bit vector of [4 x float] to an
+/// Rounds down each element of the 128-bit vector of [4 x float] to an
 ///    an integer and returns the rounded values in a 128-bit vector of
 ///    [4 x float].
 ///
@@ -147,7 +147,7 @@
 /// \returns A 128-bit vector of [4 x float] containing the rounded values.
 #define _mm_floor_ps(X)      _mm_round_ps((X), _MM_FROUND_FLOOR)
 
-/// \brief Rounds down each element of the 128-bit vector of [2 x double] to an
+/// Rounds down each element of the 128-bit vector of [2 x double] to an
 ///    integer and returns the rounded values in a 128-bit vector of
 ///    [2 x double].
 ///
@@ -164,7 +164,7 @@
 /// \returns A 128-bit vector of [2 x double] containing the rounded values.
 #define _mm_floor_pd(X)      _mm_round_pd((X), _MM_FROUND_FLOOR)
 
-/// \brief Copies three upper elements of the first 128-bit vector operand to
+/// Copies three upper elements of the first 128-bit vector operand to
 ///    the corresponding three upper elements of the 128-bit result vector of
 ///    [4 x float]. Rounds down the lowest element of the second 128-bit vector
 ///    operand to an integer and copies it to the lowest element of the 128-bit
@@ -189,7 +189,7 @@
 ///    values.
 #define _mm_floor_ss(X, Y)   _mm_round_ss((X), (Y), _MM_FROUND_FLOOR)
 
-/// \brief Copies the upper element of the first 128-bit vector operand to the
+/// Copies the upper element of the first 128-bit vector operand to the
 ///    corresponding upper element of the 128-bit result vector of [2 x double].
 ///    Rounds down the lower element of the second 128-bit vector operand to an
 ///    integer and copies it to the lower element of the 128-bit result vector
@@ -214,7 +214,7 @@
 ///    values.
 #define _mm_floor_sd(X, Y)   _mm_round_sd((X), (Y), _MM_FROUND_FLOOR)
 
-/// \brief Rounds each element of the 128-bit vector of [4 x float] to an
+/// Rounds each element of the 128-bit vector of [4 x float] to an
 ///    integer value according to the rounding control specified by the second
 ///    argument and returns the rounded values in a 128-bit vector of
 ///    [4 x float].
@@ -244,10 +244,10 @@
 ///      10: Upward (toward positive infinity) \n
 ///      11: Truncated
 /// \returns A 128-bit vector of [4 x float] containing the rounded values.
-#define _mm_round_ps(X, M) __extension__ ({ \
-  (__m128)__builtin_ia32_roundps((__v4sf)(__m128)(X), (M)); })
+#define _mm_round_ps(X, M) \
+  (__m128)__builtin_ia32_roundps((__v4sf)(__m128)(X), (M))
 
-/// \brief Copies three upper elements of the first 128-bit vector operand to
+/// Copies three upper elements of the first 128-bit vector operand to
 ///    the corresponding three upper elements of the 128-bit result vector of
 ///    [4 x float]. Rounds the lowest element of the second 128-bit vector
 ///    operand to an integer value according to the rounding control specified
@@ -285,11 +285,11 @@
 ///      11: Truncated
 /// \returns A 128-bit vector of [4 x float] containing the copied and rounded
 ///    values.
-#define _mm_round_ss(X, Y, M) __extension__ ({ \
+#define _mm_round_ss(X, Y, M) \
   (__m128)__builtin_ia32_roundss((__v4sf)(__m128)(X), \
-                                 (__v4sf)(__m128)(Y), (M)); })
+                                 (__v4sf)(__m128)(Y), (M))
 
-/// \brief Rounds each element of the 128-bit vector of [2 x double] to an
+/// Rounds each element of the 128-bit vector of [2 x double] to an
 ///    integer value according to the rounding control specified by the second
 ///    argument and returns the rounded values in a 128-bit vector of
 ///    [2 x double].
@@ -319,10 +319,10 @@
 ///      10: Upward (toward positive infinity) \n
 ///      11: Truncated
 /// \returns A 128-bit vector of [2 x double] containing the rounded values.
-#define _mm_round_pd(X, M) __extension__ ({ \
-  (__m128d)__builtin_ia32_roundpd((__v2df)(__m128d)(X), (M)); })
+#define _mm_round_pd(X, M) \
+  (__m128d)__builtin_ia32_roundpd((__v2df)(__m128d)(X), (M))
 
-/// \brief Copies the upper element of the first 128-bit vector operand to the
+/// Copies the upper element of the first 128-bit vector operand to the
 ///    corresponding upper element of the 128-bit result vector of [2 x double].
 ///    Rounds the lower element of the second 128-bit vector operand to an
 ///    integer value according to the rounding control specified by the third
@@ -360,12 +360,12 @@
 ///      11: Truncated
 /// \returns A 128-bit vector of [2 x double] containing the copied and rounded
 ///    values.
-#define _mm_round_sd(X, Y, M) __extension__ ({ \
+#define _mm_round_sd(X, Y, M) \
   (__m128d)__builtin_ia32_roundsd((__v2df)(__m128d)(X), \
-                                  (__v2df)(__m128d)(Y), (M)); })
+                                  (__v2df)(__m128d)(Y), (M))
 
 /* SSE4 Packed Blending Intrinsics.  */
-/// \brief Returns a 128-bit vector of [2 x double] where the values are
+/// Returns a 128-bit vector of [2 x double] where the values are
 ///    selected from either the first or second operand as specified by the
 ///    third operand, the control mask.
 ///
@@ -389,13 +389,11 @@
 ///    When a mask bit is 1, the corresponding 64-bit element in operand \a V2
 ///    is copied to the same position in the result.
 /// \returns A 128-bit vector of [2 x double] containing the copied values.
-#define _mm_blend_pd(V1, V2, M) __extension__ ({ \
-  (__m128d)__builtin_shufflevector((__v2df)(__m128d)(V1), \
-                                   (__v2df)(__m128d)(V2), \
-                                   (((M) & 0x01) ? 2 : 0), \
-                                   (((M) & 0x02) ? 3 : 1)); })
+#define _mm_blend_pd(V1, V2, M) \
+  (__m128d) __builtin_ia32_blendpd ((__v2df)(__m128d)(V1), \
+                                    (__v2df)(__m128d)(V2), (int)(M))
 
-/// \brief Returns a 128-bit vector of [4 x float] where the values are selected
+/// Returns a 128-bit vector of [4 x float] where the values are selected
 ///    from either the first or second operand as specified by the third
 ///    operand, the control mask.
 ///
@@ -419,14 +417,11 @@
 ///    When a mask bit is 1, the corresponding 32-bit element in operand \a V2
 ///    is copied to the same position in the result.
 /// \returns A 128-bit vector of [4 x float] containing the copied values.
-#define _mm_blend_ps(V1, V2, M) __extension__ ({ \
-  (__m128)__builtin_shufflevector((__v4sf)(__m128)(V1), (__v4sf)(__m128)(V2), \
-                                  (((M) & 0x01) ? 4 : 0), \
-                                  (((M) & 0x02) ? 5 : 1), \
-                                  (((M) & 0x04) ? 6 : 2), \
-                                  (((M) & 0x08) ? 7 : 3)); })
-
-/// \brief Returns a 128-bit vector of [2 x double] where the values are
+#define _mm_blend_ps(V1, V2, M) \
+  (__m128) __builtin_ia32_blendps ((__v4sf)(__m128)(V1), \
+                                   (__v4sf)(__m128)(V2), (int)(M))
+
+/// Returns a 128-bit vector of [2 x double] where the values are
 ///    selected from either the first or second operand as specified by the
 ///    third operand, the control mask.
 ///
@@ -453,7 +448,7 @@ _mm_blendv_pd (__m128d __V1, __m128d __V2, __m128d __M)
                                             (__v2df)__M);
 }
 
-/// \brief Returns a 128-bit vector of [4 x float] where the values are
+/// Returns a 128-bit vector of [4 x float] where the values are
 ///    selected from either the first or second operand as specified by the
 ///    third operand, the control mask.
 ///
@@ -480,7 +475,7 @@ _mm_blendv_ps (__m128 __V1, __m128 __V2, __m128 __M)
                                            (__v4sf)__M);
 }
 
-/// \brief Returns a 128-bit vector of [16 x i8] where the values are selected
+/// Returns a 128-bit vector of [16 x i8] where the values are selected
 ///    from either of the first or second operand as specified by the third
 ///    operand, the control mask.
 ///
@@ -493,7 +488,7 @@ _mm_blendv_ps (__m128 __V1, __m128 __V2, __m128 __M)
 /// \param __V2
 ///    A 128-bit vector of [16 x i8].
 /// \param __M
-///    A 128-bit vector operand, with mask bits 127, 119, 111 ... 7 specifying
+///    A 128-bit vector operand, with mask bits 127, 119, 111...7 specifying
 ///    how the values are to be copied. The position of the mask bit corresponds
 ///    to the most significant bit of a copied value. When a mask bit is 0, the
 ///    corresponding 8-bit element in operand \a __V1 is copied to the same
@@ -507,7 +502,7 @@ _mm_blendv_epi8 (__m128i __V1, __m128i __V2, __m128i __M)
                                                (__v16qi)__M);
 }
 
-/// \brief Returns a 128-bit vector of [8 x i16] where the values are selected
+/// Returns a 128-bit vector of [8 x i16] where the values are selected
 ///    from either of the first or second operand as specified by the third
 ///    operand, the control mask.
 ///
@@ -531,20 +526,12 @@ _mm_blendv_epi8 (__m128i __V1, __m128i __V2, __m128i __M)
 ///    When a mask bit is 1, the corresponding 16-bit element in operand \a V2
 ///    is copied to the same position in the result.
 /// \returns A 128-bit vector of [8 x i16] containing the copied values.
-#define _mm_blend_epi16(V1, V2, M) __extension__ ({ \
-  (__m128i)__builtin_shufflevector((__v8hi)(__m128i)(V1), \
-                                   (__v8hi)(__m128i)(V2), \
-                                   (((M) & 0x01) ?  8 : 0), \
-                                   (((M) & 0x02) ?  9 : 1), \
-                                   (((M) & 0x04) ? 10 : 2), \
-                                   (((M) & 0x08) ? 11 : 3), \
-                                   (((M) & 0x10) ? 12 : 4), \
-                                   (((M) & 0x20) ? 13 : 5), \
-                                   (((M) & 0x40) ? 14 : 6), \
-                                   (((M) & 0x80) ? 15 : 7)); })
+#define _mm_blend_epi16(V1, V2, M) \
+  (__m128i) __builtin_ia32_pblendw128 ((__v8hi)(__m128i)(V1), \
+                                       (__v8hi)(__m128i)(V2), (int)(M))
 
 /* SSE4 Dword Multiply Instructions.  */
-/// \brief Multiples corresponding elements of two 128-bit vectors of [4 x i32]
+/// Multiples corresponding elements of two 128-bit vectors of [4 x i32]
 ///    and returns the lower 32 bits of the each product in a 128-bit vector of
 ///    [4 x i32].
 ///
@@ -563,7 +550,7 @@ _mm_mullo_epi32 (__m128i __V1, __m128i __V2)
   return (__m128i) ((__v4su)__V1 * (__v4su)__V2);
 }
 
-/// \brief Multiplies corresponding even-indexed elements of two 128-bit
+/// Multiplies corresponding even-indexed elements of two 128-bit
 ///    vectors of [4 x i32] and returns a 128-bit vector of [2 x i64]
 ///    containing the products.
 ///
@@ -584,7 +571,7 @@ _mm_mul_epi32 (__m128i __V1, __m128i __V2)
 }
 
 /* SSE4 Floating Point Dot Product Instructions.  */
-/// \brief Computes the dot product of the two 128-bit vectors of [4 x float]
+/// Computes the dot product of the two 128-bit vectors of [4 x float]
 ///    and returns it in the elements of the 128-bit result vector of
 ///    [4 x float].
 ///
@@ -616,11 +603,11 @@ _mm_mul_epi32 (__m128i __V1, __m128i __V2)
 ///    each [4 x float] subvector. If a bit is set, the dot product is returned
 ///    in the corresponding element; otherwise that element is set to zero.
 /// \returns A 128-bit vector of [4 x float] containing the dot product.
-#define _mm_dp_ps(X, Y, M) __extension__ ({ \
+#define _mm_dp_ps(X, Y, M) \
   (__m128) __builtin_ia32_dpps((__v4sf)(__m128)(X), \
-                               (__v4sf)(__m128)(Y), (M)); })
+                               (__v4sf)(__m128)(Y), (M))
 
-/// \brief Computes the dot product of the two 128-bit vectors of [2 x double]
+/// Computes the dot product of the two 128-bit vectors of [2 x double]
 ///    and returns it in the elements of the 128-bit result vector of
 ///    [2 x double].
 ///
@@ -648,15 +635,15 @@ _mm_mul_epi32 (__m128i __V1, __m128i __V2)
 ///    input vectors are used as an input for dot product; otherwise that input
 ///    is treated as zero. Bits [1:0] determine which elements of the result
 ///    will receive a copy of the final dot product, with bit [0] corresponding
-///    to the lowest element and bit [3] corresponding to the highest element of
+///    to the lowest element and bit [1] corresponding to the highest element of
 ///    each [2 x double] vector. If a bit is set, the dot product is returned in
 ///    the corresponding element; otherwise that element is set to zero.
-#define _mm_dp_pd(X, Y, M) __extension__ ({\
+#define _mm_dp_pd(X, Y, M) \
   (__m128d) __builtin_ia32_dppd((__v2df)(__m128d)(X), \
-                                (__v2df)(__m128d)(Y), (M)); })
+                                (__v2df)(__m128d)(Y), (M))
 
 /* SSE4 Streaming Load Hint Instruction.  */
-/// \brief Loads integer values from a 128-bit aligned memory location to a
+/// Loads integer values from a 128-bit aligned memory location to a
 ///    128-bit integer vector.
 ///
 /// \headerfile <x86intrin.h>
@@ -675,7 +662,7 @@ _mm_stream_load_si128 (__m128i const *__V)
 }
 
 /* SSE4 Packed Integer Min/Max Instructions.  */
-/// \brief Compares the corresponding elements of two 128-bit vectors of
+/// Compares the corresponding elements of two 128-bit vectors of
 ///    [16 x i8] and returns a 128-bit vector of [16 x i8] containing the lesser
 ///    of the two values.
 ///
@@ -694,7 +681,7 @@ _mm_min_epi8 (__m128i __V1, __m128i __V2)
   return (__m128i) __builtin_ia32_pminsb128 ((__v16qi) __V1, (__v16qi) __V2);
 }
 
-/// \brief Compares the corresponding elements of two 128-bit vectors of
+/// Compares the corresponding elements of two 128-bit vectors of
 ///    [16 x i8] and returns a 128-bit vector of [16 x i8] containing the
 ///    greater value of the two.
 ///
@@ -713,7 +700,7 @@ _mm_max_epi8 (__m128i __V1, __m128i __V2)
   return (__m128i) __builtin_ia32_pmaxsb128 ((__v16qi) __V1, (__v16qi) __V2);
 }
 
-/// \brief Compares the corresponding elements of two 128-bit vectors of
+/// Compares the corresponding elements of two 128-bit vectors of
 ///    [8 x u16] and returns a 128-bit vector of [8 x u16] containing the lesser
 ///    value of the two.
 ///
@@ -732,7 +719,7 @@ _mm_min_epu16 (__m128i __V1, __m128i __V2)
   return (__m128i) __builtin_ia32_pminuw128 ((__v8hi) __V1, (__v8hi) __V2);
 }
 
-/// \brief Compares the corresponding elements of two 128-bit vectors of
+/// Compares the corresponding elements of two 128-bit vectors of
 ///    [8 x u16] and returns a 128-bit vector of [8 x u16] containing the
 ///    greater value of the two.
 ///
@@ -751,7 +738,7 @@ _mm_max_epu16 (__m128i __V1, __m128i __V2)
   return (__m128i) __builtin_ia32_pmaxuw128 ((__v8hi) __V1, (__v8hi) __V2);
 }
 
-/// \brief Compares the corresponding elements of two 128-bit vectors of
+/// Compares the corresponding elements of two 128-bit vectors of
 ///    [4 x i32] and returns a 128-bit vector of [4 x i32] containing the lesser
 ///    value of the two.
 ///
@@ -770,7 +757,7 @@ _mm_min_epi32 (__m128i __V1, __m128i __V2)
   return (__m128i) __builtin_ia32_pminsd128 ((__v4si) __V1, (__v4si) __V2);
 }
 
-/// \brief Compares the corresponding elements of two 128-bit vectors of
+/// Compares the corresponding elements of two 128-bit vectors of
 ///    [4 x i32] and returns a 128-bit vector of [4 x i32] containing the
 ///    greater value of the two.
 ///
@@ -789,7 +776,7 @@ _mm_max_epi32 (__m128i __V1, __m128i __V2)
   return (__m128i) __builtin_ia32_pmaxsd128 ((__v4si) __V1, (__v4si) __V2);
 }
 
-/// \brief Compares the corresponding elements of two 128-bit vectors of
+/// Compares the corresponding elements of two 128-bit vectors of
 ///    [4 x u32] and returns a 128-bit vector of [4 x u32] containing the lesser
 ///    value of the two.
 ///
@@ -808,7 +795,7 @@ _mm_min_epu32 (__m128i __V1, __m128i __V2)
   return (__m128i) __builtin_ia32_pminud128((__v4si) __V1, (__v4si) __V2);
 }
 
-/// \brief Compares the corresponding elements of two 128-bit vectors of
+/// Compares the corresponding elements of two 128-bit vectors of
 ///    [4 x u32] and returns a 128-bit vector of [4 x u32] containing the
 ///    greater value of the two.
 ///
@@ -828,7 +815,7 @@ _mm_max_epu32 (__m128i __V1, __m128i __V2)
 }
 
 /* SSE4 Insertion and Extraction from XMM Register Instructions.  */
-/// \brief Takes the first argument \a X and inserts an element from the second
+/// Takes the first argument \a X and inserts an element from the second
 ///    argument \a Y as selected by the third argument \a N. That result then
 ///    has elements zeroed out also as selected by the third argument \a N. The
 ///    resulting 128-bit vector of [4 x float] is then returned.
@@ -866,11 +853,11 @@ _mm_max_epu32 (__m128i __V1, __m128i __V2)
 ///      11: Copies the selected bits from \a Y to result bits [127:96]. \n
 ///    Bits[3:0]: If any of these bits are set, the corresponding result
 ///    element is cleared.
-/// \returns A 128-bit vector of [4 x float] containing the copied single-
-///    precision floating point elements from the operands.
+/// \returns A 128-bit vector of [4 x float] containing the copied
+///    single-precision floating point elements from the operands.
 #define _mm_insert_ps(X, Y, N) __builtin_ia32_insertps128((X), (Y), (N))
 
-/// \brief Extracts a 32-bit integer from a 128-bit vector of [4 x float] and
+/// Extracts a 32-bit integer from a 128-bit vector of [4 x float] and
 ///    returns it, using the immediate value parameter \a N as a selector.
 ///
 /// \headerfile <x86intrin.h>
@@ -893,15 +880,14 @@ _mm_max_epu32 (__m128i __V1, __m128i __V2)
 ///    11: Bits [127:96] of parameter \a X are returned.
 /// \returns A 32-bit integer containing the extracted 32 bits of float data.
 #define _mm_extract_ps(X, N) (__extension__                      \
-                              ({ union { int __i; float __f; } __t;  \
-                                 __v4sf __a = (__v4sf)(__m128)(X);       \
-                                 __t.__f = __a[(N) & 3];                 \
-                                 __t.__i;}))
+  ({ union { int __i; float __f; } __t;  \
+     __t.__f = __builtin_ia32_vec_ext_v4sf((__v4sf)(__m128)(X), (int)(N)); \
+     __t.__i;}))
 
 /* Miscellaneous insert and extract macros.  */
 /* Extract a single-precision float from X at index N into D.  */
-#define _MM_EXTRACT_FLOAT(D, X, N) (__extension__ ({ __v4sf __a = (__v4sf)(X); \
-                                                    (D) = __a[N]; }))
+#define _MM_EXTRACT_FLOAT(D, X, N) \
+  { (D) = __builtin_ia32_vec_ext_v4sf((__v4sf)(__m128)(X), (int)(N)); }
 
 /* Or together 2 sets of indexes (X and Y) with the zeroing bits (Z) to create
    an index suitable for _mm_insert_ps.  */
@@ -912,7 +898,7 @@ _mm_max_epu32 (__m128i __V1, __m128i __V2)
                                              _MM_MK_INSERTPS_NDX((N), 0, 0x0e))
 
 /* Insert int into packed integer array at index.  */
-/// \brief Constructs a 128-bit vector of [16 x i8] by first making a copy of
+/// Constructs a 128-bit vector of [16 x i8] by first making a copy of
 ///    the 128-bit integer vector parameter, and then inserting the lower 8 bits
 ///    of an integer parameter \a I into an offset specified by the immediate
 ///    value parameter \a N.
@@ -952,12 +938,11 @@ _mm_max_epu32 (__m128i __V1, __m128i __V2)
 ///    1110: Bits [119:112] of the result are used for insertion. \n
 ///    1111: Bits [127:120] of the result are used for insertion.
 /// \returns A 128-bit integer vector containing the constructed values.
-#define _mm_insert_epi8(X, I, N) (__extension__                           \
-                                  ({ __v16qi __a = (__v16qi)(__m128i)(X); \
-                                     __a[(N) & 15] = (I);                 \
-                                     (__m128i)__a;}))
+#define _mm_insert_epi8(X, I, N) \
+  (__m128i)__builtin_ia32_vec_set_v16qi((__v16qi)(__m128i)(X), \
+                                        (int)(I), (int)(N))
 
-/// \brief Constructs a 128-bit vector of [4 x i32] by first making a copy of
+/// Constructs a 128-bit vector of [4 x i32] by first making a copy of
 ///    the 128-bit integer vector parameter, and then inserting the 32-bit
 ///    integer parameter \a I at the offset specified by the immediate value
 ///    parameter \a N.
@@ -985,13 +970,12 @@ _mm_max_epu32 (__m128i __V1, __m128i __V2)
 ///    10: Bits [95:64] of the result are used for insertion. \n
 ///    11: Bits [127:96] of the result are used for insertion.
 /// \returns A 128-bit integer vector containing the constructed values.
-#define _mm_insert_epi32(X, I, N) (__extension__                         \
-                                   ({ __v4si __a = (__v4si)(__m128i)(X); \
-                                      __a[(N) & 3] = (I);                \
-                                      (__m128i)__a;}))
+#define _mm_insert_epi32(X, I, N) \
+  (__m128i)__builtin_ia32_vec_set_v4si((__v4si)(__m128i)(X), \
+                                       (int)(I), (int)(N))
 
 #ifdef __x86_64__
-/// \brief Constructs a 128-bit vector of [2 x i64] by first making a copy of
+/// Constructs a 128-bit vector of [2 x i64] by first making a copy of
 ///    the 128-bit integer vector parameter, and then inserting the 64-bit
 ///    integer parameter \a I, using the immediate value parameter \a N as an
 ///    insertion location selector.
@@ -1017,16 +1001,15 @@ _mm_max_epu32 (__m128i __V1, __m128i __V2)
 ///    0: Bits [63:0] of the result are used for insertion. \n
 ///    1: Bits [127:64] of the result are used for insertion. \n
 /// \returns A 128-bit integer vector containing the constructed values.
-#define _mm_insert_epi64(X, I, N) (__extension__                         \
-                                   ({ __v2di __a = (__v2di)(__m128i)(X); \
-                                      __a[(N) & 1] = (I);                \
-                                      (__m128i)__a;}))
+#define _mm_insert_epi64(X, I, N) \
+  (__m128i)__builtin_ia32_vec_set_v2di((__v2di)(__m128i)(X), \
+                                       (long long)(I), (int)(N))
 #endif /* __x86_64__ */
 
 /* Extract int from packed integer array at index.  This returns the element
  * as a zero extended value, so it is unsigned.
  */
-/// \brief Extracts an 8-bit element from the 128-bit integer vector of
+/// Extracts an 8-bit element from the 128-bit integer vector of
 ///    [16 x i8], using the immediate value parameter \a N as a selector.
 ///
 /// \headerfile <x86intrin.h>
@@ -1061,11 +1044,11 @@ _mm_max_epu32 (__m128i __V1, __m128i __V2)
 /// \returns  An unsigned integer, whose lower 8 bits are selected from the
 ///    128-bit integer vector parameter and the remaining bits are assigned
 ///    zeros.
-#define _mm_extract_epi8(X, N) (__extension__                           \
-                                ({ __v16qi __a = (__v16qi)(__m128i)(X); \
-                                   (int)(unsigned char) __a[(N) & 15];}))
+#define _mm_extract_epi8(X, N) \
+  (int)(unsigned char)__builtin_ia32_vec_ext_v16qi((__v16qi)(__m128i)(X), \
+                                                   (int)(N))
 
-/// \brief Extracts a 32-bit element from the 128-bit integer vector of
+/// Extracts a 32-bit element from the 128-bit integer vector of
 ///    [4 x i32], using the immediate value parameter \a N as a selector.
 ///
 /// \headerfile <x86intrin.h>
@@ -1087,12 +1070,11 @@ _mm_max_epu32 (__m128i __V1, __m128i __V2)
 ///    11: Bits [127:96] of the parameter \a X are exracted.
 /// \returns  An integer, whose lower 32 bits are selected from the 128-bit
 ///    integer vector parameter and the remaining bits are assigned zeros.
-#define _mm_extract_epi32(X, N) (__extension__                         \
-                                 ({ __v4si __a = (__v4si)(__m128i)(X); \
-                                    (int)__a[(N) & 3];}))
+#define _mm_extract_epi32(X, N) \
+  (int)__builtin_ia32_vec_ext_v4si((__v4si)(__m128i)(X), (int)(N))
 
 #ifdef __x86_64__
-/// \brief Extracts a 64-bit element from the 128-bit integer vector of
+/// Extracts a 64-bit element from the 128-bit integer vector of
 ///    [2 x i64], using the immediate value parameter \a N as a selector.
 ///
 /// \headerfile <x86intrin.h>
@@ -1111,13 +1093,12 @@ _mm_max_epu32 (__m128i __V1, __m128i __V2)
 ///    0: Bits [63:0] are returned. \n
 ///    1: Bits [127:64] are returned. \n
 /// \returns  A 64-bit integer.
-#define _mm_extract_epi64(X, N) (__extension__                         \
-                                 ({ __v2di __a = (__v2di)(__m128i)(X); \
-                                    (long long)__a[(N) & 1];}))
+#define _mm_extract_epi64(X, N) \
+  (long long)__builtin_ia32_vec_ext_v2di((__v2di)(__m128i)(X), (int)(N))
 #endif /* __x86_64 */
 
 /* SSE4 128-bit Packed Integer Comparisons.  */
-/// \brief Tests whether the specified bits in a 128-bit integer vector are all
+/// Tests whether the specified bits in a 128-bit integer vector are all
 ///    zeros.
 ///
 /// \headerfile <x86intrin.h>
@@ -1135,7 +1116,7 @@ _mm_testz_si128(__m128i __M, __m128i __V)
   return __builtin_ia32_ptestz128((__v2di)__M, (__v2di)__V);
 }
 
-/// \brief Tests whether the specified bits in a 128-bit integer vector are all
+/// Tests whether the specified bits in a 128-bit integer vector are all
 ///    ones.
 ///
 /// \headerfile <x86intrin.h>
@@ -1153,7 +1134,7 @@ _mm_testc_si128(__m128i __M, __m128i __V)
   return __builtin_ia32_ptestc128((__v2di)__M, (__v2di)__V);
 }
 
-/// \brief Tests whether the specified bits in a 128-bit integer vector are
+/// Tests whether the specified bits in a 128-bit integer vector are
 ///    neither all zeros nor all ones.
 ///
 /// \headerfile <x86intrin.h>
@@ -1172,7 +1153,7 @@ _mm_testnzc_si128(__m128i __M, __m128i __V)
   return __builtin_ia32_ptestnzc128((__v2di)__M, (__v2di)__V);
 }
 
-/// \brief Tests whether the specified bits in a 128-bit integer vector are all
+/// Tests whether the specified bits in a 128-bit integer vector are all
 ///    ones.
 ///
 /// \headerfile <x86intrin.h>
@@ -1189,7 +1170,7 @@ _mm_testnzc_si128(__m128i __M, __m128i __V)
 ///    otherwise.
 #define _mm_test_all_ones(V) _mm_testc_si128((V), _mm_cmpeq_epi32((V), (V)))
 
-/// \brief Tests whether the specified bits in a 128-bit integer vector are
+/// Tests whether the specified bits in a 128-bit integer vector are
 ///    neither all zeros nor all ones.
 ///
 /// \headerfile <x86intrin.h>
@@ -1208,7 +1189,7 @@ _mm_testnzc_si128(__m128i __M, __m128i __V)
 ///    FALSE otherwise.
 #define _mm_test_mix_ones_zeros(M, V) _mm_testnzc_si128((M), (V))
 
-/// \brief Tests whether the specified bits in a 128-bit integer vector are all
+/// Tests whether the specified bits in a 128-bit integer vector are all
 ///    zeros.
 ///
 /// \headerfile <x86intrin.h>
@@ -1227,7 +1208,7 @@ _mm_testnzc_si128(__m128i __M, __m128i __V)
 #define _mm_test_all_zeros(M, V) _mm_testz_si128 ((M), (V))
 
 /* SSE4 64-bit Packed Integer Comparisons.  */
-/// \brief Compares each of the corresponding 64-bit values of the 128-bit
+/// Compares each of the corresponding 64-bit values of the 128-bit
 ///    integer vectors for equality.
 ///
 /// \headerfile <x86intrin.h>
@@ -1246,7 +1227,7 @@ _mm_cmpeq_epi64(__m128i __V1, __m128i __V2)
 }
 
 /* SSE4 Packed Integer Sign-Extension.  */
-/// \brief Sign-extends each of the lower eight 8-bit integer elements of a
+/// Sign-extends each of the lower eight 8-bit integer elements of a
 ///    128-bit vector of [16 x i8] to 16-bit values and returns them in a
 ///    128-bit vector of [8 x i16]. The upper eight elements of the input vector
 ///    are unused.
@@ -1267,7 +1248,7 @@ _mm_cvtepi8_epi16(__m128i __V)
   return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v16qs)__V, (__v16qs)__V, 0, 1, 2, 3, 4, 5, 6, 7), __v8hi);
 }
 
-/// \brief Sign-extends each of the lower four 8-bit integer elements of a
+/// Sign-extends each of the lower four 8-bit integer elements of a
 ///    128-bit vector of [16 x i8] to 32-bit values and returns them in a
 ///    128-bit vector of [4 x i32]. The upper twelve elements of the input
 ///    vector are unused.
@@ -1277,8 +1258,8 @@ _mm_cvtepi8_epi16(__m128i __V)
 /// This intrinsic corresponds to the <c> VPMOVSXBD / PMOVSXBD </c> instruction.
 ///
 /// \param __V
-///    A 128-bit vector of [16 x i8]. The lower four 8-bit elements are sign-
-///    extended to 32-bit values.
+///    A 128-bit vector of [16 x i8]. The lower four 8-bit elements are
+///    sign-extended to 32-bit values.
 /// \returns A 128-bit vector of [4 x i32] containing the sign-extended values.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_cvtepi8_epi32(__m128i __V)
@@ -1288,7 +1269,7 @@ _mm_cvtepi8_epi32(__m128i __V)
   return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v16qs)__V, (__v16qs)__V, 0, 1, 2, 3), __v4si);
 }
 
-/// \brief Sign-extends each of the lower two 8-bit integer elements of a
+/// Sign-extends each of the lower two 8-bit integer elements of a
 ///    128-bit integer vector of [16 x i8] to 64-bit values and returns them in
 ///    a 128-bit vector of [2 x i64]. The upper fourteen elements of the input
 ///    vector are unused.
@@ -1298,8 +1279,8 @@ _mm_cvtepi8_epi32(__m128i __V)
 /// This intrinsic corresponds to the <c> VPMOVSXBQ / PMOVSXBQ </c> instruction.
 ///
 /// \param __V
-///    A 128-bit vector of [16 x i8]. The lower two 8-bit elements are sign-
-///    extended to 64-bit values.
+///    A 128-bit vector of [16 x i8]. The lower two 8-bit elements are
+///    sign-extended to 64-bit values.
 /// \returns A 128-bit vector of [2 x i64] containing the sign-extended values.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_cvtepi8_epi64(__m128i __V)
@@ -1309,7 +1290,7 @@ _mm_cvtepi8_epi64(__m128i __V)
   return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v16qs)__V, (__v16qs)__V, 0, 1), __v2di);
 }
 
-/// \brief Sign-extends each of the lower four 16-bit integer elements of a
+/// Sign-extends each of the lower four 16-bit integer elements of a
 ///    128-bit integer vector of [8 x i16] to 32-bit values and returns them in
 ///    a 128-bit vector of [4 x i32]. The upper four elements of the input
 ///    vector are unused.
@@ -1319,8 +1300,8 @@ _mm_cvtepi8_epi64(__m128i __V)
 /// This intrinsic corresponds to the <c> VPMOVSXWD / PMOVSXWD </c> instruction.
 ///
 /// \param __V
-///    A 128-bit vector of [8 x i16]. The lower four 16-bit elements are sign-
-///    extended to 32-bit values.
+///    A 128-bit vector of [8 x i16]. The lower four 16-bit elements are
+///    sign-extended to 32-bit values.
 /// \returns A 128-bit vector of [4 x i32] containing the sign-extended values.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_cvtepi16_epi32(__m128i __V)
@@ -1328,7 +1309,7 @@ _mm_cvtepi16_epi32(__m128i __V)
   return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v8hi)__V, (__v8hi)__V, 0, 1, 2, 3), __v4si);
 }
 
-/// \brief Sign-extends each of the lower two 16-bit integer elements of a
+/// Sign-extends each of the lower two 16-bit integer elements of a
 ///    128-bit integer vector of [8 x i16] to 64-bit values and returns them in
 ///    a 128-bit vector of [2 x i64]. The upper six elements of the input
 ///    vector are unused.
@@ -1338,8 +1319,8 @@ _mm_cvtepi16_epi32(__m128i __V)
 /// This intrinsic corresponds to the <c> VPMOVSXWQ / PMOVSXWQ </c> instruction.
 ///
 /// \param __V
-///    A 128-bit vector of [8 x i16]. The lower two 16-bit elements are sign-
-///    extended to 64-bit values.
+///    A 128-bit vector of [8 x i16]. The lower two 16-bit elements are
+///     sign-extended to 64-bit values.
 /// \returns A 128-bit vector of [2 x i64] containing the sign-extended values.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_cvtepi16_epi64(__m128i __V)
@@ -1347,7 +1328,7 @@ _mm_cvtepi16_epi64(__m128i __V)
   return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v8hi)__V, (__v8hi)__V, 0, 1), __v2di);
 }
 
-/// \brief Sign-extends each of the lower two 32-bit integer elements of a
+/// Sign-extends each of the lower two 32-bit integer elements of a
 ///    128-bit integer vector of [4 x i32] to 64-bit values and returns them in
 ///    a 128-bit vector of [2 x i64]. The upper two elements of the input vector
 ///    are unused.
@@ -1357,8 +1338,8 @@ _mm_cvtepi16_epi64(__m128i __V)
 /// This intrinsic corresponds to the <c> VPMOVSXDQ / PMOVSXDQ </c> instruction.
 ///
 /// \param __V
-///    A 128-bit vector of [4 x i32]. The lower two 32-bit elements are sign-
-///    extended to 64-bit values.
+///    A 128-bit vector of [4 x i32]. The lower two 32-bit elements are
+///    sign-extended to 64-bit values.
 /// \returns A 128-bit vector of [2 x i64] containing the sign-extended values.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_cvtepi32_epi64(__m128i __V)
@@ -1367,7 +1348,7 @@ _mm_cvtepi32_epi64(__m128i __V)
 }
 
 /* SSE4 Packed Integer Zero-Extension.  */
-/// \brief Zero-extends each of the lower eight 8-bit integer elements of a
+/// Zero-extends each of the lower eight 8-bit integer elements of a
 ///    128-bit vector of [16 x i8] to 16-bit values and returns them in a
 ///    128-bit vector of [8 x i16]. The upper eight elements of the input vector
 ///    are unused.
@@ -1377,8 +1358,8 @@ _mm_cvtepi32_epi64(__m128i __V)
 /// This intrinsic corresponds to the <c> VPMOVZXBW / PMOVZXBW </c> instruction.
 ///
 /// \param __V
-///    A 128-bit vector of [16 x i8]. The lower eight 8-bit elements are zero-
-///    extended to 16-bit values.
+///    A 128-bit vector of [16 x i8]. The lower eight 8-bit elements are
+///    zero-extended to 16-bit values.
 /// \returns A 128-bit vector of [8 x i16] containing the zero-extended values.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_cvtepu8_epi16(__m128i __V)
@@ -1386,7 +1367,7 @@ _mm_cvtepu8_epi16(__m128i __V)
   return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v16qu)__V, (__v16qu)__V, 0, 1, 2, 3, 4, 5, 6, 7), __v8hi);
 }
 
-/// \brief Zero-extends each of the lower four 8-bit integer elements of a
+/// Zero-extends each of the lower four 8-bit integer elements of a
 ///    128-bit vector of [16 x i8] to 32-bit values and returns them in a
 ///    128-bit vector of [4 x i32]. The upper twelve elements of the input
 ///    vector are unused.
@@ -1396,8 +1377,8 @@ _mm_cvtepu8_epi16(__m128i __V)
 /// This intrinsic corresponds to the <c> VPMOVZXBD / PMOVZXBD </c> instruction.
 ///
 /// \param __V
-///    A 128-bit vector of [16 x i8]. The lower four 8-bit elements are zero-
-///    extended to 32-bit values.
+///    A 128-bit vector of [16 x i8]. The lower four 8-bit elements are
+///    zero-extended to 32-bit values.
 /// \returns A 128-bit vector of [4 x i32] containing the zero-extended values.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_cvtepu8_epi32(__m128i __V)
@@ -1405,7 +1386,7 @@ _mm_cvtepu8_epi32(__m128i __V)
   return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v16qu)__V, (__v16qu)__V, 0, 1, 2, 3), __v4si);
 }
 
-/// \brief Zero-extends each of the lower two 8-bit integer elements of a
+/// Zero-extends each of the lower two 8-bit integer elements of a
 ///    128-bit integer vector of [16 x i8] to 64-bit values and returns them in
 ///    a 128-bit vector of [2 x i64]. The upper fourteen elements of the input
 ///    vector are unused.
@@ -1415,8 +1396,8 @@ _mm_cvtepu8_epi32(__m128i __V)
 /// This intrinsic corresponds to the <c> VPMOVZXBQ / PMOVZXBQ </c> instruction.
 ///
 /// \param __V
-///    A 128-bit vector of [16 x i8]. The lower two 8-bit elements are zero-
-///    extended to 64-bit values.
+///    A 128-bit vector of [16 x i8]. The lower two 8-bit elements are
+///    zero-extended to 64-bit values.
 /// \returns A 128-bit vector of [2 x i64] containing the zero-extended values.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_cvtepu8_epi64(__m128i __V)
@@ -1424,7 +1405,7 @@ _mm_cvtepu8_epi64(__m128i __V)
   return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v16qu)__V, (__v16qu)__V, 0, 1), __v2di);
 }
 
-/// \brief Zero-extends each of the lower four 16-bit integer elements of a
+/// Zero-extends each of the lower four 16-bit integer elements of a
 ///    128-bit integer vector of [8 x i16] to 32-bit values and returns them in
 ///    a 128-bit vector of [4 x i32]. The upper four elements of the input
 ///    vector are unused.
@@ -1434,8 +1415,8 @@ _mm_cvtepu8_epi64(__m128i __V)
 /// This intrinsic corresponds to the <c> VPMOVZXWD / PMOVZXWD </c> instruction.
 ///
 /// \param __V
-///    A 128-bit vector of [8 x i16]. The lower four 16-bit elements are zero-
-///    extended to 32-bit values.
+///    A 128-bit vector of [8 x i16]. The lower four 16-bit elements are
+///    zero-extended to 32-bit values.
 /// \returns A 128-bit vector of [4 x i32] containing the zero-extended values.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_cvtepu16_epi32(__m128i __V)
@@ -1443,7 +1424,7 @@ _mm_cvtepu16_epi32(__m128i __V)
   return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v8hu)__V, (__v8hu)__V, 0, 1, 2, 3), __v4si);
 }
 
-/// \brief Zero-extends each of the lower two 16-bit integer elements of a
+/// Zero-extends each of the lower two 16-bit integer elements of a
 ///    128-bit integer vector of [8 x i16] to 64-bit values and returns them in
 ///    a 128-bit vector of [2 x i64]. The upper six elements of the input vector
 ///    are unused.
@@ -1453,8 +1434,8 @@ _mm_cvtepu16_epi32(__m128i __V)
 /// This intrinsic corresponds to the <c> VPMOVZXWQ / PMOVZXWQ </c> instruction.
 ///
 /// \param __V
-///    A 128-bit vector of [8 x i16]. The lower two 16-bit elements are zero-
-///    extended to 64-bit values.
+///    A 128-bit vector of [8 x i16]. The lower two 16-bit elements are
+///    zero-extended to 64-bit values.
 /// \returns A 128-bit vector of [2 x i64] containing the zero-extended values.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_cvtepu16_epi64(__m128i __V)
@@ -1462,7 +1443,7 @@ _mm_cvtepu16_epi64(__m128i __V)
   return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v8hu)__V, (__v8hu)__V, 0, 1), __v2di);
 }
 
-/// \brief Zero-extends each of the lower two 32-bit integer elements of a
+/// Zero-extends each of the lower two 32-bit integer elements of a
 ///    128-bit integer vector of [4 x i32] to 64-bit values and returns them in
 ///    a 128-bit vector of [2 x i64]. The upper two elements of the input vector
 ///    are unused.
@@ -1472,8 +1453,8 @@ _mm_cvtepu16_epi64(__m128i __V)
 /// This intrinsic corresponds to the <c> VPMOVZXDQ / PMOVZXDQ </c> instruction.
 ///
 /// \param __V
-///    A 128-bit vector of [4 x i32]. The lower two 32-bit elements are zero-
-///    extended to 64-bit values.
+///    A 128-bit vector of [4 x i32]. The lower two 32-bit elements are
+///    zero-extended to 64-bit values.
 /// \returns A 128-bit vector of [2 x i64] containing the zero-extended values.
 static __inline__ __m128i __DEFAULT_FN_ATTRS
 _mm_cvtepu32_epi64(__m128i __V)
@@ -1482,7 +1463,7 @@ _mm_cvtepu32_epi64(__m128i __V)
 }
 
 /* SSE4 Pack with Unsigned Saturation.  */
-/// \brief Converts 32-bit signed integers from both 128-bit integer vector
+/// Converts 32-bit signed integers from both 128-bit integer vector
 ///    operands into 16-bit unsigned integers, and returns the packed result.
 ///    Values greater than 0xFFFF are saturated to 0xFFFF. Values less than
 ///    0x0000 are saturated to 0x0000.
@@ -1511,7 +1492,7 @@ _mm_packus_epi32(__m128i __V1, __m128i __V2)
 }
 
 /* SSE4 Multiple Packed Sums of Absolute Difference.  */
-/// \brief Subtracts 8-bit unsigned integer values and computes the absolute
+/// Subtracts 8-bit unsigned integer values and computes the absolute
 ///    values of the differences to the corresponding bits in the destination.
 ///    Then sums of the absolute differences are returned according to the bit
 ///    fields in the immediate operand.
@@ -1534,23 +1515,23 @@ _mm_packus_epi32(__m128i __V1, __m128i __V2)
 ///    \code
 ///    // M2 represents bit 2 of the immediate operand
 ///    // M10 represents bits [1:0] of the immediate operand
-///    i = M2 * 4
-///    j = M10 * 4
+///    i = M2 * 4;
+///    j = M10 * 4;
 ///    for (k = 0; k < 8; k = k + 1) {
-///      d0 = abs(X[i + k + 0] - Y[j + 0])
-///      d1 = abs(X[i + k + 1] - Y[j + 1])
-///      d2 = abs(X[i + k + 2] - Y[j + 2])
-///      d3 = abs(X[i + k + 3] - Y[j + 3])
-///      r[k] = d0 + d1 + d2 + d3
+///      d0 = abs(X[i + k + 0] - Y[j + 0]);
+///      d1 = abs(X[i + k + 1] - Y[j + 1]);
+///      d2 = abs(X[i + k + 2] - Y[j + 2]);
+///      d3 = abs(X[i + k + 3] - Y[j + 3]);
+///      r[k] = d0 + d1 + d2 + d3;
 ///    }
 ///    \endcode
 /// \returns A 128-bit integer vector containing the sums of the sets of
 ///    absolute differences between both operands.
-#define _mm_mpsadbw_epu8(X, Y, M) __extension__ ({ \
+#define _mm_mpsadbw_epu8(X, Y, M) \
   (__m128i) __builtin_ia32_mpsadbw128((__v16qi)(__m128i)(X), \
-                                      (__v16qi)(__m128i)(Y), (M)); })
+                                      (__v16qi)(__m128i)(Y), (M))
 
-/// \brief Finds the minimum unsigned 16-bit element in the input 128-bit
+/// Finds the minimum unsigned 16-bit element in the input 128-bit
 ///    vector of [8 x u16] and returns it and along with its index.
 ///
 /// \headerfile <x86intrin.h>
@@ -1604,7 +1585,7 @@ _mm_minpos_epu16(__m128i __V)
 #define _SIDD_UNIT_MASK                 0x40
 
 /* SSE4.2 Packed Comparison Intrinsics.  */
-/// \brief Uses the immediate operand \a M to perform a comparison of string
+/// Uses the immediate operand \a M to perform a comparison of string
 ///    data with implicitly defined lengths that is contained in source operands
 ///    \a A and \a B. Returns a 128-bit integer vector representing the result
 ///    mask of the comparison.
@@ -1660,7 +1641,7 @@ _mm_minpos_epu16(__m128i __V)
   (__m128i)__builtin_ia32_pcmpistrm128((__v16qi)(__m128i)(A), \
                                        (__v16qi)(__m128i)(B), (int)(M))
 
-/// \brief Uses the immediate operand \a M to perform a comparison of string
+/// Uses the immediate operand \a M to perform a comparison of string
 ///    data with implicitly defined lengths that is contained in source operands
 ///    \a A and \a B. Returns an integer representing the result index of the
 ///    comparison.
@@ -1714,7 +1695,7 @@ _mm_minpos_epu16(__m128i __V)
   (int)__builtin_ia32_pcmpistri128((__v16qi)(__m128i)(A), \
                                    (__v16qi)(__m128i)(B), (int)(M))
 
-/// \brief Uses the immediate operand \a M to perform a comparison of string
+/// Uses the immediate operand \a M to perform a comparison of string
 ///    data with explicitly defined lengths that is contained in source operands
 ///    \a A and \a B. Returns a 128-bit integer vector representing the result
 ///    mask of the comparison.
@@ -1775,7 +1756,7 @@ _mm_minpos_epu16(__m128i __V)
                                        (__v16qi)(__m128i)(B), (int)(LB), \
                                        (int)(M))
 
-/// \brief Uses the immediate operand \a M to perform a comparison of string
+/// Uses the immediate operand \a M to perform a comparison of string
 ///    data with explicitly defined lengths that is contained in source operands
 ///    \a A and \a B. Returns an integer representing the result index of the
 ///    comparison.
@@ -1835,7 +1816,7 @@ _mm_minpos_epu16(__m128i __V)
                                    (int)(M))
 
 /* SSE4.2 Packed Comparison Intrinsics and EFlag Reading.  */
-/// \brief Uses the immediate operand \a M to perform a comparison of string
+/// Uses the immediate operand \a M to perform a comparison of string
 ///    data with implicitly defined lengths that is contained in source operands
 ///    \a A and \a B. Returns 1 if the bit mask is zero and the length of the
 ///    string in \a B is the maximum, otherwise, returns 0.
@@ -1885,7 +1866,7 @@ _mm_minpos_epu16(__m128i __V)
   (int)__builtin_ia32_pcmpistria128((__v16qi)(__m128i)(A), \
                                     (__v16qi)(__m128i)(B), (int)(M))
 
-/// \brief Uses the immediate operand \a M to perform a comparison of string
+/// Uses the immediate operand \a M to perform a comparison of string
 ///    data with implicitly defined lengths that is contained in source operands
 ///    \a A and \a B. Returns 1 if the bit mask is non-zero, otherwise, returns
 ///    0.
@@ -1934,7 +1915,7 @@ _mm_minpos_epu16(__m128i __V)
   (int)__builtin_ia32_pcmpistric128((__v16qi)(__m128i)(A), \
                                     (__v16qi)(__m128i)(B), (int)(M))
 
-/// \brief Uses the immediate operand \a M to perform a comparison of string
+/// Uses the immediate operand \a M to perform a comparison of string
 ///    data with implicitly defined lengths that is contained in source operands
 ///    \a A and \a B. Returns bit 0 of the resulting bit mask.
 ///
@@ -1982,7 +1963,7 @@ _mm_minpos_epu16(__m128i __V)
   (int)__builtin_ia32_pcmpistrio128((__v16qi)(__m128i)(A), \
                                     (__v16qi)(__m128i)(B), (int)(M))
 
-/// \brief Uses the immediate operand \a M to perform a comparison of string
+/// Uses the immediate operand \a M to perform a comparison of string
 ///    data with implicitly defined lengths that is contained in source operands
 ///    \a A and \a B. Returns 1 if the length of the string in \a A is less than
 ///    the maximum, otherwise, returns 0.
@@ -2032,7 +2013,7 @@ _mm_minpos_epu16(__m128i __V)
   (int)__builtin_ia32_pcmpistris128((__v16qi)(__m128i)(A), \
                                     (__v16qi)(__m128i)(B), (int)(M))
 
-/// \brief Uses the immediate operand \a M to perform a comparison of string
+/// Uses the immediate operand \a M to perform a comparison of string
 ///    data with implicitly defined lengths that is contained in source operands
 ///    \a A and \a B. Returns 1 if the length of the string in \a B is less than
 ///    the maximum, otherwise, returns 0.
@@ -2082,7 +2063,7 @@ _mm_minpos_epu16(__m128i __V)
   (int)__builtin_ia32_pcmpistriz128((__v16qi)(__m128i)(A), \
                                     (__v16qi)(__m128i)(B), (int)(M))
 
-/// \brief Uses the immediate operand \a M to perform a comparison of string
+/// Uses the immediate operand \a M to perform a comparison of string
 ///    data with explicitly defined lengths that is contained in source operands
 ///    \a A and \a B. Returns 1 if the bit mask is zero and the length of the
 ///    string in \a B is the maximum, otherwise, returns 0.
@@ -2137,7 +2118,7 @@ _mm_minpos_epu16(__m128i __V)
                                     (__v16qi)(__m128i)(B), (int)(LB), \
                                     (int)(M))
 
-/// \brief Uses the immediate operand \a M to perform a comparison of string
+/// Uses the immediate operand \a M to perform a comparison of string
 ///    data with explicitly defined lengths that is contained in source operands
 ///    \a A and \a B. Returns 1 if the resulting mask is non-zero, otherwise,
 ///    returns 0.
@@ -2191,7 +2172,7 @@ _mm_minpos_epu16(__m128i __V)
                                     (__v16qi)(__m128i)(B), (int)(LB), \
                                     (int)(M))
 
-/// \brief Uses the immediate operand \a M to perform a comparison of string
+/// Uses the immediate operand \a M to perform a comparison of string
 ///    data with explicitly defined lengths that is contained in source operands
 ///    \a A and \a B. Returns bit 0 of the resulting bit mask.
 ///
@@ -2244,7 +2225,7 @@ _mm_minpos_epu16(__m128i __V)
                                     (__v16qi)(__m128i)(B), (int)(LB), \
                                     (int)(M))
 
-/// \brief Uses the immediate operand \a M to perform a comparison of string
+/// Uses the immediate operand \a M to perform a comparison of string
 ///    data with explicitly defined lengths that is contained in source operands
 ///    \a A and \a B. Returns 1 if the length of the string in \a A is less than
 ///    the maximum, otherwise, returns 0.
@@ -2299,7 +2280,7 @@ _mm_minpos_epu16(__m128i __V)
                                     (__v16qi)(__m128i)(B), (int)(LB), \
                                     (int)(M))
 
-/// \brief Uses the immediate operand \a M to perform a comparison of string
+/// Uses the immediate operand \a M to perform a comparison of string
 ///    data with explicitly defined lengths that is contained in source operands
 ///    \a A and \a B. Returns 1 if the length of the string in \a B is less than
 ///    the maximum, otherwise, returns 0.
@@ -2354,7 +2335,7 @@ _mm_minpos_epu16(__m128i __V)
                                     (int)(M))
 
 /* SSE4.2 Compare Packed Data -- Greater Than.  */
-/// \brief Compares each of the corresponding 64-bit values of the 128-bit
+/// Compares each of the corresponding 64-bit values of the 128-bit
 ///    integer vectors to determine if the values in the first operand are
 ///    greater than those in the second operand.
 ///
@@ -2374,7 +2355,7 @@ _mm_cmpgt_epi64(__m128i __V1, __m128i __V2)
 }
 
 /* SSE4.2 Accumulate CRC32.  */
-/// \brief Adds the unsigned integer operand to the CRC-32C checksum of the
+/// Adds the unsigned integer operand to the CRC-32C checksum of the
 ///    unsigned char operand.
 ///
 /// \headerfile <x86intrin.h>
@@ -2394,7 +2375,7 @@ _mm_crc32_u8(unsigned int __C, unsigned char __D)
   return __builtin_ia32_crc32qi(__C, __D);
 }
 
-/// \brief Adds the unsigned integer operand to the CRC-32C checksum of the
+/// Adds the unsigned integer operand to the CRC-32C checksum of the
 ///    unsigned short operand.
 ///
 /// \headerfile <x86intrin.h>
@@ -2414,7 +2395,7 @@ _mm_crc32_u16(unsigned int __C, unsigned short __D)
   return __builtin_ia32_crc32hi(__C, __D);
 }
 
-/// \brief Adds the first unsigned integer operand to the CRC-32C checksum of
+/// Adds the first unsigned integer operand to the CRC-32C checksum of
 ///    the second unsigned integer operand.
 ///
 /// \headerfile <x86intrin.h>
@@ -2435,7 +2416,7 @@ _mm_crc32_u32(unsigned int __C, unsigned int __D)
 }
 
 #ifdef __x86_64__
-/// \brief Adds the unsigned integer operand to the CRC-32C checksum of the
+/// Adds the unsigned integer operand to the CRC-32C checksum of the
 ///    unsigned 64-bit integer operand.
 ///
 /// \headerfile <x86intrin.h>
@@ -2458,8 +2439,6 @@ _mm_crc32_u64(unsigned long long __C, unsigned long long __D)
 
 #undef __DEFAULT_FN_ATTRS
 
-#ifdef __POPCNT__
 #include <popcntintrin.h>
-#endif
 
-#endif /* _SMMINTRIN_H */
+#endif /* __SMMINTRIN_H */