diff options
Diffstat (limited to 'lib/Headers/smmintrin.h')
-rw-r--r-- | lib/Headers/smmintrin.h | 345 |
1 files changed, 162 insertions, 183 deletions
diff --git a/lib/Headers/smmintrin.h b/lib/Headers/smmintrin.h index c2fa5a452bcec..4806b3e4e150d 100644 --- a/lib/Headers/smmintrin.h +++ b/lib/Headers/smmintrin.h @@ -21,13 +21,13 @@ *===-----------------------------------------------------------------------=== */ -#ifndef _SMMINTRIN_H -#define _SMMINTRIN_H +#ifndef __SMMINTRIN_H +#define __SMMINTRIN_H #include <tmmintrin.h> /* Define the default attributes for the functions in this file. */ -#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("sse4.1"))) +#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("sse4.1"), __min_vector_width__(128))) /* SSE4 Rounding macros. */ #define _MM_FROUND_TO_NEAREST_INT 0x00 @@ -46,7 +46,7 @@ #define _MM_FROUND_RINT (_MM_FROUND_RAISE_EXC | _MM_FROUND_CUR_DIRECTION) #define _MM_FROUND_NEARBYINT (_MM_FROUND_NO_EXC | _MM_FROUND_CUR_DIRECTION) -/// \brief Rounds up each element of the 128-bit vector of [4 x float] to an +/// Rounds up each element of the 128-bit vector of [4 x float] to an /// integer and returns the rounded values in a 128-bit vector of /// [4 x float]. /// @@ -63,7 +63,7 @@ /// \returns A 128-bit vector of [4 x float] containing the rounded values. #define _mm_ceil_ps(X) _mm_round_ps((X), _MM_FROUND_CEIL) -/// \brief Rounds up each element of the 128-bit vector of [2 x double] to an +/// Rounds up each element of the 128-bit vector of [2 x double] to an /// integer and returns the rounded values in a 128-bit vector of /// [2 x double]. /// @@ -80,7 +80,7 @@ /// \returns A 128-bit vector of [2 x double] containing the rounded values. #define _mm_ceil_pd(X) _mm_round_pd((X), _MM_FROUND_CEIL) -/// \brief Copies three upper elements of the first 128-bit vector operand to +/// Copies three upper elements of the first 128-bit vector operand to /// the corresponding three upper elements of the 128-bit result vector of /// [4 x float]. Rounds up the lowest element of the second 128-bit vector /// operand to an integer and copies it to the lowest element of the 128-bit @@ -105,7 +105,7 @@ /// values. #define _mm_ceil_ss(X, Y) _mm_round_ss((X), (Y), _MM_FROUND_CEIL) -/// \brief Copies the upper element of the first 128-bit vector operand to the +/// Copies the upper element of the first 128-bit vector operand to the /// corresponding upper element of the 128-bit result vector of [2 x double]. /// Rounds up the lower element of the second 128-bit vector operand to an /// integer and copies it to the lower element of the 128-bit result vector @@ -130,7 +130,7 @@ /// values. #define _mm_ceil_sd(X, Y) _mm_round_sd((X), (Y), _MM_FROUND_CEIL) -/// \brief Rounds down each element of the 128-bit vector of [4 x float] to an +/// Rounds down each element of the 128-bit vector of [4 x float] to an /// an integer and returns the rounded values in a 128-bit vector of /// [4 x float]. /// @@ -147,7 +147,7 @@ /// \returns A 128-bit vector of [4 x float] containing the rounded values. #define _mm_floor_ps(X) _mm_round_ps((X), _MM_FROUND_FLOOR) -/// \brief Rounds down each element of the 128-bit vector of [2 x double] to an +/// Rounds down each element of the 128-bit vector of [2 x double] to an /// integer and returns the rounded values in a 128-bit vector of /// [2 x double]. /// @@ -164,7 +164,7 @@ /// \returns A 128-bit vector of [2 x double] containing the rounded values. #define _mm_floor_pd(X) _mm_round_pd((X), _MM_FROUND_FLOOR) -/// \brief Copies three upper elements of the first 128-bit vector operand to +/// Copies three upper elements of the first 128-bit vector operand to /// the corresponding three upper elements of the 128-bit result vector of /// [4 x float]. Rounds down the lowest element of the second 128-bit vector /// operand to an integer and copies it to the lowest element of the 128-bit @@ -189,7 +189,7 @@ /// values. #define _mm_floor_ss(X, Y) _mm_round_ss((X), (Y), _MM_FROUND_FLOOR) -/// \brief Copies the upper element of the first 128-bit vector operand to the +/// Copies the upper element of the first 128-bit vector operand to the /// corresponding upper element of the 128-bit result vector of [2 x double]. /// Rounds down the lower element of the second 128-bit vector operand to an /// integer and copies it to the lower element of the 128-bit result vector @@ -214,7 +214,7 @@ /// values. #define _mm_floor_sd(X, Y) _mm_round_sd((X), (Y), _MM_FROUND_FLOOR) -/// \brief Rounds each element of the 128-bit vector of [4 x float] to an +/// Rounds each element of the 128-bit vector of [4 x float] to an /// integer value according to the rounding control specified by the second /// argument and returns the rounded values in a 128-bit vector of /// [4 x float]. @@ -244,10 +244,10 @@ /// 10: Upward (toward positive infinity) \n /// 11: Truncated /// \returns A 128-bit vector of [4 x float] containing the rounded values. -#define _mm_round_ps(X, M) __extension__ ({ \ - (__m128)__builtin_ia32_roundps((__v4sf)(__m128)(X), (M)); }) +#define _mm_round_ps(X, M) \ + (__m128)__builtin_ia32_roundps((__v4sf)(__m128)(X), (M)) -/// \brief Copies three upper elements of the first 128-bit vector operand to +/// Copies three upper elements of the first 128-bit vector operand to /// the corresponding three upper elements of the 128-bit result vector of /// [4 x float]. Rounds the lowest element of the second 128-bit vector /// operand to an integer value according to the rounding control specified @@ -285,11 +285,11 @@ /// 11: Truncated /// \returns A 128-bit vector of [4 x float] containing the copied and rounded /// values. -#define _mm_round_ss(X, Y, M) __extension__ ({ \ +#define _mm_round_ss(X, Y, M) \ (__m128)__builtin_ia32_roundss((__v4sf)(__m128)(X), \ - (__v4sf)(__m128)(Y), (M)); }) + (__v4sf)(__m128)(Y), (M)) -/// \brief Rounds each element of the 128-bit vector of [2 x double] to an +/// Rounds each element of the 128-bit vector of [2 x double] to an /// integer value according to the rounding control specified by the second /// argument and returns the rounded values in a 128-bit vector of /// [2 x double]. @@ -319,10 +319,10 @@ /// 10: Upward (toward positive infinity) \n /// 11: Truncated /// \returns A 128-bit vector of [2 x double] containing the rounded values. -#define _mm_round_pd(X, M) __extension__ ({ \ - (__m128d)__builtin_ia32_roundpd((__v2df)(__m128d)(X), (M)); }) +#define _mm_round_pd(X, M) \ + (__m128d)__builtin_ia32_roundpd((__v2df)(__m128d)(X), (M)) -/// \brief Copies the upper element of the first 128-bit vector operand to the +/// Copies the upper element of the first 128-bit vector operand to the /// corresponding upper element of the 128-bit result vector of [2 x double]. /// Rounds the lower element of the second 128-bit vector operand to an /// integer value according to the rounding control specified by the third @@ -360,12 +360,12 @@ /// 11: Truncated /// \returns A 128-bit vector of [2 x double] containing the copied and rounded /// values. -#define _mm_round_sd(X, Y, M) __extension__ ({ \ +#define _mm_round_sd(X, Y, M) \ (__m128d)__builtin_ia32_roundsd((__v2df)(__m128d)(X), \ - (__v2df)(__m128d)(Y), (M)); }) + (__v2df)(__m128d)(Y), (M)) /* SSE4 Packed Blending Intrinsics. */ -/// \brief Returns a 128-bit vector of [2 x double] where the values are +/// Returns a 128-bit vector of [2 x double] where the values are /// selected from either the first or second operand as specified by the /// third operand, the control mask. /// @@ -389,13 +389,11 @@ /// When a mask bit is 1, the corresponding 64-bit element in operand \a V2 /// is copied to the same position in the result. /// \returns A 128-bit vector of [2 x double] containing the copied values. -#define _mm_blend_pd(V1, V2, M) __extension__ ({ \ - (__m128d)__builtin_shufflevector((__v2df)(__m128d)(V1), \ - (__v2df)(__m128d)(V2), \ - (((M) & 0x01) ? 2 : 0), \ - (((M) & 0x02) ? 3 : 1)); }) +#define _mm_blend_pd(V1, V2, M) \ + (__m128d) __builtin_ia32_blendpd ((__v2df)(__m128d)(V1), \ + (__v2df)(__m128d)(V2), (int)(M)) -/// \brief Returns a 128-bit vector of [4 x float] where the values are selected +/// Returns a 128-bit vector of [4 x float] where the values are selected /// from either the first or second operand as specified by the third /// operand, the control mask. /// @@ -419,14 +417,11 @@ /// When a mask bit is 1, the corresponding 32-bit element in operand \a V2 /// is copied to the same position in the result. /// \returns A 128-bit vector of [4 x float] containing the copied values. -#define _mm_blend_ps(V1, V2, M) __extension__ ({ \ - (__m128)__builtin_shufflevector((__v4sf)(__m128)(V1), (__v4sf)(__m128)(V2), \ - (((M) & 0x01) ? 4 : 0), \ - (((M) & 0x02) ? 5 : 1), \ - (((M) & 0x04) ? 6 : 2), \ - (((M) & 0x08) ? 7 : 3)); }) - -/// \brief Returns a 128-bit vector of [2 x double] where the values are +#define _mm_blend_ps(V1, V2, M) \ + (__m128) __builtin_ia32_blendps ((__v4sf)(__m128)(V1), \ + (__v4sf)(__m128)(V2), (int)(M)) + +/// Returns a 128-bit vector of [2 x double] where the values are /// selected from either the first or second operand as specified by the /// third operand, the control mask. /// @@ -453,7 +448,7 @@ _mm_blendv_pd (__m128d __V1, __m128d __V2, __m128d __M) (__v2df)__M); } -/// \brief Returns a 128-bit vector of [4 x float] where the values are +/// Returns a 128-bit vector of [4 x float] where the values are /// selected from either the first or second operand as specified by the /// third operand, the control mask. /// @@ -480,7 +475,7 @@ _mm_blendv_ps (__m128 __V1, __m128 __V2, __m128 __M) (__v4sf)__M); } -/// \brief Returns a 128-bit vector of [16 x i8] where the values are selected +/// Returns a 128-bit vector of [16 x i8] where the values are selected /// from either of the first or second operand as specified by the third /// operand, the control mask. /// @@ -493,7 +488,7 @@ _mm_blendv_ps (__m128 __V1, __m128 __V2, __m128 __M) /// \param __V2 /// A 128-bit vector of [16 x i8]. /// \param __M -/// A 128-bit vector operand, with mask bits 127, 119, 111 ... 7 specifying +/// A 128-bit vector operand, with mask bits 127, 119, 111...7 specifying /// how the values are to be copied. The position of the mask bit corresponds /// to the most significant bit of a copied value. When a mask bit is 0, the /// corresponding 8-bit element in operand \a __V1 is copied to the same @@ -507,7 +502,7 @@ _mm_blendv_epi8 (__m128i __V1, __m128i __V2, __m128i __M) (__v16qi)__M); } -/// \brief Returns a 128-bit vector of [8 x i16] where the values are selected +/// Returns a 128-bit vector of [8 x i16] where the values are selected /// from either of the first or second operand as specified by the third /// operand, the control mask. /// @@ -531,20 +526,12 @@ _mm_blendv_epi8 (__m128i __V1, __m128i __V2, __m128i __M) /// When a mask bit is 1, the corresponding 16-bit element in operand \a V2 /// is copied to the same position in the result. /// \returns A 128-bit vector of [8 x i16] containing the copied values. -#define _mm_blend_epi16(V1, V2, M) __extension__ ({ \ - (__m128i)__builtin_shufflevector((__v8hi)(__m128i)(V1), \ - (__v8hi)(__m128i)(V2), \ - (((M) & 0x01) ? 8 : 0), \ - (((M) & 0x02) ? 9 : 1), \ - (((M) & 0x04) ? 10 : 2), \ - (((M) & 0x08) ? 11 : 3), \ - (((M) & 0x10) ? 12 : 4), \ - (((M) & 0x20) ? 13 : 5), \ - (((M) & 0x40) ? 14 : 6), \ - (((M) & 0x80) ? 15 : 7)); }) +#define _mm_blend_epi16(V1, V2, M) \ + (__m128i) __builtin_ia32_pblendw128 ((__v8hi)(__m128i)(V1), \ + (__v8hi)(__m128i)(V2), (int)(M)) /* SSE4 Dword Multiply Instructions. */ -/// \brief Multiples corresponding elements of two 128-bit vectors of [4 x i32] +/// Multiples corresponding elements of two 128-bit vectors of [4 x i32] /// and returns the lower 32 bits of the each product in a 128-bit vector of /// [4 x i32]. /// @@ -563,7 +550,7 @@ _mm_mullo_epi32 (__m128i __V1, __m128i __V2) return (__m128i) ((__v4su)__V1 * (__v4su)__V2); } -/// \brief Multiplies corresponding even-indexed elements of two 128-bit +/// Multiplies corresponding even-indexed elements of two 128-bit /// vectors of [4 x i32] and returns a 128-bit vector of [2 x i64] /// containing the products. /// @@ -584,7 +571,7 @@ _mm_mul_epi32 (__m128i __V1, __m128i __V2) } /* SSE4 Floating Point Dot Product Instructions. */ -/// \brief Computes the dot product of the two 128-bit vectors of [4 x float] +/// Computes the dot product of the two 128-bit vectors of [4 x float] /// and returns it in the elements of the 128-bit result vector of /// [4 x float]. /// @@ -616,11 +603,11 @@ _mm_mul_epi32 (__m128i __V1, __m128i __V2) /// each [4 x float] subvector. If a bit is set, the dot product is returned /// in the corresponding element; otherwise that element is set to zero. /// \returns A 128-bit vector of [4 x float] containing the dot product. -#define _mm_dp_ps(X, Y, M) __extension__ ({ \ +#define _mm_dp_ps(X, Y, M) \ (__m128) __builtin_ia32_dpps((__v4sf)(__m128)(X), \ - (__v4sf)(__m128)(Y), (M)); }) + (__v4sf)(__m128)(Y), (M)) -/// \brief Computes the dot product of the two 128-bit vectors of [2 x double] +/// Computes the dot product of the two 128-bit vectors of [2 x double] /// and returns it in the elements of the 128-bit result vector of /// [2 x double]. /// @@ -648,15 +635,15 @@ _mm_mul_epi32 (__m128i __V1, __m128i __V2) /// input vectors are used as an input for dot product; otherwise that input /// is treated as zero. Bits [1:0] determine which elements of the result /// will receive a copy of the final dot product, with bit [0] corresponding -/// to the lowest element and bit [3] corresponding to the highest element of +/// to the lowest element and bit [1] corresponding to the highest element of /// each [2 x double] vector. If a bit is set, the dot product is returned in /// the corresponding element; otherwise that element is set to zero. -#define _mm_dp_pd(X, Y, M) __extension__ ({\ +#define _mm_dp_pd(X, Y, M) \ (__m128d) __builtin_ia32_dppd((__v2df)(__m128d)(X), \ - (__v2df)(__m128d)(Y), (M)); }) + (__v2df)(__m128d)(Y), (M)) /* SSE4 Streaming Load Hint Instruction. */ -/// \brief Loads integer values from a 128-bit aligned memory location to a +/// Loads integer values from a 128-bit aligned memory location to a /// 128-bit integer vector. /// /// \headerfile <x86intrin.h> @@ -675,7 +662,7 @@ _mm_stream_load_si128 (__m128i const *__V) } /* SSE4 Packed Integer Min/Max Instructions. */ -/// \brief Compares the corresponding elements of two 128-bit vectors of +/// Compares the corresponding elements of two 128-bit vectors of /// [16 x i8] and returns a 128-bit vector of [16 x i8] containing the lesser /// of the two values. /// @@ -694,7 +681,7 @@ _mm_min_epi8 (__m128i __V1, __m128i __V2) return (__m128i) __builtin_ia32_pminsb128 ((__v16qi) __V1, (__v16qi) __V2); } -/// \brief Compares the corresponding elements of two 128-bit vectors of +/// Compares the corresponding elements of two 128-bit vectors of /// [16 x i8] and returns a 128-bit vector of [16 x i8] containing the /// greater value of the two. /// @@ -713,7 +700,7 @@ _mm_max_epi8 (__m128i __V1, __m128i __V2) return (__m128i) __builtin_ia32_pmaxsb128 ((__v16qi) __V1, (__v16qi) __V2); } -/// \brief Compares the corresponding elements of two 128-bit vectors of +/// Compares the corresponding elements of two 128-bit vectors of /// [8 x u16] and returns a 128-bit vector of [8 x u16] containing the lesser /// value of the two. /// @@ -732,7 +719,7 @@ _mm_min_epu16 (__m128i __V1, __m128i __V2) return (__m128i) __builtin_ia32_pminuw128 ((__v8hi) __V1, (__v8hi) __V2); } -/// \brief Compares the corresponding elements of two 128-bit vectors of +/// Compares the corresponding elements of two 128-bit vectors of /// [8 x u16] and returns a 128-bit vector of [8 x u16] containing the /// greater value of the two. /// @@ -751,7 +738,7 @@ _mm_max_epu16 (__m128i __V1, __m128i __V2) return (__m128i) __builtin_ia32_pmaxuw128 ((__v8hi) __V1, (__v8hi) __V2); } -/// \brief Compares the corresponding elements of two 128-bit vectors of +/// Compares the corresponding elements of two 128-bit vectors of /// [4 x i32] and returns a 128-bit vector of [4 x i32] containing the lesser /// value of the two. /// @@ -770,7 +757,7 @@ _mm_min_epi32 (__m128i __V1, __m128i __V2) return (__m128i) __builtin_ia32_pminsd128 ((__v4si) __V1, (__v4si) __V2); } -/// \brief Compares the corresponding elements of two 128-bit vectors of +/// Compares the corresponding elements of two 128-bit vectors of /// [4 x i32] and returns a 128-bit vector of [4 x i32] containing the /// greater value of the two. /// @@ -789,7 +776,7 @@ _mm_max_epi32 (__m128i __V1, __m128i __V2) return (__m128i) __builtin_ia32_pmaxsd128 ((__v4si) __V1, (__v4si) __V2); } -/// \brief Compares the corresponding elements of two 128-bit vectors of +/// Compares the corresponding elements of two 128-bit vectors of /// [4 x u32] and returns a 128-bit vector of [4 x u32] containing the lesser /// value of the two. /// @@ -808,7 +795,7 @@ _mm_min_epu32 (__m128i __V1, __m128i __V2) return (__m128i) __builtin_ia32_pminud128((__v4si) __V1, (__v4si) __V2); } -/// \brief Compares the corresponding elements of two 128-bit vectors of +/// Compares the corresponding elements of two 128-bit vectors of /// [4 x u32] and returns a 128-bit vector of [4 x u32] containing the /// greater value of the two. /// @@ -828,7 +815,7 @@ _mm_max_epu32 (__m128i __V1, __m128i __V2) } /* SSE4 Insertion and Extraction from XMM Register Instructions. */ -/// \brief Takes the first argument \a X and inserts an element from the second +/// Takes the first argument \a X and inserts an element from the second /// argument \a Y as selected by the third argument \a N. That result then /// has elements zeroed out also as selected by the third argument \a N. The /// resulting 128-bit vector of [4 x float] is then returned. @@ -866,11 +853,11 @@ _mm_max_epu32 (__m128i __V1, __m128i __V2) /// 11: Copies the selected bits from \a Y to result bits [127:96]. \n /// Bits[3:0]: If any of these bits are set, the corresponding result /// element is cleared. -/// \returns A 128-bit vector of [4 x float] containing the copied single- -/// precision floating point elements from the operands. +/// \returns A 128-bit vector of [4 x float] containing the copied +/// single-precision floating point elements from the operands. #define _mm_insert_ps(X, Y, N) __builtin_ia32_insertps128((X), (Y), (N)) -/// \brief Extracts a 32-bit integer from a 128-bit vector of [4 x float] and +/// Extracts a 32-bit integer from a 128-bit vector of [4 x float] and /// returns it, using the immediate value parameter \a N as a selector. /// /// \headerfile <x86intrin.h> @@ -893,15 +880,14 @@ _mm_max_epu32 (__m128i __V1, __m128i __V2) /// 11: Bits [127:96] of parameter \a X are returned. /// \returns A 32-bit integer containing the extracted 32 bits of float data. #define _mm_extract_ps(X, N) (__extension__ \ - ({ union { int __i; float __f; } __t; \ - __v4sf __a = (__v4sf)(__m128)(X); \ - __t.__f = __a[(N) & 3]; \ - __t.__i;})) + ({ union { int __i; float __f; } __t; \ + __t.__f = __builtin_ia32_vec_ext_v4sf((__v4sf)(__m128)(X), (int)(N)); \ + __t.__i;})) /* Miscellaneous insert and extract macros. */ /* Extract a single-precision float from X at index N into D. */ -#define _MM_EXTRACT_FLOAT(D, X, N) (__extension__ ({ __v4sf __a = (__v4sf)(X); \ - (D) = __a[N]; })) +#define _MM_EXTRACT_FLOAT(D, X, N) \ + { (D) = __builtin_ia32_vec_ext_v4sf((__v4sf)(__m128)(X), (int)(N)); } /* Or together 2 sets of indexes (X and Y) with the zeroing bits (Z) to create an index suitable for _mm_insert_ps. */ @@ -912,7 +898,7 @@ _mm_max_epu32 (__m128i __V1, __m128i __V2) _MM_MK_INSERTPS_NDX((N), 0, 0x0e)) /* Insert int into packed integer array at index. */ -/// \brief Constructs a 128-bit vector of [16 x i8] by first making a copy of +/// Constructs a 128-bit vector of [16 x i8] by first making a copy of /// the 128-bit integer vector parameter, and then inserting the lower 8 bits /// of an integer parameter \a I into an offset specified by the immediate /// value parameter \a N. @@ -952,12 +938,11 @@ _mm_max_epu32 (__m128i __V1, __m128i __V2) /// 1110: Bits [119:112] of the result are used for insertion. \n /// 1111: Bits [127:120] of the result are used for insertion. /// \returns A 128-bit integer vector containing the constructed values. -#define _mm_insert_epi8(X, I, N) (__extension__ \ - ({ __v16qi __a = (__v16qi)(__m128i)(X); \ - __a[(N) & 15] = (I); \ - (__m128i)__a;})) +#define _mm_insert_epi8(X, I, N) \ + (__m128i)__builtin_ia32_vec_set_v16qi((__v16qi)(__m128i)(X), \ + (int)(I), (int)(N)) -/// \brief Constructs a 128-bit vector of [4 x i32] by first making a copy of +/// Constructs a 128-bit vector of [4 x i32] by first making a copy of /// the 128-bit integer vector parameter, and then inserting the 32-bit /// integer parameter \a I at the offset specified by the immediate value /// parameter \a N. @@ -985,13 +970,12 @@ _mm_max_epu32 (__m128i __V1, __m128i __V2) /// 10: Bits [95:64] of the result are used for insertion. \n /// 11: Bits [127:96] of the result are used for insertion. /// \returns A 128-bit integer vector containing the constructed values. -#define _mm_insert_epi32(X, I, N) (__extension__ \ - ({ __v4si __a = (__v4si)(__m128i)(X); \ - __a[(N) & 3] = (I); \ - (__m128i)__a;})) +#define _mm_insert_epi32(X, I, N) \ + (__m128i)__builtin_ia32_vec_set_v4si((__v4si)(__m128i)(X), \ + (int)(I), (int)(N)) #ifdef __x86_64__ -/// \brief Constructs a 128-bit vector of [2 x i64] by first making a copy of +/// Constructs a 128-bit vector of [2 x i64] by first making a copy of /// the 128-bit integer vector parameter, and then inserting the 64-bit /// integer parameter \a I, using the immediate value parameter \a N as an /// insertion location selector. @@ -1017,16 +1001,15 @@ _mm_max_epu32 (__m128i __V1, __m128i __V2) /// 0: Bits [63:0] of the result are used for insertion. \n /// 1: Bits [127:64] of the result are used for insertion. \n /// \returns A 128-bit integer vector containing the constructed values. -#define _mm_insert_epi64(X, I, N) (__extension__ \ - ({ __v2di __a = (__v2di)(__m128i)(X); \ - __a[(N) & 1] = (I); \ - (__m128i)__a;})) +#define _mm_insert_epi64(X, I, N) \ + (__m128i)__builtin_ia32_vec_set_v2di((__v2di)(__m128i)(X), \ + (long long)(I), (int)(N)) #endif /* __x86_64__ */ /* Extract int from packed integer array at index. This returns the element * as a zero extended value, so it is unsigned. */ -/// \brief Extracts an 8-bit element from the 128-bit integer vector of +/// Extracts an 8-bit element from the 128-bit integer vector of /// [16 x i8], using the immediate value parameter \a N as a selector. /// /// \headerfile <x86intrin.h> @@ -1061,11 +1044,11 @@ _mm_max_epu32 (__m128i __V1, __m128i __V2) /// \returns An unsigned integer, whose lower 8 bits are selected from the /// 128-bit integer vector parameter and the remaining bits are assigned /// zeros. -#define _mm_extract_epi8(X, N) (__extension__ \ - ({ __v16qi __a = (__v16qi)(__m128i)(X); \ - (int)(unsigned char) __a[(N) & 15];})) +#define _mm_extract_epi8(X, N) \ + (int)(unsigned char)__builtin_ia32_vec_ext_v16qi((__v16qi)(__m128i)(X), \ + (int)(N)) -/// \brief Extracts a 32-bit element from the 128-bit integer vector of +/// Extracts a 32-bit element from the 128-bit integer vector of /// [4 x i32], using the immediate value parameter \a N as a selector. /// /// \headerfile <x86intrin.h> @@ -1087,12 +1070,11 @@ _mm_max_epu32 (__m128i __V1, __m128i __V2) /// 11: Bits [127:96] of the parameter \a X are exracted. /// \returns An integer, whose lower 32 bits are selected from the 128-bit /// integer vector parameter and the remaining bits are assigned zeros. -#define _mm_extract_epi32(X, N) (__extension__ \ - ({ __v4si __a = (__v4si)(__m128i)(X); \ - (int)__a[(N) & 3];})) +#define _mm_extract_epi32(X, N) \ + (int)__builtin_ia32_vec_ext_v4si((__v4si)(__m128i)(X), (int)(N)) #ifdef __x86_64__ -/// \brief Extracts a 64-bit element from the 128-bit integer vector of +/// Extracts a 64-bit element from the 128-bit integer vector of /// [2 x i64], using the immediate value parameter \a N as a selector. /// /// \headerfile <x86intrin.h> @@ -1111,13 +1093,12 @@ _mm_max_epu32 (__m128i __V1, __m128i __V2) /// 0: Bits [63:0] are returned. \n /// 1: Bits [127:64] are returned. \n /// \returns A 64-bit integer. -#define _mm_extract_epi64(X, N) (__extension__ \ - ({ __v2di __a = (__v2di)(__m128i)(X); \ - (long long)__a[(N) & 1];})) +#define _mm_extract_epi64(X, N) \ + (long long)__builtin_ia32_vec_ext_v2di((__v2di)(__m128i)(X), (int)(N)) #endif /* __x86_64 */ /* SSE4 128-bit Packed Integer Comparisons. */ -/// \brief Tests whether the specified bits in a 128-bit integer vector are all +/// Tests whether the specified bits in a 128-bit integer vector are all /// zeros. /// /// \headerfile <x86intrin.h> @@ -1135,7 +1116,7 @@ _mm_testz_si128(__m128i __M, __m128i __V) return __builtin_ia32_ptestz128((__v2di)__M, (__v2di)__V); } -/// \brief Tests whether the specified bits in a 128-bit integer vector are all +/// Tests whether the specified bits in a 128-bit integer vector are all /// ones. /// /// \headerfile <x86intrin.h> @@ -1153,7 +1134,7 @@ _mm_testc_si128(__m128i __M, __m128i __V) return __builtin_ia32_ptestc128((__v2di)__M, (__v2di)__V); } -/// \brief Tests whether the specified bits in a 128-bit integer vector are +/// Tests whether the specified bits in a 128-bit integer vector are /// neither all zeros nor all ones. /// /// \headerfile <x86intrin.h> @@ -1172,7 +1153,7 @@ _mm_testnzc_si128(__m128i __M, __m128i __V) return __builtin_ia32_ptestnzc128((__v2di)__M, (__v2di)__V); } -/// \brief Tests whether the specified bits in a 128-bit integer vector are all +/// Tests whether the specified bits in a 128-bit integer vector are all /// ones. /// /// \headerfile <x86intrin.h> @@ -1189,7 +1170,7 @@ _mm_testnzc_si128(__m128i __M, __m128i __V) /// otherwise. #define _mm_test_all_ones(V) _mm_testc_si128((V), _mm_cmpeq_epi32((V), (V))) -/// \brief Tests whether the specified bits in a 128-bit integer vector are +/// Tests whether the specified bits in a 128-bit integer vector are /// neither all zeros nor all ones. /// /// \headerfile <x86intrin.h> @@ -1208,7 +1189,7 @@ _mm_testnzc_si128(__m128i __M, __m128i __V) /// FALSE otherwise. #define _mm_test_mix_ones_zeros(M, V) _mm_testnzc_si128((M), (V)) -/// \brief Tests whether the specified bits in a 128-bit integer vector are all +/// Tests whether the specified bits in a 128-bit integer vector are all /// zeros. /// /// \headerfile <x86intrin.h> @@ -1227,7 +1208,7 @@ _mm_testnzc_si128(__m128i __M, __m128i __V) #define _mm_test_all_zeros(M, V) _mm_testz_si128 ((M), (V)) /* SSE4 64-bit Packed Integer Comparisons. */ -/// \brief Compares each of the corresponding 64-bit values of the 128-bit +/// Compares each of the corresponding 64-bit values of the 128-bit /// integer vectors for equality. /// /// \headerfile <x86intrin.h> @@ -1246,7 +1227,7 @@ _mm_cmpeq_epi64(__m128i __V1, __m128i __V2) } /* SSE4 Packed Integer Sign-Extension. */ -/// \brief Sign-extends each of the lower eight 8-bit integer elements of a +/// Sign-extends each of the lower eight 8-bit integer elements of a /// 128-bit vector of [16 x i8] to 16-bit values and returns them in a /// 128-bit vector of [8 x i16]. The upper eight elements of the input vector /// are unused. @@ -1267,7 +1248,7 @@ _mm_cvtepi8_epi16(__m128i __V) return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v16qs)__V, (__v16qs)__V, 0, 1, 2, 3, 4, 5, 6, 7), __v8hi); } -/// \brief Sign-extends each of the lower four 8-bit integer elements of a +/// Sign-extends each of the lower four 8-bit integer elements of a /// 128-bit vector of [16 x i8] to 32-bit values and returns them in a /// 128-bit vector of [4 x i32]. The upper twelve elements of the input /// vector are unused. @@ -1277,8 +1258,8 @@ _mm_cvtepi8_epi16(__m128i __V) /// This intrinsic corresponds to the <c> VPMOVSXBD / PMOVSXBD </c> instruction. /// /// \param __V -/// A 128-bit vector of [16 x i8]. The lower four 8-bit elements are sign- -/// extended to 32-bit values. +/// A 128-bit vector of [16 x i8]. The lower four 8-bit elements are +/// sign-extended to 32-bit values. /// \returns A 128-bit vector of [4 x i32] containing the sign-extended values. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtepi8_epi32(__m128i __V) @@ -1288,7 +1269,7 @@ _mm_cvtepi8_epi32(__m128i __V) return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v16qs)__V, (__v16qs)__V, 0, 1, 2, 3), __v4si); } -/// \brief Sign-extends each of the lower two 8-bit integer elements of a +/// Sign-extends each of the lower two 8-bit integer elements of a /// 128-bit integer vector of [16 x i8] to 64-bit values and returns them in /// a 128-bit vector of [2 x i64]. The upper fourteen elements of the input /// vector are unused. @@ -1298,8 +1279,8 @@ _mm_cvtepi8_epi32(__m128i __V) /// This intrinsic corresponds to the <c> VPMOVSXBQ / PMOVSXBQ </c> instruction. /// /// \param __V -/// A 128-bit vector of [16 x i8]. The lower two 8-bit elements are sign- -/// extended to 64-bit values. +/// A 128-bit vector of [16 x i8]. The lower two 8-bit elements are +/// sign-extended to 64-bit values. /// \returns A 128-bit vector of [2 x i64] containing the sign-extended values. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtepi8_epi64(__m128i __V) @@ -1309,7 +1290,7 @@ _mm_cvtepi8_epi64(__m128i __V) return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v16qs)__V, (__v16qs)__V, 0, 1), __v2di); } -/// \brief Sign-extends each of the lower four 16-bit integer elements of a +/// Sign-extends each of the lower four 16-bit integer elements of a /// 128-bit integer vector of [8 x i16] to 32-bit values and returns them in /// a 128-bit vector of [4 x i32]. The upper four elements of the input /// vector are unused. @@ -1319,8 +1300,8 @@ _mm_cvtepi8_epi64(__m128i __V) /// This intrinsic corresponds to the <c> VPMOVSXWD / PMOVSXWD </c> instruction. /// /// \param __V -/// A 128-bit vector of [8 x i16]. The lower four 16-bit elements are sign- -/// extended to 32-bit values. +/// A 128-bit vector of [8 x i16]. The lower four 16-bit elements are +/// sign-extended to 32-bit values. /// \returns A 128-bit vector of [4 x i32] containing the sign-extended values. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtepi16_epi32(__m128i __V) @@ -1328,7 +1309,7 @@ _mm_cvtepi16_epi32(__m128i __V) return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v8hi)__V, (__v8hi)__V, 0, 1, 2, 3), __v4si); } -/// \brief Sign-extends each of the lower two 16-bit integer elements of a +/// Sign-extends each of the lower two 16-bit integer elements of a /// 128-bit integer vector of [8 x i16] to 64-bit values and returns them in /// a 128-bit vector of [2 x i64]. The upper six elements of the input /// vector are unused. @@ -1338,8 +1319,8 @@ _mm_cvtepi16_epi32(__m128i __V) /// This intrinsic corresponds to the <c> VPMOVSXWQ / PMOVSXWQ </c> instruction. /// /// \param __V -/// A 128-bit vector of [8 x i16]. The lower two 16-bit elements are sign- -/// extended to 64-bit values. +/// A 128-bit vector of [8 x i16]. The lower two 16-bit elements are +/// sign-extended to 64-bit values. /// \returns A 128-bit vector of [2 x i64] containing the sign-extended values. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtepi16_epi64(__m128i __V) @@ -1347,7 +1328,7 @@ _mm_cvtepi16_epi64(__m128i __V) return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v8hi)__V, (__v8hi)__V, 0, 1), __v2di); } -/// \brief Sign-extends each of the lower two 32-bit integer elements of a +/// Sign-extends each of the lower two 32-bit integer elements of a /// 128-bit integer vector of [4 x i32] to 64-bit values and returns them in /// a 128-bit vector of [2 x i64]. The upper two elements of the input vector /// are unused. @@ -1357,8 +1338,8 @@ _mm_cvtepi16_epi64(__m128i __V) /// This intrinsic corresponds to the <c> VPMOVSXDQ / PMOVSXDQ </c> instruction. /// /// \param __V -/// A 128-bit vector of [4 x i32]. The lower two 32-bit elements are sign- -/// extended to 64-bit values. +/// A 128-bit vector of [4 x i32]. The lower two 32-bit elements are +/// sign-extended to 64-bit values. /// \returns A 128-bit vector of [2 x i64] containing the sign-extended values. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtepi32_epi64(__m128i __V) @@ -1367,7 +1348,7 @@ _mm_cvtepi32_epi64(__m128i __V) } /* SSE4 Packed Integer Zero-Extension. */ -/// \brief Zero-extends each of the lower eight 8-bit integer elements of a +/// Zero-extends each of the lower eight 8-bit integer elements of a /// 128-bit vector of [16 x i8] to 16-bit values and returns them in a /// 128-bit vector of [8 x i16]. The upper eight elements of the input vector /// are unused. @@ -1377,8 +1358,8 @@ _mm_cvtepi32_epi64(__m128i __V) /// This intrinsic corresponds to the <c> VPMOVZXBW / PMOVZXBW </c> instruction. /// /// \param __V -/// A 128-bit vector of [16 x i8]. The lower eight 8-bit elements are zero- -/// extended to 16-bit values. +/// A 128-bit vector of [16 x i8]. The lower eight 8-bit elements are +/// zero-extended to 16-bit values. /// \returns A 128-bit vector of [8 x i16] containing the zero-extended values. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtepu8_epi16(__m128i __V) @@ -1386,7 +1367,7 @@ _mm_cvtepu8_epi16(__m128i __V) return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v16qu)__V, (__v16qu)__V, 0, 1, 2, 3, 4, 5, 6, 7), __v8hi); } -/// \brief Zero-extends each of the lower four 8-bit integer elements of a +/// Zero-extends each of the lower four 8-bit integer elements of a /// 128-bit vector of [16 x i8] to 32-bit values and returns them in a /// 128-bit vector of [4 x i32]. The upper twelve elements of the input /// vector are unused. @@ -1396,8 +1377,8 @@ _mm_cvtepu8_epi16(__m128i __V) /// This intrinsic corresponds to the <c> VPMOVZXBD / PMOVZXBD </c> instruction. /// /// \param __V -/// A 128-bit vector of [16 x i8]. The lower four 8-bit elements are zero- -/// extended to 32-bit values. +/// A 128-bit vector of [16 x i8]. The lower four 8-bit elements are +/// zero-extended to 32-bit values. /// \returns A 128-bit vector of [4 x i32] containing the zero-extended values. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtepu8_epi32(__m128i __V) @@ -1405,7 +1386,7 @@ _mm_cvtepu8_epi32(__m128i __V) return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v16qu)__V, (__v16qu)__V, 0, 1, 2, 3), __v4si); } -/// \brief Zero-extends each of the lower two 8-bit integer elements of a +/// Zero-extends each of the lower two 8-bit integer elements of a /// 128-bit integer vector of [16 x i8] to 64-bit values and returns them in /// a 128-bit vector of [2 x i64]. The upper fourteen elements of the input /// vector are unused. @@ -1415,8 +1396,8 @@ _mm_cvtepu8_epi32(__m128i __V) /// This intrinsic corresponds to the <c> VPMOVZXBQ / PMOVZXBQ </c> instruction. /// /// \param __V -/// A 128-bit vector of [16 x i8]. The lower two 8-bit elements are zero- -/// extended to 64-bit values. +/// A 128-bit vector of [16 x i8]. The lower two 8-bit elements are +/// zero-extended to 64-bit values. /// \returns A 128-bit vector of [2 x i64] containing the zero-extended values. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtepu8_epi64(__m128i __V) @@ -1424,7 +1405,7 @@ _mm_cvtepu8_epi64(__m128i __V) return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v16qu)__V, (__v16qu)__V, 0, 1), __v2di); } -/// \brief Zero-extends each of the lower four 16-bit integer elements of a +/// Zero-extends each of the lower four 16-bit integer elements of a /// 128-bit integer vector of [8 x i16] to 32-bit values and returns them in /// a 128-bit vector of [4 x i32]. The upper four elements of the input /// vector are unused. @@ -1434,8 +1415,8 @@ _mm_cvtepu8_epi64(__m128i __V) /// This intrinsic corresponds to the <c> VPMOVZXWD / PMOVZXWD </c> instruction. /// /// \param __V -/// A 128-bit vector of [8 x i16]. The lower four 16-bit elements are zero- -/// extended to 32-bit values. +/// A 128-bit vector of [8 x i16]. The lower four 16-bit elements are +/// zero-extended to 32-bit values. /// \returns A 128-bit vector of [4 x i32] containing the zero-extended values. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtepu16_epi32(__m128i __V) @@ -1443,7 +1424,7 @@ _mm_cvtepu16_epi32(__m128i __V) return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v8hu)__V, (__v8hu)__V, 0, 1, 2, 3), __v4si); } -/// \brief Zero-extends each of the lower two 16-bit integer elements of a +/// Zero-extends each of the lower two 16-bit integer elements of a /// 128-bit integer vector of [8 x i16] to 64-bit values and returns them in /// a 128-bit vector of [2 x i64]. The upper six elements of the input vector /// are unused. @@ -1453,8 +1434,8 @@ _mm_cvtepu16_epi32(__m128i __V) /// This intrinsic corresponds to the <c> VPMOVZXWQ / PMOVZXWQ </c> instruction. /// /// \param __V -/// A 128-bit vector of [8 x i16]. The lower two 16-bit elements are zero- -/// extended to 64-bit values. +/// A 128-bit vector of [8 x i16]. The lower two 16-bit elements are +/// zero-extended to 64-bit values. /// \returns A 128-bit vector of [2 x i64] containing the zero-extended values. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtepu16_epi64(__m128i __V) @@ -1462,7 +1443,7 @@ _mm_cvtepu16_epi64(__m128i __V) return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v8hu)__V, (__v8hu)__V, 0, 1), __v2di); } -/// \brief Zero-extends each of the lower two 32-bit integer elements of a +/// Zero-extends each of the lower two 32-bit integer elements of a /// 128-bit integer vector of [4 x i32] to 64-bit values and returns them in /// a 128-bit vector of [2 x i64]. The upper two elements of the input vector /// are unused. @@ -1472,8 +1453,8 @@ _mm_cvtepu16_epi64(__m128i __V) /// This intrinsic corresponds to the <c> VPMOVZXDQ / PMOVZXDQ </c> instruction. /// /// \param __V -/// A 128-bit vector of [4 x i32]. The lower two 32-bit elements are zero- -/// extended to 64-bit values. +/// A 128-bit vector of [4 x i32]. The lower two 32-bit elements are +/// zero-extended to 64-bit values. /// \returns A 128-bit vector of [2 x i64] containing the zero-extended values. static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_cvtepu32_epi64(__m128i __V) @@ -1482,7 +1463,7 @@ _mm_cvtepu32_epi64(__m128i __V) } /* SSE4 Pack with Unsigned Saturation. */ -/// \brief Converts 32-bit signed integers from both 128-bit integer vector +/// Converts 32-bit signed integers from both 128-bit integer vector /// operands into 16-bit unsigned integers, and returns the packed result. /// Values greater than 0xFFFF are saturated to 0xFFFF. Values less than /// 0x0000 are saturated to 0x0000. @@ -1511,7 +1492,7 @@ _mm_packus_epi32(__m128i __V1, __m128i __V2) } /* SSE4 Multiple Packed Sums of Absolute Difference. */ -/// \brief Subtracts 8-bit unsigned integer values and computes the absolute +/// Subtracts 8-bit unsigned integer values and computes the absolute /// values of the differences to the corresponding bits in the destination. /// Then sums of the absolute differences are returned according to the bit /// fields in the immediate operand. @@ -1534,23 +1515,23 @@ _mm_packus_epi32(__m128i __V1, __m128i __V2) /// \code /// // M2 represents bit 2 of the immediate operand /// // M10 represents bits [1:0] of the immediate operand -/// i = M2 * 4 -/// j = M10 * 4 +/// i = M2 * 4; +/// j = M10 * 4; /// for (k = 0; k < 8; k = k + 1) { -/// d0 = abs(X[i + k + 0] - Y[j + 0]) -/// d1 = abs(X[i + k + 1] - Y[j + 1]) -/// d2 = abs(X[i + k + 2] - Y[j + 2]) -/// d3 = abs(X[i + k + 3] - Y[j + 3]) -/// r[k] = d0 + d1 + d2 + d3 +/// d0 = abs(X[i + k + 0] - Y[j + 0]); +/// d1 = abs(X[i + k + 1] - Y[j + 1]); +/// d2 = abs(X[i + k + 2] - Y[j + 2]); +/// d3 = abs(X[i + k + 3] - Y[j + 3]); +/// r[k] = d0 + d1 + d2 + d3; /// } /// \endcode /// \returns A 128-bit integer vector containing the sums of the sets of /// absolute differences between both operands. -#define _mm_mpsadbw_epu8(X, Y, M) __extension__ ({ \ +#define _mm_mpsadbw_epu8(X, Y, M) \ (__m128i) __builtin_ia32_mpsadbw128((__v16qi)(__m128i)(X), \ - (__v16qi)(__m128i)(Y), (M)); }) + (__v16qi)(__m128i)(Y), (M)) -/// \brief Finds the minimum unsigned 16-bit element in the input 128-bit +/// Finds the minimum unsigned 16-bit element in the input 128-bit /// vector of [8 x u16] and returns it and along with its index. /// /// \headerfile <x86intrin.h> @@ -1604,7 +1585,7 @@ _mm_minpos_epu16(__m128i __V) #define _SIDD_UNIT_MASK 0x40 /* SSE4.2 Packed Comparison Intrinsics. */ -/// \brief Uses the immediate operand \a M to perform a comparison of string +/// Uses the immediate operand \a M to perform a comparison of string /// data with implicitly defined lengths that is contained in source operands /// \a A and \a B. Returns a 128-bit integer vector representing the result /// mask of the comparison. @@ -1660,7 +1641,7 @@ _mm_minpos_epu16(__m128i __V) (__m128i)__builtin_ia32_pcmpistrm128((__v16qi)(__m128i)(A), \ (__v16qi)(__m128i)(B), (int)(M)) -/// \brief Uses the immediate operand \a M to perform a comparison of string +/// Uses the immediate operand \a M to perform a comparison of string /// data with implicitly defined lengths that is contained in source operands /// \a A and \a B. Returns an integer representing the result index of the /// comparison. @@ -1714,7 +1695,7 @@ _mm_minpos_epu16(__m128i __V) (int)__builtin_ia32_pcmpistri128((__v16qi)(__m128i)(A), \ (__v16qi)(__m128i)(B), (int)(M)) -/// \brief Uses the immediate operand \a M to perform a comparison of string +/// Uses the immediate operand \a M to perform a comparison of string /// data with explicitly defined lengths that is contained in source operands /// \a A and \a B. Returns a 128-bit integer vector representing the result /// mask of the comparison. @@ -1775,7 +1756,7 @@ _mm_minpos_epu16(__m128i __V) (__v16qi)(__m128i)(B), (int)(LB), \ (int)(M)) -/// \brief Uses the immediate operand \a M to perform a comparison of string +/// Uses the immediate operand \a M to perform a comparison of string /// data with explicitly defined lengths that is contained in source operands /// \a A and \a B. Returns an integer representing the result index of the /// comparison. @@ -1835,7 +1816,7 @@ _mm_minpos_epu16(__m128i __V) (int)(M)) /* SSE4.2 Packed Comparison Intrinsics and EFlag Reading. */ -/// \brief Uses the immediate operand \a M to perform a comparison of string +/// Uses the immediate operand \a M to perform a comparison of string /// data with implicitly defined lengths that is contained in source operands /// \a A and \a B. Returns 1 if the bit mask is zero and the length of the /// string in \a B is the maximum, otherwise, returns 0. @@ -1885,7 +1866,7 @@ _mm_minpos_epu16(__m128i __V) (int)__builtin_ia32_pcmpistria128((__v16qi)(__m128i)(A), \ (__v16qi)(__m128i)(B), (int)(M)) -/// \brief Uses the immediate operand \a M to perform a comparison of string +/// Uses the immediate operand \a M to perform a comparison of string /// data with implicitly defined lengths that is contained in source operands /// \a A and \a B. Returns 1 if the bit mask is non-zero, otherwise, returns /// 0. @@ -1934,7 +1915,7 @@ _mm_minpos_epu16(__m128i __V) (int)__builtin_ia32_pcmpistric128((__v16qi)(__m128i)(A), \ (__v16qi)(__m128i)(B), (int)(M)) -/// \brief Uses the immediate operand \a M to perform a comparison of string +/// Uses the immediate operand \a M to perform a comparison of string /// data with implicitly defined lengths that is contained in source operands /// \a A and \a B. Returns bit 0 of the resulting bit mask. /// @@ -1982,7 +1963,7 @@ _mm_minpos_epu16(__m128i __V) (int)__builtin_ia32_pcmpistrio128((__v16qi)(__m128i)(A), \ (__v16qi)(__m128i)(B), (int)(M)) -/// \brief Uses the immediate operand \a M to perform a comparison of string +/// Uses the immediate operand \a M to perform a comparison of string /// data with implicitly defined lengths that is contained in source operands /// \a A and \a B. Returns 1 if the length of the string in \a A is less than /// the maximum, otherwise, returns 0. @@ -2032,7 +2013,7 @@ _mm_minpos_epu16(__m128i __V) (int)__builtin_ia32_pcmpistris128((__v16qi)(__m128i)(A), \ (__v16qi)(__m128i)(B), (int)(M)) -/// \brief Uses the immediate operand \a M to perform a comparison of string +/// Uses the immediate operand \a M to perform a comparison of string /// data with implicitly defined lengths that is contained in source operands /// \a A and \a B. Returns 1 if the length of the string in \a B is less than /// the maximum, otherwise, returns 0. @@ -2082,7 +2063,7 @@ _mm_minpos_epu16(__m128i __V) (int)__builtin_ia32_pcmpistriz128((__v16qi)(__m128i)(A), \ (__v16qi)(__m128i)(B), (int)(M)) -/// \brief Uses the immediate operand \a M to perform a comparison of string +/// Uses the immediate operand \a M to perform a comparison of string /// data with explicitly defined lengths that is contained in source operands /// \a A and \a B. Returns 1 if the bit mask is zero and the length of the /// string in \a B is the maximum, otherwise, returns 0. @@ -2137,7 +2118,7 @@ _mm_minpos_epu16(__m128i __V) (__v16qi)(__m128i)(B), (int)(LB), \ (int)(M)) -/// \brief Uses the immediate operand \a M to perform a comparison of string +/// Uses the immediate operand \a M to perform a comparison of string /// data with explicitly defined lengths that is contained in source operands /// \a A and \a B. Returns 1 if the resulting mask is non-zero, otherwise, /// returns 0. @@ -2191,7 +2172,7 @@ _mm_minpos_epu16(__m128i __V) (__v16qi)(__m128i)(B), (int)(LB), \ (int)(M)) -/// \brief Uses the immediate operand \a M to perform a comparison of string +/// Uses the immediate operand \a M to perform a comparison of string /// data with explicitly defined lengths that is contained in source operands /// \a A and \a B. Returns bit 0 of the resulting bit mask. /// @@ -2244,7 +2225,7 @@ _mm_minpos_epu16(__m128i __V) (__v16qi)(__m128i)(B), (int)(LB), \ (int)(M)) -/// \brief Uses the immediate operand \a M to perform a comparison of string +/// Uses the immediate operand \a M to perform a comparison of string /// data with explicitly defined lengths that is contained in source operands /// \a A and \a B. Returns 1 if the length of the string in \a A is less than /// the maximum, otherwise, returns 0. @@ -2299,7 +2280,7 @@ _mm_minpos_epu16(__m128i __V) (__v16qi)(__m128i)(B), (int)(LB), \ (int)(M)) -/// \brief Uses the immediate operand \a M to perform a comparison of string +/// Uses the immediate operand \a M to perform a comparison of string /// data with explicitly defined lengths that is contained in source operands /// \a A and \a B. Returns 1 if the length of the string in \a B is less than /// the maximum, otherwise, returns 0. @@ -2354,7 +2335,7 @@ _mm_minpos_epu16(__m128i __V) (int)(M)) /* SSE4.2 Compare Packed Data -- Greater Than. */ -/// \brief Compares each of the corresponding 64-bit values of the 128-bit +/// Compares each of the corresponding 64-bit values of the 128-bit /// integer vectors to determine if the values in the first operand are /// greater than those in the second operand. /// @@ -2374,7 +2355,7 @@ _mm_cmpgt_epi64(__m128i __V1, __m128i __V2) } /* SSE4.2 Accumulate CRC32. */ -/// \brief Adds the unsigned integer operand to the CRC-32C checksum of the +/// Adds the unsigned integer operand to the CRC-32C checksum of the /// unsigned char operand. /// /// \headerfile <x86intrin.h> @@ -2394,7 +2375,7 @@ _mm_crc32_u8(unsigned int __C, unsigned char __D) return __builtin_ia32_crc32qi(__C, __D); } -/// \brief Adds the unsigned integer operand to the CRC-32C checksum of the +/// Adds the unsigned integer operand to the CRC-32C checksum of the /// unsigned short operand. /// /// \headerfile <x86intrin.h> @@ -2414,7 +2395,7 @@ _mm_crc32_u16(unsigned int __C, unsigned short __D) return __builtin_ia32_crc32hi(__C, __D); } -/// \brief Adds the first unsigned integer operand to the CRC-32C checksum of +/// Adds the first unsigned integer operand to the CRC-32C checksum of /// the second unsigned integer operand. /// /// \headerfile <x86intrin.h> @@ -2435,7 +2416,7 @@ _mm_crc32_u32(unsigned int __C, unsigned int __D) } #ifdef __x86_64__ -/// \brief Adds the unsigned integer operand to the CRC-32C checksum of the +/// Adds the unsigned integer operand to the CRC-32C checksum of the /// unsigned 64-bit integer operand. /// /// \headerfile <x86intrin.h> @@ -2458,8 +2439,6 @@ _mm_crc32_u64(unsigned long long __C, unsigned long long __D) #undef __DEFAULT_FN_ATTRS -#ifdef __POPCNT__ #include <popcntintrin.h> -#endif -#endif /* _SMMINTRIN_H */ +#endif /* __SMMINTRIN_H */ |